mesa/src/gallium/auxiliary/gallivm/lp_bld_arit.c

3814 lines
118 KiB
C
Raw Normal View History

2009-08-01 17:27:05 +01:00
/**************************************************************************
*
* Copyright 2009-2010 VMware, Inc.
2009-08-01 17:27:05 +01:00
* All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sub license, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice (including the
* next paragraph) shall be included in all copies or substantial portions
* of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
* IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
* ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
**************************************************************************/
/**
* @file
* Helper
*
* LLVM IR doesn't support all basic arithmetic operations we care about (most
* notably min/max and saturated operations), and it is often necessary to
* resort machine-specific intrinsics directly. The functions here hide all
* these implementation details from the other modules.
*
* We also do simple expressions simplification here. Reasons are:
* - it is very easy given we have all necessary information readily available
* - LLVM optimization passes fail to simplify several vector expressions
* - We often know value constraints which the optimization passes have no way
* of knowing, such as when source arguments are known to be in [0, 1] range.
*
* @author Jose Fonseca <jfonseca@vmware.com>
*/
#include <float.h>
#include "util/u_memory.h"
2009-08-03 22:31:08 +01:00
#include "util/u_debug.h"
2009-10-25 11:48:17 +00:00
#include "util/u_math.h"
#include "util/u_cpu_detect.h"
2009-08-01 17:27:05 +01:00
#include "lp_bld_type.h"
#include "lp_bld_const.h"
#include "lp_bld_init.h"
#include "lp_bld_intr.h"
#include "lp_bld_logic.h"
#include "lp_bld_pack.h"
#include "lp_bld_debug.h"
#include "lp_bld_bitarit.h"
2009-08-01 17:27:05 +01:00
#include "lp_bld_arit.h"
#include "lp_bld_flow.h"
2009-08-01 17:27:05 +01:00
#if defined(PIPE_ARCH_SSE)
#include <xmmintrin.h>
#endif
#ifndef _MM_DENORMALS_ZERO_MASK
#define _MM_DENORMALS_ZERO_MASK 0x0040
#endif
#ifndef _MM_FLUSH_ZERO_MASK
#define _MM_FLUSH_ZERO_MASK 0x8000
#endif
2009-08-01 17:27:05 +01:00
#define EXP_POLY_DEGREE 5
#define LOG_POLY_DEGREE 4
2009-08-19 20:13:49 +01:00
/**
* Generate min(a, b)
* No checks for special case values of a or b = 1 or 0 are done.
* NaN's are handled according to the behavior specified by the
* nan_behavior argument.
2009-08-19 20:13:49 +01:00
*/
static LLVMValueRef
lp_build_min_simple(struct lp_build_context *bld,
LLVMValueRef a,
LLVMValueRef b,
enum gallivm_nan_behavior nan_behavior)
2009-08-01 17:27:05 +01:00
{
const struct lp_type type = bld->type;
const char *intrinsic = NULL;
unsigned intr_size = 0;
LLVMValueRef cond;
assert(lp_check_value(type, a));
assert(lp_check_value(type, b));
/* TODO: optimize the constant case */
gallivm,draw,llvmpipe: Support wider native registers. Squashed commit of the following: commit 7acb7b4f60dc505af3dd00dcff744f80315d5b0e Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:46:31 2012 +0100 draw: Don't use dynamically sized arrays. Not supported by MSVC. commit 5810c28c83647612cb372d1e763fd9d7780df3cb Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:44:16 2012 +0100 gallivm,llvmpipe: Don't use expressions with PIPE_ALIGN_VAR(). MSVC doesn't accept exceptions in _declspec(align(...)). Use a define instead. commit 8aafd1457ba572a02b289b3f3411e99a3c056072 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:41:56 2012 +0100 gallium/util: Make u_cpu_detect.h header C++ safe. commit 5795248350771f899cfbfc1a3a58f1835eb2671d Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 2 12:08:01 2012 +0100 gallium/util: Add ULL suffix to large constants. As suggested by Andy Furniss: it looks like some old gcc versions require it. commit 4c66c22727eff92226544c7d43c4eb94de359e10 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Truly disable INF/NAN tests on MSVC. Thanks to Brian for spotting this. commit 8bce274c7fad578d7eb656d9a1413f5c0844c94e Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Disable INF/NAN tests on MSVC. Somehow they are not recognized as constants. commit 6868649cff8d7fd2e2579c28d0b74ef6dd4f9716 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 15:05:24 2012 +0200 gallivm: Cleanup the 2 x 8 float -> 16 ub special path in lp_build_conv. No behaviour change intended, like 7b98455fb40c2df84cfd3cdb1eb7650f67c8a751. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 5147a0949c4407e8bce9e41d9859314b4a9ccf77 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 14:28:19 2012 +0200 gallivm: (trivial) fix issues with multiple-of-4 texture fetch Some formats can't handle non-multiple of 4 fetches I believe, but everything must support length 1 and multiples of 4. So avoid going to scalar fetch (which is very costly) just because length isn't 4. Also extend the hack to not use shift with variable count for yuv formats to arbitrary length (larger than 1) - doesn't matter how many elements we have we always want to avoid it unless we have variable shift count instruction (which we should get with avx2). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 87ebcb1bd71fa4c739451ec8ca89a7f29b168c08 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jul 4 02:09:55 2012 +0200 gallivm: (trivial) fix typo for wrap repeat mode in linear filtering aos code This would lead to bogus coordinates at the edges. (undetected by piglit because this path is only taken for block-based formats). Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 3a42717101b1619874c8932a580c0b9e6896b557 Author: José Fonseca <jfonseca@vmware.com> Date: Tue Jul 3 19:42:49 2012 +0100 gallivm: Fix TGSI integer translation with AVX. commit d71ff104085c196b16426081098fb0bde128ce4f Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 15:17:41 2012 +0100 llvmpipe: Fix LLVM JIT linear path. It was not working properly because it was looking at the JIT function before it was actually compiled. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit a94df0386213e1f5f9a6ed470c535f9688ec0a1b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Jun 28 18:07:10 2012 +0100 gallivm: Refactor lp_build_broadcast(_scalar) to share code. Doesn't really change the generated assembly, but produces more compact IR, and of course, makes code more consistent. Reviewed-by: Brian Paul <brianp@vmware.com> commit 66712ba2731fc029fa246d4fc477d61ab785edb5 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 17:30:13 2012 +0100 gallivm: Make LLVMContextRef a singleton. There are any places inside LLVM that depend on it. Too many to attempt to fix. Reviewed-by: Brian Paul <brianp@vmware.com> commit ff5fb7897495ac263f0b069370fab701b70dccef Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 28 18:15:27 2012 +0200 gallivm: don't use 8-wide texture fetch in aos path This appears to be a slight loss usually. There are probably several reasons for that: - fetching itself is scalar - filtering is pure int code hence needs splitting anyway, same for the final texel offset calculations - texture wrap related code, which can be done 8-wide, is slightly more complex with floats (with clamp_to_edge) and float operations generally more costly hence probably not much faster overall - the code needed to split when encountering different mip levels for the quads, adding complexity So, just split always for aos path (but leave it 8-wide for soa, since we do 8-wide filtering there when possible). This should certainly be revisited if we'd have avx2 support. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ce8032b43dcd8e8d816cbab6428f54b0798f945d Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:41:19 2012 +0200 gallivm: (trivial) don't extract fparts variable if not needed Did not have any consequences but unnecessary. commit aaa9aaed8f80dc282492f62aa583a7ee23a4c6d5 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:09:06 2012 +0200 gallivm: fix precision issue in aos linear int wrap code now not just passes at a quick glance but also with piglit... If we do the wrapping with floats, we also need to set the weights accordingly. We can potentially end up with different (integer) coordinates than what the integer calculations would have chosen, which means the integer weights calculated previously in this case are completely wrong. Well at least that's what I think happens, at least recalculating the weights helps. (Some day really should refactor all the wrapping, so we do whatever is fastest independent of 16bit int aos or 32bit float soa filtering.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit fd6f18588ced7ac8e081892f3bab2916623ad7a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 11:15:53 2012 +0100 gallium/util: Fix parsing of options with underscore. For example GALLIVM_DEBUG=no_brilinear which was being parsed as two options, "no" and "brilinear". commit 09a8f809088178a03e49e409fa18f1ac89561837 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 26 15:00:14 2012 +0100 gallivm: Added a generic lp_build_print_value which prints a LLVMValueRef. Updated lp_build_printf to share common code. Removed specific lp_build_print_vecX. Reviewed-by: José Fonseca <jfonseca@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit e59bdcc2c075931bfba2a84967a5ecd1dedd6eb0 Author: José Fonseca <jfonseca@vmware.com> Date: Wed May 16 15:00:23 2012 +0100 draw,llvmpipe: Avoid named struct types on LLVM 3.0 and later. Starting with LLVM 3.0, named structures are meant not for debugging, but for recursive data types, previously also known as opaque types. The recursive nature of these types leads to several memory management difficulties. Given that we don't actually need recursive types, avoid them altogether. This is an attempt to address fdo bugs 41791 and 44466. The issue is somewhat random so there's no easy way to check how effective this is. Cherry-picked from 9af1ba565dfd5cef9ee938bb7c04767d14878fbf commit df6070f618a203c7a876d984c847cde4cbc26bdb Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 14:42:53 2012 +0200 gallivm: (trivial) fix typo in faster aos linear int wrap code no longer crashes, now REALLY tested. commit d8f98dce452c867214e6782e86dc08562643c862 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 18:20:58 2012 +0200 llvmpipe: (trivial) remove bogus optimization for float aos repeat wrap This optimization for nearest filtering on the linear path generated likely bogus results, and the int path didn't have any optimizations there since the only shader using force_nearest apparently uses clamp_to_edge not repeat wrap anyway. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit c4e271a0631087c795e756a5bb6b046043b5099d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 23:01:52 2012 +0200 gallivm: faster repeat wrap for linear aos path too Even if we already have scaled integer coords, it's way faster to use the original float coord (plus some conversions) rather than use URem. The choice of what to do for texture wrapping is not really tied to int aos or float soa filtering though for some modes there can be some gains (because of easier weight calculations). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 1174a75b1806e92aee4264ffe0ffe7e70abbbfa3 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 14:39:22 2012 +0200 gallivm: improve npot tex wrap repeat in linear soa path URem gets translated into series of scalar divisions so just about anything else is faster. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit f849ffaa499ed96fa0efd3594fce255c7f22891b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 00:40:35 2012 +0100 gallivm: (trivial) fix near-invisible shift-space typo I blame the keyboard. commit 5298a0b19fe672aebeb70964c0797d5921b51cf0 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:24:28 2012 +0200 gallivm: add new intrinsic helper to deal with arbitrary vector length This helper will split vectors which are too large for the hw, or expand them if they are too small, so a caller of a function using intrinsics which uses such sizes need not split (or expand) the vectors manually and the function will still use the intrinsic instead of dropping back to generic llvm code. It can also accept scalars for use with pseudo-vector intrinsics (only useful for float arguments, all x86 scalar simd float intrinsics use 4vf32). Only used for lp_build_min/max() for now (also added the scalar float case for these while there). (Other basic binary functions could use it easily, whereas functions with a different interface would need different helpers.) Expanding vectors isn't widely used, because we always try to use build contexts with native hw vector sizes. But it might (or not) be nicer if this wouldn't need to be done, the generated code should in theory stay the same (it does get hit by lp_build_rho though already since we didn't have a intrinsic for the scalar lp_build_max case before). v2: incorporated Brian's feedback, and also made the scalar min/max case work instead of crash (all scalar simd float intrinsics take 4vf32 as argument, probably the reason why it wasn't used before). Moved to lp_bld_intr based on José's request, and passing intrinsic size instead of length. Ideally we'd derive the source type info from the passed in llvm value refs and process some llvmtype return type so we could handle intrinsics where the source and destination type isn't the same (like float/int conversions, packing instructions) but that's a bit too complicated for now. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 01aa760b99ec0b2dc8ce57a43650e83f8c1becdf Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:19:18 2012 +0200 gallivm: (trivial) increase max code size for shader disassembly 64kB was just short of what I needed (which caused a crash) hence increase to 96kB (should probably be smarter about that). commit 74aa739138d981311ce13076388382b5e89c6562 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:53:29 2012 +0100 gallivm: simplify aos float tex wrap repeat nearest just handle pot and npot the same. The previous pot handling ended up with exactly the same instructions plus 2 more (leave it in the soa path though since it is probably still cheaper there). While here also fix a issue which would cause a crash after an assert. commit 0e1e755645e9e49cfaa2025191e3245ccd723564 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:29:24 2012 +0100 gallivm: (trivial) skip floor rounding in ifloor when not signed This was only done for the non-sse41 case before, but even with sse41 this is obviously unnecessary (some callers already call itrunc in this case anyway but some might not). commit 7f01a62f27dcb1d52597b24825931e88bae76f33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:23:12 2012 +0100 gallivm: (trivial) fix bogus comments commit 5c85be25fd82e28490274c468ce7f3e6e8c1d416 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 20 11:51:57 2012 +0100 translate: Free elt8_func/elt16_func too. These were leaking. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 0ad498f36fb6f7458c7cffa73b6598adceee0a6c Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 15:55:34 2012 +0200 gallivm: fix bug for tex wrap repeat with linear sampling in aos float path The comparison needs to be against length not length_minus_one, otherwise the max texel is never chosen (for the second coordinate). Fixes piglit texwrap-1D-npot-proj (and 2D/3D versions). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit d1ad65937c5b76407dc2499b7b774ab59341209e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 16:13:43 2012 +0200 gallivm: simplify soa tex wrap repeat with npot textures and no mip filtering Similar to what is already done in aos sampling for the float path (but not the int path since we don't get normalized float coordinates there). URem is expensive and the calculation is done trivially with normalized floats instead (at least with sse41-capable cpus). (Some day should probably do the same for the mip filter path but it's much more complicated there hence the gain is smaller.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit e1e23f57ba9b910295c306d148f15643acc3fc83 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:38:56 2012 +0200 llvmpipe: (trivial) remove duplicated function declaration Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 07ca57eb09e04c48a157733255427ef5de620861 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:37:34 2012 +0200 llvmpipe: destroy setup variants on context destruction lp_delete_setup_variants() used to be called in garbage collection, but this no longer exists hence the setup shaders never got freed. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ed0003c633859a45f9963a479f4c15ae0ef1dca3 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 16:25:29 2012 +0100 gallivm: handle different ilod parts for multiple quad sampling This fixes filtering when the integer part of the lod is not the same for all quads. I'm not fully convinced of that solution yet as it just splits the vector if the levels to be sampled from are different. But otherwise we'd need to do things like some minify steps, and getting mip level base address separately anyway hence it wouldn't really look like much of a win (and making the code even more complex). This should now give identical results to single quad sampling. commit 8580ac4cfc43a64df55e84ac71ce1a774d33c0d2 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:14:47 2012 +0200 gallivm: de-duplicate sample code common to soa and aos sampling There doesn't seem to be any reason why this code dealing with cube face selection, lod and mip level calculation is separate in aos and soa sampling, and I am sick of having it to change in both places. commit fb541e5f957408ce305b272100196f1e12e5b1e8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:15:41 2012 +0200 gallivm: do mip filtering with per quad lod_fpart This gives better results for mip filtering, though the generated code might not be optimal. For now it also creates some artifacts if the lod_ipart isn't the same for all quads, since instead of using the same mip weight for all quads as previously (which just caused non-smooth gradients) this now will use the right weights but with the wrong mip level in this case (can easily be seen with things like texfilt, mipmap_tunnel). v2: use logic helper suggested by José, and fix issue with negative lod_fpart values commit f1cc84eef7d826a20fab6cd8ccef9a275ff78967 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 13 18:35:25 2012 +0200 gallivm: (trivial) fix bogus assert in lp_build_unpack_broadcast_aos_scalars commit 7c17dbae8ae290df9ce0f50781a09e8ed640c044 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:11:14 2012 +0100 util: Reimplement half <-> float conversions. Removed u_half.py used to generate the table for previous method. Previous implementation of float to half conversion was faulty for denormalised and NaNs and would require extra logic to fix, thus making the speedup of using tables irrelevant. commit 7762f59274070e1dd4b546f5cb431c2eb71ae5c3 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:12:16 2012 +0100 tests: Updated tests to properly handle NaN for half floats. commit fa94c135aea5911fd93d5dfb6e6f157fb40dce5e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:33:10 2012 +0200 gallivm: do mip level calculations per quad This is the final piece which shouldn't change the rendering output yet. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 23cbeaddfe03c09ca18c45d28955515317ffcf4c Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Jun 9 00:54:21 2012 +0200 gallivm: do per-quad cube face selection Doesn't quite fix the piglit cubemap test (not sure why actually) but doing per-quad face selection is doing the right thing and definitely an improvement. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit abfb372b3702ac97ac8b5aa80ad1b94a2cc39d33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:22:59 2012 +0200 gallivm: do all lod calculations per quad Still no functional change but lod is now converted to scalar after lod calculations. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 519368632747ae03feb5bca9c655eccbc5b751b4 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:46:10 2012 +0100 gallivm: Added support for half-float to float conversion in lp_build_conv. Updated various utility functions to support this change. commit 135b4d683a4c95f7577ba27b9bffa4a6fbd2c2e7 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:02:46 2012 +0100 gallivm: Added function for half-float to float conversion. Updated lp_build_format_aos_array to support half-float source. commit 37d648827406a20c5007abeb177698723ed86673 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:55:18 2012 +0100 util: Updated u_format_tests to rigidly test half-float boundary values. commit 2ad18165d96e578aa9046df7c93cb1c3284d8c6b Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:54:16 2012 +0100 llvmpipe: Updated lp_test_format to properly handle Inf/NaN results. commit 78740acf25aeba8a7d146493dd5c966e22c27b73 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:53:30 2012 +0100 util: Added functions for checking NaN / Inf for double and half-floats. commit 35e9f640ae01241f9e0d67fe893bbbf564c05809 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:05:13 2012 +0200 gallivm: Fix calculating rho for 3d textures for the single-quad case Discovered by accident, this looks like a very old typo bug. commit fc1220c636326536fd0541913154e62afa7cd1d8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:04:59 2012 +0200 gallivm: do calcs per-quad in lp_build_rho Still convert to scalar at the end of the function. commit 50a887ffc550bf310a6988fa2cea5c24d38c1a41 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 21 23:21:50 2012 +0200 gallivm: (trivial) return scalar in lp_build_extract_range for length 1 vectors Our type system on top of llvm's one doesn't generally support vectors of length 1, instead using scalars. So we should return a scalar from this function instead of having to bitcast the vector with length 1 later elsewhere. commit 80c71c621f9391f0f9230460198d861643324876 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 17:49:15 2012 +0100 draw: Fixed bad merge error commit c47401cfad0c9167de20ff560654f533579f452c Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:29:30 2012 +0100 draw: Updated store_clip to store whole vectors instead of individual elements. commit 2d9c1ad74b0b0b41861fffcecde39f09cc27f1cf Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:28:32 2012 +0100 gallivm: Added lp_build_fetch_rgba_aos_array. A version of lp_build_fetch_rgba_aos which is targeted at simple array formats. Reads the whole vector from memory in one, instead of reading each element individually. Tested with mesa tests and demos. commit ff7805dc2b6ef6d8b11ec4e54aab1633aef29ac8 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:27:40 2012 +0100 gallivm: Added lp_build_pad_vector. This function pads a vector with undef to a desired length. commit 701f50acef24a2791dabf4730e5b5687d6eb875d Author: James Benton <jbenton@vmware.com> Date: Fri May 18 17:27:19 2012 +0100 util: Added util_format_is_array. This function checks whether a format description is in a simple array format. commit 5e0a7fa543dcd009de26f34a7926674190fa6246 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:13:47 2012 +0100 draw: Removed draw_llvm_translate_from and draw/draw_llvm_translate.c. This is "replaced" by adding an optimised path in lp_build_fetch_rgba_aos in an upcoming patch. commit 8c886d6a7dd3fb464ecf031de6f747cb33e5361d Author: James Benton <jbenton@vmware.com> Date: Wed May 16 15:02:31 2012 +0100 draw: Modified store_aos to write the vector as one, not individual elements. commit 37337f3d657e21dfd662c7b26d61cb0f8cfa6f17 Author: James Benton <jbenton@vmware.com> Date: Wed May 16 14:16:23 2012 +0100 draw: Changed aos_to_soa to use lp_build_transpose_aos. commit bd2b69ce5d5c94b067944d1dcd5df9f8e84548f1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:14:27 2012 +0100 draw: Changed soa_to_aos to use lp_build_transpose_aos. commit 0b98a950d29a116e82ce31dfe7b82cdadb632f2b Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:45 2012 +0100 gallivm: Added lp_build_transpose_aos which converts between aos and soa. commit 69ea84531ad46fd145eb619ed1cedbe97dde7cb5 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:01 2012 +0100 gallivm: Added lp_build_interleave2_half aimed at AVX unpack instructions. commit 7a4cb1349dd35c18144ad5934525cfb9436792f9 Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 22 11:54:14 2012 +0100 gallivm: Fix build on Windows. MC-JIT not yet supported there. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit afd105fc16bb75d874e418046b80d9cc578818a1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:17:26 2012 +0100 llvmpipe: Added a error counter to lp_test_conv. Useful for keeping track of progress when fixing errors! Signed-off-by: José Fonseca <jfonseca@vmware.com> commit b644907d08c10a805657841330fc23db3963d59c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:16:46 2012 +0100 llvmpipe: Changed known failures in lp_test_conv. To comply with the recent fixes to lp_bld_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit d7061507bd94f6468581e218e61261b79c760d4f Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:14:38 2012 +0100 llvmpipe: Added fixed point types tests to lp_test_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 146b3ea39b4726dbe125ac666bd8902ea3d6ca8c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:26:35 2012 +0100 llvmpipe: Changed lp_test_conv src/dst alignment to be correct. Now based on the define rather than a fixed number. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit f3b57441f834833a4b142a951eb98df0aa874536 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:06:44 2012 +0100 gallivm: Fixed erroneous optimisation in lp_build_min/max. Previously assumed normalised was 0 to 1, but it can be -1 to 1 if type is signed. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a0613382e5a215cd146bb277646a6b394d376ae4 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:04:49 2012 +0100 gallivm: Compensate for lp_const_offset in lp_build_conv. Fixing a /*FIXME*/ to remove errors in integer conversion in lp_build_conv. Tested using lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a3d2bf15ea345bc8a0664f8f441276fd566566f3 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:01:25 2012 +0100 gallivm: Fixed overflow in lp_build_clamped_float_to_unsigned_norm. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit e7b1e76fe237613731fa6003b5e1601a2e506207 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 21 20:07:51 2012 +0100 gallivm: Fix build with LLVM 2.6 Trivial, and useful. commit d3c6bbe5c7f5ba1976710831281ab1b6a631082d Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 15 17:15:59 2012 +0100 gallivm: Enable MCJIT/AVX with vanilla LLVM 3.1. Add the necessary C++ glue, so that we don't need any modifications to the soon to be released LLVM 3.1. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 724a019a14d40fdbed21759a204a2bec8a315636 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 22:04:06 2012 +0100 gallivm: Use HAVE_LLVM 0x0301 consistently. commit af6991e2a3868e40ad599b46278551b794839748 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 21:49:06 2012 +0100 gallivm: Add MCRegisterInfo.h to silence benign warnings about missing implementation. Trivial. commit 6f8a1d75458daae2503a86c6b030ecc4bb494e23 Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Apr 2 22:14:15 2012 -0700 gallivm: Pass in a MCInstrInfo to createMCInstPrinter on llvm-3.1. llvm-3.1svn r153860 makes MCInstrInfo available to the MCInstPrinter. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 62555b6ed8760545794f83064e27cddcb3ce5284 Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 27 21:51:17 2012 -0700 gallivm: Fix method overriding in raw_debug_ostream. Use matching type qualifers to avoid method hiding. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 6a9bd784f4ac68ad0a731dcd39e5a3c39989f2be Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 13 22:40:52 2012 -0700 gallivm: Fix createOProfileJITEventListener namespace with llvm-3.1. llvm-3.1svn r152620 refactored the OProfile profiling code. createOProfileJITEventListener was moved from the llvm namespace to the llvm::JITEventListener namespace. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit b674955d39adae272a779be85aa1bd665de24e3e Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Mar 5 22:00:40 2012 -0800 gallivm: Pass in a MCRegisterInfo to MCInstPrinter on llvm-3.1. llvm-3.1svn r152043 changes createMCInstPrinter to take an additional MCRegisterInfo argument. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 11ab69971a8a31c62f6de74905dbf8c02884599f Author: Vinson Lee <vlee@freedesktop.org> Date: Wed Feb 29 21:20:53 2012 -0800 Revert "gallivm: Change getExtent and readByte to non-const with llvm-3.1." This reverts commit d5a6c172547d8964f4d4bb79637651decaf9deee. llvm-3.1svn r151687 makes MemoryObject accessor members const again. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 339960c82d2a9f5c928ee9035ed31dadb7f45537 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 14 16:19:56 2012 +0200 gallivm: (trivial) fix assertion failure for mipmapped 1d textures In lp_build_rho, we may end up with a 1-element vector (for mipmapped 1d textures), but in this case we require the type to be a non-vector type, so need a cast. commit 9d73edb727bd6d196030dc3026b7bf0c574b3e19 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:12:07 2012 +0200 gallivm: prepare for per-quad lod calculations for large vectors to be able to handle multiple quads at once in texture sampling and still do lod calculations per quad, it is necessary to get the per-quad derivatives into the lp_build_rho function. Until now these derivative values were just scalars, which isn't going to work. So we now use vectors, and since the interface needs to change we also do some different (slightly more efficient) packing of the values. For 8-wide vectors the packed derivative values for 3 coords would look like this, this scales to a arbitrary (multiple of 4) vector size: ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____ The second vector will be unused for 1d and 2d textures. To facilitate future changes the derivative values are put into a struct, since quite some functions just pass these values through. The generated code seems to be very slightly better for 2d textures (with 4-wide vectors) than before with sse2 (if you have a cpu with physical 128bit simd units - otherwise it's probably not a win). v2: suggestions from José, rename variables, add comments, use swizzle helper commit 0aa21de0d31466dac77b05c97005722e902517b8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:10:31 2012 +0200 gallivm: add undefined swizzle handling to lp_build_swizzle_aos This is useful for vectors with "holes", it lets llvm choose the most efficient shuffle instructions if some elements aren't needed without having to worry what elements to manually pick otherwise. commit 00faf3f370e7ce92f5ef51002b0ea42ef856e181 Author: José Fonseca <jfonseca@vmware.com> Date: Fri May 4 17:25:16 2012 +0100 gallivm: Get the LLVM IR optimization passes before JIT compilation. MC-JIT engine compiles the module immediately on creation, so the optimization passes were being run too late. So now we create a target data layout from a string, that matches the ABI parameters reported by the compiler. The backend optimization passes were always been run, so the performance improvement is modest (3% on multiarb mesa demo). Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit 40a43f4e2ce3074b5ce9027179d657ebba68800a Author: Roland Scheidegger <sroland@vmware.com> Date: Wed May 2 16:03:54 2012 +0200 gallivm: (trivial) fix wrong define used in lp_build_pack2 should fix stack-smashing crashes. commit e6371d0f4dffad4eb3b7a9d906c23f1c88a2ab9e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 30 21:25:29 2012 +0200 gallivm: add perf warnings when not using intrinsics with 256bit vectors Helper functions using integer sse2 intrinsics could split the vectors with AVX instead of using generic fallback (which should be faster). We don't actually expect to hit these paths (hence don't fix them up to actually do the vector splitting) so just emit warnings (for those functions where it's obvious doing split/intrinsic is faster than using generic path). Only emit warnings for 256bit vectors since we _really_ don't expect to hit arbitrary large vectors which would affect a lot more functions. The warnings do not actually depend on avx since the same logic applies to plain sse2 too (but of course again there's _really_ no reason we should hit these functions with 256bit vectors without avx). commit 8a9ea701ea7295181e846c6383bf66a5f5e47637 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:37:07 2012 +0200 gallivm: split vectors manually for avx in lp_build_pack2 (v2) There's 2 reasons for this: First, there's a llvm bug (fixed in 3.1) which generates tons of byte inserts/extracts otherwise, and second, more importantly, we want to use pack intrinsics instead of shuffles. We do this in lp_build_pack2 and not the calling code (aos sample path) because potentially other callers might find that useful too, even if for larger sequences of code using non-native vector sizes it might be better to manually split vectors. This should boost texture performance in the aos path considerably. v2: fix issues with intrinsics types with old llvm commit 27ac5b48fa1f2ea3efeb5248e2ce32264aba466e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:26:22 2012 +0200 llvmpipe: refactor lp_build_pack2 (v2) prettify, and it's unnecessary to assert when there's no intrinsic due to unsupported bit width - the shuffle path will work regardless. In contrast lp_build_packs2, should only rely on lp_build_pack2 doing the clamping for element sizes for which there is a sse2 intrinsic. v2: fix bug spotted by Jose regarding the intrinsic type for packusdw on old llvm versions. commit ddf279031f0111de4b18eaf783bdc0a1e47813c8 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:13:59 2012 +0200 gallivm: add src width check in lp_build_packs2() not doing so would skip clamping even if no sse2 pack instruction is available, which is incorrect (in theory only, such widths would also always hit a (unnecessary) assertion in lp_build_pack2(). commit e7f0ad7fe079975eae7712a6e0c54be4fae0114b Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Apr 27 15:57:00 2012 +0200 gallivm: (trivial) fix crash-causing typo for npot textures with avx commit 28a9d7f6f655b6ec508c8a3aa6ffefc1e79793a0 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Apr 25 19:38:45 2012 +0200 gallivm: (trivial) remove code mistakenly added twice. commit d5926537316f8ff67ad0a52e7242f7c5478d919b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Apr 24 21:16:15 2012 +0200 gallivm: add a new avx aos sample path (v2) Try to avoid mixing float and int address calculations. This does texture wrap modes with floats, and then the offset calculations still with ints (because of lack of precision with floats, though we could do some effort to make it work with not too large (16MB) textures). This also handles wrap repeat mode with npot-sized textures differently than either the old soa or aos int path (likely way faster but untested). Otherwise the actual address wrap code is largely similar to the soa path (not quite the same as this one also has some int code), it should get used by avx soa sampling later as well but doesn't handle more complex address modes yet (this will also have the benefit that we can use aos sampling path for all texture address modes). Generated code for that looks reasonable, but still does not split vectors explicitly for fetch/filter which means still get hit by llvm (fixed upstream) which generates hundreds of pinsrb/pextrb instead of two shuffles. It is not obvious though if it's much of a win over just doing address calcs 4-wide but with ints, even if it is definitely much less instructions on avx. piglit's texwrap seems to look exactly the same but doesn't test neither the non-normalized nor the npot cases. v2: fix comments, prettify based on Brian's and Jose's feedback. commit bffecd22dea66fb416ecff8cffd10dd4bdb73fce Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Apr 19 01:58:29 2012 +0200 gallivm: refactor aos lp_build_sample_image_nearest/linear split them up to separate address calculations and fetching/filtering. Need this for being able to do 8-wide float address calcs and 4-wide fetch/filter later (for avx). Plus the functions were very big scary monsters anyway (in particular lp_build_sample_image_linear). commit a80b325c57529adddcfa367f96f03557725c4773 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 16 17:17:18 2012 +0200 gallivm: fix lp_build_resize when truncating width but expanding vector size Missed this case which I thought was impossible - the assertion for it was right after the division by zero... (AoS) texture sampling may ask us to do this, for things like 8 4x32int vectors to 1 32x8int vector conversion (eventually, we probably don't want this to happen). commit f9c8337caa3eb185830d18bce8b95676a065b1d7 Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Apr 14 18:00:59 2012 +0200 gallivm: fix cube maps with larger vectors This makes the branchless cube face selection code work with larger vectors. Because the complexity is quite high (cannot really be improved it seems, per-face selection would reduce complexity a lot but this leads to errors unless the derivatives are calculated all from the same face which almost doubles the work to be done) it is still slower than the branching version, hence only enable this with large vectors. It doesn't actually do per-quad face selection yet (only makes sense with matching lod selection, in fact it will select the same face for all pixels based on the average of the first four pixels for now) but only different shuffles are required to make it work (the branching version actually should work with larger vectors too now thanks to the improved horizontal add but of course it cannot be extended to really select the face per-quad unless doing branching per quad). commit 7780c58869fc9a00af4f23209902db7e058e8a66 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 21:11:12 2012 +0100 llvmpipe: (trivial) fix compiler warning and also clarify comment regarding availability of popcnt instruction. commit a266dccf477df6d29a611154e988e8895892277e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 14:21:07 2012 +0100 gallivm: remove unneeded members in lp_build_sample_context Minor cleanup, the texture width, height, depth aren't accessed in their scalar form anywhere. Makes it more obvious those values should probably be fetched already vectorized (but this requires more invasive changes)... commit b678c57fb474e14f05e25658c829fc04d2792fff Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 29 15:53:55 2012 +0100 gallivm: add a helper for concatenating vectors Similar to the extract_range helper intended to get around slow code generated by llvm for 128bit insertelements. Concatenating two 128bit vectors this way will result in a single vinsertf128 operation rather than two 64bit stores plus one 128bit load, though it might be mildly useful for other purposes as well. commit 415ff228bcd0cf5e44a4c15350a661f0f5520029 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 19:41:15 2012 +0100 gallivm: add a custom 2x8f->1x16ub avx conversion path Similar to the existing 4x4f->1x16ub sse2 path, shaves off a couple instructions (min/max mostly) because it relies on pack intrinsics clamping. commit 78c08fc89f8fbcc6dba09779981b1e873e2a0299 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 18:44:07 2012 +0100 gallivm: add avx arithmetic intrinsics Add all avx intrinsics for arithmetic functions (with the exception of the horizontal add function which needs another look). Seems to pass basic tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit a586caa2800aa5ce54c173f7c0d4fc48153dbc4e Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 15:31:35 2012 +0100 gallivm: add avx logic intrinsics Add the blend intrinsics for 8-wide float and 4-wide double vectors. Since we lack 256bit int instructions these are used for int vectors as well, though obviously not for byte or word element values. The comparison intrinsics aren't extended for avx since these are only used for pre-2.7 llvm versions. commit 70275e4c13c89315fc2560a4c488c0e6935d5caf Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 00:40:53 2012 +0100 gallivm: new helper function for extract shuffles. Based on José's idea as we can need that in a couple places. Note that such shuffles should not be used lightly, since data layout of <4 x i8> is different to <16 x i8> for instance, hence might cause data rearrangement. commit 4d586dbae1b0c55915dda1759d2faea631c0a1c2 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 18:27:25 2012 +0100 gallivm: (trivial) don't overallocate shuffle variable using wrong define meant huge array... commit 06b0ec1f6d665d98c135f9573ddf4ba04b2121ad Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 17:54:20 2012 +0100 gallivm: don't do per-element extract/insert for vector element resize Instead of doing per-element extract/insert if the src vectors and dst vector differ in total size (which generates atrocious code) first change the src vectors size by using shuffles to destination vector size. We can still do better than that on AVX for packing to color buffer (by exploiting pack intrinsics characteristics hence eleminating the need for some clamps) but this already generates much better code. v2: incorporate feedback from José, Keith and use shuffle instead of bitcasts/extracts. Due to llvm deficiencies the latter cause all data to get moved to GPRs and back in pieces (even though the data in the regs actually stays the same...). commit c9970d70e05f95d3f52fe7d2cd794176a52693aa Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 19:33:19 2012 +0000 gallivm: fix bug in simple position interpolation Accidental use of position attribute instead of just pixel coordinates. Caused failures in piglit glsl-fs-ceil and glsl-fs-floor. commit d0b6fcdb008d04d7f73d3d725615321544da5a7e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 15:31:14 2012 +0000 gallivm: fix emission of ceil opcode lp_build_ceil seems more appropriate than lp_build_trunc. This seems to be never hit though someone performs some ceil to floor magic. commit d97fafed7e62ffa6bf76560a92ea246a1a26d256 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 22 11:46:52 2012 +0000 gallivm: new vectorized path for cubemap calculations should be faster when adapted to multiple quads as only selection masks need to be different. The code is more or less a per-pixel version adapted to only do it per quad. A per pixel version would be much simpler (could drop 2 selects, 6 broadcasts and the messy horizontal add of 3 vectors at the expense of only 2 more absolute value instructions - would also just work for arbitary large vectors). This version doesn't yet work with larger vectors because the horizontal add isn't adjusted to be able to work with 2x4 vectors (and also because face selection wouldn't be done per quad just per block though that would be only a correctness issue just as with lod selection). The downside is this code is quite a bit slower. On a Core2 it can be sped up by disabling the hw blend instructions for selection and using logicop fallbacks instead, but it is still slower than the old code, hence leave that in for now. Probably will chose one or the other version based on vector length in the end. commit b375fbb18a3fd46859b7fdd42f3e9908ea4ff9a3 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 21 14:42:29 2012 +0000 gallivm: fix optimized occlusion query intrinsic name commit a9ba0a3b611e48efbb0e79eb09caa85033dbe9a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Mar 21 16:19:43 2012 +0000 draw,gallivm,llvmpipe: Call gallivm_verify_function everywhere. commit f94c2238d2bc7383e088b8845b7410439a602071 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 18:54:10 2012 +0000 gallivm: optimize calculations for cube maps a bit this does some more vectorized calculations and uses horizontal adds if possible. A definite win with sse3 otherwise it doesn't seem to make much of a difference. In any case this is arithmetically identical, cannot handle larger vectors. Should be useful as a reference point against larger vector version later... commit 21a2c1cf3c8e1ac648ff49e59fdc0e3be77e2ebb Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 15:16:27 2012 +0000 llvmpipe: slight optimization of occlusion queries using movmskps when available. While this is slightly better for cpus without popcnt we should really sum the vectors ourselves (it is also possible to cast to i4 before doing the popcnt but that doesn't help that much neither since llvm is using some optimized popcnt version for i32) commit 5ab5a35f216619bcdf55eed52b0db275c4a06c1b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 13:32:11 2012 +0000 llvmpipe: fix occlusion queries with larger vectors need to adjust casts etc. commit ff95e6fdf5f16d4ef999ffcf05ea6e8c7160b0d5 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Mar 19 20:15:25 2012 +0000 gallivm: Restore optimization passes. commit 57b05b4b36451e351659e98946dae27be0959832 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:34:22 2012 +0000 llvmpipe: use existing min2 macro commit bc9a20e19b4f600a439f45679451f2e87cd4b299 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:07:27 2012 +0000 llvmpipe: add some safeguards against really large vectors As per José's suggestion, prevent things from blowing up if some cpu would have 1024bit or larger vectors. commit 0e2b525e5ca1c5bbaa63158bde52ad1c1564a3a9 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:31:08 2012 +0000 llvmpipe: fix mask generation for uberwide vectors this was the only piece preventing 16-wide vectors from working (apart from the LP_MAX_VECTOR_WIDTH define that is), which is the maximum as we don't get more pixels in the fragment shader at once. Hence adjust that so things could be tested properly with that size even though there seems to be no practical value. commit 3c8334162211c97f3a11c7f64e9e5a2a91ad9656 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:19:41 2012 +0000 llvmpipe: fix the simple interpolation method with larger vectors so both methods actually _really_ work now. Makes textures look nice with larger vectors... commit 1cb0464ef8871be1778d43b0c56adf9c06843e2d Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 17:26:35 2012 +0000 llvmpipe: fix mask generation and position interpolation with 8-wide vectors trivial bugs, with these things start to look somewhat reasonable. Textures though have some swizzling issues it seems. commit 168277a63ef5b72542cf063c337f2d701053ff4b Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 16:04:03 2012 +0000 llvmpipe: don't overallocate variables we never have more than 16 (stamp size) / 4 (minimum possible vector size). (With larger vectors those variables are still overallocated a bit.) commit 409b54b30f81ed0aa9ed0b01affe15c72de9abd2 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:56:48 2012 +0000 llvmpipe: add some 32f8 formats to lp_test_conv Also add the ability to handle different sized vectors. commit 55dcd3af8366ebdac0af3cdb22c2588f24aa18ce Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:47:27 2012 +0000 gallivm: handle different sized vectors in conversion / pack only fully generic path for now (extract/insert per element). commit 9c040f78c54575fcd94a8808216cf415fe8868f6 Author: Roland Scheidegger <sroland@vmware.com> Date: Sun Mar 18 00:58:28 2012 +0100 llvmpipe: fix harmless use of unitialized values commit 551e9d5468b92fc7d5aa2265db9a52bb1e368a36 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:31:21 2012 +0100 gallivm: drop special path in extract_broadcast with different sized vectors Not needed, llvm can handle shuffles with different sized result vector just fine. Should hopefully generate the same code in the end, but simpler IR. commit 44da531119ffa07a421eaa041f63607cec88f6f8 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:28:49 2012 +0100 llvmpipe: adapt interpolation for handling multiple quads at once this is still WIP there are actually two methods possible not quite sure what makes the most sense, so there's code for both for now: 1) the iterative method as used before (compute attrib values at upper left corner of stamp and upper left corner of each quad initially). It is improved to handle more than one quad at once, and also do some more vectorized calculations initially for slightly better code - newer cpus have full throughput with 4 wide float vectors, hence don't try to code up a path which might be faster if there's just one channel active per attribute. 2) just do straight interpolation for each pixel. Method 2) is more work per quad, but less initially - if all quads are executed significantly more overall though. But this might change with larger vector lengths. This method would also be needed if we'd do some kind of active quad merging when operating on multiple quads at once. This path contains some hack to force llvm to generate better code, it is still far from ideal though, still generates far too many unnecessary register spills/reloads. Both methods should work with different sized vectors. Not very well tested yet, still seems to work with four-wide vectors, need changes elsewhere to be able to test with wider vectors. commit be5d3e82e2fe14ad0a46529ab79f65bf2276cd28 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:59:37 2012 +0000 draw: Cleanup. commit f85bc12c7fbacb3de2a94e88c6cd2d5ee0ec0e8d Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:43:30 2012 +0000 gallivm: More module compilation refactoring. commit d76f093198f2a06a93b2204857e6fea5fd0b3ece Author: José Fonseca <jfonseca@vmware.com> Date: Thu Mar 15 21:29:11 2012 +0000 llvmpipe: Use gallivm_compile/free_function() in linear code. Should had been done before. commit 122e1adb613ce083ad739b153ced1cde61dfc8c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 13 14:47:10 2012 +0100 llvmpipe: generate partial pixel mask for multiple quads still works with one quad, cannot be tested yet with more At least for now always fixed order with multiple quads. commit 4c4f15081d75ed585a01392cd2dcce0ad10e0ea8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 22:09:24 2012 +0100 llvmpipe: refactor state setup a bit Refactor to make it easier to emit (and potentially later fetch in fs) coefficients for multiple attributes at once. Need to think more about how to make this actually happen however, the problem is different attributes can have different interpolation modes, requiring different handling in both setup and fs (though linear and perspective handling is close). commit 9363e49722ff47094d688a4be6f015a03fba9c79 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 19:23:23 2012 +0100 llvmpipe: vectorize tri offset calc cuts number of instructions in quad-offset-factor from 107 to 75. This code actually duplicated the (scalar) code calculating the determinant except it used different vertex order (leading to different sign but it doesn't matter) hence llvm could not have figured out it's the same (of course with determinant vectorized in the other place that wouldn't have worked any longer neither). Note this particular piece doesn't actually vectorize well, not many arithmetic instructions left but tons of shuffle instructions... Probably would need to work on n tris at a time for better vectorization. commit 63169dcb9dd445c94605625bf86d85306e2b4297 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 03:11:37 2012 +0100 llvmpipe: vectorize some scalar code in setup reduces number of arithmetic instructions, and avoids loading vector x,y values twice (once as scalars once as vectors). Results in a reduction of instructions from 76 to 64 in fs setup for glxgears (16%) on a cpu with sse41. Since this code uses vec2 disguised as vec4, on old cpus which had physical 64bit sse units (pre-Core2) it probably is less of a win in practice (and if you have no vectors you can only hope llvm eliminates the arithmetic for unneeded elements). commit 732ecb877f951ab89bf503ac5e35ab8d838b58a1 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 7 00:32:24 2012 +0100 draw: fix clipping bug introduced by 4822fea3f0440b5205e957cd303838c3b128419c broke clipping pretty badly (verified with lineclip test) commit ef5d90b86d624c152d200c7c4056f47c3c6d2688 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 23:38:59 2012 +0100 draw: don't store vertex header per attribute storing the vertex header once per attribute is totally unnecessary. Some quick look at the generated assembly says llvm in fact cannot optimize away the additional stores (maybe due to potentially aliasing pointers somewhere). Plus, this makes the code cleaner and also allows using a vector "or" instead of scalar ones. commit 6b3a5a57b0b9850854cfbd7b586e4e50102dda71 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 19:11:01 2012 +0100 draw: do the per-vertex "boolean" clipmask "or" with vectors no point extracting the values and doing it per component. Doesn't help that much since we still extract the values elsewhere anyway. commit 36519caf1af40e4480251cc79a2d527350b7c61f Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 2 22:27:01 2012 +0100 gallivm: fix lp_build_extract_broadcast with different sized vectors Fix the obviously wrong argument, so it doesn't blow up. commit 76d0ac3ad85066d6058486638013afd02b069c58 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 2 12:16:23 2012 +0000 draw: Compile per module and not per function (WIP). Enough to get gears w/ LLVM draw + softpipe to work on AVX doing: GALLIUM_DRIVER=softpipe SOFTPIPE_USE_LLVM=yes glxgears But still hackish -- will need to rethink and refactor this. commit 78e32b247d2a7a771be9a1a07eb000d1e54ea8bd Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 12:01:05 2012 +0000 llvmpipe: Remove lp_state_setup_fallback. Never used. commit 6895d5e40d19b4972c361e8b83fdb7eecda3c225 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Feb 27 19:14:27 2012 +0000 llvmpipe: Don't emit EMMS on x86 We already take precautions to ensure that LLVM never emits MMX code. commit 4822fea3f0440b5205e957cd303838c3b128419c Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 29 15:58:19 2012 +0100 draw: modifications for larger vector sizes We want to be able to use larger vectors especially for running the vertex shader. With this patch we build soa vectors which might have a different length than 4. Note that aos structures really remain the same, only when aos structures are converted to soa potentially different sized vectors are used. Samplers probably don't work yet, didn't look at them. Testing done: glxgears works with both 128bit and 256bit vectors. commit f4950fc1ea784680ab767d3dd0dce589f4e70603 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:57 2012 +0100 gallivm: override native vector width with LP_NATIVE_VECTOR_WIDTH env var for debug commit 6ad6dbf0c92f3bf68ae54e5f2aca035d19b76e53 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:24 2012 +0100 draw: allocate storage with alignment according to native vector width commit 7bf0e3e7c9bd2469ae7279cabf4c5229ae9880c1 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Feb 24 19:06:08 2012 +0000 gallivm: Fix comment grammar. Was missing several words. Spotted by Roland. commit b20f1b28eb890b2fa2de44a0399b9b6a0d453c52 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 19:22:09 2012 +0000 gallivm: Use MC-JIT on LLVM 3.1 + (i.e, SVN) MC-JIT Note: MC-JIT is still WIP. For this to work correctly it requires LLVM changes which are not yet upstream. commit b1af4dfcadfc241fd4023f4c3f823a1286d452c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 20:03:15 2012 +0100 llvmpipe: use new lp_type_width() helper in lp_test_blend commit 04e0a37e888237d4db2298f31973af459ef9c95f Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 19:50:34 2012 +0100 llvmpipe: clean up lp_test_blend a little Using variables just sized and aligned right makes it a bit more obvious what's going on. The test still only tests vector length 4. For AoS anything else probably isn't going to work. For SoA other lengths should work (at least with floats). commit e61c393d3ec392ddee0a3da170e985fda885a823 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:48:30 2012 +0000 gallivm: Ensure vector width consistency. Instead of assuming that everything is the max native size. commit 330081ac7bc41c5754a92825e51456d231bf84dd Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:44:14 2012 +0000 draw: More simd vector width consistency fixes. commit d90ca002753596269e37297e2e6c139b19f29f03 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:43:00 2012 +0000 gallivm: Remove unused lp_build_int32_vec4_type() helper. commit cae23417824d75869c202aaf897808d73a2c1db0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 17:32:16 2012 +0100 gallivm: use global variable for native vector width instead of define We do not know the simd extensions (and hence the simd width we should use) available at compile time. At least for now keep a define for maximum vector width, since a global variable obviously can't be used to adjust alignment of automatic stack variables. Leave the runtime-determined value at 128 for now in all cases. commit 51270ace6349acc2c294fc6f34c025c707be538a Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 15:41:02 2012 +0000 gallivm: Add a hunk inadvertedly lost when rebasing. commit bf256df9cfdd0236637a455cbaece949b1253e98 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:24:23 2012 +0000 llvmpipe: Use consistent vector width in depth/stencil test. commit 5543b0901677146662c44be2cfba655fd55da94b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:19:59 2012 +0000 draw: Use a consistent the vector register width. Instead of 4x32 sometimes, LP_NATIVE_VECTOR_WIDTH other times. commit eada8bbd22a3a61f549f32fe2a7e408222e5c824 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 12:08:04 2012 +0000 gallivm: Remove garbagge collection. MC-JIT will require one compilation per module (as opposed to one compilation per function), therefore no state will be shared, eliminating the need to do garbagge collection. commit 556697ea0ed72e0641851e4fbbbb862c470fd7eb Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 10:33:41 2012 +0000 gallivm: Move all native target initialization to lp_set_target_options(). commit c518e8f3f2649d5dc265403511fab4bcbe2cc5c8 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:52:32 2012 +0000 llvmpipe: Create one gallivm instance for each test. commit 90f10af8920ec6be6f2b1e7365cfc477a0cb111d Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:48:08 2012 +0000 gallivm: Avoid LLVMAddGlobalMapping() in lp_bld_assert(). Brittle, complex, and unecesary. Just use function pointer constant. commit 98fde550b33401e3fe006af59db4db628bcbf476 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:21:26 2012 +0000 gallivm: Add a lp_build_const_func_pointer() helper. To be reused in all places where we want to call C code. commit 6cfedadb62c2ce5af8d75969bc95a607f3ece118 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:44:41 2012 +0000 gallivm: Cleanup/simplify lp_build_const_string_variable. - Move to lp_bld_const where it belongs - Rename to lp_build_const_string - take the length from the argument (and don't count the zero terminator twice) - bitcast the constant to generic i8 * commit db1d4018c0f1fa682a9da93c032977659adfb68c Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 11:52:17 2012 +0000 gallivm: Set NoFramePointerElimNonLeaf to true where supported. commit 088614164aa915baaa5044fede728aa898483183 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 22 19:38:47 2012 +0100 llvmpipe: pass in/out pointers rather scalar floats in lp_bld_arit we don't want llvm to potentially optimize away the vectors (though it doesn't seem to currently), plus we want to be able to handle in/out vectors of arbitrary length. commit 3f5c4e04af8a7592fdffa54938a277c34ae76b51 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:22:55 2012 +0100 gallivm: fix lp_build_sqrt() for vector length 1 since we optimize away vectors with length 1 need to emit intrinsic without vector type. commit 79d94e5f93ed8ba6757b97e2026722ea31d32c06 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 17:00:46 2012 +0000 llvmpipe: Remove lp_test_round. commit 81f41b5aeb3f4126e06453cfc78990086b85b78d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:56:24 2012 +0100 llvmpipe: subsume lp_test_round into lp_test_arit Much simpler, and since the arguments aren't passed as 128bit values can run on any arch. This also uses the float instead of the double versions of the c functions (which probably was the intention anyway). In contrast to lp_test_round the output is much less verbose however. Tested vector width of 32 to 512 bits - all pass except 32 (length 1) which crashes in lp_build_sqrt() due to wrong type. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 945b338b421defbd274481d8c4f7e0910fd0e7eb Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 09:55:03 2012 +0000 gallivm: Centralize the function compilation logic. This simplifies a lot of code. Also doing this in a central place will make it easier to carry out the changes necessary to use MC-JIT in the future. gallivm: Fix typo in explicit derivative shuffle. Trivial. draw: make DEBUG_STORE work again adapt to lp_build_printf() interface changes Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: get rid of vecnf_from_scalar() just use lp_build_broadcast directly (cannot assign a name but don't really need it, vecnf_from_scalar() was producing much uglier IR due to using repeated insertelement instead of insertelement+shuffle). Reviewed-by: José Fonseca <jfonseca@vmware.com> llvmpipe: fix typo in complex interpolation code Fixes position interpolation when using complex mode (piglit fp-fragment-position and similar) Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: fix clipvertex/position storing again This appears to be the result of a bad merge. Fixes piglit tests relying on clipping, like a lot of the interpolation tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Fix explicit derivative manipulation. Same counter variable was being used in two nested loops. Use more meanigful variable names for the counter to fix and avoid this. gallivm: Prevent buffer overflow in repeat wrap mode for NPOT. Based on Roland's patch, discussion, and review . Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix dims for TGSI_TEXTURE_1D in emit_tex. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix explicit volume texture derivatives. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: fix 1d shadow texture sampling Always r coordinate is used, hence need 3 coords not two (the second one is unused). Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Enable AVX support without MCJIT, where available. For now, this just enables AVX on Windows for testing. If the code is stable then we might consider prefering the old JIT wherever possible. No change elsewhere. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2012-07-13 18:09:30 +01:00
if (type.floating && util_cpu_caps.has_sse) {
if (type.width == 32) {
if (type.length == 1) {
intrinsic = "llvm.x86.sse.min.ss";
intr_size = 128;
}
else if (type.length <= 4 || !util_cpu_caps.has_avx) {
intrinsic = "llvm.x86.sse.min.ps";
gallivm,draw,llvmpipe: Support wider native registers. Squashed commit of the following: commit 7acb7b4f60dc505af3dd00dcff744f80315d5b0e Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:46:31 2012 +0100 draw: Don't use dynamically sized arrays. Not supported by MSVC. commit 5810c28c83647612cb372d1e763fd9d7780df3cb Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:44:16 2012 +0100 gallivm,llvmpipe: Don't use expressions with PIPE_ALIGN_VAR(). MSVC doesn't accept exceptions in _declspec(align(...)). Use a define instead. commit 8aafd1457ba572a02b289b3f3411e99a3c056072 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:41:56 2012 +0100 gallium/util: Make u_cpu_detect.h header C++ safe. commit 5795248350771f899cfbfc1a3a58f1835eb2671d Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 2 12:08:01 2012 +0100 gallium/util: Add ULL suffix to large constants. As suggested by Andy Furniss: it looks like some old gcc versions require it. commit 4c66c22727eff92226544c7d43c4eb94de359e10 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Truly disable INF/NAN tests on MSVC. Thanks to Brian for spotting this. commit 8bce274c7fad578d7eb656d9a1413f5c0844c94e Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Disable INF/NAN tests on MSVC. Somehow they are not recognized as constants. commit 6868649cff8d7fd2e2579c28d0b74ef6dd4f9716 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 15:05:24 2012 +0200 gallivm: Cleanup the 2 x 8 float -> 16 ub special path in lp_build_conv. No behaviour change intended, like 7b98455fb40c2df84cfd3cdb1eb7650f67c8a751. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 5147a0949c4407e8bce9e41d9859314b4a9ccf77 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 14:28:19 2012 +0200 gallivm: (trivial) fix issues with multiple-of-4 texture fetch Some formats can't handle non-multiple of 4 fetches I believe, but everything must support length 1 and multiples of 4. So avoid going to scalar fetch (which is very costly) just because length isn't 4. Also extend the hack to not use shift with variable count for yuv formats to arbitrary length (larger than 1) - doesn't matter how many elements we have we always want to avoid it unless we have variable shift count instruction (which we should get with avx2). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 87ebcb1bd71fa4c739451ec8ca89a7f29b168c08 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jul 4 02:09:55 2012 +0200 gallivm: (trivial) fix typo for wrap repeat mode in linear filtering aos code This would lead to bogus coordinates at the edges. (undetected by piglit because this path is only taken for block-based formats). Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 3a42717101b1619874c8932a580c0b9e6896b557 Author: José Fonseca <jfonseca@vmware.com> Date: Tue Jul 3 19:42:49 2012 +0100 gallivm: Fix TGSI integer translation with AVX. commit d71ff104085c196b16426081098fb0bde128ce4f Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 15:17:41 2012 +0100 llvmpipe: Fix LLVM JIT linear path. It was not working properly because it was looking at the JIT function before it was actually compiled. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit a94df0386213e1f5f9a6ed470c535f9688ec0a1b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Jun 28 18:07:10 2012 +0100 gallivm: Refactor lp_build_broadcast(_scalar) to share code. Doesn't really change the generated assembly, but produces more compact IR, and of course, makes code more consistent. Reviewed-by: Brian Paul <brianp@vmware.com> commit 66712ba2731fc029fa246d4fc477d61ab785edb5 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 17:30:13 2012 +0100 gallivm: Make LLVMContextRef a singleton. There are any places inside LLVM that depend on it. Too many to attempt to fix. Reviewed-by: Brian Paul <brianp@vmware.com> commit ff5fb7897495ac263f0b069370fab701b70dccef Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 28 18:15:27 2012 +0200 gallivm: don't use 8-wide texture fetch in aos path This appears to be a slight loss usually. There are probably several reasons for that: - fetching itself is scalar - filtering is pure int code hence needs splitting anyway, same for the final texel offset calculations - texture wrap related code, which can be done 8-wide, is slightly more complex with floats (with clamp_to_edge) and float operations generally more costly hence probably not much faster overall - the code needed to split when encountering different mip levels for the quads, adding complexity So, just split always for aos path (but leave it 8-wide for soa, since we do 8-wide filtering there when possible). This should certainly be revisited if we'd have avx2 support. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ce8032b43dcd8e8d816cbab6428f54b0798f945d Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:41:19 2012 +0200 gallivm: (trivial) don't extract fparts variable if not needed Did not have any consequences but unnecessary. commit aaa9aaed8f80dc282492f62aa583a7ee23a4c6d5 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:09:06 2012 +0200 gallivm: fix precision issue in aos linear int wrap code now not just passes at a quick glance but also with piglit... If we do the wrapping with floats, we also need to set the weights accordingly. We can potentially end up with different (integer) coordinates than what the integer calculations would have chosen, which means the integer weights calculated previously in this case are completely wrong. Well at least that's what I think happens, at least recalculating the weights helps. (Some day really should refactor all the wrapping, so we do whatever is fastest independent of 16bit int aos or 32bit float soa filtering.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit fd6f18588ced7ac8e081892f3bab2916623ad7a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 11:15:53 2012 +0100 gallium/util: Fix parsing of options with underscore. For example GALLIVM_DEBUG=no_brilinear which was being parsed as two options, "no" and "brilinear". commit 09a8f809088178a03e49e409fa18f1ac89561837 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 26 15:00:14 2012 +0100 gallivm: Added a generic lp_build_print_value which prints a LLVMValueRef. Updated lp_build_printf to share common code. Removed specific lp_build_print_vecX. Reviewed-by: José Fonseca <jfonseca@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit e59bdcc2c075931bfba2a84967a5ecd1dedd6eb0 Author: José Fonseca <jfonseca@vmware.com> Date: Wed May 16 15:00:23 2012 +0100 draw,llvmpipe: Avoid named struct types on LLVM 3.0 and later. Starting with LLVM 3.0, named structures are meant not for debugging, but for recursive data types, previously also known as opaque types. The recursive nature of these types leads to several memory management difficulties. Given that we don't actually need recursive types, avoid them altogether. This is an attempt to address fdo bugs 41791 and 44466. The issue is somewhat random so there's no easy way to check how effective this is. Cherry-picked from 9af1ba565dfd5cef9ee938bb7c04767d14878fbf commit df6070f618a203c7a876d984c847cde4cbc26bdb Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 14:42:53 2012 +0200 gallivm: (trivial) fix typo in faster aos linear int wrap code no longer crashes, now REALLY tested. commit d8f98dce452c867214e6782e86dc08562643c862 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 18:20:58 2012 +0200 llvmpipe: (trivial) remove bogus optimization for float aos repeat wrap This optimization for nearest filtering on the linear path generated likely bogus results, and the int path didn't have any optimizations there since the only shader using force_nearest apparently uses clamp_to_edge not repeat wrap anyway. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit c4e271a0631087c795e756a5bb6b046043b5099d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 23:01:52 2012 +0200 gallivm: faster repeat wrap for linear aos path too Even if we already have scaled integer coords, it's way faster to use the original float coord (plus some conversions) rather than use URem. The choice of what to do for texture wrapping is not really tied to int aos or float soa filtering though for some modes there can be some gains (because of easier weight calculations). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 1174a75b1806e92aee4264ffe0ffe7e70abbbfa3 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 14:39:22 2012 +0200 gallivm: improve npot tex wrap repeat in linear soa path URem gets translated into series of scalar divisions so just about anything else is faster. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit f849ffaa499ed96fa0efd3594fce255c7f22891b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 00:40:35 2012 +0100 gallivm: (trivial) fix near-invisible shift-space typo I blame the keyboard. commit 5298a0b19fe672aebeb70964c0797d5921b51cf0 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:24:28 2012 +0200 gallivm: add new intrinsic helper to deal with arbitrary vector length This helper will split vectors which are too large for the hw, or expand them if they are too small, so a caller of a function using intrinsics which uses such sizes need not split (or expand) the vectors manually and the function will still use the intrinsic instead of dropping back to generic llvm code. It can also accept scalars for use with pseudo-vector intrinsics (only useful for float arguments, all x86 scalar simd float intrinsics use 4vf32). Only used for lp_build_min/max() for now (also added the scalar float case for these while there). (Other basic binary functions could use it easily, whereas functions with a different interface would need different helpers.) Expanding vectors isn't widely used, because we always try to use build contexts with native hw vector sizes. But it might (or not) be nicer if this wouldn't need to be done, the generated code should in theory stay the same (it does get hit by lp_build_rho though already since we didn't have a intrinsic for the scalar lp_build_max case before). v2: incorporated Brian's feedback, and also made the scalar min/max case work instead of crash (all scalar simd float intrinsics take 4vf32 as argument, probably the reason why it wasn't used before). Moved to lp_bld_intr based on José's request, and passing intrinsic size instead of length. Ideally we'd derive the source type info from the passed in llvm value refs and process some llvmtype return type so we could handle intrinsics where the source and destination type isn't the same (like float/int conversions, packing instructions) but that's a bit too complicated for now. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 01aa760b99ec0b2dc8ce57a43650e83f8c1becdf Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:19:18 2012 +0200 gallivm: (trivial) increase max code size for shader disassembly 64kB was just short of what I needed (which caused a crash) hence increase to 96kB (should probably be smarter about that). commit 74aa739138d981311ce13076388382b5e89c6562 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:53:29 2012 +0100 gallivm: simplify aos float tex wrap repeat nearest just handle pot and npot the same. The previous pot handling ended up with exactly the same instructions plus 2 more (leave it in the soa path though since it is probably still cheaper there). While here also fix a issue which would cause a crash after an assert. commit 0e1e755645e9e49cfaa2025191e3245ccd723564 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:29:24 2012 +0100 gallivm: (trivial) skip floor rounding in ifloor when not signed This was only done for the non-sse41 case before, but even with sse41 this is obviously unnecessary (some callers already call itrunc in this case anyway but some might not). commit 7f01a62f27dcb1d52597b24825931e88bae76f33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:23:12 2012 +0100 gallivm: (trivial) fix bogus comments commit 5c85be25fd82e28490274c468ce7f3e6e8c1d416 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 20 11:51:57 2012 +0100 translate: Free elt8_func/elt16_func too. These were leaking. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 0ad498f36fb6f7458c7cffa73b6598adceee0a6c Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 15:55:34 2012 +0200 gallivm: fix bug for tex wrap repeat with linear sampling in aos float path The comparison needs to be against length not length_minus_one, otherwise the max texel is never chosen (for the second coordinate). Fixes piglit texwrap-1D-npot-proj (and 2D/3D versions). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit d1ad65937c5b76407dc2499b7b774ab59341209e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 16:13:43 2012 +0200 gallivm: simplify soa tex wrap repeat with npot textures and no mip filtering Similar to what is already done in aos sampling for the float path (but not the int path since we don't get normalized float coordinates there). URem is expensive and the calculation is done trivially with normalized floats instead (at least with sse41-capable cpus). (Some day should probably do the same for the mip filter path but it's much more complicated there hence the gain is smaller.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit e1e23f57ba9b910295c306d148f15643acc3fc83 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:38:56 2012 +0200 llvmpipe: (trivial) remove duplicated function declaration Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 07ca57eb09e04c48a157733255427ef5de620861 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:37:34 2012 +0200 llvmpipe: destroy setup variants on context destruction lp_delete_setup_variants() used to be called in garbage collection, but this no longer exists hence the setup shaders never got freed. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ed0003c633859a45f9963a479f4c15ae0ef1dca3 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 16:25:29 2012 +0100 gallivm: handle different ilod parts for multiple quad sampling This fixes filtering when the integer part of the lod is not the same for all quads. I'm not fully convinced of that solution yet as it just splits the vector if the levels to be sampled from are different. But otherwise we'd need to do things like some minify steps, and getting mip level base address separately anyway hence it wouldn't really look like much of a win (and making the code even more complex). This should now give identical results to single quad sampling. commit 8580ac4cfc43a64df55e84ac71ce1a774d33c0d2 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:14:47 2012 +0200 gallivm: de-duplicate sample code common to soa and aos sampling There doesn't seem to be any reason why this code dealing with cube face selection, lod and mip level calculation is separate in aos and soa sampling, and I am sick of having it to change in both places. commit fb541e5f957408ce305b272100196f1e12e5b1e8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:15:41 2012 +0200 gallivm: do mip filtering with per quad lod_fpart This gives better results for mip filtering, though the generated code might not be optimal. For now it also creates some artifacts if the lod_ipart isn't the same for all quads, since instead of using the same mip weight for all quads as previously (which just caused non-smooth gradients) this now will use the right weights but with the wrong mip level in this case (can easily be seen with things like texfilt, mipmap_tunnel). v2: use logic helper suggested by José, and fix issue with negative lod_fpart values commit f1cc84eef7d826a20fab6cd8ccef9a275ff78967 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 13 18:35:25 2012 +0200 gallivm: (trivial) fix bogus assert in lp_build_unpack_broadcast_aos_scalars commit 7c17dbae8ae290df9ce0f50781a09e8ed640c044 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:11:14 2012 +0100 util: Reimplement half <-> float conversions. Removed u_half.py used to generate the table for previous method. Previous implementation of float to half conversion was faulty for denormalised and NaNs and would require extra logic to fix, thus making the speedup of using tables irrelevant. commit 7762f59274070e1dd4b546f5cb431c2eb71ae5c3 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:12:16 2012 +0100 tests: Updated tests to properly handle NaN for half floats. commit fa94c135aea5911fd93d5dfb6e6f157fb40dce5e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:33:10 2012 +0200 gallivm: do mip level calculations per quad This is the final piece which shouldn't change the rendering output yet. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 23cbeaddfe03c09ca18c45d28955515317ffcf4c Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Jun 9 00:54:21 2012 +0200 gallivm: do per-quad cube face selection Doesn't quite fix the piglit cubemap test (not sure why actually) but doing per-quad face selection is doing the right thing and definitely an improvement. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit abfb372b3702ac97ac8b5aa80ad1b94a2cc39d33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:22:59 2012 +0200 gallivm: do all lod calculations per quad Still no functional change but lod is now converted to scalar after lod calculations. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 519368632747ae03feb5bca9c655eccbc5b751b4 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:46:10 2012 +0100 gallivm: Added support for half-float to float conversion in lp_build_conv. Updated various utility functions to support this change. commit 135b4d683a4c95f7577ba27b9bffa4a6fbd2c2e7 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:02:46 2012 +0100 gallivm: Added function for half-float to float conversion. Updated lp_build_format_aos_array to support half-float source. commit 37d648827406a20c5007abeb177698723ed86673 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:55:18 2012 +0100 util: Updated u_format_tests to rigidly test half-float boundary values. commit 2ad18165d96e578aa9046df7c93cb1c3284d8c6b Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:54:16 2012 +0100 llvmpipe: Updated lp_test_format to properly handle Inf/NaN results. commit 78740acf25aeba8a7d146493dd5c966e22c27b73 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:53:30 2012 +0100 util: Added functions for checking NaN / Inf for double and half-floats. commit 35e9f640ae01241f9e0d67fe893bbbf564c05809 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:05:13 2012 +0200 gallivm: Fix calculating rho for 3d textures for the single-quad case Discovered by accident, this looks like a very old typo bug. commit fc1220c636326536fd0541913154e62afa7cd1d8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:04:59 2012 +0200 gallivm: do calcs per-quad in lp_build_rho Still convert to scalar at the end of the function. commit 50a887ffc550bf310a6988fa2cea5c24d38c1a41 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 21 23:21:50 2012 +0200 gallivm: (trivial) return scalar in lp_build_extract_range for length 1 vectors Our type system on top of llvm's one doesn't generally support vectors of length 1, instead using scalars. So we should return a scalar from this function instead of having to bitcast the vector with length 1 later elsewhere. commit 80c71c621f9391f0f9230460198d861643324876 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 17:49:15 2012 +0100 draw: Fixed bad merge error commit c47401cfad0c9167de20ff560654f533579f452c Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:29:30 2012 +0100 draw: Updated store_clip to store whole vectors instead of individual elements. commit 2d9c1ad74b0b0b41861fffcecde39f09cc27f1cf Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:28:32 2012 +0100 gallivm: Added lp_build_fetch_rgba_aos_array. A version of lp_build_fetch_rgba_aos which is targeted at simple array formats. Reads the whole vector from memory in one, instead of reading each element individually. Tested with mesa tests and demos. commit ff7805dc2b6ef6d8b11ec4e54aab1633aef29ac8 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:27:40 2012 +0100 gallivm: Added lp_build_pad_vector. This function pads a vector with undef to a desired length. commit 701f50acef24a2791dabf4730e5b5687d6eb875d Author: James Benton <jbenton@vmware.com> Date: Fri May 18 17:27:19 2012 +0100 util: Added util_format_is_array. This function checks whether a format description is in a simple array format. commit 5e0a7fa543dcd009de26f34a7926674190fa6246 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:13:47 2012 +0100 draw: Removed draw_llvm_translate_from and draw/draw_llvm_translate.c. This is "replaced" by adding an optimised path in lp_build_fetch_rgba_aos in an upcoming patch. commit 8c886d6a7dd3fb464ecf031de6f747cb33e5361d Author: James Benton <jbenton@vmware.com> Date: Wed May 16 15:02:31 2012 +0100 draw: Modified store_aos to write the vector as one, not individual elements. commit 37337f3d657e21dfd662c7b26d61cb0f8cfa6f17 Author: James Benton <jbenton@vmware.com> Date: Wed May 16 14:16:23 2012 +0100 draw: Changed aos_to_soa to use lp_build_transpose_aos. commit bd2b69ce5d5c94b067944d1dcd5df9f8e84548f1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:14:27 2012 +0100 draw: Changed soa_to_aos to use lp_build_transpose_aos. commit 0b98a950d29a116e82ce31dfe7b82cdadb632f2b Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:45 2012 +0100 gallivm: Added lp_build_transpose_aos which converts between aos and soa. commit 69ea84531ad46fd145eb619ed1cedbe97dde7cb5 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:01 2012 +0100 gallivm: Added lp_build_interleave2_half aimed at AVX unpack instructions. commit 7a4cb1349dd35c18144ad5934525cfb9436792f9 Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 22 11:54:14 2012 +0100 gallivm: Fix build on Windows. MC-JIT not yet supported there. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit afd105fc16bb75d874e418046b80d9cc578818a1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:17:26 2012 +0100 llvmpipe: Added a error counter to lp_test_conv. Useful for keeping track of progress when fixing errors! Signed-off-by: José Fonseca <jfonseca@vmware.com> commit b644907d08c10a805657841330fc23db3963d59c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:16:46 2012 +0100 llvmpipe: Changed known failures in lp_test_conv. To comply with the recent fixes to lp_bld_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit d7061507bd94f6468581e218e61261b79c760d4f Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:14:38 2012 +0100 llvmpipe: Added fixed point types tests to lp_test_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 146b3ea39b4726dbe125ac666bd8902ea3d6ca8c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:26:35 2012 +0100 llvmpipe: Changed lp_test_conv src/dst alignment to be correct. Now based on the define rather than a fixed number. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit f3b57441f834833a4b142a951eb98df0aa874536 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:06:44 2012 +0100 gallivm: Fixed erroneous optimisation in lp_build_min/max. Previously assumed normalised was 0 to 1, but it can be -1 to 1 if type is signed. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a0613382e5a215cd146bb277646a6b394d376ae4 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:04:49 2012 +0100 gallivm: Compensate for lp_const_offset in lp_build_conv. Fixing a /*FIXME*/ to remove errors in integer conversion in lp_build_conv. Tested using lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a3d2bf15ea345bc8a0664f8f441276fd566566f3 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:01:25 2012 +0100 gallivm: Fixed overflow in lp_build_clamped_float_to_unsigned_norm. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit e7b1e76fe237613731fa6003b5e1601a2e506207 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 21 20:07:51 2012 +0100 gallivm: Fix build with LLVM 2.6 Trivial, and useful. commit d3c6bbe5c7f5ba1976710831281ab1b6a631082d Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 15 17:15:59 2012 +0100 gallivm: Enable MCJIT/AVX with vanilla LLVM 3.1. Add the necessary C++ glue, so that we don't need any modifications to the soon to be released LLVM 3.1. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 724a019a14d40fdbed21759a204a2bec8a315636 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 22:04:06 2012 +0100 gallivm: Use HAVE_LLVM 0x0301 consistently. commit af6991e2a3868e40ad599b46278551b794839748 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 21:49:06 2012 +0100 gallivm: Add MCRegisterInfo.h to silence benign warnings about missing implementation. Trivial. commit 6f8a1d75458daae2503a86c6b030ecc4bb494e23 Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Apr 2 22:14:15 2012 -0700 gallivm: Pass in a MCInstrInfo to createMCInstPrinter on llvm-3.1. llvm-3.1svn r153860 makes MCInstrInfo available to the MCInstPrinter. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 62555b6ed8760545794f83064e27cddcb3ce5284 Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 27 21:51:17 2012 -0700 gallivm: Fix method overriding in raw_debug_ostream. Use matching type qualifers to avoid method hiding. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 6a9bd784f4ac68ad0a731dcd39e5a3c39989f2be Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 13 22:40:52 2012 -0700 gallivm: Fix createOProfileJITEventListener namespace with llvm-3.1. llvm-3.1svn r152620 refactored the OProfile profiling code. createOProfileJITEventListener was moved from the llvm namespace to the llvm::JITEventListener namespace. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit b674955d39adae272a779be85aa1bd665de24e3e Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Mar 5 22:00:40 2012 -0800 gallivm: Pass in a MCRegisterInfo to MCInstPrinter on llvm-3.1. llvm-3.1svn r152043 changes createMCInstPrinter to take an additional MCRegisterInfo argument. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 11ab69971a8a31c62f6de74905dbf8c02884599f Author: Vinson Lee <vlee@freedesktop.org> Date: Wed Feb 29 21:20:53 2012 -0800 Revert "gallivm: Change getExtent and readByte to non-const with llvm-3.1." This reverts commit d5a6c172547d8964f4d4bb79637651decaf9deee. llvm-3.1svn r151687 makes MemoryObject accessor members const again. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 339960c82d2a9f5c928ee9035ed31dadb7f45537 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 14 16:19:56 2012 +0200 gallivm: (trivial) fix assertion failure for mipmapped 1d textures In lp_build_rho, we may end up with a 1-element vector (for mipmapped 1d textures), but in this case we require the type to be a non-vector type, so need a cast. commit 9d73edb727bd6d196030dc3026b7bf0c574b3e19 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:12:07 2012 +0200 gallivm: prepare for per-quad lod calculations for large vectors to be able to handle multiple quads at once in texture sampling and still do lod calculations per quad, it is necessary to get the per-quad derivatives into the lp_build_rho function. Until now these derivative values were just scalars, which isn't going to work. So we now use vectors, and since the interface needs to change we also do some different (slightly more efficient) packing of the values. For 8-wide vectors the packed derivative values for 3 coords would look like this, this scales to a arbitrary (multiple of 4) vector size: ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____ The second vector will be unused for 1d and 2d textures. To facilitate future changes the derivative values are put into a struct, since quite some functions just pass these values through. The generated code seems to be very slightly better for 2d textures (with 4-wide vectors) than before with sse2 (if you have a cpu with physical 128bit simd units - otherwise it's probably not a win). v2: suggestions from José, rename variables, add comments, use swizzle helper commit 0aa21de0d31466dac77b05c97005722e902517b8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:10:31 2012 +0200 gallivm: add undefined swizzle handling to lp_build_swizzle_aos This is useful for vectors with "holes", it lets llvm choose the most efficient shuffle instructions if some elements aren't needed without having to worry what elements to manually pick otherwise. commit 00faf3f370e7ce92f5ef51002b0ea42ef856e181 Author: José Fonseca <jfonseca@vmware.com> Date: Fri May 4 17:25:16 2012 +0100 gallivm: Get the LLVM IR optimization passes before JIT compilation. MC-JIT engine compiles the module immediately on creation, so the optimization passes were being run too late. So now we create a target data layout from a string, that matches the ABI parameters reported by the compiler. The backend optimization passes were always been run, so the performance improvement is modest (3% on multiarb mesa demo). Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit 40a43f4e2ce3074b5ce9027179d657ebba68800a Author: Roland Scheidegger <sroland@vmware.com> Date: Wed May 2 16:03:54 2012 +0200 gallivm: (trivial) fix wrong define used in lp_build_pack2 should fix stack-smashing crashes. commit e6371d0f4dffad4eb3b7a9d906c23f1c88a2ab9e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 30 21:25:29 2012 +0200 gallivm: add perf warnings when not using intrinsics with 256bit vectors Helper functions using integer sse2 intrinsics could split the vectors with AVX instead of using generic fallback (which should be faster). We don't actually expect to hit these paths (hence don't fix them up to actually do the vector splitting) so just emit warnings (for those functions where it's obvious doing split/intrinsic is faster than using generic path). Only emit warnings for 256bit vectors since we _really_ don't expect to hit arbitrary large vectors which would affect a lot more functions. The warnings do not actually depend on avx since the same logic applies to plain sse2 too (but of course again there's _really_ no reason we should hit these functions with 256bit vectors without avx). commit 8a9ea701ea7295181e846c6383bf66a5f5e47637 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:37:07 2012 +0200 gallivm: split vectors manually for avx in lp_build_pack2 (v2) There's 2 reasons for this: First, there's a llvm bug (fixed in 3.1) which generates tons of byte inserts/extracts otherwise, and second, more importantly, we want to use pack intrinsics instead of shuffles. We do this in lp_build_pack2 and not the calling code (aos sample path) because potentially other callers might find that useful too, even if for larger sequences of code using non-native vector sizes it might be better to manually split vectors. This should boost texture performance in the aos path considerably. v2: fix issues with intrinsics types with old llvm commit 27ac5b48fa1f2ea3efeb5248e2ce32264aba466e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:26:22 2012 +0200 llvmpipe: refactor lp_build_pack2 (v2) prettify, and it's unnecessary to assert when there's no intrinsic due to unsupported bit width - the shuffle path will work regardless. In contrast lp_build_packs2, should only rely on lp_build_pack2 doing the clamping for element sizes for which there is a sse2 intrinsic. v2: fix bug spotted by Jose regarding the intrinsic type for packusdw on old llvm versions. commit ddf279031f0111de4b18eaf783bdc0a1e47813c8 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:13:59 2012 +0200 gallivm: add src width check in lp_build_packs2() not doing so would skip clamping even if no sse2 pack instruction is available, which is incorrect (in theory only, such widths would also always hit a (unnecessary) assertion in lp_build_pack2(). commit e7f0ad7fe079975eae7712a6e0c54be4fae0114b Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Apr 27 15:57:00 2012 +0200 gallivm: (trivial) fix crash-causing typo for npot textures with avx commit 28a9d7f6f655b6ec508c8a3aa6ffefc1e79793a0 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Apr 25 19:38:45 2012 +0200 gallivm: (trivial) remove code mistakenly added twice. commit d5926537316f8ff67ad0a52e7242f7c5478d919b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Apr 24 21:16:15 2012 +0200 gallivm: add a new avx aos sample path (v2) Try to avoid mixing float and int address calculations. This does texture wrap modes with floats, and then the offset calculations still with ints (because of lack of precision with floats, though we could do some effort to make it work with not too large (16MB) textures). This also handles wrap repeat mode with npot-sized textures differently than either the old soa or aos int path (likely way faster but untested). Otherwise the actual address wrap code is largely similar to the soa path (not quite the same as this one also has some int code), it should get used by avx soa sampling later as well but doesn't handle more complex address modes yet (this will also have the benefit that we can use aos sampling path for all texture address modes). Generated code for that looks reasonable, but still does not split vectors explicitly for fetch/filter which means still get hit by llvm (fixed upstream) which generates hundreds of pinsrb/pextrb instead of two shuffles. It is not obvious though if it's much of a win over just doing address calcs 4-wide but with ints, even if it is definitely much less instructions on avx. piglit's texwrap seems to look exactly the same but doesn't test neither the non-normalized nor the npot cases. v2: fix comments, prettify based on Brian's and Jose's feedback. commit bffecd22dea66fb416ecff8cffd10dd4bdb73fce Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Apr 19 01:58:29 2012 +0200 gallivm: refactor aos lp_build_sample_image_nearest/linear split them up to separate address calculations and fetching/filtering. Need this for being able to do 8-wide float address calcs and 4-wide fetch/filter later (for avx). Plus the functions were very big scary monsters anyway (in particular lp_build_sample_image_linear). commit a80b325c57529adddcfa367f96f03557725c4773 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 16 17:17:18 2012 +0200 gallivm: fix lp_build_resize when truncating width but expanding vector size Missed this case which I thought was impossible - the assertion for it was right after the division by zero... (AoS) texture sampling may ask us to do this, for things like 8 4x32int vectors to 1 32x8int vector conversion (eventually, we probably don't want this to happen). commit f9c8337caa3eb185830d18bce8b95676a065b1d7 Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Apr 14 18:00:59 2012 +0200 gallivm: fix cube maps with larger vectors This makes the branchless cube face selection code work with larger vectors. Because the complexity is quite high (cannot really be improved it seems, per-face selection would reduce complexity a lot but this leads to errors unless the derivatives are calculated all from the same face which almost doubles the work to be done) it is still slower than the branching version, hence only enable this with large vectors. It doesn't actually do per-quad face selection yet (only makes sense with matching lod selection, in fact it will select the same face for all pixels based on the average of the first four pixels for now) but only different shuffles are required to make it work (the branching version actually should work with larger vectors too now thanks to the improved horizontal add but of course it cannot be extended to really select the face per-quad unless doing branching per quad). commit 7780c58869fc9a00af4f23209902db7e058e8a66 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 21:11:12 2012 +0100 llvmpipe: (trivial) fix compiler warning and also clarify comment regarding availability of popcnt instruction. commit a266dccf477df6d29a611154e988e8895892277e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 14:21:07 2012 +0100 gallivm: remove unneeded members in lp_build_sample_context Minor cleanup, the texture width, height, depth aren't accessed in their scalar form anywhere. Makes it more obvious those values should probably be fetched already vectorized (but this requires more invasive changes)... commit b678c57fb474e14f05e25658c829fc04d2792fff Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 29 15:53:55 2012 +0100 gallivm: add a helper for concatenating vectors Similar to the extract_range helper intended to get around slow code generated by llvm for 128bit insertelements. Concatenating two 128bit vectors this way will result in a single vinsertf128 operation rather than two 64bit stores plus one 128bit load, though it might be mildly useful for other purposes as well. commit 415ff228bcd0cf5e44a4c15350a661f0f5520029 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 19:41:15 2012 +0100 gallivm: add a custom 2x8f->1x16ub avx conversion path Similar to the existing 4x4f->1x16ub sse2 path, shaves off a couple instructions (min/max mostly) because it relies on pack intrinsics clamping. commit 78c08fc89f8fbcc6dba09779981b1e873e2a0299 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 18:44:07 2012 +0100 gallivm: add avx arithmetic intrinsics Add all avx intrinsics for arithmetic functions (with the exception of the horizontal add function which needs another look). Seems to pass basic tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit a586caa2800aa5ce54c173f7c0d4fc48153dbc4e Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 15:31:35 2012 +0100 gallivm: add avx logic intrinsics Add the blend intrinsics for 8-wide float and 4-wide double vectors. Since we lack 256bit int instructions these are used for int vectors as well, though obviously not for byte or word element values. The comparison intrinsics aren't extended for avx since these are only used for pre-2.7 llvm versions. commit 70275e4c13c89315fc2560a4c488c0e6935d5caf Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 00:40:53 2012 +0100 gallivm: new helper function for extract shuffles. Based on José's idea as we can need that in a couple places. Note that such shuffles should not be used lightly, since data layout of <4 x i8> is different to <16 x i8> for instance, hence might cause data rearrangement. commit 4d586dbae1b0c55915dda1759d2faea631c0a1c2 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 18:27:25 2012 +0100 gallivm: (trivial) don't overallocate shuffle variable using wrong define meant huge array... commit 06b0ec1f6d665d98c135f9573ddf4ba04b2121ad Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 17:54:20 2012 +0100 gallivm: don't do per-element extract/insert for vector element resize Instead of doing per-element extract/insert if the src vectors and dst vector differ in total size (which generates atrocious code) first change the src vectors size by using shuffles to destination vector size. We can still do better than that on AVX for packing to color buffer (by exploiting pack intrinsics characteristics hence eleminating the need for some clamps) but this already generates much better code. v2: incorporate feedback from José, Keith and use shuffle instead of bitcasts/extracts. Due to llvm deficiencies the latter cause all data to get moved to GPRs and back in pieces (even though the data in the regs actually stays the same...). commit c9970d70e05f95d3f52fe7d2cd794176a52693aa Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 19:33:19 2012 +0000 gallivm: fix bug in simple position interpolation Accidental use of position attribute instead of just pixel coordinates. Caused failures in piglit glsl-fs-ceil and glsl-fs-floor. commit d0b6fcdb008d04d7f73d3d725615321544da5a7e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 15:31:14 2012 +0000 gallivm: fix emission of ceil opcode lp_build_ceil seems more appropriate than lp_build_trunc. This seems to be never hit though someone performs some ceil to floor magic. commit d97fafed7e62ffa6bf76560a92ea246a1a26d256 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 22 11:46:52 2012 +0000 gallivm: new vectorized path for cubemap calculations should be faster when adapted to multiple quads as only selection masks need to be different. The code is more or less a per-pixel version adapted to only do it per quad. A per pixel version would be much simpler (could drop 2 selects, 6 broadcasts and the messy horizontal add of 3 vectors at the expense of only 2 more absolute value instructions - would also just work for arbitary large vectors). This version doesn't yet work with larger vectors because the horizontal add isn't adjusted to be able to work with 2x4 vectors (and also because face selection wouldn't be done per quad just per block though that would be only a correctness issue just as with lod selection). The downside is this code is quite a bit slower. On a Core2 it can be sped up by disabling the hw blend instructions for selection and using logicop fallbacks instead, but it is still slower than the old code, hence leave that in for now. Probably will chose one or the other version based on vector length in the end. commit b375fbb18a3fd46859b7fdd42f3e9908ea4ff9a3 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 21 14:42:29 2012 +0000 gallivm: fix optimized occlusion query intrinsic name commit a9ba0a3b611e48efbb0e79eb09caa85033dbe9a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Mar 21 16:19:43 2012 +0000 draw,gallivm,llvmpipe: Call gallivm_verify_function everywhere. commit f94c2238d2bc7383e088b8845b7410439a602071 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 18:54:10 2012 +0000 gallivm: optimize calculations for cube maps a bit this does some more vectorized calculations and uses horizontal adds if possible. A definite win with sse3 otherwise it doesn't seem to make much of a difference. In any case this is arithmetically identical, cannot handle larger vectors. Should be useful as a reference point against larger vector version later... commit 21a2c1cf3c8e1ac648ff49e59fdc0e3be77e2ebb Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 15:16:27 2012 +0000 llvmpipe: slight optimization of occlusion queries using movmskps when available. While this is slightly better for cpus without popcnt we should really sum the vectors ourselves (it is also possible to cast to i4 before doing the popcnt but that doesn't help that much neither since llvm is using some optimized popcnt version for i32) commit 5ab5a35f216619bcdf55eed52b0db275c4a06c1b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 13:32:11 2012 +0000 llvmpipe: fix occlusion queries with larger vectors need to adjust casts etc. commit ff95e6fdf5f16d4ef999ffcf05ea6e8c7160b0d5 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Mar 19 20:15:25 2012 +0000 gallivm: Restore optimization passes. commit 57b05b4b36451e351659e98946dae27be0959832 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:34:22 2012 +0000 llvmpipe: use existing min2 macro commit bc9a20e19b4f600a439f45679451f2e87cd4b299 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:07:27 2012 +0000 llvmpipe: add some safeguards against really large vectors As per José's suggestion, prevent things from blowing up if some cpu would have 1024bit or larger vectors. commit 0e2b525e5ca1c5bbaa63158bde52ad1c1564a3a9 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:31:08 2012 +0000 llvmpipe: fix mask generation for uberwide vectors this was the only piece preventing 16-wide vectors from working (apart from the LP_MAX_VECTOR_WIDTH define that is), which is the maximum as we don't get more pixels in the fragment shader at once. Hence adjust that so things could be tested properly with that size even though there seems to be no practical value. commit 3c8334162211c97f3a11c7f64e9e5a2a91ad9656 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:19:41 2012 +0000 llvmpipe: fix the simple interpolation method with larger vectors so both methods actually _really_ work now. Makes textures look nice with larger vectors... commit 1cb0464ef8871be1778d43b0c56adf9c06843e2d Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 17:26:35 2012 +0000 llvmpipe: fix mask generation and position interpolation with 8-wide vectors trivial bugs, with these things start to look somewhat reasonable. Textures though have some swizzling issues it seems. commit 168277a63ef5b72542cf063c337f2d701053ff4b Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 16:04:03 2012 +0000 llvmpipe: don't overallocate variables we never have more than 16 (stamp size) / 4 (minimum possible vector size). (With larger vectors those variables are still overallocated a bit.) commit 409b54b30f81ed0aa9ed0b01affe15c72de9abd2 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:56:48 2012 +0000 llvmpipe: add some 32f8 formats to lp_test_conv Also add the ability to handle different sized vectors. commit 55dcd3af8366ebdac0af3cdb22c2588f24aa18ce Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:47:27 2012 +0000 gallivm: handle different sized vectors in conversion / pack only fully generic path for now (extract/insert per element). commit 9c040f78c54575fcd94a8808216cf415fe8868f6 Author: Roland Scheidegger <sroland@vmware.com> Date: Sun Mar 18 00:58:28 2012 +0100 llvmpipe: fix harmless use of unitialized values commit 551e9d5468b92fc7d5aa2265db9a52bb1e368a36 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:31:21 2012 +0100 gallivm: drop special path in extract_broadcast with different sized vectors Not needed, llvm can handle shuffles with different sized result vector just fine. Should hopefully generate the same code in the end, but simpler IR. commit 44da531119ffa07a421eaa041f63607cec88f6f8 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:28:49 2012 +0100 llvmpipe: adapt interpolation for handling multiple quads at once this is still WIP there are actually two methods possible not quite sure what makes the most sense, so there's code for both for now: 1) the iterative method as used before (compute attrib values at upper left corner of stamp and upper left corner of each quad initially). It is improved to handle more than one quad at once, and also do some more vectorized calculations initially for slightly better code - newer cpus have full throughput with 4 wide float vectors, hence don't try to code up a path which might be faster if there's just one channel active per attribute. 2) just do straight interpolation for each pixel. Method 2) is more work per quad, but less initially - if all quads are executed significantly more overall though. But this might change with larger vector lengths. This method would also be needed if we'd do some kind of active quad merging when operating on multiple quads at once. This path contains some hack to force llvm to generate better code, it is still far from ideal though, still generates far too many unnecessary register spills/reloads. Both methods should work with different sized vectors. Not very well tested yet, still seems to work with four-wide vectors, need changes elsewhere to be able to test with wider vectors. commit be5d3e82e2fe14ad0a46529ab79f65bf2276cd28 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:59:37 2012 +0000 draw: Cleanup. commit f85bc12c7fbacb3de2a94e88c6cd2d5ee0ec0e8d Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:43:30 2012 +0000 gallivm: More module compilation refactoring. commit d76f093198f2a06a93b2204857e6fea5fd0b3ece Author: José Fonseca <jfonseca@vmware.com> Date: Thu Mar 15 21:29:11 2012 +0000 llvmpipe: Use gallivm_compile/free_function() in linear code. Should had been done before. commit 122e1adb613ce083ad739b153ced1cde61dfc8c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 13 14:47:10 2012 +0100 llvmpipe: generate partial pixel mask for multiple quads still works with one quad, cannot be tested yet with more At least for now always fixed order with multiple quads. commit 4c4f15081d75ed585a01392cd2dcce0ad10e0ea8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 22:09:24 2012 +0100 llvmpipe: refactor state setup a bit Refactor to make it easier to emit (and potentially later fetch in fs) coefficients for multiple attributes at once. Need to think more about how to make this actually happen however, the problem is different attributes can have different interpolation modes, requiring different handling in both setup and fs (though linear and perspective handling is close). commit 9363e49722ff47094d688a4be6f015a03fba9c79 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 19:23:23 2012 +0100 llvmpipe: vectorize tri offset calc cuts number of instructions in quad-offset-factor from 107 to 75. This code actually duplicated the (scalar) code calculating the determinant except it used different vertex order (leading to different sign but it doesn't matter) hence llvm could not have figured out it's the same (of course with determinant vectorized in the other place that wouldn't have worked any longer neither). Note this particular piece doesn't actually vectorize well, not many arithmetic instructions left but tons of shuffle instructions... Probably would need to work on n tris at a time for better vectorization. commit 63169dcb9dd445c94605625bf86d85306e2b4297 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 03:11:37 2012 +0100 llvmpipe: vectorize some scalar code in setup reduces number of arithmetic instructions, and avoids loading vector x,y values twice (once as scalars once as vectors). Results in a reduction of instructions from 76 to 64 in fs setup for glxgears (16%) on a cpu with sse41. Since this code uses vec2 disguised as vec4, on old cpus which had physical 64bit sse units (pre-Core2) it probably is less of a win in practice (and if you have no vectors you can only hope llvm eliminates the arithmetic for unneeded elements). commit 732ecb877f951ab89bf503ac5e35ab8d838b58a1 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 7 00:32:24 2012 +0100 draw: fix clipping bug introduced by 4822fea3f0440b5205e957cd303838c3b128419c broke clipping pretty badly (verified with lineclip test) commit ef5d90b86d624c152d200c7c4056f47c3c6d2688 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 23:38:59 2012 +0100 draw: don't store vertex header per attribute storing the vertex header once per attribute is totally unnecessary. Some quick look at the generated assembly says llvm in fact cannot optimize away the additional stores (maybe due to potentially aliasing pointers somewhere). Plus, this makes the code cleaner and also allows using a vector "or" instead of scalar ones. commit 6b3a5a57b0b9850854cfbd7b586e4e50102dda71 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 19:11:01 2012 +0100 draw: do the per-vertex "boolean" clipmask "or" with vectors no point extracting the values and doing it per component. Doesn't help that much since we still extract the values elsewhere anyway. commit 36519caf1af40e4480251cc79a2d527350b7c61f Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 2 22:27:01 2012 +0100 gallivm: fix lp_build_extract_broadcast with different sized vectors Fix the obviously wrong argument, so it doesn't blow up. commit 76d0ac3ad85066d6058486638013afd02b069c58 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 2 12:16:23 2012 +0000 draw: Compile per module and not per function (WIP). Enough to get gears w/ LLVM draw + softpipe to work on AVX doing: GALLIUM_DRIVER=softpipe SOFTPIPE_USE_LLVM=yes glxgears But still hackish -- will need to rethink and refactor this. commit 78e32b247d2a7a771be9a1a07eb000d1e54ea8bd Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 12:01:05 2012 +0000 llvmpipe: Remove lp_state_setup_fallback. Never used. commit 6895d5e40d19b4972c361e8b83fdb7eecda3c225 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Feb 27 19:14:27 2012 +0000 llvmpipe: Don't emit EMMS on x86 We already take precautions to ensure that LLVM never emits MMX code. commit 4822fea3f0440b5205e957cd303838c3b128419c Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 29 15:58:19 2012 +0100 draw: modifications for larger vector sizes We want to be able to use larger vectors especially for running the vertex shader. With this patch we build soa vectors which might have a different length than 4. Note that aos structures really remain the same, only when aos structures are converted to soa potentially different sized vectors are used. Samplers probably don't work yet, didn't look at them. Testing done: glxgears works with both 128bit and 256bit vectors. commit f4950fc1ea784680ab767d3dd0dce589f4e70603 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:57 2012 +0100 gallivm: override native vector width with LP_NATIVE_VECTOR_WIDTH env var for debug commit 6ad6dbf0c92f3bf68ae54e5f2aca035d19b76e53 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:24 2012 +0100 draw: allocate storage with alignment according to native vector width commit 7bf0e3e7c9bd2469ae7279cabf4c5229ae9880c1 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Feb 24 19:06:08 2012 +0000 gallivm: Fix comment grammar. Was missing several words. Spotted by Roland. commit b20f1b28eb890b2fa2de44a0399b9b6a0d453c52 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 19:22:09 2012 +0000 gallivm: Use MC-JIT on LLVM 3.1 + (i.e, SVN) MC-JIT Note: MC-JIT is still WIP. For this to work correctly it requires LLVM changes which are not yet upstream. commit b1af4dfcadfc241fd4023f4c3f823a1286d452c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 20:03:15 2012 +0100 llvmpipe: use new lp_type_width() helper in lp_test_blend commit 04e0a37e888237d4db2298f31973af459ef9c95f Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 19:50:34 2012 +0100 llvmpipe: clean up lp_test_blend a little Using variables just sized and aligned right makes it a bit more obvious what's going on. The test still only tests vector length 4. For AoS anything else probably isn't going to work. For SoA other lengths should work (at least with floats). commit e61c393d3ec392ddee0a3da170e985fda885a823 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:48:30 2012 +0000 gallivm: Ensure vector width consistency. Instead of assuming that everything is the max native size. commit 330081ac7bc41c5754a92825e51456d231bf84dd Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:44:14 2012 +0000 draw: More simd vector width consistency fixes. commit d90ca002753596269e37297e2e6c139b19f29f03 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:43:00 2012 +0000 gallivm: Remove unused lp_build_int32_vec4_type() helper. commit cae23417824d75869c202aaf897808d73a2c1db0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 17:32:16 2012 +0100 gallivm: use global variable for native vector width instead of define We do not know the simd extensions (and hence the simd width we should use) available at compile time. At least for now keep a define for maximum vector width, since a global variable obviously can't be used to adjust alignment of automatic stack variables. Leave the runtime-determined value at 128 for now in all cases. commit 51270ace6349acc2c294fc6f34c025c707be538a Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 15:41:02 2012 +0000 gallivm: Add a hunk inadvertedly lost when rebasing. commit bf256df9cfdd0236637a455cbaece949b1253e98 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:24:23 2012 +0000 llvmpipe: Use consistent vector width in depth/stencil test. commit 5543b0901677146662c44be2cfba655fd55da94b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:19:59 2012 +0000 draw: Use a consistent the vector register width. Instead of 4x32 sometimes, LP_NATIVE_VECTOR_WIDTH other times. commit eada8bbd22a3a61f549f32fe2a7e408222e5c824 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 12:08:04 2012 +0000 gallivm: Remove garbagge collection. MC-JIT will require one compilation per module (as opposed to one compilation per function), therefore no state will be shared, eliminating the need to do garbagge collection. commit 556697ea0ed72e0641851e4fbbbb862c470fd7eb Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 10:33:41 2012 +0000 gallivm: Move all native target initialization to lp_set_target_options(). commit c518e8f3f2649d5dc265403511fab4bcbe2cc5c8 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:52:32 2012 +0000 llvmpipe: Create one gallivm instance for each test. commit 90f10af8920ec6be6f2b1e7365cfc477a0cb111d Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:48:08 2012 +0000 gallivm: Avoid LLVMAddGlobalMapping() in lp_bld_assert(). Brittle, complex, and unecesary. Just use function pointer constant. commit 98fde550b33401e3fe006af59db4db628bcbf476 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:21:26 2012 +0000 gallivm: Add a lp_build_const_func_pointer() helper. To be reused in all places where we want to call C code. commit 6cfedadb62c2ce5af8d75969bc95a607f3ece118 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:44:41 2012 +0000 gallivm: Cleanup/simplify lp_build_const_string_variable. - Move to lp_bld_const where it belongs - Rename to lp_build_const_string - take the length from the argument (and don't count the zero terminator twice) - bitcast the constant to generic i8 * commit db1d4018c0f1fa682a9da93c032977659adfb68c Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 11:52:17 2012 +0000 gallivm: Set NoFramePointerElimNonLeaf to true where supported. commit 088614164aa915baaa5044fede728aa898483183 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 22 19:38:47 2012 +0100 llvmpipe: pass in/out pointers rather scalar floats in lp_bld_arit we don't want llvm to potentially optimize away the vectors (though it doesn't seem to currently), plus we want to be able to handle in/out vectors of arbitrary length. commit 3f5c4e04af8a7592fdffa54938a277c34ae76b51 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:22:55 2012 +0100 gallivm: fix lp_build_sqrt() for vector length 1 since we optimize away vectors with length 1 need to emit intrinsic without vector type. commit 79d94e5f93ed8ba6757b97e2026722ea31d32c06 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 17:00:46 2012 +0000 llvmpipe: Remove lp_test_round. commit 81f41b5aeb3f4126e06453cfc78990086b85b78d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:56:24 2012 +0100 llvmpipe: subsume lp_test_round into lp_test_arit Much simpler, and since the arguments aren't passed as 128bit values can run on any arch. This also uses the float instead of the double versions of the c functions (which probably was the intention anyway). In contrast to lp_test_round the output is much less verbose however. Tested vector width of 32 to 512 bits - all pass except 32 (length 1) which crashes in lp_build_sqrt() due to wrong type. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 945b338b421defbd274481d8c4f7e0910fd0e7eb Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 09:55:03 2012 +0000 gallivm: Centralize the function compilation logic. This simplifies a lot of code. Also doing this in a central place will make it easier to carry out the changes necessary to use MC-JIT in the future. gallivm: Fix typo in explicit derivative shuffle. Trivial. draw: make DEBUG_STORE work again adapt to lp_build_printf() interface changes Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: get rid of vecnf_from_scalar() just use lp_build_broadcast directly (cannot assign a name but don't really need it, vecnf_from_scalar() was producing much uglier IR due to using repeated insertelement instead of insertelement+shuffle). Reviewed-by: José Fonseca <jfonseca@vmware.com> llvmpipe: fix typo in complex interpolation code Fixes position interpolation when using complex mode (piglit fp-fragment-position and similar) Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: fix clipvertex/position storing again This appears to be the result of a bad merge. Fixes piglit tests relying on clipping, like a lot of the interpolation tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Fix explicit derivative manipulation. Same counter variable was being used in two nested loops. Use more meanigful variable names for the counter to fix and avoid this. gallivm: Prevent buffer overflow in repeat wrap mode for NPOT. Based on Roland's patch, discussion, and review . Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix dims for TGSI_TEXTURE_1D in emit_tex. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix explicit volume texture derivatives. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: fix 1d shadow texture sampling Always r coordinate is used, hence need 3 coords not two (the second one is unused). Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Enable AVX support without MCJIT, where available. For now, this just enables AVX on Windows for testing. If the code is stable then we might consider prefering the old JIT wherever possible. No change elsewhere. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2012-07-13 18:09:30 +01:00
intr_size = 128;
}
else {
intrinsic = "llvm.x86.avx.min.ps.256";
intr_size = 256;
}
}
if (type.width == 64 && util_cpu_caps.has_sse2) {
if (type.length == 1) {
intrinsic = "llvm.x86.sse2.min.sd";
intr_size = 128;
}
else if (type.length == 2 || !util_cpu_caps.has_avx) {
intrinsic = "llvm.x86.sse2.min.pd";
gallivm,draw,llvmpipe: Support wider native registers. Squashed commit of the following: commit 7acb7b4f60dc505af3dd00dcff744f80315d5b0e Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:46:31 2012 +0100 draw: Don't use dynamically sized arrays. Not supported by MSVC. commit 5810c28c83647612cb372d1e763fd9d7780df3cb Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:44:16 2012 +0100 gallivm,llvmpipe: Don't use expressions with PIPE_ALIGN_VAR(). MSVC doesn't accept exceptions in _declspec(align(...)). Use a define instead. commit 8aafd1457ba572a02b289b3f3411e99a3c056072 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:41:56 2012 +0100 gallium/util: Make u_cpu_detect.h header C++ safe. commit 5795248350771f899cfbfc1a3a58f1835eb2671d Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 2 12:08:01 2012 +0100 gallium/util: Add ULL suffix to large constants. As suggested by Andy Furniss: it looks like some old gcc versions require it. commit 4c66c22727eff92226544c7d43c4eb94de359e10 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Truly disable INF/NAN tests on MSVC. Thanks to Brian for spotting this. commit 8bce274c7fad578d7eb656d9a1413f5c0844c94e Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Disable INF/NAN tests on MSVC. Somehow they are not recognized as constants. commit 6868649cff8d7fd2e2579c28d0b74ef6dd4f9716 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 15:05:24 2012 +0200 gallivm: Cleanup the 2 x 8 float -> 16 ub special path in lp_build_conv. No behaviour change intended, like 7b98455fb40c2df84cfd3cdb1eb7650f67c8a751. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 5147a0949c4407e8bce9e41d9859314b4a9ccf77 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 14:28:19 2012 +0200 gallivm: (trivial) fix issues with multiple-of-4 texture fetch Some formats can't handle non-multiple of 4 fetches I believe, but everything must support length 1 and multiples of 4. So avoid going to scalar fetch (which is very costly) just because length isn't 4. Also extend the hack to not use shift with variable count for yuv formats to arbitrary length (larger than 1) - doesn't matter how many elements we have we always want to avoid it unless we have variable shift count instruction (which we should get with avx2). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 87ebcb1bd71fa4c739451ec8ca89a7f29b168c08 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jul 4 02:09:55 2012 +0200 gallivm: (trivial) fix typo for wrap repeat mode in linear filtering aos code This would lead to bogus coordinates at the edges. (undetected by piglit because this path is only taken for block-based formats). Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 3a42717101b1619874c8932a580c0b9e6896b557 Author: José Fonseca <jfonseca@vmware.com> Date: Tue Jul 3 19:42:49 2012 +0100 gallivm: Fix TGSI integer translation with AVX. commit d71ff104085c196b16426081098fb0bde128ce4f Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 15:17:41 2012 +0100 llvmpipe: Fix LLVM JIT linear path. It was not working properly because it was looking at the JIT function before it was actually compiled. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit a94df0386213e1f5f9a6ed470c535f9688ec0a1b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Jun 28 18:07:10 2012 +0100 gallivm: Refactor lp_build_broadcast(_scalar) to share code. Doesn't really change the generated assembly, but produces more compact IR, and of course, makes code more consistent. Reviewed-by: Brian Paul <brianp@vmware.com> commit 66712ba2731fc029fa246d4fc477d61ab785edb5 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 17:30:13 2012 +0100 gallivm: Make LLVMContextRef a singleton. There are any places inside LLVM that depend on it. Too many to attempt to fix. Reviewed-by: Brian Paul <brianp@vmware.com> commit ff5fb7897495ac263f0b069370fab701b70dccef Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 28 18:15:27 2012 +0200 gallivm: don't use 8-wide texture fetch in aos path This appears to be a slight loss usually. There are probably several reasons for that: - fetching itself is scalar - filtering is pure int code hence needs splitting anyway, same for the final texel offset calculations - texture wrap related code, which can be done 8-wide, is slightly more complex with floats (with clamp_to_edge) and float operations generally more costly hence probably not much faster overall - the code needed to split when encountering different mip levels for the quads, adding complexity So, just split always for aos path (but leave it 8-wide for soa, since we do 8-wide filtering there when possible). This should certainly be revisited if we'd have avx2 support. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ce8032b43dcd8e8d816cbab6428f54b0798f945d Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:41:19 2012 +0200 gallivm: (trivial) don't extract fparts variable if not needed Did not have any consequences but unnecessary. commit aaa9aaed8f80dc282492f62aa583a7ee23a4c6d5 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:09:06 2012 +0200 gallivm: fix precision issue in aos linear int wrap code now not just passes at a quick glance but also with piglit... If we do the wrapping with floats, we also need to set the weights accordingly. We can potentially end up with different (integer) coordinates than what the integer calculations would have chosen, which means the integer weights calculated previously in this case are completely wrong. Well at least that's what I think happens, at least recalculating the weights helps. (Some day really should refactor all the wrapping, so we do whatever is fastest independent of 16bit int aos or 32bit float soa filtering.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit fd6f18588ced7ac8e081892f3bab2916623ad7a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 11:15:53 2012 +0100 gallium/util: Fix parsing of options with underscore. For example GALLIVM_DEBUG=no_brilinear which was being parsed as two options, "no" and "brilinear". commit 09a8f809088178a03e49e409fa18f1ac89561837 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 26 15:00:14 2012 +0100 gallivm: Added a generic lp_build_print_value which prints a LLVMValueRef. Updated lp_build_printf to share common code. Removed specific lp_build_print_vecX. Reviewed-by: José Fonseca <jfonseca@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit e59bdcc2c075931bfba2a84967a5ecd1dedd6eb0 Author: José Fonseca <jfonseca@vmware.com> Date: Wed May 16 15:00:23 2012 +0100 draw,llvmpipe: Avoid named struct types on LLVM 3.0 and later. Starting with LLVM 3.0, named structures are meant not for debugging, but for recursive data types, previously also known as opaque types. The recursive nature of these types leads to several memory management difficulties. Given that we don't actually need recursive types, avoid them altogether. This is an attempt to address fdo bugs 41791 and 44466. The issue is somewhat random so there's no easy way to check how effective this is. Cherry-picked from 9af1ba565dfd5cef9ee938bb7c04767d14878fbf commit df6070f618a203c7a876d984c847cde4cbc26bdb Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 14:42:53 2012 +0200 gallivm: (trivial) fix typo in faster aos linear int wrap code no longer crashes, now REALLY tested. commit d8f98dce452c867214e6782e86dc08562643c862 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 18:20:58 2012 +0200 llvmpipe: (trivial) remove bogus optimization for float aos repeat wrap This optimization for nearest filtering on the linear path generated likely bogus results, and the int path didn't have any optimizations there since the only shader using force_nearest apparently uses clamp_to_edge not repeat wrap anyway. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit c4e271a0631087c795e756a5bb6b046043b5099d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 23:01:52 2012 +0200 gallivm: faster repeat wrap for linear aos path too Even if we already have scaled integer coords, it's way faster to use the original float coord (plus some conversions) rather than use URem. The choice of what to do for texture wrapping is not really tied to int aos or float soa filtering though for some modes there can be some gains (because of easier weight calculations). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 1174a75b1806e92aee4264ffe0ffe7e70abbbfa3 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 14:39:22 2012 +0200 gallivm: improve npot tex wrap repeat in linear soa path URem gets translated into series of scalar divisions so just about anything else is faster. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit f849ffaa499ed96fa0efd3594fce255c7f22891b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 00:40:35 2012 +0100 gallivm: (trivial) fix near-invisible shift-space typo I blame the keyboard. commit 5298a0b19fe672aebeb70964c0797d5921b51cf0 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:24:28 2012 +0200 gallivm: add new intrinsic helper to deal with arbitrary vector length This helper will split vectors which are too large for the hw, or expand them if they are too small, so a caller of a function using intrinsics which uses such sizes need not split (or expand) the vectors manually and the function will still use the intrinsic instead of dropping back to generic llvm code. It can also accept scalars for use with pseudo-vector intrinsics (only useful for float arguments, all x86 scalar simd float intrinsics use 4vf32). Only used for lp_build_min/max() for now (also added the scalar float case for these while there). (Other basic binary functions could use it easily, whereas functions with a different interface would need different helpers.) Expanding vectors isn't widely used, because we always try to use build contexts with native hw vector sizes. But it might (or not) be nicer if this wouldn't need to be done, the generated code should in theory stay the same (it does get hit by lp_build_rho though already since we didn't have a intrinsic for the scalar lp_build_max case before). v2: incorporated Brian's feedback, and also made the scalar min/max case work instead of crash (all scalar simd float intrinsics take 4vf32 as argument, probably the reason why it wasn't used before). Moved to lp_bld_intr based on José's request, and passing intrinsic size instead of length. Ideally we'd derive the source type info from the passed in llvm value refs and process some llvmtype return type so we could handle intrinsics where the source and destination type isn't the same (like float/int conversions, packing instructions) but that's a bit too complicated for now. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 01aa760b99ec0b2dc8ce57a43650e83f8c1becdf Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:19:18 2012 +0200 gallivm: (trivial) increase max code size for shader disassembly 64kB was just short of what I needed (which caused a crash) hence increase to 96kB (should probably be smarter about that). commit 74aa739138d981311ce13076388382b5e89c6562 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:53:29 2012 +0100 gallivm: simplify aos float tex wrap repeat nearest just handle pot and npot the same. The previous pot handling ended up with exactly the same instructions plus 2 more (leave it in the soa path though since it is probably still cheaper there). While here also fix a issue which would cause a crash after an assert. commit 0e1e755645e9e49cfaa2025191e3245ccd723564 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:29:24 2012 +0100 gallivm: (trivial) skip floor rounding in ifloor when not signed This was only done for the non-sse41 case before, but even with sse41 this is obviously unnecessary (some callers already call itrunc in this case anyway but some might not). commit 7f01a62f27dcb1d52597b24825931e88bae76f33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:23:12 2012 +0100 gallivm: (trivial) fix bogus comments commit 5c85be25fd82e28490274c468ce7f3e6e8c1d416 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 20 11:51:57 2012 +0100 translate: Free elt8_func/elt16_func too. These were leaking. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 0ad498f36fb6f7458c7cffa73b6598adceee0a6c Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 15:55:34 2012 +0200 gallivm: fix bug for tex wrap repeat with linear sampling in aos float path The comparison needs to be against length not length_minus_one, otherwise the max texel is never chosen (for the second coordinate). Fixes piglit texwrap-1D-npot-proj (and 2D/3D versions). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit d1ad65937c5b76407dc2499b7b774ab59341209e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 16:13:43 2012 +0200 gallivm: simplify soa tex wrap repeat with npot textures and no mip filtering Similar to what is already done in aos sampling for the float path (but not the int path since we don't get normalized float coordinates there). URem is expensive and the calculation is done trivially with normalized floats instead (at least with sse41-capable cpus). (Some day should probably do the same for the mip filter path but it's much more complicated there hence the gain is smaller.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit e1e23f57ba9b910295c306d148f15643acc3fc83 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:38:56 2012 +0200 llvmpipe: (trivial) remove duplicated function declaration Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 07ca57eb09e04c48a157733255427ef5de620861 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:37:34 2012 +0200 llvmpipe: destroy setup variants on context destruction lp_delete_setup_variants() used to be called in garbage collection, but this no longer exists hence the setup shaders never got freed. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ed0003c633859a45f9963a479f4c15ae0ef1dca3 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 16:25:29 2012 +0100 gallivm: handle different ilod parts for multiple quad sampling This fixes filtering when the integer part of the lod is not the same for all quads. I'm not fully convinced of that solution yet as it just splits the vector if the levels to be sampled from are different. But otherwise we'd need to do things like some minify steps, and getting mip level base address separately anyway hence it wouldn't really look like much of a win (and making the code even more complex). This should now give identical results to single quad sampling. commit 8580ac4cfc43a64df55e84ac71ce1a774d33c0d2 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:14:47 2012 +0200 gallivm: de-duplicate sample code common to soa and aos sampling There doesn't seem to be any reason why this code dealing with cube face selection, lod and mip level calculation is separate in aos and soa sampling, and I am sick of having it to change in both places. commit fb541e5f957408ce305b272100196f1e12e5b1e8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:15:41 2012 +0200 gallivm: do mip filtering with per quad lod_fpart This gives better results for mip filtering, though the generated code might not be optimal. For now it also creates some artifacts if the lod_ipart isn't the same for all quads, since instead of using the same mip weight for all quads as previously (which just caused non-smooth gradients) this now will use the right weights but with the wrong mip level in this case (can easily be seen with things like texfilt, mipmap_tunnel). v2: use logic helper suggested by José, and fix issue with negative lod_fpart values commit f1cc84eef7d826a20fab6cd8ccef9a275ff78967 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 13 18:35:25 2012 +0200 gallivm: (trivial) fix bogus assert in lp_build_unpack_broadcast_aos_scalars commit 7c17dbae8ae290df9ce0f50781a09e8ed640c044 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:11:14 2012 +0100 util: Reimplement half <-> float conversions. Removed u_half.py used to generate the table for previous method. Previous implementation of float to half conversion was faulty for denormalised and NaNs and would require extra logic to fix, thus making the speedup of using tables irrelevant. commit 7762f59274070e1dd4b546f5cb431c2eb71ae5c3 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:12:16 2012 +0100 tests: Updated tests to properly handle NaN for half floats. commit fa94c135aea5911fd93d5dfb6e6f157fb40dce5e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:33:10 2012 +0200 gallivm: do mip level calculations per quad This is the final piece which shouldn't change the rendering output yet. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 23cbeaddfe03c09ca18c45d28955515317ffcf4c Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Jun 9 00:54:21 2012 +0200 gallivm: do per-quad cube face selection Doesn't quite fix the piglit cubemap test (not sure why actually) but doing per-quad face selection is doing the right thing and definitely an improvement. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit abfb372b3702ac97ac8b5aa80ad1b94a2cc39d33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:22:59 2012 +0200 gallivm: do all lod calculations per quad Still no functional change but lod is now converted to scalar after lod calculations. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 519368632747ae03feb5bca9c655eccbc5b751b4 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:46:10 2012 +0100 gallivm: Added support for half-float to float conversion in lp_build_conv. Updated various utility functions to support this change. commit 135b4d683a4c95f7577ba27b9bffa4a6fbd2c2e7 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:02:46 2012 +0100 gallivm: Added function for half-float to float conversion. Updated lp_build_format_aos_array to support half-float source. commit 37d648827406a20c5007abeb177698723ed86673 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:55:18 2012 +0100 util: Updated u_format_tests to rigidly test half-float boundary values. commit 2ad18165d96e578aa9046df7c93cb1c3284d8c6b Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:54:16 2012 +0100 llvmpipe: Updated lp_test_format to properly handle Inf/NaN results. commit 78740acf25aeba8a7d146493dd5c966e22c27b73 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:53:30 2012 +0100 util: Added functions for checking NaN / Inf for double and half-floats. commit 35e9f640ae01241f9e0d67fe893bbbf564c05809 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:05:13 2012 +0200 gallivm: Fix calculating rho for 3d textures for the single-quad case Discovered by accident, this looks like a very old typo bug. commit fc1220c636326536fd0541913154e62afa7cd1d8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:04:59 2012 +0200 gallivm: do calcs per-quad in lp_build_rho Still convert to scalar at the end of the function. commit 50a887ffc550bf310a6988fa2cea5c24d38c1a41 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 21 23:21:50 2012 +0200 gallivm: (trivial) return scalar in lp_build_extract_range for length 1 vectors Our type system on top of llvm's one doesn't generally support vectors of length 1, instead using scalars. So we should return a scalar from this function instead of having to bitcast the vector with length 1 later elsewhere. commit 80c71c621f9391f0f9230460198d861643324876 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 17:49:15 2012 +0100 draw: Fixed bad merge error commit c47401cfad0c9167de20ff560654f533579f452c Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:29:30 2012 +0100 draw: Updated store_clip to store whole vectors instead of individual elements. commit 2d9c1ad74b0b0b41861fffcecde39f09cc27f1cf Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:28:32 2012 +0100 gallivm: Added lp_build_fetch_rgba_aos_array. A version of lp_build_fetch_rgba_aos which is targeted at simple array formats. Reads the whole vector from memory in one, instead of reading each element individually. Tested with mesa tests and demos. commit ff7805dc2b6ef6d8b11ec4e54aab1633aef29ac8 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:27:40 2012 +0100 gallivm: Added lp_build_pad_vector. This function pads a vector with undef to a desired length. commit 701f50acef24a2791dabf4730e5b5687d6eb875d Author: James Benton <jbenton@vmware.com> Date: Fri May 18 17:27:19 2012 +0100 util: Added util_format_is_array. This function checks whether a format description is in a simple array format. commit 5e0a7fa543dcd009de26f34a7926674190fa6246 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:13:47 2012 +0100 draw: Removed draw_llvm_translate_from and draw/draw_llvm_translate.c. This is "replaced" by adding an optimised path in lp_build_fetch_rgba_aos in an upcoming patch. commit 8c886d6a7dd3fb464ecf031de6f747cb33e5361d Author: James Benton <jbenton@vmware.com> Date: Wed May 16 15:02:31 2012 +0100 draw: Modified store_aos to write the vector as one, not individual elements. commit 37337f3d657e21dfd662c7b26d61cb0f8cfa6f17 Author: James Benton <jbenton@vmware.com> Date: Wed May 16 14:16:23 2012 +0100 draw: Changed aos_to_soa to use lp_build_transpose_aos. commit bd2b69ce5d5c94b067944d1dcd5df9f8e84548f1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:14:27 2012 +0100 draw: Changed soa_to_aos to use lp_build_transpose_aos. commit 0b98a950d29a116e82ce31dfe7b82cdadb632f2b Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:45 2012 +0100 gallivm: Added lp_build_transpose_aos which converts between aos and soa. commit 69ea84531ad46fd145eb619ed1cedbe97dde7cb5 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:01 2012 +0100 gallivm: Added lp_build_interleave2_half aimed at AVX unpack instructions. commit 7a4cb1349dd35c18144ad5934525cfb9436792f9 Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 22 11:54:14 2012 +0100 gallivm: Fix build on Windows. MC-JIT not yet supported there. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit afd105fc16bb75d874e418046b80d9cc578818a1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:17:26 2012 +0100 llvmpipe: Added a error counter to lp_test_conv. Useful for keeping track of progress when fixing errors! Signed-off-by: José Fonseca <jfonseca@vmware.com> commit b644907d08c10a805657841330fc23db3963d59c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:16:46 2012 +0100 llvmpipe: Changed known failures in lp_test_conv. To comply with the recent fixes to lp_bld_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit d7061507bd94f6468581e218e61261b79c760d4f Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:14:38 2012 +0100 llvmpipe: Added fixed point types tests to lp_test_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 146b3ea39b4726dbe125ac666bd8902ea3d6ca8c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:26:35 2012 +0100 llvmpipe: Changed lp_test_conv src/dst alignment to be correct. Now based on the define rather than a fixed number. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit f3b57441f834833a4b142a951eb98df0aa874536 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:06:44 2012 +0100 gallivm: Fixed erroneous optimisation in lp_build_min/max. Previously assumed normalised was 0 to 1, but it can be -1 to 1 if type is signed. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a0613382e5a215cd146bb277646a6b394d376ae4 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:04:49 2012 +0100 gallivm: Compensate for lp_const_offset in lp_build_conv. Fixing a /*FIXME*/ to remove errors in integer conversion in lp_build_conv. Tested using lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a3d2bf15ea345bc8a0664f8f441276fd566566f3 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:01:25 2012 +0100 gallivm: Fixed overflow in lp_build_clamped_float_to_unsigned_norm. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit e7b1e76fe237613731fa6003b5e1601a2e506207 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 21 20:07:51 2012 +0100 gallivm: Fix build with LLVM 2.6 Trivial, and useful. commit d3c6bbe5c7f5ba1976710831281ab1b6a631082d Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 15 17:15:59 2012 +0100 gallivm: Enable MCJIT/AVX with vanilla LLVM 3.1. Add the necessary C++ glue, so that we don't need any modifications to the soon to be released LLVM 3.1. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 724a019a14d40fdbed21759a204a2bec8a315636 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 22:04:06 2012 +0100 gallivm: Use HAVE_LLVM 0x0301 consistently. commit af6991e2a3868e40ad599b46278551b794839748 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 21:49:06 2012 +0100 gallivm: Add MCRegisterInfo.h to silence benign warnings about missing implementation. Trivial. commit 6f8a1d75458daae2503a86c6b030ecc4bb494e23 Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Apr 2 22:14:15 2012 -0700 gallivm: Pass in a MCInstrInfo to createMCInstPrinter on llvm-3.1. llvm-3.1svn r153860 makes MCInstrInfo available to the MCInstPrinter. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 62555b6ed8760545794f83064e27cddcb3ce5284 Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 27 21:51:17 2012 -0700 gallivm: Fix method overriding in raw_debug_ostream. Use matching type qualifers to avoid method hiding. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 6a9bd784f4ac68ad0a731dcd39e5a3c39989f2be Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 13 22:40:52 2012 -0700 gallivm: Fix createOProfileJITEventListener namespace with llvm-3.1. llvm-3.1svn r152620 refactored the OProfile profiling code. createOProfileJITEventListener was moved from the llvm namespace to the llvm::JITEventListener namespace. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit b674955d39adae272a779be85aa1bd665de24e3e Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Mar 5 22:00:40 2012 -0800 gallivm: Pass in a MCRegisterInfo to MCInstPrinter on llvm-3.1. llvm-3.1svn r152043 changes createMCInstPrinter to take an additional MCRegisterInfo argument. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 11ab69971a8a31c62f6de74905dbf8c02884599f Author: Vinson Lee <vlee@freedesktop.org> Date: Wed Feb 29 21:20:53 2012 -0800 Revert "gallivm: Change getExtent and readByte to non-const with llvm-3.1." This reverts commit d5a6c172547d8964f4d4bb79637651decaf9deee. llvm-3.1svn r151687 makes MemoryObject accessor members const again. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 339960c82d2a9f5c928ee9035ed31dadb7f45537 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 14 16:19:56 2012 +0200 gallivm: (trivial) fix assertion failure for mipmapped 1d textures In lp_build_rho, we may end up with a 1-element vector (for mipmapped 1d textures), but in this case we require the type to be a non-vector type, so need a cast. commit 9d73edb727bd6d196030dc3026b7bf0c574b3e19 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:12:07 2012 +0200 gallivm: prepare for per-quad lod calculations for large vectors to be able to handle multiple quads at once in texture sampling and still do lod calculations per quad, it is necessary to get the per-quad derivatives into the lp_build_rho function. Until now these derivative values were just scalars, which isn't going to work. So we now use vectors, and since the interface needs to change we also do some different (slightly more efficient) packing of the values. For 8-wide vectors the packed derivative values for 3 coords would look like this, this scales to a arbitrary (multiple of 4) vector size: ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____ The second vector will be unused for 1d and 2d textures. To facilitate future changes the derivative values are put into a struct, since quite some functions just pass these values through. The generated code seems to be very slightly better for 2d textures (with 4-wide vectors) than before with sse2 (if you have a cpu with physical 128bit simd units - otherwise it's probably not a win). v2: suggestions from José, rename variables, add comments, use swizzle helper commit 0aa21de0d31466dac77b05c97005722e902517b8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:10:31 2012 +0200 gallivm: add undefined swizzle handling to lp_build_swizzle_aos This is useful for vectors with "holes", it lets llvm choose the most efficient shuffle instructions if some elements aren't needed without having to worry what elements to manually pick otherwise. commit 00faf3f370e7ce92f5ef51002b0ea42ef856e181 Author: José Fonseca <jfonseca@vmware.com> Date: Fri May 4 17:25:16 2012 +0100 gallivm: Get the LLVM IR optimization passes before JIT compilation. MC-JIT engine compiles the module immediately on creation, so the optimization passes were being run too late. So now we create a target data layout from a string, that matches the ABI parameters reported by the compiler. The backend optimization passes were always been run, so the performance improvement is modest (3% on multiarb mesa demo). Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit 40a43f4e2ce3074b5ce9027179d657ebba68800a Author: Roland Scheidegger <sroland@vmware.com> Date: Wed May 2 16:03:54 2012 +0200 gallivm: (trivial) fix wrong define used in lp_build_pack2 should fix stack-smashing crashes. commit e6371d0f4dffad4eb3b7a9d906c23f1c88a2ab9e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 30 21:25:29 2012 +0200 gallivm: add perf warnings when not using intrinsics with 256bit vectors Helper functions using integer sse2 intrinsics could split the vectors with AVX instead of using generic fallback (which should be faster). We don't actually expect to hit these paths (hence don't fix them up to actually do the vector splitting) so just emit warnings (for those functions where it's obvious doing split/intrinsic is faster than using generic path). Only emit warnings for 256bit vectors since we _really_ don't expect to hit arbitrary large vectors which would affect a lot more functions. The warnings do not actually depend on avx since the same logic applies to plain sse2 too (but of course again there's _really_ no reason we should hit these functions with 256bit vectors without avx). commit 8a9ea701ea7295181e846c6383bf66a5f5e47637 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:37:07 2012 +0200 gallivm: split vectors manually for avx in lp_build_pack2 (v2) There's 2 reasons for this: First, there's a llvm bug (fixed in 3.1) which generates tons of byte inserts/extracts otherwise, and second, more importantly, we want to use pack intrinsics instead of shuffles. We do this in lp_build_pack2 and not the calling code (aos sample path) because potentially other callers might find that useful too, even if for larger sequences of code using non-native vector sizes it might be better to manually split vectors. This should boost texture performance in the aos path considerably. v2: fix issues with intrinsics types with old llvm commit 27ac5b48fa1f2ea3efeb5248e2ce32264aba466e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:26:22 2012 +0200 llvmpipe: refactor lp_build_pack2 (v2) prettify, and it's unnecessary to assert when there's no intrinsic due to unsupported bit width - the shuffle path will work regardless. In contrast lp_build_packs2, should only rely on lp_build_pack2 doing the clamping for element sizes for which there is a sse2 intrinsic. v2: fix bug spotted by Jose regarding the intrinsic type for packusdw on old llvm versions. commit ddf279031f0111de4b18eaf783bdc0a1e47813c8 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:13:59 2012 +0200 gallivm: add src width check in lp_build_packs2() not doing so would skip clamping even if no sse2 pack instruction is available, which is incorrect (in theory only, such widths would also always hit a (unnecessary) assertion in lp_build_pack2(). commit e7f0ad7fe079975eae7712a6e0c54be4fae0114b Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Apr 27 15:57:00 2012 +0200 gallivm: (trivial) fix crash-causing typo for npot textures with avx commit 28a9d7f6f655b6ec508c8a3aa6ffefc1e79793a0 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Apr 25 19:38:45 2012 +0200 gallivm: (trivial) remove code mistakenly added twice. commit d5926537316f8ff67ad0a52e7242f7c5478d919b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Apr 24 21:16:15 2012 +0200 gallivm: add a new avx aos sample path (v2) Try to avoid mixing float and int address calculations. This does texture wrap modes with floats, and then the offset calculations still with ints (because of lack of precision with floats, though we could do some effort to make it work with not too large (16MB) textures). This also handles wrap repeat mode with npot-sized textures differently than either the old soa or aos int path (likely way faster but untested). Otherwise the actual address wrap code is largely similar to the soa path (not quite the same as this one also has some int code), it should get used by avx soa sampling later as well but doesn't handle more complex address modes yet (this will also have the benefit that we can use aos sampling path for all texture address modes). Generated code for that looks reasonable, but still does not split vectors explicitly for fetch/filter which means still get hit by llvm (fixed upstream) which generates hundreds of pinsrb/pextrb instead of two shuffles. It is not obvious though if it's much of a win over just doing address calcs 4-wide but with ints, even if it is definitely much less instructions on avx. piglit's texwrap seems to look exactly the same but doesn't test neither the non-normalized nor the npot cases. v2: fix comments, prettify based on Brian's and Jose's feedback. commit bffecd22dea66fb416ecff8cffd10dd4bdb73fce Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Apr 19 01:58:29 2012 +0200 gallivm: refactor aos lp_build_sample_image_nearest/linear split them up to separate address calculations and fetching/filtering. Need this for being able to do 8-wide float address calcs and 4-wide fetch/filter later (for avx). Plus the functions were very big scary monsters anyway (in particular lp_build_sample_image_linear). commit a80b325c57529adddcfa367f96f03557725c4773 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 16 17:17:18 2012 +0200 gallivm: fix lp_build_resize when truncating width but expanding vector size Missed this case which I thought was impossible - the assertion for it was right after the division by zero... (AoS) texture sampling may ask us to do this, for things like 8 4x32int vectors to 1 32x8int vector conversion (eventually, we probably don't want this to happen). commit f9c8337caa3eb185830d18bce8b95676a065b1d7 Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Apr 14 18:00:59 2012 +0200 gallivm: fix cube maps with larger vectors This makes the branchless cube face selection code work with larger vectors. Because the complexity is quite high (cannot really be improved it seems, per-face selection would reduce complexity a lot but this leads to errors unless the derivatives are calculated all from the same face which almost doubles the work to be done) it is still slower than the branching version, hence only enable this with large vectors. It doesn't actually do per-quad face selection yet (only makes sense with matching lod selection, in fact it will select the same face for all pixels based on the average of the first four pixels for now) but only different shuffles are required to make it work (the branching version actually should work with larger vectors too now thanks to the improved horizontal add but of course it cannot be extended to really select the face per-quad unless doing branching per quad). commit 7780c58869fc9a00af4f23209902db7e058e8a66 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 21:11:12 2012 +0100 llvmpipe: (trivial) fix compiler warning and also clarify comment regarding availability of popcnt instruction. commit a266dccf477df6d29a611154e988e8895892277e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 14:21:07 2012 +0100 gallivm: remove unneeded members in lp_build_sample_context Minor cleanup, the texture width, height, depth aren't accessed in their scalar form anywhere. Makes it more obvious those values should probably be fetched already vectorized (but this requires more invasive changes)... commit b678c57fb474e14f05e25658c829fc04d2792fff Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 29 15:53:55 2012 +0100 gallivm: add a helper for concatenating vectors Similar to the extract_range helper intended to get around slow code generated by llvm for 128bit insertelements. Concatenating two 128bit vectors this way will result in a single vinsertf128 operation rather than two 64bit stores plus one 128bit load, though it might be mildly useful for other purposes as well. commit 415ff228bcd0cf5e44a4c15350a661f0f5520029 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 19:41:15 2012 +0100 gallivm: add a custom 2x8f->1x16ub avx conversion path Similar to the existing 4x4f->1x16ub sse2 path, shaves off a couple instructions (min/max mostly) because it relies on pack intrinsics clamping. commit 78c08fc89f8fbcc6dba09779981b1e873e2a0299 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 18:44:07 2012 +0100 gallivm: add avx arithmetic intrinsics Add all avx intrinsics for arithmetic functions (with the exception of the horizontal add function which needs another look). Seems to pass basic tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit a586caa2800aa5ce54c173f7c0d4fc48153dbc4e Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 15:31:35 2012 +0100 gallivm: add avx logic intrinsics Add the blend intrinsics for 8-wide float and 4-wide double vectors. Since we lack 256bit int instructions these are used for int vectors as well, though obviously not for byte or word element values. The comparison intrinsics aren't extended for avx since these are only used for pre-2.7 llvm versions. commit 70275e4c13c89315fc2560a4c488c0e6935d5caf Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 00:40:53 2012 +0100 gallivm: new helper function for extract shuffles. Based on José's idea as we can need that in a couple places. Note that such shuffles should not be used lightly, since data layout of <4 x i8> is different to <16 x i8> for instance, hence might cause data rearrangement. commit 4d586dbae1b0c55915dda1759d2faea631c0a1c2 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 18:27:25 2012 +0100 gallivm: (trivial) don't overallocate shuffle variable using wrong define meant huge array... commit 06b0ec1f6d665d98c135f9573ddf4ba04b2121ad Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 17:54:20 2012 +0100 gallivm: don't do per-element extract/insert for vector element resize Instead of doing per-element extract/insert if the src vectors and dst vector differ in total size (which generates atrocious code) first change the src vectors size by using shuffles to destination vector size. We can still do better than that on AVX for packing to color buffer (by exploiting pack intrinsics characteristics hence eleminating the need for some clamps) but this already generates much better code. v2: incorporate feedback from José, Keith and use shuffle instead of bitcasts/extracts. Due to llvm deficiencies the latter cause all data to get moved to GPRs and back in pieces (even though the data in the regs actually stays the same...). commit c9970d70e05f95d3f52fe7d2cd794176a52693aa Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 19:33:19 2012 +0000 gallivm: fix bug in simple position interpolation Accidental use of position attribute instead of just pixel coordinates. Caused failures in piglit glsl-fs-ceil and glsl-fs-floor. commit d0b6fcdb008d04d7f73d3d725615321544da5a7e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 15:31:14 2012 +0000 gallivm: fix emission of ceil opcode lp_build_ceil seems more appropriate than lp_build_trunc. This seems to be never hit though someone performs some ceil to floor magic. commit d97fafed7e62ffa6bf76560a92ea246a1a26d256 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 22 11:46:52 2012 +0000 gallivm: new vectorized path for cubemap calculations should be faster when adapted to multiple quads as only selection masks need to be different. The code is more or less a per-pixel version adapted to only do it per quad. A per pixel version would be much simpler (could drop 2 selects, 6 broadcasts and the messy horizontal add of 3 vectors at the expense of only 2 more absolute value instructions - would also just work for arbitary large vectors). This version doesn't yet work with larger vectors because the horizontal add isn't adjusted to be able to work with 2x4 vectors (and also because face selection wouldn't be done per quad just per block though that would be only a correctness issue just as with lod selection). The downside is this code is quite a bit slower. On a Core2 it can be sped up by disabling the hw blend instructions for selection and using logicop fallbacks instead, but it is still slower than the old code, hence leave that in for now. Probably will chose one or the other version based on vector length in the end. commit b375fbb18a3fd46859b7fdd42f3e9908ea4ff9a3 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 21 14:42:29 2012 +0000 gallivm: fix optimized occlusion query intrinsic name commit a9ba0a3b611e48efbb0e79eb09caa85033dbe9a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Mar 21 16:19:43 2012 +0000 draw,gallivm,llvmpipe: Call gallivm_verify_function everywhere. commit f94c2238d2bc7383e088b8845b7410439a602071 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 18:54:10 2012 +0000 gallivm: optimize calculations for cube maps a bit this does some more vectorized calculations and uses horizontal adds if possible. A definite win with sse3 otherwise it doesn't seem to make much of a difference. In any case this is arithmetically identical, cannot handle larger vectors. Should be useful as a reference point against larger vector version later... commit 21a2c1cf3c8e1ac648ff49e59fdc0e3be77e2ebb Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 15:16:27 2012 +0000 llvmpipe: slight optimization of occlusion queries using movmskps when available. While this is slightly better for cpus without popcnt we should really sum the vectors ourselves (it is also possible to cast to i4 before doing the popcnt but that doesn't help that much neither since llvm is using some optimized popcnt version for i32) commit 5ab5a35f216619bcdf55eed52b0db275c4a06c1b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 13:32:11 2012 +0000 llvmpipe: fix occlusion queries with larger vectors need to adjust casts etc. commit ff95e6fdf5f16d4ef999ffcf05ea6e8c7160b0d5 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Mar 19 20:15:25 2012 +0000 gallivm: Restore optimization passes. commit 57b05b4b36451e351659e98946dae27be0959832 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:34:22 2012 +0000 llvmpipe: use existing min2 macro commit bc9a20e19b4f600a439f45679451f2e87cd4b299 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:07:27 2012 +0000 llvmpipe: add some safeguards against really large vectors As per José's suggestion, prevent things from blowing up if some cpu would have 1024bit or larger vectors. commit 0e2b525e5ca1c5bbaa63158bde52ad1c1564a3a9 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:31:08 2012 +0000 llvmpipe: fix mask generation for uberwide vectors this was the only piece preventing 16-wide vectors from working (apart from the LP_MAX_VECTOR_WIDTH define that is), which is the maximum as we don't get more pixels in the fragment shader at once. Hence adjust that so things could be tested properly with that size even though there seems to be no practical value. commit 3c8334162211c97f3a11c7f64e9e5a2a91ad9656 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:19:41 2012 +0000 llvmpipe: fix the simple interpolation method with larger vectors so both methods actually _really_ work now. Makes textures look nice with larger vectors... commit 1cb0464ef8871be1778d43b0c56adf9c06843e2d Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 17:26:35 2012 +0000 llvmpipe: fix mask generation and position interpolation with 8-wide vectors trivial bugs, with these things start to look somewhat reasonable. Textures though have some swizzling issues it seems. commit 168277a63ef5b72542cf063c337f2d701053ff4b Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 16:04:03 2012 +0000 llvmpipe: don't overallocate variables we never have more than 16 (stamp size) / 4 (minimum possible vector size). (With larger vectors those variables are still overallocated a bit.) commit 409b54b30f81ed0aa9ed0b01affe15c72de9abd2 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:56:48 2012 +0000 llvmpipe: add some 32f8 formats to lp_test_conv Also add the ability to handle different sized vectors. commit 55dcd3af8366ebdac0af3cdb22c2588f24aa18ce Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:47:27 2012 +0000 gallivm: handle different sized vectors in conversion / pack only fully generic path for now (extract/insert per element). commit 9c040f78c54575fcd94a8808216cf415fe8868f6 Author: Roland Scheidegger <sroland@vmware.com> Date: Sun Mar 18 00:58:28 2012 +0100 llvmpipe: fix harmless use of unitialized values commit 551e9d5468b92fc7d5aa2265db9a52bb1e368a36 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:31:21 2012 +0100 gallivm: drop special path in extract_broadcast with different sized vectors Not needed, llvm can handle shuffles with different sized result vector just fine. Should hopefully generate the same code in the end, but simpler IR. commit 44da531119ffa07a421eaa041f63607cec88f6f8 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:28:49 2012 +0100 llvmpipe: adapt interpolation for handling multiple quads at once this is still WIP there are actually two methods possible not quite sure what makes the most sense, so there's code for both for now: 1) the iterative method as used before (compute attrib values at upper left corner of stamp and upper left corner of each quad initially). It is improved to handle more than one quad at once, and also do some more vectorized calculations initially for slightly better code - newer cpus have full throughput with 4 wide float vectors, hence don't try to code up a path which might be faster if there's just one channel active per attribute. 2) just do straight interpolation for each pixel. Method 2) is more work per quad, but less initially - if all quads are executed significantly more overall though. But this might change with larger vector lengths. This method would also be needed if we'd do some kind of active quad merging when operating on multiple quads at once. This path contains some hack to force llvm to generate better code, it is still far from ideal though, still generates far too many unnecessary register spills/reloads. Both methods should work with different sized vectors. Not very well tested yet, still seems to work with four-wide vectors, need changes elsewhere to be able to test with wider vectors. commit be5d3e82e2fe14ad0a46529ab79f65bf2276cd28 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:59:37 2012 +0000 draw: Cleanup. commit f85bc12c7fbacb3de2a94e88c6cd2d5ee0ec0e8d Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:43:30 2012 +0000 gallivm: More module compilation refactoring. commit d76f093198f2a06a93b2204857e6fea5fd0b3ece Author: José Fonseca <jfonseca@vmware.com> Date: Thu Mar 15 21:29:11 2012 +0000 llvmpipe: Use gallivm_compile/free_function() in linear code. Should had been done before. commit 122e1adb613ce083ad739b153ced1cde61dfc8c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 13 14:47:10 2012 +0100 llvmpipe: generate partial pixel mask for multiple quads still works with one quad, cannot be tested yet with more At least for now always fixed order with multiple quads. commit 4c4f15081d75ed585a01392cd2dcce0ad10e0ea8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 22:09:24 2012 +0100 llvmpipe: refactor state setup a bit Refactor to make it easier to emit (and potentially later fetch in fs) coefficients for multiple attributes at once. Need to think more about how to make this actually happen however, the problem is different attributes can have different interpolation modes, requiring different handling in both setup and fs (though linear and perspective handling is close). commit 9363e49722ff47094d688a4be6f015a03fba9c79 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 19:23:23 2012 +0100 llvmpipe: vectorize tri offset calc cuts number of instructions in quad-offset-factor from 107 to 75. This code actually duplicated the (scalar) code calculating the determinant except it used different vertex order (leading to different sign but it doesn't matter) hence llvm could not have figured out it's the same (of course with determinant vectorized in the other place that wouldn't have worked any longer neither). Note this particular piece doesn't actually vectorize well, not many arithmetic instructions left but tons of shuffle instructions... Probably would need to work on n tris at a time for better vectorization. commit 63169dcb9dd445c94605625bf86d85306e2b4297 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 03:11:37 2012 +0100 llvmpipe: vectorize some scalar code in setup reduces number of arithmetic instructions, and avoids loading vector x,y values twice (once as scalars once as vectors). Results in a reduction of instructions from 76 to 64 in fs setup for glxgears (16%) on a cpu with sse41. Since this code uses vec2 disguised as vec4, on old cpus which had physical 64bit sse units (pre-Core2) it probably is less of a win in practice (and if you have no vectors you can only hope llvm eliminates the arithmetic for unneeded elements). commit 732ecb877f951ab89bf503ac5e35ab8d838b58a1 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 7 00:32:24 2012 +0100 draw: fix clipping bug introduced by 4822fea3f0440b5205e957cd303838c3b128419c broke clipping pretty badly (verified with lineclip test) commit ef5d90b86d624c152d200c7c4056f47c3c6d2688 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 23:38:59 2012 +0100 draw: don't store vertex header per attribute storing the vertex header once per attribute is totally unnecessary. Some quick look at the generated assembly says llvm in fact cannot optimize away the additional stores (maybe due to potentially aliasing pointers somewhere). Plus, this makes the code cleaner and also allows using a vector "or" instead of scalar ones. commit 6b3a5a57b0b9850854cfbd7b586e4e50102dda71 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 19:11:01 2012 +0100 draw: do the per-vertex "boolean" clipmask "or" with vectors no point extracting the values and doing it per component. Doesn't help that much since we still extract the values elsewhere anyway. commit 36519caf1af40e4480251cc79a2d527350b7c61f Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 2 22:27:01 2012 +0100 gallivm: fix lp_build_extract_broadcast with different sized vectors Fix the obviously wrong argument, so it doesn't blow up. commit 76d0ac3ad85066d6058486638013afd02b069c58 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 2 12:16:23 2012 +0000 draw: Compile per module and not per function (WIP). Enough to get gears w/ LLVM draw + softpipe to work on AVX doing: GALLIUM_DRIVER=softpipe SOFTPIPE_USE_LLVM=yes glxgears But still hackish -- will need to rethink and refactor this. commit 78e32b247d2a7a771be9a1a07eb000d1e54ea8bd Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 12:01:05 2012 +0000 llvmpipe: Remove lp_state_setup_fallback. Never used. commit 6895d5e40d19b4972c361e8b83fdb7eecda3c225 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Feb 27 19:14:27 2012 +0000 llvmpipe: Don't emit EMMS on x86 We already take precautions to ensure that LLVM never emits MMX code. commit 4822fea3f0440b5205e957cd303838c3b128419c Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 29 15:58:19 2012 +0100 draw: modifications for larger vector sizes We want to be able to use larger vectors especially for running the vertex shader. With this patch we build soa vectors which might have a different length than 4. Note that aos structures really remain the same, only when aos structures are converted to soa potentially different sized vectors are used. Samplers probably don't work yet, didn't look at them. Testing done: glxgears works with both 128bit and 256bit vectors. commit f4950fc1ea784680ab767d3dd0dce589f4e70603 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:57 2012 +0100 gallivm: override native vector width with LP_NATIVE_VECTOR_WIDTH env var for debug commit 6ad6dbf0c92f3bf68ae54e5f2aca035d19b76e53 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:24 2012 +0100 draw: allocate storage with alignment according to native vector width commit 7bf0e3e7c9bd2469ae7279cabf4c5229ae9880c1 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Feb 24 19:06:08 2012 +0000 gallivm: Fix comment grammar. Was missing several words. Spotted by Roland. commit b20f1b28eb890b2fa2de44a0399b9b6a0d453c52 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 19:22:09 2012 +0000 gallivm: Use MC-JIT on LLVM 3.1 + (i.e, SVN) MC-JIT Note: MC-JIT is still WIP. For this to work correctly it requires LLVM changes which are not yet upstream. commit b1af4dfcadfc241fd4023f4c3f823a1286d452c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 20:03:15 2012 +0100 llvmpipe: use new lp_type_width() helper in lp_test_blend commit 04e0a37e888237d4db2298f31973af459ef9c95f Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 19:50:34 2012 +0100 llvmpipe: clean up lp_test_blend a little Using variables just sized and aligned right makes it a bit more obvious what's going on. The test still only tests vector length 4. For AoS anything else probably isn't going to work. For SoA other lengths should work (at least with floats). commit e61c393d3ec392ddee0a3da170e985fda885a823 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:48:30 2012 +0000 gallivm: Ensure vector width consistency. Instead of assuming that everything is the max native size. commit 330081ac7bc41c5754a92825e51456d231bf84dd Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:44:14 2012 +0000 draw: More simd vector width consistency fixes. commit d90ca002753596269e37297e2e6c139b19f29f03 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:43:00 2012 +0000 gallivm: Remove unused lp_build_int32_vec4_type() helper. commit cae23417824d75869c202aaf897808d73a2c1db0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 17:32:16 2012 +0100 gallivm: use global variable for native vector width instead of define We do not know the simd extensions (and hence the simd width we should use) available at compile time. At least for now keep a define for maximum vector width, since a global variable obviously can't be used to adjust alignment of automatic stack variables. Leave the runtime-determined value at 128 for now in all cases. commit 51270ace6349acc2c294fc6f34c025c707be538a Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 15:41:02 2012 +0000 gallivm: Add a hunk inadvertedly lost when rebasing. commit bf256df9cfdd0236637a455cbaece949b1253e98 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:24:23 2012 +0000 llvmpipe: Use consistent vector width in depth/stencil test. commit 5543b0901677146662c44be2cfba655fd55da94b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:19:59 2012 +0000 draw: Use a consistent the vector register width. Instead of 4x32 sometimes, LP_NATIVE_VECTOR_WIDTH other times. commit eada8bbd22a3a61f549f32fe2a7e408222e5c824 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 12:08:04 2012 +0000 gallivm: Remove garbagge collection. MC-JIT will require one compilation per module (as opposed to one compilation per function), therefore no state will be shared, eliminating the need to do garbagge collection. commit 556697ea0ed72e0641851e4fbbbb862c470fd7eb Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 10:33:41 2012 +0000 gallivm: Move all native target initialization to lp_set_target_options(). commit c518e8f3f2649d5dc265403511fab4bcbe2cc5c8 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:52:32 2012 +0000 llvmpipe: Create one gallivm instance for each test. commit 90f10af8920ec6be6f2b1e7365cfc477a0cb111d Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:48:08 2012 +0000 gallivm: Avoid LLVMAddGlobalMapping() in lp_bld_assert(). Brittle, complex, and unecesary. Just use function pointer constant. commit 98fde550b33401e3fe006af59db4db628bcbf476 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:21:26 2012 +0000 gallivm: Add a lp_build_const_func_pointer() helper. To be reused in all places where we want to call C code. commit 6cfedadb62c2ce5af8d75969bc95a607f3ece118 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:44:41 2012 +0000 gallivm: Cleanup/simplify lp_build_const_string_variable. - Move to lp_bld_const where it belongs - Rename to lp_build_const_string - take the length from the argument (and don't count the zero terminator twice) - bitcast the constant to generic i8 * commit db1d4018c0f1fa682a9da93c032977659adfb68c Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 11:52:17 2012 +0000 gallivm: Set NoFramePointerElimNonLeaf to true where supported. commit 088614164aa915baaa5044fede728aa898483183 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 22 19:38:47 2012 +0100 llvmpipe: pass in/out pointers rather scalar floats in lp_bld_arit we don't want llvm to potentially optimize away the vectors (though it doesn't seem to currently), plus we want to be able to handle in/out vectors of arbitrary length. commit 3f5c4e04af8a7592fdffa54938a277c34ae76b51 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:22:55 2012 +0100 gallivm: fix lp_build_sqrt() for vector length 1 since we optimize away vectors with length 1 need to emit intrinsic without vector type. commit 79d94e5f93ed8ba6757b97e2026722ea31d32c06 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 17:00:46 2012 +0000 llvmpipe: Remove lp_test_round. commit 81f41b5aeb3f4126e06453cfc78990086b85b78d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:56:24 2012 +0100 llvmpipe: subsume lp_test_round into lp_test_arit Much simpler, and since the arguments aren't passed as 128bit values can run on any arch. This also uses the float instead of the double versions of the c functions (which probably was the intention anyway). In contrast to lp_test_round the output is much less verbose however. Tested vector width of 32 to 512 bits - all pass except 32 (length 1) which crashes in lp_build_sqrt() due to wrong type. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 945b338b421defbd274481d8c4f7e0910fd0e7eb Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 09:55:03 2012 +0000 gallivm: Centralize the function compilation logic. This simplifies a lot of code. Also doing this in a central place will make it easier to carry out the changes necessary to use MC-JIT in the future. gallivm: Fix typo in explicit derivative shuffle. Trivial. draw: make DEBUG_STORE work again adapt to lp_build_printf() interface changes Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: get rid of vecnf_from_scalar() just use lp_build_broadcast directly (cannot assign a name but don't really need it, vecnf_from_scalar() was producing much uglier IR due to using repeated insertelement instead of insertelement+shuffle). Reviewed-by: José Fonseca <jfonseca@vmware.com> llvmpipe: fix typo in complex interpolation code Fixes position interpolation when using complex mode (piglit fp-fragment-position and similar) Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: fix clipvertex/position storing again This appears to be the result of a bad merge. Fixes piglit tests relying on clipping, like a lot of the interpolation tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Fix explicit derivative manipulation. Same counter variable was being used in two nested loops. Use more meanigful variable names for the counter to fix and avoid this. gallivm: Prevent buffer overflow in repeat wrap mode for NPOT. Based on Roland's patch, discussion, and review . Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix dims for TGSI_TEXTURE_1D in emit_tex. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix explicit volume texture derivatives. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: fix 1d shadow texture sampling Always r coordinate is used, hence need 3 coords not two (the second one is unused). Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Enable AVX support without MCJIT, where available. For now, this just enables AVX on Windows for testing. If the code is stable then we might consider prefering the old JIT wherever possible. No change elsewhere. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2012-07-13 18:09:30 +01:00
intr_size = 128;
}
else {
intrinsic = "llvm.x86.avx.min.pd.256";
intr_size = 256;
}
}
gallivm,draw,llvmpipe: Support wider native registers. Squashed commit of the following: commit 7acb7b4f60dc505af3dd00dcff744f80315d5b0e Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:46:31 2012 +0100 draw: Don't use dynamically sized arrays. Not supported by MSVC. commit 5810c28c83647612cb372d1e763fd9d7780df3cb Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:44:16 2012 +0100 gallivm,llvmpipe: Don't use expressions with PIPE_ALIGN_VAR(). MSVC doesn't accept exceptions in _declspec(align(...)). Use a define instead. commit 8aafd1457ba572a02b289b3f3411e99a3c056072 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:41:56 2012 +0100 gallium/util: Make u_cpu_detect.h header C++ safe. commit 5795248350771f899cfbfc1a3a58f1835eb2671d Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 2 12:08:01 2012 +0100 gallium/util: Add ULL suffix to large constants. As suggested by Andy Furniss: it looks like some old gcc versions require it. commit 4c66c22727eff92226544c7d43c4eb94de359e10 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Truly disable INF/NAN tests on MSVC. Thanks to Brian for spotting this. commit 8bce274c7fad578d7eb656d9a1413f5c0844c94e Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Disable INF/NAN tests on MSVC. Somehow they are not recognized as constants. commit 6868649cff8d7fd2e2579c28d0b74ef6dd4f9716 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 15:05:24 2012 +0200 gallivm: Cleanup the 2 x 8 float -> 16 ub special path in lp_build_conv. No behaviour change intended, like 7b98455fb40c2df84cfd3cdb1eb7650f67c8a751. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 5147a0949c4407e8bce9e41d9859314b4a9ccf77 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 14:28:19 2012 +0200 gallivm: (trivial) fix issues with multiple-of-4 texture fetch Some formats can't handle non-multiple of 4 fetches I believe, but everything must support length 1 and multiples of 4. So avoid going to scalar fetch (which is very costly) just because length isn't 4. Also extend the hack to not use shift with variable count for yuv formats to arbitrary length (larger than 1) - doesn't matter how many elements we have we always want to avoid it unless we have variable shift count instruction (which we should get with avx2). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 87ebcb1bd71fa4c739451ec8ca89a7f29b168c08 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jul 4 02:09:55 2012 +0200 gallivm: (trivial) fix typo for wrap repeat mode in linear filtering aos code This would lead to bogus coordinates at the edges. (undetected by piglit because this path is only taken for block-based formats). Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 3a42717101b1619874c8932a580c0b9e6896b557 Author: José Fonseca <jfonseca@vmware.com> Date: Tue Jul 3 19:42:49 2012 +0100 gallivm: Fix TGSI integer translation with AVX. commit d71ff104085c196b16426081098fb0bde128ce4f Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 15:17:41 2012 +0100 llvmpipe: Fix LLVM JIT linear path. It was not working properly because it was looking at the JIT function before it was actually compiled. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit a94df0386213e1f5f9a6ed470c535f9688ec0a1b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Jun 28 18:07:10 2012 +0100 gallivm: Refactor lp_build_broadcast(_scalar) to share code. Doesn't really change the generated assembly, but produces more compact IR, and of course, makes code more consistent. Reviewed-by: Brian Paul <brianp@vmware.com> commit 66712ba2731fc029fa246d4fc477d61ab785edb5 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 17:30:13 2012 +0100 gallivm: Make LLVMContextRef a singleton. There are any places inside LLVM that depend on it. Too many to attempt to fix. Reviewed-by: Brian Paul <brianp@vmware.com> commit ff5fb7897495ac263f0b069370fab701b70dccef Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 28 18:15:27 2012 +0200 gallivm: don't use 8-wide texture fetch in aos path This appears to be a slight loss usually. There are probably several reasons for that: - fetching itself is scalar - filtering is pure int code hence needs splitting anyway, same for the final texel offset calculations - texture wrap related code, which can be done 8-wide, is slightly more complex with floats (with clamp_to_edge) and float operations generally more costly hence probably not much faster overall - the code needed to split when encountering different mip levels for the quads, adding complexity So, just split always for aos path (but leave it 8-wide for soa, since we do 8-wide filtering there when possible). This should certainly be revisited if we'd have avx2 support. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ce8032b43dcd8e8d816cbab6428f54b0798f945d Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:41:19 2012 +0200 gallivm: (trivial) don't extract fparts variable if not needed Did not have any consequences but unnecessary. commit aaa9aaed8f80dc282492f62aa583a7ee23a4c6d5 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:09:06 2012 +0200 gallivm: fix precision issue in aos linear int wrap code now not just passes at a quick glance but also with piglit... If we do the wrapping with floats, we also need to set the weights accordingly. We can potentially end up with different (integer) coordinates than what the integer calculations would have chosen, which means the integer weights calculated previously in this case are completely wrong. Well at least that's what I think happens, at least recalculating the weights helps. (Some day really should refactor all the wrapping, so we do whatever is fastest independent of 16bit int aos or 32bit float soa filtering.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit fd6f18588ced7ac8e081892f3bab2916623ad7a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 11:15:53 2012 +0100 gallium/util: Fix parsing of options with underscore. For example GALLIVM_DEBUG=no_brilinear which was being parsed as two options, "no" and "brilinear". commit 09a8f809088178a03e49e409fa18f1ac89561837 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 26 15:00:14 2012 +0100 gallivm: Added a generic lp_build_print_value which prints a LLVMValueRef. Updated lp_build_printf to share common code. Removed specific lp_build_print_vecX. Reviewed-by: José Fonseca <jfonseca@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit e59bdcc2c075931bfba2a84967a5ecd1dedd6eb0 Author: José Fonseca <jfonseca@vmware.com> Date: Wed May 16 15:00:23 2012 +0100 draw,llvmpipe: Avoid named struct types on LLVM 3.0 and later. Starting with LLVM 3.0, named structures are meant not for debugging, but for recursive data types, previously also known as opaque types. The recursive nature of these types leads to several memory management difficulties. Given that we don't actually need recursive types, avoid them altogether. This is an attempt to address fdo bugs 41791 and 44466. The issue is somewhat random so there's no easy way to check how effective this is. Cherry-picked from 9af1ba565dfd5cef9ee938bb7c04767d14878fbf commit df6070f618a203c7a876d984c847cde4cbc26bdb Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 14:42:53 2012 +0200 gallivm: (trivial) fix typo in faster aos linear int wrap code no longer crashes, now REALLY tested. commit d8f98dce452c867214e6782e86dc08562643c862 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 18:20:58 2012 +0200 llvmpipe: (trivial) remove bogus optimization for float aos repeat wrap This optimization for nearest filtering on the linear path generated likely bogus results, and the int path didn't have any optimizations there since the only shader using force_nearest apparently uses clamp_to_edge not repeat wrap anyway. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit c4e271a0631087c795e756a5bb6b046043b5099d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 23:01:52 2012 +0200 gallivm: faster repeat wrap for linear aos path too Even if we already have scaled integer coords, it's way faster to use the original float coord (plus some conversions) rather than use URem. The choice of what to do for texture wrapping is not really tied to int aos or float soa filtering though for some modes there can be some gains (because of easier weight calculations). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 1174a75b1806e92aee4264ffe0ffe7e70abbbfa3 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 14:39:22 2012 +0200 gallivm: improve npot tex wrap repeat in linear soa path URem gets translated into series of scalar divisions so just about anything else is faster. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit f849ffaa499ed96fa0efd3594fce255c7f22891b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 00:40:35 2012 +0100 gallivm: (trivial) fix near-invisible shift-space typo I blame the keyboard. commit 5298a0b19fe672aebeb70964c0797d5921b51cf0 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:24:28 2012 +0200 gallivm: add new intrinsic helper to deal with arbitrary vector length This helper will split vectors which are too large for the hw, or expand them if they are too small, so a caller of a function using intrinsics which uses such sizes need not split (or expand) the vectors manually and the function will still use the intrinsic instead of dropping back to generic llvm code. It can also accept scalars for use with pseudo-vector intrinsics (only useful for float arguments, all x86 scalar simd float intrinsics use 4vf32). Only used for lp_build_min/max() for now (also added the scalar float case for these while there). (Other basic binary functions could use it easily, whereas functions with a different interface would need different helpers.) Expanding vectors isn't widely used, because we always try to use build contexts with native hw vector sizes. But it might (or not) be nicer if this wouldn't need to be done, the generated code should in theory stay the same (it does get hit by lp_build_rho though already since we didn't have a intrinsic for the scalar lp_build_max case before). v2: incorporated Brian's feedback, and also made the scalar min/max case work instead of crash (all scalar simd float intrinsics take 4vf32 as argument, probably the reason why it wasn't used before). Moved to lp_bld_intr based on José's request, and passing intrinsic size instead of length. Ideally we'd derive the source type info from the passed in llvm value refs and process some llvmtype return type so we could handle intrinsics where the source and destination type isn't the same (like float/int conversions, packing instructions) but that's a bit too complicated for now. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 01aa760b99ec0b2dc8ce57a43650e83f8c1becdf Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:19:18 2012 +0200 gallivm: (trivial) increase max code size for shader disassembly 64kB was just short of what I needed (which caused a crash) hence increase to 96kB (should probably be smarter about that). commit 74aa739138d981311ce13076388382b5e89c6562 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:53:29 2012 +0100 gallivm: simplify aos float tex wrap repeat nearest just handle pot and npot the same. The previous pot handling ended up with exactly the same instructions plus 2 more (leave it in the soa path though since it is probably still cheaper there). While here also fix a issue which would cause a crash after an assert. commit 0e1e755645e9e49cfaa2025191e3245ccd723564 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:29:24 2012 +0100 gallivm: (trivial) skip floor rounding in ifloor when not signed This was only done for the non-sse41 case before, but even with sse41 this is obviously unnecessary (some callers already call itrunc in this case anyway but some might not). commit 7f01a62f27dcb1d52597b24825931e88bae76f33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:23:12 2012 +0100 gallivm: (trivial) fix bogus comments commit 5c85be25fd82e28490274c468ce7f3e6e8c1d416 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 20 11:51:57 2012 +0100 translate: Free elt8_func/elt16_func too. These were leaking. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 0ad498f36fb6f7458c7cffa73b6598adceee0a6c Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 15:55:34 2012 +0200 gallivm: fix bug for tex wrap repeat with linear sampling in aos float path The comparison needs to be against length not length_minus_one, otherwise the max texel is never chosen (for the second coordinate). Fixes piglit texwrap-1D-npot-proj (and 2D/3D versions). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit d1ad65937c5b76407dc2499b7b774ab59341209e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 16:13:43 2012 +0200 gallivm: simplify soa tex wrap repeat with npot textures and no mip filtering Similar to what is already done in aos sampling for the float path (but not the int path since we don't get normalized float coordinates there). URem is expensive and the calculation is done trivially with normalized floats instead (at least with sse41-capable cpus). (Some day should probably do the same for the mip filter path but it's much more complicated there hence the gain is smaller.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit e1e23f57ba9b910295c306d148f15643acc3fc83 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:38:56 2012 +0200 llvmpipe: (trivial) remove duplicated function declaration Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 07ca57eb09e04c48a157733255427ef5de620861 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:37:34 2012 +0200 llvmpipe: destroy setup variants on context destruction lp_delete_setup_variants() used to be called in garbage collection, but this no longer exists hence the setup shaders never got freed. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ed0003c633859a45f9963a479f4c15ae0ef1dca3 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 16:25:29 2012 +0100 gallivm: handle different ilod parts for multiple quad sampling This fixes filtering when the integer part of the lod is not the same for all quads. I'm not fully convinced of that solution yet as it just splits the vector if the levels to be sampled from are different. But otherwise we'd need to do things like some minify steps, and getting mip level base address separately anyway hence it wouldn't really look like much of a win (and making the code even more complex). This should now give identical results to single quad sampling. commit 8580ac4cfc43a64df55e84ac71ce1a774d33c0d2 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:14:47 2012 +0200 gallivm: de-duplicate sample code common to soa and aos sampling There doesn't seem to be any reason why this code dealing with cube face selection, lod and mip level calculation is separate in aos and soa sampling, and I am sick of having it to change in both places. commit fb541e5f957408ce305b272100196f1e12e5b1e8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:15:41 2012 +0200 gallivm: do mip filtering with per quad lod_fpart This gives better results for mip filtering, though the generated code might not be optimal. For now it also creates some artifacts if the lod_ipart isn't the same for all quads, since instead of using the same mip weight for all quads as previously (which just caused non-smooth gradients) this now will use the right weights but with the wrong mip level in this case (can easily be seen with things like texfilt, mipmap_tunnel). v2: use logic helper suggested by José, and fix issue with negative lod_fpart values commit f1cc84eef7d826a20fab6cd8ccef9a275ff78967 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 13 18:35:25 2012 +0200 gallivm: (trivial) fix bogus assert in lp_build_unpack_broadcast_aos_scalars commit 7c17dbae8ae290df9ce0f50781a09e8ed640c044 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:11:14 2012 +0100 util: Reimplement half <-> float conversions. Removed u_half.py used to generate the table for previous method. Previous implementation of float to half conversion was faulty for denormalised and NaNs and would require extra logic to fix, thus making the speedup of using tables irrelevant. commit 7762f59274070e1dd4b546f5cb431c2eb71ae5c3 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:12:16 2012 +0100 tests: Updated tests to properly handle NaN for half floats. commit fa94c135aea5911fd93d5dfb6e6f157fb40dce5e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:33:10 2012 +0200 gallivm: do mip level calculations per quad This is the final piece which shouldn't change the rendering output yet. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 23cbeaddfe03c09ca18c45d28955515317ffcf4c Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Jun 9 00:54:21 2012 +0200 gallivm: do per-quad cube face selection Doesn't quite fix the piglit cubemap test (not sure why actually) but doing per-quad face selection is doing the right thing and definitely an improvement. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit abfb372b3702ac97ac8b5aa80ad1b94a2cc39d33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:22:59 2012 +0200 gallivm: do all lod calculations per quad Still no functional change but lod is now converted to scalar after lod calculations. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 519368632747ae03feb5bca9c655eccbc5b751b4 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:46:10 2012 +0100 gallivm: Added support for half-float to float conversion in lp_build_conv. Updated various utility functions to support this change. commit 135b4d683a4c95f7577ba27b9bffa4a6fbd2c2e7 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:02:46 2012 +0100 gallivm: Added function for half-float to float conversion. Updated lp_build_format_aos_array to support half-float source. commit 37d648827406a20c5007abeb177698723ed86673 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:55:18 2012 +0100 util: Updated u_format_tests to rigidly test half-float boundary values. commit 2ad18165d96e578aa9046df7c93cb1c3284d8c6b Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:54:16 2012 +0100 llvmpipe: Updated lp_test_format to properly handle Inf/NaN results. commit 78740acf25aeba8a7d146493dd5c966e22c27b73 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:53:30 2012 +0100 util: Added functions for checking NaN / Inf for double and half-floats. commit 35e9f640ae01241f9e0d67fe893bbbf564c05809 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:05:13 2012 +0200 gallivm: Fix calculating rho for 3d textures for the single-quad case Discovered by accident, this looks like a very old typo bug. commit fc1220c636326536fd0541913154e62afa7cd1d8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:04:59 2012 +0200 gallivm: do calcs per-quad in lp_build_rho Still convert to scalar at the end of the function. commit 50a887ffc550bf310a6988fa2cea5c24d38c1a41 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 21 23:21:50 2012 +0200 gallivm: (trivial) return scalar in lp_build_extract_range for length 1 vectors Our type system on top of llvm's one doesn't generally support vectors of length 1, instead using scalars. So we should return a scalar from this function instead of having to bitcast the vector with length 1 later elsewhere. commit 80c71c621f9391f0f9230460198d861643324876 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 17:49:15 2012 +0100 draw: Fixed bad merge error commit c47401cfad0c9167de20ff560654f533579f452c Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:29:30 2012 +0100 draw: Updated store_clip to store whole vectors instead of individual elements. commit 2d9c1ad74b0b0b41861fffcecde39f09cc27f1cf Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:28:32 2012 +0100 gallivm: Added lp_build_fetch_rgba_aos_array. A version of lp_build_fetch_rgba_aos which is targeted at simple array formats. Reads the whole vector from memory in one, instead of reading each element individually. Tested with mesa tests and demos. commit ff7805dc2b6ef6d8b11ec4e54aab1633aef29ac8 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:27:40 2012 +0100 gallivm: Added lp_build_pad_vector. This function pads a vector with undef to a desired length. commit 701f50acef24a2791dabf4730e5b5687d6eb875d Author: James Benton <jbenton@vmware.com> Date: Fri May 18 17:27:19 2012 +0100 util: Added util_format_is_array. This function checks whether a format description is in a simple array format. commit 5e0a7fa543dcd009de26f34a7926674190fa6246 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:13:47 2012 +0100 draw: Removed draw_llvm_translate_from and draw/draw_llvm_translate.c. This is "replaced" by adding an optimised path in lp_build_fetch_rgba_aos in an upcoming patch. commit 8c886d6a7dd3fb464ecf031de6f747cb33e5361d Author: James Benton <jbenton@vmware.com> Date: Wed May 16 15:02:31 2012 +0100 draw: Modified store_aos to write the vector as one, not individual elements. commit 37337f3d657e21dfd662c7b26d61cb0f8cfa6f17 Author: James Benton <jbenton@vmware.com> Date: Wed May 16 14:16:23 2012 +0100 draw: Changed aos_to_soa to use lp_build_transpose_aos. commit bd2b69ce5d5c94b067944d1dcd5df9f8e84548f1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:14:27 2012 +0100 draw: Changed soa_to_aos to use lp_build_transpose_aos. commit 0b98a950d29a116e82ce31dfe7b82cdadb632f2b Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:45 2012 +0100 gallivm: Added lp_build_transpose_aos which converts between aos and soa. commit 69ea84531ad46fd145eb619ed1cedbe97dde7cb5 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:01 2012 +0100 gallivm: Added lp_build_interleave2_half aimed at AVX unpack instructions. commit 7a4cb1349dd35c18144ad5934525cfb9436792f9 Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 22 11:54:14 2012 +0100 gallivm: Fix build on Windows. MC-JIT not yet supported there. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit afd105fc16bb75d874e418046b80d9cc578818a1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:17:26 2012 +0100 llvmpipe: Added a error counter to lp_test_conv. Useful for keeping track of progress when fixing errors! Signed-off-by: José Fonseca <jfonseca@vmware.com> commit b644907d08c10a805657841330fc23db3963d59c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:16:46 2012 +0100 llvmpipe: Changed known failures in lp_test_conv. To comply with the recent fixes to lp_bld_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit d7061507bd94f6468581e218e61261b79c760d4f Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:14:38 2012 +0100 llvmpipe: Added fixed point types tests to lp_test_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 146b3ea39b4726dbe125ac666bd8902ea3d6ca8c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:26:35 2012 +0100 llvmpipe: Changed lp_test_conv src/dst alignment to be correct. Now based on the define rather than a fixed number. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit f3b57441f834833a4b142a951eb98df0aa874536 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:06:44 2012 +0100 gallivm: Fixed erroneous optimisation in lp_build_min/max. Previously assumed normalised was 0 to 1, but it can be -1 to 1 if type is signed. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a0613382e5a215cd146bb277646a6b394d376ae4 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:04:49 2012 +0100 gallivm: Compensate for lp_const_offset in lp_build_conv. Fixing a /*FIXME*/ to remove errors in integer conversion in lp_build_conv. Tested using lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a3d2bf15ea345bc8a0664f8f441276fd566566f3 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:01:25 2012 +0100 gallivm: Fixed overflow in lp_build_clamped_float_to_unsigned_norm. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit e7b1e76fe237613731fa6003b5e1601a2e506207 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 21 20:07:51 2012 +0100 gallivm: Fix build with LLVM 2.6 Trivial, and useful. commit d3c6bbe5c7f5ba1976710831281ab1b6a631082d Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 15 17:15:59 2012 +0100 gallivm: Enable MCJIT/AVX with vanilla LLVM 3.1. Add the necessary C++ glue, so that we don't need any modifications to the soon to be released LLVM 3.1. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 724a019a14d40fdbed21759a204a2bec8a315636 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 22:04:06 2012 +0100 gallivm: Use HAVE_LLVM 0x0301 consistently. commit af6991e2a3868e40ad599b46278551b794839748 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 21:49:06 2012 +0100 gallivm: Add MCRegisterInfo.h to silence benign warnings about missing implementation. Trivial. commit 6f8a1d75458daae2503a86c6b030ecc4bb494e23 Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Apr 2 22:14:15 2012 -0700 gallivm: Pass in a MCInstrInfo to createMCInstPrinter on llvm-3.1. llvm-3.1svn r153860 makes MCInstrInfo available to the MCInstPrinter. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 62555b6ed8760545794f83064e27cddcb3ce5284 Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 27 21:51:17 2012 -0700 gallivm: Fix method overriding in raw_debug_ostream. Use matching type qualifers to avoid method hiding. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 6a9bd784f4ac68ad0a731dcd39e5a3c39989f2be Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 13 22:40:52 2012 -0700 gallivm: Fix createOProfileJITEventListener namespace with llvm-3.1. llvm-3.1svn r152620 refactored the OProfile profiling code. createOProfileJITEventListener was moved from the llvm namespace to the llvm::JITEventListener namespace. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit b674955d39adae272a779be85aa1bd665de24e3e Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Mar 5 22:00:40 2012 -0800 gallivm: Pass in a MCRegisterInfo to MCInstPrinter on llvm-3.1. llvm-3.1svn r152043 changes createMCInstPrinter to take an additional MCRegisterInfo argument. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 11ab69971a8a31c62f6de74905dbf8c02884599f Author: Vinson Lee <vlee@freedesktop.org> Date: Wed Feb 29 21:20:53 2012 -0800 Revert "gallivm: Change getExtent and readByte to non-const with llvm-3.1." This reverts commit d5a6c172547d8964f4d4bb79637651decaf9deee. llvm-3.1svn r151687 makes MemoryObject accessor members const again. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 339960c82d2a9f5c928ee9035ed31dadb7f45537 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 14 16:19:56 2012 +0200 gallivm: (trivial) fix assertion failure for mipmapped 1d textures In lp_build_rho, we may end up with a 1-element vector (for mipmapped 1d textures), but in this case we require the type to be a non-vector type, so need a cast. commit 9d73edb727bd6d196030dc3026b7bf0c574b3e19 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:12:07 2012 +0200 gallivm: prepare for per-quad lod calculations for large vectors to be able to handle multiple quads at once in texture sampling and still do lod calculations per quad, it is necessary to get the per-quad derivatives into the lp_build_rho function. Until now these derivative values were just scalars, which isn't going to work. So we now use vectors, and since the interface needs to change we also do some different (slightly more efficient) packing of the values. For 8-wide vectors the packed derivative values for 3 coords would look like this, this scales to a arbitrary (multiple of 4) vector size: ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____ The second vector will be unused for 1d and 2d textures. To facilitate future changes the derivative values are put into a struct, since quite some functions just pass these values through. The generated code seems to be very slightly better for 2d textures (with 4-wide vectors) than before with sse2 (if you have a cpu with physical 128bit simd units - otherwise it's probably not a win). v2: suggestions from José, rename variables, add comments, use swizzle helper commit 0aa21de0d31466dac77b05c97005722e902517b8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:10:31 2012 +0200 gallivm: add undefined swizzle handling to lp_build_swizzle_aos This is useful for vectors with "holes", it lets llvm choose the most efficient shuffle instructions if some elements aren't needed without having to worry what elements to manually pick otherwise. commit 00faf3f370e7ce92f5ef51002b0ea42ef856e181 Author: José Fonseca <jfonseca@vmware.com> Date: Fri May 4 17:25:16 2012 +0100 gallivm: Get the LLVM IR optimization passes before JIT compilation. MC-JIT engine compiles the module immediately on creation, so the optimization passes were being run too late. So now we create a target data layout from a string, that matches the ABI parameters reported by the compiler. The backend optimization passes were always been run, so the performance improvement is modest (3% on multiarb mesa demo). Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit 40a43f4e2ce3074b5ce9027179d657ebba68800a Author: Roland Scheidegger <sroland@vmware.com> Date: Wed May 2 16:03:54 2012 +0200 gallivm: (trivial) fix wrong define used in lp_build_pack2 should fix stack-smashing crashes. commit e6371d0f4dffad4eb3b7a9d906c23f1c88a2ab9e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 30 21:25:29 2012 +0200 gallivm: add perf warnings when not using intrinsics with 256bit vectors Helper functions using integer sse2 intrinsics could split the vectors with AVX instead of using generic fallback (which should be faster). We don't actually expect to hit these paths (hence don't fix them up to actually do the vector splitting) so just emit warnings (for those functions where it's obvious doing split/intrinsic is faster than using generic path). Only emit warnings for 256bit vectors since we _really_ don't expect to hit arbitrary large vectors which would affect a lot more functions. The warnings do not actually depend on avx since the same logic applies to plain sse2 too (but of course again there's _really_ no reason we should hit these functions with 256bit vectors without avx). commit 8a9ea701ea7295181e846c6383bf66a5f5e47637 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:37:07 2012 +0200 gallivm: split vectors manually for avx in lp_build_pack2 (v2) There's 2 reasons for this: First, there's a llvm bug (fixed in 3.1) which generates tons of byte inserts/extracts otherwise, and second, more importantly, we want to use pack intrinsics instead of shuffles. We do this in lp_build_pack2 and not the calling code (aos sample path) because potentially other callers might find that useful too, even if for larger sequences of code using non-native vector sizes it might be better to manually split vectors. This should boost texture performance in the aos path considerably. v2: fix issues with intrinsics types with old llvm commit 27ac5b48fa1f2ea3efeb5248e2ce32264aba466e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:26:22 2012 +0200 llvmpipe: refactor lp_build_pack2 (v2) prettify, and it's unnecessary to assert when there's no intrinsic due to unsupported bit width - the shuffle path will work regardless. In contrast lp_build_packs2, should only rely on lp_build_pack2 doing the clamping for element sizes for which there is a sse2 intrinsic. v2: fix bug spotted by Jose regarding the intrinsic type for packusdw on old llvm versions. commit ddf279031f0111de4b18eaf783bdc0a1e47813c8 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:13:59 2012 +0200 gallivm: add src width check in lp_build_packs2() not doing so would skip clamping even if no sse2 pack instruction is available, which is incorrect (in theory only, such widths would also always hit a (unnecessary) assertion in lp_build_pack2(). commit e7f0ad7fe079975eae7712a6e0c54be4fae0114b Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Apr 27 15:57:00 2012 +0200 gallivm: (trivial) fix crash-causing typo for npot textures with avx commit 28a9d7f6f655b6ec508c8a3aa6ffefc1e79793a0 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Apr 25 19:38:45 2012 +0200 gallivm: (trivial) remove code mistakenly added twice. commit d5926537316f8ff67ad0a52e7242f7c5478d919b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Apr 24 21:16:15 2012 +0200 gallivm: add a new avx aos sample path (v2) Try to avoid mixing float and int address calculations. This does texture wrap modes with floats, and then the offset calculations still with ints (because of lack of precision with floats, though we could do some effort to make it work with not too large (16MB) textures). This also handles wrap repeat mode with npot-sized textures differently than either the old soa or aos int path (likely way faster but untested). Otherwise the actual address wrap code is largely similar to the soa path (not quite the same as this one also has some int code), it should get used by avx soa sampling later as well but doesn't handle more complex address modes yet (this will also have the benefit that we can use aos sampling path for all texture address modes). Generated code for that looks reasonable, but still does not split vectors explicitly for fetch/filter which means still get hit by llvm (fixed upstream) which generates hundreds of pinsrb/pextrb instead of two shuffles. It is not obvious though if it's much of a win over just doing address calcs 4-wide but with ints, even if it is definitely much less instructions on avx. piglit's texwrap seems to look exactly the same but doesn't test neither the non-normalized nor the npot cases. v2: fix comments, prettify based on Brian's and Jose's feedback. commit bffecd22dea66fb416ecff8cffd10dd4bdb73fce Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Apr 19 01:58:29 2012 +0200 gallivm: refactor aos lp_build_sample_image_nearest/linear split them up to separate address calculations and fetching/filtering. Need this for being able to do 8-wide float address calcs and 4-wide fetch/filter later (for avx). Plus the functions were very big scary monsters anyway (in particular lp_build_sample_image_linear). commit a80b325c57529adddcfa367f96f03557725c4773 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 16 17:17:18 2012 +0200 gallivm: fix lp_build_resize when truncating width but expanding vector size Missed this case which I thought was impossible - the assertion for it was right after the division by zero... (AoS) texture sampling may ask us to do this, for things like 8 4x32int vectors to 1 32x8int vector conversion (eventually, we probably don't want this to happen). commit f9c8337caa3eb185830d18bce8b95676a065b1d7 Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Apr 14 18:00:59 2012 +0200 gallivm: fix cube maps with larger vectors This makes the branchless cube face selection code work with larger vectors. Because the complexity is quite high (cannot really be improved it seems, per-face selection would reduce complexity a lot but this leads to errors unless the derivatives are calculated all from the same face which almost doubles the work to be done) it is still slower than the branching version, hence only enable this with large vectors. It doesn't actually do per-quad face selection yet (only makes sense with matching lod selection, in fact it will select the same face for all pixels based on the average of the first four pixels for now) but only different shuffles are required to make it work (the branching version actually should work with larger vectors too now thanks to the improved horizontal add but of course it cannot be extended to really select the face per-quad unless doing branching per quad). commit 7780c58869fc9a00af4f23209902db7e058e8a66 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 21:11:12 2012 +0100 llvmpipe: (trivial) fix compiler warning and also clarify comment regarding availability of popcnt instruction. commit a266dccf477df6d29a611154e988e8895892277e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 14:21:07 2012 +0100 gallivm: remove unneeded members in lp_build_sample_context Minor cleanup, the texture width, height, depth aren't accessed in their scalar form anywhere. Makes it more obvious those values should probably be fetched already vectorized (but this requires more invasive changes)... commit b678c57fb474e14f05e25658c829fc04d2792fff Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 29 15:53:55 2012 +0100 gallivm: add a helper for concatenating vectors Similar to the extract_range helper intended to get around slow code generated by llvm for 128bit insertelements. Concatenating two 128bit vectors this way will result in a single vinsertf128 operation rather than two 64bit stores plus one 128bit load, though it might be mildly useful for other purposes as well. commit 415ff228bcd0cf5e44a4c15350a661f0f5520029 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 19:41:15 2012 +0100 gallivm: add a custom 2x8f->1x16ub avx conversion path Similar to the existing 4x4f->1x16ub sse2 path, shaves off a couple instructions (min/max mostly) because it relies on pack intrinsics clamping. commit 78c08fc89f8fbcc6dba09779981b1e873e2a0299 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 18:44:07 2012 +0100 gallivm: add avx arithmetic intrinsics Add all avx intrinsics for arithmetic functions (with the exception of the horizontal add function which needs another look). Seems to pass basic tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit a586caa2800aa5ce54c173f7c0d4fc48153dbc4e Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 15:31:35 2012 +0100 gallivm: add avx logic intrinsics Add the blend intrinsics for 8-wide float and 4-wide double vectors. Since we lack 256bit int instructions these are used for int vectors as well, though obviously not for byte or word element values. The comparison intrinsics aren't extended for avx since these are only used for pre-2.7 llvm versions. commit 70275e4c13c89315fc2560a4c488c0e6935d5caf Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 00:40:53 2012 +0100 gallivm: new helper function for extract shuffles. Based on José's idea as we can need that in a couple places. Note that such shuffles should not be used lightly, since data layout of <4 x i8> is different to <16 x i8> for instance, hence might cause data rearrangement. commit 4d586dbae1b0c55915dda1759d2faea631c0a1c2 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 18:27:25 2012 +0100 gallivm: (trivial) don't overallocate shuffle variable using wrong define meant huge array... commit 06b0ec1f6d665d98c135f9573ddf4ba04b2121ad Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 17:54:20 2012 +0100 gallivm: don't do per-element extract/insert for vector element resize Instead of doing per-element extract/insert if the src vectors and dst vector differ in total size (which generates atrocious code) first change the src vectors size by using shuffles to destination vector size. We can still do better than that on AVX for packing to color buffer (by exploiting pack intrinsics characteristics hence eleminating the need for some clamps) but this already generates much better code. v2: incorporate feedback from José, Keith and use shuffle instead of bitcasts/extracts. Due to llvm deficiencies the latter cause all data to get moved to GPRs and back in pieces (even though the data in the regs actually stays the same...). commit c9970d70e05f95d3f52fe7d2cd794176a52693aa Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 19:33:19 2012 +0000 gallivm: fix bug in simple position interpolation Accidental use of position attribute instead of just pixel coordinates. Caused failures in piglit glsl-fs-ceil and glsl-fs-floor. commit d0b6fcdb008d04d7f73d3d725615321544da5a7e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 15:31:14 2012 +0000 gallivm: fix emission of ceil opcode lp_build_ceil seems more appropriate than lp_build_trunc. This seems to be never hit though someone performs some ceil to floor magic. commit d97fafed7e62ffa6bf76560a92ea246a1a26d256 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 22 11:46:52 2012 +0000 gallivm: new vectorized path for cubemap calculations should be faster when adapted to multiple quads as only selection masks need to be different. The code is more or less a per-pixel version adapted to only do it per quad. A per pixel version would be much simpler (could drop 2 selects, 6 broadcasts and the messy horizontal add of 3 vectors at the expense of only 2 more absolute value instructions - would also just work for arbitary large vectors). This version doesn't yet work with larger vectors because the horizontal add isn't adjusted to be able to work with 2x4 vectors (and also because face selection wouldn't be done per quad just per block though that would be only a correctness issue just as with lod selection). The downside is this code is quite a bit slower. On a Core2 it can be sped up by disabling the hw blend instructions for selection and using logicop fallbacks instead, but it is still slower than the old code, hence leave that in for now. Probably will chose one or the other version based on vector length in the end. commit b375fbb18a3fd46859b7fdd42f3e9908ea4ff9a3 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 21 14:42:29 2012 +0000 gallivm: fix optimized occlusion query intrinsic name commit a9ba0a3b611e48efbb0e79eb09caa85033dbe9a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Mar 21 16:19:43 2012 +0000 draw,gallivm,llvmpipe: Call gallivm_verify_function everywhere. commit f94c2238d2bc7383e088b8845b7410439a602071 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 18:54:10 2012 +0000 gallivm: optimize calculations for cube maps a bit this does some more vectorized calculations and uses horizontal adds if possible. A definite win with sse3 otherwise it doesn't seem to make much of a difference. In any case this is arithmetically identical, cannot handle larger vectors. Should be useful as a reference point against larger vector version later... commit 21a2c1cf3c8e1ac648ff49e59fdc0e3be77e2ebb Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 15:16:27 2012 +0000 llvmpipe: slight optimization of occlusion queries using movmskps when available. While this is slightly better for cpus without popcnt we should really sum the vectors ourselves (it is also possible to cast to i4 before doing the popcnt but that doesn't help that much neither since llvm is using some optimized popcnt version for i32) commit 5ab5a35f216619bcdf55eed52b0db275c4a06c1b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 13:32:11 2012 +0000 llvmpipe: fix occlusion queries with larger vectors need to adjust casts etc. commit ff95e6fdf5f16d4ef999ffcf05ea6e8c7160b0d5 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Mar 19 20:15:25 2012 +0000 gallivm: Restore optimization passes. commit 57b05b4b36451e351659e98946dae27be0959832 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:34:22 2012 +0000 llvmpipe: use existing min2 macro commit bc9a20e19b4f600a439f45679451f2e87cd4b299 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:07:27 2012 +0000 llvmpipe: add some safeguards against really large vectors As per José's suggestion, prevent things from blowing up if some cpu would have 1024bit or larger vectors. commit 0e2b525e5ca1c5bbaa63158bde52ad1c1564a3a9 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:31:08 2012 +0000 llvmpipe: fix mask generation for uberwide vectors this was the only piece preventing 16-wide vectors from working (apart from the LP_MAX_VECTOR_WIDTH define that is), which is the maximum as we don't get more pixels in the fragment shader at once. Hence adjust that so things could be tested properly with that size even though there seems to be no practical value. commit 3c8334162211c97f3a11c7f64e9e5a2a91ad9656 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:19:41 2012 +0000 llvmpipe: fix the simple interpolation method with larger vectors so both methods actually _really_ work now. Makes textures look nice with larger vectors... commit 1cb0464ef8871be1778d43b0c56adf9c06843e2d Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 17:26:35 2012 +0000 llvmpipe: fix mask generation and position interpolation with 8-wide vectors trivial bugs, with these things start to look somewhat reasonable. Textures though have some swizzling issues it seems. commit 168277a63ef5b72542cf063c337f2d701053ff4b Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 16:04:03 2012 +0000 llvmpipe: don't overallocate variables we never have more than 16 (stamp size) / 4 (minimum possible vector size). (With larger vectors those variables are still overallocated a bit.) commit 409b54b30f81ed0aa9ed0b01affe15c72de9abd2 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:56:48 2012 +0000 llvmpipe: add some 32f8 formats to lp_test_conv Also add the ability to handle different sized vectors. commit 55dcd3af8366ebdac0af3cdb22c2588f24aa18ce Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:47:27 2012 +0000 gallivm: handle different sized vectors in conversion / pack only fully generic path for now (extract/insert per element). commit 9c040f78c54575fcd94a8808216cf415fe8868f6 Author: Roland Scheidegger <sroland@vmware.com> Date: Sun Mar 18 00:58:28 2012 +0100 llvmpipe: fix harmless use of unitialized values commit 551e9d5468b92fc7d5aa2265db9a52bb1e368a36 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:31:21 2012 +0100 gallivm: drop special path in extract_broadcast with different sized vectors Not needed, llvm can handle shuffles with different sized result vector just fine. Should hopefully generate the same code in the end, but simpler IR. commit 44da531119ffa07a421eaa041f63607cec88f6f8 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:28:49 2012 +0100 llvmpipe: adapt interpolation for handling multiple quads at once this is still WIP there are actually two methods possible not quite sure what makes the most sense, so there's code for both for now: 1) the iterative method as used before (compute attrib values at upper left corner of stamp and upper left corner of each quad initially). It is improved to handle more than one quad at once, and also do some more vectorized calculations initially for slightly better code - newer cpus have full throughput with 4 wide float vectors, hence don't try to code up a path which might be faster if there's just one channel active per attribute. 2) just do straight interpolation for each pixel. Method 2) is more work per quad, but less initially - if all quads are executed significantly more overall though. But this might change with larger vector lengths. This method would also be needed if we'd do some kind of active quad merging when operating on multiple quads at once. This path contains some hack to force llvm to generate better code, it is still far from ideal though, still generates far too many unnecessary register spills/reloads. Both methods should work with different sized vectors. Not very well tested yet, still seems to work with four-wide vectors, need changes elsewhere to be able to test with wider vectors. commit be5d3e82e2fe14ad0a46529ab79f65bf2276cd28 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:59:37 2012 +0000 draw: Cleanup. commit f85bc12c7fbacb3de2a94e88c6cd2d5ee0ec0e8d Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:43:30 2012 +0000 gallivm: More module compilation refactoring. commit d76f093198f2a06a93b2204857e6fea5fd0b3ece Author: José Fonseca <jfonseca@vmware.com> Date: Thu Mar 15 21:29:11 2012 +0000 llvmpipe: Use gallivm_compile/free_function() in linear code. Should had been done before. commit 122e1adb613ce083ad739b153ced1cde61dfc8c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 13 14:47:10 2012 +0100 llvmpipe: generate partial pixel mask for multiple quads still works with one quad, cannot be tested yet with more At least for now always fixed order with multiple quads. commit 4c4f15081d75ed585a01392cd2dcce0ad10e0ea8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 22:09:24 2012 +0100 llvmpipe: refactor state setup a bit Refactor to make it easier to emit (and potentially later fetch in fs) coefficients for multiple attributes at once. Need to think more about how to make this actually happen however, the problem is different attributes can have different interpolation modes, requiring different handling in both setup and fs (though linear and perspective handling is close). commit 9363e49722ff47094d688a4be6f015a03fba9c79 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 19:23:23 2012 +0100 llvmpipe: vectorize tri offset calc cuts number of instructions in quad-offset-factor from 107 to 75. This code actually duplicated the (scalar) code calculating the determinant except it used different vertex order (leading to different sign but it doesn't matter) hence llvm could not have figured out it's the same (of course with determinant vectorized in the other place that wouldn't have worked any longer neither). Note this particular piece doesn't actually vectorize well, not many arithmetic instructions left but tons of shuffle instructions... Probably would need to work on n tris at a time for better vectorization. commit 63169dcb9dd445c94605625bf86d85306e2b4297 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 03:11:37 2012 +0100 llvmpipe: vectorize some scalar code in setup reduces number of arithmetic instructions, and avoids loading vector x,y values twice (once as scalars once as vectors). Results in a reduction of instructions from 76 to 64 in fs setup for glxgears (16%) on a cpu with sse41. Since this code uses vec2 disguised as vec4, on old cpus which had physical 64bit sse units (pre-Core2) it probably is less of a win in practice (and if you have no vectors you can only hope llvm eliminates the arithmetic for unneeded elements). commit 732ecb877f951ab89bf503ac5e35ab8d838b58a1 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 7 00:32:24 2012 +0100 draw: fix clipping bug introduced by 4822fea3f0440b5205e957cd303838c3b128419c broke clipping pretty badly (verified with lineclip test) commit ef5d90b86d624c152d200c7c4056f47c3c6d2688 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 23:38:59 2012 +0100 draw: don't store vertex header per attribute storing the vertex header once per attribute is totally unnecessary. Some quick look at the generated assembly says llvm in fact cannot optimize away the additional stores (maybe due to potentially aliasing pointers somewhere). Plus, this makes the code cleaner and also allows using a vector "or" instead of scalar ones. commit 6b3a5a57b0b9850854cfbd7b586e4e50102dda71 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 19:11:01 2012 +0100 draw: do the per-vertex "boolean" clipmask "or" with vectors no point extracting the values and doing it per component. Doesn't help that much since we still extract the values elsewhere anyway. commit 36519caf1af40e4480251cc79a2d527350b7c61f Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 2 22:27:01 2012 +0100 gallivm: fix lp_build_extract_broadcast with different sized vectors Fix the obviously wrong argument, so it doesn't blow up. commit 76d0ac3ad85066d6058486638013afd02b069c58 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 2 12:16:23 2012 +0000 draw: Compile per module and not per function (WIP). Enough to get gears w/ LLVM draw + softpipe to work on AVX doing: GALLIUM_DRIVER=softpipe SOFTPIPE_USE_LLVM=yes glxgears But still hackish -- will need to rethink and refactor this. commit 78e32b247d2a7a771be9a1a07eb000d1e54ea8bd Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 12:01:05 2012 +0000 llvmpipe: Remove lp_state_setup_fallback. Never used. commit 6895d5e40d19b4972c361e8b83fdb7eecda3c225 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Feb 27 19:14:27 2012 +0000 llvmpipe: Don't emit EMMS on x86 We already take precautions to ensure that LLVM never emits MMX code. commit 4822fea3f0440b5205e957cd303838c3b128419c Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 29 15:58:19 2012 +0100 draw: modifications for larger vector sizes We want to be able to use larger vectors especially for running the vertex shader. With this patch we build soa vectors which might have a different length than 4. Note that aos structures really remain the same, only when aos structures are converted to soa potentially different sized vectors are used. Samplers probably don't work yet, didn't look at them. Testing done: glxgears works with both 128bit and 256bit vectors. commit f4950fc1ea784680ab767d3dd0dce589f4e70603 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:57 2012 +0100 gallivm: override native vector width with LP_NATIVE_VECTOR_WIDTH env var for debug commit 6ad6dbf0c92f3bf68ae54e5f2aca035d19b76e53 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:24 2012 +0100 draw: allocate storage with alignment according to native vector width commit 7bf0e3e7c9bd2469ae7279cabf4c5229ae9880c1 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Feb 24 19:06:08 2012 +0000 gallivm: Fix comment grammar. Was missing several words. Spotted by Roland. commit b20f1b28eb890b2fa2de44a0399b9b6a0d453c52 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 19:22:09 2012 +0000 gallivm: Use MC-JIT on LLVM 3.1 + (i.e, SVN) MC-JIT Note: MC-JIT is still WIP. For this to work correctly it requires LLVM changes which are not yet upstream. commit b1af4dfcadfc241fd4023f4c3f823a1286d452c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 20:03:15 2012 +0100 llvmpipe: use new lp_type_width() helper in lp_test_blend commit 04e0a37e888237d4db2298f31973af459ef9c95f Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 19:50:34 2012 +0100 llvmpipe: clean up lp_test_blend a little Using variables just sized and aligned right makes it a bit more obvious what's going on. The test still only tests vector length 4. For AoS anything else probably isn't going to work. For SoA other lengths should work (at least with floats). commit e61c393d3ec392ddee0a3da170e985fda885a823 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:48:30 2012 +0000 gallivm: Ensure vector width consistency. Instead of assuming that everything is the max native size. commit 330081ac7bc41c5754a92825e51456d231bf84dd Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:44:14 2012 +0000 draw: More simd vector width consistency fixes. commit d90ca002753596269e37297e2e6c139b19f29f03 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:43:00 2012 +0000 gallivm: Remove unused lp_build_int32_vec4_type() helper. commit cae23417824d75869c202aaf897808d73a2c1db0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 17:32:16 2012 +0100 gallivm: use global variable for native vector width instead of define We do not know the simd extensions (and hence the simd width we should use) available at compile time. At least for now keep a define for maximum vector width, since a global variable obviously can't be used to adjust alignment of automatic stack variables. Leave the runtime-determined value at 128 for now in all cases. commit 51270ace6349acc2c294fc6f34c025c707be538a Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 15:41:02 2012 +0000 gallivm: Add a hunk inadvertedly lost when rebasing. commit bf256df9cfdd0236637a455cbaece949b1253e98 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:24:23 2012 +0000 llvmpipe: Use consistent vector width in depth/stencil test. commit 5543b0901677146662c44be2cfba655fd55da94b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:19:59 2012 +0000 draw: Use a consistent the vector register width. Instead of 4x32 sometimes, LP_NATIVE_VECTOR_WIDTH other times. commit eada8bbd22a3a61f549f32fe2a7e408222e5c824 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 12:08:04 2012 +0000 gallivm: Remove garbagge collection. MC-JIT will require one compilation per module (as opposed to one compilation per function), therefore no state will be shared, eliminating the need to do garbagge collection. commit 556697ea0ed72e0641851e4fbbbb862c470fd7eb Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 10:33:41 2012 +0000 gallivm: Move all native target initialization to lp_set_target_options(). commit c518e8f3f2649d5dc265403511fab4bcbe2cc5c8 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:52:32 2012 +0000 llvmpipe: Create one gallivm instance for each test. commit 90f10af8920ec6be6f2b1e7365cfc477a0cb111d Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:48:08 2012 +0000 gallivm: Avoid LLVMAddGlobalMapping() in lp_bld_assert(). Brittle, complex, and unecesary. Just use function pointer constant. commit 98fde550b33401e3fe006af59db4db628bcbf476 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:21:26 2012 +0000 gallivm: Add a lp_build_const_func_pointer() helper. To be reused in all places where we want to call C code. commit 6cfedadb62c2ce5af8d75969bc95a607f3ece118 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:44:41 2012 +0000 gallivm: Cleanup/simplify lp_build_const_string_variable. - Move to lp_bld_const where it belongs - Rename to lp_build_const_string - take the length from the argument (and don't count the zero terminator twice) - bitcast the constant to generic i8 * commit db1d4018c0f1fa682a9da93c032977659adfb68c Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 11:52:17 2012 +0000 gallivm: Set NoFramePointerElimNonLeaf to true where supported. commit 088614164aa915baaa5044fede728aa898483183 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 22 19:38:47 2012 +0100 llvmpipe: pass in/out pointers rather scalar floats in lp_bld_arit we don't want llvm to potentially optimize away the vectors (though it doesn't seem to currently), plus we want to be able to handle in/out vectors of arbitrary length. commit 3f5c4e04af8a7592fdffa54938a277c34ae76b51 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:22:55 2012 +0100 gallivm: fix lp_build_sqrt() for vector length 1 since we optimize away vectors with length 1 need to emit intrinsic without vector type. commit 79d94e5f93ed8ba6757b97e2026722ea31d32c06 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 17:00:46 2012 +0000 llvmpipe: Remove lp_test_round. commit 81f41b5aeb3f4126e06453cfc78990086b85b78d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:56:24 2012 +0100 llvmpipe: subsume lp_test_round into lp_test_arit Much simpler, and since the arguments aren't passed as 128bit values can run on any arch. This also uses the float instead of the double versions of the c functions (which probably was the intention anyway). In contrast to lp_test_round the output is much less verbose however. Tested vector width of 32 to 512 bits - all pass except 32 (length 1) which crashes in lp_build_sqrt() due to wrong type. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 945b338b421defbd274481d8c4f7e0910fd0e7eb Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 09:55:03 2012 +0000 gallivm: Centralize the function compilation logic. This simplifies a lot of code. Also doing this in a central place will make it easier to carry out the changes necessary to use MC-JIT in the future. gallivm: Fix typo in explicit derivative shuffle. Trivial. draw: make DEBUG_STORE work again adapt to lp_build_printf() interface changes Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: get rid of vecnf_from_scalar() just use lp_build_broadcast directly (cannot assign a name but don't really need it, vecnf_from_scalar() was producing much uglier IR due to using repeated insertelement instead of insertelement+shuffle). Reviewed-by: José Fonseca <jfonseca@vmware.com> llvmpipe: fix typo in complex interpolation code Fixes position interpolation when using complex mode (piglit fp-fragment-position and similar) Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: fix clipvertex/position storing again This appears to be the result of a bad merge. Fixes piglit tests relying on clipping, like a lot of the interpolation tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Fix explicit derivative manipulation. Same counter variable was being used in two nested loops. Use more meanigful variable names for the counter to fix and avoid this. gallivm: Prevent buffer overflow in repeat wrap mode for NPOT. Based on Roland's patch, discussion, and review . Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix dims for TGSI_TEXTURE_1D in emit_tex. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix explicit volume texture derivatives. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: fix 1d shadow texture sampling Always r coordinate is used, hence need 3 coords not two (the second one is unused). Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Enable AVX support without MCJIT, where available. For now, this just enables AVX on Windows for testing. If the code is stable then we might consider prefering the old JIT wherever possible. No change elsewhere. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2012-07-13 18:09:30 +01:00
}
else if (type.floating && util_cpu_caps.has_altivec) {
if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
2013-11-11 14:29:25 +00:00
debug_printf("%s: altivec doesn't support nan return nan behavior\n",
__FUNCTION__);
}
if (type.width == 32 && type.length == 4) {
intrinsic = "llvm.ppc.altivec.vminfp";
intr_size = 128;
}
} else if (HAVE_LLVM < 0x0309 &&
util_cpu_caps.has_avx2 && type.length > 4) {
intr_size = 256;
switch (type.width) {
case 8:
intrinsic = type.sign ? "llvm.x86.avx2.pmins.b" : "llvm.x86.avx2.pminu.b";
break;
case 16:
intrinsic = type.sign ? "llvm.x86.avx2.pmins.w" : "llvm.x86.avx2.pminu.w";
break;
case 32:
intrinsic = type.sign ? "llvm.x86.avx2.pmins.d" : "llvm.x86.avx2.pminu.d";
break;
}
} else if (HAVE_LLVM < 0x0309 &&
util_cpu_caps.has_sse2 && type.length >= 2) {
gallivm,draw,llvmpipe: Support wider native registers. Squashed commit of the following: commit 7acb7b4f60dc505af3dd00dcff744f80315d5b0e Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:46:31 2012 +0100 draw: Don't use dynamically sized arrays. Not supported by MSVC. commit 5810c28c83647612cb372d1e763fd9d7780df3cb Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:44:16 2012 +0100 gallivm,llvmpipe: Don't use expressions with PIPE_ALIGN_VAR(). MSVC doesn't accept exceptions in _declspec(align(...)). Use a define instead. commit 8aafd1457ba572a02b289b3f3411e99a3c056072 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:41:56 2012 +0100 gallium/util: Make u_cpu_detect.h header C++ safe. commit 5795248350771f899cfbfc1a3a58f1835eb2671d Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 2 12:08:01 2012 +0100 gallium/util: Add ULL suffix to large constants. As suggested by Andy Furniss: it looks like some old gcc versions require it. commit 4c66c22727eff92226544c7d43c4eb94de359e10 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Truly disable INF/NAN tests on MSVC. Thanks to Brian for spotting this. commit 8bce274c7fad578d7eb656d9a1413f5c0844c94e Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Disable INF/NAN tests on MSVC. Somehow they are not recognized as constants. commit 6868649cff8d7fd2e2579c28d0b74ef6dd4f9716 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 15:05:24 2012 +0200 gallivm: Cleanup the 2 x 8 float -> 16 ub special path in lp_build_conv. No behaviour change intended, like 7b98455fb40c2df84cfd3cdb1eb7650f67c8a751. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 5147a0949c4407e8bce9e41d9859314b4a9ccf77 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 14:28:19 2012 +0200 gallivm: (trivial) fix issues with multiple-of-4 texture fetch Some formats can't handle non-multiple of 4 fetches I believe, but everything must support length 1 and multiples of 4. So avoid going to scalar fetch (which is very costly) just because length isn't 4. Also extend the hack to not use shift with variable count for yuv formats to arbitrary length (larger than 1) - doesn't matter how many elements we have we always want to avoid it unless we have variable shift count instruction (which we should get with avx2). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 87ebcb1bd71fa4c739451ec8ca89a7f29b168c08 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jul 4 02:09:55 2012 +0200 gallivm: (trivial) fix typo for wrap repeat mode in linear filtering aos code This would lead to bogus coordinates at the edges. (undetected by piglit because this path is only taken for block-based formats). Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 3a42717101b1619874c8932a580c0b9e6896b557 Author: José Fonseca <jfonseca@vmware.com> Date: Tue Jul 3 19:42:49 2012 +0100 gallivm: Fix TGSI integer translation with AVX. commit d71ff104085c196b16426081098fb0bde128ce4f Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 15:17:41 2012 +0100 llvmpipe: Fix LLVM JIT linear path. It was not working properly because it was looking at the JIT function before it was actually compiled. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit a94df0386213e1f5f9a6ed470c535f9688ec0a1b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Jun 28 18:07:10 2012 +0100 gallivm: Refactor lp_build_broadcast(_scalar) to share code. Doesn't really change the generated assembly, but produces more compact IR, and of course, makes code more consistent. Reviewed-by: Brian Paul <brianp@vmware.com> commit 66712ba2731fc029fa246d4fc477d61ab785edb5 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 17:30:13 2012 +0100 gallivm: Make LLVMContextRef a singleton. There are any places inside LLVM that depend on it. Too many to attempt to fix. Reviewed-by: Brian Paul <brianp@vmware.com> commit ff5fb7897495ac263f0b069370fab701b70dccef Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 28 18:15:27 2012 +0200 gallivm: don't use 8-wide texture fetch in aos path This appears to be a slight loss usually. There are probably several reasons for that: - fetching itself is scalar - filtering is pure int code hence needs splitting anyway, same for the final texel offset calculations - texture wrap related code, which can be done 8-wide, is slightly more complex with floats (with clamp_to_edge) and float operations generally more costly hence probably not much faster overall - the code needed to split when encountering different mip levels for the quads, adding complexity So, just split always for aos path (but leave it 8-wide for soa, since we do 8-wide filtering there when possible). This should certainly be revisited if we'd have avx2 support. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ce8032b43dcd8e8d816cbab6428f54b0798f945d Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:41:19 2012 +0200 gallivm: (trivial) don't extract fparts variable if not needed Did not have any consequences but unnecessary. commit aaa9aaed8f80dc282492f62aa583a7ee23a4c6d5 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:09:06 2012 +0200 gallivm: fix precision issue in aos linear int wrap code now not just passes at a quick glance but also with piglit... If we do the wrapping with floats, we also need to set the weights accordingly. We can potentially end up with different (integer) coordinates than what the integer calculations would have chosen, which means the integer weights calculated previously in this case are completely wrong. Well at least that's what I think happens, at least recalculating the weights helps. (Some day really should refactor all the wrapping, so we do whatever is fastest independent of 16bit int aos or 32bit float soa filtering.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit fd6f18588ced7ac8e081892f3bab2916623ad7a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 11:15:53 2012 +0100 gallium/util: Fix parsing of options with underscore. For example GALLIVM_DEBUG=no_brilinear which was being parsed as two options, "no" and "brilinear". commit 09a8f809088178a03e49e409fa18f1ac89561837 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 26 15:00:14 2012 +0100 gallivm: Added a generic lp_build_print_value which prints a LLVMValueRef. Updated lp_build_printf to share common code. Removed specific lp_build_print_vecX. Reviewed-by: José Fonseca <jfonseca@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit e59bdcc2c075931bfba2a84967a5ecd1dedd6eb0 Author: José Fonseca <jfonseca@vmware.com> Date: Wed May 16 15:00:23 2012 +0100 draw,llvmpipe: Avoid named struct types on LLVM 3.0 and later. Starting with LLVM 3.0, named structures are meant not for debugging, but for recursive data types, previously also known as opaque types. The recursive nature of these types leads to several memory management difficulties. Given that we don't actually need recursive types, avoid them altogether. This is an attempt to address fdo bugs 41791 and 44466. The issue is somewhat random so there's no easy way to check how effective this is. Cherry-picked from 9af1ba565dfd5cef9ee938bb7c04767d14878fbf commit df6070f618a203c7a876d984c847cde4cbc26bdb Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 14:42:53 2012 +0200 gallivm: (trivial) fix typo in faster aos linear int wrap code no longer crashes, now REALLY tested. commit d8f98dce452c867214e6782e86dc08562643c862 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 18:20:58 2012 +0200 llvmpipe: (trivial) remove bogus optimization for float aos repeat wrap This optimization for nearest filtering on the linear path generated likely bogus results, and the int path didn't have any optimizations there since the only shader using force_nearest apparently uses clamp_to_edge not repeat wrap anyway. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit c4e271a0631087c795e756a5bb6b046043b5099d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 23:01:52 2012 +0200 gallivm: faster repeat wrap for linear aos path too Even if we already have scaled integer coords, it's way faster to use the original float coord (plus some conversions) rather than use URem. The choice of what to do for texture wrapping is not really tied to int aos or float soa filtering though for some modes there can be some gains (because of easier weight calculations). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 1174a75b1806e92aee4264ffe0ffe7e70abbbfa3 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 14:39:22 2012 +0200 gallivm: improve npot tex wrap repeat in linear soa path URem gets translated into series of scalar divisions so just about anything else is faster. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit f849ffaa499ed96fa0efd3594fce255c7f22891b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 00:40:35 2012 +0100 gallivm: (trivial) fix near-invisible shift-space typo I blame the keyboard. commit 5298a0b19fe672aebeb70964c0797d5921b51cf0 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:24:28 2012 +0200 gallivm: add new intrinsic helper to deal with arbitrary vector length This helper will split vectors which are too large for the hw, or expand them if they are too small, so a caller of a function using intrinsics which uses such sizes need not split (or expand) the vectors manually and the function will still use the intrinsic instead of dropping back to generic llvm code. It can also accept scalars for use with pseudo-vector intrinsics (only useful for float arguments, all x86 scalar simd float intrinsics use 4vf32). Only used for lp_build_min/max() for now (also added the scalar float case for these while there). (Other basic binary functions could use it easily, whereas functions with a different interface would need different helpers.) Expanding vectors isn't widely used, because we always try to use build contexts with native hw vector sizes. But it might (or not) be nicer if this wouldn't need to be done, the generated code should in theory stay the same (it does get hit by lp_build_rho though already since we didn't have a intrinsic for the scalar lp_build_max case before). v2: incorporated Brian's feedback, and also made the scalar min/max case work instead of crash (all scalar simd float intrinsics take 4vf32 as argument, probably the reason why it wasn't used before). Moved to lp_bld_intr based on José's request, and passing intrinsic size instead of length. Ideally we'd derive the source type info from the passed in llvm value refs and process some llvmtype return type so we could handle intrinsics where the source and destination type isn't the same (like float/int conversions, packing instructions) but that's a bit too complicated for now. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 01aa760b99ec0b2dc8ce57a43650e83f8c1becdf Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:19:18 2012 +0200 gallivm: (trivial) increase max code size for shader disassembly 64kB was just short of what I needed (which caused a crash) hence increase to 96kB (should probably be smarter about that). commit 74aa739138d981311ce13076388382b5e89c6562 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:53:29 2012 +0100 gallivm: simplify aos float tex wrap repeat nearest just handle pot and npot the same. The previous pot handling ended up with exactly the same instructions plus 2 more (leave it in the soa path though since it is probably still cheaper there). While here also fix a issue which would cause a crash after an assert. commit 0e1e755645e9e49cfaa2025191e3245ccd723564 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:29:24 2012 +0100 gallivm: (trivial) skip floor rounding in ifloor when not signed This was only done for the non-sse41 case before, but even with sse41 this is obviously unnecessary (some callers already call itrunc in this case anyway but some might not). commit 7f01a62f27dcb1d52597b24825931e88bae76f33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:23:12 2012 +0100 gallivm: (trivial) fix bogus comments commit 5c85be25fd82e28490274c468ce7f3e6e8c1d416 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 20 11:51:57 2012 +0100 translate: Free elt8_func/elt16_func too. These were leaking. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 0ad498f36fb6f7458c7cffa73b6598adceee0a6c Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 15:55:34 2012 +0200 gallivm: fix bug for tex wrap repeat with linear sampling in aos float path The comparison needs to be against length not length_minus_one, otherwise the max texel is never chosen (for the second coordinate). Fixes piglit texwrap-1D-npot-proj (and 2D/3D versions). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit d1ad65937c5b76407dc2499b7b774ab59341209e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 16:13:43 2012 +0200 gallivm: simplify soa tex wrap repeat with npot textures and no mip filtering Similar to what is already done in aos sampling for the float path (but not the int path since we don't get normalized float coordinates there). URem is expensive and the calculation is done trivially with normalized floats instead (at least with sse41-capable cpus). (Some day should probably do the same for the mip filter path but it's much more complicated there hence the gain is smaller.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit e1e23f57ba9b910295c306d148f15643acc3fc83 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:38:56 2012 +0200 llvmpipe: (trivial) remove duplicated function declaration Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 07ca57eb09e04c48a157733255427ef5de620861 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:37:34 2012 +0200 llvmpipe: destroy setup variants on context destruction lp_delete_setup_variants() used to be called in garbage collection, but this no longer exists hence the setup shaders never got freed. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ed0003c633859a45f9963a479f4c15ae0ef1dca3 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 16:25:29 2012 +0100 gallivm: handle different ilod parts for multiple quad sampling This fixes filtering when the integer part of the lod is not the same for all quads. I'm not fully convinced of that solution yet as it just splits the vector if the levels to be sampled from are different. But otherwise we'd need to do things like some minify steps, and getting mip level base address separately anyway hence it wouldn't really look like much of a win (and making the code even more complex). This should now give identical results to single quad sampling. commit 8580ac4cfc43a64df55e84ac71ce1a774d33c0d2 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:14:47 2012 +0200 gallivm: de-duplicate sample code common to soa and aos sampling There doesn't seem to be any reason why this code dealing with cube face selection, lod and mip level calculation is separate in aos and soa sampling, and I am sick of having it to change in both places. commit fb541e5f957408ce305b272100196f1e12e5b1e8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:15:41 2012 +0200 gallivm: do mip filtering with per quad lod_fpart This gives better results for mip filtering, though the generated code might not be optimal. For now it also creates some artifacts if the lod_ipart isn't the same for all quads, since instead of using the same mip weight for all quads as previously (which just caused non-smooth gradients) this now will use the right weights but with the wrong mip level in this case (can easily be seen with things like texfilt, mipmap_tunnel). v2: use logic helper suggested by José, and fix issue with negative lod_fpart values commit f1cc84eef7d826a20fab6cd8ccef9a275ff78967 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 13 18:35:25 2012 +0200 gallivm: (trivial) fix bogus assert in lp_build_unpack_broadcast_aos_scalars commit 7c17dbae8ae290df9ce0f50781a09e8ed640c044 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:11:14 2012 +0100 util: Reimplement half <-> float conversions. Removed u_half.py used to generate the table for previous method. Previous implementation of float to half conversion was faulty for denormalised and NaNs and would require extra logic to fix, thus making the speedup of using tables irrelevant. commit 7762f59274070e1dd4b546f5cb431c2eb71ae5c3 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:12:16 2012 +0100 tests: Updated tests to properly handle NaN for half floats. commit fa94c135aea5911fd93d5dfb6e6f157fb40dce5e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:33:10 2012 +0200 gallivm: do mip level calculations per quad This is the final piece which shouldn't change the rendering output yet. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 23cbeaddfe03c09ca18c45d28955515317ffcf4c Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Jun 9 00:54:21 2012 +0200 gallivm: do per-quad cube face selection Doesn't quite fix the piglit cubemap test (not sure why actually) but doing per-quad face selection is doing the right thing and definitely an improvement. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit abfb372b3702ac97ac8b5aa80ad1b94a2cc39d33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:22:59 2012 +0200 gallivm: do all lod calculations per quad Still no functional change but lod is now converted to scalar after lod calculations. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 519368632747ae03feb5bca9c655eccbc5b751b4 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:46:10 2012 +0100 gallivm: Added support for half-float to float conversion in lp_build_conv. Updated various utility functions to support this change. commit 135b4d683a4c95f7577ba27b9bffa4a6fbd2c2e7 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:02:46 2012 +0100 gallivm: Added function for half-float to float conversion. Updated lp_build_format_aos_array to support half-float source. commit 37d648827406a20c5007abeb177698723ed86673 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:55:18 2012 +0100 util: Updated u_format_tests to rigidly test half-float boundary values. commit 2ad18165d96e578aa9046df7c93cb1c3284d8c6b Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:54:16 2012 +0100 llvmpipe: Updated lp_test_format to properly handle Inf/NaN results. commit 78740acf25aeba8a7d146493dd5c966e22c27b73 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:53:30 2012 +0100 util: Added functions for checking NaN / Inf for double and half-floats. commit 35e9f640ae01241f9e0d67fe893bbbf564c05809 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:05:13 2012 +0200 gallivm: Fix calculating rho for 3d textures for the single-quad case Discovered by accident, this looks like a very old typo bug. commit fc1220c636326536fd0541913154e62afa7cd1d8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:04:59 2012 +0200 gallivm: do calcs per-quad in lp_build_rho Still convert to scalar at the end of the function. commit 50a887ffc550bf310a6988fa2cea5c24d38c1a41 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 21 23:21:50 2012 +0200 gallivm: (trivial) return scalar in lp_build_extract_range for length 1 vectors Our type system on top of llvm's one doesn't generally support vectors of length 1, instead using scalars. So we should return a scalar from this function instead of having to bitcast the vector with length 1 later elsewhere. commit 80c71c621f9391f0f9230460198d861643324876 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 17:49:15 2012 +0100 draw: Fixed bad merge error commit c47401cfad0c9167de20ff560654f533579f452c Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:29:30 2012 +0100 draw: Updated store_clip to store whole vectors instead of individual elements. commit 2d9c1ad74b0b0b41861fffcecde39f09cc27f1cf Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:28:32 2012 +0100 gallivm: Added lp_build_fetch_rgba_aos_array. A version of lp_build_fetch_rgba_aos which is targeted at simple array formats. Reads the whole vector from memory in one, instead of reading each element individually. Tested with mesa tests and demos. commit ff7805dc2b6ef6d8b11ec4e54aab1633aef29ac8 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:27:40 2012 +0100 gallivm: Added lp_build_pad_vector. This function pads a vector with undef to a desired length. commit 701f50acef24a2791dabf4730e5b5687d6eb875d Author: James Benton <jbenton@vmware.com> Date: Fri May 18 17:27:19 2012 +0100 util: Added util_format_is_array. This function checks whether a format description is in a simple array format. commit 5e0a7fa543dcd009de26f34a7926674190fa6246 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:13:47 2012 +0100 draw: Removed draw_llvm_translate_from and draw/draw_llvm_translate.c. This is "replaced" by adding an optimised path in lp_build_fetch_rgba_aos in an upcoming patch. commit 8c886d6a7dd3fb464ecf031de6f747cb33e5361d Author: James Benton <jbenton@vmware.com> Date: Wed May 16 15:02:31 2012 +0100 draw: Modified store_aos to write the vector as one, not individual elements. commit 37337f3d657e21dfd662c7b26d61cb0f8cfa6f17 Author: James Benton <jbenton@vmware.com> Date: Wed May 16 14:16:23 2012 +0100 draw: Changed aos_to_soa to use lp_build_transpose_aos. commit bd2b69ce5d5c94b067944d1dcd5df9f8e84548f1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:14:27 2012 +0100 draw: Changed soa_to_aos to use lp_build_transpose_aos. commit 0b98a950d29a116e82ce31dfe7b82cdadb632f2b Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:45 2012 +0100 gallivm: Added lp_build_transpose_aos which converts between aos and soa. commit 69ea84531ad46fd145eb619ed1cedbe97dde7cb5 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:01 2012 +0100 gallivm: Added lp_build_interleave2_half aimed at AVX unpack instructions. commit 7a4cb1349dd35c18144ad5934525cfb9436792f9 Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 22 11:54:14 2012 +0100 gallivm: Fix build on Windows. MC-JIT not yet supported there. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit afd105fc16bb75d874e418046b80d9cc578818a1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:17:26 2012 +0100 llvmpipe: Added a error counter to lp_test_conv. Useful for keeping track of progress when fixing errors! Signed-off-by: José Fonseca <jfonseca@vmware.com> commit b644907d08c10a805657841330fc23db3963d59c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:16:46 2012 +0100 llvmpipe: Changed known failures in lp_test_conv. To comply with the recent fixes to lp_bld_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit d7061507bd94f6468581e218e61261b79c760d4f Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:14:38 2012 +0100 llvmpipe: Added fixed point types tests to lp_test_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 146b3ea39b4726dbe125ac666bd8902ea3d6ca8c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:26:35 2012 +0100 llvmpipe: Changed lp_test_conv src/dst alignment to be correct. Now based on the define rather than a fixed number. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit f3b57441f834833a4b142a951eb98df0aa874536 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:06:44 2012 +0100 gallivm: Fixed erroneous optimisation in lp_build_min/max. Previously assumed normalised was 0 to 1, but it can be -1 to 1 if type is signed. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a0613382e5a215cd146bb277646a6b394d376ae4 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:04:49 2012 +0100 gallivm: Compensate for lp_const_offset in lp_build_conv. Fixing a /*FIXME*/ to remove errors in integer conversion in lp_build_conv. Tested using lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a3d2bf15ea345bc8a0664f8f441276fd566566f3 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:01:25 2012 +0100 gallivm: Fixed overflow in lp_build_clamped_float_to_unsigned_norm. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit e7b1e76fe237613731fa6003b5e1601a2e506207 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 21 20:07:51 2012 +0100 gallivm: Fix build with LLVM 2.6 Trivial, and useful. commit d3c6bbe5c7f5ba1976710831281ab1b6a631082d Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 15 17:15:59 2012 +0100 gallivm: Enable MCJIT/AVX with vanilla LLVM 3.1. Add the necessary C++ glue, so that we don't need any modifications to the soon to be released LLVM 3.1. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 724a019a14d40fdbed21759a204a2bec8a315636 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 22:04:06 2012 +0100 gallivm: Use HAVE_LLVM 0x0301 consistently. commit af6991e2a3868e40ad599b46278551b794839748 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 21:49:06 2012 +0100 gallivm: Add MCRegisterInfo.h to silence benign warnings about missing implementation. Trivial. commit 6f8a1d75458daae2503a86c6b030ecc4bb494e23 Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Apr 2 22:14:15 2012 -0700 gallivm: Pass in a MCInstrInfo to createMCInstPrinter on llvm-3.1. llvm-3.1svn r153860 makes MCInstrInfo available to the MCInstPrinter. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 62555b6ed8760545794f83064e27cddcb3ce5284 Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 27 21:51:17 2012 -0700 gallivm: Fix method overriding in raw_debug_ostream. Use matching type qualifers to avoid method hiding. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 6a9bd784f4ac68ad0a731dcd39e5a3c39989f2be Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 13 22:40:52 2012 -0700 gallivm: Fix createOProfileJITEventListener namespace with llvm-3.1. llvm-3.1svn r152620 refactored the OProfile profiling code. createOProfileJITEventListener was moved from the llvm namespace to the llvm::JITEventListener namespace. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit b674955d39adae272a779be85aa1bd665de24e3e Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Mar 5 22:00:40 2012 -0800 gallivm: Pass in a MCRegisterInfo to MCInstPrinter on llvm-3.1. llvm-3.1svn r152043 changes createMCInstPrinter to take an additional MCRegisterInfo argument. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 11ab69971a8a31c62f6de74905dbf8c02884599f Author: Vinson Lee <vlee@freedesktop.org> Date: Wed Feb 29 21:20:53 2012 -0800 Revert "gallivm: Change getExtent and readByte to non-const with llvm-3.1." This reverts commit d5a6c172547d8964f4d4bb79637651decaf9deee. llvm-3.1svn r151687 makes MemoryObject accessor members const again. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 339960c82d2a9f5c928ee9035ed31dadb7f45537 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 14 16:19:56 2012 +0200 gallivm: (trivial) fix assertion failure for mipmapped 1d textures In lp_build_rho, we may end up with a 1-element vector (for mipmapped 1d textures), but in this case we require the type to be a non-vector type, so need a cast. commit 9d73edb727bd6d196030dc3026b7bf0c574b3e19 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:12:07 2012 +0200 gallivm: prepare for per-quad lod calculations for large vectors to be able to handle multiple quads at once in texture sampling and still do lod calculations per quad, it is necessary to get the per-quad derivatives into the lp_build_rho function. Until now these derivative values were just scalars, which isn't going to work. So we now use vectors, and since the interface needs to change we also do some different (slightly more efficient) packing of the values. For 8-wide vectors the packed derivative values for 3 coords would look like this, this scales to a arbitrary (multiple of 4) vector size: ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____ The second vector will be unused for 1d and 2d textures. To facilitate future changes the derivative values are put into a struct, since quite some functions just pass these values through. The generated code seems to be very slightly better for 2d textures (with 4-wide vectors) than before with sse2 (if you have a cpu with physical 128bit simd units - otherwise it's probably not a win). v2: suggestions from José, rename variables, add comments, use swizzle helper commit 0aa21de0d31466dac77b05c97005722e902517b8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:10:31 2012 +0200 gallivm: add undefined swizzle handling to lp_build_swizzle_aos This is useful for vectors with "holes", it lets llvm choose the most efficient shuffle instructions if some elements aren't needed without having to worry what elements to manually pick otherwise. commit 00faf3f370e7ce92f5ef51002b0ea42ef856e181 Author: José Fonseca <jfonseca@vmware.com> Date: Fri May 4 17:25:16 2012 +0100 gallivm: Get the LLVM IR optimization passes before JIT compilation. MC-JIT engine compiles the module immediately on creation, so the optimization passes were being run too late. So now we create a target data layout from a string, that matches the ABI parameters reported by the compiler. The backend optimization passes were always been run, so the performance improvement is modest (3% on multiarb mesa demo). Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit 40a43f4e2ce3074b5ce9027179d657ebba68800a Author: Roland Scheidegger <sroland@vmware.com> Date: Wed May 2 16:03:54 2012 +0200 gallivm: (trivial) fix wrong define used in lp_build_pack2 should fix stack-smashing crashes. commit e6371d0f4dffad4eb3b7a9d906c23f1c88a2ab9e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 30 21:25:29 2012 +0200 gallivm: add perf warnings when not using intrinsics with 256bit vectors Helper functions using integer sse2 intrinsics could split the vectors with AVX instead of using generic fallback (which should be faster). We don't actually expect to hit these paths (hence don't fix them up to actually do the vector splitting) so just emit warnings (for those functions where it's obvious doing split/intrinsic is faster than using generic path). Only emit warnings for 256bit vectors since we _really_ don't expect to hit arbitrary large vectors which would affect a lot more functions. The warnings do not actually depend on avx since the same logic applies to plain sse2 too (but of course again there's _really_ no reason we should hit these functions with 256bit vectors without avx). commit 8a9ea701ea7295181e846c6383bf66a5f5e47637 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:37:07 2012 +0200 gallivm: split vectors manually for avx in lp_build_pack2 (v2) There's 2 reasons for this: First, there's a llvm bug (fixed in 3.1) which generates tons of byte inserts/extracts otherwise, and second, more importantly, we want to use pack intrinsics instead of shuffles. We do this in lp_build_pack2 and not the calling code (aos sample path) because potentially other callers might find that useful too, even if for larger sequences of code using non-native vector sizes it might be better to manually split vectors. This should boost texture performance in the aos path considerably. v2: fix issues with intrinsics types with old llvm commit 27ac5b48fa1f2ea3efeb5248e2ce32264aba466e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:26:22 2012 +0200 llvmpipe: refactor lp_build_pack2 (v2) prettify, and it's unnecessary to assert when there's no intrinsic due to unsupported bit width - the shuffle path will work regardless. In contrast lp_build_packs2, should only rely on lp_build_pack2 doing the clamping for element sizes for which there is a sse2 intrinsic. v2: fix bug spotted by Jose regarding the intrinsic type for packusdw on old llvm versions. commit ddf279031f0111de4b18eaf783bdc0a1e47813c8 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:13:59 2012 +0200 gallivm: add src width check in lp_build_packs2() not doing so would skip clamping even if no sse2 pack instruction is available, which is incorrect (in theory only, such widths would also always hit a (unnecessary) assertion in lp_build_pack2(). commit e7f0ad7fe079975eae7712a6e0c54be4fae0114b Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Apr 27 15:57:00 2012 +0200 gallivm: (trivial) fix crash-causing typo for npot textures with avx commit 28a9d7f6f655b6ec508c8a3aa6ffefc1e79793a0 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Apr 25 19:38:45 2012 +0200 gallivm: (trivial) remove code mistakenly added twice. commit d5926537316f8ff67ad0a52e7242f7c5478d919b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Apr 24 21:16:15 2012 +0200 gallivm: add a new avx aos sample path (v2) Try to avoid mixing float and int address calculations. This does texture wrap modes with floats, and then the offset calculations still with ints (because of lack of precision with floats, though we could do some effort to make it work with not too large (16MB) textures). This also handles wrap repeat mode with npot-sized textures differently than either the old soa or aos int path (likely way faster but untested). Otherwise the actual address wrap code is largely similar to the soa path (not quite the same as this one also has some int code), it should get used by avx soa sampling later as well but doesn't handle more complex address modes yet (this will also have the benefit that we can use aos sampling path for all texture address modes). Generated code for that looks reasonable, but still does not split vectors explicitly for fetch/filter which means still get hit by llvm (fixed upstream) which generates hundreds of pinsrb/pextrb instead of two shuffles. It is not obvious though if it's much of a win over just doing address calcs 4-wide but with ints, even if it is definitely much less instructions on avx. piglit's texwrap seems to look exactly the same but doesn't test neither the non-normalized nor the npot cases. v2: fix comments, prettify based on Brian's and Jose's feedback. commit bffecd22dea66fb416ecff8cffd10dd4bdb73fce Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Apr 19 01:58:29 2012 +0200 gallivm: refactor aos lp_build_sample_image_nearest/linear split them up to separate address calculations and fetching/filtering. Need this for being able to do 8-wide float address calcs and 4-wide fetch/filter later (for avx). Plus the functions were very big scary monsters anyway (in particular lp_build_sample_image_linear). commit a80b325c57529adddcfa367f96f03557725c4773 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 16 17:17:18 2012 +0200 gallivm: fix lp_build_resize when truncating width but expanding vector size Missed this case which I thought was impossible - the assertion for it was right after the division by zero... (AoS) texture sampling may ask us to do this, for things like 8 4x32int vectors to 1 32x8int vector conversion (eventually, we probably don't want this to happen). commit f9c8337caa3eb185830d18bce8b95676a065b1d7 Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Apr 14 18:00:59 2012 +0200 gallivm: fix cube maps with larger vectors This makes the branchless cube face selection code work with larger vectors. Because the complexity is quite high (cannot really be improved it seems, per-face selection would reduce complexity a lot but this leads to errors unless the derivatives are calculated all from the same face which almost doubles the work to be done) it is still slower than the branching version, hence only enable this with large vectors. It doesn't actually do per-quad face selection yet (only makes sense with matching lod selection, in fact it will select the same face for all pixels based on the average of the first four pixels for now) but only different shuffles are required to make it work (the branching version actually should work with larger vectors too now thanks to the improved horizontal add but of course it cannot be extended to really select the face per-quad unless doing branching per quad). commit 7780c58869fc9a00af4f23209902db7e058e8a66 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 21:11:12 2012 +0100 llvmpipe: (trivial) fix compiler warning and also clarify comment regarding availability of popcnt instruction. commit a266dccf477df6d29a611154e988e8895892277e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 14:21:07 2012 +0100 gallivm: remove unneeded members in lp_build_sample_context Minor cleanup, the texture width, height, depth aren't accessed in their scalar form anywhere. Makes it more obvious those values should probably be fetched already vectorized (but this requires more invasive changes)... commit b678c57fb474e14f05e25658c829fc04d2792fff Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 29 15:53:55 2012 +0100 gallivm: add a helper for concatenating vectors Similar to the extract_range helper intended to get around slow code generated by llvm for 128bit insertelements. Concatenating two 128bit vectors this way will result in a single vinsertf128 operation rather than two 64bit stores plus one 128bit load, though it might be mildly useful for other purposes as well. commit 415ff228bcd0cf5e44a4c15350a661f0f5520029 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 19:41:15 2012 +0100 gallivm: add a custom 2x8f->1x16ub avx conversion path Similar to the existing 4x4f->1x16ub sse2 path, shaves off a couple instructions (min/max mostly) because it relies on pack intrinsics clamping. commit 78c08fc89f8fbcc6dba09779981b1e873e2a0299 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 18:44:07 2012 +0100 gallivm: add avx arithmetic intrinsics Add all avx intrinsics for arithmetic functions (with the exception of the horizontal add function which needs another look). Seems to pass basic tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit a586caa2800aa5ce54c173f7c0d4fc48153dbc4e Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 15:31:35 2012 +0100 gallivm: add avx logic intrinsics Add the blend intrinsics for 8-wide float and 4-wide double vectors. Since we lack 256bit int instructions these are used for int vectors as well, though obviously not for byte or word element values. The comparison intrinsics aren't extended for avx since these are only used for pre-2.7 llvm versions. commit 70275e4c13c89315fc2560a4c488c0e6935d5caf Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 00:40:53 2012 +0100 gallivm: new helper function for extract shuffles. Based on José's idea as we can need that in a couple places. Note that such shuffles should not be used lightly, since data layout of <4 x i8> is different to <16 x i8> for instance, hence might cause data rearrangement. commit 4d586dbae1b0c55915dda1759d2faea631c0a1c2 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 18:27:25 2012 +0100 gallivm: (trivial) don't overallocate shuffle variable using wrong define meant huge array... commit 06b0ec1f6d665d98c135f9573ddf4ba04b2121ad Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 17:54:20 2012 +0100 gallivm: don't do per-element extract/insert for vector element resize Instead of doing per-element extract/insert if the src vectors and dst vector differ in total size (which generates atrocious code) first change the src vectors size by using shuffles to destination vector size. We can still do better than that on AVX for packing to color buffer (by exploiting pack intrinsics characteristics hence eleminating the need for some clamps) but this already generates much better code. v2: incorporate feedback from José, Keith and use shuffle instead of bitcasts/extracts. Due to llvm deficiencies the latter cause all data to get moved to GPRs and back in pieces (even though the data in the regs actually stays the same...). commit c9970d70e05f95d3f52fe7d2cd794176a52693aa Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 19:33:19 2012 +0000 gallivm: fix bug in simple position interpolation Accidental use of position attribute instead of just pixel coordinates. Caused failures in piglit glsl-fs-ceil and glsl-fs-floor. commit d0b6fcdb008d04d7f73d3d725615321544da5a7e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 15:31:14 2012 +0000 gallivm: fix emission of ceil opcode lp_build_ceil seems more appropriate than lp_build_trunc. This seems to be never hit though someone performs some ceil to floor magic. commit d97fafed7e62ffa6bf76560a92ea246a1a26d256 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 22 11:46:52 2012 +0000 gallivm: new vectorized path for cubemap calculations should be faster when adapted to multiple quads as only selection masks need to be different. The code is more or less a per-pixel version adapted to only do it per quad. A per pixel version would be much simpler (could drop 2 selects, 6 broadcasts and the messy horizontal add of 3 vectors at the expense of only 2 more absolute value instructions - would also just work for arbitary large vectors). This version doesn't yet work with larger vectors because the horizontal add isn't adjusted to be able to work with 2x4 vectors (and also because face selection wouldn't be done per quad just per block though that would be only a correctness issue just as with lod selection). The downside is this code is quite a bit slower. On a Core2 it can be sped up by disabling the hw blend instructions for selection and using logicop fallbacks instead, but it is still slower than the old code, hence leave that in for now. Probably will chose one or the other version based on vector length in the end. commit b375fbb18a3fd46859b7fdd42f3e9908ea4ff9a3 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 21 14:42:29 2012 +0000 gallivm: fix optimized occlusion query intrinsic name commit a9ba0a3b611e48efbb0e79eb09caa85033dbe9a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Mar 21 16:19:43 2012 +0000 draw,gallivm,llvmpipe: Call gallivm_verify_function everywhere. commit f94c2238d2bc7383e088b8845b7410439a602071 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 18:54:10 2012 +0000 gallivm: optimize calculations for cube maps a bit this does some more vectorized calculations and uses horizontal adds if possible. A definite win with sse3 otherwise it doesn't seem to make much of a difference. In any case this is arithmetically identical, cannot handle larger vectors. Should be useful as a reference point against larger vector version later... commit 21a2c1cf3c8e1ac648ff49e59fdc0e3be77e2ebb Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 15:16:27 2012 +0000 llvmpipe: slight optimization of occlusion queries using movmskps when available. While this is slightly better for cpus without popcnt we should really sum the vectors ourselves (it is also possible to cast to i4 before doing the popcnt but that doesn't help that much neither since llvm is using some optimized popcnt version for i32) commit 5ab5a35f216619bcdf55eed52b0db275c4a06c1b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 13:32:11 2012 +0000 llvmpipe: fix occlusion queries with larger vectors need to adjust casts etc. commit ff95e6fdf5f16d4ef999ffcf05ea6e8c7160b0d5 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Mar 19 20:15:25 2012 +0000 gallivm: Restore optimization passes. commit 57b05b4b36451e351659e98946dae27be0959832 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:34:22 2012 +0000 llvmpipe: use existing min2 macro commit bc9a20e19b4f600a439f45679451f2e87cd4b299 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:07:27 2012 +0000 llvmpipe: add some safeguards against really large vectors As per José's suggestion, prevent things from blowing up if some cpu would have 1024bit or larger vectors. commit 0e2b525e5ca1c5bbaa63158bde52ad1c1564a3a9 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:31:08 2012 +0000 llvmpipe: fix mask generation for uberwide vectors this was the only piece preventing 16-wide vectors from working (apart from the LP_MAX_VECTOR_WIDTH define that is), which is the maximum as we don't get more pixels in the fragment shader at once. Hence adjust that so things could be tested properly with that size even though there seems to be no practical value. commit 3c8334162211c97f3a11c7f64e9e5a2a91ad9656 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:19:41 2012 +0000 llvmpipe: fix the simple interpolation method with larger vectors so both methods actually _really_ work now. Makes textures look nice with larger vectors... commit 1cb0464ef8871be1778d43b0c56adf9c06843e2d Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 17:26:35 2012 +0000 llvmpipe: fix mask generation and position interpolation with 8-wide vectors trivial bugs, with these things start to look somewhat reasonable. Textures though have some swizzling issues it seems. commit 168277a63ef5b72542cf063c337f2d701053ff4b Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 16:04:03 2012 +0000 llvmpipe: don't overallocate variables we never have more than 16 (stamp size) / 4 (minimum possible vector size). (With larger vectors those variables are still overallocated a bit.) commit 409b54b30f81ed0aa9ed0b01affe15c72de9abd2 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:56:48 2012 +0000 llvmpipe: add some 32f8 formats to lp_test_conv Also add the ability to handle different sized vectors. commit 55dcd3af8366ebdac0af3cdb22c2588f24aa18ce Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:47:27 2012 +0000 gallivm: handle different sized vectors in conversion / pack only fully generic path for now (extract/insert per element). commit 9c040f78c54575fcd94a8808216cf415fe8868f6 Author: Roland Scheidegger <sroland@vmware.com> Date: Sun Mar 18 00:58:28 2012 +0100 llvmpipe: fix harmless use of unitialized values commit 551e9d5468b92fc7d5aa2265db9a52bb1e368a36 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:31:21 2012 +0100 gallivm: drop special path in extract_broadcast with different sized vectors Not needed, llvm can handle shuffles with different sized result vector just fine. Should hopefully generate the same code in the end, but simpler IR. commit 44da531119ffa07a421eaa041f63607cec88f6f8 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:28:49 2012 +0100 llvmpipe: adapt interpolation for handling multiple quads at once this is still WIP there are actually two methods possible not quite sure what makes the most sense, so there's code for both for now: 1) the iterative method as used before (compute attrib values at upper left corner of stamp and upper left corner of each quad initially). It is improved to handle more than one quad at once, and also do some more vectorized calculations initially for slightly better code - newer cpus have full throughput with 4 wide float vectors, hence don't try to code up a path which might be faster if there's just one channel active per attribute. 2) just do straight interpolation for each pixel. Method 2) is more work per quad, but less initially - if all quads are executed significantly more overall though. But this might change with larger vector lengths. This method would also be needed if we'd do some kind of active quad merging when operating on multiple quads at once. This path contains some hack to force llvm to generate better code, it is still far from ideal though, still generates far too many unnecessary register spills/reloads. Both methods should work with different sized vectors. Not very well tested yet, still seems to work with four-wide vectors, need changes elsewhere to be able to test with wider vectors. commit be5d3e82e2fe14ad0a46529ab79f65bf2276cd28 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:59:37 2012 +0000 draw: Cleanup. commit f85bc12c7fbacb3de2a94e88c6cd2d5ee0ec0e8d Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:43:30 2012 +0000 gallivm: More module compilation refactoring. commit d76f093198f2a06a93b2204857e6fea5fd0b3ece Author: José Fonseca <jfonseca@vmware.com> Date: Thu Mar 15 21:29:11 2012 +0000 llvmpipe: Use gallivm_compile/free_function() in linear code. Should had been done before. commit 122e1adb613ce083ad739b153ced1cde61dfc8c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 13 14:47:10 2012 +0100 llvmpipe: generate partial pixel mask for multiple quads still works with one quad, cannot be tested yet with more At least for now always fixed order with multiple quads. commit 4c4f15081d75ed585a01392cd2dcce0ad10e0ea8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 22:09:24 2012 +0100 llvmpipe: refactor state setup a bit Refactor to make it easier to emit (and potentially later fetch in fs) coefficients for multiple attributes at once. Need to think more about how to make this actually happen however, the problem is different attributes can have different interpolation modes, requiring different handling in both setup and fs (though linear and perspective handling is close). commit 9363e49722ff47094d688a4be6f015a03fba9c79 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 19:23:23 2012 +0100 llvmpipe: vectorize tri offset calc cuts number of instructions in quad-offset-factor from 107 to 75. This code actually duplicated the (scalar) code calculating the determinant except it used different vertex order (leading to different sign but it doesn't matter) hence llvm could not have figured out it's the same (of course with determinant vectorized in the other place that wouldn't have worked any longer neither). Note this particular piece doesn't actually vectorize well, not many arithmetic instructions left but tons of shuffle instructions... Probably would need to work on n tris at a time for better vectorization. commit 63169dcb9dd445c94605625bf86d85306e2b4297 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 03:11:37 2012 +0100 llvmpipe: vectorize some scalar code in setup reduces number of arithmetic instructions, and avoids loading vector x,y values twice (once as scalars once as vectors). Results in a reduction of instructions from 76 to 64 in fs setup for glxgears (16%) on a cpu with sse41. Since this code uses vec2 disguised as vec4, on old cpus which had physical 64bit sse units (pre-Core2) it probably is less of a win in practice (and if you have no vectors you can only hope llvm eliminates the arithmetic for unneeded elements). commit 732ecb877f951ab89bf503ac5e35ab8d838b58a1 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 7 00:32:24 2012 +0100 draw: fix clipping bug introduced by 4822fea3f0440b5205e957cd303838c3b128419c broke clipping pretty badly (verified with lineclip test) commit ef5d90b86d624c152d200c7c4056f47c3c6d2688 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 23:38:59 2012 +0100 draw: don't store vertex header per attribute storing the vertex header once per attribute is totally unnecessary. Some quick look at the generated assembly says llvm in fact cannot optimize away the additional stores (maybe due to potentially aliasing pointers somewhere). Plus, this makes the code cleaner and also allows using a vector "or" instead of scalar ones. commit 6b3a5a57b0b9850854cfbd7b586e4e50102dda71 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 19:11:01 2012 +0100 draw: do the per-vertex "boolean" clipmask "or" with vectors no point extracting the values and doing it per component. Doesn't help that much since we still extract the values elsewhere anyway. commit 36519caf1af40e4480251cc79a2d527350b7c61f Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 2 22:27:01 2012 +0100 gallivm: fix lp_build_extract_broadcast with different sized vectors Fix the obviously wrong argument, so it doesn't blow up. commit 76d0ac3ad85066d6058486638013afd02b069c58 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 2 12:16:23 2012 +0000 draw: Compile per module and not per function (WIP). Enough to get gears w/ LLVM draw + softpipe to work on AVX doing: GALLIUM_DRIVER=softpipe SOFTPIPE_USE_LLVM=yes glxgears But still hackish -- will need to rethink and refactor this. commit 78e32b247d2a7a771be9a1a07eb000d1e54ea8bd Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 12:01:05 2012 +0000 llvmpipe: Remove lp_state_setup_fallback. Never used. commit 6895d5e40d19b4972c361e8b83fdb7eecda3c225 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Feb 27 19:14:27 2012 +0000 llvmpipe: Don't emit EMMS on x86 We already take precautions to ensure that LLVM never emits MMX code. commit 4822fea3f0440b5205e957cd303838c3b128419c Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 29 15:58:19 2012 +0100 draw: modifications for larger vector sizes We want to be able to use larger vectors especially for running the vertex shader. With this patch we build soa vectors which might have a different length than 4. Note that aos structures really remain the same, only when aos structures are converted to soa potentially different sized vectors are used. Samplers probably don't work yet, didn't look at them. Testing done: glxgears works with both 128bit and 256bit vectors. commit f4950fc1ea784680ab767d3dd0dce589f4e70603 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:57 2012 +0100 gallivm: override native vector width with LP_NATIVE_VECTOR_WIDTH env var for debug commit 6ad6dbf0c92f3bf68ae54e5f2aca035d19b76e53 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:24 2012 +0100 draw: allocate storage with alignment according to native vector width commit 7bf0e3e7c9bd2469ae7279cabf4c5229ae9880c1 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Feb 24 19:06:08 2012 +0000 gallivm: Fix comment grammar. Was missing several words. Spotted by Roland. commit b20f1b28eb890b2fa2de44a0399b9b6a0d453c52 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 19:22:09 2012 +0000 gallivm: Use MC-JIT on LLVM 3.1 + (i.e, SVN) MC-JIT Note: MC-JIT is still WIP. For this to work correctly it requires LLVM changes which are not yet upstream. commit b1af4dfcadfc241fd4023f4c3f823a1286d452c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 20:03:15 2012 +0100 llvmpipe: use new lp_type_width() helper in lp_test_blend commit 04e0a37e888237d4db2298f31973af459ef9c95f Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 19:50:34 2012 +0100 llvmpipe: clean up lp_test_blend a little Using variables just sized and aligned right makes it a bit more obvious what's going on. The test still only tests vector length 4. For AoS anything else probably isn't going to work. For SoA other lengths should work (at least with floats). commit e61c393d3ec392ddee0a3da170e985fda885a823 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:48:30 2012 +0000 gallivm: Ensure vector width consistency. Instead of assuming that everything is the max native size. commit 330081ac7bc41c5754a92825e51456d231bf84dd Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:44:14 2012 +0000 draw: More simd vector width consistency fixes. commit d90ca002753596269e37297e2e6c139b19f29f03 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:43:00 2012 +0000 gallivm: Remove unused lp_build_int32_vec4_type() helper. commit cae23417824d75869c202aaf897808d73a2c1db0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 17:32:16 2012 +0100 gallivm: use global variable for native vector width instead of define We do not know the simd extensions (and hence the simd width we should use) available at compile time. At least for now keep a define for maximum vector width, since a global variable obviously can't be used to adjust alignment of automatic stack variables. Leave the runtime-determined value at 128 for now in all cases. commit 51270ace6349acc2c294fc6f34c025c707be538a Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 15:41:02 2012 +0000 gallivm: Add a hunk inadvertedly lost when rebasing. commit bf256df9cfdd0236637a455cbaece949b1253e98 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:24:23 2012 +0000 llvmpipe: Use consistent vector width in depth/stencil test. commit 5543b0901677146662c44be2cfba655fd55da94b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:19:59 2012 +0000 draw: Use a consistent the vector register width. Instead of 4x32 sometimes, LP_NATIVE_VECTOR_WIDTH other times. commit eada8bbd22a3a61f549f32fe2a7e408222e5c824 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 12:08:04 2012 +0000 gallivm: Remove garbagge collection. MC-JIT will require one compilation per module (as opposed to one compilation per function), therefore no state will be shared, eliminating the need to do garbagge collection. commit 556697ea0ed72e0641851e4fbbbb862c470fd7eb Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 10:33:41 2012 +0000 gallivm: Move all native target initialization to lp_set_target_options(). commit c518e8f3f2649d5dc265403511fab4bcbe2cc5c8 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:52:32 2012 +0000 llvmpipe: Create one gallivm instance for each test. commit 90f10af8920ec6be6f2b1e7365cfc477a0cb111d Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:48:08 2012 +0000 gallivm: Avoid LLVMAddGlobalMapping() in lp_bld_assert(). Brittle, complex, and unecesary. Just use function pointer constant. commit 98fde550b33401e3fe006af59db4db628bcbf476 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:21:26 2012 +0000 gallivm: Add a lp_build_const_func_pointer() helper. To be reused in all places where we want to call C code. commit 6cfedadb62c2ce5af8d75969bc95a607f3ece118 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:44:41 2012 +0000 gallivm: Cleanup/simplify lp_build_const_string_variable. - Move to lp_bld_const where it belongs - Rename to lp_build_const_string - take the length from the argument (and don't count the zero terminator twice) - bitcast the constant to generic i8 * commit db1d4018c0f1fa682a9da93c032977659adfb68c Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 11:52:17 2012 +0000 gallivm: Set NoFramePointerElimNonLeaf to true where supported. commit 088614164aa915baaa5044fede728aa898483183 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 22 19:38:47 2012 +0100 llvmpipe: pass in/out pointers rather scalar floats in lp_bld_arit we don't want llvm to potentially optimize away the vectors (though it doesn't seem to currently), plus we want to be able to handle in/out vectors of arbitrary length. commit 3f5c4e04af8a7592fdffa54938a277c34ae76b51 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:22:55 2012 +0100 gallivm: fix lp_build_sqrt() for vector length 1 since we optimize away vectors with length 1 need to emit intrinsic without vector type. commit 79d94e5f93ed8ba6757b97e2026722ea31d32c06 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 17:00:46 2012 +0000 llvmpipe: Remove lp_test_round. commit 81f41b5aeb3f4126e06453cfc78990086b85b78d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:56:24 2012 +0100 llvmpipe: subsume lp_test_round into lp_test_arit Much simpler, and since the arguments aren't passed as 128bit values can run on any arch. This also uses the float instead of the double versions of the c functions (which probably was the intention anyway). In contrast to lp_test_round the output is much less verbose however. Tested vector width of 32 to 512 bits - all pass except 32 (length 1) which crashes in lp_build_sqrt() due to wrong type. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 945b338b421defbd274481d8c4f7e0910fd0e7eb Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 09:55:03 2012 +0000 gallivm: Centralize the function compilation logic. This simplifies a lot of code. Also doing this in a central place will make it easier to carry out the changes necessary to use MC-JIT in the future. gallivm: Fix typo in explicit derivative shuffle. Trivial. draw: make DEBUG_STORE work again adapt to lp_build_printf() interface changes Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: get rid of vecnf_from_scalar() just use lp_build_broadcast directly (cannot assign a name but don't really need it, vecnf_from_scalar() was producing much uglier IR due to using repeated insertelement instead of insertelement+shuffle). Reviewed-by: José Fonseca <jfonseca@vmware.com> llvmpipe: fix typo in complex interpolation code Fixes position interpolation when using complex mode (piglit fp-fragment-position and similar) Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: fix clipvertex/position storing again This appears to be the result of a bad merge. Fixes piglit tests relying on clipping, like a lot of the interpolation tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Fix explicit derivative manipulation. Same counter variable was being used in two nested loops. Use more meanigful variable names for the counter to fix and avoid this. gallivm: Prevent buffer overflow in repeat wrap mode for NPOT. Based on Roland's patch, discussion, and review . Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix dims for TGSI_TEXTURE_1D in emit_tex. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix explicit volume texture derivatives. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: fix 1d shadow texture sampling Always r coordinate is used, hence need 3 coords not two (the second one is unused). Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Enable AVX support without MCJIT, where available. For now, this just enables AVX on Windows for testing. If the code is stable then we might consider prefering the old JIT wherever possible. No change elsewhere. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2012-07-13 18:09:30 +01:00
intr_size = 128;
if ((type.width == 8 || type.width == 16) &&
(type.width * type.length <= 64) &&
(gallivm_debug & GALLIVM_DEBUG_PERF)) {
debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
__FUNCTION__);
}
gallivm,draw,llvmpipe: Support wider native registers. Squashed commit of the following: commit 7acb7b4f60dc505af3dd00dcff744f80315d5b0e Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:46:31 2012 +0100 draw: Don't use dynamically sized arrays. Not supported by MSVC. commit 5810c28c83647612cb372d1e763fd9d7780df3cb Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:44:16 2012 +0100 gallivm,llvmpipe: Don't use expressions with PIPE_ALIGN_VAR(). MSVC doesn't accept exceptions in _declspec(align(...)). Use a define instead. commit 8aafd1457ba572a02b289b3f3411e99a3c056072 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:41:56 2012 +0100 gallium/util: Make u_cpu_detect.h header C++ safe. commit 5795248350771f899cfbfc1a3a58f1835eb2671d Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 2 12:08:01 2012 +0100 gallium/util: Add ULL suffix to large constants. As suggested by Andy Furniss: it looks like some old gcc versions require it. commit 4c66c22727eff92226544c7d43c4eb94de359e10 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Truly disable INF/NAN tests on MSVC. Thanks to Brian for spotting this. commit 8bce274c7fad578d7eb656d9a1413f5c0844c94e Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Disable INF/NAN tests on MSVC. Somehow they are not recognized as constants. commit 6868649cff8d7fd2e2579c28d0b74ef6dd4f9716 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 15:05:24 2012 +0200 gallivm: Cleanup the 2 x 8 float -> 16 ub special path in lp_build_conv. No behaviour change intended, like 7b98455fb40c2df84cfd3cdb1eb7650f67c8a751. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 5147a0949c4407e8bce9e41d9859314b4a9ccf77 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 14:28:19 2012 +0200 gallivm: (trivial) fix issues with multiple-of-4 texture fetch Some formats can't handle non-multiple of 4 fetches I believe, but everything must support length 1 and multiples of 4. So avoid going to scalar fetch (which is very costly) just because length isn't 4. Also extend the hack to not use shift with variable count for yuv formats to arbitrary length (larger than 1) - doesn't matter how many elements we have we always want to avoid it unless we have variable shift count instruction (which we should get with avx2). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 87ebcb1bd71fa4c739451ec8ca89a7f29b168c08 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jul 4 02:09:55 2012 +0200 gallivm: (trivial) fix typo for wrap repeat mode in linear filtering aos code This would lead to bogus coordinates at the edges. (undetected by piglit because this path is only taken for block-based formats). Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 3a42717101b1619874c8932a580c0b9e6896b557 Author: José Fonseca <jfonseca@vmware.com> Date: Tue Jul 3 19:42:49 2012 +0100 gallivm: Fix TGSI integer translation with AVX. commit d71ff104085c196b16426081098fb0bde128ce4f Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 15:17:41 2012 +0100 llvmpipe: Fix LLVM JIT linear path. It was not working properly because it was looking at the JIT function before it was actually compiled. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit a94df0386213e1f5f9a6ed470c535f9688ec0a1b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Jun 28 18:07:10 2012 +0100 gallivm: Refactor lp_build_broadcast(_scalar) to share code. Doesn't really change the generated assembly, but produces more compact IR, and of course, makes code more consistent. Reviewed-by: Brian Paul <brianp@vmware.com> commit 66712ba2731fc029fa246d4fc477d61ab785edb5 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 17:30:13 2012 +0100 gallivm: Make LLVMContextRef a singleton. There are any places inside LLVM that depend on it. Too many to attempt to fix. Reviewed-by: Brian Paul <brianp@vmware.com> commit ff5fb7897495ac263f0b069370fab701b70dccef Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 28 18:15:27 2012 +0200 gallivm: don't use 8-wide texture fetch in aos path This appears to be a slight loss usually. There are probably several reasons for that: - fetching itself is scalar - filtering is pure int code hence needs splitting anyway, same for the final texel offset calculations - texture wrap related code, which can be done 8-wide, is slightly more complex with floats (with clamp_to_edge) and float operations generally more costly hence probably not much faster overall - the code needed to split when encountering different mip levels for the quads, adding complexity So, just split always for aos path (but leave it 8-wide for soa, since we do 8-wide filtering there when possible). This should certainly be revisited if we'd have avx2 support. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ce8032b43dcd8e8d816cbab6428f54b0798f945d Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:41:19 2012 +0200 gallivm: (trivial) don't extract fparts variable if not needed Did not have any consequences but unnecessary. commit aaa9aaed8f80dc282492f62aa583a7ee23a4c6d5 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:09:06 2012 +0200 gallivm: fix precision issue in aos linear int wrap code now not just passes at a quick glance but also with piglit... If we do the wrapping with floats, we also need to set the weights accordingly. We can potentially end up with different (integer) coordinates than what the integer calculations would have chosen, which means the integer weights calculated previously in this case are completely wrong. Well at least that's what I think happens, at least recalculating the weights helps. (Some day really should refactor all the wrapping, so we do whatever is fastest independent of 16bit int aos or 32bit float soa filtering.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit fd6f18588ced7ac8e081892f3bab2916623ad7a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 11:15:53 2012 +0100 gallium/util: Fix parsing of options with underscore. For example GALLIVM_DEBUG=no_brilinear which was being parsed as two options, "no" and "brilinear". commit 09a8f809088178a03e49e409fa18f1ac89561837 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 26 15:00:14 2012 +0100 gallivm: Added a generic lp_build_print_value which prints a LLVMValueRef. Updated lp_build_printf to share common code. Removed specific lp_build_print_vecX. Reviewed-by: José Fonseca <jfonseca@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit e59bdcc2c075931bfba2a84967a5ecd1dedd6eb0 Author: José Fonseca <jfonseca@vmware.com> Date: Wed May 16 15:00:23 2012 +0100 draw,llvmpipe: Avoid named struct types on LLVM 3.0 and later. Starting with LLVM 3.0, named structures are meant not for debugging, but for recursive data types, previously also known as opaque types. The recursive nature of these types leads to several memory management difficulties. Given that we don't actually need recursive types, avoid them altogether. This is an attempt to address fdo bugs 41791 and 44466. The issue is somewhat random so there's no easy way to check how effective this is. Cherry-picked from 9af1ba565dfd5cef9ee938bb7c04767d14878fbf commit df6070f618a203c7a876d984c847cde4cbc26bdb Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 14:42:53 2012 +0200 gallivm: (trivial) fix typo in faster aos linear int wrap code no longer crashes, now REALLY tested. commit d8f98dce452c867214e6782e86dc08562643c862 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 18:20:58 2012 +0200 llvmpipe: (trivial) remove bogus optimization for float aos repeat wrap This optimization for nearest filtering on the linear path generated likely bogus results, and the int path didn't have any optimizations there since the only shader using force_nearest apparently uses clamp_to_edge not repeat wrap anyway. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit c4e271a0631087c795e756a5bb6b046043b5099d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 23:01:52 2012 +0200 gallivm: faster repeat wrap for linear aos path too Even if we already have scaled integer coords, it's way faster to use the original float coord (plus some conversions) rather than use URem. The choice of what to do for texture wrapping is not really tied to int aos or float soa filtering though for some modes there can be some gains (because of easier weight calculations). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 1174a75b1806e92aee4264ffe0ffe7e70abbbfa3 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 14:39:22 2012 +0200 gallivm: improve npot tex wrap repeat in linear soa path URem gets translated into series of scalar divisions so just about anything else is faster. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit f849ffaa499ed96fa0efd3594fce255c7f22891b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 00:40:35 2012 +0100 gallivm: (trivial) fix near-invisible shift-space typo I blame the keyboard. commit 5298a0b19fe672aebeb70964c0797d5921b51cf0 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:24:28 2012 +0200 gallivm: add new intrinsic helper to deal with arbitrary vector length This helper will split vectors which are too large for the hw, or expand them if they are too small, so a caller of a function using intrinsics which uses such sizes need not split (or expand) the vectors manually and the function will still use the intrinsic instead of dropping back to generic llvm code. It can also accept scalars for use with pseudo-vector intrinsics (only useful for float arguments, all x86 scalar simd float intrinsics use 4vf32). Only used for lp_build_min/max() for now (also added the scalar float case for these while there). (Other basic binary functions could use it easily, whereas functions with a different interface would need different helpers.) Expanding vectors isn't widely used, because we always try to use build contexts with native hw vector sizes. But it might (or not) be nicer if this wouldn't need to be done, the generated code should in theory stay the same (it does get hit by lp_build_rho though already since we didn't have a intrinsic for the scalar lp_build_max case before). v2: incorporated Brian's feedback, and also made the scalar min/max case work instead of crash (all scalar simd float intrinsics take 4vf32 as argument, probably the reason why it wasn't used before). Moved to lp_bld_intr based on José's request, and passing intrinsic size instead of length. Ideally we'd derive the source type info from the passed in llvm value refs and process some llvmtype return type so we could handle intrinsics where the source and destination type isn't the same (like float/int conversions, packing instructions) but that's a bit too complicated for now. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 01aa760b99ec0b2dc8ce57a43650e83f8c1becdf Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:19:18 2012 +0200 gallivm: (trivial) increase max code size for shader disassembly 64kB was just short of what I needed (which caused a crash) hence increase to 96kB (should probably be smarter about that). commit 74aa739138d981311ce13076388382b5e89c6562 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:53:29 2012 +0100 gallivm: simplify aos float tex wrap repeat nearest just handle pot and npot the same. The previous pot handling ended up with exactly the same instructions plus 2 more (leave it in the soa path though since it is probably still cheaper there). While here also fix a issue which would cause a crash after an assert. commit 0e1e755645e9e49cfaa2025191e3245ccd723564 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:29:24 2012 +0100 gallivm: (trivial) skip floor rounding in ifloor when not signed This was only done for the non-sse41 case before, but even with sse41 this is obviously unnecessary (some callers already call itrunc in this case anyway but some might not). commit 7f01a62f27dcb1d52597b24825931e88bae76f33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:23:12 2012 +0100 gallivm: (trivial) fix bogus comments commit 5c85be25fd82e28490274c468ce7f3e6e8c1d416 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 20 11:51:57 2012 +0100 translate: Free elt8_func/elt16_func too. These were leaking. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 0ad498f36fb6f7458c7cffa73b6598adceee0a6c Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 15:55:34 2012 +0200 gallivm: fix bug for tex wrap repeat with linear sampling in aos float path The comparison needs to be against length not length_minus_one, otherwise the max texel is never chosen (for the second coordinate). Fixes piglit texwrap-1D-npot-proj (and 2D/3D versions). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit d1ad65937c5b76407dc2499b7b774ab59341209e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 16:13:43 2012 +0200 gallivm: simplify soa tex wrap repeat with npot textures and no mip filtering Similar to what is already done in aos sampling for the float path (but not the int path since we don't get normalized float coordinates there). URem is expensive and the calculation is done trivially with normalized floats instead (at least with sse41-capable cpus). (Some day should probably do the same for the mip filter path but it's much more complicated there hence the gain is smaller.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit e1e23f57ba9b910295c306d148f15643acc3fc83 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:38:56 2012 +0200 llvmpipe: (trivial) remove duplicated function declaration Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 07ca57eb09e04c48a157733255427ef5de620861 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:37:34 2012 +0200 llvmpipe: destroy setup variants on context destruction lp_delete_setup_variants() used to be called in garbage collection, but this no longer exists hence the setup shaders never got freed. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ed0003c633859a45f9963a479f4c15ae0ef1dca3 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 16:25:29 2012 +0100 gallivm: handle different ilod parts for multiple quad sampling This fixes filtering when the integer part of the lod is not the same for all quads. I'm not fully convinced of that solution yet as it just splits the vector if the levels to be sampled from are different. But otherwise we'd need to do things like some minify steps, and getting mip level base address separately anyway hence it wouldn't really look like much of a win (and making the code even more complex). This should now give identical results to single quad sampling. commit 8580ac4cfc43a64df55e84ac71ce1a774d33c0d2 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:14:47 2012 +0200 gallivm: de-duplicate sample code common to soa and aos sampling There doesn't seem to be any reason why this code dealing with cube face selection, lod and mip level calculation is separate in aos and soa sampling, and I am sick of having it to change in both places. commit fb541e5f957408ce305b272100196f1e12e5b1e8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:15:41 2012 +0200 gallivm: do mip filtering with per quad lod_fpart This gives better results for mip filtering, though the generated code might not be optimal. For now it also creates some artifacts if the lod_ipart isn't the same for all quads, since instead of using the same mip weight for all quads as previously (which just caused non-smooth gradients) this now will use the right weights but with the wrong mip level in this case (can easily be seen with things like texfilt, mipmap_tunnel). v2: use logic helper suggested by José, and fix issue with negative lod_fpart values commit f1cc84eef7d826a20fab6cd8ccef9a275ff78967 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 13 18:35:25 2012 +0200 gallivm: (trivial) fix bogus assert in lp_build_unpack_broadcast_aos_scalars commit 7c17dbae8ae290df9ce0f50781a09e8ed640c044 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:11:14 2012 +0100 util: Reimplement half <-> float conversions. Removed u_half.py used to generate the table for previous method. Previous implementation of float to half conversion was faulty for denormalised and NaNs and would require extra logic to fix, thus making the speedup of using tables irrelevant. commit 7762f59274070e1dd4b546f5cb431c2eb71ae5c3 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:12:16 2012 +0100 tests: Updated tests to properly handle NaN for half floats. commit fa94c135aea5911fd93d5dfb6e6f157fb40dce5e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:33:10 2012 +0200 gallivm: do mip level calculations per quad This is the final piece which shouldn't change the rendering output yet. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 23cbeaddfe03c09ca18c45d28955515317ffcf4c Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Jun 9 00:54:21 2012 +0200 gallivm: do per-quad cube face selection Doesn't quite fix the piglit cubemap test (not sure why actually) but doing per-quad face selection is doing the right thing and definitely an improvement. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit abfb372b3702ac97ac8b5aa80ad1b94a2cc39d33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:22:59 2012 +0200 gallivm: do all lod calculations per quad Still no functional change but lod is now converted to scalar after lod calculations. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 519368632747ae03feb5bca9c655eccbc5b751b4 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:46:10 2012 +0100 gallivm: Added support for half-float to float conversion in lp_build_conv. Updated various utility functions to support this change. commit 135b4d683a4c95f7577ba27b9bffa4a6fbd2c2e7 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:02:46 2012 +0100 gallivm: Added function for half-float to float conversion. Updated lp_build_format_aos_array to support half-float source. commit 37d648827406a20c5007abeb177698723ed86673 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:55:18 2012 +0100 util: Updated u_format_tests to rigidly test half-float boundary values. commit 2ad18165d96e578aa9046df7c93cb1c3284d8c6b Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:54:16 2012 +0100 llvmpipe: Updated lp_test_format to properly handle Inf/NaN results. commit 78740acf25aeba8a7d146493dd5c966e22c27b73 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:53:30 2012 +0100 util: Added functions for checking NaN / Inf for double and half-floats. commit 35e9f640ae01241f9e0d67fe893bbbf564c05809 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:05:13 2012 +0200 gallivm: Fix calculating rho for 3d textures for the single-quad case Discovered by accident, this looks like a very old typo bug. commit fc1220c636326536fd0541913154e62afa7cd1d8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:04:59 2012 +0200 gallivm: do calcs per-quad in lp_build_rho Still convert to scalar at the end of the function. commit 50a887ffc550bf310a6988fa2cea5c24d38c1a41 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 21 23:21:50 2012 +0200 gallivm: (trivial) return scalar in lp_build_extract_range for length 1 vectors Our type system on top of llvm's one doesn't generally support vectors of length 1, instead using scalars. So we should return a scalar from this function instead of having to bitcast the vector with length 1 later elsewhere. commit 80c71c621f9391f0f9230460198d861643324876 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 17:49:15 2012 +0100 draw: Fixed bad merge error commit c47401cfad0c9167de20ff560654f533579f452c Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:29:30 2012 +0100 draw: Updated store_clip to store whole vectors instead of individual elements. commit 2d9c1ad74b0b0b41861fffcecde39f09cc27f1cf Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:28:32 2012 +0100 gallivm: Added lp_build_fetch_rgba_aos_array. A version of lp_build_fetch_rgba_aos which is targeted at simple array formats. Reads the whole vector from memory in one, instead of reading each element individually. Tested with mesa tests and demos. commit ff7805dc2b6ef6d8b11ec4e54aab1633aef29ac8 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:27:40 2012 +0100 gallivm: Added lp_build_pad_vector. This function pads a vector with undef to a desired length. commit 701f50acef24a2791dabf4730e5b5687d6eb875d Author: James Benton <jbenton@vmware.com> Date: Fri May 18 17:27:19 2012 +0100 util: Added util_format_is_array. This function checks whether a format description is in a simple array format. commit 5e0a7fa543dcd009de26f34a7926674190fa6246 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:13:47 2012 +0100 draw: Removed draw_llvm_translate_from and draw/draw_llvm_translate.c. This is "replaced" by adding an optimised path in lp_build_fetch_rgba_aos in an upcoming patch. commit 8c886d6a7dd3fb464ecf031de6f747cb33e5361d Author: James Benton <jbenton@vmware.com> Date: Wed May 16 15:02:31 2012 +0100 draw: Modified store_aos to write the vector as one, not individual elements. commit 37337f3d657e21dfd662c7b26d61cb0f8cfa6f17 Author: James Benton <jbenton@vmware.com> Date: Wed May 16 14:16:23 2012 +0100 draw: Changed aos_to_soa to use lp_build_transpose_aos. commit bd2b69ce5d5c94b067944d1dcd5df9f8e84548f1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:14:27 2012 +0100 draw: Changed soa_to_aos to use lp_build_transpose_aos. commit 0b98a950d29a116e82ce31dfe7b82cdadb632f2b Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:45 2012 +0100 gallivm: Added lp_build_transpose_aos which converts between aos and soa. commit 69ea84531ad46fd145eb619ed1cedbe97dde7cb5 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:01 2012 +0100 gallivm: Added lp_build_interleave2_half aimed at AVX unpack instructions. commit 7a4cb1349dd35c18144ad5934525cfb9436792f9 Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 22 11:54:14 2012 +0100 gallivm: Fix build on Windows. MC-JIT not yet supported there. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit afd105fc16bb75d874e418046b80d9cc578818a1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:17:26 2012 +0100 llvmpipe: Added a error counter to lp_test_conv. Useful for keeping track of progress when fixing errors! Signed-off-by: José Fonseca <jfonseca@vmware.com> commit b644907d08c10a805657841330fc23db3963d59c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:16:46 2012 +0100 llvmpipe: Changed known failures in lp_test_conv. To comply with the recent fixes to lp_bld_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit d7061507bd94f6468581e218e61261b79c760d4f Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:14:38 2012 +0100 llvmpipe: Added fixed point types tests to lp_test_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 146b3ea39b4726dbe125ac666bd8902ea3d6ca8c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:26:35 2012 +0100 llvmpipe: Changed lp_test_conv src/dst alignment to be correct. Now based on the define rather than a fixed number. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit f3b57441f834833a4b142a951eb98df0aa874536 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:06:44 2012 +0100 gallivm: Fixed erroneous optimisation in lp_build_min/max. Previously assumed normalised was 0 to 1, but it can be -1 to 1 if type is signed. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a0613382e5a215cd146bb277646a6b394d376ae4 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:04:49 2012 +0100 gallivm: Compensate for lp_const_offset in lp_build_conv. Fixing a /*FIXME*/ to remove errors in integer conversion in lp_build_conv. Tested using lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a3d2bf15ea345bc8a0664f8f441276fd566566f3 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:01:25 2012 +0100 gallivm: Fixed overflow in lp_build_clamped_float_to_unsigned_norm. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit e7b1e76fe237613731fa6003b5e1601a2e506207 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 21 20:07:51 2012 +0100 gallivm: Fix build with LLVM 2.6 Trivial, and useful. commit d3c6bbe5c7f5ba1976710831281ab1b6a631082d Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 15 17:15:59 2012 +0100 gallivm: Enable MCJIT/AVX with vanilla LLVM 3.1. Add the necessary C++ glue, so that we don't need any modifications to the soon to be released LLVM 3.1. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 724a019a14d40fdbed21759a204a2bec8a315636 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 22:04:06 2012 +0100 gallivm: Use HAVE_LLVM 0x0301 consistently. commit af6991e2a3868e40ad599b46278551b794839748 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 21:49:06 2012 +0100 gallivm: Add MCRegisterInfo.h to silence benign warnings about missing implementation. Trivial. commit 6f8a1d75458daae2503a86c6b030ecc4bb494e23 Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Apr 2 22:14:15 2012 -0700 gallivm: Pass in a MCInstrInfo to createMCInstPrinter on llvm-3.1. llvm-3.1svn r153860 makes MCInstrInfo available to the MCInstPrinter. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 62555b6ed8760545794f83064e27cddcb3ce5284 Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 27 21:51:17 2012 -0700 gallivm: Fix method overriding in raw_debug_ostream. Use matching type qualifers to avoid method hiding. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 6a9bd784f4ac68ad0a731dcd39e5a3c39989f2be Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 13 22:40:52 2012 -0700 gallivm: Fix createOProfileJITEventListener namespace with llvm-3.1. llvm-3.1svn r152620 refactored the OProfile profiling code. createOProfileJITEventListener was moved from the llvm namespace to the llvm::JITEventListener namespace. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit b674955d39adae272a779be85aa1bd665de24e3e Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Mar 5 22:00:40 2012 -0800 gallivm: Pass in a MCRegisterInfo to MCInstPrinter on llvm-3.1. llvm-3.1svn r152043 changes createMCInstPrinter to take an additional MCRegisterInfo argument. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 11ab69971a8a31c62f6de74905dbf8c02884599f Author: Vinson Lee <vlee@freedesktop.org> Date: Wed Feb 29 21:20:53 2012 -0800 Revert "gallivm: Change getExtent and readByte to non-const with llvm-3.1." This reverts commit d5a6c172547d8964f4d4bb79637651decaf9deee. llvm-3.1svn r151687 makes MemoryObject accessor members const again. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 339960c82d2a9f5c928ee9035ed31dadb7f45537 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 14 16:19:56 2012 +0200 gallivm: (trivial) fix assertion failure for mipmapped 1d textures In lp_build_rho, we may end up with a 1-element vector (for mipmapped 1d textures), but in this case we require the type to be a non-vector type, so need a cast. commit 9d73edb727bd6d196030dc3026b7bf0c574b3e19 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:12:07 2012 +0200 gallivm: prepare for per-quad lod calculations for large vectors to be able to handle multiple quads at once in texture sampling and still do lod calculations per quad, it is necessary to get the per-quad derivatives into the lp_build_rho function. Until now these derivative values were just scalars, which isn't going to work. So we now use vectors, and since the interface needs to change we also do some different (slightly more efficient) packing of the values. For 8-wide vectors the packed derivative values for 3 coords would look like this, this scales to a arbitrary (multiple of 4) vector size: ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____ The second vector will be unused for 1d and 2d textures. To facilitate future changes the derivative values are put into a struct, since quite some functions just pass these values through. The generated code seems to be very slightly better for 2d textures (with 4-wide vectors) than before with sse2 (if you have a cpu with physical 128bit simd units - otherwise it's probably not a win). v2: suggestions from José, rename variables, add comments, use swizzle helper commit 0aa21de0d31466dac77b05c97005722e902517b8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:10:31 2012 +0200 gallivm: add undefined swizzle handling to lp_build_swizzle_aos This is useful for vectors with "holes", it lets llvm choose the most efficient shuffle instructions if some elements aren't needed without having to worry what elements to manually pick otherwise. commit 00faf3f370e7ce92f5ef51002b0ea42ef856e181 Author: José Fonseca <jfonseca@vmware.com> Date: Fri May 4 17:25:16 2012 +0100 gallivm: Get the LLVM IR optimization passes before JIT compilation. MC-JIT engine compiles the module immediately on creation, so the optimization passes were being run too late. So now we create a target data layout from a string, that matches the ABI parameters reported by the compiler. The backend optimization passes were always been run, so the performance improvement is modest (3% on multiarb mesa demo). Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit 40a43f4e2ce3074b5ce9027179d657ebba68800a Author: Roland Scheidegger <sroland@vmware.com> Date: Wed May 2 16:03:54 2012 +0200 gallivm: (trivial) fix wrong define used in lp_build_pack2 should fix stack-smashing crashes. commit e6371d0f4dffad4eb3b7a9d906c23f1c88a2ab9e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 30 21:25:29 2012 +0200 gallivm: add perf warnings when not using intrinsics with 256bit vectors Helper functions using integer sse2 intrinsics could split the vectors with AVX instead of using generic fallback (which should be faster). We don't actually expect to hit these paths (hence don't fix them up to actually do the vector splitting) so just emit warnings (for those functions where it's obvious doing split/intrinsic is faster than using generic path). Only emit warnings for 256bit vectors since we _really_ don't expect to hit arbitrary large vectors which would affect a lot more functions. The warnings do not actually depend on avx since the same logic applies to plain sse2 too (but of course again there's _really_ no reason we should hit these functions with 256bit vectors without avx). commit 8a9ea701ea7295181e846c6383bf66a5f5e47637 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:37:07 2012 +0200 gallivm: split vectors manually for avx in lp_build_pack2 (v2) There's 2 reasons for this: First, there's a llvm bug (fixed in 3.1) which generates tons of byte inserts/extracts otherwise, and second, more importantly, we want to use pack intrinsics instead of shuffles. We do this in lp_build_pack2 and not the calling code (aos sample path) because potentially other callers might find that useful too, even if for larger sequences of code using non-native vector sizes it might be better to manually split vectors. This should boost texture performance in the aos path considerably. v2: fix issues with intrinsics types with old llvm commit 27ac5b48fa1f2ea3efeb5248e2ce32264aba466e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:26:22 2012 +0200 llvmpipe: refactor lp_build_pack2 (v2) prettify, and it's unnecessary to assert when there's no intrinsic due to unsupported bit width - the shuffle path will work regardless. In contrast lp_build_packs2, should only rely on lp_build_pack2 doing the clamping for element sizes for which there is a sse2 intrinsic. v2: fix bug spotted by Jose regarding the intrinsic type for packusdw on old llvm versions. commit ddf279031f0111de4b18eaf783bdc0a1e47813c8 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:13:59 2012 +0200 gallivm: add src width check in lp_build_packs2() not doing so would skip clamping even if no sse2 pack instruction is available, which is incorrect (in theory only, such widths would also always hit a (unnecessary) assertion in lp_build_pack2(). commit e7f0ad7fe079975eae7712a6e0c54be4fae0114b Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Apr 27 15:57:00 2012 +0200 gallivm: (trivial) fix crash-causing typo for npot textures with avx commit 28a9d7f6f655b6ec508c8a3aa6ffefc1e79793a0 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Apr 25 19:38:45 2012 +0200 gallivm: (trivial) remove code mistakenly added twice. commit d5926537316f8ff67ad0a52e7242f7c5478d919b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Apr 24 21:16:15 2012 +0200 gallivm: add a new avx aos sample path (v2) Try to avoid mixing float and int address calculations. This does texture wrap modes with floats, and then the offset calculations still with ints (because of lack of precision with floats, though we could do some effort to make it work with not too large (16MB) textures). This also handles wrap repeat mode with npot-sized textures differently than either the old soa or aos int path (likely way faster but untested). Otherwise the actual address wrap code is largely similar to the soa path (not quite the same as this one also has some int code), it should get used by avx soa sampling later as well but doesn't handle more complex address modes yet (this will also have the benefit that we can use aos sampling path for all texture address modes). Generated code for that looks reasonable, but still does not split vectors explicitly for fetch/filter which means still get hit by llvm (fixed upstream) which generates hundreds of pinsrb/pextrb instead of two shuffles. It is not obvious though if it's much of a win over just doing address calcs 4-wide but with ints, even if it is definitely much less instructions on avx. piglit's texwrap seems to look exactly the same but doesn't test neither the non-normalized nor the npot cases. v2: fix comments, prettify based on Brian's and Jose's feedback. commit bffecd22dea66fb416ecff8cffd10dd4bdb73fce Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Apr 19 01:58:29 2012 +0200 gallivm: refactor aos lp_build_sample_image_nearest/linear split them up to separate address calculations and fetching/filtering. Need this for being able to do 8-wide float address calcs and 4-wide fetch/filter later (for avx). Plus the functions were very big scary monsters anyway (in particular lp_build_sample_image_linear). commit a80b325c57529adddcfa367f96f03557725c4773 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 16 17:17:18 2012 +0200 gallivm: fix lp_build_resize when truncating width but expanding vector size Missed this case which I thought was impossible - the assertion for it was right after the division by zero... (AoS) texture sampling may ask us to do this, for things like 8 4x32int vectors to 1 32x8int vector conversion (eventually, we probably don't want this to happen). commit f9c8337caa3eb185830d18bce8b95676a065b1d7 Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Apr 14 18:00:59 2012 +0200 gallivm: fix cube maps with larger vectors This makes the branchless cube face selection code work with larger vectors. Because the complexity is quite high (cannot really be improved it seems, per-face selection would reduce complexity a lot but this leads to errors unless the derivatives are calculated all from the same face which almost doubles the work to be done) it is still slower than the branching version, hence only enable this with large vectors. It doesn't actually do per-quad face selection yet (only makes sense with matching lod selection, in fact it will select the same face for all pixels based on the average of the first four pixels for now) but only different shuffles are required to make it work (the branching version actually should work with larger vectors too now thanks to the improved horizontal add but of course it cannot be extended to really select the face per-quad unless doing branching per quad). commit 7780c58869fc9a00af4f23209902db7e058e8a66 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 21:11:12 2012 +0100 llvmpipe: (trivial) fix compiler warning and also clarify comment regarding availability of popcnt instruction. commit a266dccf477df6d29a611154e988e8895892277e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 14:21:07 2012 +0100 gallivm: remove unneeded members in lp_build_sample_context Minor cleanup, the texture width, height, depth aren't accessed in their scalar form anywhere. Makes it more obvious those values should probably be fetched already vectorized (but this requires more invasive changes)... commit b678c57fb474e14f05e25658c829fc04d2792fff Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 29 15:53:55 2012 +0100 gallivm: add a helper for concatenating vectors Similar to the extract_range helper intended to get around slow code generated by llvm for 128bit insertelements. Concatenating two 128bit vectors this way will result in a single vinsertf128 operation rather than two 64bit stores plus one 128bit load, though it might be mildly useful for other purposes as well. commit 415ff228bcd0cf5e44a4c15350a661f0f5520029 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 19:41:15 2012 +0100 gallivm: add a custom 2x8f->1x16ub avx conversion path Similar to the existing 4x4f->1x16ub sse2 path, shaves off a couple instructions (min/max mostly) because it relies on pack intrinsics clamping. commit 78c08fc89f8fbcc6dba09779981b1e873e2a0299 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 18:44:07 2012 +0100 gallivm: add avx arithmetic intrinsics Add all avx intrinsics for arithmetic functions (with the exception of the horizontal add function which needs another look). Seems to pass basic tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit a586caa2800aa5ce54c173f7c0d4fc48153dbc4e Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 15:31:35 2012 +0100 gallivm: add avx logic intrinsics Add the blend intrinsics for 8-wide float and 4-wide double vectors. Since we lack 256bit int instructions these are used for int vectors as well, though obviously not for byte or word element values. The comparison intrinsics aren't extended for avx since these are only used for pre-2.7 llvm versions. commit 70275e4c13c89315fc2560a4c488c0e6935d5caf Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 00:40:53 2012 +0100 gallivm: new helper function for extract shuffles. Based on José's idea as we can need that in a couple places. Note that such shuffles should not be used lightly, since data layout of <4 x i8> is different to <16 x i8> for instance, hence might cause data rearrangement. commit 4d586dbae1b0c55915dda1759d2faea631c0a1c2 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 18:27:25 2012 +0100 gallivm: (trivial) don't overallocate shuffle variable using wrong define meant huge array... commit 06b0ec1f6d665d98c135f9573ddf4ba04b2121ad Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 17:54:20 2012 +0100 gallivm: don't do per-element extract/insert for vector element resize Instead of doing per-element extract/insert if the src vectors and dst vector differ in total size (which generates atrocious code) first change the src vectors size by using shuffles to destination vector size. We can still do better than that on AVX for packing to color buffer (by exploiting pack intrinsics characteristics hence eleminating the need for some clamps) but this already generates much better code. v2: incorporate feedback from José, Keith and use shuffle instead of bitcasts/extracts. Due to llvm deficiencies the latter cause all data to get moved to GPRs and back in pieces (even though the data in the regs actually stays the same...). commit c9970d70e05f95d3f52fe7d2cd794176a52693aa Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 19:33:19 2012 +0000 gallivm: fix bug in simple position interpolation Accidental use of position attribute instead of just pixel coordinates. Caused failures in piglit glsl-fs-ceil and glsl-fs-floor. commit d0b6fcdb008d04d7f73d3d725615321544da5a7e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 15:31:14 2012 +0000 gallivm: fix emission of ceil opcode lp_build_ceil seems more appropriate than lp_build_trunc. This seems to be never hit though someone performs some ceil to floor magic. commit d97fafed7e62ffa6bf76560a92ea246a1a26d256 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 22 11:46:52 2012 +0000 gallivm: new vectorized path for cubemap calculations should be faster when adapted to multiple quads as only selection masks need to be different. The code is more or less a per-pixel version adapted to only do it per quad. A per pixel version would be much simpler (could drop 2 selects, 6 broadcasts and the messy horizontal add of 3 vectors at the expense of only 2 more absolute value instructions - would also just work for arbitary large vectors). This version doesn't yet work with larger vectors because the horizontal add isn't adjusted to be able to work with 2x4 vectors (and also because face selection wouldn't be done per quad just per block though that would be only a correctness issue just as with lod selection). The downside is this code is quite a bit slower. On a Core2 it can be sped up by disabling the hw blend instructions for selection and using logicop fallbacks instead, but it is still slower than the old code, hence leave that in for now. Probably will chose one or the other version based on vector length in the end. commit b375fbb18a3fd46859b7fdd42f3e9908ea4ff9a3 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 21 14:42:29 2012 +0000 gallivm: fix optimized occlusion query intrinsic name commit a9ba0a3b611e48efbb0e79eb09caa85033dbe9a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Mar 21 16:19:43 2012 +0000 draw,gallivm,llvmpipe: Call gallivm_verify_function everywhere. commit f94c2238d2bc7383e088b8845b7410439a602071 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 18:54:10 2012 +0000 gallivm: optimize calculations for cube maps a bit this does some more vectorized calculations and uses horizontal adds if possible. A definite win with sse3 otherwise it doesn't seem to make much of a difference. In any case this is arithmetically identical, cannot handle larger vectors. Should be useful as a reference point against larger vector version later... commit 21a2c1cf3c8e1ac648ff49e59fdc0e3be77e2ebb Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 15:16:27 2012 +0000 llvmpipe: slight optimization of occlusion queries using movmskps when available. While this is slightly better for cpus without popcnt we should really sum the vectors ourselves (it is also possible to cast to i4 before doing the popcnt but that doesn't help that much neither since llvm is using some optimized popcnt version for i32) commit 5ab5a35f216619bcdf55eed52b0db275c4a06c1b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 13:32:11 2012 +0000 llvmpipe: fix occlusion queries with larger vectors need to adjust casts etc. commit ff95e6fdf5f16d4ef999ffcf05ea6e8c7160b0d5 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Mar 19 20:15:25 2012 +0000 gallivm: Restore optimization passes. commit 57b05b4b36451e351659e98946dae27be0959832 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:34:22 2012 +0000 llvmpipe: use existing min2 macro commit bc9a20e19b4f600a439f45679451f2e87cd4b299 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:07:27 2012 +0000 llvmpipe: add some safeguards against really large vectors As per José's suggestion, prevent things from blowing up if some cpu would have 1024bit or larger vectors. commit 0e2b525e5ca1c5bbaa63158bde52ad1c1564a3a9 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:31:08 2012 +0000 llvmpipe: fix mask generation for uberwide vectors this was the only piece preventing 16-wide vectors from working (apart from the LP_MAX_VECTOR_WIDTH define that is), which is the maximum as we don't get more pixels in the fragment shader at once. Hence adjust that so things could be tested properly with that size even though there seems to be no practical value. commit 3c8334162211c97f3a11c7f64e9e5a2a91ad9656 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:19:41 2012 +0000 llvmpipe: fix the simple interpolation method with larger vectors so both methods actually _really_ work now. Makes textures look nice with larger vectors... commit 1cb0464ef8871be1778d43b0c56adf9c06843e2d Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 17:26:35 2012 +0000 llvmpipe: fix mask generation and position interpolation with 8-wide vectors trivial bugs, with these things start to look somewhat reasonable. Textures though have some swizzling issues it seems. commit 168277a63ef5b72542cf063c337f2d701053ff4b Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 16:04:03 2012 +0000 llvmpipe: don't overallocate variables we never have more than 16 (stamp size) / 4 (minimum possible vector size). (With larger vectors those variables are still overallocated a bit.) commit 409b54b30f81ed0aa9ed0b01affe15c72de9abd2 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:56:48 2012 +0000 llvmpipe: add some 32f8 formats to lp_test_conv Also add the ability to handle different sized vectors. commit 55dcd3af8366ebdac0af3cdb22c2588f24aa18ce Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:47:27 2012 +0000 gallivm: handle different sized vectors in conversion / pack only fully generic path for now (extract/insert per element). commit 9c040f78c54575fcd94a8808216cf415fe8868f6 Author: Roland Scheidegger <sroland@vmware.com> Date: Sun Mar 18 00:58:28 2012 +0100 llvmpipe: fix harmless use of unitialized values commit 551e9d5468b92fc7d5aa2265db9a52bb1e368a36 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:31:21 2012 +0100 gallivm: drop special path in extract_broadcast with different sized vectors Not needed, llvm can handle shuffles with different sized result vector just fine. Should hopefully generate the same code in the end, but simpler IR. commit 44da531119ffa07a421eaa041f63607cec88f6f8 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:28:49 2012 +0100 llvmpipe: adapt interpolation for handling multiple quads at once this is still WIP there are actually two methods possible not quite sure what makes the most sense, so there's code for both for now: 1) the iterative method as used before (compute attrib values at upper left corner of stamp and upper left corner of each quad initially). It is improved to handle more than one quad at once, and also do some more vectorized calculations initially for slightly better code - newer cpus have full throughput with 4 wide float vectors, hence don't try to code up a path which might be faster if there's just one channel active per attribute. 2) just do straight interpolation for each pixel. Method 2) is more work per quad, but less initially - if all quads are executed significantly more overall though. But this might change with larger vector lengths. This method would also be needed if we'd do some kind of active quad merging when operating on multiple quads at once. This path contains some hack to force llvm to generate better code, it is still far from ideal though, still generates far too many unnecessary register spills/reloads. Both methods should work with different sized vectors. Not very well tested yet, still seems to work with four-wide vectors, need changes elsewhere to be able to test with wider vectors. commit be5d3e82e2fe14ad0a46529ab79f65bf2276cd28 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:59:37 2012 +0000 draw: Cleanup. commit f85bc12c7fbacb3de2a94e88c6cd2d5ee0ec0e8d Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:43:30 2012 +0000 gallivm: More module compilation refactoring. commit d76f093198f2a06a93b2204857e6fea5fd0b3ece Author: José Fonseca <jfonseca@vmware.com> Date: Thu Mar 15 21:29:11 2012 +0000 llvmpipe: Use gallivm_compile/free_function() in linear code. Should had been done before. commit 122e1adb613ce083ad739b153ced1cde61dfc8c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 13 14:47:10 2012 +0100 llvmpipe: generate partial pixel mask for multiple quads still works with one quad, cannot be tested yet with more At least for now always fixed order with multiple quads. commit 4c4f15081d75ed585a01392cd2dcce0ad10e0ea8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 22:09:24 2012 +0100 llvmpipe: refactor state setup a bit Refactor to make it easier to emit (and potentially later fetch in fs) coefficients for multiple attributes at once. Need to think more about how to make this actually happen however, the problem is different attributes can have different interpolation modes, requiring different handling in both setup and fs (though linear and perspective handling is close). commit 9363e49722ff47094d688a4be6f015a03fba9c79 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 19:23:23 2012 +0100 llvmpipe: vectorize tri offset calc cuts number of instructions in quad-offset-factor from 107 to 75. This code actually duplicated the (scalar) code calculating the determinant except it used different vertex order (leading to different sign but it doesn't matter) hence llvm could not have figured out it's the same (of course with determinant vectorized in the other place that wouldn't have worked any longer neither). Note this particular piece doesn't actually vectorize well, not many arithmetic instructions left but tons of shuffle instructions... Probably would need to work on n tris at a time for better vectorization. commit 63169dcb9dd445c94605625bf86d85306e2b4297 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 03:11:37 2012 +0100 llvmpipe: vectorize some scalar code in setup reduces number of arithmetic instructions, and avoids loading vector x,y values twice (once as scalars once as vectors). Results in a reduction of instructions from 76 to 64 in fs setup for glxgears (16%) on a cpu with sse41. Since this code uses vec2 disguised as vec4, on old cpus which had physical 64bit sse units (pre-Core2) it probably is less of a win in practice (and if you have no vectors you can only hope llvm eliminates the arithmetic for unneeded elements). commit 732ecb877f951ab89bf503ac5e35ab8d838b58a1 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 7 00:32:24 2012 +0100 draw: fix clipping bug introduced by 4822fea3f0440b5205e957cd303838c3b128419c broke clipping pretty badly (verified with lineclip test) commit ef5d90b86d624c152d200c7c4056f47c3c6d2688 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 23:38:59 2012 +0100 draw: don't store vertex header per attribute storing the vertex header once per attribute is totally unnecessary. Some quick look at the generated assembly says llvm in fact cannot optimize away the additional stores (maybe due to potentially aliasing pointers somewhere). Plus, this makes the code cleaner and also allows using a vector "or" instead of scalar ones. commit 6b3a5a57b0b9850854cfbd7b586e4e50102dda71 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 19:11:01 2012 +0100 draw: do the per-vertex "boolean" clipmask "or" with vectors no point extracting the values and doing it per component. Doesn't help that much since we still extract the values elsewhere anyway. commit 36519caf1af40e4480251cc79a2d527350b7c61f Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 2 22:27:01 2012 +0100 gallivm: fix lp_build_extract_broadcast with different sized vectors Fix the obviously wrong argument, so it doesn't blow up. commit 76d0ac3ad85066d6058486638013afd02b069c58 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 2 12:16:23 2012 +0000 draw: Compile per module and not per function (WIP). Enough to get gears w/ LLVM draw + softpipe to work on AVX doing: GALLIUM_DRIVER=softpipe SOFTPIPE_USE_LLVM=yes glxgears But still hackish -- will need to rethink and refactor this. commit 78e32b247d2a7a771be9a1a07eb000d1e54ea8bd Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 12:01:05 2012 +0000 llvmpipe: Remove lp_state_setup_fallback. Never used. commit 6895d5e40d19b4972c361e8b83fdb7eecda3c225 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Feb 27 19:14:27 2012 +0000 llvmpipe: Don't emit EMMS on x86 We already take precautions to ensure that LLVM never emits MMX code. commit 4822fea3f0440b5205e957cd303838c3b128419c Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 29 15:58:19 2012 +0100 draw: modifications for larger vector sizes We want to be able to use larger vectors especially for running the vertex shader. With this patch we build soa vectors which might have a different length than 4. Note that aos structures really remain the same, only when aos structures are converted to soa potentially different sized vectors are used. Samplers probably don't work yet, didn't look at them. Testing done: glxgears works with both 128bit and 256bit vectors. commit f4950fc1ea784680ab767d3dd0dce589f4e70603 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:57 2012 +0100 gallivm: override native vector width with LP_NATIVE_VECTOR_WIDTH env var for debug commit 6ad6dbf0c92f3bf68ae54e5f2aca035d19b76e53 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:24 2012 +0100 draw: allocate storage with alignment according to native vector width commit 7bf0e3e7c9bd2469ae7279cabf4c5229ae9880c1 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Feb 24 19:06:08 2012 +0000 gallivm: Fix comment grammar. Was missing several words. Spotted by Roland. commit b20f1b28eb890b2fa2de44a0399b9b6a0d453c52 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 19:22:09 2012 +0000 gallivm: Use MC-JIT on LLVM 3.1 + (i.e, SVN) MC-JIT Note: MC-JIT is still WIP. For this to work correctly it requires LLVM changes which are not yet upstream. commit b1af4dfcadfc241fd4023f4c3f823a1286d452c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 20:03:15 2012 +0100 llvmpipe: use new lp_type_width() helper in lp_test_blend commit 04e0a37e888237d4db2298f31973af459ef9c95f Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 19:50:34 2012 +0100 llvmpipe: clean up lp_test_blend a little Using variables just sized and aligned right makes it a bit more obvious what's going on. The test still only tests vector length 4. For AoS anything else probably isn't going to work. For SoA other lengths should work (at least with floats). commit e61c393d3ec392ddee0a3da170e985fda885a823 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:48:30 2012 +0000 gallivm: Ensure vector width consistency. Instead of assuming that everything is the max native size. commit 330081ac7bc41c5754a92825e51456d231bf84dd Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:44:14 2012 +0000 draw: More simd vector width consistency fixes. commit d90ca002753596269e37297e2e6c139b19f29f03 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:43:00 2012 +0000 gallivm: Remove unused lp_build_int32_vec4_type() helper. commit cae23417824d75869c202aaf897808d73a2c1db0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 17:32:16 2012 +0100 gallivm: use global variable for native vector width instead of define We do not know the simd extensions (and hence the simd width we should use) available at compile time. At least for now keep a define for maximum vector width, since a global variable obviously can't be used to adjust alignment of automatic stack variables. Leave the runtime-determined value at 128 for now in all cases. commit 51270ace6349acc2c294fc6f34c025c707be538a Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 15:41:02 2012 +0000 gallivm: Add a hunk inadvertedly lost when rebasing. commit bf256df9cfdd0236637a455cbaece949b1253e98 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:24:23 2012 +0000 llvmpipe: Use consistent vector width in depth/stencil test. commit 5543b0901677146662c44be2cfba655fd55da94b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:19:59 2012 +0000 draw: Use a consistent the vector register width. Instead of 4x32 sometimes, LP_NATIVE_VECTOR_WIDTH other times. commit eada8bbd22a3a61f549f32fe2a7e408222e5c824 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 12:08:04 2012 +0000 gallivm: Remove garbagge collection. MC-JIT will require one compilation per module (as opposed to one compilation per function), therefore no state will be shared, eliminating the need to do garbagge collection. commit 556697ea0ed72e0641851e4fbbbb862c470fd7eb Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 10:33:41 2012 +0000 gallivm: Move all native target initialization to lp_set_target_options(). commit c518e8f3f2649d5dc265403511fab4bcbe2cc5c8 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:52:32 2012 +0000 llvmpipe: Create one gallivm instance for each test. commit 90f10af8920ec6be6f2b1e7365cfc477a0cb111d Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:48:08 2012 +0000 gallivm: Avoid LLVMAddGlobalMapping() in lp_bld_assert(). Brittle, complex, and unecesary. Just use function pointer constant. commit 98fde550b33401e3fe006af59db4db628bcbf476 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:21:26 2012 +0000 gallivm: Add a lp_build_const_func_pointer() helper. To be reused in all places where we want to call C code. commit 6cfedadb62c2ce5af8d75969bc95a607f3ece118 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:44:41 2012 +0000 gallivm: Cleanup/simplify lp_build_const_string_variable. - Move to lp_bld_const where it belongs - Rename to lp_build_const_string - take the length from the argument (and don't count the zero terminator twice) - bitcast the constant to generic i8 * commit db1d4018c0f1fa682a9da93c032977659adfb68c Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 11:52:17 2012 +0000 gallivm: Set NoFramePointerElimNonLeaf to true where supported. commit 088614164aa915baaa5044fede728aa898483183 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 22 19:38:47 2012 +0100 llvmpipe: pass in/out pointers rather scalar floats in lp_bld_arit we don't want llvm to potentially optimize away the vectors (though it doesn't seem to currently), plus we want to be able to handle in/out vectors of arbitrary length. commit 3f5c4e04af8a7592fdffa54938a277c34ae76b51 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:22:55 2012 +0100 gallivm: fix lp_build_sqrt() for vector length 1 since we optimize away vectors with length 1 need to emit intrinsic without vector type. commit 79d94e5f93ed8ba6757b97e2026722ea31d32c06 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 17:00:46 2012 +0000 llvmpipe: Remove lp_test_round. commit 81f41b5aeb3f4126e06453cfc78990086b85b78d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:56:24 2012 +0100 llvmpipe: subsume lp_test_round into lp_test_arit Much simpler, and since the arguments aren't passed as 128bit values can run on any arch. This also uses the float instead of the double versions of the c functions (which probably was the intention anyway). In contrast to lp_test_round the output is much less verbose however. Tested vector width of 32 to 512 bits - all pass except 32 (length 1) which crashes in lp_build_sqrt() due to wrong type. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 945b338b421defbd274481d8c4f7e0910fd0e7eb Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 09:55:03 2012 +0000 gallivm: Centralize the function compilation logic. This simplifies a lot of code. Also doing this in a central place will make it easier to carry out the changes necessary to use MC-JIT in the future. gallivm: Fix typo in explicit derivative shuffle. Trivial. draw: make DEBUG_STORE work again adapt to lp_build_printf() interface changes Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: get rid of vecnf_from_scalar() just use lp_build_broadcast directly (cannot assign a name but don't really need it, vecnf_from_scalar() was producing much uglier IR due to using repeated insertelement instead of insertelement+shuffle). Reviewed-by: José Fonseca <jfonseca@vmware.com> llvmpipe: fix typo in complex interpolation code Fixes position interpolation when using complex mode (piglit fp-fragment-position and similar) Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: fix clipvertex/position storing again This appears to be the result of a bad merge. Fixes piglit tests relying on clipping, like a lot of the interpolation tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Fix explicit derivative manipulation. Same counter variable was being used in two nested loops. Use more meanigful variable names for the counter to fix and avoid this. gallivm: Prevent buffer overflow in repeat wrap mode for NPOT. Based on Roland's patch, discussion, and review . Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix dims for TGSI_TEXTURE_1D in emit_tex. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix explicit volume texture derivatives. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: fix 1d shadow texture sampling Always r coordinate is used, hence need 3 coords not two (the second one is unused). Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Enable AVX support without MCJIT, where available. For now, this just enables AVX on Windows for testing. If the code is stable then we might consider prefering the old JIT wherever possible. No change elsewhere. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2012-07-13 18:09:30 +01:00
if (type.width == 8 && !type.sign) {
intrinsic = "llvm.x86.sse2.pminu.b";
}
else if (type.width == 16 && type.sign) {
intrinsic = "llvm.x86.sse2.pmins.w";
}
if (util_cpu_caps.has_sse4_1) {
if (type.width == 8 && type.sign) {
intrinsic = "llvm.x86.sse41.pminsb";
gallivm,draw,llvmpipe: Support wider native registers. Squashed commit of the following: commit 7acb7b4f60dc505af3dd00dcff744f80315d5b0e Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:46:31 2012 +0100 draw: Don't use dynamically sized arrays. Not supported by MSVC. commit 5810c28c83647612cb372d1e763fd9d7780df3cb Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:44:16 2012 +0100 gallivm,llvmpipe: Don't use expressions with PIPE_ALIGN_VAR(). MSVC doesn't accept exceptions in _declspec(align(...)). Use a define instead. commit 8aafd1457ba572a02b289b3f3411e99a3c056072 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:41:56 2012 +0100 gallium/util: Make u_cpu_detect.h header C++ safe. commit 5795248350771f899cfbfc1a3a58f1835eb2671d Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 2 12:08:01 2012 +0100 gallium/util: Add ULL suffix to large constants. As suggested by Andy Furniss: it looks like some old gcc versions require it. commit 4c66c22727eff92226544c7d43c4eb94de359e10 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Truly disable INF/NAN tests on MSVC. Thanks to Brian for spotting this. commit 8bce274c7fad578d7eb656d9a1413f5c0844c94e Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Disable INF/NAN tests on MSVC. Somehow they are not recognized as constants. commit 6868649cff8d7fd2e2579c28d0b74ef6dd4f9716 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 15:05:24 2012 +0200 gallivm: Cleanup the 2 x 8 float -> 16 ub special path in lp_build_conv. No behaviour change intended, like 7b98455fb40c2df84cfd3cdb1eb7650f67c8a751. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 5147a0949c4407e8bce9e41d9859314b4a9ccf77 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 14:28:19 2012 +0200 gallivm: (trivial) fix issues with multiple-of-4 texture fetch Some formats can't handle non-multiple of 4 fetches I believe, but everything must support length 1 and multiples of 4. So avoid going to scalar fetch (which is very costly) just because length isn't 4. Also extend the hack to not use shift with variable count for yuv formats to arbitrary length (larger than 1) - doesn't matter how many elements we have we always want to avoid it unless we have variable shift count instruction (which we should get with avx2). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 87ebcb1bd71fa4c739451ec8ca89a7f29b168c08 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jul 4 02:09:55 2012 +0200 gallivm: (trivial) fix typo for wrap repeat mode in linear filtering aos code This would lead to bogus coordinates at the edges. (undetected by piglit because this path is only taken for block-based formats). Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 3a42717101b1619874c8932a580c0b9e6896b557 Author: José Fonseca <jfonseca@vmware.com> Date: Tue Jul 3 19:42:49 2012 +0100 gallivm: Fix TGSI integer translation with AVX. commit d71ff104085c196b16426081098fb0bde128ce4f Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 15:17:41 2012 +0100 llvmpipe: Fix LLVM JIT linear path. It was not working properly because it was looking at the JIT function before it was actually compiled. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit a94df0386213e1f5f9a6ed470c535f9688ec0a1b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Jun 28 18:07:10 2012 +0100 gallivm: Refactor lp_build_broadcast(_scalar) to share code. Doesn't really change the generated assembly, but produces more compact IR, and of course, makes code more consistent. Reviewed-by: Brian Paul <brianp@vmware.com> commit 66712ba2731fc029fa246d4fc477d61ab785edb5 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 17:30:13 2012 +0100 gallivm: Make LLVMContextRef a singleton. There are any places inside LLVM that depend on it. Too many to attempt to fix. Reviewed-by: Brian Paul <brianp@vmware.com> commit ff5fb7897495ac263f0b069370fab701b70dccef Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 28 18:15:27 2012 +0200 gallivm: don't use 8-wide texture fetch in aos path This appears to be a slight loss usually. There are probably several reasons for that: - fetching itself is scalar - filtering is pure int code hence needs splitting anyway, same for the final texel offset calculations - texture wrap related code, which can be done 8-wide, is slightly more complex with floats (with clamp_to_edge) and float operations generally more costly hence probably not much faster overall - the code needed to split when encountering different mip levels for the quads, adding complexity So, just split always for aos path (but leave it 8-wide for soa, since we do 8-wide filtering there when possible). This should certainly be revisited if we'd have avx2 support. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ce8032b43dcd8e8d816cbab6428f54b0798f945d Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:41:19 2012 +0200 gallivm: (trivial) don't extract fparts variable if not needed Did not have any consequences but unnecessary. commit aaa9aaed8f80dc282492f62aa583a7ee23a4c6d5 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:09:06 2012 +0200 gallivm: fix precision issue in aos linear int wrap code now not just passes at a quick glance but also with piglit... If we do the wrapping with floats, we also need to set the weights accordingly. We can potentially end up with different (integer) coordinates than what the integer calculations would have chosen, which means the integer weights calculated previously in this case are completely wrong. Well at least that's what I think happens, at least recalculating the weights helps. (Some day really should refactor all the wrapping, so we do whatever is fastest independent of 16bit int aos or 32bit float soa filtering.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit fd6f18588ced7ac8e081892f3bab2916623ad7a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 11:15:53 2012 +0100 gallium/util: Fix parsing of options with underscore. For example GALLIVM_DEBUG=no_brilinear which was being parsed as two options, "no" and "brilinear". commit 09a8f809088178a03e49e409fa18f1ac89561837 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 26 15:00:14 2012 +0100 gallivm: Added a generic lp_build_print_value which prints a LLVMValueRef. Updated lp_build_printf to share common code. Removed specific lp_build_print_vecX. Reviewed-by: José Fonseca <jfonseca@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit e59bdcc2c075931bfba2a84967a5ecd1dedd6eb0 Author: José Fonseca <jfonseca@vmware.com> Date: Wed May 16 15:00:23 2012 +0100 draw,llvmpipe: Avoid named struct types on LLVM 3.0 and later. Starting with LLVM 3.0, named structures are meant not for debugging, but for recursive data types, previously also known as opaque types. The recursive nature of these types leads to several memory management difficulties. Given that we don't actually need recursive types, avoid them altogether. This is an attempt to address fdo bugs 41791 and 44466. The issue is somewhat random so there's no easy way to check how effective this is. Cherry-picked from 9af1ba565dfd5cef9ee938bb7c04767d14878fbf commit df6070f618a203c7a876d984c847cde4cbc26bdb Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 14:42:53 2012 +0200 gallivm: (trivial) fix typo in faster aos linear int wrap code no longer crashes, now REALLY tested. commit d8f98dce452c867214e6782e86dc08562643c862 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 18:20:58 2012 +0200 llvmpipe: (trivial) remove bogus optimization for float aos repeat wrap This optimization for nearest filtering on the linear path generated likely bogus results, and the int path didn't have any optimizations there since the only shader using force_nearest apparently uses clamp_to_edge not repeat wrap anyway. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit c4e271a0631087c795e756a5bb6b046043b5099d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 23:01:52 2012 +0200 gallivm: faster repeat wrap for linear aos path too Even if we already have scaled integer coords, it's way faster to use the original float coord (plus some conversions) rather than use URem. The choice of what to do for texture wrapping is not really tied to int aos or float soa filtering though for some modes there can be some gains (because of easier weight calculations). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 1174a75b1806e92aee4264ffe0ffe7e70abbbfa3 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 14:39:22 2012 +0200 gallivm: improve npot tex wrap repeat in linear soa path URem gets translated into series of scalar divisions so just about anything else is faster. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit f849ffaa499ed96fa0efd3594fce255c7f22891b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 00:40:35 2012 +0100 gallivm: (trivial) fix near-invisible shift-space typo I blame the keyboard. commit 5298a0b19fe672aebeb70964c0797d5921b51cf0 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:24:28 2012 +0200 gallivm: add new intrinsic helper to deal with arbitrary vector length This helper will split vectors which are too large for the hw, or expand them if they are too small, so a caller of a function using intrinsics which uses such sizes need not split (or expand) the vectors manually and the function will still use the intrinsic instead of dropping back to generic llvm code. It can also accept scalars for use with pseudo-vector intrinsics (only useful for float arguments, all x86 scalar simd float intrinsics use 4vf32). Only used for lp_build_min/max() for now (also added the scalar float case for these while there). (Other basic binary functions could use it easily, whereas functions with a different interface would need different helpers.) Expanding vectors isn't widely used, because we always try to use build contexts with native hw vector sizes. But it might (or not) be nicer if this wouldn't need to be done, the generated code should in theory stay the same (it does get hit by lp_build_rho though already since we didn't have a intrinsic for the scalar lp_build_max case before). v2: incorporated Brian's feedback, and also made the scalar min/max case work instead of crash (all scalar simd float intrinsics take 4vf32 as argument, probably the reason why it wasn't used before). Moved to lp_bld_intr based on José's request, and passing intrinsic size instead of length. Ideally we'd derive the source type info from the passed in llvm value refs and process some llvmtype return type so we could handle intrinsics where the source and destination type isn't the same (like float/int conversions, packing instructions) but that's a bit too complicated for now. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 01aa760b99ec0b2dc8ce57a43650e83f8c1becdf Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:19:18 2012 +0200 gallivm: (trivial) increase max code size for shader disassembly 64kB was just short of what I needed (which caused a crash) hence increase to 96kB (should probably be smarter about that). commit 74aa739138d981311ce13076388382b5e89c6562 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:53:29 2012 +0100 gallivm: simplify aos float tex wrap repeat nearest just handle pot and npot the same. The previous pot handling ended up with exactly the same instructions plus 2 more (leave it in the soa path though since it is probably still cheaper there). While here also fix a issue which would cause a crash after an assert. commit 0e1e755645e9e49cfaa2025191e3245ccd723564 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:29:24 2012 +0100 gallivm: (trivial) skip floor rounding in ifloor when not signed This was only done for the non-sse41 case before, but even with sse41 this is obviously unnecessary (some callers already call itrunc in this case anyway but some might not). commit 7f01a62f27dcb1d52597b24825931e88bae76f33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:23:12 2012 +0100 gallivm: (trivial) fix bogus comments commit 5c85be25fd82e28490274c468ce7f3e6e8c1d416 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 20 11:51:57 2012 +0100 translate: Free elt8_func/elt16_func too. These were leaking. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 0ad498f36fb6f7458c7cffa73b6598adceee0a6c Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 15:55:34 2012 +0200 gallivm: fix bug for tex wrap repeat with linear sampling in aos float path The comparison needs to be against length not length_minus_one, otherwise the max texel is never chosen (for the second coordinate). Fixes piglit texwrap-1D-npot-proj (and 2D/3D versions). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit d1ad65937c5b76407dc2499b7b774ab59341209e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 16:13:43 2012 +0200 gallivm: simplify soa tex wrap repeat with npot textures and no mip filtering Similar to what is already done in aos sampling for the float path (but not the int path since we don't get normalized float coordinates there). URem is expensive and the calculation is done trivially with normalized floats instead (at least with sse41-capable cpus). (Some day should probably do the same for the mip filter path but it's much more complicated there hence the gain is smaller.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit e1e23f57ba9b910295c306d148f15643acc3fc83 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:38:56 2012 +0200 llvmpipe: (trivial) remove duplicated function declaration Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 07ca57eb09e04c48a157733255427ef5de620861 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:37:34 2012 +0200 llvmpipe: destroy setup variants on context destruction lp_delete_setup_variants() used to be called in garbage collection, but this no longer exists hence the setup shaders never got freed. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ed0003c633859a45f9963a479f4c15ae0ef1dca3 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 16:25:29 2012 +0100 gallivm: handle different ilod parts for multiple quad sampling This fixes filtering when the integer part of the lod is not the same for all quads. I'm not fully convinced of that solution yet as it just splits the vector if the levels to be sampled from are different. But otherwise we'd need to do things like some minify steps, and getting mip level base address separately anyway hence it wouldn't really look like much of a win (and making the code even more complex). This should now give identical results to single quad sampling. commit 8580ac4cfc43a64df55e84ac71ce1a774d33c0d2 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:14:47 2012 +0200 gallivm: de-duplicate sample code common to soa and aos sampling There doesn't seem to be any reason why this code dealing with cube face selection, lod and mip level calculation is separate in aos and soa sampling, and I am sick of having it to change in both places. commit fb541e5f957408ce305b272100196f1e12e5b1e8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:15:41 2012 +0200 gallivm: do mip filtering with per quad lod_fpart This gives better results for mip filtering, though the generated code might not be optimal. For now it also creates some artifacts if the lod_ipart isn't the same for all quads, since instead of using the same mip weight for all quads as previously (which just caused non-smooth gradients) this now will use the right weights but with the wrong mip level in this case (can easily be seen with things like texfilt, mipmap_tunnel). v2: use logic helper suggested by José, and fix issue with negative lod_fpart values commit f1cc84eef7d826a20fab6cd8ccef9a275ff78967 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 13 18:35:25 2012 +0200 gallivm: (trivial) fix bogus assert in lp_build_unpack_broadcast_aos_scalars commit 7c17dbae8ae290df9ce0f50781a09e8ed640c044 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:11:14 2012 +0100 util: Reimplement half <-> float conversions. Removed u_half.py used to generate the table for previous method. Previous implementation of float to half conversion was faulty for denormalised and NaNs and would require extra logic to fix, thus making the speedup of using tables irrelevant. commit 7762f59274070e1dd4b546f5cb431c2eb71ae5c3 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:12:16 2012 +0100 tests: Updated tests to properly handle NaN for half floats. commit fa94c135aea5911fd93d5dfb6e6f157fb40dce5e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:33:10 2012 +0200 gallivm: do mip level calculations per quad This is the final piece which shouldn't change the rendering output yet. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 23cbeaddfe03c09ca18c45d28955515317ffcf4c Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Jun 9 00:54:21 2012 +0200 gallivm: do per-quad cube face selection Doesn't quite fix the piglit cubemap test (not sure why actually) but doing per-quad face selection is doing the right thing and definitely an improvement. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit abfb372b3702ac97ac8b5aa80ad1b94a2cc39d33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:22:59 2012 +0200 gallivm: do all lod calculations per quad Still no functional change but lod is now converted to scalar after lod calculations. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 519368632747ae03feb5bca9c655eccbc5b751b4 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:46:10 2012 +0100 gallivm: Added support for half-float to float conversion in lp_build_conv. Updated various utility functions to support this change. commit 135b4d683a4c95f7577ba27b9bffa4a6fbd2c2e7 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:02:46 2012 +0100 gallivm: Added function for half-float to float conversion. Updated lp_build_format_aos_array to support half-float source. commit 37d648827406a20c5007abeb177698723ed86673 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:55:18 2012 +0100 util: Updated u_format_tests to rigidly test half-float boundary values. commit 2ad18165d96e578aa9046df7c93cb1c3284d8c6b Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:54:16 2012 +0100 llvmpipe: Updated lp_test_format to properly handle Inf/NaN results. commit 78740acf25aeba8a7d146493dd5c966e22c27b73 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:53:30 2012 +0100 util: Added functions for checking NaN / Inf for double and half-floats. commit 35e9f640ae01241f9e0d67fe893bbbf564c05809 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:05:13 2012 +0200 gallivm: Fix calculating rho for 3d textures for the single-quad case Discovered by accident, this looks like a very old typo bug. commit fc1220c636326536fd0541913154e62afa7cd1d8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:04:59 2012 +0200 gallivm: do calcs per-quad in lp_build_rho Still convert to scalar at the end of the function. commit 50a887ffc550bf310a6988fa2cea5c24d38c1a41 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 21 23:21:50 2012 +0200 gallivm: (trivial) return scalar in lp_build_extract_range for length 1 vectors Our type system on top of llvm's one doesn't generally support vectors of length 1, instead using scalars. So we should return a scalar from this function instead of having to bitcast the vector with length 1 later elsewhere. commit 80c71c621f9391f0f9230460198d861643324876 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 17:49:15 2012 +0100 draw: Fixed bad merge error commit c47401cfad0c9167de20ff560654f533579f452c Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:29:30 2012 +0100 draw: Updated store_clip to store whole vectors instead of individual elements. commit 2d9c1ad74b0b0b41861fffcecde39f09cc27f1cf Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:28:32 2012 +0100 gallivm: Added lp_build_fetch_rgba_aos_array. A version of lp_build_fetch_rgba_aos which is targeted at simple array formats. Reads the whole vector from memory in one, instead of reading each element individually. Tested with mesa tests and demos. commit ff7805dc2b6ef6d8b11ec4e54aab1633aef29ac8 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:27:40 2012 +0100 gallivm: Added lp_build_pad_vector. This function pads a vector with undef to a desired length. commit 701f50acef24a2791dabf4730e5b5687d6eb875d Author: James Benton <jbenton@vmware.com> Date: Fri May 18 17:27:19 2012 +0100 util: Added util_format_is_array. This function checks whether a format description is in a simple array format. commit 5e0a7fa543dcd009de26f34a7926674190fa6246 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:13:47 2012 +0100 draw: Removed draw_llvm_translate_from and draw/draw_llvm_translate.c. This is "replaced" by adding an optimised path in lp_build_fetch_rgba_aos in an upcoming patch. commit 8c886d6a7dd3fb464ecf031de6f747cb33e5361d Author: James Benton <jbenton@vmware.com> Date: Wed May 16 15:02:31 2012 +0100 draw: Modified store_aos to write the vector as one, not individual elements. commit 37337f3d657e21dfd662c7b26d61cb0f8cfa6f17 Author: James Benton <jbenton@vmware.com> Date: Wed May 16 14:16:23 2012 +0100 draw: Changed aos_to_soa to use lp_build_transpose_aos. commit bd2b69ce5d5c94b067944d1dcd5df9f8e84548f1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:14:27 2012 +0100 draw: Changed soa_to_aos to use lp_build_transpose_aos. commit 0b98a950d29a116e82ce31dfe7b82cdadb632f2b Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:45 2012 +0100 gallivm: Added lp_build_transpose_aos which converts between aos and soa. commit 69ea84531ad46fd145eb619ed1cedbe97dde7cb5 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:01 2012 +0100 gallivm: Added lp_build_interleave2_half aimed at AVX unpack instructions. commit 7a4cb1349dd35c18144ad5934525cfb9436792f9 Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 22 11:54:14 2012 +0100 gallivm: Fix build on Windows. MC-JIT not yet supported there. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit afd105fc16bb75d874e418046b80d9cc578818a1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:17:26 2012 +0100 llvmpipe: Added a error counter to lp_test_conv. Useful for keeping track of progress when fixing errors! Signed-off-by: José Fonseca <jfonseca@vmware.com> commit b644907d08c10a805657841330fc23db3963d59c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:16:46 2012 +0100 llvmpipe: Changed known failures in lp_test_conv. To comply with the recent fixes to lp_bld_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit d7061507bd94f6468581e218e61261b79c760d4f Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:14:38 2012 +0100 llvmpipe: Added fixed point types tests to lp_test_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 146b3ea39b4726dbe125ac666bd8902ea3d6ca8c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:26:35 2012 +0100 llvmpipe: Changed lp_test_conv src/dst alignment to be correct. Now based on the define rather than a fixed number. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit f3b57441f834833a4b142a951eb98df0aa874536 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:06:44 2012 +0100 gallivm: Fixed erroneous optimisation in lp_build_min/max. Previously assumed normalised was 0 to 1, but it can be -1 to 1 if type is signed. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a0613382e5a215cd146bb277646a6b394d376ae4 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:04:49 2012 +0100 gallivm: Compensate for lp_const_offset in lp_build_conv. Fixing a /*FIXME*/ to remove errors in integer conversion in lp_build_conv. Tested using lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a3d2bf15ea345bc8a0664f8f441276fd566566f3 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:01:25 2012 +0100 gallivm: Fixed overflow in lp_build_clamped_float_to_unsigned_norm. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit e7b1e76fe237613731fa6003b5e1601a2e506207 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 21 20:07:51 2012 +0100 gallivm: Fix build with LLVM 2.6 Trivial, and useful. commit d3c6bbe5c7f5ba1976710831281ab1b6a631082d Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 15 17:15:59 2012 +0100 gallivm: Enable MCJIT/AVX with vanilla LLVM 3.1. Add the necessary C++ glue, so that we don't need any modifications to the soon to be released LLVM 3.1. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 724a019a14d40fdbed21759a204a2bec8a315636 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 22:04:06 2012 +0100 gallivm: Use HAVE_LLVM 0x0301 consistently. commit af6991e2a3868e40ad599b46278551b794839748 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 21:49:06 2012 +0100 gallivm: Add MCRegisterInfo.h to silence benign warnings about missing implementation. Trivial. commit 6f8a1d75458daae2503a86c6b030ecc4bb494e23 Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Apr 2 22:14:15 2012 -0700 gallivm: Pass in a MCInstrInfo to createMCInstPrinter on llvm-3.1. llvm-3.1svn r153860 makes MCInstrInfo available to the MCInstPrinter. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 62555b6ed8760545794f83064e27cddcb3ce5284 Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 27 21:51:17 2012 -0700 gallivm: Fix method overriding in raw_debug_ostream. Use matching type qualifers to avoid method hiding. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 6a9bd784f4ac68ad0a731dcd39e5a3c39989f2be Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 13 22:40:52 2012 -0700 gallivm: Fix createOProfileJITEventListener namespace with llvm-3.1. llvm-3.1svn r152620 refactored the OProfile profiling code. createOProfileJITEventListener was moved from the llvm namespace to the llvm::JITEventListener namespace. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit b674955d39adae272a779be85aa1bd665de24e3e Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Mar 5 22:00:40 2012 -0800 gallivm: Pass in a MCRegisterInfo to MCInstPrinter on llvm-3.1. llvm-3.1svn r152043 changes createMCInstPrinter to take an additional MCRegisterInfo argument. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 11ab69971a8a31c62f6de74905dbf8c02884599f Author: Vinson Lee <vlee@freedesktop.org> Date: Wed Feb 29 21:20:53 2012 -0800 Revert "gallivm: Change getExtent and readByte to non-const with llvm-3.1." This reverts commit d5a6c172547d8964f4d4bb79637651decaf9deee. llvm-3.1svn r151687 makes MemoryObject accessor members const again. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 339960c82d2a9f5c928ee9035ed31dadb7f45537 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 14 16:19:56 2012 +0200 gallivm: (trivial) fix assertion failure for mipmapped 1d textures In lp_build_rho, we may end up with a 1-element vector (for mipmapped 1d textures), but in this case we require the type to be a non-vector type, so need a cast. commit 9d73edb727bd6d196030dc3026b7bf0c574b3e19 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:12:07 2012 +0200 gallivm: prepare for per-quad lod calculations for large vectors to be able to handle multiple quads at once in texture sampling and still do lod calculations per quad, it is necessary to get the per-quad derivatives into the lp_build_rho function. Until now these derivative values were just scalars, which isn't going to work. So we now use vectors, and since the interface needs to change we also do some different (slightly more efficient) packing of the values. For 8-wide vectors the packed derivative values for 3 coords would look like this, this scales to a arbitrary (multiple of 4) vector size: ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____ The second vector will be unused for 1d and 2d textures. To facilitate future changes the derivative values are put into a struct, since quite some functions just pass these values through. The generated code seems to be very slightly better for 2d textures (with 4-wide vectors) than before with sse2 (if you have a cpu with physical 128bit simd units - otherwise it's probably not a win). v2: suggestions from José, rename variables, add comments, use swizzle helper commit 0aa21de0d31466dac77b05c97005722e902517b8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:10:31 2012 +0200 gallivm: add undefined swizzle handling to lp_build_swizzle_aos This is useful for vectors with "holes", it lets llvm choose the most efficient shuffle instructions if some elements aren't needed without having to worry what elements to manually pick otherwise. commit 00faf3f370e7ce92f5ef51002b0ea42ef856e181 Author: José Fonseca <jfonseca@vmware.com> Date: Fri May 4 17:25:16 2012 +0100 gallivm: Get the LLVM IR optimization passes before JIT compilation. MC-JIT engine compiles the module immediately on creation, so the optimization passes were being run too late. So now we create a target data layout from a string, that matches the ABI parameters reported by the compiler. The backend optimization passes were always been run, so the performance improvement is modest (3% on multiarb mesa demo). Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit 40a43f4e2ce3074b5ce9027179d657ebba68800a Author: Roland Scheidegger <sroland@vmware.com> Date: Wed May 2 16:03:54 2012 +0200 gallivm: (trivial) fix wrong define used in lp_build_pack2 should fix stack-smashing crashes. commit e6371d0f4dffad4eb3b7a9d906c23f1c88a2ab9e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 30 21:25:29 2012 +0200 gallivm: add perf warnings when not using intrinsics with 256bit vectors Helper functions using integer sse2 intrinsics could split the vectors with AVX instead of using generic fallback (which should be faster). We don't actually expect to hit these paths (hence don't fix them up to actually do the vector splitting) so just emit warnings (for those functions where it's obvious doing split/intrinsic is faster than using generic path). Only emit warnings for 256bit vectors since we _really_ don't expect to hit arbitrary large vectors which would affect a lot more functions. The warnings do not actually depend on avx since the same logic applies to plain sse2 too (but of course again there's _really_ no reason we should hit these functions with 256bit vectors without avx). commit 8a9ea701ea7295181e846c6383bf66a5f5e47637 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:37:07 2012 +0200 gallivm: split vectors manually for avx in lp_build_pack2 (v2) There's 2 reasons for this: First, there's a llvm bug (fixed in 3.1) which generates tons of byte inserts/extracts otherwise, and second, more importantly, we want to use pack intrinsics instead of shuffles. We do this in lp_build_pack2 and not the calling code (aos sample path) because potentially other callers might find that useful too, even if for larger sequences of code using non-native vector sizes it might be better to manually split vectors. This should boost texture performance in the aos path considerably. v2: fix issues with intrinsics types with old llvm commit 27ac5b48fa1f2ea3efeb5248e2ce32264aba466e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:26:22 2012 +0200 llvmpipe: refactor lp_build_pack2 (v2) prettify, and it's unnecessary to assert when there's no intrinsic due to unsupported bit width - the shuffle path will work regardless. In contrast lp_build_packs2, should only rely on lp_build_pack2 doing the clamping for element sizes for which there is a sse2 intrinsic. v2: fix bug spotted by Jose regarding the intrinsic type for packusdw on old llvm versions. commit ddf279031f0111de4b18eaf783bdc0a1e47813c8 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:13:59 2012 +0200 gallivm: add src width check in lp_build_packs2() not doing so would skip clamping even if no sse2 pack instruction is available, which is incorrect (in theory only, such widths would also always hit a (unnecessary) assertion in lp_build_pack2(). commit e7f0ad7fe079975eae7712a6e0c54be4fae0114b Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Apr 27 15:57:00 2012 +0200 gallivm: (trivial) fix crash-causing typo for npot textures with avx commit 28a9d7f6f655b6ec508c8a3aa6ffefc1e79793a0 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Apr 25 19:38:45 2012 +0200 gallivm: (trivial) remove code mistakenly added twice. commit d5926537316f8ff67ad0a52e7242f7c5478d919b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Apr 24 21:16:15 2012 +0200 gallivm: add a new avx aos sample path (v2) Try to avoid mixing float and int address calculations. This does texture wrap modes with floats, and then the offset calculations still with ints (because of lack of precision with floats, though we could do some effort to make it work with not too large (16MB) textures). This also handles wrap repeat mode with npot-sized textures differently than either the old soa or aos int path (likely way faster but untested). Otherwise the actual address wrap code is largely similar to the soa path (not quite the same as this one also has some int code), it should get used by avx soa sampling later as well but doesn't handle more complex address modes yet (this will also have the benefit that we can use aos sampling path for all texture address modes). Generated code for that looks reasonable, but still does not split vectors explicitly for fetch/filter which means still get hit by llvm (fixed upstream) which generates hundreds of pinsrb/pextrb instead of two shuffles. It is not obvious though if it's much of a win over just doing address calcs 4-wide but with ints, even if it is definitely much less instructions on avx. piglit's texwrap seems to look exactly the same but doesn't test neither the non-normalized nor the npot cases. v2: fix comments, prettify based on Brian's and Jose's feedback. commit bffecd22dea66fb416ecff8cffd10dd4bdb73fce Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Apr 19 01:58:29 2012 +0200 gallivm: refactor aos lp_build_sample_image_nearest/linear split them up to separate address calculations and fetching/filtering. Need this for being able to do 8-wide float address calcs and 4-wide fetch/filter later (for avx). Plus the functions were very big scary monsters anyway (in particular lp_build_sample_image_linear). commit a80b325c57529adddcfa367f96f03557725c4773 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 16 17:17:18 2012 +0200 gallivm: fix lp_build_resize when truncating width but expanding vector size Missed this case which I thought was impossible - the assertion for it was right after the division by zero... (AoS) texture sampling may ask us to do this, for things like 8 4x32int vectors to 1 32x8int vector conversion (eventually, we probably don't want this to happen). commit f9c8337caa3eb185830d18bce8b95676a065b1d7 Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Apr 14 18:00:59 2012 +0200 gallivm: fix cube maps with larger vectors This makes the branchless cube face selection code work with larger vectors. Because the complexity is quite high (cannot really be improved it seems, per-face selection would reduce complexity a lot but this leads to errors unless the derivatives are calculated all from the same face which almost doubles the work to be done) it is still slower than the branching version, hence only enable this with large vectors. It doesn't actually do per-quad face selection yet (only makes sense with matching lod selection, in fact it will select the same face for all pixels based on the average of the first four pixels for now) but only different shuffles are required to make it work (the branching version actually should work with larger vectors too now thanks to the improved horizontal add but of course it cannot be extended to really select the face per-quad unless doing branching per quad). commit 7780c58869fc9a00af4f23209902db7e058e8a66 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 21:11:12 2012 +0100 llvmpipe: (trivial) fix compiler warning and also clarify comment regarding availability of popcnt instruction. commit a266dccf477df6d29a611154e988e8895892277e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 14:21:07 2012 +0100 gallivm: remove unneeded members in lp_build_sample_context Minor cleanup, the texture width, height, depth aren't accessed in their scalar form anywhere. Makes it more obvious those values should probably be fetched already vectorized (but this requires more invasive changes)... commit b678c57fb474e14f05e25658c829fc04d2792fff Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 29 15:53:55 2012 +0100 gallivm: add a helper for concatenating vectors Similar to the extract_range helper intended to get around slow code generated by llvm for 128bit insertelements. Concatenating two 128bit vectors this way will result in a single vinsertf128 operation rather than two 64bit stores plus one 128bit load, though it might be mildly useful for other purposes as well. commit 415ff228bcd0cf5e44a4c15350a661f0f5520029 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 19:41:15 2012 +0100 gallivm: add a custom 2x8f->1x16ub avx conversion path Similar to the existing 4x4f->1x16ub sse2 path, shaves off a couple instructions (min/max mostly) because it relies on pack intrinsics clamping. commit 78c08fc89f8fbcc6dba09779981b1e873e2a0299 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 18:44:07 2012 +0100 gallivm: add avx arithmetic intrinsics Add all avx intrinsics for arithmetic functions (with the exception of the horizontal add function which needs another look). Seems to pass basic tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit a586caa2800aa5ce54c173f7c0d4fc48153dbc4e Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 15:31:35 2012 +0100 gallivm: add avx logic intrinsics Add the blend intrinsics for 8-wide float and 4-wide double vectors. Since we lack 256bit int instructions these are used for int vectors as well, though obviously not for byte or word element values. The comparison intrinsics aren't extended for avx since these are only used for pre-2.7 llvm versions. commit 70275e4c13c89315fc2560a4c488c0e6935d5caf Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 00:40:53 2012 +0100 gallivm: new helper function for extract shuffles. Based on José's idea as we can need that in a couple places. Note that such shuffles should not be used lightly, since data layout of <4 x i8> is different to <16 x i8> for instance, hence might cause data rearrangement. commit 4d586dbae1b0c55915dda1759d2faea631c0a1c2 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 18:27:25 2012 +0100 gallivm: (trivial) don't overallocate shuffle variable using wrong define meant huge array... commit 06b0ec1f6d665d98c135f9573ddf4ba04b2121ad Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 17:54:20 2012 +0100 gallivm: don't do per-element extract/insert for vector element resize Instead of doing per-element extract/insert if the src vectors and dst vector differ in total size (which generates atrocious code) first change the src vectors size by using shuffles to destination vector size. We can still do better than that on AVX for packing to color buffer (by exploiting pack intrinsics characteristics hence eleminating the need for some clamps) but this already generates much better code. v2: incorporate feedback from José, Keith and use shuffle instead of bitcasts/extracts. Due to llvm deficiencies the latter cause all data to get moved to GPRs and back in pieces (even though the data in the regs actually stays the same...). commit c9970d70e05f95d3f52fe7d2cd794176a52693aa Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 19:33:19 2012 +0000 gallivm: fix bug in simple position interpolation Accidental use of position attribute instead of just pixel coordinates. Caused failures in piglit glsl-fs-ceil and glsl-fs-floor. commit d0b6fcdb008d04d7f73d3d725615321544da5a7e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 15:31:14 2012 +0000 gallivm: fix emission of ceil opcode lp_build_ceil seems more appropriate than lp_build_trunc. This seems to be never hit though someone performs some ceil to floor magic. commit d97fafed7e62ffa6bf76560a92ea246a1a26d256 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 22 11:46:52 2012 +0000 gallivm: new vectorized path for cubemap calculations should be faster when adapted to multiple quads as only selection masks need to be different. The code is more or less a per-pixel version adapted to only do it per quad. A per pixel version would be much simpler (could drop 2 selects, 6 broadcasts and the messy horizontal add of 3 vectors at the expense of only 2 more absolute value instructions - would also just work for arbitary large vectors). This version doesn't yet work with larger vectors because the horizontal add isn't adjusted to be able to work with 2x4 vectors (and also because face selection wouldn't be done per quad just per block though that would be only a correctness issue just as with lod selection). The downside is this code is quite a bit slower. On a Core2 it can be sped up by disabling the hw blend instructions for selection and using logicop fallbacks instead, but it is still slower than the old code, hence leave that in for now. Probably will chose one or the other version based on vector length in the end. commit b375fbb18a3fd46859b7fdd42f3e9908ea4ff9a3 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 21 14:42:29 2012 +0000 gallivm: fix optimized occlusion query intrinsic name commit a9ba0a3b611e48efbb0e79eb09caa85033dbe9a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Mar 21 16:19:43 2012 +0000 draw,gallivm,llvmpipe: Call gallivm_verify_function everywhere. commit f94c2238d2bc7383e088b8845b7410439a602071 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 18:54:10 2012 +0000 gallivm: optimize calculations for cube maps a bit this does some more vectorized calculations and uses horizontal adds if possible. A definite win with sse3 otherwise it doesn't seem to make much of a difference. In any case this is arithmetically identical, cannot handle larger vectors. Should be useful as a reference point against larger vector version later... commit 21a2c1cf3c8e1ac648ff49e59fdc0e3be77e2ebb Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 15:16:27 2012 +0000 llvmpipe: slight optimization of occlusion queries using movmskps when available. While this is slightly better for cpus without popcnt we should really sum the vectors ourselves (it is also possible to cast to i4 before doing the popcnt but that doesn't help that much neither since llvm is using some optimized popcnt version for i32) commit 5ab5a35f216619bcdf55eed52b0db275c4a06c1b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 13:32:11 2012 +0000 llvmpipe: fix occlusion queries with larger vectors need to adjust casts etc. commit ff95e6fdf5f16d4ef999ffcf05ea6e8c7160b0d5 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Mar 19 20:15:25 2012 +0000 gallivm: Restore optimization passes. commit 57b05b4b36451e351659e98946dae27be0959832 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:34:22 2012 +0000 llvmpipe: use existing min2 macro commit bc9a20e19b4f600a439f45679451f2e87cd4b299 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:07:27 2012 +0000 llvmpipe: add some safeguards against really large vectors As per José's suggestion, prevent things from blowing up if some cpu would have 1024bit or larger vectors. commit 0e2b525e5ca1c5bbaa63158bde52ad1c1564a3a9 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:31:08 2012 +0000 llvmpipe: fix mask generation for uberwide vectors this was the only piece preventing 16-wide vectors from working (apart from the LP_MAX_VECTOR_WIDTH define that is), which is the maximum as we don't get more pixels in the fragment shader at once. Hence adjust that so things could be tested properly with that size even though there seems to be no practical value. commit 3c8334162211c97f3a11c7f64e9e5a2a91ad9656 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:19:41 2012 +0000 llvmpipe: fix the simple interpolation method with larger vectors so both methods actually _really_ work now. Makes textures look nice with larger vectors... commit 1cb0464ef8871be1778d43b0c56adf9c06843e2d Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 17:26:35 2012 +0000 llvmpipe: fix mask generation and position interpolation with 8-wide vectors trivial bugs, with these things start to look somewhat reasonable. Textures though have some swizzling issues it seems. commit 168277a63ef5b72542cf063c337f2d701053ff4b Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 16:04:03 2012 +0000 llvmpipe: don't overallocate variables we never have more than 16 (stamp size) / 4 (minimum possible vector size). (With larger vectors those variables are still overallocated a bit.) commit 409b54b30f81ed0aa9ed0b01affe15c72de9abd2 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:56:48 2012 +0000 llvmpipe: add some 32f8 formats to lp_test_conv Also add the ability to handle different sized vectors. commit 55dcd3af8366ebdac0af3cdb22c2588f24aa18ce Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:47:27 2012 +0000 gallivm: handle different sized vectors in conversion / pack only fully generic path for now (extract/insert per element). commit 9c040f78c54575fcd94a8808216cf415fe8868f6 Author: Roland Scheidegger <sroland@vmware.com> Date: Sun Mar 18 00:58:28 2012 +0100 llvmpipe: fix harmless use of unitialized values commit 551e9d5468b92fc7d5aa2265db9a52bb1e368a36 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:31:21 2012 +0100 gallivm: drop special path in extract_broadcast with different sized vectors Not needed, llvm can handle shuffles with different sized result vector just fine. Should hopefully generate the same code in the end, but simpler IR. commit 44da531119ffa07a421eaa041f63607cec88f6f8 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:28:49 2012 +0100 llvmpipe: adapt interpolation for handling multiple quads at once this is still WIP there are actually two methods possible not quite sure what makes the most sense, so there's code for both for now: 1) the iterative method as used before (compute attrib values at upper left corner of stamp and upper left corner of each quad initially). It is improved to handle more than one quad at once, and also do some more vectorized calculations initially for slightly better code - newer cpus have full throughput with 4 wide float vectors, hence don't try to code up a path which might be faster if there's just one channel active per attribute. 2) just do straight interpolation for each pixel. Method 2) is more work per quad, but less initially - if all quads are executed significantly more overall though. But this might change with larger vector lengths. This method would also be needed if we'd do some kind of active quad merging when operating on multiple quads at once. This path contains some hack to force llvm to generate better code, it is still far from ideal though, still generates far too many unnecessary register spills/reloads. Both methods should work with different sized vectors. Not very well tested yet, still seems to work with four-wide vectors, need changes elsewhere to be able to test with wider vectors. commit be5d3e82e2fe14ad0a46529ab79f65bf2276cd28 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:59:37 2012 +0000 draw: Cleanup. commit f85bc12c7fbacb3de2a94e88c6cd2d5ee0ec0e8d Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:43:30 2012 +0000 gallivm: More module compilation refactoring. commit d76f093198f2a06a93b2204857e6fea5fd0b3ece Author: José Fonseca <jfonseca@vmware.com> Date: Thu Mar 15 21:29:11 2012 +0000 llvmpipe: Use gallivm_compile/free_function() in linear code. Should had been done before. commit 122e1adb613ce083ad739b153ced1cde61dfc8c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 13 14:47:10 2012 +0100 llvmpipe: generate partial pixel mask for multiple quads still works with one quad, cannot be tested yet with more At least for now always fixed order with multiple quads. commit 4c4f15081d75ed585a01392cd2dcce0ad10e0ea8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 22:09:24 2012 +0100 llvmpipe: refactor state setup a bit Refactor to make it easier to emit (and potentially later fetch in fs) coefficients for multiple attributes at once. Need to think more about how to make this actually happen however, the problem is different attributes can have different interpolation modes, requiring different handling in both setup and fs (though linear and perspective handling is close). commit 9363e49722ff47094d688a4be6f015a03fba9c79 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 19:23:23 2012 +0100 llvmpipe: vectorize tri offset calc cuts number of instructions in quad-offset-factor from 107 to 75. This code actually duplicated the (scalar) code calculating the determinant except it used different vertex order (leading to different sign but it doesn't matter) hence llvm could not have figured out it's the same (of course with determinant vectorized in the other place that wouldn't have worked any longer neither). Note this particular piece doesn't actually vectorize well, not many arithmetic instructions left but tons of shuffle instructions... Probably would need to work on n tris at a time for better vectorization. commit 63169dcb9dd445c94605625bf86d85306e2b4297 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 03:11:37 2012 +0100 llvmpipe: vectorize some scalar code in setup reduces number of arithmetic instructions, and avoids loading vector x,y values twice (once as scalars once as vectors). Results in a reduction of instructions from 76 to 64 in fs setup for glxgears (16%) on a cpu with sse41. Since this code uses vec2 disguised as vec4, on old cpus which had physical 64bit sse units (pre-Core2) it probably is less of a win in practice (and if you have no vectors you can only hope llvm eliminates the arithmetic for unneeded elements). commit 732ecb877f951ab89bf503ac5e35ab8d838b58a1 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 7 00:32:24 2012 +0100 draw: fix clipping bug introduced by 4822fea3f0440b5205e957cd303838c3b128419c broke clipping pretty badly (verified with lineclip test) commit ef5d90b86d624c152d200c7c4056f47c3c6d2688 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 23:38:59 2012 +0100 draw: don't store vertex header per attribute storing the vertex header once per attribute is totally unnecessary. Some quick look at the generated assembly says llvm in fact cannot optimize away the additional stores (maybe due to potentially aliasing pointers somewhere). Plus, this makes the code cleaner and also allows using a vector "or" instead of scalar ones. commit 6b3a5a57b0b9850854cfbd7b586e4e50102dda71 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 19:11:01 2012 +0100 draw: do the per-vertex "boolean" clipmask "or" with vectors no point extracting the values and doing it per component. Doesn't help that much since we still extract the values elsewhere anyway. commit 36519caf1af40e4480251cc79a2d527350b7c61f Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 2 22:27:01 2012 +0100 gallivm: fix lp_build_extract_broadcast with different sized vectors Fix the obviously wrong argument, so it doesn't blow up. commit 76d0ac3ad85066d6058486638013afd02b069c58 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 2 12:16:23 2012 +0000 draw: Compile per module and not per function (WIP). Enough to get gears w/ LLVM draw + softpipe to work on AVX doing: GALLIUM_DRIVER=softpipe SOFTPIPE_USE_LLVM=yes glxgears But still hackish -- will need to rethink and refactor this. commit 78e32b247d2a7a771be9a1a07eb000d1e54ea8bd Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 12:01:05 2012 +0000 llvmpipe: Remove lp_state_setup_fallback. Never used. commit 6895d5e40d19b4972c361e8b83fdb7eecda3c225 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Feb 27 19:14:27 2012 +0000 llvmpipe: Don't emit EMMS on x86 We already take precautions to ensure that LLVM never emits MMX code. commit 4822fea3f0440b5205e957cd303838c3b128419c Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 29 15:58:19 2012 +0100 draw: modifications for larger vector sizes We want to be able to use larger vectors especially for running the vertex shader. With this patch we build soa vectors which might have a different length than 4. Note that aos structures really remain the same, only when aos structures are converted to soa potentially different sized vectors are used. Samplers probably don't work yet, didn't look at them. Testing done: glxgears works with both 128bit and 256bit vectors. commit f4950fc1ea784680ab767d3dd0dce589f4e70603 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:57 2012 +0100 gallivm: override native vector width with LP_NATIVE_VECTOR_WIDTH env var for debug commit 6ad6dbf0c92f3bf68ae54e5f2aca035d19b76e53 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:24 2012 +0100 draw: allocate storage with alignment according to native vector width commit 7bf0e3e7c9bd2469ae7279cabf4c5229ae9880c1 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Feb 24 19:06:08 2012 +0000 gallivm: Fix comment grammar. Was missing several words. Spotted by Roland. commit b20f1b28eb890b2fa2de44a0399b9b6a0d453c52 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 19:22:09 2012 +0000 gallivm: Use MC-JIT on LLVM 3.1 + (i.e, SVN) MC-JIT Note: MC-JIT is still WIP. For this to work correctly it requires LLVM changes which are not yet upstream. commit b1af4dfcadfc241fd4023f4c3f823a1286d452c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 20:03:15 2012 +0100 llvmpipe: use new lp_type_width() helper in lp_test_blend commit 04e0a37e888237d4db2298f31973af459ef9c95f Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 19:50:34 2012 +0100 llvmpipe: clean up lp_test_blend a little Using variables just sized and aligned right makes it a bit more obvious what's going on. The test still only tests vector length 4. For AoS anything else probably isn't going to work. For SoA other lengths should work (at least with floats). commit e61c393d3ec392ddee0a3da170e985fda885a823 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:48:30 2012 +0000 gallivm: Ensure vector width consistency. Instead of assuming that everything is the max native size. commit 330081ac7bc41c5754a92825e51456d231bf84dd Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:44:14 2012 +0000 draw: More simd vector width consistency fixes. commit d90ca002753596269e37297e2e6c139b19f29f03 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:43:00 2012 +0000 gallivm: Remove unused lp_build_int32_vec4_type() helper. commit cae23417824d75869c202aaf897808d73a2c1db0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 17:32:16 2012 +0100 gallivm: use global variable for native vector width instead of define We do not know the simd extensions (and hence the simd width we should use) available at compile time. At least for now keep a define for maximum vector width, since a global variable obviously can't be used to adjust alignment of automatic stack variables. Leave the runtime-determined value at 128 for now in all cases. commit 51270ace6349acc2c294fc6f34c025c707be538a Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 15:41:02 2012 +0000 gallivm: Add a hunk inadvertedly lost when rebasing. commit bf256df9cfdd0236637a455cbaece949b1253e98 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:24:23 2012 +0000 llvmpipe: Use consistent vector width in depth/stencil test. commit 5543b0901677146662c44be2cfba655fd55da94b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:19:59 2012 +0000 draw: Use a consistent the vector register width. Instead of 4x32 sometimes, LP_NATIVE_VECTOR_WIDTH other times. commit eada8bbd22a3a61f549f32fe2a7e408222e5c824 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 12:08:04 2012 +0000 gallivm: Remove garbagge collection. MC-JIT will require one compilation per module (as opposed to one compilation per function), therefore no state will be shared, eliminating the need to do garbagge collection. commit 556697ea0ed72e0641851e4fbbbb862c470fd7eb Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 10:33:41 2012 +0000 gallivm: Move all native target initialization to lp_set_target_options(). commit c518e8f3f2649d5dc265403511fab4bcbe2cc5c8 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:52:32 2012 +0000 llvmpipe: Create one gallivm instance for each test. commit 90f10af8920ec6be6f2b1e7365cfc477a0cb111d Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:48:08 2012 +0000 gallivm: Avoid LLVMAddGlobalMapping() in lp_bld_assert(). Brittle, complex, and unecesary. Just use function pointer constant. commit 98fde550b33401e3fe006af59db4db628bcbf476 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:21:26 2012 +0000 gallivm: Add a lp_build_const_func_pointer() helper. To be reused in all places where we want to call C code. commit 6cfedadb62c2ce5af8d75969bc95a607f3ece118 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:44:41 2012 +0000 gallivm: Cleanup/simplify lp_build_const_string_variable. - Move to lp_bld_const where it belongs - Rename to lp_build_const_string - take the length from the argument (and don't count the zero terminator twice) - bitcast the constant to generic i8 * commit db1d4018c0f1fa682a9da93c032977659adfb68c Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 11:52:17 2012 +0000 gallivm: Set NoFramePointerElimNonLeaf to true where supported. commit 088614164aa915baaa5044fede728aa898483183 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 22 19:38:47 2012 +0100 llvmpipe: pass in/out pointers rather scalar floats in lp_bld_arit we don't want llvm to potentially optimize away the vectors (though it doesn't seem to currently), plus we want to be able to handle in/out vectors of arbitrary length. commit 3f5c4e04af8a7592fdffa54938a277c34ae76b51 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:22:55 2012 +0100 gallivm: fix lp_build_sqrt() for vector length 1 since we optimize away vectors with length 1 need to emit intrinsic without vector type. commit 79d94e5f93ed8ba6757b97e2026722ea31d32c06 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 17:00:46 2012 +0000 llvmpipe: Remove lp_test_round. commit 81f41b5aeb3f4126e06453cfc78990086b85b78d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:56:24 2012 +0100 llvmpipe: subsume lp_test_round into lp_test_arit Much simpler, and since the arguments aren't passed as 128bit values can run on any arch. This also uses the float instead of the double versions of the c functions (which probably was the intention anyway). In contrast to lp_test_round the output is much less verbose however. Tested vector width of 32 to 512 bits - all pass except 32 (length 1) which crashes in lp_build_sqrt() due to wrong type. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 945b338b421defbd274481d8c4f7e0910fd0e7eb Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 09:55:03 2012 +0000 gallivm: Centralize the function compilation logic. This simplifies a lot of code. Also doing this in a central place will make it easier to carry out the changes necessary to use MC-JIT in the future. gallivm: Fix typo in explicit derivative shuffle. Trivial. draw: make DEBUG_STORE work again adapt to lp_build_printf() interface changes Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: get rid of vecnf_from_scalar() just use lp_build_broadcast directly (cannot assign a name but don't really need it, vecnf_from_scalar() was producing much uglier IR due to using repeated insertelement instead of insertelement+shuffle). Reviewed-by: José Fonseca <jfonseca@vmware.com> llvmpipe: fix typo in complex interpolation code Fixes position interpolation when using complex mode (piglit fp-fragment-position and similar) Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: fix clipvertex/position storing again This appears to be the result of a bad merge. Fixes piglit tests relying on clipping, like a lot of the interpolation tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Fix explicit derivative manipulation. Same counter variable was being used in two nested loops. Use more meanigful variable names for the counter to fix and avoid this. gallivm: Prevent buffer overflow in repeat wrap mode for NPOT. Based on Roland's patch, discussion, and review . Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix dims for TGSI_TEXTURE_1D in emit_tex. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix explicit volume texture derivatives. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: fix 1d shadow texture sampling Always r coordinate is used, hence need 3 coords not two (the second one is unused). Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Enable AVX support without MCJIT, where available. For now, this just enables AVX on Windows for testing. If the code is stable then we might consider prefering the old JIT wherever possible. No change elsewhere. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2012-07-13 18:09:30 +01:00
}
if (type.width == 16 && !type.sign) {
intrinsic = "llvm.x86.sse41.pminuw";
gallivm,draw,llvmpipe: Support wider native registers. Squashed commit of the following: commit 7acb7b4f60dc505af3dd00dcff744f80315d5b0e Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:46:31 2012 +0100 draw: Don't use dynamically sized arrays. Not supported by MSVC. commit 5810c28c83647612cb372d1e763fd9d7780df3cb Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:44:16 2012 +0100 gallivm,llvmpipe: Don't use expressions with PIPE_ALIGN_VAR(). MSVC doesn't accept exceptions in _declspec(align(...)). Use a define instead. commit 8aafd1457ba572a02b289b3f3411e99a3c056072 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:41:56 2012 +0100 gallium/util: Make u_cpu_detect.h header C++ safe. commit 5795248350771f899cfbfc1a3a58f1835eb2671d Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 2 12:08:01 2012 +0100 gallium/util: Add ULL suffix to large constants. As suggested by Andy Furniss: it looks like some old gcc versions require it. commit 4c66c22727eff92226544c7d43c4eb94de359e10 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Truly disable INF/NAN tests on MSVC. Thanks to Brian for spotting this. commit 8bce274c7fad578d7eb656d9a1413f5c0844c94e Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Disable INF/NAN tests on MSVC. Somehow they are not recognized as constants. commit 6868649cff8d7fd2e2579c28d0b74ef6dd4f9716 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 15:05:24 2012 +0200 gallivm: Cleanup the 2 x 8 float -> 16 ub special path in lp_build_conv. No behaviour change intended, like 7b98455fb40c2df84cfd3cdb1eb7650f67c8a751. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 5147a0949c4407e8bce9e41d9859314b4a9ccf77 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 14:28:19 2012 +0200 gallivm: (trivial) fix issues with multiple-of-4 texture fetch Some formats can't handle non-multiple of 4 fetches I believe, but everything must support length 1 and multiples of 4. So avoid going to scalar fetch (which is very costly) just because length isn't 4. Also extend the hack to not use shift with variable count for yuv formats to arbitrary length (larger than 1) - doesn't matter how many elements we have we always want to avoid it unless we have variable shift count instruction (which we should get with avx2). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 87ebcb1bd71fa4c739451ec8ca89a7f29b168c08 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jul 4 02:09:55 2012 +0200 gallivm: (trivial) fix typo for wrap repeat mode in linear filtering aos code This would lead to bogus coordinates at the edges. (undetected by piglit because this path is only taken for block-based formats). Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 3a42717101b1619874c8932a580c0b9e6896b557 Author: José Fonseca <jfonseca@vmware.com> Date: Tue Jul 3 19:42:49 2012 +0100 gallivm: Fix TGSI integer translation with AVX. commit d71ff104085c196b16426081098fb0bde128ce4f Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 15:17:41 2012 +0100 llvmpipe: Fix LLVM JIT linear path. It was not working properly because it was looking at the JIT function before it was actually compiled. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit a94df0386213e1f5f9a6ed470c535f9688ec0a1b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Jun 28 18:07:10 2012 +0100 gallivm: Refactor lp_build_broadcast(_scalar) to share code. Doesn't really change the generated assembly, but produces more compact IR, and of course, makes code more consistent. Reviewed-by: Brian Paul <brianp@vmware.com> commit 66712ba2731fc029fa246d4fc477d61ab785edb5 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 17:30:13 2012 +0100 gallivm: Make LLVMContextRef a singleton. There are any places inside LLVM that depend on it. Too many to attempt to fix. Reviewed-by: Brian Paul <brianp@vmware.com> commit ff5fb7897495ac263f0b069370fab701b70dccef Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 28 18:15:27 2012 +0200 gallivm: don't use 8-wide texture fetch in aos path This appears to be a slight loss usually. There are probably several reasons for that: - fetching itself is scalar - filtering is pure int code hence needs splitting anyway, same for the final texel offset calculations - texture wrap related code, which can be done 8-wide, is slightly more complex with floats (with clamp_to_edge) and float operations generally more costly hence probably not much faster overall - the code needed to split when encountering different mip levels for the quads, adding complexity So, just split always for aos path (but leave it 8-wide for soa, since we do 8-wide filtering there when possible). This should certainly be revisited if we'd have avx2 support. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ce8032b43dcd8e8d816cbab6428f54b0798f945d Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:41:19 2012 +0200 gallivm: (trivial) don't extract fparts variable if not needed Did not have any consequences but unnecessary. commit aaa9aaed8f80dc282492f62aa583a7ee23a4c6d5 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:09:06 2012 +0200 gallivm: fix precision issue in aos linear int wrap code now not just passes at a quick glance but also with piglit... If we do the wrapping with floats, we also need to set the weights accordingly. We can potentially end up with different (integer) coordinates than what the integer calculations would have chosen, which means the integer weights calculated previously in this case are completely wrong. Well at least that's what I think happens, at least recalculating the weights helps. (Some day really should refactor all the wrapping, so we do whatever is fastest independent of 16bit int aos or 32bit float soa filtering.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit fd6f18588ced7ac8e081892f3bab2916623ad7a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 11:15:53 2012 +0100 gallium/util: Fix parsing of options with underscore. For example GALLIVM_DEBUG=no_brilinear which was being parsed as two options, "no" and "brilinear". commit 09a8f809088178a03e49e409fa18f1ac89561837 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 26 15:00:14 2012 +0100 gallivm: Added a generic lp_build_print_value which prints a LLVMValueRef. Updated lp_build_printf to share common code. Removed specific lp_build_print_vecX. Reviewed-by: José Fonseca <jfonseca@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit e59bdcc2c075931bfba2a84967a5ecd1dedd6eb0 Author: José Fonseca <jfonseca@vmware.com> Date: Wed May 16 15:00:23 2012 +0100 draw,llvmpipe: Avoid named struct types on LLVM 3.0 and later. Starting with LLVM 3.0, named structures are meant not for debugging, but for recursive data types, previously also known as opaque types. The recursive nature of these types leads to several memory management difficulties. Given that we don't actually need recursive types, avoid them altogether. This is an attempt to address fdo bugs 41791 and 44466. The issue is somewhat random so there's no easy way to check how effective this is. Cherry-picked from 9af1ba565dfd5cef9ee938bb7c04767d14878fbf commit df6070f618a203c7a876d984c847cde4cbc26bdb Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 14:42:53 2012 +0200 gallivm: (trivial) fix typo in faster aos linear int wrap code no longer crashes, now REALLY tested. commit d8f98dce452c867214e6782e86dc08562643c862 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 18:20:58 2012 +0200 llvmpipe: (trivial) remove bogus optimization for float aos repeat wrap This optimization for nearest filtering on the linear path generated likely bogus results, and the int path didn't have any optimizations there since the only shader using force_nearest apparently uses clamp_to_edge not repeat wrap anyway. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit c4e271a0631087c795e756a5bb6b046043b5099d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 23:01:52 2012 +0200 gallivm: faster repeat wrap for linear aos path too Even if we already have scaled integer coords, it's way faster to use the original float coord (plus some conversions) rather than use URem. The choice of what to do for texture wrapping is not really tied to int aos or float soa filtering though for some modes there can be some gains (because of easier weight calculations). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 1174a75b1806e92aee4264ffe0ffe7e70abbbfa3 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 14:39:22 2012 +0200 gallivm: improve npot tex wrap repeat in linear soa path URem gets translated into series of scalar divisions so just about anything else is faster. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit f849ffaa499ed96fa0efd3594fce255c7f22891b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 00:40:35 2012 +0100 gallivm: (trivial) fix near-invisible shift-space typo I blame the keyboard. commit 5298a0b19fe672aebeb70964c0797d5921b51cf0 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:24:28 2012 +0200 gallivm: add new intrinsic helper to deal with arbitrary vector length This helper will split vectors which are too large for the hw, or expand them if they are too small, so a caller of a function using intrinsics which uses such sizes need not split (or expand) the vectors manually and the function will still use the intrinsic instead of dropping back to generic llvm code. It can also accept scalars for use with pseudo-vector intrinsics (only useful for float arguments, all x86 scalar simd float intrinsics use 4vf32). Only used for lp_build_min/max() for now (also added the scalar float case for these while there). (Other basic binary functions could use it easily, whereas functions with a different interface would need different helpers.) Expanding vectors isn't widely used, because we always try to use build contexts with native hw vector sizes. But it might (or not) be nicer if this wouldn't need to be done, the generated code should in theory stay the same (it does get hit by lp_build_rho though already since we didn't have a intrinsic for the scalar lp_build_max case before). v2: incorporated Brian's feedback, and also made the scalar min/max case work instead of crash (all scalar simd float intrinsics take 4vf32 as argument, probably the reason why it wasn't used before). Moved to lp_bld_intr based on José's request, and passing intrinsic size instead of length. Ideally we'd derive the source type info from the passed in llvm value refs and process some llvmtype return type so we could handle intrinsics where the source and destination type isn't the same (like float/int conversions, packing instructions) but that's a bit too complicated for now. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 01aa760b99ec0b2dc8ce57a43650e83f8c1becdf Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:19:18 2012 +0200 gallivm: (trivial) increase max code size for shader disassembly 64kB was just short of what I needed (which caused a crash) hence increase to 96kB (should probably be smarter about that). commit 74aa739138d981311ce13076388382b5e89c6562 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:53:29 2012 +0100 gallivm: simplify aos float tex wrap repeat nearest just handle pot and npot the same. The previous pot handling ended up with exactly the same instructions plus 2 more (leave it in the soa path though since it is probably still cheaper there). While here also fix a issue which would cause a crash after an assert. commit 0e1e755645e9e49cfaa2025191e3245ccd723564 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:29:24 2012 +0100 gallivm: (trivial) skip floor rounding in ifloor when not signed This was only done for the non-sse41 case before, but even with sse41 this is obviously unnecessary (some callers already call itrunc in this case anyway but some might not). commit 7f01a62f27dcb1d52597b24825931e88bae76f33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:23:12 2012 +0100 gallivm: (trivial) fix bogus comments commit 5c85be25fd82e28490274c468ce7f3e6e8c1d416 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 20 11:51:57 2012 +0100 translate: Free elt8_func/elt16_func too. These were leaking. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 0ad498f36fb6f7458c7cffa73b6598adceee0a6c Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 15:55:34 2012 +0200 gallivm: fix bug for tex wrap repeat with linear sampling in aos float path The comparison needs to be against length not length_minus_one, otherwise the max texel is never chosen (for the second coordinate). Fixes piglit texwrap-1D-npot-proj (and 2D/3D versions). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit d1ad65937c5b76407dc2499b7b774ab59341209e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 16:13:43 2012 +0200 gallivm: simplify soa tex wrap repeat with npot textures and no mip filtering Similar to what is already done in aos sampling for the float path (but not the int path since we don't get normalized float coordinates there). URem is expensive and the calculation is done trivially with normalized floats instead (at least with sse41-capable cpus). (Some day should probably do the same for the mip filter path but it's much more complicated there hence the gain is smaller.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit e1e23f57ba9b910295c306d148f15643acc3fc83 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:38:56 2012 +0200 llvmpipe: (trivial) remove duplicated function declaration Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 07ca57eb09e04c48a157733255427ef5de620861 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:37:34 2012 +0200 llvmpipe: destroy setup variants on context destruction lp_delete_setup_variants() used to be called in garbage collection, but this no longer exists hence the setup shaders never got freed. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ed0003c633859a45f9963a479f4c15ae0ef1dca3 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 16:25:29 2012 +0100 gallivm: handle different ilod parts for multiple quad sampling This fixes filtering when the integer part of the lod is not the same for all quads. I'm not fully convinced of that solution yet as it just splits the vector if the levels to be sampled from are different. But otherwise we'd need to do things like some minify steps, and getting mip level base address separately anyway hence it wouldn't really look like much of a win (and making the code even more complex). This should now give identical results to single quad sampling. commit 8580ac4cfc43a64df55e84ac71ce1a774d33c0d2 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:14:47 2012 +0200 gallivm: de-duplicate sample code common to soa and aos sampling There doesn't seem to be any reason why this code dealing with cube face selection, lod and mip level calculation is separate in aos and soa sampling, and I am sick of having it to change in both places. commit fb541e5f957408ce305b272100196f1e12e5b1e8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:15:41 2012 +0200 gallivm: do mip filtering with per quad lod_fpart This gives better results for mip filtering, though the generated code might not be optimal. For now it also creates some artifacts if the lod_ipart isn't the same for all quads, since instead of using the same mip weight for all quads as previously (which just caused non-smooth gradients) this now will use the right weights but with the wrong mip level in this case (can easily be seen with things like texfilt, mipmap_tunnel). v2: use logic helper suggested by José, and fix issue with negative lod_fpart values commit f1cc84eef7d826a20fab6cd8ccef9a275ff78967 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 13 18:35:25 2012 +0200 gallivm: (trivial) fix bogus assert in lp_build_unpack_broadcast_aos_scalars commit 7c17dbae8ae290df9ce0f50781a09e8ed640c044 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:11:14 2012 +0100 util: Reimplement half <-> float conversions. Removed u_half.py used to generate the table for previous method. Previous implementation of float to half conversion was faulty for denormalised and NaNs and would require extra logic to fix, thus making the speedup of using tables irrelevant. commit 7762f59274070e1dd4b546f5cb431c2eb71ae5c3 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:12:16 2012 +0100 tests: Updated tests to properly handle NaN for half floats. commit fa94c135aea5911fd93d5dfb6e6f157fb40dce5e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:33:10 2012 +0200 gallivm: do mip level calculations per quad This is the final piece which shouldn't change the rendering output yet. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 23cbeaddfe03c09ca18c45d28955515317ffcf4c Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Jun 9 00:54:21 2012 +0200 gallivm: do per-quad cube face selection Doesn't quite fix the piglit cubemap test (not sure why actually) but doing per-quad face selection is doing the right thing and definitely an improvement. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit abfb372b3702ac97ac8b5aa80ad1b94a2cc39d33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:22:59 2012 +0200 gallivm: do all lod calculations per quad Still no functional change but lod is now converted to scalar after lod calculations. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 519368632747ae03feb5bca9c655eccbc5b751b4 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:46:10 2012 +0100 gallivm: Added support for half-float to float conversion in lp_build_conv. Updated various utility functions to support this change. commit 135b4d683a4c95f7577ba27b9bffa4a6fbd2c2e7 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:02:46 2012 +0100 gallivm: Added function for half-float to float conversion. Updated lp_build_format_aos_array to support half-float source. commit 37d648827406a20c5007abeb177698723ed86673 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:55:18 2012 +0100 util: Updated u_format_tests to rigidly test half-float boundary values. commit 2ad18165d96e578aa9046df7c93cb1c3284d8c6b Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:54:16 2012 +0100 llvmpipe: Updated lp_test_format to properly handle Inf/NaN results. commit 78740acf25aeba8a7d146493dd5c966e22c27b73 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:53:30 2012 +0100 util: Added functions for checking NaN / Inf for double and half-floats. commit 35e9f640ae01241f9e0d67fe893bbbf564c05809 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:05:13 2012 +0200 gallivm: Fix calculating rho for 3d textures for the single-quad case Discovered by accident, this looks like a very old typo bug. commit fc1220c636326536fd0541913154e62afa7cd1d8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:04:59 2012 +0200 gallivm: do calcs per-quad in lp_build_rho Still convert to scalar at the end of the function. commit 50a887ffc550bf310a6988fa2cea5c24d38c1a41 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 21 23:21:50 2012 +0200 gallivm: (trivial) return scalar in lp_build_extract_range for length 1 vectors Our type system on top of llvm's one doesn't generally support vectors of length 1, instead using scalars. So we should return a scalar from this function instead of having to bitcast the vector with length 1 later elsewhere. commit 80c71c621f9391f0f9230460198d861643324876 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 17:49:15 2012 +0100 draw: Fixed bad merge error commit c47401cfad0c9167de20ff560654f533579f452c Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:29:30 2012 +0100 draw: Updated store_clip to store whole vectors instead of individual elements. commit 2d9c1ad74b0b0b41861fffcecde39f09cc27f1cf Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:28:32 2012 +0100 gallivm: Added lp_build_fetch_rgba_aos_array. A version of lp_build_fetch_rgba_aos which is targeted at simple array formats. Reads the whole vector from memory in one, instead of reading each element individually. Tested with mesa tests and demos. commit ff7805dc2b6ef6d8b11ec4e54aab1633aef29ac8 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:27:40 2012 +0100 gallivm: Added lp_build_pad_vector. This function pads a vector with undef to a desired length. commit 701f50acef24a2791dabf4730e5b5687d6eb875d Author: James Benton <jbenton@vmware.com> Date: Fri May 18 17:27:19 2012 +0100 util: Added util_format_is_array. This function checks whether a format description is in a simple array format. commit 5e0a7fa543dcd009de26f34a7926674190fa6246 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:13:47 2012 +0100 draw: Removed draw_llvm_translate_from and draw/draw_llvm_translate.c. This is "replaced" by adding an optimised path in lp_build_fetch_rgba_aos in an upcoming patch. commit 8c886d6a7dd3fb464ecf031de6f747cb33e5361d Author: James Benton <jbenton@vmware.com> Date: Wed May 16 15:02:31 2012 +0100 draw: Modified store_aos to write the vector as one, not individual elements. commit 37337f3d657e21dfd662c7b26d61cb0f8cfa6f17 Author: James Benton <jbenton@vmware.com> Date: Wed May 16 14:16:23 2012 +0100 draw: Changed aos_to_soa to use lp_build_transpose_aos. commit bd2b69ce5d5c94b067944d1dcd5df9f8e84548f1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:14:27 2012 +0100 draw: Changed soa_to_aos to use lp_build_transpose_aos. commit 0b98a950d29a116e82ce31dfe7b82cdadb632f2b Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:45 2012 +0100 gallivm: Added lp_build_transpose_aos which converts between aos and soa. commit 69ea84531ad46fd145eb619ed1cedbe97dde7cb5 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:01 2012 +0100 gallivm: Added lp_build_interleave2_half aimed at AVX unpack instructions. commit 7a4cb1349dd35c18144ad5934525cfb9436792f9 Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 22 11:54:14 2012 +0100 gallivm: Fix build on Windows. MC-JIT not yet supported there. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit afd105fc16bb75d874e418046b80d9cc578818a1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:17:26 2012 +0100 llvmpipe: Added a error counter to lp_test_conv. Useful for keeping track of progress when fixing errors! Signed-off-by: José Fonseca <jfonseca@vmware.com> commit b644907d08c10a805657841330fc23db3963d59c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:16:46 2012 +0100 llvmpipe: Changed known failures in lp_test_conv. To comply with the recent fixes to lp_bld_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit d7061507bd94f6468581e218e61261b79c760d4f Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:14:38 2012 +0100 llvmpipe: Added fixed point types tests to lp_test_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 146b3ea39b4726dbe125ac666bd8902ea3d6ca8c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:26:35 2012 +0100 llvmpipe: Changed lp_test_conv src/dst alignment to be correct. Now based on the define rather than a fixed number. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit f3b57441f834833a4b142a951eb98df0aa874536 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:06:44 2012 +0100 gallivm: Fixed erroneous optimisation in lp_build_min/max. Previously assumed normalised was 0 to 1, but it can be -1 to 1 if type is signed. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a0613382e5a215cd146bb277646a6b394d376ae4 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:04:49 2012 +0100 gallivm: Compensate for lp_const_offset in lp_build_conv. Fixing a /*FIXME*/ to remove errors in integer conversion in lp_build_conv. Tested using lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a3d2bf15ea345bc8a0664f8f441276fd566566f3 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:01:25 2012 +0100 gallivm: Fixed overflow in lp_build_clamped_float_to_unsigned_norm. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit e7b1e76fe237613731fa6003b5e1601a2e506207 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 21 20:07:51 2012 +0100 gallivm: Fix build with LLVM 2.6 Trivial, and useful. commit d3c6bbe5c7f5ba1976710831281ab1b6a631082d Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 15 17:15:59 2012 +0100 gallivm: Enable MCJIT/AVX with vanilla LLVM 3.1. Add the necessary C++ glue, so that we don't need any modifications to the soon to be released LLVM 3.1. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 724a019a14d40fdbed21759a204a2bec8a315636 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 22:04:06 2012 +0100 gallivm: Use HAVE_LLVM 0x0301 consistently. commit af6991e2a3868e40ad599b46278551b794839748 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 21:49:06 2012 +0100 gallivm: Add MCRegisterInfo.h to silence benign warnings about missing implementation. Trivial. commit 6f8a1d75458daae2503a86c6b030ecc4bb494e23 Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Apr 2 22:14:15 2012 -0700 gallivm: Pass in a MCInstrInfo to createMCInstPrinter on llvm-3.1. llvm-3.1svn r153860 makes MCInstrInfo available to the MCInstPrinter. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 62555b6ed8760545794f83064e27cddcb3ce5284 Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 27 21:51:17 2012 -0700 gallivm: Fix method overriding in raw_debug_ostream. Use matching type qualifers to avoid method hiding. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 6a9bd784f4ac68ad0a731dcd39e5a3c39989f2be Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 13 22:40:52 2012 -0700 gallivm: Fix createOProfileJITEventListener namespace with llvm-3.1. llvm-3.1svn r152620 refactored the OProfile profiling code. createOProfileJITEventListener was moved from the llvm namespace to the llvm::JITEventListener namespace. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit b674955d39adae272a779be85aa1bd665de24e3e Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Mar 5 22:00:40 2012 -0800 gallivm: Pass in a MCRegisterInfo to MCInstPrinter on llvm-3.1. llvm-3.1svn r152043 changes createMCInstPrinter to take an additional MCRegisterInfo argument. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 11ab69971a8a31c62f6de74905dbf8c02884599f Author: Vinson Lee <vlee@freedesktop.org> Date: Wed Feb 29 21:20:53 2012 -0800 Revert "gallivm: Change getExtent and readByte to non-const with llvm-3.1." This reverts commit d5a6c172547d8964f4d4bb79637651decaf9deee. llvm-3.1svn r151687 makes MemoryObject accessor members const again. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 339960c82d2a9f5c928ee9035ed31dadb7f45537 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 14 16:19:56 2012 +0200 gallivm: (trivial) fix assertion failure for mipmapped 1d textures In lp_build_rho, we may end up with a 1-element vector (for mipmapped 1d textures), but in this case we require the type to be a non-vector type, so need a cast. commit 9d73edb727bd6d196030dc3026b7bf0c574b3e19 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:12:07 2012 +0200 gallivm: prepare for per-quad lod calculations for large vectors to be able to handle multiple quads at once in texture sampling and still do lod calculations per quad, it is necessary to get the per-quad derivatives into the lp_build_rho function. Until now these derivative values were just scalars, which isn't going to work. So we now use vectors, and since the interface needs to change we also do some different (slightly more efficient) packing of the values. For 8-wide vectors the packed derivative values for 3 coords would look like this, this scales to a arbitrary (multiple of 4) vector size: ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____ The second vector will be unused for 1d and 2d textures. To facilitate future changes the derivative values are put into a struct, since quite some functions just pass these values through. The generated code seems to be very slightly better for 2d textures (with 4-wide vectors) than before with sse2 (if you have a cpu with physical 128bit simd units - otherwise it's probably not a win). v2: suggestions from José, rename variables, add comments, use swizzle helper commit 0aa21de0d31466dac77b05c97005722e902517b8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:10:31 2012 +0200 gallivm: add undefined swizzle handling to lp_build_swizzle_aos This is useful for vectors with "holes", it lets llvm choose the most efficient shuffle instructions if some elements aren't needed without having to worry what elements to manually pick otherwise. commit 00faf3f370e7ce92f5ef51002b0ea42ef856e181 Author: José Fonseca <jfonseca@vmware.com> Date: Fri May 4 17:25:16 2012 +0100 gallivm: Get the LLVM IR optimization passes before JIT compilation. MC-JIT engine compiles the module immediately on creation, so the optimization passes were being run too late. So now we create a target data layout from a string, that matches the ABI parameters reported by the compiler. The backend optimization passes were always been run, so the performance improvement is modest (3% on multiarb mesa demo). Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit 40a43f4e2ce3074b5ce9027179d657ebba68800a Author: Roland Scheidegger <sroland@vmware.com> Date: Wed May 2 16:03:54 2012 +0200 gallivm: (trivial) fix wrong define used in lp_build_pack2 should fix stack-smashing crashes. commit e6371d0f4dffad4eb3b7a9d906c23f1c88a2ab9e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 30 21:25:29 2012 +0200 gallivm: add perf warnings when not using intrinsics with 256bit vectors Helper functions using integer sse2 intrinsics could split the vectors with AVX instead of using generic fallback (which should be faster). We don't actually expect to hit these paths (hence don't fix them up to actually do the vector splitting) so just emit warnings (for those functions where it's obvious doing split/intrinsic is faster than using generic path). Only emit warnings for 256bit vectors since we _really_ don't expect to hit arbitrary large vectors which would affect a lot more functions. The warnings do not actually depend on avx since the same logic applies to plain sse2 too (but of course again there's _really_ no reason we should hit these functions with 256bit vectors without avx). commit 8a9ea701ea7295181e846c6383bf66a5f5e47637 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:37:07 2012 +0200 gallivm: split vectors manually for avx in lp_build_pack2 (v2) There's 2 reasons for this: First, there's a llvm bug (fixed in 3.1) which generates tons of byte inserts/extracts otherwise, and second, more importantly, we want to use pack intrinsics instead of shuffles. We do this in lp_build_pack2 and not the calling code (aos sample path) because potentially other callers might find that useful too, even if for larger sequences of code using non-native vector sizes it might be better to manually split vectors. This should boost texture performance in the aos path considerably. v2: fix issues with intrinsics types with old llvm commit 27ac5b48fa1f2ea3efeb5248e2ce32264aba466e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:26:22 2012 +0200 llvmpipe: refactor lp_build_pack2 (v2) prettify, and it's unnecessary to assert when there's no intrinsic due to unsupported bit width - the shuffle path will work regardless. In contrast lp_build_packs2, should only rely on lp_build_pack2 doing the clamping for element sizes for which there is a sse2 intrinsic. v2: fix bug spotted by Jose regarding the intrinsic type for packusdw on old llvm versions. commit ddf279031f0111de4b18eaf783bdc0a1e47813c8 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:13:59 2012 +0200 gallivm: add src width check in lp_build_packs2() not doing so would skip clamping even if no sse2 pack instruction is available, which is incorrect (in theory only, such widths would also always hit a (unnecessary) assertion in lp_build_pack2(). commit e7f0ad7fe079975eae7712a6e0c54be4fae0114b Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Apr 27 15:57:00 2012 +0200 gallivm: (trivial) fix crash-causing typo for npot textures with avx commit 28a9d7f6f655b6ec508c8a3aa6ffefc1e79793a0 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Apr 25 19:38:45 2012 +0200 gallivm: (trivial) remove code mistakenly added twice. commit d5926537316f8ff67ad0a52e7242f7c5478d919b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Apr 24 21:16:15 2012 +0200 gallivm: add a new avx aos sample path (v2) Try to avoid mixing float and int address calculations. This does texture wrap modes with floats, and then the offset calculations still with ints (because of lack of precision with floats, though we could do some effort to make it work with not too large (16MB) textures). This also handles wrap repeat mode with npot-sized textures differently than either the old soa or aos int path (likely way faster but untested). Otherwise the actual address wrap code is largely similar to the soa path (not quite the same as this one also has some int code), it should get used by avx soa sampling later as well but doesn't handle more complex address modes yet (this will also have the benefit that we can use aos sampling path for all texture address modes). Generated code for that looks reasonable, but still does not split vectors explicitly for fetch/filter which means still get hit by llvm (fixed upstream) which generates hundreds of pinsrb/pextrb instead of two shuffles. It is not obvious though if it's much of a win over just doing address calcs 4-wide but with ints, even if it is definitely much less instructions on avx. piglit's texwrap seems to look exactly the same but doesn't test neither the non-normalized nor the npot cases. v2: fix comments, prettify based on Brian's and Jose's feedback. commit bffecd22dea66fb416ecff8cffd10dd4bdb73fce Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Apr 19 01:58:29 2012 +0200 gallivm: refactor aos lp_build_sample_image_nearest/linear split them up to separate address calculations and fetching/filtering. Need this for being able to do 8-wide float address calcs and 4-wide fetch/filter later (for avx). Plus the functions were very big scary monsters anyway (in particular lp_build_sample_image_linear). commit a80b325c57529adddcfa367f96f03557725c4773 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 16 17:17:18 2012 +0200 gallivm: fix lp_build_resize when truncating width but expanding vector size Missed this case which I thought was impossible - the assertion for it was right after the division by zero... (AoS) texture sampling may ask us to do this, for things like 8 4x32int vectors to 1 32x8int vector conversion (eventually, we probably don't want this to happen). commit f9c8337caa3eb185830d18bce8b95676a065b1d7 Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Apr 14 18:00:59 2012 +0200 gallivm: fix cube maps with larger vectors This makes the branchless cube face selection code work with larger vectors. Because the complexity is quite high (cannot really be improved it seems, per-face selection would reduce complexity a lot but this leads to errors unless the derivatives are calculated all from the same face which almost doubles the work to be done) it is still slower than the branching version, hence only enable this with large vectors. It doesn't actually do per-quad face selection yet (only makes sense with matching lod selection, in fact it will select the same face for all pixels based on the average of the first four pixels for now) but only different shuffles are required to make it work (the branching version actually should work with larger vectors too now thanks to the improved horizontal add but of course it cannot be extended to really select the face per-quad unless doing branching per quad). commit 7780c58869fc9a00af4f23209902db7e058e8a66 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 21:11:12 2012 +0100 llvmpipe: (trivial) fix compiler warning and also clarify comment regarding availability of popcnt instruction. commit a266dccf477df6d29a611154e988e8895892277e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 14:21:07 2012 +0100 gallivm: remove unneeded members in lp_build_sample_context Minor cleanup, the texture width, height, depth aren't accessed in their scalar form anywhere. Makes it more obvious those values should probably be fetched already vectorized (but this requires more invasive changes)... commit b678c57fb474e14f05e25658c829fc04d2792fff Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 29 15:53:55 2012 +0100 gallivm: add a helper for concatenating vectors Similar to the extract_range helper intended to get around slow code generated by llvm for 128bit insertelements. Concatenating two 128bit vectors this way will result in a single vinsertf128 operation rather than two 64bit stores plus one 128bit load, though it might be mildly useful for other purposes as well. commit 415ff228bcd0cf5e44a4c15350a661f0f5520029 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 19:41:15 2012 +0100 gallivm: add a custom 2x8f->1x16ub avx conversion path Similar to the existing 4x4f->1x16ub sse2 path, shaves off a couple instructions (min/max mostly) because it relies on pack intrinsics clamping. commit 78c08fc89f8fbcc6dba09779981b1e873e2a0299 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 18:44:07 2012 +0100 gallivm: add avx arithmetic intrinsics Add all avx intrinsics for arithmetic functions (with the exception of the horizontal add function which needs another look). Seems to pass basic tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit a586caa2800aa5ce54c173f7c0d4fc48153dbc4e Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 15:31:35 2012 +0100 gallivm: add avx logic intrinsics Add the blend intrinsics for 8-wide float and 4-wide double vectors. Since we lack 256bit int instructions these are used for int vectors as well, though obviously not for byte or word element values. The comparison intrinsics aren't extended for avx since these are only used for pre-2.7 llvm versions. commit 70275e4c13c89315fc2560a4c488c0e6935d5caf Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 00:40:53 2012 +0100 gallivm: new helper function for extract shuffles. Based on José's idea as we can need that in a couple places. Note that such shuffles should not be used lightly, since data layout of <4 x i8> is different to <16 x i8> for instance, hence might cause data rearrangement. commit 4d586dbae1b0c55915dda1759d2faea631c0a1c2 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 18:27:25 2012 +0100 gallivm: (trivial) don't overallocate shuffle variable using wrong define meant huge array... commit 06b0ec1f6d665d98c135f9573ddf4ba04b2121ad Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 17:54:20 2012 +0100 gallivm: don't do per-element extract/insert for vector element resize Instead of doing per-element extract/insert if the src vectors and dst vector differ in total size (which generates atrocious code) first change the src vectors size by using shuffles to destination vector size. We can still do better than that on AVX for packing to color buffer (by exploiting pack intrinsics characteristics hence eleminating the need for some clamps) but this already generates much better code. v2: incorporate feedback from José, Keith and use shuffle instead of bitcasts/extracts. Due to llvm deficiencies the latter cause all data to get moved to GPRs and back in pieces (even though the data in the regs actually stays the same...). commit c9970d70e05f95d3f52fe7d2cd794176a52693aa Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 19:33:19 2012 +0000 gallivm: fix bug in simple position interpolation Accidental use of position attribute instead of just pixel coordinates. Caused failures in piglit glsl-fs-ceil and glsl-fs-floor. commit d0b6fcdb008d04d7f73d3d725615321544da5a7e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 15:31:14 2012 +0000 gallivm: fix emission of ceil opcode lp_build_ceil seems more appropriate than lp_build_trunc. This seems to be never hit though someone performs some ceil to floor magic. commit d97fafed7e62ffa6bf76560a92ea246a1a26d256 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 22 11:46:52 2012 +0000 gallivm: new vectorized path for cubemap calculations should be faster when adapted to multiple quads as only selection masks need to be different. The code is more or less a per-pixel version adapted to only do it per quad. A per pixel version would be much simpler (could drop 2 selects, 6 broadcasts and the messy horizontal add of 3 vectors at the expense of only 2 more absolute value instructions - would also just work for arbitary large vectors). This version doesn't yet work with larger vectors because the horizontal add isn't adjusted to be able to work with 2x4 vectors (and also because face selection wouldn't be done per quad just per block though that would be only a correctness issue just as with lod selection). The downside is this code is quite a bit slower. On a Core2 it can be sped up by disabling the hw blend instructions for selection and using logicop fallbacks instead, but it is still slower than the old code, hence leave that in for now. Probably will chose one or the other version based on vector length in the end. commit b375fbb18a3fd46859b7fdd42f3e9908ea4ff9a3 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 21 14:42:29 2012 +0000 gallivm: fix optimized occlusion query intrinsic name commit a9ba0a3b611e48efbb0e79eb09caa85033dbe9a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Mar 21 16:19:43 2012 +0000 draw,gallivm,llvmpipe: Call gallivm_verify_function everywhere. commit f94c2238d2bc7383e088b8845b7410439a602071 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 18:54:10 2012 +0000 gallivm: optimize calculations for cube maps a bit this does some more vectorized calculations and uses horizontal adds if possible. A definite win with sse3 otherwise it doesn't seem to make much of a difference. In any case this is arithmetically identical, cannot handle larger vectors. Should be useful as a reference point against larger vector version later... commit 21a2c1cf3c8e1ac648ff49e59fdc0e3be77e2ebb Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 15:16:27 2012 +0000 llvmpipe: slight optimization of occlusion queries using movmskps when available. While this is slightly better for cpus without popcnt we should really sum the vectors ourselves (it is also possible to cast to i4 before doing the popcnt but that doesn't help that much neither since llvm is using some optimized popcnt version for i32) commit 5ab5a35f216619bcdf55eed52b0db275c4a06c1b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 13:32:11 2012 +0000 llvmpipe: fix occlusion queries with larger vectors need to adjust casts etc. commit ff95e6fdf5f16d4ef999ffcf05ea6e8c7160b0d5 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Mar 19 20:15:25 2012 +0000 gallivm: Restore optimization passes. commit 57b05b4b36451e351659e98946dae27be0959832 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:34:22 2012 +0000 llvmpipe: use existing min2 macro commit bc9a20e19b4f600a439f45679451f2e87cd4b299 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:07:27 2012 +0000 llvmpipe: add some safeguards against really large vectors As per José's suggestion, prevent things from blowing up if some cpu would have 1024bit or larger vectors. commit 0e2b525e5ca1c5bbaa63158bde52ad1c1564a3a9 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:31:08 2012 +0000 llvmpipe: fix mask generation for uberwide vectors this was the only piece preventing 16-wide vectors from working (apart from the LP_MAX_VECTOR_WIDTH define that is), which is the maximum as we don't get more pixels in the fragment shader at once. Hence adjust that so things could be tested properly with that size even though there seems to be no practical value. commit 3c8334162211c97f3a11c7f64e9e5a2a91ad9656 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:19:41 2012 +0000 llvmpipe: fix the simple interpolation method with larger vectors so both methods actually _really_ work now. Makes textures look nice with larger vectors... commit 1cb0464ef8871be1778d43b0c56adf9c06843e2d Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 17:26:35 2012 +0000 llvmpipe: fix mask generation and position interpolation with 8-wide vectors trivial bugs, with these things start to look somewhat reasonable. Textures though have some swizzling issues it seems. commit 168277a63ef5b72542cf063c337f2d701053ff4b Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 16:04:03 2012 +0000 llvmpipe: don't overallocate variables we never have more than 16 (stamp size) / 4 (minimum possible vector size). (With larger vectors those variables are still overallocated a bit.) commit 409b54b30f81ed0aa9ed0b01affe15c72de9abd2 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:56:48 2012 +0000 llvmpipe: add some 32f8 formats to lp_test_conv Also add the ability to handle different sized vectors. commit 55dcd3af8366ebdac0af3cdb22c2588f24aa18ce Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:47:27 2012 +0000 gallivm: handle different sized vectors in conversion / pack only fully generic path for now (extract/insert per element). commit 9c040f78c54575fcd94a8808216cf415fe8868f6 Author: Roland Scheidegger <sroland@vmware.com> Date: Sun Mar 18 00:58:28 2012 +0100 llvmpipe: fix harmless use of unitialized values commit 551e9d5468b92fc7d5aa2265db9a52bb1e368a36 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:31:21 2012 +0100 gallivm: drop special path in extract_broadcast with different sized vectors Not needed, llvm can handle shuffles with different sized result vector just fine. Should hopefully generate the same code in the end, but simpler IR. commit 44da531119ffa07a421eaa041f63607cec88f6f8 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:28:49 2012 +0100 llvmpipe: adapt interpolation for handling multiple quads at once this is still WIP there are actually two methods possible not quite sure what makes the most sense, so there's code for both for now: 1) the iterative method as used before (compute attrib values at upper left corner of stamp and upper left corner of each quad initially). It is improved to handle more than one quad at once, and also do some more vectorized calculations initially for slightly better code - newer cpus have full throughput with 4 wide float vectors, hence don't try to code up a path which might be faster if there's just one channel active per attribute. 2) just do straight interpolation for each pixel. Method 2) is more work per quad, but less initially - if all quads are executed significantly more overall though. But this might change with larger vector lengths. This method would also be needed if we'd do some kind of active quad merging when operating on multiple quads at once. This path contains some hack to force llvm to generate better code, it is still far from ideal though, still generates far too many unnecessary register spills/reloads. Both methods should work with different sized vectors. Not very well tested yet, still seems to work with four-wide vectors, need changes elsewhere to be able to test with wider vectors. commit be5d3e82e2fe14ad0a46529ab79f65bf2276cd28 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:59:37 2012 +0000 draw: Cleanup. commit f85bc12c7fbacb3de2a94e88c6cd2d5ee0ec0e8d Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:43:30 2012 +0000 gallivm: More module compilation refactoring. commit d76f093198f2a06a93b2204857e6fea5fd0b3ece Author: José Fonseca <jfonseca@vmware.com> Date: Thu Mar 15 21:29:11 2012 +0000 llvmpipe: Use gallivm_compile/free_function() in linear code. Should had been done before. commit 122e1adb613ce083ad739b153ced1cde61dfc8c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 13 14:47:10 2012 +0100 llvmpipe: generate partial pixel mask for multiple quads still works with one quad, cannot be tested yet with more At least for now always fixed order with multiple quads. commit 4c4f15081d75ed585a01392cd2dcce0ad10e0ea8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 22:09:24 2012 +0100 llvmpipe: refactor state setup a bit Refactor to make it easier to emit (and potentially later fetch in fs) coefficients for multiple attributes at once. Need to think more about how to make this actually happen however, the problem is different attributes can have different interpolation modes, requiring different handling in both setup and fs (though linear and perspective handling is close). commit 9363e49722ff47094d688a4be6f015a03fba9c79 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 19:23:23 2012 +0100 llvmpipe: vectorize tri offset calc cuts number of instructions in quad-offset-factor from 107 to 75. This code actually duplicated the (scalar) code calculating the determinant except it used different vertex order (leading to different sign but it doesn't matter) hence llvm could not have figured out it's the same (of course with determinant vectorized in the other place that wouldn't have worked any longer neither). Note this particular piece doesn't actually vectorize well, not many arithmetic instructions left but tons of shuffle instructions... Probably would need to work on n tris at a time for better vectorization. commit 63169dcb9dd445c94605625bf86d85306e2b4297 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 03:11:37 2012 +0100 llvmpipe: vectorize some scalar code in setup reduces number of arithmetic instructions, and avoids loading vector x,y values twice (once as scalars once as vectors). Results in a reduction of instructions from 76 to 64 in fs setup for glxgears (16%) on a cpu with sse41. Since this code uses vec2 disguised as vec4, on old cpus which had physical 64bit sse units (pre-Core2) it probably is less of a win in practice (and if you have no vectors you can only hope llvm eliminates the arithmetic for unneeded elements). commit 732ecb877f951ab89bf503ac5e35ab8d838b58a1 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 7 00:32:24 2012 +0100 draw: fix clipping bug introduced by 4822fea3f0440b5205e957cd303838c3b128419c broke clipping pretty badly (verified with lineclip test) commit ef5d90b86d624c152d200c7c4056f47c3c6d2688 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 23:38:59 2012 +0100 draw: don't store vertex header per attribute storing the vertex header once per attribute is totally unnecessary. Some quick look at the generated assembly says llvm in fact cannot optimize away the additional stores (maybe due to potentially aliasing pointers somewhere). Plus, this makes the code cleaner and also allows using a vector "or" instead of scalar ones. commit 6b3a5a57b0b9850854cfbd7b586e4e50102dda71 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 19:11:01 2012 +0100 draw: do the per-vertex "boolean" clipmask "or" with vectors no point extracting the values and doing it per component. Doesn't help that much since we still extract the values elsewhere anyway. commit 36519caf1af40e4480251cc79a2d527350b7c61f Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 2 22:27:01 2012 +0100 gallivm: fix lp_build_extract_broadcast with different sized vectors Fix the obviously wrong argument, so it doesn't blow up. commit 76d0ac3ad85066d6058486638013afd02b069c58 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 2 12:16:23 2012 +0000 draw: Compile per module and not per function (WIP). Enough to get gears w/ LLVM draw + softpipe to work on AVX doing: GALLIUM_DRIVER=softpipe SOFTPIPE_USE_LLVM=yes glxgears But still hackish -- will need to rethink and refactor this. commit 78e32b247d2a7a771be9a1a07eb000d1e54ea8bd Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 12:01:05 2012 +0000 llvmpipe: Remove lp_state_setup_fallback. Never used. commit 6895d5e40d19b4972c361e8b83fdb7eecda3c225 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Feb 27 19:14:27 2012 +0000 llvmpipe: Don't emit EMMS on x86 We already take precautions to ensure that LLVM never emits MMX code. commit 4822fea3f0440b5205e957cd303838c3b128419c Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 29 15:58:19 2012 +0100 draw: modifications for larger vector sizes We want to be able to use larger vectors especially for running the vertex shader. With this patch we build soa vectors which might have a different length than 4. Note that aos structures really remain the same, only when aos structures are converted to soa potentially different sized vectors are used. Samplers probably don't work yet, didn't look at them. Testing done: glxgears works with both 128bit and 256bit vectors. commit f4950fc1ea784680ab767d3dd0dce589f4e70603 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:57 2012 +0100 gallivm: override native vector width with LP_NATIVE_VECTOR_WIDTH env var for debug commit 6ad6dbf0c92f3bf68ae54e5f2aca035d19b76e53 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:24 2012 +0100 draw: allocate storage with alignment according to native vector width commit 7bf0e3e7c9bd2469ae7279cabf4c5229ae9880c1 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Feb 24 19:06:08 2012 +0000 gallivm: Fix comment grammar. Was missing several words. Spotted by Roland. commit b20f1b28eb890b2fa2de44a0399b9b6a0d453c52 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 19:22:09 2012 +0000 gallivm: Use MC-JIT on LLVM 3.1 + (i.e, SVN) MC-JIT Note: MC-JIT is still WIP. For this to work correctly it requires LLVM changes which are not yet upstream. commit b1af4dfcadfc241fd4023f4c3f823a1286d452c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 20:03:15 2012 +0100 llvmpipe: use new lp_type_width() helper in lp_test_blend commit 04e0a37e888237d4db2298f31973af459ef9c95f Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 19:50:34 2012 +0100 llvmpipe: clean up lp_test_blend a little Using variables just sized and aligned right makes it a bit more obvious what's going on. The test still only tests vector length 4. For AoS anything else probably isn't going to work. For SoA other lengths should work (at least with floats). commit e61c393d3ec392ddee0a3da170e985fda885a823 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:48:30 2012 +0000 gallivm: Ensure vector width consistency. Instead of assuming that everything is the max native size. commit 330081ac7bc41c5754a92825e51456d231bf84dd Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:44:14 2012 +0000 draw: More simd vector width consistency fixes. commit d90ca002753596269e37297e2e6c139b19f29f03 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:43:00 2012 +0000 gallivm: Remove unused lp_build_int32_vec4_type() helper. commit cae23417824d75869c202aaf897808d73a2c1db0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 17:32:16 2012 +0100 gallivm: use global variable for native vector width instead of define We do not know the simd extensions (and hence the simd width we should use) available at compile time. At least for now keep a define for maximum vector width, since a global variable obviously can't be used to adjust alignment of automatic stack variables. Leave the runtime-determined value at 128 for now in all cases. commit 51270ace6349acc2c294fc6f34c025c707be538a Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 15:41:02 2012 +0000 gallivm: Add a hunk inadvertedly lost when rebasing. commit bf256df9cfdd0236637a455cbaece949b1253e98 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:24:23 2012 +0000 llvmpipe: Use consistent vector width in depth/stencil test. commit 5543b0901677146662c44be2cfba655fd55da94b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:19:59 2012 +0000 draw: Use a consistent the vector register width. Instead of 4x32 sometimes, LP_NATIVE_VECTOR_WIDTH other times. commit eada8bbd22a3a61f549f32fe2a7e408222e5c824 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 12:08:04 2012 +0000 gallivm: Remove garbagge collection. MC-JIT will require one compilation per module (as opposed to one compilation per function), therefore no state will be shared, eliminating the need to do garbagge collection. commit 556697ea0ed72e0641851e4fbbbb862c470fd7eb Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 10:33:41 2012 +0000 gallivm: Move all native target initialization to lp_set_target_options(). commit c518e8f3f2649d5dc265403511fab4bcbe2cc5c8 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:52:32 2012 +0000 llvmpipe: Create one gallivm instance for each test. commit 90f10af8920ec6be6f2b1e7365cfc477a0cb111d Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:48:08 2012 +0000 gallivm: Avoid LLVMAddGlobalMapping() in lp_bld_assert(). Brittle, complex, and unecesary. Just use function pointer constant. commit 98fde550b33401e3fe006af59db4db628bcbf476 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:21:26 2012 +0000 gallivm: Add a lp_build_const_func_pointer() helper. To be reused in all places where we want to call C code. commit 6cfedadb62c2ce5af8d75969bc95a607f3ece118 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:44:41 2012 +0000 gallivm: Cleanup/simplify lp_build_const_string_variable. - Move to lp_bld_const where it belongs - Rename to lp_build_const_string - take the length from the argument (and don't count the zero terminator twice) - bitcast the constant to generic i8 * commit db1d4018c0f1fa682a9da93c032977659adfb68c Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 11:52:17 2012 +0000 gallivm: Set NoFramePointerElimNonLeaf to true where supported. commit 088614164aa915baaa5044fede728aa898483183 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 22 19:38:47 2012 +0100 llvmpipe: pass in/out pointers rather scalar floats in lp_bld_arit we don't want llvm to potentially optimize away the vectors (though it doesn't seem to currently), plus we want to be able to handle in/out vectors of arbitrary length. commit 3f5c4e04af8a7592fdffa54938a277c34ae76b51 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:22:55 2012 +0100 gallivm: fix lp_build_sqrt() for vector length 1 since we optimize away vectors with length 1 need to emit intrinsic without vector type. commit 79d94e5f93ed8ba6757b97e2026722ea31d32c06 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 17:00:46 2012 +0000 llvmpipe: Remove lp_test_round. commit 81f41b5aeb3f4126e06453cfc78990086b85b78d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:56:24 2012 +0100 llvmpipe: subsume lp_test_round into lp_test_arit Much simpler, and since the arguments aren't passed as 128bit values can run on any arch. This also uses the float instead of the double versions of the c functions (which probably was the intention anyway). In contrast to lp_test_round the output is much less verbose however. Tested vector width of 32 to 512 bits - all pass except 32 (length 1) which crashes in lp_build_sqrt() due to wrong type. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 945b338b421defbd274481d8c4f7e0910fd0e7eb Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 09:55:03 2012 +0000 gallivm: Centralize the function compilation logic. This simplifies a lot of code. Also doing this in a central place will make it easier to carry out the changes necessary to use MC-JIT in the future. gallivm: Fix typo in explicit derivative shuffle. Trivial. draw: make DEBUG_STORE work again adapt to lp_build_printf() interface changes Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: get rid of vecnf_from_scalar() just use lp_build_broadcast directly (cannot assign a name but don't really need it, vecnf_from_scalar() was producing much uglier IR due to using repeated insertelement instead of insertelement+shuffle). Reviewed-by: José Fonseca <jfonseca@vmware.com> llvmpipe: fix typo in complex interpolation code Fixes position interpolation when using complex mode (piglit fp-fragment-position and similar) Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: fix clipvertex/position storing again This appears to be the result of a bad merge. Fixes piglit tests relying on clipping, like a lot of the interpolation tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Fix explicit derivative manipulation. Same counter variable was being used in two nested loops. Use more meanigful variable names for the counter to fix and avoid this. gallivm: Prevent buffer overflow in repeat wrap mode for NPOT. Based on Roland's patch, discussion, and review . Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix dims for TGSI_TEXTURE_1D in emit_tex. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix explicit volume texture derivatives. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: fix 1d shadow texture sampling Always r coordinate is used, hence need 3 coords not two (the second one is unused). Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Enable AVX support without MCJIT, where available. For now, this just enables AVX on Windows for testing. If the code is stable then we might consider prefering the old JIT wherever possible. No change elsewhere. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2012-07-13 18:09:30 +01:00
}
if (type.width == 32 && !type.sign) {
intrinsic = "llvm.x86.sse41.pminud";
}
gallivm,draw,llvmpipe: Support wider native registers. Squashed commit of the following: commit 7acb7b4f60dc505af3dd00dcff744f80315d5b0e Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:46:31 2012 +0100 draw: Don't use dynamically sized arrays. Not supported by MSVC. commit 5810c28c83647612cb372d1e763fd9d7780df3cb Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:44:16 2012 +0100 gallivm,llvmpipe: Don't use expressions with PIPE_ALIGN_VAR(). MSVC doesn't accept exceptions in _declspec(align(...)). Use a define instead. commit 8aafd1457ba572a02b289b3f3411e99a3c056072 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:41:56 2012 +0100 gallium/util: Make u_cpu_detect.h header C++ safe. commit 5795248350771f899cfbfc1a3a58f1835eb2671d Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 2 12:08:01 2012 +0100 gallium/util: Add ULL suffix to large constants. As suggested by Andy Furniss: it looks like some old gcc versions require it. commit 4c66c22727eff92226544c7d43c4eb94de359e10 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Truly disable INF/NAN tests on MSVC. Thanks to Brian for spotting this. commit 8bce274c7fad578d7eb656d9a1413f5c0844c94e Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Disable INF/NAN tests on MSVC. Somehow they are not recognized as constants. commit 6868649cff8d7fd2e2579c28d0b74ef6dd4f9716 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 15:05:24 2012 +0200 gallivm: Cleanup the 2 x 8 float -> 16 ub special path in lp_build_conv. No behaviour change intended, like 7b98455fb40c2df84cfd3cdb1eb7650f67c8a751. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 5147a0949c4407e8bce9e41d9859314b4a9ccf77 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 14:28:19 2012 +0200 gallivm: (trivial) fix issues with multiple-of-4 texture fetch Some formats can't handle non-multiple of 4 fetches I believe, but everything must support length 1 and multiples of 4. So avoid going to scalar fetch (which is very costly) just because length isn't 4. Also extend the hack to not use shift with variable count for yuv formats to arbitrary length (larger than 1) - doesn't matter how many elements we have we always want to avoid it unless we have variable shift count instruction (which we should get with avx2). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 87ebcb1bd71fa4c739451ec8ca89a7f29b168c08 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jul 4 02:09:55 2012 +0200 gallivm: (trivial) fix typo for wrap repeat mode in linear filtering aos code This would lead to bogus coordinates at the edges. (undetected by piglit because this path is only taken for block-based formats). Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 3a42717101b1619874c8932a580c0b9e6896b557 Author: José Fonseca <jfonseca@vmware.com> Date: Tue Jul 3 19:42:49 2012 +0100 gallivm: Fix TGSI integer translation with AVX. commit d71ff104085c196b16426081098fb0bde128ce4f Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 15:17:41 2012 +0100 llvmpipe: Fix LLVM JIT linear path. It was not working properly because it was looking at the JIT function before it was actually compiled. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit a94df0386213e1f5f9a6ed470c535f9688ec0a1b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Jun 28 18:07:10 2012 +0100 gallivm: Refactor lp_build_broadcast(_scalar) to share code. Doesn't really change the generated assembly, but produces more compact IR, and of course, makes code more consistent. Reviewed-by: Brian Paul <brianp@vmware.com> commit 66712ba2731fc029fa246d4fc477d61ab785edb5 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 17:30:13 2012 +0100 gallivm: Make LLVMContextRef a singleton. There are any places inside LLVM that depend on it. Too many to attempt to fix. Reviewed-by: Brian Paul <brianp@vmware.com> commit ff5fb7897495ac263f0b069370fab701b70dccef Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 28 18:15:27 2012 +0200 gallivm: don't use 8-wide texture fetch in aos path This appears to be a slight loss usually. There are probably several reasons for that: - fetching itself is scalar - filtering is pure int code hence needs splitting anyway, same for the final texel offset calculations - texture wrap related code, which can be done 8-wide, is slightly more complex with floats (with clamp_to_edge) and float operations generally more costly hence probably not much faster overall - the code needed to split when encountering different mip levels for the quads, adding complexity So, just split always for aos path (but leave it 8-wide for soa, since we do 8-wide filtering there when possible). This should certainly be revisited if we'd have avx2 support. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ce8032b43dcd8e8d816cbab6428f54b0798f945d Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:41:19 2012 +0200 gallivm: (trivial) don't extract fparts variable if not needed Did not have any consequences but unnecessary. commit aaa9aaed8f80dc282492f62aa583a7ee23a4c6d5 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:09:06 2012 +0200 gallivm: fix precision issue in aos linear int wrap code now not just passes at a quick glance but also with piglit... If we do the wrapping with floats, we also need to set the weights accordingly. We can potentially end up with different (integer) coordinates than what the integer calculations would have chosen, which means the integer weights calculated previously in this case are completely wrong. Well at least that's what I think happens, at least recalculating the weights helps. (Some day really should refactor all the wrapping, so we do whatever is fastest independent of 16bit int aos or 32bit float soa filtering.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit fd6f18588ced7ac8e081892f3bab2916623ad7a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 11:15:53 2012 +0100 gallium/util: Fix parsing of options with underscore. For example GALLIVM_DEBUG=no_brilinear which was being parsed as two options, "no" and "brilinear". commit 09a8f809088178a03e49e409fa18f1ac89561837 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 26 15:00:14 2012 +0100 gallivm: Added a generic lp_build_print_value which prints a LLVMValueRef. Updated lp_build_printf to share common code. Removed specific lp_build_print_vecX. Reviewed-by: José Fonseca <jfonseca@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit e59bdcc2c075931bfba2a84967a5ecd1dedd6eb0 Author: José Fonseca <jfonseca@vmware.com> Date: Wed May 16 15:00:23 2012 +0100 draw,llvmpipe: Avoid named struct types on LLVM 3.0 and later. Starting with LLVM 3.0, named structures are meant not for debugging, but for recursive data types, previously also known as opaque types. The recursive nature of these types leads to several memory management difficulties. Given that we don't actually need recursive types, avoid them altogether. This is an attempt to address fdo bugs 41791 and 44466. The issue is somewhat random so there's no easy way to check how effective this is. Cherry-picked from 9af1ba565dfd5cef9ee938bb7c04767d14878fbf commit df6070f618a203c7a876d984c847cde4cbc26bdb Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 14:42:53 2012 +0200 gallivm: (trivial) fix typo in faster aos linear int wrap code no longer crashes, now REALLY tested. commit d8f98dce452c867214e6782e86dc08562643c862 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 18:20:58 2012 +0200 llvmpipe: (trivial) remove bogus optimization for float aos repeat wrap This optimization for nearest filtering on the linear path generated likely bogus results, and the int path didn't have any optimizations there since the only shader using force_nearest apparently uses clamp_to_edge not repeat wrap anyway. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit c4e271a0631087c795e756a5bb6b046043b5099d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 23:01:52 2012 +0200 gallivm: faster repeat wrap for linear aos path too Even if we already have scaled integer coords, it's way faster to use the original float coord (plus some conversions) rather than use URem. The choice of what to do for texture wrapping is not really tied to int aos or float soa filtering though for some modes there can be some gains (because of easier weight calculations). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 1174a75b1806e92aee4264ffe0ffe7e70abbbfa3 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 14:39:22 2012 +0200 gallivm: improve npot tex wrap repeat in linear soa path URem gets translated into series of scalar divisions so just about anything else is faster. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit f849ffaa499ed96fa0efd3594fce255c7f22891b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 00:40:35 2012 +0100 gallivm: (trivial) fix near-invisible shift-space typo I blame the keyboard. commit 5298a0b19fe672aebeb70964c0797d5921b51cf0 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:24:28 2012 +0200 gallivm: add new intrinsic helper to deal with arbitrary vector length This helper will split vectors which are too large for the hw, or expand them if they are too small, so a caller of a function using intrinsics which uses such sizes need not split (or expand) the vectors manually and the function will still use the intrinsic instead of dropping back to generic llvm code. It can also accept scalars for use with pseudo-vector intrinsics (only useful for float arguments, all x86 scalar simd float intrinsics use 4vf32). Only used for lp_build_min/max() for now (also added the scalar float case for these while there). (Other basic binary functions could use it easily, whereas functions with a different interface would need different helpers.) Expanding vectors isn't widely used, because we always try to use build contexts with native hw vector sizes. But it might (or not) be nicer if this wouldn't need to be done, the generated code should in theory stay the same (it does get hit by lp_build_rho though already since we didn't have a intrinsic for the scalar lp_build_max case before). v2: incorporated Brian's feedback, and also made the scalar min/max case work instead of crash (all scalar simd float intrinsics take 4vf32 as argument, probably the reason why it wasn't used before). Moved to lp_bld_intr based on José's request, and passing intrinsic size instead of length. Ideally we'd derive the source type info from the passed in llvm value refs and process some llvmtype return type so we could handle intrinsics where the source and destination type isn't the same (like float/int conversions, packing instructions) but that's a bit too complicated for now. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 01aa760b99ec0b2dc8ce57a43650e83f8c1becdf Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:19:18 2012 +0200 gallivm: (trivial) increase max code size for shader disassembly 64kB was just short of what I needed (which caused a crash) hence increase to 96kB (should probably be smarter about that). commit 74aa739138d981311ce13076388382b5e89c6562 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:53:29 2012 +0100 gallivm: simplify aos float tex wrap repeat nearest just handle pot and npot the same. The previous pot handling ended up with exactly the same instructions plus 2 more (leave it in the soa path though since it is probably still cheaper there). While here also fix a issue which would cause a crash after an assert. commit 0e1e755645e9e49cfaa2025191e3245ccd723564 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:29:24 2012 +0100 gallivm: (trivial) skip floor rounding in ifloor when not signed This was only done for the non-sse41 case before, but even with sse41 this is obviously unnecessary (some callers already call itrunc in this case anyway but some might not). commit 7f01a62f27dcb1d52597b24825931e88bae76f33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:23:12 2012 +0100 gallivm: (trivial) fix bogus comments commit 5c85be25fd82e28490274c468ce7f3e6e8c1d416 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 20 11:51:57 2012 +0100 translate: Free elt8_func/elt16_func too. These were leaking. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 0ad498f36fb6f7458c7cffa73b6598adceee0a6c Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 15:55:34 2012 +0200 gallivm: fix bug for tex wrap repeat with linear sampling in aos float path The comparison needs to be against length not length_minus_one, otherwise the max texel is never chosen (for the second coordinate). Fixes piglit texwrap-1D-npot-proj (and 2D/3D versions). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit d1ad65937c5b76407dc2499b7b774ab59341209e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 16:13:43 2012 +0200 gallivm: simplify soa tex wrap repeat with npot textures and no mip filtering Similar to what is already done in aos sampling for the float path (but not the int path since we don't get normalized float coordinates there). URem is expensive and the calculation is done trivially with normalized floats instead (at least with sse41-capable cpus). (Some day should probably do the same for the mip filter path but it's much more complicated there hence the gain is smaller.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit e1e23f57ba9b910295c306d148f15643acc3fc83 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:38:56 2012 +0200 llvmpipe: (trivial) remove duplicated function declaration Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 07ca57eb09e04c48a157733255427ef5de620861 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:37:34 2012 +0200 llvmpipe: destroy setup variants on context destruction lp_delete_setup_variants() used to be called in garbage collection, but this no longer exists hence the setup shaders never got freed. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ed0003c633859a45f9963a479f4c15ae0ef1dca3 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 16:25:29 2012 +0100 gallivm: handle different ilod parts for multiple quad sampling This fixes filtering when the integer part of the lod is not the same for all quads. I'm not fully convinced of that solution yet as it just splits the vector if the levels to be sampled from are different. But otherwise we'd need to do things like some minify steps, and getting mip level base address separately anyway hence it wouldn't really look like much of a win (and making the code even more complex). This should now give identical results to single quad sampling. commit 8580ac4cfc43a64df55e84ac71ce1a774d33c0d2 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:14:47 2012 +0200 gallivm: de-duplicate sample code common to soa and aos sampling There doesn't seem to be any reason why this code dealing with cube face selection, lod and mip level calculation is separate in aos and soa sampling, and I am sick of having it to change in both places. commit fb541e5f957408ce305b272100196f1e12e5b1e8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:15:41 2012 +0200 gallivm: do mip filtering with per quad lod_fpart This gives better results for mip filtering, though the generated code might not be optimal. For now it also creates some artifacts if the lod_ipart isn't the same for all quads, since instead of using the same mip weight for all quads as previously (which just caused non-smooth gradients) this now will use the right weights but with the wrong mip level in this case (can easily be seen with things like texfilt, mipmap_tunnel). v2: use logic helper suggested by José, and fix issue with negative lod_fpart values commit f1cc84eef7d826a20fab6cd8ccef9a275ff78967 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 13 18:35:25 2012 +0200 gallivm: (trivial) fix bogus assert in lp_build_unpack_broadcast_aos_scalars commit 7c17dbae8ae290df9ce0f50781a09e8ed640c044 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:11:14 2012 +0100 util: Reimplement half <-> float conversions. Removed u_half.py used to generate the table for previous method. Previous implementation of float to half conversion was faulty for denormalised and NaNs and would require extra logic to fix, thus making the speedup of using tables irrelevant. commit 7762f59274070e1dd4b546f5cb431c2eb71ae5c3 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:12:16 2012 +0100 tests: Updated tests to properly handle NaN for half floats. commit fa94c135aea5911fd93d5dfb6e6f157fb40dce5e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:33:10 2012 +0200 gallivm: do mip level calculations per quad This is the final piece which shouldn't change the rendering output yet. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 23cbeaddfe03c09ca18c45d28955515317ffcf4c Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Jun 9 00:54:21 2012 +0200 gallivm: do per-quad cube face selection Doesn't quite fix the piglit cubemap test (not sure why actually) but doing per-quad face selection is doing the right thing and definitely an improvement. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit abfb372b3702ac97ac8b5aa80ad1b94a2cc39d33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:22:59 2012 +0200 gallivm: do all lod calculations per quad Still no functional change but lod is now converted to scalar after lod calculations. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 519368632747ae03feb5bca9c655eccbc5b751b4 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:46:10 2012 +0100 gallivm: Added support for half-float to float conversion in lp_build_conv. Updated various utility functions to support this change. commit 135b4d683a4c95f7577ba27b9bffa4a6fbd2c2e7 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:02:46 2012 +0100 gallivm: Added function for half-float to float conversion. Updated lp_build_format_aos_array to support half-float source. commit 37d648827406a20c5007abeb177698723ed86673 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:55:18 2012 +0100 util: Updated u_format_tests to rigidly test half-float boundary values. commit 2ad18165d96e578aa9046df7c93cb1c3284d8c6b Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:54:16 2012 +0100 llvmpipe: Updated lp_test_format to properly handle Inf/NaN results. commit 78740acf25aeba8a7d146493dd5c966e22c27b73 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:53:30 2012 +0100 util: Added functions for checking NaN / Inf for double and half-floats. commit 35e9f640ae01241f9e0d67fe893bbbf564c05809 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:05:13 2012 +0200 gallivm: Fix calculating rho for 3d textures for the single-quad case Discovered by accident, this looks like a very old typo bug. commit fc1220c636326536fd0541913154e62afa7cd1d8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:04:59 2012 +0200 gallivm: do calcs per-quad in lp_build_rho Still convert to scalar at the end of the function. commit 50a887ffc550bf310a6988fa2cea5c24d38c1a41 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 21 23:21:50 2012 +0200 gallivm: (trivial) return scalar in lp_build_extract_range for length 1 vectors Our type system on top of llvm's one doesn't generally support vectors of length 1, instead using scalars. So we should return a scalar from this function instead of having to bitcast the vector with length 1 later elsewhere. commit 80c71c621f9391f0f9230460198d861643324876 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 17:49:15 2012 +0100 draw: Fixed bad merge error commit c47401cfad0c9167de20ff560654f533579f452c Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:29:30 2012 +0100 draw: Updated store_clip to store whole vectors instead of individual elements. commit 2d9c1ad74b0b0b41861fffcecde39f09cc27f1cf Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:28:32 2012 +0100 gallivm: Added lp_build_fetch_rgba_aos_array. A version of lp_build_fetch_rgba_aos which is targeted at simple array formats. Reads the whole vector from memory in one, instead of reading each element individually. Tested with mesa tests and demos. commit ff7805dc2b6ef6d8b11ec4e54aab1633aef29ac8 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:27:40 2012 +0100 gallivm: Added lp_build_pad_vector. This function pads a vector with undef to a desired length. commit 701f50acef24a2791dabf4730e5b5687d6eb875d Author: James Benton <jbenton@vmware.com> Date: Fri May 18 17:27:19 2012 +0100 util: Added util_format_is_array. This function checks whether a format description is in a simple array format. commit 5e0a7fa543dcd009de26f34a7926674190fa6246 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:13:47 2012 +0100 draw: Removed draw_llvm_translate_from and draw/draw_llvm_translate.c. This is "replaced" by adding an optimised path in lp_build_fetch_rgba_aos in an upcoming patch. commit 8c886d6a7dd3fb464ecf031de6f747cb33e5361d Author: James Benton <jbenton@vmware.com> Date: Wed May 16 15:02:31 2012 +0100 draw: Modified store_aos to write the vector as one, not individual elements. commit 37337f3d657e21dfd662c7b26d61cb0f8cfa6f17 Author: James Benton <jbenton@vmware.com> Date: Wed May 16 14:16:23 2012 +0100 draw: Changed aos_to_soa to use lp_build_transpose_aos. commit bd2b69ce5d5c94b067944d1dcd5df9f8e84548f1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:14:27 2012 +0100 draw: Changed soa_to_aos to use lp_build_transpose_aos. commit 0b98a950d29a116e82ce31dfe7b82cdadb632f2b Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:45 2012 +0100 gallivm: Added lp_build_transpose_aos which converts between aos and soa. commit 69ea84531ad46fd145eb619ed1cedbe97dde7cb5 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:01 2012 +0100 gallivm: Added lp_build_interleave2_half aimed at AVX unpack instructions. commit 7a4cb1349dd35c18144ad5934525cfb9436792f9 Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 22 11:54:14 2012 +0100 gallivm: Fix build on Windows. MC-JIT not yet supported there. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit afd105fc16bb75d874e418046b80d9cc578818a1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:17:26 2012 +0100 llvmpipe: Added a error counter to lp_test_conv. Useful for keeping track of progress when fixing errors! Signed-off-by: José Fonseca <jfonseca@vmware.com> commit b644907d08c10a805657841330fc23db3963d59c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:16:46 2012 +0100 llvmpipe: Changed known failures in lp_test_conv. To comply with the recent fixes to lp_bld_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit d7061507bd94f6468581e218e61261b79c760d4f Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:14:38 2012 +0100 llvmpipe: Added fixed point types tests to lp_test_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 146b3ea39b4726dbe125ac666bd8902ea3d6ca8c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:26:35 2012 +0100 llvmpipe: Changed lp_test_conv src/dst alignment to be correct. Now based on the define rather than a fixed number. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit f3b57441f834833a4b142a951eb98df0aa874536 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:06:44 2012 +0100 gallivm: Fixed erroneous optimisation in lp_build_min/max. Previously assumed normalised was 0 to 1, but it can be -1 to 1 if type is signed. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a0613382e5a215cd146bb277646a6b394d376ae4 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:04:49 2012 +0100 gallivm: Compensate for lp_const_offset in lp_build_conv. Fixing a /*FIXME*/ to remove errors in integer conversion in lp_build_conv. Tested using lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a3d2bf15ea345bc8a0664f8f441276fd566566f3 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:01:25 2012 +0100 gallivm: Fixed overflow in lp_build_clamped_float_to_unsigned_norm. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit e7b1e76fe237613731fa6003b5e1601a2e506207 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 21 20:07:51 2012 +0100 gallivm: Fix build with LLVM 2.6 Trivial, and useful. commit d3c6bbe5c7f5ba1976710831281ab1b6a631082d Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 15 17:15:59 2012 +0100 gallivm: Enable MCJIT/AVX with vanilla LLVM 3.1. Add the necessary C++ glue, so that we don't need any modifications to the soon to be released LLVM 3.1. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 724a019a14d40fdbed21759a204a2bec8a315636 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 22:04:06 2012 +0100 gallivm: Use HAVE_LLVM 0x0301 consistently. commit af6991e2a3868e40ad599b46278551b794839748 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 21:49:06 2012 +0100 gallivm: Add MCRegisterInfo.h to silence benign warnings about missing implementation. Trivial. commit 6f8a1d75458daae2503a86c6b030ecc4bb494e23 Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Apr 2 22:14:15 2012 -0700 gallivm: Pass in a MCInstrInfo to createMCInstPrinter on llvm-3.1. llvm-3.1svn r153860 makes MCInstrInfo available to the MCInstPrinter. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 62555b6ed8760545794f83064e27cddcb3ce5284 Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 27 21:51:17 2012 -0700 gallivm: Fix method overriding in raw_debug_ostream. Use matching type qualifers to avoid method hiding. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 6a9bd784f4ac68ad0a731dcd39e5a3c39989f2be Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 13 22:40:52 2012 -0700 gallivm: Fix createOProfileJITEventListener namespace with llvm-3.1. llvm-3.1svn r152620 refactored the OProfile profiling code. createOProfileJITEventListener was moved from the llvm namespace to the llvm::JITEventListener namespace. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit b674955d39adae272a779be85aa1bd665de24e3e Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Mar 5 22:00:40 2012 -0800 gallivm: Pass in a MCRegisterInfo to MCInstPrinter on llvm-3.1. llvm-3.1svn r152043 changes createMCInstPrinter to take an additional MCRegisterInfo argument. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 11ab69971a8a31c62f6de74905dbf8c02884599f Author: Vinson Lee <vlee@freedesktop.org> Date: Wed Feb 29 21:20:53 2012 -0800 Revert "gallivm: Change getExtent and readByte to non-const with llvm-3.1." This reverts commit d5a6c172547d8964f4d4bb79637651decaf9deee. llvm-3.1svn r151687 makes MemoryObject accessor members const again. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 339960c82d2a9f5c928ee9035ed31dadb7f45537 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 14 16:19:56 2012 +0200 gallivm: (trivial) fix assertion failure for mipmapped 1d textures In lp_build_rho, we may end up with a 1-element vector (for mipmapped 1d textures), but in this case we require the type to be a non-vector type, so need a cast. commit 9d73edb727bd6d196030dc3026b7bf0c574b3e19 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:12:07 2012 +0200 gallivm: prepare for per-quad lod calculations for large vectors to be able to handle multiple quads at once in texture sampling and still do lod calculations per quad, it is necessary to get the per-quad derivatives into the lp_build_rho function. Until now these derivative values were just scalars, which isn't going to work. So we now use vectors, and since the interface needs to change we also do some different (slightly more efficient) packing of the values. For 8-wide vectors the packed derivative values for 3 coords would look like this, this scales to a arbitrary (multiple of 4) vector size: ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____ The second vector will be unused for 1d and 2d textures. To facilitate future changes the derivative values are put into a struct, since quite some functions just pass these values through. The generated code seems to be very slightly better for 2d textures (with 4-wide vectors) than before with sse2 (if you have a cpu with physical 128bit simd units - otherwise it's probably not a win). v2: suggestions from José, rename variables, add comments, use swizzle helper commit 0aa21de0d31466dac77b05c97005722e902517b8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:10:31 2012 +0200 gallivm: add undefined swizzle handling to lp_build_swizzle_aos This is useful for vectors with "holes", it lets llvm choose the most efficient shuffle instructions if some elements aren't needed without having to worry what elements to manually pick otherwise. commit 00faf3f370e7ce92f5ef51002b0ea42ef856e181 Author: José Fonseca <jfonseca@vmware.com> Date: Fri May 4 17:25:16 2012 +0100 gallivm: Get the LLVM IR optimization passes before JIT compilation. MC-JIT engine compiles the module immediately on creation, so the optimization passes were being run too late. So now we create a target data layout from a string, that matches the ABI parameters reported by the compiler. The backend optimization passes were always been run, so the performance improvement is modest (3% on multiarb mesa demo). Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit 40a43f4e2ce3074b5ce9027179d657ebba68800a Author: Roland Scheidegger <sroland@vmware.com> Date: Wed May 2 16:03:54 2012 +0200 gallivm: (trivial) fix wrong define used in lp_build_pack2 should fix stack-smashing crashes. commit e6371d0f4dffad4eb3b7a9d906c23f1c88a2ab9e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 30 21:25:29 2012 +0200 gallivm: add perf warnings when not using intrinsics with 256bit vectors Helper functions using integer sse2 intrinsics could split the vectors with AVX instead of using generic fallback (which should be faster). We don't actually expect to hit these paths (hence don't fix them up to actually do the vector splitting) so just emit warnings (for those functions where it's obvious doing split/intrinsic is faster than using generic path). Only emit warnings for 256bit vectors since we _really_ don't expect to hit arbitrary large vectors which would affect a lot more functions. The warnings do not actually depend on avx since the same logic applies to plain sse2 too (but of course again there's _really_ no reason we should hit these functions with 256bit vectors without avx). commit 8a9ea701ea7295181e846c6383bf66a5f5e47637 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:37:07 2012 +0200 gallivm: split vectors manually for avx in lp_build_pack2 (v2) There's 2 reasons for this: First, there's a llvm bug (fixed in 3.1) which generates tons of byte inserts/extracts otherwise, and second, more importantly, we want to use pack intrinsics instead of shuffles. We do this in lp_build_pack2 and not the calling code (aos sample path) because potentially other callers might find that useful too, even if for larger sequences of code using non-native vector sizes it might be better to manually split vectors. This should boost texture performance in the aos path considerably. v2: fix issues with intrinsics types with old llvm commit 27ac5b48fa1f2ea3efeb5248e2ce32264aba466e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:26:22 2012 +0200 llvmpipe: refactor lp_build_pack2 (v2) prettify, and it's unnecessary to assert when there's no intrinsic due to unsupported bit width - the shuffle path will work regardless. In contrast lp_build_packs2, should only rely on lp_build_pack2 doing the clamping for element sizes for which there is a sse2 intrinsic. v2: fix bug spotted by Jose regarding the intrinsic type for packusdw on old llvm versions. commit ddf279031f0111de4b18eaf783bdc0a1e47813c8 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:13:59 2012 +0200 gallivm: add src width check in lp_build_packs2() not doing so would skip clamping even if no sse2 pack instruction is available, which is incorrect (in theory only, such widths would also always hit a (unnecessary) assertion in lp_build_pack2(). commit e7f0ad7fe079975eae7712a6e0c54be4fae0114b Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Apr 27 15:57:00 2012 +0200 gallivm: (trivial) fix crash-causing typo for npot textures with avx commit 28a9d7f6f655b6ec508c8a3aa6ffefc1e79793a0 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Apr 25 19:38:45 2012 +0200 gallivm: (trivial) remove code mistakenly added twice. commit d5926537316f8ff67ad0a52e7242f7c5478d919b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Apr 24 21:16:15 2012 +0200 gallivm: add a new avx aos sample path (v2) Try to avoid mixing float and int address calculations. This does texture wrap modes with floats, and then the offset calculations still with ints (because of lack of precision with floats, though we could do some effort to make it work with not too large (16MB) textures). This also handles wrap repeat mode with npot-sized textures differently than either the old soa or aos int path (likely way faster but untested). Otherwise the actual address wrap code is largely similar to the soa path (not quite the same as this one also has some int code), it should get used by avx soa sampling later as well but doesn't handle more complex address modes yet (this will also have the benefit that we can use aos sampling path for all texture address modes). Generated code for that looks reasonable, but still does not split vectors explicitly for fetch/filter which means still get hit by llvm (fixed upstream) which generates hundreds of pinsrb/pextrb instead of two shuffles. It is not obvious though if it's much of a win over just doing address calcs 4-wide but with ints, even if it is definitely much less instructions on avx. piglit's texwrap seems to look exactly the same but doesn't test neither the non-normalized nor the npot cases. v2: fix comments, prettify based on Brian's and Jose's feedback. commit bffecd22dea66fb416ecff8cffd10dd4bdb73fce Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Apr 19 01:58:29 2012 +0200 gallivm: refactor aos lp_build_sample_image_nearest/linear split them up to separate address calculations and fetching/filtering. Need this for being able to do 8-wide float address calcs and 4-wide fetch/filter later (for avx). Plus the functions were very big scary monsters anyway (in particular lp_build_sample_image_linear). commit a80b325c57529adddcfa367f96f03557725c4773 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 16 17:17:18 2012 +0200 gallivm: fix lp_build_resize when truncating width but expanding vector size Missed this case which I thought was impossible - the assertion for it was right after the division by zero... (AoS) texture sampling may ask us to do this, for things like 8 4x32int vectors to 1 32x8int vector conversion (eventually, we probably don't want this to happen). commit f9c8337caa3eb185830d18bce8b95676a065b1d7 Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Apr 14 18:00:59 2012 +0200 gallivm: fix cube maps with larger vectors This makes the branchless cube face selection code work with larger vectors. Because the complexity is quite high (cannot really be improved it seems, per-face selection would reduce complexity a lot but this leads to errors unless the derivatives are calculated all from the same face which almost doubles the work to be done) it is still slower than the branching version, hence only enable this with large vectors. It doesn't actually do per-quad face selection yet (only makes sense with matching lod selection, in fact it will select the same face for all pixels based on the average of the first four pixels for now) but only different shuffles are required to make it work (the branching version actually should work with larger vectors too now thanks to the improved horizontal add but of course it cannot be extended to really select the face per-quad unless doing branching per quad). commit 7780c58869fc9a00af4f23209902db7e058e8a66 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 21:11:12 2012 +0100 llvmpipe: (trivial) fix compiler warning and also clarify comment regarding availability of popcnt instruction. commit a266dccf477df6d29a611154e988e8895892277e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 14:21:07 2012 +0100 gallivm: remove unneeded members in lp_build_sample_context Minor cleanup, the texture width, height, depth aren't accessed in their scalar form anywhere. Makes it more obvious those values should probably be fetched already vectorized (but this requires more invasive changes)... commit b678c57fb474e14f05e25658c829fc04d2792fff Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 29 15:53:55 2012 +0100 gallivm: add a helper for concatenating vectors Similar to the extract_range helper intended to get around slow code generated by llvm for 128bit insertelements. Concatenating two 128bit vectors this way will result in a single vinsertf128 operation rather than two 64bit stores plus one 128bit load, though it might be mildly useful for other purposes as well. commit 415ff228bcd0cf5e44a4c15350a661f0f5520029 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 19:41:15 2012 +0100 gallivm: add a custom 2x8f->1x16ub avx conversion path Similar to the existing 4x4f->1x16ub sse2 path, shaves off a couple instructions (min/max mostly) because it relies on pack intrinsics clamping. commit 78c08fc89f8fbcc6dba09779981b1e873e2a0299 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 18:44:07 2012 +0100 gallivm: add avx arithmetic intrinsics Add all avx intrinsics for arithmetic functions (with the exception of the horizontal add function which needs another look). Seems to pass basic tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit a586caa2800aa5ce54c173f7c0d4fc48153dbc4e Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 15:31:35 2012 +0100 gallivm: add avx logic intrinsics Add the blend intrinsics for 8-wide float and 4-wide double vectors. Since we lack 256bit int instructions these are used for int vectors as well, though obviously not for byte or word element values. The comparison intrinsics aren't extended for avx since these are only used for pre-2.7 llvm versions. commit 70275e4c13c89315fc2560a4c488c0e6935d5caf Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 00:40:53 2012 +0100 gallivm: new helper function for extract shuffles. Based on José's idea as we can need that in a couple places. Note that such shuffles should not be used lightly, since data layout of <4 x i8> is different to <16 x i8> for instance, hence might cause data rearrangement. commit 4d586dbae1b0c55915dda1759d2faea631c0a1c2 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 18:27:25 2012 +0100 gallivm: (trivial) don't overallocate shuffle variable using wrong define meant huge array... commit 06b0ec1f6d665d98c135f9573ddf4ba04b2121ad Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 17:54:20 2012 +0100 gallivm: don't do per-element extract/insert for vector element resize Instead of doing per-element extract/insert if the src vectors and dst vector differ in total size (which generates atrocious code) first change the src vectors size by using shuffles to destination vector size. We can still do better than that on AVX for packing to color buffer (by exploiting pack intrinsics characteristics hence eleminating the need for some clamps) but this already generates much better code. v2: incorporate feedback from José, Keith and use shuffle instead of bitcasts/extracts. Due to llvm deficiencies the latter cause all data to get moved to GPRs and back in pieces (even though the data in the regs actually stays the same...). commit c9970d70e05f95d3f52fe7d2cd794176a52693aa Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 19:33:19 2012 +0000 gallivm: fix bug in simple position interpolation Accidental use of position attribute instead of just pixel coordinates. Caused failures in piglit glsl-fs-ceil and glsl-fs-floor. commit d0b6fcdb008d04d7f73d3d725615321544da5a7e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 15:31:14 2012 +0000 gallivm: fix emission of ceil opcode lp_build_ceil seems more appropriate than lp_build_trunc. This seems to be never hit though someone performs some ceil to floor magic. commit d97fafed7e62ffa6bf76560a92ea246a1a26d256 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 22 11:46:52 2012 +0000 gallivm: new vectorized path for cubemap calculations should be faster when adapted to multiple quads as only selection masks need to be different. The code is more or less a per-pixel version adapted to only do it per quad. A per pixel version would be much simpler (could drop 2 selects, 6 broadcasts and the messy horizontal add of 3 vectors at the expense of only 2 more absolute value instructions - would also just work for arbitary large vectors). This version doesn't yet work with larger vectors because the horizontal add isn't adjusted to be able to work with 2x4 vectors (and also because face selection wouldn't be done per quad just per block though that would be only a correctness issue just as with lod selection). The downside is this code is quite a bit slower. On a Core2 it can be sped up by disabling the hw blend instructions for selection and using logicop fallbacks instead, but it is still slower than the old code, hence leave that in for now. Probably will chose one or the other version based on vector length in the end. commit b375fbb18a3fd46859b7fdd42f3e9908ea4ff9a3 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 21 14:42:29 2012 +0000 gallivm: fix optimized occlusion query intrinsic name commit a9ba0a3b611e48efbb0e79eb09caa85033dbe9a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Mar 21 16:19:43 2012 +0000 draw,gallivm,llvmpipe: Call gallivm_verify_function everywhere. commit f94c2238d2bc7383e088b8845b7410439a602071 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 18:54:10 2012 +0000 gallivm: optimize calculations for cube maps a bit this does some more vectorized calculations and uses horizontal adds if possible. A definite win with sse3 otherwise it doesn't seem to make much of a difference. In any case this is arithmetically identical, cannot handle larger vectors. Should be useful as a reference point against larger vector version later... commit 21a2c1cf3c8e1ac648ff49e59fdc0e3be77e2ebb Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 15:16:27 2012 +0000 llvmpipe: slight optimization of occlusion queries using movmskps when available. While this is slightly better for cpus without popcnt we should really sum the vectors ourselves (it is also possible to cast to i4 before doing the popcnt but that doesn't help that much neither since llvm is using some optimized popcnt version for i32) commit 5ab5a35f216619bcdf55eed52b0db275c4a06c1b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 13:32:11 2012 +0000 llvmpipe: fix occlusion queries with larger vectors need to adjust casts etc. commit ff95e6fdf5f16d4ef999ffcf05ea6e8c7160b0d5 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Mar 19 20:15:25 2012 +0000 gallivm: Restore optimization passes. commit 57b05b4b36451e351659e98946dae27be0959832 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:34:22 2012 +0000 llvmpipe: use existing min2 macro commit bc9a20e19b4f600a439f45679451f2e87cd4b299 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:07:27 2012 +0000 llvmpipe: add some safeguards against really large vectors As per José's suggestion, prevent things from blowing up if some cpu would have 1024bit or larger vectors. commit 0e2b525e5ca1c5bbaa63158bde52ad1c1564a3a9 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:31:08 2012 +0000 llvmpipe: fix mask generation for uberwide vectors this was the only piece preventing 16-wide vectors from working (apart from the LP_MAX_VECTOR_WIDTH define that is), which is the maximum as we don't get more pixels in the fragment shader at once. Hence adjust that so things could be tested properly with that size even though there seems to be no practical value. commit 3c8334162211c97f3a11c7f64e9e5a2a91ad9656 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:19:41 2012 +0000 llvmpipe: fix the simple interpolation method with larger vectors so both methods actually _really_ work now. Makes textures look nice with larger vectors... commit 1cb0464ef8871be1778d43b0c56adf9c06843e2d Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 17:26:35 2012 +0000 llvmpipe: fix mask generation and position interpolation with 8-wide vectors trivial bugs, with these things start to look somewhat reasonable. Textures though have some swizzling issues it seems. commit 168277a63ef5b72542cf063c337f2d701053ff4b Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 16:04:03 2012 +0000 llvmpipe: don't overallocate variables we never have more than 16 (stamp size) / 4 (minimum possible vector size). (With larger vectors those variables are still overallocated a bit.) commit 409b54b30f81ed0aa9ed0b01affe15c72de9abd2 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:56:48 2012 +0000 llvmpipe: add some 32f8 formats to lp_test_conv Also add the ability to handle different sized vectors. commit 55dcd3af8366ebdac0af3cdb22c2588f24aa18ce Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:47:27 2012 +0000 gallivm: handle different sized vectors in conversion / pack only fully generic path for now (extract/insert per element). commit 9c040f78c54575fcd94a8808216cf415fe8868f6 Author: Roland Scheidegger <sroland@vmware.com> Date: Sun Mar 18 00:58:28 2012 +0100 llvmpipe: fix harmless use of unitialized values commit 551e9d5468b92fc7d5aa2265db9a52bb1e368a36 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:31:21 2012 +0100 gallivm: drop special path in extract_broadcast with different sized vectors Not needed, llvm can handle shuffles with different sized result vector just fine. Should hopefully generate the same code in the end, but simpler IR. commit 44da531119ffa07a421eaa041f63607cec88f6f8 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:28:49 2012 +0100 llvmpipe: adapt interpolation for handling multiple quads at once this is still WIP there are actually two methods possible not quite sure what makes the most sense, so there's code for both for now: 1) the iterative method as used before (compute attrib values at upper left corner of stamp and upper left corner of each quad initially). It is improved to handle more than one quad at once, and also do some more vectorized calculations initially for slightly better code - newer cpus have full throughput with 4 wide float vectors, hence don't try to code up a path which might be faster if there's just one channel active per attribute. 2) just do straight interpolation for each pixel. Method 2) is more work per quad, but less initially - if all quads are executed significantly more overall though. But this might change with larger vector lengths. This method would also be needed if we'd do some kind of active quad merging when operating on multiple quads at once. This path contains some hack to force llvm to generate better code, it is still far from ideal though, still generates far too many unnecessary register spills/reloads. Both methods should work with different sized vectors. Not very well tested yet, still seems to work with four-wide vectors, need changes elsewhere to be able to test with wider vectors. commit be5d3e82e2fe14ad0a46529ab79f65bf2276cd28 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:59:37 2012 +0000 draw: Cleanup. commit f85bc12c7fbacb3de2a94e88c6cd2d5ee0ec0e8d Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:43:30 2012 +0000 gallivm: More module compilation refactoring. commit d76f093198f2a06a93b2204857e6fea5fd0b3ece Author: José Fonseca <jfonseca@vmware.com> Date: Thu Mar 15 21:29:11 2012 +0000 llvmpipe: Use gallivm_compile/free_function() in linear code. Should had been done before. commit 122e1adb613ce083ad739b153ced1cde61dfc8c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 13 14:47:10 2012 +0100 llvmpipe: generate partial pixel mask for multiple quads still works with one quad, cannot be tested yet with more At least for now always fixed order with multiple quads. commit 4c4f15081d75ed585a01392cd2dcce0ad10e0ea8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 22:09:24 2012 +0100 llvmpipe: refactor state setup a bit Refactor to make it easier to emit (and potentially later fetch in fs) coefficients for multiple attributes at once. Need to think more about how to make this actually happen however, the problem is different attributes can have different interpolation modes, requiring different handling in both setup and fs (though linear and perspective handling is close). commit 9363e49722ff47094d688a4be6f015a03fba9c79 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 19:23:23 2012 +0100 llvmpipe: vectorize tri offset calc cuts number of instructions in quad-offset-factor from 107 to 75. This code actually duplicated the (scalar) code calculating the determinant except it used different vertex order (leading to different sign but it doesn't matter) hence llvm could not have figured out it's the same (of course with determinant vectorized in the other place that wouldn't have worked any longer neither). Note this particular piece doesn't actually vectorize well, not many arithmetic instructions left but tons of shuffle instructions... Probably would need to work on n tris at a time for better vectorization. commit 63169dcb9dd445c94605625bf86d85306e2b4297 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 03:11:37 2012 +0100 llvmpipe: vectorize some scalar code in setup reduces number of arithmetic instructions, and avoids loading vector x,y values twice (once as scalars once as vectors). Results in a reduction of instructions from 76 to 64 in fs setup for glxgears (16%) on a cpu with sse41. Since this code uses vec2 disguised as vec4, on old cpus which had physical 64bit sse units (pre-Core2) it probably is less of a win in practice (and if you have no vectors you can only hope llvm eliminates the arithmetic for unneeded elements). commit 732ecb877f951ab89bf503ac5e35ab8d838b58a1 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 7 00:32:24 2012 +0100 draw: fix clipping bug introduced by 4822fea3f0440b5205e957cd303838c3b128419c broke clipping pretty badly (verified with lineclip test) commit ef5d90b86d624c152d200c7c4056f47c3c6d2688 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 23:38:59 2012 +0100 draw: don't store vertex header per attribute storing the vertex header once per attribute is totally unnecessary. Some quick look at the generated assembly says llvm in fact cannot optimize away the additional stores (maybe due to potentially aliasing pointers somewhere). Plus, this makes the code cleaner and also allows using a vector "or" instead of scalar ones. commit 6b3a5a57b0b9850854cfbd7b586e4e50102dda71 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 19:11:01 2012 +0100 draw: do the per-vertex "boolean" clipmask "or" with vectors no point extracting the values and doing it per component. Doesn't help that much since we still extract the values elsewhere anyway. commit 36519caf1af40e4480251cc79a2d527350b7c61f Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 2 22:27:01 2012 +0100 gallivm: fix lp_build_extract_broadcast with different sized vectors Fix the obviously wrong argument, so it doesn't blow up. commit 76d0ac3ad85066d6058486638013afd02b069c58 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 2 12:16:23 2012 +0000 draw: Compile per module and not per function (WIP). Enough to get gears w/ LLVM draw + softpipe to work on AVX doing: GALLIUM_DRIVER=softpipe SOFTPIPE_USE_LLVM=yes glxgears But still hackish -- will need to rethink and refactor this. commit 78e32b247d2a7a771be9a1a07eb000d1e54ea8bd Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 12:01:05 2012 +0000 llvmpipe: Remove lp_state_setup_fallback. Never used. commit 6895d5e40d19b4972c361e8b83fdb7eecda3c225 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Feb 27 19:14:27 2012 +0000 llvmpipe: Don't emit EMMS on x86 We already take precautions to ensure that LLVM never emits MMX code. commit 4822fea3f0440b5205e957cd303838c3b128419c Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 29 15:58:19 2012 +0100 draw: modifications for larger vector sizes We want to be able to use larger vectors especially for running the vertex shader. With this patch we build soa vectors which might have a different length than 4. Note that aos structures really remain the same, only when aos structures are converted to soa potentially different sized vectors are used. Samplers probably don't work yet, didn't look at them. Testing done: glxgears works with both 128bit and 256bit vectors. commit f4950fc1ea784680ab767d3dd0dce589f4e70603 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:57 2012 +0100 gallivm: override native vector width with LP_NATIVE_VECTOR_WIDTH env var for debug commit 6ad6dbf0c92f3bf68ae54e5f2aca035d19b76e53 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:24 2012 +0100 draw: allocate storage with alignment according to native vector width commit 7bf0e3e7c9bd2469ae7279cabf4c5229ae9880c1 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Feb 24 19:06:08 2012 +0000 gallivm: Fix comment grammar. Was missing several words. Spotted by Roland. commit b20f1b28eb890b2fa2de44a0399b9b6a0d453c52 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 19:22:09 2012 +0000 gallivm: Use MC-JIT on LLVM 3.1 + (i.e, SVN) MC-JIT Note: MC-JIT is still WIP. For this to work correctly it requires LLVM changes which are not yet upstream. commit b1af4dfcadfc241fd4023f4c3f823a1286d452c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 20:03:15 2012 +0100 llvmpipe: use new lp_type_width() helper in lp_test_blend commit 04e0a37e888237d4db2298f31973af459ef9c95f Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 19:50:34 2012 +0100 llvmpipe: clean up lp_test_blend a little Using variables just sized and aligned right makes it a bit more obvious what's going on. The test still only tests vector length 4. For AoS anything else probably isn't going to work. For SoA other lengths should work (at least with floats). commit e61c393d3ec392ddee0a3da170e985fda885a823 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:48:30 2012 +0000 gallivm: Ensure vector width consistency. Instead of assuming that everything is the max native size. commit 330081ac7bc41c5754a92825e51456d231bf84dd Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:44:14 2012 +0000 draw: More simd vector width consistency fixes. commit d90ca002753596269e37297e2e6c139b19f29f03 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:43:00 2012 +0000 gallivm: Remove unused lp_build_int32_vec4_type() helper. commit cae23417824d75869c202aaf897808d73a2c1db0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 17:32:16 2012 +0100 gallivm: use global variable for native vector width instead of define We do not know the simd extensions (and hence the simd width we should use) available at compile time. At least for now keep a define for maximum vector width, since a global variable obviously can't be used to adjust alignment of automatic stack variables. Leave the runtime-determined value at 128 for now in all cases. commit 51270ace6349acc2c294fc6f34c025c707be538a Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 15:41:02 2012 +0000 gallivm: Add a hunk inadvertedly lost when rebasing. commit bf256df9cfdd0236637a455cbaece949b1253e98 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:24:23 2012 +0000 llvmpipe: Use consistent vector width in depth/stencil test. commit 5543b0901677146662c44be2cfba655fd55da94b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:19:59 2012 +0000 draw: Use a consistent the vector register width. Instead of 4x32 sometimes, LP_NATIVE_VECTOR_WIDTH other times. commit eada8bbd22a3a61f549f32fe2a7e408222e5c824 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 12:08:04 2012 +0000 gallivm: Remove garbagge collection. MC-JIT will require one compilation per module (as opposed to one compilation per function), therefore no state will be shared, eliminating the need to do garbagge collection. commit 556697ea0ed72e0641851e4fbbbb862c470fd7eb Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 10:33:41 2012 +0000 gallivm: Move all native target initialization to lp_set_target_options(). commit c518e8f3f2649d5dc265403511fab4bcbe2cc5c8 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:52:32 2012 +0000 llvmpipe: Create one gallivm instance for each test. commit 90f10af8920ec6be6f2b1e7365cfc477a0cb111d Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:48:08 2012 +0000 gallivm: Avoid LLVMAddGlobalMapping() in lp_bld_assert(). Brittle, complex, and unecesary. Just use function pointer constant. commit 98fde550b33401e3fe006af59db4db628bcbf476 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:21:26 2012 +0000 gallivm: Add a lp_build_const_func_pointer() helper. To be reused in all places where we want to call C code. commit 6cfedadb62c2ce5af8d75969bc95a607f3ece118 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:44:41 2012 +0000 gallivm: Cleanup/simplify lp_build_const_string_variable. - Move to lp_bld_const where it belongs - Rename to lp_build_const_string - take the length from the argument (and don't count the zero terminator twice) - bitcast the constant to generic i8 * commit db1d4018c0f1fa682a9da93c032977659adfb68c Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 11:52:17 2012 +0000 gallivm: Set NoFramePointerElimNonLeaf to true where supported. commit 088614164aa915baaa5044fede728aa898483183 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 22 19:38:47 2012 +0100 llvmpipe: pass in/out pointers rather scalar floats in lp_bld_arit we don't want llvm to potentially optimize away the vectors (though it doesn't seem to currently), plus we want to be able to handle in/out vectors of arbitrary length. commit 3f5c4e04af8a7592fdffa54938a277c34ae76b51 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:22:55 2012 +0100 gallivm: fix lp_build_sqrt() for vector length 1 since we optimize away vectors with length 1 need to emit intrinsic without vector type. commit 79d94e5f93ed8ba6757b97e2026722ea31d32c06 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 17:00:46 2012 +0000 llvmpipe: Remove lp_test_round. commit 81f41b5aeb3f4126e06453cfc78990086b85b78d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:56:24 2012 +0100 llvmpipe: subsume lp_test_round into lp_test_arit Much simpler, and since the arguments aren't passed as 128bit values can run on any arch. This also uses the float instead of the double versions of the c functions (which probably was the intention anyway). In contrast to lp_test_round the output is much less verbose however. Tested vector width of 32 to 512 bits - all pass except 32 (length 1) which crashes in lp_build_sqrt() due to wrong type. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 945b338b421defbd274481d8c4f7e0910fd0e7eb Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 09:55:03 2012 +0000 gallivm: Centralize the function compilation logic. This simplifies a lot of code. Also doing this in a central place will make it easier to carry out the changes necessary to use MC-JIT in the future. gallivm: Fix typo in explicit derivative shuffle. Trivial. draw: make DEBUG_STORE work again adapt to lp_build_printf() interface changes Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: get rid of vecnf_from_scalar() just use lp_build_broadcast directly (cannot assign a name but don't really need it, vecnf_from_scalar() was producing much uglier IR due to using repeated insertelement instead of insertelement+shuffle). Reviewed-by: José Fonseca <jfonseca@vmware.com> llvmpipe: fix typo in complex interpolation code Fixes position interpolation when using complex mode (piglit fp-fragment-position and similar) Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: fix clipvertex/position storing again This appears to be the result of a bad merge. Fixes piglit tests relying on clipping, like a lot of the interpolation tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Fix explicit derivative manipulation. Same counter variable was being used in two nested loops. Use more meanigful variable names for the counter to fix and avoid this. gallivm: Prevent buffer overflow in repeat wrap mode for NPOT. Based on Roland's patch, discussion, and review . Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix dims for TGSI_TEXTURE_1D in emit_tex. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix explicit volume texture derivatives. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: fix 1d shadow texture sampling Always r coordinate is used, hence need 3 coords not two (the second one is unused). Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Enable AVX support without MCJIT, where available. For now, this just enables AVX on Windows for testing. If the code is stable then we might consider prefering the old JIT wherever possible. No change elsewhere. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2012-07-13 18:09:30 +01:00
if (type.width == 32 && type.sign) {
intrinsic = "llvm.x86.sse41.pminsd";
gallivm,draw,llvmpipe: Support wider native registers. Squashed commit of the following: commit 7acb7b4f60dc505af3dd00dcff744f80315d5b0e Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:46:31 2012 +0100 draw: Don't use dynamically sized arrays. Not supported by MSVC. commit 5810c28c83647612cb372d1e763fd9d7780df3cb Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:44:16 2012 +0100 gallivm,llvmpipe: Don't use expressions with PIPE_ALIGN_VAR(). MSVC doesn't accept exceptions in _declspec(align(...)). Use a define instead. commit 8aafd1457ba572a02b289b3f3411e99a3c056072 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:41:56 2012 +0100 gallium/util: Make u_cpu_detect.h header C++ safe. commit 5795248350771f899cfbfc1a3a58f1835eb2671d Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 2 12:08:01 2012 +0100 gallium/util: Add ULL suffix to large constants. As suggested by Andy Furniss: it looks like some old gcc versions require it. commit 4c66c22727eff92226544c7d43c4eb94de359e10 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Truly disable INF/NAN tests on MSVC. Thanks to Brian for spotting this. commit 8bce274c7fad578d7eb656d9a1413f5c0844c94e Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Disable INF/NAN tests on MSVC. Somehow they are not recognized as constants. commit 6868649cff8d7fd2e2579c28d0b74ef6dd4f9716 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 15:05:24 2012 +0200 gallivm: Cleanup the 2 x 8 float -> 16 ub special path in lp_build_conv. No behaviour change intended, like 7b98455fb40c2df84cfd3cdb1eb7650f67c8a751. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 5147a0949c4407e8bce9e41d9859314b4a9ccf77 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 14:28:19 2012 +0200 gallivm: (trivial) fix issues with multiple-of-4 texture fetch Some formats can't handle non-multiple of 4 fetches I believe, but everything must support length 1 and multiples of 4. So avoid going to scalar fetch (which is very costly) just because length isn't 4. Also extend the hack to not use shift with variable count for yuv formats to arbitrary length (larger than 1) - doesn't matter how many elements we have we always want to avoid it unless we have variable shift count instruction (which we should get with avx2). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 87ebcb1bd71fa4c739451ec8ca89a7f29b168c08 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jul 4 02:09:55 2012 +0200 gallivm: (trivial) fix typo for wrap repeat mode in linear filtering aos code This would lead to bogus coordinates at the edges. (undetected by piglit because this path is only taken for block-based formats). Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 3a42717101b1619874c8932a580c0b9e6896b557 Author: José Fonseca <jfonseca@vmware.com> Date: Tue Jul 3 19:42:49 2012 +0100 gallivm: Fix TGSI integer translation with AVX. commit d71ff104085c196b16426081098fb0bde128ce4f Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 15:17:41 2012 +0100 llvmpipe: Fix LLVM JIT linear path. It was not working properly because it was looking at the JIT function before it was actually compiled. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit a94df0386213e1f5f9a6ed470c535f9688ec0a1b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Jun 28 18:07:10 2012 +0100 gallivm: Refactor lp_build_broadcast(_scalar) to share code. Doesn't really change the generated assembly, but produces more compact IR, and of course, makes code more consistent. Reviewed-by: Brian Paul <brianp@vmware.com> commit 66712ba2731fc029fa246d4fc477d61ab785edb5 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 17:30:13 2012 +0100 gallivm: Make LLVMContextRef a singleton. There are any places inside LLVM that depend on it. Too many to attempt to fix. Reviewed-by: Brian Paul <brianp@vmware.com> commit ff5fb7897495ac263f0b069370fab701b70dccef Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 28 18:15:27 2012 +0200 gallivm: don't use 8-wide texture fetch in aos path This appears to be a slight loss usually. There are probably several reasons for that: - fetching itself is scalar - filtering is pure int code hence needs splitting anyway, same for the final texel offset calculations - texture wrap related code, which can be done 8-wide, is slightly more complex with floats (with clamp_to_edge) and float operations generally more costly hence probably not much faster overall - the code needed to split when encountering different mip levels for the quads, adding complexity So, just split always for aos path (but leave it 8-wide for soa, since we do 8-wide filtering there when possible). This should certainly be revisited if we'd have avx2 support. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ce8032b43dcd8e8d816cbab6428f54b0798f945d Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:41:19 2012 +0200 gallivm: (trivial) don't extract fparts variable if not needed Did not have any consequences but unnecessary. commit aaa9aaed8f80dc282492f62aa583a7ee23a4c6d5 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:09:06 2012 +0200 gallivm: fix precision issue in aos linear int wrap code now not just passes at a quick glance but also with piglit... If we do the wrapping with floats, we also need to set the weights accordingly. We can potentially end up with different (integer) coordinates than what the integer calculations would have chosen, which means the integer weights calculated previously in this case are completely wrong. Well at least that's what I think happens, at least recalculating the weights helps. (Some day really should refactor all the wrapping, so we do whatever is fastest independent of 16bit int aos or 32bit float soa filtering.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit fd6f18588ced7ac8e081892f3bab2916623ad7a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 11:15:53 2012 +0100 gallium/util: Fix parsing of options with underscore. For example GALLIVM_DEBUG=no_brilinear which was being parsed as two options, "no" and "brilinear". commit 09a8f809088178a03e49e409fa18f1ac89561837 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 26 15:00:14 2012 +0100 gallivm: Added a generic lp_build_print_value which prints a LLVMValueRef. Updated lp_build_printf to share common code. Removed specific lp_build_print_vecX. Reviewed-by: José Fonseca <jfonseca@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit e59bdcc2c075931bfba2a84967a5ecd1dedd6eb0 Author: José Fonseca <jfonseca@vmware.com> Date: Wed May 16 15:00:23 2012 +0100 draw,llvmpipe: Avoid named struct types on LLVM 3.0 and later. Starting with LLVM 3.0, named structures are meant not for debugging, but for recursive data types, previously also known as opaque types. The recursive nature of these types leads to several memory management difficulties. Given that we don't actually need recursive types, avoid them altogether. This is an attempt to address fdo bugs 41791 and 44466. The issue is somewhat random so there's no easy way to check how effective this is. Cherry-picked from 9af1ba565dfd5cef9ee938bb7c04767d14878fbf commit df6070f618a203c7a876d984c847cde4cbc26bdb Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 14:42:53 2012 +0200 gallivm: (trivial) fix typo in faster aos linear int wrap code no longer crashes, now REALLY tested. commit d8f98dce452c867214e6782e86dc08562643c862 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 18:20:58 2012 +0200 llvmpipe: (trivial) remove bogus optimization for float aos repeat wrap This optimization for nearest filtering on the linear path generated likely bogus results, and the int path didn't have any optimizations there since the only shader using force_nearest apparently uses clamp_to_edge not repeat wrap anyway. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit c4e271a0631087c795e756a5bb6b046043b5099d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 23:01:52 2012 +0200 gallivm: faster repeat wrap for linear aos path too Even if we already have scaled integer coords, it's way faster to use the original float coord (plus some conversions) rather than use URem. The choice of what to do for texture wrapping is not really tied to int aos or float soa filtering though for some modes there can be some gains (because of easier weight calculations). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 1174a75b1806e92aee4264ffe0ffe7e70abbbfa3 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 14:39:22 2012 +0200 gallivm: improve npot tex wrap repeat in linear soa path URem gets translated into series of scalar divisions so just about anything else is faster. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit f849ffaa499ed96fa0efd3594fce255c7f22891b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 00:40:35 2012 +0100 gallivm: (trivial) fix near-invisible shift-space typo I blame the keyboard. commit 5298a0b19fe672aebeb70964c0797d5921b51cf0 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:24:28 2012 +0200 gallivm: add new intrinsic helper to deal with arbitrary vector length This helper will split vectors which are too large for the hw, or expand them if they are too small, so a caller of a function using intrinsics which uses such sizes need not split (or expand) the vectors manually and the function will still use the intrinsic instead of dropping back to generic llvm code. It can also accept scalars for use with pseudo-vector intrinsics (only useful for float arguments, all x86 scalar simd float intrinsics use 4vf32). Only used for lp_build_min/max() for now (also added the scalar float case for these while there). (Other basic binary functions could use it easily, whereas functions with a different interface would need different helpers.) Expanding vectors isn't widely used, because we always try to use build contexts with native hw vector sizes. But it might (or not) be nicer if this wouldn't need to be done, the generated code should in theory stay the same (it does get hit by lp_build_rho though already since we didn't have a intrinsic for the scalar lp_build_max case before). v2: incorporated Brian's feedback, and also made the scalar min/max case work instead of crash (all scalar simd float intrinsics take 4vf32 as argument, probably the reason why it wasn't used before). Moved to lp_bld_intr based on José's request, and passing intrinsic size instead of length. Ideally we'd derive the source type info from the passed in llvm value refs and process some llvmtype return type so we could handle intrinsics where the source and destination type isn't the same (like float/int conversions, packing instructions) but that's a bit too complicated for now. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 01aa760b99ec0b2dc8ce57a43650e83f8c1becdf Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:19:18 2012 +0200 gallivm: (trivial) increase max code size for shader disassembly 64kB was just short of what I needed (which caused a crash) hence increase to 96kB (should probably be smarter about that). commit 74aa739138d981311ce13076388382b5e89c6562 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:53:29 2012 +0100 gallivm: simplify aos float tex wrap repeat nearest just handle pot and npot the same. The previous pot handling ended up with exactly the same instructions plus 2 more (leave it in the soa path though since it is probably still cheaper there). While here also fix a issue which would cause a crash after an assert. commit 0e1e755645e9e49cfaa2025191e3245ccd723564 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:29:24 2012 +0100 gallivm: (trivial) skip floor rounding in ifloor when not signed This was only done for the non-sse41 case before, but even with sse41 this is obviously unnecessary (some callers already call itrunc in this case anyway but some might not). commit 7f01a62f27dcb1d52597b24825931e88bae76f33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:23:12 2012 +0100 gallivm: (trivial) fix bogus comments commit 5c85be25fd82e28490274c468ce7f3e6e8c1d416 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 20 11:51:57 2012 +0100 translate: Free elt8_func/elt16_func too. These were leaking. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 0ad498f36fb6f7458c7cffa73b6598adceee0a6c Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 15:55:34 2012 +0200 gallivm: fix bug for tex wrap repeat with linear sampling in aos float path The comparison needs to be against length not length_minus_one, otherwise the max texel is never chosen (for the second coordinate). Fixes piglit texwrap-1D-npot-proj (and 2D/3D versions). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit d1ad65937c5b76407dc2499b7b774ab59341209e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 16:13:43 2012 +0200 gallivm: simplify soa tex wrap repeat with npot textures and no mip filtering Similar to what is already done in aos sampling for the float path (but not the int path since we don't get normalized float coordinates there). URem is expensive and the calculation is done trivially with normalized floats instead (at least with sse41-capable cpus). (Some day should probably do the same for the mip filter path but it's much more complicated there hence the gain is smaller.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit e1e23f57ba9b910295c306d148f15643acc3fc83 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:38:56 2012 +0200 llvmpipe: (trivial) remove duplicated function declaration Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 07ca57eb09e04c48a157733255427ef5de620861 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:37:34 2012 +0200 llvmpipe: destroy setup variants on context destruction lp_delete_setup_variants() used to be called in garbage collection, but this no longer exists hence the setup shaders never got freed. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ed0003c633859a45f9963a479f4c15ae0ef1dca3 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 16:25:29 2012 +0100 gallivm: handle different ilod parts for multiple quad sampling This fixes filtering when the integer part of the lod is not the same for all quads. I'm not fully convinced of that solution yet as it just splits the vector if the levels to be sampled from are different. But otherwise we'd need to do things like some minify steps, and getting mip level base address separately anyway hence it wouldn't really look like much of a win (and making the code even more complex). This should now give identical results to single quad sampling. commit 8580ac4cfc43a64df55e84ac71ce1a774d33c0d2 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:14:47 2012 +0200 gallivm: de-duplicate sample code common to soa and aos sampling There doesn't seem to be any reason why this code dealing with cube face selection, lod and mip level calculation is separate in aos and soa sampling, and I am sick of having it to change in both places. commit fb541e5f957408ce305b272100196f1e12e5b1e8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:15:41 2012 +0200 gallivm: do mip filtering with per quad lod_fpart This gives better results for mip filtering, though the generated code might not be optimal. For now it also creates some artifacts if the lod_ipart isn't the same for all quads, since instead of using the same mip weight for all quads as previously (which just caused non-smooth gradients) this now will use the right weights but with the wrong mip level in this case (can easily be seen with things like texfilt, mipmap_tunnel). v2: use logic helper suggested by José, and fix issue with negative lod_fpart values commit f1cc84eef7d826a20fab6cd8ccef9a275ff78967 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 13 18:35:25 2012 +0200 gallivm: (trivial) fix bogus assert in lp_build_unpack_broadcast_aos_scalars commit 7c17dbae8ae290df9ce0f50781a09e8ed640c044 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:11:14 2012 +0100 util: Reimplement half <-> float conversions. Removed u_half.py used to generate the table for previous method. Previous implementation of float to half conversion was faulty for denormalised and NaNs and would require extra logic to fix, thus making the speedup of using tables irrelevant. commit 7762f59274070e1dd4b546f5cb431c2eb71ae5c3 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:12:16 2012 +0100 tests: Updated tests to properly handle NaN for half floats. commit fa94c135aea5911fd93d5dfb6e6f157fb40dce5e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:33:10 2012 +0200 gallivm: do mip level calculations per quad This is the final piece which shouldn't change the rendering output yet. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 23cbeaddfe03c09ca18c45d28955515317ffcf4c Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Jun 9 00:54:21 2012 +0200 gallivm: do per-quad cube face selection Doesn't quite fix the piglit cubemap test (not sure why actually) but doing per-quad face selection is doing the right thing and definitely an improvement. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit abfb372b3702ac97ac8b5aa80ad1b94a2cc39d33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:22:59 2012 +0200 gallivm: do all lod calculations per quad Still no functional change but lod is now converted to scalar after lod calculations. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 519368632747ae03feb5bca9c655eccbc5b751b4 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:46:10 2012 +0100 gallivm: Added support for half-float to float conversion in lp_build_conv. Updated various utility functions to support this change. commit 135b4d683a4c95f7577ba27b9bffa4a6fbd2c2e7 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:02:46 2012 +0100 gallivm: Added function for half-float to float conversion. Updated lp_build_format_aos_array to support half-float source. commit 37d648827406a20c5007abeb177698723ed86673 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:55:18 2012 +0100 util: Updated u_format_tests to rigidly test half-float boundary values. commit 2ad18165d96e578aa9046df7c93cb1c3284d8c6b Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:54:16 2012 +0100 llvmpipe: Updated lp_test_format to properly handle Inf/NaN results. commit 78740acf25aeba8a7d146493dd5c966e22c27b73 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:53:30 2012 +0100 util: Added functions for checking NaN / Inf for double and half-floats. commit 35e9f640ae01241f9e0d67fe893bbbf564c05809 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:05:13 2012 +0200 gallivm: Fix calculating rho for 3d textures for the single-quad case Discovered by accident, this looks like a very old typo bug. commit fc1220c636326536fd0541913154e62afa7cd1d8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:04:59 2012 +0200 gallivm: do calcs per-quad in lp_build_rho Still convert to scalar at the end of the function. commit 50a887ffc550bf310a6988fa2cea5c24d38c1a41 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 21 23:21:50 2012 +0200 gallivm: (trivial) return scalar in lp_build_extract_range for length 1 vectors Our type system on top of llvm's one doesn't generally support vectors of length 1, instead using scalars. So we should return a scalar from this function instead of having to bitcast the vector with length 1 later elsewhere. commit 80c71c621f9391f0f9230460198d861643324876 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 17:49:15 2012 +0100 draw: Fixed bad merge error commit c47401cfad0c9167de20ff560654f533579f452c Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:29:30 2012 +0100 draw: Updated store_clip to store whole vectors instead of individual elements. commit 2d9c1ad74b0b0b41861fffcecde39f09cc27f1cf Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:28:32 2012 +0100 gallivm: Added lp_build_fetch_rgba_aos_array. A version of lp_build_fetch_rgba_aos which is targeted at simple array formats. Reads the whole vector from memory in one, instead of reading each element individually. Tested with mesa tests and demos. commit ff7805dc2b6ef6d8b11ec4e54aab1633aef29ac8 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:27:40 2012 +0100 gallivm: Added lp_build_pad_vector. This function pads a vector with undef to a desired length. commit 701f50acef24a2791dabf4730e5b5687d6eb875d Author: James Benton <jbenton@vmware.com> Date: Fri May 18 17:27:19 2012 +0100 util: Added util_format_is_array. This function checks whether a format description is in a simple array format. commit 5e0a7fa543dcd009de26f34a7926674190fa6246 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:13:47 2012 +0100 draw: Removed draw_llvm_translate_from and draw/draw_llvm_translate.c. This is "replaced" by adding an optimised path in lp_build_fetch_rgba_aos in an upcoming patch. commit 8c886d6a7dd3fb464ecf031de6f747cb33e5361d Author: James Benton <jbenton@vmware.com> Date: Wed May 16 15:02:31 2012 +0100 draw: Modified store_aos to write the vector as one, not individual elements. commit 37337f3d657e21dfd662c7b26d61cb0f8cfa6f17 Author: James Benton <jbenton@vmware.com> Date: Wed May 16 14:16:23 2012 +0100 draw: Changed aos_to_soa to use lp_build_transpose_aos. commit bd2b69ce5d5c94b067944d1dcd5df9f8e84548f1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:14:27 2012 +0100 draw: Changed soa_to_aos to use lp_build_transpose_aos. commit 0b98a950d29a116e82ce31dfe7b82cdadb632f2b Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:45 2012 +0100 gallivm: Added lp_build_transpose_aos which converts between aos and soa. commit 69ea84531ad46fd145eb619ed1cedbe97dde7cb5 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:01 2012 +0100 gallivm: Added lp_build_interleave2_half aimed at AVX unpack instructions. commit 7a4cb1349dd35c18144ad5934525cfb9436792f9 Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 22 11:54:14 2012 +0100 gallivm: Fix build on Windows. MC-JIT not yet supported there. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit afd105fc16bb75d874e418046b80d9cc578818a1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:17:26 2012 +0100 llvmpipe: Added a error counter to lp_test_conv. Useful for keeping track of progress when fixing errors! Signed-off-by: José Fonseca <jfonseca@vmware.com> commit b644907d08c10a805657841330fc23db3963d59c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:16:46 2012 +0100 llvmpipe: Changed known failures in lp_test_conv. To comply with the recent fixes to lp_bld_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit d7061507bd94f6468581e218e61261b79c760d4f Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:14:38 2012 +0100 llvmpipe: Added fixed point types tests to lp_test_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 146b3ea39b4726dbe125ac666bd8902ea3d6ca8c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:26:35 2012 +0100 llvmpipe: Changed lp_test_conv src/dst alignment to be correct. Now based on the define rather than a fixed number. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit f3b57441f834833a4b142a951eb98df0aa874536 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:06:44 2012 +0100 gallivm: Fixed erroneous optimisation in lp_build_min/max. Previously assumed normalised was 0 to 1, but it can be -1 to 1 if type is signed. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a0613382e5a215cd146bb277646a6b394d376ae4 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:04:49 2012 +0100 gallivm: Compensate for lp_const_offset in lp_build_conv. Fixing a /*FIXME*/ to remove errors in integer conversion in lp_build_conv. Tested using lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a3d2bf15ea345bc8a0664f8f441276fd566566f3 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:01:25 2012 +0100 gallivm: Fixed overflow in lp_build_clamped_float_to_unsigned_norm. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit e7b1e76fe237613731fa6003b5e1601a2e506207 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 21 20:07:51 2012 +0100 gallivm: Fix build with LLVM 2.6 Trivial, and useful. commit d3c6bbe5c7f5ba1976710831281ab1b6a631082d Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 15 17:15:59 2012 +0100 gallivm: Enable MCJIT/AVX with vanilla LLVM 3.1. Add the necessary C++ glue, so that we don't need any modifications to the soon to be released LLVM 3.1. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 724a019a14d40fdbed21759a204a2bec8a315636 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 22:04:06 2012 +0100 gallivm: Use HAVE_LLVM 0x0301 consistently. commit af6991e2a3868e40ad599b46278551b794839748 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 21:49:06 2012 +0100 gallivm: Add MCRegisterInfo.h to silence benign warnings about missing implementation. Trivial. commit 6f8a1d75458daae2503a86c6b030ecc4bb494e23 Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Apr 2 22:14:15 2012 -0700 gallivm: Pass in a MCInstrInfo to createMCInstPrinter on llvm-3.1. llvm-3.1svn r153860 makes MCInstrInfo available to the MCInstPrinter. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 62555b6ed8760545794f83064e27cddcb3ce5284 Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 27 21:51:17 2012 -0700 gallivm: Fix method overriding in raw_debug_ostream. Use matching type qualifers to avoid method hiding. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 6a9bd784f4ac68ad0a731dcd39e5a3c39989f2be Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 13 22:40:52 2012 -0700 gallivm: Fix createOProfileJITEventListener namespace with llvm-3.1. llvm-3.1svn r152620 refactored the OProfile profiling code. createOProfileJITEventListener was moved from the llvm namespace to the llvm::JITEventListener namespace. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit b674955d39adae272a779be85aa1bd665de24e3e Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Mar 5 22:00:40 2012 -0800 gallivm: Pass in a MCRegisterInfo to MCInstPrinter on llvm-3.1. llvm-3.1svn r152043 changes createMCInstPrinter to take an additional MCRegisterInfo argument. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 11ab69971a8a31c62f6de74905dbf8c02884599f Author: Vinson Lee <vlee@freedesktop.org> Date: Wed Feb 29 21:20:53 2012 -0800 Revert "gallivm: Change getExtent and readByte to non-const with llvm-3.1." This reverts commit d5a6c172547d8964f4d4bb79637651decaf9deee. llvm-3.1svn r151687 makes MemoryObject accessor members const again. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 339960c82d2a9f5c928ee9035ed31dadb7f45537 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 14 16:19:56 2012 +0200 gallivm: (trivial) fix assertion failure for mipmapped 1d textures In lp_build_rho, we may end up with a 1-element vector (for mipmapped 1d textures), but in this case we require the type to be a non-vector type, so need a cast. commit 9d73edb727bd6d196030dc3026b7bf0c574b3e19 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:12:07 2012 +0200 gallivm: prepare for per-quad lod calculations for large vectors to be able to handle multiple quads at once in texture sampling and still do lod calculations per quad, it is necessary to get the per-quad derivatives into the lp_build_rho function. Until now these derivative values were just scalars, which isn't going to work. So we now use vectors, and since the interface needs to change we also do some different (slightly more efficient) packing of the values. For 8-wide vectors the packed derivative values for 3 coords would look like this, this scales to a arbitrary (multiple of 4) vector size: ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____ The second vector will be unused for 1d and 2d textures. To facilitate future changes the derivative values are put into a struct, since quite some functions just pass these values through. The generated code seems to be very slightly better for 2d textures (with 4-wide vectors) than before with sse2 (if you have a cpu with physical 128bit simd units - otherwise it's probably not a win). v2: suggestions from José, rename variables, add comments, use swizzle helper commit 0aa21de0d31466dac77b05c97005722e902517b8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:10:31 2012 +0200 gallivm: add undefined swizzle handling to lp_build_swizzle_aos This is useful for vectors with "holes", it lets llvm choose the most efficient shuffle instructions if some elements aren't needed without having to worry what elements to manually pick otherwise. commit 00faf3f370e7ce92f5ef51002b0ea42ef856e181 Author: José Fonseca <jfonseca@vmware.com> Date: Fri May 4 17:25:16 2012 +0100 gallivm: Get the LLVM IR optimization passes before JIT compilation. MC-JIT engine compiles the module immediately on creation, so the optimization passes were being run too late. So now we create a target data layout from a string, that matches the ABI parameters reported by the compiler. The backend optimization passes were always been run, so the performance improvement is modest (3% on multiarb mesa demo). Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit 40a43f4e2ce3074b5ce9027179d657ebba68800a Author: Roland Scheidegger <sroland@vmware.com> Date: Wed May 2 16:03:54 2012 +0200 gallivm: (trivial) fix wrong define used in lp_build_pack2 should fix stack-smashing crashes. commit e6371d0f4dffad4eb3b7a9d906c23f1c88a2ab9e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 30 21:25:29 2012 +0200 gallivm: add perf warnings when not using intrinsics with 256bit vectors Helper functions using integer sse2 intrinsics could split the vectors with AVX instead of using generic fallback (which should be faster). We don't actually expect to hit these paths (hence don't fix them up to actually do the vector splitting) so just emit warnings (for those functions where it's obvious doing split/intrinsic is faster than using generic path). Only emit warnings for 256bit vectors since we _really_ don't expect to hit arbitrary large vectors which would affect a lot more functions. The warnings do not actually depend on avx since the same logic applies to plain sse2 too (but of course again there's _really_ no reason we should hit these functions with 256bit vectors without avx). commit 8a9ea701ea7295181e846c6383bf66a5f5e47637 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:37:07 2012 +0200 gallivm: split vectors manually for avx in lp_build_pack2 (v2) There's 2 reasons for this: First, there's a llvm bug (fixed in 3.1) which generates tons of byte inserts/extracts otherwise, and second, more importantly, we want to use pack intrinsics instead of shuffles. We do this in lp_build_pack2 and not the calling code (aos sample path) because potentially other callers might find that useful too, even if for larger sequences of code using non-native vector sizes it might be better to manually split vectors. This should boost texture performance in the aos path considerably. v2: fix issues with intrinsics types with old llvm commit 27ac5b48fa1f2ea3efeb5248e2ce32264aba466e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:26:22 2012 +0200 llvmpipe: refactor lp_build_pack2 (v2) prettify, and it's unnecessary to assert when there's no intrinsic due to unsupported bit width - the shuffle path will work regardless. In contrast lp_build_packs2, should only rely on lp_build_pack2 doing the clamping for element sizes for which there is a sse2 intrinsic. v2: fix bug spotted by Jose regarding the intrinsic type for packusdw on old llvm versions. commit ddf279031f0111de4b18eaf783bdc0a1e47813c8 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:13:59 2012 +0200 gallivm: add src width check in lp_build_packs2() not doing so would skip clamping even if no sse2 pack instruction is available, which is incorrect (in theory only, such widths would also always hit a (unnecessary) assertion in lp_build_pack2(). commit e7f0ad7fe079975eae7712a6e0c54be4fae0114b Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Apr 27 15:57:00 2012 +0200 gallivm: (trivial) fix crash-causing typo for npot textures with avx commit 28a9d7f6f655b6ec508c8a3aa6ffefc1e79793a0 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Apr 25 19:38:45 2012 +0200 gallivm: (trivial) remove code mistakenly added twice. commit d5926537316f8ff67ad0a52e7242f7c5478d919b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Apr 24 21:16:15 2012 +0200 gallivm: add a new avx aos sample path (v2) Try to avoid mixing float and int address calculations. This does texture wrap modes with floats, and then the offset calculations still with ints (because of lack of precision with floats, though we could do some effort to make it work with not too large (16MB) textures). This also handles wrap repeat mode with npot-sized textures differently than either the old soa or aos int path (likely way faster but untested). Otherwise the actual address wrap code is largely similar to the soa path (not quite the same as this one also has some int code), it should get used by avx soa sampling later as well but doesn't handle more complex address modes yet (this will also have the benefit that we can use aos sampling path for all texture address modes). Generated code for that looks reasonable, but still does not split vectors explicitly for fetch/filter which means still get hit by llvm (fixed upstream) which generates hundreds of pinsrb/pextrb instead of two shuffles. It is not obvious though if it's much of a win over just doing address calcs 4-wide but with ints, even if it is definitely much less instructions on avx. piglit's texwrap seems to look exactly the same but doesn't test neither the non-normalized nor the npot cases. v2: fix comments, prettify based on Brian's and Jose's feedback. commit bffecd22dea66fb416ecff8cffd10dd4bdb73fce Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Apr 19 01:58:29 2012 +0200 gallivm: refactor aos lp_build_sample_image_nearest/linear split them up to separate address calculations and fetching/filtering. Need this for being able to do 8-wide float address calcs and 4-wide fetch/filter later (for avx). Plus the functions were very big scary monsters anyway (in particular lp_build_sample_image_linear). commit a80b325c57529adddcfa367f96f03557725c4773 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 16 17:17:18 2012 +0200 gallivm: fix lp_build_resize when truncating width but expanding vector size Missed this case which I thought was impossible - the assertion for it was right after the division by zero... (AoS) texture sampling may ask us to do this, for things like 8 4x32int vectors to 1 32x8int vector conversion (eventually, we probably don't want this to happen). commit f9c8337caa3eb185830d18bce8b95676a065b1d7 Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Apr 14 18:00:59 2012 +0200 gallivm: fix cube maps with larger vectors This makes the branchless cube face selection code work with larger vectors. Because the complexity is quite high (cannot really be improved it seems, per-face selection would reduce complexity a lot but this leads to errors unless the derivatives are calculated all from the same face which almost doubles the work to be done) it is still slower than the branching version, hence only enable this with large vectors. It doesn't actually do per-quad face selection yet (only makes sense with matching lod selection, in fact it will select the same face for all pixels based on the average of the first four pixels for now) but only different shuffles are required to make it work (the branching version actually should work with larger vectors too now thanks to the improved horizontal add but of course it cannot be extended to really select the face per-quad unless doing branching per quad). commit 7780c58869fc9a00af4f23209902db7e058e8a66 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 21:11:12 2012 +0100 llvmpipe: (trivial) fix compiler warning and also clarify comment regarding availability of popcnt instruction. commit a266dccf477df6d29a611154e988e8895892277e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 14:21:07 2012 +0100 gallivm: remove unneeded members in lp_build_sample_context Minor cleanup, the texture width, height, depth aren't accessed in their scalar form anywhere. Makes it more obvious those values should probably be fetched already vectorized (but this requires more invasive changes)... commit b678c57fb474e14f05e25658c829fc04d2792fff Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 29 15:53:55 2012 +0100 gallivm: add a helper for concatenating vectors Similar to the extract_range helper intended to get around slow code generated by llvm for 128bit insertelements. Concatenating two 128bit vectors this way will result in a single vinsertf128 operation rather than two 64bit stores plus one 128bit load, though it might be mildly useful for other purposes as well. commit 415ff228bcd0cf5e44a4c15350a661f0f5520029 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 19:41:15 2012 +0100 gallivm: add a custom 2x8f->1x16ub avx conversion path Similar to the existing 4x4f->1x16ub sse2 path, shaves off a couple instructions (min/max mostly) because it relies on pack intrinsics clamping. commit 78c08fc89f8fbcc6dba09779981b1e873e2a0299 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 18:44:07 2012 +0100 gallivm: add avx arithmetic intrinsics Add all avx intrinsics for arithmetic functions (with the exception of the horizontal add function which needs another look). Seems to pass basic tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit a586caa2800aa5ce54c173f7c0d4fc48153dbc4e Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 15:31:35 2012 +0100 gallivm: add avx logic intrinsics Add the blend intrinsics for 8-wide float and 4-wide double vectors. Since we lack 256bit int instructions these are used for int vectors as well, though obviously not for byte or word element values. The comparison intrinsics aren't extended for avx since these are only used for pre-2.7 llvm versions. commit 70275e4c13c89315fc2560a4c488c0e6935d5caf Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 00:40:53 2012 +0100 gallivm: new helper function for extract shuffles. Based on José's idea as we can need that in a couple places. Note that such shuffles should not be used lightly, since data layout of <4 x i8> is different to <16 x i8> for instance, hence might cause data rearrangement. commit 4d586dbae1b0c55915dda1759d2faea631c0a1c2 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 18:27:25 2012 +0100 gallivm: (trivial) don't overallocate shuffle variable using wrong define meant huge array... commit 06b0ec1f6d665d98c135f9573ddf4ba04b2121ad Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 17:54:20 2012 +0100 gallivm: don't do per-element extract/insert for vector element resize Instead of doing per-element extract/insert if the src vectors and dst vector differ in total size (which generates atrocious code) first change the src vectors size by using shuffles to destination vector size. We can still do better than that on AVX for packing to color buffer (by exploiting pack intrinsics characteristics hence eleminating the need for some clamps) but this already generates much better code. v2: incorporate feedback from José, Keith and use shuffle instead of bitcasts/extracts. Due to llvm deficiencies the latter cause all data to get moved to GPRs and back in pieces (even though the data in the regs actually stays the same...). commit c9970d70e05f95d3f52fe7d2cd794176a52693aa Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 19:33:19 2012 +0000 gallivm: fix bug in simple position interpolation Accidental use of position attribute instead of just pixel coordinates. Caused failures in piglit glsl-fs-ceil and glsl-fs-floor. commit d0b6fcdb008d04d7f73d3d725615321544da5a7e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 15:31:14 2012 +0000 gallivm: fix emission of ceil opcode lp_build_ceil seems more appropriate than lp_build_trunc. This seems to be never hit though someone performs some ceil to floor magic. commit d97fafed7e62ffa6bf76560a92ea246a1a26d256 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 22 11:46:52 2012 +0000 gallivm: new vectorized path for cubemap calculations should be faster when adapted to multiple quads as only selection masks need to be different. The code is more or less a per-pixel version adapted to only do it per quad. A per pixel version would be much simpler (could drop 2 selects, 6 broadcasts and the messy horizontal add of 3 vectors at the expense of only 2 more absolute value instructions - would also just work for arbitary large vectors). This version doesn't yet work with larger vectors because the horizontal add isn't adjusted to be able to work with 2x4 vectors (and also because face selection wouldn't be done per quad just per block though that would be only a correctness issue just as with lod selection). The downside is this code is quite a bit slower. On a Core2 it can be sped up by disabling the hw blend instructions for selection and using logicop fallbacks instead, but it is still slower than the old code, hence leave that in for now. Probably will chose one or the other version based on vector length in the end. commit b375fbb18a3fd46859b7fdd42f3e9908ea4ff9a3 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 21 14:42:29 2012 +0000 gallivm: fix optimized occlusion query intrinsic name commit a9ba0a3b611e48efbb0e79eb09caa85033dbe9a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Mar 21 16:19:43 2012 +0000 draw,gallivm,llvmpipe: Call gallivm_verify_function everywhere. commit f94c2238d2bc7383e088b8845b7410439a602071 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 18:54:10 2012 +0000 gallivm: optimize calculations for cube maps a bit this does some more vectorized calculations and uses horizontal adds if possible. A definite win with sse3 otherwise it doesn't seem to make much of a difference. In any case this is arithmetically identical, cannot handle larger vectors. Should be useful as a reference point against larger vector version later... commit 21a2c1cf3c8e1ac648ff49e59fdc0e3be77e2ebb Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 15:16:27 2012 +0000 llvmpipe: slight optimization of occlusion queries using movmskps when available. While this is slightly better for cpus without popcnt we should really sum the vectors ourselves (it is also possible to cast to i4 before doing the popcnt but that doesn't help that much neither since llvm is using some optimized popcnt version for i32) commit 5ab5a35f216619bcdf55eed52b0db275c4a06c1b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 13:32:11 2012 +0000 llvmpipe: fix occlusion queries with larger vectors need to adjust casts etc. commit ff95e6fdf5f16d4ef999ffcf05ea6e8c7160b0d5 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Mar 19 20:15:25 2012 +0000 gallivm: Restore optimization passes. commit 57b05b4b36451e351659e98946dae27be0959832 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:34:22 2012 +0000 llvmpipe: use existing min2 macro commit bc9a20e19b4f600a439f45679451f2e87cd4b299 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:07:27 2012 +0000 llvmpipe: add some safeguards against really large vectors As per José's suggestion, prevent things from blowing up if some cpu would have 1024bit or larger vectors. commit 0e2b525e5ca1c5bbaa63158bde52ad1c1564a3a9 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:31:08 2012 +0000 llvmpipe: fix mask generation for uberwide vectors this was the only piece preventing 16-wide vectors from working (apart from the LP_MAX_VECTOR_WIDTH define that is), which is the maximum as we don't get more pixels in the fragment shader at once. Hence adjust that so things could be tested properly with that size even though there seems to be no practical value. commit 3c8334162211c97f3a11c7f64e9e5a2a91ad9656 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:19:41 2012 +0000 llvmpipe: fix the simple interpolation method with larger vectors so both methods actually _really_ work now. Makes textures look nice with larger vectors... commit 1cb0464ef8871be1778d43b0c56adf9c06843e2d Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 17:26:35 2012 +0000 llvmpipe: fix mask generation and position interpolation with 8-wide vectors trivial bugs, with these things start to look somewhat reasonable. Textures though have some swizzling issues it seems. commit 168277a63ef5b72542cf063c337f2d701053ff4b Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 16:04:03 2012 +0000 llvmpipe: don't overallocate variables we never have more than 16 (stamp size) / 4 (minimum possible vector size). (With larger vectors those variables are still overallocated a bit.) commit 409b54b30f81ed0aa9ed0b01affe15c72de9abd2 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:56:48 2012 +0000 llvmpipe: add some 32f8 formats to lp_test_conv Also add the ability to handle different sized vectors. commit 55dcd3af8366ebdac0af3cdb22c2588f24aa18ce Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:47:27 2012 +0000 gallivm: handle different sized vectors in conversion / pack only fully generic path for now (extract/insert per element). commit 9c040f78c54575fcd94a8808216cf415fe8868f6 Author: Roland Scheidegger <sroland@vmware.com> Date: Sun Mar 18 00:58:28 2012 +0100 llvmpipe: fix harmless use of unitialized values commit 551e9d5468b92fc7d5aa2265db9a52bb1e368a36 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:31:21 2012 +0100 gallivm: drop special path in extract_broadcast with different sized vectors Not needed, llvm can handle shuffles with different sized result vector just fine. Should hopefully generate the same code in the end, but simpler IR. commit 44da531119ffa07a421eaa041f63607cec88f6f8 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:28:49 2012 +0100 llvmpipe: adapt interpolation for handling multiple quads at once this is still WIP there are actually two methods possible not quite sure what makes the most sense, so there's code for both for now: 1) the iterative method as used before (compute attrib values at upper left corner of stamp and upper left corner of each quad initially). It is improved to handle more than one quad at once, and also do some more vectorized calculations initially for slightly better code - newer cpus have full throughput with 4 wide float vectors, hence don't try to code up a path which might be faster if there's just one channel active per attribute. 2) just do straight interpolation for each pixel. Method 2) is more work per quad, but less initially - if all quads are executed significantly more overall though. But this might change with larger vector lengths. This method would also be needed if we'd do some kind of active quad merging when operating on multiple quads at once. This path contains some hack to force llvm to generate better code, it is still far from ideal though, still generates far too many unnecessary register spills/reloads. Both methods should work with different sized vectors. Not very well tested yet, still seems to work with four-wide vectors, need changes elsewhere to be able to test with wider vectors. commit be5d3e82e2fe14ad0a46529ab79f65bf2276cd28 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:59:37 2012 +0000 draw: Cleanup. commit f85bc12c7fbacb3de2a94e88c6cd2d5ee0ec0e8d Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:43:30 2012 +0000 gallivm: More module compilation refactoring. commit d76f093198f2a06a93b2204857e6fea5fd0b3ece Author: José Fonseca <jfonseca@vmware.com> Date: Thu Mar 15 21:29:11 2012 +0000 llvmpipe: Use gallivm_compile/free_function() in linear code. Should had been done before. commit 122e1adb613ce083ad739b153ced1cde61dfc8c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 13 14:47:10 2012 +0100 llvmpipe: generate partial pixel mask for multiple quads still works with one quad, cannot be tested yet with more At least for now always fixed order with multiple quads. commit 4c4f15081d75ed585a01392cd2dcce0ad10e0ea8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 22:09:24 2012 +0100 llvmpipe: refactor state setup a bit Refactor to make it easier to emit (and potentially later fetch in fs) coefficients for multiple attributes at once. Need to think more about how to make this actually happen however, the problem is different attributes can have different interpolation modes, requiring different handling in both setup and fs (though linear and perspective handling is close). commit 9363e49722ff47094d688a4be6f015a03fba9c79 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 19:23:23 2012 +0100 llvmpipe: vectorize tri offset calc cuts number of instructions in quad-offset-factor from 107 to 75. This code actually duplicated the (scalar) code calculating the determinant except it used different vertex order (leading to different sign but it doesn't matter) hence llvm could not have figured out it's the same (of course with determinant vectorized in the other place that wouldn't have worked any longer neither). Note this particular piece doesn't actually vectorize well, not many arithmetic instructions left but tons of shuffle instructions... Probably would need to work on n tris at a time for better vectorization. commit 63169dcb9dd445c94605625bf86d85306e2b4297 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 03:11:37 2012 +0100 llvmpipe: vectorize some scalar code in setup reduces number of arithmetic instructions, and avoids loading vector x,y values twice (once as scalars once as vectors). Results in a reduction of instructions from 76 to 64 in fs setup for glxgears (16%) on a cpu with sse41. Since this code uses vec2 disguised as vec4, on old cpus which had physical 64bit sse units (pre-Core2) it probably is less of a win in practice (and if you have no vectors you can only hope llvm eliminates the arithmetic for unneeded elements). commit 732ecb877f951ab89bf503ac5e35ab8d838b58a1 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 7 00:32:24 2012 +0100 draw: fix clipping bug introduced by 4822fea3f0440b5205e957cd303838c3b128419c broke clipping pretty badly (verified with lineclip test) commit ef5d90b86d624c152d200c7c4056f47c3c6d2688 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 23:38:59 2012 +0100 draw: don't store vertex header per attribute storing the vertex header once per attribute is totally unnecessary. Some quick look at the generated assembly says llvm in fact cannot optimize away the additional stores (maybe due to potentially aliasing pointers somewhere). Plus, this makes the code cleaner and also allows using a vector "or" instead of scalar ones. commit 6b3a5a57b0b9850854cfbd7b586e4e50102dda71 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 19:11:01 2012 +0100 draw: do the per-vertex "boolean" clipmask "or" with vectors no point extracting the values and doing it per component. Doesn't help that much since we still extract the values elsewhere anyway. commit 36519caf1af40e4480251cc79a2d527350b7c61f Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 2 22:27:01 2012 +0100 gallivm: fix lp_build_extract_broadcast with different sized vectors Fix the obviously wrong argument, so it doesn't blow up. commit 76d0ac3ad85066d6058486638013afd02b069c58 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 2 12:16:23 2012 +0000 draw: Compile per module and not per function (WIP). Enough to get gears w/ LLVM draw + softpipe to work on AVX doing: GALLIUM_DRIVER=softpipe SOFTPIPE_USE_LLVM=yes glxgears But still hackish -- will need to rethink and refactor this. commit 78e32b247d2a7a771be9a1a07eb000d1e54ea8bd Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 12:01:05 2012 +0000 llvmpipe: Remove lp_state_setup_fallback. Never used. commit 6895d5e40d19b4972c361e8b83fdb7eecda3c225 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Feb 27 19:14:27 2012 +0000 llvmpipe: Don't emit EMMS on x86 We already take precautions to ensure that LLVM never emits MMX code. commit 4822fea3f0440b5205e957cd303838c3b128419c Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 29 15:58:19 2012 +0100 draw: modifications for larger vector sizes We want to be able to use larger vectors especially for running the vertex shader. With this patch we build soa vectors which might have a different length than 4. Note that aos structures really remain the same, only when aos structures are converted to soa potentially different sized vectors are used. Samplers probably don't work yet, didn't look at them. Testing done: glxgears works with both 128bit and 256bit vectors. commit f4950fc1ea784680ab767d3dd0dce589f4e70603 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:57 2012 +0100 gallivm: override native vector width with LP_NATIVE_VECTOR_WIDTH env var for debug commit 6ad6dbf0c92f3bf68ae54e5f2aca035d19b76e53 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:24 2012 +0100 draw: allocate storage with alignment according to native vector width commit 7bf0e3e7c9bd2469ae7279cabf4c5229ae9880c1 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Feb 24 19:06:08 2012 +0000 gallivm: Fix comment grammar. Was missing several words. Spotted by Roland. commit b20f1b28eb890b2fa2de44a0399b9b6a0d453c52 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 19:22:09 2012 +0000 gallivm: Use MC-JIT on LLVM 3.1 + (i.e, SVN) MC-JIT Note: MC-JIT is still WIP. For this to work correctly it requires LLVM changes which are not yet upstream. commit b1af4dfcadfc241fd4023f4c3f823a1286d452c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 20:03:15 2012 +0100 llvmpipe: use new lp_type_width() helper in lp_test_blend commit 04e0a37e888237d4db2298f31973af459ef9c95f Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 19:50:34 2012 +0100 llvmpipe: clean up lp_test_blend a little Using variables just sized and aligned right makes it a bit more obvious what's going on. The test still only tests vector length 4. For AoS anything else probably isn't going to work. For SoA other lengths should work (at least with floats). commit e61c393d3ec392ddee0a3da170e985fda885a823 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:48:30 2012 +0000 gallivm: Ensure vector width consistency. Instead of assuming that everything is the max native size. commit 330081ac7bc41c5754a92825e51456d231bf84dd Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:44:14 2012 +0000 draw: More simd vector width consistency fixes. commit d90ca002753596269e37297e2e6c139b19f29f03 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:43:00 2012 +0000 gallivm: Remove unused lp_build_int32_vec4_type() helper. commit cae23417824d75869c202aaf897808d73a2c1db0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 17:32:16 2012 +0100 gallivm: use global variable for native vector width instead of define We do not know the simd extensions (and hence the simd width we should use) available at compile time. At least for now keep a define for maximum vector width, since a global variable obviously can't be used to adjust alignment of automatic stack variables. Leave the runtime-determined value at 128 for now in all cases. commit 51270ace6349acc2c294fc6f34c025c707be538a Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 15:41:02 2012 +0000 gallivm: Add a hunk inadvertedly lost when rebasing. commit bf256df9cfdd0236637a455cbaece949b1253e98 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:24:23 2012 +0000 llvmpipe: Use consistent vector width in depth/stencil test. commit 5543b0901677146662c44be2cfba655fd55da94b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:19:59 2012 +0000 draw: Use a consistent the vector register width. Instead of 4x32 sometimes, LP_NATIVE_VECTOR_WIDTH other times. commit eada8bbd22a3a61f549f32fe2a7e408222e5c824 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 12:08:04 2012 +0000 gallivm: Remove garbagge collection. MC-JIT will require one compilation per module (as opposed to one compilation per function), therefore no state will be shared, eliminating the need to do garbagge collection. commit 556697ea0ed72e0641851e4fbbbb862c470fd7eb Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 10:33:41 2012 +0000 gallivm: Move all native target initialization to lp_set_target_options(). commit c518e8f3f2649d5dc265403511fab4bcbe2cc5c8 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:52:32 2012 +0000 llvmpipe: Create one gallivm instance for each test. commit 90f10af8920ec6be6f2b1e7365cfc477a0cb111d Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:48:08 2012 +0000 gallivm: Avoid LLVMAddGlobalMapping() in lp_bld_assert(). Brittle, complex, and unecesary. Just use function pointer constant. commit 98fde550b33401e3fe006af59db4db628bcbf476 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:21:26 2012 +0000 gallivm: Add a lp_build_const_func_pointer() helper. To be reused in all places where we want to call C code. commit 6cfedadb62c2ce5af8d75969bc95a607f3ece118 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:44:41 2012 +0000 gallivm: Cleanup/simplify lp_build_const_string_variable. - Move to lp_bld_const where it belongs - Rename to lp_build_const_string - take the length from the argument (and don't count the zero terminator twice) - bitcast the constant to generic i8 * commit db1d4018c0f1fa682a9da93c032977659adfb68c Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 11:52:17 2012 +0000 gallivm: Set NoFramePointerElimNonLeaf to true where supported. commit 088614164aa915baaa5044fede728aa898483183 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 22 19:38:47 2012 +0100 llvmpipe: pass in/out pointers rather scalar floats in lp_bld_arit we don't want llvm to potentially optimize away the vectors (though it doesn't seem to currently), plus we want to be able to handle in/out vectors of arbitrary length. commit 3f5c4e04af8a7592fdffa54938a277c34ae76b51 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:22:55 2012 +0100 gallivm: fix lp_build_sqrt() for vector length 1 since we optimize away vectors with length 1 need to emit intrinsic without vector type. commit 79d94e5f93ed8ba6757b97e2026722ea31d32c06 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 17:00:46 2012 +0000 llvmpipe: Remove lp_test_round. commit 81f41b5aeb3f4126e06453cfc78990086b85b78d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:56:24 2012 +0100 llvmpipe: subsume lp_test_round into lp_test_arit Much simpler, and since the arguments aren't passed as 128bit values can run on any arch. This also uses the float instead of the double versions of the c functions (which probably was the intention anyway). In contrast to lp_test_round the output is much less verbose however. Tested vector width of 32 to 512 bits - all pass except 32 (length 1) which crashes in lp_build_sqrt() due to wrong type. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 945b338b421defbd274481d8c4f7e0910fd0e7eb Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 09:55:03 2012 +0000 gallivm: Centralize the function compilation logic. This simplifies a lot of code. Also doing this in a central place will make it easier to carry out the changes necessary to use MC-JIT in the future. gallivm: Fix typo in explicit derivative shuffle. Trivial. draw: make DEBUG_STORE work again adapt to lp_build_printf() interface changes Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: get rid of vecnf_from_scalar() just use lp_build_broadcast directly (cannot assign a name but don't really need it, vecnf_from_scalar() was producing much uglier IR due to using repeated insertelement instead of insertelement+shuffle). Reviewed-by: José Fonseca <jfonseca@vmware.com> llvmpipe: fix typo in complex interpolation code Fixes position interpolation when using complex mode (piglit fp-fragment-position and similar) Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: fix clipvertex/position storing again This appears to be the result of a bad merge. Fixes piglit tests relying on clipping, like a lot of the interpolation tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Fix explicit derivative manipulation. Same counter variable was being used in two nested loops. Use more meanigful variable names for the counter to fix and avoid this. gallivm: Prevent buffer overflow in repeat wrap mode for NPOT. Based on Roland's patch, discussion, and review . Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix dims for TGSI_TEXTURE_1D in emit_tex. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix explicit volume texture derivatives. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: fix 1d shadow texture sampling Always r coordinate is used, hence need 3 coords not two (the second one is unused). Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Enable AVX support without MCJIT, where available. For now, this just enables AVX on Windows for testing. If the code is stable then we might consider prefering the old JIT wherever possible. No change elsewhere. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2012-07-13 18:09:30 +01:00
}
}
} else if (util_cpu_caps.has_altivec) {
intr_size = 128;
if (type.width == 8) {
if (!type.sign) {
intrinsic = "llvm.ppc.altivec.vminub";
} else {
intrinsic = "llvm.ppc.altivec.vminsb";
}
} else if (type.width == 16) {
if (!type.sign) {
intrinsic = "llvm.ppc.altivec.vminuh";
} else {
intrinsic = "llvm.ppc.altivec.vminsh";
}
} else if (type.width == 32) {
if (!type.sign) {
intrinsic = "llvm.ppc.altivec.vminuw";
} else {
intrinsic = "llvm.ppc.altivec.vminsw";
}
}
}
if (intrinsic) {
/* We need to handle nan's for floating point numbers. If one of the
* inputs is nan the other should be returned (required by both D3D10+
* and OpenCL).
* The sse intrinsics return the second operator in case of nan by
* default so we need to special code to handle those.
*/
if (util_cpu_caps.has_sse && type.floating &&
nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
LLVMValueRef isnan, min;
min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
type,
intr_size, a, b);
if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
isnan = lp_build_isnan(bld, b);
return lp_build_select(bld, isnan, a, min);
} else {
assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
isnan = lp_build_isnan(bld, a);
return lp_build_select(bld, isnan, a, min);
}
} else {
return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
type,
intr_size, a, b);
}
gallivm,draw,llvmpipe: Support wider native registers. Squashed commit of the following: commit 7acb7b4f60dc505af3dd00dcff744f80315d5b0e Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:46:31 2012 +0100 draw: Don't use dynamically sized arrays. Not supported by MSVC. commit 5810c28c83647612cb372d1e763fd9d7780df3cb Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:44:16 2012 +0100 gallivm,llvmpipe: Don't use expressions with PIPE_ALIGN_VAR(). MSVC doesn't accept exceptions in _declspec(align(...)). Use a define instead. commit 8aafd1457ba572a02b289b3f3411e99a3c056072 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:41:56 2012 +0100 gallium/util: Make u_cpu_detect.h header C++ safe. commit 5795248350771f899cfbfc1a3a58f1835eb2671d Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 2 12:08:01 2012 +0100 gallium/util: Add ULL suffix to large constants. As suggested by Andy Furniss: it looks like some old gcc versions require it. commit 4c66c22727eff92226544c7d43c4eb94de359e10 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Truly disable INF/NAN tests on MSVC. Thanks to Brian for spotting this. commit 8bce274c7fad578d7eb656d9a1413f5c0844c94e Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Disable INF/NAN tests on MSVC. Somehow they are not recognized as constants. commit 6868649cff8d7fd2e2579c28d0b74ef6dd4f9716 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 15:05:24 2012 +0200 gallivm: Cleanup the 2 x 8 float -> 16 ub special path in lp_build_conv. No behaviour change intended, like 7b98455fb40c2df84cfd3cdb1eb7650f67c8a751. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 5147a0949c4407e8bce9e41d9859314b4a9ccf77 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 14:28:19 2012 +0200 gallivm: (trivial) fix issues with multiple-of-4 texture fetch Some formats can't handle non-multiple of 4 fetches I believe, but everything must support length 1 and multiples of 4. So avoid going to scalar fetch (which is very costly) just because length isn't 4. Also extend the hack to not use shift with variable count for yuv formats to arbitrary length (larger than 1) - doesn't matter how many elements we have we always want to avoid it unless we have variable shift count instruction (which we should get with avx2). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 87ebcb1bd71fa4c739451ec8ca89a7f29b168c08 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jul 4 02:09:55 2012 +0200 gallivm: (trivial) fix typo for wrap repeat mode in linear filtering aos code This would lead to bogus coordinates at the edges. (undetected by piglit because this path is only taken for block-based formats). Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 3a42717101b1619874c8932a580c0b9e6896b557 Author: José Fonseca <jfonseca@vmware.com> Date: Tue Jul 3 19:42:49 2012 +0100 gallivm: Fix TGSI integer translation with AVX. commit d71ff104085c196b16426081098fb0bde128ce4f Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 15:17:41 2012 +0100 llvmpipe: Fix LLVM JIT linear path. It was not working properly because it was looking at the JIT function before it was actually compiled. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit a94df0386213e1f5f9a6ed470c535f9688ec0a1b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Jun 28 18:07:10 2012 +0100 gallivm: Refactor lp_build_broadcast(_scalar) to share code. Doesn't really change the generated assembly, but produces more compact IR, and of course, makes code more consistent. Reviewed-by: Brian Paul <brianp@vmware.com> commit 66712ba2731fc029fa246d4fc477d61ab785edb5 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 17:30:13 2012 +0100 gallivm: Make LLVMContextRef a singleton. There are any places inside LLVM that depend on it. Too many to attempt to fix. Reviewed-by: Brian Paul <brianp@vmware.com> commit ff5fb7897495ac263f0b069370fab701b70dccef Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 28 18:15:27 2012 +0200 gallivm: don't use 8-wide texture fetch in aos path This appears to be a slight loss usually. There are probably several reasons for that: - fetching itself is scalar - filtering is pure int code hence needs splitting anyway, same for the final texel offset calculations - texture wrap related code, which can be done 8-wide, is slightly more complex with floats (with clamp_to_edge) and float operations generally more costly hence probably not much faster overall - the code needed to split when encountering different mip levels for the quads, adding complexity So, just split always for aos path (but leave it 8-wide for soa, since we do 8-wide filtering there when possible). This should certainly be revisited if we'd have avx2 support. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ce8032b43dcd8e8d816cbab6428f54b0798f945d Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:41:19 2012 +0200 gallivm: (trivial) don't extract fparts variable if not needed Did not have any consequences but unnecessary. commit aaa9aaed8f80dc282492f62aa583a7ee23a4c6d5 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:09:06 2012 +0200 gallivm: fix precision issue in aos linear int wrap code now not just passes at a quick glance but also with piglit... If we do the wrapping with floats, we also need to set the weights accordingly. We can potentially end up with different (integer) coordinates than what the integer calculations would have chosen, which means the integer weights calculated previously in this case are completely wrong. Well at least that's what I think happens, at least recalculating the weights helps. (Some day really should refactor all the wrapping, so we do whatever is fastest independent of 16bit int aos or 32bit float soa filtering.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit fd6f18588ced7ac8e081892f3bab2916623ad7a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 11:15:53 2012 +0100 gallium/util: Fix parsing of options with underscore. For example GALLIVM_DEBUG=no_brilinear which was being parsed as two options, "no" and "brilinear". commit 09a8f809088178a03e49e409fa18f1ac89561837 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 26 15:00:14 2012 +0100 gallivm: Added a generic lp_build_print_value which prints a LLVMValueRef. Updated lp_build_printf to share common code. Removed specific lp_build_print_vecX. Reviewed-by: José Fonseca <jfonseca@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit e59bdcc2c075931bfba2a84967a5ecd1dedd6eb0 Author: José Fonseca <jfonseca@vmware.com> Date: Wed May 16 15:00:23 2012 +0100 draw,llvmpipe: Avoid named struct types on LLVM 3.0 and later. Starting with LLVM 3.0, named structures are meant not for debugging, but for recursive data types, previously also known as opaque types. The recursive nature of these types leads to several memory management difficulties. Given that we don't actually need recursive types, avoid them altogether. This is an attempt to address fdo bugs 41791 and 44466. The issue is somewhat random so there's no easy way to check how effective this is. Cherry-picked from 9af1ba565dfd5cef9ee938bb7c04767d14878fbf commit df6070f618a203c7a876d984c847cde4cbc26bdb Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 14:42:53 2012 +0200 gallivm: (trivial) fix typo in faster aos linear int wrap code no longer crashes, now REALLY tested. commit d8f98dce452c867214e6782e86dc08562643c862 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 18:20:58 2012 +0200 llvmpipe: (trivial) remove bogus optimization for float aos repeat wrap This optimization for nearest filtering on the linear path generated likely bogus results, and the int path didn't have any optimizations there since the only shader using force_nearest apparently uses clamp_to_edge not repeat wrap anyway. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit c4e271a0631087c795e756a5bb6b046043b5099d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 23:01:52 2012 +0200 gallivm: faster repeat wrap for linear aos path too Even if we already have scaled integer coords, it's way faster to use the original float coord (plus some conversions) rather than use URem. The choice of what to do for texture wrapping is not really tied to int aos or float soa filtering though for some modes there can be some gains (because of easier weight calculations). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 1174a75b1806e92aee4264ffe0ffe7e70abbbfa3 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 14:39:22 2012 +0200 gallivm: improve npot tex wrap repeat in linear soa path URem gets translated into series of scalar divisions so just about anything else is faster. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit f849ffaa499ed96fa0efd3594fce255c7f22891b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 00:40:35 2012 +0100 gallivm: (trivial) fix near-invisible shift-space typo I blame the keyboard. commit 5298a0b19fe672aebeb70964c0797d5921b51cf0 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:24:28 2012 +0200 gallivm: add new intrinsic helper to deal with arbitrary vector length This helper will split vectors which are too large for the hw, or expand them if they are too small, so a caller of a function using intrinsics which uses such sizes need not split (or expand) the vectors manually and the function will still use the intrinsic instead of dropping back to generic llvm code. It can also accept scalars for use with pseudo-vector intrinsics (only useful for float arguments, all x86 scalar simd float intrinsics use 4vf32). Only used for lp_build_min/max() for now (also added the scalar float case for these while there). (Other basic binary functions could use it easily, whereas functions with a different interface would need different helpers.) Expanding vectors isn't widely used, because we always try to use build contexts with native hw vector sizes. But it might (or not) be nicer if this wouldn't need to be done, the generated code should in theory stay the same (it does get hit by lp_build_rho though already since we didn't have a intrinsic for the scalar lp_build_max case before). v2: incorporated Brian's feedback, and also made the scalar min/max case work instead of crash (all scalar simd float intrinsics take 4vf32 as argument, probably the reason why it wasn't used before). Moved to lp_bld_intr based on José's request, and passing intrinsic size instead of length. Ideally we'd derive the source type info from the passed in llvm value refs and process some llvmtype return type so we could handle intrinsics where the source and destination type isn't the same (like float/int conversions, packing instructions) but that's a bit too complicated for now. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 01aa760b99ec0b2dc8ce57a43650e83f8c1becdf Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:19:18 2012 +0200 gallivm: (trivial) increase max code size for shader disassembly 64kB was just short of what I needed (which caused a crash) hence increase to 96kB (should probably be smarter about that). commit 74aa739138d981311ce13076388382b5e89c6562 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:53:29 2012 +0100 gallivm: simplify aos float tex wrap repeat nearest just handle pot and npot the same. The previous pot handling ended up with exactly the same instructions plus 2 more (leave it in the soa path though since it is probably still cheaper there). While here also fix a issue which would cause a crash after an assert. commit 0e1e755645e9e49cfaa2025191e3245ccd723564 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:29:24 2012 +0100 gallivm: (trivial) skip floor rounding in ifloor when not signed This was only done for the non-sse41 case before, but even with sse41 this is obviously unnecessary (some callers already call itrunc in this case anyway but some might not). commit 7f01a62f27dcb1d52597b24825931e88bae76f33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:23:12 2012 +0100 gallivm: (trivial) fix bogus comments commit 5c85be25fd82e28490274c468ce7f3e6e8c1d416 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 20 11:51:57 2012 +0100 translate: Free elt8_func/elt16_func too. These were leaking. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 0ad498f36fb6f7458c7cffa73b6598adceee0a6c Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 15:55:34 2012 +0200 gallivm: fix bug for tex wrap repeat with linear sampling in aos float path The comparison needs to be against length not length_minus_one, otherwise the max texel is never chosen (for the second coordinate). Fixes piglit texwrap-1D-npot-proj (and 2D/3D versions). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit d1ad65937c5b76407dc2499b7b774ab59341209e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 16:13:43 2012 +0200 gallivm: simplify soa tex wrap repeat with npot textures and no mip filtering Similar to what is already done in aos sampling for the float path (but not the int path since we don't get normalized float coordinates there). URem is expensive and the calculation is done trivially with normalized floats instead (at least with sse41-capable cpus). (Some day should probably do the same for the mip filter path but it's much more complicated there hence the gain is smaller.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit e1e23f57ba9b910295c306d148f15643acc3fc83 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:38:56 2012 +0200 llvmpipe: (trivial) remove duplicated function declaration Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 07ca57eb09e04c48a157733255427ef5de620861 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:37:34 2012 +0200 llvmpipe: destroy setup variants on context destruction lp_delete_setup_variants() used to be called in garbage collection, but this no longer exists hence the setup shaders never got freed. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ed0003c633859a45f9963a479f4c15ae0ef1dca3 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 16:25:29 2012 +0100 gallivm: handle different ilod parts for multiple quad sampling This fixes filtering when the integer part of the lod is not the same for all quads. I'm not fully convinced of that solution yet as it just splits the vector if the levels to be sampled from are different. But otherwise we'd need to do things like some minify steps, and getting mip level base address separately anyway hence it wouldn't really look like much of a win (and making the code even more complex). This should now give identical results to single quad sampling. commit 8580ac4cfc43a64df55e84ac71ce1a774d33c0d2 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:14:47 2012 +0200 gallivm: de-duplicate sample code common to soa and aos sampling There doesn't seem to be any reason why this code dealing with cube face selection, lod and mip level calculation is separate in aos and soa sampling, and I am sick of having it to change in both places. commit fb541e5f957408ce305b272100196f1e12e5b1e8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:15:41 2012 +0200 gallivm: do mip filtering with per quad lod_fpart This gives better results for mip filtering, though the generated code might not be optimal. For now it also creates some artifacts if the lod_ipart isn't the same for all quads, since instead of using the same mip weight for all quads as previously (which just caused non-smooth gradients) this now will use the right weights but with the wrong mip level in this case (can easily be seen with things like texfilt, mipmap_tunnel). v2: use logic helper suggested by José, and fix issue with negative lod_fpart values commit f1cc84eef7d826a20fab6cd8ccef9a275ff78967 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 13 18:35:25 2012 +0200 gallivm: (trivial) fix bogus assert in lp_build_unpack_broadcast_aos_scalars commit 7c17dbae8ae290df9ce0f50781a09e8ed640c044 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:11:14 2012 +0100 util: Reimplement half <-> float conversions. Removed u_half.py used to generate the table for previous method. Previous implementation of float to half conversion was faulty for denormalised and NaNs and would require extra logic to fix, thus making the speedup of using tables irrelevant. commit 7762f59274070e1dd4b546f5cb431c2eb71ae5c3 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:12:16 2012 +0100 tests: Updated tests to properly handle NaN for half floats. commit fa94c135aea5911fd93d5dfb6e6f157fb40dce5e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:33:10 2012 +0200 gallivm: do mip level calculations per quad This is the final piece which shouldn't change the rendering output yet. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 23cbeaddfe03c09ca18c45d28955515317ffcf4c Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Jun 9 00:54:21 2012 +0200 gallivm: do per-quad cube face selection Doesn't quite fix the piglit cubemap test (not sure why actually) but doing per-quad face selection is doing the right thing and definitely an improvement. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit abfb372b3702ac97ac8b5aa80ad1b94a2cc39d33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:22:59 2012 +0200 gallivm: do all lod calculations per quad Still no functional change but lod is now converted to scalar after lod calculations. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 519368632747ae03feb5bca9c655eccbc5b751b4 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:46:10 2012 +0100 gallivm: Added support for half-float to float conversion in lp_build_conv. Updated various utility functions to support this change. commit 135b4d683a4c95f7577ba27b9bffa4a6fbd2c2e7 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:02:46 2012 +0100 gallivm: Added function for half-float to float conversion. Updated lp_build_format_aos_array to support half-float source. commit 37d648827406a20c5007abeb177698723ed86673 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:55:18 2012 +0100 util: Updated u_format_tests to rigidly test half-float boundary values. commit 2ad18165d96e578aa9046df7c93cb1c3284d8c6b Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:54:16 2012 +0100 llvmpipe: Updated lp_test_format to properly handle Inf/NaN results. commit 78740acf25aeba8a7d146493dd5c966e22c27b73 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:53:30 2012 +0100 util: Added functions for checking NaN / Inf for double and half-floats. commit 35e9f640ae01241f9e0d67fe893bbbf564c05809 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:05:13 2012 +0200 gallivm: Fix calculating rho for 3d textures for the single-quad case Discovered by accident, this looks like a very old typo bug. commit fc1220c636326536fd0541913154e62afa7cd1d8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:04:59 2012 +0200 gallivm: do calcs per-quad in lp_build_rho Still convert to scalar at the end of the function. commit 50a887ffc550bf310a6988fa2cea5c24d38c1a41 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 21 23:21:50 2012 +0200 gallivm: (trivial) return scalar in lp_build_extract_range for length 1 vectors Our type system on top of llvm's one doesn't generally support vectors of length 1, instead using scalars. So we should return a scalar from this function instead of having to bitcast the vector with length 1 later elsewhere. commit 80c71c621f9391f0f9230460198d861643324876 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 17:49:15 2012 +0100 draw: Fixed bad merge error commit c47401cfad0c9167de20ff560654f533579f452c Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:29:30 2012 +0100 draw: Updated store_clip to store whole vectors instead of individual elements. commit 2d9c1ad74b0b0b41861fffcecde39f09cc27f1cf Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:28:32 2012 +0100 gallivm: Added lp_build_fetch_rgba_aos_array. A version of lp_build_fetch_rgba_aos which is targeted at simple array formats. Reads the whole vector from memory in one, instead of reading each element individually. Tested with mesa tests and demos. commit ff7805dc2b6ef6d8b11ec4e54aab1633aef29ac8 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:27:40 2012 +0100 gallivm: Added lp_build_pad_vector. This function pads a vector with undef to a desired length. commit 701f50acef24a2791dabf4730e5b5687d6eb875d Author: James Benton <jbenton@vmware.com> Date: Fri May 18 17:27:19 2012 +0100 util: Added util_format_is_array. This function checks whether a format description is in a simple array format. commit 5e0a7fa543dcd009de26f34a7926674190fa6246 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:13:47 2012 +0100 draw: Removed draw_llvm_translate_from and draw/draw_llvm_translate.c. This is "replaced" by adding an optimised path in lp_build_fetch_rgba_aos in an upcoming patch. commit 8c886d6a7dd3fb464ecf031de6f747cb33e5361d Author: James Benton <jbenton@vmware.com> Date: Wed May 16 15:02:31 2012 +0100 draw: Modified store_aos to write the vector as one, not individual elements. commit 37337f3d657e21dfd662c7b26d61cb0f8cfa6f17 Author: James Benton <jbenton@vmware.com> Date: Wed May 16 14:16:23 2012 +0100 draw: Changed aos_to_soa to use lp_build_transpose_aos. commit bd2b69ce5d5c94b067944d1dcd5df9f8e84548f1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:14:27 2012 +0100 draw: Changed soa_to_aos to use lp_build_transpose_aos. commit 0b98a950d29a116e82ce31dfe7b82cdadb632f2b Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:45 2012 +0100 gallivm: Added lp_build_transpose_aos which converts between aos and soa. commit 69ea84531ad46fd145eb619ed1cedbe97dde7cb5 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:01 2012 +0100 gallivm: Added lp_build_interleave2_half aimed at AVX unpack instructions. commit 7a4cb1349dd35c18144ad5934525cfb9436792f9 Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 22 11:54:14 2012 +0100 gallivm: Fix build on Windows. MC-JIT not yet supported there. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit afd105fc16bb75d874e418046b80d9cc578818a1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:17:26 2012 +0100 llvmpipe: Added a error counter to lp_test_conv. Useful for keeping track of progress when fixing errors! Signed-off-by: José Fonseca <jfonseca@vmware.com> commit b644907d08c10a805657841330fc23db3963d59c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:16:46 2012 +0100 llvmpipe: Changed known failures in lp_test_conv. To comply with the recent fixes to lp_bld_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit d7061507bd94f6468581e218e61261b79c760d4f Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:14:38 2012 +0100 llvmpipe: Added fixed point types tests to lp_test_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 146b3ea39b4726dbe125ac666bd8902ea3d6ca8c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:26:35 2012 +0100 llvmpipe: Changed lp_test_conv src/dst alignment to be correct. Now based on the define rather than a fixed number. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit f3b57441f834833a4b142a951eb98df0aa874536 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:06:44 2012 +0100 gallivm: Fixed erroneous optimisation in lp_build_min/max. Previously assumed normalised was 0 to 1, but it can be -1 to 1 if type is signed. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a0613382e5a215cd146bb277646a6b394d376ae4 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:04:49 2012 +0100 gallivm: Compensate for lp_const_offset in lp_build_conv. Fixing a /*FIXME*/ to remove errors in integer conversion in lp_build_conv. Tested using lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a3d2bf15ea345bc8a0664f8f441276fd566566f3 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:01:25 2012 +0100 gallivm: Fixed overflow in lp_build_clamped_float_to_unsigned_norm. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit e7b1e76fe237613731fa6003b5e1601a2e506207 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 21 20:07:51 2012 +0100 gallivm: Fix build with LLVM 2.6 Trivial, and useful. commit d3c6bbe5c7f5ba1976710831281ab1b6a631082d Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 15 17:15:59 2012 +0100 gallivm: Enable MCJIT/AVX with vanilla LLVM 3.1. Add the necessary C++ glue, so that we don't need any modifications to the soon to be released LLVM 3.1. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 724a019a14d40fdbed21759a204a2bec8a315636 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 22:04:06 2012 +0100 gallivm: Use HAVE_LLVM 0x0301 consistently. commit af6991e2a3868e40ad599b46278551b794839748 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 21:49:06 2012 +0100 gallivm: Add MCRegisterInfo.h to silence benign warnings about missing implementation. Trivial. commit 6f8a1d75458daae2503a86c6b030ecc4bb494e23 Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Apr 2 22:14:15 2012 -0700 gallivm: Pass in a MCInstrInfo to createMCInstPrinter on llvm-3.1. llvm-3.1svn r153860 makes MCInstrInfo available to the MCInstPrinter. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 62555b6ed8760545794f83064e27cddcb3ce5284 Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 27 21:51:17 2012 -0700 gallivm: Fix method overriding in raw_debug_ostream. Use matching type qualifers to avoid method hiding. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 6a9bd784f4ac68ad0a731dcd39e5a3c39989f2be Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 13 22:40:52 2012 -0700 gallivm: Fix createOProfileJITEventListener namespace with llvm-3.1. llvm-3.1svn r152620 refactored the OProfile profiling code. createOProfileJITEventListener was moved from the llvm namespace to the llvm::JITEventListener namespace. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit b674955d39adae272a779be85aa1bd665de24e3e Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Mar 5 22:00:40 2012 -0800 gallivm: Pass in a MCRegisterInfo to MCInstPrinter on llvm-3.1. llvm-3.1svn r152043 changes createMCInstPrinter to take an additional MCRegisterInfo argument. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 11ab69971a8a31c62f6de74905dbf8c02884599f Author: Vinson Lee <vlee@freedesktop.org> Date: Wed Feb 29 21:20:53 2012 -0800 Revert "gallivm: Change getExtent and readByte to non-const with llvm-3.1." This reverts commit d5a6c172547d8964f4d4bb79637651decaf9deee. llvm-3.1svn r151687 makes MemoryObject accessor members const again. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 339960c82d2a9f5c928ee9035ed31dadb7f45537 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 14 16:19:56 2012 +0200 gallivm: (trivial) fix assertion failure for mipmapped 1d textures In lp_build_rho, we may end up with a 1-element vector (for mipmapped 1d textures), but in this case we require the type to be a non-vector type, so need a cast. commit 9d73edb727bd6d196030dc3026b7bf0c574b3e19 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:12:07 2012 +0200 gallivm: prepare for per-quad lod calculations for large vectors to be able to handle multiple quads at once in texture sampling and still do lod calculations per quad, it is necessary to get the per-quad derivatives into the lp_build_rho function. Until now these derivative values were just scalars, which isn't going to work. So we now use vectors, and since the interface needs to change we also do some different (slightly more efficient) packing of the values. For 8-wide vectors the packed derivative values for 3 coords would look like this, this scales to a arbitrary (multiple of 4) vector size: ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____ The second vector will be unused for 1d and 2d textures. To facilitate future changes the derivative values are put into a struct, since quite some functions just pass these values through. The generated code seems to be very slightly better for 2d textures (with 4-wide vectors) than before with sse2 (if you have a cpu with physical 128bit simd units - otherwise it's probably not a win). v2: suggestions from José, rename variables, add comments, use swizzle helper commit 0aa21de0d31466dac77b05c97005722e902517b8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:10:31 2012 +0200 gallivm: add undefined swizzle handling to lp_build_swizzle_aos This is useful for vectors with "holes", it lets llvm choose the most efficient shuffle instructions if some elements aren't needed without having to worry what elements to manually pick otherwise. commit 00faf3f370e7ce92f5ef51002b0ea42ef856e181 Author: José Fonseca <jfonseca@vmware.com> Date: Fri May 4 17:25:16 2012 +0100 gallivm: Get the LLVM IR optimization passes before JIT compilation. MC-JIT engine compiles the module immediately on creation, so the optimization passes were being run too late. So now we create a target data layout from a string, that matches the ABI parameters reported by the compiler. The backend optimization passes were always been run, so the performance improvement is modest (3% on multiarb mesa demo). Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit 40a43f4e2ce3074b5ce9027179d657ebba68800a Author: Roland Scheidegger <sroland@vmware.com> Date: Wed May 2 16:03:54 2012 +0200 gallivm: (trivial) fix wrong define used in lp_build_pack2 should fix stack-smashing crashes. commit e6371d0f4dffad4eb3b7a9d906c23f1c88a2ab9e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 30 21:25:29 2012 +0200 gallivm: add perf warnings when not using intrinsics with 256bit vectors Helper functions using integer sse2 intrinsics could split the vectors with AVX instead of using generic fallback (which should be faster). We don't actually expect to hit these paths (hence don't fix them up to actually do the vector splitting) so just emit warnings (for those functions where it's obvious doing split/intrinsic is faster than using generic path). Only emit warnings for 256bit vectors since we _really_ don't expect to hit arbitrary large vectors which would affect a lot more functions. The warnings do not actually depend on avx since the same logic applies to plain sse2 too (but of course again there's _really_ no reason we should hit these functions with 256bit vectors without avx). commit 8a9ea701ea7295181e846c6383bf66a5f5e47637 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:37:07 2012 +0200 gallivm: split vectors manually for avx in lp_build_pack2 (v2) There's 2 reasons for this: First, there's a llvm bug (fixed in 3.1) which generates tons of byte inserts/extracts otherwise, and second, more importantly, we want to use pack intrinsics instead of shuffles. We do this in lp_build_pack2 and not the calling code (aos sample path) because potentially other callers might find that useful too, even if for larger sequences of code using non-native vector sizes it might be better to manually split vectors. This should boost texture performance in the aos path considerably. v2: fix issues with intrinsics types with old llvm commit 27ac5b48fa1f2ea3efeb5248e2ce32264aba466e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:26:22 2012 +0200 llvmpipe: refactor lp_build_pack2 (v2) prettify, and it's unnecessary to assert when there's no intrinsic due to unsupported bit width - the shuffle path will work regardless. In contrast lp_build_packs2, should only rely on lp_build_pack2 doing the clamping for element sizes for which there is a sse2 intrinsic. v2: fix bug spotted by Jose regarding the intrinsic type for packusdw on old llvm versions. commit ddf279031f0111de4b18eaf783bdc0a1e47813c8 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:13:59 2012 +0200 gallivm: add src width check in lp_build_packs2() not doing so would skip clamping even if no sse2 pack instruction is available, which is incorrect (in theory only, such widths would also always hit a (unnecessary) assertion in lp_build_pack2(). commit e7f0ad7fe079975eae7712a6e0c54be4fae0114b Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Apr 27 15:57:00 2012 +0200 gallivm: (trivial) fix crash-causing typo for npot textures with avx commit 28a9d7f6f655b6ec508c8a3aa6ffefc1e79793a0 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Apr 25 19:38:45 2012 +0200 gallivm: (trivial) remove code mistakenly added twice. commit d5926537316f8ff67ad0a52e7242f7c5478d919b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Apr 24 21:16:15 2012 +0200 gallivm: add a new avx aos sample path (v2) Try to avoid mixing float and int address calculations. This does texture wrap modes with floats, and then the offset calculations still with ints (because of lack of precision with floats, though we could do some effort to make it work with not too large (16MB) textures). This also handles wrap repeat mode with npot-sized textures differently than either the old soa or aos int path (likely way faster but untested). Otherwise the actual address wrap code is largely similar to the soa path (not quite the same as this one also has some int code), it should get used by avx soa sampling later as well but doesn't handle more complex address modes yet (this will also have the benefit that we can use aos sampling path for all texture address modes). Generated code for that looks reasonable, but still does not split vectors explicitly for fetch/filter which means still get hit by llvm (fixed upstream) which generates hundreds of pinsrb/pextrb instead of two shuffles. It is not obvious though if it's much of a win over just doing address calcs 4-wide but with ints, even if it is definitely much less instructions on avx. piglit's texwrap seems to look exactly the same but doesn't test neither the non-normalized nor the npot cases. v2: fix comments, prettify based on Brian's and Jose's feedback. commit bffecd22dea66fb416ecff8cffd10dd4bdb73fce Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Apr 19 01:58:29 2012 +0200 gallivm: refactor aos lp_build_sample_image_nearest/linear split them up to separate address calculations and fetching/filtering. Need this for being able to do 8-wide float address calcs and 4-wide fetch/filter later (for avx). Plus the functions were very big scary monsters anyway (in particular lp_build_sample_image_linear). commit a80b325c57529adddcfa367f96f03557725c4773 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 16 17:17:18 2012 +0200 gallivm: fix lp_build_resize when truncating width but expanding vector size Missed this case which I thought was impossible - the assertion for it was right after the division by zero... (AoS) texture sampling may ask us to do this, for things like 8 4x32int vectors to 1 32x8int vector conversion (eventually, we probably don't want this to happen). commit f9c8337caa3eb185830d18bce8b95676a065b1d7 Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Apr 14 18:00:59 2012 +0200 gallivm: fix cube maps with larger vectors This makes the branchless cube face selection code work with larger vectors. Because the complexity is quite high (cannot really be improved it seems, per-face selection would reduce complexity a lot but this leads to errors unless the derivatives are calculated all from the same face which almost doubles the work to be done) it is still slower than the branching version, hence only enable this with large vectors. It doesn't actually do per-quad face selection yet (only makes sense with matching lod selection, in fact it will select the same face for all pixels based on the average of the first four pixels for now) but only different shuffles are required to make it work (the branching version actually should work with larger vectors too now thanks to the improved horizontal add but of course it cannot be extended to really select the face per-quad unless doing branching per quad). commit 7780c58869fc9a00af4f23209902db7e058e8a66 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 21:11:12 2012 +0100 llvmpipe: (trivial) fix compiler warning and also clarify comment regarding availability of popcnt instruction. commit a266dccf477df6d29a611154e988e8895892277e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 14:21:07 2012 +0100 gallivm: remove unneeded members in lp_build_sample_context Minor cleanup, the texture width, height, depth aren't accessed in their scalar form anywhere. Makes it more obvious those values should probably be fetched already vectorized (but this requires more invasive changes)... commit b678c57fb474e14f05e25658c829fc04d2792fff Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 29 15:53:55 2012 +0100 gallivm: add a helper for concatenating vectors Similar to the extract_range helper intended to get around slow code generated by llvm for 128bit insertelements. Concatenating two 128bit vectors this way will result in a single vinsertf128 operation rather than two 64bit stores plus one 128bit load, though it might be mildly useful for other purposes as well. commit 415ff228bcd0cf5e44a4c15350a661f0f5520029 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 19:41:15 2012 +0100 gallivm: add a custom 2x8f->1x16ub avx conversion path Similar to the existing 4x4f->1x16ub sse2 path, shaves off a couple instructions (min/max mostly) because it relies on pack intrinsics clamping. commit 78c08fc89f8fbcc6dba09779981b1e873e2a0299 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 18:44:07 2012 +0100 gallivm: add avx arithmetic intrinsics Add all avx intrinsics for arithmetic functions (with the exception of the horizontal add function which needs another look). Seems to pass basic tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit a586caa2800aa5ce54c173f7c0d4fc48153dbc4e Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 15:31:35 2012 +0100 gallivm: add avx logic intrinsics Add the blend intrinsics for 8-wide float and 4-wide double vectors. Since we lack 256bit int instructions these are used for int vectors as well, though obviously not for byte or word element values. The comparison intrinsics aren't extended for avx since these are only used for pre-2.7 llvm versions. commit 70275e4c13c89315fc2560a4c488c0e6935d5caf Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 00:40:53 2012 +0100 gallivm: new helper function for extract shuffles. Based on José's idea as we can need that in a couple places. Note that such shuffles should not be used lightly, since data layout of <4 x i8> is different to <16 x i8> for instance, hence might cause data rearrangement. commit 4d586dbae1b0c55915dda1759d2faea631c0a1c2 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 18:27:25 2012 +0100 gallivm: (trivial) don't overallocate shuffle variable using wrong define meant huge array... commit 06b0ec1f6d665d98c135f9573ddf4ba04b2121ad Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 17:54:20 2012 +0100 gallivm: don't do per-element extract/insert for vector element resize Instead of doing per-element extract/insert if the src vectors and dst vector differ in total size (which generates atrocious code) first change the src vectors size by using shuffles to destination vector size. We can still do better than that on AVX for packing to color buffer (by exploiting pack intrinsics characteristics hence eleminating the need for some clamps) but this already generates much better code. v2: incorporate feedback from José, Keith and use shuffle instead of bitcasts/extracts. Due to llvm deficiencies the latter cause all data to get moved to GPRs and back in pieces (even though the data in the regs actually stays the same...). commit c9970d70e05f95d3f52fe7d2cd794176a52693aa Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 19:33:19 2012 +0000 gallivm: fix bug in simple position interpolation Accidental use of position attribute instead of just pixel coordinates. Caused failures in piglit glsl-fs-ceil and glsl-fs-floor. commit d0b6fcdb008d04d7f73d3d725615321544da5a7e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 15:31:14 2012 +0000 gallivm: fix emission of ceil opcode lp_build_ceil seems more appropriate than lp_build_trunc. This seems to be never hit though someone performs some ceil to floor magic. commit d97fafed7e62ffa6bf76560a92ea246a1a26d256 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 22 11:46:52 2012 +0000 gallivm: new vectorized path for cubemap calculations should be faster when adapted to multiple quads as only selection masks need to be different. The code is more or less a per-pixel version adapted to only do it per quad. A per pixel version would be much simpler (could drop 2 selects, 6 broadcasts and the messy horizontal add of 3 vectors at the expense of only 2 more absolute value instructions - would also just work for arbitary large vectors). This version doesn't yet work with larger vectors because the horizontal add isn't adjusted to be able to work with 2x4 vectors (and also because face selection wouldn't be done per quad just per block though that would be only a correctness issue just as with lod selection). The downside is this code is quite a bit slower. On a Core2 it can be sped up by disabling the hw blend instructions for selection and using logicop fallbacks instead, but it is still slower than the old code, hence leave that in for now. Probably will chose one or the other version based on vector length in the end. commit b375fbb18a3fd46859b7fdd42f3e9908ea4ff9a3 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 21 14:42:29 2012 +0000 gallivm: fix optimized occlusion query intrinsic name commit a9ba0a3b611e48efbb0e79eb09caa85033dbe9a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Mar 21 16:19:43 2012 +0000 draw,gallivm,llvmpipe: Call gallivm_verify_function everywhere. commit f94c2238d2bc7383e088b8845b7410439a602071 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 18:54:10 2012 +0000 gallivm: optimize calculations for cube maps a bit this does some more vectorized calculations and uses horizontal adds if possible. A definite win with sse3 otherwise it doesn't seem to make much of a difference. In any case this is arithmetically identical, cannot handle larger vectors. Should be useful as a reference point against larger vector version later... commit 21a2c1cf3c8e1ac648ff49e59fdc0e3be77e2ebb Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 15:16:27 2012 +0000 llvmpipe: slight optimization of occlusion queries using movmskps when available. While this is slightly better for cpus without popcnt we should really sum the vectors ourselves (it is also possible to cast to i4 before doing the popcnt but that doesn't help that much neither since llvm is using some optimized popcnt version for i32) commit 5ab5a35f216619bcdf55eed52b0db275c4a06c1b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 13:32:11 2012 +0000 llvmpipe: fix occlusion queries with larger vectors need to adjust casts etc. commit ff95e6fdf5f16d4ef999ffcf05ea6e8c7160b0d5 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Mar 19 20:15:25 2012 +0000 gallivm: Restore optimization passes. commit 57b05b4b36451e351659e98946dae27be0959832 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:34:22 2012 +0000 llvmpipe: use existing min2 macro commit bc9a20e19b4f600a439f45679451f2e87cd4b299 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:07:27 2012 +0000 llvmpipe: add some safeguards against really large vectors As per José's suggestion, prevent things from blowing up if some cpu would have 1024bit or larger vectors. commit 0e2b525e5ca1c5bbaa63158bde52ad1c1564a3a9 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:31:08 2012 +0000 llvmpipe: fix mask generation for uberwide vectors this was the only piece preventing 16-wide vectors from working (apart from the LP_MAX_VECTOR_WIDTH define that is), which is the maximum as we don't get more pixels in the fragment shader at once. Hence adjust that so things could be tested properly with that size even though there seems to be no practical value. commit 3c8334162211c97f3a11c7f64e9e5a2a91ad9656 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:19:41 2012 +0000 llvmpipe: fix the simple interpolation method with larger vectors so both methods actually _really_ work now. Makes textures look nice with larger vectors... commit 1cb0464ef8871be1778d43b0c56adf9c06843e2d Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 17:26:35 2012 +0000 llvmpipe: fix mask generation and position interpolation with 8-wide vectors trivial bugs, with these things start to look somewhat reasonable. Textures though have some swizzling issues it seems. commit 168277a63ef5b72542cf063c337f2d701053ff4b Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 16:04:03 2012 +0000 llvmpipe: don't overallocate variables we never have more than 16 (stamp size) / 4 (minimum possible vector size). (With larger vectors those variables are still overallocated a bit.) commit 409b54b30f81ed0aa9ed0b01affe15c72de9abd2 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:56:48 2012 +0000 llvmpipe: add some 32f8 formats to lp_test_conv Also add the ability to handle different sized vectors. commit 55dcd3af8366ebdac0af3cdb22c2588f24aa18ce Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:47:27 2012 +0000 gallivm: handle different sized vectors in conversion / pack only fully generic path for now (extract/insert per element). commit 9c040f78c54575fcd94a8808216cf415fe8868f6 Author: Roland Scheidegger <sroland@vmware.com> Date: Sun Mar 18 00:58:28 2012 +0100 llvmpipe: fix harmless use of unitialized values commit 551e9d5468b92fc7d5aa2265db9a52bb1e368a36 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:31:21 2012 +0100 gallivm: drop special path in extract_broadcast with different sized vectors Not needed, llvm can handle shuffles with different sized result vector just fine. Should hopefully generate the same code in the end, but simpler IR. commit 44da531119ffa07a421eaa041f63607cec88f6f8 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:28:49 2012 +0100 llvmpipe: adapt interpolation for handling multiple quads at once this is still WIP there are actually two methods possible not quite sure what makes the most sense, so there's code for both for now: 1) the iterative method as used before (compute attrib values at upper left corner of stamp and upper left corner of each quad initially). It is improved to handle more than one quad at once, and also do some more vectorized calculations initially for slightly better code - newer cpus have full throughput with 4 wide float vectors, hence don't try to code up a path which might be faster if there's just one channel active per attribute. 2) just do straight interpolation for each pixel. Method 2) is more work per quad, but less initially - if all quads are executed significantly more overall though. But this might change with larger vector lengths. This method would also be needed if we'd do some kind of active quad merging when operating on multiple quads at once. This path contains some hack to force llvm to generate better code, it is still far from ideal though, still generates far too many unnecessary register spills/reloads. Both methods should work with different sized vectors. Not very well tested yet, still seems to work with four-wide vectors, need changes elsewhere to be able to test with wider vectors. commit be5d3e82e2fe14ad0a46529ab79f65bf2276cd28 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:59:37 2012 +0000 draw: Cleanup. commit f85bc12c7fbacb3de2a94e88c6cd2d5ee0ec0e8d Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:43:30 2012 +0000 gallivm: More module compilation refactoring. commit d76f093198f2a06a93b2204857e6fea5fd0b3ece Author: José Fonseca <jfonseca@vmware.com> Date: Thu Mar 15 21:29:11 2012 +0000 llvmpipe: Use gallivm_compile/free_function() in linear code. Should had been done before. commit 122e1adb613ce083ad739b153ced1cde61dfc8c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 13 14:47:10 2012 +0100 llvmpipe: generate partial pixel mask for multiple quads still works with one quad, cannot be tested yet with more At least for now always fixed order with multiple quads. commit 4c4f15081d75ed585a01392cd2dcce0ad10e0ea8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 22:09:24 2012 +0100 llvmpipe: refactor state setup a bit Refactor to make it easier to emit (and potentially later fetch in fs) coefficients for multiple attributes at once. Need to think more about how to make this actually happen however, the problem is different attributes can have different interpolation modes, requiring different handling in both setup and fs (though linear and perspective handling is close). commit 9363e49722ff47094d688a4be6f015a03fba9c79 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 19:23:23 2012 +0100 llvmpipe: vectorize tri offset calc cuts number of instructions in quad-offset-factor from 107 to 75. This code actually duplicated the (scalar) code calculating the determinant except it used different vertex order (leading to different sign but it doesn't matter) hence llvm could not have figured out it's the same (of course with determinant vectorized in the other place that wouldn't have worked any longer neither). Note this particular piece doesn't actually vectorize well, not many arithmetic instructions left but tons of shuffle instructions... Probably would need to work on n tris at a time for better vectorization. commit 63169dcb9dd445c94605625bf86d85306e2b4297 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 03:11:37 2012 +0100 llvmpipe: vectorize some scalar code in setup reduces number of arithmetic instructions, and avoids loading vector x,y values twice (once as scalars once as vectors). Results in a reduction of instructions from 76 to 64 in fs setup for glxgears (16%) on a cpu with sse41. Since this code uses vec2 disguised as vec4, on old cpus which had physical 64bit sse units (pre-Core2) it probably is less of a win in practice (and if you have no vectors you can only hope llvm eliminates the arithmetic for unneeded elements). commit 732ecb877f951ab89bf503ac5e35ab8d838b58a1 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 7 00:32:24 2012 +0100 draw: fix clipping bug introduced by 4822fea3f0440b5205e957cd303838c3b128419c broke clipping pretty badly (verified with lineclip test) commit ef5d90b86d624c152d200c7c4056f47c3c6d2688 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 23:38:59 2012 +0100 draw: don't store vertex header per attribute storing the vertex header once per attribute is totally unnecessary. Some quick look at the generated assembly says llvm in fact cannot optimize away the additional stores (maybe due to potentially aliasing pointers somewhere). Plus, this makes the code cleaner and also allows using a vector "or" instead of scalar ones. commit 6b3a5a57b0b9850854cfbd7b586e4e50102dda71 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 19:11:01 2012 +0100 draw: do the per-vertex "boolean" clipmask "or" with vectors no point extracting the values and doing it per component. Doesn't help that much since we still extract the values elsewhere anyway. commit 36519caf1af40e4480251cc79a2d527350b7c61f Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 2 22:27:01 2012 +0100 gallivm: fix lp_build_extract_broadcast with different sized vectors Fix the obviously wrong argument, so it doesn't blow up. commit 76d0ac3ad85066d6058486638013afd02b069c58 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 2 12:16:23 2012 +0000 draw: Compile per module and not per function (WIP). Enough to get gears w/ LLVM draw + softpipe to work on AVX doing: GALLIUM_DRIVER=softpipe SOFTPIPE_USE_LLVM=yes glxgears But still hackish -- will need to rethink and refactor this. commit 78e32b247d2a7a771be9a1a07eb000d1e54ea8bd Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 12:01:05 2012 +0000 llvmpipe: Remove lp_state_setup_fallback. Never used. commit 6895d5e40d19b4972c361e8b83fdb7eecda3c225 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Feb 27 19:14:27 2012 +0000 llvmpipe: Don't emit EMMS on x86 We already take precautions to ensure that LLVM never emits MMX code. commit 4822fea3f0440b5205e957cd303838c3b128419c Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 29 15:58:19 2012 +0100 draw: modifications for larger vector sizes We want to be able to use larger vectors especially for running the vertex shader. With this patch we build soa vectors which might have a different length than 4. Note that aos structures really remain the same, only when aos structures are converted to soa potentially different sized vectors are used. Samplers probably don't work yet, didn't look at them. Testing done: glxgears works with both 128bit and 256bit vectors. commit f4950fc1ea784680ab767d3dd0dce589f4e70603 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:57 2012 +0100 gallivm: override native vector width with LP_NATIVE_VECTOR_WIDTH env var for debug commit 6ad6dbf0c92f3bf68ae54e5f2aca035d19b76e53 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:24 2012 +0100 draw: allocate storage with alignment according to native vector width commit 7bf0e3e7c9bd2469ae7279cabf4c5229ae9880c1 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Feb 24 19:06:08 2012 +0000 gallivm: Fix comment grammar. Was missing several words. Spotted by Roland. commit b20f1b28eb890b2fa2de44a0399b9b6a0d453c52 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 19:22:09 2012 +0000 gallivm: Use MC-JIT on LLVM 3.1 + (i.e, SVN) MC-JIT Note: MC-JIT is still WIP. For this to work correctly it requires LLVM changes which are not yet upstream. commit b1af4dfcadfc241fd4023f4c3f823a1286d452c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 20:03:15 2012 +0100 llvmpipe: use new lp_type_width() helper in lp_test_blend commit 04e0a37e888237d4db2298f31973af459ef9c95f Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 19:50:34 2012 +0100 llvmpipe: clean up lp_test_blend a little Using variables just sized and aligned right makes it a bit more obvious what's going on. The test still only tests vector length 4. For AoS anything else probably isn't going to work. For SoA other lengths should work (at least with floats). commit e61c393d3ec392ddee0a3da170e985fda885a823 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:48:30 2012 +0000 gallivm: Ensure vector width consistency. Instead of assuming that everything is the max native size. commit 330081ac7bc41c5754a92825e51456d231bf84dd Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:44:14 2012 +0000 draw: More simd vector width consistency fixes. commit d90ca002753596269e37297e2e6c139b19f29f03 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:43:00 2012 +0000 gallivm: Remove unused lp_build_int32_vec4_type() helper. commit cae23417824d75869c202aaf897808d73a2c1db0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 17:32:16 2012 +0100 gallivm: use global variable for native vector width instead of define We do not know the simd extensions (and hence the simd width we should use) available at compile time. At least for now keep a define for maximum vector width, since a global variable obviously can't be used to adjust alignment of automatic stack variables. Leave the runtime-determined value at 128 for now in all cases. commit 51270ace6349acc2c294fc6f34c025c707be538a Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 15:41:02 2012 +0000 gallivm: Add a hunk inadvertedly lost when rebasing. commit bf256df9cfdd0236637a455cbaece949b1253e98 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:24:23 2012 +0000 llvmpipe: Use consistent vector width in depth/stencil test. commit 5543b0901677146662c44be2cfba655fd55da94b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:19:59 2012 +0000 draw: Use a consistent the vector register width. Instead of 4x32 sometimes, LP_NATIVE_VECTOR_WIDTH other times. commit eada8bbd22a3a61f549f32fe2a7e408222e5c824 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 12:08:04 2012 +0000 gallivm: Remove garbagge collection. MC-JIT will require one compilation per module (as opposed to one compilation per function), therefore no state will be shared, eliminating the need to do garbagge collection. commit 556697ea0ed72e0641851e4fbbbb862c470fd7eb Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 10:33:41 2012 +0000 gallivm: Move all native target initialization to lp_set_target_options(). commit c518e8f3f2649d5dc265403511fab4bcbe2cc5c8 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:52:32 2012 +0000 llvmpipe: Create one gallivm instance for each test. commit 90f10af8920ec6be6f2b1e7365cfc477a0cb111d Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:48:08 2012 +0000 gallivm: Avoid LLVMAddGlobalMapping() in lp_bld_assert(). Brittle, complex, and unecesary. Just use function pointer constant. commit 98fde550b33401e3fe006af59db4db628bcbf476 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:21:26 2012 +0000 gallivm: Add a lp_build_const_func_pointer() helper. To be reused in all places where we want to call C code. commit 6cfedadb62c2ce5af8d75969bc95a607f3ece118 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:44:41 2012 +0000 gallivm: Cleanup/simplify lp_build_const_string_variable. - Move to lp_bld_const where it belongs - Rename to lp_build_const_string - take the length from the argument (and don't count the zero terminator twice) - bitcast the constant to generic i8 * commit db1d4018c0f1fa682a9da93c032977659adfb68c Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 11:52:17 2012 +0000 gallivm: Set NoFramePointerElimNonLeaf to true where supported. commit 088614164aa915baaa5044fede728aa898483183 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 22 19:38:47 2012 +0100 llvmpipe: pass in/out pointers rather scalar floats in lp_bld_arit we don't want llvm to potentially optimize away the vectors (though it doesn't seem to currently), plus we want to be able to handle in/out vectors of arbitrary length. commit 3f5c4e04af8a7592fdffa54938a277c34ae76b51 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:22:55 2012 +0100 gallivm: fix lp_build_sqrt() for vector length 1 since we optimize away vectors with length 1 need to emit intrinsic without vector type. commit 79d94e5f93ed8ba6757b97e2026722ea31d32c06 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 17:00:46 2012 +0000 llvmpipe: Remove lp_test_round. commit 81f41b5aeb3f4126e06453cfc78990086b85b78d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:56:24 2012 +0100 llvmpipe: subsume lp_test_round into lp_test_arit Much simpler, and since the arguments aren't passed as 128bit values can run on any arch. This also uses the float instead of the double versions of the c functions (which probably was the intention anyway). In contrast to lp_test_round the output is much less verbose however. Tested vector width of 32 to 512 bits - all pass except 32 (length 1) which crashes in lp_build_sqrt() due to wrong type. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 945b338b421defbd274481d8c4f7e0910fd0e7eb Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 09:55:03 2012 +0000 gallivm: Centralize the function compilation logic. This simplifies a lot of code. Also doing this in a central place will make it easier to carry out the changes necessary to use MC-JIT in the future. gallivm: Fix typo in explicit derivative shuffle. Trivial. draw: make DEBUG_STORE work again adapt to lp_build_printf() interface changes Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: get rid of vecnf_from_scalar() just use lp_build_broadcast directly (cannot assign a name but don't really need it, vecnf_from_scalar() was producing much uglier IR due to using repeated insertelement instead of insertelement+shuffle). Reviewed-by: José Fonseca <jfonseca@vmware.com> llvmpipe: fix typo in complex interpolation code Fixes position interpolation when using complex mode (piglit fp-fragment-position and similar) Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: fix clipvertex/position storing again This appears to be the result of a bad merge. Fixes piglit tests relying on clipping, like a lot of the interpolation tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Fix explicit derivative manipulation. Same counter variable was being used in two nested loops. Use more meanigful variable names for the counter to fix and avoid this. gallivm: Prevent buffer overflow in repeat wrap mode for NPOT. Based on Roland's patch, discussion, and review . Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix dims for TGSI_TEXTURE_1D in emit_tex. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix explicit volume texture derivatives. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: fix 1d shadow texture sampling Always r coordinate is used, hence need 3 coords not two (the second one is unused). Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Enable AVX support without MCJIT, where available. For now, this just enables AVX on Windows for testing. If the code is stable then we might consider prefering the old JIT wherever possible. No change elsewhere. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2012-07-13 18:09:30 +01:00
}
if (type.floating) {
switch (nan_behavior) {
case GALLIVM_NAN_RETURN_NAN: {
LLVMValueRef isnan = lp_build_isnan(bld, b);
cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
return lp_build_select(bld, cond, a, b);
}
break;
case GALLIVM_NAN_RETURN_OTHER: {
LLVMValueRef isnan = lp_build_isnan(bld, a);
cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
return lp_build_select(bld, cond, a, b);
}
break;
2013-11-11 14:29:25 +00:00
case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
return lp_build_select(bld, cond, a, b);
case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
return lp_build_select(bld, cond, b, a);
case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
return lp_build_select(bld, cond, a, b);
break;
default:
assert(0);
cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
return lp_build_select(bld, cond, a, b);
}
} else {
cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
return lp_build_select(bld, cond, a, b);
}
2009-08-01 17:27:05 +01:00
}
LLVMValueRef
lp_build_fmuladd(LLVMBuilderRef builder,
LLVMValueRef a,
LLVMValueRef b,
LLVMValueRef c)
{
LLVMTypeRef type = LLVMTypeOf(a);
assert(type == LLVMTypeOf(b));
assert(type == LLVMTypeOf(c));
if (HAVE_LLVM < 0x0304) {
/* XXX: LLVM 3.3 does not breakdown llvm.fmuladd into mul+add when FMA is
* not supported, and instead it falls-back to a C function.
*/
return LLVMBuildFAdd(builder, LLVMBuildFMul(builder, a, b, ""), c, "");
}
char intrinsic[32];
lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
LLVMValueRef args[] = { a, b, c };
return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0);
}
2009-08-19 20:13:49 +01:00
/**
* Generate max(a, b)
* No checks for special case values of a or b = 1 or 0 are done.
* NaN's are handled according to the behavior specified by the
* nan_behavior argument.
2009-08-19 20:13:49 +01:00
*/
static LLVMValueRef
lp_build_max_simple(struct lp_build_context *bld,
LLVMValueRef a,
LLVMValueRef b,
enum gallivm_nan_behavior nan_behavior)
2009-08-01 17:27:05 +01:00
{
const struct lp_type type = bld->type;
const char *intrinsic = NULL;
unsigned intr_size = 0;
LLVMValueRef cond;
assert(lp_check_value(type, a));
assert(lp_check_value(type, b));
/* TODO: optimize the constant case */
gallivm,draw,llvmpipe: Support wider native registers. Squashed commit of the following: commit 7acb7b4f60dc505af3dd00dcff744f80315d5b0e Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:46:31 2012 +0100 draw: Don't use dynamically sized arrays. Not supported by MSVC. commit 5810c28c83647612cb372d1e763fd9d7780df3cb Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:44:16 2012 +0100 gallivm,llvmpipe: Don't use expressions with PIPE_ALIGN_VAR(). MSVC doesn't accept exceptions in _declspec(align(...)). Use a define instead. commit 8aafd1457ba572a02b289b3f3411e99a3c056072 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:41:56 2012 +0100 gallium/util: Make u_cpu_detect.h header C++ safe. commit 5795248350771f899cfbfc1a3a58f1835eb2671d Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 2 12:08:01 2012 +0100 gallium/util: Add ULL suffix to large constants. As suggested by Andy Furniss: it looks like some old gcc versions require it. commit 4c66c22727eff92226544c7d43c4eb94de359e10 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Truly disable INF/NAN tests on MSVC. Thanks to Brian for spotting this. commit 8bce274c7fad578d7eb656d9a1413f5c0844c94e Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Disable INF/NAN tests on MSVC. Somehow they are not recognized as constants. commit 6868649cff8d7fd2e2579c28d0b74ef6dd4f9716 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 15:05:24 2012 +0200 gallivm: Cleanup the 2 x 8 float -> 16 ub special path in lp_build_conv. No behaviour change intended, like 7b98455fb40c2df84cfd3cdb1eb7650f67c8a751. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 5147a0949c4407e8bce9e41d9859314b4a9ccf77 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 14:28:19 2012 +0200 gallivm: (trivial) fix issues with multiple-of-4 texture fetch Some formats can't handle non-multiple of 4 fetches I believe, but everything must support length 1 and multiples of 4. So avoid going to scalar fetch (which is very costly) just because length isn't 4. Also extend the hack to not use shift with variable count for yuv formats to arbitrary length (larger than 1) - doesn't matter how many elements we have we always want to avoid it unless we have variable shift count instruction (which we should get with avx2). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 87ebcb1bd71fa4c739451ec8ca89a7f29b168c08 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jul 4 02:09:55 2012 +0200 gallivm: (trivial) fix typo for wrap repeat mode in linear filtering aos code This would lead to bogus coordinates at the edges. (undetected by piglit because this path is only taken for block-based formats). Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 3a42717101b1619874c8932a580c0b9e6896b557 Author: José Fonseca <jfonseca@vmware.com> Date: Tue Jul 3 19:42:49 2012 +0100 gallivm: Fix TGSI integer translation with AVX. commit d71ff104085c196b16426081098fb0bde128ce4f Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 15:17:41 2012 +0100 llvmpipe: Fix LLVM JIT linear path. It was not working properly because it was looking at the JIT function before it was actually compiled. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit a94df0386213e1f5f9a6ed470c535f9688ec0a1b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Jun 28 18:07:10 2012 +0100 gallivm: Refactor lp_build_broadcast(_scalar) to share code. Doesn't really change the generated assembly, but produces more compact IR, and of course, makes code more consistent. Reviewed-by: Brian Paul <brianp@vmware.com> commit 66712ba2731fc029fa246d4fc477d61ab785edb5 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 17:30:13 2012 +0100 gallivm: Make LLVMContextRef a singleton. There are any places inside LLVM that depend on it. Too many to attempt to fix. Reviewed-by: Brian Paul <brianp@vmware.com> commit ff5fb7897495ac263f0b069370fab701b70dccef Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 28 18:15:27 2012 +0200 gallivm: don't use 8-wide texture fetch in aos path This appears to be a slight loss usually. There are probably several reasons for that: - fetching itself is scalar - filtering is pure int code hence needs splitting anyway, same for the final texel offset calculations - texture wrap related code, which can be done 8-wide, is slightly more complex with floats (with clamp_to_edge) and float operations generally more costly hence probably not much faster overall - the code needed to split when encountering different mip levels for the quads, adding complexity So, just split always for aos path (but leave it 8-wide for soa, since we do 8-wide filtering there when possible). This should certainly be revisited if we'd have avx2 support. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ce8032b43dcd8e8d816cbab6428f54b0798f945d Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:41:19 2012 +0200 gallivm: (trivial) don't extract fparts variable if not needed Did not have any consequences but unnecessary. commit aaa9aaed8f80dc282492f62aa583a7ee23a4c6d5 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:09:06 2012 +0200 gallivm: fix precision issue in aos linear int wrap code now not just passes at a quick glance but also with piglit... If we do the wrapping with floats, we also need to set the weights accordingly. We can potentially end up with different (integer) coordinates than what the integer calculations would have chosen, which means the integer weights calculated previously in this case are completely wrong. Well at least that's what I think happens, at least recalculating the weights helps. (Some day really should refactor all the wrapping, so we do whatever is fastest independent of 16bit int aos or 32bit float soa filtering.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit fd6f18588ced7ac8e081892f3bab2916623ad7a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 11:15:53 2012 +0100 gallium/util: Fix parsing of options with underscore. For example GALLIVM_DEBUG=no_brilinear which was being parsed as two options, "no" and "brilinear". commit 09a8f809088178a03e49e409fa18f1ac89561837 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 26 15:00:14 2012 +0100 gallivm: Added a generic lp_build_print_value which prints a LLVMValueRef. Updated lp_build_printf to share common code. Removed specific lp_build_print_vecX. Reviewed-by: José Fonseca <jfonseca@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit e59bdcc2c075931bfba2a84967a5ecd1dedd6eb0 Author: José Fonseca <jfonseca@vmware.com> Date: Wed May 16 15:00:23 2012 +0100 draw,llvmpipe: Avoid named struct types on LLVM 3.0 and later. Starting with LLVM 3.0, named structures are meant not for debugging, but for recursive data types, previously also known as opaque types. The recursive nature of these types leads to several memory management difficulties. Given that we don't actually need recursive types, avoid them altogether. This is an attempt to address fdo bugs 41791 and 44466. The issue is somewhat random so there's no easy way to check how effective this is. Cherry-picked from 9af1ba565dfd5cef9ee938bb7c04767d14878fbf commit df6070f618a203c7a876d984c847cde4cbc26bdb Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 14:42:53 2012 +0200 gallivm: (trivial) fix typo in faster aos linear int wrap code no longer crashes, now REALLY tested. commit d8f98dce452c867214e6782e86dc08562643c862 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 18:20:58 2012 +0200 llvmpipe: (trivial) remove bogus optimization for float aos repeat wrap This optimization for nearest filtering on the linear path generated likely bogus results, and the int path didn't have any optimizations there since the only shader using force_nearest apparently uses clamp_to_edge not repeat wrap anyway. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit c4e271a0631087c795e756a5bb6b046043b5099d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 23:01:52 2012 +0200 gallivm: faster repeat wrap for linear aos path too Even if we already have scaled integer coords, it's way faster to use the original float coord (plus some conversions) rather than use URem. The choice of what to do for texture wrapping is not really tied to int aos or float soa filtering though for some modes there can be some gains (because of easier weight calculations). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 1174a75b1806e92aee4264ffe0ffe7e70abbbfa3 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 14:39:22 2012 +0200 gallivm: improve npot tex wrap repeat in linear soa path URem gets translated into series of scalar divisions so just about anything else is faster. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit f849ffaa499ed96fa0efd3594fce255c7f22891b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 00:40:35 2012 +0100 gallivm: (trivial) fix near-invisible shift-space typo I blame the keyboard. commit 5298a0b19fe672aebeb70964c0797d5921b51cf0 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:24:28 2012 +0200 gallivm: add new intrinsic helper to deal with arbitrary vector length This helper will split vectors which are too large for the hw, or expand them if they are too small, so a caller of a function using intrinsics which uses such sizes need not split (or expand) the vectors manually and the function will still use the intrinsic instead of dropping back to generic llvm code. It can also accept scalars for use with pseudo-vector intrinsics (only useful for float arguments, all x86 scalar simd float intrinsics use 4vf32). Only used for lp_build_min/max() for now (also added the scalar float case for these while there). (Other basic binary functions could use it easily, whereas functions with a different interface would need different helpers.) Expanding vectors isn't widely used, because we always try to use build contexts with native hw vector sizes. But it might (or not) be nicer if this wouldn't need to be done, the generated code should in theory stay the same (it does get hit by lp_build_rho though already since we didn't have a intrinsic for the scalar lp_build_max case before). v2: incorporated Brian's feedback, and also made the scalar min/max case work instead of crash (all scalar simd float intrinsics take 4vf32 as argument, probably the reason why it wasn't used before). Moved to lp_bld_intr based on José's request, and passing intrinsic size instead of length. Ideally we'd derive the source type info from the passed in llvm value refs and process some llvmtype return type so we could handle intrinsics where the source and destination type isn't the same (like float/int conversions, packing instructions) but that's a bit too complicated for now. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 01aa760b99ec0b2dc8ce57a43650e83f8c1becdf Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:19:18 2012 +0200 gallivm: (trivial) increase max code size for shader disassembly 64kB was just short of what I needed (which caused a crash) hence increase to 96kB (should probably be smarter about that). commit 74aa739138d981311ce13076388382b5e89c6562 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:53:29 2012 +0100 gallivm: simplify aos float tex wrap repeat nearest just handle pot and npot the same. The previous pot handling ended up with exactly the same instructions plus 2 more (leave it in the soa path though since it is probably still cheaper there). While here also fix a issue which would cause a crash after an assert. commit 0e1e755645e9e49cfaa2025191e3245ccd723564 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:29:24 2012 +0100 gallivm: (trivial) skip floor rounding in ifloor when not signed This was only done for the non-sse41 case before, but even with sse41 this is obviously unnecessary (some callers already call itrunc in this case anyway but some might not). commit 7f01a62f27dcb1d52597b24825931e88bae76f33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:23:12 2012 +0100 gallivm: (trivial) fix bogus comments commit 5c85be25fd82e28490274c468ce7f3e6e8c1d416 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 20 11:51:57 2012 +0100 translate: Free elt8_func/elt16_func too. These were leaking. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 0ad498f36fb6f7458c7cffa73b6598adceee0a6c Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 15:55:34 2012 +0200 gallivm: fix bug for tex wrap repeat with linear sampling in aos float path The comparison needs to be against length not length_minus_one, otherwise the max texel is never chosen (for the second coordinate). Fixes piglit texwrap-1D-npot-proj (and 2D/3D versions). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit d1ad65937c5b76407dc2499b7b774ab59341209e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 16:13:43 2012 +0200 gallivm: simplify soa tex wrap repeat with npot textures and no mip filtering Similar to what is already done in aos sampling for the float path (but not the int path since we don't get normalized float coordinates there). URem is expensive and the calculation is done trivially with normalized floats instead (at least with sse41-capable cpus). (Some day should probably do the same for the mip filter path but it's much more complicated there hence the gain is smaller.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit e1e23f57ba9b910295c306d148f15643acc3fc83 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:38:56 2012 +0200 llvmpipe: (trivial) remove duplicated function declaration Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 07ca57eb09e04c48a157733255427ef5de620861 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:37:34 2012 +0200 llvmpipe: destroy setup variants on context destruction lp_delete_setup_variants() used to be called in garbage collection, but this no longer exists hence the setup shaders never got freed. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ed0003c633859a45f9963a479f4c15ae0ef1dca3 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 16:25:29 2012 +0100 gallivm: handle different ilod parts for multiple quad sampling This fixes filtering when the integer part of the lod is not the same for all quads. I'm not fully convinced of that solution yet as it just splits the vector if the levels to be sampled from are different. But otherwise we'd need to do things like some minify steps, and getting mip level base address separately anyway hence it wouldn't really look like much of a win (and making the code even more complex). This should now give identical results to single quad sampling. commit 8580ac4cfc43a64df55e84ac71ce1a774d33c0d2 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:14:47 2012 +0200 gallivm: de-duplicate sample code common to soa and aos sampling There doesn't seem to be any reason why this code dealing with cube face selection, lod and mip level calculation is separate in aos and soa sampling, and I am sick of having it to change in both places. commit fb541e5f957408ce305b272100196f1e12e5b1e8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:15:41 2012 +0200 gallivm: do mip filtering with per quad lod_fpart This gives better results for mip filtering, though the generated code might not be optimal. For now it also creates some artifacts if the lod_ipart isn't the same for all quads, since instead of using the same mip weight for all quads as previously (which just caused non-smooth gradients) this now will use the right weights but with the wrong mip level in this case (can easily be seen with things like texfilt, mipmap_tunnel). v2: use logic helper suggested by José, and fix issue with negative lod_fpart values commit f1cc84eef7d826a20fab6cd8ccef9a275ff78967 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 13 18:35:25 2012 +0200 gallivm: (trivial) fix bogus assert in lp_build_unpack_broadcast_aos_scalars commit 7c17dbae8ae290df9ce0f50781a09e8ed640c044 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:11:14 2012 +0100 util: Reimplement half <-> float conversions. Removed u_half.py used to generate the table for previous method. Previous implementation of float to half conversion was faulty for denormalised and NaNs and would require extra logic to fix, thus making the speedup of using tables irrelevant. commit 7762f59274070e1dd4b546f5cb431c2eb71ae5c3 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:12:16 2012 +0100 tests: Updated tests to properly handle NaN for half floats. commit fa94c135aea5911fd93d5dfb6e6f157fb40dce5e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:33:10 2012 +0200 gallivm: do mip level calculations per quad This is the final piece which shouldn't change the rendering output yet. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 23cbeaddfe03c09ca18c45d28955515317ffcf4c Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Jun 9 00:54:21 2012 +0200 gallivm: do per-quad cube face selection Doesn't quite fix the piglit cubemap test (not sure why actually) but doing per-quad face selection is doing the right thing and definitely an improvement. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit abfb372b3702ac97ac8b5aa80ad1b94a2cc39d33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:22:59 2012 +0200 gallivm: do all lod calculations per quad Still no functional change but lod is now converted to scalar after lod calculations. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 519368632747ae03feb5bca9c655eccbc5b751b4 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:46:10 2012 +0100 gallivm: Added support for half-float to float conversion in lp_build_conv. Updated various utility functions to support this change. commit 135b4d683a4c95f7577ba27b9bffa4a6fbd2c2e7 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:02:46 2012 +0100 gallivm: Added function for half-float to float conversion. Updated lp_build_format_aos_array to support half-float source. commit 37d648827406a20c5007abeb177698723ed86673 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:55:18 2012 +0100 util: Updated u_format_tests to rigidly test half-float boundary values. commit 2ad18165d96e578aa9046df7c93cb1c3284d8c6b Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:54:16 2012 +0100 llvmpipe: Updated lp_test_format to properly handle Inf/NaN results. commit 78740acf25aeba8a7d146493dd5c966e22c27b73 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:53:30 2012 +0100 util: Added functions for checking NaN / Inf for double and half-floats. commit 35e9f640ae01241f9e0d67fe893bbbf564c05809 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:05:13 2012 +0200 gallivm: Fix calculating rho for 3d textures for the single-quad case Discovered by accident, this looks like a very old typo bug. commit fc1220c636326536fd0541913154e62afa7cd1d8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:04:59 2012 +0200 gallivm: do calcs per-quad in lp_build_rho Still convert to scalar at the end of the function. commit 50a887ffc550bf310a6988fa2cea5c24d38c1a41 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 21 23:21:50 2012 +0200 gallivm: (trivial) return scalar in lp_build_extract_range for length 1 vectors Our type system on top of llvm's one doesn't generally support vectors of length 1, instead using scalars. So we should return a scalar from this function instead of having to bitcast the vector with length 1 later elsewhere. commit 80c71c621f9391f0f9230460198d861643324876 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 17:49:15 2012 +0100 draw: Fixed bad merge error commit c47401cfad0c9167de20ff560654f533579f452c Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:29:30 2012 +0100 draw: Updated store_clip to store whole vectors instead of individual elements. commit 2d9c1ad74b0b0b41861fffcecde39f09cc27f1cf Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:28:32 2012 +0100 gallivm: Added lp_build_fetch_rgba_aos_array. A version of lp_build_fetch_rgba_aos which is targeted at simple array formats. Reads the whole vector from memory in one, instead of reading each element individually. Tested with mesa tests and demos. commit ff7805dc2b6ef6d8b11ec4e54aab1633aef29ac8 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:27:40 2012 +0100 gallivm: Added lp_build_pad_vector. This function pads a vector with undef to a desired length. commit 701f50acef24a2791dabf4730e5b5687d6eb875d Author: James Benton <jbenton@vmware.com> Date: Fri May 18 17:27:19 2012 +0100 util: Added util_format_is_array. This function checks whether a format description is in a simple array format. commit 5e0a7fa543dcd009de26f34a7926674190fa6246 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:13:47 2012 +0100 draw: Removed draw_llvm_translate_from and draw/draw_llvm_translate.c. This is "replaced" by adding an optimised path in lp_build_fetch_rgba_aos in an upcoming patch. commit 8c886d6a7dd3fb464ecf031de6f747cb33e5361d Author: James Benton <jbenton@vmware.com> Date: Wed May 16 15:02:31 2012 +0100 draw: Modified store_aos to write the vector as one, not individual elements. commit 37337f3d657e21dfd662c7b26d61cb0f8cfa6f17 Author: James Benton <jbenton@vmware.com> Date: Wed May 16 14:16:23 2012 +0100 draw: Changed aos_to_soa to use lp_build_transpose_aos. commit bd2b69ce5d5c94b067944d1dcd5df9f8e84548f1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:14:27 2012 +0100 draw: Changed soa_to_aos to use lp_build_transpose_aos. commit 0b98a950d29a116e82ce31dfe7b82cdadb632f2b Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:45 2012 +0100 gallivm: Added lp_build_transpose_aos which converts between aos and soa. commit 69ea84531ad46fd145eb619ed1cedbe97dde7cb5 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:01 2012 +0100 gallivm: Added lp_build_interleave2_half aimed at AVX unpack instructions. commit 7a4cb1349dd35c18144ad5934525cfb9436792f9 Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 22 11:54:14 2012 +0100 gallivm: Fix build on Windows. MC-JIT not yet supported there. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit afd105fc16bb75d874e418046b80d9cc578818a1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:17:26 2012 +0100 llvmpipe: Added a error counter to lp_test_conv. Useful for keeping track of progress when fixing errors! Signed-off-by: José Fonseca <jfonseca@vmware.com> commit b644907d08c10a805657841330fc23db3963d59c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:16:46 2012 +0100 llvmpipe: Changed known failures in lp_test_conv. To comply with the recent fixes to lp_bld_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit d7061507bd94f6468581e218e61261b79c760d4f Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:14:38 2012 +0100 llvmpipe: Added fixed point types tests to lp_test_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 146b3ea39b4726dbe125ac666bd8902ea3d6ca8c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:26:35 2012 +0100 llvmpipe: Changed lp_test_conv src/dst alignment to be correct. Now based on the define rather than a fixed number. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit f3b57441f834833a4b142a951eb98df0aa874536 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:06:44 2012 +0100 gallivm: Fixed erroneous optimisation in lp_build_min/max. Previously assumed normalised was 0 to 1, but it can be -1 to 1 if type is signed. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a0613382e5a215cd146bb277646a6b394d376ae4 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:04:49 2012 +0100 gallivm: Compensate for lp_const_offset in lp_build_conv. Fixing a /*FIXME*/ to remove errors in integer conversion in lp_build_conv. Tested using lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a3d2bf15ea345bc8a0664f8f441276fd566566f3 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:01:25 2012 +0100 gallivm: Fixed overflow in lp_build_clamped_float_to_unsigned_norm. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit e7b1e76fe237613731fa6003b5e1601a2e506207 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 21 20:07:51 2012 +0100 gallivm: Fix build with LLVM 2.6 Trivial, and useful. commit d3c6bbe5c7f5ba1976710831281ab1b6a631082d Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 15 17:15:59 2012 +0100 gallivm: Enable MCJIT/AVX with vanilla LLVM 3.1. Add the necessary C++ glue, so that we don't need any modifications to the soon to be released LLVM 3.1. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 724a019a14d40fdbed21759a204a2bec8a315636 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 22:04:06 2012 +0100 gallivm: Use HAVE_LLVM 0x0301 consistently. commit af6991e2a3868e40ad599b46278551b794839748 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 21:49:06 2012 +0100 gallivm: Add MCRegisterInfo.h to silence benign warnings about missing implementation. Trivial. commit 6f8a1d75458daae2503a86c6b030ecc4bb494e23 Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Apr 2 22:14:15 2012 -0700 gallivm: Pass in a MCInstrInfo to createMCInstPrinter on llvm-3.1. llvm-3.1svn r153860 makes MCInstrInfo available to the MCInstPrinter. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 62555b6ed8760545794f83064e27cddcb3ce5284 Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 27 21:51:17 2012 -0700 gallivm: Fix method overriding in raw_debug_ostream. Use matching type qualifers to avoid method hiding. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 6a9bd784f4ac68ad0a731dcd39e5a3c39989f2be Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 13 22:40:52 2012 -0700 gallivm: Fix createOProfileJITEventListener namespace with llvm-3.1. llvm-3.1svn r152620 refactored the OProfile profiling code. createOProfileJITEventListener was moved from the llvm namespace to the llvm::JITEventListener namespace. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit b674955d39adae272a779be85aa1bd665de24e3e Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Mar 5 22:00:40 2012 -0800 gallivm: Pass in a MCRegisterInfo to MCInstPrinter on llvm-3.1. llvm-3.1svn r152043 changes createMCInstPrinter to take an additional MCRegisterInfo argument. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 11ab69971a8a31c62f6de74905dbf8c02884599f Author: Vinson Lee <vlee@freedesktop.org> Date: Wed Feb 29 21:20:53 2012 -0800 Revert "gallivm: Change getExtent and readByte to non-const with llvm-3.1." This reverts commit d5a6c172547d8964f4d4bb79637651decaf9deee. llvm-3.1svn r151687 makes MemoryObject accessor members const again. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 339960c82d2a9f5c928ee9035ed31dadb7f45537 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 14 16:19:56 2012 +0200 gallivm: (trivial) fix assertion failure for mipmapped 1d textures In lp_build_rho, we may end up with a 1-element vector (for mipmapped 1d textures), but in this case we require the type to be a non-vector type, so need a cast. commit 9d73edb727bd6d196030dc3026b7bf0c574b3e19 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:12:07 2012 +0200 gallivm: prepare for per-quad lod calculations for large vectors to be able to handle multiple quads at once in texture sampling and still do lod calculations per quad, it is necessary to get the per-quad derivatives into the lp_build_rho function. Until now these derivative values were just scalars, which isn't going to work. So we now use vectors, and since the interface needs to change we also do some different (slightly more efficient) packing of the values. For 8-wide vectors the packed derivative values for 3 coords would look like this, this scales to a arbitrary (multiple of 4) vector size: ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____ The second vector will be unused for 1d and 2d textures. To facilitate future changes the derivative values are put into a struct, since quite some functions just pass these values through. The generated code seems to be very slightly better for 2d textures (with 4-wide vectors) than before with sse2 (if you have a cpu with physical 128bit simd units - otherwise it's probably not a win). v2: suggestions from José, rename variables, add comments, use swizzle helper commit 0aa21de0d31466dac77b05c97005722e902517b8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:10:31 2012 +0200 gallivm: add undefined swizzle handling to lp_build_swizzle_aos This is useful for vectors with "holes", it lets llvm choose the most efficient shuffle instructions if some elements aren't needed without having to worry what elements to manually pick otherwise. commit 00faf3f370e7ce92f5ef51002b0ea42ef856e181 Author: José Fonseca <jfonseca@vmware.com> Date: Fri May 4 17:25:16 2012 +0100 gallivm: Get the LLVM IR optimization passes before JIT compilation. MC-JIT engine compiles the module immediately on creation, so the optimization passes were being run too late. So now we create a target data layout from a string, that matches the ABI parameters reported by the compiler. The backend optimization passes were always been run, so the performance improvement is modest (3% on multiarb mesa demo). Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit 40a43f4e2ce3074b5ce9027179d657ebba68800a Author: Roland Scheidegger <sroland@vmware.com> Date: Wed May 2 16:03:54 2012 +0200 gallivm: (trivial) fix wrong define used in lp_build_pack2 should fix stack-smashing crashes. commit e6371d0f4dffad4eb3b7a9d906c23f1c88a2ab9e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 30 21:25:29 2012 +0200 gallivm: add perf warnings when not using intrinsics with 256bit vectors Helper functions using integer sse2 intrinsics could split the vectors with AVX instead of using generic fallback (which should be faster). We don't actually expect to hit these paths (hence don't fix them up to actually do the vector splitting) so just emit warnings (for those functions where it's obvious doing split/intrinsic is faster than using generic path). Only emit warnings for 256bit vectors since we _really_ don't expect to hit arbitrary large vectors which would affect a lot more functions. The warnings do not actually depend on avx since the same logic applies to plain sse2 too (but of course again there's _really_ no reason we should hit these functions with 256bit vectors without avx). commit 8a9ea701ea7295181e846c6383bf66a5f5e47637 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:37:07 2012 +0200 gallivm: split vectors manually for avx in lp_build_pack2 (v2) There's 2 reasons for this: First, there's a llvm bug (fixed in 3.1) which generates tons of byte inserts/extracts otherwise, and second, more importantly, we want to use pack intrinsics instead of shuffles. We do this in lp_build_pack2 and not the calling code (aos sample path) because potentially other callers might find that useful too, even if for larger sequences of code using non-native vector sizes it might be better to manually split vectors. This should boost texture performance in the aos path considerably. v2: fix issues with intrinsics types with old llvm commit 27ac5b48fa1f2ea3efeb5248e2ce32264aba466e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:26:22 2012 +0200 llvmpipe: refactor lp_build_pack2 (v2) prettify, and it's unnecessary to assert when there's no intrinsic due to unsupported bit width - the shuffle path will work regardless. In contrast lp_build_packs2, should only rely on lp_build_pack2 doing the clamping for element sizes for which there is a sse2 intrinsic. v2: fix bug spotted by Jose regarding the intrinsic type for packusdw on old llvm versions. commit ddf279031f0111de4b18eaf783bdc0a1e47813c8 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:13:59 2012 +0200 gallivm: add src width check in lp_build_packs2() not doing so would skip clamping even if no sse2 pack instruction is available, which is incorrect (in theory only, such widths would also always hit a (unnecessary) assertion in lp_build_pack2(). commit e7f0ad7fe079975eae7712a6e0c54be4fae0114b Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Apr 27 15:57:00 2012 +0200 gallivm: (trivial) fix crash-causing typo for npot textures with avx commit 28a9d7f6f655b6ec508c8a3aa6ffefc1e79793a0 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Apr 25 19:38:45 2012 +0200 gallivm: (trivial) remove code mistakenly added twice. commit d5926537316f8ff67ad0a52e7242f7c5478d919b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Apr 24 21:16:15 2012 +0200 gallivm: add a new avx aos sample path (v2) Try to avoid mixing float and int address calculations. This does texture wrap modes with floats, and then the offset calculations still with ints (because of lack of precision with floats, though we could do some effort to make it work with not too large (16MB) textures). This also handles wrap repeat mode with npot-sized textures differently than either the old soa or aos int path (likely way faster but untested). Otherwise the actual address wrap code is largely similar to the soa path (not quite the same as this one also has some int code), it should get used by avx soa sampling later as well but doesn't handle more complex address modes yet (this will also have the benefit that we can use aos sampling path for all texture address modes). Generated code for that looks reasonable, but still does not split vectors explicitly for fetch/filter which means still get hit by llvm (fixed upstream) which generates hundreds of pinsrb/pextrb instead of two shuffles. It is not obvious though if it's much of a win over just doing address calcs 4-wide but with ints, even if it is definitely much less instructions on avx. piglit's texwrap seems to look exactly the same but doesn't test neither the non-normalized nor the npot cases. v2: fix comments, prettify based on Brian's and Jose's feedback. commit bffecd22dea66fb416ecff8cffd10dd4bdb73fce Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Apr 19 01:58:29 2012 +0200 gallivm: refactor aos lp_build_sample_image_nearest/linear split them up to separate address calculations and fetching/filtering. Need this for being able to do 8-wide float address calcs and 4-wide fetch/filter later (for avx). Plus the functions were very big scary monsters anyway (in particular lp_build_sample_image_linear). commit a80b325c57529adddcfa367f96f03557725c4773 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 16 17:17:18 2012 +0200 gallivm: fix lp_build_resize when truncating width but expanding vector size Missed this case which I thought was impossible - the assertion for it was right after the division by zero... (AoS) texture sampling may ask us to do this, for things like 8 4x32int vectors to 1 32x8int vector conversion (eventually, we probably don't want this to happen). commit f9c8337caa3eb185830d18bce8b95676a065b1d7 Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Apr 14 18:00:59 2012 +0200 gallivm: fix cube maps with larger vectors This makes the branchless cube face selection code work with larger vectors. Because the complexity is quite high (cannot really be improved it seems, per-face selection would reduce complexity a lot but this leads to errors unless the derivatives are calculated all from the same face which almost doubles the work to be done) it is still slower than the branching version, hence only enable this with large vectors. It doesn't actually do per-quad face selection yet (only makes sense with matching lod selection, in fact it will select the same face for all pixels based on the average of the first four pixels for now) but only different shuffles are required to make it work (the branching version actually should work with larger vectors too now thanks to the improved horizontal add but of course it cannot be extended to really select the face per-quad unless doing branching per quad). commit 7780c58869fc9a00af4f23209902db7e058e8a66 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 21:11:12 2012 +0100 llvmpipe: (trivial) fix compiler warning and also clarify comment regarding availability of popcnt instruction. commit a266dccf477df6d29a611154e988e8895892277e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 14:21:07 2012 +0100 gallivm: remove unneeded members in lp_build_sample_context Minor cleanup, the texture width, height, depth aren't accessed in their scalar form anywhere. Makes it more obvious those values should probably be fetched already vectorized (but this requires more invasive changes)... commit b678c57fb474e14f05e25658c829fc04d2792fff Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 29 15:53:55 2012 +0100 gallivm: add a helper for concatenating vectors Similar to the extract_range helper intended to get around slow code generated by llvm for 128bit insertelements. Concatenating two 128bit vectors this way will result in a single vinsertf128 operation rather than two 64bit stores plus one 128bit load, though it might be mildly useful for other purposes as well. commit 415ff228bcd0cf5e44a4c15350a661f0f5520029 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 19:41:15 2012 +0100 gallivm: add a custom 2x8f->1x16ub avx conversion path Similar to the existing 4x4f->1x16ub sse2 path, shaves off a couple instructions (min/max mostly) because it relies on pack intrinsics clamping. commit 78c08fc89f8fbcc6dba09779981b1e873e2a0299 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 18:44:07 2012 +0100 gallivm: add avx arithmetic intrinsics Add all avx intrinsics for arithmetic functions (with the exception of the horizontal add function which needs another look). Seems to pass basic tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit a586caa2800aa5ce54c173f7c0d4fc48153dbc4e Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 15:31:35 2012 +0100 gallivm: add avx logic intrinsics Add the blend intrinsics for 8-wide float and 4-wide double vectors. Since we lack 256bit int instructions these are used for int vectors as well, though obviously not for byte or word element values. The comparison intrinsics aren't extended for avx since these are only used for pre-2.7 llvm versions. commit 70275e4c13c89315fc2560a4c488c0e6935d5caf Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 00:40:53 2012 +0100 gallivm: new helper function for extract shuffles. Based on José's idea as we can need that in a couple places. Note that such shuffles should not be used lightly, since data layout of <4 x i8> is different to <16 x i8> for instance, hence might cause data rearrangement. commit 4d586dbae1b0c55915dda1759d2faea631c0a1c2 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 18:27:25 2012 +0100 gallivm: (trivial) don't overallocate shuffle variable using wrong define meant huge array... commit 06b0ec1f6d665d98c135f9573ddf4ba04b2121ad Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 17:54:20 2012 +0100 gallivm: don't do per-element extract/insert for vector element resize Instead of doing per-element extract/insert if the src vectors and dst vector differ in total size (which generates atrocious code) first change the src vectors size by using shuffles to destination vector size. We can still do better than that on AVX for packing to color buffer (by exploiting pack intrinsics characteristics hence eleminating the need for some clamps) but this already generates much better code. v2: incorporate feedback from José, Keith and use shuffle instead of bitcasts/extracts. Due to llvm deficiencies the latter cause all data to get moved to GPRs and back in pieces (even though the data in the regs actually stays the same...). commit c9970d70e05f95d3f52fe7d2cd794176a52693aa Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 19:33:19 2012 +0000 gallivm: fix bug in simple position interpolation Accidental use of position attribute instead of just pixel coordinates. Caused failures in piglit glsl-fs-ceil and glsl-fs-floor. commit d0b6fcdb008d04d7f73d3d725615321544da5a7e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 15:31:14 2012 +0000 gallivm: fix emission of ceil opcode lp_build_ceil seems more appropriate than lp_build_trunc. This seems to be never hit though someone performs some ceil to floor magic. commit d97fafed7e62ffa6bf76560a92ea246a1a26d256 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 22 11:46:52 2012 +0000 gallivm: new vectorized path for cubemap calculations should be faster when adapted to multiple quads as only selection masks need to be different. The code is more or less a per-pixel version adapted to only do it per quad. A per pixel version would be much simpler (could drop 2 selects, 6 broadcasts and the messy horizontal add of 3 vectors at the expense of only 2 more absolute value instructions - would also just work for arbitary large vectors). This version doesn't yet work with larger vectors because the horizontal add isn't adjusted to be able to work with 2x4 vectors (and also because face selection wouldn't be done per quad just per block though that would be only a correctness issue just as with lod selection). The downside is this code is quite a bit slower. On a Core2 it can be sped up by disabling the hw blend instructions for selection and using logicop fallbacks instead, but it is still slower than the old code, hence leave that in for now. Probably will chose one or the other version based on vector length in the end. commit b375fbb18a3fd46859b7fdd42f3e9908ea4ff9a3 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 21 14:42:29 2012 +0000 gallivm: fix optimized occlusion query intrinsic name commit a9ba0a3b611e48efbb0e79eb09caa85033dbe9a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Mar 21 16:19:43 2012 +0000 draw,gallivm,llvmpipe: Call gallivm_verify_function everywhere. commit f94c2238d2bc7383e088b8845b7410439a602071 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 18:54:10 2012 +0000 gallivm: optimize calculations for cube maps a bit this does some more vectorized calculations and uses horizontal adds if possible. A definite win with sse3 otherwise it doesn't seem to make much of a difference. In any case this is arithmetically identical, cannot handle larger vectors. Should be useful as a reference point against larger vector version later... commit 21a2c1cf3c8e1ac648ff49e59fdc0e3be77e2ebb Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 15:16:27 2012 +0000 llvmpipe: slight optimization of occlusion queries using movmskps when available. While this is slightly better for cpus without popcnt we should really sum the vectors ourselves (it is also possible to cast to i4 before doing the popcnt but that doesn't help that much neither since llvm is using some optimized popcnt version for i32) commit 5ab5a35f216619bcdf55eed52b0db275c4a06c1b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 13:32:11 2012 +0000 llvmpipe: fix occlusion queries with larger vectors need to adjust casts etc. commit ff95e6fdf5f16d4ef999ffcf05ea6e8c7160b0d5 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Mar 19 20:15:25 2012 +0000 gallivm: Restore optimization passes. commit 57b05b4b36451e351659e98946dae27be0959832 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:34:22 2012 +0000 llvmpipe: use existing min2 macro commit bc9a20e19b4f600a439f45679451f2e87cd4b299 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:07:27 2012 +0000 llvmpipe: add some safeguards against really large vectors As per José's suggestion, prevent things from blowing up if some cpu would have 1024bit or larger vectors. commit 0e2b525e5ca1c5bbaa63158bde52ad1c1564a3a9 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:31:08 2012 +0000 llvmpipe: fix mask generation for uberwide vectors this was the only piece preventing 16-wide vectors from working (apart from the LP_MAX_VECTOR_WIDTH define that is), which is the maximum as we don't get more pixels in the fragment shader at once. Hence adjust that so things could be tested properly with that size even though there seems to be no practical value. commit 3c8334162211c97f3a11c7f64e9e5a2a91ad9656 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:19:41 2012 +0000 llvmpipe: fix the simple interpolation method with larger vectors so both methods actually _really_ work now. Makes textures look nice with larger vectors... commit 1cb0464ef8871be1778d43b0c56adf9c06843e2d Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 17:26:35 2012 +0000 llvmpipe: fix mask generation and position interpolation with 8-wide vectors trivial bugs, with these things start to look somewhat reasonable. Textures though have some swizzling issues it seems. commit 168277a63ef5b72542cf063c337f2d701053ff4b Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 16:04:03 2012 +0000 llvmpipe: don't overallocate variables we never have more than 16 (stamp size) / 4 (minimum possible vector size). (With larger vectors those variables are still overallocated a bit.) commit 409b54b30f81ed0aa9ed0b01affe15c72de9abd2 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:56:48 2012 +0000 llvmpipe: add some 32f8 formats to lp_test_conv Also add the ability to handle different sized vectors. commit 55dcd3af8366ebdac0af3cdb22c2588f24aa18ce Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:47:27 2012 +0000 gallivm: handle different sized vectors in conversion / pack only fully generic path for now (extract/insert per element). commit 9c040f78c54575fcd94a8808216cf415fe8868f6 Author: Roland Scheidegger <sroland@vmware.com> Date: Sun Mar 18 00:58:28 2012 +0100 llvmpipe: fix harmless use of unitialized values commit 551e9d5468b92fc7d5aa2265db9a52bb1e368a36 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:31:21 2012 +0100 gallivm: drop special path in extract_broadcast with different sized vectors Not needed, llvm can handle shuffles with different sized result vector just fine. Should hopefully generate the same code in the end, but simpler IR. commit 44da531119ffa07a421eaa041f63607cec88f6f8 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:28:49 2012 +0100 llvmpipe: adapt interpolation for handling multiple quads at once this is still WIP there are actually two methods possible not quite sure what makes the most sense, so there's code for both for now: 1) the iterative method as used before (compute attrib values at upper left corner of stamp and upper left corner of each quad initially). It is improved to handle more than one quad at once, and also do some more vectorized calculations initially for slightly better code - newer cpus have full throughput with 4 wide float vectors, hence don't try to code up a path which might be faster if there's just one channel active per attribute. 2) just do straight interpolation for each pixel. Method 2) is more work per quad, but less initially - if all quads are executed significantly more overall though. But this might change with larger vector lengths. This method would also be needed if we'd do some kind of active quad merging when operating on multiple quads at once. This path contains some hack to force llvm to generate better code, it is still far from ideal though, still generates far too many unnecessary register spills/reloads. Both methods should work with different sized vectors. Not very well tested yet, still seems to work with four-wide vectors, need changes elsewhere to be able to test with wider vectors. commit be5d3e82e2fe14ad0a46529ab79f65bf2276cd28 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:59:37 2012 +0000 draw: Cleanup. commit f85bc12c7fbacb3de2a94e88c6cd2d5ee0ec0e8d Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:43:30 2012 +0000 gallivm: More module compilation refactoring. commit d76f093198f2a06a93b2204857e6fea5fd0b3ece Author: José Fonseca <jfonseca@vmware.com> Date: Thu Mar 15 21:29:11 2012 +0000 llvmpipe: Use gallivm_compile/free_function() in linear code. Should had been done before. commit 122e1adb613ce083ad739b153ced1cde61dfc8c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 13 14:47:10 2012 +0100 llvmpipe: generate partial pixel mask for multiple quads still works with one quad, cannot be tested yet with more At least for now always fixed order with multiple quads. commit 4c4f15081d75ed585a01392cd2dcce0ad10e0ea8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 22:09:24 2012 +0100 llvmpipe: refactor state setup a bit Refactor to make it easier to emit (and potentially later fetch in fs) coefficients for multiple attributes at once. Need to think more about how to make this actually happen however, the problem is different attributes can have different interpolation modes, requiring different handling in both setup and fs (though linear and perspective handling is close). commit 9363e49722ff47094d688a4be6f015a03fba9c79 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 19:23:23 2012 +0100 llvmpipe: vectorize tri offset calc cuts number of instructions in quad-offset-factor from 107 to 75. This code actually duplicated the (scalar) code calculating the determinant except it used different vertex order (leading to different sign but it doesn't matter) hence llvm could not have figured out it's the same (of course with determinant vectorized in the other place that wouldn't have worked any longer neither). Note this particular piece doesn't actually vectorize well, not many arithmetic instructions left but tons of shuffle instructions... Probably would need to work on n tris at a time for better vectorization. commit 63169dcb9dd445c94605625bf86d85306e2b4297 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 03:11:37 2012 +0100 llvmpipe: vectorize some scalar code in setup reduces number of arithmetic instructions, and avoids loading vector x,y values twice (once as scalars once as vectors). Results in a reduction of instructions from 76 to 64 in fs setup for glxgears (16%) on a cpu with sse41. Since this code uses vec2 disguised as vec4, on old cpus which had physical 64bit sse units (pre-Core2) it probably is less of a win in practice (and if you have no vectors you can only hope llvm eliminates the arithmetic for unneeded elements). commit 732ecb877f951ab89bf503ac5e35ab8d838b58a1 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 7 00:32:24 2012 +0100 draw: fix clipping bug introduced by 4822fea3f0440b5205e957cd303838c3b128419c broke clipping pretty badly (verified with lineclip test) commit ef5d90b86d624c152d200c7c4056f47c3c6d2688 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 23:38:59 2012 +0100 draw: don't store vertex header per attribute storing the vertex header once per attribute is totally unnecessary. Some quick look at the generated assembly says llvm in fact cannot optimize away the additional stores (maybe due to potentially aliasing pointers somewhere). Plus, this makes the code cleaner and also allows using a vector "or" instead of scalar ones. commit 6b3a5a57b0b9850854cfbd7b586e4e50102dda71 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 19:11:01 2012 +0100 draw: do the per-vertex "boolean" clipmask "or" with vectors no point extracting the values and doing it per component. Doesn't help that much since we still extract the values elsewhere anyway. commit 36519caf1af40e4480251cc79a2d527350b7c61f Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 2 22:27:01 2012 +0100 gallivm: fix lp_build_extract_broadcast with different sized vectors Fix the obviously wrong argument, so it doesn't blow up. commit 76d0ac3ad85066d6058486638013afd02b069c58 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 2 12:16:23 2012 +0000 draw: Compile per module and not per function (WIP). Enough to get gears w/ LLVM draw + softpipe to work on AVX doing: GALLIUM_DRIVER=softpipe SOFTPIPE_USE_LLVM=yes glxgears But still hackish -- will need to rethink and refactor this. commit 78e32b247d2a7a771be9a1a07eb000d1e54ea8bd Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 12:01:05 2012 +0000 llvmpipe: Remove lp_state_setup_fallback. Never used. commit 6895d5e40d19b4972c361e8b83fdb7eecda3c225 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Feb 27 19:14:27 2012 +0000 llvmpipe: Don't emit EMMS on x86 We already take precautions to ensure that LLVM never emits MMX code. commit 4822fea3f0440b5205e957cd303838c3b128419c Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 29 15:58:19 2012 +0100 draw: modifications for larger vector sizes We want to be able to use larger vectors especially for running the vertex shader. With this patch we build soa vectors which might have a different length than 4. Note that aos structures really remain the same, only when aos structures are converted to soa potentially different sized vectors are used. Samplers probably don't work yet, didn't look at them. Testing done: glxgears works with both 128bit and 256bit vectors. commit f4950fc1ea784680ab767d3dd0dce589f4e70603 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:57 2012 +0100 gallivm: override native vector width with LP_NATIVE_VECTOR_WIDTH env var for debug commit 6ad6dbf0c92f3bf68ae54e5f2aca035d19b76e53 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:24 2012 +0100 draw: allocate storage with alignment according to native vector width commit 7bf0e3e7c9bd2469ae7279cabf4c5229ae9880c1 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Feb 24 19:06:08 2012 +0000 gallivm: Fix comment grammar. Was missing several words. Spotted by Roland. commit b20f1b28eb890b2fa2de44a0399b9b6a0d453c52 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 19:22:09 2012 +0000 gallivm: Use MC-JIT on LLVM 3.1 + (i.e, SVN) MC-JIT Note: MC-JIT is still WIP. For this to work correctly it requires LLVM changes which are not yet upstream. commit b1af4dfcadfc241fd4023f4c3f823a1286d452c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 20:03:15 2012 +0100 llvmpipe: use new lp_type_width() helper in lp_test_blend commit 04e0a37e888237d4db2298f31973af459ef9c95f Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 19:50:34 2012 +0100 llvmpipe: clean up lp_test_blend a little Using variables just sized and aligned right makes it a bit more obvious what's going on. The test still only tests vector length 4. For AoS anything else probably isn't going to work. For SoA other lengths should work (at least with floats). commit e61c393d3ec392ddee0a3da170e985fda885a823 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:48:30 2012 +0000 gallivm: Ensure vector width consistency. Instead of assuming that everything is the max native size. commit 330081ac7bc41c5754a92825e51456d231bf84dd Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:44:14 2012 +0000 draw: More simd vector width consistency fixes. commit d90ca002753596269e37297e2e6c139b19f29f03 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:43:00 2012 +0000 gallivm: Remove unused lp_build_int32_vec4_type() helper. commit cae23417824d75869c202aaf897808d73a2c1db0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 17:32:16 2012 +0100 gallivm: use global variable for native vector width instead of define We do not know the simd extensions (and hence the simd width we should use) available at compile time. At least for now keep a define for maximum vector width, since a global variable obviously can't be used to adjust alignment of automatic stack variables. Leave the runtime-determined value at 128 for now in all cases. commit 51270ace6349acc2c294fc6f34c025c707be538a Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 15:41:02 2012 +0000 gallivm: Add a hunk inadvertedly lost when rebasing. commit bf256df9cfdd0236637a455cbaece949b1253e98 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:24:23 2012 +0000 llvmpipe: Use consistent vector width in depth/stencil test. commit 5543b0901677146662c44be2cfba655fd55da94b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:19:59 2012 +0000 draw: Use a consistent the vector register width. Instead of 4x32 sometimes, LP_NATIVE_VECTOR_WIDTH other times. commit eada8bbd22a3a61f549f32fe2a7e408222e5c824 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 12:08:04 2012 +0000 gallivm: Remove garbagge collection. MC-JIT will require one compilation per module (as opposed to one compilation per function), therefore no state will be shared, eliminating the need to do garbagge collection. commit 556697ea0ed72e0641851e4fbbbb862c470fd7eb Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 10:33:41 2012 +0000 gallivm: Move all native target initialization to lp_set_target_options(). commit c518e8f3f2649d5dc265403511fab4bcbe2cc5c8 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:52:32 2012 +0000 llvmpipe: Create one gallivm instance for each test. commit 90f10af8920ec6be6f2b1e7365cfc477a0cb111d Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:48:08 2012 +0000 gallivm: Avoid LLVMAddGlobalMapping() in lp_bld_assert(). Brittle, complex, and unecesary. Just use function pointer constant. commit 98fde550b33401e3fe006af59db4db628bcbf476 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:21:26 2012 +0000 gallivm: Add a lp_build_const_func_pointer() helper. To be reused in all places where we want to call C code. commit 6cfedadb62c2ce5af8d75969bc95a607f3ece118 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:44:41 2012 +0000 gallivm: Cleanup/simplify lp_build_const_string_variable. - Move to lp_bld_const where it belongs - Rename to lp_build_const_string - take the length from the argument (and don't count the zero terminator twice) - bitcast the constant to generic i8 * commit db1d4018c0f1fa682a9da93c032977659adfb68c Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 11:52:17 2012 +0000 gallivm: Set NoFramePointerElimNonLeaf to true where supported. commit 088614164aa915baaa5044fede728aa898483183 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 22 19:38:47 2012 +0100 llvmpipe: pass in/out pointers rather scalar floats in lp_bld_arit we don't want llvm to potentially optimize away the vectors (though it doesn't seem to currently), plus we want to be able to handle in/out vectors of arbitrary length. commit 3f5c4e04af8a7592fdffa54938a277c34ae76b51 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:22:55 2012 +0100 gallivm: fix lp_build_sqrt() for vector length 1 since we optimize away vectors with length 1 need to emit intrinsic without vector type. commit 79d94e5f93ed8ba6757b97e2026722ea31d32c06 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 17:00:46 2012 +0000 llvmpipe: Remove lp_test_round. commit 81f41b5aeb3f4126e06453cfc78990086b85b78d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:56:24 2012 +0100 llvmpipe: subsume lp_test_round into lp_test_arit Much simpler, and since the arguments aren't passed as 128bit values can run on any arch. This also uses the float instead of the double versions of the c functions (which probably was the intention anyway). In contrast to lp_test_round the output is much less verbose however. Tested vector width of 32 to 512 bits - all pass except 32 (length 1) which crashes in lp_build_sqrt() due to wrong type. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 945b338b421defbd274481d8c4f7e0910fd0e7eb Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 09:55:03 2012 +0000 gallivm: Centralize the function compilation logic. This simplifies a lot of code. Also doing this in a central place will make it easier to carry out the changes necessary to use MC-JIT in the future. gallivm: Fix typo in explicit derivative shuffle. Trivial. draw: make DEBUG_STORE work again adapt to lp_build_printf() interface changes Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: get rid of vecnf_from_scalar() just use lp_build_broadcast directly (cannot assign a name but don't really need it, vecnf_from_scalar() was producing much uglier IR due to using repeated insertelement instead of insertelement+shuffle). Reviewed-by: José Fonseca <jfonseca@vmware.com> llvmpipe: fix typo in complex interpolation code Fixes position interpolation when using complex mode (piglit fp-fragment-position and similar) Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: fix clipvertex/position storing again This appears to be the result of a bad merge. Fixes piglit tests relying on clipping, like a lot of the interpolation tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Fix explicit derivative manipulation. Same counter variable was being used in two nested loops. Use more meanigful variable names for the counter to fix and avoid this. gallivm: Prevent buffer overflow in repeat wrap mode for NPOT. Based on Roland's patch, discussion, and review . Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix dims for TGSI_TEXTURE_1D in emit_tex. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix explicit volume texture derivatives. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: fix 1d shadow texture sampling Always r coordinate is used, hence need 3 coords not two (the second one is unused). Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Enable AVX support without MCJIT, where available. For now, this just enables AVX on Windows for testing. If the code is stable then we might consider prefering the old JIT wherever possible. No change elsewhere. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2012-07-13 18:09:30 +01:00
if (type.floating && util_cpu_caps.has_sse) {
if (type.width == 32) {
if (type.length == 1) {
intrinsic = "llvm.x86.sse.max.ss";
intr_size = 128;
}
else if (type.length <= 4 || !util_cpu_caps.has_avx) {
intrinsic = "llvm.x86.sse.max.ps";
gallivm,draw,llvmpipe: Support wider native registers. Squashed commit of the following: commit 7acb7b4f60dc505af3dd00dcff744f80315d5b0e Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:46:31 2012 +0100 draw: Don't use dynamically sized arrays. Not supported by MSVC. commit 5810c28c83647612cb372d1e763fd9d7780df3cb Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:44:16 2012 +0100 gallivm,llvmpipe: Don't use expressions with PIPE_ALIGN_VAR(). MSVC doesn't accept exceptions in _declspec(align(...)). Use a define instead. commit 8aafd1457ba572a02b289b3f3411e99a3c056072 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:41:56 2012 +0100 gallium/util: Make u_cpu_detect.h header C++ safe. commit 5795248350771f899cfbfc1a3a58f1835eb2671d Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 2 12:08:01 2012 +0100 gallium/util: Add ULL suffix to large constants. As suggested by Andy Furniss: it looks like some old gcc versions require it. commit 4c66c22727eff92226544c7d43c4eb94de359e10 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Truly disable INF/NAN tests on MSVC. Thanks to Brian for spotting this. commit 8bce274c7fad578d7eb656d9a1413f5c0844c94e Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Disable INF/NAN tests on MSVC. Somehow they are not recognized as constants. commit 6868649cff8d7fd2e2579c28d0b74ef6dd4f9716 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 15:05:24 2012 +0200 gallivm: Cleanup the 2 x 8 float -> 16 ub special path in lp_build_conv. No behaviour change intended, like 7b98455fb40c2df84cfd3cdb1eb7650f67c8a751. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 5147a0949c4407e8bce9e41d9859314b4a9ccf77 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 14:28:19 2012 +0200 gallivm: (trivial) fix issues with multiple-of-4 texture fetch Some formats can't handle non-multiple of 4 fetches I believe, but everything must support length 1 and multiples of 4. So avoid going to scalar fetch (which is very costly) just because length isn't 4. Also extend the hack to not use shift with variable count for yuv formats to arbitrary length (larger than 1) - doesn't matter how many elements we have we always want to avoid it unless we have variable shift count instruction (which we should get with avx2). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 87ebcb1bd71fa4c739451ec8ca89a7f29b168c08 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jul 4 02:09:55 2012 +0200 gallivm: (trivial) fix typo for wrap repeat mode in linear filtering aos code This would lead to bogus coordinates at the edges. (undetected by piglit because this path is only taken for block-based formats). Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 3a42717101b1619874c8932a580c0b9e6896b557 Author: José Fonseca <jfonseca@vmware.com> Date: Tue Jul 3 19:42:49 2012 +0100 gallivm: Fix TGSI integer translation with AVX. commit d71ff104085c196b16426081098fb0bde128ce4f Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 15:17:41 2012 +0100 llvmpipe: Fix LLVM JIT linear path. It was not working properly because it was looking at the JIT function before it was actually compiled. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit a94df0386213e1f5f9a6ed470c535f9688ec0a1b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Jun 28 18:07:10 2012 +0100 gallivm: Refactor lp_build_broadcast(_scalar) to share code. Doesn't really change the generated assembly, but produces more compact IR, and of course, makes code more consistent. Reviewed-by: Brian Paul <brianp@vmware.com> commit 66712ba2731fc029fa246d4fc477d61ab785edb5 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 17:30:13 2012 +0100 gallivm: Make LLVMContextRef a singleton. There are any places inside LLVM that depend on it. Too many to attempt to fix. Reviewed-by: Brian Paul <brianp@vmware.com> commit ff5fb7897495ac263f0b069370fab701b70dccef Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 28 18:15:27 2012 +0200 gallivm: don't use 8-wide texture fetch in aos path This appears to be a slight loss usually. There are probably several reasons for that: - fetching itself is scalar - filtering is pure int code hence needs splitting anyway, same for the final texel offset calculations - texture wrap related code, which can be done 8-wide, is slightly more complex with floats (with clamp_to_edge) and float operations generally more costly hence probably not much faster overall - the code needed to split when encountering different mip levels for the quads, adding complexity So, just split always for aos path (but leave it 8-wide for soa, since we do 8-wide filtering there when possible). This should certainly be revisited if we'd have avx2 support. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ce8032b43dcd8e8d816cbab6428f54b0798f945d Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:41:19 2012 +0200 gallivm: (trivial) don't extract fparts variable if not needed Did not have any consequences but unnecessary. commit aaa9aaed8f80dc282492f62aa583a7ee23a4c6d5 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:09:06 2012 +0200 gallivm: fix precision issue in aos linear int wrap code now not just passes at a quick glance but also with piglit... If we do the wrapping with floats, we also need to set the weights accordingly. We can potentially end up with different (integer) coordinates than what the integer calculations would have chosen, which means the integer weights calculated previously in this case are completely wrong. Well at least that's what I think happens, at least recalculating the weights helps. (Some day really should refactor all the wrapping, so we do whatever is fastest independent of 16bit int aos or 32bit float soa filtering.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit fd6f18588ced7ac8e081892f3bab2916623ad7a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 11:15:53 2012 +0100 gallium/util: Fix parsing of options with underscore. For example GALLIVM_DEBUG=no_brilinear which was being parsed as two options, "no" and "brilinear". commit 09a8f809088178a03e49e409fa18f1ac89561837 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 26 15:00:14 2012 +0100 gallivm: Added a generic lp_build_print_value which prints a LLVMValueRef. Updated lp_build_printf to share common code. Removed specific lp_build_print_vecX. Reviewed-by: José Fonseca <jfonseca@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit e59bdcc2c075931bfba2a84967a5ecd1dedd6eb0 Author: José Fonseca <jfonseca@vmware.com> Date: Wed May 16 15:00:23 2012 +0100 draw,llvmpipe: Avoid named struct types on LLVM 3.0 and later. Starting with LLVM 3.0, named structures are meant not for debugging, but for recursive data types, previously also known as opaque types. The recursive nature of these types leads to several memory management difficulties. Given that we don't actually need recursive types, avoid them altogether. This is an attempt to address fdo bugs 41791 and 44466. The issue is somewhat random so there's no easy way to check how effective this is. Cherry-picked from 9af1ba565dfd5cef9ee938bb7c04767d14878fbf commit df6070f618a203c7a876d984c847cde4cbc26bdb Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 14:42:53 2012 +0200 gallivm: (trivial) fix typo in faster aos linear int wrap code no longer crashes, now REALLY tested. commit d8f98dce452c867214e6782e86dc08562643c862 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 18:20:58 2012 +0200 llvmpipe: (trivial) remove bogus optimization for float aos repeat wrap This optimization for nearest filtering on the linear path generated likely bogus results, and the int path didn't have any optimizations there since the only shader using force_nearest apparently uses clamp_to_edge not repeat wrap anyway. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit c4e271a0631087c795e756a5bb6b046043b5099d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 23:01:52 2012 +0200 gallivm: faster repeat wrap for linear aos path too Even if we already have scaled integer coords, it's way faster to use the original float coord (plus some conversions) rather than use URem. The choice of what to do for texture wrapping is not really tied to int aos or float soa filtering though for some modes there can be some gains (because of easier weight calculations). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 1174a75b1806e92aee4264ffe0ffe7e70abbbfa3 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 14:39:22 2012 +0200 gallivm: improve npot tex wrap repeat in linear soa path URem gets translated into series of scalar divisions so just about anything else is faster. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit f849ffaa499ed96fa0efd3594fce255c7f22891b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 00:40:35 2012 +0100 gallivm: (trivial) fix near-invisible shift-space typo I blame the keyboard. commit 5298a0b19fe672aebeb70964c0797d5921b51cf0 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:24:28 2012 +0200 gallivm: add new intrinsic helper to deal with arbitrary vector length This helper will split vectors which are too large for the hw, or expand them if they are too small, so a caller of a function using intrinsics which uses such sizes need not split (or expand) the vectors manually and the function will still use the intrinsic instead of dropping back to generic llvm code. It can also accept scalars for use with pseudo-vector intrinsics (only useful for float arguments, all x86 scalar simd float intrinsics use 4vf32). Only used for lp_build_min/max() for now (also added the scalar float case for these while there). (Other basic binary functions could use it easily, whereas functions with a different interface would need different helpers.) Expanding vectors isn't widely used, because we always try to use build contexts with native hw vector sizes. But it might (or not) be nicer if this wouldn't need to be done, the generated code should in theory stay the same (it does get hit by lp_build_rho though already since we didn't have a intrinsic for the scalar lp_build_max case before). v2: incorporated Brian's feedback, and also made the scalar min/max case work instead of crash (all scalar simd float intrinsics take 4vf32 as argument, probably the reason why it wasn't used before). Moved to lp_bld_intr based on José's request, and passing intrinsic size instead of length. Ideally we'd derive the source type info from the passed in llvm value refs and process some llvmtype return type so we could handle intrinsics where the source and destination type isn't the same (like float/int conversions, packing instructions) but that's a bit too complicated for now. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 01aa760b99ec0b2dc8ce57a43650e83f8c1becdf Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:19:18 2012 +0200 gallivm: (trivial) increase max code size for shader disassembly 64kB was just short of what I needed (which caused a crash) hence increase to 96kB (should probably be smarter about that). commit 74aa739138d981311ce13076388382b5e89c6562 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:53:29 2012 +0100 gallivm: simplify aos float tex wrap repeat nearest just handle pot and npot the same. The previous pot handling ended up with exactly the same instructions plus 2 more (leave it in the soa path though since it is probably still cheaper there). While here also fix a issue which would cause a crash after an assert. commit 0e1e755645e9e49cfaa2025191e3245ccd723564 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:29:24 2012 +0100 gallivm: (trivial) skip floor rounding in ifloor when not signed This was only done for the non-sse41 case before, but even with sse41 this is obviously unnecessary (some callers already call itrunc in this case anyway but some might not). commit 7f01a62f27dcb1d52597b24825931e88bae76f33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:23:12 2012 +0100 gallivm: (trivial) fix bogus comments commit 5c85be25fd82e28490274c468ce7f3e6e8c1d416 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 20 11:51:57 2012 +0100 translate: Free elt8_func/elt16_func too. These were leaking. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 0ad498f36fb6f7458c7cffa73b6598adceee0a6c Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 15:55:34 2012 +0200 gallivm: fix bug for tex wrap repeat with linear sampling in aos float path The comparison needs to be against length not length_minus_one, otherwise the max texel is never chosen (for the second coordinate). Fixes piglit texwrap-1D-npot-proj (and 2D/3D versions). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit d1ad65937c5b76407dc2499b7b774ab59341209e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 16:13:43 2012 +0200 gallivm: simplify soa tex wrap repeat with npot textures and no mip filtering Similar to what is already done in aos sampling for the float path (but not the int path since we don't get normalized float coordinates there). URem is expensive and the calculation is done trivially with normalized floats instead (at least with sse41-capable cpus). (Some day should probably do the same for the mip filter path but it's much more complicated there hence the gain is smaller.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit e1e23f57ba9b910295c306d148f15643acc3fc83 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:38:56 2012 +0200 llvmpipe: (trivial) remove duplicated function declaration Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 07ca57eb09e04c48a157733255427ef5de620861 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:37:34 2012 +0200 llvmpipe: destroy setup variants on context destruction lp_delete_setup_variants() used to be called in garbage collection, but this no longer exists hence the setup shaders never got freed. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ed0003c633859a45f9963a479f4c15ae0ef1dca3 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 16:25:29 2012 +0100 gallivm: handle different ilod parts for multiple quad sampling This fixes filtering when the integer part of the lod is not the same for all quads. I'm not fully convinced of that solution yet as it just splits the vector if the levels to be sampled from are different. But otherwise we'd need to do things like some minify steps, and getting mip level base address separately anyway hence it wouldn't really look like much of a win (and making the code even more complex). This should now give identical results to single quad sampling. commit 8580ac4cfc43a64df55e84ac71ce1a774d33c0d2 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:14:47 2012 +0200 gallivm: de-duplicate sample code common to soa and aos sampling There doesn't seem to be any reason why this code dealing with cube face selection, lod and mip level calculation is separate in aos and soa sampling, and I am sick of having it to change in both places. commit fb541e5f957408ce305b272100196f1e12e5b1e8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:15:41 2012 +0200 gallivm: do mip filtering with per quad lod_fpart This gives better results for mip filtering, though the generated code might not be optimal. For now it also creates some artifacts if the lod_ipart isn't the same for all quads, since instead of using the same mip weight for all quads as previously (which just caused non-smooth gradients) this now will use the right weights but with the wrong mip level in this case (can easily be seen with things like texfilt, mipmap_tunnel). v2: use logic helper suggested by José, and fix issue with negative lod_fpart values commit f1cc84eef7d826a20fab6cd8ccef9a275ff78967 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 13 18:35:25 2012 +0200 gallivm: (trivial) fix bogus assert in lp_build_unpack_broadcast_aos_scalars commit 7c17dbae8ae290df9ce0f50781a09e8ed640c044 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:11:14 2012 +0100 util: Reimplement half <-> float conversions. Removed u_half.py used to generate the table for previous method. Previous implementation of float to half conversion was faulty for denormalised and NaNs and would require extra logic to fix, thus making the speedup of using tables irrelevant. commit 7762f59274070e1dd4b546f5cb431c2eb71ae5c3 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:12:16 2012 +0100 tests: Updated tests to properly handle NaN for half floats. commit fa94c135aea5911fd93d5dfb6e6f157fb40dce5e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:33:10 2012 +0200 gallivm: do mip level calculations per quad This is the final piece which shouldn't change the rendering output yet. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 23cbeaddfe03c09ca18c45d28955515317ffcf4c Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Jun 9 00:54:21 2012 +0200 gallivm: do per-quad cube face selection Doesn't quite fix the piglit cubemap test (not sure why actually) but doing per-quad face selection is doing the right thing and definitely an improvement. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit abfb372b3702ac97ac8b5aa80ad1b94a2cc39d33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:22:59 2012 +0200 gallivm: do all lod calculations per quad Still no functional change but lod is now converted to scalar after lod calculations. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 519368632747ae03feb5bca9c655eccbc5b751b4 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:46:10 2012 +0100 gallivm: Added support for half-float to float conversion in lp_build_conv. Updated various utility functions to support this change. commit 135b4d683a4c95f7577ba27b9bffa4a6fbd2c2e7 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:02:46 2012 +0100 gallivm: Added function for half-float to float conversion. Updated lp_build_format_aos_array to support half-float source. commit 37d648827406a20c5007abeb177698723ed86673 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:55:18 2012 +0100 util: Updated u_format_tests to rigidly test half-float boundary values. commit 2ad18165d96e578aa9046df7c93cb1c3284d8c6b Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:54:16 2012 +0100 llvmpipe: Updated lp_test_format to properly handle Inf/NaN results. commit 78740acf25aeba8a7d146493dd5c966e22c27b73 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:53:30 2012 +0100 util: Added functions for checking NaN / Inf for double and half-floats. commit 35e9f640ae01241f9e0d67fe893bbbf564c05809 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:05:13 2012 +0200 gallivm: Fix calculating rho for 3d textures for the single-quad case Discovered by accident, this looks like a very old typo bug. commit fc1220c636326536fd0541913154e62afa7cd1d8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:04:59 2012 +0200 gallivm: do calcs per-quad in lp_build_rho Still convert to scalar at the end of the function. commit 50a887ffc550bf310a6988fa2cea5c24d38c1a41 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 21 23:21:50 2012 +0200 gallivm: (trivial) return scalar in lp_build_extract_range for length 1 vectors Our type system on top of llvm's one doesn't generally support vectors of length 1, instead using scalars. So we should return a scalar from this function instead of having to bitcast the vector with length 1 later elsewhere. commit 80c71c621f9391f0f9230460198d861643324876 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 17:49:15 2012 +0100 draw: Fixed bad merge error commit c47401cfad0c9167de20ff560654f533579f452c Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:29:30 2012 +0100 draw: Updated store_clip to store whole vectors instead of individual elements. commit 2d9c1ad74b0b0b41861fffcecde39f09cc27f1cf Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:28:32 2012 +0100 gallivm: Added lp_build_fetch_rgba_aos_array. A version of lp_build_fetch_rgba_aos which is targeted at simple array formats. Reads the whole vector from memory in one, instead of reading each element individually. Tested with mesa tests and demos. commit ff7805dc2b6ef6d8b11ec4e54aab1633aef29ac8 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:27:40 2012 +0100 gallivm: Added lp_build_pad_vector. This function pads a vector with undef to a desired length. commit 701f50acef24a2791dabf4730e5b5687d6eb875d Author: James Benton <jbenton@vmware.com> Date: Fri May 18 17:27:19 2012 +0100 util: Added util_format_is_array. This function checks whether a format description is in a simple array format. commit 5e0a7fa543dcd009de26f34a7926674190fa6246 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:13:47 2012 +0100 draw: Removed draw_llvm_translate_from and draw/draw_llvm_translate.c. This is "replaced" by adding an optimised path in lp_build_fetch_rgba_aos in an upcoming patch. commit 8c886d6a7dd3fb464ecf031de6f747cb33e5361d Author: James Benton <jbenton@vmware.com> Date: Wed May 16 15:02:31 2012 +0100 draw: Modified store_aos to write the vector as one, not individual elements. commit 37337f3d657e21dfd662c7b26d61cb0f8cfa6f17 Author: James Benton <jbenton@vmware.com> Date: Wed May 16 14:16:23 2012 +0100 draw: Changed aos_to_soa to use lp_build_transpose_aos. commit bd2b69ce5d5c94b067944d1dcd5df9f8e84548f1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:14:27 2012 +0100 draw: Changed soa_to_aos to use lp_build_transpose_aos. commit 0b98a950d29a116e82ce31dfe7b82cdadb632f2b Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:45 2012 +0100 gallivm: Added lp_build_transpose_aos which converts between aos and soa. commit 69ea84531ad46fd145eb619ed1cedbe97dde7cb5 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:01 2012 +0100 gallivm: Added lp_build_interleave2_half aimed at AVX unpack instructions. commit 7a4cb1349dd35c18144ad5934525cfb9436792f9 Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 22 11:54:14 2012 +0100 gallivm: Fix build on Windows. MC-JIT not yet supported there. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit afd105fc16bb75d874e418046b80d9cc578818a1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:17:26 2012 +0100 llvmpipe: Added a error counter to lp_test_conv. Useful for keeping track of progress when fixing errors! Signed-off-by: José Fonseca <jfonseca@vmware.com> commit b644907d08c10a805657841330fc23db3963d59c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:16:46 2012 +0100 llvmpipe: Changed known failures in lp_test_conv. To comply with the recent fixes to lp_bld_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit d7061507bd94f6468581e218e61261b79c760d4f Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:14:38 2012 +0100 llvmpipe: Added fixed point types tests to lp_test_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 146b3ea39b4726dbe125ac666bd8902ea3d6ca8c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:26:35 2012 +0100 llvmpipe: Changed lp_test_conv src/dst alignment to be correct. Now based on the define rather than a fixed number. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit f3b57441f834833a4b142a951eb98df0aa874536 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:06:44 2012 +0100 gallivm: Fixed erroneous optimisation in lp_build_min/max. Previously assumed normalised was 0 to 1, but it can be -1 to 1 if type is signed. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a0613382e5a215cd146bb277646a6b394d376ae4 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:04:49 2012 +0100 gallivm: Compensate for lp_const_offset in lp_build_conv. Fixing a /*FIXME*/ to remove errors in integer conversion in lp_build_conv. Tested using lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a3d2bf15ea345bc8a0664f8f441276fd566566f3 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:01:25 2012 +0100 gallivm: Fixed overflow in lp_build_clamped_float_to_unsigned_norm. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit e7b1e76fe237613731fa6003b5e1601a2e506207 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 21 20:07:51 2012 +0100 gallivm: Fix build with LLVM 2.6 Trivial, and useful. commit d3c6bbe5c7f5ba1976710831281ab1b6a631082d Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 15 17:15:59 2012 +0100 gallivm: Enable MCJIT/AVX with vanilla LLVM 3.1. Add the necessary C++ glue, so that we don't need any modifications to the soon to be released LLVM 3.1. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 724a019a14d40fdbed21759a204a2bec8a315636 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 22:04:06 2012 +0100 gallivm: Use HAVE_LLVM 0x0301 consistently. commit af6991e2a3868e40ad599b46278551b794839748 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 21:49:06 2012 +0100 gallivm: Add MCRegisterInfo.h to silence benign warnings about missing implementation. Trivial. commit 6f8a1d75458daae2503a86c6b030ecc4bb494e23 Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Apr 2 22:14:15 2012 -0700 gallivm: Pass in a MCInstrInfo to createMCInstPrinter on llvm-3.1. llvm-3.1svn r153860 makes MCInstrInfo available to the MCInstPrinter. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 62555b6ed8760545794f83064e27cddcb3ce5284 Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 27 21:51:17 2012 -0700 gallivm: Fix method overriding in raw_debug_ostream. Use matching type qualifers to avoid method hiding. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 6a9bd784f4ac68ad0a731dcd39e5a3c39989f2be Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 13 22:40:52 2012 -0700 gallivm: Fix createOProfileJITEventListener namespace with llvm-3.1. llvm-3.1svn r152620 refactored the OProfile profiling code. createOProfileJITEventListener was moved from the llvm namespace to the llvm::JITEventListener namespace. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit b674955d39adae272a779be85aa1bd665de24e3e Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Mar 5 22:00:40 2012 -0800 gallivm: Pass in a MCRegisterInfo to MCInstPrinter on llvm-3.1. llvm-3.1svn r152043 changes createMCInstPrinter to take an additional MCRegisterInfo argument. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 11ab69971a8a31c62f6de74905dbf8c02884599f Author: Vinson Lee <vlee@freedesktop.org> Date: Wed Feb 29 21:20:53 2012 -0800 Revert "gallivm: Change getExtent and readByte to non-const with llvm-3.1." This reverts commit d5a6c172547d8964f4d4bb79637651decaf9deee. llvm-3.1svn r151687 makes MemoryObject accessor members const again. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 339960c82d2a9f5c928ee9035ed31dadb7f45537 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 14 16:19:56 2012 +0200 gallivm: (trivial) fix assertion failure for mipmapped 1d textures In lp_build_rho, we may end up with a 1-element vector (for mipmapped 1d textures), but in this case we require the type to be a non-vector type, so need a cast. commit 9d73edb727bd6d196030dc3026b7bf0c574b3e19 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:12:07 2012 +0200 gallivm: prepare for per-quad lod calculations for large vectors to be able to handle multiple quads at once in texture sampling and still do lod calculations per quad, it is necessary to get the per-quad derivatives into the lp_build_rho function. Until now these derivative values were just scalars, which isn't going to work. So we now use vectors, and since the interface needs to change we also do some different (slightly more efficient) packing of the values. For 8-wide vectors the packed derivative values for 3 coords would look like this, this scales to a arbitrary (multiple of 4) vector size: ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____ The second vector will be unused for 1d and 2d textures. To facilitate future changes the derivative values are put into a struct, since quite some functions just pass these values through. The generated code seems to be very slightly better for 2d textures (with 4-wide vectors) than before with sse2 (if you have a cpu with physical 128bit simd units - otherwise it's probably not a win). v2: suggestions from José, rename variables, add comments, use swizzle helper commit 0aa21de0d31466dac77b05c97005722e902517b8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:10:31 2012 +0200 gallivm: add undefined swizzle handling to lp_build_swizzle_aos This is useful for vectors with "holes", it lets llvm choose the most efficient shuffle instructions if some elements aren't needed without having to worry what elements to manually pick otherwise. commit 00faf3f370e7ce92f5ef51002b0ea42ef856e181 Author: José Fonseca <jfonseca@vmware.com> Date: Fri May 4 17:25:16 2012 +0100 gallivm: Get the LLVM IR optimization passes before JIT compilation. MC-JIT engine compiles the module immediately on creation, so the optimization passes were being run too late. So now we create a target data layout from a string, that matches the ABI parameters reported by the compiler. The backend optimization passes were always been run, so the performance improvement is modest (3% on multiarb mesa demo). Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit 40a43f4e2ce3074b5ce9027179d657ebba68800a Author: Roland Scheidegger <sroland@vmware.com> Date: Wed May 2 16:03:54 2012 +0200 gallivm: (trivial) fix wrong define used in lp_build_pack2 should fix stack-smashing crashes. commit e6371d0f4dffad4eb3b7a9d906c23f1c88a2ab9e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 30 21:25:29 2012 +0200 gallivm: add perf warnings when not using intrinsics with 256bit vectors Helper functions using integer sse2 intrinsics could split the vectors with AVX instead of using generic fallback (which should be faster). We don't actually expect to hit these paths (hence don't fix them up to actually do the vector splitting) so just emit warnings (for those functions where it's obvious doing split/intrinsic is faster than using generic path). Only emit warnings for 256bit vectors since we _really_ don't expect to hit arbitrary large vectors which would affect a lot more functions. The warnings do not actually depend on avx since the same logic applies to plain sse2 too (but of course again there's _really_ no reason we should hit these functions with 256bit vectors without avx). commit 8a9ea701ea7295181e846c6383bf66a5f5e47637 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:37:07 2012 +0200 gallivm: split vectors manually for avx in lp_build_pack2 (v2) There's 2 reasons for this: First, there's a llvm bug (fixed in 3.1) which generates tons of byte inserts/extracts otherwise, and second, more importantly, we want to use pack intrinsics instead of shuffles. We do this in lp_build_pack2 and not the calling code (aos sample path) because potentially other callers might find that useful too, even if for larger sequences of code using non-native vector sizes it might be better to manually split vectors. This should boost texture performance in the aos path considerably. v2: fix issues with intrinsics types with old llvm commit 27ac5b48fa1f2ea3efeb5248e2ce32264aba466e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:26:22 2012 +0200 llvmpipe: refactor lp_build_pack2 (v2) prettify, and it's unnecessary to assert when there's no intrinsic due to unsupported bit width - the shuffle path will work regardless. In contrast lp_build_packs2, should only rely on lp_build_pack2 doing the clamping for element sizes for which there is a sse2 intrinsic. v2: fix bug spotted by Jose regarding the intrinsic type for packusdw on old llvm versions. commit ddf279031f0111de4b18eaf783bdc0a1e47813c8 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:13:59 2012 +0200 gallivm: add src width check in lp_build_packs2() not doing so would skip clamping even if no sse2 pack instruction is available, which is incorrect (in theory only, such widths would also always hit a (unnecessary) assertion in lp_build_pack2(). commit e7f0ad7fe079975eae7712a6e0c54be4fae0114b Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Apr 27 15:57:00 2012 +0200 gallivm: (trivial) fix crash-causing typo for npot textures with avx commit 28a9d7f6f655b6ec508c8a3aa6ffefc1e79793a0 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Apr 25 19:38:45 2012 +0200 gallivm: (trivial) remove code mistakenly added twice. commit d5926537316f8ff67ad0a52e7242f7c5478d919b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Apr 24 21:16:15 2012 +0200 gallivm: add a new avx aos sample path (v2) Try to avoid mixing float and int address calculations. This does texture wrap modes with floats, and then the offset calculations still with ints (because of lack of precision with floats, though we could do some effort to make it work with not too large (16MB) textures). This also handles wrap repeat mode with npot-sized textures differently than either the old soa or aos int path (likely way faster but untested). Otherwise the actual address wrap code is largely similar to the soa path (not quite the same as this one also has some int code), it should get used by avx soa sampling later as well but doesn't handle more complex address modes yet (this will also have the benefit that we can use aos sampling path for all texture address modes). Generated code for that looks reasonable, but still does not split vectors explicitly for fetch/filter which means still get hit by llvm (fixed upstream) which generates hundreds of pinsrb/pextrb instead of two shuffles. It is not obvious though if it's much of a win over just doing address calcs 4-wide but with ints, even if it is definitely much less instructions on avx. piglit's texwrap seems to look exactly the same but doesn't test neither the non-normalized nor the npot cases. v2: fix comments, prettify based on Brian's and Jose's feedback. commit bffecd22dea66fb416ecff8cffd10dd4bdb73fce Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Apr 19 01:58:29 2012 +0200 gallivm: refactor aos lp_build_sample_image_nearest/linear split them up to separate address calculations and fetching/filtering. Need this for being able to do 8-wide float address calcs and 4-wide fetch/filter later (for avx). Plus the functions were very big scary monsters anyway (in particular lp_build_sample_image_linear). commit a80b325c57529adddcfa367f96f03557725c4773 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 16 17:17:18 2012 +0200 gallivm: fix lp_build_resize when truncating width but expanding vector size Missed this case which I thought was impossible - the assertion for it was right after the division by zero... (AoS) texture sampling may ask us to do this, for things like 8 4x32int vectors to 1 32x8int vector conversion (eventually, we probably don't want this to happen). commit f9c8337caa3eb185830d18bce8b95676a065b1d7 Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Apr 14 18:00:59 2012 +0200 gallivm: fix cube maps with larger vectors This makes the branchless cube face selection code work with larger vectors. Because the complexity is quite high (cannot really be improved it seems, per-face selection would reduce complexity a lot but this leads to errors unless the derivatives are calculated all from the same face which almost doubles the work to be done) it is still slower than the branching version, hence only enable this with large vectors. It doesn't actually do per-quad face selection yet (only makes sense with matching lod selection, in fact it will select the same face for all pixels based on the average of the first four pixels for now) but only different shuffles are required to make it work (the branching version actually should work with larger vectors too now thanks to the improved horizontal add but of course it cannot be extended to really select the face per-quad unless doing branching per quad). commit 7780c58869fc9a00af4f23209902db7e058e8a66 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 21:11:12 2012 +0100 llvmpipe: (trivial) fix compiler warning and also clarify comment regarding availability of popcnt instruction. commit a266dccf477df6d29a611154e988e8895892277e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 14:21:07 2012 +0100 gallivm: remove unneeded members in lp_build_sample_context Minor cleanup, the texture width, height, depth aren't accessed in their scalar form anywhere. Makes it more obvious those values should probably be fetched already vectorized (but this requires more invasive changes)... commit b678c57fb474e14f05e25658c829fc04d2792fff Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 29 15:53:55 2012 +0100 gallivm: add a helper for concatenating vectors Similar to the extract_range helper intended to get around slow code generated by llvm for 128bit insertelements. Concatenating two 128bit vectors this way will result in a single vinsertf128 operation rather than two 64bit stores plus one 128bit load, though it might be mildly useful for other purposes as well. commit 415ff228bcd0cf5e44a4c15350a661f0f5520029 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 19:41:15 2012 +0100 gallivm: add a custom 2x8f->1x16ub avx conversion path Similar to the existing 4x4f->1x16ub sse2 path, shaves off a couple instructions (min/max mostly) because it relies on pack intrinsics clamping. commit 78c08fc89f8fbcc6dba09779981b1e873e2a0299 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 18:44:07 2012 +0100 gallivm: add avx arithmetic intrinsics Add all avx intrinsics for arithmetic functions (with the exception of the horizontal add function which needs another look). Seems to pass basic tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit a586caa2800aa5ce54c173f7c0d4fc48153dbc4e Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 15:31:35 2012 +0100 gallivm: add avx logic intrinsics Add the blend intrinsics for 8-wide float and 4-wide double vectors. Since we lack 256bit int instructions these are used for int vectors as well, though obviously not for byte or word element values. The comparison intrinsics aren't extended for avx since these are only used for pre-2.7 llvm versions. commit 70275e4c13c89315fc2560a4c488c0e6935d5caf Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 00:40:53 2012 +0100 gallivm: new helper function for extract shuffles. Based on José's idea as we can need that in a couple places. Note that such shuffles should not be used lightly, since data layout of <4 x i8> is different to <16 x i8> for instance, hence might cause data rearrangement. commit 4d586dbae1b0c55915dda1759d2faea631c0a1c2 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 18:27:25 2012 +0100 gallivm: (trivial) don't overallocate shuffle variable using wrong define meant huge array... commit 06b0ec1f6d665d98c135f9573ddf4ba04b2121ad Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 17:54:20 2012 +0100 gallivm: don't do per-element extract/insert for vector element resize Instead of doing per-element extract/insert if the src vectors and dst vector differ in total size (which generates atrocious code) first change the src vectors size by using shuffles to destination vector size. We can still do better than that on AVX for packing to color buffer (by exploiting pack intrinsics characteristics hence eleminating the need for some clamps) but this already generates much better code. v2: incorporate feedback from José, Keith and use shuffle instead of bitcasts/extracts. Due to llvm deficiencies the latter cause all data to get moved to GPRs and back in pieces (even though the data in the regs actually stays the same...). commit c9970d70e05f95d3f52fe7d2cd794176a52693aa Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 19:33:19 2012 +0000 gallivm: fix bug in simple position interpolation Accidental use of position attribute instead of just pixel coordinates. Caused failures in piglit glsl-fs-ceil and glsl-fs-floor. commit d0b6fcdb008d04d7f73d3d725615321544da5a7e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 15:31:14 2012 +0000 gallivm: fix emission of ceil opcode lp_build_ceil seems more appropriate than lp_build_trunc. This seems to be never hit though someone performs some ceil to floor magic. commit d97fafed7e62ffa6bf76560a92ea246a1a26d256 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 22 11:46:52 2012 +0000 gallivm: new vectorized path for cubemap calculations should be faster when adapted to multiple quads as only selection masks need to be different. The code is more or less a per-pixel version adapted to only do it per quad. A per pixel version would be much simpler (could drop 2 selects, 6 broadcasts and the messy horizontal add of 3 vectors at the expense of only 2 more absolute value instructions - would also just work for arbitary large vectors). This version doesn't yet work with larger vectors because the horizontal add isn't adjusted to be able to work with 2x4 vectors (and also because face selection wouldn't be done per quad just per block though that would be only a correctness issue just as with lod selection). The downside is this code is quite a bit slower. On a Core2 it can be sped up by disabling the hw blend instructions for selection and using logicop fallbacks instead, but it is still slower than the old code, hence leave that in for now. Probably will chose one or the other version based on vector length in the end. commit b375fbb18a3fd46859b7fdd42f3e9908ea4ff9a3 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 21 14:42:29 2012 +0000 gallivm: fix optimized occlusion query intrinsic name commit a9ba0a3b611e48efbb0e79eb09caa85033dbe9a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Mar 21 16:19:43 2012 +0000 draw,gallivm,llvmpipe: Call gallivm_verify_function everywhere. commit f94c2238d2bc7383e088b8845b7410439a602071 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 18:54:10 2012 +0000 gallivm: optimize calculations for cube maps a bit this does some more vectorized calculations and uses horizontal adds if possible. A definite win with sse3 otherwise it doesn't seem to make much of a difference. In any case this is arithmetically identical, cannot handle larger vectors. Should be useful as a reference point against larger vector version later... commit 21a2c1cf3c8e1ac648ff49e59fdc0e3be77e2ebb Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 15:16:27 2012 +0000 llvmpipe: slight optimization of occlusion queries using movmskps when available. While this is slightly better for cpus without popcnt we should really sum the vectors ourselves (it is also possible to cast to i4 before doing the popcnt but that doesn't help that much neither since llvm is using some optimized popcnt version for i32) commit 5ab5a35f216619bcdf55eed52b0db275c4a06c1b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 13:32:11 2012 +0000 llvmpipe: fix occlusion queries with larger vectors need to adjust casts etc. commit ff95e6fdf5f16d4ef999ffcf05ea6e8c7160b0d5 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Mar 19 20:15:25 2012 +0000 gallivm: Restore optimization passes. commit 57b05b4b36451e351659e98946dae27be0959832 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:34:22 2012 +0000 llvmpipe: use existing min2 macro commit bc9a20e19b4f600a439f45679451f2e87cd4b299 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:07:27 2012 +0000 llvmpipe: add some safeguards against really large vectors As per José's suggestion, prevent things from blowing up if some cpu would have 1024bit or larger vectors. commit 0e2b525e5ca1c5bbaa63158bde52ad1c1564a3a9 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:31:08 2012 +0000 llvmpipe: fix mask generation for uberwide vectors this was the only piece preventing 16-wide vectors from working (apart from the LP_MAX_VECTOR_WIDTH define that is), which is the maximum as we don't get more pixels in the fragment shader at once. Hence adjust that so things could be tested properly with that size even though there seems to be no practical value. commit 3c8334162211c97f3a11c7f64e9e5a2a91ad9656 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:19:41 2012 +0000 llvmpipe: fix the simple interpolation method with larger vectors so both methods actually _really_ work now. Makes textures look nice with larger vectors... commit 1cb0464ef8871be1778d43b0c56adf9c06843e2d Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 17:26:35 2012 +0000 llvmpipe: fix mask generation and position interpolation with 8-wide vectors trivial bugs, with these things start to look somewhat reasonable. Textures though have some swizzling issues it seems. commit 168277a63ef5b72542cf063c337f2d701053ff4b Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 16:04:03 2012 +0000 llvmpipe: don't overallocate variables we never have more than 16 (stamp size) / 4 (minimum possible vector size). (With larger vectors those variables are still overallocated a bit.) commit 409b54b30f81ed0aa9ed0b01affe15c72de9abd2 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:56:48 2012 +0000 llvmpipe: add some 32f8 formats to lp_test_conv Also add the ability to handle different sized vectors. commit 55dcd3af8366ebdac0af3cdb22c2588f24aa18ce Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:47:27 2012 +0000 gallivm: handle different sized vectors in conversion / pack only fully generic path for now (extract/insert per element). commit 9c040f78c54575fcd94a8808216cf415fe8868f6 Author: Roland Scheidegger <sroland@vmware.com> Date: Sun Mar 18 00:58:28 2012 +0100 llvmpipe: fix harmless use of unitialized values commit 551e9d5468b92fc7d5aa2265db9a52bb1e368a36 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:31:21 2012 +0100 gallivm: drop special path in extract_broadcast with different sized vectors Not needed, llvm can handle shuffles with different sized result vector just fine. Should hopefully generate the same code in the end, but simpler IR. commit 44da531119ffa07a421eaa041f63607cec88f6f8 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:28:49 2012 +0100 llvmpipe: adapt interpolation for handling multiple quads at once this is still WIP there are actually two methods possible not quite sure what makes the most sense, so there's code for both for now: 1) the iterative method as used before (compute attrib values at upper left corner of stamp and upper left corner of each quad initially). It is improved to handle more than one quad at once, and also do some more vectorized calculations initially for slightly better code - newer cpus have full throughput with 4 wide float vectors, hence don't try to code up a path which might be faster if there's just one channel active per attribute. 2) just do straight interpolation for each pixel. Method 2) is more work per quad, but less initially - if all quads are executed significantly more overall though. But this might change with larger vector lengths. This method would also be needed if we'd do some kind of active quad merging when operating on multiple quads at once. This path contains some hack to force llvm to generate better code, it is still far from ideal though, still generates far too many unnecessary register spills/reloads. Both methods should work with different sized vectors. Not very well tested yet, still seems to work with four-wide vectors, need changes elsewhere to be able to test with wider vectors. commit be5d3e82e2fe14ad0a46529ab79f65bf2276cd28 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:59:37 2012 +0000 draw: Cleanup. commit f85bc12c7fbacb3de2a94e88c6cd2d5ee0ec0e8d Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:43:30 2012 +0000 gallivm: More module compilation refactoring. commit d76f093198f2a06a93b2204857e6fea5fd0b3ece Author: José Fonseca <jfonseca@vmware.com> Date: Thu Mar 15 21:29:11 2012 +0000 llvmpipe: Use gallivm_compile/free_function() in linear code. Should had been done before. commit 122e1adb613ce083ad739b153ced1cde61dfc8c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 13 14:47:10 2012 +0100 llvmpipe: generate partial pixel mask for multiple quads still works with one quad, cannot be tested yet with more At least for now always fixed order with multiple quads. commit 4c4f15081d75ed585a01392cd2dcce0ad10e0ea8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 22:09:24 2012 +0100 llvmpipe: refactor state setup a bit Refactor to make it easier to emit (and potentially later fetch in fs) coefficients for multiple attributes at once. Need to think more about how to make this actually happen however, the problem is different attributes can have different interpolation modes, requiring different handling in both setup and fs (though linear and perspective handling is close). commit 9363e49722ff47094d688a4be6f015a03fba9c79 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 19:23:23 2012 +0100 llvmpipe: vectorize tri offset calc cuts number of instructions in quad-offset-factor from 107 to 75. This code actually duplicated the (scalar) code calculating the determinant except it used different vertex order (leading to different sign but it doesn't matter) hence llvm could not have figured out it's the same (of course with determinant vectorized in the other place that wouldn't have worked any longer neither). Note this particular piece doesn't actually vectorize well, not many arithmetic instructions left but tons of shuffle instructions... Probably would need to work on n tris at a time for better vectorization. commit 63169dcb9dd445c94605625bf86d85306e2b4297 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 03:11:37 2012 +0100 llvmpipe: vectorize some scalar code in setup reduces number of arithmetic instructions, and avoids loading vector x,y values twice (once as scalars once as vectors). Results in a reduction of instructions from 76 to 64 in fs setup for glxgears (16%) on a cpu with sse41. Since this code uses vec2 disguised as vec4, on old cpus which had physical 64bit sse units (pre-Core2) it probably is less of a win in practice (and if you have no vectors you can only hope llvm eliminates the arithmetic for unneeded elements). commit 732ecb877f951ab89bf503ac5e35ab8d838b58a1 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 7 00:32:24 2012 +0100 draw: fix clipping bug introduced by 4822fea3f0440b5205e957cd303838c3b128419c broke clipping pretty badly (verified with lineclip test) commit ef5d90b86d624c152d200c7c4056f47c3c6d2688 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 23:38:59 2012 +0100 draw: don't store vertex header per attribute storing the vertex header once per attribute is totally unnecessary. Some quick look at the generated assembly says llvm in fact cannot optimize away the additional stores (maybe due to potentially aliasing pointers somewhere). Plus, this makes the code cleaner and also allows using a vector "or" instead of scalar ones. commit 6b3a5a57b0b9850854cfbd7b586e4e50102dda71 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 19:11:01 2012 +0100 draw: do the per-vertex "boolean" clipmask "or" with vectors no point extracting the values and doing it per component. Doesn't help that much since we still extract the values elsewhere anyway. commit 36519caf1af40e4480251cc79a2d527350b7c61f Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 2 22:27:01 2012 +0100 gallivm: fix lp_build_extract_broadcast with different sized vectors Fix the obviously wrong argument, so it doesn't blow up. commit 76d0ac3ad85066d6058486638013afd02b069c58 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 2 12:16:23 2012 +0000 draw: Compile per module and not per function (WIP). Enough to get gears w/ LLVM draw + softpipe to work on AVX doing: GALLIUM_DRIVER=softpipe SOFTPIPE_USE_LLVM=yes glxgears But still hackish -- will need to rethink and refactor this. commit 78e32b247d2a7a771be9a1a07eb000d1e54ea8bd Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 12:01:05 2012 +0000 llvmpipe: Remove lp_state_setup_fallback. Never used. commit 6895d5e40d19b4972c361e8b83fdb7eecda3c225 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Feb 27 19:14:27 2012 +0000 llvmpipe: Don't emit EMMS on x86 We already take precautions to ensure that LLVM never emits MMX code. commit 4822fea3f0440b5205e957cd303838c3b128419c Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 29 15:58:19 2012 +0100 draw: modifications for larger vector sizes We want to be able to use larger vectors especially for running the vertex shader. With this patch we build soa vectors which might have a different length than 4. Note that aos structures really remain the same, only when aos structures are converted to soa potentially different sized vectors are used. Samplers probably don't work yet, didn't look at them. Testing done: glxgears works with both 128bit and 256bit vectors. commit f4950fc1ea784680ab767d3dd0dce589f4e70603 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:57 2012 +0100 gallivm: override native vector width with LP_NATIVE_VECTOR_WIDTH env var for debug commit 6ad6dbf0c92f3bf68ae54e5f2aca035d19b76e53 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:24 2012 +0100 draw: allocate storage with alignment according to native vector width commit 7bf0e3e7c9bd2469ae7279cabf4c5229ae9880c1 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Feb 24 19:06:08 2012 +0000 gallivm: Fix comment grammar. Was missing several words. Spotted by Roland. commit b20f1b28eb890b2fa2de44a0399b9b6a0d453c52 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 19:22:09 2012 +0000 gallivm: Use MC-JIT on LLVM 3.1 + (i.e, SVN) MC-JIT Note: MC-JIT is still WIP. For this to work correctly it requires LLVM changes which are not yet upstream. commit b1af4dfcadfc241fd4023f4c3f823a1286d452c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 20:03:15 2012 +0100 llvmpipe: use new lp_type_width() helper in lp_test_blend commit 04e0a37e888237d4db2298f31973af459ef9c95f Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 19:50:34 2012 +0100 llvmpipe: clean up lp_test_blend a little Using variables just sized and aligned right makes it a bit more obvious what's going on. The test still only tests vector length 4. For AoS anything else probably isn't going to work. For SoA other lengths should work (at least with floats). commit e61c393d3ec392ddee0a3da170e985fda885a823 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:48:30 2012 +0000 gallivm: Ensure vector width consistency. Instead of assuming that everything is the max native size. commit 330081ac7bc41c5754a92825e51456d231bf84dd Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:44:14 2012 +0000 draw: More simd vector width consistency fixes. commit d90ca002753596269e37297e2e6c139b19f29f03 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:43:00 2012 +0000 gallivm: Remove unused lp_build_int32_vec4_type() helper. commit cae23417824d75869c202aaf897808d73a2c1db0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 17:32:16 2012 +0100 gallivm: use global variable for native vector width instead of define We do not know the simd extensions (and hence the simd width we should use) available at compile time. At least for now keep a define for maximum vector width, since a global variable obviously can't be used to adjust alignment of automatic stack variables. Leave the runtime-determined value at 128 for now in all cases. commit 51270ace6349acc2c294fc6f34c025c707be538a Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 15:41:02 2012 +0000 gallivm: Add a hunk inadvertedly lost when rebasing. commit bf256df9cfdd0236637a455cbaece949b1253e98 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:24:23 2012 +0000 llvmpipe: Use consistent vector width in depth/stencil test. commit 5543b0901677146662c44be2cfba655fd55da94b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:19:59 2012 +0000 draw: Use a consistent the vector register width. Instead of 4x32 sometimes, LP_NATIVE_VECTOR_WIDTH other times. commit eada8bbd22a3a61f549f32fe2a7e408222e5c824 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 12:08:04 2012 +0000 gallivm: Remove garbagge collection. MC-JIT will require one compilation per module (as opposed to one compilation per function), therefore no state will be shared, eliminating the need to do garbagge collection. commit 556697ea0ed72e0641851e4fbbbb862c470fd7eb Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 10:33:41 2012 +0000 gallivm: Move all native target initialization to lp_set_target_options(). commit c518e8f3f2649d5dc265403511fab4bcbe2cc5c8 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:52:32 2012 +0000 llvmpipe: Create one gallivm instance for each test. commit 90f10af8920ec6be6f2b1e7365cfc477a0cb111d Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:48:08 2012 +0000 gallivm: Avoid LLVMAddGlobalMapping() in lp_bld_assert(). Brittle, complex, and unecesary. Just use function pointer constant. commit 98fde550b33401e3fe006af59db4db628bcbf476 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:21:26 2012 +0000 gallivm: Add a lp_build_const_func_pointer() helper. To be reused in all places where we want to call C code. commit 6cfedadb62c2ce5af8d75969bc95a607f3ece118 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:44:41 2012 +0000 gallivm: Cleanup/simplify lp_build_const_string_variable. - Move to lp_bld_const where it belongs - Rename to lp_build_const_string - take the length from the argument (and don't count the zero terminator twice) - bitcast the constant to generic i8 * commit db1d4018c0f1fa682a9da93c032977659adfb68c Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 11:52:17 2012 +0000 gallivm: Set NoFramePointerElimNonLeaf to true where supported. commit 088614164aa915baaa5044fede728aa898483183 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 22 19:38:47 2012 +0100 llvmpipe: pass in/out pointers rather scalar floats in lp_bld_arit we don't want llvm to potentially optimize away the vectors (though it doesn't seem to currently), plus we want to be able to handle in/out vectors of arbitrary length. commit 3f5c4e04af8a7592fdffa54938a277c34ae76b51 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:22:55 2012 +0100 gallivm: fix lp_build_sqrt() for vector length 1 since we optimize away vectors with length 1 need to emit intrinsic without vector type. commit 79d94e5f93ed8ba6757b97e2026722ea31d32c06 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 17:00:46 2012 +0000 llvmpipe: Remove lp_test_round. commit 81f41b5aeb3f4126e06453cfc78990086b85b78d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:56:24 2012 +0100 llvmpipe: subsume lp_test_round into lp_test_arit Much simpler, and since the arguments aren't passed as 128bit values can run on any arch. This also uses the float instead of the double versions of the c functions (which probably was the intention anyway). In contrast to lp_test_round the output is much less verbose however. Tested vector width of 32 to 512 bits - all pass except 32 (length 1) which crashes in lp_build_sqrt() due to wrong type. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 945b338b421defbd274481d8c4f7e0910fd0e7eb Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 09:55:03 2012 +0000 gallivm: Centralize the function compilation logic. This simplifies a lot of code. Also doing this in a central place will make it easier to carry out the changes necessary to use MC-JIT in the future. gallivm: Fix typo in explicit derivative shuffle. Trivial. draw: make DEBUG_STORE work again adapt to lp_build_printf() interface changes Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: get rid of vecnf_from_scalar() just use lp_build_broadcast directly (cannot assign a name but don't really need it, vecnf_from_scalar() was producing much uglier IR due to using repeated insertelement instead of insertelement+shuffle). Reviewed-by: José Fonseca <jfonseca@vmware.com> llvmpipe: fix typo in complex interpolation code Fixes position interpolation when using complex mode (piglit fp-fragment-position and similar) Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: fix clipvertex/position storing again This appears to be the result of a bad merge. Fixes piglit tests relying on clipping, like a lot of the interpolation tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Fix explicit derivative manipulation. Same counter variable was being used in two nested loops. Use more meanigful variable names for the counter to fix and avoid this. gallivm: Prevent buffer overflow in repeat wrap mode for NPOT. Based on Roland's patch, discussion, and review . Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix dims for TGSI_TEXTURE_1D in emit_tex. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix explicit volume texture derivatives. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: fix 1d shadow texture sampling Always r coordinate is used, hence need 3 coords not two (the second one is unused). Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Enable AVX support without MCJIT, where available. For now, this just enables AVX on Windows for testing. If the code is stable then we might consider prefering the old JIT wherever possible. No change elsewhere. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2012-07-13 18:09:30 +01:00
intr_size = 128;
}
else {
intrinsic = "llvm.x86.avx.max.ps.256";
intr_size = 256;
}
}
if (type.width == 64 && util_cpu_caps.has_sse2) {
if (type.length == 1) {
intrinsic = "llvm.x86.sse2.max.sd";
intr_size = 128;
}
else if (type.length == 2 || !util_cpu_caps.has_avx) {
intrinsic = "llvm.x86.sse2.max.pd";
gallivm,draw,llvmpipe: Support wider native registers. Squashed commit of the following: commit 7acb7b4f60dc505af3dd00dcff744f80315d5b0e Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:46:31 2012 +0100 draw: Don't use dynamically sized arrays. Not supported by MSVC. commit 5810c28c83647612cb372d1e763fd9d7780df3cb Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:44:16 2012 +0100 gallivm,llvmpipe: Don't use expressions with PIPE_ALIGN_VAR(). MSVC doesn't accept exceptions in _declspec(align(...)). Use a define instead. commit 8aafd1457ba572a02b289b3f3411e99a3c056072 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:41:56 2012 +0100 gallium/util: Make u_cpu_detect.h header C++ safe. commit 5795248350771f899cfbfc1a3a58f1835eb2671d Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 2 12:08:01 2012 +0100 gallium/util: Add ULL suffix to large constants. As suggested by Andy Furniss: it looks like some old gcc versions require it. commit 4c66c22727eff92226544c7d43c4eb94de359e10 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Truly disable INF/NAN tests on MSVC. Thanks to Brian for spotting this. commit 8bce274c7fad578d7eb656d9a1413f5c0844c94e Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Disable INF/NAN tests on MSVC. Somehow they are not recognized as constants. commit 6868649cff8d7fd2e2579c28d0b74ef6dd4f9716 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 15:05:24 2012 +0200 gallivm: Cleanup the 2 x 8 float -> 16 ub special path in lp_build_conv. No behaviour change intended, like 7b98455fb40c2df84cfd3cdb1eb7650f67c8a751. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 5147a0949c4407e8bce9e41d9859314b4a9ccf77 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 14:28:19 2012 +0200 gallivm: (trivial) fix issues with multiple-of-4 texture fetch Some formats can't handle non-multiple of 4 fetches I believe, but everything must support length 1 and multiples of 4. So avoid going to scalar fetch (which is very costly) just because length isn't 4. Also extend the hack to not use shift with variable count for yuv formats to arbitrary length (larger than 1) - doesn't matter how many elements we have we always want to avoid it unless we have variable shift count instruction (which we should get with avx2). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 87ebcb1bd71fa4c739451ec8ca89a7f29b168c08 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jul 4 02:09:55 2012 +0200 gallivm: (trivial) fix typo for wrap repeat mode in linear filtering aos code This would lead to bogus coordinates at the edges. (undetected by piglit because this path is only taken for block-based formats). Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 3a42717101b1619874c8932a580c0b9e6896b557 Author: José Fonseca <jfonseca@vmware.com> Date: Tue Jul 3 19:42:49 2012 +0100 gallivm: Fix TGSI integer translation with AVX. commit d71ff104085c196b16426081098fb0bde128ce4f Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 15:17:41 2012 +0100 llvmpipe: Fix LLVM JIT linear path. It was not working properly because it was looking at the JIT function before it was actually compiled. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit a94df0386213e1f5f9a6ed470c535f9688ec0a1b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Jun 28 18:07:10 2012 +0100 gallivm: Refactor lp_build_broadcast(_scalar) to share code. Doesn't really change the generated assembly, but produces more compact IR, and of course, makes code more consistent. Reviewed-by: Brian Paul <brianp@vmware.com> commit 66712ba2731fc029fa246d4fc477d61ab785edb5 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 17:30:13 2012 +0100 gallivm: Make LLVMContextRef a singleton. There are any places inside LLVM that depend on it. Too many to attempt to fix. Reviewed-by: Brian Paul <brianp@vmware.com> commit ff5fb7897495ac263f0b069370fab701b70dccef Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 28 18:15:27 2012 +0200 gallivm: don't use 8-wide texture fetch in aos path This appears to be a slight loss usually. There are probably several reasons for that: - fetching itself is scalar - filtering is pure int code hence needs splitting anyway, same for the final texel offset calculations - texture wrap related code, which can be done 8-wide, is slightly more complex with floats (with clamp_to_edge) and float operations generally more costly hence probably not much faster overall - the code needed to split when encountering different mip levels for the quads, adding complexity So, just split always for aos path (but leave it 8-wide for soa, since we do 8-wide filtering there when possible). This should certainly be revisited if we'd have avx2 support. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ce8032b43dcd8e8d816cbab6428f54b0798f945d Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:41:19 2012 +0200 gallivm: (trivial) don't extract fparts variable if not needed Did not have any consequences but unnecessary. commit aaa9aaed8f80dc282492f62aa583a7ee23a4c6d5 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:09:06 2012 +0200 gallivm: fix precision issue in aos linear int wrap code now not just passes at a quick glance but also with piglit... If we do the wrapping with floats, we also need to set the weights accordingly. We can potentially end up with different (integer) coordinates than what the integer calculations would have chosen, which means the integer weights calculated previously in this case are completely wrong. Well at least that's what I think happens, at least recalculating the weights helps. (Some day really should refactor all the wrapping, so we do whatever is fastest independent of 16bit int aos or 32bit float soa filtering.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit fd6f18588ced7ac8e081892f3bab2916623ad7a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 11:15:53 2012 +0100 gallium/util: Fix parsing of options with underscore. For example GALLIVM_DEBUG=no_brilinear which was being parsed as two options, "no" and "brilinear". commit 09a8f809088178a03e49e409fa18f1ac89561837 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 26 15:00:14 2012 +0100 gallivm: Added a generic lp_build_print_value which prints a LLVMValueRef. Updated lp_build_printf to share common code. Removed specific lp_build_print_vecX. Reviewed-by: José Fonseca <jfonseca@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit e59bdcc2c075931bfba2a84967a5ecd1dedd6eb0 Author: José Fonseca <jfonseca@vmware.com> Date: Wed May 16 15:00:23 2012 +0100 draw,llvmpipe: Avoid named struct types on LLVM 3.0 and later. Starting with LLVM 3.0, named structures are meant not for debugging, but for recursive data types, previously also known as opaque types. The recursive nature of these types leads to several memory management difficulties. Given that we don't actually need recursive types, avoid them altogether. This is an attempt to address fdo bugs 41791 and 44466. The issue is somewhat random so there's no easy way to check how effective this is. Cherry-picked from 9af1ba565dfd5cef9ee938bb7c04767d14878fbf commit df6070f618a203c7a876d984c847cde4cbc26bdb Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 14:42:53 2012 +0200 gallivm: (trivial) fix typo in faster aos linear int wrap code no longer crashes, now REALLY tested. commit d8f98dce452c867214e6782e86dc08562643c862 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 18:20:58 2012 +0200 llvmpipe: (trivial) remove bogus optimization for float aos repeat wrap This optimization for nearest filtering on the linear path generated likely bogus results, and the int path didn't have any optimizations there since the only shader using force_nearest apparently uses clamp_to_edge not repeat wrap anyway. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit c4e271a0631087c795e756a5bb6b046043b5099d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 23:01:52 2012 +0200 gallivm: faster repeat wrap for linear aos path too Even if we already have scaled integer coords, it's way faster to use the original float coord (plus some conversions) rather than use URem. The choice of what to do for texture wrapping is not really tied to int aos or float soa filtering though for some modes there can be some gains (because of easier weight calculations). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 1174a75b1806e92aee4264ffe0ffe7e70abbbfa3 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 14:39:22 2012 +0200 gallivm: improve npot tex wrap repeat in linear soa path URem gets translated into series of scalar divisions so just about anything else is faster. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit f849ffaa499ed96fa0efd3594fce255c7f22891b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 00:40:35 2012 +0100 gallivm: (trivial) fix near-invisible shift-space typo I blame the keyboard. commit 5298a0b19fe672aebeb70964c0797d5921b51cf0 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:24:28 2012 +0200 gallivm: add new intrinsic helper to deal with arbitrary vector length This helper will split vectors which are too large for the hw, or expand them if they are too small, so a caller of a function using intrinsics which uses such sizes need not split (or expand) the vectors manually and the function will still use the intrinsic instead of dropping back to generic llvm code. It can also accept scalars for use with pseudo-vector intrinsics (only useful for float arguments, all x86 scalar simd float intrinsics use 4vf32). Only used for lp_build_min/max() for now (also added the scalar float case for these while there). (Other basic binary functions could use it easily, whereas functions with a different interface would need different helpers.) Expanding vectors isn't widely used, because we always try to use build contexts with native hw vector sizes. But it might (or not) be nicer if this wouldn't need to be done, the generated code should in theory stay the same (it does get hit by lp_build_rho though already since we didn't have a intrinsic for the scalar lp_build_max case before). v2: incorporated Brian's feedback, and also made the scalar min/max case work instead of crash (all scalar simd float intrinsics take 4vf32 as argument, probably the reason why it wasn't used before). Moved to lp_bld_intr based on José's request, and passing intrinsic size instead of length. Ideally we'd derive the source type info from the passed in llvm value refs and process some llvmtype return type so we could handle intrinsics where the source and destination type isn't the same (like float/int conversions, packing instructions) but that's a bit too complicated for now. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 01aa760b99ec0b2dc8ce57a43650e83f8c1becdf Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:19:18 2012 +0200 gallivm: (trivial) increase max code size for shader disassembly 64kB was just short of what I needed (which caused a crash) hence increase to 96kB (should probably be smarter about that). commit 74aa739138d981311ce13076388382b5e89c6562 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:53:29 2012 +0100 gallivm: simplify aos float tex wrap repeat nearest just handle pot and npot the same. The previous pot handling ended up with exactly the same instructions plus 2 more (leave it in the soa path though since it is probably still cheaper there). While here also fix a issue which would cause a crash after an assert. commit 0e1e755645e9e49cfaa2025191e3245ccd723564 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:29:24 2012 +0100 gallivm: (trivial) skip floor rounding in ifloor when not signed This was only done for the non-sse41 case before, but even with sse41 this is obviously unnecessary (some callers already call itrunc in this case anyway but some might not). commit 7f01a62f27dcb1d52597b24825931e88bae76f33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:23:12 2012 +0100 gallivm: (trivial) fix bogus comments commit 5c85be25fd82e28490274c468ce7f3e6e8c1d416 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 20 11:51:57 2012 +0100 translate: Free elt8_func/elt16_func too. These were leaking. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 0ad498f36fb6f7458c7cffa73b6598adceee0a6c Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 15:55:34 2012 +0200 gallivm: fix bug for tex wrap repeat with linear sampling in aos float path The comparison needs to be against length not length_minus_one, otherwise the max texel is never chosen (for the second coordinate). Fixes piglit texwrap-1D-npot-proj (and 2D/3D versions). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit d1ad65937c5b76407dc2499b7b774ab59341209e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 16:13:43 2012 +0200 gallivm: simplify soa tex wrap repeat with npot textures and no mip filtering Similar to what is already done in aos sampling for the float path (but not the int path since we don't get normalized float coordinates there). URem is expensive and the calculation is done trivially with normalized floats instead (at least with sse41-capable cpus). (Some day should probably do the same for the mip filter path but it's much more complicated there hence the gain is smaller.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit e1e23f57ba9b910295c306d148f15643acc3fc83 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:38:56 2012 +0200 llvmpipe: (trivial) remove duplicated function declaration Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 07ca57eb09e04c48a157733255427ef5de620861 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:37:34 2012 +0200 llvmpipe: destroy setup variants on context destruction lp_delete_setup_variants() used to be called in garbage collection, but this no longer exists hence the setup shaders never got freed. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ed0003c633859a45f9963a479f4c15ae0ef1dca3 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 16:25:29 2012 +0100 gallivm: handle different ilod parts for multiple quad sampling This fixes filtering when the integer part of the lod is not the same for all quads. I'm not fully convinced of that solution yet as it just splits the vector if the levels to be sampled from are different. But otherwise we'd need to do things like some minify steps, and getting mip level base address separately anyway hence it wouldn't really look like much of a win (and making the code even more complex). This should now give identical results to single quad sampling. commit 8580ac4cfc43a64df55e84ac71ce1a774d33c0d2 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:14:47 2012 +0200 gallivm: de-duplicate sample code common to soa and aos sampling There doesn't seem to be any reason why this code dealing with cube face selection, lod and mip level calculation is separate in aos and soa sampling, and I am sick of having it to change in both places. commit fb541e5f957408ce305b272100196f1e12e5b1e8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:15:41 2012 +0200 gallivm: do mip filtering with per quad lod_fpart This gives better results for mip filtering, though the generated code might not be optimal. For now it also creates some artifacts if the lod_ipart isn't the same for all quads, since instead of using the same mip weight for all quads as previously (which just caused non-smooth gradients) this now will use the right weights but with the wrong mip level in this case (can easily be seen with things like texfilt, mipmap_tunnel). v2: use logic helper suggested by José, and fix issue with negative lod_fpart values commit f1cc84eef7d826a20fab6cd8ccef9a275ff78967 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 13 18:35:25 2012 +0200 gallivm: (trivial) fix bogus assert in lp_build_unpack_broadcast_aos_scalars commit 7c17dbae8ae290df9ce0f50781a09e8ed640c044 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:11:14 2012 +0100 util: Reimplement half <-> float conversions. Removed u_half.py used to generate the table for previous method. Previous implementation of float to half conversion was faulty for denormalised and NaNs and would require extra logic to fix, thus making the speedup of using tables irrelevant. commit 7762f59274070e1dd4b546f5cb431c2eb71ae5c3 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:12:16 2012 +0100 tests: Updated tests to properly handle NaN for half floats. commit fa94c135aea5911fd93d5dfb6e6f157fb40dce5e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:33:10 2012 +0200 gallivm: do mip level calculations per quad This is the final piece which shouldn't change the rendering output yet. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 23cbeaddfe03c09ca18c45d28955515317ffcf4c Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Jun 9 00:54:21 2012 +0200 gallivm: do per-quad cube face selection Doesn't quite fix the piglit cubemap test (not sure why actually) but doing per-quad face selection is doing the right thing and definitely an improvement. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit abfb372b3702ac97ac8b5aa80ad1b94a2cc39d33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:22:59 2012 +0200 gallivm: do all lod calculations per quad Still no functional change but lod is now converted to scalar after lod calculations. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 519368632747ae03feb5bca9c655eccbc5b751b4 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:46:10 2012 +0100 gallivm: Added support for half-float to float conversion in lp_build_conv. Updated various utility functions to support this change. commit 135b4d683a4c95f7577ba27b9bffa4a6fbd2c2e7 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:02:46 2012 +0100 gallivm: Added function for half-float to float conversion. Updated lp_build_format_aos_array to support half-float source. commit 37d648827406a20c5007abeb177698723ed86673 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:55:18 2012 +0100 util: Updated u_format_tests to rigidly test half-float boundary values. commit 2ad18165d96e578aa9046df7c93cb1c3284d8c6b Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:54:16 2012 +0100 llvmpipe: Updated lp_test_format to properly handle Inf/NaN results. commit 78740acf25aeba8a7d146493dd5c966e22c27b73 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:53:30 2012 +0100 util: Added functions for checking NaN / Inf for double and half-floats. commit 35e9f640ae01241f9e0d67fe893bbbf564c05809 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:05:13 2012 +0200 gallivm: Fix calculating rho for 3d textures for the single-quad case Discovered by accident, this looks like a very old typo bug. commit fc1220c636326536fd0541913154e62afa7cd1d8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:04:59 2012 +0200 gallivm: do calcs per-quad in lp_build_rho Still convert to scalar at the end of the function. commit 50a887ffc550bf310a6988fa2cea5c24d38c1a41 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 21 23:21:50 2012 +0200 gallivm: (trivial) return scalar in lp_build_extract_range for length 1 vectors Our type system on top of llvm's one doesn't generally support vectors of length 1, instead using scalars. So we should return a scalar from this function instead of having to bitcast the vector with length 1 later elsewhere. commit 80c71c621f9391f0f9230460198d861643324876 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 17:49:15 2012 +0100 draw: Fixed bad merge error commit c47401cfad0c9167de20ff560654f533579f452c Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:29:30 2012 +0100 draw: Updated store_clip to store whole vectors instead of individual elements. commit 2d9c1ad74b0b0b41861fffcecde39f09cc27f1cf Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:28:32 2012 +0100 gallivm: Added lp_build_fetch_rgba_aos_array. A version of lp_build_fetch_rgba_aos which is targeted at simple array formats. Reads the whole vector from memory in one, instead of reading each element individually. Tested with mesa tests and demos. commit ff7805dc2b6ef6d8b11ec4e54aab1633aef29ac8 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:27:40 2012 +0100 gallivm: Added lp_build_pad_vector. This function pads a vector with undef to a desired length. commit 701f50acef24a2791dabf4730e5b5687d6eb875d Author: James Benton <jbenton@vmware.com> Date: Fri May 18 17:27:19 2012 +0100 util: Added util_format_is_array. This function checks whether a format description is in a simple array format. commit 5e0a7fa543dcd009de26f34a7926674190fa6246 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:13:47 2012 +0100 draw: Removed draw_llvm_translate_from and draw/draw_llvm_translate.c. This is "replaced" by adding an optimised path in lp_build_fetch_rgba_aos in an upcoming patch. commit 8c886d6a7dd3fb464ecf031de6f747cb33e5361d Author: James Benton <jbenton@vmware.com> Date: Wed May 16 15:02:31 2012 +0100 draw: Modified store_aos to write the vector as one, not individual elements. commit 37337f3d657e21dfd662c7b26d61cb0f8cfa6f17 Author: James Benton <jbenton@vmware.com> Date: Wed May 16 14:16:23 2012 +0100 draw: Changed aos_to_soa to use lp_build_transpose_aos. commit bd2b69ce5d5c94b067944d1dcd5df9f8e84548f1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:14:27 2012 +0100 draw: Changed soa_to_aos to use lp_build_transpose_aos. commit 0b98a950d29a116e82ce31dfe7b82cdadb632f2b Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:45 2012 +0100 gallivm: Added lp_build_transpose_aos which converts between aos and soa. commit 69ea84531ad46fd145eb619ed1cedbe97dde7cb5 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:01 2012 +0100 gallivm: Added lp_build_interleave2_half aimed at AVX unpack instructions. commit 7a4cb1349dd35c18144ad5934525cfb9436792f9 Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 22 11:54:14 2012 +0100 gallivm: Fix build on Windows. MC-JIT not yet supported there. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit afd105fc16bb75d874e418046b80d9cc578818a1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:17:26 2012 +0100 llvmpipe: Added a error counter to lp_test_conv. Useful for keeping track of progress when fixing errors! Signed-off-by: José Fonseca <jfonseca@vmware.com> commit b644907d08c10a805657841330fc23db3963d59c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:16:46 2012 +0100 llvmpipe: Changed known failures in lp_test_conv. To comply with the recent fixes to lp_bld_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit d7061507bd94f6468581e218e61261b79c760d4f Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:14:38 2012 +0100 llvmpipe: Added fixed point types tests to lp_test_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 146b3ea39b4726dbe125ac666bd8902ea3d6ca8c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:26:35 2012 +0100 llvmpipe: Changed lp_test_conv src/dst alignment to be correct. Now based on the define rather than a fixed number. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit f3b57441f834833a4b142a951eb98df0aa874536 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:06:44 2012 +0100 gallivm: Fixed erroneous optimisation in lp_build_min/max. Previously assumed normalised was 0 to 1, but it can be -1 to 1 if type is signed. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a0613382e5a215cd146bb277646a6b394d376ae4 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:04:49 2012 +0100 gallivm: Compensate for lp_const_offset in lp_build_conv. Fixing a /*FIXME*/ to remove errors in integer conversion in lp_build_conv. Tested using lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a3d2bf15ea345bc8a0664f8f441276fd566566f3 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:01:25 2012 +0100 gallivm: Fixed overflow in lp_build_clamped_float_to_unsigned_norm. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit e7b1e76fe237613731fa6003b5e1601a2e506207 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 21 20:07:51 2012 +0100 gallivm: Fix build with LLVM 2.6 Trivial, and useful. commit d3c6bbe5c7f5ba1976710831281ab1b6a631082d Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 15 17:15:59 2012 +0100 gallivm: Enable MCJIT/AVX with vanilla LLVM 3.1. Add the necessary C++ glue, so that we don't need any modifications to the soon to be released LLVM 3.1. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 724a019a14d40fdbed21759a204a2bec8a315636 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 22:04:06 2012 +0100 gallivm: Use HAVE_LLVM 0x0301 consistently. commit af6991e2a3868e40ad599b46278551b794839748 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 21:49:06 2012 +0100 gallivm: Add MCRegisterInfo.h to silence benign warnings about missing implementation. Trivial. commit 6f8a1d75458daae2503a86c6b030ecc4bb494e23 Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Apr 2 22:14:15 2012 -0700 gallivm: Pass in a MCInstrInfo to createMCInstPrinter on llvm-3.1. llvm-3.1svn r153860 makes MCInstrInfo available to the MCInstPrinter. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 62555b6ed8760545794f83064e27cddcb3ce5284 Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 27 21:51:17 2012 -0700 gallivm: Fix method overriding in raw_debug_ostream. Use matching type qualifers to avoid method hiding. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 6a9bd784f4ac68ad0a731dcd39e5a3c39989f2be Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 13 22:40:52 2012 -0700 gallivm: Fix createOProfileJITEventListener namespace with llvm-3.1. llvm-3.1svn r152620 refactored the OProfile profiling code. createOProfileJITEventListener was moved from the llvm namespace to the llvm::JITEventListener namespace. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit b674955d39adae272a779be85aa1bd665de24e3e Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Mar 5 22:00:40 2012 -0800 gallivm: Pass in a MCRegisterInfo to MCInstPrinter on llvm-3.1. llvm-3.1svn r152043 changes createMCInstPrinter to take an additional MCRegisterInfo argument. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 11ab69971a8a31c62f6de74905dbf8c02884599f Author: Vinson Lee <vlee@freedesktop.org> Date: Wed Feb 29 21:20:53 2012 -0800 Revert "gallivm: Change getExtent and readByte to non-const with llvm-3.1." This reverts commit d5a6c172547d8964f4d4bb79637651decaf9deee. llvm-3.1svn r151687 makes MemoryObject accessor members const again. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 339960c82d2a9f5c928ee9035ed31dadb7f45537 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 14 16:19:56 2012 +0200 gallivm: (trivial) fix assertion failure for mipmapped 1d textures In lp_build_rho, we may end up with a 1-element vector (for mipmapped 1d textures), but in this case we require the type to be a non-vector type, so need a cast. commit 9d73edb727bd6d196030dc3026b7bf0c574b3e19 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:12:07 2012 +0200 gallivm: prepare for per-quad lod calculations for large vectors to be able to handle multiple quads at once in texture sampling and still do lod calculations per quad, it is necessary to get the per-quad derivatives into the lp_build_rho function. Until now these derivative values were just scalars, which isn't going to work. So we now use vectors, and since the interface needs to change we also do some different (slightly more efficient) packing of the values. For 8-wide vectors the packed derivative values for 3 coords would look like this, this scales to a arbitrary (multiple of 4) vector size: ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____ The second vector will be unused for 1d and 2d textures. To facilitate future changes the derivative values are put into a struct, since quite some functions just pass these values through. The generated code seems to be very slightly better for 2d textures (with 4-wide vectors) than before with sse2 (if you have a cpu with physical 128bit simd units - otherwise it's probably not a win). v2: suggestions from José, rename variables, add comments, use swizzle helper commit 0aa21de0d31466dac77b05c97005722e902517b8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:10:31 2012 +0200 gallivm: add undefined swizzle handling to lp_build_swizzle_aos This is useful for vectors with "holes", it lets llvm choose the most efficient shuffle instructions if some elements aren't needed without having to worry what elements to manually pick otherwise. commit 00faf3f370e7ce92f5ef51002b0ea42ef856e181 Author: José Fonseca <jfonseca@vmware.com> Date: Fri May 4 17:25:16 2012 +0100 gallivm: Get the LLVM IR optimization passes before JIT compilation. MC-JIT engine compiles the module immediately on creation, so the optimization passes were being run too late. So now we create a target data layout from a string, that matches the ABI parameters reported by the compiler. The backend optimization passes were always been run, so the performance improvement is modest (3% on multiarb mesa demo). Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit 40a43f4e2ce3074b5ce9027179d657ebba68800a Author: Roland Scheidegger <sroland@vmware.com> Date: Wed May 2 16:03:54 2012 +0200 gallivm: (trivial) fix wrong define used in lp_build_pack2 should fix stack-smashing crashes. commit e6371d0f4dffad4eb3b7a9d906c23f1c88a2ab9e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 30 21:25:29 2012 +0200 gallivm: add perf warnings when not using intrinsics with 256bit vectors Helper functions using integer sse2 intrinsics could split the vectors with AVX instead of using generic fallback (which should be faster). We don't actually expect to hit these paths (hence don't fix them up to actually do the vector splitting) so just emit warnings (for those functions where it's obvious doing split/intrinsic is faster than using generic path). Only emit warnings for 256bit vectors since we _really_ don't expect to hit arbitrary large vectors which would affect a lot more functions. The warnings do not actually depend on avx since the same logic applies to plain sse2 too (but of course again there's _really_ no reason we should hit these functions with 256bit vectors without avx). commit 8a9ea701ea7295181e846c6383bf66a5f5e47637 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:37:07 2012 +0200 gallivm: split vectors manually for avx in lp_build_pack2 (v2) There's 2 reasons for this: First, there's a llvm bug (fixed in 3.1) which generates tons of byte inserts/extracts otherwise, and second, more importantly, we want to use pack intrinsics instead of shuffles. We do this in lp_build_pack2 and not the calling code (aos sample path) because potentially other callers might find that useful too, even if for larger sequences of code using non-native vector sizes it might be better to manually split vectors. This should boost texture performance in the aos path considerably. v2: fix issues with intrinsics types with old llvm commit 27ac5b48fa1f2ea3efeb5248e2ce32264aba466e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:26:22 2012 +0200 llvmpipe: refactor lp_build_pack2 (v2) prettify, and it's unnecessary to assert when there's no intrinsic due to unsupported bit width - the shuffle path will work regardless. In contrast lp_build_packs2, should only rely on lp_build_pack2 doing the clamping for element sizes for which there is a sse2 intrinsic. v2: fix bug spotted by Jose regarding the intrinsic type for packusdw on old llvm versions. commit ddf279031f0111de4b18eaf783bdc0a1e47813c8 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:13:59 2012 +0200 gallivm: add src width check in lp_build_packs2() not doing so would skip clamping even if no sse2 pack instruction is available, which is incorrect (in theory only, such widths would also always hit a (unnecessary) assertion in lp_build_pack2(). commit e7f0ad7fe079975eae7712a6e0c54be4fae0114b Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Apr 27 15:57:00 2012 +0200 gallivm: (trivial) fix crash-causing typo for npot textures with avx commit 28a9d7f6f655b6ec508c8a3aa6ffefc1e79793a0 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Apr 25 19:38:45 2012 +0200 gallivm: (trivial) remove code mistakenly added twice. commit d5926537316f8ff67ad0a52e7242f7c5478d919b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Apr 24 21:16:15 2012 +0200 gallivm: add a new avx aos sample path (v2) Try to avoid mixing float and int address calculations. This does texture wrap modes with floats, and then the offset calculations still with ints (because of lack of precision with floats, though we could do some effort to make it work with not too large (16MB) textures). This also handles wrap repeat mode with npot-sized textures differently than either the old soa or aos int path (likely way faster but untested). Otherwise the actual address wrap code is largely similar to the soa path (not quite the same as this one also has some int code), it should get used by avx soa sampling later as well but doesn't handle more complex address modes yet (this will also have the benefit that we can use aos sampling path for all texture address modes). Generated code for that looks reasonable, but still does not split vectors explicitly for fetch/filter which means still get hit by llvm (fixed upstream) which generates hundreds of pinsrb/pextrb instead of two shuffles. It is not obvious though if it's much of a win over just doing address calcs 4-wide but with ints, even if it is definitely much less instructions on avx. piglit's texwrap seems to look exactly the same but doesn't test neither the non-normalized nor the npot cases. v2: fix comments, prettify based on Brian's and Jose's feedback. commit bffecd22dea66fb416ecff8cffd10dd4bdb73fce Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Apr 19 01:58:29 2012 +0200 gallivm: refactor aos lp_build_sample_image_nearest/linear split them up to separate address calculations and fetching/filtering. Need this for being able to do 8-wide float address calcs and 4-wide fetch/filter later (for avx). Plus the functions were very big scary monsters anyway (in particular lp_build_sample_image_linear). commit a80b325c57529adddcfa367f96f03557725c4773 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 16 17:17:18 2012 +0200 gallivm: fix lp_build_resize when truncating width but expanding vector size Missed this case which I thought was impossible - the assertion for it was right after the division by zero... (AoS) texture sampling may ask us to do this, for things like 8 4x32int vectors to 1 32x8int vector conversion (eventually, we probably don't want this to happen). commit f9c8337caa3eb185830d18bce8b95676a065b1d7 Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Apr 14 18:00:59 2012 +0200 gallivm: fix cube maps with larger vectors This makes the branchless cube face selection code work with larger vectors. Because the complexity is quite high (cannot really be improved it seems, per-face selection would reduce complexity a lot but this leads to errors unless the derivatives are calculated all from the same face which almost doubles the work to be done) it is still slower than the branching version, hence only enable this with large vectors. It doesn't actually do per-quad face selection yet (only makes sense with matching lod selection, in fact it will select the same face for all pixels based on the average of the first four pixels for now) but only different shuffles are required to make it work (the branching version actually should work with larger vectors too now thanks to the improved horizontal add but of course it cannot be extended to really select the face per-quad unless doing branching per quad). commit 7780c58869fc9a00af4f23209902db7e058e8a66 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 21:11:12 2012 +0100 llvmpipe: (trivial) fix compiler warning and also clarify comment regarding availability of popcnt instruction. commit a266dccf477df6d29a611154e988e8895892277e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 14:21:07 2012 +0100 gallivm: remove unneeded members in lp_build_sample_context Minor cleanup, the texture width, height, depth aren't accessed in their scalar form anywhere. Makes it more obvious those values should probably be fetched already vectorized (but this requires more invasive changes)... commit b678c57fb474e14f05e25658c829fc04d2792fff Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 29 15:53:55 2012 +0100 gallivm: add a helper for concatenating vectors Similar to the extract_range helper intended to get around slow code generated by llvm for 128bit insertelements. Concatenating two 128bit vectors this way will result in a single vinsertf128 operation rather than two 64bit stores plus one 128bit load, though it might be mildly useful for other purposes as well. commit 415ff228bcd0cf5e44a4c15350a661f0f5520029 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 19:41:15 2012 +0100 gallivm: add a custom 2x8f->1x16ub avx conversion path Similar to the existing 4x4f->1x16ub sse2 path, shaves off a couple instructions (min/max mostly) because it relies on pack intrinsics clamping. commit 78c08fc89f8fbcc6dba09779981b1e873e2a0299 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 18:44:07 2012 +0100 gallivm: add avx arithmetic intrinsics Add all avx intrinsics for arithmetic functions (with the exception of the horizontal add function which needs another look). Seems to pass basic tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit a586caa2800aa5ce54c173f7c0d4fc48153dbc4e Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 15:31:35 2012 +0100 gallivm: add avx logic intrinsics Add the blend intrinsics for 8-wide float and 4-wide double vectors. Since we lack 256bit int instructions these are used for int vectors as well, though obviously not for byte or word element values. The comparison intrinsics aren't extended for avx since these are only used for pre-2.7 llvm versions. commit 70275e4c13c89315fc2560a4c488c0e6935d5caf Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 00:40:53 2012 +0100 gallivm: new helper function for extract shuffles. Based on José's idea as we can need that in a couple places. Note that such shuffles should not be used lightly, since data layout of <4 x i8> is different to <16 x i8> for instance, hence might cause data rearrangement. commit 4d586dbae1b0c55915dda1759d2faea631c0a1c2 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 18:27:25 2012 +0100 gallivm: (trivial) don't overallocate shuffle variable using wrong define meant huge array... commit 06b0ec1f6d665d98c135f9573ddf4ba04b2121ad Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 17:54:20 2012 +0100 gallivm: don't do per-element extract/insert for vector element resize Instead of doing per-element extract/insert if the src vectors and dst vector differ in total size (which generates atrocious code) first change the src vectors size by using shuffles to destination vector size. We can still do better than that on AVX for packing to color buffer (by exploiting pack intrinsics characteristics hence eleminating the need for some clamps) but this already generates much better code. v2: incorporate feedback from José, Keith and use shuffle instead of bitcasts/extracts. Due to llvm deficiencies the latter cause all data to get moved to GPRs and back in pieces (even though the data in the regs actually stays the same...). commit c9970d70e05f95d3f52fe7d2cd794176a52693aa Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 19:33:19 2012 +0000 gallivm: fix bug in simple position interpolation Accidental use of position attribute instead of just pixel coordinates. Caused failures in piglit glsl-fs-ceil and glsl-fs-floor. commit d0b6fcdb008d04d7f73d3d725615321544da5a7e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 15:31:14 2012 +0000 gallivm: fix emission of ceil opcode lp_build_ceil seems more appropriate than lp_build_trunc. This seems to be never hit though someone performs some ceil to floor magic. commit d97fafed7e62ffa6bf76560a92ea246a1a26d256 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 22 11:46:52 2012 +0000 gallivm: new vectorized path for cubemap calculations should be faster when adapted to multiple quads as only selection masks need to be different. The code is more or less a per-pixel version adapted to only do it per quad. A per pixel version would be much simpler (could drop 2 selects, 6 broadcasts and the messy horizontal add of 3 vectors at the expense of only 2 more absolute value instructions - would also just work for arbitary large vectors). This version doesn't yet work with larger vectors because the horizontal add isn't adjusted to be able to work with 2x4 vectors (and also because face selection wouldn't be done per quad just per block though that would be only a correctness issue just as with lod selection). The downside is this code is quite a bit slower. On a Core2 it can be sped up by disabling the hw blend instructions for selection and using logicop fallbacks instead, but it is still slower than the old code, hence leave that in for now. Probably will chose one or the other version based on vector length in the end. commit b375fbb18a3fd46859b7fdd42f3e9908ea4ff9a3 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 21 14:42:29 2012 +0000 gallivm: fix optimized occlusion query intrinsic name commit a9ba0a3b611e48efbb0e79eb09caa85033dbe9a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Mar 21 16:19:43 2012 +0000 draw,gallivm,llvmpipe: Call gallivm_verify_function everywhere. commit f94c2238d2bc7383e088b8845b7410439a602071 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 18:54:10 2012 +0000 gallivm: optimize calculations for cube maps a bit this does some more vectorized calculations and uses horizontal adds if possible. A definite win with sse3 otherwise it doesn't seem to make much of a difference. In any case this is arithmetically identical, cannot handle larger vectors. Should be useful as a reference point against larger vector version later... commit 21a2c1cf3c8e1ac648ff49e59fdc0e3be77e2ebb Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 15:16:27 2012 +0000 llvmpipe: slight optimization of occlusion queries using movmskps when available. While this is slightly better for cpus without popcnt we should really sum the vectors ourselves (it is also possible to cast to i4 before doing the popcnt but that doesn't help that much neither since llvm is using some optimized popcnt version for i32) commit 5ab5a35f216619bcdf55eed52b0db275c4a06c1b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 13:32:11 2012 +0000 llvmpipe: fix occlusion queries with larger vectors need to adjust casts etc. commit ff95e6fdf5f16d4ef999ffcf05ea6e8c7160b0d5 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Mar 19 20:15:25 2012 +0000 gallivm: Restore optimization passes. commit 57b05b4b36451e351659e98946dae27be0959832 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:34:22 2012 +0000 llvmpipe: use existing min2 macro commit bc9a20e19b4f600a439f45679451f2e87cd4b299 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:07:27 2012 +0000 llvmpipe: add some safeguards against really large vectors As per José's suggestion, prevent things from blowing up if some cpu would have 1024bit or larger vectors. commit 0e2b525e5ca1c5bbaa63158bde52ad1c1564a3a9 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:31:08 2012 +0000 llvmpipe: fix mask generation for uberwide vectors this was the only piece preventing 16-wide vectors from working (apart from the LP_MAX_VECTOR_WIDTH define that is), which is the maximum as we don't get more pixels in the fragment shader at once. Hence adjust that so things could be tested properly with that size even though there seems to be no practical value. commit 3c8334162211c97f3a11c7f64e9e5a2a91ad9656 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:19:41 2012 +0000 llvmpipe: fix the simple interpolation method with larger vectors so both methods actually _really_ work now. Makes textures look nice with larger vectors... commit 1cb0464ef8871be1778d43b0c56adf9c06843e2d Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 17:26:35 2012 +0000 llvmpipe: fix mask generation and position interpolation with 8-wide vectors trivial bugs, with these things start to look somewhat reasonable. Textures though have some swizzling issues it seems. commit 168277a63ef5b72542cf063c337f2d701053ff4b Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 16:04:03 2012 +0000 llvmpipe: don't overallocate variables we never have more than 16 (stamp size) / 4 (minimum possible vector size). (With larger vectors those variables are still overallocated a bit.) commit 409b54b30f81ed0aa9ed0b01affe15c72de9abd2 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:56:48 2012 +0000 llvmpipe: add some 32f8 formats to lp_test_conv Also add the ability to handle different sized vectors. commit 55dcd3af8366ebdac0af3cdb22c2588f24aa18ce Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:47:27 2012 +0000 gallivm: handle different sized vectors in conversion / pack only fully generic path for now (extract/insert per element). commit 9c040f78c54575fcd94a8808216cf415fe8868f6 Author: Roland Scheidegger <sroland@vmware.com> Date: Sun Mar 18 00:58:28 2012 +0100 llvmpipe: fix harmless use of unitialized values commit 551e9d5468b92fc7d5aa2265db9a52bb1e368a36 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:31:21 2012 +0100 gallivm: drop special path in extract_broadcast with different sized vectors Not needed, llvm can handle shuffles with different sized result vector just fine. Should hopefully generate the same code in the end, but simpler IR. commit 44da531119ffa07a421eaa041f63607cec88f6f8 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:28:49 2012 +0100 llvmpipe: adapt interpolation for handling multiple quads at once this is still WIP there are actually two methods possible not quite sure what makes the most sense, so there's code for both for now: 1) the iterative method as used before (compute attrib values at upper left corner of stamp and upper left corner of each quad initially). It is improved to handle more than one quad at once, and also do some more vectorized calculations initially for slightly better code - newer cpus have full throughput with 4 wide float vectors, hence don't try to code up a path which might be faster if there's just one channel active per attribute. 2) just do straight interpolation for each pixel. Method 2) is more work per quad, but less initially - if all quads are executed significantly more overall though. But this might change with larger vector lengths. This method would also be needed if we'd do some kind of active quad merging when operating on multiple quads at once. This path contains some hack to force llvm to generate better code, it is still far from ideal though, still generates far too many unnecessary register spills/reloads. Both methods should work with different sized vectors. Not very well tested yet, still seems to work with four-wide vectors, need changes elsewhere to be able to test with wider vectors. commit be5d3e82e2fe14ad0a46529ab79f65bf2276cd28 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:59:37 2012 +0000 draw: Cleanup. commit f85bc12c7fbacb3de2a94e88c6cd2d5ee0ec0e8d Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:43:30 2012 +0000 gallivm: More module compilation refactoring. commit d76f093198f2a06a93b2204857e6fea5fd0b3ece Author: José Fonseca <jfonseca@vmware.com> Date: Thu Mar 15 21:29:11 2012 +0000 llvmpipe: Use gallivm_compile/free_function() in linear code. Should had been done before. commit 122e1adb613ce083ad739b153ced1cde61dfc8c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 13 14:47:10 2012 +0100 llvmpipe: generate partial pixel mask for multiple quads still works with one quad, cannot be tested yet with more At least for now always fixed order with multiple quads. commit 4c4f15081d75ed585a01392cd2dcce0ad10e0ea8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 22:09:24 2012 +0100 llvmpipe: refactor state setup a bit Refactor to make it easier to emit (and potentially later fetch in fs) coefficients for multiple attributes at once. Need to think more about how to make this actually happen however, the problem is different attributes can have different interpolation modes, requiring different handling in both setup and fs (though linear and perspective handling is close). commit 9363e49722ff47094d688a4be6f015a03fba9c79 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 19:23:23 2012 +0100 llvmpipe: vectorize tri offset calc cuts number of instructions in quad-offset-factor from 107 to 75. This code actually duplicated the (scalar) code calculating the determinant except it used different vertex order (leading to different sign but it doesn't matter) hence llvm could not have figured out it's the same (of course with determinant vectorized in the other place that wouldn't have worked any longer neither). Note this particular piece doesn't actually vectorize well, not many arithmetic instructions left but tons of shuffle instructions... Probably would need to work on n tris at a time for better vectorization. commit 63169dcb9dd445c94605625bf86d85306e2b4297 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 03:11:37 2012 +0100 llvmpipe: vectorize some scalar code in setup reduces number of arithmetic instructions, and avoids loading vector x,y values twice (once as scalars once as vectors). Results in a reduction of instructions from 76 to 64 in fs setup for glxgears (16%) on a cpu with sse41. Since this code uses vec2 disguised as vec4, on old cpus which had physical 64bit sse units (pre-Core2) it probably is less of a win in practice (and if you have no vectors you can only hope llvm eliminates the arithmetic for unneeded elements). commit 732ecb877f951ab89bf503ac5e35ab8d838b58a1 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 7 00:32:24 2012 +0100 draw: fix clipping bug introduced by 4822fea3f0440b5205e957cd303838c3b128419c broke clipping pretty badly (verified with lineclip test) commit ef5d90b86d624c152d200c7c4056f47c3c6d2688 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 23:38:59 2012 +0100 draw: don't store vertex header per attribute storing the vertex header once per attribute is totally unnecessary. Some quick look at the generated assembly says llvm in fact cannot optimize away the additional stores (maybe due to potentially aliasing pointers somewhere). Plus, this makes the code cleaner and also allows using a vector "or" instead of scalar ones. commit 6b3a5a57b0b9850854cfbd7b586e4e50102dda71 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 19:11:01 2012 +0100 draw: do the per-vertex "boolean" clipmask "or" with vectors no point extracting the values and doing it per component. Doesn't help that much since we still extract the values elsewhere anyway. commit 36519caf1af40e4480251cc79a2d527350b7c61f Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 2 22:27:01 2012 +0100 gallivm: fix lp_build_extract_broadcast with different sized vectors Fix the obviously wrong argument, so it doesn't blow up. commit 76d0ac3ad85066d6058486638013afd02b069c58 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 2 12:16:23 2012 +0000 draw: Compile per module and not per function (WIP). Enough to get gears w/ LLVM draw + softpipe to work on AVX doing: GALLIUM_DRIVER=softpipe SOFTPIPE_USE_LLVM=yes glxgears But still hackish -- will need to rethink and refactor this. commit 78e32b247d2a7a771be9a1a07eb000d1e54ea8bd Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 12:01:05 2012 +0000 llvmpipe: Remove lp_state_setup_fallback. Never used. commit 6895d5e40d19b4972c361e8b83fdb7eecda3c225 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Feb 27 19:14:27 2012 +0000 llvmpipe: Don't emit EMMS on x86 We already take precautions to ensure that LLVM never emits MMX code. commit 4822fea3f0440b5205e957cd303838c3b128419c Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 29 15:58:19 2012 +0100 draw: modifications for larger vector sizes We want to be able to use larger vectors especially for running the vertex shader. With this patch we build soa vectors which might have a different length than 4. Note that aos structures really remain the same, only when aos structures are converted to soa potentially different sized vectors are used. Samplers probably don't work yet, didn't look at them. Testing done: glxgears works with both 128bit and 256bit vectors. commit f4950fc1ea784680ab767d3dd0dce589f4e70603 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:57 2012 +0100 gallivm: override native vector width with LP_NATIVE_VECTOR_WIDTH env var for debug commit 6ad6dbf0c92f3bf68ae54e5f2aca035d19b76e53 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:24 2012 +0100 draw: allocate storage with alignment according to native vector width commit 7bf0e3e7c9bd2469ae7279cabf4c5229ae9880c1 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Feb 24 19:06:08 2012 +0000 gallivm: Fix comment grammar. Was missing several words. Spotted by Roland. commit b20f1b28eb890b2fa2de44a0399b9b6a0d453c52 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 19:22:09 2012 +0000 gallivm: Use MC-JIT on LLVM 3.1 + (i.e, SVN) MC-JIT Note: MC-JIT is still WIP. For this to work correctly it requires LLVM changes which are not yet upstream. commit b1af4dfcadfc241fd4023f4c3f823a1286d452c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 20:03:15 2012 +0100 llvmpipe: use new lp_type_width() helper in lp_test_blend commit 04e0a37e888237d4db2298f31973af459ef9c95f Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 19:50:34 2012 +0100 llvmpipe: clean up lp_test_blend a little Using variables just sized and aligned right makes it a bit more obvious what's going on. The test still only tests vector length 4. For AoS anything else probably isn't going to work. For SoA other lengths should work (at least with floats). commit e61c393d3ec392ddee0a3da170e985fda885a823 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:48:30 2012 +0000 gallivm: Ensure vector width consistency. Instead of assuming that everything is the max native size. commit 330081ac7bc41c5754a92825e51456d231bf84dd Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:44:14 2012 +0000 draw: More simd vector width consistency fixes. commit d90ca002753596269e37297e2e6c139b19f29f03 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:43:00 2012 +0000 gallivm: Remove unused lp_build_int32_vec4_type() helper. commit cae23417824d75869c202aaf897808d73a2c1db0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 17:32:16 2012 +0100 gallivm: use global variable for native vector width instead of define We do not know the simd extensions (and hence the simd width we should use) available at compile time. At least for now keep a define for maximum vector width, since a global variable obviously can't be used to adjust alignment of automatic stack variables. Leave the runtime-determined value at 128 for now in all cases. commit 51270ace6349acc2c294fc6f34c025c707be538a Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 15:41:02 2012 +0000 gallivm: Add a hunk inadvertedly lost when rebasing. commit bf256df9cfdd0236637a455cbaece949b1253e98 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:24:23 2012 +0000 llvmpipe: Use consistent vector width in depth/stencil test. commit 5543b0901677146662c44be2cfba655fd55da94b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:19:59 2012 +0000 draw: Use a consistent the vector register width. Instead of 4x32 sometimes, LP_NATIVE_VECTOR_WIDTH other times. commit eada8bbd22a3a61f549f32fe2a7e408222e5c824 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 12:08:04 2012 +0000 gallivm: Remove garbagge collection. MC-JIT will require one compilation per module (as opposed to one compilation per function), therefore no state will be shared, eliminating the need to do garbagge collection. commit 556697ea0ed72e0641851e4fbbbb862c470fd7eb Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 10:33:41 2012 +0000 gallivm: Move all native target initialization to lp_set_target_options(). commit c518e8f3f2649d5dc265403511fab4bcbe2cc5c8 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:52:32 2012 +0000 llvmpipe: Create one gallivm instance for each test. commit 90f10af8920ec6be6f2b1e7365cfc477a0cb111d Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:48:08 2012 +0000 gallivm: Avoid LLVMAddGlobalMapping() in lp_bld_assert(). Brittle, complex, and unecesary. Just use function pointer constant. commit 98fde550b33401e3fe006af59db4db628bcbf476 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:21:26 2012 +0000 gallivm: Add a lp_build_const_func_pointer() helper. To be reused in all places where we want to call C code. commit 6cfedadb62c2ce5af8d75969bc95a607f3ece118 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:44:41 2012 +0000 gallivm: Cleanup/simplify lp_build_const_string_variable. - Move to lp_bld_const where it belongs - Rename to lp_build_const_string - take the length from the argument (and don't count the zero terminator twice) - bitcast the constant to generic i8 * commit db1d4018c0f1fa682a9da93c032977659adfb68c Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 11:52:17 2012 +0000 gallivm: Set NoFramePointerElimNonLeaf to true where supported. commit 088614164aa915baaa5044fede728aa898483183 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 22 19:38:47 2012 +0100 llvmpipe: pass in/out pointers rather scalar floats in lp_bld_arit we don't want llvm to potentially optimize away the vectors (though it doesn't seem to currently), plus we want to be able to handle in/out vectors of arbitrary length. commit 3f5c4e04af8a7592fdffa54938a277c34ae76b51 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:22:55 2012 +0100 gallivm: fix lp_build_sqrt() for vector length 1 since we optimize away vectors with length 1 need to emit intrinsic without vector type. commit 79d94e5f93ed8ba6757b97e2026722ea31d32c06 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 17:00:46 2012 +0000 llvmpipe: Remove lp_test_round. commit 81f41b5aeb3f4126e06453cfc78990086b85b78d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:56:24 2012 +0100 llvmpipe: subsume lp_test_round into lp_test_arit Much simpler, and since the arguments aren't passed as 128bit values can run on any arch. This also uses the float instead of the double versions of the c functions (which probably was the intention anyway). In contrast to lp_test_round the output is much less verbose however. Tested vector width of 32 to 512 bits - all pass except 32 (length 1) which crashes in lp_build_sqrt() due to wrong type. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 945b338b421defbd274481d8c4f7e0910fd0e7eb Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 09:55:03 2012 +0000 gallivm: Centralize the function compilation logic. This simplifies a lot of code. Also doing this in a central place will make it easier to carry out the changes necessary to use MC-JIT in the future. gallivm: Fix typo in explicit derivative shuffle. Trivial. draw: make DEBUG_STORE work again adapt to lp_build_printf() interface changes Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: get rid of vecnf_from_scalar() just use lp_build_broadcast directly (cannot assign a name but don't really need it, vecnf_from_scalar() was producing much uglier IR due to using repeated insertelement instead of insertelement+shuffle). Reviewed-by: José Fonseca <jfonseca@vmware.com> llvmpipe: fix typo in complex interpolation code Fixes position interpolation when using complex mode (piglit fp-fragment-position and similar) Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: fix clipvertex/position storing again This appears to be the result of a bad merge. Fixes piglit tests relying on clipping, like a lot of the interpolation tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Fix explicit derivative manipulation. Same counter variable was being used in two nested loops. Use more meanigful variable names for the counter to fix and avoid this. gallivm: Prevent buffer overflow in repeat wrap mode for NPOT. Based on Roland's patch, discussion, and review . Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix dims for TGSI_TEXTURE_1D in emit_tex. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix explicit volume texture derivatives. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: fix 1d shadow texture sampling Always r coordinate is used, hence need 3 coords not two (the second one is unused). Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Enable AVX support without MCJIT, where available. For now, this just enables AVX on Windows for testing. If the code is stable then we might consider prefering the old JIT wherever possible. No change elsewhere. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2012-07-13 18:09:30 +01:00
intr_size = 128;
}
else {
intrinsic = "llvm.x86.avx.max.pd.256";
intr_size = 256;
}
}
gallivm,draw,llvmpipe: Support wider native registers. Squashed commit of the following: commit 7acb7b4f60dc505af3dd00dcff744f80315d5b0e Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:46:31 2012 +0100 draw: Don't use dynamically sized arrays. Not supported by MSVC. commit 5810c28c83647612cb372d1e763fd9d7780df3cb Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:44:16 2012 +0100 gallivm,llvmpipe: Don't use expressions with PIPE_ALIGN_VAR(). MSVC doesn't accept exceptions in _declspec(align(...)). Use a define instead. commit 8aafd1457ba572a02b289b3f3411e99a3c056072 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:41:56 2012 +0100 gallium/util: Make u_cpu_detect.h header C++ safe. commit 5795248350771f899cfbfc1a3a58f1835eb2671d Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 2 12:08:01 2012 +0100 gallium/util: Add ULL suffix to large constants. As suggested by Andy Furniss: it looks like some old gcc versions require it. commit 4c66c22727eff92226544c7d43c4eb94de359e10 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Truly disable INF/NAN tests on MSVC. Thanks to Brian for spotting this. commit 8bce274c7fad578d7eb656d9a1413f5c0844c94e Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Disable INF/NAN tests on MSVC. Somehow they are not recognized as constants. commit 6868649cff8d7fd2e2579c28d0b74ef6dd4f9716 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 15:05:24 2012 +0200 gallivm: Cleanup the 2 x 8 float -> 16 ub special path in lp_build_conv. No behaviour change intended, like 7b98455fb40c2df84cfd3cdb1eb7650f67c8a751. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 5147a0949c4407e8bce9e41d9859314b4a9ccf77 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 14:28:19 2012 +0200 gallivm: (trivial) fix issues with multiple-of-4 texture fetch Some formats can't handle non-multiple of 4 fetches I believe, but everything must support length 1 and multiples of 4. So avoid going to scalar fetch (which is very costly) just because length isn't 4. Also extend the hack to not use shift with variable count for yuv formats to arbitrary length (larger than 1) - doesn't matter how many elements we have we always want to avoid it unless we have variable shift count instruction (which we should get with avx2). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 87ebcb1bd71fa4c739451ec8ca89a7f29b168c08 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jul 4 02:09:55 2012 +0200 gallivm: (trivial) fix typo for wrap repeat mode in linear filtering aos code This would lead to bogus coordinates at the edges. (undetected by piglit because this path is only taken for block-based formats). Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 3a42717101b1619874c8932a580c0b9e6896b557 Author: José Fonseca <jfonseca@vmware.com> Date: Tue Jul 3 19:42:49 2012 +0100 gallivm: Fix TGSI integer translation with AVX. commit d71ff104085c196b16426081098fb0bde128ce4f Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 15:17:41 2012 +0100 llvmpipe: Fix LLVM JIT linear path. It was not working properly because it was looking at the JIT function before it was actually compiled. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit a94df0386213e1f5f9a6ed470c535f9688ec0a1b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Jun 28 18:07:10 2012 +0100 gallivm: Refactor lp_build_broadcast(_scalar) to share code. Doesn't really change the generated assembly, but produces more compact IR, and of course, makes code more consistent. Reviewed-by: Brian Paul <brianp@vmware.com> commit 66712ba2731fc029fa246d4fc477d61ab785edb5 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 17:30:13 2012 +0100 gallivm: Make LLVMContextRef a singleton. There are any places inside LLVM that depend on it. Too many to attempt to fix. Reviewed-by: Brian Paul <brianp@vmware.com> commit ff5fb7897495ac263f0b069370fab701b70dccef Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 28 18:15:27 2012 +0200 gallivm: don't use 8-wide texture fetch in aos path This appears to be a slight loss usually. There are probably several reasons for that: - fetching itself is scalar - filtering is pure int code hence needs splitting anyway, same for the final texel offset calculations - texture wrap related code, which can be done 8-wide, is slightly more complex with floats (with clamp_to_edge) and float operations generally more costly hence probably not much faster overall - the code needed to split when encountering different mip levels for the quads, adding complexity So, just split always for aos path (but leave it 8-wide for soa, since we do 8-wide filtering there when possible). This should certainly be revisited if we'd have avx2 support. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ce8032b43dcd8e8d816cbab6428f54b0798f945d Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:41:19 2012 +0200 gallivm: (trivial) don't extract fparts variable if not needed Did not have any consequences but unnecessary. commit aaa9aaed8f80dc282492f62aa583a7ee23a4c6d5 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:09:06 2012 +0200 gallivm: fix precision issue in aos linear int wrap code now not just passes at a quick glance but also with piglit... If we do the wrapping with floats, we also need to set the weights accordingly. We can potentially end up with different (integer) coordinates than what the integer calculations would have chosen, which means the integer weights calculated previously in this case are completely wrong. Well at least that's what I think happens, at least recalculating the weights helps. (Some day really should refactor all the wrapping, so we do whatever is fastest independent of 16bit int aos or 32bit float soa filtering.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit fd6f18588ced7ac8e081892f3bab2916623ad7a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 11:15:53 2012 +0100 gallium/util: Fix parsing of options with underscore. For example GALLIVM_DEBUG=no_brilinear which was being parsed as two options, "no" and "brilinear". commit 09a8f809088178a03e49e409fa18f1ac89561837 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 26 15:00:14 2012 +0100 gallivm: Added a generic lp_build_print_value which prints a LLVMValueRef. Updated lp_build_printf to share common code. Removed specific lp_build_print_vecX. Reviewed-by: José Fonseca <jfonseca@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit e59bdcc2c075931bfba2a84967a5ecd1dedd6eb0 Author: José Fonseca <jfonseca@vmware.com> Date: Wed May 16 15:00:23 2012 +0100 draw,llvmpipe: Avoid named struct types on LLVM 3.0 and later. Starting with LLVM 3.0, named structures are meant not for debugging, but for recursive data types, previously also known as opaque types. The recursive nature of these types leads to several memory management difficulties. Given that we don't actually need recursive types, avoid them altogether. This is an attempt to address fdo bugs 41791 and 44466. The issue is somewhat random so there's no easy way to check how effective this is. Cherry-picked from 9af1ba565dfd5cef9ee938bb7c04767d14878fbf commit df6070f618a203c7a876d984c847cde4cbc26bdb Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 14:42:53 2012 +0200 gallivm: (trivial) fix typo in faster aos linear int wrap code no longer crashes, now REALLY tested. commit d8f98dce452c867214e6782e86dc08562643c862 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 18:20:58 2012 +0200 llvmpipe: (trivial) remove bogus optimization for float aos repeat wrap This optimization for nearest filtering on the linear path generated likely bogus results, and the int path didn't have any optimizations there since the only shader using force_nearest apparently uses clamp_to_edge not repeat wrap anyway. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit c4e271a0631087c795e756a5bb6b046043b5099d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 23:01:52 2012 +0200 gallivm: faster repeat wrap for linear aos path too Even if we already have scaled integer coords, it's way faster to use the original float coord (plus some conversions) rather than use URem. The choice of what to do for texture wrapping is not really tied to int aos or float soa filtering though for some modes there can be some gains (because of easier weight calculations). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 1174a75b1806e92aee4264ffe0ffe7e70abbbfa3 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 14:39:22 2012 +0200 gallivm: improve npot tex wrap repeat in linear soa path URem gets translated into series of scalar divisions so just about anything else is faster. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit f849ffaa499ed96fa0efd3594fce255c7f22891b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 00:40:35 2012 +0100 gallivm: (trivial) fix near-invisible shift-space typo I blame the keyboard. commit 5298a0b19fe672aebeb70964c0797d5921b51cf0 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:24:28 2012 +0200 gallivm: add new intrinsic helper to deal with arbitrary vector length This helper will split vectors which are too large for the hw, or expand them if they are too small, so a caller of a function using intrinsics which uses such sizes need not split (or expand) the vectors manually and the function will still use the intrinsic instead of dropping back to generic llvm code. It can also accept scalars for use with pseudo-vector intrinsics (only useful for float arguments, all x86 scalar simd float intrinsics use 4vf32). Only used for lp_build_min/max() for now (also added the scalar float case for these while there). (Other basic binary functions could use it easily, whereas functions with a different interface would need different helpers.) Expanding vectors isn't widely used, because we always try to use build contexts with native hw vector sizes. But it might (or not) be nicer if this wouldn't need to be done, the generated code should in theory stay the same (it does get hit by lp_build_rho though already since we didn't have a intrinsic for the scalar lp_build_max case before). v2: incorporated Brian's feedback, and also made the scalar min/max case work instead of crash (all scalar simd float intrinsics take 4vf32 as argument, probably the reason why it wasn't used before). Moved to lp_bld_intr based on José's request, and passing intrinsic size instead of length. Ideally we'd derive the source type info from the passed in llvm value refs and process some llvmtype return type so we could handle intrinsics where the source and destination type isn't the same (like float/int conversions, packing instructions) but that's a bit too complicated for now. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 01aa760b99ec0b2dc8ce57a43650e83f8c1becdf Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:19:18 2012 +0200 gallivm: (trivial) increase max code size for shader disassembly 64kB was just short of what I needed (which caused a crash) hence increase to 96kB (should probably be smarter about that). commit 74aa739138d981311ce13076388382b5e89c6562 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:53:29 2012 +0100 gallivm: simplify aos float tex wrap repeat nearest just handle pot and npot the same. The previous pot handling ended up with exactly the same instructions plus 2 more (leave it in the soa path though since it is probably still cheaper there). While here also fix a issue which would cause a crash after an assert. commit 0e1e755645e9e49cfaa2025191e3245ccd723564 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:29:24 2012 +0100 gallivm: (trivial) skip floor rounding in ifloor when not signed This was only done for the non-sse41 case before, but even with sse41 this is obviously unnecessary (some callers already call itrunc in this case anyway but some might not). commit 7f01a62f27dcb1d52597b24825931e88bae76f33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:23:12 2012 +0100 gallivm: (trivial) fix bogus comments commit 5c85be25fd82e28490274c468ce7f3e6e8c1d416 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 20 11:51:57 2012 +0100 translate: Free elt8_func/elt16_func too. These were leaking. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 0ad498f36fb6f7458c7cffa73b6598adceee0a6c Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 15:55:34 2012 +0200 gallivm: fix bug for tex wrap repeat with linear sampling in aos float path The comparison needs to be against length not length_minus_one, otherwise the max texel is never chosen (for the second coordinate). Fixes piglit texwrap-1D-npot-proj (and 2D/3D versions). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit d1ad65937c5b76407dc2499b7b774ab59341209e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 16:13:43 2012 +0200 gallivm: simplify soa tex wrap repeat with npot textures and no mip filtering Similar to what is already done in aos sampling for the float path (but not the int path since we don't get normalized float coordinates there). URem is expensive and the calculation is done trivially with normalized floats instead (at least with sse41-capable cpus). (Some day should probably do the same for the mip filter path but it's much more complicated there hence the gain is smaller.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit e1e23f57ba9b910295c306d148f15643acc3fc83 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:38:56 2012 +0200 llvmpipe: (trivial) remove duplicated function declaration Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 07ca57eb09e04c48a157733255427ef5de620861 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:37:34 2012 +0200 llvmpipe: destroy setup variants on context destruction lp_delete_setup_variants() used to be called in garbage collection, but this no longer exists hence the setup shaders never got freed. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ed0003c633859a45f9963a479f4c15ae0ef1dca3 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 16:25:29 2012 +0100 gallivm: handle different ilod parts for multiple quad sampling This fixes filtering when the integer part of the lod is not the same for all quads. I'm not fully convinced of that solution yet as it just splits the vector if the levels to be sampled from are different. But otherwise we'd need to do things like some minify steps, and getting mip level base address separately anyway hence it wouldn't really look like much of a win (and making the code even more complex). This should now give identical results to single quad sampling. commit 8580ac4cfc43a64df55e84ac71ce1a774d33c0d2 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:14:47 2012 +0200 gallivm: de-duplicate sample code common to soa and aos sampling There doesn't seem to be any reason why this code dealing with cube face selection, lod and mip level calculation is separate in aos and soa sampling, and I am sick of having it to change in both places. commit fb541e5f957408ce305b272100196f1e12e5b1e8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:15:41 2012 +0200 gallivm: do mip filtering with per quad lod_fpart This gives better results for mip filtering, though the generated code might not be optimal. For now it also creates some artifacts if the lod_ipart isn't the same for all quads, since instead of using the same mip weight for all quads as previously (which just caused non-smooth gradients) this now will use the right weights but with the wrong mip level in this case (can easily be seen with things like texfilt, mipmap_tunnel). v2: use logic helper suggested by José, and fix issue with negative lod_fpart values commit f1cc84eef7d826a20fab6cd8ccef9a275ff78967 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 13 18:35:25 2012 +0200 gallivm: (trivial) fix bogus assert in lp_build_unpack_broadcast_aos_scalars commit 7c17dbae8ae290df9ce0f50781a09e8ed640c044 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:11:14 2012 +0100 util: Reimplement half <-> float conversions. Removed u_half.py used to generate the table for previous method. Previous implementation of float to half conversion was faulty for denormalised and NaNs and would require extra logic to fix, thus making the speedup of using tables irrelevant. commit 7762f59274070e1dd4b546f5cb431c2eb71ae5c3 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:12:16 2012 +0100 tests: Updated tests to properly handle NaN for half floats. commit fa94c135aea5911fd93d5dfb6e6f157fb40dce5e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:33:10 2012 +0200 gallivm: do mip level calculations per quad This is the final piece which shouldn't change the rendering output yet. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 23cbeaddfe03c09ca18c45d28955515317ffcf4c Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Jun 9 00:54:21 2012 +0200 gallivm: do per-quad cube face selection Doesn't quite fix the piglit cubemap test (not sure why actually) but doing per-quad face selection is doing the right thing and definitely an improvement. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit abfb372b3702ac97ac8b5aa80ad1b94a2cc39d33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:22:59 2012 +0200 gallivm: do all lod calculations per quad Still no functional change but lod is now converted to scalar after lod calculations. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 519368632747ae03feb5bca9c655eccbc5b751b4 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:46:10 2012 +0100 gallivm: Added support for half-float to float conversion in lp_build_conv. Updated various utility functions to support this change. commit 135b4d683a4c95f7577ba27b9bffa4a6fbd2c2e7 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:02:46 2012 +0100 gallivm: Added function for half-float to float conversion. Updated lp_build_format_aos_array to support half-float source. commit 37d648827406a20c5007abeb177698723ed86673 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:55:18 2012 +0100 util: Updated u_format_tests to rigidly test half-float boundary values. commit 2ad18165d96e578aa9046df7c93cb1c3284d8c6b Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:54:16 2012 +0100 llvmpipe: Updated lp_test_format to properly handle Inf/NaN results. commit 78740acf25aeba8a7d146493dd5c966e22c27b73 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:53:30 2012 +0100 util: Added functions for checking NaN / Inf for double and half-floats. commit 35e9f640ae01241f9e0d67fe893bbbf564c05809 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:05:13 2012 +0200 gallivm: Fix calculating rho for 3d textures for the single-quad case Discovered by accident, this looks like a very old typo bug. commit fc1220c636326536fd0541913154e62afa7cd1d8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:04:59 2012 +0200 gallivm: do calcs per-quad in lp_build_rho Still convert to scalar at the end of the function. commit 50a887ffc550bf310a6988fa2cea5c24d38c1a41 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 21 23:21:50 2012 +0200 gallivm: (trivial) return scalar in lp_build_extract_range for length 1 vectors Our type system on top of llvm's one doesn't generally support vectors of length 1, instead using scalars. So we should return a scalar from this function instead of having to bitcast the vector with length 1 later elsewhere. commit 80c71c621f9391f0f9230460198d861643324876 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 17:49:15 2012 +0100 draw: Fixed bad merge error commit c47401cfad0c9167de20ff560654f533579f452c Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:29:30 2012 +0100 draw: Updated store_clip to store whole vectors instead of individual elements. commit 2d9c1ad74b0b0b41861fffcecde39f09cc27f1cf Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:28:32 2012 +0100 gallivm: Added lp_build_fetch_rgba_aos_array. A version of lp_build_fetch_rgba_aos which is targeted at simple array formats. Reads the whole vector from memory in one, instead of reading each element individually. Tested with mesa tests and demos. commit ff7805dc2b6ef6d8b11ec4e54aab1633aef29ac8 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:27:40 2012 +0100 gallivm: Added lp_build_pad_vector. This function pads a vector with undef to a desired length. commit 701f50acef24a2791dabf4730e5b5687d6eb875d Author: James Benton <jbenton@vmware.com> Date: Fri May 18 17:27:19 2012 +0100 util: Added util_format_is_array. This function checks whether a format description is in a simple array format. commit 5e0a7fa543dcd009de26f34a7926674190fa6246 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:13:47 2012 +0100 draw: Removed draw_llvm_translate_from and draw/draw_llvm_translate.c. This is "replaced" by adding an optimised path in lp_build_fetch_rgba_aos in an upcoming patch. commit 8c886d6a7dd3fb464ecf031de6f747cb33e5361d Author: James Benton <jbenton@vmware.com> Date: Wed May 16 15:02:31 2012 +0100 draw: Modified store_aos to write the vector as one, not individual elements. commit 37337f3d657e21dfd662c7b26d61cb0f8cfa6f17 Author: James Benton <jbenton@vmware.com> Date: Wed May 16 14:16:23 2012 +0100 draw: Changed aos_to_soa to use lp_build_transpose_aos. commit bd2b69ce5d5c94b067944d1dcd5df9f8e84548f1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:14:27 2012 +0100 draw: Changed soa_to_aos to use lp_build_transpose_aos. commit 0b98a950d29a116e82ce31dfe7b82cdadb632f2b Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:45 2012 +0100 gallivm: Added lp_build_transpose_aos which converts between aos and soa. commit 69ea84531ad46fd145eb619ed1cedbe97dde7cb5 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:01 2012 +0100 gallivm: Added lp_build_interleave2_half aimed at AVX unpack instructions. commit 7a4cb1349dd35c18144ad5934525cfb9436792f9 Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 22 11:54:14 2012 +0100 gallivm: Fix build on Windows. MC-JIT not yet supported there. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit afd105fc16bb75d874e418046b80d9cc578818a1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:17:26 2012 +0100 llvmpipe: Added a error counter to lp_test_conv. Useful for keeping track of progress when fixing errors! Signed-off-by: José Fonseca <jfonseca@vmware.com> commit b644907d08c10a805657841330fc23db3963d59c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:16:46 2012 +0100 llvmpipe: Changed known failures in lp_test_conv. To comply with the recent fixes to lp_bld_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit d7061507bd94f6468581e218e61261b79c760d4f Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:14:38 2012 +0100 llvmpipe: Added fixed point types tests to lp_test_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 146b3ea39b4726dbe125ac666bd8902ea3d6ca8c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:26:35 2012 +0100 llvmpipe: Changed lp_test_conv src/dst alignment to be correct. Now based on the define rather than a fixed number. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit f3b57441f834833a4b142a951eb98df0aa874536 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:06:44 2012 +0100 gallivm: Fixed erroneous optimisation in lp_build_min/max. Previously assumed normalised was 0 to 1, but it can be -1 to 1 if type is signed. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a0613382e5a215cd146bb277646a6b394d376ae4 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:04:49 2012 +0100 gallivm: Compensate for lp_const_offset in lp_build_conv. Fixing a /*FIXME*/ to remove errors in integer conversion in lp_build_conv. Tested using lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a3d2bf15ea345bc8a0664f8f441276fd566566f3 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:01:25 2012 +0100 gallivm: Fixed overflow in lp_build_clamped_float_to_unsigned_norm. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit e7b1e76fe237613731fa6003b5e1601a2e506207 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 21 20:07:51 2012 +0100 gallivm: Fix build with LLVM 2.6 Trivial, and useful. commit d3c6bbe5c7f5ba1976710831281ab1b6a631082d Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 15 17:15:59 2012 +0100 gallivm: Enable MCJIT/AVX with vanilla LLVM 3.1. Add the necessary C++ glue, so that we don't need any modifications to the soon to be released LLVM 3.1. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 724a019a14d40fdbed21759a204a2bec8a315636 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 22:04:06 2012 +0100 gallivm: Use HAVE_LLVM 0x0301 consistently. commit af6991e2a3868e40ad599b46278551b794839748 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 21:49:06 2012 +0100 gallivm: Add MCRegisterInfo.h to silence benign warnings about missing implementation. Trivial. commit 6f8a1d75458daae2503a86c6b030ecc4bb494e23 Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Apr 2 22:14:15 2012 -0700 gallivm: Pass in a MCInstrInfo to createMCInstPrinter on llvm-3.1. llvm-3.1svn r153860 makes MCInstrInfo available to the MCInstPrinter. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 62555b6ed8760545794f83064e27cddcb3ce5284 Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 27 21:51:17 2012 -0700 gallivm: Fix method overriding in raw_debug_ostream. Use matching type qualifers to avoid method hiding. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 6a9bd784f4ac68ad0a731dcd39e5a3c39989f2be Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 13 22:40:52 2012 -0700 gallivm: Fix createOProfileJITEventListener namespace with llvm-3.1. llvm-3.1svn r152620 refactored the OProfile profiling code. createOProfileJITEventListener was moved from the llvm namespace to the llvm::JITEventListener namespace. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit b674955d39adae272a779be85aa1bd665de24e3e Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Mar 5 22:00:40 2012 -0800 gallivm: Pass in a MCRegisterInfo to MCInstPrinter on llvm-3.1. llvm-3.1svn r152043 changes createMCInstPrinter to take an additional MCRegisterInfo argument. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 11ab69971a8a31c62f6de74905dbf8c02884599f Author: Vinson Lee <vlee@freedesktop.org> Date: Wed Feb 29 21:20:53 2012 -0800 Revert "gallivm: Change getExtent and readByte to non-const with llvm-3.1." This reverts commit d5a6c172547d8964f4d4bb79637651decaf9deee. llvm-3.1svn r151687 makes MemoryObject accessor members const again. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 339960c82d2a9f5c928ee9035ed31dadb7f45537 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 14 16:19:56 2012 +0200 gallivm: (trivial) fix assertion failure for mipmapped 1d textures In lp_build_rho, we may end up with a 1-element vector (for mipmapped 1d textures), but in this case we require the type to be a non-vector type, so need a cast. commit 9d73edb727bd6d196030dc3026b7bf0c574b3e19 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:12:07 2012 +0200 gallivm: prepare for per-quad lod calculations for large vectors to be able to handle multiple quads at once in texture sampling and still do lod calculations per quad, it is necessary to get the per-quad derivatives into the lp_build_rho function. Until now these derivative values were just scalars, which isn't going to work. So we now use vectors, and since the interface needs to change we also do some different (slightly more efficient) packing of the values. For 8-wide vectors the packed derivative values for 3 coords would look like this, this scales to a arbitrary (multiple of 4) vector size: ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____ The second vector will be unused for 1d and 2d textures. To facilitate future changes the derivative values are put into a struct, since quite some functions just pass these values through. The generated code seems to be very slightly better for 2d textures (with 4-wide vectors) than before with sse2 (if you have a cpu with physical 128bit simd units - otherwise it's probably not a win). v2: suggestions from José, rename variables, add comments, use swizzle helper commit 0aa21de0d31466dac77b05c97005722e902517b8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:10:31 2012 +0200 gallivm: add undefined swizzle handling to lp_build_swizzle_aos This is useful for vectors with "holes", it lets llvm choose the most efficient shuffle instructions if some elements aren't needed without having to worry what elements to manually pick otherwise. commit 00faf3f370e7ce92f5ef51002b0ea42ef856e181 Author: José Fonseca <jfonseca@vmware.com> Date: Fri May 4 17:25:16 2012 +0100 gallivm: Get the LLVM IR optimization passes before JIT compilation. MC-JIT engine compiles the module immediately on creation, so the optimization passes were being run too late. So now we create a target data layout from a string, that matches the ABI parameters reported by the compiler. The backend optimization passes were always been run, so the performance improvement is modest (3% on multiarb mesa demo). Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit 40a43f4e2ce3074b5ce9027179d657ebba68800a Author: Roland Scheidegger <sroland@vmware.com> Date: Wed May 2 16:03:54 2012 +0200 gallivm: (trivial) fix wrong define used in lp_build_pack2 should fix stack-smashing crashes. commit e6371d0f4dffad4eb3b7a9d906c23f1c88a2ab9e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 30 21:25:29 2012 +0200 gallivm: add perf warnings when not using intrinsics with 256bit vectors Helper functions using integer sse2 intrinsics could split the vectors with AVX instead of using generic fallback (which should be faster). We don't actually expect to hit these paths (hence don't fix them up to actually do the vector splitting) so just emit warnings (for those functions where it's obvious doing split/intrinsic is faster than using generic path). Only emit warnings for 256bit vectors since we _really_ don't expect to hit arbitrary large vectors which would affect a lot more functions. The warnings do not actually depend on avx since the same logic applies to plain sse2 too (but of course again there's _really_ no reason we should hit these functions with 256bit vectors without avx). commit 8a9ea701ea7295181e846c6383bf66a5f5e47637 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:37:07 2012 +0200 gallivm: split vectors manually for avx in lp_build_pack2 (v2) There's 2 reasons for this: First, there's a llvm bug (fixed in 3.1) which generates tons of byte inserts/extracts otherwise, and second, more importantly, we want to use pack intrinsics instead of shuffles. We do this in lp_build_pack2 and not the calling code (aos sample path) because potentially other callers might find that useful too, even if for larger sequences of code using non-native vector sizes it might be better to manually split vectors. This should boost texture performance in the aos path considerably. v2: fix issues with intrinsics types with old llvm commit 27ac5b48fa1f2ea3efeb5248e2ce32264aba466e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:26:22 2012 +0200 llvmpipe: refactor lp_build_pack2 (v2) prettify, and it's unnecessary to assert when there's no intrinsic due to unsupported bit width - the shuffle path will work regardless. In contrast lp_build_packs2, should only rely on lp_build_pack2 doing the clamping for element sizes for which there is a sse2 intrinsic. v2: fix bug spotted by Jose regarding the intrinsic type for packusdw on old llvm versions. commit ddf279031f0111de4b18eaf783bdc0a1e47813c8 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:13:59 2012 +0200 gallivm: add src width check in lp_build_packs2() not doing so would skip clamping even if no sse2 pack instruction is available, which is incorrect (in theory only, such widths would also always hit a (unnecessary) assertion in lp_build_pack2(). commit e7f0ad7fe079975eae7712a6e0c54be4fae0114b Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Apr 27 15:57:00 2012 +0200 gallivm: (trivial) fix crash-causing typo for npot textures with avx commit 28a9d7f6f655b6ec508c8a3aa6ffefc1e79793a0 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Apr 25 19:38:45 2012 +0200 gallivm: (trivial) remove code mistakenly added twice. commit d5926537316f8ff67ad0a52e7242f7c5478d919b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Apr 24 21:16:15 2012 +0200 gallivm: add a new avx aos sample path (v2) Try to avoid mixing float and int address calculations. This does texture wrap modes with floats, and then the offset calculations still with ints (because of lack of precision with floats, though we could do some effort to make it work with not too large (16MB) textures). This also handles wrap repeat mode with npot-sized textures differently than either the old soa or aos int path (likely way faster but untested). Otherwise the actual address wrap code is largely similar to the soa path (not quite the same as this one also has some int code), it should get used by avx soa sampling later as well but doesn't handle more complex address modes yet (this will also have the benefit that we can use aos sampling path for all texture address modes). Generated code for that looks reasonable, but still does not split vectors explicitly for fetch/filter which means still get hit by llvm (fixed upstream) which generates hundreds of pinsrb/pextrb instead of two shuffles. It is not obvious though if it's much of a win over just doing address calcs 4-wide but with ints, even if it is definitely much less instructions on avx. piglit's texwrap seems to look exactly the same but doesn't test neither the non-normalized nor the npot cases. v2: fix comments, prettify based on Brian's and Jose's feedback. commit bffecd22dea66fb416ecff8cffd10dd4bdb73fce Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Apr 19 01:58:29 2012 +0200 gallivm: refactor aos lp_build_sample_image_nearest/linear split them up to separate address calculations and fetching/filtering. Need this for being able to do 8-wide float address calcs and 4-wide fetch/filter later (for avx). Plus the functions were very big scary monsters anyway (in particular lp_build_sample_image_linear). commit a80b325c57529adddcfa367f96f03557725c4773 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 16 17:17:18 2012 +0200 gallivm: fix lp_build_resize when truncating width but expanding vector size Missed this case which I thought was impossible - the assertion for it was right after the division by zero... (AoS) texture sampling may ask us to do this, for things like 8 4x32int vectors to 1 32x8int vector conversion (eventually, we probably don't want this to happen). commit f9c8337caa3eb185830d18bce8b95676a065b1d7 Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Apr 14 18:00:59 2012 +0200 gallivm: fix cube maps with larger vectors This makes the branchless cube face selection code work with larger vectors. Because the complexity is quite high (cannot really be improved it seems, per-face selection would reduce complexity a lot but this leads to errors unless the derivatives are calculated all from the same face which almost doubles the work to be done) it is still slower than the branching version, hence only enable this with large vectors. It doesn't actually do per-quad face selection yet (only makes sense with matching lod selection, in fact it will select the same face for all pixels based on the average of the first four pixels for now) but only different shuffles are required to make it work (the branching version actually should work with larger vectors too now thanks to the improved horizontal add but of course it cannot be extended to really select the face per-quad unless doing branching per quad). commit 7780c58869fc9a00af4f23209902db7e058e8a66 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 21:11:12 2012 +0100 llvmpipe: (trivial) fix compiler warning and also clarify comment regarding availability of popcnt instruction. commit a266dccf477df6d29a611154e988e8895892277e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 14:21:07 2012 +0100 gallivm: remove unneeded members in lp_build_sample_context Minor cleanup, the texture width, height, depth aren't accessed in their scalar form anywhere. Makes it more obvious those values should probably be fetched already vectorized (but this requires more invasive changes)... commit b678c57fb474e14f05e25658c829fc04d2792fff Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 29 15:53:55 2012 +0100 gallivm: add a helper for concatenating vectors Similar to the extract_range helper intended to get around slow code generated by llvm for 128bit insertelements. Concatenating two 128bit vectors this way will result in a single vinsertf128 operation rather than two 64bit stores plus one 128bit load, though it might be mildly useful for other purposes as well. commit 415ff228bcd0cf5e44a4c15350a661f0f5520029 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 19:41:15 2012 +0100 gallivm: add a custom 2x8f->1x16ub avx conversion path Similar to the existing 4x4f->1x16ub sse2 path, shaves off a couple instructions (min/max mostly) because it relies on pack intrinsics clamping. commit 78c08fc89f8fbcc6dba09779981b1e873e2a0299 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 18:44:07 2012 +0100 gallivm: add avx arithmetic intrinsics Add all avx intrinsics for arithmetic functions (with the exception of the horizontal add function which needs another look). Seems to pass basic tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit a586caa2800aa5ce54c173f7c0d4fc48153dbc4e Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 15:31:35 2012 +0100 gallivm: add avx logic intrinsics Add the blend intrinsics for 8-wide float and 4-wide double vectors. Since we lack 256bit int instructions these are used for int vectors as well, though obviously not for byte or word element values. The comparison intrinsics aren't extended for avx since these are only used for pre-2.7 llvm versions. commit 70275e4c13c89315fc2560a4c488c0e6935d5caf Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 00:40:53 2012 +0100 gallivm: new helper function for extract shuffles. Based on José's idea as we can need that in a couple places. Note that such shuffles should not be used lightly, since data layout of <4 x i8> is different to <16 x i8> for instance, hence might cause data rearrangement. commit 4d586dbae1b0c55915dda1759d2faea631c0a1c2 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 18:27:25 2012 +0100 gallivm: (trivial) don't overallocate shuffle variable using wrong define meant huge array... commit 06b0ec1f6d665d98c135f9573ddf4ba04b2121ad Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 17:54:20 2012 +0100 gallivm: don't do per-element extract/insert for vector element resize Instead of doing per-element extract/insert if the src vectors and dst vector differ in total size (which generates atrocious code) first change the src vectors size by using shuffles to destination vector size. We can still do better than that on AVX for packing to color buffer (by exploiting pack intrinsics characteristics hence eleminating the need for some clamps) but this already generates much better code. v2: incorporate feedback from José, Keith and use shuffle instead of bitcasts/extracts. Due to llvm deficiencies the latter cause all data to get moved to GPRs and back in pieces (even though the data in the regs actually stays the same...). commit c9970d70e05f95d3f52fe7d2cd794176a52693aa Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 19:33:19 2012 +0000 gallivm: fix bug in simple position interpolation Accidental use of position attribute instead of just pixel coordinates. Caused failures in piglit glsl-fs-ceil and glsl-fs-floor. commit d0b6fcdb008d04d7f73d3d725615321544da5a7e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 15:31:14 2012 +0000 gallivm: fix emission of ceil opcode lp_build_ceil seems more appropriate than lp_build_trunc. This seems to be never hit though someone performs some ceil to floor magic. commit d97fafed7e62ffa6bf76560a92ea246a1a26d256 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 22 11:46:52 2012 +0000 gallivm: new vectorized path for cubemap calculations should be faster when adapted to multiple quads as only selection masks need to be different. The code is more or less a per-pixel version adapted to only do it per quad. A per pixel version would be much simpler (could drop 2 selects, 6 broadcasts and the messy horizontal add of 3 vectors at the expense of only 2 more absolute value instructions - would also just work for arbitary large vectors). This version doesn't yet work with larger vectors because the horizontal add isn't adjusted to be able to work with 2x4 vectors (and also because face selection wouldn't be done per quad just per block though that would be only a correctness issue just as with lod selection). The downside is this code is quite a bit slower. On a Core2 it can be sped up by disabling the hw blend instructions for selection and using logicop fallbacks instead, but it is still slower than the old code, hence leave that in for now. Probably will chose one or the other version based on vector length in the end. commit b375fbb18a3fd46859b7fdd42f3e9908ea4ff9a3 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 21 14:42:29 2012 +0000 gallivm: fix optimized occlusion query intrinsic name commit a9ba0a3b611e48efbb0e79eb09caa85033dbe9a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Mar 21 16:19:43 2012 +0000 draw,gallivm,llvmpipe: Call gallivm_verify_function everywhere. commit f94c2238d2bc7383e088b8845b7410439a602071 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 18:54:10 2012 +0000 gallivm: optimize calculations for cube maps a bit this does some more vectorized calculations and uses horizontal adds if possible. A definite win with sse3 otherwise it doesn't seem to make much of a difference. In any case this is arithmetically identical, cannot handle larger vectors. Should be useful as a reference point against larger vector version later... commit 21a2c1cf3c8e1ac648ff49e59fdc0e3be77e2ebb Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 15:16:27 2012 +0000 llvmpipe: slight optimization of occlusion queries using movmskps when available. While this is slightly better for cpus without popcnt we should really sum the vectors ourselves (it is also possible to cast to i4 before doing the popcnt but that doesn't help that much neither since llvm is using some optimized popcnt version for i32) commit 5ab5a35f216619bcdf55eed52b0db275c4a06c1b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 13:32:11 2012 +0000 llvmpipe: fix occlusion queries with larger vectors need to adjust casts etc. commit ff95e6fdf5f16d4ef999ffcf05ea6e8c7160b0d5 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Mar 19 20:15:25 2012 +0000 gallivm: Restore optimization passes. commit 57b05b4b36451e351659e98946dae27be0959832 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:34:22 2012 +0000 llvmpipe: use existing min2 macro commit bc9a20e19b4f600a439f45679451f2e87cd4b299 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:07:27 2012 +0000 llvmpipe: add some safeguards against really large vectors As per José's suggestion, prevent things from blowing up if some cpu would have 1024bit or larger vectors. commit 0e2b525e5ca1c5bbaa63158bde52ad1c1564a3a9 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:31:08 2012 +0000 llvmpipe: fix mask generation for uberwide vectors this was the only piece preventing 16-wide vectors from working (apart from the LP_MAX_VECTOR_WIDTH define that is), which is the maximum as we don't get more pixels in the fragment shader at once. Hence adjust that so things could be tested properly with that size even though there seems to be no practical value. commit 3c8334162211c97f3a11c7f64e9e5a2a91ad9656 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:19:41 2012 +0000 llvmpipe: fix the simple interpolation method with larger vectors so both methods actually _really_ work now. Makes textures look nice with larger vectors... commit 1cb0464ef8871be1778d43b0c56adf9c06843e2d Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 17:26:35 2012 +0000 llvmpipe: fix mask generation and position interpolation with 8-wide vectors trivial bugs, with these things start to look somewhat reasonable. Textures though have some swizzling issues it seems. commit 168277a63ef5b72542cf063c337f2d701053ff4b Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 16:04:03 2012 +0000 llvmpipe: don't overallocate variables we never have more than 16 (stamp size) / 4 (minimum possible vector size). (With larger vectors those variables are still overallocated a bit.) commit 409b54b30f81ed0aa9ed0b01affe15c72de9abd2 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:56:48 2012 +0000 llvmpipe: add some 32f8 formats to lp_test_conv Also add the ability to handle different sized vectors. commit 55dcd3af8366ebdac0af3cdb22c2588f24aa18ce Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:47:27 2012 +0000 gallivm: handle different sized vectors in conversion / pack only fully generic path for now (extract/insert per element). commit 9c040f78c54575fcd94a8808216cf415fe8868f6 Author: Roland Scheidegger <sroland@vmware.com> Date: Sun Mar 18 00:58:28 2012 +0100 llvmpipe: fix harmless use of unitialized values commit 551e9d5468b92fc7d5aa2265db9a52bb1e368a36 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:31:21 2012 +0100 gallivm: drop special path in extract_broadcast with different sized vectors Not needed, llvm can handle shuffles with different sized result vector just fine. Should hopefully generate the same code in the end, but simpler IR. commit 44da531119ffa07a421eaa041f63607cec88f6f8 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:28:49 2012 +0100 llvmpipe: adapt interpolation for handling multiple quads at once this is still WIP there are actually two methods possible not quite sure what makes the most sense, so there's code for both for now: 1) the iterative method as used before (compute attrib values at upper left corner of stamp and upper left corner of each quad initially). It is improved to handle more than one quad at once, and also do some more vectorized calculations initially for slightly better code - newer cpus have full throughput with 4 wide float vectors, hence don't try to code up a path which might be faster if there's just one channel active per attribute. 2) just do straight interpolation for each pixel. Method 2) is more work per quad, but less initially - if all quads are executed significantly more overall though. But this might change with larger vector lengths. This method would also be needed if we'd do some kind of active quad merging when operating on multiple quads at once. This path contains some hack to force llvm to generate better code, it is still far from ideal though, still generates far too many unnecessary register spills/reloads. Both methods should work with different sized vectors. Not very well tested yet, still seems to work with four-wide vectors, need changes elsewhere to be able to test with wider vectors. commit be5d3e82e2fe14ad0a46529ab79f65bf2276cd28 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:59:37 2012 +0000 draw: Cleanup. commit f85bc12c7fbacb3de2a94e88c6cd2d5ee0ec0e8d Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:43:30 2012 +0000 gallivm: More module compilation refactoring. commit d76f093198f2a06a93b2204857e6fea5fd0b3ece Author: José Fonseca <jfonseca@vmware.com> Date: Thu Mar 15 21:29:11 2012 +0000 llvmpipe: Use gallivm_compile/free_function() in linear code. Should had been done before. commit 122e1adb613ce083ad739b153ced1cde61dfc8c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 13 14:47:10 2012 +0100 llvmpipe: generate partial pixel mask for multiple quads still works with one quad, cannot be tested yet with more At least for now always fixed order with multiple quads. commit 4c4f15081d75ed585a01392cd2dcce0ad10e0ea8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 22:09:24 2012 +0100 llvmpipe: refactor state setup a bit Refactor to make it easier to emit (and potentially later fetch in fs) coefficients for multiple attributes at once. Need to think more about how to make this actually happen however, the problem is different attributes can have different interpolation modes, requiring different handling in both setup and fs (though linear and perspective handling is close). commit 9363e49722ff47094d688a4be6f015a03fba9c79 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 19:23:23 2012 +0100 llvmpipe: vectorize tri offset calc cuts number of instructions in quad-offset-factor from 107 to 75. This code actually duplicated the (scalar) code calculating the determinant except it used different vertex order (leading to different sign but it doesn't matter) hence llvm could not have figured out it's the same (of course with determinant vectorized in the other place that wouldn't have worked any longer neither). Note this particular piece doesn't actually vectorize well, not many arithmetic instructions left but tons of shuffle instructions... Probably would need to work on n tris at a time for better vectorization. commit 63169dcb9dd445c94605625bf86d85306e2b4297 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 03:11:37 2012 +0100 llvmpipe: vectorize some scalar code in setup reduces number of arithmetic instructions, and avoids loading vector x,y values twice (once as scalars once as vectors). Results in a reduction of instructions from 76 to 64 in fs setup for glxgears (16%) on a cpu with sse41. Since this code uses vec2 disguised as vec4, on old cpus which had physical 64bit sse units (pre-Core2) it probably is less of a win in practice (and if you have no vectors you can only hope llvm eliminates the arithmetic for unneeded elements). commit 732ecb877f951ab89bf503ac5e35ab8d838b58a1 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 7 00:32:24 2012 +0100 draw: fix clipping bug introduced by 4822fea3f0440b5205e957cd303838c3b128419c broke clipping pretty badly (verified with lineclip test) commit ef5d90b86d624c152d200c7c4056f47c3c6d2688 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 23:38:59 2012 +0100 draw: don't store vertex header per attribute storing the vertex header once per attribute is totally unnecessary. Some quick look at the generated assembly says llvm in fact cannot optimize away the additional stores (maybe due to potentially aliasing pointers somewhere). Plus, this makes the code cleaner and also allows using a vector "or" instead of scalar ones. commit 6b3a5a57b0b9850854cfbd7b586e4e50102dda71 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 19:11:01 2012 +0100 draw: do the per-vertex "boolean" clipmask "or" with vectors no point extracting the values and doing it per component. Doesn't help that much since we still extract the values elsewhere anyway. commit 36519caf1af40e4480251cc79a2d527350b7c61f Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 2 22:27:01 2012 +0100 gallivm: fix lp_build_extract_broadcast with different sized vectors Fix the obviously wrong argument, so it doesn't blow up. commit 76d0ac3ad85066d6058486638013afd02b069c58 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 2 12:16:23 2012 +0000 draw: Compile per module and not per function (WIP). Enough to get gears w/ LLVM draw + softpipe to work on AVX doing: GALLIUM_DRIVER=softpipe SOFTPIPE_USE_LLVM=yes glxgears But still hackish -- will need to rethink and refactor this. commit 78e32b247d2a7a771be9a1a07eb000d1e54ea8bd Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 12:01:05 2012 +0000 llvmpipe: Remove lp_state_setup_fallback. Never used. commit 6895d5e40d19b4972c361e8b83fdb7eecda3c225 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Feb 27 19:14:27 2012 +0000 llvmpipe: Don't emit EMMS on x86 We already take precautions to ensure that LLVM never emits MMX code. commit 4822fea3f0440b5205e957cd303838c3b128419c Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 29 15:58:19 2012 +0100 draw: modifications for larger vector sizes We want to be able to use larger vectors especially for running the vertex shader. With this patch we build soa vectors which might have a different length than 4. Note that aos structures really remain the same, only when aos structures are converted to soa potentially different sized vectors are used. Samplers probably don't work yet, didn't look at them. Testing done: glxgears works with both 128bit and 256bit vectors. commit f4950fc1ea784680ab767d3dd0dce589f4e70603 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:57 2012 +0100 gallivm: override native vector width with LP_NATIVE_VECTOR_WIDTH env var for debug commit 6ad6dbf0c92f3bf68ae54e5f2aca035d19b76e53 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:24 2012 +0100 draw: allocate storage with alignment according to native vector width commit 7bf0e3e7c9bd2469ae7279cabf4c5229ae9880c1 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Feb 24 19:06:08 2012 +0000 gallivm: Fix comment grammar. Was missing several words. Spotted by Roland. commit b20f1b28eb890b2fa2de44a0399b9b6a0d453c52 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 19:22:09 2012 +0000 gallivm: Use MC-JIT on LLVM 3.1 + (i.e, SVN) MC-JIT Note: MC-JIT is still WIP. For this to work correctly it requires LLVM changes which are not yet upstream. commit b1af4dfcadfc241fd4023f4c3f823a1286d452c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 20:03:15 2012 +0100 llvmpipe: use new lp_type_width() helper in lp_test_blend commit 04e0a37e888237d4db2298f31973af459ef9c95f Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 19:50:34 2012 +0100 llvmpipe: clean up lp_test_blend a little Using variables just sized and aligned right makes it a bit more obvious what's going on. The test still only tests vector length 4. For AoS anything else probably isn't going to work. For SoA other lengths should work (at least with floats). commit e61c393d3ec392ddee0a3da170e985fda885a823 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:48:30 2012 +0000 gallivm: Ensure vector width consistency. Instead of assuming that everything is the max native size. commit 330081ac7bc41c5754a92825e51456d231bf84dd Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:44:14 2012 +0000 draw: More simd vector width consistency fixes. commit d90ca002753596269e37297e2e6c139b19f29f03 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:43:00 2012 +0000 gallivm: Remove unused lp_build_int32_vec4_type() helper. commit cae23417824d75869c202aaf897808d73a2c1db0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 17:32:16 2012 +0100 gallivm: use global variable for native vector width instead of define We do not know the simd extensions (and hence the simd width we should use) available at compile time. At least for now keep a define for maximum vector width, since a global variable obviously can't be used to adjust alignment of automatic stack variables. Leave the runtime-determined value at 128 for now in all cases. commit 51270ace6349acc2c294fc6f34c025c707be538a Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 15:41:02 2012 +0000 gallivm: Add a hunk inadvertedly lost when rebasing. commit bf256df9cfdd0236637a455cbaece949b1253e98 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:24:23 2012 +0000 llvmpipe: Use consistent vector width in depth/stencil test. commit 5543b0901677146662c44be2cfba655fd55da94b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:19:59 2012 +0000 draw: Use a consistent the vector register width. Instead of 4x32 sometimes, LP_NATIVE_VECTOR_WIDTH other times. commit eada8bbd22a3a61f549f32fe2a7e408222e5c824 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 12:08:04 2012 +0000 gallivm: Remove garbagge collection. MC-JIT will require one compilation per module (as opposed to one compilation per function), therefore no state will be shared, eliminating the need to do garbagge collection. commit 556697ea0ed72e0641851e4fbbbb862c470fd7eb Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 10:33:41 2012 +0000 gallivm: Move all native target initialization to lp_set_target_options(). commit c518e8f3f2649d5dc265403511fab4bcbe2cc5c8 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:52:32 2012 +0000 llvmpipe: Create one gallivm instance for each test. commit 90f10af8920ec6be6f2b1e7365cfc477a0cb111d Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:48:08 2012 +0000 gallivm: Avoid LLVMAddGlobalMapping() in lp_bld_assert(). Brittle, complex, and unecesary. Just use function pointer constant. commit 98fde550b33401e3fe006af59db4db628bcbf476 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:21:26 2012 +0000 gallivm: Add a lp_build_const_func_pointer() helper. To be reused in all places where we want to call C code. commit 6cfedadb62c2ce5af8d75969bc95a607f3ece118 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:44:41 2012 +0000 gallivm: Cleanup/simplify lp_build_const_string_variable. - Move to lp_bld_const where it belongs - Rename to lp_build_const_string - take the length from the argument (and don't count the zero terminator twice) - bitcast the constant to generic i8 * commit db1d4018c0f1fa682a9da93c032977659adfb68c Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 11:52:17 2012 +0000 gallivm: Set NoFramePointerElimNonLeaf to true where supported. commit 088614164aa915baaa5044fede728aa898483183 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 22 19:38:47 2012 +0100 llvmpipe: pass in/out pointers rather scalar floats in lp_bld_arit we don't want llvm to potentially optimize away the vectors (though it doesn't seem to currently), plus we want to be able to handle in/out vectors of arbitrary length. commit 3f5c4e04af8a7592fdffa54938a277c34ae76b51 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:22:55 2012 +0100 gallivm: fix lp_build_sqrt() for vector length 1 since we optimize away vectors with length 1 need to emit intrinsic without vector type. commit 79d94e5f93ed8ba6757b97e2026722ea31d32c06 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 17:00:46 2012 +0000 llvmpipe: Remove lp_test_round. commit 81f41b5aeb3f4126e06453cfc78990086b85b78d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:56:24 2012 +0100 llvmpipe: subsume lp_test_round into lp_test_arit Much simpler, and since the arguments aren't passed as 128bit values can run on any arch. This also uses the float instead of the double versions of the c functions (which probably was the intention anyway). In contrast to lp_test_round the output is much less verbose however. Tested vector width of 32 to 512 bits - all pass except 32 (length 1) which crashes in lp_build_sqrt() due to wrong type. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 945b338b421defbd274481d8c4f7e0910fd0e7eb Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 09:55:03 2012 +0000 gallivm: Centralize the function compilation logic. This simplifies a lot of code. Also doing this in a central place will make it easier to carry out the changes necessary to use MC-JIT in the future. gallivm: Fix typo in explicit derivative shuffle. Trivial. draw: make DEBUG_STORE work again adapt to lp_build_printf() interface changes Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: get rid of vecnf_from_scalar() just use lp_build_broadcast directly (cannot assign a name but don't really need it, vecnf_from_scalar() was producing much uglier IR due to using repeated insertelement instead of insertelement+shuffle). Reviewed-by: José Fonseca <jfonseca@vmware.com> llvmpipe: fix typo in complex interpolation code Fixes position interpolation when using complex mode (piglit fp-fragment-position and similar) Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: fix clipvertex/position storing again This appears to be the result of a bad merge. Fixes piglit tests relying on clipping, like a lot of the interpolation tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Fix explicit derivative manipulation. Same counter variable was being used in two nested loops. Use more meanigful variable names for the counter to fix and avoid this. gallivm: Prevent buffer overflow in repeat wrap mode for NPOT. Based on Roland's patch, discussion, and review . Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix dims for TGSI_TEXTURE_1D in emit_tex. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix explicit volume texture derivatives. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: fix 1d shadow texture sampling Always r coordinate is used, hence need 3 coords not two (the second one is unused). Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Enable AVX support without MCJIT, where available. For now, this just enables AVX on Windows for testing. If the code is stable then we might consider prefering the old JIT wherever possible. No change elsewhere. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2012-07-13 18:09:30 +01:00
}
else if (type.floating && util_cpu_caps.has_altivec) {
if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
2013-11-11 14:29:25 +00:00
debug_printf("%s: altivec doesn't support nan return nan behavior\n",
__FUNCTION__);
}
if (type.width == 32 || type.length == 4) {
intrinsic = "llvm.ppc.altivec.vmaxfp";
intr_size = 128;
}
} else if (HAVE_LLVM < 0x0309 &&
util_cpu_caps.has_avx2 && type.length > 4) {
intr_size = 256;
switch (type.width) {
case 8:
intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.b" : "llvm.x86.avx2.pmaxu.b";
break;
case 16:
intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.w" : "llvm.x86.avx2.pmaxu.w";
break;
case 32:
intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.d" : "llvm.x86.avx2.pmaxu.d";
break;
}
} else if (HAVE_LLVM < 0x0309 &&
util_cpu_caps.has_sse2 && type.length >= 2) {
gallivm,draw,llvmpipe: Support wider native registers. Squashed commit of the following: commit 7acb7b4f60dc505af3dd00dcff744f80315d5b0e Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:46:31 2012 +0100 draw: Don't use dynamically sized arrays. Not supported by MSVC. commit 5810c28c83647612cb372d1e763fd9d7780df3cb Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:44:16 2012 +0100 gallivm,llvmpipe: Don't use expressions with PIPE_ALIGN_VAR(). MSVC doesn't accept exceptions in _declspec(align(...)). Use a define instead. commit 8aafd1457ba572a02b289b3f3411e99a3c056072 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:41:56 2012 +0100 gallium/util: Make u_cpu_detect.h header C++ safe. commit 5795248350771f899cfbfc1a3a58f1835eb2671d Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 2 12:08:01 2012 +0100 gallium/util: Add ULL suffix to large constants. As suggested by Andy Furniss: it looks like some old gcc versions require it. commit 4c66c22727eff92226544c7d43c4eb94de359e10 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Truly disable INF/NAN tests on MSVC. Thanks to Brian for spotting this. commit 8bce274c7fad578d7eb656d9a1413f5c0844c94e Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Disable INF/NAN tests on MSVC. Somehow they are not recognized as constants. commit 6868649cff8d7fd2e2579c28d0b74ef6dd4f9716 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 15:05:24 2012 +0200 gallivm: Cleanup the 2 x 8 float -> 16 ub special path in lp_build_conv. No behaviour change intended, like 7b98455fb40c2df84cfd3cdb1eb7650f67c8a751. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 5147a0949c4407e8bce9e41d9859314b4a9ccf77 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 14:28:19 2012 +0200 gallivm: (trivial) fix issues with multiple-of-4 texture fetch Some formats can't handle non-multiple of 4 fetches I believe, but everything must support length 1 and multiples of 4. So avoid going to scalar fetch (which is very costly) just because length isn't 4. Also extend the hack to not use shift with variable count for yuv formats to arbitrary length (larger than 1) - doesn't matter how many elements we have we always want to avoid it unless we have variable shift count instruction (which we should get with avx2). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 87ebcb1bd71fa4c739451ec8ca89a7f29b168c08 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jul 4 02:09:55 2012 +0200 gallivm: (trivial) fix typo for wrap repeat mode in linear filtering aos code This would lead to bogus coordinates at the edges. (undetected by piglit because this path is only taken for block-based formats). Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 3a42717101b1619874c8932a580c0b9e6896b557 Author: José Fonseca <jfonseca@vmware.com> Date: Tue Jul 3 19:42:49 2012 +0100 gallivm: Fix TGSI integer translation with AVX. commit d71ff104085c196b16426081098fb0bde128ce4f Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 15:17:41 2012 +0100 llvmpipe: Fix LLVM JIT linear path. It was not working properly because it was looking at the JIT function before it was actually compiled. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit a94df0386213e1f5f9a6ed470c535f9688ec0a1b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Jun 28 18:07:10 2012 +0100 gallivm: Refactor lp_build_broadcast(_scalar) to share code. Doesn't really change the generated assembly, but produces more compact IR, and of course, makes code more consistent. Reviewed-by: Brian Paul <brianp@vmware.com> commit 66712ba2731fc029fa246d4fc477d61ab785edb5 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 17:30:13 2012 +0100 gallivm: Make LLVMContextRef a singleton. There are any places inside LLVM that depend on it. Too many to attempt to fix. Reviewed-by: Brian Paul <brianp@vmware.com> commit ff5fb7897495ac263f0b069370fab701b70dccef Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 28 18:15:27 2012 +0200 gallivm: don't use 8-wide texture fetch in aos path This appears to be a slight loss usually. There are probably several reasons for that: - fetching itself is scalar - filtering is pure int code hence needs splitting anyway, same for the final texel offset calculations - texture wrap related code, which can be done 8-wide, is slightly more complex with floats (with clamp_to_edge) and float operations generally more costly hence probably not much faster overall - the code needed to split when encountering different mip levels for the quads, adding complexity So, just split always for aos path (but leave it 8-wide for soa, since we do 8-wide filtering there when possible). This should certainly be revisited if we'd have avx2 support. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ce8032b43dcd8e8d816cbab6428f54b0798f945d Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:41:19 2012 +0200 gallivm: (trivial) don't extract fparts variable if not needed Did not have any consequences but unnecessary. commit aaa9aaed8f80dc282492f62aa583a7ee23a4c6d5 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:09:06 2012 +0200 gallivm: fix precision issue in aos linear int wrap code now not just passes at a quick glance but also with piglit... If we do the wrapping with floats, we also need to set the weights accordingly. We can potentially end up with different (integer) coordinates than what the integer calculations would have chosen, which means the integer weights calculated previously in this case are completely wrong. Well at least that's what I think happens, at least recalculating the weights helps. (Some day really should refactor all the wrapping, so we do whatever is fastest independent of 16bit int aos or 32bit float soa filtering.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit fd6f18588ced7ac8e081892f3bab2916623ad7a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 11:15:53 2012 +0100 gallium/util: Fix parsing of options with underscore. For example GALLIVM_DEBUG=no_brilinear which was being parsed as two options, "no" and "brilinear". commit 09a8f809088178a03e49e409fa18f1ac89561837 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 26 15:00:14 2012 +0100 gallivm: Added a generic lp_build_print_value which prints a LLVMValueRef. Updated lp_build_printf to share common code. Removed specific lp_build_print_vecX. Reviewed-by: José Fonseca <jfonseca@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit e59bdcc2c075931bfba2a84967a5ecd1dedd6eb0 Author: José Fonseca <jfonseca@vmware.com> Date: Wed May 16 15:00:23 2012 +0100 draw,llvmpipe: Avoid named struct types on LLVM 3.0 and later. Starting with LLVM 3.0, named structures are meant not for debugging, but for recursive data types, previously also known as opaque types. The recursive nature of these types leads to several memory management difficulties. Given that we don't actually need recursive types, avoid them altogether. This is an attempt to address fdo bugs 41791 and 44466. The issue is somewhat random so there's no easy way to check how effective this is. Cherry-picked from 9af1ba565dfd5cef9ee938bb7c04767d14878fbf commit df6070f618a203c7a876d984c847cde4cbc26bdb Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 14:42:53 2012 +0200 gallivm: (trivial) fix typo in faster aos linear int wrap code no longer crashes, now REALLY tested. commit d8f98dce452c867214e6782e86dc08562643c862 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 18:20:58 2012 +0200 llvmpipe: (trivial) remove bogus optimization for float aos repeat wrap This optimization for nearest filtering on the linear path generated likely bogus results, and the int path didn't have any optimizations there since the only shader using force_nearest apparently uses clamp_to_edge not repeat wrap anyway. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit c4e271a0631087c795e756a5bb6b046043b5099d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 23:01:52 2012 +0200 gallivm: faster repeat wrap for linear aos path too Even if we already have scaled integer coords, it's way faster to use the original float coord (plus some conversions) rather than use URem. The choice of what to do for texture wrapping is not really tied to int aos or float soa filtering though for some modes there can be some gains (because of easier weight calculations). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 1174a75b1806e92aee4264ffe0ffe7e70abbbfa3 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 14:39:22 2012 +0200 gallivm: improve npot tex wrap repeat in linear soa path URem gets translated into series of scalar divisions so just about anything else is faster. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit f849ffaa499ed96fa0efd3594fce255c7f22891b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 00:40:35 2012 +0100 gallivm: (trivial) fix near-invisible shift-space typo I blame the keyboard. commit 5298a0b19fe672aebeb70964c0797d5921b51cf0 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:24:28 2012 +0200 gallivm: add new intrinsic helper to deal with arbitrary vector length This helper will split vectors which are too large for the hw, or expand them if they are too small, so a caller of a function using intrinsics which uses such sizes need not split (or expand) the vectors manually and the function will still use the intrinsic instead of dropping back to generic llvm code. It can also accept scalars for use with pseudo-vector intrinsics (only useful for float arguments, all x86 scalar simd float intrinsics use 4vf32). Only used for lp_build_min/max() for now (also added the scalar float case for these while there). (Other basic binary functions could use it easily, whereas functions with a different interface would need different helpers.) Expanding vectors isn't widely used, because we always try to use build contexts with native hw vector sizes. But it might (or not) be nicer if this wouldn't need to be done, the generated code should in theory stay the same (it does get hit by lp_build_rho though already since we didn't have a intrinsic for the scalar lp_build_max case before). v2: incorporated Brian's feedback, and also made the scalar min/max case work instead of crash (all scalar simd float intrinsics take 4vf32 as argument, probably the reason why it wasn't used before). Moved to lp_bld_intr based on José's request, and passing intrinsic size instead of length. Ideally we'd derive the source type info from the passed in llvm value refs and process some llvmtype return type so we could handle intrinsics where the source and destination type isn't the same (like float/int conversions, packing instructions) but that's a bit too complicated for now. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 01aa760b99ec0b2dc8ce57a43650e83f8c1becdf Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:19:18 2012 +0200 gallivm: (trivial) increase max code size for shader disassembly 64kB was just short of what I needed (which caused a crash) hence increase to 96kB (should probably be smarter about that). commit 74aa739138d981311ce13076388382b5e89c6562 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:53:29 2012 +0100 gallivm: simplify aos float tex wrap repeat nearest just handle pot and npot the same. The previous pot handling ended up with exactly the same instructions plus 2 more (leave it in the soa path though since it is probably still cheaper there). While here also fix a issue which would cause a crash after an assert. commit 0e1e755645e9e49cfaa2025191e3245ccd723564 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:29:24 2012 +0100 gallivm: (trivial) skip floor rounding in ifloor when not signed This was only done for the non-sse41 case before, but even with sse41 this is obviously unnecessary (some callers already call itrunc in this case anyway but some might not). commit 7f01a62f27dcb1d52597b24825931e88bae76f33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:23:12 2012 +0100 gallivm: (trivial) fix bogus comments commit 5c85be25fd82e28490274c468ce7f3e6e8c1d416 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 20 11:51:57 2012 +0100 translate: Free elt8_func/elt16_func too. These were leaking. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 0ad498f36fb6f7458c7cffa73b6598adceee0a6c Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 15:55:34 2012 +0200 gallivm: fix bug for tex wrap repeat with linear sampling in aos float path The comparison needs to be against length not length_minus_one, otherwise the max texel is never chosen (for the second coordinate). Fixes piglit texwrap-1D-npot-proj (and 2D/3D versions). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit d1ad65937c5b76407dc2499b7b774ab59341209e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 16:13:43 2012 +0200 gallivm: simplify soa tex wrap repeat with npot textures and no mip filtering Similar to what is already done in aos sampling for the float path (but not the int path since we don't get normalized float coordinates there). URem is expensive and the calculation is done trivially with normalized floats instead (at least with sse41-capable cpus). (Some day should probably do the same for the mip filter path but it's much more complicated there hence the gain is smaller.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit e1e23f57ba9b910295c306d148f15643acc3fc83 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:38:56 2012 +0200 llvmpipe: (trivial) remove duplicated function declaration Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 07ca57eb09e04c48a157733255427ef5de620861 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:37:34 2012 +0200 llvmpipe: destroy setup variants on context destruction lp_delete_setup_variants() used to be called in garbage collection, but this no longer exists hence the setup shaders never got freed. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ed0003c633859a45f9963a479f4c15ae0ef1dca3 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 16:25:29 2012 +0100 gallivm: handle different ilod parts for multiple quad sampling This fixes filtering when the integer part of the lod is not the same for all quads. I'm not fully convinced of that solution yet as it just splits the vector if the levels to be sampled from are different. But otherwise we'd need to do things like some minify steps, and getting mip level base address separately anyway hence it wouldn't really look like much of a win (and making the code even more complex). This should now give identical results to single quad sampling. commit 8580ac4cfc43a64df55e84ac71ce1a774d33c0d2 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:14:47 2012 +0200 gallivm: de-duplicate sample code common to soa and aos sampling There doesn't seem to be any reason why this code dealing with cube face selection, lod and mip level calculation is separate in aos and soa sampling, and I am sick of having it to change in both places. commit fb541e5f957408ce305b272100196f1e12e5b1e8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:15:41 2012 +0200 gallivm: do mip filtering with per quad lod_fpart This gives better results for mip filtering, though the generated code might not be optimal. For now it also creates some artifacts if the lod_ipart isn't the same for all quads, since instead of using the same mip weight for all quads as previously (which just caused non-smooth gradients) this now will use the right weights but with the wrong mip level in this case (can easily be seen with things like texfilt, mipmap_tunnel). v2: use logic helper suggested by José, and fix issue with negative lod_fpart values commit f1cc84eef7d826a20fab6cd8ccef9a275ff78967 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 13 18:35:25 2012 +0200 gallivm: (trivial) fix bogus assert in lp_build_unpack_broadcast_aos_scalars commit 7c17dbae8ae290df9ce0f50781a09e8ed640c044 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:11:14 2012 +0100 util: Reimplement half <-> float conversions. Removed u_half.py used to generate the table for previous method. Previous implementation of float to half conversion was faulty for denormalised and NaNs and would require extra logic to fix, thus making the speedup of using tables irrelevant. commit 7762f59274070e1dd4b546f5cb431c2eb71ae5c3 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:12:16 2012 +0100 tests: Updated tests to properly handle NaN for half floats. commit fa94c135aea5911fd93d5dfb6e6f157fb40dce5e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:33:10 2012 +0200 gallivm: do mip level calculations per quad This is the final piece which shouldn't change the rendering output yet. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 23cbeaddfe03c09ca18c45d28955515317ffcf4c Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Jun 9 00:54:21 2012 +0200 gallivm: do per-quad cube face selection Doesn't quite fix the piglit cubemap test (not sure why actually) but doing per-quad face selection is doing the right thing and definitely an improvement. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit abfb372b3702ac97ac8b5aa80ad1b94a2cc39d33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:22:59 2012 +0200 gallivm: do all lod calculations per quad Still no functional change but lod is now converted to scalar after lod calculations. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 519368632747ae03feb5bca9c655eccbc5b751b4 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:46:10 2012 +0100 gallivm: Added support for half-float to float conversion in lp_build_conv. Updated various utility functions to support this change. commit 135b4d683a4c95f7577ba27b9bffa4a6fbd2c2e7 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:02:46 2012 +0100 gallivm: Added function for half-float to float conversion. Updated lp_build_format_aos_array to support half-float source. commit 37d648827406a20c5007abeb177698723ed86673 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:55:18 2012 +0100 util: Updated u_format_tests to rigidly test half-float boundary values. commit 2ad18165d96e578aa9046df7c93cb1c3284d8c6b Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:54:16 2012 +0100 llvmpipe: Updated lp_test_format to properly handle Inf/NaN results. commit 78740acf25aeba8a7d146493dd5c966e22c27b73 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:53:30 2012 +0100 util: Added functions for checking NaN / Inf for double and half-floats. commit 35e9f640ae01241f9e0d67fe893bbbf564c05809 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:05:13 2012 +0200 gallivm: Fix calculating rho for 3d textures for the single-quad case Discovered by accident, this looks like a very old typo bug. commit fc1220c636326536fd0541913154e62afa7cd1d8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:04:59 2012 +0200 gallivm: do calcs per-quad in lp_build_rho Still convert to scalar at the end of the function. commit 50a887ffc550bf310a6988fa2cea5c24d38c1a41 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 21 23:21:50 2012 +0200 gallivm: (trivial) return scalar in lp_build_extract_range for length 1 vectors Our type system on top of llvm's one doesn't generally support vectors of length 1, instead using scalars. So we should return a scalar from this function instead of having to bitcast the vector with length 1 later elsewhere. commit 80c71c621f9391f0f9230460198d861643324876 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 17:49:15 2012 +0100 draw: Fixed bad merge error commit c47401cfad0c9167de20ff560654f533579f452c Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:29:30 2012 +0100 draw: Updated store_clip to store whole vectors instead of individual elements. commit 2d9c1ad74b0b0b41861fffcecde39f09cc27f1cf Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:28:32 2012 +0100 gallivm: Added lp_build_fetch_rgba_aos_array. A version of lp_build_fetch_rgba_aos which is targeted at simple array formats. Reads the whole vector from memory in one, instead of reading each element individually. Tested with mesa tests and demos. commit ff7805dc2b6ef6d8b11ec4e54aab1633aef29ac8 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:27:40 2012 +0100 gallivm: Added lp_build_pad_vector. This function pads a vector with undef to a desired length. commit 701f50acef24a2791dabf4730e5b5687d6eb875d Author: James Benton <jbenton@vmware.com> Date: Fri May 18 17:27:19 2012 +0100 util: Added util_format_is_array. This function checks whether a format description is in a simple array format. commit 5e0a7fa543dcd009de26f34a7926674190fa6246 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:13:47 2012 +0100 draw: Removed draw_llvm_translate_from and draw/draw_llvm_translate.c. This is "replaced" by adding an optimised path in lp_build_fetch_rgba_aos in an upcoming patch. commit 8c886d6a7dd3fb464ecf031de6f747cb33e5361d Author: James Benton <jbenton@vmware.com> Date: Wed May 16 15:02:31 2012 +0100 draw: Modified store_aos to write the vector as one, not individual elements. commit 37337f3d657e21dfd662c7b26d61cb0f8cfa6f17 Author: James Benton <jbenton@vmware.com> Date: Wed May 16 14:16:23 2012 +0100 draw: Changed aos_to_soa to use lp_build_transpose_aos. commit bd2b69ce5d5c94b067944d1dcd5df9f8e84548f1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:14:27 2012 +0100 draw: Changed soa_to_aos to use lp_build_transpose_aos. commit 0b98a950d29a116e82ce31dfe7b82cdadb632f2b Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:45 2012 +0100 gallivm: Added lp_build_transpose_aos which converts between aos and soa. commit 69ea84531ad46fd145eb619ed1cedbe97dde7cb5 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:01 2012 +0100 gallivm: Added lp_build_interleave2_half aimed at AVX unpack instructions. commit 7a4cb1349dd35c18144ad5934525cfb9436792f9 Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 22 11:54:14 2012 +0100 gallivm: Fix build on Windows. MC-JIT not yet supported there. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit afd105fc16bb75d874e418046b80d9cc578818a1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:17:26 2012 +0100 llvmpipe: Added a error counter to lp_test_conv. Useful for keeping track of progress when fixing errors! Signed-off-by: José Fonseca <jfonseca@vmware.com> commit b644907d08c10a805657841330fc23db3963d59c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:16:46 2012 +0100 llvmpipe: Changed known failures in lp_test_conv. To comply with the recent fixes to lp_bld_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit d7061507bd94f6468581e218e61261b79c760d4f Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:14:38 2012 +0100 llvmpipe: Added fixed point types tests to lp_test_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 146b3ea39b4726dbe125ac666bd8902ea3d6ca8c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:26:35 2012 +0100 llvmpipe: Changed lp_test_conv src/dst alignment to be correct. Now based on the define rather than a fixed number. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit f3b57441f834833a4b142a951eb98df0aa874536 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:06:44 2012 +0100 gallivm: Fixed erroneous optimisation in lp_build_min/max. Previously assumed normalised was 0 to 1, but it can be -1 to 1 if type is signed. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a0613382e5a215cd146bb277646a6b394d376ae4 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:04:49 2012 +0100 gallivm: Compensate for lp_const_offset in lp_build_conv. Fixing a /*FIXME*/ to remove errors in integer conversion in lp_build_conv. Tested using lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a3d2bf15ea345bc8a0664f8f441276fd566566f3 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:01:25 2012 +0100 gallivm: Fixed overflow in lp_build_clamped_float_to_unsigned_norm. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit e7b1e76fe237613731fa6003b5e1601a2e506207 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 21 20:07:51 2012 +0100 gallivm: Fix build with LLVM 2.6 Trivial, and useful. commit d3c6bbe5c7f5ba1976710831281ab1b6a631082d Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 15 17:15:59 2012 +0100 gallivm: Enable MCJIT/AVX with vanilla LLVM 3.1. Add the necessary C++ glue, so that we don't need any modifications to the soon to be released LLVM 3.1. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 724a019a14d40fdbed21759a204a2bec8a315636 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 22:04:06 2012 +0100 gallivm: Use HAVE_LLVM 0x0301 consistently. commit af6991e2a3868e40ad599b46278551b794839748 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 21:49:06 2012 +0100 gallivm: Add MCRegisterInfo.h to silence benign warnings about missing implementation. Trivial. commit 6f8a1d75458daae2503a86c6b030ecc4bb494e23 Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Apr 2 22:14:15 2012 -0700 gallivm: Pass in a MCInstrInfo to createMCInstPrinter on llvm-3.1. llvm-3.1svn r153860 makes MCInstrInfo available to the MCInstPrinter. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 62555b6ed8760545794f83064e27cddcb3ce5284 Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 27 21:51:17 2012 -0700 gallivm: Fix method overriding in raw_debug_ostream. Use matching type qualifers to avoid method hiding. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 6a9bd784f4ac68ad0a731dcd39e5a3c39989f2be Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 13 22:40:52 2012 -0700 gallivm: Fix createOProfileJITEventListener namespace with llvm-3.1. llvm-3.1svn r152620 refactored the OProfile profiling code. createOProfileJITEventListener was moved from the llvm namespace to the llvm::JITEventListener namespace. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit b674955d39adae272a779be85aa1bd665de24e3e Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Mar 5 22:00:40 2012 -0800 gallivm: Pass in a MCRegisterInfo to MCInstPrinter on llvm-3.1. llvm-3.1svn r152043 changes createMCInstPrinter to take an additional MCRegisterInfo argument. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 11ab69971a8a31c62f6de74905dbf8c02884599f Author: Vinson Lee <vlee@freedesktop.org> Date: Wed Feb 29 21:20:53 2012 -0800 Revert "gallivm: Change getExtent and readByte to non-const with llvm-3.1." This reverts commit d5a6c172547d8964f4d4bb79637651decaf9deee. llvm-3.1svn r151687 makes MemoryObject accessor members const again. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 339960c82d2a9f5c928ee9035ed31dadb7f45537 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 14 16:19:56 2012 +0200 gallivm: (trivial) fix assertion failure for mipmapped 1d textures In lp_build_rho, we may end up with a 1-element vector (for mipmapped 1d textures), but in this case we require the type to be a non-vector type, so need a cast. commit 9d73edb727bd6d196030dc3026b7bf0c574b3e19 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:12:07 2012 +0200 gallivm: prepare for per-quad lod calculations for large vectors to be able to handle multiple quads at once in texture sampling and still do lod calculations per quad, it is necessary to get the per-quad derivatives into the lp_build_rho function. Until now these derivative values were just scalars, which isn't going to work. So we now use vectors, and since the interface needs to change we also do some different (slightly more efficient) packing of the values. For 8-wide vectors the packed derivative values for 3 coords would look like this, this scales to a arbitrary (multiple of 4) vector size: ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____ The second vector will be unused for 1d and 2d textures. To facilitate future changes the derivative values are put into a struct, since quite some functions just pass these values through. The generated code seems to be very slightly better for 2d textures (with 4-wide vectors) than before with sse2 (if you have a cpu with physical 128bit simd units - otherwise it's probably not a win). v2: suggestions from José, rename variables, add comments, use swizzle helper commit 0aa21de0d31466dac77b05c97005722e902517b8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:10:31 2012 +0200 gallivm: add undefined swizzle handling to lp_build_swizzle_aos This is useful for vectors with "holes", it lets llvm choose the most efficient shuffle instructions if some elements aren't needed without having to worry what elements to manually pick otherwise. commit 00faf3f370e7ce92f5ef51002b0ea42ef856e181 Author: José Fonseca <jfonseca@vmware.com> Date: Fri May 4 17:25:16 2012 +0100 gallivm: Get the LLVM IR optimization passes before JIT compilation. MC-JIT engine compiles the module immediately on creation, so the optimization passes were being run too late. So now we create a target data layout from a string, that matches the ABI parameters reported by the compiler. The backend optimization passes were always been run, so the performance improvement is modest (3% on multiarb mesa demo). Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit 40a43f4e2ce3074b5ce9027179d657ebba68800a Author: Roland Scheidegger <sroland@vmware.com> Date: Wed May 2 16:03:54 2012 +0200 gallivm: (trivial) fix wrong define used in lp_build_pack2 should fix stack-smashing crashes. commit e6371d0f4dffad4eb3b7a9d906c23f1c88a2ab9e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 30 21:25:29 2012 +0200 gallivm: add perf warnings when not using intrinsics with 256bit vectors Helper functions using integer sse2 intrinsics could split the vectors with AVX instead of using generic fallback (which should be faster). We don't actually expect to hit these paths (hence don't fix them up to actually do the vector splitting) so just emit warnings (for those functions where it's obvious doing split/intrinsic is faster than using generic path). Only emit warnings for 256bit vectors since we _really_ don't expect to hit arbitrary large vectors which would affect a lot more functions. The warnings do not actually depend on avx since the same logic applies to plain sse2 too (but of course again there's _really_ no reason we should hit these functions with 256bit vectors without avx). commit 8a9ea701ea7295181e846c6383bf66a5f5e47637 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:37:07 2012 +0200 gallivm: split vectors manually for avx in lp_build_pack2 (v2) There's 2 reasons for this: First, there's a llvm bug (fixed in 3.1) which generates tons of byte inserts/extracts otherwise, and second, more importantly, we want to use pack intrinsics instead of shuffles. We do this in lp_build_pack2 and not the calling code (aos sample path) because potentially other callers might find that useful too, even if for larger sequences of code using non-native vector sizes it might be better to manually split vectors. This should boost texture performance in the aos path considerably. v2: fix issues with intrinsics types with old llvm commit 27ac5b48fa1f2ea3efeb5248e2ce32264aba466e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:26:22 2012 +0200 llvmpipe: refactor lp_build_pack2 (v2) prettify, and it's unnecessary to assert when there's no intrinsic due to unsupported bit width - the shuffle path will work regardless. In contrast lp_build_packs2, should only rely on lp_build_pack2 doing the clamping for element sizes for which there is a sse2 intrinsic. v2: fix bug spotted by Jose regarding the intrinsic type for packusdw on old llvm versions. commit ddf279031f0111de4b18eaf783bdc0a1e47813c8 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:13:59 2012 +0200 gallivm: add src width check in lp_build_packs2() not doing so would skip clamping even if no sse2 pack instruction is available, which is incorrect (in theory only, such widths would also always hit a (unnecessary) assertion in lp_build_pack2(). commit e7f0ad7fe079975eae7712a6e0c54be4fae0114b Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Apr 27 15:57:00 2012 +0200 gallivm: (trivial) fix crash-causing typo for npot textures with avx commit 28a9d7f6f655b6ec508c8a3aa6ffefc1e79793a0 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Apr 25 19:38:45 2012 +0200 gallivm: (trivial) remove code mistakenly added twice. commit d5926537316f8ff67ad0a52e7242f7c5478d919b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Apr 24 21:16:15 2012 +0200 gallivm: add a new avx aos sample path (v2) Try to avoid mixing float and int address calculations. This does texture wrap modes with floats, and then the offset calculations still with ints (because of lack of precision with floats, though we could do some effort to make it work with not too large (16MB) textures). This also handles wrap repeat mode with npot-sized textures differently than either the old soa or aos int path (likely way faster but untested). Otherwise the actual address wrap code is largely similar to the soa path (not quite the same as this one also has some int code), it should get used by avx soa sampling later as well but doesn't handle more complex address modes yet (this will also have the benefit that we can use aos sampling path for all texture address modes). Generated code for that looks reasonable, but still does not split vectors explicitly for fetch/filter which means still get hit by llvm (fixed upstream) which generates hundreds of pinsrb/pextrb instead of two shuffles. It is not obvious though if it's much of a win over just doing address calcs 4-wide but with ints, even if it is definitely much less instructions on avx. piglit's texwrap seems to look exactly the same but doesn't test neither the non-normalized nor the npot cases. v2: fix comments, prettify based on Brian's and Jose's feedback. commit bffecd22dea66fb416ecff8cffd10dd4bdb73fce Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Apr 19 01:58:29 2012 +0200 gallivm: refactor aos lp_build_sample_image_nearest/linear split them up to separate address calculations and fetching/filtering. Need this for being able to do 8-wide float address calcs and 4-wide fetch/filter later (for avx). Plus the functions were very big scary monsters anyway (in particular lp_build_sample_image_linear). commit a80b325c57529adddcfa367f96f03557725c4773 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 16 17:17:18 2012 +0200 gallivm: fix lp_build_resize when truncating width but expanding vector size Missed this case which I thought was impossible - the assertion for it was right after the division by zero... (AoS) texture sampling may ask us to do this, for things like 8 4x32int vectors to 1 32x8int vector conversion (eventually, we probably don't want this to happen). commit f9c8337caa3eb185830d18bce8b95676a065b1d7 Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Apr 14 18:00:59 2012 +0200 gallivm: fix cube maps with larger vectors This makes the branchless cube face selection code work with larger vectors. Because the complexity is quite high (cannot really be improved it seems, per-face selection would reduce complexity a lot but this leads to errors unless the derivatives are calculated all from the same face which almost doubles the work to be done) it is still slower than the branching version, hence only enable this with large vectors. It doesn't actually do per-quad face selection yet (only makes sense with matching lod selection, in fact it will select the same face for all pixels based on the average of the first four pixels for now) but only different shuffles are required to make it work (the branching version actually should work with larger vectors too now thanks to the improved horizontal add but of course it cannot be extended to really select the face per-quad unless doing branching per quad). commit 7780c58869fc9a00af4f23209902db7e058e8a66 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 21:11:12 2012 +0100 llvmpipe: (trivial) fix compiler warning and also clarify comment regarding availability of popcnt instruction. commit a266dccf477df6d29a611154e988e8895892277e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 14:21:07 2012 +0100 gallivm: remove unneeded members in lp_build_sample_context Minor cleanup, the texture width, height, depth aren't accessed in their scalar form anywhere. Makes it more obvious those values should probably be fetched already vectorized (but this requires more invasive changes)... commit b678c57fb474e14f05e25658c829fc04d2792fff Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 29 15:53:55 2012 +0100 gallivm: add a helper for concatenating vectors Similar to the extract_range helper intended to get around slow code generated by llvm for 128bit insertelements. Concatenating two 128bit vectors this way will result in a single vinsertf128 operation rather than two 64bit stores plus one 128bit load, though it might be mildly useful for other purposes as well. commit 415ff228bcd0cf5e44a4c15350a661f0f5520029 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 19:41:15 2012 +0100 gallivm: add a custom 2x8f->1x16ub avx conversion path Similar to the existing 4x4f->1x16ub sse2 path, shaves off a couple instructions (min/max mostly) because it relies on pack intrinsics clamping. commit 78c08fc89f8fbcc6dba09779981b1e873e2a0299 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 18:44:07 2012 +0100 gallivm: add avx arithmetic intrinsics Add all avx intrinsics for arithmetic functions (with the exception of the horizontal add function which needs another look). Seems to pass basic tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit a586caa2800aa5ce54c173f7c0d4fc48153dbc4e Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 15:31:35 2012 +0100 gallivm: add avx logic intrinsics Add the blend intrinsics for 8-wide float and 4-wide double vectors. Since we lack 256bit int instructions these are used for int vectors as well, though obviously not for byte or word element values. The comparison intrinsics aren't extended for avx since these are only used for pre-2.7 llvm versions. commit 70275e4c13c89315fc2560a4c488c0e6935d5caf Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 00:40:53 2012 +0100 gallivm: new helper function for extract shuffles. Based on José's idea as we can need that in a couple places. Note that such shuffles should not be used lightly, since data layout of <4 x i8> is different to <16 x i8> for instance, hence might cause data rearrangement. commit 4d586dbae1b0c55915dda1759d2faea631c0a1c2 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 18:27:25 2012 +0100 gallivm: (trivial) don't overallocate shuffle variable using wrong define meant huge array... commit 06b0ec1f6d665d98c135f9573ddf4ba04b2121ad Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 17:54:20 2012 +0100 gallivm: don't do per-element extract/insert for vector element resize Instead of doing per-element extract/insert if the src vectors and dst vector differ in total size (which generates atrocious code) first change the src vectors size by using shuffles to destination vector size. We can still do better than that on AVX for packing to color buffer (by exploiting pack intrinsics characteristics hence eleminating the need for some clamps) but this already generates much better code. v2: incorporate feedback from José, Keith and use shuffle instead of bitcasts/extracts. Due to llvm deficiencies the latter cause all data to get moved to GPRs and back in pieces (even though the data in the regs actually stays the same...). commit c9970d70e05f95d3f52fe7d2cd794176a52693aa Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 19:33:19 2012 +0000 gallivm: fix bug in simple position interpolation Accidental use of position attribute instead of just pixel coordinates. Caused failures in piglit glsl-fs-ceil and glsl-fs-floor. commit d0b6fcdb008d04d7f73d3d725615321544da5a7e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 15:31:14 2012 +0000 gallivm: fix emission of ceil opcode lp_build_ceil seems more appropriate than lp_build_trunc. This seems to be never hit though someone performs some ceil to floor magic. commit d97fafed7e62ffa6bf76560a92ea246a1a26d256 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 22 11:46:52 2012 +0000 gallivm: new vectorized path for cubemap calculations should be faster when adapted to multiple quads as only selection masks need to be different. The code is more or less a per-pixel version adapted to only do it per quad. A per pixel version would be much simpler (could drop 2 selects, 6 broadcasts and the messy horizontal add of 3 vectors at the expense of only 2 more absolute value instructions - would also just work for arbitary large vectors). This version doesn't yet work with larger vectors because the horizontal add isn't adjusted to be able to work with 2x4 vectors (and also because face selection wouldn't be done per quad just per block though that would be only a correctness issue just as with lod selection). The downside is this code is quite a bit slower. On a Core2 it can be sped up by disabling the hw blend instructions for selection and using logicop fallbacks instead, but it is still slower than the old code, hence leave that in for now. Probably will chose one or the other version based on vector length in the end. commit b375fbb18a3fd46859b7fdd42f3e9908ea4ff9a3 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 21 14:42:29 2012 +0000 gallivm: fix optimized occlusion query intrinsic name commit a9ba0a3b611e48efbb0e79eb09caa85033dbe9a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Mar 21 16:19:43 2012 +0000 draw,gallivm,llvmpipe: Call gallivm_verify_function everywhere. commit f94c2238d2bc7383e088b8845b7410439a602071 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 18:54:10 2012 +0000 gallivm: optimize calculations for cube maps a bit this does some more vectorized calculations and uses horizontal adds if possible. A definite win with sse3 otherwise it doesn't seem to make much of a difference. In any case this is arithmetically identical, cannot handle larger vectors. Should be useful as a reference point against larger vector version later... commit 21a2c1cf3c8e1ac648ff49e59fdc0e3be77e2ebb Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 15:16:27 2012 +0000 llvmpipe: slight optimization of occlusion queries using movmskps when available. While this is slightly better for cpus without popcnt we should really sum the vectors ourselves (it is also possible to cast to i4 before doing the popcnt but that doesn't help that much neither since llvm is using some optimized popcnt version for i32) commit 5ab5a35f216619bcdf55eed52b0db275c4a06c1b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 13:32:11 2012 +0000 llvmpipe: fix occlusion queries with larger vectors need to adjust casts etc. commit ff95e6fdf5f16d4ef999ffcf05ea6e8c7160b0d5 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Mar 19 20:15:25 2012 +0000 gallivm: Restore optimization passes. commit 57b05b4b36451e351659e98946dae27be0959832 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:34:22 2012 +0000 llvmpipe: use existing min2 macro commit bc9a20e19b4f600a439f45679451f2e87cd4b299 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:07:27 2012 +0000 llvmpipe: add some safeguards against really large vectors As per José's suggestion, prevent things from blowing up if some cpu would have 1024bit or larger vectors. commit 0e2b525e5ca1c5bbaa63158bde52ad1c1564a3a9 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:31:08 2012 +0000 llvmpipe: fix mask generation for uberwide vectors this was the only piece preventing 16-wide vectors from working (apart from the LP_MAX_VECTOR_WIDTH define that is), which is the maximum as we don't get more pixels in the fragment shader at once. Hence adjust that so things could be tested properly with that size even though there seems to be no practical value. commit 3c8334162211c97f3a11c7f64e9e5a2a91ad9656 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:19:41 2012 +0000 llvmpipe: fix the simple interpolation method with larger vectors so both methods actually _really_ work now. Makes textures look nice with larger vectors... commit 1cb0464ef8871be1778d43b0c56adf9c06843e2d Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 17:26:35 2012 +0000 llvmpipe: fix mask generation and position interpolation with 8-wide vectors trivial bugs, with these things start to look somewhat reasonable. Textures though have some swizzling issues it seems. commit 168277a63ef5b72542cf063c337f2d701053ff4b Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 16:04:03 2012 +0000 llvmpipe: don't overallocate variables we never have more than 16 (stamp size) / 4 (minimum possible vector size). (With larger vectors those variables are still overallocated a bit.) commit 409b54b30f81ed0aa9ed0b01affe15c72de9abd2 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:56:48 2012 +0000 llvmpipe: add some 32f8 formats to lp_test_conv Also add the ability to handle different sized vectors. commit 55dcd3af8366ebdac0af3cdb22c2588f24aa18ce Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:47:27 2012 +0000 gallivm: handle different sized vectors in conversion / pack only fully generic path for now (extract/insert per element). commit 9c040f78c54575fcd94a8808216cf415fe8868f6 Author: Roland Scheidegger <sroland@vmware.com> Date: Sun Mar 18 00:58:28 2012 +0100 llvmpipe: fix harmless use of unitialized values commit 551e9d5468b92fc7d5aa2265db9a52bb1e368a36 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:31:21 2012 +0100 gallivm: drop special path in extract_broadcast with different sized vectors Not needed, llvm can handle shuffles with different sized result vector just fine. Should hopefully generate the same code in the end, but simpler IR. commit 44da531119ffa07a421eaa041f63607cec88f6f8 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:28:49 2012 +0100 llvmpipe: adapt interpolation for handling multiple quads at once this is still WIP there are actually two methods possible not quite sure what makes the most sense, so there's code for both for now: 1) the iterative method as used before (compute attrib values at upper left corner of stamp and upper left corner of each quad initially). It is improved to handle more than one quad at once, and also do some more vectorized calculations initially for slightly better code - newer cpus have full throughput with 4 wide float vectors, hence don't try to code up a path which might be faster if there's just one channel active per attribute. 2) just do straight interpolation for each pixel. Method 2) is more work per quad, but less initially - if all quads are executed significantly more overall though. But this might change with larger vector lengths. This method would also be needed if we'd do some kind of active quad merging when operating on multiple quads at once. This path contains some hack to force llvm to generate better code, it is still far from ideal though, still generates far too many unnecessary register spills/reloads. Both methods should work with different sized vectors. Not very well tested yet, still seems to work with four-wide vectors, need changes elsewhere to be able to test with wider vectors. commit be5d3e82e2fe14ad0a46529ab79f65bf2276cd28 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:59:37 2012 +0000 draw: Cleanup. commit f85bc12c7fbacb3de2a94e88c6cd2d5ee0ec0e8d Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:43:30 2012 +0000 gallivm: More module compilation refactoring. commit d76f093198f2a06a93b2204857e6fea5fd0b3ece Author: José Fonseca <jfonseca@vmware.com> Date: Thu Mar 15 21:29:11 2012 +0000 llvmpipe: Use gallivm_compile/free_function() in linear code. Should had been done before. commit 122e1adb613ce083ad739b153ced1cde61dfc8c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 13 14:47:10 2012 +0100 llvmpipe: generate partial pixel mask for multiple quads still works with one quad, cannot be tested yet with more At least for now always fixed order with multiple quads. commit 4c4f15081d75ed585a01392cd2dcce0ad10e0ea8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 22:09:24 2012 +0100 llvmpipe: refactor state setup a bit Refactor to make it easier to emit (and potentially later fetch in fs) coefficients for multiple attributes at once. Need to think more about how to make this actually happen however, the problem is different attributes can have different interpolation modes, requiring different handling in both setup and fs (though linear and perspective handling is close). commit 9363e49722ff47094d688a4be6f015a03fba9c79 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 19:23:23 2012 +0100 llvmpipe: vectorize tri offset calc cuts number of instructions in quad-offset-factor from 107 to 75. This code actually duplicated the (scalar) code calculating the determinant except it used different vertex order (leading to different sign but it doesn't matter) hence llvm could not have figured out it's the same (of course with determinant vectorized in the other place that wouldn't have worked any longer neither). Note this particular piece doesn't actually vectorize well, not many arithmetic instructions left but tons of shuffle instructions... Probably would need to work on n tris at a time for better vectorization. commit 63169dcb9dd445c94605625bf86d85306e2b4297 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 03:11:37 2012 +0100 llvmpipe: vectorize some scalar code in setup reduces number of arithmetic instructions, and avoids loading vector x,y values twice (once as scalars once as vectors). Results in a reduction of instructions from 76 to 64 in fs setup for glxgears (16%) on a cpu with sse41. Since this code uses vec2 disguised as vec4, on old cpus which had physical 64bit sse units (pre-Core2) it probably is less of a win in practice (and if you have no vectors you can only hope llvm eliminates the arithmetic for unneeded elements). commit 732ecb877f951ab89bf503ac5e35ab8d838b58a1 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 7 00:32:24 2012 +0100 draw: fix clipping bug introduced by 4822fea3f0440b5205e957cd303838c3b128419c broke clipping pretty badly (verified with lineclip test) commit ef5d90b86d624c152d200c7c4056f47c3c6d2688 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 23:38:59 2012 +0100 draw: don't store vertex header per attribute storing the vertex header once per attribute is totally unnecessary. Some quick look at the generated assembly says llvm in fact cannot optimize away the additional stores (maybe due to potentially aliasing pointers somewhere). Plus, this makes the code cleaner and also allows using a vector "or" instead of scalar ones. commit 6b3a5a57b0b9850854cfbd7b586e4e50102dda71 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 19:11:01 2012 +0100 draw: do the per-vertex "boolean" clipmask "or" with vectors no point extracting the values and doing it per component. Doesn't help that much since we still extract the values elsewhere anyway. commit 36519caf1af40e4480251cc79a2d527350b7c61f Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 2 22:27:01 2012 +0100 gallivm: fix lp_build_extract_broadcast with different sized vectors Fix the obviously wrong argument, so it doesn't blow up. commit 76d0ac3ad85066d6058486638013afd02b069c58 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 2 12:16:23 2012 +0000 draw: Compile per module and not per function (WIP). Enough to get gears w/ LLVM draw + softpipe to work on AVX doing: GALLIUM_DRIVER=softpipe SOFTPIPE_USE_LLVM=yes glxgears But still hackish -- will need to rethink and refactor this. commit 78e32b247d2a7a771be9a1a07eb000d1e54ea8bd Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 12:01:05 2012 +0000 llvmpipe: Remove lp_state_setup_fallback. Never used. commit 6895d5e40d19b4972c361e8b83fdb7eecda3c225 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Feb 27 19:14:27 2012 +0000 llvmpipe: Don't emit EMMS on x86 We already take precautions to ensure that LLVM never emits MMX code. commit 4822fea3f0440b5205e957cd303838c3b128419c Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 29 15:58:19 2012 +0100 draw: modifications for larger vector sizes We want to be able to use larger vectors especially for running the vertex shader. With this patch we build soa vectors which might have a different length than 4. Note that aos structures really remain the same, only when aos structures are converted to soa potentially different sized vectors are used. Samplers probably don't work yet, didn't look at them. Testing done: glxgears works with both 128bit and 256bit vectors. commit f4950fc1ea784680ab767d3dd0dce589f4e70603 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:57 2012 +0100 gallivm: override native vector width with LP_NATIVE_VECTOR_WIDTH env var for debug commit 6ad6dbf0c92f3bf68ae54e5f2aca035d19b76e53 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:24 2012 +0100 draw: allocate storage with alignment according to native vector width commit 7bf0e3e7c9bd2469ae7279cabf4c5229ae9880c1 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Feb 24 19:06:08 2012 +0000 gallivm: Fix comment grammar. Was missing several words. Spotted by Roland. commit b20f1b28eb890b2fa2de44a0399b9b6a0d453c52 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 19:22:09 2012 +0000 gallivm: Use MC-JIT on LLVM 3.1 + (i.e, SVN) MC-JIT Note: MC-JIT is still WIP. For this to work correctly it requires LLVM changes which are not yet upstream. commit b1af4dfcadfc241fd4023f4c3f823a1286d452c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 20:03:15 2012 +0100 llvmpipe: use new lp_type_width() helper in lp_test_blend commit 04e0a37e888237d4db2298f31973af459ef9c95f Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 19:50:34 2012 +0100 llvmpipe: clean up lp_test_blend a little Using variables just sized and aligned right makes it a bit more obvious what's going on. The test still only tests vector length 4. For AoS anything else probably isn't going to work. For SoA other lengths should work (at least with floats). commit e61c393d3ec392ddee0a3da170e985fda885a823 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:48:30 2012 +0000 gallivm: Ensure vector width consistency. Instead of assuming that everything is the max native size. commit 330081ac7bc41c5754a92825e51456d231bf84dd Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:44:14 2012 +0000 draw: More simd vector width consistency fixes. commit d90ca002753596269e37297e2e6c139b19f29f03 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:43:00 2012 +0000 gallivm: Remove unused lp_build_int32_vec4_type() helper. commit cae23417824d75869c202aaf897808d73a2c1db0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 17:32:16 2012 +0100 gallivm: use global variable for native vector width instead of define We do not know the simd extensions (and hence the simd width we should use) available at compile time. At least for now keep a define for maximum vector width, since a global variable obviously can't be used to adjust alignment of automatic stack variables. Leave the runtime-determined value at 128 for now in all cases. commit 51270ace6349acc2c294fc6f34c025c707be538a Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 15:41:02 2012 +0000 gallivm: Add a hunk inadvertedly lost when rebasing. commit bf256df9cfdd0236637a455cbaece949b1253e98 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:24:23 2012 +0000 llvmpipe: Use consistent vector width in depth/stencil test. commit 5543b0901677146662c44be2cfba655fd55da94b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:19:59 2012 +0000 draw: Use a consistent the vector register width. Instead of 4x32 sometimes, LP_NATIVE_VECTOR_WIDTH other times. commit eada8bbd22a3a61f549f32fe2a7e408222e5c824 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 12:08:04 2012 +0000 gallivm: Remove garbagge collection. MC-JIT will require one compilation per module (as opposed to one compilation per function), therefore no state will be shared, eliminating the need to do garbagge collection. commit 556697ea0ed72e0641851e4fbbbb862c470fd7eb Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 10:33:41 2012 +0000 gallivm: Move all native target initialization to lp_set_target_options(). commit c518e8f3f2649d5dc265403511fab4bcbe2cc5c8 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:52:32 2012 +0000 llvmpipe: Create one gallivm instance for each test. commit 90f10af8920ec6be6f2b1e7365cfc477a0cb111d Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:48:08 2012 +0000 gallivm: Avoid LLVMAddGlobalMapping() in lp_bld_assert(). Brittle, complex, and unecesary. Just use function pointer constant. commit 98fde550b33401e3fe006af59db4db628bcbf476 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:21:26 2012 +0000 gallivm: Add a lp_build_const_func_pointer() helper. To be reused in all places where we want to call C code. commit 6cfedadb62c2ce5af8d75969bc95a607f3ece118 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:44:41 2012 +0000 gallivm: Cleanup/simplify lp_build_const_string_variable. - Move to lp_bld_const where it belongs - Rename to lp_build_const_string - take the length from the argument (and don't count the zero terminator twice) - bitcast the constant to generic i8 * commit db1d4018c0f1fa682a9da93c032977659adfb68c Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 11:52:17 2012 +0000 gallivm: Set NoFramePointerElimNonLeaf to true where supported. commit 088614164aa915baaa5044fede728aa898483183 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 22 19:38:47 2012 +0100 llvmpipe: pass in/out pointers rather scalar floats in lp_bld_arit we don't want llvm to potentially optimize away the vectors (though it doesn't seem to currently), plus we want to be able to handle in/out vectors of arbitrary length. commit 3f5c4e04af8a7592fdffa54938a277c34ae76b51 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:22:55 2012 +0100 gallivm: fix lp_build_sqrt() for vector length 1 since we optimize away vectors with length 1 need to emit intrinsic without vector type. commit 79d94e5f93ed8ba6757b97e2026722ea31d32c06 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 17:00:46 2012 +0000 llvmpipe: Remove lp_test_round. commit 81f41b5aeb3f4126e06453cfc78990086b85b78d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:56:24 2012 +0100 llvmpipe: subsume lp_test_round into lp_test_arit Much simpler, and since the arguments aren't passed as 128bit values can run on any arch. This also uses the float instead of the double versions of the c functions (which probably was the intention anyway). In contrast to lp_test_round the output is much less verbose however. Tested vector width of 32 to 512 bits - all pass except 32 (length 1) which crashes in lp_build_sqrt() due to wrong type. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 945b338b421defbd274481d8c4f7e0910fd0e7eb Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 09:55:03 2012 +0000 gallivm: Centralize the function compilation logic. This simplifies a lot of code. Also doing this in a central place will make it easier to carry out the changes necessary to use MC-JIT in the future. gallivm: Fix typo in explicit derivative shuffle. Trivial. draw: make DEBUG_STORE work again adapt to lp_build_printf() interface changes Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: get rid of vecnf_from_scalar() just use lp_build_broadcast directly (cannot assign a name but don't really need it, vecnf_from_scalar() was producing much uglier IR due to using repeated insertelement instead of insertelement+shuffle). Reviewed-by: José Fonseca <jfonseca@vmware.com> llvmpipe: fix typo in complex interpolation code Fixes position interpolation when using complex mode (piglit fp-fragment-position and similar) Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: fix clipvertex/position storing again This appears to be the result of a bad merge. Fixes piglit tests relying on clipping, like a lot of the interpolation tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Fix explicit derivative manipulation. Same counter variable was being used in two nested loops. Use more meanigful variable names for the counter to fix and avoid this. gallivm: Prevent buffer overflow in repeat wrap mode for NPOT. Based on Roland's patch, discussion, and review . Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix dims for TGSI_TEXTURE_1D in emit_tex. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix explicit volume texture derivatives. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: fix 1d shadow texture sampling Always r coordinate is used, hence need 3 coords not two (the second one is unused). Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Enable AVX support without MCJIT, where available. For now, this just enables AVX on Windows for testing. If the code is stable then we might consider prefering the old JIT wherever possible. No change elsewhere. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2012-07-13 18:09:30 +01:00
intr_size = 128;
if ((type.width == 8 || type.width == 16) &&
(type.width * type.length <= 64) &&
(gallivm_debug & GALLIVM_DEBUG_PERF)) {
debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
__FUNCTION__);
}
if (type.width == 8 && !type.sign) {
intrinsic = "llvm.x86.sse2.pmaxu.b";
intr_size = 128;
}
else if (type.width == 16 && type.sign) {
intrinsic = "llvm.x86.sse2.pmaxs.w";
}
if (util_cpu_caps.has_sse4_1) {
if (type.width == 8 && type.sign) {
intrinsic = "llvm.x86.sse41.pmaxsb";
gallivm,draw,llvmpipe: Support wider native registers. Squashed commit of the following: commit 7acb7b4f60dc505af3dd00dcff744f80315d5b0e Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:46:31 2012 +0100 draw: Don't use dynamically sized arrays. Not supported by MSVC. commit 5810c28c83647612cb372d1e763fd9d7780df3cb Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:44:16 2012 +0100 gallivm,llvmpipe: Don't use expressions with PIPE_ALIGN_VAR(). MSVC doesn't accept exceptions in _declspec(align(...)). Use a define instead. commit 8aafd1457ba572a02b289b3f3411e99a3c056072 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:41:56 2012 +0100 gallium/util: Make u_cpu_detect.h header C++ safe. commit 5795248350771f899cfbfc1a3a58f1835eb2671d Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 2 12:08:01 2012 +0100 gallium/util: Add ULL suffix to large constants. As suggested by Andy Furniss: it looks like some old gcc versions require it. commit 4c66c22727eff92226544c7d43c4eb94de359e10 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Truly disable INF/NAN tests on MSVC. Thanks to Brian for spotting this. commit 8bce274c7fad578d7eb656d9a1413f5c0844c94e Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Disable INF/NAN tests on MSVC. Somehow they are not recognized as constants. commit 6868649cff8d7fd2e2579c28d0b74ef6dd4f9716 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 15:05:24 2012 +0200 gallivm: Cleanup the 2 x 8 float -> 16 ub special path in lp_build_conv. No behaviour change intended, like 7b98455fb40c2df84cfd3cdb1eb7650f67c8a751. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 5147a0949c4407e8bce9e41d9859314b4a9ccf77 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 14:28:19 2012 +0200 gallivm: (trivial) fix issues with multiple-of-4 texture fetch Some formats can't handle non-multiple of 4 fetches I believe, but everything must support length 1 and multiples of 4. So avoid going to scalar fetch (which is very costly) just because length isn't 4. Also extend the hack to not use shift with variable count for yuv formats to arbitrary length (larger than 1) - doesn't matter how many elements we have we always want to avoid it unless we have variable shift count instruction (which we should get with avx2). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 87ebcb1bd71fa4c739451ec8ca89a7f29b168c08 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jul 4 02:09:55 2012 +0200 gallivm: (trivial) fix typo for wrap repeat mode in linear filtering aos code This would lead to bogus coordinates at the edges. (undetected by piglit because this path is only taken for block-based formats). Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 3a42717101b1619874c8932a580c0b9e6896b557 Author: José Fonseca <jfonseca@vmware.com> Date: Tue Jul 3 19:42:49 2012 +0100 gallivm: Fix TGSI integer translation with AVX. commit d71ff104085c196b16426081098fb0bde128ce4f Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 15:17:41 2012 +0100 llvmpipe: Fix LLVM JIT linear path. It was not working properly because it was looking at the JIT function before it was actually compiled. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit a94df0386213e1f5f9a6ed470c535f9688ec0a1b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Jun 28 18:07:10 2012 +0100 gallivm: Refactor lp_build_broadcast(_scalar) to share code. Doesn't really change the generated assembly, but produces more compact IR, and of course, makes code more consistent. Reviewed-by: Brian Paul <brianp@vmware.com> commit 66712ba2731fc029fa246d4fc477d61ab785edb5 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 17:30:13 2012 +0100 gallivm: Make LLVMContextRef a singleton. There are any places inside LLVM that depend on it. Too many to attempt to fix. Reviewed-by: Brian Paul <brianp@vmware.com> commit ff5fb7897495ac263f0b069370fab701b70dccef Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 28 18:15:27 2012 +0200 gallivm: don't use 8-wide texture fetch in aos path This appears to be a slight loss usually. There are probably several reasons for that: - fetching itself is scalar - filtering is pure int code hence needs splitting anyway, same for the final texel offset calculations - texture wrap related code, which can be done 8-wide, is slightly more complex with floats (with clamp_to_edge) and float operations generally more costly hence probably not much faster overall - the code needed to split when encountering different mip levels for the quads, adding complexity So, just split always for aos path (but leave it 8-wide for soa, since we do 8-wide filtering there when possible). This should certainly be revisited if we'd have avx2 support. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ce8032b43dcd8e8d816cbab6428f54b0798f945d Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:41:19 2012 +0200 gallivm: (trivial) don't extract fparts variable if not needed Did not have any consequences but unnecessary. commit aaa9aaed8f80dc282492f62aa583a7ee23a4c6d5 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:09:06 2012 +0200 gallivm: fix precision issue in aos linear int wrap code now not just passes at a quick glance but also with piglit... If we do the wrapping with floats, we also need to set the weights accordingly. We can potentially end up with different (integer) coordinates than what the integer calculations would have chosen, which means the integer weights calculated previously in this case are completely wrong. Well at least that's what I think happens, at least recalculating the weights helps. (Some day really should refactor all the wrapping, so we do whatever is fastest independent of 16bit int aos or 32bit float soa filtering.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit fd6f18588ced7ac8e081892f3bab2916623ad7a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 11:15:53 2012 +0100 gallium/util: Fix parsing of options with underscore. For example GALLIVM_DEBUG=no_brilinear which was being parsed as two options, "no" and "brilinear". commit 09a8f809088178a03e49e409fa18f1ac89561837 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 26 15:00:14 2012 +0100 gallivm: Added a generic lp_build_print_value which prints a LLVMValueRef. Updated lp_build_printf to share common code. Removed specific lp_build_print_vecX. Reviewed-by: José Fonseca <jfonseca@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit e59bdcc2c075931bfba2a84967a5ecd1dedd6eb0 Author: José Fonseca <jfonseca@vmware.com> Date: Wed May 16 15:00:23 2012 +0100 draw,llvmpipe: Avoid named struct types on LLVM 3.0 and later. Starting with LLVM 3.0, named structures are meant not for debugging, but for recursive data types, previously also known as opaque types. The recursive nature of these types leads to several memory management difficulties. Given that we don't actually need recursive types, avoid them altogether. This is an attempt to address fdo bugs 41791 and 44466. The issue is somewhat random so there's no easy way to check how effective this is. Cherry-picked from 9af1ba565dfd5cef9ee938bb7c04767d14878fbf commit df6070f618a203c7a876d984c847cde4cbc26bdb Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 14:42:53 2012 +0200 gallivm: (trivial) fix typo in faster aos linear int wrap code no longer crashes, now REALLY tested. commit d8f98dce452c867214e6782e86dc08562643c862 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 18:20:58 2012 +0200 llvmpipe: (trivial) remove bogus optimization for float aos repeat wrap This optimization for nearest filtering on the linear path generated likely bogus results, and the int path didn't have any optimizations there since the only shader using force_nearest apparently uses clamp_to_edge not repeat wrap anyway. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit c4e271a0631087c795e756a5bb6b046043b5099d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 23:01:52 2012 +0200 gallivm: faster repeat wrap for linear aos path too Even if we already have scaled integer coords, it's way faster to use the original float coord (plus some conversions) rather than use URem. The choice of what to do for texture wrapping is not really tied to int aos or float soa filtering though for some modes there can be some gains (because of easier weight calculations). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 1174a75b1806e92aee4264ffe0ffe7e70abbbfa3 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 14:39:22 2012 +0200 gallivm: improve npot tex wrap repeat in linear soa path URem gets translated into series of scalar divisions so just about anything else is faster. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit f849ffaa499ed96fa0efd3594fce255c7f22891b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 00:40:35 2012 +0100 gallivm: (trivial) fix near-invisible shift-space typo I blame the keyboard. commit 5298a0b19fe672aebeb70964c0797d5921b51cf0 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:24:28 2012 +0200 gallivm: add new intrinsic helper to deal with arbitrary vector length This helper will split vectors which are too large for the hw, or expand them if they are too small, so a caller of a function using intrinsics which uses such sizes need not split (or expand) the vectors manually and the function will still use the intrinsic instead of dropping back to generic llvm code. It can also accept scalars for use with pseudo-vector intrinsics (only useful for float arguments, all x86 scalar simd float intrinsics use 4vf32). Only used for lp_build_min/max() for now (also added the scalar float case for these while there). (Other basic binary functions could use it easily, whereas functions with a different interface would need different helpers.) Expanding vectors isn't widely used, because we always try to use build contexts with native hw vector sizes. But it might (or not) be nicer if this wouldn't need to be done, the generated code should in theory stay the same (it does get hit by lp_build_rho though already since we didn't have a intrinsic for the scalar lp_build_max case before). v2: incorporated Brian's feedback, and also made the scalar min/max case work instead of crash (all scalar simd float intrinsics take 4vf32 as argument, probably the reason why it wasn't used before). Moved to lp_bld_intr based on José's request, and passing intrinsic size instead of length. Ideally we'd derive the source type info from the passed in llvm value refs and process some llvmtype return type so we could handle intrinsics where the source and destination type isn't the same (like float/int conversions, packing instructions) but that's a bit too complicated for now. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 01aa760b99ec0b2dc8ce57a43650e83f8c1becdf Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:19:18 2012 +0200 gallivm: (trivial) increase max code size for shader disassembly 64kB was just short of what I needed (which caused a crash) hence increase to 96kB (should probably be smarter about that). commit 74aa739138d981311ce13076388382b5e89c6562 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:53:29 2012 +0100 gallivm: simplify aos float tex wrap repeat nearest just handle pot and npot the same. The previous pot handling ended up with exactly the same instructions plus 2 more (leave it in the soa path though since it is probably still cheaper there). While here also fix a issue which would cause a crash after an assert. commit 0e1e755645e9e49cfaa2025191e3245ccd723564 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:29:24 2012 +0100 gallivm: (trivial) skip floor rounding in ifloor when not signed This was only done for the non-sse41 case before, but even with sse41 this is obviously unnecessary (some callers already call itrunc in this case anyway but some might not). commit 7f01a62f27dcb1d52597b24825931e88bae76f33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:23:12 2012 +0100 gallivm: (trivial) fix bogus comments commit 5c85be25fd82e28490274c468ce7f3e6e8c1d416 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 20 11:51:57 2012 +0100 translate: Free elt8_func/elt16_func too. These were leaking. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 0ad498f36fb6f7458c7cffa73b6598adceee0a6c Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 15:55:34 2012 +0200 gallivm: fix bug for tex wrap repeat with linear sampling in aos float path The comparison needs to be against length not length_minus_one, otherwise the max texel is never chosen (for the second coordinate). Fixes piglit texwrap-1D-npot-proj (and 2D/3D versions). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit d1ad65937c5b76407dc2499b7b774ab59341209e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 16:13:43 2012 +0200 gallivm: simplify soa tex wrap repeat with npot textures and no mip filtering Similar to what is already done in aos sampling for the float path (but not the int path since we don't get normalized float coordinates there). URem is expensive and the calculation is done trivially with normalized floats instead (at least with sse41-capable cpus). (Some day should probably do the same for the mip filter path but it's much more complicated there hence the gain is smaller.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit e1e23f57ba9b910295c306d148f15643acc3fc83 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:38:56 2012 +0200 llvmpipe: (trivial) remove duplicated function declaration Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 07ca57eb09e04c48a157733255427ef5de620861 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:37:34 2012 +0200 llvmpipe: destroy setup variants on context destruction lp_delete_setup_variants() used to be called in garbage collection, but this no longer exists hence the setup shaders never got freed. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ed0003c633859a45f9963a479f4c15ae0ef1dca3 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 16:25:29 2012 +0100 gallivm: handle different ilod parts for multiple quad sampling This fixes filtering when the integer part of the lod is not the same for all quads. I'm not fully convinced of that solution yet as it just splits the vector if the levels to be sampled from are different. But otherwise we'd need to do things like some minify steps, and getting mip level base address separately anyway hence it wouldn't really look like much of a win (and making the code even more complex). This should now give identical results to single quad sampling. commit 8580ac4cfc43a64df55e84ac71ce1a774d33c0d2 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:14:47 2012 +0200 gallivm: de-duplicate sample code common to soa and aos sampling There doesn't seem to be any reason why this code dealing with cube face selection, lod and mip level calculation is separate in aos and soa sampling, and I am sick of having it to change in both places. commit fb541e5f957408ce305b272100196f1e12e5b1e8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:15:41 2012 +0200 gallivm: do mip filtering with per quad lod_fpart This gives better results for mip filtering, though the generated code might not be optimal. For now it also creates some artifacts if the lod_ipart isn't the same for all quads, since instead of using the same mip weight for all quads as previously (which just caused non-smooth gradients) this now will use the right weights but with the wrong mip level in this case (can easily be seen with things like texfilt, mipmap_tunnel). v2: use logic helper suggested by José, and fix issue with negative lod_fpart values commit f1cc84eef7d826a20fab6cd8ccef9a275ff78967 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 13 18:35:25 2012 +0200 gallivm: (trivial) fix bogus assert in lp_build_unpack_broadcast_aos_scalars commit 7c17dbae8ae290df9ce0f50781a09e8ed640c044 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:11:14 2012 +0100 util: Reimplement half <-> float conversions. Removed u_half.py used to generate the table for previous method. Previous implementation of float to half conversion was faulty for denormalised and NaNs and would require extra logic to fix, thus making the speedup of using tables irrelevant. commit 7762f59274070e1dd4b546f5cb431c2eb71ae5c3 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:12:16 2012 +0100 tests: Updated tests to properly handle NaN for half floats. commit fa94c135aea5911fd93d5dfb6e6f157fb40dce5e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:33:10 2012 +0200 gallivm: do mip level calculations per quad This is the final piece which shouldn't change the rendering output yet. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 23cbeaddfe03c09ca18c45d28955515317ffcf4c Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Jun 9 00:54:21 2012 +0200 gallivm: do per-quad cube face selection Doesn't quite fix the piglit cubemap test (not sure why actually) but doing per-quad face selection is doing the right thing and definitely an improvement. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit abfb372b3702ac97ac8b5aa80ad1b94a2cc39d33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:22:59 2012 +0200 gallivm: do all lod calculations per quad Still no functional change but lod is now converted to scalar after lod calculations. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 519368632747ae03feb5bca9c655eccbc5b751b4 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:46:10 2012 +0100 gallivm: Added support for half-float to float conversion in lp_build_conv. Updated various utility functions to support this change. commit 135b4d683a4c95f7577ba27b9bffa4a6fbd2c2e7 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:02:46 2012 +0100 gallivm: Added function for half-float to float conversion. Updated lp_build_format_aos_array to support half-float source. commit 37d648827406a20c5007abeb177698723ed86673 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:55:18 2012 +0100 util: Updated u_format_tests to rigidly test half-float boundary values. commit 2ad18165d96e578aa9046df7c93cb1c3284d8c6b Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:54:16 2012 +0100 llvmpipe: Updated lp_test_format to properly handle Inf/NaN results. commit 78740acf25aeba8a7d146493dd5c966e22c27b73 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:53:30 2012 +0100 util: Added functions for checking NaN / Inf for double and half-floats. commit 35e9f640ae01241f9e0d67fe893bbbf564c05809 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:05:13 2012 +0200 gallivm: Fix calculating rho for 3d textures for the single-quad case Discovered by accident, this looks like a very old typo bug. commit fc1220c636326536fd0541913154e62afa7cd1d8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:04:59 2012 +0200 gallivm: do calcs per-quad in lp_build_rho Still convert to scalar at the end of the function. commit 50a887ffc550bf310a6988fa2cea5c24d38c1a41 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 21 23:21:50 2012 +0200 gallivm: (trivial) return scalar in lp_build_extract_range for length 1 vectors Our type system on top of llvm's one doesn't generally support vectors of length 1, instead using scalars. So we should return a scalar from this function instead of having to bitcast the vector with length 1 later elsewhere. commit 80c71c621f9391f0f9230460198d861643324876 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 17:49:15 2012 +0100 draw: Fixed bad merge error commit c47401cfad0c9167de20ff560654f533579f452c Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:29:30 2012 +0100 draw: Updated store_clip to store whole vectors instead of individual elements. commit 2d9c1ad74b0b0b41861fffcecde39f09cc27f1cf Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:28:32 2012 +0100 gallivm: Added lp_build_fetch_rgba_aos_array. A version of lp_build_fetch_rgba_aos which is targeted at simple array formats. Reads the whole vector from memory in one, instead of reading each element individually. Tested with mesa tests and demos. commit ff7805dc2b6ef6d8b11ec4e54aab1633aef29ac8 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:27:40 2012 +0100 gallivm: Added lp_build_pad_vector. This function pads a vector with undef to a desired length. commit 701f50acef24a2791dabf4730e5b5687d6eb875d Author: James Benton <jbenton@vmware.com> Date: Fri May 18 17:27:19 2012 +0100 util: Added util_format_is_array. This function checks whether a format description is in a simple array format. commit 5e0a7fa543dcd009de26f34a7926674190fa6246 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:13:47 2012 +0100 draw: Removed draw_llvm_translate_from and draw/draw_llvm_translate.c. This is "replaced" by adding an optimised path in lp_build_fetch_rgba_aos in an upcoming patch. commit 8c886d6a7dd3fb464ecf031de6f747cb33e5361d Author: James Benton <jbenton@vmware.com> Date: Wed May 16 15:02:31 2012 +0100 draw: Modified store_aos to write the vector as one, not individual elements. commit 37337f3d657e21dfd662c7b26d61cb0f8cfa6f17 Author: James Benton <jbenton@vmware.com> Date: Wed May 16 14:16:23 2012 +0100 draw: Changed aos_to_soa to use lp_build_transpose_aos. commit bd2b69ce5d5c94b067944d1dcd5df9f8e84548f1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:14:27 2012 +0100 draw: Changed soa_to_aos to use lp_build_transpose_aos. commit 0b98a950d29a116e82ce31dfe7b82cdadb632f2b Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:45 2012 +0100 gallivm: Added lp_build_transpose_aos which converts between aos and soa. commit 69ea84531ad46fd145eb619ed1cedbe97dde7cb5 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:01 2012 +0100 gallivm: Added lp_build_interleave2_half aimed at AVX unpack instructions. commit 7a4cb1349dd35c18144ad5934525cfb9436792f9 Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 22 11:54:14 2012 +0100 gallivm: Fix build on Windows. MC-JIT not yet supported there. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit afd105fc16bb75d874e418046b80d9cc578818a1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:17:26 2012 +0100 llvmpipe: Added a error counter to lp_test_conv. Useful for keeping track of progress when fixing errors! Signed-off-by: José Fonseca <jfonseca@vmware.com> commit b644907d08c10a805657841330fc23db3963d59c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:16:46 2012 +0100 llvmpipe: Changed known failures in lp_test_conv. To comply with the recent fixes to lp_bld_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit d7061507bd94f6468581e218e61261b79c760d4f Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:14:38 2012 +0100 llvmpipe: Added fixed point types tests to lp_test_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 146b3ea39b4726dbe125ac666bd8902ea3d6ca8c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:26:35 2012 +0100 llvmpipe: Changed lp_test_conv src/dst alignment to be correct. Now based on the define rather than a fixed number. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit f3b57441f834833a4b142a951eb98df0aa874536 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:06:44 2012 +0100 gallivm: Fixed erroneous optimisation in lp_build_min/max. Previously assumed normalised was 0 to 1, but it can be -1 to 1 if type is signed. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a0613382e5a215cd146bb277646a6b394d376ae4 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:04:49 2012 +0100 gallivm: Compensate for lp_const_offset in lp_build_conv. Fixing a /*FIXME*/ to remove errors in integer conversion in lp_build_conv. Tested using lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a3d2bf15ea345bc8a0664f8f441276fd566566f3 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:01:25 2012 +0100 gallivm: Fixed overflow in lp_build_clamped_float_to_unsigned_norm. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit e7b1e76fe237613731fa6003b5e1601a2e506207 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 21 20:07:51 2012 +0100 gallivm: Fix build with LLVM 2.6 Trivial, and useful. commit d3c6bbe5c7f5ba1976710831281ab1b6a631082d Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 15 17:15:59 2012 +0100 gallivm: Enable MCJIT/AVX with vanilla LLVM 3.1. Add the necessary C++ glue, so that we don't need any modifications to the soon to be released LLVM 3.1. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 724a019a14d40fdbed21759a204a2bec8a315636 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 22:04:06 2012 +0100 gallivm: Use HAVE_LLVM 0x0301 consistently. commit af6991e2a3868e40ad599b46278551b794839748 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 21:49:06 2012 +0100 gallivm: Add MCRegisterInfo.h to silence benign warnings about missing implementation. Trivial. commit 6f8a1d75458daae2503a86c6b030ecc4bb494e23 Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Apr 2 22:14:15 2012 -0700 gallivm: Pass in a MCInstrInfo to createMCInstPrinter on llvm-3.1. llvm-3.1svn r153860 makes MCInstrInfo available to the MCInstPrinter. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 62555b6ed8760545794f83064e27cddcb3ce5284 Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 27 21:51:17 2012 -0700 gallivm: Fix method overriding in raw_debug_ostream. Use matching type qualifers to avoid method hiding. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 6a9bd784f4ac68ad0a731dcd39e5a3c39989f2be Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 13 22:40:52 2012 -0700 gallivm: Fix createOProfileJITEventListener namespace with llvm-3.1. llvm-3.1svn r152620 refactored the OProfile profiling code. createOProfileJITEventListener was moved from the llvm namespace to the llvm::JITEventListener namespace. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit b674955d39adae272a779be85aa1bd665de24e3e Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Mar 5 22:00:40 2012 -0800 gallivm: Pass in a MCRegisterInfo to MCInstPrinter on llvm-3.1. llvm-3.1svn r152043 changes createMCInstPrinter to take an additional MCRegisterInfo argument. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 11ab69971a8a31c62f6de74905dbf8c02884599f Author: Vinson Lee <vlee@freedesktop.org> Date: Wed Feb 29 21:20:53 2012 -0800 Revert "gallivm: Change getExtent and readByte to non-const with llvm-3.1." This reverts commit d5a6c172547d8964f4d4bb79637651decaf9deee. llvm-3.1svn r151687 makes MemoryObject accessor members const again. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 339960c82d2a9f5c928ee9035ed31dadb7f45537 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 14 16:19:56 2012 +0200 gallivm: (trivial) fix assertion failure for mipmapped 1d textures In lp_build_rho, we may end up with a 1-element vector (for mipmapped 1d textures), but in this case we require the type to be a non-vector type, so need a cast. commit 9d73edb727bd6d196030dc3026b7bf0c574b3e19 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:12:07 2012 +0200 gallivm: prepare for per-quad lod calculations for large vectors to be able to handle multiple quads at once in texture sampling and still do lod calculations per quad, it is necessary to get the per-quad derivatives into the lp_build_rho function. Until now these derivative values were just scalars, which isn't going to work. So we now use vectors, and since the interface needs to change we also do some different (slightly more efficient) packing of the values. For 8-wide vectors the packed derivative values for 3 coords would look like this, this scales to a arbitrary (multiple of 4) vector size: ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____ The second vector will be unused for 1d and 2d textures. To facilitate future changes the derivative values are put into a struct, since quite some functions just pass these values through. The generated code seems to be very slightly better for 2d textures (with 4-wide vectors) than before with sse2 (if you have a cpu with physical 128bit simd units - otherwise it's probably not a win). v2: suggestions from José, rename variables, add comments, use swizzle helper commit 0aa21de0d31466dac77b05c97005722e902517b8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:10:31 2012 +0200 gallivm: add undefined swizzle handling to lp_build_swizzle_aos This is useful for vectors with "holes", it lets llvm choose the most efficient shuffle instructions if some elements aren't needed without having to worry what elements to manually pick otherwise. commit 00faf3f370e7ce92f5ef51002b0ea42ef856e181 Author: José Fonseca <jfonseca@vmware.com> Date: Fri May 4 17:25:16 2012 +0100 gallivm: Get the LLVM IR optimization passes before JIT compilation. MC-JIT engine compiles the module immediately on creation, so the optimization passes were being run too late. So now we create a target data layout from a string, that matches the ABI parameters reported by the compiler. The backend optimization passes were always been run, so the performance improvement is modest (3% on multiarb mesa demo). Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit 40a43f4e2ce3074b5ce9027179d657ebba68800a Author: Roland Scheidegger <sroland@vmware.com> Date: Wed May 2 16:03:54 2012 +0200 gallivm: (trivial) fix wrong define used in lp_build_pack2 should fix stack-smashing crashes. commit e6371d0f4dffad4eb3b7a9d906c23f1c88a2ab9e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 30 21:25:29 2012 +0200 gallivm: add perf warnings when not using intrinsics with 256bit vectors Helper functions using integer sse2 intrinsics could split the vectors with AVX instead of using generic fallback (which should be faster). We don't actually expect to hit these paths (hence don't fix them up to actually do the vector splitting) so just emit warnings (for those functions where it's obvious doing split/intrinsic is faster than using generic path). Only emit warnings for 256bit vectors since we _really_ don't expect to hit arbitrary large vectors which would affect a lot more functions. The warnings do not actually depend on avx since the same logic applies to plain sse2 too (but of course again there's _really_ no reason we should hit these functions with 256bit vectors without avx). commit 8a9ea701ea7295181e846c6383bf66a5f5e47637 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:37:07 2012 +0200 gallivm: split vectors manually for avx in lp_build_pack2 (v2) There's 2 reasons for this: First, there's a llvm bug (fixed in 3.1) which generates tons of byte inserts/extracts otherwise, and second, more importantly, we want to use pack intrinsics instead of shuffles. We do this in lp_build_pack2 and not the calling code (aos sample path) because potentially other callers might find that useful too, even if for larger sequences of code using non-native vector sizes it might be better to manually split vectors. This should boost texture performance in the aos path considerably. v2: fix issues with intrinsics types with old llvm commit 27ac5b48fa1f2ea3efeb5248e2ce32264aba466e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:26:22 2012 +0200 llvmpipe: refactor lp_build_pack2 (v2) prettify, and it's unnecessary to assert when there's no intrinsic due to unsupported bit width - the shuffle path will work regardless. In contrast lp_build_packs2, should only rely on lp_build_pack2 doing the clamping for element sizes for which there is a sse2 intrinsic. v2: fix bug spotted by Jose regarding the intrinsic type for packusdw on old llvm versions. commit ddf279031f0111de4b18eaf783bdc0a1e47813c8 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:13:59 2012 +0200 gallivm: add src width check in lp_build_packs2() not doing so would skip clamping even if no sse2 pack instruction is available, which is incorrect (in theory only, such widths would also always hit a (unnecessary) assertion in lp_build_pack2(). commit e7f0ad7fe079975eae7712a6e0c54be4fae0114b Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Apr 27 15:57:00 2012 +0200 gallivm: (trivial) fix crash-causing typo for npot textures with avx commit 28a9d7f6f655b6ec508c8a3aa6ffefc1e79793a0 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Apr 25 19:38:45 2012 +0200 gallivm: (trivial) remove code mistakenly added twice. commit d5926537316f8ff67ad0a52e7242f7c5478d919b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Apr 24 21:16:15 2012 +0200 gallivm: add a new avx aos sample path (v2) Try to avoid mixing float and int address calculations. This does texture wrap modes with floats, and then the offset calculations still with ints (because of lack of precision with floats, though we could do some effort to make it work with not too large (16MB) textures). This also handles wrap repeat mode with npot-sized textures differently than either the old soa or aos int path (likely way faster but untested). Otherwise the actual address wrap code is largely similar to the soa path (not quite the same as this one also has some int code), it should get used by avx soa sampling later as well but doesn't handle more complex address modes yet (this will also have the benefit that we can use aos sampling path for all texture address modes). Generated code for that looks reasonable, but still does not split vectors explicitly for fetch/filter which means still get hit by llvm (fixed upstream) which generates hundreds of pinsrb/pextrb instead of two shuffles. It is not obvious though if it's much of a win over just doing address calcs 4-wide but with ints, even if it is definitely much less instructions on avx. piglit's texwrap seems to look exactly the same but doesn't test neither the non-normalized nor the npot cases. v2: fix comments, prettify based on Brian's and Jose's feedback. commit bffecd22dea66fb416ecff8cffd10dd4bdb73fce Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Apr 19 01:58:29 2012 +0200 gallivm: refactor aos lp_build_sample_image_nearest/linear split them up to separate address calculations and fetching/filtering. Need this for being able to do 8-wide float address calcs and 4-wide fetch/filter later (for avx). Plus the functions were very big scary monsters anyway (in particular lp_build_sample_image_linear). commit a80b325c57529adddcfa367f96f03557725c4773 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 16 17:17:18 2012 +0200 gallivm: fix lp_build_resize when truncating width but expanding vector size Missed this case which I thought was impossible - the assertion for it was right after the division by zero... (AoS) texture sampling may ask us to do this, for things like 8 4x32int vectors to 1 32x8int vector conversion (eventually, we probably don't want this to happen). commit f9c8337caa3eb185830d18bce8b95676a065b1d7 Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Apr 14 18:00:59 2012 +0200 gallivm: fix cube maps with larger vectors This makes the branchless cube face selection code work with larger vectors. Because the complexity is quite high (cannot really be improved it seems, per-face selection would reduce complexity a lot but this leads to errors unless the derivatives are calculated all from the same face which almost doubles the work to be done) it is still slower than the branching version, hence only enable this with large vectors. It doesn't actually do per-quad face selection yet (only makes sense with matching lod selection, in fact it will select the same face for all pixels based on the average of the first four pixels for now) but only different shuffles are required to make it work (the branching version actually should work with larger vectors too now thanks to the improved horizontal add but of course it cannot be extended to really select the face per-quad unless doing branching per quad). commit 7780c58869fc9a00af4f23209902db7e058e8a66 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 21:11:12 2012 +0100 llvmpipe: (trivial) fix compiler warning and also clarify comment regarding availability of popcnt instruction. commit a266dccf477df6d29a611154e988e8895892277e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 14:21:07 2012 +0100 gallivm: remove unneeded members in lp_build_sample_context Minor cleanup, the texture width, height, depth aren't accessed in their scalar form anywhere. Makes it more obvious those values should probably be fetched already vectorized (but this requires more invasive changes)... commit b678c57fb474e14f05e25658c829fc04d2792fff Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 29 15:53:55 2012 +0100 gallivm: add a helper for concatenating vectors Similar to the extract_range helper intended to get around slow code generated by llvm for 128bit insertelements. Concatenating two 128bit vectors this way will result in a single vinsertf128 operation rather than two 64bit stores plus one 128bit load, though it might be mildly useful for other purposes as well. commit 415ff228bcd0cf5e44a4c15350a661f0f5520029 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 19:41:15 2012 +0100 gallivm: add a custom 2x8f->1x16ub avx conversion path Similar to the existing 4x4f->1x16ub sse2 path, shaves off a couple instructions (min/max mostly) because it relies on pack intrinsics clamping. commit 78c08fc89f8fbcc6dba09779981b1e873e2a0299 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 18:44:07 2012 +0100 gallivm: add avx arithmetic intrinsics Add all avx intrinsics for arithmetic functions (with the exception of the horizontal add function which needs another look). Seems to pass basic tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit a586caa2800aa5ce54c173f7c0d4fc48153dbc4e Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 15:31:35 2012 +0100 gallivm: add avx logic intrinsics Add the blend intrinsics for 8-wide float and 4-wide double vectors. Since we lack 256bit int instructions these are used for int vectors as well, though obviously not for byte or word element values. The comparison intrinsics aren't extended for avx since these are only used for pre-2.7 llvm versions. commit 70275e4c13c89315fc2560a4c488c0e6935d5caf Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 00:40:53 2012 +0100 gallivm: new helper function for extract shuffles. Based on José's idea as we can need that in a couple places. Note that such shuffles should not be used lightly, since data layout of <4 x i8> is different to <16 x i8> for instance, hence might cause data rearrangement. commit 4d586dbae1b0c55915dda1759d2faea631c0a1c2 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 18:27:25 2012 +0100 gallivm: (trivial) don't overallocate shuffle variable using wrong define meant huge array... commit 06b0ec1f6d665d98c135f9573ddf4ba04b2121ad Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 17:54:20 2012 +0100 gallivm: don't do per-element extract/insert for vector element resize Instead of doing per-element extract/insert if the src vectors and dst vector differ in total size (which generates atrocious code) first change the src vectors size by using shuffles to destination vector size. We can still do better than that on AVX for packing to color buffer (by exploiting pack intrinsics characteristics hence eleminating the need for some clamps) but this already generates much better code. v2: incorporate feedback from José, Keith and use shuffle instead of bitcasts/extracts. Due to llvm deficiencies the latter cause all data to get moved to GPRs and back in pieces (even though the data in the regs actually stays the same...). commit c9970d70e05f95d3f52fe7d2cd794176a52693aa Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 19:33:19 2012 +0000 gallivm: fix bug in simple position interpolation Accidental use of position attribute instead of just pixel coordinates. Caused failures in piglit glsl-fs-ceil and glsl-fs-floor. commit d0b6fcdb008d04d7f73d3d725615321544da5a7e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 15:31:14 2012 +0000 gallivm: fix emission of ceil opcode lp_build_ceil seems more appropriate than lp_build_trunc. This seems to be never hit though someone performs some ceil to floor magic. commit d97fafed7e62ffa6bf76560a92ea246a1a26d256 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 22 11:46:52 2012 +0000 gallivm: new vectorized path for cubemap calculations should be faster when adapted to multiple quads as only selection masks need to be different. The code is more or less a per-pixel version adapted to only do it per quad. A per pixel version would be much simpler (could drop 2 selects, 6 broadcasts and the messy horizontal add of 3 vectors at the expense of only 2 more absolute value instructions - would also just work for arbitary large vectors). This version doesn't yet work with larger vectors because the horizontal add isn't adjusted to be able to work with 2x4 vectors (and also because face selection wouldn't be done per quad just per block though that would be only a correctness issue just as with lod selection). The downside is this code is quite a bit slower. On a Core2 it can be sped up by disabling the hw blend instructions for selection and using logicop fallbacks instead, but it is still slower than the old code, hence leave that in for now. Probably will chose one or the other version based on vector length in the end. commit b375fbb18a3fd46859b7fdd42f3e9908ea4ff9a3 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 21 14:42:29 2012 +0000 gallivm: fix optimized occlusion query intrinsic name commit a9ba0a3b611e48efbb0e79eb09caa85033dbe9a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Mar 21 16:19:43 2012 +0000 draw,gallivm,llvmpipe: Call gallivm_verify_function everywhere. commit f94c2238d2bc7383e088b8845b7410439a602071 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 18:54:10 2012 +0000 gallivm: optimize calculations for cube maps a bit this does some more vectorized calculations and uses horizontal adds if possible. A definite win with sse3 otherwise it doesn't seem to make much of a difference. In any case this is arithmetically identical, cannot handle larger vectors. Should be useful as a reference point against larger vector version later... commit 21a2c1cf3c8e1ac648ff49e59fdc0e3be77e2ebb Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 15:16:27 2012 +0000 llvmpipe: slight optimization of occlusion queries using movmskps when available. While this is slightly better for cpus without popcnt we should really sum the vectors ourselves (it is also possible to cast to i4 before doing the popcnt but that doesn't help that much neither since llvm is using some optimized popcnt version for i32) commit 5ab5a35f216619bcdf55eed52b0db275c4a06c1b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 13:32:11 2012 +0000 llvmpipe: fix occlusion queries with larger vectors need to adjust casts etc. commit ff95e6fdf5f16d4ef999ffcf05ea6e8c7160b0d5 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Mar 19 20:15:25 2012 +0000 gallivm: Restore optimization passes. commit 57b05b4b36451e351659e98946dae27be0959832 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:34:22 2012 +0000 llvmpipe: use existing min2 macro commit bc9a20e19b4f600a439f45679451f2e87cd4b299 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:07:27 2012 +0000 llvmpipe: add some safeguards against really large vectors As per José's suggestion, prevent things from blowing up if some cpu would have 1024bit or larger vectors. commit 0e2b525e5ca1c5bbaa63158bde52ad1c1564a3a9 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:31:08 2012 +0000 llvmpipe: fix mask generation for uberwide vectors this was the only piece preventing 16-wide vectors from working (apart from the LP_MAX_VECTOR_WIDTH define that is), which is the maximum as we don't get more pixels in the fragment shader at once. Hence adjust that so things could be tested properly with that size even though there seems to be no practical value. commit 3c8334162211c97f3a11c7f64e9e5a2a91ad9656 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:19:41 2012 +0000 llvmpipe: fix the simple interpolation method with larger vectors so both methods actually _really_ work now. Makes textures look nice with larger vectors... commit 1cb0464ef8871be1778d43b0c56adf9c06843e2d Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 17:26:35 2012 +0000 llvmpipe: fix mask generation and position interpolation with 8-wide vectors trivial bugs, with these things start to look somewhat reasonable. Textures though have some swizzling issues it seems. commit 168277a63ef5b72542cf063c337f2d701053ff4b Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 16:04:03 2012 +0000 llvmpipe: don't overallocate variables we never have more than 16 (stamp size) / 4 (minimum possible vector size). (With larger vectors those variables are still overallocated a bit.) commit 409b54b30f81ed0aa9ed0b01affe15c72de9abd2 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:56:48 2012 +0000 llvmpipe: add some 32f8 formats to lp_test_conv Also add the ability to handle different sized vectors. commit 55dcd3af8366ebdac0af3cdb22c2588f24aa18ce Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:47:27 2012 +0000 gallivm: handle different sized vectors in conversion / pack only fully generic path for now (extract/insert per element). commit 9c040f78c54575fcd94a8808216cf415fe8868f6 Author: Roland Scheidegger <sroland@vmware.com> Date: Sun Mar 18 00:58:28 2012 +0100 llvmpipe: fix harmless use of unitialized values commit 551e9d5468b92fc7d5aa2265db9a52bb1e368a36 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:31:21 2012 +0100 gallivm: drop special path in extract_broadcast with different sized vectors Not needed, llvm can handle shuffles with different sized result vector just fine. Should hopefully generate the same code in the end, but simpler IR. commit 44da531119ffa07a421eaa041f63607cec88f6f8 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:28:49 2012 +0100 llvmpipe: adapt interpolation for handling multiple quads at once this is still WIP there are actually two methods possible not quite sure what makes the most sense, so there's code for both for now: 1) the iterative method as used before (compute attrib values at upper left corner of stamp and upper left corner of each quad initially). It is improved to handle more than one quad at once, and also do some more vectorized calculations initially for slightly better code - newer cpus have full throughput with 4 wide float vectors, hence don't try to code up a path which might be faster if there's just one channel active per attribute. 2) just do straight interpolation for each pixel. Method 2) is more work per quad, but less initially - if all quads are executed significantly more overall though. But this might change with larger vector lengths. This method would also be needed if we'd do some kind of active quad merging when operating on multiple quads at once. This path contains some hack to force llvm to generate better code, it is still far from ideal though, still generates far too many unnecessary register spills/reloads. Both methods should work with different sized vectors. Not very well tested yet, still seems to work with four-wide vectors, need changes elsewhere to be able to test with wider vectors. commit be5d3e82e2fe14ad0a46529ab79f65bf2276cd28 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:59:37 2012 +0000 draw: Cleanup. commit f85bc12c7fbacb3de2a94e88c6cd2d5ee0ec0e8d Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:43:30 2012 +0000 gallivm: More module compilation refactoring. commit d76f093198f2a06a93b2204857e6fea5fd0b3ece Author: José Fonseca <jfonseca@vmware.com> Date: Thu Mar 15 21:29:11 2012 +0000 llvmpipe: Use gallivm_compile/free_function() in linear code. Should had been done before. commit 122e1adb613ce083ad739b153ced1cde61dfc8c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 13 14:47:10 2012 +0100 llvmpipe: generate partial pixel mask for multiple quads still works with one quad, cannot be tested yet with more At least for now always fixed order with multiple quads. commit 4c4f15081d75ed585a01392cd2dcce0ad10e0ea8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 22:09:24 2012 +0100 llvmpipe: refactor state setup a bit Refactor to make it easier to emit (and potentially later fetch in fs) coefficients for multiple attributes at once. Need to think more about how to make this actually happen however, the problem is different attributes can have different interpolation modes, requiring different handling in both setup and fs (though linear and perspective handling is close). commit 9363e49722ff47094d688a4be6f015a03fba9c79 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 19:23:23 2012 +0100 llvmpipe: vectorize tri offset calc cuts number of instructions in quad-offset-factor from 107 to 75. This code actually duplicated the (scalar) code calculating the determinant except it used different vertex order (leading to different sign but it doesn't matter) hence llvm could not have figured out it's the same (of course with determinant vectorized in the other place that wouldn't have worked any longer neither). Note this particular piece doesn't actually vectorize well, not many arithmetic instructions left but tons of shuffle instructions... Probably would need to work on n tris at a time for better vectorization. commit 63169dcb9dd445c94605625bf86d85306e2b4297 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 03:11:37 2012 +0100 llvmpipe: vectorize some scalar code in setup reduces number of arithmetic instructions, and avoids loading vector x,y values twice (once as scalars once as vectors). Results in a reduction of instructions from 76 to 64 in fs setup for glxgears (16%) on a cpu with sse41. Since this code uses vec2 disguised as vec4, on old cpus which had physical 64bit sse units (pre-Core2) it probably is less of a win in practice (and if you have no vectors you can only hope llvm eliminates the arithmetic for unneeded elements). commit 732ecb877f951ab89bf503ac5e35ab8d838b58a1 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 7 00:32:24 2012 +0100 draw: fix clipping bug introduced by 4822fea3f0440b5205e957cd303838c3b128419c broke clipping pretty badly (verified with lineclip test) commit ef5d90b86d624c152d200c7c4056f47c3c6d2688 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 23:38:59 2012 +0100 draw: don't store vertex header per attribute storing the vertex header once per attribute is totally unnecessary. Some quick look at the generated assembly says llvm in fact cannot optimize away the additional stores (maybe due to potentially aliasing pointers somewhere). Plus, this makes the code cleaner and also allows using a vector "or" instead of scalar ones. commit 6b3a5a57b0b9850854cfbd7b586e4e50102dda71 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 19:11:01 2012 +0100 draw: do the per-vertex "boolean" clipmask "or" with vectors no point extracting the values and doing it per component. Doesn't help that much since we still extract the values elsewhere anyway. commit 36519caf1af40e4480251cc79a2d527350b7c61f Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 2 22:27:01 2012 +0100 gallivm: fix lp_build_extract_broadcast with different sized vectors Fix the obviously wrong argument, so it doesn't blow up. commit 76d0ac3ad85066d6058486638013afd02b069c58 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 2 12:16:23 2012 +0000 draw: Compile per module and not per function (WIP). Enough to get gears w/ LLVM draw + softpipe to work on AVX doing: GALLIUM_DRIVER=softpipe SOFTPIPE_USE_LLVM=yes glxgears But still hackish -- will need to rethink and refactor this. commit 78e32b247d2a7a771be9a1a07eb000d1e54ea8bd Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 12:01:05 2012 +0000 llvmpipe: Remove lp_state_setup_fallback. Never used. commit 6895d5e40d19b4972c361e8b83fdb7eecda3c225 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Feb 27 19:14:27 2012 +0000 llvmpipe: Don't emit EMMS on x86 We already take precautions to ensure that LLVM never emits MMX code. commit 4822fea3f0440b5205e957cd303838c3b128419c Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 29 15:58:19 2012 +0100 draw: modifications for larger vector sizes We want to be able to use larger vectors especially for running the vertex shader. With this patch we build soa vectors which might have a different length than 4. Note that aos structures really remain the same, only when aos structures are converted to soa potentially different sized vectors are used. Samplers probably don't work yet, didn't look at them. Testing done: glxgears works with both 128bit and 256bit vectors. commit f4950fc1ea784680ab767d3dd0dce589f4e70603 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:57 2012 +0100 gallivm: override native vector width with LP_NATIVE_VECTOR_WIDTH env var for debug commit 6ad6dbf0c92f3bf68ae54e5f2aca035d19b76e53 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:24 2012 +0100 draw: allocate storage with alignment according to native vector width commit 7bf0e3e7c9bd2469ae7279cabf4c5229ae9880c1 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Feb 24 19:06:08 2012 +0000 gallivm: Fix comment grammar. Was missing several words. Spotted by Roland. commit b20f1b28eb890b2fa2de44a0399b9b6a0d453c52 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 19:22:09 2012 +0000 gallivm: Use MC-JIT on LLVM 3.1 + (i.e, SVN) MC-JIT Note: MC-JIT is still WIP. For this to work correctly it requires LLVM changes which are not yet upstream. commit b1af4dfcadfc241fd4023f4c3f823a1286d452c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 20:03:15 2012 +0100 llvmpipe: use new lp_type_width() helper in lp_test_blend commit 04e0a37e888237d4db2298f31973af459ef9c95f Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 19:50:34 2012 +0100 llvmpipe: clean up lp_test_blend a little Using variables just sized and aligned right makes it a bit more obvious what's going on. The test still only tests vector length 4. For AoS anything else probably isn't going to work. For SoA other lengths should work (at least with floats). commit e61c393d3ec392ddee0a3da170e985fda885a823 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:48:30 2012 +0000 gallivm: Ensure vector width consistency. Instead of assuming that everything is the max native size. commit 330081ac7bc41c5754a92825e51456d231bf84dd Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:44:14 2012 +0000 draw: More simd vector width consistency fixes. commit d90ca002753596269e37297e2e6c139b19f29f03 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:43:00 2012 +0000 gallivm: Remove unused lp_build_int32_vec4_type() helper. commit cae23417824d75869c202aaf897808d73a2c1db0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 17:32:16 2012 +0100 gallivm: use global variable for native vector width instead of define We do not know the simd extensions (and hence the simd width we should use) available at compile time. At least for now keep a define for maximum vector width, since a global variable obviously can't be used to adjust alignment of automatic stack variables. Leave the runtime-determined value at 128 for now in all cases. commit 51270ace6349acc2c294fc6f34c025c707be538a Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 15:41:02 2012 +0000 gallivm: Add a hunk inadvertedly lost when rebasing. commit bf256df9cfdd0236637a455cbaece949b1253e98 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:24:23 2012 +0000 llvmpipe: Use consistent vector width in depth/stencil test. commit 5543b0901677146662c44be2cfba655fd55da94b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:19:59 2012 +0000 draw: Use a consistent the vector register width. Instead of 4x32 sometimes, LP_NATIVE_VECTOR_WIDTH other times. commit eada8bbd22a3a61f549f32fe2a7e408222e5c824 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 12:08:04 2012 +0000 gallivm: Remove garbagge collection. MC-JIT will require one compilation per module (as opposed to one compilation per function), therefore no state will be shared, eliminating the need to do garbagge collection. commit 556697ea0ed72e0641851e4fbbbb862c470fd7eb Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 10:33:41 2012 +0000 gallivm: Move all native target initialization to lp_set_target_options(). commit c518e8f3f2649d5dc265403511fab4bcbe2cc5c8 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:52:32 2012 +0000 llvmpipe: Create one gallivm instance for each test. commit 90f10af8920ec6be6f2b1e7365cfc477a0cb111d Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:48:08 2012 +0000 gallivm: Avoid LLVMAddGlobalMapping() in lp_bld_assert(). Brittle, complex, and unecesary. Just use function pointer constant. commit 98fde550b33401e3fe006af59db4db628bcbf476 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:21:26 2012 +0000 gallivm: Add a lp_build_const_func_pointer() helper. To be reused in all places where we want to call C code. commit 6cfedadb62c2ce5af8d75969bc95a607f3ece118 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:44:41 2012 +0000 gallivm: Cleanup/simplify lp_build_const_string_variable. - Move to lp_bld_const where it belongs - Rename to lp_build_const_string - take the length from the argument (and don't count the zero terminator twice) - bitcast the constant to generic i8 * commit db1d4018c0f1fa682a9da93c032977659adfb68c Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 11:52:17 2012 +0000 gallivm: Set NoFramePointerElimNonLeaf to true where supported. commit 088614164aa915baaa5044fede728aa898483183 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 22 19:38:47 2012 +0100 llvmpipe: pass in/out pointers rather scalar floats in lp_bld_arit we don't want llvm to potentially optimize away the vectors (though it doesn't seem to currently), plus we want to be able to handle in/out vectors of arbitrary length. commit 3f5c4e04af8a7592fdffa54938a277c34ae76b51 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:22:55 2012 +0100 gallivm: fix lp_build_sqrt() for vector length 1 since we optimize away vectors with length 1 need to emit intrinsic without vector type. commit 79d94e5f93ed8ba6757b97e2026722ea31d32c06 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 17:00:46 2012 +0000 llvmpipe: Remove lp_test_round. commit 81f41b5aeb3f4126e06453cfc78990086b85b78d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:56:24 2012 +0100 llvmpipe: subsume lp_test_round into lp_test_arit Much simpler, and since the arguments aren't passed as 128bit values can run on any arch. This also uses the float instead of the double versions of the c functions (which probably was the intention anyway). In contrast to lp_test_round the output is much less verbose however. Tested vector width of 32 to 512 bits - all pass except 32 (length 1) which crashes in lp_build_sqrt() due to wrong type. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 945b338b421defbd274481d8c4f7e0910fd0e7eb Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 09:55:03 2012 +0000 gallivm: Centralize the function compilation logic. This simplifies a lot of code. Also doing this in a central place will make it easier to carry out the changes necessary to use MC-JIT in the future. gallivm: Fix typo in explicit derivative shuffle. Trivial. draw: make DEBUG_STORE work again adapt to lp_build_printf() interface changes Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: get rid of vecnf_from_scalar() just use lp_build_broadcast directly (cannot assign a name but don't really need it, vecnf_from_scalar() was producing much uglier IR due to using repeated insertelement instead of insertelement+shuffle). Reviewed-by: José Fonseca <jfonseca@vmware.com> llvmpipe: fix typo in complex interpolation code Fixes position interpolation when using complex mode (piglit fp-fragment-position and similar) Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: fix clipvertex/position storing again This appears to be the result of a bad merge. Fixes piglit tests relying on clipping, like a lot of the interpolation tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Fix explicit derivative manipulation. Same counter variable was being used in two nested loops. Use more meanigful variable names for the counter to fix and avoid this. gallivm: Prevent buffer overflow in repeat wrap mode for NPOT. Based on Roland's patch, discussion, and review . Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix dims for TGSI_TEXTURE_1D in emit_tex. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix explicit volume texture derivatives. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: fix 1d shadow texture sampling Always r coordinate is used, hence need 3 coords not two (the second one is unused). Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Enable AVX support without MCJIT, where available. For now, this just enables AVX on Windows for testing. If the code is stable then we might consider prefering the old JIT wherever possible. No change elsewhere. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2012-07-13 18:09:30 +01:00
}
if (type.width == 16 && !type.sign) {
intrinsic = "llvm.x86.sse41.pmaxuw";
gallivm,draw,llvmpipe: Support wider native registers. Squashed commit of the following: commit 7acb7b4f60dc505af3dd00dcff744f80315d5b0e Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:46:31 2012 +0100 draw: Don't use dynamically sized arrays. Not supported by MSVC. commit 5810c28c83647612cb372d1e763fd9d7780df3cb Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:44:16 2012 +0100 gallivm,llvmpipe: Don't use expressions with PIPE_ALIGN_VAR(). MSVC doesn't accept exceptions in _declspec(align(...)). Use a define instead. commit 8aafd1457ba572a02b289b3f3411e99a3c056072 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:41:56 2012 +0100 gallium/util: Make u_cpu_detect.h header C++ safe. commit 5795248350771f899cfbfc1a3a58f1835eb2671d Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 2 12:08:01 2012 +0100 gallium/util: Add ULL suffix to large constants. As suggested by Andy Furniss: it looks like some old gcc versions require it. commit 4c66c22727eff92226544c7d43c4eb94de359e10 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Truly disable INF/NAN tests on MSVC. Thanks to Brian for spotting this. commit 8bce274c7fad578d7eb656d9a1413f5c0844c94e Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Disable INF/NAN tests on MSVC. Somehow they are not recognized as constants. commit 6868649cff8d7fd2e2579c28d0b74ef6dd4f9716 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 15:05:24 2012 +0200 gallivm: Cleanup the 2 x 8 float -> 16 ub special path in lp_build_conv. No behaviour change intended, like 7b98455fb40c2df84cfd3cdb1eb7650f67c8a751. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 5147a0949c4407e8bce9e41d9859314b4a9ccf77 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 14:28:19 2012 +0200 gallivm: (trivial) fix issues with multiple-of-4 texture fetch Some formats can't handle non-multiple of 4 fetches I believe, but everything must support length 1 and multiples of 4. So avoid going to scalar fetch (which is very costly) just because length isn't 4. Also extend the hack to not use shift with variable count for yuv formats to arbitrary length (larger than 1) - doesn't matter how many elements we have we always want to avoid it unless we have variable shift count instruction (which we should get with avx2). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 87ebcb1bd71fa4c739451ec8ca89a7f29b168c08 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jul 4 02:09:55 2012 +0200 gallivm: (trivial) fix typo for wrap repeat mode in linear filtering aos code This would lead to bogus coordinates at the edges. (undetected by piglit because this path is only taken for block-based formats). Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 3a42717101b1619874c8932a580c0b9e6896b557 Author: José Fonseca <jfonseca@vmware.com> Date: Tue Jul 3 19:42:49 2012 +0100 gallivm: Fix TGSI integer translation with AVX. commit d71ff104085c196b16426081098fb0bde128ce4f Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 15:17:41 2012 +0100 llvmpipe: Fix LLVM JIT linear path. It was not working properly because it was looking at the JIT function before it was actually compiled. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit a94df0386213e1f5f9a6ed470c535f9688ec0a1b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Jun 28 18:07:10 2012 +0100 gallivm: Refactor lp_build_broadcast(_scalar) to share code. Doesn't really change the generated assembly, but produces more compact IR, and of course, makes code more consistent. Reviewed-by: Brian Paul <brianp@vmware.com> commit 66712ba2731fc029fa246d4fc477d61ab785edb5 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 17:30:13 2012 +0100 gallivm: Make LLVMContextRef a singleton. There are any places inside LLVM that depend on it. Too many to attempt to fix. Reviewed-by: Brian Paul <brianp@vmware.com> commit ff5fb7897495ac263f0b069370fab701b70dccef Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 28 18:15:27 2012 +0200 gallivm: don't use 8-wide texture fetch in aos path This appears to be a slight loss usually. There are probably several reasons for that: - fetching itself is scalar - filtering is pure int code hence needs splitting anyway, same for the final texel offset calculations - texture wrap related code, which can be done 8-wide, is slightly more complex with floats (with clamp_to_edge) and float operations generally more costly hence probably not much faster overall - the code needed to split when encountering different mip levels for the quads, adding complexity So, just split always for aos path (but leave it 8-wide for soa, since we do 8-wide filtering there when possible). This should certainly be revisited if we'd have avx2 support. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ce8032b43dcd8e8d816cbab6428f54b0798f945d Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:41:19 2012 +0200 gallivm: (trivial) don't extract fparts variable if not needed Did not have any consequences but unnecessary. commit aaa9aaed8f80dc282492f62aa583a7ee23a4c6d5 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:09:06 2012 +0200 gallivm: fix precision issue in aos linear int wrap code now not just passes at a quick glance but also with piglit... If we do the wrapping with floats, we also need to set the weights accordingly. We can potentially end up with different (integer) coordinates than what the integer calculations would have chosen, which means the integer weights calculated previously in this case are completely wrong. Well at least that's what I think happens, at least recalculating the weights helps. (Some day really should refactor all the wrapping, so we do whatever is fastest independent of 16bit int aos or 32bit float soa filtering.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit fd6f18588ced7ac8e081892f3bab2916623ad7a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 11:15:53 2012 +0100 gallium/util: Fix parsing of options with underscore. For example GALLIVM_DEBUG=no_brilinear which was being parsed as two options, "no" and "brilinear". commit 09a8f809088178a03e49e409fa18f1ac89561837 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 26 15:00:14 2012 +0100 gallivm: Added a generic lp_build_print_value which prints a LLVMValueRef. Updated lp_build_printf to share common code. Removed specific lp_build_print_vecX. Reviewed-by: José Fonseca <jfonseca@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit e59bdcc2c075931bfba2a84967a5ecd1dedd6eb0 Author: José Fonseca <jfonseca@vmware.com> Date: Wed May 16 15:00:23 2012 +0100 draw,llvmpipe: Avoid named struct types on LLVM 3.0 and later. Starting with LLVM 3.0, named structures are meant not for debugging, but for recursive data types, previously also known as opaque types. The recursive nature of these types leads to several memory management difficulties. Given that we don't actually need recursive types, avoid them altogether. This is an attempt to address fdo bugs 41791 and 44466. The issue is somewhat random so there's no easy way to check how effective this is. Cherry-picked from 9af1ba565dfd5cef9ee938bb7c04767d14878fbf commit df6070f618a203c7a876d984c847cde4cbc26bdb Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 14:42:53 2012 +0200 gallivm: (trivial) fix typo in faster aos linear int wrap code no longer crashes, now REALLY tested. commit d8f98dce452c867214e6782e86dc08562643c862 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 18:20:58 2012 +0200 llvmpipe: (trivial) remove bogus optimization for float aos repeat wrap This optimization for nearest filtering on the linear path generated likely bogus results, and the int path didn't have any optimizations there since the only shader using force_nearest apparently uses clamp_to_edge not repeat wrap anyway. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit c4e271a0631087c795e756a5bb6b046043b5099d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 23:01:52 2012 +0200 gallivm: faster repeat wrap for linear aos path too Even if we already have scaled integer coords, it's way faster to use the original float coord (plus some conversions) rather than use URem. The choice of what to do for texture wrapping is not really tied to int aos or float soa filtering though for some modes there can be some gains (because of easier weight calculations). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 1174a75b1806e92aee4264ffe0ffe7e70abbbfa3 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 14:39:22 2012 +0200 gallivm: improve npot tex wrap repeat in linear soa path URem gets translated into series of scalar divisions so just about anything else is faster. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit f849ffaa499ed96fa0efd3594fce255c7f22891b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 00:40:35 2012 +0100 gallivm: (trivial) fix near-invisible shift-space typo I blame the keyboard. commit 5298a0b19fe672aebeb70964c0797d5921b51cf0 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:24:28 2012 +0200 gallivm: add new intrinsic helper to deal with arbitrary vector length This helper will split vectors which are too large for the hw, or expand them if they are too small, so a caller of a function using intrinsics which uses such sizes need not split (or expand) the vectors manually and the function will still use the intrinsic instead of dropping back to generic llvm code. It can also accept scalars for use with pseudo-vector intrinsics (only useful for float arguments, all x86 scalar simd float intrinsics use 4vf32). Only used for lp_build_min/max() for now (also added the scalar float case for these while there). (Other basic binary functions could use it easily, whereas functions with a different interface would need different helpers.) Expanding vectors isn't widely used, because we always try to use build contexts with native hw vector sizes. But it might (or not) be nicer if this wouldn't need to be done, the generated code should in theory stay the same (it does get hit by lp_build_rho though already since we didn't have a intrinsic for the scalar lp_build_max case before). v2: incorporated Brian's feedback, and also made the scalar min/max case work instead of crash (all scalar simd float intrinsics take 4vf32 as argument, probably the reason why it wasn't used before). Moved to lp_bld_intr based on José's request, and passing intrinsic size instead of length. Ideally we'd derive the source type info from the passed in llvm value refs and process some llvmtype return type so we could handle intrinsics where the source and destination type isn't the same (like float/int conversions, packing instructions) but that's a bit too complicated for now. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 01aa760b99ec0b2dc8ce57a43650e83f8c1becdf Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:19:18 2012 +0200 gallivm: (trivial) increase max code size for shader disassembly 64kB was just short of what I needed (which caused a crash) hence increase to 96kB (should probably be smarter about that). commit 74aa739138d981311ce13076388382b5e89c6562 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:53:29 2012 +0100 gallivm: simplify aos float tex wrap repeat nearest just handle pot and npot the same. The previous pot handling ended up with exactly the same instructions plus 2 more (leave it in the soa path though since it is probably still cheaper there). While here also fix a issue which would cause a crash after an assert. commit 0e1e755645e9e49cfaa2025191e3245ccd723564 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:29:24 2012 +0100 gallivm: (trivial) skip floor rounding in ifloor when not signed This was only done for the non-sse41 case before, but even with sse41 this is obviously unnecessary (some callers already call itrunc in this case anyway but some might not). commit 7f01a62f27dcb1d52597b24825931e88bae76f33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:23:12 2012 +0100 gallivm: (trivial) fix bogus comments commit 5c85be25fd82e28490274c468ce7f3e6e8c1d416 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 20 11:51:57 2012 +0100 translate: Free elt8_func/elt16_func too. These were leaking. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 0ad498f36fb6f7458c7cffa73b6598adceee0a6c Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 15:55:34 2012 +0200 gallivm: fix bug for tex wrap repeat with linear sampling in aos float path The comparison needs to be against length not length_minus_one, otherwise the max texel is never chosen (for the second coordinate). Fixes piglit texwrap-1D-npot-proj (and 2D/3D versions). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit d1ad65937c5b76407dc2499b7b774ab59341209e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 16:13:43 2012 +0200 gallivm: simplify soa tex wrap repeat with npot textures and no mip filtering Similar to what is already done in aos sampling for the float path (but not the int path since we don't get normalized float coordinates there). URem is expensive and the calculation is done trivially with normalized floats instead (at least with sse41-capable cpus). (Some day should probably do the same for the mip filter path but it's much more complicated there hence the gain is smaller.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit e1e23f57ba9b910295c306d148f15643acc3fc83 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:38:56 2012 +0200 llvmpipe: (trivial) remove duplicated function declaration Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 07ca57eb09e04c48a157733255427ef5de620861 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:37:34 2012 +0200 llvmpipe: destroy setup variants on context destruction lp_delete_setup_variants() used to be called in garbage collection, but this no longer exists hence the setup shaders never got freed. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ed0003c633859a45f9963a479f4c15ae0ef1dca3 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 16:25:29 2012 +0100 gallivm: handle different ilod parts for multiple quad sampling This fixes filtering when the integer part of the lod is not the same for all quads. I'm not fully convinced of that solution yet as it just splits the vector if the levels to be sampled from are different. But otherwise we'd need to do things like some minify steps, and getting mip level base address separately anyway hence it wouldn't really look like much of a win (and making the code even more complex). This should now give identical results to single quad sampling. commit 8580ac4cfc43a64df55e84ac71ce1a774d33c0d2 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:14:47 2012 +0200 gallivm: de-duplicate sample code common to soa and aos sampling There doesn't seem to be any reason why this code dealing with cube face selection, lod and mip level calculation is separate in aos and soa sampling, and I am sick of having it to change in both places. commit fb541e5f957408ce305b272100196f1e12e5b1e8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:15:41 2012 +0200 gallivm: do mip filtering with per quad lod_fpart This gives better results for mip filtering, though the generated code might not be optimal. For now it also creates some artifacts if the lod_ipart isn't the same for all quads, since instead of using the same mip weight for all quads as previously (which just caused non-smooth gradients) this now will use the right weights but with the wrong mip level in this case (can easily be seen with things like texfilt, mipmap_tunnel). v2: use logic helper suggested by José, and fix issue with negative lod_fpart values commit f1cc84eef7d826a20fab6cd8ccef9a275ff78967 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 13 18:35:25 2012 +0200 gallivm: (trivial) fix bogus assert in lp_build_unpack_broadcast_aos_scalars commit 7c17dbae8ae290df9ce0f50781a09e8ed640c044 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:11:14 2012 +0100 util: Reimplement half <-> float conversions. Removed u_half.py used to generate the table for previous method. Previous implementation of float to half conversion was faulty for denormalised and NaNs and would require extra logic to fix, thus making the speedup of using tables irrelevant. commit 7762f59274070e1dd4b546f5cb431c2eb71ae5c3 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:12:16 2012 +0100 tests: Updated tests to properly handle NaN for half floats. commit fa94c135aea5911fd93d5dfb6e6f157fb40dce5e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:33:10 2012 +0200 gallivm: do mip level calculations per quad This is the final piece which shouldn't change the rendering output yet. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 23cbeaddfe03c09ca18c45d28955515317ffcf4c Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Jun 9 00:54:21 2012 +0200 gallivm: do per-quad cube face selection Doesn't quite fix the piglit cubemap test (not sure why actually) but doing per-quad face selection is doing the right thing and definitely an improvement. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit abfb372b3702ac97ac8b5aa80ad1b94a2cc39d33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:22:59 2012 +0200 gallivm: do all lod calculations per quad Still no functional change but lod is now converted to scalar after lod calculations. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 519368632747ae03feb5bca9c655eccbc5b751b4 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:46:10 2012 +0100 gallivm: Added support for half-float to float conversion in lp_build_conv. Updated various utility functions to support this change. commit 135b4d683a4c95f7577ba27b9bffa4a6fbd2c2e7 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:02:46 2012 +0100 gallivm: Added function for half-float to float conversion. Updated lp_build_format_aos_array to support half-float source. commit 37d648827406a20c5007abeb177698723ed86673 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:55:18 2012 +0100 util: Updated u_format_tests to rigidly test half-float boundary values. commit 2ad18165d96e578aa9046df7c93cb1c3284d8c6b Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:54:16 2012 +0100 llvmpipe: Updated lp_test_format to properly handle Inf/NaN results. commit 78740acf25aeba8a7d146493dd5c966e22c27b73 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:53:30 2012 +0100 util: Added functions for checking NaN / Inf for double and half-floats. commit 35e9f640ae01241f9e0d67fe893bbbf564c05809 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:05:13 2012 +0200 gallivm: Fix calculating rho for 3d textures for the single-quad case Discovered by accident, this looks like a very old typo bug. commit fc1220c636326536fd0541913154e62afa7cd1d8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:04:59 2012 +0200 gallivm: do calcs per-quad in lp_build_rho Still convert to scalar at the end of the function. commit 50a887ffc550bf310a6988fa2cea5c24d38c1a41 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 21 23:21:50 2012 +0200 gallivm: (trivial) return scalar in lp_build_extract_range for length 1 vectors Our type system on top of llvm's one doesn't generally support vectors of length 1, instead using scalars. So we should return a scalar from this function instead of having to bitcast the vector with length 1 later elsewhere. commit 80c71c621f9391f0f9230460198d861643324876 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 17:49:15 2012 +0100 draw: Fixed bad merge error commit c47401cfad0c9167de20ff560654f533579f452c Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:29:30 2012 +0100 draw: Updated store_clip to store whole vectors instead of individual elements. commit 2d9c1ad74b0b0b41861fffcecde39f09cc27f1cf Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:28:32 2012 +0100 gallivm: Added lp_build_fetch_rgba_aos_array. A version of lp_build_fetch_rgba_aos which is targeted at simple array formats. Reads the whole vector from memory in one, instead of reading each element individually. Tested with mesa tests and demos. commit ff7805dc2b6ef6d8b11ec4e54aab1633aef29ac8 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:27:40 2012 +0100 gallivm: Added lp_build_pad_vector. This function pads a vector with undef to a desired length. commit 701f50acef24a2791dabf4730e5b5687d6eb875d Author: James Benton <jbenton@vmware.com> Date: Fri May 18 17:27:19 2012 +0100 util: Added util_format_is_array. This function checks whether a format description is in a simple array format. commit 5e0a7fa543dcd009de26f34a7926674190fa6246 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:13:47 2012 +0100 draw: Removed draw_llvm_translate_from and draw/draw_llvm_translate.c. This is "replaced" by adding an optimised path in lp_build_fetch_rgba_aos in an upcoming patch. commit 8c886d6a7dd3fb464ecf031de6f747cb33e5361d Author: James Benton <jbenton@vmware.com> Date: Wed May 16 15:02:31 2012 +0100 draw: Modified store_aos to write the vector as one, not individual elements. commit 37337f3d657e21dfd662c7b26d61cb0f8cfa6f17 Author: James Benton <jbenton@vmware.com> Date: Wed May 16 14:16:23 2012 +0100 draw: Changed aos_to_soa to use lp_build_transpose_aos. commit bd2b69ce5d5c94b067944d1dcd5df9f8e84548f1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:14:27 2012 +0100 draw: Changed soa_to_aos to use lp_build_transpose_aos. commit 0b98a950d29a116e82ce31dfe7b82cdadb632f2b Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:45 2012 +0100 gallivm: Added lp_build_transpose_aos which converts between aos and soa. commit 69ea84531ad46fd145eb619ed1cedbe97dde7cb5 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:01 2012 +0100 gallivm: Added lp_build_interleave2_half aimed at AVX unpack instructions. commit 7a4cb1349dd35c18144ad5934525cfb9436792f9 Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 22 11:54:14 2012 +0100 gallivm: Fix build on Windows. MC-JIT not yet supported there. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit afd105fc16bb75d874e418046b80d9cc578818a1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:17:26 2012 +0100 llvmpipe: Added a error counter to lp_test_conv. Useful for keeping track of progress when fixing errors! Signed-off-by: José Fonseca <jfonseca@vmware.com> commit b644907d08c10a805657841330fc23db3963d59c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:16:46 2012 +0100 llvmpipe: Changed known failures in lp_test_conv. To comply with the recent fixes to lp_bld_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit d7061507bd94f6468581e218e61261b79c760d4f Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:14:38 2012 +0100 llvmpipe: Added fixed point types tests to lp_test_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 146b3ea39b4726dbe125ac666bd8902ea3d6ca8c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:26:35 2012 +0100 llvmpipe: Changed lp_test_conv src/dst alignment to be correct. Now based on the define rather than a fixed number. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit f3b57441f834833a4b142a951eb98df0aa874536 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:06:44 2012 +0100 gallivm: Fixed erroneous optimisation in lp_build_min/max. Previously assumed normalised was 0 to 1, but it can be -1 to 1 if type is signed. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a0613382e5a215cd146bb277646a6b394d376ae4 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:04:49 2012 +0100 gallivm: Compensate for lp_const_offset in lp_build_conv. Fixing a /*FIXME*/ to remove errors in integer conversion in lp_build_conv. Tested using lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a3d2bf15ea345bc8a0664f8f441276fd566566f3 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:01:25 2012 +0100 gallivm: Fixed overflow in lp_build_clamped_float_to_unsigned_norm. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit e7b1e76fe237613731fa6003b5e1601a2e506207 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 21 20:07:51 2012 +0100 gallivm: Fix build with LLVM 2.6 Trivial, and useful. commit d3c6bbe5c7f5ba1976710831281ab1b6a631082d Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 15 17:15:59 2012 +0100 gallivm: Enable MCJIT/AVX with vanilla LLVM 3.1. Add the necessary C++ glue, so that we don't need any modifications to the soon to be released LLVM 3.1. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 724a019a14d40fdbed21759a204a2bec8a315636 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 22:04:06 2012 +0100 gallivm: Use HAVE_LLVM 0x0301 consistently. commit af6991e2a3868e40ad599b46278551b794839748 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 21:49:06 2012 +0100 gallivm: Add MCRegisterInfo.h to silence benign warnings about missing implementation. Trivial. commit 6f8a1d75458daae2503a86c6b030ecc4bb494e23 Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Apr 2 22:14:15 2012 -0700 gallivm: Pass in a MCInstrInfo to createMCInstPrinter on llvm-3.1. llvm-3.1svn r153860 makes MCInstrInfo available to the MCInstPrinter. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 62555b6ed8760545794f83064e27cddcb3ce5284 Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 27 21:51:17 2012 -0700 gallivm: Fix method overriding in raw_debug_ostream. Use matching type qualifers to avoid method hiding. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 6a9bd784f4ac68ad0a731dcd39e5a3c39989f2be Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 13 22:40:52 2012 -0700 gallivm: Fix createOProfileJITEventListener namespace with llvm-3.1. llvm-3.1svn r152620 refactored the OProfile profiling code. createOProfileJITEventListener was moved from the llvm namespace to the llvm::JITEventListener namespace. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit b674955d39adae272a779be85aa1bd665de24e3e Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Mar 5 22:00:40 2012 -0800 gallivm: Pass in a MCRegisterInfo to MCInstPrinter on llvm-3.1. llvm-3.1svn r152043 changes createMCInstPrinter to take an additional MCRegisterInfo argument. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 11ab69971a8a31c62f6de74905dbf8c02884599f Author: Vinson Lee <vlee@freedesktop.org> Date: Wed Feb 29 21:20:53 2012 -0800 Revert "gallivm: Change getExtent and readByte to non-const with llvm-3.1." This reverts commit d5a6c172547d8964f4d4bb79637651decaf9deee. llvm-3.1svn r151687 makes MemoryObject accessor members const again. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 339960c82d2a9f5c928ee9035ed31dadb7f45537 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 14 16:19:56 2012 +0200 gallivm: (trivial) fix assertion failure for mipmapped 1d textures In lp_build_rho, we may end up with a 1-element vector (for mipmapped 1d textures), but in this case we require the type to be a non-vector type, so need a cast. commit 9d73edb727bd6d196030dc3026b7bf0c574b3e19 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:12:07 2012 +0200 gallivm: prepare for per-quad lod calculations for large vectors to be able to handle multiple quads at once in texture sampling and still do lod calculations per quad, it is necessary to get the per-quad derivatives into the lp_build_rho function. Until now these derivative values were just scalars, which isn't going to work. So we now use vectors, and since the interface needs to change we also do some different (slightly more efficient) packing of the values. For 8-wide vectors the packed derivative values for 3 coords would look like this, this scales to a arbitrary (multiple of 4) vector size: ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____ The second vector will be unused for 1d and 2d textures. To facilitate future changes the derivative values are put into a struct, since quite some functions just pass these values through. The generated code seems to be very slightly better for 2d textures (with 4-wide vectors) than before with sse2 (if you have a cpu with physical 128bit simd units - otherwise it's probably not a win). v2: suggestions from José, rename variables, add comments, use swizzle helper commit 0aa21de0d31466dac77b05c97005722e902517b8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:10:31 2012 +0200 gallivm: add undefined swizzle handling to lp_build_swizzle_aos This is useful for vectors with "holes", it lets llvm choose the most efficient shuffle instructions if some elements aren't needed without having to worry what elements to manually pick otherwise. commit 00faf3f370e7ce92f5ef51002b0ea42ef856e181 Author: José Fonseca <jfonseca@vmware.com> Date: Fri May 4 17:25:16 2012 +0100 gallivm: Get the LLVM IR optimization passes before JIT compilation. MC-JIT engine compiles the module immediately on creation, so the optimization passes were being run too late. So now we create a target data layout from a string, that matches the ABI parameters reported by the compiler. The backend optimization passes were always been run, so the performance improvement is modest (3% on multiarb mesa demo). Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit 40a43f4e2ce3074b5ce9027179d657ebba68800a Author: Roland Scheidegger <sroland@vmware.com> Date: Wed May 2 16:03:54 2012 +0200 gallivm: (trivial) fix wrong define used in lp_build_pack2 should fix stack-smashing crashes. commit e6371d0f4dffad4eb3b7a9d906c23f1c88a2ab9e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 30 21:25:29 2012 +0200 gallivm: add perf warnings when not using intrinsics with 256bit vectors Helper functions using integer sse2 intrinsics could split the vectors with AVX instead of using generic fallback (which should be faster). We don't actually expect to hit these paths (hence don't fix them up to actually do the vector splitting) so just emit warnings (for those functions where it's obvious doing split/intrinsic is faster than using generic path). Only emit warnings for 256bit vectors since we _really_ don't expect to hit arbitrary large vectors which would affect a lot more functions. The warnings do not actually depend on avx since the same logic applies to plain sse2 too (but of course again there's _really_ no reason we should hit these functions with 256bit vectors without avx). commit 8a9ea701ea7295181e846c6383bf66a5f5e47637 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:37:07 2012 +0200 gallivm: split vectors manually for avx in lp_build_pack2 (v2) There's 2 reasons for this: First, there's a llvm bug (fixed in 3.1) which generates tons of byte inserts/extracts otherwise, and second, more importantly, we want to use pack intrinsics instead of shuffles. We do this in lp_build_pack2 and not the calling code (aos sample path) because potentially other callers might find that useful too, even if for larger sequences of code using non-native vector sizes it might be better to manually split vectors. This should boost texture performance in the aos path considerably. v2: fix issues with intrinsics types with old llvm commit 27ac5b48fa1f2ea3efeb5248e2ce32264aba466e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:26:22 2012 +0200 llvmpipe: refactor lp_build_pack2 (v2) prettify, and it's unnecessary to assert when there's no intrinsic due to unsupported bit width - the shuffle path will work regardless. In contrast lp_build_packs2, should only rely on lp_build_pack2 doing the clamping for element sizes for which there is a sse2 intrinsic. v2: fix bug spotted by Jose regarding the intrinsic type for packusdw on old llvm versions. commit ddf279031f0111de4b18eaf783bdc0a1e47813c8 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:13:59 2012 +0200 gallivm: add src width check in lp_build_packs2() not doing so would skip clamping even if no sse2 pack instruction is available, which is incorrect (in theory only, such widths would also always hit a (unnecessary) assertion in lp_build_pack2(). commit e7f0ad7fe079975eae7712a6e0c54be4fae0114b Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Apr 27 15:57:00 2012 +0200 gallivm: (trivial) fix crash-causing typo for npot textures with avx commit 28a9d7f6f655b6ec508c8a3aa6ffefc1e79793a0 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Apr 25 19:38:45 2012 +0200 gallivm: (trivial) remove code mistakenly added twice. commit d5926537316f8ff67ad0a52e7242f7c5478d919b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Apr 24 21:16:15 2012 +0200 gallivm: add a new avx aos sample path (v2) Try to avoid mixing float and int address calculations. This does texture wrap modes with floats, and then the offset calculations still with ints (because of lack of precision with floats, though we could do some effort to make it work with not too large (16MB) textures). This also handles wrap repeat mode with npot-sized textures differently than either the old soa or aos int path (likely way faster but untested). Otherwise the actual address wrap code is largely similar to the soa path (not quite the same as this one also has some int code), it should get used by avx soa sampling later as well but doesn't handle more complex address modes yet (this will also have the benefit that we can use aos sampling path for all texture address modes). Generated code for that looks reasonable, but still does not split vectors explicitly for fetch/filter which means still get hit by llvm (fixed upstream) which generates hundreds of pinsrb/pextrb instead of two shuffles. It is not obvious though if it's much of a win over just doing address calcs 4-wide but with ints, even if it is definitely much less instructions on avx. piglit's texwrap seems to look exactly the same but doesn't test neither the non-normalized nor the npot cases. v2: fix comments, prettify based on Brian's and Jose's feedback. commit bffecd22dea66fb416ecff8cffd10dd4bdb73fce Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Apr 19 01:58:29 2012 +0200 gallivm: refactor aos lp_build_sample_image_nearest/linear split them up to separate address calculations and fetching/filtering. Need this for being able to do 8-wide float address calcs and 4-wide fetch/filter later (for avx). Plus the functions were very big scary monsters anyway (in particular lp_build_sample_image_linear). commit a80b325c57529adddcfa367f96f03557725c4773 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 16 17:17:18 2012 +0200 gallivm: fix lp_build_resize when truncating width but expanding vector size Missed this case which I thought was impossible - the assertion for it was right after the division by zero... (AoS) texture sampling may ask us to do this, for things like 8 4x32int vectors to 1 32x8int vector conversion (eventually, we probably don't want this to happen). commit f9c8337caa3eb185830d18bce8b95676a065b1d7 Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Apr 14 18:00:59 2012 +0200 gallivm: fix cube maps with larger vectors This makes the branchless cube face selection code work with larger vectors. Because the complexity is quite high (cannot really be improved it seems, per-face selection would reduce complexity a lot but this leads to errors unless the derivatives are calculated all from the same face which almost doubles the work to be done) it is still slower than the branching version, hence only enable this with large vectors. It doesn't actually do per-quad face selection yet (only makes sense with matching lod selection, in fact it will select the same face for all pixels based on the average of the first four pixels for now) but only different shuffles are required to make it work (the branching version actually should work with larger vectors too now thanks to the improved horizontal add but of course it cannot be extended to really select the face per-quad unless doing branching per quad). commit 7780c58869fc9a00af4f23209902db7e058e8a66 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 21:11:12 2012 +0100 llvmpipe: (trivial) fix compiler warning and also clarify comment regarding availability of popcnt instruction. commit a266dccf477df6d29a611154e988e8895892277e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 14:21:07 2012 +0100 gallivm: remove unneeded members in lp_build_sample_context Minor cleanup, the texture width, height, depth aren't accessed in their scalar form anywhere. Makes it more obvious those values should probably be fetched already vectorized (but this requires more invasive changes)... commit b678c57fb474e14f05e25658c829fc04d2792fff Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 29 15:53:55 2012 +0100 gallivm: add a helper for concatenating vectors Similar to the extract_range helper intended to get around slow code generated by llvm for 128bit insertelements. Concatenating two 128bit vectors this way will result in a single vinsertf128 operation rather than two 64bit stores plus one 128bit load, though it might be mildly useful for other purposes as well. commit 415ff228bcd0cf5e44a4c15350a661f0f5520029 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 19:41:15 2012 +0100 gallivm: add a custom 2x8f->1x16ub avx conversion path Similar to the existing 4x4f->1x16ub sse2 path, shaves off a couple instructions (min/max mostly) because it relies on pack intrinsics clamping. commit 78c08fc89f8fbcc6dba09779981b1e873e2a0299 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 18:44:07 2012 +0100 gallivm: add avx arithmetic intrinsics Add all avx intrinsics for arithmetic functions (with the exception of the horizontal add function which needs another look). Seems to pass basic tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit a586caa2800aa5ce54c173f7c0d4fc48153dbc4e Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 15:31:35 2012 +0100 gallivm: add avx logic intrinsics Add the blend intrinsics for 8-wide float and 4-wide double vectors. Since we lack 256bit int instructions these are used for int vectors as well, though obviously not for byte or word element values. The comparison intrinsics aren't extended for avx since these are only used for pre-2.7 llvm versions. commit 70275e4c13c89315fc2560a4c488c0e6935d5caf Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 00:40:53 2012 +0100 gallivm: new helper function for extract shuffles. Based on José's idea as we can need that in a couple places. Note that such shuffles should not be used lightly, since data layout of <4 x i8> is different to <16 x i8> for instance, hence might cause data rearrangement. commit 4d586dbae1b0c55915dda1759d2faea631c0a1c2 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 18:27:25 2012 +0100 gallivm: (trivial) don't overallocate shuffle variable using wrong define meant huge array... commit 06b0ec1f6d665d98c135f9573ddf4ba04b2121ad Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 17:54:20 2012 +0100 gallivm: don't do per-element extract/insert for vector element resize Instead of doing per-element extract/insert if the src vectors and dst vector differ in total size (which generates atrocious code) first change the src vectors size by using shuffles to destination vector size. We can still do better than that on AVX for packing to color buffer (by exploiting pack intrinsics characteristics hence eleminating the need for some clamps) but this already generates much better code. v2: incorporate feedback from José, Keith and use shuffle instead of bitcasts/extracts. Due to llvm deficiencies the latter cause all data to get moved to GPRs and back in pieces (even though the data in the regs actually stays the same...). commit c9970d70e05f95d3f52fe7d2cd794176a52693aa Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 19:33:19 2012 +0000 gallivm: fix bug in simple position interpolation Accidental use of position attribute instead of just pixel coordinates. Caused failures in piglit glsl-fs-ceil and glsl-fs-floor. commit d0b6fcdb008d04d7f73d3d725615321544da5a7e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 15:31:14 2012 +0000 gallivm: fix emission of ceil opcode lp_build_ceil seems more appropriate than lp_build_trunc. This seems to be never hit though someone performs some ceil to floor magic. commit d97fafed7e62ffa6bf76560a92ea246a1a26d256 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 22 11:46:52 2012 +0000 gallivm: new vectorized path for cubemap calculations should be faster when adapted to multiple quads as only selection masks need to be different. The code is more or less a per-pixel version adapted to only do it per quad. A per pixel version would be much simpler (could drop 2 selects, 6 broadcasts and the messy horizontal add of 3 vectors at the expense of only 2 more absolute value instructions - would also just work for arbitary large vectors). This version doesn't yet work with larger vectors because the horizontal add isn't adjusted to be able to work with 2x4 vectors (and also because face selection wouldn't be done per quad just per block though that would be only a correctness issue just as with lod selection). The downside is this code is quite a bit slower. On a Core2 it can be sped up by disabling the hw blend instructions for selection and using logicop fallbacks instead, but it is still slower than the old code, hence leave that in for now. Probably will chose one or the other version based on vector length in the end. commit b375fbb18a3fd46859b7fdd42f3e9908ea4ff9a3 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 21 14:42:29 2012 +0000 gallivm: fix optimized occlusion query intrinsic name commit a9ba0a3b611e48efbb0e79eb09caa85033dbe9a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Mar 21 16:19:43 2012 +0000 draw,gallivm,llvmpipe: Call gallivm_verify_function everywhere. commit f94c2238d2bc7383e088b8845b7410439a602071 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 18:54:10 2012 +0000 gallivm: optimize calculations for cube maps a bit this does some more vectorized calculations and uses horizontal adds if possible. A definite win with sse3 otherwise it doesn't seem to make much of a difference. In any case this is arithmetically identical, cannot handle larger vectors. Should be useful as a reference point against larger vector version later... commit 21a2c1cf3c8e1ac648ff49e59fdc0e3be77e2ebb Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 15:16:27 2012 +0000 llvmpipe: slight optimization of occlusion queries using movmskps when available. While this is slightly better for cpus without popcnt we should really sum the vectors ourselves (it is also possible to cast to i4 before doing the popcnt but that doesn't help that much neither since llvm is using some optimized popcnt version for i32) commit 5ab5a35f216619bcdf55eed52b0db275c4a06c1b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 13:32:11 2012 +0000 llvmpipe: fix occlusion queries with larger vectors need to adjust casts etc. commit ff95e6fdf5f16d4ef999ffcf05ea6e8c7160b0d5 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Mar 19 20:15:25 2012 +0000 gallivm: Restore optimization passes. commit 57b05b4b36451e351659e98946dae27be0959832 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:34:22 2012 +0000 llvmpipe: use existing min2 macro commit bc9a20e19b4f600a439f45679451f2e87cd4b299 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:07:27 2012 +0000 llvmpipe: add some safeguards against really large vectors As per José's suggestion, prevent things from blowing up if some cpu would have 1024bit or larger vectors. commit 0e2b525e5ca1c5bbaa63158bde52ad1c1564a3a9 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:31:08 2012 +0000 llvmpipe: fix mask generation for uberwide vectors this was the only piece preventing 16-wide vectors from working (apart from the LP_MAX_VECTOR_WIDTH define that is), which is the maximum as we don't get more pixels in the fragment shader at once. Hence adjust that so things could be tested properly with that size even though there seems to be no practical value. commit 3c8334162211c97f3a11c7f64e9e5a2a91ad9656 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:19:41 2012 +0000 llvmpipe: fix the simple interpolation method with larger vectors so both methods actually _really_ work now. Makes textures look nice with larger vectors... commit 1cb0464ef8871be1778d43b0c56adf9c06843e2d Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 17:26:35 2012 +0000 llvmpipe: fix mask generation and position interpolation with 8-wide vectors trivial bugs, with these things start to look somewhat reasonable. Textures though have some swizzling issues it seems. commit 168277a63ef5b72542cf063c337f2d701053ff4b Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 16:04:03 2012 +0000 llvmpipe: don't overallocate variables we never have more than 16 (stamp size) / 4 (minimum possible vector size). (With larger vectors those variables are still overallocated a bit.) commit 409b54b30f81ed0aa9ed0b01affe15c72de9abd2 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:56:48 2012 +0000 llvmpipe: add some 32f8 formats to lp_test_conv Also add the ability to handle different sized vectors. commit 55dcd3af8366ebdac0af3cdb22c2588f24aa18ce Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:47:27 2012 +0000 gallivm: handle different sized vectors in conversion / pack only fully generic path for now (extract/insert per element). commit 9c040f78c54575fcd94a8808216cf415fe8868f6 Author: Roland Scheidegger <sroland@vmware.com> Date: Sun Mar 18 00:58:28 2012 +0100 llvmpipe: fix harmless use of unitialized values commit 551e9d5468b92fc7d5aa2265db9a52bb1e368a36 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:31:21 2012 +0100 gallivm: drop special path in extract_broadcast with different sized vectors Not needed, llvm can handle shuffles with different sized result vector just fine. Should hopefully generate the same code in the end, but simpler IR. commit 44da531119ffa07a421eaa041f63607cec88f6f8 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:28:49 2012 +0100 llvmpipe: adapt interpolation for handling multiple quads at once this is still WIP there are actually two methods possible not quite sure what makes the most sense, so there's code for both for now: 1) the iterative method as used before (compute attrib values at upper left corner of stamp and upper left corner of each quad initially). It is improved to handle more than one quad at once, and also do some more vectorized calculations initially for slightly better code - newer cpus have full throughput with 4 wide float vectors, hence don't try to code up a path which might be faster if there's just one channel active per attribute. 2) just do straight interpolation for each pixel. Method 2) is more work per quad, but less initially - if all quads are executed significantly more overall though. But this might change with larger vector lengths. This method would also be needed if we'd do some kind of active quad merging when operating on multiple quads at once. This path contains some hack to force llvm to generate better code, it is still far from ideal though, still generates far too many unnecessary register spills/reloads. Both methods should work with different sized vectors. Not very well tested yet, still seems to work with four-wide vectors, need changes elsewhere to be able to test with wider vectors. commit be5d3e82e2fe14ad0a46529ab79f65bf2276cd28 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:59:37 2012 +0000 draw: Cleanup. commit f85bc12c7fbacb3de2a94e88c6cd2d5ee0ec0e8d Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:43:30 2012 +0000 gallivm: More module compilation refactoring. commit d76f093198f2a06a93b2204857e6fea5fd0b3ece Author: José Fonseca <jfonseca@vmware.com> Date: Thu Mar 15 21:29:11 2012 +0000 llvmpipe: Use gallivm_compile/free_function() in linear code. Should had been done before. commit 122e1adb613ce083ad739b153ced1cde61dfc8c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 13 14:47:10 2012 +0100 llvmpipe: generate partial pixel mask for multiple quads still works with one quad, cannot be tested yet with more At least for now always fixed order with multiple quads. commit 4c4f15081d75ed585a01392cd2dcce0ad10e0ea8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 22:09:24 2012 +0100 llvmpipe: refactor state setup a bit Refactor to make it easier to emit (and potentially later fetch in fs) coefficients for multiple attributes at once. Need to think more about how to make this actually happen however, the problem is different attributes can have different interpolation modes, requiring different handling in both setup and fs (though linear and perspective handling is close). commit 9363e49722ff47094d688a4be6f015a03fba9c79 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 19:23:23 2012 +0100 llvmpipe: vectorize tri offset calc cuts number of instructions in quad-offset-factor from 107 to 75. This code actually duplicated the (scalar) code calculating the determinant except it used different vertex order (leading to different sign but it doesn't matter) hence llvm could not have figured out it's the same (of course with determinant vectorized in the other place that wouldn't have worked any longer neither). Note this particular piece doesn't actually vectorize well, not many arithmetic instructions left but tons of shuffle instructions... Probably would need to work on n tris at a time for better vectorization. commit 63169dcb9dd445c94605625bf86d85306e2b4297 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 03:11:37 2012 +0100 llvmpipe: vectorize some scalar code in setup reduces number of arithmetic instructions, and avoids loading vector x,y values twice (once as scalars once as vectors). Results in a reduction of instructions from 76 to 64 in fs setup for glxgears (16%) on a cpu with sse41. Since this code uses vec2 disguised as vec4, on old cpus which had physical 64bit sse units (pre-Core2) it probably is less of a win in practice (and if you have no vectors you can only hope llvm eliminates the arithmetic for unneeded elements). commit 732ecb877f951ab89bf503ac5e35ab8d838b58a1 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 7 00:32:24 2012 +0100 draw: fix clipping bug introduced by 4822fea3f0440b5205e957cd303838c3b128419c broke clipping pretty badly (verified with lineclip test) commit ef5d90b86d624c152d200c7c4056f47c3c6d2688 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 23:38:59 2012 +0100 draw: don't store vertex header per attribute storing the vertex header once per attribute is totally unnecessary. Some quick look at the generated assembly says llvm in fact cannot optimize away the additional stores (maybe due to potentially aliasing pointers somewhere). Plus, this makes the code cleaner and also allows using a vector "or" instead of scalar ones. commit 6b3a5a57b0b9850854cfbd7b586e4e50102dda71 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 19:11:01 2012 +0100 draw: do the per-vertex "boolean" clipmask "or" with vectors no point extracting the values and doing it per component. Doesn't help that much since we still extract the values elsewhere anyway. commit 36519caf1af40e4480251cc79a2d527350b7c61f Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 2 22:27:01 2012 +0100 gallivm: fix lp_build_extract_broadcast with different sized vectors Fix the obviously wrong argument, so it doesn't blow up. commit 76d0ac3ad85066d6058486638013afd02b069c58 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 2 12:16:23 2012 +0000 draw: Compile per module and not per function (WIP). Enough to get gears w/ LLVM draw + softpipe to work on AVX doing: GALLIUM_DRIVER=softpipe SOFTPIPE_USE_LLVM=yes glxgears But still hackish -- will need to rethink and refactor this. commit 78e32b247d2a7a771be9a1a07eb000d1e54ea8bd Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 12:01:05 2012 +0000 llvmpipe: Remove lp_state_setup_fallback. Never used. commit 6895d5e40d19b4972c361e8b83fdb7eecda3c225 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Feb 27 19:14:27 2012 +0000 llvmpipe: Don't emit EMMS on x86 We already take precautions to ensure that LLVM never emits MMX code. commit 4822fea3f0440b5205e957cd303838c3b128419c Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 29 15:58:19 2012 +0100 draw: modifications for larger vector sizes We want to be able to use larger vectors especially for running the vertex shader. With this patch we build soa vectors which might have a different length than 4. Note that aos structures really remain the same, only when aos structures are converted to soa potentially different sized vectors are used. Samplers probably don't work yet, didn't look at them. Testing done: glxgears works with both 128bit and 256bit vectors. commit f4950fc1ea784680ab767d3dd0dce589f4e70603 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:57 2012 +0100 gallivm: override native vector width with LP_NATIVE_VECTOR_WIDTH env var for debug commit 6ad6dbf0c92f3bf68ae54e5f2aca035d19b76e53 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:24 2012 +0100 draw: allocate storage with alignment according to native vector width commit 7bf0e3e7c9bd2469ae7279cabf4c5229ae9880c1 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Feb 24 19:06:08 2012 +0000 gallivm: Fix comment grammar. Was missing several words. Spotted by Roland. commit b20f1b28eb890b2fa2de44a0399b9b6a0d453c52 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 19:22:09 2012 +0000 gallivm: Use MC-JIT on LLVM 3.1 + (i.e, SVN) MC-JIT Note: MC-JIT is still WIP. For this to work correctly it requires LLVM changes which are not yet upstream. commit b1af4dfcadfc241fd4023f4c3f823a1286d452c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 20:03:15 2012 +0100 llvmpipe: use new lp_type_width() helper in lp_test_blend commit 04e0a37e888237d4db2298f31973af459ef9c95f Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 19:50:34 2012 +0100 llvmpipe: clean up lp_test_blend a little Using variables just sized and aligned right makes it a bit more obvious what's going on. The test still only tests vector length 4. For AoS anything else probably isn't going to work. For SoA other lengths should work (at least with floats). commit e61c393d3ec392ddee0a3da170e985fda885a823 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:48:30 2012 +0000 gallivm: Ensure vector width consistency. Instead of assuming that everything is the max native size. commit 330081ac7bc41c5754a92825e51456d231bf84dd Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:44:14 2012 +0000 draw: More simd vector width consistency fixes. commit d90ca002753596269e37297e2e6c139b19f29f03 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:43:00 2012 +0000 gallivm: Remove unused lp_build_int32_vec4_type() helper. commit cae23417824d75869c202aaf897808d73a2c1db0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 17:32:16 2012 +0100 gallivm: use global variable for native vector width instead of define We do not know the simd extensions (and hence the simd width we should use) available at compile time. At least for now keep a define for maximum vector width, since a global variable obviously can't be used to adjust alignment of automatic stack variables. Leave the runtime-determined value at 128 for now in all cases. commit 51270ace6349acc2c294fc6f34c025c707be538a Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 15:41:02 2012 +0000 gallivm: Add a hunk inadvertedly lost when rebasing. commit bf256df9cfdd0236637a455cbaece949b1253e98 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:24:23 2012 +0000 llvmpipe: Use consistent vector width in depth/stencil test. commit 5543b0901677146662c44be2cfba655fd55da94b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:19:59 2012 +0000 draw: Use a consistent the vector register width. Instead of 4x32 sometimes, LP_NATIVE_VECTOR_WIDTH other times. commit eada8bbd22a3a61f549f32fe2a7e408222e5c824 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 12:08:04 2012 +0000 gallivm: Remove garbagge collection. MC-JIT will require one compilation per module (as opposed to one compilation per function), therefore no state will be shared, eliminating the need to do garbagge collection. commit 556697ea0ed72e0641851e4fbbbb862c470fd7eb Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 10:33:41 2012 +0000 gallivm: Move all native target initialization to lp_set_target_options(). commit c518e8f3f2649d5dc265403511fab4bcbe2cc5c8 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:52:32 2012 +0000 llvmpipe: Create one gallivm instance for each test. commit 90f10af8920ec6be6f2b1e7365cfc477a0cb111d Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:48:08 2012 +0000 gallivm: Avoid LLVMAddGlobalMapping() in lp_bld_assert(). Brittle, complex, and unecesary. Just use function pointer constant. commit 98fde550b33401e3fe006af59db4db628bcbf476 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:21:26 2012 +0000 gallivm: Add a lp_build_const_func_pointer() helper. To be reused in all places where we want to call C code. commit 6cfedadb62c2ce5af8d75969bc95a607f3ece118 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:44:41 2012 +0000 gallivm: Cleanup/simplify lp_build_const_string_variable. - Move to lp_bld_const where it belongs - Rename to lp_build_const_string - take the length from the argument (and don't count the zero terminator twice) - bitcast the constant to generic i8 * commit db1d4018c0f1fa682a9da93c032977659adfb68c Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 11:52:17 2012 +0000 gallivm: Set NoFramePointerElimNonLeaf to true where supported. commit 088614164aa915baaa5044fede728aa898483183 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 22 19:38:47 2012 +0100 llvmpipe: pass in/out pointers rather scalar floats in lp_bld_arit we don't want llvm to potentially optimize away the vectors (though it doesn't seem to currently), plus we want to be able to handle in/out vectors of arbitrary length. commit 3f5c4e04af8a7592fdffa54938a277c34ae76b51 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:22:55 2012 +0100 gallivm: fix lp_build_sqrt() for vector length 1 since we optimize away vectors with length 1 need to emit intrinsic without vector type. commit 79d94e5f93ed8ba6757b97e2026722ea31d32c06 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 17:00:46 2012 +0000 llvmpipe: Remove lp_test_round. commit 81f41b5aeb3f4126e06453cfc78990086b85b78d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:56:24 2012 +0100 llvmpipe: subsume lp_test_round into lp_test_arit Much simpler, and since the arguments aren't passed as 128bit values can run on any arch. This also uses the float instead of the double versions of the c functions (which probably was the intention anyway). In contrast to lp_test_round the output is much less verbose however. Tested vector width of 32 to 512 bits - all pass except 32 (length 1) which crashes in lp_build_sqrt() due to wrong type. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 945b338b421defbd274481d8c4f7e0910fd0e7eb Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 09:55:03 2012 +0000 gallivm: Centralize the function compilation logic. This simplifies a lot of code. Also doing this in a central place will make it easier to carry out the changes necessary to use MC-JIT in the future. gallivm: Fix typo in explicit derivative shuffle. Trivial. draw: make DEBUG_STORE work again adapt to lp_build_printf() interface changes Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: get rid of vecnf_from_scalar() just use lp_build_broadcast directly (cannot assign a name but don't really need it, vecnf_from_scalar() was producing much uglier IR due to using repeated insertelement instead of insertelement+shuffle). Reviewed-by: José Fonseca <jfonseca@vmware.com> llvmpipe: fix typo in complex interpolation code Fixes position interpolation when using complex mode (piglit fp-fragment-position and similar) Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: fix clipvertex/position storing again This appears to be the result of a bad merge. Fixes piglit tests relying on clipping, like a lot of the interpolation tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Fix explicit derivative manipulation. Same counter variable was being used in two nested loops. Use more meanigful variable names for the counter to fix and avoid this. gallivm: Prevent buffer overflow in repeat wrap mode for NPOT. Based on Roland's patch, discussion, and review . Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix dims for TGSI_TEXTURE_1D in emit_tex. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix explicit volume texture derivatives. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: fix 1d shadow texture sampling Always r coordinate is used, hence need 3 coords not two (the second one is unused). Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Enable AVX support without MCJIT, where available. For now, this just enables AVX on Windows for testing. If the code is stable then we might consider prefering the old JIT wherever possible. No change elsewhere. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2012-07-13 18:09:30 +01:00
}
if (type.width == 32 && !type.sign) {
intrinsic = "llvm.x86.sse41.pmaxud";
gallivm,draw,llvmpipe: Support wider native registers. Squashed commit of the following: commit 7acb7b4f60dc505af3dd00dcff744f80315d5b0e Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:46:31 2012 +0100 draw: Don't use dynamically sized arrays. Not supported by MSVC. commit 5810c28c83647612cb372d1e763fd9d7780df3cb Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:44:16 2012 +0100 gallivm,llvmpipe: Don't use expressions with PIPE_ALIGN_VAR(). MSVC doesn't accept exceptions in _declspec(align(...)). Use a define instead. commit 8aafd1457ba572a02b289b3f3411e99a3c056072 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:41:56 2012 +0100 gallium/util: Make u_cpu_detect.h header C++ safe. commit 5795248350771f899cfbfc1a3a58f1835eb2671d Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 2 12:08:01 2012 +0100 gallium/util: Add ULL suffix to large constants. As suggested by Andy Furniss: it looks like some old gcc versions require it. commit 4c66c22727eff92226544c7d43c4eb94de359e10 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Truly disable INF/NAN tests on MSVC. Thanks to Brian for spotting this. commit 8bce274c7fad578d7eb656d9a1413f5c0844c94e Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Disable INF/NAN tests on MSVC. Somehow they are not recognized as constants. commit 6868649cff8d7fd2e2579c28d0b74ef6dd4f9716 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 15:05:24 2012 +0200 gallivm: Cleanup the 2 x 8 float -> 16 ub special path in lp_build_conv. No behaviour change intended, like 7b98455fb40c2df84cfd3cdb1eb7650f67c8a751. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 5147a0949c4407e8bce9e41d9859314b4a9ccf77 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 14:28:19 2012 +0200 gallivm: (trivial) fix issues with multiple-of-4 texture fetch Some formats can't handle non-multiple of 4 fetches I believe, but everything must support length 1 and multiples of 4. So avoid going to scalar fetch (which is very costly) just because length isn't 4. Also extend the hack to not use shift with variable count for yuv formats to arbitrary length (larger than 1) - doesn't matter how many elements we have we always want to avoid it unless we have variable shift count instruction (which we should get with avx2). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 87ebcb1bd71fa4c739451ec8ca89a7f29b168c08 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jul 4 02:09:55 2012 +0200 gallivm: (trivial) fix typo for wrap repeat mode in linear filtering aos code This would lead to bogus coordinates at the edges. (undetected by piglit because this path is only taken for block-based formats). Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 3a42717101b1619874c8932a580c0b9e6896b557 Author: José Fonseca <jfonseca@vmware.com> Date: Tue Jul 3 19:42:49 2012 +0100 gallivm: Fix TGSI integer translation with AVX. commit d71ff104085c196b16426081098fb0bde128ce4f Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 15:17:41 2012 +0100 llvmpipe: Fix LLVM JIT linear path. It was not working properly because it was looking at the JIT function before it was actually compiled. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit a94df0386213e1f5f9a6ed470c535f9688ec0a1b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Jun 28 18:07:10 2012 +0100 gallivm: Refactor lp_build_broadcast(_scalar) to share code. Doesn't really change the generated assembly, but produces more compact IR, and of course, makes code more consistent. Reviewed-by: Brian Paul <brianp@vmware.com> commit 66712ba2731fc029fa246d4fc477d61ab785edb5 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 17:30:13 2012 +0100 gallivm: Make LLVMContextRef a singleton. There are any places inside LLVM that depend on it. Too many to attempt to fix. Reviewed-by: Brian Paul <brianp@vmware.com> commit ff5fb7897495ac263f0b069370fab701b70dccef Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 28 18:15:27 2012 +0200 gallivm: don't use 8-wide texture fetch in aos path This appears to be a slight loss usually. There are probably several reasons for that: - fetching itself is scalar - filtering is pure int code hence needs splitting anyway, same for the final texel offset calculations - texture wrap related code, which can be done 8-wide, is slightly more complex with floats (with clamp_to_edge) and float operations generally more costly hence probably not much faster overall - the code needed to split when encountering different mip levels for the quads, adding complexity So, just split always for aos path (but leave it 8-wide for soa, since we do 8-wide filtering there when possible). This should certainly be revisited if we'd have avx2 support. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ce8032b43dcd8e8d816cbab6428f54b0798f945d Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:41:19 2012 +0200 gallivm: (trivial) don't extract fparts variable if not needed Did not have any consequences but unnecessary. commit aaa9aaed8f80dc282492f62aa583a7ee23a4c6d5 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:09:06 2012 +0200 gallivm: fix precision issue in aos linear int wrap code now not just passes at a quick glance but also with piglit... If we do the wrapping with floats, we also need to set the weights accordingly. We can potentially end up with different (integer) coordinates than what the integer calculations would have chosen, which means the integer weights calculated previously in this case are completely wrong. Well at least that's what I think happens, at least recalculating the weights helps. (Some day really should refactor all the wrapping, so we do whatever is fastest independent of 16bit int aos or 32bit float soa filtering.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit fd6f18588ced7ac8e081892f3bab2916623ad7a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 11:15:53 2012 +0100 gallium/util: Fix parsing of options with underscore. For example GALLIVM_DEBUG=no_brilinear which was being parsed as two options, "no" and "brilinear". commit 09a8f809088178a03e49e409fa18f1ac89561837 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 26 15:00:14 2012 +0100 gallivm: Added a generic lp_build_print_value which prints a LLVMValueRef. Updated lp_build_printf to share common code. Removed specific lp_build_print_vecX. Reviewed-by: José Fonseca <jfonseca@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit e59bdcc2c075931bfba2a84967a5ecd1dedd6eb0 Author: José Fonseca <jfonseca@vmware.com> Date: Wed May 16 15:00:23 2012 +0100 draw,llvmpipe: Avoid named struct types on LLVM 3.0 and later. Starting with LLVM 3.0, named structures are meant not for debugging, but for recursive data types, previously also known as opaque types. The recursive nature of these types leads to several memory management difficulties. Given that we don't actually need recursive types, avoid them altogether. This is an attempt to address fdo bugs 41791 and 44466. The issue is somewhat random so there's no easy way to check how effective this is. Cherry-picked from 9af1ba565dfd5cef9ee938bb7c04767d14878fbf commit df6070f618a203c7a876d984c847cde4cbc26bdb Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 14:42:53 2012 +0200 gallivm: (trivial) fix typo in faster aos linear int wrap code no longer crashes, now REALLY tested. commit d8f98dce452c867214e6782e86dc08562643c862 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 18:20:58 2012 +0200 llvmpipe: (trivial) remove bogus optimization for float aos repeat wrap This optimization for nearest filtering on the linear path generated likely bogus results, and the int path didn't have any optimizations there since the only shader using force_nearest apparently uses clamp_to_edge not repeat wrap anyway. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit c4e271a0631087c795e756a5bb6b046043b5099d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 23:01:52 2012 +0200 gallivm: faster repeat wrap for linear aos path too Even if we already have scaled integer coords, it's way faster to use the original float coord (plus some conversions) rather than use URem. The choice of what to do for texture wrapping is not really tied to int aos or float soa filtering though for some modes there can be some gains (because of easier weight calculations). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 1174a75b1806e92aee4264ffe0ffe7e70abbbfa3 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 14:39:22 2012 +0200 gallivm: improve npot tex wrap repeat in linear soa path URem gets translated into series of scalar divisions so just about anything else is faster. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit f849ffaa499ed96fa0efd3594fce255c7f22891b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 00:40:35 2012 +0100 gallivm: (trivial) fix near-invisible shift-space typo I blame the keyboard. commit 5298a0b19fe672aebeb70964c0797d5921b51cf0 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:24:28 2012 +0200 gallivm: add new intrinsic helper to deal with arbitrary vector length This helper will split vectors which are too large for the hw, or expand them if they are too small, so a caller of a function using intrinsics which uses such sizes need not split (or expand) the vectors manually and the function will still use the intrinsic instead of dropping back to generic llvm code. It can also accept scalars for use with pseudo-vector intrinsics (only useful for float arguments, all x86 scalar simd float intrinsics use 4vf32). Only used for lp_build_min/max() for now (also added the scalar float case for these while there). (Other basic binary functions could use it easily, whereas functions with a different interface would need different helpers.) Expanding vectors isn't widely used, because we always try to use build contexts with native hw vector sizes. But it might (or not) be nicer if this wouldn't need to be done, the generated code should in theory stay the same (it does get hit by lp_build_rho though already since we didn't have a intrinsic for the scalar lp_build_max case before). v2: incorporated Brian's feedback, and also made the scalar min/max case work instead of crash (all scalar simd float intrinsics take 4vf32 as argument, probably the reason why it wasn't used before). Moved to lp_bld_intr based on José's request, and passing intrinsic size instead of length. Ideally we'd derive the source type info from the passed in llvm value refs and process some llvmtype return type so we could handle intrinsics where the source and destination type isn't the same (like float/int conversions, packing instructions) but that's a bit too complicated for now. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 01aa760b99ec0b2dc8ce57a43650e83f8c1becdf Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:19:18 2012 +0200 gallivm: (trivial) increase max code size for shader disassembly 64kB was just short of what I needed (which caused a crash) hence increase to 96kB (should probably be smarter about that). commit 74aa739138d981311ce13076388382b5e89c6562 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:53:29 2012 +0100 gallivm: simplify aos float tex wrap repeat nearest just handle pot and npot the same. The previous pot handling ended up with exactly the same instructions plus 2 more (leave it in the soa path though since it is probably still cheaper there). While here also fix a issue which would cause a crash after an assert. commit 0e1e755645e9e49cfaa2025191e3245ccd723564 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:29:24 2012 +0100 gallivm: (trivial) skip floor rounding in ifloor when not signed This was only done for the non-sse41 case before, but even with sse41 this is obviously unnecessary (some callers already call itrunc in this case anyway but some might not). commit 7f01a62f27dcb1d52597b24825931e88bae76f33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:23:12 2012 +0100 gallivm: (trivial) fix bogus comments commit 5c85be25fd82e28490274c468ce7f3e6e8c1d416 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 20 11:51:57 2012 +0100 translate: Free elt8_func/elt16_func too. These were leaking. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 0ad498f36fb6f7458c7cffa73b6598adceee0a6c Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 15:55:34 2012 +0200 gallivm: fix bug for tex wrap repeat with linear sampling in aos float path The comparison needs to be against length not length_minus_one, otherwise the max texel is never chosen (for the second coordinate). Fixes piglit texwrap-1D-npot-proj (and 2D/3D versions). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit d1ad65937c5b76407dc2499b7b774ab59341209e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 16:13:43 2012 +0200 gallivm: simplify soa tex wrap repeat with npot textures and no mip filtering Similar to what is already done in aos sampling for the float path (but not the int path since we don't get normalized float coordinates there). URem is expensive and the calculation is done trivially with normalized floats instead (at least with sse41-capable cpus). (Some day should probably do the same for the mip filter path but it's much more complicated there hence the gain is smaller.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit e1e23f57ba9b910295c306d148f15643acc3fc83 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:38:56 2012 +0200 llvmpipe: (trivial) remove duplicated function declaration Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 07ca57eb09e04c48a157733255427ef5de620861 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:37:34 2012 +0200 llvmpipe: destroy setup variants on context destruction lp_delete_setup_variants() used to be called in garbage collection, but this no longer exists hence the setup shaders never got freed. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ed0003c633859a45f9963a479f4c15ae0ef1dca3 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 16:25:29 2012 +0100 gallivm: handle different ilod parts for multiple quad sampling This fixes filtering when the integer part of the lod is not the same for all quads. I'm not fully convinced of that solution yet as it just splits the vector if the levels to be sampled from are different. But otherwise we'd need to do things like some minify steps, and getting mip level base address separately anyway hence it wouldn't really look like much of a win (and making the code even more complex). This should now give identical results to single quad sampling. commit 8580ac4cfc43a64df55e84ac71ce1a774d33c0d2 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:14:47 2012 +0200 gallivm: de-duplicate sample code common to soa and aos sampling There doesn't seem to be any reason why this code dealing with cube face selection, lod and mip level calculation is separate in aos and soa sampling, and I am sick of having it to change in both places. commit fb541e5f957408ce305b272100196f1e12e5b1e8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:15:41 2012 +0200 gallivm: do mip filtering with per quad lod_fpart This gives better results for mip filtering, though the generated code might not be optimal. For now it also creates some artifacts if the lod_ipart isn't the same for all quads, since instead of using the same mip weight for all quads as previously (which just caused non-smooth gradients) this now will use the right weights but with the wrong mip level in this case (can easily be seen with things like texfilt, mipmap_tunnel). v2: use logic helper suggested by José, and fix issue with negative lod_fpart values commit f1cc84eef7d826a20fab6cd8ccef9a275ff78967 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 13 18:35:25 2012 +0200 gallivm: (trivial) fix bogus assert in lp_build_unpack_broadcast_aos_scalars commit 7c17dbae8ae290df9ce0f50781a09e8ed640c044 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:11:14 2012 +0100 util: Reimplement half <-> float conversions. Removed u_half.py used to generate the table for previous method. Previous implementation of float to half conversion was faulty for denormalised and NaNs and would require extra logic to fix, thus making the speedup of using tables irrelevant. commit 7762f59274070e1dd4b546f5cb431c2eb71ae5c3 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:12:16 2012 +0100 tests: Updated tests to properly handle NaN for half floats. commit fa94c135aea5911fd93d5dfb6e6f157fb40dce5e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:33:10 2012 +0200 gallivm: do mip level calculations per quad This is the final piece which shouldn't change the rendering output yet. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 23cbeaddfe03c09ca18c45d28955515317ffcf4c Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Jun 9 00:54:21 2012 +0200 gallivm: do per-quad cube face selection Doesn't quite fix the piglit cubemap test (not sure why actually) but doing per-quad face selection is doing the right thing and definitely an improvement. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit abfb372b3702ac97ac8b5aa80ad1b94a2cc39d33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:22:59 2012 +0200 gallivm: do all lod calculations per quad Still no functional change but lod is now converted to scalar after lod calculations. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 519368632747ae03feb5bca9c655eccbc5b751b4 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:46:10 2012 +0100 gallivm: Added support for half-float to float conversion in lp_build_conv. Updated various utility functions to support this change. commit 135b4d683a4c95f7577ba27b9bffa4a6fbd2c2e7 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:02:46 2012 +0100 gallivm: Added function for half-float to float conversion. Updated lp_build_format_aos_array to support half-float source. commit 37d648827406a20c5007abeb177698723ed86673 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:55:18 2012 +0100 util: Updated u_format_tests to rigidly test half-float boundary values. commit 2ad18165d96e578aa9046df7c93cb1c3284d8c6b Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:54:16 2012 +0100 llvmpipe: Updated lp_test_format to properly handle Inf/NaN results. commit 78740acf25aeba8a7d146493dd5c966e22c27b73 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:53:30 2012 +0100 util: Added functions for checking NaN / Inf for double and half-floats. commit 35e9f640ae01241f9e0d67fe893bbbf564c05809 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:05:13 2012 +0200 gallivm: Fix calculating rho for 3d textures for the single-quad case Discovered by accident, this looks like a very old typo bug. commit fc1220c636326536fd0541913154e62afa7cd1d8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:04:59 2012 +0200 gallivm: do calcs per-quad in lp_build_rho Still convert to scalar at the end of the function. commit 50a887ffc550bf310a6988fa2cea5c24d38c1a41 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 21 23:21:50 2012 +0200 gallivm: (trivial) return scalar in lp_build_extract_range for length 1 vectors Our type system on top of llvm's one doesn't generally support vectors of length 1, instead using scalars. So we should return a scalar from this function instead of having to bitcast the vector with length 1 later elsewhere. commit 80c71c621f9391f0f9230460198d861643324876 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 17:49:15 2012 +0100 draw: Fixed bad merge error commit c47401cfad0c9167de20ff560654f533579f452c Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:29:30 2012 +0100 draw: Updated store_clip to store whole vectors instead of individual elements. commit 2d9c1ad74b0b0b41861fffcecde39f09cc27f1cf Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:28:32 2012 +0100 gallivm: Added lp_build_fetch_rgba_aos_array. A version of lp_build_fetch_rgba_aos which is targeted at simple array formats. Reads the whole vector from memory in one, instead of reading each element individually. Tested with mesa tests and demos. commit ff7805dc2b6ef6d8b11ec4e54aab1633aef29ac8 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:27:40 2012 +0100 gallivm: Added lp_build_pad_vector. This function pads a vector with undef to a desired length. commit 701f50acef24a2791dabf4730e5b5687d6eb875d Author: James Benton <jbenton@vmware.com> Date: Fri May 18 17:27:19 2012 +0100 util: Added util_format_is_array. This function checks whether a format description is in a simple array format. commit 5e0a7fa543dcd009de26f34a7926674190fa6246 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:13:47 2012 +0100 draw: Removed draw_llvm_translate_from and draw/draw_llvm_translate.c. This is "replaced" by adding an optimised path in lp_build_fetch_rgba_aos in an upcoming patch. commit 8c886d6a7dd3fb464ecf031de6f747cb33e5361d Author: James Benton <jbenton@vmware.com> Date: Wed May 16 15:02:31 2012 +0100 draw: Modified store_aos to write the vector as one, not individual elements. commit 37337f3d657e21dfd662c7b26d61cb0f8cfa6f17 Author: James Benton <jbenton@vmware.com> Date: Wed May 16 14:16:23 2012 +0100 draw: Changed aos_to_soa to use lp_build_transpose_aos. commit bd2b69ce5d5c94b067944d1dcd5df9f8e84548f1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:14:27 2012 +0100 draw: Changed soa_to_aos to use lp_build_transpose_aos. commit 0b98a950d29a116e82ce31dfe7b82cdadb632f2b Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:45 2012 +0100 gallivm: Added lp_build_transpose_aos which converts between aos and soa. commit 69ea84531ad46fd145eb619ed1cedbe97dde7cb5 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:01 2012 +0100 gallivm: Added lp_build_interleave2_half aimed at AVX unpack instructions. commit 7a4cb1349dd35c18144ad5934525cfb9436792f9 Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 22 11:54:14 2012 +0100 gallivm: Fix build on Windows. MC-JIT not yet supported there. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit afd105fc16bb75d874e418046b80d9cc578818a1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:17:26 2012 +0100 llvmpipe: Added a error counter to lp_test_conv. Useful for keeping track of progress when fixing errors! Signed-off-by: José Fonseca <jfonseca@vmware.com> commit b644907d08c10a805657841330fc23db3963d59c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:16:46 2012 +0100 llvmpipe: Changed known failures in lp_test_conv. To comply with the recent fixes to lp_bld_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit d7061507bd94f6468581e218e61261b79c760d4f Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:14:38 2012 +0100 llvmpipe: Added fixed point types tests to lp_test_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 146b3ea39b4726dbe125ac666bd8902ea3d6ca8c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:26:35 2012 +0100 llvmpipe: Changed lp_test_conv src/dst alignment to be correct. Now based on the define rather than a fixed number. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit f3b57441f834833a4b142a951eb98df0aa874536 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:06:44 2012 +0100 gallivm: Fixed erroneous optimisation in lp_build_min/max. Previously assumed normalised was 0 to 1, but it can be -1 to 1 if type is signed. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a0613382e5a215cd146bb277646a6b394d376ae4 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:04:49 2012 +0100 gallivm: Compensate for lp_const_offset in lp_build_conv. Fixing a /*FIXME*/ to remove errors in integer conversion in lp_build_conv. Tested using lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a3d2bf15ea345bc8a0664f8f441276fd566566f3 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:01:25 2012 +0100 gallivm: Fixed overflow in lp_build_clamped_float_to_unsigned_norm. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit e7b1e76fe237613731fa6003b5e1601a2e506207 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 21 20:07:51 2012 +0100 gallivm: Fix build with LLVM 2.6 Trivial, and useful. commit d3c6bbe5c7f5ba1976710831281ab1b6a631082d Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 15 17:15:59 2012 +0100 gallivm: Enable MCJIT/AVX with vanilla LLVM 3.1. Add the necessary C++ glue, so that we don't need any modifications to the soon to be released LLVM 3.1. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 724a019a14d40fdbed21759a204a2bec8a315636 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 22:04:06 2012 +0100 gallivm: Use HAVE_LLVM 0x0301 consistently. commit af6991e2a3868e40ad599b46278551b794839748 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 21:49:06 2012 +0100 gallivm: Add MCRegisterInfo.h to silence benign warnings about missing implementation. Trivial. commit 6f8a1d75458daae2503a86c6b030ecc4bb494e23 Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Apr 2 22:14:15 2012 -0700 gallivm: Pass in a MCInstrInfo to createMCInstPrinter on llvm-3.1. llvm-3.1svn r153860 makes MCInstrInfo available to the MCInstPrinter. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 62555b6ed8760545794f83064e27cddcb3ce5284 Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 27 21:51:17 2012 -0700 gallivm: Fix method overriding in raw_debug_ostream. Use matching type qualifers to avoid method hiding. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 6a9bd784f4ac68ad0a731dcd39e5a3c39989f2be Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 13 22:40:52 2012 -0700 gallivm: Fix createOProfileJITEventListener namespace with llvm-3.1. llvm-3.1svn r152620 refactored the OProfile profiling code. createOProfileJITEventListener was moved from the llvm namespace to the llvm::JITEventListener namespace. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit b674955d39adae272a779be85aa1bd665de24e3e Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Mar 5 22:00:40 2012 -0800 gallivm: Pass in a MCRegisterInfo to MCInstPrinter on llvm-3.1. llvm-3.1svn r152043 changes createMCInstPrinter to take an additional MCRegisterInfo argument. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 11ab69971a8a31c62f6de74905dbf8c02884599f Author: Vinson Lee <vlee@freedesktop.org> Date: Wed Feb 29 21:20:53 2012 -0800 Revert "gallivm: Change getExtent and readByte to non-const with llvm-3.1." This reverts commit d5a6c172547d8964f4d4bb79637651decaf9deee. llvm-3.1svn r151687 makes MemoryObject accessor members const again. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 339960c82d2a9f5c928ee9035ed31dadb7f45537 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 14 16:19:56 2012 +0200 gallivm: (trivial) fix assertion failure for mipmapped 1d textures In lp_build_rho, we may end up with a 1-element vector (for mipmapped 1d textures), but in this case we require the type to be a non-vector type, so need a cast. commit 9d73edb727bd6d196030dc3026b7bf0c574b3e19 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:12:07 2012 +0200 gallivm: prepare for per-quad lod calculations for large vectors to be able to handle multiple quads at once in texture sampling and still do lod calculations per quad, it is necessary to get the per-quad derivatives into the lp_build_rho function. Until now these derivative values were just scalars, which isn't going to work. So we now use vectors, and since the interface needs to change we also do some different (slightly more efficient) packing of the values. For 8-wide vectors the packed derivative values for 3 coords would look like this, this scales to a arbitrary (multiple of 4) vector size: ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____ The second vector will be unused for 1d and 2d textures. To facilitate future changes the derivative values are put into a struct, since quite some functions just pass these values through. The generated code seems to be very slightly better for 2d textures (with 4-wide vectors) than before with sse2 (if you have a cpu with physical 128bit simd units - otherwise it's probably not a win). v2: suggestions from José, rename variables, add comments, use swizzle helper commit 0aa21de0d31466dac77b05c97005722e902517b8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:10:31 2012 +0200 gallivm: add undefined swizzle handling to lp_build_swizzle_aos This is useful for vectors with "holes", it lets llvm choose the most efficient shuffle instructions if some elements aren't needed without having to worry what elements to manually pick otherwise. commit 00faf3f370e7ce92f5ef51002b0ea42ef856e181 Author: José Fonseca <jfonseca@vmware.com> Date: Fri May 4 17:25:16 2012 +0100 gallivm: Get the LLVM IR optimization passes before JIT compilation. MC-JIT engine compiles the module immediately on creation, so the optimization passes were being run too late. So now we create a target data layout from a string, that matches the ABI parameters reported by the compiler. The backend optimization passes were always been run, so the performance improvement is modest (3% on multiarb mesa demo). Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit 40a43f4e2ce3074b5ce9027179d657ebba68800a Author: Roland Scheidegger <sroland@vmware.com> Date: Wed May 2 16:03:54 2012 +0200 gallivm: (trivial) fix wrong define used in lp_build_pack2 should fix stack-smashing crashes. commit e6371d0f4dffad4eb3b7a9d906c23f1c88a2ab9e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 30 21:25:29 2012 +0200 gallivm: add perf warnings when not using intrinsics with 256bit vectors Helper functions using integer sse2 intrinsics could split the vectors with AVX instead of using generic fallback (which should be faster). We don't actually expect to hit these paths (hence don't fix them up to actually do the vector splitting) so just emit warnings (for those functions where it's obvious doing split/intrinsic is faster than using generic path). Only emit warnings for 256bit vectors since we _really_ don't expect to hit arbitrary large vectors which would affect a lot more functions. The warnings do not actually depend on avx since the same logic applies to plain sse2 too (but of course again there's _really_ no reason we should hit these functions with 256bit vectors without avx). commit 8a9ea701ea7295181e846c6383bf66a5f5e47637 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:37:07 2012 +0200 gallivm: split vectors manually for avx in lp_build_pack2 (v2) There's 2 reasons for this: First, there's a llvm bug (fixed in 3.1) which generates tons of byte inserts/extracts otherwise, and second, more importantly, we want to use pack intrinsics instead of shuffles. We do this in lp_build_pack2 and not the calling code (aos sample path) because potentially other callers might find that useful too, even if for larger sequences of code using non-native vector sizes it might be better to manually split vectors. This should boost texture performance in the aos path considerably. v2: fix issues with intrinsics types with old llvm commit 27ac5b48fa1f2ea3efeb5248e2ce32264aba466e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:26:22 2012 +0200 llvmpipe: refactor lp_build_pack2 (v2) prettify, and it's unnecessary to assert when there's no intrinsic due to unsupported bit width - the shuffle path will work regardless. In contrast lp_build_packs2, should only rely on lp_build_pack2 doing the clamping for element sizes for which there is a sse2 intrinsic. v2: fix bug spotted by Jose regarding the intrinsic type for packusdw on old llvm versions. commit ddf279031f0111de4b18eaf783bdc0a1e47813c8 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:13:59 2012 +0200 gallivm: add src width check in lp_build_packs2() not doing so would skip clamping even if no sse2 pack instruction is available, which is incorrect (in theory only, such widths would also always hit a (unnecessary) assertion in lp_build_pack2(). commit e7f0ad7fe079975eae7712a6e0c54be4fae0114b Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Apr 27 15:57:00 2012 +0200 gallivm: (trivial) fix crash-causing typo for npot textures with avx commit 28a9d7f6f655b6ec508c8a3aa6ffefc1e79793a0 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Apr 25 19:38:45 2012 +0200 gallivm: (trivial) remove code mistakenly added twice. commit d5926537316f8ff67ad0a52e7242f7c5478d919b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Apr 24 21:16:15 2012 +0200 gallivm: add a new avx aos sample path (v2) Try to avoid mixing float and int address calculations. This does texture wrap modes with floats, and then the offset calculations still with ints (because of lack of precision with floats, though we could do some effort to make it work with not too large (16MB) textures). This also handles wrap repeat mode with npot-sized textures differently than either the old soa or aos int path (likely way faster but untested). Otherwise the actual address wrap code is largely similar to the soa path (not quite the same as this one also has some int code), it should get used by avx soa sampling later as well but doesn't handle more complex address modes yet (this will also have the benefit that we can use aos sampling path for all texture address modes). Generated code for that looks reasonable, but still does not split vectors explicitly for fetch/filter which means still get hit by llvm (fixed upstream) which generates hundreds of pinsrb/pextrb instead of two shuffles. It is not obvious though if it's much of a win over just doing address calcs 4-wide but with ints, even if it is definitely much less instructions on avx. piglit's texwrap seems to look exactly the same but doesn't test neither the non-normalized nor the npot cases. v2: fix comments, prettify based on Brian's and Jose's feedback. commit bffecd22dea66fb416ecff8cffd10dd4bdb73fce Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Apr 19 01:58:29 2012 +0200 gallivm: refactor aos lp_build_sample_image_nearest/linear split them up to separate address calculations and fetching/filtering. Need this for being able to do 8-wide float address calcs and 4-wide fetch/filter later (for avx). Plus the functions were very big scary monsters anyway (in particular lp_build_sample_image_linear). commit a80b325c57529adddcfa367f96f03557725c4773 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 16 17:17:18 2012 +0200 gallivm: fix lp_build_resize when truncating width but expanding vector size Missed this case which I thought was impossible - the assertion for it was right after the division by zero... (AoS) texture sampling may ask us to do this, for things like 8 4x32int vectors to 1 32x8int vector conversion (eventually, we probably don't want this to happen). commit f9c8337caa3eb185830d18bce8b95676a065b1d7 Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Apr 14 18:00:59 2012 +0200 gallivm: fix cube maps with larger vectors This makes the branchless cube face selection code work with larger vectors. Because the complexity is quite high (cannot really be improved it seems, per-face selection would reduce complexity a lot but this leads to errors unless the derivatives are calculated all from the same face which almost doubles the work to be done) it is still slower than the branching version, hence only enable this with large vectors. It doesn't actually do per-quad face selection yet (only makes sense with matching lod selection, in fact it will select the same face for all pixels based on the average of the first four pixels for now) but only different shuffles are required to make it work (the branching version actually should work with larger vectors too now thanks to the improved horizontal add but of course it cannot be extended to really select the face per-quad unless doing branching per quad). commit 7780c58869fc9a00af4f23209902db7e058e8a66 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 21:11:12 2012 +0100 llvmpipe: (trivial) fix compiler warning and also clarify comment regarding availability of popcnt instruction. commit a266dccf477df6d29a611154e988e8895892277e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 14:21:07 2012 +0100 gallivm: remove unneeded members in lp_build_sample_context Minor cleanup, the texture width, height, depth aren't accessed in their scalar form anywhere. Makes it more obvious those values should probably be fetched already vectorized (but this requires more invasive changes)... commit b678c57fb474e14f05e25658c829fc04d2792fff Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 29 15:53:55 2012 +0100 gallivm: add a helper for concatenating vectors Similar to the extract_range helper intended to get around slow code generated by llvm for 128bit insertelements. Concatenating two 128bit vectors this way will result in a single vinsertf128 operation rather than two 64bit stores plus one 128bit load, though it might be mildly useful for other purposes as well. commit 415ff228bcd0cf5e44a4c15350a661f0f5520029 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 19:41:15 2012 +0100 gallivm: add a custom 2x8f->1x16ub avx conversion path Similar to the existing 4x4f->1x16ub sse2 path, shaves off a couple instructions (min/max mostly) because it relies on pack intrinsics clamping. commit 78c08fc89f8fbcc6dba09779981b1e873e2a0299 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 18:44:07 2012 +0100 gallivm: add avx arithmetic intrinsics Add all avx intrinsics for arithmetic functions (with the exception of the horizontal add function which needs another look). Seems to pass basic tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit a586caa2800aa5ce54c173f7c0d4fc48153dbc4e Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 15:31:35 2012 +0100 gallivm: add avx logic intrinsics Add the blend intrinsics for 8-wide float and 4-wide double vectors. Since we lack 256bit int instructions these are used for int vectors as well, though obviously not for byte or word element values. The comparison intrinsics aren't extended for avx since these are only used for pre-2.7 llvm versions. commit 70275e4c13c89315fc2560a4c488c0e6935d5caf Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 00:40:53 2012 +0100 gallivm: new helper function for extract shuffles. Based on José's idea as we can need that in a couple places. Note that such shuffles should not be used lightly, since data layout of <4 x i8> is different to <16 x i8> for instance, hence might cause data rearrangement. commit 4d586dbae1b0c55915dda1759d2faea631c0a1c2 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 18:27:25 2012 +0100 gallivm: (trivial) don't overallocate shuffle variable using wrong define meant huge array... commit 06b0ec1f6d665d98c135f9573ddf4ba04b2121ad Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 17:54:20 2012 +0100 gallivm: don't do per-element extract/insert for vector element resize Instead of doing per-element extract/insert if the src vectors and dst vector differ in total size (which generates atrocious code) first change the src vectors size by using shuffles to destination vector size. We can still do better than that on AVX for packing to color buffer (by exploiting pack intrinsics characteristics hence eleminating the need for some clamps) but this already generates much better code. v2: incorporate feedback from José, Keith and use shuffle instead of bitcasts/extracts. Due to llvm deficiencies the latter cause all data to get moved to GPRs and back in pieces (even though the data in the regs actually stays the same...). commit c9970d70e05f95d3f52fe7d2cd794176a52693aa Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 19:33:19 2012 +0000 gallivm: fix bug in simple position interpolation Accidental use of position attribute instead of just pixel coordinates. Caused failures in piglit glsl-fs-ceil and glsl-fs-floor. commit d0b6fcdb008d04d7f73d3d725615321544da5a7e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 15:31:14 2012 +0000 gallivm: fix emission of ceil opcode lp_build_ceil seems more appropriate than lp_build_trunc. This seems to be never hit though someone performs some ceil to floor magic. commit d97fafed7e62ffa6bf76560a92ea246a1a26d256 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 22 11:46:52 2012 +0000 gallivm: new vectorized path for cubemap calculations should be faster when adapted to multiple quads as only selection masks need to be different. The code is more or less a per-pixel version adapted to only do it per quad. A per pixel version would be much simpler (could drop 2 selects, 6 broadcasts and the messy horizontal add of 3 vectors at the expense of only 2 more absolute value instructions - would also just work for arbitary large vectors). This version doesn't yet work with larger vectors because the horizontal add isn't adjusted to be able to work with 2x4 vectors (and also because face selection wouldn't be done per quad just per block though that would be only a correctness issue just as with lod selection). The downside is this code is quite a bit slower. On a Core2 it can be sped up by disabling the hw blend instructions for selection and using logicop fallbacks instead, but it is still slower than the old code, hence leave that in for now. Probably will chose one or the other version based on vector length in the end. commit b375fbb18a3fd46859b7fdd42f3e9908ea4ff9a3 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 21 14:42:29 2012 +0000 gallivm: fix optimized occlusion query intrinsic name commit a9ba0a3b611e48efbb0e79eb09caa85033dbe9a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Mar 21 16:19:43 2012 +0000 draw,gallivm,llvmpipe: Call gallivm_verify_function everywhere. commit f94c2238d2bc7383e088b8845b7410439a602071 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 18:54:10 2012 +0000 gallivm: optimize calculations for cube maps a bit this does some more vectorized calculations and uses horizontal adds if possible. A definite win with sse3 otherwise it doesn't seem to make much of a difference. In any case this is arithmetically identical, cannot handle larger vectors. Should be useful as a reference point against larger vector version later... commit 21a2c1cf3c8e1ac648ff49e59fdc0e3be77e2ebb Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 15:16:27 2012 +0000 llvmpipe: slight optimization of occlusion queries using movmskps when available. While this is slightly better for cpus without popcnt we should really sum the vectors ourselves (it is also possible to cast to i4 before doing the popcnt but that doesn't help that much neither since llvm is using some optimized popcnt version for i32) commit 5ab5a35f216619bcdf55eed52b0db275c4a06c1b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 13:32:11 2012 +0000 llvmpipe: fix occlusion queries with larger vectors need to adjust casts etc. commit ff95e6fdf5f16d4ef999ffcf05ea6e8c7160b0d5 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Mar 19 20:15:25 2012 +0000 gallivm: Restore optimization passes. commit 57b05b4b36451e351659e98946dae27be0959832 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:34:22 2012 +0000 llvmpipe: use existing min2 macro commit bc9a20e19b4f600a439f45679451f2e87cd4b299 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:07:27 2012 +0000 llvmpipe: add some safeguards against really large vectors As per José's suggestion, prevent things from blowing up if some cpu would have 1024bit or larger vectors. commit 0e2b525e5ca1c5bbaa63158bde52ad1c1564a3a9 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:31:08 2012 +0000 llvmpipe: fix mask generation for uberwide vectors this was the only piece preventing 16-wide vectors from working (apart from the LP_MAX_VECTOR_WIDTH define that is), which is the maximum as we don't get more pixels in the fragment shader at once. Hence adjust that so things could be tested properly with that size even though there seems to be no practical value. commit 3c8334162211c97f3a11c7f64e9e5a2a91ad9656 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:19:41 2012 +0000 llvmpipe: fix the simple interpolation method with larger vectors so both methods actually _really_ work now. Makes textures look nice with larger vectors... commit 1cb0464ef8871be1778d43b0c56adf9c06843e2d Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 17:26:35 2012 +0000 llvmpipe: fix mask generation and position interpolation with 8-wide vectors trivial bugs, with these things start to look somewhat reasonable. Textures though have some swizzling issues it seems. commit 168277a63ef5b72542cf063c337f2d701053ff4b Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 16:04:03 2012 +0000 llvmpipe: don't overallocate variables we never have more than 16 (stamp size) / 4 (minimum possible vector size). (With larger vectors those variables are still overallocated a bit.) commit 409b54b30f81ed0aa9ed0b01affe15c72de9abd2 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:56:48 2012 +0000 llvmpipe: add some 32f8 formats to lp_test_conv Also add the ability to handle different sized vectors. commit 55dcd3af8366ebdac0af3cdb22c2588f24aa18ce Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:47:27 2012 +0000 gallivm: handle different sized vectors in conversion / pack only fully generic path for now (extract/insert per element). commit 9c040f78c54575fcd94a8808216cf415fe8868f6 Author: Roland Scheidegger <sroland@vmware.com> Date: Sun Mar 18 00:58:28 2012 +0100 llvmpipe: fix harmless use of unitialized values commit 551e9d5468b92fc7d5aa2265db9a52bb1e368a36 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:31:21 2012 +0100 gallivm: drop special path in extract_broadcast with different sized vectors Not needed, llvm can handle shuffles with different sized result vector just fine. Should hopefully generate the same code in the end, but simpler IR. commit 44da531119ffa07a421eaa041f63607cec88f6f8 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:28:49 2012 +0100 llvmpipe: adapt interpolation for handling multiple quads at once this is still WIP there are actually two methods possible not quite sure what makes the most sense, so there's code for both for now: 1) the iterative method as used before (compute attrib values at upper left corner of stamp and upper left corner of each quad initially). It is improved to handle more than one quad at once, and also do some more vectorized calculations initially for slightly better code - newer cpus have full throughput with 4 wide float vectors, hence don't try to code up a path which might be faster if there's just one channel active per attribute. 2) just do straight interpolation for each pixel. Method 2) is more work per quad, but less initially - if all quads are executed significantly more overall though. But this might change with larger vector lengths. This method would also be needed if we'd do some kind of active quad merging when operating on multiple quads at once. This path contains some hack to force llvm to generate better code, it is still far from ideal though, still generates far too many unnecessary register spills/reloads. Both methods should work with different sized vectors. Not very well tested yet, still seems to work with four-wide vectors, need changes elsewhere to be able to test with wider vectors. commit be5d3e82e2fe14ad0a46529ab79f65bf2276cd28 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:59:37 2012 +0000 draw: Cleanup. commit f85bc12c7fbacb3de2a94e88c6cd2d5ee0ec0e8d Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:43:30 2012 +0000 gallivm: More module compilation refactoring. commit d76f093198f2a06a93b2204857e6fea5fd0b3ece Author: José Fonseca <jfonseca@vmware.com> Date: Thu Mar 15 21:29:11 2012 +0000 llvmpipe: Use gallivm_compile/free_function() in linear code. Should had been done before. commit 122e1adb613ce083ad739b153ced1cde61dfc8c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 13 14:47:10 2012 +0100 llvmpipe: generate partial pixel mask for multiple quads still works with one quad, cannot be tested yet with more At least for now always fixed order with multiple quads. commit 4c4f15081d75ed585a01392cd2dcce0ad10e0ea8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 22:09:24 2012 +0100 llvmpipe: refactor state setup a bit Refactor to make it easier to emit (and potentially later fetch in fs) coefficients for multiple attributes at once. Need to think more about how to make this actually happen however, the problem is different attributes can have different interpolation modes, requiring different handling in both setup and fs (though linear and perspective handling is close). commit 9363e49722ff47094d688a4be6f015a03fba9c79 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 19:23:23 2012 +0100 llvmpipe: vectorize tri offset calc cuts number of instructions in quad-offset-factor from 107 to 75. This code actually duplicated the (scalar) code calculating the determinant except it used different vertex order (leading to different sign but it doesn't matter) hence llvm could not have figured out it's the same (of course with determinant vectorized in the other place that wouldn't have worked any longer neither). Note this particular piece doesn't actually vectorize well, not many arithmetic instructions left but tons of shuffle instructions... Probably would need to work on n tris at a time for better vectorization. commit 63169dcb9dd445c94605625bf86d85306e2b4297 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 03:11:37 2012 +0100 llvmpipe: vectorize some scalar code in setup reduces number of arithmetic instructions, and avoids loading vector x,y values twice (once as scalars once as vectors). Results in a reduction of instructions from 76 to 64 in fs setup for glxgears (16%) on a cpu with sse41. Since this code uses vec2 disguised as vec4, on old cpus which had physical 64bit sse units (pre-Core2) it probably is less of a win in practice (and if you have no vectors you can only hope llvm eliminates the arithmetic for unneeded elements). commit 732ecb877f951ab89bf503ac5e35ab8d838b58a1 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 7 00:32:24 2012 +0100 draw: fix clipping bug introduced by 4822fea3f0440b5205e957cd303838c3b128419c broke clipping pretty badly (verified with lineclip test) commit ef5d90b86d624c152d200c7c4056f47c3c6d2688 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 23:38:59 2012 +0100 draw: don't store vertex header per attribute storing the vertex header once per attribute is totally unnecessary. Some quick look at the generated assembly says llvm in fact cannot optimize away the additional stores (maybe due to potentially aliasing pointers somewhere). Plus, this makes the code cleaner and also allows using a vector "or" instead of scalar ones. commit 6b3a5a57b0b9850854cfbd7b586e4e50102dda71 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 19:11:01 2012 +0100 draw: do the per-vertex "boolean" clipmask "or" with vectors no point extracting the values and doing it per component. Doesn't help that much since we still extract the values elsewhere anyway. commit 36519caf1af40e4480251cc79a2d527350b7c61f Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 2 22:27:01 2012 +0100 gallivm: fix lp_build_extract_broadcast with different sized vectors Fix the obviously wrong argument, so it doesn't blow up. commit 76d0ac3ad85066d6058486638013afd02b069c58 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 2 12:16:23 2012 +0000 draw: Compile per module and not per function (WIP). Enough to get gears w/ LLVM draw + softpipe to work on AVX doing: GALLIUM_DRIVER=softpipe SOFTPIPE_USE_LLVM=yes glxgears But still hackish -- will need to rethink and refactor this. commit 78e32b247d2a7a771be9a1a07eb000d1e54ea8bd Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 12:01:05 2012 +0000 llvmpipe: Remove lp_state_setup_fallback. Never used. commit 6895d5e40d19b4972c361e8b83fdb7eecda3c225 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Feb 27 19:14:27 2012 +0000 llvmpipe: Don't emit EMMS on x86 We already take precautions to ensure that LLVM never emits MMX code. commit 4822fea3f0440b5205e957cd303838c3b128419c Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 29 15:58:19 2012 +0100 draw: modifications for larger vector sizes We want to be able to use larger vectors especially for running the vertex shader. With this patch we build soa vectors which might have a different length than 4. Note that aos structures really remain the same, only when aos structures are converted to soa potentially different sized vectors are used. Samplers probably don't work yet, didn't look at them. Testing done: glxgears works with both 128bit and 256bit vectors. commit f4950fc1ea784680ab767d3dd0dce589f4e70603 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:57 2012 +0100 gallivm: override native vector width with LP_NATIVE_VECTOR_WIDTH env var for debug commit 6ad6dbf0c92f3bf68ae54e5f2aca035d19b76e53 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:24 2012 +0100 draw: allocate storage with alignment according to native vector width commit 7bf0e3e7c9bd2469ae7279cabf4c5229ae9880c1 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Feb 24 19:06:08 2012 +0000 gallivm: Fix comment grammar. Was missing several words. Spotted by Roland. commit b20f1b28eb890b2fa2de44a0399b9b6a0d453c52 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 19:22:09 2012 +0000 gallivm: Use MC-JIT on LLVM 3.1 + (i.e, SVN) MC-JIT Note: MC-JIT is still WIP. For this to work correctly it requires LLVM changes which are not yet upstream. commit b1af4dfcadfc241fd4023f4c3f823a1286d452c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 20:03:15 2012 +0100 llvmpipe: use new lp_type_width() helper in lp_test_blend commit 04e0a37e888237d4db2298f31973af459ef9c95f Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 19:50:34 2012 +0100 llvmpipe: clean up lp_test_blend a little Using variables just sized and aligned right makes it a bit more obvious what's going on. The test still only tests vector length 4. For AoS anything else probably isn't going to work. For SoA other lengths should work (at least with floats). commit e61c393d3ec392ddee0a3da170e985fda885a823 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:48:30 2012 +0000 gallivm: Ensure vector width consistency. Instead of assuming that everything is the max native size. commit 330081ac7bc41c5754a92825e51456d231bf84dd Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:44:14 2012 +0000 draw: More simd vector width consistency fixes. commit d90ca002753596269e37297e2e6c139b19f29f03 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:43:00 2012 +0000 gallivm: Remove unused lp_build_int32_vec4_type() helper. commit cae23417824d75869c202aaf897808d73a2c1db0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 17:32:16 2012 +0100 gallivm: use global variable for native vector width instead of define We do not know the simd extensions (and hence the simd width we should use) available at compile time. At least for now keep a define for maximum vector width, since a global variable obviously can't be used to adjust alignment of automatic stack variables. Leave the runtime-determined value at 128 for now in all cases. commit 51270ace6349acc2c294fc6f34c025c707be538a Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 15:41:02 2012 +0000 gallivm: Add a hunk inadvertedly lost when rebasing. commit bf256df9cfdd0236637a455cbaece949b1253e98 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:24:23 2012 +0000 llvmpipe: Use consistent vector width in depth/stencil test. commit 5543b0901677146662c44be2cfba655fd55da94b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:19:59 2012 +0000 draw: Use a consistent the vector register width. Instead of 4x32 sometimes, LP_NATIVE_VECTOR_WIDTH other times. commit eada8bbd22a3a61f549f32fe2a7e408222e5c824 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 12:08:04 2012 +0000 gallivm: Remove garbagge collection. MC-JIT will require one compilation per module (as opposed to one compilation per function), therefore no state will be shared, eliminating the need to do garbagge collection. commit 556697ea0ed72e0641851e4fbbbb862c470fd7eb Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 10:33:41 2012 +0000 gallivm: Move all native target initialization to lp_set_target_options(). commit c518e8f3f2649d5dc265403511fab4bcbe2cc5c8 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:52:32 2012 +0000 llvmpipe: Create one gallivm instance for each test. commit 90f10af8920ec6be6f2b1e7365cfc477a0cb111d Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:48:08 2012 +0000 gallivm: Avoid LLVMAddGlobalMapping() in lp_bld_assert(). Brittle, complex, and unecesary. Just use function pointer constant. commit 98fde550b33401e3fe006af59db4db628bcbf476 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:21:26 2012 +0000 gallivm: Add a lp_build_const_func_pointer() helper. To be reused in all places where we want to call C code. commit 6cfedadb62c2ce5af8d75969bc95a607f3ece118 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:44:41 2012 +0000 gallivm: Cleanup/simplify lp_build_const_string_variable. - Move to lp_bld_const where it belongs - Rename to lp_build_const_string - take the length from the argument (and don't count the zero terminator twice) - bitcast the constant to generic i8 * commit db1d4018c0f1fa682a9da93c032977659adfb68c Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 11:52:17 2012 +0000 gallivm: Set NoFramePointerElimNonLeaf to true where supported. commit 088614164aa915baaa5044fede728aa898483183 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 22 19:38:47 2012 +0100 llvmpipe: pass in/out pointers rather scalar floats in lp_bld_arit we don't want llvm to potentially optimize away the vectors (though it doesn't seem to currently), plus we want to be able to handle in/out vectors of arbitrary length. commit 3f5c4e04af8a7592fdffa54938a277c34ae76b51 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:22:55 2012 +0100 gallivm: fix lp_build_sqrt() for vector length 1 since we optimize away vectors with length 1 need to emit intrinsic without vector type. commit 79d94e5f93ed8ba6757b97e2026722ea31d32c06 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 17:00:46 2012 +0000 llvmpipe: Remove lp_test_round. commit 81f41b5aeb3f4126e06453cfc78990086b85b78d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:56:24 2012 +0100 llvmpipe: subsume lp_test_round into lp_test_arit Much simpler, and since the arguments aren't passed as 128bit values can run on any arch. This also uses the float instead of the double versions of the c functions (which probably was the intention anyway). In contrast to lp_test_round the output is much less verbose however. Tested vector width of 32 to 512 bits - all pass except 32 (length 1) which crashes in lp_build_sqrt() due to wrong type. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 945b338b421defbd274481d8c4f7e0910fd0e7eb Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 09:55:03 2012 +0000 gallivm: Centralize the function compilation logic. This simplifies a lot of code. Also doing this in a central place will make it easier to carry out the changes necessary to use MC-JIT in the future. gallivm: Fix typo in explicit derivative shuffle. Trivial. draw: make DEBUG_STORE work again adapt to lp_build_printf() interface changes Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: get rid of vecnf_from_scalar() just use lp_build_broadcast directly (cannot assign a name but don't really need it, vecnf_from_scalar() was producing much uglier IR due to using repeated insertelement instead of insertelement+shuffle). Reviewed-by: José Fonseca <jfonseca@vmware.com> llvmpipe: fix typo in complex interpolation code Fixes position interpolation when using complex mode (piglit fp-fragment-position and similar) Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: fix clipvertex/position storing again This appears to be the result of a bad merge. Fixes piglit tests relying on clipping, like a lot of the interpolation tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Fix explicit derivative manipulation. Same counter variable was being used in two nested loops. Use more meanigful variable names for the counter to fix and avoid this. gallivm: Prevent buffer overflow in repeat wrap mode for NPOT. Based on Roland's patch, discussion, and review . Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix dims for TGSI_TEXTURE_1D in emit_tex. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix explicit volume texture derivatives. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: fix 1d shadow texture sampling Always r coordinate is used, hence need 3 coords not two (the second one is unused). Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Enable AVX support without MCJIT, where available. For now, this just enables AVX on Windows for testing. If the code is stable then we might consider prefering the old JIT wherever possible. No change elsewhere. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2012-07-13 18:09:30 +01:00
}
if (type.width == 32 && type.sign) {
intrinsic = "llvm.x86.sse41.pmaxsd";
gallivm,draw,llvmpipe: Support wider native registers. Squashed commit of the following: commit 7acb7b4f60dc505af3dd00dcff744f80315d5b0e Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:46:31 2012 +0100 draw: Don't use dynamically sized arrays. Not supported by MSVC. commit 5810c28c83647612cb372d1e763fd9d7780df3cb Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:44:16 2012 +0100 gallivm,llvmpipe: Don't use expressions with PIPE_ALIGN_VAR(). MSVC doesn't accept exceptions in _declspec(align(...)). Use a define instead. commit 8aafd1457ba572a02b289b3f3411e99a3c056072 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:41:56 2012 +0100 gallium/util: Make u_cpu_detect.h header C++ safe. commit 5795248350771f899cfbfc1a3a58f1835eb2671d Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 2 12:08:01 2012 +0100 gallium/util: Add ULL suffix to large constants. As suggested by Andy Furniss: it looks like some old gcc versions require it. commit 4c66c22727eff92226544c7d43c4eb94de359e10 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Truly disable INF/NAN tests on MSVC. Thanks to Brian for spotting this. commit 8bce274c7fad578d7eb656d9a1413f5c0844c94e Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Disable INF/NAN tests on MSVC. Somehow they are not recognized as constants. commit 6868649cff8d7fd2e2579c28d0b74ef6dd4f9716 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 15:05:24 2012 +0200 gallivm: Cleanup the 2 x 8 float -> 16 ub special path in lp_build_conv. No behaviour change intended, like 7b98455fb40c2df84cfd3cdb1eb7650f67c8a751. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 5147a0949c4407e8bce9e41d9859314b4a9ccf77 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 14:28:19 2012 +0200 gallivm: (trivial) fix issues with multiple-of-4 texture fetch Some formats can't handle non-multiple of 4 fetches I believe, but everything must support length 1 and multiples of 4. So avoid going to scalar fetch (which is very costly) just because length isn't 4. Also extend the hack to not use shift with variable count for yuv formats to arbitrary length (larger than 1) - doesn't matter how many elements we have we always want to avoid it unless we have variable shift count instruction (which we should get with avx2). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 87ebcb1bd71fa4c739451ec8ca89a7f29b168c08 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jul 4 02:09:55 2012 +0200 gallivm: (trivial) fix typo for wrap repeat mode in linear filtering aos code This would lead to bogus coordinates at the edges. (undetected by piglit because this path is only taken for block-based formats). Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 3a42717101b1619874c8932a580c0b9e6896b557 Author: José Fonseca <jfonseca@vmware.com> Date: Tue Jul 3 19:42:49 2012 +0100 gallivm: Fix TGSI integer translation with AVX. commit d71ff104085c196b16426081098fb0bde128ce4f Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 15:17:41 2012 +0100 llvmpipe: Fix LLVM JIT linear path. It was not working properly because it was looking at the JIT function before it was actually compiled. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit a94df0386213e1f5f9a6ed470c535f9688ec0a1b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Jun 28 18:07:10 2012 +0100 gallivm: Refactor lp_build_broadcast(_scalar) to share code. Doesn't really change the generated assembly, but produces more compact IR, and of course, makes code more consistent. Reviewed-by: Brian Paul <brianp@vmware.com> commit 66712ba2731fc029fa246d4fc477d61ab785edb5 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 17:30:13 2012 +0100 gallivm: Make LLVMContextRef a singleton. There are any places inside LLVM that depend on it. Too many to attempt to fix. Reviewed-by: Brian Paul <brianp@vmware.com> commit ff5fb7897495ac263f0b069370fab701b70dccef Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 28 18:15:27 2012 +0200 gallivm: don't use 8-wide texture fetch in aos path This appears to be a slight loss usually. There are probably several reasons for that: - fetching itself is scalar - filtering is pure int code hence needs splitting anyway, same for the final texel offset calculations - texture wrap related code, which can be done 8-wide, is slightly more complex with floats (with clamp_to_edge) and float operations generally more costly hence probably not much faster overall - the code needed to split when encountering different mip levels for the quads, adding complexity So, just split always for aos path (but leave it 8-wide for soa, since we do 8-wide filtering there when possible). This should certainly be revisited if we'd have avx2 support. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ce8032b43dcd8e8d816cbab6428f54b0798f945d Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:41:19 2012 +0200 gallivm: (trivial) don't extract fparts variable if not needed Did not have any consequences but unnecessary. commit aaa9aaed8f80dc282492f62aa583a7ee23a4c6d5 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:09:06 2012 +0200 gallivm: fix precision issue in aos linear int wrap code now not just passes at a quick glance but also with piglit... If we do the wrapping with floats, we also need to set the weights accordingly. We can potentially end up with different (integer) coordinates than what the integer calculations would have chosen, which means the integer weights calculated previously in this case are completely wrong. Well at least that's what I think happens, at least recalculating the weights helps. (Some day really should refactor all the wrapping, so we do whatever is fastest independent of 16bit int aos or 32bit float soa filtering.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit fd6f18588ced7ac8e081892f3bab2916623ad7a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 11:15:53 2012 +0100 gallium/util: Fix parsing of options with underscore. For example GALLIVM_DEBUG=no_brilinear which was being parsed as two options, "no" and "brilinear". commit 09a8f809088178a03e49e409fa18f1ac89561837 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 26 15:00:14 2012 +0100 gallivm: Added a generic lp_build_print_value which prints a LLVMValueRef. Updated lp_build_printf to share common code. Removed specific lp_build_print_vecX. Reviewed-by: José Fonseca <jfonseca@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit e59bdcc2c075931bfba2a84967a5ecd1dedd6eb0 Author: José Fonseca <jfonseca@vmware.com> Date: Wed May 16 15:00:23 2012 +0100 draw,llvmpipe: Avoid named struct types on LLVM 3.0 and later. Starting with LLVM 3.0, named structures are meant not for debugging, but for recursive data types, previously also known as opaque types. The recursive nature of these types leads to several memory management difficulties. Given that we don't actually need recursive types, avoid them altogether. This is an attempt to address fdo bugs 41791 and 44466. The issue is somewhat random so there's no easy way to check how effective this is. Cherry-picked from 9af1ba565dfd5cef9ee938bb7c04767d14878fbf commit df6070f618a203c7a876d984c847cde4cbc26bdb Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 14:42:53 2012 +0200 gallivm: (trivial) fix typo in faster aos linear int wrap code no longer crashes, now REALLY tested. commit d8f98dce452c867214e6782e86dc08562643c862 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 18:20:58 2012 +0200 llvmpipe: (trivial) remove bogus optimization for float aos repeat wrap This optimization for nearest filtering on the linear path generated likely bogus results, and the int path didn't have any optimizations there since the only shader using force_nearest apparently uses clamp_to_edge not repeat wrap anyway. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit c4e271a0631087c795e756a5bb6b046043b5099d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 23:01:52 2012 +0200 gallivm: faster repeat wrap for linear aos path too Even if we already have scaled integer coords, it's way faster to use the original float coord (plus some conversions) rather than use URem. The choice of what to do for texture wrapping is not really tied to int aos or float soa filtering though for some modes there can be some gains (because of easier weight calculations). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 1174a75b1806e92aee4264ffe0ffe7e70abbbfa3 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 14:39:22 2012 +0200 gallivm: improve npot tex wrap repeat in linear soa path URem gets translated into series of scalar divisions so just about anything else is faster. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit f849ffaa499ed96fa0efd3594fce255c7f22891b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 00:40:35 2012 +0100 gallivm: (trivial) fix near-invisible shift-space typo I blame the keyboard. commit 5298a0b19fe672aebeb70964c0797d5921b51cf0 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:24:28 2012 +0200 gallivm: add new intrinsic helper to deal with arbitrary vector length This helper will split vectors which are too large for the hw, or expand them if they are too small, so a caller of a function using intrinsics which uses such sizes need not split (or expand) the vectors manually and the function will still use the intrinsic instead of dropping back to generic llvm code. It can also accept scalars for use with pseudo-vector intrinsics (only useful for float arguments, all x86 scalar simd float intrinsics use 4vf32). Only used for lp_build_min/max() for now (also added the scalar float case for these while there). (Other basic binary functions could use it easily, whereas functions with a different interface would need different helpers.) Expanding vectors isn't widely used, because we always try to use build contexts with native hw vector sizes. But it might (or not) be nicer if this wouldn't need to be done, the generated code should in theory stay the same (it does get hit by lp_build_rho though already since we didn't have a intrinsic for the scalar lp_build_max case before). v2: incorporated Brian's feedback, and also made the scalar min/max case work instead of crash (all scalar simd float intrinsics take 4vf32 as argument, probably the reason why it wasn't used before). Moved to lp_bld_intr based on José's request, and passing intrinsic size instead of length. Ideally we'd derive the source type info from the passed in llvm value refs and process some llvmtype return type so we could handle intrinsics where the source and destination type isn't the same (like float/int conversions, packing instructions) but that's a bit too complicated for now. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 01aa760b99ec0b2dc8ce57a43650e83f8c1becdf Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:19:18 2012 +0200 gallivm: (trivial) increase max code size for shader disassembly 64kB was just short of what I needed (which caused a crash) hence increase to 96kB (should probably be smarter about that). commit 74aa739138d981311ce13076388382b5e89c6562 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:53:29 2012 +0100 gallivm: simplify aos float tex wrap repeat nearest just handle pot and npot the same. The previous pot handling ended up with exactly the same instructions plus 2 more (leave it in the soa path though since it is probably still cheaper there). While here also fix a issue which would cause a crash after an assert. commit 0e1e755645e9e49cfaa2025191e3245ccd723564 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:29:24 2012 +0100 gallivm: (trivial) skip floor rounding in ifloor when not signed This was only done for the non-sse41 case before, but even with sse41 this is obviously unnecessary (some callers already call itrunc in this case anyway but some might not). commit 7f01a62f27dcb1d52597b24825931e88bae76f33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:23:12 2012 +0100 gallivm: (trivial) fix bogus comments commit 5c85be25fd82e28490274c468ce7f3e6e8c1d416 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 20 11:51:57 2012 +0100 translate: Free elt8_func/elt16_func too. These were leaking. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 0ad498f36fb6f7458c7cffa73b6598adceee0a6c Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 15:55:34 2012 +0200 gallivm: fix bug for tex wrap repeat with linear sampling in aos float path The comparison needs to be against length not length_minus_one, otherwise the max texel is never chosen (for the second coordinate). Fixes piglit texwrap-1D-npot-proj (and 2D/3D versions). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit d1ad65937c5b76407dc2499b7b774ab59341209e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 16:13:43 2012 +0200 gallivm: simplify soa tex wrap repeat with npot textures and no mip filtering Similar to what is already done in aos sampling for the float path (but not the int path since we don't get normalized float coordinates there). URem is expensive and the calculation is done trivially with normalized floats instead (at least with sse41-capable cpus). (Some day should probably do the same for the mip filter path but it's much more complicated there hence the gain is smaller.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit e1e23f57ba9b910295c306d148f15643acc3fc83 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:38:56 2012 +0200 llvmpipe: (trivial) remove duplicated function declaration Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 07ca57eb09e04c48a157733255427ef5de620861 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:37:34 2012 +0200 llvmpipe: destroy setup variants on context destruction lp_delete_setup_variants() used to be called in garbage collection, but this no longer exists hence the setup shaders never got freed. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ed0003c633859a45f9963a479f4c15ae0ef1dca3 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 16:25:29 2012 +0100 gallivm: handle different ilod parts for multiple quad sampling This fixes filtering when the integer part of the lod is not the same for all quads. I'm not fully convinced of that solution yet as it just splits the vector if the levels to be sampled from are different. But otherwise we'd need to do things like some minify steps, and getting mip level base address separately anyway hence it wouldn't really look like much of a win (and making the code even more complex). This should now give identical results to single quad sampling. commit 8580ac4cfc43a64df55e84ac71ce1a774d33c0d2 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:14:47 2012 +0200 gallivm: de-duplicate sample code common to soa and aos sampling There doesn't seem to be any reason why this code dealing with cube face selection, lod and mip level calculation is separate in aos and soa sampling, and I am sick of having it to change in both places. commit fb541e5f957408ce305b272100196f1e12e5b1e8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:15:41 2012 +0200 gallivm: do mip filtering with per quad lod_fpart This gives better results for mip filtering, though the generated code might not be optimal. For now it also creates some artifacts if the lod_ipart isn't the same for all quads, since instead of using the same mip weight for all quads as previously (which just caused non-smooth gradients) this now will use the right weights but with the wrong mip level in this case (can easily be seen with things like texfilt, mipmap_tunnel). v2: use logic helper suggested by José, and fix issue with negative lod_fpart values commit f1cc84eef7d826a20fab6cd8ccef9a275ff78967 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 13 18:35:25 2012 +0200 gallivm: (trivial) fix bogus assert in lp_build_unpack_broadcast_aos_scalars commit 7c17dbae8ae290df9ce0f50781a09e8ed640c044 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:11:14 2012 +0100 util: Reimplement half <-> float conversions. Removed u_half.py used to generate the table for previous method. Previous implementation of float to half conversion was faulty for denormalised and NaNs and would require extra logic to fix, thus making the speedup of using tables irrelevant. commit 7762f59274070e1dd4b546f5cb431c2eb71ae5c3 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:12:16 2012 +0100 tests: Updated tests to properly handle NaN for half floats. commit fa94c135aea5911fd93d5dfb6e6f157fb40dce5e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:33:10 2012 +0200 gallivm: do mip level calculations per quad This is the final piece which shouldn't change the rendering output yet. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 23cbeaddfe03c09ca18c45d28955515317ffcf4c Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Jun 9 00:54:21 2012 +0200 gallivm: do per-quad cube face selection Doesn't quite fix the piglit cubemap test (not sure why actually) but doing per-quad face selection is doing the right thing and definitely an improvement. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit abfb372b3702ac97ac8b5aa80ad1b94a2cc39d33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:22:59 2012 +0200 gallivm: do all lod calculations per quad Still no functional change but lod is now converted to scalar after lod calculations. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 519368632747ae03feb5bca9c655eccbc5b751b4 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:46:10 2012 +0100 gallivm: Added support for half-float to float conversion in lp_build_conv. Updated various utility functions to support this change. commit 135b4d683a4c95f7577ba27b9bffa4a6fbd2c2e7 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:02:46 2012 +0100 gallivm: Added function for half-float to float conversion. Updated lp_build_format_aos_array to support half-float source. commit 37d648827406a20c5007abeb177698723ed86673 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:55:18 2012 +0100 util: Updated u_format_tests to rigidly test half-float boundary values. commit 2ad18165d96e578aa9046df7c93cb1c3284d8c6b Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:54:16 2012 +0100 llvmpipe: Updated lp_test_format to properly handle Inf/NaN results. commit 78740acf25aeba8a7d146493dd5c966e22c27b73 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:53:30 2012 +0100 util: Added functions for checking NaN / Inf for double and half-floats. commit 35e9f640ae01241f9e0d67fe893bbbf564c05809 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:05:13 2012 +0200 gallivm: Fix calculating rho for 3d textures for the single-quad case Discovered by accident, this looks like a very old typo bug. commit fc1220c636326536fd0541913154e62afa7cd1d8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:04:59 2012 +0200 gallivm: do calcs per-quad in lp_build_rho Still convert to scalar at the end of the function. commit 50a887ffc550bf310a6988fa2cea5c24d38c1a41 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 21 23:21:50 2012 +0200 gallivm: (trivial) return scalar in lp_build_extract_range for length 1 vectors Our type system on top of llvm's one doesn't generally support vectors of length 1, instead using scalars. So we should return a scalar from this function instead of having to bitcast the vector with length 1 later elsewhere. commit 80c71c621f9391f0f9230460198d861643324876 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 17:49:15 2012 +0100 draw: Fixed bad merge error commit c47401cfad0c9167de20ff560654f533579f452c Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:29:30 2012 +0100 draw: Updated store_clip to store whole vectors instead of individual elements. commit 2d9c1ad74b0b0b41861fffcecde39f09cc27f1cf Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:28:32 2012 +0100 gallivm: Added lp_build_fetch_rgba_aos_array. A version of lp_build_fetch_rgba_aos which is targeted at simple array formats. Reads the whole vector from memory in one, instead of reading each element individually. Tested with mesa tests and demos. commit ff7805dc2b6ef6d8b11ec4e54aab1633aef29ac8 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:27:40 2012 +0100 gallivm: Added lp_build_pad_vector. This function pads a vector with undef to a desired length. commit 701f50acef24a2791dabf4730e5b5687d6eb875d Author: James Benton <jbenton@vmware.com> Date: Fri May 18 17:27:19 2012 +0100 util: Added util_format_is_array. This function checks whether a format description is in a simple array format. commit 5e0a7fa543dcd009de26f34a7926674190fa6246 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:13:47 2012 +0100 draw: Removed draw_llvm_translate_from and draw/draw_llvm_translate.c. This is "replaced" by adding an optimised path in lp_build_fetch_rgba_aos in an upcoming patch. commit 8c886d6a7dd3fb464ecf031de6f747cb33e5361d Author: James Benton <jbenton@vmware.com> Date: Wed May 16 15:02:31 2012 +0100 draw: Modified store_aos to write the vector as one, not individual elements. commit 37337f3d657e21dfd662c7b26d61cb0f8cfa6f17 Author: James Benton <jbenton@vmware.com> Date: Wed May 16 14:16:23 2012 +0100 draw: Changed aos_to_soa to use lp_build_transpose_aos. commit bd2b69ce5d5c94b067944d1dcd5df9f8e84548f1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:14:27 2012 +0100 draw: Changed soa_to_aos to use lp_build_transpose_aos. commit 0b98a950d29a116e82ce31dfe7b82cdadb632f2b Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:45 2012 +0100 gallivm: Added lp_build_transpose_aos which converts between aos and soa. commit 69ea84531ad46fd145eb619ed1cedbe97dde7cb5 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:01 2012 +0100 gallivm: Added lp_build_interleave2_half aimed at AVX unpack instructions. commit 7a4cb1349dd35c18144ad5934525cfb9436792f9 Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 22 11:54:14 2012 +0100 gallivm: Fix build on Windows. MC-JIT not yet supported there. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit afd105fc16bb75d874e418046b80d9cc578818a1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:17:26 2012 +0100 llvmpipe: Added a error counter to lp_test_conv. Useful for keeping track of progress when fixing errors! Signed-off-by: José Fonseca <jfonseca@vmware.com> commit b644907d08c10a805657841330fc23db3963d59c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:16:46 2012 +0100 llvmpipe: Changed known failures in lp_test_conv. To comply with the recent fixes to lp_bld_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit d7061507bd94f6468581e218e61261b79c760d4f Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:14:38 2012 +0100 llvmpipe: Added fixed point types tests to lp_test_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 146b3ea39b4726dbe125ac666bd8902ea3d6ca8c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:26:35 2012 +0100 llvmpipe: Changed lp_test_conv src/dst alignment to be correct. Now based on the define rather than a fixed number. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit f3b57441f834833a4b142a951eb98df0aa874536 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:06:44 2012 +0100 gallivm: Fixed erroneous optimisation in lp_build_min/max. Previously assumed normalised was 0 to 1, but it can be -1 to 1 if type is signed. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a0613382e5a215cd146bb277646a6b394d376ae4 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:04:49 2012 +0100 gallivm: Compensate for lp_const_offset in lp_build_conv. Fixing a /*FIXME*/ to remove errors in integer conversion in lp_build_conv. Tested using lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a3d2bf15ea345bc8a0664f8f441276fd566566f3 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:01:25 2012 +0100 gallivm: Fixed overflow in lp_build_clamped_float_to_unsigned_norm. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit e7b1e76fe237613731fa6003b5e1601a2e506207 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 21 20:07:51 2012 +0100 gallivm: Fix build with LLVM 2.6 Trivial, and useful. commit d3c6bbe5c7f5ba1976710831281ab1b6a631082d Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 15 17:15:59 2012 +0100 gallivm: Enable MCJIT/AVX with vanilla LLVM 3.1. Add the necessary C++ glue, so that we don't need any modifications to the soon to be released LLVM 3.1. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 724a019a14d40fdbed21759a204a2bec8a315636 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 22:04:06 2012 +0100 gallivm: Use HAVE_LLVM 0x0301 consistently. commit af6991e2a3868e40ad599b46278551b794839748 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 21:49:06 2012 +0100 gallivm: Add MCRegisterInfo.h to silence benign warnings about missing implementation. Trivial. commit 6f8a1d75458daae2503a86c6b030ecc4bb494e23 Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Apr 2 22:14:15 2012 -0700 gallivm: Pass in a MCInstrInfo to createMCInstPrinter on llvm-3.1. llvm-3.1svn r153860 makes MCInstrInfo available to the MCInstPrinter. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 62555b6ed8760545794f83064e27cddcb3ce5284 Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 27 21:51:17 2012 -0700 gallivm: Fix method overriding in raw_debug_ostream. Use matching type qualifers to avoid method hiding. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 6a9bd784f4ac68ad0a731dcd39e5a3c39989f2be Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 13 22:40:52 2012 -0700 gallivm: Fix createOProfileJITEventListener namespace with llvm-3.1. llvm-3.1svn r152620 refactored the OProfile profiling code. createOProfileJITEventListener was moved from the llvm namespace to the llvm::JITEventListener namespace. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit b674955d39adae272a779be85aa1bd665de24e3e Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Mar 5 22:00:40 2012 -0800 gallivm: Pass in a MCRegisterInfo to MCInstPrinter on llvm-3.1. llvm-3.1svn r152043 changes createMCInstPrinter to take an additional MCRegisterInfo argument. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 11ab69971a8a31c62f6de74905dbf8c02884599f Author: Vinson Lee <vlee@freedesktop.org> Date: Wed Feb 29 21:20:53 2012 -0800 Revert "gallivm: Change getExtent and readByte to non-const with llvm-3.1." This reverts commit d5a6c172547d8964f4d4bb79637651decaf9deee. llvm-3.1svn r151687 makes MemoryObject accessor members const again. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 339960c82d2a9f5c928ee9035ed31dadb7f45537 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 14 16:19:56 2012 +0200 gallivm: (trivial) fix assertion failure for mipmapped 1d textures In lp_build_rho, we may end up with a 1-element vector (for mipmapped 1d textures), but in this case we require the type to be a non-vector type, so need a cast. commit 9d73edb727bd6d196030dc3026b7bf0c574b3e19 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:12:07 2012 +0200 gallivm: prepare for per-quad lod calculations for large vectors to be able to handle multiple quads at once in texture sampling and still do lod calculations per quad, it is necessary to get the per-quad derivatives into the lp_build_rho function. Until now these derivative values were just scalars, which isn't going to work. So we now use vectors, and since the interface needs to change we also do some different (slightly more efficient) packing of the values. For 8-wide vectors the packed derivative values for 3 coords would look like this, this scales to a arbitrary (multiple of 4) vector size: ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____ The second vector will be unused for 1d and 2d textures. To facilitate future changes the derivative values are put into a struct, since quite some functions just pass these values through. The generated code seems to be very slightly better for 2d textures (with 4-wide vectors) than before with sse2 (if you have a cpu with physical 128bit simd units - otherwise it's probably not a win). v2: suggestions from José, rename variables, add comments, use swizzle helper commit 0aa21de0d31466dac77b05c97005722e902517b8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:10:31 2012 +0200 gallivm: add undefined swizzle handling to lp_build_swizzle_aos This is useful for vectors with "holes", it lets llvm choose the most efficient shuffle instructions if some elements aren't needed without having to worry what elements to manually pick otherwise. commit 00faf3f370e7ce92f5ef51002b0ea42ef856e181 Author: José Fonseca <jfonseca@vmware.com> Date: Fri May 4 17:25:16 2012 +0100 gallivm: Get the LLVM IR optimization passes before JIT compilation. MC-JIT engine compiles the module immediately on creation, so the optimization passes were being run too late. So now we create a target data layout from a string, that matches the ABI parameters reported by the compiler. The backend optimization passes were always been run, so the performance improvement is modest (3% on multiarb mesa demo). Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit 40a43f4e2ce3074b5ce9027179d657ebba68800a Author: Roland Scheidegger <sroland@vmware.com> Date: Wed May 2 16:03:54 2012 +0200 gallivm: (trivial) fix wrong define used in lp_build_pack2 should fix stack-smashing crashes. commit e6371d0f4dffad4eb3b7a9d906c23f1c88a2ab9e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 30 21:25:29 2012 +0200 gallivm: add perf warnings when not using intrinsics with 256bit vectors Helper functions using integer sse2 intrinsics could split the vectors with AVX instead of using generic fallback (which should be faster). We don't actually expect to hit these paths (hence don't fix them up to actually do the vector splitting) so just emit warnings (for those functions where it's obvious doing split/intrinsic is faster than using generic path). Only emit warnings for 256bit vectors since we _really_ don't expect to hit arbitrary large vectors which would affect a lot more functions. The warnings do not actually depend on avx since the same logic applies to plain sse2 too (but of course again there's _really_ no reason we should hit these functions with 256bit vectors without avx). commit 8a9ea701ea7295181e846c6383bf66a5f5e47637 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:37:07 2012 +0200 gallivm: split vectors manually for avx in lp_build_pack2 (v2) There's 2 reasons for this: First, there's a llvm bug (fixed in 3.1) which generates tons of byte inserts/extracts otherwise, and second, more importantly, we want to use pack intrinsics instead of shuffles. We do this in lp_build_pack2 and not the calling code (aos sample path) because potentially other callers might find that useful too, even if for larger sequences of code using non-native vector sizes it might be better to manually split vectors. This should boost texture performance in the aos path considerably. v2: fix issues with intrinsics types with old llvm commit 27ac5b48fa1f2ea3efeb5248e2ce32264aba466e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:26:22 2012 +0200 llvmpipe: refactor lp_build_pack2 (v2) prettify, and it's unnecessary to assert when there's no intrinsic due to unsupported bit width - the shuffle path will work regardless. In contrast lp_build_packs2, should only rely on lp_build_pack2 doing the clamping for element sizes for which there is a sse2 intrinsic. v2: fix bug spotted by Jose regarding the intrinsic type for packusdw on old llvm versions. commit ddf279031f0111de4b18eaf783bdc0a1e47813c8 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:13:59 2012 +0200 gallivm: add src width check in lp_build_packs2() not doing so would skip clamping even if no sse2 pack instruction is available, which is incorrect (in theory only, such widths would also always hit a (unnecessary) assertion in lp_build_pack2(). commit e7f0ad7fe079975eae7712a6e0c54be4fae0114b Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Apr 27 15:57:00 2012 +0200 gallivm: (trivial) fix crash-causing typo for npot textures with avx commit 28a9d7f6f655b6ec508c8a3aa6ffefc1e79793a0 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Apr 25 19:38:45 2012 +0200 gallivm: (trivial) remove code mistakenly added twice. commit d5926537316f8ff67ad0a52e7242f7c5478d919b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Apr 24 21:16:15 2012 +0200 gallivm: add a new avx aos sample path (v2) Try to avoid mixing float and int address calculations. This does texture wrap modes with floats, and then the offset calculations still with ints (because of lack of precision with floats, though we could do some effort to make it work with not too large (16MB) textures). This also handles wrap repeat mode with npot-sized textures differently than either the old soa or aos int path (likely way faster but untested). Otherwise the actual address wrap code is largely similar to the soa path (not quite the same as this one also has some int code), it should get used by avx soa sampling later as well but doesn't handle more complex address modes yet (this will also have the benefit that we can use aos sampling path for all texture address modes). Generated code for that looks reasonable, but still does not split vectors explicitly for fetch/filter which means still get hit by llvm (fixed upstream) which generates hundreds of pinsrb/pextrb instead of two shuffles. It is not obvious though if it's much of a win over just doing address calcs 4-wide but with ints, even if it is definitely much less instructions on avx. piglit's texwrap seems to look exactly the same but doesn't test neither the non-normalized nor the npot cases. v2: fix comments, prettify based on Brian's and Jose's feedback. commit bffecd22dea66fb416ecff8cffd10dd4bdb73fce Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Apr 19 01:58:29 2012 +0200 gallivm: refactor aos lp_build_sample_image_nearest/linear split them up to separate address calculations and fetching/filtering. Need this for being able to do 8-wide float address calcs and 4-wide fetch/filter later (for avx). Plus the functions were very big scary monsters anyway (in particular lp_build_sample_image_linear). commit a80b325c57529adddcfa367f96f03557725c4773 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 16 17:17:18 2012 +0200 gallivm: fix lp_build_resize when truncating width but expanding vector size Missed this case which I thought was impossible - the assertion for it was right after the division by zero... (AoS) texture sampling may ask us to do this, for things like 8 4x32int vectors to 1 32x8int vector conversion (eventually, we probably don't want this to happen). commit f9c8337caa3eb185830d18bce8b95676a065b1d7 Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Apr 14 18:00:59 2012 +0200 gallivm: fix cube maps with larger vectors This makes the branchless cube face selection code work with larger vectors. Because the complexity is quite high (cannot really be improved it seems, per-face selection would reduce complexity a lot but this leads to errors unless the derivatives are calculated all from the same face which almost doubles the work to be done) it is still slower than the branching version, hence only enable this with large vectors. It doesn't actually do per-quad face selection yet (only makes sense with matching lod selection, in fact it will select the same face for all pixels based on the average of the first four pixels for now) but only different shuffles are required to make it work (the branching version actually should work with larger vectors too now thanks to the improved horizontal add but of course it cannot be extended to really select the face per-quad unless doing branching per quad). commit 7780c58869fc9a00af4f23209902db7e058e8a66 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 21:11:12 2012 +0100 llvmpipe: (trivial) fix compiler warning and also clarify comment regarding availability of popcnt instruction. commit a266dccf477df6d29a611154e988e8895892277e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 14:21:07 2012 +0100 gallivm: remove unneeded members in lp_build_sample_context Minor cleanup, the texture width, height, depth aren't accessed in their scalar form anywhere. Makes it more obvious those values should probably be fetched already vectorized (but this requires more invasive changes)... commit b678c57fb474e14f05e25658c829fc04d2792fff Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 29 15:53:55 2012 +0100 gallivm: add a helper for concatenating vectors Similar to the extract_range helper intended to get around slow code generated by llvm for 128bit insertelements. Concatenating two 128bit vectors this way will result in a single vinsertf128 operation rather than two 64bit stores plus one 128bit load, though it might be mildly useful for other purposes as well. commit 415ff228bcd0cf5e44a4c15350a661f0f5520029 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 19:41:15 2012 +0100 gallivm: add a custom 2x8f->1x16ub avx conversion path Similar to the existing 4x4f->1x16ub sse2 path, shaves off a couple instructions (min/max mostly) because it relies on pack intrinsics clamping. commit 78c08fc89f8fbcc6dba09779981b1e873e2a0299 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 18:44:07 2012 +0100 gallivm: add avx arithmetic intrinsics Add all avx intrinsics for arithmetic functions (with the exception of the horizontal add function which needs another look). Seems to pass basic tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit a586caa2800aa5ce54c173f7c0d4fc48153dbc4e Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 15:31:35 2012 +0100 gallivm: add avx logic intrinsics Add the blend intrinsics for 8-wide float and 4-wide double vectors. Since we lack 256bit int instructions these are used for int vectors as well, though obviously not for byte or word element values. The comparison intrinsics aren't extended for avx since these are only used for pre-2.7 llvm versions. commit 70275e4c13c89315fc2560a4c488c0e6935d5caf Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 00:40:53 2012 +0100 gallivm: new helper function for extract shuffles. Based on José's idea as we can need that in a couple places. Note that such shuffles should not be used lightly, since data layout of <4 x i8> is different to <16 x i8> for instance, hence might cause data rearrangement. commit 4d586dbae1b0c55915dda1759d2faea631c0a1c2 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 18:27:25 2012 +0100 gallivm: (trivial) don't overallocate shuffle variable using wrong define meant huge array... commit 06b0ec1f6d665d98c135f9573ddf4ba04b2121ad Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 17:54:20 2012 +0100 gallivm: don't do per-element extract/insert for vector element resize Instead of doing per-element extract/insert if the src vectors and dst vector differ in total size (which generates atrocious code) first change the src vectors size by using shuffles to destination vector size. We can still do better than that on AVX for packing to color buffer (by exploiting pack intrinsics characteristics hence eleminating the need for some clamps) but this already generates much better code. v2: incorporate feedback from José, Keith and use shuffle instead of bitcasts/extracts. Due to llvm deficiencies the latter cause all data to get moved to GPRs and back in pieces (even though the data in the regs actually stays the same...). commit c9970d70e05f95d3f52fe7d2cd794176a52693aa Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 19:33:19 2012 +0000 gallivm: fix bug in simple position interpolation Accidental use of position attribute instead of just pixel coordinates. Caused failures in piglit glsl-fs-ceil and glsl-fs-floor. commit d0b6fcdb008d04d7f73d3d725615321544da5a7e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 15:31:14 2012 +0000 gallivm: fix emission of ceil opcode lp_build_ceil seems more appropriate than lp_build_trunc. This seems to be never hit though someone performs some ceil to floor magic. commit d97fafed7e62ffa6bf76560a92ea246a1a26d256 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 22 11:46:52 2012 +0000 gallivm: new vectorized path for cubemap calculations should be faster when adapted to multiple quads as only selection masks need to be different. The code is more or less a per-pixel version adapted to only do it per quad. A per pixel version would be much simpler (could drop 2 selects, 6 broadcasts and the messy horizontal add of 3 vectors at the expense of only 2 more absolute value instructions - would also just work for arbitary large vectors). This version doesn't yet work with larger vectors because the horizontal add isn't adjusted to be able to work with 2x4 vectors (and also because face selection wouldn't be done per quad just per block though that would be only a correctness issue just as with lod selection). The downside is this code is quite a bit slower. On a Core2 it can be sped up by disabling the hw blend instructions for selection and using logicop fallbacks instead, but it is still slower than the old code, hence leave that in for now. Probably will chose one or the other version based on vector length in the end. commit b375fbb18a3fd46859b7fdd42f3e9908ea4ff9a3 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 21 14:42:29 2012 +0000 gallivm: fix optimized occlusion query intrinsic name commit a9ba0a3b611e48efbb0e79eb09caa85033dbe9a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Mar 21 16:19:43 2012 +0000 draw,gallivm,llvmpipe: Call gallivm_verify_function everywhere. commit f94c2238d2bc7383e088b8845b7410439a602071 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 18:54:10 2012 +0000 gallivm: optimize calculations for cube maps a bit this does some more vectorized calculations and uses horizontal adds if possible. A definite win with sse3 otherwise it doesn't seem to make much of a difference. In any case this is arithmetically identical, cannot handle larger vectors. Should be useful as a reference point against larger vector version later... commit 21a2c1cf3c8e1ac648ff49e59fdc0e3be77e2ebb Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 15:16:27 2012 +0000 llvmpipe: slight optimization of occlusion queries using movmskps when available. While this is slightly better for cpus without popcnt we should really sum the vectors ourselves (it is also possible to cast to i4 before doing the popcnt but that doesn't help that much neither since llvm is using some optimized popcnt version for i32) commit 5ab5a35f216619bcdf55eed52b0db275c4a06c1b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 13:32:11 2012 +0000 llvmpipe: fix occlusion queries with larger vectors need to adjust casts etc. commit ff95e6fdf5f16d4ef999ffcf05ea6e8c7160b0d5 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Mar 19 20:15:25 2012 +0000 gallivm: Restore optimization passes. commit 57b05b4b36451e351659e98946dae27be0959832 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:34:22 2012 +0000 llvmpipe: use existing min2 macro commit bc9a20e19b4f600a439f45679451f2e87cd4b299 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:07:27 2012 +0000 llvmpipe: add some safeguards against really large vectors As per José's suggestion, prevent things from blowing up if some cpu would have 1024bit or larger vectors. commit 0e2b525e5ca1c5bbaa63158bde52ad1c1564a3a9 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:31:08 2012 +0000 llvmpipe: fix mask generation for uberwide vectors this was the only piece preventing 16-wide vectors from working (apart from the LP_MAX_VECTOR_WIDTH define that is), which is the maximum as we don't get more pixels in the fragment shader at once. Hence adjust that so things could be tested properly with that size even though there seems to be no practical value. commit 3c8334162211c97f3a11c7f64e9e5a2a91ad9656 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:19:41 2012 +0000 llvmpipe: fix the simple interpolation method with larger vectors so both methods actually _really_ work now. Makes textures look nice with larger vectors... commit 1cb0464ef8871be1778d43b0c56adf9c06843e2d Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 17:26:35 2012 +0000 llvmpipe: fix mask generation and position interpolation with 8-wide vectors trivial bugs, with these things start to look somewhat reasonable. Textures though have some swizzling issues it seems. commit 168277a63ef5b72542cf063c337f2d701053ff4b Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 16:04:03 2012 +0000 llvmpipe: don't overallocate variables we never have more than 16 (stamp size) / 4 (minimum possible vector size). (With larger vectors those variables are still overallocated a bit.) commit 409b54b30f81ed0aa9ed0b01affe15c72de9abd2 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:56:48 2012 +0000 llvmpipe: add some 32f8 formats to lp_test_conv Also add the ability to handle different sized vectors. commit 55dcd3af8366ebdac0af3cdb22c2588f24aa18ce Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:47:27 2012 +0000 gallivm: handle different sized vectors in conversion / pack only fully generic path for now (extract/insert per element). commit 9c040f78c54575fcd94a8808216cf415fe8868f6 Author: Roland Scheidegger <sroland@vmware.com> Date: Sun Mar 18 00:58:28 2012 +0100 llvmpipe: fix harmless use of unitialized values commit 551e9d5468b92fc7d5aa2265db9a52bb1e368a36 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:31:21 2012 +0100 gallivm: drop special path in extract_broadcast with different sized vectors Not needed, llvm can handle shuffles with different sized result vector just fine. Should hopefully generate the same code in the end, but simpler IR. commit 44da531119ffa07a421eaa041f63607cec88f6f8 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:28:49 2012 +0100 llvmpipe: adapt interpolation for handling multiple quads at once this is still WIP there are actually two methods possible not quite sure what makes the most sense, so there's code for both for now: 1) the iterative method as used before (compute attrib values at upper left corner of stamp and upper left corner of each quad initially). It is improved to handle more than one quad at once, and also do some more vectorized calculations initially for slightly better code - newer cpus have full throughput with 4 wide float vectors, hence don't try to code up a path which might be faster if there's just one channel active per attribute. 2) just do straight interpolation for each pixel. Method 2) is more work per quad, but less initially - if all quads are executed significantly more overall though. But this might change with larger vector lengths. This method would also be needed if we'd do some kind of active quad merging when operating on multiple quads at once. This path contains some hack to force llvm to generate better code, it is still far from ideal though, still generates far too many unnecessary register spills/reloads. Both methods should work with different sized vectors. Not very well tested yet, still seems to work with four-wide vectors, need changes elsewhere to be able to test with wider vectors. commit be5d3e82e2fe14ad0a46529ab79f65bf2276cd28 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:59:37 2012 +0000 draw: Cleanup. commit f85bc12c7fbacb3de2a94e88c6cd2d5ee0ec0e8d Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:43:30 2012 +0000 gallivm: More module compilation refactoring. commit d76f093198f2a06a93b2204857e6fea5fd0b3ece Author: José Fonseca <jfonseca@vmware.com> Date: Thu Mar 15 21:29:11 2012 +0000 llvmpipe: Use gallivm_compile/free_function() in linear code. Should had been done before. commit 122e1adb613ce083ad739b153ced1cde61dfc8c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 13 14:47:10 2012 +0100 llvmpipe: generate partial pixel mask for multiple quads still works with one quad, cannot be tested yet with more At least for now always fixed order with multiple quads. commit 4c4f15081d75ed585a01392cd2dcce0ad10e0ea8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 22:09:24 2012 +0100 llvmpipe: refactor state setup a bit Refactor to make it easier to emit (and potentially later fetch in fs) coefficients for multiple attributes at once. Need to think more about how to make this actually happen however, the problem is different attributes can have different interpolation modes, requiring different handling in both setup and fs (though linear and perspective handling is close). commit 9363e49722ff47094d688a4be6f015a03fba9c79 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 19:23:23 2012 +0100 llvmpipe: vectorize tri offset calc cuts number of instructions in quad-offset-factor from 107 to 75. This code actually duplicated the (scalar) code calculating the determinant except it used different vertex order (leading to different sign but it doesn't matter) hence llvm could not have figured out it's the same (of course with determinant vectorized in the other place that wouldn't have worked any longer neither). Note this particular piece doesn't actually vectorize well, not many arithmetic instructions left but tons of shuffle instructions... Probably would need to work on n tris at a time for better vectorization. commit 63169dcb9dd445c94605625bf86d85306e2b4297 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 03:11:37 2012 +0100 llvmpipe: vectorize some scalar code in setup reduces number of arithmetic instructions, and avoids loading vector x,y values twice (once as scalars once as vectors). Results in a reduction of instructions from 76 to 64 in fs setup for glxgears (16%) on a cpu with sse41. Since this code uses vec2 disguised as vec4, on old cpus which had physical 64bit sse units (pre-Core2) it probably is less of a win in practice (and if you have no vectors you can only hope llvm eliminates the arithmetic for unneeded elements). commit 732ecb877f951ab89bf503ac5e35ab8d838b58a1 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 7 00:32:24 2012 +0100 draw: fix clipping bug introduced by 4822fea3f0440b5205e957cd303838c3b128419c broke clipping pretty badly (verified with lineclip test) commit ef5d90b86d624c152d200c7c4056f47c3c6d2688 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 23:38:59 2012 +0100 draw: don't store vertex header per attribute storing the vertex header once per attribute is totally unnecessary. Some quick look at the generated assembly says llvm in fact cannot optimize away the additional stores (maybe due to potentially aliasing pointers somewhere). Plus, this makes the code cleaner and also allows using a vector "or" instead of scalar ones. commit 6b3a5a57b0b9850854cfbd7b586e4e50102dda71 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 19:11:01 2012 +0100 draw: do the per-vertex "boolean" clipmask "or" with vectors no point extracting the values and doing it per component. Doesn't help that much since we still extract the values elsewhere anyway. commit 36519caf1af40e4480251cc79a2d527350b7c61f Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 2 22:27:01 2012 +0100 gallivm: fix lp_build_extract_broadcast with different sized vectors Fix the obviously wrong argument, so it doesn't blow up. commit 76d0ac3ad85066d6058486638013afd02b069c58 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 2 12:16:23 2012 +0000 draw: Compile per module and not per function (WIP). Enough to get gears w/ LLVM draw + softpipe to work on AVX doing: GALLIUM_DRIVER=softpipe SOFTPIPE_USE_LLVM=yes glxgears But still hackish -- will need to rethink and refactor this. commit 78e32b247d2a7a771be9a1a07eb000d1e54ea8bd Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 12:01:05 2012 +0000 llvmpipe: Remove lp_state_setup_fallback. Never used. commit 6895d5e40d19b4972c361e8b83fdb7eecda3c225 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Feb 27 19:14:27 2012 +0000 llvmpipe: Don't emit EMMS on x86 We already take precautions to ensure that LLVM never emits MMX code. commit 4822fea3f0440b5205e957cd303838c3b128419c Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 29 15:58:19 2012 +0100 draw: modifications for larger vector sizes We want to be able to use larger vectors especially for running the vertex shader. With this patch we build soa vectors which might have a different length than 4. Note that aos structures really remain the same, only when aos structures are converted to soa potentially different sized vectors are used. Samplers probably don't work yet, didn't look at them. Testing done: glxgears works with both 128bit and 256bit vectors. commit f4950fc1ea784680ab767d3dd0dce589f4e70603 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:57 2012 +0100 gallivm: override native vector width with LP_NATIVE_VECTOR_WIDTH env var for debug commit 6ad6dbf0c92f3bf68ae54e5f2aca035d19b76e53 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:24 2012 +0100 draw: allocate storage with alignment according to native vector width commit 7bf0e3e7c9bd2469ae7279cabf4c5229ae9880c1 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Feb 24 19:06:08 2012 +0000 gallivm: Fix comment grammar. Was missing several words. Spotted by Roland. commit b20f1b28eb890b2fa2de44a0399b9b6a0d453c52 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 19:22:09 2012 +0000 gallivm: Use MC-JIT on LLVM 3.1 + (i.e, SVN) MC-JIT Note: MC-JIT is still WIP. For this to work correctly it requires LLVM changes which are not yet upstream. commit b1af4dfcadfc241fd4023f4c3f823a1286d452c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 20:03:15 2012 +0100 llvmpipe: use new lp_type_width() helper in lp_test_blend commit 04e0a37e888237d4db2298f31973af459ef9c95f Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 19:50:34 2012 +0100 llvmpipe: clean up lp_test_blend a little Using variables just sized and aligned right makes it a bit more obvious what's going on. The test still only tests vector length 4. For AoS anything else probably isn't going to work. For SoA other lengths should work (at least with floats). commit e61c393d3ec392ddee0a3da170e985fda885a823 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:48:30 2012 +0000 gallivm: Ensure vector width consistency. Instead of assuming that everything is the max native size. commit 330081ac7bc41c5754a92825e51456d231bf84dd Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:44:14 2012 +0000 draw: More simd vector width consistency fixes. commit d90ca002753596269e37297e2e6c139b19f29f03 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:43:00 2012 +0000 gallivm: Remove unused lp_build_int32_vec4_type() helper. commit cae23417824d75869c202aaf897808d73a2c1db0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 17:32:16 2012 +0100 gallivm: use global variable for native vector width instead of define We do not know the simd extensions (and hence the simd width we should use) available at compile time. At least for now keep a define for maximum vector width, since a global variable obviously can't be used to adjust alignment of automatic stack variables. Leave the runtime-determined value at 128 for now in all cases. commit 51270ace6349acc2c294fc6f34c025c707be538a Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 15:41:02 2012 +0000 gallivm: Add a hunk inadvertedly lost when rebasing. commit bf256df9cfdd0236637a455cbaece949b1253e98 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:24:23 2012 +0000 llvmpipe: Use consistent vector width in depth/stencil test. commit 5543b0901677146662c44be2cfba655fd55da94b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:19:59 2012 +0000 draw: Use a consistent the vector register width. Instead of 4x32 sometimes, LP_NATIVE_VECTOR_WIDTH other times. commit eada8bbd22a3a61f549f32fe2a7e408222e5c824 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 12:08:04 2012 +0000 gallivm: Remove garbagge collection. MC-JIT will require one compilation per module (as opposed to one compilation per function), therefore no state will be shared, eliminating the need to do garbagge collection. commit 556697ea0ed72e0641851e4fbbbb862c470fd7eb Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 10:33:41 2012 +0000 gallivm: Move all native target initialization to lp_set_target_options(). commit c518e8f3f2649d5dc265403511fab4bcbe2cc5c8 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:52:32 2012 +0000 llvmpipe: Create one gallivm instance for each test. commit 90f10af8920ec6be6f2b1e7365cfc477a0cb111d Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:48:08 2012 +0000 gallivm: Avoid LLVMAddGlobalMapping() in lp_bld_assert(). Brittle, complex, and unecesary. Just use function pointer constant. commit 98fde550b33401e3fe006af59db4db628bcbf476 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:21:26 2012 +0000 gallivm: Add a lp_build_const_func_pointer() helper. To be reused in all places where we want to call C code. commit 6cfedadb62c2ce5af8d75969bc95a607f3ece118 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:44:41 2012 +0000 gallivm: Cleanup/simplify lp_build_const_string_variable. - Move to lp_bld_const where it belongs - Rename to lp_build_const_string - take the length from the argument (and don't count the zero terminator twice) - bitcast the constant to generic i8 * commit db1d4018c0f1fa682a9da93c032977659adfb68c Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 11:52:17 2012 +0000 gallivm: Set NoFramePointerElimNonLeaf to true where supported. commit 088614164aa915baaa5044fede728aa898483183 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 22 19:38:47 2012 +0100 llvmpipe: pass in/out pointers rather scalar floats in lp_bld_arit we don't want llvm to potentially optimize away the vectors (though it doesn't seem to currently), plus we want to be able to handle in/out vectors of arbitrary length. commit 3f5c4e04af8a7592fdffa54938a277c34ae76b51 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:22:55 2012 +0100 gallivm: fix lp_build_sqrt() for vector length 1 since we optimize away vectors with length 1 need to emit intrinsic without vector type. commit 79d94e5f93ed8ba6757b97e2026722ea31d32c06 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 17:00:46 2012 +0000 llvmpipe: Remove lp_test_round. commit 81f41b5aeb3f4126e06453cfc78990086b85b78d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:56:24 2012 +0100 llvmpipe: subsume lp_test_round into lp_test_arit Much simpler, and since the arguments aren't passed as 128bit values can run on any arch. This also uses the float instead of the double versions of the c functions (which probably was the intention anyway). In contrast to lp_test_round the output is much less verbose however. Tested vector width of 32 to 512 bits - all pass except 32 (length 1) which crashes in lp_build_sqrt() due to wrong type. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 945b338b421defbd274481d8c4f7e0910fd0e7eb Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 09:55:03 2012 +0000 gallivm: Centralize the function compilation logic. This simplifies a lot of code. Also doing this in a central place will make it easier to carry out the changes necessary to use MC-JIT in the future. gallivm: Fix typo in explicit derivative shuffle. Trivial. draw: make DEBUG_STORE work again adapt to lp_build_printf() interface changes Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: get rid of vecnf_from_scalar() just use lp_build_broadcast directly (cannot assign a name but don't really need it, vecnf_from_scalar() was producing much uglier IR due to using repeated insertelement instead of insertelement+shuffle). Reviewed-by: José Fonseca <jfonseca@vmware.com> llvmpipe: fix typo in complex interpolation code Fixes position interpolation when using complex mode (piglit fp-fragment-position and similar) Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: fix clipvertex/position storing again This appears to be the result of a bad merge. Fixes piglit tests relying on clipping, like a lot of the interpolation tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Fix explicit derivative manipulation. Same counter variable was being used in two nested loops. Use more meanigful variable names for the counter to fix and avoid this. gallivm: Prevent buffer overflow in repeat wrap mode for NPOT. Based on Roland's patch, discussion, and review . Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix dims for TGSI_TEXTURE_1D in emit_tex. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix explicit volume texture derivatives. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: fix 1d shadow texture sampling Always r coordinate is used, hence need 3 coords not two (the second one is unused). Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Enable AVX support without MCJIT, where available. For now, this just enables AVX on Windows for testing. If the code is stable then we might consider prefering the old JIT wherever possible. No change elsewhere. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2012-07-13 18:09:30 +01:00
}
}
} else if (util_cpu_caps.has_altivec) {
intr_size = 128;
if (type.width == 8) {
if (!type.sign) {
intrinsic = "llvm.ppc.altivec.vmaxub";
} else {
intrinsic = "llvm.ppc.altivec.vmaxsb";
}
} else if (type.width == 16) {
if (!type.sign) {
intrinsic = "llvm.ppc.altivec.vmaxuh";
} else {
intrinsic = "llvm.ppc.altivec.vmaxsh";
}
} else if (type.width == 32) {
if (!type.sign) {
intrinsic = "llvm.ppc.altivec.vmaxuw";
} else {
intrinsic = "llvm.ppc.altivec.vmaxsw";
}
}
}
if (intrinsic) {
if (util_cpu_caps.has_sse && type.floating &&
nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
LLVMValueRef isnan, max;
max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
type,
intr_size, a, b);
if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
isnan = lp_build_isnan(bld, b);
return lp_build_select(bld, isnan, a, max);
} else {
assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
isnan = lp_build_isnan(bld, a);
return lp_build_select(bld, isnan, a, max);
}
} else {
return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
type,
intr_size, a, b);
}
gallivm,draw,llvmpipe: Support wider native registers. Squashed commit of the following: commit 7acb7b4f60dc505af3dd00dcff744f80315d5b0e Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:46:31 2012 +0100 draw: Don't use dynamically sized arrays. Not supported by MSVC. commit 5810c28c83647612cb372d1e763fd9d7780df3cb Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:44:16 2012 +0100 gallivm,llvmpipe: Don't use expressions with PIPE_ALIGN_VAR(). MSVC doesn't accept exceptions in _declspec(align(...)). Use a define instead. commit 8aafd1457ba572a02b289b3f3411e99a3c056072 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:41:56 2012 +0100 gallium/util: Make u_cpu_detect.h header C++ safe. commit 5795248350771f899cfbfc1a3a58f1835eb2671d Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 2 12:08:01 2012 +0100 gallium/util: Add ULL suffix to large constants. As suggested by Andy Furniss: it looks like some old gcc versions require it. commit 4c66c22727eff92226544c7d43c4eb94de359e10 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Truly disable INF/NAN tests on MSVC. Thanks to Brian for spotting this. commit 8bce274c7fad578d7eb656d9a1413f5c0844c94e Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Disable INF/NAN tests on MSVC. Somehow they are not recognized as constants. commit 6868649cff8d7fd2e2579c28d0b74ef6dd4f9716 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 15:05:24 2012 +0200 gallivm: Cleanup the 2 x 8 float -> 16 ub special path in lp_build_conv. No behaviour change intended, like 7b98455fb40c2df84cfd3cdb1eb7650f67c8a751. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 5147a0949c4407e8bce9e41d9859314b4a9ccf77 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 14:28:19 2012 +0200 gallivm: (trivial) fix issues with multiple-of-4 texture fetch Some formats can't handle non-multiple of 4 fetches I believe, but everything must support length 1 and multiples of 4. So avoid going to scalar fetch (which is very costly) just because length isn't 4. Also extend the hack to not use shift with variable count for yuv formats to arbitrary length (larger than 1) - doesn't matter how many elements we have we always want to avoid it unless we have variable shift count instruction (which we should get with avx2). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 87ebcb1bd71fa4c739451ec8ca89a7f29b168c08 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jul 4 02:09:55 2012 +0200 gallivm: (trivial) fix typo for wrap repeat mode in linear filtering aos code This would lead to bogus coordinates at the edges. (undetected by piglit because this path is only taken for block-based formats). Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 3a42717101b1619874c8932a580c0b9e6896b557 Author: José Fonseca <jfonseca@vmware.com> Date: Tue Jul 3 19:42:49 2012 +0100 gallivm: Fix TGSI integer translation with AVX. commit d71ff104085c196b16426081098fb0bde128ce4f Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 15:17:41 2012 +0100 llvmpipe: Fix LLVM JIT linear path. It was not working properly because it was looking at the JIT function before it was actually compiled. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit a94df0386213e1f5f9a6ed470c535f9688ec0a1b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Jun 28 18:07:10 2012 +0100 gallivm: Refactor lp_build_broadcast(_scalar) to share code. Doesn't really change the generated assembly, but produces more compact IR, and of course, makes code more consistent. Reviewed-by: Brian Paul <brianp@vmware.com> commit 66712ba2731fc029fa246d4fc477d61ab785edb5 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 17:30:13 2012 +0100 gallivm: Make LLVMContextRef a singleton. There are any places inside LLVM that depend on it. Too many to attempt to fix. Reviewed-by: Brian Paul <brianp@vmware.com> commit ff5fb7897495ac263f0b069370fab701b70dccef Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 28 18:15:27 2012 +0200 gallivm: don't use 8-wide texture fetch in aos path This appears to be a slight loss usually. There are probably several reasons for that: - fetching itself is scalar - filtering is pure int code hence needs splitting anyway, same for the final texel offset calculations - texture wrap related code, which can be done 8-wide, is slightly more complex with floats (with clamp_to_edge) and float operations generally more costly hence probably not much faster overall - the code needed to split when encountering different mip levels for the quads, adding complexity So, just split always for aos path (but leave it 8-wide for soa, since we do 8-wide filtering there when possible). This should certainly be revisited if we'd have avx2 support. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ce8032b43dcd8e8d816cbab6428f54b0798f945d Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:41:19 2012 +0200 gallivm: (trivial) don't extract fparts variable if not needed Did not have any consequences but unnecessary. commit aaa9aaed8f80dc282492f62aa583a7ee23a4c6d5 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:09:06 2012 +0200 gallivm: fix precision issue in aos linear int wrap code now not just passes at a quick glance but also with piglit... If we do the wrapping with floats, we also need to set the weights accordingly. We can potentially end up with different (integer) coordinates than what the integer calculations would have chosen, which means the integer weights calculated previously in this case are completely wrong. Well at least that's what I think happens, at least recalculating the weights helps. (Some day really should refactor all the wrapping, so we do whatever is fastest independent of 16bit int aos or 32bit float soa filtering.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit fd6f18588ced7ac8e081892f3bab2916623ad7a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 11:15:53 2012 +0100 gallium/util: Fix parsing of options with underscore. For example GALLIVM_DEBUG=no_brilinear which was being parsed as two options, "no" and "brilinear". commit 09a8f809088178a03e49e409fa18f1ac89561837 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 26 15:00:14 2012 +0100 gallivm: Added a generic lp_build_print_value which prints a LLVMValueRef. Updated lp_build_printf to share common code. Removed specific lp_build_print_vecX. Reviewed-by: José Fonseca <jfonseca@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit e59bdcc2c075931bfba2a84967a5ecd1dedd6eb0 Author: José Fonseca <jfonseca@vmware.com> Date: Wed May 16 15:00:23 2012 +0100 draw,llvmpipe: Avoid named struct types on LLVM 3.0 and later. Starting with LLVM 3.0, named structures are meant not for debugging, but for recursive data types, previously also known as opaque types. The recursive nature of these types leads to several memory management difficulties. Given that we don't actually need recursive types, avoid them altogether. This is an attempt to address fdo bugs 41791 and 44466. The issue is somewhat random so there's no easy way to check how effective this is. Cherry-picked from 9af1ba565dfd5cef9ee938bb7c04767d14878fbf commit df6070f618a203c7a876d984c847cde4cbc26bdb Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 14:42:53 2012 +0200 gallivm: (trivial) fix typo in faster aos linear int wrap code no longer crashes, now REALLY tested. commit d8f98dce452c867214e6782e86dc08562643c862 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 18:20:58 2012 +0200 llvmpipe: (trivial) remove bogus optimization for float aos repeat wrap This optimization for nearest filtering on the linear path generated likely bogus results, and the int path didn't have any optimizations there since the only shader using force_nearest apparently uses clamp_to_edge not repeat wrap anyway. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit c4e271a0631087c795e756a5bb6b046043b5099d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 23:01:52 2012 +0200 gallivm: faster repeat wrap for linear aos path too Even if we already have scaled integer coords, it's way faster to use the original float coord (plus some conversions) rather than use URem. The choice of what to do for texture wrapping is not really tied to int aos or float soa filtering though for some modes there can be some gains (because of easier weight calculations). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 1174a75b1806e92aee4264ffe0ffe7e70abbbfa3 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 14:39:22 2012 +0200 gallivm: improve npot tex wrap repeat in linear soa path URem gets translated into series of scalar divisions so just about anything else is faster. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit f849ffaa499ed96fa0efd3594fce255c7f22891b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 00:40:35 2012 +0100 gallivm: (trivial) fix near-invisible shift-space typo I blame the keyboard. commit 5298a0b19fe672aebeb70964c0797d5921b51cf0 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:24:28 2012 +0200 gallivm: add new intrinsic helper to deal with arbitrary vector length This helper will split vectors which are too large for the hw, or expand them if they are too small, so a caller of a function using intrinsics which uses such sizes need not split (or expand) the vectors manually and the function will still use the intrinsic instead of dropping back to generic llvm code. It can also accept scalars for use with pseudo-vector intrinsics (only useful for float arguments, all x86 scalar simd float intrinsics use 4vf32). Only used for lp_build_min/max() for now (also added the scalar float case for these while there). (Other basic binary functions could use it easily, whereas functions with a different interface would need different helpers.) Expanding vectors isn't widely used, because we always try to use build contexts with native hw vector sizes. But it might (or not) be nicer if this wouldn't need to be done, the generated code should in theory stay the same (it does get hit by lp_build_rho though already since we didn't have a intrinsic for the scalar lp_build_max case before). v2: incorporated Brian's feedback, and also made the scalar min/max case work instead of crash (all scalar simd float intrinsics take 4vf32 as argument, probably the reason why it wasn't used before). Moved to lp_bld_intr based on José's request, and passing intrinsic size instead of length. Ideally we'd derive the source type info from the passed in llvm value refs and process some llvmtype return type so we could handle intrinsics where the source and destination type isn't the same (like float/int conversions, packing instructions) but that's a bit too complicated for now. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 01aa760b99ec0b2dc8ce57a43650e83f8c1becdf Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:19:18 2012 +0200 gallivm: (trivial) increase max code size for shader disassembly 64kB was just short of what I needed (which caused a crash) hence increase to 96kB (should probably be smarter about that). commit 74aa739138d981311ce13076388382b5e89c6562 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:53:29 2012 +0100 gallivm: simplify aos float tex wrap repeat nearest just handle pot and npot the same. The previous pot handling ended up with exactly the same instructions plus 2 more (leave it in the soa path though since it is probably still cheaper there). While here also fix a issue which would cause a crash after an assert. commit 0e1e755645e9e49cfaa2025191e3245ccd723564 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:29:24 2012 +0100 gallivm: (trivial) skip floor rounding in ifloor when not signed This was only done for the non-sse41 case before, but even with sse41 this is obviously unnecessary (some callers already call itrunc in this case anyway but some might not). commit 7f01a62f27dcb1d52597b24825931e88bae76f33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:23:12 2012 +0100 gallivm: (trivial) fix bogus comments commit 5c85be25fd82e28490274c468ce7f3e6e8c1d416 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 20 11:51:57 2012 +0100 translate: Free elt8_func/elt16_func too. These were leaking. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 0ad498f36fb6f7458c7cffa73b6598adceee0a6c Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 15:55:34 2012 +0200 gallivm: fix bug for tex wrap repeat with linear sampling in aos float path The comparison needs to be against length not length_minus_one, otherwise the max texel is never chosen (for the second coordinate). Fixes piglit texwrap-1D-npot-proj (and 2D/3D versions). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit d1ad65937c5b76407dc2499b7b774ab59341209e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 16:13:43 2012 +0200 gallivm: simplify soa tex wrap repeat with npot textures and no mip filtering Similar to what is already done in aos sampling for the float path (but not the int path since we don't get normalized float coordinates there). URem is expensive and the calculation is done trivially with normalized floats instead (at least with sse41-capable cpus). (Some day should probably do the same for the mip filter path but it's much more complicated there hence the gain is smaller.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit e1e23f57ba9b910295c306d148f15643acc3fc83 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:38:56 2012 +0200 llvmpipe: (trivial) remove duplicated function declaration Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 07ca57eb09e04c48a157733255427ef5de620861 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:37:34 2012 +0200 llvmpipe: destroy setup variants on context destruction lp_delete_setup_variants() used to be called in garbage collection, but this no longer exists hence the setup shaders never got freed. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ed0003c633859a45f9963a479f4c15ae0ef1dca3 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 16:25:29 2012 +0100 gallivm: handle different ilod parts for multiple quad sampling This fixes filtering when the integer part of the lod is not the same for all quads. I'm not fully convinced of that solution yet as it just splits the vector if the levels to be sampled from are different. But otherwise we'd need to do things like some minify steps, and getting mip level base address separately anyway hence it wouldn't really look like much of a win (and making the code even more complex). This should now give identical results to single quad sampling. commit 8580ac4cfc43a64df55e84ac71ce1a774d33c0d2 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:14:47 2012 +0200 gallivm: de-duplicate sample code common to soa and aos sampling There doesn't seem to be any reason why this code dealing with cube face selection, lod and mip level calculation is separate in aos and soa sampling, and I am sick of having it to change in both places. commit fb541e5f957408ce305b272100196f1e12e5b1e8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:15:41 2012 +0200 gallivm: do mip filtering with per quad lod_fpart This gives better results for mip filtering, though the generated code might not be optimal. For now it also creates some artifacts if the lod_ipart isn't the same for all quads, since instead of using the same mip weight for all quads as previously (which just caused non-smooth gradients) this now will use the right weights but with the wrong mip level in this case (can easily be seen with things like texfilt, mipmap_tunnel). v2: use logic helper suggested by José, and fix issue with negative lod_fpart values commit f1cc84eef7d826a20fab6cd8ccef9a275ff78967 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 13 18:35:25 2012 +0200 gallivm: (trivial) fix bogus assert in lp_build_unpack_broadcast_aos_scalars commit 7c17dbae8ae290df9ce0f50781a09e8ed640c044 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:11:14 2012 +0100 util: Reimplement half <-> float conversions. Removed u_half.py used to generate the table for previous method. Previous implementation of float to half conversion was faulty for denormalised and NaNs and would require extra logic to fix, thus making the speedup of using tables irrelevant. commit 7762f59274070e1dd4b546f5cb431c2eb71ae5c3 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:12:16 2012 +0100 tests: Updated tests to properly handle NaN for half floats. commit fa94c135aea5911fd93d5dfb6e6f157fb40dce5e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:33:10 2012 +0200 gallivm: do mip level calculations per quad This is the final piece which shouldn't change the rendering output yet. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 23cbeaddfe03c09ca18c45d28955515317ffcf4c Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Jun 9 00:54:21 2012 +0200 gallivm: do per-quad cube face selection Doesn't quite fix the piglit cubemap test (not sure why actually) but doing per-quad face selection is doing the right thing and definitely an improvement. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit abfb372b3702ac97ac8b5aa80ad1b94a2cc39d33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:22:59 2012 +0200 gallivm: do all lod calculations per quad Still no functional change but lod is now converted to scalar after lod calculations. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 519368632747ae03feb5bca9c655eccbc5b751b4 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:46:10 2012 +0100 gallivm: Added support for half-float to float conversion in lp_build_conv. Updated various utility functions to support this change. commit 135b4d683a4c95f7577ba27b9bffa4a6fbd2c2e7 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:02:46 2012 +0100 gallivm: Added function for half-float to float conversion. Updated lp_build_format_aos_array to support half-float source. commit 37d648827406a20c5007abeb177698723ed86673 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:55:18 2012 +0100 util: Updated u_format_tests to rigidly test half-float boundary values. commit 2ad18165d96e578aa9046df7c93cb1c3284d8c6b Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:54:16 2012 +0100 llvmpipe: Updated lp_test_format to properly handle Inf/NaN results. commit 78740acf25aeba8a7d146493dd5c966e22c27b73 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:53:30 2012 +0100 util: Added functions for checking NaN / Inf for double and half-floats. commit 35e9f640ae01241f9e0d67fe893bbbf564c05809 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:05:13 2012 +0200 gallivm: Fix calculating rho for 3d textures for the single-quad case Discovered by accident, this looks like a very old typo bug. commit fc1220c636326536fd0541913154e62afa7cd1d8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:04:59 2012 +0200 gallivm: do calcs per-quad in lp_build_rho Still convert to scalar at the end of the function. commit 50a887ffc550bf310a6988fa2cea5c24d38c1a41 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 21 23:21:50 2012 +0200 gallivm: (trivial) return scalar in lp_build_extract_range for length 1 vectors Our type system on top of llvm's one doesn't generally support vectors of length 1, instead using scalars. So we should return a scalar from this function instead of having to bitcast the vector with length 1 later elsewhere. commit 80c71c621f9391f0f9230460198d861643324876 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 17:49:15 2012 +0100 draw: Fixed bad merge error commit c47401cfad0c9167de20ff560654f533579f452c Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:29:30 2012 +0100 draw: Updated store_clip to store whole vectors instead of individual elements. commit 2d9c1ad74b0b0b41861fffcecde39f09cc27f1cf Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:28:32 2012 +0100 gallivm: Added lp_build_fetch_rgba_aos_array. A version of lp_build_fetch_rgba_aos which is targeted at simple array formats. Reads the whole vector from memory in one, instead of reading each element individually. Tested with mesa tests and demos. commit ff7805dc2b6ef6d8b11ec4e54aab1633aef29ac8 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:27:40 2012 +0100 gallivm: Added lp_build_pad_vector. This function pads a vector with undef to a desired length. commit 701f50acef24a2791dabf4730e5b5687d6eb875d Author: James Benton <jbenton@vmware.com> Date: Fri May 18 17:27:19 2012 +0100 util: Added util_format_is_array. This function checks whether a format description is in a simple array format. commit 5e0a7fa543dcd009de26f34a7926674190fa6246 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:13:47 2012 +0100 draw: Removed draw_llvm_translate_from and draw/draw_llvm_translate.c. This is "replaced" by adding an optimised path in lp_build_fetch_rgba_aos in an upcoming patch. commit 8c886d6a7dd3fb464ecf031de6f747cb33e5361d Author: James Benton <jbenton@vmware.com> Date: Wed May 16 15:02:31 2012 +0100 draw: Modified store_aos to write the vector as one, not individual elements. commit 37337f3d657e21dfd662c7b26d61cb0f8cfa6f17 Author: James Benton <jbenton@vmware.com> Date: Wed May 16 14:16:23 2012 +0100 draw: Changed aos_to_soa to use lp_build_transpose_aos. commit bd2b69ce5d5c94b067944d1dcd5df9f8e84548f1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:14:27 2012 +0100 draw: Changed soa_to_aos to use lp_build_transpose_aos. commit 0b98a950d29a116e82ce31dfe7b82cdadb632f2b Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:45 2012 +0100 gallivm: Added lp_build_transpose_aos which converts between aos and soa. commit 69ea84531ad46fd145eb619ed1cedbe97dde7cb5 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:01 2012 +0100 gallivm: Added lp_build_interleave2_half aimed at AVX unpack instructions. commit 7a4cb1349dd35c18144ad5934525cfb9436792f9 Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 22 11:54:14 2012 +0100 gallivm: Fix build on Windows. MC-JIT not yet supported there. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit afd105fc16bb75d874e418046b80d9cc578818a1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:17:26 2012 +0100 llvmpipe: Added a error counter to lp_test_conv. Useful for keeping track of progress when fixing errors! Signed-off-by: José Fonseca <jfonseca@vmware.com> commit b644907d08c10a805657841330fc23db3963d59c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:16:46 2012 +0100 llvmpipe: Changed known failures in lp_test_conv. To comply with the recent fixes to lp_bld_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit d7061507bd94f6468581e218e61261b79c760d4f Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:14:38 2012 +0100 llvmpipe: Added fixed point types tests to lp_test_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 146b3ea39b4726dbe125ac666bd8902ea3d6ca8c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:26:35 2012 +0100 llvmpipe: Changed lp_test_conv src/dst alignment to be correct. Now based on the define rather than a fixed number. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit f3b57441f834833a4b142a951eb98df0aa874536 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:06:44 2012 +0100 gallivm: Fixed erroneous optimisation in lp_build_min/max. Previously assumed normalised was 0 to 1, but it can be -1 to 1 if type is signed. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a0613382e5a215cd146bb277646a6b394d376ae4 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:04:49 2012 +0100 gallivm: Compensate for lp_const_offset in lp_build_conv. Fixing a /*FIXME*/ to remove errors in integer conversion in lp_build_conv. Tested using lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a3d2bf15ea345bc8a0664f8f441276fd566566f3 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:01:25 2012 +0100 gallivm: Fixed overflow in lp_build_clamped_float_to_unsigned_norm. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit e7b1e76fe237613731fa6003b5e1601a2e506207 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 21 20:07:51 2012 +0100 gallivm: Fix build with LLVM 2.6 Trivial, and useful. commit d3c6bbe5c7f5ba1976710831281ab1b6a631082d Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 15 17:15:59 2012 +0100 gallivm: Enable MCJIT/AVX with vanilla LLVM 3.1. Add the necessary C++ glue, so that we don't need any modifications to the soon to be released LLVM 3.1. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 724a019a14d40fdbed21759a204a2bec8a315636 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 22:04:06 2012 +0100 gallivm: Use HAVE_LLVM 0x0301 consistently. commit af6991e2a3868e40ad599b46278551b794839748 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 21:49:06 2012 +0100 gallivm: Add MCRegisterInfo.h to silence benign warnings about missing implementation. Trivial. commit 6f8a1d75458daae2503a86c6b030ecc4bb494e23 Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Apr 2 22:14:15 2012 -0700 gallivm: Pass in a MCInstrInfo to createMCInstPrinter on llvm-3.1. llvm-3.1svn r153860 makes MCInstrInfo available to the MCInstPrinter. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 62555b6ed8760545794f83064e27cddcb3ce5284 Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 27 21:51:17 2012 -0700 gallivm: Fix method overriding in raw_debug_ostream. Use matching type qualifers to avoid method hiding. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 6a9bd784f4ac68ad0a731dcd39e5a3c39989f2be Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 13 22:40:52 2012 -0700 gallivm: Fix createOProfileJITEventListener namespace with llvm-3.1. llvm-3.1svn r152620 refactored the OProfile profiling code. createOProfileJITEventListener was moved from the llvm namespace to the llvm::JITEventListener namespace. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit b674955d39adae272a779be85aa1bd665de24e3e Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Mar 5 22:00:40 2012 -0800 gallivm: Pass in a MCRegisterInfo to MCInstPrinter on llvm-3.1. llvm-3.1svn r152043 changes createMCInstPrinter to take an additional MCRegisterInfo argument. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 11ab69971a8a31c62f6de74905dbf8c02884599f Author: Vinson Lee <vlee@freedesktop.org> Date: Wed Feb 29 21:20:53 2012 -0800 Revert "gallivm: Change getExtent and readByte to non-const with llvm-3.1." This reverts commit d5a6c172547d8964f4d4bb79637651decaf9deee. llvm-3.1svn r151687 makes MemoryObject accessor members const again. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 339960c82d2a9f5c928ee9035ed31dadb7f45537 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 14 16:19:56 2012 +0200 gallivm: (trivial) fix assertion failure for mipmapped 1d textures In lp_build_rho, we may end up with a 1-element vector (for mipmapped 1d textures), but in this case we require the type to be a non-vector type, so need a cast. commit 9d73edb727bd6d196030dc3026b7bf0c574b3e19 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:12:07 2012 +0200 gallivm: prepare for per-quad lod calculations for large vectors to be able to handle multiple quads at once in texture sampling and still do lod calculations per quad, it is necessary to get the per-quad derivatives into the lp_build_rho function. Until now these derivative values were just scalars, which isn't going to work. So we now use vectors, and since the interface needs to change we also do some different (slightly more efficient) packing of the values. For 8-wide vectors the packed derivative values for 3 coords would look like this, this scales to a arbitrary (multiple of 4) vector size: ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____ The second vector will be unused for 1d and 2d textures. To facilitate future changes the derivative values are put into a struct, since quite some functions just pass these values through. The generated code seems to be very slightly better for 2d textures (with 4-wide vectors) than before with sse2 (if you have a cpu with physical 128bit simd units - otherwise it's probably not a win). v2: suggestions from José, rename variables, add comments, use swizzle helper commit 0aa21de0d31466dac77b05c97005722e902517b8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:10:31 2012 +0200 gallivm: add undefined swizzle handling to lp_build_swizzle_aos This is useful for vectors with "holes", it lets llvm choose the most efficient shuffle instructions if some elements aren't needed without having to worry what elements to manually pick otherwise. commit 00faf3f370e7ce92f5ef51002b0ea42ef856e181 Author: José Fonseca <jfonseca@vmware.com> Date: Fri May 4 17:25:16 2012 +0100 gallivm: Get the LLVM IR optimization passes before JIT compilation. MC-JIT engine compiles the module immediately on creation, so the optimization passes were being run too late. So now we create a target data layout from a string, that matches the ABI parameters reported by the compiler. The backend optimization passes were always been run, so the performance improvement is modest (3% on multiarb mesa demo). Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit 40a43f4e2ce3074b5ce9027179d657ebba68800a Author: Roland Scheidegger <sroland@vmware.com> Date: Wed May 2 16:03:54 2012 +0200 gallivm: (trivial) fix wrong define used in lp_build_pack2 should fix stack-smashing crashes. commit e6371d0f4dffad4eb3b7a9d906c23f1c88a2ab9e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 30 21:25:29 2012 +0200 gallivm: add perf warnings when not using intrinsics with 256bit vectors Helper functions using integer sse2 intrinsics could split the vectors with AVX instead of using generic fallback (which should be faster). We don't actually expect to hit these paths (hence don't fix them up to actually do the vector splitting) so just emit warnings (for those functions where it's obvious doing split/intrinsic is faster than using generic path). Only emit warnings for 256bit vectors since we _really_ don't expect to hit arbitrary large vectors which would affect a lot more functions. The warnings do not actually depend on avx since the same logic applies to plain sse2 too (but of course again there's _really_ no reason we should hit these functions with 256bit vectors without avx). commit 8a9ea701ea7295181e846c6383bf66a5f5e47637 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:37:07 2012 +0200 gallivm: split vectors manually for avx in lp_build_pack2 (v2) There's 2 reasons for this: First, there's a llvm bug (fixed in 3.1) which generates tons of byte inserts/extracts otherwise, and second, more importantly, we want to use pack intrinsics instead of shuffles. We do this in lp_build_pack2 and not the calling code (aos sample path) because potentially other callers might find that useful too, even if for larger sequences of code using non-native vector sizes it might be better to manually split vectors. This should boost texture performance in the aos path considerably. v2: fix issues with intrinsics types with old llvm commit 27ac5b48fa1f2ea3efeb5248e2ce32264aba466e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:26:22 2012 +0200 llvmpipe: refactor lp_build_pack2 (v2) prettify, and it's unnecessary to assert when there's no intrinsic due to unsupported bit width - the shuffle path will work regardless. In contrast lp_build_packs2, should only rely on lp_build_pack2 doing the clamping for element sizes for which there is a sse2 intrinsic. v2: fix bug spotted by Jose regarding the intrinsic type for packusdw on old llvm versions. commit ddf279031f0111de4b18eaf783bdc0a1e47813c8 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:13:59 2012 +0200 gallivm: add src width check in lp_build_packs2() not doing so would skip clamping even if no sse2 pack instruction is available, which is incorrect (in theory only, such widths would also always hit a (unnecessary) assertion in lp_build_pack2(). commit e7f0ad7fe079975eae7712a6e0c54be4fae0114b Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Apr 27 15:57:00 2012 +0200 gallivm: (trivial) fix crash-causing typo for npot textures with avx commit 28a9d7f6f655b6ec508c8a3aa6ffefc1e79793a0 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Apr 25 19:38:45 2012 +0200 gallivm: (trivial) remove code mistakenly added twice. commit d5926537316f8ff67ad0a52e7242f7c5478d919b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Apr 24 21:16:15 2012 +0200 gallivm: add a new avx aos sample path (v2) Try to avoid mixing float and int address calculations. This does texture wrap modes with floats, and then the offset calculations still with ints (because of lack of precision with floats, though we could do some effort to make it work with not too large (16MB) textures). This also handles wrap repeat mode with npot-sized textures differently than either the old soa or aos int path (likely way faster but untested). Otherwise the actual address wrap code is largely similar to the soa path (not quite the same as this one also has some int code), it should get used by avx soa sampling later as well but doesn't handle more complex address modes yet (this will also have the benefit that we can use aos sampling path for all texture address modes). Generated code for that looks reasonable, but still does not split vectors explicitly for fetch/filter which means still get hit by llvm (fixed upstream) which generates hundreds of pinsrb/pextrb instead of two shuffles. It is not obvious though if it's much of a win over just doing address calcs 4-wide but with ints, even if it is definitely much less instructions on avx. piglit's texwrap seems to look exactly the same but doesn't test neither the non-normalized nor the npot cases. v2: fix comments, prettify based on Brian's and Jose's feedback. commit bffecd22dea66fb416ecff8cffd10dd4bdb73fce Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Apr 19 01:58:29 2012 +0200 gallivm: refactor aos lp_build_sample_image_nearest/linear split them up to separate address calculations and fetching/filtering. Need this for being able to do 8-wide float address calcs and 4-wide fetch/filter later (for avx). Plus the functions were very big scary monsters anyway (in particular lp_build_sample_image_linear). commit a80b325c57529adddcfa367f96f03557725c4773 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 16 17:17:18 2012 +0200 gallivm: fix lp_build_resize when truncating width but expanding vector size Missed this case which I thought was impossible - the assertion for it was right after the division by zero... (AoS) texture sampling may ask us to do this, for things like 8 4x32int vectors to 1 32x8int vector conversion (eventually, we probably don't want this to happen). commit f9c8337caa3eb185830d18bce8b95676a065b1d7 Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Apr 14 18:00:59 2012 +0200 gallivm: fix cube maps with larger vectors This makes the branchless cube face selection code work with larger vectors. Because the complexity is quite high (cannot really be improved it seems, per-face selection would reduce complexity a lot but this leads to errors unless the derivatives are calculated all from the same face which almost doubles the work to be done) it is still slower than the branching version, hence only enable this with large vectors. It doesn't actually do per-quad face selection yet (only makes sense with matching lod selection, in fact it will select the same face for all pixels based on the average of the first four pixels for now) but only different shuffles are required to make it work (the branching version actually should work with larger vectors too now thanks to the improved horizontal add but of course it cannot be extended to really select the face per-quad unless doing branching per quad). commit 7780c58869fc9a00af4f23209902db7e058e8a66 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 21:11:12 2012 +0100 llvmpipe: (trivial) fix compiler warning and also clarify comment regarding availability of popcnt instruction. commit a266dccf477df6d29a611154e988e8895892277e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 14:21:07 2012 +0100 gallivm: remove unneeded members in lp_build_sample_context Minor cleanup, the texture width, height, depth aren't accessed in their scalar form anywhere. Makes it more obvious those values should probably be fetched already vectorized (but this requires more invasive changes)... commit b678c57fb474e14f05e25658c829fc04d2792fff Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 29 15:53:55 2012 +0100 gallivm: add a helper for concatenating vectors Similar to the extract_range helper intended to get around slow code generated by llvm for 128bit insertelements. Concatenating two 128bit vectors this way will result in a single vinsertf128 operation rather than two 64bit stores plus one 128bit load, though it might be mildly useful for other purposes as well. commit 415ff228bcd0cf5e44a4c15350a661f0f5520029 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 19:41:15 2012 +0100 gallivm: add a custom 2x8f->1x16ub avx conversion path Similar to the existing 4x4f->1x16ub sse2 path, shaves off a couple instructions (min/max mostly) because it relies on pack intrinsics clamping. commit 78c08fc89f8fbcc6dba09779981b1e873e2a0299 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 18:44:07 2012 +0100 gallivm: add avx arithmetic intrinsics Add all avx intrinsics for arithmetic functions (with the exception of the horizontal add function which needs another look). Seems to pass basic tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit a586caa2800aa5ce54c173f7c0d4fc48153dbc4e Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 15:31:35 2012 +0100 gallivm: add avx logic intrinsics Add the blend intrinsics for 8-wide float and 4-wide double vectors. Since we lack 256bit int instructions these are used for int vectors as well, though obviously not for byte or word element values. The comparison intrinsics aren't extended for avx since these are only used for pre-2.7 llvm versions. commit 70275e4c13c89315fc2560a4c488c0e6935d5caf Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 00:40:53 2012 +0100 gallivm: new helper function for extract shuffles. Based on José's idea as we can need that in a couple places. Note that such shuffles should not be used lightly, since data layout of <4 x i8> is different to <16 x i8> for instance, hence might cause data rearrangement. commit 4d586dbae1b0c55915dda1759d2faea631c0a1c2 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 18:27:25 2012 +0100 gallivm: (trivial) don't overallocate shuffle variable using wrong define meant huge array... commit 06b0ec1f6d665d98c135f9573ddf4ba04b2121ad Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 17:54:20 2012 +0100 gallivm: don't do per-element extract/insert for vector element resize Instead of doing per-element extract/insert if the src vectors and dst vector differ in total size (which generates atrocious code) first change the src vectors size by using shuffles to destination vector size. We can still do better than that on AVX for packing to color buffer (by exploiting pack intrinsics characteristics hence eleminating the need for some clamps) but this already generates much better code. v2: incorporate feedback from José, Keith and use shuffle instead of bitcasts/extracts. Due to llvm deficiencies the latter cause all data to get moved to GPRs and back in pieces (even though the data in the regs actually stays the same...). commit c9970d70e05f95d3f52fe7d2cd794176a52693aa Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 19:33:19 2012 +0000 gallivm: fix bug in simple position interpolation Accidental use of position attribute instead of just pixel coordinates. Caused failures in piglit glsl-fs-ceil and glsl-fs-floor. commit d0b6fcdb008d04d7f73d3d725615321544da5a7e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 15:31:14 2012 +0000 gallivm: fix emission of ceil opcode lp_build_ceil seems more appropriate than lp_build_trunc. This seems to be never hit though someone performs some ceil to floor magic. commit d97fafed7e62ffa6bf76560a92ea246a1a26d256 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 22 11:46:52 2012 +0000 gallivm: new vectorized path for cubemap calculations should be faster when adapted to multiple quads as only selection masks need to be different. The code is more or less a per-pixel version adapted to only do it per quad. A per pixel version would be much simpler (could drop 2 selects, 6 broadcasts and the messy horizontal add of 3 vectors at the expense of only 2 more absolute value instructions - would also just work for arbitary large vectors). This version doesn't yet work with larger vectors because the horizontal add isn't adjusted to be able to work with 2x4 vectors (and also because face selection wouldn't be done per quad just per block though that would be only a correctness issue just as with lod selection). The downside is this code is quite a bit slower. On a Core2 it can be sped up by disabling the hw blend instructions for selection and using logicop fallbacks instead, but it is still slower than the old code, hence leave that in for now. Probably will chose one or the other version based on vector length in the end. commit b375fbb18a3fd46859b7fdd42f3e9908ea4ff9a3 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 21 14:42:29 2012 +0000 gallivm: fix optimized occlusion query intrinsic name commit a9ba0a3b611e48efbb0e79eb09caa85033dbe9a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Mar 21 16:19:43 2012 +0000 draw,gallivm,llvmpipe: Call gallivm_verify_function everywhere. commit f94c2238d2bc7383e088b8845b7410439a602071 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 18:54:10 2012 +0000 gallivm: optimize calculations for cube maps a bit this does some more vectorized calculations and uses horizontal adds if possible. A definite win with sse3 otherwise it doesn't seem to make much of a difference. In any case this is arithmetically identical, cannot handle larger vectors. Should be useful as a reference point against larger vector version later... commit 21a2c1cf3c8e1ac648ff49e59fdc0e3be77e2ebb Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 15:16:27 2012 +0000 llvmpipe: slight optimization of occlusion queries using movmskps when available. While this is slightly better for cpus without popcnt we should really sum the vectors ourselves (it is also possible to cast to i4 before doing the popcnt but that doesn't help that much neither since llvm is using some optimized popcnt version for i32) commit 5ab5a35f216619bcdf55eed52b0db275c4a06c1b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 13:32:11 2012 +0000 llvmpipe: fix occlusion queries with larger vectors need to adjust casts etc. commit ff95e6fdf5f16d4ef999ffcf05ea6e8c7160b0d5 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Mar 19 20:15:25 2012 +0000 gallivm: Restore optimization passes. commit 57b05b4b36451e351659e98946dae27be0959832 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:34:22 2012 +0000 llvmpipe: use existing min2 macro commit bc9a20e19b4f600a439f45679451f2e87cd4b299 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:07:27 2012 +0000 llvmpipe: add some safeguards against really large vectors As per José's suggestion, prevent things from blowing up if some cpu would have 1024bit or larger vectors. commit 0e2b525e5ca1c5bbaa63158bde52ad1c1564a3a9 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:31:08 2012 +0000 llvmpipe: fix mask generation for uberwide vectors this was the only piece preventing 16-wide vectors from working (apart from the LP_MAX_VECTOR_WIDTH define that is), which is the maximum as we don't get more pixels in the fragment shader at once. Hence adjust that so things could be tested properly with that size even though there seems to be no practical value. commit 3c8334162211c97f3a11c7f64e9e5a2a91ad9656 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:19:41 2012 +0000 llvmpipe: fix the simple interpolation method with larger vectors so both methods actually _really_ work now. Makes textures look nice with larger vectors... commit 1cb0464ef8871be1778d43b0c56adf9c06843e2d Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 17:26:35 2012 +0000 llvmpipe: fix mask generation and position interpolation with 8-wide vectors trivial bugs, with these things start to look somewhat reasonable. Textures though have some swizzling issues it seems. commit 168277a63ef5b72542cf063c337f2d701053ff4b Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 16:04:03 2012 +0000 llvmpipe: don't overallocate variables we never have more than 16 (stamp size) / 4 (minimum possible vector size). (With larger vectors those variables are still overallocated a bit.) commit 409b54b30f81ed0aa9ed0b01affe15c72de9abd2 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:56:48 2012 +0000 llvmpipe: add some 32f8 formats to lp_test_conv Also add the ability to handle different sized vectors. commit 55dcd3af8366ebdac0af3cdb22c2588f24aa18ce Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:47:27 2012 +0000 gallivm: handle different sized vectors in conversion / pack only fully generic path for now (extract/insert per element). commit 9c040f78c54575fcd94a8808216cf415fe8868f6 Author: Roland Scheidegger <sroland@vmware.com> Date: Sun Mar 18 00:58:28 2012 +0100 llvmpipe: fix harmless use of unitialized values commit 551e9d5468b92fc7d5aa2265db9a52bb1e368a36 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:31:21 2012 +0100 gallivm: drop special path in extract_broadcast with different sized vectors Not needed, llvm can handle shuffles with different sized result vector just fine. Should hopefully generate the same code in the end, but simpler IR. commit 44da531119ffa07a421eaa041f63607cec88f6f8 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:28:49 2012 +0100 llvmpipe: adapt interpolation for handling multiple quads at once this is still WIP there are actually two methods possible not quite sure what makes the most sense, so there's code for both for now: 1) the iterative method as used before (compute attrib values at upper left corner of stamp and upper left corner of each quad initially). It is improved to handle more than one quad at once, and also do some more vectorized calculations initially for slightly better code - newer cpus have full throughput with 4 wide float vectors, hence don't try to code up a path which might be faster if there's just one channel active per attribute. 2) just do straight interpolation for each pixel. Method 2) is more work per quad, but less initially - if all quads are executed significantly more overall though. But this might change with larger vector lengths. This method would also be needed if we'd do some kind of active quad merging when operating on multiple quads at once. This path contains some hack to force llvm to generate better code, it is still far from ideal though, still generates far too many unnecessary register spills/reloads. Both methods should work with different sized vectors. Not very well tested yet, still seems to work with four-wide vectors, need changes elsewhere to be able to test with wider vectors. commit be5d3e82e2fe14ad0a46529ab79f65bf2276cd28 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:59:37 2012 +0000 draw: Cleanup. commit f85bc12c7fbacb3de2a94e88c6cd2d5ee0ec0e8d Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:43:30 2012 +0000 gallivm: More module compilation refactoring. commit d76f093198f2a06a93b2204857e6fea5fd0b3ece Author: José Fonseca <jfonseca@vmware.com> Date: Thu Mar 15 21:29:11 2012 +0000 llvmpipe: Use gallivm_compile/free_function() in linear code. Should had been done before. commit 122e1adb613ce083ad739b153ced1cde61dfc8c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 13 14:47:10 2012 +0100 llvmpipe: generate partial pixel mask for multiple quads still works with one quad, cannot be tested yet with more At least for now always fixed order with multiple quads. commit 4c4f15081d75ed585a01392cd2dcce0ad10e0ea8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 22:09:24 2012 +0100 llvmpipe: refactor state setup a bit Refactor to make it easier to emit (and potentially later fetch in fs) coefficients for multiple attributes at once. Need to think more about how to make this actually happen however, the problem is different attributes can have different interpolation modes, requiring different handling in both setup and fs (though linear and perspective handling is close). commit 9363e49722ff47094d688a4be6f015a03fba9c79 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 19:23:23 2012 +0100 llvmpipe: vectorize tri offset calc cuts number of instructions in quad-offset-factor from 107 to 75. This code actually duplicated the (scalar) code calculating the determinant except it used different vertex order (leading to different sign but it doesn't matter) hence llvm could not have figured out it's the same (of course with determinant vectorized in the other place that wouldn't have worked any longer neither). Note this particular piece doesn't actually vectorize well, not many arithmetic instructions left but tons of shuffle instructions... Probably would need to work on n tris at a time for better vectorization. commit 63169dcb9dd445c94605625bf86d85306e2b4297 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 03:11:37 2012 +0100 llvmpipe: vectorize some scalar code in setup reduces number of arithmetic instructions, and avoids loading vector x,y values twice (once as scalars once as vectors). Results in a reduction of instructions from 76 to 64 in fs setup for glxgears (16%) on a cpu with sse41. Since this code uses vec2 disguised as vec4, on old cpus which had physical 64bit sse units (pre-Core2) it probably is less of a win in practice (and if you have no vectors you can only hope llvm eliminates the arithmetic for unneeded elements). commit 732ecb877f951ab89bf503ac5e35ab8d838b58a1 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 7 00:32:24 2012 +0100 draw: fix clipping bug introduced by 4822fea3f0440b5205e957cd303838c3b128419c broke clipping pretty badly (verified with lineclip test) commit ef5d90b86d624c152d200c7c4056f47c3c6d2688 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 23:38:59 2012 +0100 draw: don't store vertex header per attribute storing the vertex header once per attribute is totally unnecessary. Some quick look at the generated assembly says llvm in fact cannot optimize away the additional stores (maybe due to potentially aliasing pointers somewhere). Plus, this makes the code cleaner and also allows using a vector "or" instead of scalar ones. commit 6b3a5a57b0b9850854cfbd7b586e4e50102dda71 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 19:11:01 2012 +0100 draw: do the per-vertex "boolean" clipmask "or" with vectors no point extracting the values and doing it per component. Doesn't help that much since we still extract the values elsewhere anyway. commit 36519caf1af40e4480251cc79a2d527350b7c61f Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 2 22:27:01 2012 +0100 gallivm: fix lp_build_extract_broadcast with different sized vectors Fix the obviously wrong argument, so it doesn't blow up. commit 76d0ac3ad85066d6058486638013afd02b069c58 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 2 12:16:23 2012 +0000 draw: Compile per module and not per function (WIP). Enough to get gears w/ LLVM draw + softpipe to work on AVX doing: GALLIUM_DRIVER=softpipe SOFTPIPE_USE_LLVM=yes glxgears But still hackish -- will need to rethink and refactor this. commit 78e32b247d2a7a771be9a1a07eb000d1e54ea8bd Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 12:01:05 2012 +0000 llvmpipe: Remove lp_state_setup_fallback. Never used. commit 6895d5e40d19b4972c361e8b83fdb7eecda3c225 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Feb 27 19:14:27 2012 +0000 llvmpipe: Don't emit EMMS on x86 We already take precautions to ensure that LLVM never emits MMX code. commit 4822fea3f0440b5205e957cd303838c3b128419c Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 29 15:58:19 2012 +0100 draw: modifications for larger vector sizes We want to be able to use larger vectors especially for running the vertex shader. With this patch we build soa vectors which might have a different length than 4. Note that aos structures really remain the same, only when aos structures are converted to soa potentially different sized vectors are used. Samplers probably don't work yet, didn't look at them. Testing done: glxgears works with both 128bit and 256bit vectors. commit f4950fc1ea784680ab767d3dd0dce589f4e70603 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:57 2012 +0100 gallivm: override native vector width with LP_NATIVE_VECTOR_WIDTH env var for debug commit 6ad6dbf0c92f3bf68ae54e5f2aca035d19b76e53 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:24 2012 +0100 draw: allocate storage with alignment according to native vector width commit 7bf0e3e7c9bd2469ae7279cabf4c5229ae9880c1 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Feb 24 19:06:08 2012 +0000 gallivm: Fix comment grammar. Was missing several words. Spotted by Roland. commit b20f1b28eb890b2fa2de44a0399b9b6a0d453c52 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 19:22:09 2012 +0000 gallivm: Use MC-JIT on LLVM 3.1 + (i.e, SVN) MC-JIT Note: MC-JIT is still WIP. For this to work correctly it requires LLVM changes which are not yet upstream. commit b1af4dfcadfc241fd4023f4c3f823a1286d452c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 20:03:15 2012 +0100 llvmpipe: use new lp_type_width() helper in lp_test_blend commit 04e0a37e888237d4db2298f31973af459ef9c95f Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 19:50:34 2012 +0100 llvmpipe: clean up lp_test_blend a little Using variables just sized and aligned right makes it a bit more obvious what's going on. The test still only tests vector length 4. For AoS anything else probably isn't going to work. For SoA other lengths should work (at least with floats). commit e61c393d3ec392ddee0a3da170e985fda885a823 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:48:30 2012 +0000 gallivm: Ensure vector width consistency. Instead of assuming that everything is the max native size. commit 330081ac7bc41c5754a92825e51456d231bf84dd Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:44:14 2012 +0000 draw: More simd vector width consistency fixes. commit d90ca002753596269e37297e2e6c139b19f29f03 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:43:00 2012 +0000 gallivm: Remove unused lp_build_int32_vec4_type() helper. commit cae23417824d75869c202aaf897808d73a2c1db0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 17:32:16 2012 +0100 gallivm: use global variable for native vector width instead of define We do not know the simd extensions (and hence the simd width we should use) available at compile time. At least for now keep a define for maximum vector width, since a global variable obviously can't be used to adjust alignment of automatic stack variables. Leave the runtime-determined value at 128 for now in all cases. commit 51270ace6349acc2c294fc6f34c025c707be538a Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 15:41:02 2012 +0000 gallivm: Add a hunk inadvertedly lost when rebasing. commit bf256df9cfdd0236637a455cbaece949b1253e98 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:24:23 2012 +0000 llvmpipe: Use consistent vector width in depth/stencil test. commit 5543b0901677146662c44be2cfba655fd55da94b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:19:59 2012 +0000 draw: Use a consistent the vector register width. Instead of 4x32 sometimes, LP_NATIVE_VECTOR_WIDTH other times. commit eada8bbd22a3a61f549f32fe2a7e408222e5c824 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 12:08:04 2012 +0000 gallivm: Remove garbagge collection. MC-JIT will require one compilation per module (as opposed to one compilation per function), therefore no state will be shared, eliminating the need to do garbagge collection. commit 556697ea0ed72e0641851e4fbbbb862c470fd7eb Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 10:33:41 2012 +0000 gallivm: Move all native target initialization to lp_set_target_options(). commit c518e8f3f2649d5dc265403511fab4bcbe2cc5c8 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:52:32 2012 +0000 llvmpipe: Create one gallivm instance for each test. commit 90f10af8920ec6be6f2b1e7365cfc477a0cb111d Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:48:08 2012 +0000 gallivm: Avoid LLVMAddGlobalMapping() in lp_bld_assert(). Brittle, complex, and unecesary. Just use function pointer constant. commit 98fde550b33401e3fe006af59db4db628bcbf476 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:21:26 2012 +0000 gallivm: Add a lp_build_const_func_pointer() helper. To be reused in all places where we want to call C code. commit 6cfedadb62c2ce5af8d75969bc95a607f3ece118 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:44:41 2012 +0000 gallivm: Cleanup/simplify lp_build_const_string_variable. - Move to lp_bld_const where it belongs - Rename to lp_build_const_string - take the length from the argument (and don't count the zero terminator twice) - bitcast the constant to generic i8 * commit db1d4018c0f1fa682a9da93c032977659adfb68c Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 11:52:17 2012 +0000 gallivm: Set NoFramePointerElimNonLeaf to true where supported. commit 088614164aa915baaa5044fede728aa898483183 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 22 19:38:47 2012 +0100 llvmpipe: pass in/out pointers rather scalar floats in lp_bld_arit we don't want llvm to potentially optimize away the vectors (though it doesn't seem to currently), plus we want to be able to handle in/out vectors of arbitrary length. commit 3f5c4e04af8a7592fdffa54938a277c34ae76b51 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:22:55 2012 +0100 gallivm: fix lp_build_sqrt() for vector length 1 since we optimize away vectors with length 1 need to emit intrinsic without vector type. commit 79d94e5f93ed8ba6757b97e2026722ea31d32c06 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 17:00:46 2012 +0000 llvmpipe: Remove lp_test_round. commit 81f41b5aeb3f4126e06453cfc78990086b85b78d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:56:24 2012 +0100 llvmpipe: subsume lp_test_round into lp_test_arit Much simpler, and since the arguments aren't passed as 128bit values can run on any arch. This also uses the float instead of the double versions of the c functions (which probably was the intention anyway). In contrast to lp_test_round the output is much less verbose however. Tested vector width of 32 to 512 bits - all pass except 32 (length 1) which crashes in lp_build_sqrt() due to wrong type. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 945b338b421defbd274481d8c4f7e0910fd0e7eb Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 09:55:03 2012 +0000 gallivm: Centralize the function compilation logic. This simplifies a lot of code. Also doing this in a central place will make it easier to carry out the changes necessary to use MC-JIT in the future. gallivm: Fix typo in explicit derivative shuffle. Trivial. draw: make DEBUG_STORE work again adapt to lp_build_printf() interface changes Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: get rid of vecnf_from_scalar() just use lp_build_broadcast directly (cannot assign a name but don't really need it, vecnf_from_scalar() was producing much uglier IR due to using repeated insertelement instead of insertelement+shuffle). Reviewed-by: José Fonseca <jfonseca@vmware.com> llvmpipe: fix typo in complex interpolation code Fixes position interpolation when using complex mode (piglit fp-fragment-position and similar) Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: fix clipvertex/position storing again This appears to be the result of a bad merge. Fixes piglit tests relying on clipping, like a lot of the interpolation tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Fix explicit derivative manipulation. Same counter variable was being used in two nested loops. Use more meanigful variable names for the counter to fix and avoid this. gallivm: Prevent buffer overflow in repeat wrap mode for NPOT. Based on Roland's patch, discussion, and review . Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix dims for TGSI_TEXTURE_1D in emit_tex. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix explicit volume texture derivatives. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: fix 1d shadow texture sampling Always r coordinate is used, hence need 3 coords not two (the second one is unused). Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Enable AVX support without MCJIT, where available. For now, this just enables AVX on Windows for testing. If the code is stable then we might consider prefering the old JIT wherever possible. No change elsewhere. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2012-07-13 18:09:30 +01:00
}
if (type.floating) {
switch (nan_behavior) {
case GALLIVM_NAN_RETURN_NAN: {
LLVMValueRef isnan = lp_build_isnan(bld, b);
cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
return lp_build_select(bld, cond, a, b);
}
break;
case GALLIVM_NAN_RETURN_OTHER: {
LLVMValueRef isnan = lp_build_isnan(bld, a);
cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
return lp_build_select(bld, cond, a, b);
}
break;
2013-11-11 14:29:25 +00:00
case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
return lp_build_select(bld, cond, a, b);
case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
return lp_build_select(bld, cond, b, a);
case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
return lp_build_select(bld, cond, a, b);
break;
default:
assert(0);
cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
return lp_build_select(bld, cond, a, b);
}
} else {
cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
return lp_build_select(bld, cond, a, b);
}
2009-08-01 17:27:05 +01:00
}
2009-08-19 20:13:49 +01:00
/**
* Generate 1 - a, or ~a depending on bld->type.
*/
2009-08-01 17:27:05 +01:00
LLVMValueRef
lp_build_comp(struct lp_build_context *bld,
LLVMValueRef a)
2009-08-01 17:27:05 +01:00
{
LLVMBuilderRef builder = bld->gallivm->builder;
const struct lp_type type = bld->type;
2010-08-09 17:30:33 +01:00
assert(lp_check_value(type, a));
if(a == bld->one)
return bld->zero;
if(a == bld->zero)
return bld->one;
if(type.norm && !type.floating && !type.fixed && !type.sign) {
if(LLVMIsConstant(a))
return LLVMConstNot(a);
else
return LLVMBuildNot(builder, a, "");
}
if(LLVMIsConstant(a))
if (type.floating)
return LLVMConstFSub(bld->one, a);
else
return LLVMConstSub(bld->one, a);
2009-08-01 17:27:05 +01:00
else
if (type.floating)
return LLVMBuildFSub(builder, bld->one, a, "");
else
return LLVMBuildSub(builder, bld->one, a, "");
2009-08-01 17:27:05 +01:00
}
2009-08-19 20:13:49 +01:00
/**
* Generate a + b
*/
2009-08-01 17:27:05 +01:00
LLVMValueRef
lp_build_add(struct lp_build_context *bld,
2009-08-01 17:27:05 +01:00
LLVMValueRef a,
LLVMValueRef b)
{
LLVMBuilderRef builder = bld->gallivm->builder;
const struct lp_type type = bld->type;
LLVMValueRef res;
2009-08-01 17:27:05 +01:00
assert(lp_check_value(type, a));
assert(lp_check_value(type, b));
if(a == bld->zero)
return b;
if(b == bld->zero)
return a;
if(a == bld->undef || b == bld->undef)
return bld->undef;
2009-08-01 17:27:05 +01:00
if(bld->type.norm) {
const char *intrinsic = NULL;
2009-08-01 17:27:05 +01:00
if(a == bld->one || b == bld->one)
return bld->one;
2009-08-01 17:27:05 +01:00
if (!type.floating && !type.fixed) {
if (type.width * type.length == 128) {
if(util_cpu_caps.has_sse2) {
if(type.width == 8)
intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
if(type.width == 16)
intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
} else if (util_cpu_caps.has_altivec) {
if(type.width == 8)
intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
if(type.width == 16)
intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
}
}
if (type.width * type.length == 256) {
if(util_cpu_caps.has_avx2) {
if(type.width == 8)
intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b";
if(type.width == 16)
intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : "llvm.x86.avx2.paddus.w";
}
}
}
if (intrinsic)
return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
}
if(type.norm && !type.floating && !type.fixed) {
if (type.sign) {
uint64_t sign = (uint64_t)1 << (type.width - 1);
LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
/* a_clamp_max is the maximum a for positive b,
a_clamp_min is the minimum a for negative b. */
LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
} else {
a = lp_build_min_simple(bld, a, lp_build_comp(bld, b), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
}
}
if(LLVMIsConstant(a) && LLVMIsConstant(b))
if (type.floating)
res = LLVMConstFAdd(a, b);
else
res = LLVMConstAdd(a, b);
else
if (type.floating)
res = LLVMBuildFAdd(builder, a, b, "");
else
res = LLVMBuildAdd(builder, a, b, "");
2009-08-19 20:13:49 +01:00
/* clamp to ceiling of 1.0 */
if(bld->type.norm && (bld->type.floating || bld->type.fixed))
res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
2009-08-19 20:13:49 +01:00
/* XXX clamp to floor of -1 or 0??? */
return res;
2009-08-01 17:27:05 +01:00
}
gallivm,draw,llvmpipe: Support wider native registers. Squashed commit of the following: commit 7acb7b4f60dc505af3dd00dcff744f80315d5b0e Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:46:31 2012 +0100 draw: Don't use dynamically sized arrays. Not supported by MSVC. commit 5810c28c83647612cb372d1e763fd9d7780df3cb Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:44:16 2012 +0100 gallivm,llvmpipe: Don't use expressions with PIPE_ALIGN_VAR(). MSVC doesn't accept exceptions in _declspec(align(...)). Use a define instead. commit 8aafd1457ba572a02b289b3f3411e99a3c056072 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:41:56 2012 +0100 gallium/util: Make u_cpu_detect.h header C++ safe. commit 5795248350771f899cfbfc1a3a58f1835eb2671d Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 2 12:08:01 2012 +0100 gallium/util: Add ULL suffix to large constants. As suggested by Andy Furniss: it looks like some old gcc versions require it. commit 4c66c22727eff92226544c7d43c4eb94de359e10 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Truly disable INF/NAN tests on MSVC. Thanks to Brian for spotting this. commit 8bce274c7fad578d7eb656d9a1413f5c0844c94e Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Disable INF/NAN tests on MSVC. Somehow they are not recognized as constants. commit 6868649cff8d7fd2e2579c28d0b74ef6dd4f9716 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 15:05:24 2012 +0200 gallivm: Cleanup the 2 x 8 float -> 16 ub special path in lp_build_conv. No behaviour change intended, like 7b98455fb40c2df84cfd3cdb1eb7650f67c8a751. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 5147a0949c4407e8bce9e41d9859314b4a9ccf77 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 14:28:19 2012 +0200 gallivm: (trivial) fix issues with multiple-of-4 texture fetch Some formats can't handle non-multiple of 4 fetches I believe, but everything must support length 1 and multiples of 4. So avoid going to scalar fetch (which is very costly) just because length isn't 4. Also extend the hack to not use shift with variable count for yuv formats to arbitrary length (larger than 1) - doesn't matter how many elements we have we always want to avoid it unless we have variable shift count instruction (which we should get with avx2). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 87ebcb1bd71fa4c739451ec8ca89a7f29b168c08 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jul 4 02:09:55 2012 +0200 gallivm: (trivial) fix typo for wrap repeat mode in linear filtering aos code This would lead to bogus coordinates at the edges. (undetected by piglit because this path is only taken for block-based formats). Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 3a42717101b1619874c8932a580c0b9e6896b557 Author: José Fonseca <jfonseca@vmware.com> Date: Tue Jul 3 19:42:49 2012 +0100 gallivm: Fix TGSI integer translation with AVX. commit d71ff104085c196b16426081098fb0bde128ce4f Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 15:17:41 2012 +0100 llvmpipe: Fix LLVM JIT linear path. It was not working properly because it was looking at the JIT function before it was actually compiled. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit a94df0386213e1f5f9a6ed470c535f9688ec0a1b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Jun 28 18:07:10 2012 +0100 gallivm: Refactor lp_build_broadcast(_scalar) to share code. Doesn't really change the generated assembly, but produces more compact IR, and of course, makes code more consistent. Reviewed-by: Brian Paul <brianp@vmware.com> commit 66712ba2731fc029fa246d4fc477d61ab785edb5 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 17:30:13 2012 +0100 gallivm: Make LLVMContextRef a singleton. There are any places inside LLVM that depend on it. Too many to attempt to fix. Reviewed-by: Brian Paul <brianp@vmware.com> commit ff5fb7897495ac263f0b069370fab701b70dccef Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 28 18:15:27 2012 +0200 gallivm: don't use 8-wide texture fetch in aos path This appears to be a slight loss usually. There are probably several reasons for that: - fetching itself is scalar - filtering is pure int code hence needs splitting anyway, same for the final texel offset calculations - texture wrap related code, which can be done 8-wide, is slightly more complex with floats (with clamp_to_edge) and float operations generally more costly hence probably not much faster overall - the code needed to split when encountering different mip levels for the quads, adding complexity So, just split always for aos path (but leave it 8-wide for soa, since we do 8-wide filtering there when possible). This should certainly be revisited if we'd have avx2 support. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ce8032b43dcd8e8d816cbab6428f54b0798f945d Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:41:19 2012 +0200 gallivm: (trivial) don't extract fparts variable if not needed Did not have any consequences but unnecessary. commit aaa9aaed8f80dc282492f62aa583a7ee23a4c6d5 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:09:06 2012 +0200 gallivm: fix precision issue in aos linear int wrap code now not just passes at a quick glance but also with piglit... If we do the wrapping with floats, we also need to set the weights accordingly. We can potentially end up with different (integer) coordinates than what the integer calculations would have chosen, which means the integer weights calculated previously in this case are completely wrong. Well at least that's what I think happens, at least recalculating the weights helps. (Some day really should refactor all the wrapping, so we do whatever is fastest independent of 16bit int aos or 32bit float soa filtering.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit fd6f18588ced7ac8e081892f3bab2916623ad7a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 11:15:53 2012 +0100 gallium/util: Fix parsing of options with underscore. For example GALLIVM_DEBUG=no_brilinear which was being parsed as two options, "no" and "brilinear". commit 09a8f809088178a03e49e409fa18f1ac89561837 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 26 15:00:14 2012 +0100 gallivm: Added a generic lp_build_print_value which prints a LLVMValueRef. Updated lp_build_printf to share common code. Removed specific lp_build_print_vecX. Reviewed-by: José Fonseca <jfonseca@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit e59bdcc2c075931bfba2a84967a5ecd1dedd6eb0 Author: José Fonseca <jfonseca@vmware.com> Date: Wed May 16 15:00:23 2012 +0100 draw,llvmpipe: Avoid named struct types on LLVM 3.0 and later. Starting with LLVM 3.0, named structures are meant not for debugging, but for recursive data types, previously also known as opaque types. The recursive nature of these types leads to several memory management difficulties. Given that we don't actually need recursive types, avoid them altogether. This is an attempt to address fdo bugs 41791 and 44466. The issue is somewhat random so there's no easy way to check how effective this is. Cherry-picked from 9af1ba565dfd5cef9ee938bb7c04767d14878fbf commit df6070f618a203c7a876d984c847cde4cbc26bdb Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 14:42:53 2012 +0200 gallivm: (trivial) fix typo in faster aos linear int wrap code no longer crashes, now REALLY tested. commit d8f98dce452c867214e6782e86dc08562643c862 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 18:20:58 2012 +0200 llvmpipe: (trivial) remove bogus optimization for float aos repeat wrap This optimization for nearest filtering on the linear path generated likely bogus results, and the int path didn't have any optimizations there since the only shader using force_nearest apparently uses clamp_to_edge not repeat wrap anyway. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit c4e271a0631087c795e756a5bb6b046043b5099d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 23:01:52 2012 +0200 gallivm: faster repeat wrap for linear aos path too Even if we already have scaled integer coords, it's way faster to use the original float coord (plus some conversions) rather than use URem. The choice of what to do for texture wrapping is not really tied to int aos or float soa filtering though for some modes there can be some gains (because of easier weight calculations). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 1174a75b1806e92aee4264ffe0ffe7e70abbbfa3 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 14:39:22 2012 +0200 gallivm: improve npot tex wrap repeat in linear soa path URem gets translated into series of scalar divisions so just about anything else is faster. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit f849ffaa499ed96fa0efd3594fce255c7f22891b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 00:40:35 2012 +0100 gallivm: (trivial) fix near-invisible shift-space typo I blame the keyboard. commit 5298a0b19fe672aebeb70964c0797d5921b51cf0 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:24:28 2012 +0200 gallivm: add new intrinsic helper to deal with arbitrary vector length This helper will split vectors which are too large for the hw, or expand them if they are too small, so a caller of a function using intrinsics which uses such sizes need not split (or expand) the vectors manually and the function will still use the intrinsic instead of dropping back to generic llvm code. It can also accept scalars for use with pseudo-vector intrinsics (only useful for float arguments, all x86 scalar simd float intrinsics use 4vf32). Only used for lp_build_min/max() for now (also added the scalar float case for these while there). (Other basic binary functions could use it easily, whereas functions with a different interface would need different helpers.) Expanding vectors isn't widely used, because we always try to use build contexts with native hw vector sizes. But it might (or not) be nicer if this wouldn't need to be done, the generated code should in theory stay the same (it does get hit by lp_build_rho though already since we didn't have a intrinsic for the scalar lp_build_max case before). v2: incorporated Brian's feedback, and also made the scalar min/max case work instead of crash (all scalar simd float intrinsics take 4vf32 as argument, probably the reason why it wasn't used before). Moved to lp_bld_intr based on José's request, and passing intrinsic size instead of length. Ideally we'd derive the source type info from the passed in llvm value refs and process some llvmtype return type so we could handle intrinsics where the source and destination type isn't the same (like float/int conversions, packing instructions) but that's a bit too complicated for now. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 01aa760b99ec0b2dc8ce57a43650e83f8c1becdf Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:19:18 2012 +0200 gallivm: (trivial) increase max code size for shader disassembly 64kB was just short of what I needed (which caused a crash) hence increase to 96kB (should probably be smarter about that). commit 74aa739138d981311ce13076388382b5e89c6562 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:53:29 2012 +0100 gallivm: simplify aos float tex wrap repeat nearest just handle pot and npot the same. The previous pot handling ended up with exactly the same instructions plus 2 more (leave it in the soa path though since it is probably still cheaper there). While here also fix a issue which would cause a crash after an assert. commit 0e1e755645e9e49cfaa2025191e3245ccd723564 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:29:24 2012 +0100 gallivm: (trivial) skip floor rounding in ifloor when not signed This was only done for the non-sse41 case before, but even with sse41 this is obviously unnecessary (some callers already call itrunc in this case anyway but some might not). commit 7f01a62f27dcb1d52597b24825931e88bae76f33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:23:12 2012 +0100 gallivm: (trivial) fix bogus comments commit 5c85be25fd82e28490274c468ce7f3e6e8c1d416 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 20 11:51:57 2012 +0100 translate: Free elt8_func/elt16_func too. These were leaking. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 0ad498f36fb6f7458c7cffa73b6598adceee0a6c Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 15:55:34 2012 +0200 gallivm: fix bug for tex wrap repeat with linear sampling in aos float path The comparison needs to be against length not length_minus_one, otherwise the max texel is never chosen (for the second coordinate). Fixes piglit texwrap-1D-npot-proj (and 2D/3D versions). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit d1ad65937c5b76407dc2499b7b774ab59341209e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 16:13:43 2012 +0200 gallivm: simplify soa tex wrap repeat with npot textures and no mip filtering Similar to what is already done in aos sampling for the float path (but not the int path since we don't get normalized float coordinates there). URem is expensive and the calculation is done trivially with normalized floats instead (at least with sse41-capable cpus). (Some day should probably do the same for the mip filter path but it's much more complicated there hence the gain is smaller.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit e1e23f57ba9b910295c306d148f15643acc3fc83 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:38:56 2012 +0200 llvmpipe: (trivial) remove duplicated function declaration Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 07ca57eb09e04c48a157733255427ef5de620861 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:37:34 2012 +0200 llvmpipe: destroy setup variants on context destruction lp_delete_setup_variants() used to be called in garbage collection, but this no longer exists hence the setup shaders never got freed. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ed0003c633859a45f9963a479f4c15ae0ef1dca3 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 16:25:29 2012 +0100 gallivm: handle different ilod parts for multiple quad sampling This fixes filtering when the integer part of the lod is not the same for all quads. I'm not fully convinced of that solution yet as it just splits the vector if the levels to be sampled from are different. But otherwise we'd need to do things like some minify steps, and getting mip level base address separately anyway hence it wouldn't really look like much of a win (and making the code even more complex). This should now give identical results to single quad sampling. commit 8580ac4cfc43a64df55e84ac71ce1a774d33c0d2 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:14:47 2012 +0200 gallivm: de-duplicate sample code common to soa and aos sampling There doesn't seem to be any reason why this code dealing with cube face selection, lod and mip level calculation is separate in aos and soa sampling, and I am sick of having it to change in both places. commit fb541e5f957408ce305b272100196f1e12e5b1e8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:15:41 2012 +0200 gallivm: do mip filtering with per quad lod_fpart This gives better results for mip filtering, though the generated code might not be optimal. For now it also creates some artifacts if the lod_ipart isn't the same for all quads, since instead of using the same mip weight for all quads as previously (which just caused non-smooth gradients) this now will use the right weights but with the wrong mip level in this case (can easily be seen with things like texfilt, mipmap_tunnel). v2: use logic helper suggested by José, and fix issue with negative lod_fpart values commit f1cc84eef7d826a20fab6cd8ccef9a275ff78967 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 13 18:35:25 2012 +0200 gallivm: (trivial) fix bogus assert in lp_build_unpack_broadcast_aos_scalars commit 7c17dbae8ae290df9ce0f50781a09e8ed640c044 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:11:14 2012 +0100 util: Reimplement half <-> float conversions. Removed u_half.py used to generate the table for previous method. Previous implementation of float to half conversion was faulty for denormalised and NaNs and would require extra logic to fix, thus making the speedup of using tables irrelevant. commit 7762f59274070e1dd4b546f5cb431c2eb71ae5c3 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:12:16 2012 +0100 tests: Updated tests to properly handle NaN for half floats. commit fa94c135aea5911fd93d5dfb6e6f157fb40dce5e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:33:10 2012 +0200 gallivm: do mip level calculations per quad This is the final piece which shouldn't change the rendering output yet. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 23cbeaddfe03c09ca18c45d28955515317ffcf4c Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Jun 9 00:54:21 2012 +0200 gallivm: do per-quad cube face selection Doesn't quite fix the piglit cubemap test (not sure why actually) but doing per-quad face selection is doing the right thing and definitely an improvement. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit abfb372b3702ac97ac8b5aa80ad1b94a2cc39d33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:22:59 2012 +0200 gallivm: do all lod calculations per quad Still no functional change but lod is now converted to scalar after lod calculations. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 519368632747ae03feb5bca9c655eccbc5b751b4 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:46:10 2012 +0100 gallivm: Added support for half-float to float conversion in lp_build_conv. Updated various utility functions to support this change. commit 135b4d683a4c95f7577ba27b9bffa4a6fbd2c2e7 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:02:46 2012 +0100 gallivm: Added function for half-float to float conversion. Updated lp_build_format_aos_array to support half-float source. commit 37d648827406a20c5007abeb177698723ed86673 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:55:18 2012 +0100 util: Updated u_format_tests to rigidly test half-float boundary values. commit 2ad18165d96e578aa9046df7c93cb1c3284d8c6b Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:54:16 2012 +0100 llvmpipe: Updated lp_test_format to properly handle Inf/NaN results. commit 78740acf25aeba8a7d146493dd5c966e22c27b73 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:53:30 2012 +0100 util: Added functions for checking NaN / Inf for double and half-floats. commit 35e9f640ae01241f9e0d67fe893bbbf564c05809 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:05:13 2012 +0200 gallivm: Fix calculating rho for 3d textures for the single-quad case Discovered by accident, this looks like a very old typo bug. commit fc1220c636326536fd0541913154e62afa7cd1d8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:04:59 2012 +0200 gallivm: do calcs per-quad in lp_build_rho Still convert to scalar at the end of the function. commit 50a887ffc550bf310a6988fa2cea5c24d38c1a41 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 21 23:21:50 2012 +0200 gallivm: (trivial) return scalar in lp_build_extract_range for length 1 vectors Our type system on top of llvm's one doesn't generally support vectors of length 1, instead using scalars. So we should return a scalar from this function instead of having to bitcast the vector with length 1 later elsewhere. commit 80c71c621f9391f0f9230460198d861643324876 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 17:49:15 2012 +0100 draw: Fixed bad merge error commit c47401cfad0c9167de20ff560654f533579f452c Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:29:30 2012 +0100 draw: Updated store_clip to store whole vectors instead of individual elements. commit 2d9c1ad74b0b0b41861fffcecde39f09cc27f1cf Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:28:32 2012 +0100 gallivm: Added lp_build_fetch_rgba_aos_array. A version of lp_build_fetch_rgba_aos which is targeted at simple array formats. Reads the whole vector from memory in one, instead of reading each element individually. Tested with mesa tests and demos. commit ff7805dc2b6ef6d8b11ec4e54aab1633aef29ac8 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:27:40 2012 +0100 gallivm: Added lp_build_pad_vector. This function pads a vector with undef to a desired length. commit 701f50acef24a2791dabf4730e5b5687d6eb875d Author: James Benton <jbenton@vmware.com> Date: Fri May 18 17:27:19 2012 +0100 util: Added util_format_is_array. This function checks whether a format description is in a simple array format. commit 5e0a7fa543dcd009de26f34a7926674190fa6246 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:13:47 2012 +0100 draw: Removed draw_llvm_translate_from and draw/draw_llvm_translate.c. This is "replaced" by adding an optimised path in lp_build_fetch_rgba_aos in an upcoming patch. commit 8c886d6a7dd3fb464ecf031de6f747cb33e5361d Author: James Benton <jbenton@vmware.com> Date: Wed May 16 15:02:31 2012 +0100 draw: Modified store_aos to write the vector as one, not individual elements. commit 37337f3d657e21dfd662c7b26d61cb0f8cfa6f17 Author: James Benton <jbenton@vmware.com> Date: Wed May 16 14:16:23 2012 +0100 draw: Changed aos_to_soa to use lp_build_transpose_aos. commit bd2b69ce5d5c94b067944d1dcd5df9f8e84548f1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:14:27 2012 +0100 draw: Changed soa_to_aos to use lp_build_transpose_aos. commit 0b98a950d29a116e82ce31dfe7b82cdadb632f2b Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:45 2012 +0100 gallivm: Added lp_build_transpose_aos which converts between aos and soa. commit 69ea84531ad46fd145eb619ed1cedbe97dde7cb5 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:01 2012 +0100 gallivm: Added lp_build_interleave2_half aimed at AVX unpack instructions. commit 7a4cb1349dd35c18144ad5934525cfb9436792f9 Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 22 11:54:14 2012 +0100 gallivm: Fix build on Windows. MC-JIT not yet supported there. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit afd105fc16bb75d874e418046b80d9cc578818a1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:17:26 2012 +0100 llvmpipe: Added a error counter to lp_test_conv. Useful for keeping track of progress when fixing errors! Signed-off-by: José Fonseca <jfonseca@vmware.com> commit b644907d08c10a805657841330fc23db3963d59c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:16:46 2012 +0100 llvmpipe: Changed known failures in lp_test_conv. To comply with the recent fixes to lp_bld_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit d7061507bd94f6468581e218e61261b79c760d4f Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:14:38 2012 +0100 llvmpipe: Added fixed point types tests to lp_test_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 146b3ea39b4726dbe125ac666bd8902ea3d6ca8c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:26:35 2012 +0100 llvmpipe: Changed lp_test_conv src/dst alignment to be correct. Now based on the define rather than a fixed number. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit f3b57441f834833a4b142a951eb98df0aa874536 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:06:44 2012 +0100 gallivm: Fixed erroneous optimisation in lp_build_min/max. Previously assumed normalised was 0 to 1, but it can be -1 to 1 if type is signed. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a0613382e5a215cd146bb277646a6b394d376ae4 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:04:49 2012 +0100 gallivm: Compensate for lp_const_offset in lp_build_conv. Fixing a /*FIXME*/ to remove errors in integer conversion in lp_build_conv. Tested using lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a3d2bf15ea345bc8a0664f8f441276fd566566f3 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:01:25 2012 +0100 gallivm: Fixed overflow in lp_build_clamped_float_to_unsigned_norm. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit e7b1e76fe237613731fa6003b5e1601a2e506207 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 21 20:07:51 2012 +0100 gallivm: Fix build with LLVM 2.6 Trivial, and useful. commit d3c6bbe5c7f5ba1976710831281ab1b6a631082d Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 15 17:15:59 2012 +0100 gallivm: Enable MCJIT/AVX with vanilla LLVM 3.1. Add the necessary C++ glue, so that we don't need any modifications to the soon to be released LLVM 3.1. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 724a019a14d40fdbed21759a204a2bec8a315636 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 22:04:06 2012 +0100 gallivm: Use HAVE_LLVM 0x0301 consistently. commit af6991e2a3868e40ad599b46278551b794839748 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 21:49:06 2012 +0100 gallivm: Add MCRegisterInfo.h to silence benign warnings about missing implementation. Trivial. commit 6f8a1d75458daae2503a86c6b030ecc4bb494e23 Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Apr 2 22:14:15 2012 -0700 gallivm: Pass in a MCInstrInfo to createMCInstPrinter on llvm-3.1. llvm-3.1svn r153860 makes MCInstrInfo available to the MCInstPrinter. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 62555b6ed8760545794f83064e27cddcb3ce5284 Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 27 21:51:17 2012 -0700 gallivm: Fix method overriding in raw_debug_ostream. Use matching type qualifers to avoid method hiding. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 6a9bd784f4ac68ad0a731dcd39e5a3c39989f2be Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 13 22:40:52 2012 -0700 gallivm: Fix createOProfileJITEventListener namespace with llvm-3.1. llvm-3.1svn r152620 refactored the OProfile profiling code. createOProfileJITEventListener was moved from the llvm namespace to the llvm::JITEventListener namespace. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit b674955d39adae272a779be85aa1bd665de24e3e Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Mar 5 22:00:40 2012 -0800 gallivm: Pass in a MCRegisterInfo to MCInstPrinter on llvm-3.1. llvm-3.1svn r152043 changes createMCInstPrinter to take an additional MCRegisterInfo argument. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 11ab69971a8a31c62f6de74905dbf8c02884599f Author: Vinson Lee <vlee@freedesktop.org> Date: Wed Feb 29 21:20:53 2012 -0800 Revert "gallivm: Change getExtent and readByte to non-const with llvm-3.1." This reverts commit d5a6c172547d8964f4d4bb79637651decaf9deee. llvm-3.1svn r151687 makes MemoryObject accessor members const again. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 339960c82d2a9f5c928ee9035ed31dadb7f45537 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 14 16:19:56 2012 +0200 gallivm: (trivial) fix assertion failure for mipmapped 1d textures In lp_build_rho, we may end up with a 1-element vector (for mipmapped 1d textures), but in this case we require the type to be a non-vector type, so need a cast. commit 9d73edb727bd6d196030dc3026b7bf0c574b3e19 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:12:07 2012 +0200 gallivm: prepare for per-quad lod calculations for large vectors to be able to handle multiple quads at once in texture sampling and still do lod calculations per quad, it is necessary to get the per-quad derivatives into the lp_build_rho function. Until now these derivative values were just scalars, which isn't going to work. So we now use vectors, and since the interface needs to change we also do some different (slightly more efficient) packing of the values. For 8-wide vectors the packed derivative values for 3 coords would look like this, this scales to a arbitrary (multiple of 4) vector size: ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____ The second vector will be unused for 1d and 2d textures. To facilitate future changes the derivative values are put into a struct, since quite some functions just pass these values through. The generated code seems to be very slightly better for 2d textures (with 4-wide vectors) than before with sse2 (if you have a cpu with physical 128bit simd units - otherwise it's probably not a win). v2: suggestions from José, rename variables, add comments, use swizzle helper commit 0aa21de0d31466dac77b05c97005722e902517b8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:10:31 2012 +0200 gallivm: add undefined swizzle handling to lp_build_swizzle_aos This is useful for vectors with "holes", it lets llvm choose the most efficient shuffle instructions if some elements aren't needed without having to worry what elements to manually pick otherwise. commit 00faf3f370e7ce92f5ef51002b0ea42ef856e181 Author: José Fonseca <jfonseca@vmware.com> Date: Fri May 4 17:25:16 2012 +0100 gallivm: Get the LLVM IR optimization passes before JIT compilation. MC-JIT engine compiles the module immediately on creation, so the optimization passes were being run too late. So now we create a target data layout from a string, that matches the ABI parameters reported by the compiler. The backend optimization passes were always been run, so the performance improvement is modest (3% on multiarb mesa demo). Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit 40a43f4e2ce3074b5ce9027179d657ebba68800a Author: Roland Scheidegger <sroland@vmware.com> Date: Wed May 2 16:03:54 2012 +0200 gallivm: (trivial) fix wrong define used in lp_build_pack2 should fix stack-smashing crashes. commit e6371d0f4dffad4eb3b7a9d906c23f1c88a2ab9e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 30 21:25:29 2012 +0200 gallivm: add perf warnings when not using intrinsics with 256bit vectors Helper functions using integer sse2 intrinsics could split the vectors with AVX instead of using generic fallback (which should be faster). We don't actually expect to hit these paths (hence don't fix them up to actually do the vector splitting) so just emit warnings (for those functions where it's obvious doing split/intrinsic is faster than using generic path). Only emit warnings for 256bit vectors since we _really_ don't expect to hit arbitrary large vectors which would affect a lot more functions. The warnings do not actually depend on avx since the same logic applies to plain sse2 too (but of course again there's _really_ no reason we should hit these functions with 256bit vectors without avx). commit 8a9ea701ea7295181e846c6383bf66a5f5e47637 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:37:07 2012 +0200 gallivm: split vectors manually for avx in lp_build_pack2 (v2) There's 2 reasons for this: First, there's a llvm bug (fixed in 3.1) which generates tons of byte inserts/extracts otherwise, and second, more importantly, we want to use pack intrinsics instead of shuffles. We do this in lp_build_pack2 and not the calling code (aos sample path) because potentially other callers might find that useful too, even if for larger sequences of code using non-native vector sizes it might be better to manually split vectors. This should boost texture performance in the aos path considerably. v2: fix issues with intrinsics types with old llvm commit 27ac5b48fa1f2ea3efeb5248e2ce32264aba466e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:26:22 2012 +0200 llvmpipe: refactor lp_build_pack2 (v2) prettify, and it's unnecessary to assert when there's no intrinsic due to unsupported bit width - the shuffle path will work regardless. In contrast lp_build_packs2, should only rely on lp_build_pack2 doing the clamping for element sizes for which there is a sse2 intrinsic. v2: fix bug spotted by Jose regarding the intrinsic type for packusdw on old llvm versions. commit ddf279031f0111de4b18eaf783bdc0a1e47813c8 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:13:59 2012 +0200 gallivm: add src width check in lp_build_packs2() not doing so would skip clamping even if no sse2 pack instruction is available, which is incorrect (in theory only, such widths would also always hit a (unnecessary) assertion in lp_build_pack2(). commit e7f0ad7fe079975eae7712a6e0c54be4fae0114b Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Apr 27 15:57:00 2012 +0200 gallivm: (trivial) fix crash-causing typo for npot textures with avx commit 28a9d7f6f655b6ec508c8a3aa6ffefc1e79793a0 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Apr 25 19:38:45 2012 +0200 gallivm: (trivial) remove code mistakenly added twice. commit d5926537316f8ff67ad0a52e7242f7c5478d919b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Apr 24 21:16:15 2012 +0200 gallivm: add a new avx aos sample path (v2) Try to avoid mixing float and int address calculations. This does texture wrap modes with floats, and then the offset calculations still with ints (because of lack of precision with floats, though we could do some effort to make it work with not too large (16MB) textures). This also handles wrap repeat mode with npot-sized textures differently than either the old soa or aos int path (likely way faster but untested). Otherwise the actual address wrap code is largely similar to the soa path (not quite the same as this one also has some int code), it should get used by avx soa sampling later as well but doesn't handle more complex address modes yet (this will also have the benefit that we can use aos sampling path for all texture address modes). Generated code for that looks reasonable, but still does not split vectors explicitly for fetch/filter which means still get hit by llvm (fixed upstream) which generates hundreds of pinsrb/pextrb instead of two shuffles. It is not obvious though if it's much of a win over just doing address calcs 4-wide but with ints, even if it is definitely much less instructions on avx. piglit's texwrap seems to look exactly the same but doesn't test neither the non-normalized nor the npot cases. v2: fix comments, prettify based on Brian's and Jose's feedback. commit bffecd22dea66fb416ecff8cffd10dd4bdb73fce Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Apr 19 01:58:29 2012 +0200 gallivm: refactor aos lp_build_sample_image_nearest/linear split them up to separate address calculations and fetching/filtering. Need this for being able to do 8-wide float address calcs and 4-wide fetch/filter later (for avx). Plus the functions were very big scary monsters anyway (in particular lp_build_sample_image_linear). commit a80b325c57529adddcfa367f96f03557725c4773 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 16 17:17:18 2012 +0200 gallivm: fix lp_build_resize when truncating width but expanding vector size Missed this case which I thought was impossible - the assertion for it was right after the division by zero... (AoS) texture sampling may ask us to do this, for things like 8 4x32int vectors to 1 32x8int vector conversion (eventually, we probably don't want this to happen). commit f9c8337caa3eb185830d18bce8b95676a065b1d7 Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Apr 14 18:00:59 2012 +0200 gallivm: fix cube maps with larger vectors This makes the branchless cube face selection code work with larger vectors. Because the complexity is quite high (cannot really be improved it seems, per-face selection would reduce complexity a lot but this leads to errors unless the derivatives are calculated all from the same face which almost doubles the work to be done) it is still slower than the branching version, hence only enable this with large vectors. It doesn't actually do per-quad face selection yet (only makes sense with matching lod selection, in fact it will select the same face for all pixels based on the average of the first four pixels for now) but only different shuffles are required to make it work (the branching version actually should work with larger vectors too now thanks to the improved horizontal add but of course it cannot be extended to really select the face per-quad unless doing branching per quad). commit 7780c58869fc9a00af4f23209902db7e058e8a66 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 21:11:12 2012 +0100 llvmpipe: (trivial) fix compiler warning and also clarify comment regarding availability of popcnt instruction. commit a266dccf477df6d29a611154e988e8895892277e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 14:21:07 2012 +0100 gallivm: remove unneeded members in lp_build_sample_context Minor cleanup, the texture width, height, depth aren't accessed in their scalar form anywhere. Makes it more obvious those values should probably be fetched already vectorized (but this requires more invasive changes)... commit b678c57fb474e14f05e25658c829fc04d2792fff Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 29 15:53:55 2012 +0100 gallivm: add a helper for concatenating vectors Similar to the extract_range helper intended to get around slow code generated by llvm for 128bit insertelements. Concatenating two 128bit vectors this way will result in a single vinsertf128 operation rather than two 64bit stores plus one 128bit load, though it might be mildly useful for other purposes as well. commit 415ff228bcd0cf5e44a4c15350a661f0f5520029 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 19:41:15 2012 +0100 gallivm: add a custom 2x8f->1x16ub avx conversion path Similar to the existing 4x4f->1x16ub sse2 path, shaves off a couple instructions (min/max mostly) because it relies on pack intrinsics clamping. commit 78c08fc89f8fbcc6dba09779981b1e873e2a0299 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 18:44:07 2012 +0100 gallivm: add avx arithmetic intrinsics Add all avx intrinsics for arithmetic functions (with the exception of the horizontal add function which needs another look). Seems to pass basic tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit a586caa2800aa5ce54c173f7c0d4fc48153dbc4e Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 15:31:35 2012 +0100 gallivm: add avx logic intrinsics Add the blend intrinsics for 8-wide float and 4-wide double vectors. Since we lack 256bit int instructions these are used for int vectors as well, though obviously not for byte or word element values. The comparison intrinsics aren't extended for avx since these are only used for pre-2.7 llvm versions. commit 70275e4c13c89315fc2560a4c488c0e6935d5caf Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 00:40:53 2012 +0100 gallivm: new helper function for extract shuffles. Based on José's idea as we can need that in a couple places. Note that such shuffles should not be used lightly, since data layout of <4 x i8> is different to <16 x i8> for instance, hence might cause data rearrangement. commit 4d586dbae1b0c55915dda1759d2faea631c0a1c2 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 18:27:25 2012 +0100 gallivm: (trivial) don't overallocate shuffle variable using wrong define meant huge array... commit 06b0ec1f6d665d98c135f9573ddf4ba04b2121ad Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 17:54:20 2012 +0100 gallivm: don't do per-element extract/insert for vector element resize Instead of doing per-element extract/insert if the src vectors and dst vector differ in total size (which generates atrocious code) first change the src vectors size by using shuffles to destination vector size. We can still do better than that on AVX for packing to color buffer (by exploiting pack intrinsics characteristics hence eleminating the need for some clamps) but this already generates much better code. v2: incorporate feedback from José, Keith and use shuffle instead of bitcasts/extracts. Due to llvm deficiencies the latter cause all data to get moved to GPRs and back in pieces (even though the data in the regs actually stays the same...). commit c9970d70e05f95d3f52fe7d2cd794176a52693aa Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 19:33:19 2012 +0000 gallivm: fix bug in simple position interpolation Accidental use of position attribute instead of just pixel coordinates. Caused failures in piglit glsl-fs-ceil and glsl-fs-floor. commit d0b6fcdb008d04d7f73d3d725615321544da5a7e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 15:31:14 2012 +0000 gallivm: fix emission of ceil opcode lp_build_ceil seems more appropriate than lp_build_trunc. This seems to be never hit though someone performs some ceil to floor magic. commit d97fafed7e62ffa6bf76560a92ea246a1a26d256 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 22 11:46:52 2012 +0000 gallivm: new vectorized path for cubemap calculations should be faster when adapted to multiple quads as only selection masks need to be different. The code is more or less a per-pixel version adapted to only do it per quad. A per pixel version would be much simpler (could drop 2 selects, 6 broadcasts and the messy horizontal add of 3 vectors at the expense of only 2 more absolute value instructions - would also just work for arbitary large vectors). This version doesn't yet work with larger vectors because the horizontal add isn't adjusted to be able to work with 2x4 vectors (and also because face selection wouldn't be done per quad just per block though that would be only a correctness issue just as with lod selection). The downside is this code is quite a bit slower. On a Core2 it can be sped up by disabling the hw blend instructions for selection and using logicop fallbacks instead, but it is still slower than the old code, hence leave that in for now. Probably will chose one or the other version based on vector length in the end. commit b375fbb18a3fd46859b7fdd42f3e9908ea4ff9a3 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 21 14:42:29 2012 +0000 gallivm: fix optimized occlusion query intrinsic name commit a9ba0a3b611e48efbb0e79eb09caa85033dbe9a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Mar 21 16:19:43 2012 +0000 draw,gallivm,llvmpipe: Call gallivm_verify_function everywhere. commit f94c2238d2bc7383e088b8845b7410439a602071 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 18:54:10 2012 +0000 gallivm: optimize calculations for cube maps a bit this does some more vectorized calculations and uses horizontal adds if possible. A definite win with sse3 otherwise it doesn't seem to make much of a difference. In any case this is arithmetically identical, cannot handle larger vectors. Should be useful as a reference point against larger vector version later... commit 21a2c1cf3c8e1ac648ff49e59fdc0e3be77e2ebb Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 15:16:27 2012 +0000 llvmpipe: slight optimization of occlusion queries using movmskps when available. While this is slightly better for cpus without popcnt we should really sum the vectors ourselves (it is also possible to cast to i4 before doing the popcnt but that doesn't help that much neither since llvm is using some optimized popcnt version for i32) commit 5ab5a35f216619bcdf55eed52b0db275c4a06c1b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 13:32:11 2012 +0000 llvmpipe: fix occlusion queries with larger vectors need to adjust casts etc. commit ff95e6fdf5f16d4ef999ffcf05ea6e8c7160b0d5 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Mar 19 20:15:25 2012 +0000 gallivm: Restore optimization passes. commit 57b05b4b36451e351659e98946dae27be0959832 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:34:22 2012 +0000 llvmpipe: use existing min2 macro commit bc9a20e19b4f600a439f45679451f2e87cd4b299 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:07:27 2012 +0000 llvmpipe: add some safeguards against really large vectors As per José's suggestion, prevent things from blowing up if some cpu would have 1024bit or larger vectors. commit 0e2b525e5ca1c5bbaa63158bde52ad1c1564a3a9 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:31:08 2012 +0000 llvmpipe: fix mask generation for uberwide vectors this was the only piece preventing 16-wide vectors from working (apart from the LP_MAX_VECTOR_WIDTH define that is), which is the maximum as we don't get more pixels in the fragment shader at once. Hence adjust that so things could be tested properly with that size even though there seems to be no practical value. commit 3c8334162211c97f3a11c7f64e9e5a2a91ad9656 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:19:41 2012 +0000 llvmpipe: fix the simple interpolation method with larger vectors so both methods actually _really_ work now. Makes textures look nice with larger vectors... commit 1cb0464ef8871be1778d43b0c56adf9c06843e2d Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 17:26:35 2012 +0000 llvmpipe: fix mask generation and position interpolation with 8-wide vectors trivial bugs, with these things start to look somewhat reasonable. Textures though have some swizzling issues it seems. commit 168277a63ef5b72542cf063c337f2d701053ff4b Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 16:04:03 2012 +0000 llvmpipe: don't overallocate variables we never have more than 16 (stamp size) / 4 (minimum possible vector size). (With larger vectors those variables are still overallocated a bit.) commit 409b54b30f81ed0aa9ed0b01affe15c72de9abd2 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:56:48 2012 +0000 llvmpipe: add some 32f8 formats to lp_test_conv Also add the ability to handle different sized vectors. commit 55dcd3af8366ebdac0af3cdb22c2588f24aa18ce Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:47:27 2012 +0000 gallivm: handle different sized vectors in conversion / pack only fully generic path for now (extract/insert per element). commit 9c040f78c54575fcd94a8808216cf415fe8868f6 Author: Roland Scheidegger <sroland@vmware.com> Date: Sun Mar 18 00:58:28 2012 +0100 llvmpipe: fix harmless use of unitialized values commit 551e9d5468b92fc7d5aa2265db9a52bb1e368a36 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:31:21 2012 +0100 gallivm: drop special path in extract_broadcast with different sized vectors Not needed, llvm can handle shuffles with different sized result vector just fine. Should hopefully generate the same code in the end, but simpler IR. commit 44da531119ffa07a421eaa041f63607cec88f6f8 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:28:49 2012 +0100 llvmpipe: adapt interpolation for handling multiple quads at once this is still WIP there are actually two methods possible not quite sure what makes the most sense, so there's code for both for now: 1) the iterative method as used before (compute attrib values at upper left corner of stamp and upper left corner of each quad initially). It is improved to handle more than one quad at once, and also do some more vectorized calculations initially for slightly better code - newer cpus have full throughput with 4 wide float vectors, hence don't try to code up a path which might be faster if there's just one channel active per attribute. 2) just do straight interpolation for each pixel. Method 2) is more work per quad, but less initially - if all quads are executed significantly more overall though. But this might change with larger vector lengths. This method would also be needed if we'd do some kind of active quad merging when operating on multiple quads at once. This path contains some hack to force llvm to generate better code, it is still far from ideal though, still generates far too many unnecessary register spills/reloads. Both methods should work with different sized vectors. Not very well tested yet, still seems to work with four-wide vectors, need changes elsewhere to be able to test with wider vectors. commit be5d3e82e2fe14ad0a46529ab79f65bf2276cd28 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:59:37 2012 +0000 draw: Cleanup. commit f85bc12c7fbacb3de2a94e88c6cd2d5ee0ec0e8d Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:43:30 2012 +0000 gallivm: More module compilation refactoring. commit d76f093198f2a06a93b2204857e6fea5fd0b3ece Author: José Fonseca <jfonseca@vmware.com> Date: Thu Mar 15 21:29:11 2012 +0000 llvmpipe: Use gallivm_compile/free_function() in linear code. Should had been done before. commit 122e1adb613ce083ad739b153ced1cde61dfc8c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 13 14:47:10 2012 +0100 llvmpipe: generate partial pixel mask for multiple quads still works with one quad, cannot be tested yet with more At least for now always fixed order with multiple quads. commit 4c4f15081d75ed585a01392cd2dcce0ad10e0ea8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 22:09:24 2012 +0100 llvmpipe: refactor state setup a bit Refactor to make it easier to emit (and potentially later fetch in fs) coefficients for multiple attributes at once. Need to think more about how to make this actually happen however, the problem is different attributes can have different interpolation modes, requiring different handling in both setup and fs (though linear and perspective handling is close). commit 9363e49722ff47094d688a4be6f015a03fba9c79 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 19:23:23 2012 +0100 llvmpipe: vectorize tri offset calc cuts number of instructions in quad-offset-factor from 107 to 75. This code actually duplicated the (scalar) code calculating the determinant except it used different vertex order (leading to different sign but it doesn't matter) hence llvm could not have figured out it's the same (of course with determinant vectorized in the other place that wouldn't have worked any longer neither). Note this particular piece doesn't actually vectorize well, not many arithmetic instructions left but tons of shuffle instructions... Probably would need to work on n tris at a time for better vectorization. commit 63169dcb9dd445c94605625bf86d85306e2b4297 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 03:11:37 2012 +0100 llvmpipe: vectorize some scalar code in setup reduces number of arithmetic instructions, and avoids loading vector x,y values twice (once as scalars once as vectors). Results in a reduction of instructions from 76 to 64 in fs setup for glxgears (16%) on a cpu with sse41. Since this code uses vec2 disguised as vec4, on old cpus which had physical 64bit sse units (pre-Core2) it probably is less of a win in practice (and if you have no vectors you can only hope llvm eliminates the arithmetic for unneeded elements). commit 732ecb877f951ab89bf503ac5e35ab8d838b58a1 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 7 00:32:24 2012 +0100 draw: fix clipping bug introduced by 4822fea3f0440b5205e957cd303838c3b128419c broke clipping pretty badly (verified with lineclip test) commit ef5d90b86d624c152d200c7c4056f47c3c6d2688 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 23:38:59 2012 +0100 draw: don't store vertex header per attribute storing the vertex header once per attribute is totally unnecessary. Some quick look at the generated assembly says llvm in fact cannot optimize away the additional stores (maybe due to potentially aliasing pointers somewhere). Plus, this makes the code cleaner and also allows using a vector "or" instead of scalar ones. commit 6b3a5a57b0b9850854cfbd7b586e4e50102dda71 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 19:11:01 2012 +0100 draw: do the per-vertex "boolean" clipmask "or" with vectors no point extracting the values and doing it per component. Doesn't help that much since we still extract the values elsewhere anyway. commit 36519caf1af40e4480251cc79a2d527350b7c61f Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 2 22:27:01 2012 +0100 gallivm: fix lp_build_extract_broadcast with different sized vectors Fix the obviously wrong argument, so it doesn't blow up. commit 76d0ac3ad85066d6058486638013afd02b069c58 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 2 12:16:23 2012 +0000 draw: Compile per module and not per function (WIP). Enough to get gears w/ LLVM draw + softpipe to work on AVX doing: GALLIUM_DRIVER=softpipe SOFTPIPE_USE_LLVM=yes glxgears But still hackish -- will need to rethink and refactor this. commit 78e32b247d2a7a771be9a1a07eb000d1e54ea8bd Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 12:01:05 2012 +0000 llvmpipe: Remove lp_state_setup_fallback. Never used. commit 6895d5e40d19b4972c361e8b83fdb7eecda3c225 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Feb 27 19:14:27 2012 +0000 llvmpipe: Don't emit EMMS on x86 We already take precautions to ensure that LLVM never emits MMX code. commit 4822fea3f0440b5205e957cd303838c3b128419c Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 29 15:58:19 2012 +0100 draw: modifications for larger vector sizes We want to be able to use larger vectors especially for running the vertex shader. With this patch we build soa vectors which might have a different length than 4. Note that aos structures really remain the same, only when aos structures are converted to soa potentially different sized vectors are used. Samplers probably don't work yet, didn't look at them. Testing done: glxgears works with both 128bit and 256bit vectors. commit f4950fc1ea784680ab767d3dd0dce589f4e70603 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:57 2012 +0100 gallivm: override native vector width with LP_NATIVE_VECTOR_WIDTH env var for debug commit 6ad6dbf0c92f3bf68ae54e5f2aca035d19b76e53 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:24 2012 +0100 draw: allocate storage with alignment according to native vector width commit 7bf0e3e7c9bd2469ae7279cabf4c5229ae9880c1 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Feb 24 19:06:08 2012 +0000 gallivm: Fix comment grammar. Was missing several words. Spotted by Roland. commit b20f1b28eb890b2fa2de44a0399b9b6a0d453c52 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 19:22:09 2012 +0000 gallivm: Use MC-JIT on LLVM 3.1 + (i.e, SVN) MC-JIT Note: MC-JIT is still WIP. For this to work correctly it requires LLVM changes which are not yet upstream. commit b1af4dfcadfc241fd4023f4c3f823a1286d452c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 20:03:15 2012 +0100 llvmpipe: use new lp_type_width() helper in lp_test_blend commit 04e0a37e888237d4db2298f31973af459ef9c95f Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 19:50:34 2012 +0100 llvmpipe: clean up lp_test_blend a little Using variables just sized and aligned right makes it a bit more obvious what's going on. The test still only tests vector length 4. For AoS anything else probably isn't going to work. For SoA other lengths should work (at least with floats). commit e61c393d3ec392ddee0a3da170e985fda885a823 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:48:30 2012 +0000 gallivm: Ensure vector width consistency. Instead of assuming that everything is the max native size. commit 330081ac7bc41c5754a92825e51456d231bf84dd Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:44:14 2012 +0000 draw: More simd vector width consistency fixes. commit d90ca002753596269e37297e2e6c139b19f29f03 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:43:00 2012 +0000 gallivm: Remove unused lp_build_int32_vec4_type() helper. commit cae23417824d75869c202aaf897808d73a2c1db0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 17:32:16 2012 +0100 gallivm: use global variable for native vector width instead of define We do not know the simd extensions (and hence the simd width we should use) available at compile time. At least for now keep a define for maximum vector width, since a global variable obviously can't be used to adjust alignment of automatic stack variables. Leave the runtime-determined value at 128 for now in all cases. commit 51270ace6349acc2c294fc6f34c025c707be538a Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 15:41:02 2012 +0000 gallivm: Add a hunk inadvertedly lost when rebasing. commit bf256df9cfdd0236637a455cbaece949b1253e98 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:24:23 2012 +0000 llvmpipe: Use consistent vector width in depth/stencil test. commit 5543b0901677146662c44be2cfba655fd55da94b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:19:59 2012 +0000 draw: Use a consistent the vector register width. Instead of 4x32 sometimes, LP_NATIVE_VECTOR_WIDTH other times. commit eada8bbd22a3a61f549f32fe2a7e408222e5c824 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 12:08:04 2012 +0000 gallivm: Remove garbagge collection. MC-JIT will require one compilation per module (as opposed to one compilation per function), therefore no state will be shared, eliminating the need to do garbagge collection. commit 556697ea0ed72e0641851e4fbbbb862c470fd7eb Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 10:33:41 2012 +0000 gallivm: Move all native target initialization to lp_set_target_options(). commit c518e8f3f2649d5dc265403511fab4bcbe2cc5c8 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:52:32 2012 +0000 llvmpipe: Create one gallivm instance for each test. commit 90f10af8920ec6be6f2b1e7365cfc477a0cb111d Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:48:08 2012 +0000 gallivm: Avoid LLVMAddGlobalMapping() in lp_bld_assert(). Brittle, complex, and unecesary. Just use function pointer constant. commit 98fde550b33401e3fe006af59db4db628bcbf476 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:21:26 2012 +0000 gallivm: Add a lp_build_const_func_pointer() helper. To be reused in all places where we want to call C code. commit 6cfedadb62c2ce5af8d75969bc95a607f3ece118 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:44:41 2012 +0000 gallivm: Cleanup/simplify lp_build_const_string_variable. - Move to lp_bld_const where it belongs - Rename to lp_build_const_string - take the length from the argument (and don't count the zero terminator twice) - bitcast the constant to generic i8 * commit db1d4018c0f1fa682a9da93c032977659adfb68c Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 11:52:17 2012 +0000 gallivm: Set NoFramePointerElimNonLeaf to true where supported. commit 088614164aa915baaa5044fede728aa898483183 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 22 19:38:47 2012 +0100 llvmpipe: pass in/out pointers rather scalar floats in lp_bld_arit we don't want llvm to potentially optimize away the vectors (though it doesn't seem to currently), plus we want to be able to handle in/out vectors of arbitrary length. commit 3f5c4e04af8a7592fdffa54938a277c34ae76b51 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:22:55 2012 +0100 gallivm: fix lp_build_sqrt() for vector length 1 since we optimize away vectors with length 1 need to emit intrinsic without vector type. commit 79d94e5f93ed8ba6757b97e2026722ea31d32c06 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 17:00:46 2012 +0000 llvmpipe: Remove lp_test_round. commit 81f41b5aeb3f4126e06453cfc78990086b85b78d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:56:24 2012 +0100 llvmpipe: subsume lp_test_round into lp_test_arit Much simpler, and since the arguments aren't passed as 128bit values can run on any arch. This also uses the float instead of the double versions of the c functions (which probably was the intention anyway). In contrast to lp_test_round the output is much less verbose however. Tested vector width of 32 to 512 bits - all pass except 32 (length 1) which crashes in lp_build_sqrt() due to wrong type. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 945b338b421defbd274481d8c4f7e0910fd0e7eb Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 09:55:03 2012 +0000 gallivm: Centralize the function compilation logic. This simplifies a lot of code. Also doing this in a central place will make it easier to carry out the changes necessary to use MC-JIT in the future. gallivm: Fix typo in explicit derivative shuffle. Trivial. draw: make DEBUG_STORE work again adapt to lp_build_printf() interface changes Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: get rid of vecnf_from_scalar() just use lp_build_broadcast directly (cannot assign a name but don't really need it, vecnf_from_scalar() was producing much uglier IR due to using repeated insertelement instead of insertelement+shuffle). Reviewed-by: José Fonseca <jfonseca@vmware.com> llvmpipe: fix typo in complex interpolation code Fixes position interpolation when using complex mode (piglit fp-fragment-position and similar) Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: fix clipvertex/position storing again This appears to be the result of a bad merge. Fixes piglit tests relying on clipping, like a lot of the interpolation tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Fix explicit derivative manipulation. Same counter variable was being used in two nested loops. Use more meanigful variable names for the counter to fix and avoid this. gallivm: Prevent buffer overflow in repeat wrap mode for NPOT. Based on Roland's patch, discussion, and review . Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix dims for TGSI_TEXTURE_1D in emit_tex. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix explicit volume texture derivatives. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: fix 1d shadow texture sampling Always r coordinate is used, hence need 3 coords not two (the second one is unused). Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Enable AVX support without MCJIT, where available. For now, this just enables AVX on Windows for testing. If the code is stable then we might consider prefering the old JIT wherever possible. No change elsewhere. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2012-07-13 18:09:30 +01:00
/** Return the scalar sum of the elements of a.
* Should avoid this operation whenever possible.
*/
2010-03-11 23:26:12 +00:00
LLVMValueRef
gallivm,draw,llvmpipe: Support wider native registers. Squashed commit of the following: commit 7acb7b4f60dc505af3dd00dcff744f80315d5b0e Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:46:31 2012 +0100 draw: Don't use dynamically sized arrays. Not supported by MSVC. commit 5810c28c83647612cb372d1e763fd9d7780df3cb Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:44:16 2012 +0100 gallivm,llvmpipe: Don't use expressions with PIPE_ALIGN_VAR(). MSVC doesn't accept exceptions in _declspec(align(...)). Use a define instead. commit 8aafd1457ba572a02b289b3f3411e99a3c056072 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:41:56 2012 +0100 gallium/util: Make u_cpu_detect.h header C++ safe. commit 5795248350771f899cfbfc1a3a58f1835eb2671d Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 2 12:08:01 2012 +0100 gallium/util: Add ULL suffix to large constants. As suggested by Andy Furniss: it looks like some old gcc versions require it. commit 4c66c22727eff92226544c7d43c4eb94de359e10 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Truly disable INF/NAN tests on MSVC. Thanks to Brian for spotting this. commit 8bce274c7fad578d7eb656d9a1413f5c0844c94e Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Disable INF/NAN tests on MSVC. Somehow they are not recognized as constants. commit 6868649cff8d7fd2e2579c28d0b74ef6dd4f9716 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 15:05:24 2012 +0200 gallivm: Cleanup the 2 x 8 float -> 16 ub special path in lp_build_conv. No behaviour change intended, like 7b98455fb40c2df84cfd3cdb1eb7650f67c8a751. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 5147a0949c4407e8bce9e41d9859314b4a9ccf77 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 14:28:19 2012 +0200 gallivm: (trivial) fix issues with multiple-of-4 texture fetch Some formats can't handle non-multiple of 4 fetches I believe, but everything must support length 1 and multiples of 4. So avoid going to scalar fetch (which is very costly) just because length isn't 4. Also extend the hack to not use shift with variable count for yuv formats to arbitrary length (larger than 1) - doesn't matter how many elements we have we always want to avoid it unless we have variable shift count instruction (which we should get with avx2). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 87ebcb1bd71fa4c739451ec8ca89a7f29b168c08 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jul 4 02:09:55 2012 +0200 gallivm: (trivial) fix typo for wrap repeat mode in linear filtering aos code This would lead to bogus coordinates at the edges. (undetected by piglit because this path is only taken for block-based formats). Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 3a42717101b1619874c8932a580c0b9e6896b557 Author: José Fonseca <jfonseca@vmware.com> Date: Tue Jul 3 19:42:49 2012 +0100 gallivm: Fix TGSI integer translation with AVX. commit d71ff104085c196b16426081098fb0bde128ce4f Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 15:17:41 2012 +0100 llvmpipe: Fix LLVM JIT linear path. It was not working properly because it was looking at the JIT function before it was actually compiled. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit a94df0386213e1f5f9a6ed470c535f9688ec0a1b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Jun 28 18:07:10 2012 +0100 gallivm: Refactor lp_build_broadcast(_scalar) to share code. Doesn't really change the generated assembly, but produces more compact IR, and of course, makes code more consistent. Reviewed-by: Brian Paul <brianp@vmware.com> commit 66712ba2731fc029fa246d4fc477d61ab785edb5 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 17:30:13 2012 +0100 gallivm: Make LLVMContextRef a singleton. There are any places inside LLVM that depend on it. Too many to attempt to fix. Reviewed-by: Brian Paul <brianp@vmware.com> commit ff5fb7897495ac263f0b069370fab701b70dccef Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 28 18:15:27 2012 +0200 gallivm: don't use 8-wide texture fetch in aos path This appears to be a slight loss usually. There are probably several reasons for that: - fetching itself is scalar - filtering is pure int code hence needs splitting anyway, same for the final texel offset calculations - texture wrap related code, which can be done 8-wide, is slightly more complex with floats (with clamp_to_edge) and float operations generally more costly hence probably not much faster overall - the code needed to split when encountering different mip levels for the quads, adding complexity So, just split always for aos path (but leave it 8-wide for soa, since we do 8-wide filtering there when possible). This should certainly be revisited if we'd have avx2 support. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ce8032b43dcd8e8d816cbab6428f54b0798f945d Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:41:19 2012 +0200 gallivm: (trivial) don't extract fparts variable if not needed Did not have any consequences but unnecessary. commit aaa9aaed8f80dc282492f62aa583a7ee23a4c6d5 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:09:06 2012 +0200 gallivm: fix precision issue in aos linear int wrap code now not just passes at a quick glance but also with piglit... If we do the wrapping with floats, we also need to set the weights accordingly. We can potentially end up with different (integer) coordinates than what the integer calculations would have chosen, which means the integer weights calculated previously in this case are completely wrong. Well at least that's what I think happens, at least recalculating the weights helps. (Some day really should refactor all the wrapping, so we do whatever is fastest independent of 16bit int aos or 32bit float soa filtering.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit fd6f18588ced7ac8e081892f3bab2916623ad7a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 11:15:53 2012 +0100 gallium/util: Fix parsing of options with underscore. For example GALLIVM_DEBUG=no_brilinear which was being parsed as two options, "no" and "brilinear". commit 09a8f809088178a03e49e409fa18f1ac89561837 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 26 15:00:14 2012 +0100 gallivm: Added a generic lp_build_print_value which prints a LLVMValueRef. Updated lp_build_printf to share common code. Removed specific lp_build_print_vecX. Reviewed-by: José Fonseca <jfonseca@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit e59bdcc2c075931bfba2a84967a5ecd1dedd6eb0 Author: José Fonseca <jfonseca@vmware.com> Date: Wed May 16 15:00:23 2012 +0100 draw,llvmpipe: Avoid named struct types on LLVM 3.0 and later. Starting with LLVM 3.0, named structures are meant not for debugging, but for recursive data types, previously also known as opaque types. The recursive nature of these types leads to several memory management difficulties. Given that we don't actually need recursive types, avoid them altogether. This is an attempt to address fdo bugs 41791 and 44466. The issue is somewhat random so there's no easy way to check how effective this is. Cherry-picked from 9af1ba565dfd5cef9ee938bb7c04767d14878fbf commit df6070f618a203c7a876d984c847cde4cbc26bdb Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 14:42:53 2012 +0200 gallivm: (trivial) fix typo in faster aos linear int wrap code no longer crashes, now REALLY tested. commit d8f98dce452c867214e6782e86dc08562643c862 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 18:20:58 2012 +0200 llvmpipe: (trivial) remove bogus optimization for float aos repeat wrap This optimization for nearest filtering on the linear path generated likely bogus results, and the int path didn't have any optimizations there since the only shader using force_nearest apparently uses clamp_to_edge not repeat wrap anyway. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit c4e271a0631087c795e756a5bb6b046043b5099d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 23:01:52 2012 +0200 gallivm: faster repeat wrap for linear aos path too Even if we already have scaled integer coords, it's way faster to use the original float coord (plus some conversions) rather than use URem. The choice of what to do for texture wrapping is not really tied to int aos or float soa filtering though for some modes there can be some gains (because of easier weight calculations). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 1174a75b1806e92aee4264ffe0ffe7e70abbbfa3 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 14:39:22 2012 +0200 gallivm: improve npot tex wrap repeat in linear soa path URem gets translated into series of scalar divisions so just about anything else is faster. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit f849ffaa499ed96fa0efd3594fce255c7f22891b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 00:40:35 2012 +0100 gallivm: (trivial) fix near-invisible shift-space typo I blame the keyboard. commit 5298a0b19fe672aebeb70964c0797d5921b51cf0 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:24:28 2012 +0200 gallivm: add new intrinsic helper to deal with arbitrary vector length This helper will split vectors which are too large for the hw, or expand them if they are too small, so a caller of a function using intrinsics which uses such sizes need not split (or expand) the vectors manually and the function will still use the intrinsic instead of dropping back to generic llvm code. It can also accept scalars for use with pseudo-vector intrinsics (only useful for float arguments, all x86 scalar simd float intrinsics use 4vf32). Only used for lp_build_min/max() for now (also added the scalar float case for these while there). (Other basic binary functions could use it easily, whereas functions with a different interface would need different helpers.) Expanding vectors isn't widely used, because we always try to use build contexts with native hw vector sizes. But it might (or not) be nicer if this wouldn't need to be done, the generated code should in theory stay the same (it does get hit by lp_build_rho though already since we didn't have a intrinsic for the scalar lp_build_max case before). v2: incorporated Brian's feedback, and also made the scalar min/max case work instead of crash (all scalar simd float intrinsics take 4vf32 as argument, probably the reason why it wasn't used before). Moved to lp_bld_intr based on José's request, and passing intrinsic size instead of length. Ideally we'd derive the source type info from the passed in llvm value refs and process some llvmtype return type so we could handle intrinsics where the source and destination type isn't the same (like float/int conversions, packing instructions) but that's a bit too complicated for now. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 01aa760b99ec0b2dc8ce57a43650e83f8c1becdf Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:19:18 2012 +0200 gallivm: (trivial) increase max code size for shader disassembly 64kB was just short of what I needed (which caused a crash) hence increase to 96kB (should probably be smarter about that). commit 74aa739138d981311ce13076388382b5e89c6562 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:53:29 2012 +0100 gallivm: simplify aos float tex wrap repeat nearest just handle pot and npot the same. The previous pot handling ended up with exactly the same instructions plus 2 more (leave it in the soa path though since it is probably still cheaper there). While here also fix a issue which would cause a crash after an assert. commit 0e1e755645e9e49cfaa2025191e3245ccd723564 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:29:24 2012 +0100 gallivm: (trivial) skip floor rounding in ifloor when not signed This was only done for the non-sse41 case before, but even with sse41 this is obviously unnecessary (some callers already call itrunc in this case anyway but some might not). commit 7f01a62f27dcb1d52597b24825931e88bae76f33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:23:12 2012 +0100 gallivm: (trivial) fix bogus comments commit 5c85be25fd82e28490274c468ce7f3e6e8c1d416 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 20 11:51:57 2012 +0100 translate: Free elt8_func/elt16_func too. These were leaking. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 0ad498f36fb6f7458c7cffa73b6598adceee0a6c Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 15:55:34 2012 +0200 gallivm: fix bug for tex wrap repeat with linear sampling in aos float path The comparison needs to be against length not length_minus_one, otherwise the max texel is never chosen (for the second coordinate). Fixes piglit texwrap-1D-npot-proj (and 2D/3D versions). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit d1ad65937c5b76407dc2499b7b774ab59341209e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 16:13:43 2012 +0200 gallivm: simplify soa tex wrap repeat with npot textures and no mip filtering Similar to what is already done in aos sampling for the float path (but not the int path since we don't get normalized float coordinates there). URem is expensive and the calculation is done trivially with normalized floats instead (at least with sse41-capable cpus). (Some day should probably do the same for the mip filter path but it's much more complicated there hence the gain is smaller.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit e1e23f57ba9b910295c306d148f15643acc3fc83 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:38:56 2012 +0200 llvmpipe: (trivial) remove duplicated function declaration Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 07ca57eb09e04c48a157733255427ef5de620861 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:37:34 2012 +0200 llvmpipe: destroy setup variants on context destruction lp_delete_setup_variants() used to be called in garbage collection, but this no longer exists hence the setup shaders never got freed. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ed0003c633859a45f9963a479f4c15ae0ef1dca3 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 16:25:29 2012 +0100 gallivm: handle different ilod parts for multiple quad sampling This fixes filtering when the integer part of the lod is not the same for all quads. I'm not fully convinced of that solution yet as it just splits the vector if the levels to be sampled from are different. But otherwise we'd need to do things like some minify steps, and getting mip level base address separately anyway hence it wouldn't really look like much of a win (and making the code even more complex). This should now give identical results to single quad sampling. commit 8580ac4cfc43a64df55e84ac71ce1a774d33c0d2 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:14:47 2012 +0200 gallivm: de-duplicate sample code common to soa and aos sampling There doesn't seem to be any reason why this code dealing with cube face selection, lod and mip level calculation is separate in aos and soa sampling, and I am sick of having it to change in both places. commit fb541e5f957408ce305b272100196f1e12e5b1e8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:15:41 2012 +0200 gallivm: do mip filtering with per quad lod_fpart This gives better results for mip filtering, though the generated code might not be optimal. For now it also creates some artifacts if the lod_ipart isn't the same for all quads, since instead of using the same mip weight for all quads as previously (which just caused non-smooth gradients) this now will use the right weights but with the wrong mip level in this case (can easily be seen with things like texfilt, mipmap_tunnel). v2: use logic helper suggested by José, and fix issue with negative lod_fpart values commit f1cc84eef7d826a20fab6cd8ccef9a275ff78967 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 13 18:35:25 2012 +0200 gallivm: (trivial) fix bogus assert in lp_build_unpack_broadcast_aos_scalars commit 7c17dbae8ae290df9ce0f50781a09e8ed640c044 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:11:14 2012 +0100 util: Reimplement half <-> float conversions. Removed u_half.py used to generate the table for previous method. Previous implementation of float to half conversion was faulty for denormalised and NaNs and would require extra logic to fix, thus making the speedup of using tables irrelevant. commit 7762f59274070e1dd4b546f5cb431c2eb71ae5c3 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:12:16 2012 +0100 tests: Updated tests to properly handle NaN for half floats. commit fa94c135aea5911fd93d5dfb6e6f157fb40dce5e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:33:10 2012 +0200 gallivm: do mip level calculations per quad This is the final piece which shouldn't change the rendering output yet. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 23cbeaddfe03c09ca18c45d28955515317ffcf4c Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Jun 9 00:54:21 2012 +0200 gallivm: do per-quad cube face selection Doesn't quite fix the piglit cubemap test (not sure why actually) but doing per-quad face selection is doing the right thing and definitely an improvement. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit abfb372b3702ac97ac8b5aa80ad1b94a2cc39d33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:22:59 2012 +0200 gallivm: do all lod calculations per quad Still no functional change but lod is now converted to scalar after lod calculations. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 519368632747ae03feb5bca9c655eccbc5b751b4 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:46:10 2012 +0100 gallivm: Added support for half-float to float conversion in lp_build_conv. Updated various utility functions to support this change. commit 135b4d683a4c95f7577ba27b9bffa4a6fbd2c2e7 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:02:46 2012 +0100 gallivm: Added function for half-float to float conversion. Updated lp_build_format_aos_array to support half-float source. commit 37d648827406a20c5007abeb177698723ed86673 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:55:18 2012 +0100 util: Updated u_format_tests to rigidly test half-float boundary values. commit 2ad18165d96e578aa9046df7c93cb1c3284d8c6b Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:54:16 2012 +0100 llvmpipe: Updated lp_test_format to properly handle Inf/NaN results. commit 78740acf25aeba8a7d146493dd5c966e22c27b73 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:53:30 2012 +0100 util: Added functions for checking NaN / Inf for double and half-floats. commit 35e9f640ae01241f9e0d67fe893bbbf564c05809 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:05:13 2012 +0200 gallivm: Fix calculating rho for 3d textures for the single-quad case Discovered by accident, this looks like a very old typo bug. commit fc1220c636326536fd0541913154e62afa7cd1d8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:04:59 2012 +0200 gallivm: do calcs per-quad in lp_build_rho Still convert to scalar at the end of the function. commit 50a887ffc550bf310a6988fa2cea5c24d38c1a41 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 21 23:21:50 2012 +0200 gallivm: (trivial) return scalar in lp_build_extract_range for length 1 vectors Our type system on top of llvm's one doesn't generally support vectors of length 1, instead using scalars. So we should return a scalar from this function instead of having to bitcast the vector with length 1 later elsewhere. commit 80c71c621f9391f0f9230460198d861643324876 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 17:49:15 2012 +0100 draw: Fixed bad merge error commit c47401cfad0c9167de20ff560654f533579f452c Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:29:30 2012 +0100 draw: Updated store_clip to store whole vectors instead of individual elements. commit 2d9c1ad74b0b0b41861fffcecde39f09cc27f1cf Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:28:32 2012 +0100 gallivm: Added lp_build_fetch_rgba_aos_array. A version of lp_build_fetch_rgba_aos which is targeted at simple array formats. Reads the whole vector from memory in one, instead of reading each element individually. Tested with mesa tests and demos. commit ff7805dc2b6ef6d8b11ec4e54aab1633aef29ac8 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:27:40 2012 +0100 gallivm: Added lp_build_pad_vector. This function pads a vector with undef to a desired length. commit 701f50acef24a2791dabf4730e5b5687d6eb875d Author: James Benton <jbenton@vmware.com> Date: Fri May 18 17:27:19 2012 +0100 util: Added util_format_is_array. This function checks whether a format description is in a simple array format. commit 5e0a7fa543dcd009de26f34a7926674190fa6246 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:13:47 2012 +0100 draw: Removed draw_llvm_translate_from and draw/draw_llvm_translate.c. This is "replaced" by adding an optimised path in lp_build_fetch_rgba_aos in an upcoming patch. commit 8c886d6a7dd3fb464ecf031de6f747cb33e5361d Author: James Benton <jbenton@vmware.com> Date: Wed May 16 15:02:31 2012 +0100 draw: Modified store_aos to write the vector as one, not individual elements. commit 37337f3d657e21dfd662c7b26d61cb0f8cfa6f17 Author: James Benton <jbenton@vmware.com> Date: Wed May 16 14:16:23 2012 +0100 draw: Changed aos_to_soa to use lp_build_transpose_aos. commit bd2b69ce5d5c94b067944d1dcd5df9f8e84548f1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:14:27 2012 +0100 draw: Changed soa_to_aos to use lp_build_transpose_aos. commit 0b98a950d29a116e82ce31dfe7b82cdadb632f2b Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:45 2012 +0100 gallivm: Added lp_build_transpose_aos which converts between aos and soa. commit 69ea84531ad46fd145eb619ed1cedbe97dde7cb5 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:01 2012 +0100 gallivm: Added lp_build_interleave2_half aimed at AVX unpack instructions. commit 7a4cb1349dd35c18144ad5934525cfb9436792f9 Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 22 11:54:14 2012 +0100 gallivm: Fix build on Windows. MC-JIT not yet supported there. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit afd105fc16bb75d874e418046b80d9cc578818a1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:17:26 2012 +0100 llvmpipe: Added a error counter to lp_test_conv. Useful for keeping track of progress when fixing errors! Signed-off-by: José Fonseca <jfonseca@vmware.com> commit b644907d08c10a805657841330fc23db3963d59c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:16:46 2012 +0100 llvmpipe: Changed known failures in lp_test_conv. To comply with the recent fixes to lp_bld_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit d7061507bd94f6468581e218e61261b79c760d4f Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:14:38 2012 +0100 llvmpipe: Added fixed point types tests to lp_test_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 146b3ea39b4726dbe125ac666bd8902ea3d6ca8c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:26:35 2012 +0100 llvmpipe: Changed lp_test_conv src/dst alignment to be correct. Now based on the define rather than a fixed number. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit f3b57441f834833a4b142a951eb98df0aa874536 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:06:44 2012 +0100 gallivm: Fixed erroneous optimisation in lp_build_min/max. Previously assumed normalised was 0 to 1, but it can be -1 to 1 if type is signed. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a0613382e5a215cd146bb277646a6b394d376ae4 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:04:49 2012 +0100 gallivm: Compensate for lp_const_offset in lp_build_conv. Fixing a /*FIXME*/ to remove errors in integer conversion in lp_build_conv. Tested using lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a3d2bf15ea345bc8a0664f8f441276fd566566f3 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:01:25 2012 +0100 gallivm: Fixed overflow in lp_build_clamped_float_to_unsigned_norm. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit e7b1e76fe237613731fa6003b5e1601a2e506207 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 21 20:07:51 2012 +0100 gallivm: Fix build with LLVM 2.6 Trivial, and useful. commit d3c6bbe5c7f5ba1976710831281ab1b6a631082d Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 15 17:15:59 2012 +0100 gallivm: Enable MCJIT/AVX with vanilla LLVM 3.1. Add the necessary C++ glue, so that we don't need any modifications to the soon to be released LLVM 3.1. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 724a019a14d40fdbed21759a204a2bec8a315636 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 22:04:06 2012 +0100 gallivm: Use HAVE_LLVM 0x0301 consistently. commit af6991e2a3868e40ad599b46278551b794839748 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 21:49:06 2012 +0100 gallivm: Add MCRegisterInfo.h to silence benign warnings about missing implementation. Trivial. commit 6f8a1d75458daae2503a86c6b030ecc4bb494e23 Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Apr 2 22:14:15 2012 -0700 gallivm: Pass in a MCInstrInfo to createMCInstPrinter on llvm-3.1. llvm-3.1svn r153860 makes MCInstrInfo available to the MCInstPrinter. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 62555b6ed8760545794f83064e27cddcb3ce5284 Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 27 21:51:17 2012 -0700 gallivm: Fix method overriding in raw_debug_ostream. Use matching type qualifers to avoid method hiding. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 6a9bd784f4ac68ad0a731dcd39e5a3c39989f2be Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 13 22:40:52 2012 -0700 gallivm: Fix createOProfileJITEventListener namespace with llvm-3.1. llvm-3.1svn r152620 refactored the OProfile profiling code. createOProfileJITEventListener was moved from the llvm namespace to the llvm::JITEventListener namespace. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit b674955d39adae272a779be85aa1bd665de24e3e Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Mar 5 22:00:40 2012 -0800 gallivm: Pass in a MCRegisterInfo to MCInstPrinter on llvm-3.1. llvm-3.1svn r152043 changes createMCInstPrinter to take an additional MCRegisterInfo argument. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 11ab69971a8a31c62f6de74905dbf8c02884599f Author: Vinson Lee <vlee@freedesktop.org> Date: Wed Feb 29 21:20:53 2012 -0800 Revert "gallivm: Change getExtent and readByte to non-const with llvm-3.1." This reverts commit d5a6c172547d8964f4d4bb79637651decaf9deee. llvm-3.1svn r151687 makes MemoryObject accessor members const again. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 339960c82d2a9f5c928ee9035ed31dadb7f45537 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 14 16:19:56 2012 +0200 gallivm: (trivial) fix assertion failure for mipmapped 1d textures In lp_build_rho, we may end up with a 1-element vector (for mipmapped 1d textures), but in this case we require the type to be a non-vector type, so need a cast. commit 9d73edb727bd6d196030dc3026b7bf0c574b3e19 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:12:07 2012 +0200 gallivm: prepare for per-quad lod calculations for large vectors to be able to handle multiple quads at once in texture sampling and still do lod calculations per quad, it is necessary to get the per-quad derivatives into the lp_build_rho function. Until now these derivative values were just scalars, which isn't going to work. So we now use vectors, and since the interface needs to change we also do some different (slightly more efficient) packing of the values. For 8-wide vectors the packed derivative values for 3 coords would look like this, this scales to a arbitrary (multiple of 4) vector size: ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____ The second vector will be unused for 1d and 2d textures. To facilitate future changes the derivative values are put into a struct, since quite some functions just pass these values through. The generated code seems to be very slightly better for 2d textures (with 4-wide vectors) than before with sse2 (if you have a cpu with physical 128bit simd units - otherwise it's probably not a win). v2: suggestions from José, rename variables, add comments, use swizzle helper commit 0aa21de0d31466dac77b05c97005722e902517b8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:10:31 2012 +0200 gallivm: add undefined swizzle handling to lp_build_swizzle_aos This is useful for vectors with "holes", it lets llvm choose the most efficient shuffle instructions if some elements aren't needed without having to worry what elements to manually pick otherwise. commit 00faf3f370e7ce92f5ef51002b0ea42ef856e181 Author: José Fonseca <jfonseca@vmware.com> Date: Fri May 4 17:25:16 2012 +0100 gallivm: Get the LLVM IR optimization passes before JIT compilation. MC-JIT engine compiles the module immediately on creation, so the optimization passes were being run too late. So now we create a target data layout from a string, that matches the ABI parameters reported by the compiler. The backend optimization passes were always been run, so the performance improvement is modest (3% on multiarb mesa demo). Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit 40a43f4e2ce3074b5ce9027179d657ebba68800a Author: Roland Scheidegger <sroland@vmware.com> Date: Wed May 2 16:03:54 2012 +0200 gallivm: (trivial) fix wrong define used in lp_build_pack2 should fix stack-smashing crashes. commit e6371d0f4dffad4eb3b7a9d906c23f1c88a2ab9e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 30 21:25:29 2012 +0200 gallivm: add perf warnings when not using intrinsics with 256bit vectors Helper functions using integer sse2 intrinsics could split the vectors with AVX instead of using generic fallback (which should be faster). We don't actually expect to hit these paths (hence don't fix them up to actually do the vector splitting) so just emit warnings (for those functions where it's obvious doing split/intrinsic is faster than using generic path). Only emit warnings for 256bit vectors since we _really_ don't expect to hit arbitrary large vectors which would affect a lot more functions. The warnings do not actually depend on avx since the same logic applies to plain sse2 too (but of course again there's _really_ no reason we should hit these functions with 256bit vectors without avx). commit 8a9ea701ea7295181e846c6383bf66a5f5e47637 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:37:07 2012 +0200 gallivm: split vectors manually for avx in lp_build_pack2 (v2) There's 2 reasons for this: First, there's a llvm bug (fixed in 3.1) which generates tons of byte inserts/extracts otherwise, and second, more importantly, we want to use pack intrinsics instead of shuffles. We do this in lp_build_pack2 and not the calling code (aos sample path) because potentially other callers might find that useful too, even if for larger sequences of code using non-native vector sizes it might be better to manually split vectors. This should boost texture performance in the aos path considerably. v2: fix issues with intrinsics types with old llvm commit 27ac5b48fa1f2ea3efeb5248e2ce32264aba466e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:26:22 2012 +0200 llvmpipe: refactor lp_build_pack2 (v2) prettify, and it's unnecessary to assert when there's no intrinsic due to unsupported bit width - the shuffle path will work regardless. In contrast lp_build_packs2, should only rely on lp_build_pack2 doing the clamping for element sizes for which there is a sse2 intrinsic. v2: fix bug spotted by Jose regarding the intrinsic type for packusdw on old llvm versions. commit ddf279031f0111de4b18eaf783bdc0a1e47813c8 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:13:59 2012 +0200 gallivm: add src width check in lp_build_packs2() not doing so would skip clamping even if no sse2 pack instruction is available, which is incorrect (in theory only, such widths would also always hit a (unnecessary) assertion in lp_build_pack2(). commit e7f0ad7fe079975eae7712a6e0c54be4fae0114b Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Apr 27 15:57:00 2012 +0200 gallivm: (trivial) fix crash-causing typo for npot textures with avx commit 28a9d7f6f655b6ec508c8a3aa6ffefc1e79793a0 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Apr 25 19:38:45 2012 +0200 gallivm: (trivial) remove code mistakenly added twice. commit d5926537316f8ff67ad0a52e7242f7c5478d919b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Apr 24 21:16:15 2012 +0200 gallivm: add a new avx aos sample path (v2) Try to avoid mixing float and int address calculations. This does texture wrap modes with floats, and then the offset calculations still with ints (because of lack of precision with floats, though we could do some effort to make it work with not too large (16MB) textures). This also handles wrap repeat mode with npot-sized textures differently than either the old soa or aos int path (likely way faster but untested). Otherwise the actual address wrap code is largely similar to the soa path (not quite the same as this one also has some int code), it should get used by avx soa sampling later as well but doesn't handle more complex address modes yet (this will also have the benefit that we can use aos sampling path for all texture address modes). Generated code for that looks reasonable, but still does not split vectors explicitly for fetch/filter which means still get hit by llvm (fixed upstream) which generates hundreds of pinsrb/pextrb instead of two shuffles. It is not obvious though if it's much of a win over just doing address calcs 4-wide but with ints, even if it is definitely much less instructions on avx. piglit's texwrap seems to look exactly the same but doesn't test neither the non-normalized nor the npot cases. v2: fix comments, prettify based on Brian's and Jose's feedback. commit bffecd22dea66fb416ecff8cffd10dd4bdb73fce Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Apr 19 01:58:29 2012 +0200 gallivm: refactor aos lp_build_sample_image_nearest/linear split them up to separate address calculations and fetching/filtering. Need this for being able to do 8-wide float address calcs and 4-wide fetch/filter later (for avx). Plus the functions were very big scary monsters anyway (in particular lp_build_sample_image_linear). commit a80b325c57529adddcfa367f96f03557725c4773 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 16 17:17:18 2012 +0200 gallivm: fix lp_build_resize when truncating width but expanding vector size Missed this case which I thought was impossible - the assertion for it was right after the division by zero... (AoS) texture sampling may ask us to do this, for things like 8 4x32int vectors to 1 32x8int vector conversion (eventually, we probably don't want this to happen). commit f9c8337caa3eb185830d18bce8b95676a065b1d7 Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Apr 14 18:00:59 2012 +0200 gallivm: fix cube maps with larger vectors This makes the branchless cube face selection code work with larger vectors. Because the complexity is quite high (cannot really be improved it seems, per-face selection would reduce complexity a lot but this leads to errors unless the derivatives are calculated all from the same face which almost doubles the work to be done) it is still slower than the branching version, hence only enable this with large vectors. It doesn't actually do per-quad face selection yet (only makes sense with matching lod selection, in fact it will select the same face for all pixels based on the average of the first four pixels for now) but only different shuffles are required to make it work (the branching version actually should work with larger vectors too now thanks to the improved horizontal add but of course it cannot be extended to really select the face per-quad unless doing branching per quad). commit 7780c58869fc9a00af4f23209902db7e058e8a66 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 21:11:12 2012 +0100 llvmpipe: (trivial) fix compiler warning and also clarify comment regarding availability of popcnt instruction. commit a266dccf477df6d29a611154e988e8895892277e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 14:21:07 2012 +0100 gallivm: remove unneeded members in lp_build_sample_context Minor cleanup, the texture width, height, depth aren't accessed in their scalar form anywhere. Makes it more obvious those values should probably be fetched already vectorized (but this requires more invasive changes)... commit b678c57fb474e14f05e25658c829fc04d2792fff Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 29 15:53:55 2012 +0100 gallivm: add a helper for concatenating vectors Similar to the extract_range helper intended to get around slow code generated by llvm for 128bit insertelements. Concatenating two 128bit vectors this way will result in a single vinsertf128 operation rather than two 64bit stores plus one 128bit load, though it might be mildly useful for other purposes as well. commit 415ff228bcd0cf5e44a4c15350a661f0f5520029 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 19:41:15 2012 +0100 gallivm: add a custom 2x8f->1x16ub avx conversion path Similar to the existing 4x4f->1x16ub sse2 path, shaves off a couple instructions (min/max mostly) because it relies on pack intrinsics clamping. commit 78c08fc89f8fbcc6dba09779981b1e873e2a0299 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 18:44:07 2012 +0100 gallivm: add avx arithmetic intrinsics Add all avx intrinsics for arithmetic functions (with the exception of the horizontal add function which needs another look). Seems to pass basic tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit a586caa2800aa5ce54c173f7c0d4fc48153dbc4e Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 15:31:35 2012 +0100 gallivm: add avx logic intrinsics Add the blend intrinsics for 8-wide float and 4-wide double vectors. Since we lack 256bit int instructions these are used for int vectors as well, though obviously not for byte or word element values. The comparison intrinsics aren't extended for avx since these are only used for pre-2.7 llvm versions. commit 70275e4c13c89315fc2560a4c488c0e6935d5caf Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 00:40:53 2012 +0100 gallivm: new helper function for extract shuffles. Based on José's idea as we can need that in a couple places. Note that such shuffles should not be used lightly, since data layout of <4 x i8> is different to <16 x i8> for instance, hence might cause data rearrangement. commit 4d586dbae1b0c55915dda1759d2faea631c0a1c2 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 18:27:25 2012 +0100 gallivm: (trivial) don't overallocate shuffle variable using wrong define meant huge array... commit 06b0ec1f6d665d98c135f9573ddf4ba04b2121ad Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 17:54:20 2012 +0100 gallivm: don't do per-element extract/insert for vector element resize Instead of doing per-element extract/insert if the src vectors and dst vector differ in total size (which generates atrocious code) first change the src vectors size by using shuffles to destination vector size. We can still do better than that on AVX for packing to color buffer (by exploiting pack intrinsics characteristics hence eleminating the need for some clamps) but this already generates much better code. v2: incorporate feedback from José, Keith and use shuffle instead of bitcasts/extracts. Due to llvm deficiencies the latter cause all data to get moved to GPRs and back in pieces (even though the data in the regs actually stays the same...). commit c9970d70e05f95d3f52fe7d2cd794176a52693aa Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 19:33:19 2012 +0000 gallivm: fix bug in simple position interpolation Accidental use of position attribute instead of just pixel coordinates. Caused failures in piglit glsl-fs-ceil and glsl-fs-floor. commit d0b6fcdb008d04d7f73d3d725615321544da5a7e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 15:31:14 2012 +0000 gallivm: fix emission of ceil opcode lp_build_ceil seems more appropriate than lp_build_trunc. This seems to be never hit though someone performs some ceil to floor magic. commit d97fafed7e62ffa6bf76560a92ea246a1a26d256 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 22 11:46:52 2012 +0000 gallivm: new vectorized path for cubemap calculations should be faster when adapted to multiple quads as only selection masks need to be different. The code is more or less a per-pixel version adapted to only do it per quad. A per pixel version would be much simpler (could drop 2 selects, 6 broadcasts and the messy horizontal add of 3 vectors at the expense of only 2 more absolute value instructions - would also just work for arbitary large vectors). This version doesn't yet work with larger vectors because the horizontal add isn't adjusted to be able to work with 2x4 vectors (and also because face selection wouldn't be done per quad just per block though that would be only a correctness issue just as with lod selection). The downside is this code is quite a bit slower. On a Core2 it can be sped up by disabling the hw blend instructions for selection and using logicop fallbacks instead, but it is still slower than the old code, hence leave that in for now. Probably will chose one or the other version based on vector length in the end. commit b375fbb18a3fd46859b7fdd42f3e9908ea4ff9a3 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 21 14:42:29 2012 +0000 gallivm: fix optimized occlusion query intrinsic name commit a9ba0a3b611e48efbb0e79eb09caa85033dbe9a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Mar 21 16:19:43 2012 +0000 draw,gallivm,llvmpipe: Call gallivm_verify_function everywhere. commit f94c2238d2bc7383e088b8845b7410439a602071 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 18:54:10 2012 +0000 gallivm: optimize calculations for cube maps a bit this does some more vectorized calculations and uses horizontal adds if possible. A definite win with sse3 otherwise it doesn't seem to make much of a difference. In any case this is arithmetically identical, cannot handle larger vectors. Should be useful as a reference point against larger vector version later... commit 21a2c1cf3c8e1ac648ff49e59fdc0e3be77e2ebb Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 15:16:27 2012 +0000 llvmpipe: slight optimization of occlusion queries using movmskps when available. While this is slightly better for cpus without popcnt we should really sum the vectors ourselves (it is also possible to cast to i4 before doing the popcnt but that doesn't help that much neither since llvm is using some optimized popcnt version for i32) commit 5ab5a35f216619bcdf55eed52b0db275c4a06c1b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 13:32:11 2012 +0000 llvmpipe: fix occlusion queries with larger vectors need to adjust casts etc. commit ff95e6fdf5f16d4ef999ffcf05ea6e8c7160b0d5 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Mar 19 20:15:25 2012 +0000 gallivm: Restore optimization passes. commit 57b05b4b36451e351659e98946dae27be0959832 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:34:22 2012 +0000 llvmpipe: use existing min2 macro commit bc9a20e19b4f600a439f45679451f2e87cd4b299 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:07:27 2012 +0000 llvmpipe: add some safeguards against really large vectors As per José's suggestion, prevent things from blowing up if some cpu would have 1024bit or larger vectors. commit 0e2b525e5ca1c5bbaa63158bde52ad1c1564a3a9 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:31:08 2012 +0000 llvmpipe: fix mask generation for uberwide vectors this was the only piece preventing 16-wide vectors from working (apart from the LP_MAX_VECTOR_WIDTH define that is), which is the maximum as we don't get more pixels in the fragment shader at once. Hence adjust that so things could be tested properly with that size even though there seems to be no practical value. commit 3c8334162211c97f3a11c7f64e9e5a2a91ad9656 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:19:41 2012 +0000 llvmpipe: fix the simple interpolation method with larger vectors so both methods actually _really_ work now. Makes textures look nice with larger vectors... commit 1cb0464ef8871be1778d43b0c56adf9c06843e2d Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 17:26:35 2012 +0000 llvmpipe: fix mask generation and position interpolation with 8-wide vectors trivial bugs, with these things start to look somewhat reasonable. Textures though have some swizzling issues it seems. commit 168277a63ef5b72542cf063c337f2d701053ff4b Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 16:04:03 2012 +0000 llvmpipe: don't overallocate variables we never have more than 16 (stamp size) / 4 (minimum possible vector size). (With larger vectors those variables are still overallocated a bit.) commit 409b54b30f81ed0aa9ed0b01affe15c72de9abd2 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:56:48 2012 +0000 llvmpipe: add some 32f8 formats to lp_test_conv Also add the ability to handle different sized vectors. commit 55dcd3af8366ebdac0af3cdb22c2588f24aa18ce Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:47:27 2012 +0000 gallivm: handle different sized vectors in conversion / pack only fully generic path for now (extract/insert per element). commit 9c040f78c54575fcd94a8808216cf415fe8868f6 Author: Roland Scheidegger <sroland@vmware.com> Date: Sun Mar 18 00:58:28 2012 +0100 llvmpipe: fix harmless use of unitialized values commit 551e9d5468b92fc7d5aa2265db9a52bb1e368a36 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:31:21 2012 +0100 gallivm: drop special path in extract_broadcast with different sized vectors Not needed, llvm can handle shuffles with different sized result vector just fine. Should hopefully generate the same code in the end, but simpler IR. commit 44da531119ffa07a421eaa041f63607cec88f6f8 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:28:49 2012 +0100 llvmpipe: adapt interpolation for handling multiple quads at once this is still WIP there are actually two methods possible not quite sure what makes the most sense, so there's code for both for now: 1) the iterative method as used before (compute attrib values at upper left corner of stamp and upper left corner of each quad initially). It is improved to handle more than one quad at once, and also do some more vectorized calculations initially for slightly better code - newer cpus have full throughput with 4 wide float vectors, hence don't try to code up a path which might be faster if there's just one channel active per attribute. 2) just do straight interpolation for each pixel. Method 2) is more work per quad, but less initially - if all quads are executed significantly more overall though. But this might change with larger vector lengths. This method would also be needed if we'd do some kind of active quad merging when operating on multiple quads at once. This path contains some hack to force llvm to generate better code, it is still far from ideal though, still generates far too many unnecessary register spills/reloads. Both methods should work with different sized vectors. Not very well tested yet, still seems to work with four-wide vectors, need changes elsewhere to be able to test with wider vectors. commit be5d3e82e2fe14ad0a46529ab79f65bf2276cd28 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:59:37 2012 +0000 draw: Cleanup. commit f85bc12c7fbacb3de2a94e88c6cd2d5ee0ec0e8d Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:43:30 2012 +0000 gallivm: More module compilation refactoring. commit d76f093198f2a06a93b2204857e6fea5fd0b3ece Author: José Fonseca <jfonseca@vmware.com> Date: Thu Mar 15 21:29:11 2012 +0000 llvmpipe: Use gallivm_compile/free_function() in linear code. Should had been done before. commit 122e1adb613ce083ad739b153ced1cde61dfc8c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 13 14:47:10 2012 +0100 llvmpipe: generate partial pixel mask for multiple quads still works with one quad, cannot be tested yet with more At least for now always fixed order with multiple quads. commit 4c4f15081d75ed585a01392cd2dcce0ad10e0ea8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 22:09:24 2012 +0100 llvmpipe: refactor state setup a bit Refactor to make it easier to emit (and potentially later fetch in fs) coefficients for multiple attributes at once. Need to think more about how to make this actually happen however, the problem is different attributes can have different interpolation modes, requiring different handling in both setup and fs (though linear and perspective handling is close). commit 9363e49722ff47094d688a4be6f015a03fba9c79 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 19:23:23 2012 +0100 llvmpipe: vectorize tri offset calc cuts number of instructions in quad-offset-factor from 107 to 75. This code actually duplicated the (scalar) code calculating the determinant except it used different vertex order (leading to different sign but it doesn't matter) hence llvm could not have figured out it's the same (of course with determinant vectorized in the other place that wouldn't have worked any longer neither). Note this particular piece doesn't actually vectorize well, not many arithmetic instructions left but tons of shuffle instructions... Probably would need to work on n tris at a time for better vectorization. commit 63169dcb9dd445c94605625bf86d85306e2b4297 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 03:11:37 2012 +0100 llvmpipe: vectorize some scalar code in setup reduces number of arithmetic instructions, and avoids loading vector x,y values twice (once as scalars once as vectors). Results in a reduction of instructions from 76 to 64 in fs setup for glxgears (16%) on a cpu with sse41. Since this code uses vec2 disguised as vec4, on old cpus which had physical 64bit sse units (pre-Core2) it probably is less of a win in practice (and if you have no vectors you can only hope llvm eliminates the arithmetic for unneeded elements). commit 732ecb877f951ab89bf503ac5e35ab8d838b58a1 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 7 00:32:24 2012 +0100 draw: fix clipping bug introduced by 4822fea3f0440b5205e957cd303838c3b128419c broke clipping pretty badly (verified with lineclip test) commit ef5d90b86d624c152d200c7c4056f47c3c6d2688 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 23:38:59 2012 +0100 draw: don't store vertex header per attribute storing the vertex header once per attribute is totally unnecessary. Some quick look at the generated assembly says llvm in fact cannot optimize away the additional stores (maybe due to potentially aliasing pointers somewhere). Plus, this makes the code cleaner and also allows using a vector "or" instead of scalar ones. commit 6b3a5a57b0b9850854cfbd7b586e4e50102dda71 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 19:11:01 2012 +0100 draw: do the per-vertex "boolean" clipmask "or" with vectors no point extracting the values and doing it per component. Doesn't help that much since we still extract the values elsewhere anyway. commit 36519caf1af40e4480251cc79a2d527350b7c61f Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 2 22:27:01 2012 +0100 gallivm: fix lp_build_extract_broadcast with different sized vectors Fix the obviously wrong argument, so it doesn't blow up. commit 76d0ac3ad85066d6058486638013afd02b069c58 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 2 12:16:23 2012 +0000 draw: Compile per module and not per function (WIP). Enough to get gears w/ LLVM draw + softpipe to work on AVX doing: GALLIUM_DRIVER=softpipe SOFTPIPE_USE_LLVM=yes glxgears But still hackish -- will need to rethink and refactor this. commit 78e32b247d2a7a771be9a1a07eb000d1e54ea8bd Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 12:01:05 2012 +0000 llvmpipe: Remove lp_state_setup_fallback. Never used. commit 6895d5e40d19b4972c361e8b83fdb7eecda3c225 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Feb 27 19:14:27 2012 +0000 llvmpipe: Don't emit EMMS on x86 We already take precautions to ensure that LLVM never emits MMX code. commit 4822fea3f0440b5205e957cd303838c3b128419c Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 29 15:58:19 2012 +0100 draw: modifications for larger vector sizes We want to be able to use larger vectors especially for running the vertex shader. With this patch we build soa vectors which might have a different length than 4. Note that aos structures really remain the same, only when aos structures are converted to soa potentially different sized vectors are used. Samplers probably don't work yet, didn't look at them. Testing done: glxgears works with both 128bit and 256bit vectors. commit f4950fc1ea784680ab767d3dd0dce589f4e70603 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:57 2012 +0100 gallivm: override native vector width with LP_NATIVE_VECTOR_WIDTH env var for debug commit 6ad6dbf0c92f3bf68ae54e5f2aca035d19b76e53 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:24 2012 +0100 draw: allocate storage with alignment according to native vector width commit 7bf0e3e7c9bd2469ae7279cabf4c5229ae9880c1 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Feb 24 19:06:08 2012 +0000 gallivm: Fix comment grammar. Was missing several words. Spotted by Roland. commit b20f1b28eb890b2fa2de44a0399b9b6a0d453c52 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 19:22:09 2012 +0000 gallivm: Use MC-JIT on LLVM 3.1 + (i.e, SVN) MC-JIT Note: MC-JIT is still WIP. For this to work correctly it requires LLVM changes which are not yet upstream. commit b1af4dfcadfc241fd4023f4c3f823a1286d452c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 20:03:15 2012 +0100 llvmpipe: use new lp_type_width() helper in lp_test_blend commit 04e0a37e888237d4db2298f31973af459ef9c95f Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 19:50:34 2012 +0100 llvmpipe: clean up lp_test_blend a little Using variables just sized and aligned right makes it a bit more obvious what's going on. The test still only tests vector length 4. For AoS anything else probably isn't going to work. For SoA other lengths should work (at least with floats). commit e61c393d3ec392ddee0a3da170e985fda885a823 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:48:30 2012 +0000 gallivm: Ensure vector width consistency. Instead of assuming that everything is the max native size. commit 330081ac7bc41c5754a92825e51456d231bf84dd Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:44:14 2012 +0000 draw: More simd vector width consistency fixes. commit d90ca002753596269e37297e2e6c139b19f29f03 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:43:00 2012 +0000 gallivm: Remove unused lp_build_int32_vec4_type() helper. commit cae23417824d75869c202aaf897808d73a2c1db0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 17:32:16 2012 +0100 gallivm: use global variable for native vector width instead of define We do not know the simd extensions (and hence the simd width we should use) available at compile time. At least for now keep a define for maximum vector width, since a global variable obviously can't be used to adjust alignment of automatic stack variables. Leave the runtime-determined value at 128 for now in all cases. commit 51270ace6349acc2c294fc6f34c025c707be538a Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 15:41:02 2012 +0000 gallivm: Add a hunk inadvertedly lost when rebasing. commit bf256df9cfdd0236637a455cbaece949b1253e98 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:24:23 2012 +0000 llvmpipe: Use consistent vector width in depth/stencil test. commit 5543b0901677146662c44be2cfba655fd55da94b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:19:59 2012 +0000 draw: Use a consistent the vector register width. Instead of 4x32 sometimes, LP_NATIVE_VECTOR_WIDTH other times. commit eada8bbd22a3a61f549f32fe2a7e408222e5c824 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 12:08:04 2012 +0000 gallivm: Remove garbagge collection. MC-JIT will require one compilation per module (as opposed to one compilation per function), therefore no state will be shared, eliminating the need to do garbagge collection. commit 556697ea0ed72e0641851e4fbbbb862c470fd7eb Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 10:33:41 2012 +0000 gallivm: Move all native target initialization to lp_set_target_options(). commit c518e8f3f2649d5dc265403511fab4bcbe2cc5c8 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:52:32 2012 +0000 llvmpipe: Create one gallivm instance for each test. commit 90f10af8920ec6be6f2b1e7365cfc477a0cb111d Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:48:08 2012 +0000 gallivm: Avoid LLVMAddGlobalMapping() in lp_bld_assert(). Brittle, complex, and unecesary. Just use function pointer constant. commit 98fde550b33401e3fe006af59db4db628bcbf476 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:21:26 2012 +0000 gallivm: Add a lp_build_const_func_pointer() helper. To be reused in all places where we want to call C code. commit 6cfedadb62c2ce5af8d75969bc95a607f3ece118 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:44:41 2012 +0000 gallivm: Cleanup/simplify lp_build_const_string_variable. - Move to lp_bld_const where it belongs - Rename to lp_build_const_string - take the length from the argument (and don't count the zero terminator twice) - bitcast the constant to generic i8 * commit db1d4018c0f1fa682a9da93c032977659adfb68c Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 11:52:17 2012 +0000 gallivm: Set NoFramePointerElimNonLeaf to true where supported. commit 088614164aa915baaa5044fede728aa898483183 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 22 19:38:47 2012 +0100 llvmpipe: pass in/out pointers rather scalar floats in lp_bld_arit we don't want llvm to potentially optimize away the vectors (though it doesn't seem to currently), plus we want to be able to handle in/out vectors of arbitrary length. commit 3f5c4e04af8a7592fdffa54938a277c34ae76b51 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:22:55 2012 +0100 gallivm: fix lp_build_sqrt() for vector length 1 since we optimize away vectors with length 1 need to emit intrinsic without vector type. commit 79d94e5f93ed8ba6757b97e2026722ea31d32c06 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 17:00:46 2012 +0000 llvmpipe: Remove lp_test_round. commit 81f41b5aeb3f4126e06453cfc78990086b85b78d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:56:24 2012 +0100 llvmpipe: subsume lp_test_round into lp_test_arit Much simpler, and since the arguments aren't passed as 128bit values can run on any arch. This also uses the float instead of the double versions of the c functions (which probably was the intention anyway). In contrast to lp_test_round the output is much less verbose however. Tested vector width of 32 to 512 bits - all pass except 32 (length 1) which crashes in lp_build_sqrt() due to wrong type. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 945b338b421defbd274481d8c4f7e0910fd0e7eb Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 09:55:03 2012 +0000 gallivm: Centralize the function compilation logic. This simplifies a lot of code. Also doing this in a central place will make it easier to carry out the changes necessary to use MC-JIT in the future. gallivm: Fix typo in explicit derivative shuffle. Trivial. draw: make DEBUG_STORE work again adapt to lp_build_printf() interface changes Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: get rid of vecnf_from_scalar() just use lp_build_broadcast directly (cannot assign a name but don't really need it, vecnf_from_scalar() was producing much uglier IR due to using repeated insertelement instead of insertelement+shuffle). Reviewed-by: José Fonseca <jfonseca@vmware.com> llvmpipe: fix typo in complex interpolation code Fixes position interpolation when using complex mode (piglit fp-fragment-position and similar) Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: fix clipvertex/position storing again This appears to be the result of a bad merge. Fixes piglit tests relying on clipping, like a lot of the interpolation tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Fix explicit derivative manipulation. Same counter variable was being used in two nested loops. Use more meanigful variable names for the counter to fix and avoid this. gallivm: Prevent buffer overflow in repeat wrap mode for NPOT. Based on Roland's patch, discussion, and review . Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix dims for TGSI_TEXTURE_1D in emit_tex. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix explicit volume texture derivatives. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: fix 1d shadow texture sampling Always r coordinate is used, hence need 3 coords not two (the second one is unused). Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Enable AVX support without MCJIT, where available. For now, this just enables AVX on Windows for testing. If the code is stable then we might consider prefering the old JIT wherever possible. No change elsewhere. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2012-07-13 18:09:30 +01:00
lp_build_horizontal_add(struct lp_build_context *bld,
LLVMValueRef a)
2010-03-11 23:26:12 +00:00
{
LLVMBuilderRef builder = bld->gallivm->builder;
2010-03-11 23:26:12 +00:00
const struct lp_type type = bld->type;
LLVMValueRef index, res;
gallivm,draw,llvmpipe: Support wider native registers. Squashed commit of the following: commit 7acb7b4f60dc505af3dd00dcff744f80315d5b0e Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:46:31 2012 +0100 draw: Don't use dynamically sized arrays. Not supported by MSVC. commit 5810c28c83647612cb372d1e763fd9d7780df3cb Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:44:16 2012 +0100 gallivm,llvmpipe: Don't use expressions with PIPE_ALIGN_VAR(). MSVC doesn't accept exceptions in _declspec(align(...)). Use a define instead. commit 8aafd1457ba572a02b289b3f3411e99a3c056072 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:41:56 2012 +0100 gallium/util: Make u_cpu_detect.h header C++ safe. commit 5795248350771f899cfbfc1a3a58f1835eb2671d Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 2 12:08:01 2012 +0100 gallium/util: Add ULL suffix to large constants. As suggested by Andy Furniss: it looks like some old gcc versions require it. commit 4c66c22727eff92226544c7d43c4eb94de359e10 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Truly disable INF/NAN tests on MSVC. Thanks to Brian for spotting this. commit 8bce274c7fad578d7eb656d9a1413f5c0844c94e Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Disable INF/NAN tests on MSVC. Somehow they are not recognized as constants. commit 6868649cff8d7fd2e2579c28d0b74ef6dd4f9716 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 15:05:24 2012 +0200 gallivm: Cleanup the 2 x 8 float -> 16 ub special path in lp_build_conv. No behaviour change intended, like 7b98455fb40c2df84cfd3cdb1eb7650f67c8a751. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 5147a0949c4407e8bce9e41d9859314b4a9ccf77 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 14:28:19 2012 +0200 gallivm: (trivial) fix issues with multiple-of-4 texture fetch Some formats can't handle non-multiple of 4 fetches I believe, but everything must support length 1 and multiples of 4. So avoid going to scalar fetch (which is very costly) just because length isn't 4. Also extend the hack to not use shift with variable count for yuv formats to arbitrary length (larger than 1) - doesn't matter how many elements we have we always want to avoid it unless we have variable shift count instruction (which we should get with avx2). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 87ebcb1bd71fa4c739451ec8ca89a7f29b168c08 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jul 4 02:09:55 2012 +0200 gallivm: (trivial) fix typo for wrap repeat mode in linear filtering aos code This would lead to bogus coordinates at the edges. (undetected by piglit because this path is only taken for block-based formats). Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 3a42717101b1619874c8932a580c0b9e6896b557 Author: José Fonseca <jfonseca@vmware.com> Date: Tue Jul 3 19:42:49 2012 +0100 gallivm: Fix TGSI integer translation with AVX. commit d71ff104085c196b16426081098fb0bde128ce4f Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 15:17:41 2012 +0100 llvmpipe: Fix LLVM JIT linear path. It was not working properly because it was looking at the JIT function before it was actually compiled. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit a94df0386213e1f5f9a6ed470c535f9688ec0a1b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Jun 28 18:07:10 2012 +0100 gallivm: Refactor lp_build_broadcast(_scalar) to share code. Doesn't really change the generated assembly, but produces more compact IR, and of course, makes code more consistent. Reviewed-by: Brian Paul <brianp@vmware.com> commit 66712ba2731fc029fa246d4fc477d61ab785edb5 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 17:30:13 2012 +0100 gallivm: Make LLVMContextRef a singleton. There are any places inside LLVM that depend on it. Too many to attempt to fix. Reviewed-by: Brian Paul <brianp@vmware.com> commit ff5fb7897495ac263f0b069370fab701b70dccef Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 28 18:15:27 2012 +0200 gallivm: don't use 8-wide texture fetch in aos path This appears to be a slight loss usually. There are probably several reasons for that: - fetching itself is scalar - filtering is pure int code hence needs splitting anyway, same for the final texel offset calculations - texture wrap related code, which can be done 8-wide, is slightly more complex with floats (with clamp_to_edge) and float operations generally more costly hence probably not much faster overall - the code needed to split when encountering different mip levels for the quads, adding complexity So, just split always for aos path (but leave it 8-wide for soa, since we do 8-wide filtering there when possible). This should certainly be revisited if we'd have avx2 support. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ce8032b43dcd8e8d816cbab6428f54b0798f945d Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:41:19 2012 +0200 gallivm: (trivial) don't extract fparts variable if not needed Did not have any consequences but unnecessary. commit aaa9aaed8f80dc282492f62aa583a7ee23a4c6d5 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:09:06 2012 +0200 gallivm: fix precision issue in aos linear int wrap code now not just passes at a quick glance but also with piglit... If we do the wrapping with floats, we also need to set the weights accordingly. We can potentially end up with different (integer) coordinates than what the integer calculations would have chosen, which means the integer weights calculated previously in this case are completely wrong. Well at least that's what I think happens, at least recalculating the weights helps. (Some day really should refactor all the wrapping, so we do whatever is fastest independent of 16bit int aos or 32bit float soa filtering.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit fd6f18588ced7ac8e081892f3bab2916623ad7a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 11:15:53 2012 +0100 gallium/util: Fix parsing of options with underscore. For example GALLIVM_DEBUG=no_brilinear which was being parsed as two options, "no" and "brilinear". commit 09a8f809088178a03e49e409fa18f1ac89561837 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 26 15:00:14 2012 +0100 gallivm: Added a generic lp_build_print_value which prints a LLVMValueRef. Updated lp_build_printf to share common code. Removed specific lp_build_print_vecX. Reviewed-by: José Fonseca <jfonseca@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit e59bdcc2c075931bfba2a84967a5ecd1dedd6eb0 Author: José Fonseca <jfonseca@vmware.com> Date: Wed May 16 15:00:23 2012 +0100 draw,llvmpipe: Avoid named struct types on LLVM 3.0 and later. Starting with LLVM 3.0, named structures are meant not for debugging, but for recursive data types, previously also known as opaque types. The recursive nature of these types leads to several memory management difficulties. Given that we don't actually need recursive types, avoid them altogether. This is an attempt to address fdo bugs 41791 and 44466. The issue is somewhat random so there's no easy way to check how effective this is. Cherry-picked from 9af1ba565dfd5cef9ee938bb7c04767d14878fbf commit df6070f618a203c7a876d984c847cde4cbc26bdb Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 14:42:53 2012 +0200 gallivm: (trivial) fix typo in faster aos linear int wrap code no longer crashes, now REALLY tested. commit d8f98dce452c867214e6782e86dc08562643c862 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 18:20:58 2012 +0200 llvmpipe: (trivial) remove bogus optimization for float aos repeat wrap This optimization for nearest filtering on the linear path generated likely bogus results, and the int path didn't have any optimizations there since the only shader using force_nearest apparently uses clamp_to_edge not repeat wrap anyway. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit c4e271a0631087c795e756a5bb6b046043b5099d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 23:01:52 2012 +0200 gallivm: faster repeat wrap for linear aos path too Even if we already have scaled integer coords, it's way faster to use the original float coord (plus some conversions) rather than use URem. The choice of what to do for texture wrapping is not really tied to int aos or float soa filtering though for some modes there can be some gains (because of easier weight calculations). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 1174a75b1806e92aee4264ffe0ffe7e70abbbfa3 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 14:39:22 2012 +0200 gallivm: improve npot tex wrap repeat in linear soa path URem gets translated into series of scalar divisions so just about anything else is faster. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit f849ffaa499ed96fa0efd3594fce255c7f22891b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 00:40:35 2012 +0100 gallivm: (trivial) fix near-invisible shift-space typo I blame the keyboard. commit 5298a0b19fe672aebeb70964c0797d5921b51cf0 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:24:28 2012 +0200 gallivm: add new intrinsic helper to deal with arbitrary vector length This helper will split vectors which are too large for the hw, or expand them if they are too small, so a caller of a function using intrinsics which uses such sizes need not split (or expand) the vectors manually and the function will still use the intrinsic instead of dropping back to generic llvm code. It can also accept scalars for use with pseudo-vector intrinsics (only useful for float arguments, all x86 scalar simd float intrinsics use 4vf32). Only used for lp_build_min/max() for now (also added the scalar float case for these while there). (Other basic binary functions could use it easily, whereas functions with a different interface would need different helpers.) Expanding vectors isn't widely used, because we always try to use build contexts with native hw vector sizes. But it might (or not) be nicer if this wouldn't need to be done, the generated code should in theory stay the same (it does get hit by lp_build_rho though already since we didn't have a intrinsic for the scalar lp_build_max case before). v2: incorporated Brian's feedback, and also made the scalar min/max case work instead of crash (all scalar simd float intrinsics take 4vf32 as argument, probably the reason why it wasn't used before). Moved to lp_bld_intr based on José's request, and passing intrinsic size instead of length. Ideally we'd derive the source type info from the passed in llvm value refs and process some llvmtype return type so we could handle intrinsics where the source and destination type isn't the same (like float/int conversions, packing instructions) but that's a bit too complicated for now. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 01aa760b99ec0b2dc8ce57a43650e83f8c1becdf Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:19:18 2012 +0200 gallivm: (trivial) increase max code size for shader disassembly 64kB was just short of what I needed (which caused a crash) hence increase to 96kB (should probably be smarter about that). commit 74aa739138d981311ce13076388382b5e89c6562 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:53:29 2012 +0100 gallivm: simplify aos float tex wrap repeat nearest just handle pot and npot the same. The previous pot handling ended up with exactly the same instructions plus 2 more (leave it in the soa path though since it is probably still cheaper there). While here also fix a issue which would cause a crash after an assert. commit 0e1e755645e9e49cfaa2025191e3245ccd723564 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:29:24 2012 +0100 gallivm: (trivial) skip floor rounding in ifloor when not signed This was only done for the non-sse41 case before, but even with sse41 this is obviously unnecessary (some callers already call itrunc in this case anyway but some might not). commit 7f01a62f27dcb1d52597b24825931e88bae76f33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:23:12 2012 +0100 gallivm: (trivial) fix bogus comments commit 5c85be25fd82e28490274c468ce7f3e6e8c1d416 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 20 11:51:57 2012 +0100 translate: Free elt8_func/elt16_func too. These were leaking. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 0ad498f36fb6f7458c7cffa73b6598adceee0a6c Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 15:55:34 2012 +0200 gallivm: fix bug for tex wrap repeat with linear sampling in aos float path The comparison needs to be against length not length_minus_one, otherwise the max texel is never chosen (for the second coordinate). Fixes piglit texwrap-1D-npot-proj (and 2D/3D versions). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit d1ad65937c5b76407dc2499b7b774ab59341209e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 16:13:43 2012 +0200 gallivm: simplify soa tex wrap repeat with npot textures and no mip filtering Similar to what is already done in aos sampling for the float path (but not the int path since we don't get normalized float coordinates there). URem is expensive and the calculation is done trivially with normalized floats instead (at least with sse41-capable cpus). (Some day should probably do the same for the mip filter path but it's much more complicated there hence the gain is smaller.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit e1e23f57ba9b910295c306d148f15643acc3fc83 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:38:56 2012 +0200 llvmpipe: (trivial) remove duplicated function declaration Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 07ca57eb09e04c48a157733255427ef5de620861 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:37:34 2012 +0200 llvmpipe: destroy setup variants on context destruction lp_delete_setup_variants() used to be called in garbage collection, but this no longer exists hence the setup shaders never got freed. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ed0003c633859a45f9963a479f4c15ae0ef1dca3 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 16:25:29 2012 +0100 gallivm: handle different ilod parts for multiple quad sampling This fixes filtering when the integer part of the lod is not the same for all quads. I'm not fully convinced of that solution yet as it just splits the vector if the levels to be sampled from are different. But otherwise we'd need to do things like some minify steps, and getting mip level base address separately anyway hence it wouldn't really look like much of a win (and making the code even more complex). This should now give identical results to single quad sampling. commit 8580ac4cfc43a64df55e84ac71ce1a774d33c0d2 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:14:47 2012 +0200 gallivm: de-duplicate sample code common to soa and aos sampling There doesn't seem to be any reason why this code dealing with cube face selection, lod and mip level calculation is separate in aos and soa sampling, and I am sick of having it to change in both places. commit fb541e5f957408ce305b272100196f1e12e5b1e8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:15:41 2012 +0200 gallivm: do mip filtering with per quad lod_fpart This gives better results for mip filtering, though the generated code might not be optimal. For now it also creates some artifacts if the lod_ipart isn't the same for all quads, since instead of using the same mip weight for all quads as previously (which just caused non-smooth gradients) this now will use the right weights but with the wrong mip level in this case (can easily be seen with things like texfilt, mipmap_tunnel). v2: use logic helper suggested by José, and fix issue with negative lod_fpart values commit f1cc84eef7d826a20fab6cd8ccef9a275ff78967 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 13 18:35:25 2012 +0200 gallivm: (trivial) fix bogus assert in lp_build_unpack_broadcast_aos_scalars commit 7c17dbae8ae290df9ce0f50781a09e8ed640c044 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:11:14 2012 +0100 util: Reimplement half <-> float conversions. Removed u_half.py used to generate the table for previous method. Previous implementation of float to half conversion was faulty for denormalised and NaNs and would require extra logic to fix, thus making the speedup of using tables irrelevant. commit 7762f59274070e1dd4b546f5cb431c2eb71ae5c3 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:12:16 2012 +0100 tests: Updated tests to properly handle NaN for half floats. commit fa94c135aea5911fd93d5dfb6e6f157fb40dce5e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:33:10 2012 +0200 gallivm: do mip level calculations per quad This is the final piece which shouldn't change the rendering output yet. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 23cbeaddfe03c09ca18c45d28955515317ffcf4c Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Jun 9 00:54:21 2012 +0200 gallivm: do per-quad cube face selection Doesn't quite fix the piglit cubemap test (not sure why actually) but doing per-quad face selection is doing the right thing and definitely an improvement. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit abfb372b3702ac97ac8b5aa80ad1b94a2cc39d33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:22:59 2012 +0200 gallivm: do all lod calculations per quad Still no functional change but lod is now converted to scalar after lod calculations. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 519368632747ae03feb5bca9c655eccbc5b751b4 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:46:10 2012 +0100 gallivm: Added support for half-float to float conversion in lp_build_conv. Updated various utility functions to support this change. commit 135b4d683a4c95f7577ba27b9bffa4a6fbd2c2e7 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:02:46 2012 +0100 gallivm: Added function for half-float to float conversion. Updated lp_build_format_aos_array to support half-float source. commit 37d648827406a20c5007abeb177698723ed86673 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:55:18 2012 +0100 util: Updated u_format_tests to rigidly test half-float boundary values. commit 2ad18165d96e578aa9046df7c93cb1c3284d8c6b Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:54:16 2012 +0100 llvmpipe: Updated lp_test_format to properly handle Inf/NaN results. commit 78740acf25aeba8a7d146493dd5c966e22c27b73 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:53:30 2012 +0100 util: Added functions for checking NaN / Inf for double and half-floats. commit 35e9f640ae01241f9e0d67fe893bbbf564c05809 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:05:13 2012 +0200 gallivm: Fix calculating rho for 3d textures for the single-quad case Discovered by accident, this looks like a very old typo bug. commit fc1220c636326536fd0541913154e62afa7cd1d8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:04:59 2012 +0200 gallivm: do calcs per-quad in lp_build_rho Still convert to scalar at the end of the function. commit 50a887ffc550bf310a6988fa2cea5c24d38c1a41 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 21 23:21:50 2012 +0200 gallivm: (trivial) return scalar in lp_build_extract_range for length 1 vectors Our type system on top of llvm's one doesn't generally support vectors of length 1, instead using scalars. So we should return a scalar from this function instead of having to bitcast the vector with length 1 later elsewhere. commit 80c71c621f9391f0f9230460198d861643324876 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 17:49:15 2012 +0100 draw: Fixed bad merge error commit c47401cfad0c9167de20ff560654f533579f452c Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:29:30 2012 +0100 draw: Updated store_clip to store whole vectors instead of individual elements. commit 2d9c1ad74b0b0b41861fffcecde39f09cc27f1cf Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:28:32 2012 +0100 gallivm: Added lp_build_fetch_rgba_aos_array. A version of lp_build_fetch_rgba_aos which is targeted at simple array formats. Reads the whole vector from memory in one, instead of reading each element individually. Tested with mesa tests and demos. commit ff7805dc2b6ef6d8b11ec4e54aab1633aef29ac8 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:27:40 2012 +0100 gallivm: Added lp_build_pad_vector. This function pads a vector with undef to a desired length. commit 701f50acef24a2791dabf4730e5b5687d6eb875d Author: James Benton <jbenton@vmware.com> Date: Fri May 18 17:27:19 2012 +0100 util: Added util_format_is_array. This function checks whether a format description is in a simple array format. commit 5e0a7fa543dcd009de26f34a7926674190fa6246 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:13:47 2012 +0100 draw: Removed draw_llvm_translate_from and draw/draw_llvm_translate.c. This is "replaced" by adding an optimised path in lp_build_fetch_rgba_aos in an upcoming patch. commit 8c886d6a7dd3fb464ecf031de6f747cb33e5361d Author: James Benton <jbenton@vmware.com> Date: Wed May 16 15:02:31 2012 +0100 draw: Modified store_aos to write the vector as one, not individual elements. commit 37337f3d657e21dfd662c7b26d61cb0f8cfa6f17 Author: James Benton <jbenton@vmware.com> Date: Wed May 16 14:16:23 2012 +0100 draw: Changed aos_to_soa to use lp_build_transpose_aos. commit bd2b69ce5d5c94b067944d1dcd5df9f8e84548f1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:14:27 2012 +0100 draw: Changed soa_to_aos to use lp_build_transpose_aos. commit 0b98a950d29a116e82ce31dfe7b82cdadb632f2b Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:45 2012 +0100 gallivm: Added lp_build_transpose_aos which converts between aos and soa. commit 69ea84531ad46fd145eb619ed1cedbe97dde7cb5 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:01 2012 +0100 gallivm: Added lp_build_interleave2_half aimed at AVX unpack instructions. commit 7a4cb1349dd35c18144ad5934525cfb9436792f9 Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 22 11:54:14 2012 +0100 gallivm: Fix build on Windows. MC-JIT not yet supported there. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit afd105fc16bb75d874e418046b80d9cc578818a1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:17:26 2012 +0100 llvmpipe: Added a error counter to lp_test_conv. Useful for keeping track of progress when fixing errors! Signed-off-by: José Fonseca <jfonseca@vmware.com> commit b644907d08c10a805657841330fc23db3963d59c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:16:46 2012 +0100 llvmpipe: Changed known failures in lp_test_conv. To comply with the recent fixes to lp_bld_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit d7061507bd94f6468581e218e61261b79c760d4f Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:14:38 2012 +0100 llvmpipe: Added fixed point types tests to lp_test_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 146b3ea39b4726dbe125ac666bd8902ea3d6ca8c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:26:35 2012 +0100 llvmpipe: Changed lp_test_conv src/dst alignment to be correct. Now based on the define rather than a fixed number. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit f3b57441f834833a4b142a951eb98df0aa874536 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:06:44 2012 +0100 gallivm: Fixed erroneous optimisation in lp_build_min/max. Previously assumed normalised was 0 to 1, but it can be -1 to 1 if type is signed. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a0613382e5a215cd146bb277646a6b394d376ae4 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:04:49 2012 +0100 gallivm: Compensate for lp_const_offset in lp_build_conv. Fixing a /*FIXME*/ to remove errors in integer conversion in lp_build_conv. Tested using lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a3d2bf15ea345bc8a0664f8f441276fd566566f3 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:01:25 2012 +0100 gallivm: Fixed overflow in lp_build_clamped_float_to_unsigned_norm. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit e7b1e76fe237613731fa6003b5e1601a2e506207 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 21 20:07:51 2012 +0100 gallivm: Fix build with LLVM 2.6 Trivial, and useful. commit d3c6bbe5c7f5ba1976710831281ab1b6a631082d Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 15 17:15:59 2012 +0100 gallivm: Enable MCJIT/AVX with vanilla LLVM 3.1. Add the necessary C++ glue, so that we don't need any modifications to the soon to be released LLVM 3.1. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 724a019a14d40fdbed21759a204a2bec8a315636 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 22:04:06 2012 +0100 gallivm: Use HAVE_LLVM 0x0301 consistently. commit af6991e2a3868e40ad599b46278551b794839748 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 21:49:06 2012 +0100 gallivm: Add MCRegisterInfo.h to silence benign warnings about missing implementation. Trivial. commit 6f8a1d75458daae2503a86c6b030ecc4bb494e23 Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Apr 2 22:14:15 2012 -0700 gallivm: Pass in a MCInstrInfo to createMCInstPrinter on llvm-3.1. llvm-3.1svn r153860 makes MCInstrInfo available to the MCInstPrinter. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 62555b6ed8760545794f83064e27cddcb3ce5284 Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 27 21:51:17 2012 -0700 gallivm: Fix method overriding in raw_debug_ostream. Use matching type qualifers to avoid method hiding. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 6a9bd784f4ac68ad0a731dcd39e5a3c39989f2be Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 13 22:40:52 2012 -0700 gallivm: Fix createOProfileJITEventListener namespace with llvm-3.1. llvm-3.1svn r152620 refactored the OProfile profiling code. createOProfileJITEventListener was moved from the llvm namespace to the llvm::JITEventListener namespace. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit b674955d39adae272a779be85aa1bd665de24e3e Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Mar 5 22:00:40 2012 -0800 gallivm: Pass in a MCRegisterInfo to MCInstPrinter on llvm-3.1. llvm-3.1svn r152043 changes createMCInstPrinter to take an additional MCRegisterInfo argument. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 11ab69971a8a31c62f6de74905dbf8c02884599f Author: Vinson Lee <vlee@freedesktop.org> Date: Wed Feb 29 21:20:53 2012 -0800 Revert "gallivm: Change getExtent and readByte to non-const with llvm-3.1." This reverts commit d5a6c172547d8964f4d4bb79637651decaf9deee. llvm-3.1svn r151687 makes MemoryObject accessor members const again. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 339960c82d2a9f5c928ee9035ed31dadb7f45537 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 14 16:19:56 2012 +0200 gallivm: (trivial) fix assertion failure for mipmapped 1d textures In lp_build_rho, we may end up with a 1-element vector (for mipmapped 1d textures), but in this case we require the type to be a non-vector type, so need a cast. commit 9d73edb727bd6d196030dc3026b7bf0c574b3e19 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:12:07 2012 +0200 gallivm: prepare for per-quad lod calculations for large vectors to be able to handle multiple quads at once in texture sampling and still do lod calculations per quad, it is necessary to get the per-quad derivatives into the lp_build_rho function. Until now these derivative values were just scalars, which isn't going to work. So we now use vectors, and since the interface needs to change we also do some different (slightly more efficient) packing of the values. For 8-wide vectors the packed derivative values for 3 coords would look like this, this scales to a arbitrary (multiple of 4) vector size: ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____ The second vector will be unused for 1d and 2d textures. To facilitate future changes the derivative values are put into a struct, since quite some functions just pass these values through. The generated code seems to be very slightly better for 2d textures (with 4-wide vectors) than before with sse2 (if you have a cpu with physical 128bit simd units - otherwise it's probably not a win). v2: suggestions from José, rename variables, add comments, use swizzle helper commit 0aa21de0d31466dac77b05c97005722e902517b8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:10:31 2012 +0200 gallivm: add undefined swizzle handling to lp_build_swizzle_aos This is useful for vectors with "holes", it lets llvm choose the most efficient shuffle instructions if some elements aren't needed without having to worry what elements to manually pick otherwise. commit 00faf3f370e7ce92f5ef51002b0ea42ef856e181 Author: José Fonseca <jfonseca@vmware.com> Date: Fri May 4 17:25:16 2012 +0100 gallivm: Get the LLVM IR optimization passes before JIT compilation. MC-JIT engine compiles the module immediately on creation, so the optimization passes were being run too late. So now we create a target data layout from a string, that matches the ABI parameters reported by the compiler. The backend optimization passes were always been run, so the performance improvement is modest (3% on multiarb mesa demo). Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit 40a43f4e2ce3074b5ce9027179d657ebba68800a Author: Roland Scheidegger <sroland@vmware.com> Date: Wed May 2 16:03:54 2012 +0200 gallivm: (trivial) fix wrong define used in lp_build_pack2 should fix stack-smashing crashes. commit e6371d0f4dffad4eb3b7a9d906c23f1c88a2ab9e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 30 21:25:29 2012 +0200 gallivm: add perf warnings when not using intrinsics with 256bit vectors Helper functions using integer sse2 intrinsics could split the vectors with AVX instead of using generic fallback (which should be faster). We don't actually expect to hit these paths (hence don't fix them up to actually do the vector splitting) so just emit warnings (for those functions where it's obvious doing split/intrinsic is faster than using generic path). Only emit warnings for 256bit vectors since we _really_ don't expect to hit arbitrary large vectors which would affect a lot more functions. The warnings do not actually depend on avx since the same logic applies to plain sse2 too (but of course again there's _really_ no reason we should hit these functions with 256bit vectors without avx). commit 8a9ea701ea7295181e846c6383bf66a5f5e47637 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:37:07 2012 +0200 gallivm: split vectors manually for avx in lp_build_pack2 (v2) There's 2 reasons for this: First, there's a llvm bug (fixed in 3.1) which generates tons of byte inserts/extracts otherwise, and second, more importantly, we want to use pack intrinsics instead of shuffles. We do this in lp_build_pack2 and not the calling code (aos sample path) because potentially other callers might find that useful too, even if for larger sequences of code using non-native vector sizes it might be better to manually split vectors. This should boost texture performance in the aos path considerably. v2: fix issues with intrinsics types with old llvm commit 27ac5b48fa1f2ea3efeb5248e2ce32264aba466e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:26:22 2012 +0200 llvmpipe: refactor lp_build_pack2 (v2) prettify, and it's unnecessary to assert when there's no intrinsic due to unsupported bit width - the shuffle path will work regardless. In contrast lp_build_packs2, should only rely on lp_build_pack2 doing the clamping for element sizes for which there is a sse2 intrinsic. v2: fix bug spotted by Jose regarding the intrinsic type for packusdw on old llvm versions. commit ddf279031f0111de4b18eaf783bdc0a1e47813c8 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:13:59 2012 +0200 gallivm: add src width check in lp_build_packs2() not doing so would skip clamping even if no sse2 pack instruction is available, which is incorrect (in theory only, such widths would also always hit a (unnecessary) assertion in lp_build_pack2(). commit e7f0ad7fe079975eae7712a6e0c54be4fae0114b Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Apr 27 15:57:00 2012 +0200 gallivm: (trivial) fix crash-causing typo for npot textures with avx commit 28a9d7f6f655b6ec508c8a3aa6ffefc1e79793a0 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Apr 25 19:38:45 2012 +0200 gallivm: (trivial) remove code mistakenly added twice. commit d5926537316f8ff67ad0a52e7242f7c5478d919b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Apr 24 21:16:15 2012 +0200 gallivm: add a new avx aos sample path (v2) Try to avoid mixing float and int address calculations. This does texture wrap modes with floats, and then the offset calculations still with ints (because of lack of precision with floats, though we could do some effort to make it work with not too large (16MB) textures). This also handles wrap repeat mode with npot-sized textures differently than either the old soa or aos int path (likely way faster but untested). Otherwise the actual address wrap code is largely similar to the soa path (not quite the same as this one also has some int code), it should get used by avx soa sampling later as well but doesn't handle more complex address modes yet (this will also have the benefit that we can use aos sampling path for all texture address modes). Generated code for that looks reasonable, but still does not split vectors explicitly for fetch/filter which means still get hit by llvm (fixed upstream) which generates hundreds of pinsrb/pextrb instead of two shuffles. It is not obvious though if it's much of a win over just doing address calcs 4-wide but with ints, even if it is definitely much less instructions on avx. piglit's texwrap seems to look exactly the same but doesn't test neither the non-normalized nor the npot cases. v2: fix comments, prettify based on Brian's and Jose's feedback. commit bffecd22dea66fb416ecff8cffd10dd4bdb73fce Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Apr 19 01:58:29 2012 +0200 gallivm: refactor aos lp_build_sample_image_nearest/linear split them up to separate address calculations and fetching/filtering. Need this for being able to do 8-wide float address calcs and 4-wide fetch/filter later (for avx). Plus the functions were very big scary monsters anyway (in particular lp_build_sample_image_linear). commit a80b325c57529adddcfa367f96f03557725c4773 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 16 17:17:18 2012 +0200 gallivm: fix lp_build_resize when truncating width but expanding vector size Missed this case which I thought was impossible - the assertion for it was right after the division by zero... (AoS) texture sampling may ask us to do this, for things like 8 4x32int vectors to 1 32x8int vector conversion (eventually, we probably don't want this to happen). commit f9c8337caa3eb185830d18bce8b95676a065b1d7 Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Apr 14 18:00:59 2012 +0200 gallivm: fix cube maps with larger vectors This makes the branchless cube face selection code work with larger vectors. Because the complexity is quite high (cannot really be improved it seems, per-face selection would reduce complexity a lot but this leads to errors unless the derivatives are calculated all from the same face which almost doubles the work to be done) it is still slower than the branching version, hence only enable this with large vectors. It doesn't actually do per-quad face selection yet (only makes sense with matching lod selection, in fact it will select the same face for all pixels based on the average of the first four pixels for now) but only different shuffles are required to make it work (the branching version actually should work with larger vectors too now thanks to the improved horizontal add but of course it cannot be extended to really select the face per-quad unless doing branching per quad). commit 7780c58869fc9a00af4f23209902db7e058e8a66 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 21:11:12 2012 +0100 llvmpipe: (trivial) fix compiler warning and also clarify comment regarding availability of popcnt instruction. commit a266dccf477df6d29a611154e988e8895892277e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 14:21:07 2012 +0100 gallivm: remove unneeded members in lp_build_sample_context Minor cleanup, the texture width, height, depth aren't accessed in their scalar form anywhere. Makes it more obvious those values should probably be fetched already vectorized (but this requires more invasive changes)... commit b678c57fb474e14f05e25658c829fc04d2792fff Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 29 15:53:55 2012 +0100 gallivm: add a helper for concatenating vectors Similar to the extract_range helper intended to get around slow code generated by llvm for 128bit insertelements. Concatenating two 128bit vectors this way will result in a single vinsertf128 operation rather than two 64bit stores plus one 128bit load, though it might be mildly useful for other purposes as well. commit 415ff228bcd0cf5e44a4c15350a661f0f5520029 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 19:41:15 2012 +0100 gallivm: add a custom 2x8f->1x16ub avx conversion path Similar to the existing 4x4f->1x16ub sse2 path, shaves off a couple instructions (min/max mostly) because it relies on pack intrinsics clamping. commit 78c08fc89f8fbcc6dba09779981b1e873e2a0299 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 18:44:07 2012 +0100 gallivm: add avx arithmetic intrinsics Add all avx intrinsics for arithmetic functions (with the exception of the horizontal add function which needs another look). Seems to pass basic tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit a586caa2800aa5ce54c173f7c0d4fc48153dbc4e Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 15:31:35 2012 +0100 gallivm: add avx logic intrinsics Add the blend intrinsics for 8-wide float and 4-wide double vectors. Since we lack 256bit int instructions these are used for int vectors as well, though obviously not for byte or word element values. The comparison intrinsics aren't extended for avx since these are only used for pre-2.7 llvm versions. commit 70275e4c13c89315fc2560a4c488c0e6935d5caf Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 00:40:53 2012 +0100 gallivm: new helper function for extract shuffles. Based on José's idea as we can need that in a couple places. Note that such shuffles should not be used lightly, since data layout of <4 x i8> is different to <16 x i8> for instance, hence might cause data rearrangement. commit 4d586dbae1b0c55915dda1759d2faea631c0a1c2 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 18:27:25 2012 +0100 gallivm: (trivial) don't overallocate shuffle variable using wrong define meant huge array... commit 06b0ec1f6d665d98c135f9573ddf4ba04b2121ad Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 17:54:20 2012 +0100 gallivm: don't do per-element extract/insert for vector element resize Instead of doing per-element extract/insert if the src vectors and dst vector differ in total size (which generates atrocious code) first change the src vectors size by using shuffles to destination vector size. We can still do better than that on AVX for packing to color buffer (by exploiting pack intrinsics characteristics hence eleminating the need for some clamps) but this already generates much better code. v2: incorporate feedback from José, Keith and use shuffle instead of bitcasts/extracts. Due to llvm deficiencies the latter cause all data to get moved to GPRs and back in pieces (even though the data in the regs actually stays the same...). commit c9970d70e05f95d3f52fe7d2cd794176a52693aa Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 19:33:19 2012 +0000 gallivm: fix bug in simple position interpolation Accidental use of position attribute instead of just pixel coordinates. Caused failures in piglit glsl-fs-ceil and glsl-fs-floor. commit d0b6fcdb008d04d7f73d3d725615321544da5a7e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 15:31:14 2012 +0000 gallivm: fix emission of ceil opcode lp_build_ceil seems more appropriate than lp_build_trunc. This seems to be never hit though someone performs some ceil to floor magic. commit d97fafed7e62ffa6bf76560a92ea246a1a26d256 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 22 11:46:52 2012 +0000 gallivm: new vectorized path for cubemap calculations should be faster when adapted to multiple quads as only selection masks need to be different. The code is more or less a per-pixel version adapted to only do it per quad. A per pixel version would be much simpler (could drop 2 selects, 6 broadcasts and the messy horizontal add of 3 vectors at the expense of only 2 more absolute value instructions - would also just work for arbitary large vectors). This version doesn't yet work with larger vectors because the horizontal add isn't adjusted to be able to work with 2x4 vectors (and also because face selection wouldn't be done per quad just per block though that would be only a correctness issue just as with lod selection). The downside is this code is quite a bit slower. On a Core2 it can be sped up by disabling the hw blend instructions for selection and using logicop fallbacks instead, but it is still slower than the old code, hence leave that in for now. Probably will chose one or the other version based on vector length in the end. commit b375fbb18a3fd46859b7fdd42f3e9908ea4ff9a3 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 21 14:42:29 2012 +0000 gallivm: fix optimized occlusion query intrinsic name commit a9ba0a3b611e48efbb0e79eb09caa85033dbe9a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Mar 21 16:19:43 2012 +0000 draw,gallivm,llvmpipe: Call gallivm_verify_function everywhere. commit f94c2238d2bc7383e088b8845b7410439a602071 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 18:54:10 2012 +0000 gallivm: optimize calculations for cube maps a bit this does some more vectorized calculations and uses horizontal adds if possible. A definite win with sse3 otherwise it doesn't seem to make much of a difference. In any case this is arithmetically identical, cannot handle larger vectors. Should be useful as a reference point against larger vector version later... commit 21a2c1cf3c8e1ac648ff49e59fdc0e3be77e2ebb Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 15:16:27 2012 +0000 llvmpipe: slight optimization of occlusion queries using movmskps when available. While this is slightly better for cpus without popcnt we should really sum the vectors ourselves (it is also possible to cast to i4 before doing the popcnt but that doesn't help that much neither since llvm is using some optimized popcnt version for i32) commit 5ab5a35f216619bcdf55eed52b0db275c4a06c1b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 13:32:11 2012 +0000 llvmpipe: fix occlusion queries with larger vectors need to adjust casts etc. commit ff95e6fdf5f16d4ef999ffcf05ea6e8c7160b0d5 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Mar 19 20:15:25 2012 +0000 gallivm: Restore optimization passes. commit 57b05b4b36451e351659e98946dae27be0959832 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:34:22 2012 +0000 llvmpipe: use existing min2 macro commit bc9a20e19b4f600a439f45679451f2e87cd4b299 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:07:27 2012 +0000 llvmpipe: add some safeguards against really large vectors As per José's suggestion, prevent things from blowing up if some cpu would have 1024bit or larger vectors. commit 0e2b525e5ca1c5bbaa63158bde52ad1c1564a3a9 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:31:08 2012 +0000 llvmpipe: fix mask generation for uberwide vectors this was the only piece preventing 16-wide vectors from working (apart from the LP_MAX_VECTOR_WIDTH define that is), which is the maximum as we don't get more pixels in the fragment shader at once. Hence adjust that so things could be tested properly with that size even though there seems to be no practical value. commit 3c8334162211c97f3a11c7f64e9e5a2a91ad9656 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:19:41 2012 +0000 llvmpipe: fix the simple interpolation method with larger vectors so both methods actually _really_ work now. Makes textures look nice with larger vectors... commit 1cb0464ef8871be1778d43b0c56adf9c06843e2d Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 17:26:35 2012 +0000 llvmpipe: fix mask generation and position interpolation with 8-wide vectors trivial bugs, with these things start to look somewhat reasonable. Textures though have some swizzling issues it seems. commit 168277a63ef5b72542cf063c337f2d701053ff4b Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 16:04:03 2012 +0000 llvmpipe: don't overallocate variables we never have more than 16 (stamp size) / 4 (minimum possible vector size). (With larger vectors those variables are still overallocated a bit.) commit 409b54b30f81ed0aa9ed0b01affe15c72de9abd2 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:56:48 2012 +0000 llvmpipe: add some 32f8 formats to lp_test_conv Also add the ability to handle different sized vectors. commit 55dcd3af8366ebdac0af3cdb22c2588f24aa18ce Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:47:27 2012 +0000 gallivm: handle different sized vectors in conversion / pack only fully generic path for now (extract/insert per element). commit 9c040f78c54575fcd94a8808216cf415fe8868f6 Author: Roland Scheidegger <sroland@vmware.com> Date: Sun Mar 18 00:58:28 2012 +0100 llvmpipe: fix harmless use of unitialized values commit 551e9d5468b92fc7d5aa2265db9a52bb1e368a36 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:31:21 2012 +0100 gallivm: drop special path in extract_broadcast with different sized vectors Not needed, llvm can handle shuffles with different sized result vector just fine. Should hopefully generate the same code in the end, but simpler IR. commit 44da531119ffa07a421eaa041f63607cec88f6f8 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:28:49 2012 +0100 llvmpipe: adapt interpolation for handling multiple quads at once this is still WIP there are actually two methods possible not quite sure what makes the most sense, so there's code for both for now: 1) the iterative method as used before (compute attrib values at upper left corner of stamp and upper left corner of each quad initially). It is improved to handle more than one quad at once, and also do some more vectorized calculations initially for slightly better code - newer cpus have full throughput with 4 wide float vectors, hence don't try to code up a path which might be faster if there's just one channel active per attribute. 2) just do straight interpolation for each pixel. Method 2) is more work per quad, but less initially - if all quads are executed significantly more overall though. But this might change with larger vector lengths. This method would also be needed if we'd do some kind of active quad merging when operating on multiple quads at once. This path contains some hack to force llvm to generate better code, it is still far from ideal though, still generates far too many unnecessary register spills/reloads. Both methods should work with different sized vectors. Not very well tested yet, still seems to work with four-wide vectors, need changes elsewhere to be able to test with wider vectors. commit be5d3e82e2fe14ad0a46529ab79f65bf2276cd28 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:59:37 2012 +0000 draw: Cleanup. commit f85bc12c7fbacb3de2a94e88c6cd2d5ee0ec0e8d Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:43:30 2012 +0000 gallivm: More module compilation refactoring. commit d76f093198f2a06a93b2204857e6fea5fd0b3ece Author: José Fonseca <jfonseca@vmware.com> Date: Thu Mar 15 21:29:11 2012 +0000 llvmpipe: Use gallivm_compile/free_function() in linear code. Should had been done before. commit 122e1adb613ce083ad739b153ced1cde61dfc8c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 13 14:47:10 2012 +0100 llvmpipe: generate partial pixel mask for multiple quads still works with one quad, cannot be tested yet with more At least for now always fixed order with multiple quads. commit 4c4f15081d75ed585a01392cd2dcce0ad10e0ea8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 22:09:24 2012 +0100 llvmpipe: refactor state setup a bit Refactor to make it easier to emit (and potentially later fetch in fs) coefficients for multiple attributes at once. Need to think more about how to make this actually happen however, the problem is different attributes can have different interpolation modes, requiring different handling in both setup and fs (though linear and perspective handling is close). commit 9363e49722ff47094d688a4be6f015a03fba9c79 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 19:23:23 2012 +0100 llvmpipe: vectorize tri offset calc cuts number of instructions in quad-offset-factor from 107 to 75. This code actually duplicated the (scalar) code calculating the determinant except it used different vertex order (leading to different sign but it doesn't matter) hence llvm could not have figured out it's the same (of course with determinant vectorized in the other place that wouldn't have worked any longer neither). Note this particular piece doesn't actually vectorize well, not many arithmetic instructions left but tons of shuffle instructions... Probably would need to work on n tris at a time for better vectorization. commit 63169dcb9dd445c94605625bf86d85306e2b4297 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 03:11:37 2012 +0100 llvmpipe: vectorize some scalar code in setup reduces number of arithmetic instructions, and avoids loading vector x,y values twice (once as scalars once as vectors). Results in a reduction of instructions from 76 to 64 in fs setup for glxgears (16%) on a cpu with sse41. Since this code uses vec2 disguised as vec4, on old cpus which had physical 64bit sse units (pre-Core2) it probably is less of a win in practice (and if you have no vectors you can only hope llvm eliminates the arithmetic for unneeded elements). commit 732ecb877f951ab89bf503ac5e35ab8d838b58a1 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 7 00:32:24 2012 +0100 draw: fix clipping bug introduced by 4822fea3f0440b5205e957cd303838c3b128419c broke clipping pretty badly (verified with lineclip test) commit ef5d90b86d624c152d200c7c4056f47c3c6d2688 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 23:38:59 2012 +0100 draw: don't store vertex header per attribute storing the vertex header once per attribute is totally unnecessary. Some quick look at the generated assembly says llvm in fact cannot optimize away the additional stores (maybe due to potentially aliasing pointers somewhere). Plus, this makes the code cleaner and also allows using a vector "or" instead of scalar ones. commit 6b3a5a57b0b9850854cfbd7b586e4e50102dda71 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 19:11:01 2012 +0100 draw: do the per-vertex "boolean" clipmask "or" with vectors no point extracting the values and doing it per component. Doesn't help that much since we still extract the values elsewhere anyway. commit 36519caf1af40e4480251cc79a2d527350b7c61f Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 2 22:27:01 2012 +0100 gallivm: fix lp_build_extract_broadcast with different sized vectors Fix the obviously wrong argument, so it doesn't blow up. commit 76d0ac3ad85066d6058486638013afd02b069c58 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 2 12:16:23 2012 +0000 draw: Compile per module and not per function (WIP). Enough to get gears w/ LLVM draw + softpipe to work on AVX doing: GALLIUM_DRIVER=softpipe SOFTPIPE_USE_LLVM=yes glxgears But still hackish -- will need to rethink and refactor this. commit 78e32b247d2a7a771be9a1a07eb000d1e54ea8bd Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 12:01:05 2012 +0000 llvmpipe: Remove lp_state_setup_fallback. Never used. commit 6895d5e40d19b4972c361e8b83fdb7eecda3c225 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Feb 27 19:14:27 2012 +0000 llvmpipe: Don't emit EMMS on x86 We already take precautions to ensure that LLVM never emits MMX code. commit 4822fea3f0440b5205e957cd303838c3b128419c Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 29 15:58:19 2012 +0100 draw: modifications for larger vector sizes We want to be able to use larger vectors especially for running the vertex shader. With this patch we build soa vectors which might have a different length than 4. Note that aos structures really remain the same, only when aos structures are converted to soa potentially different sized vectors are used. Samplers probably don't work yet, didn't look at them. Testing done: glxgears works with both 128bit and 256bit vectors. commit f4950fc1ea784680ab767d3dd0dce589f4e70603 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:57 2012 +0100 gallivm: override native vector width with LP_NATIVE_VECTOR_WIDTH env var for debug commit 6ad6dbf0c92f3bf68ae54e5f2aca035d19b76e53 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:24 2012 +0100 draw: allocate storage with alignment according to native vector width commit 7bf0e3e7c9bd2469ae7279cabf4c5229ae9880c1 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Feb 24 19:06:08 2012 +0000 gallivm: Fix comment grammar. Was missing several words. Spotted by Roland. commit b20f1b28eb890b2fa2de44a0399b9b6a0d453c52 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 19:22:09 2012 +0000 gallivm: Use MC-JIT on LLVM 3.1 + (i.e, SVN) MC-JIT Note: MC-JIT is still WIP. For this to work correctly it requires LLVM changes which are not yet upstream. commit b1af4dfcadfc241fd4023f4c3f823a1286d452c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 20:03:15 2012 +0100 llvmpipe: use new lp_type_width() helper in lp_test_blend commit 04e0a37e888237d4db2298f31973af459ef9c95f Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 19:50:34 2012 +0100 llvmpipe: clean up lp_test_blend a little Using variables just sized and aligned right makes it a bit more obvious what's going on. The test still only tests vector length 4. For AoS anything else probably isn't going to work. For SoA other lengths should work (at least with floats). commit e61c393d3ec392ddee0a3da170e985fda885a823 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:48:30 2012 +0000 gallivm: Ensure vector width consistency. Instead of assuming that everything is the max native size. commit 330081ac7bc41c5754a92825e51456d231bf84dd Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:44:14 2012 +0000 draw: More simd vector width consistency fixes. commit d90ca002753596269e37297e2e6c139b19f29f03 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:43:00 2012 +0000 gallivm: Remove unused lp_build_int32_vec4_type() helper. commit cae23417824d75869c202aaf897808d73a2c1db0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 17:32:16 2012 +0100 gallivm: use global variable for native vector width instead of define We do not know the simd extensions (and hence the simd width we should use) available at compile time. At least for now keep a define for maximum vector width, since a global variable obviously can't be used to adjust alignment of automatic stack variables. Leave the runtime-determined value at 128 for now in all cases. commit 51270ace6349acc2c294fc6f34c025c707be538a Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 15:41:02 2012 +0000 gallivm: Add a hunk inadvertedly lost when rebasing. commit bf256df9cfdd0236637a455cbaece949b1253e98 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:24:23 2012 +0000 llvmpipe: Use consistent vector width in depth/stencil test. commit 5543b0901677146662c44be2cfba655fd55da94b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:19:59 2012 +0000 draw: Use a consistent the vector register width. Instead of 4x32 sometimes, LP_NATIVE_VECTOR_WIDTH other times. commit eada8bbd22a3a61f549f32fe2a7e408222e5c824 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 12:08:04 2012 +0000 gallivm: Remove garbagge collection. MC-JIT will require one compilation per module (as opposed to one compilation per function), therefore no state will be shared, eliminating the need to do garbagge collection. commit 556697ea0ed72e0641851e4fbbbb862c470fd7eb Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 10:33:41 2012 +0000 gallivm: Move all native target initialization to lp_set_target_options(). commit c518e8f3f2649d5dc265403511fab4bcbe2cc5c8 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:52:32 2012 +0000 llvmpipe: Create one gallivm instance for each test. commit 90f10af8920ec6be6f2b1e7365cfc477a0cb111d Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:48:08 2012 +0000 gallivm: Avoid LLVMAddGlobalMapping() in lp_bld_assert(). Brittle, complex, and unecesary. Just use function pointer constant. commit 98fde550b33401e3fe006af59db4db628bcbf476 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:21:26 2012 +0000 gallivm: Add a lp_build_const_func_pointer() helper. To be reused in all places where we want to call C code. commit 6cfedadb62c2ce5af8d75969bc95a607f3ece118 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:44:41 2012 +0000 gallivm: Cleanup/simplify lp_build_const_string_variable. - Move to lp_bld_const where it belongs - Rename to lp_build_const_string - take the length from the argument (and don't count the zero terminator twice) - bitcast the constant to generic i8 * commit db1d4018c0f1fa682a9da93c032977659adfb68c Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 11:52:17 2012 +0000 gallivm: Set NoFramePointerElimNonLeaf to true where supported. commit 088614164aa915baaa5044fede728aa898483183 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 22 19:38:47 2012 +0100 llvmpipe: pass in/out pointers rather scalar floats in lp_bld_arit we don't want llvm to potentially optimize away the vectors (though it doesn't seem to currently), plus we want to be able to handle in/out vectors of arbitrary length. commit 3f5c4e04af8a7592fdffa54938a277c34ae76b51 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:22:55 2012 +0100 gallivm: fix lp_build_sqrt() for vector length 1 since we optimize away vectors with length 1 need to emit intrinsic without vector type. commit 79d94e5f93ed8ba6757b97e2026722ea31d32c06 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 17:00:46 2012 +0000 llvmpipe: Remove lp_test_round. commit 81f41b5aeb3f4126e06453cfc78990086b85b78d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:56:24 2012 +0100 llvmpipe: subsume lp_test_round into lp_test_arit Much simpler, and since the arguments aren't passed as 128bit values can run on any arch. This also uses the float instead of the double versions of the c functions (which probably was the intention anyway). In contrast to lp_test_round the output is much less verbose however. Tested vector width of 32 to 512 bits - all pass except 32 (length 1) which crashes in lp_build_sqrt() due to wrong type. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 945b338b421defbd274481d8c4f7e0910fd0e7eb Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 09:55:03 2012 +0000 gallivm: Centralize the function compilation logic. This simplifies a lot of code. Also doing this in a central place will make it easier to carry out the changes necessary to use MC-JIT in the future. gallivm: Fix typo in explicit derivative shuffle. Trivial. draw: make DEBUG_STORE work again adapt to lp_build_printf() interface changes Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: get rid of vecnf_from_scalar() just use lp_build_broadcast directly (cannot assign a name but don't really need it, vecnf_from_scalar() was producing much uglier IR due to using repeated insertelement instead of insertelement+shuffle). Reviewed-by: José Fonseca <jfonseca@vmware.com> llvmpipe: fix typo in complex interpolation code Fixes position interpolation when using complex mode (piglit fp-fragment-position and similar) Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: fix clipvertex/position storing again This appears to be the result of a bad merge. Fixes piglit tests relying on clipping, like a lot of the interpolation tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Fix explicit derivative manipulation. Same counter variable was being used in two nested loops. Use more meanigful variable names for the counter to fix and avoid this. gallivm: Prevent buffer overflow in repeat wrap mode for NPOT. Based on Roland's patch, discussion, and review . Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix dims for TGSI_TEXTURE_1D in emit_tex. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix explicit volume texture derivatives. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: fix 1d shadow texture sampling Always r coordinate is used, hence need 3 coords not two (the second one is unused). Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Enable AVX support without MCJIT, where available. For now, this just enables AVX on Windows for testing. If the code is stable then we might consider prefering the old JIT wherever possible. No change elsewhere. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2012-07-13 18:09:30 +01:00
unsigned i, length;
LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
LLVMValueRef vecres, elem2;
2010-03-11 23:26:12 +00:00
2010-08-09 17:30:33 +01:00
assert(lp_check_value(type, a));
if (type.length == 1) {
return a;
}
2010-03-11 23:26:12 +00:00
assert(!bld->type.norm);
gallivm,draw,llvmpipe: Support wider native registers. Squashed commit of the following: commit 7acb7b4f60dc505af3dd00dcff744f80315d5b0e Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:46:31 2012 +0100 draw: Don't use dynamically sized arrays. Not supported by MSVC. commit 5810c28c83647612cb372d1e763fd9d7780df3cb Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:44:16 2012 +0100 gallivm,llvmpipe: Don't use expressions with PIPE_ALIGN_VAR(). MSVC doesn't accept exceptions in _declspec(align(...)). Use a define instead. commit 8aafd1457ba572a02b289b3f3411e99a3c056072 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:41:56 2012 +0100 gallium/util: Make u_cpu_detect.h header C++ safe. commit 5795248350771f899cfbfc1a3a58f1835eb2671d Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 2 12:08:01 2012 +0100 gallium/util: Add ULL suffix to large constants. As suggested by Andy Furniss: it looks like some old gcc versions require it. commit 4c66c22727eff92226544c7d43c4eb94de359e10 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Truly disable INF/NAN tests on MSVC. Thanks to Brian for spotting this. commit 8bce274c7fad578d7eb656d9a1413f5c0844c94e Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Disable INF/NAN tests on MSVC. Somehow they are not recognized as constants. commit 6868649cff8d7fd2e2579c28d0b74ef6dd4f9716 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 15:05:24 2012 +0200 gallivm: Cleanup the 2 x 8 float -> 16 ub special path in lp_build_conv. No behaviour change intended, like 7b98455fb40c2df84cfd3cdb1eb7650f67c8a751. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 5147a0949c4407e8bce9e41d9859314b4a9ccf77 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 14:28:19 2012 +0200 gallivm: (trivial) fix issues with multiple-of-4 texture fetch Some formats can't handle non-multiple of 4 fetches I believe, but everything must support length 1 and multiples of 4. So avoid going to scalar fetch (which is very costly) just because length isn't 4. Also extend the hack to not use shift with variable count for yuv formats to arbitrary length (larger than 1) - doesn't matter how many elements we have we always want to avoid it unless we have variable shift count instruction (which we should get with avx2). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 87ebcb1bd71fa4c739451ec8ca89a7f29b168c08 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jul 4 02:09:55 2012 +0200 gallivm: (trivial) fix typo for wrap repeat mode in linear filtering aos code This would lead to bogus coordinates at the edges. (undetected by piglit because this path is only taken for block-based formats). Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 3a42717101b1619874c8932a580c0b9e6896b557 Author: José Fonseca <jfonseca@vmware.com> Date: Tue Jul 3 19:42:49 2012 +0100 gallivm: Fix TGSI integer translation with AVX. commit d71ff104085c196b16426081098fb0bde128ce4f Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 15:17:41 2012 +0100 llvmpipe: Fix LLVM JIT linear path. It was not working properly because it was looking at the JIT function before it was actually compiled. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit a94df0386213e1f5f9a6ed470c535f9688ec0a1b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Jun 28 18:07:10 2012 +0100 gallivm: Refactor lp_build_broadcast(_scalar) to share code. Doesn't really change the generated assembly, but produces more compact IR, and of course, makes code more consistent. Reviewed-by: Brian Paul <brianp@vmware.com> commit 66712ba2731fc029fa246d4fc477d61ab785edb5 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 17:30:13 2012 +0100 gallivm: Make LLVMContextRef a singleton. There are any places inside LLVM that depend on it. Too many to attempt to fix. Reviewed-by: Brian Paul <brianp@vmware.com> commit ff5fb7897495ac263f0b069370fab701b70dccef Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 28 18:15:27 2012 +0200 gallivm: don't use 8-wide texture fetch in aos path This appears to be a slight loss usually. There are probably several reasons for that: - fetching itself is scalar - filtering is pure int code hence needs splitting anyway, same for the final texel offset calculations - texture wrap related code, which can be done 8-wide, is slightly more complex with floats (with clamp_to_edge) and float operations generally more costly hence probably not much faster overall - the code needed to split when encountering different mip levels for the quads, adding complexity So, just split always for aos path (but leave it 8-wide for soa, since we do 8-wide filtering there when possible). This should certainly be revisited if we'd have avx2 support. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ce8032b43dcd8e8d816cbab6428f54b0798f945d Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:41:19 2012 +0200 gallivm: (trivial) don't extract fparts variable if not needed Did not have any consequences but unnecessary. commit aaa9aaed8f80dc282492f62aa583a7ee23a4c6d5 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:09:06 2012 +0200 gallivm: fix precision issue in aos linear int wrap code now not just passes at a quick glance but also with piglit... If we do the wrapping with floats, we also need to set the weights accordingly. We can potentially end up with different (integer) coordinates than what the integer calculations would have chosen, which means the integer weights calculated previously in this case are completely wrong. Well at least that's what I think happens, at least recalculating the weights helps. (Some day really should refactor all the wrapping, so we do whatever is fastest independent of 16bit int aos or 32bit float soa filtering.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit fd6f18588ced7ac8e081892f3bab2916623ad7a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 11:15:53 2012 +0100 gallium/util: Fix parsing of options with underscore. For example GALLIVM_DEBUG=no_brilinear which was being parsed as two options, "no" and "brilinear". commit 09a8f809088178a03e49e409fa18f1ac89561837 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 26 15:00:14 2012 +0100 gallivm: Added a generic lp_build_print_value which prints a LLVMValueRef. Updated lp_build_printf to share common code. Removed specific lp_build_print_vecX. Reviewed-by: José Fonseca <jfonseca@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit e59bdcc2c075931bfba2a84967a5ecd1dedd6eb0 Author: José Fonseca <jfonseca@vmware.com> Date: Wed May 16 15:00:23 2012 +0100 draw,llvmpipe: Avoid named struct types on LLVM 3.0 and later. Starting with LLVM 3.0, named structures are meant not for debugging, but for recursive data types, previously also known as opaque types. The recursive nature of these types leads to several memory management difficulties. Given that we don't actually need recursive types, avoid them altogether. This is an attempt to address fdo bugs 41791 and 44466. The issue is somewhat random so there's no easy way to check how effective this is. Cherry-picked from 9af1ba565dfd5cef9ee938bb7c04767d14878fbf commit df6070f618a203c7a876d984c847cde4cbc26bdb Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 14:42:53 2012 +0200 gallivm: (trivial) fix typo in faster aos linear int wrap code no longer crashes, now REALLY tested. commit d8f98dce452c867214e6782e86dc08562643c862 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 18:20:58 2012 +0200 llvmpipe: (trivial) remove bogus optimization for float aos repeat wrap This optimization for nearest filtering on the linear path generated likely bogus results, and the int path didn't have any optimizations there since the only shader using force_nearest apparently uses clamp_to_edge not repeat wrap anyway. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit c4e271a0631087c795e756a5bb6b046043b5099d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 23:01:52 2012 +0200 gallivm: faster repeat wrap for linear aos path too Even if we already have scaled integer coords, it's way faster to use the original float coord (plus some conversions) rather than use URem. The choice of what to do for texture wrapping is not really tied to int aos or float soa filtering though for some modes there can be some gains (because of easier weight calculations). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 1174a75b1806e92aee4264ffe0ffe7e70abbbfa3 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 14:39:22 2012 +0200 gallivm: improve npot tex wrap repeat in linear soa path URem gets translated into series of scalar divisions so just about anything else is faster. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit f849ffaa499ed96fa0efd3594fce255c7f22891b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 00:40:35 2012 +0100 gallivm: (trivial) fix near-invisible shift-space typo I blame the keyboard. commit 5298a0b19fe672aebeb70964c0797d5921b51cf0 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:24:28 2012 +0200 gallivm: add new intrinsic helper to deal with arbitrary vector length This helper will split vectors which are too large for the hw, or expand them if they are too small, so a caller of a function using intrinsics which uses such sizes need not split (or expand) the vectors manually and the function will still use the intrinsic instead of dropping back to generic llvm code. It can also accept scalars for use with pseudo-vector intrinsics (only useful for float arguments, all x86 scalar simd float intrinsics use 4vf32). Only used for lp_build_min/max() for now (also added the scalar float case for these while there). (Other basic binary functions could use it easily, whereas functions with a different interface would need different helpers.) Expanding vectors isn't widely used, because we always try to use build contexts with native hw vector sizes. But it might (or not) be nicer if this wouldn't need to be done, the generated code should in theory stay the same (it does get hit by lp_build_rho though already since we didn't have a intrinsic for the scalar lp_build_max case before). v2: incorporated Brian's feedback, and also made the scalar min/max case work instead of crash (all scalar simd float intrinsics take 4vf32 as argument, probably the reason why it wasn't used before). Moved to lp_bld_intr based on José's request, and passing intrinsic size instead of length. Ideally we'd derive the source type info from the passed in llvm value refs and process some llvmtype return type so we could handle intrinsics where the source and destination type isn't the same (like float/int conversions, packing instructions) but that's a bit too complicated for now. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 01aa760b99ec0b2dc8ce57a43650e83f8c1becdf Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:19:18 2012 +0200 gallivm: (trivial) increase max code size for shader disassembly 64kB was just short of what I needed (which caused a crash) hence increase to 96kB (should probably be smarter about that). commit 74aa739138d981311ce13076388382b5e89c6562 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:53:29 2012 +0100 gallivm: simplify aos float tex wrap repeat nearest just handle pot and npot the same. The previous pot handling ended up with exactly the same instructions plus 2 more (leave it in the soa path though since it is probably still cheaper there). While here also fix a issue which would cause a crash after an assert. commit 0e1e755645e9e49cfaa2025191e3245ccd723564 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:29:24 2012 +0100 gallivm: (trivial) skip floor rounding in ifloor when not signed This was only done for the non-sse41 case before, but even with sse41 this is obviously unnecessary (some callers already call itrunc in this case anyway but some might not). commit 7f01a62f27dcb1d52597b24825931e88bae76f33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:23:12 2012 +0100 gallivm: (trivial) fix bogus comments commit 5c85be25fd82e28490274c468ce7f3e6e8c1d416 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 20 11:51:57 2012 +0100 translate: Free elt8_func/elt16_func too. These were leaking. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 0ad498f36fb6f7458c7cffa73b6598adceee0a6c Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 15:55:34 2012 +0200 gallivm: fix bug for tex wrap repeat with linear sampling in aos float path The comparison needs to be against length not length_minus_one, otherwise the max texel is never chosen (for the second coordinate). Fixes piglit texwrap-1D-npot-proj (and 2D/3D versions). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit d1ad65937c5b76407dc2499b7b774ab59341209e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 16:13:43 2012 +0200 gallivm: simplify soa tex wrap repeat with npot textures and no mip filtering Similar to what is already done in aos sampling for the float path (but not the int path since we don't get normalized float coordinates there). URem is expensive and the calculation is done trivially with normalized floats instead (at least with sse41-capable cpus). (Some day should probably do the same for the mip filter path but it's much more complicated there hence the gain is smaller.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit e1e23f57ba9b910295c306d148f15643acc3fc83 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:38:56 2012 +0200 llvmpipe: (trivial) remove duplicated function declaration Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 07ca57eb09e04c48a157733255427ef5de620861 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:37:34 2012 +0200 llvmpipe: destroy setup variants on context destruction lp_delete_setup_variants() used to be called in garbage collection, but this no longer exists hence the setup shaders never got freed. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ed0003c633859a45f9963a479f4c15ae0ef1dca3 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 16:25:29 2012 +0100 gallivm: handle different ilod parts for multiple quad sampling This fixes filtering when the integer part of the lod is not the same for all quads. I'm not fully convinced of that solution yet as it just splits the vector if the levels to be sampled from are different. But otherwise we'd need to do things like some minify steps, and getting mip level base address separately anyway hence it wouldn't really look like much of a win (and making the code even more complex). This should now give identical results to single quad sampling. commit 8580ac4cfc43a64df55e84ac71ce1a774d33c0d2 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:14:47 2012 +0200 gallivm: de-duplicate sample code common to soa and aos sampling There doesn't seem to be any reason why this code dealing with cube face selection, lod and mip level calculation is separate in aos and soa sampling, and I am sick of having it to change in both places. commit fb541e5f957408ce305b272100196f1e12e5b1e8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:15:41 2012 +0200 gallivm: do mip filtering with per quad lod_fpart This gives better results for mip filtering, though the generated code might not be optimal. For now it also creates some artifacts if the lod_ipart isn't the same for all quads, since instead of using the same mip weight for all quads as previously (which just caused non-smooth gradients) this now will use the right weights but with the wrong mip level in this case (can easily be seen with things like texfilt, mipmap_tunnel). v2: use logic helper suggested by José, and fix issue with negative lod_fpart values commit f1cc84eef7d826a20fab6cd8ccef9a275ff78967 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 13 18:35:25 2012 +0200 gallivm: (trivial) fix bogus assert in lp_build_unpack_broadcast_aos_scalars commit 7c17dbae8ae290df9ce0f50781a09e8ed640c044 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:11:14 2012 +0100 util: Reimplement half <-> float conversions. Removed u_half.py used to generate the table for previous method. Previous implementation of float to half conversion was faulty for denormalised and NaNs and would require extra logic to fix, thus making the speedup of using tables irrelevant. commit 7762f59274070e1dd4b546f5cb431c2eb71ae5c3 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:12:16 2012 +0100 tests: Updated tests to properly handle NaN for half floats. commit fa94c135aea5911fd93d5dfb6e6f157fb40dce5e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:33:10 2012 +0200 gallivm: do mip level calculations per quad This is the final piece which shouldn't change the rendering output yet. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 23cbeaddfe03c09ca18c45d28955515317ffcf4c Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Jun 9 00:54:21 2012 +0200 gallivm: do per-quad cube face selection Doesn't quite fix the piglit cubemap test (not sure why actually) but doing per-quad face selection is doing the right thing and definitely an improvement. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit abfb372b3702ac97ac8b5aa80ad1b94a2cc39d33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:22:59 2012 +0200 gallivm: do all lod calculations per quad Still no functional change but lod is now converted to scalar after lod calculations. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 519368632747ae03feb5bca9c655eccbc5b751b4 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:46:10 2012 +0100 gallivm: Added support for half-float to float conversion in lp_build_conv. Updated various utility functions to support this change. commit 135b4d683a4c95f7577ba27b9bffa4a6fbd2c2e7 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:02:46 2012 +0100 gallivm: Added function for half-float to float conversion. Updated lp_build_format_aos_array to support half-float source. commit 37d648827406a20c5007abeb177698723ed86673 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:55:18 2012 +0100 util: Updated u_format_tests to rigidly test half-float boundary values. commit 2ad18165d96e578aa9046df7c93cb1c3284d8c6b Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:54:16 2012 +0100 llvmpipe: Updated lp_test_format to properly handle Inf/NaN results. commit 78740acf25aeba8a7d146493dd5c966e22c27b73 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:53:30 2012 +0100 util: Added functions for checking NaN / Inf for double and half-floats. commit 35e9f640ae01241f9e0d67fe893bbbf564c05809 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:05:13 2012 +0200 gallivm: Fix calculating rho for 3d textures for the single-quad case Discovered by accident, this looks like a very old typo bug. commit fc1220c636326536fd0541913154e62afa7cd1d8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:04:59 2012 +0200 gallivm: do calcs per-quad in lp_build_rho Still convert to scalar at the end of the function. commit 50a887ffc550bf310a6988fa2cea5c24d38c1a41 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 21 23:21:50 2012 +0200 gallivm: (trivial) return scalar in lp_build_extract_range for length 1 vectors Our type system on top of llvm's one doesn't generally support vectors of length 1, instead using scalars. So we should return a scalar from this function instead of having to bitcast the vector with length 1 later elsewhere. commit 80c71c621f9391f0f9230460198d861643324876 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 17:49:15 2012 +0100 draw: Fixed bad merge error commit c47401cfad0c9167de20ff560654f533579f452c Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:29:30 2012 +0100 draw: Updated store_clip to store whole vectors instead of individual elements. commit 2d9c1ad74b0b0b41861fffcecde39f09cc27f1cf Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:28:32 2012 +0100 gallivm: Added lp_build_fetch_rgba_aos_array. A version of lp_build_fetch_rgba_aos which is targeted at simple array formats. Reads the whole vector from memory in one, instead of reading each element individually. Tested with mesa tests and demos. commit ff7805dc2b6ef6d8b11ec4e54aab1633aef29ac8 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:27:40 2012 +0100 gallivm: Added lp_build_pad_vector. This function pads a vector with undef to a desired length. commit 701f50acef24a2791dabf4730e5b5687d6eb875d Author: James Benton <jbenton@vmware.com> Date: Fri May 18 17:27:19 2012 +0100 util: Added util_format_is_array. This function checks whether a format description is in a simple array format. commit 5e0a7fa543dcd009de26f34a7926674190fa6246 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:13:47 2012 +0100 draw: Removed draw_llvm_translate_from and draw/draw_llvm_translate.c. This is "replaced" by adding an optimised path in lp_build_fetch_rgba_aos in an upcoming patch. commit 8c886d6a7dd3fb464ecf031de6f747cb33e5361d Author: James Benton <jbenton@vmware.com> Date: Wed May 16 15:02:31 2012 +0100 draw: Modified store_aos to write the vector as one, not individual elements. commit 37337f3d657e21dfd662c7b26d61cb0f8cfa6f17 Author: James Benton <jbenton@vmware.com> Date: Wed May 16 14:16:23 2012 +0100 draw: Changed aos_to_soa to use lp_build_transpose_aos. commit bd2b69ce5d5c94b067944d1dcd5df9f8e84548f1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:14:27 2012 +0100 draw: Changed soa_to_aos to use lp_build_transpose_aos. commit 0b98a950d29a116e82ce31dfe7b82cdadb632f2b Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:45 2012 +0100 gallivm: Added lp_build_transpose_aos which converts between aos and soa. commit 69ea84531ad46fd145eb619ed1cedbe97dde7cb5 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:01 2012 +0100 gallivm: Added lp_build_interleave2_half aimed at AVX unpack instructions. commit 7a4cb1349dd35c18144ad5934525cfb9436792f9 Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 22 11:54:14 2012 +0100 gallivm: Fix build on Windows. MC-JIT not yet supported there. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit afd105fc16bb75d874e418046b80d9cc578818a1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:17:26 2012 +0100 llvmpipe: Added a error counter to lp_test_conv. Useful for keeping track of progress when fixing errors! Signed-off-by: José Fonseca <jfonseca@vmware.com> commit b644907d08c10a805657841330fc23db3963d59c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:16:46 2012 +0100 llvmpipe: Changed known failures in lp_test_conv. To comply with the recent fixes to lp_bld_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit d7061507bd94f6468581e218e61261b79c760d4f Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:14:38 2012 +0100 llvmpipe: Added fixed point types tests to lp_test_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 146b3ea39b4726dbe125ac666bd8902ea3d6ca8c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:26:35 2012 +0100 llvmpipe: Changed lp_test_conv src/dst alignment to be correct. Now based on the define rather than a fixed number. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit f3b57441f834833a4b142a951eb98df0aa874536 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:06:44 2012 +0100 gallivm: Fixed erroneous optimisation in lp_build_min/max. Previously assumed normalised was 0 to 1, but it can be -1 to 1 if type is signed. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a0613382e5a215cd146bb277646a6b394d376ae4 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:04:49 2012 +0100 gallivm: Compensate for lp_const_offset in lp_build_conv. Fixing a /*FIXME*/ to remove errors in integer conversion in lp_build_conv. Tested using lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a3d2bf15ea345bc8a0664f8f441276fd566566f3 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:01:25 2012 +0100 gallivm: Fixed overflow in lp_build_clamped_float_to_unsigned_norm. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit e7b1e76fe237613731fa6003b5e1601a2e506207 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 21 20:07:51 2012 +0100 gallivm: Fix build with LLVM 2.6 Trivial, and useful. commit d3c6bbe5c7f5ba1976710831281ab1b6a631082d Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 15 17:15:59 2012 +0100 gallivm: Enable MCJIT/AVX with vanilla LLVM 3.1. Add the necessary C++ glue, so that we don't need any modifications to the soon to be released LLVM 3.1. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 724a019a14d40fdbed21759a204a2bec8a315636 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 22:04:06 2012 +0100 gallivm: Use HAVE_LLVM 0x0301 consistently. commit af6991e2a3868e40ad599b46278551b794839748 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 21:49:06 2012 +0100 gallivm: Add MCRegisterInfo.h to silence benign warnings about missing implementation. Trivial. commit 6f8a1d75458daae2503a86c6b030ecc4bb494e23 Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Apr 2 22:14:15 2012 -0700 gallivm: Pass in a MCInstrInfo to createMCInstPrinter on llvm-3.1. llvm-3.1svn r153860 makes MCInstrInfo available to the MCInstPrinter. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 62555b6ed8760545794f83064e27cddcb3ce5284 Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 27 21:51:17 2012 -0700 gallivm: Fix method overriding in raw_debug_ostream. Use matching type qualifers to avoid method hiding. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 6a9bd784f4ac68ad0a731dcd39e5a3c39989f2be Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 13 22:40:52 2012 -0700 gallivm: Fix createOProfileJITEventListener namespace with llvm-3.1. llvm-3.1svn r152620 refactored the OProfile profiling code. createOProfileJITEventListener was moved from the llvm namespace to the llvm::JITEventListener namespace. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit b674955d39adae272a779be85aa1bd665de24e3e Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Mar 5 22:00:40 2012 -0800 gallivm: Pass in a MCRegisterInfo to MCInstPrinter on llvm-3.1. llvm-3.1svn r152043 changes createMCInstPrinter to take an additional MCRegisterInfo argument. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 11ab69971a8a31c62f6de74905dbf8c02884599f Author: Vinson Lee <vlee@freedesktop.org> Date: Wed Feb 29 21:20:53 2012 -0800 Revert "gallivm: Change getExtent and readByte to non-const with llvm-3.1." This reverts commit d5a6c172547d8964f4d4bb79637651decaf9deee. llvm-3.1svn r151687 makes MemoryObject accessor members const again. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 339960c82d2a9f5c928ee9035ed31dadb7f45537 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 14 16:19:56 2012 +0200 gallivm: (trivial) fix assertion failure for mipmapped 1d textures In lp_build_rho, we may end up with a 1-element vector (for mipmapped 1d textures), but in this case we require the type to be a non-vector type, so need a cast. commit 9d73edb727bd6d196030dc3026b7bf0c574b3e19 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:12:07 2012 +0200 gallivm: prepare for per-quad lod calculations for large vectors to be able to handle multiple quads at once in texture sampling and still do lod calculations per quad, it is necessary to get the per-quad derivatives into the lp_build_rho function. Until now these derivative values were just scalars, which isn't going to work. So we now use vectors, and since the interface needs to change we also do some different (slightly more efficient) packing of the values. For 8-wide vectors the packed derivative values for 3 coords would look like this, this scales to a arbitrary (multiple of 4) vector size: ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____ The second vector will be unused for 1d and 2d textures. To facilitate future changes the derivative values are put into a struct, since quite some functions just pass these values through. The generated code seems to be very slightly better for 2d textures (with 4-wide vectors) than before with sse2 (if you have a cpu with physical 128bit simd units - otherwise it's probably not a win). v2: suggestions from José, rename variables, add comments, use swizzle helper commit 0aa21de0d31466dac77b05c97005722e902517b8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:10:31 2012 +0200 gallivm: add undefined swizzle handling to lp_build_swizzle_aos This is useful for vectors with "holes", it lets llvm choose the most efficient shuffle instructions if some elements aren't needed without having to worry what elements to manually pick otherwise. commit 00faf3f370e7ce92f5ef51002b0ea42ef856e181 Author: José Fonseca <jfonseca@vmware.com> Date: Fri May 4 17:25:16 2012 +0100 gallivm: Get the LLVM IR optimization passes before JIT compilation. MC-JIT engine compiles the module immediately on creation, so the optimization passes were being run too late. So now we create a target data layout from a string, that matches the ABI parameters reported by the compiler. The backend optimization passes were always been run, so the performance improvement is modest (3% on multiarb mesa demo). Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit 40a43f4e2ce3074b5ce9027179d657ebba68800a Author: Roland Scheidegger <sroland@vmware.com> Date: Wed May 2 16:03:54 2012 +0200 gallivm: (trivial) fix wrong define used in lp_build_pack2 should fix stack-smashing crashes. commit e6371d0f4dffad4eb3b7a9d906c23f1c88a2ab9e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 30 21:25:29 2012 +0200 gallivm: add perf warnings when not using intrinsics with 256bit vectors Helper functions using integer sse2 intrinsics could split the vectors with AVX instead of using generic fallback (which should be faster). We don't actually expect to hit these paths (hence don't fix them up to actually do the vector splitting) so just emit warnings (for those functions where it's obvious doing split/intrinsic is faster than using generic path). Only emit warnings for 256bit vectors since we _really_ don't expect to hit arbitrary large vectors which would affect a lot more functions. The warnings do not actually depend on avx since the same logic applies to plain sse2 too (but of course again there's _really_ no reason we should hit these functions with 256bit vectors without avx). commit 8a9ea701ea7295181e846c6383bf66a5f5e47637 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:37:07 2012 +0200 gallivm: split vectors manually for avx in lp_build_pack2 (v2) There's 2 reasons for this: First, there's a llvm bug (fixed in 3.1) which generates tons of byte inserts/extracts otherwise, and second, more importantly, we want to use pack intrinsics instead of shuffles. We do this in lp_build_pack2 and not the calling code (aos sample path) because potentially other callers might find that useful too, even if for larger sequences of code using non-native vector sizes it might be better to manually split vectors. This should boost texture performance in the aos path considerably. v2: fix issues with intrinsics types with old llvm commit 27ac5b48fa1f2ea3efeb5248e2ce32264aba466e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:26:22 2012 +0200 llvmpipe: refactor lp_build_pack2 (v2) prettify, and it's unnecessary to assert when there's no intrinsic due to unsupported bit width - the shuffle path will work regardless. In contrast lp_build_packs2, should only rely on lp_build_pack2 doing the clamping for element sizes for which there is a sse2 intrinsic. v2: fix bug spotted by Jose regarding the intrinsic type for packusdw on old llvm versions. commit ddf279031f0111de4b18eaf783bdc0a1e47813c8 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:13:59 2012 +0200 gallivm: add src width check in lp_build_packs2() not doing so would skip clamping even if no sse2 pack instruction is available, which is incorrect (in theory only, such widths would also always hit a (unnecessary) assertion in lp_build_pack2(). commit e7f0ad7fe079975eae7712a6e0c54be4fae0114b Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Apr 27 15:57:00 2012 +0200 gallivm: (trivial) fix crash-causing typo for npot textures with avx commit 28a9d7f6f655b6ec508c8a3aa6ffefc1e79793a0 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Apr 25 19:38:45 2012 +0200 gallivm: (trivial) remove code mistakenly added twice. commit d5926537316f8ff67ad0a52e7242f7c5478d919b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Apr 24 21:16:15 2012 +0200 gallivm: add a new avx aos sample path (v2) Try to avoid mixing float and int address calculations. This does texture wrap modes with floats, and then the offset calculations still with ints (because of lack of precision with floats, though we could do some effort to make it work with not too large (16MB) textures). This also handles wrap repeat mode with npot-sized textures differently than either the old soa or aos int path (likely way faster but untested). Otherwise the actual address wrap code is largely similar to the soa path (not quite the same as this one also has some int code), it should get used by avx soa sampling later as well but doesn't handle more complex address modes yet (this will also have the benefit that we can use aos sampling path for all texture address modes). Generated code for that looks reasonable, but still does not split vectors explicitly for fetch/filter which means still get hit by llvm (fixed upstream) which generates hundreds of pinsrb/pextrb instead of two shuffles. It is not obvious though if it's much of a win over just doing address calcs 4-wide but with ints, even if it is definitely much less instructions on avx. piglit's texwrap seems to look exactly the same but doesn't test neither the non-normalized nor the npot cases. v2: fix comments, prettify based on Brian's and Jose's feedback. commit bffecd22dea66fb416ecff8cffd10dd4bdb73fce Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Apr 19 01:58:29 2012 +0200 gallivm: refactor aos lp_build_sample_image_nearest/linear split them up to separate address calculations and fetching/filtering. Need this for being able to do 8-wide float address calcs and 4-wide fetch/filter later (for avx). Plus the functions were very big scary monsters anyway (in particular lp_build_sample_image_linear). commit a80b325c57529adddcfa367f96f03557725c4773 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 16 17:17:18 2012 +0200 gallivm: fix lp_build_resize when truncating width but expanding vector size Missed this case which I thought was impossible - the assertion for it was right after the division by zero... (AoS) texture sampling may ask us to do this, for things like 8 4x32int vectors to 1 32x8int vector conversion (eventually, we probably don't want this to happen). commit f9c8337caa3eb185830d18bce8b95676a065b1d7 Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Apr 14 18:00:59 2012 +0200 gallivm: fix cube maps with larger vectors This makes the branchless cube face selection code work with larger vectors. Because the complexity is quite high (cannot really be improved it seems, per-face selection would reduce complexity a lot but this leads to errors unless the derivatives are calculated all from the same face which almost doubles the work to be done) it is still slower than the branching version, hence only enable this with large vectors. It doesn't actually do per-quad face selection yet (only makes sense with matching lod selection, in fact it will select the same face for all pixels based on the average of the first four pixels for now) but only different shuffles are required to make it work (the branching version actually should work with larger vectors too now thanks to the improved horizontal add but of course it cannot be extended to really select the face per-quad unless doing branching per quad). commit 7780c58869fc9a00af4f23209902db7e058e8a66 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 21:11:12 2012 +0100 llvmpipe: (trivial) fix compiler warning and also clarify comment regarding availability of popcnt instruction. commit a266dccf477df6d29a611154e988e8895892277e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 14:21:07 2012 +0100 gallivm: remove unneeded members in lp_build_sample_context Minor cleanup, the texture width, height, depth aren't accessed in their scalar form anywhere. Makes it more obvious those values should probably be fetched already vectorized (but this requires more invasive changes)... commit b678c57fb474e14f05e25658c829fc04d2792fff Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 29 15:53:55 2012 +0100 gallivm: add a helper for concatenating vectors Similar to the extract_range helper intended to get around slow code generated by llvm for 128bit insertelements. Concatenating two 128bit vectors this way will result in a single vinsertf128 operation rather than two 64bit stores plus one 128bit load, though it might be mildly useful for other purposes as well. commit 415ff228bcd0cf5e44a4c15350a661f0f5520029 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 19:41:15 2012 +0100 gallivm: add a custom 2x8f->1x16ub avx conversion path Similar to the existing 4x4f->1x16ub sse2 path, shaves off a couple instructions (min/max mostly) because it relies on pack intrinsics clamping. commit 78c08fc89f8fbcc6dba09779981b1e873e2a0299 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 18:44:07 2012 +0100 gallivm: add avx arithmetic intrinsics Add all avx intrinsics for arithmetic functions (with the exception of the horizontal add function which needs another look). Seems to pass basic tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit a586caa2800aa5ce54c173f7c0d4fc48153dbc4e Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 15:31:35 2012 +0100 gallivm: add avx logic intrinsics Add the blend intrinsics for 8-wide float and 4-wide double vectors. Since we lack 256bit int instructions these are used for int vectors as well, though obviously not for byte or word element values. The comparison intrinsics aren't extended for avx since these are only used for pre-2.7 llvm versions. commit 70275e4c13c89315fc2560a4c488c0e6935d5caf Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 00:40:53 2012 +0100 gallivm: new helper function for extract shuffles. Based on José's idea as we can need that in a couple places. Note that such shuffles should not be used lightly, since data layout of <4 x i8> is different to <16 x i8> for instance, hence might cause data rearrangement. commit 4d586dbae1b0c55915dda1759d2faea631c0a1c2 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 18:27:25 2012 +0100 gallivm: (trivial) don't overallocate shuffle variable using wrong define meant huge array... commit 06b0ec1f6d665d98c135f9573ddf4ba04b2121ad Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 17:54:20 2012 +0100 gallivm: don't do per-element extract/insert for vector element resize Instead of doing per-element extract/insert if the src vectors and dst vector differ in total size (which generates atrocious code) first change the src vectors size by using shuffles to destination vector size. We can still do better than that on AVX for packing to color buffer (by exploiting pack intrinsics characteristics hence eleminating the need for some clamps) but this already generates much better code. v2: incorporate feedback from José, Keith and use shuffle instead of bitcasts/extracts. Due to llvm deficiencies the latter cause all data to get moved to GPRs and back in pieces (even though the data in the regs actually stays the same...). commit c9970d70e05f95d3f52fe7d2cd794176a52693aa Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 19:33:19 2012 +0000 gallivm: fix bug in simple position interpolation Accidental use of position attribute instead of just pixel coordinates. Caused failures in piglit glsl-fs-ceil and glsl-fs-floor. commit d0b6fcdb008d04d7f73d3d725615321544da5a7e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 15:31:14 2012 +0000 gallivm: fix emission of ceil opcode lp_build_ceil seems more appropriate than lp_build_trunc. This seems to be never hit though someone performs some ceil to floor magic. commit d97fafed7e62ffa6bf76560a92ea246a1a26d256 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 22 11:46:52 2012 +0000 gallivm: new vectorized path for cubemap calculations should be faster when adapted to multiple quads as only selection masks need to be different. The code is more or less a per-pixel version adapted to only do it per quad. A per pixel version would be much simpler (could drop 2 selects, 6 broadcasts and the messy horizontal add of 3 vectors at the expense of only 2 more absolute value instructions - would also just work for arbitary large vectors). This version doesn't yet work with larger vectors because the horizontal add isn't adjusted to be able to work with 2x4 vectors (and also because face selection wouldn't be done per quad just per block though that would be only a correctness issue just as with lod selection). The downside is this code is quite a bit slower. On a Core2 it can be sped up by disabling the hw blend instructions for selection and using logicop fallbacks instead, but it is still slower than the old code, hence leave that in for now. Probably will chose one or the other version based on vector length in the end. commit b375fbb18a3fd46859b7fdd42f3e9908ea4ff9a3 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 21 14:42:29 2012 +0000 gallivm: fix optimized occlusion query intrinsic name commit a9ba0a3b611e48efbb0e79eb09caa85033dbe9a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Mar 21 16:19:43 2012 +0000 draw,gallivm,llvmpipe: Call gallivm_verify_function everywhere. commit f94c2238d2bc7383e088b8845b7410439a602071 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 18:54:10 2012 +0000 gallivm: optimize calculations for cube maps a bit this does some more vectorized calculations and uses horizontal adds if possible. A definite win with sse3 otherwise it doesn't seem to make much of a difference. In any case this is arithmetically identical, cannot handle larger vectors. Should be useful as a reference point against larger vector version later... commit 21a2c1cf3c8e1ac648ff49e59fdc0e3be77e2ebb Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 15:16:27 2012 +0000 llvmpipe: slight optimization of occlusion queries using movmskps when available. While this is slightly better for cpus without popcnt we should really sum the vectors ourselves (it is also possible to cast to i4 before doing the popcnt but that doesn't help that much neither since llvm is using some optimized popcnt version for i32) commit 5ab5a35f216619bcdf55eed52b0db275c4a06c1b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 13:32:11 2012 +0000 llvmpipe: fix occlusion queries with larger vectors need to adjust casts etc. commit ff95e6fdf5f16d4ef999ffcf05ea6e8c7160b0d5 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Mar 19 20:15:25 2012 +0000 gallivm: Restore optimization passes. commit 57b05b4b36451e351659e98946dae27be0959832 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:34:22 2012 +0000 llvmpipe: use existing min2 macro commit bc9a20e19b4f600a439f45679451f2e87cd4b299 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:07:27 2012 +0000 llvmpipe: add some safeguards against really large vectors As per José's suggestion, prevent things from blowing up if some cpu would have 1024bit or larger vectors. commit 0e2b525e5ca1c5bbaa63158bde52ad1c1564a3a9 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:31:08 2012 +0000 llvmpipe: fix mask generation for uberwide vectors this was the only piece preventing 16-wide vectors from working (apart from the LP_MAX_VECTOR_WIDTH define that is), which is the maximum as we don't get more pixels in the fragment shader at once. Hence adjust that so things could be tested properly with that size even though there seems to be no practical value. commit 3c8334162211c97f3a11c7f64e9e5a2a91ad9656 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:19:41 2012 +0000 llvmpipe: fix the simple interpolation method with larger vectors so both methods actually _really_ work now. Makes textures look nice with larger vectors... commit 1cb0464ef8871be1778d43b0c56adf9c06843e2d Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 17:26:35 2012 +0000 llvmpipe: fix mask generation and position interpolation with 8-wide vectors trivial bugs, with these things start to look somewhat reasonable. Textures though have some swizzling issues it seems. commit 168277a63ef5b72542cf063c337f2d701053ff4b Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 16:04:03 2012 +0000 llvmpipe: don't overallocate variables we never have more than 16 (stamp size) / 4 (minimum possible vector size). (With larger vectors those variables are still overallocated a bit.) commit 409b54b30f81ed0aa9ed0b01affe15c72de9abd2 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:56:48 2012 +0000 llvmpipe: add some 32f8 formats to lp_test_conv Also add the ability to handle different sized vectors. commit 55dcd3af8366ebdac0af3cdb22c2588f24aa18ce Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:47:27 2012 +0000 gallivm: handle different sized vectors in conversion / pack only fully generic path for now (extract/insert per element). commit 9c040f78c54575fcd94a8808216cf415fe8868f6 Author: Roland Scheidegger <sroland@vmware.com> Date: Sun Mar 18 00:58:28 2012 +0100 llvmpipe: fix harmless use of unitialized values commit 551e9d5468b92fc7d5aa2265db9a52bb1e368a36 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:31:21 2012 +0100 gallivm: drop special path in extract_broadcast with different sized vectors Not needed, llvm can handle shuffles with different sized result vector just fine. Should hopefully generate the same code in the end, but simpler IR. commit 44da531119ffa07a421eaa041f63607cec88f6f8 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:28:49 2012 +0100 llvmpipe: adapt interpolation for handling multiple quads at once this is still WIP there are actually two methods possible not quite sure what makes the most sense, so there's code for both for now: 1) the iterative method as used before (compute attrib values at upper left corner of stamp and upper left corner of each quad initially). It is improved to handle more than one quad at once, and also do some more vectorized calculations initially for slightly better code - newer cpus have full throughput with 4 wide float vectors, hence don't try to code up a path which might be faster if there's just one channel active per attribute. 2) just do straight interpolation for each pixel. Method 2) is more work per quad, but less initially - if all quads are executed significantly more overall though. But this might change with larger vector lengths. This method would also be needed if we'd do some kind of active quad merging when operating on multiple quads at once. This path contains some hack to force llvm to generate better code, it is still far from ideal though, still generates far too many unnecessary register spills/reloads. Both methods should work with different sized vectors. Not very well tested yet, still seems to work with four-wide vectors, need changes elsewhere to be able to test with wider vectors. commit be5d3e82e2fe14ad0a46529ab79f65bf2276cd28 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:59:37 2012 +0000 draw: Cleanup. commit f85bc12c7fbacb3de2a94e88c6cd2d5ee0ec0e8d Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:43:30 2012 +0000 gallivm: More module compilation refactoring. commit d76f093198f2a06a93b2204857e6fea5fd0b3ece Author: José Fonseca <jfonseca@vmware.com> Date: Thu Mar 15 21:29:11 2012 +0000 llvmpipe: Use gallivm_compile/free_function() in linear code. Should had been done before. commit 122e1adb613ce083ad739b153ced1cde61dfc8c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 13 14:47:10 2012 +0100 llvmpipe: generate partial pixel mask for multiple quads still works with one quad, cannot be tested yet with more At least for now always fixed order with multiple quads. commit 4c4f15081d75ed585a01392cd2dcce0ad10e0ea8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 22:09:24 2012 +0100 llvmpipe: refactor state setup a bit Refactor to make it easier to emit (and potentially later fetch in fs) coefficients for multiple attributes at once. Need to think more about how to make this actually happen however, the problem is different attributes can have different interpolation modes, requiring different handling in both setup and fs (though linear and perspective handling is close). commit 9363e49722ff47094d688a4be6f015a03fba9c79 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 19:23:23 2012 +0100 llvmpipe: vectorize tri offset calc cuts number of instructions in quad-offset-factor from 107 to 75. This code actually duplicated the (scalar) code calculating the determinant except it used different vertex order (leading to different sign but it doesn't matter) hence llvm could not have figured out it's the same (of course with determinant vectorized in the other place that wouldn't have worked any longer neither). Note this particular piece doesn't actually vectorize well, not many arithmetic instructions left but tons of shuffle instructions... Probably would need to work on n tris at a time for better vectorization. commit 63169dcb9dd445c94605625bf86d85306e2b4297 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 03:11:37 2012 +0100 llvmpipe: vectorize some scalar code in setup reduces number of arithmetic instructions, and avoids loading vector x,y values twice (once as scalars once as vectors). Results in a reduction of instructions from 76 to 64 in fs setup for glxgears (16%) on a cpu with sse41. Since this code uses vec2 disguised as vec4, on old cpus which had physical 64bit sse units (pre-Core2) it probably is less of a win in practice (and if you have no vectors you can only hope llvm eliminates the arithmetic for unneeded elements). commit 732ecb877f951ab89bf503ac5e35ab8d838b58a1 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 7 00:32:24 2012 +0100 draw: fix clipping bug introduced by 4822fea3f0440b5205e957cd303838c3b128419c broke clipping pretty badly (verified with lineclip test) commit ef5d90b86d624c152d200c7c4056f47c3c6d2688 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 23:38:59 2012 +0100 draw: don't store vertex header per attribute storing the vertex header once per attribute is totally unnecessary. Some quick look at the generated assembly says llvm in fact cannot optimize away the additional stores (maybe due to potentially aliasing pointers somewhere). Plus, this makes the code cleaner and also allows using a vector "or" instead of scalar ones. commit 6b3a5a57b0b9850854cfbd7b586e4e50102dda71 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 19:11:01 2012 +0100 draw: do the per-vertex "boolean" clipmask "or" with vectors no point extracting the values and doing it per component. Doesn't help that much since we still extract the values elsewhere anyway. commit 36519caf1af40e4480251cc79a2d527350b7c61f Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 2 22:27:01 2012 +0100 gallivm: fix lp_build_extract_broadcast with different sized vectors Fix the obviously wrong argument, so it doesn't blow up. commit 76d0ac3ad85066d6058486638013afd02b069c58 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 2 12:16:23 2012 +0000 draw: Compile per module and not per function (WIP). Enough to get gears w/ LLVM draw + softpipe to work on AVX doing: GALLIUM_DRIVER=softpipe SOFTPIPE_USE_LLVM=yes glxgears But still hackish -- will need to rethink and refactor this. commit 78e32b247d2a7a771be9a1a07eb000d1e54ea8bd Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 12:01:05 2012 +0000 llvmpipe: Remove lp_state_setup_fallback. Never used. commit 6895d5e40d19b4972c361e8b83fdb7eecda3c225 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Feb 27 19:14:27 2012 +0000 llvmpipe: Don't emit EMMS on x86 We already take precautions to ensure that LLVM never emits MMX code. commit 4822fea3f0440b5205e957cd303838c3b128419c Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 29 15:58:19 2012 +0100 draw: modifications for larger vector sizes We want to be able to use larger vectors especially for running the vertex shader. With this patch we build soa vectors which might have a different length than 4. Note that aos structures really remain the same, only when aos structures are converted to soa potentially different sized vectors are used. Samplers probably don't work yet, didn't look at them. Testing done: glxgears works with both 128bit and 256bit vectors. commit f4950fc1ea784680ab767d3dd0dce589f4e70603 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:57 2012 +0100 gallivm: override native vector width with LP_NATIVE_VECTOR_WIDTH env var for debug commit 6ad6dbf0c92f3bf68ae54e5f2aca035d19b76e53 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:24 2012 +0100 draw: allocate storage with alignment according to native vector width commit 7bf0e3e7c9bd2469ae7279cabf4c5229ae9880c1 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Feb 24 19:06:08 2012 +0000 gallivm: Fix comment grammar. Was missing several words. Spotted by Roland. commit b20f1b28eb890b2fa2de44a0399b9b6a0d453c52 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 19:22:09 2012 +0000 gallivm: Use MC-JIT on LLVM 3.1 + (i.e, SVN) MC-JIT Note: MC-JIT is still WIP. For this to work correctly it requires LLVM changes which are not yet upstream. commit b1af4dfcadfc241fd4023f4c3f823a1286d452c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 20:03:15 2012 +0100 llvmpipe: use new lp_type_width() helper in lp_test_blend commit 04e0a37e888237d4db2298f31973af459ef9c95f Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 19:50:34 2012 +0100 llvmpipe: clean up lp_test_blend a little Using variables just sized and aligned right makes it a bit more obvious what's going on. The test still only tests vector length 4. For AoS anything else probably isn't going to work. For SoA other lengths should work (at least with floats). commit e61c393d3ec392ddee0a3da170e985fda885a823 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:48:30 2012 +0000 gallivm: Ensure vector width consistency. Instead of assuming that everything is the max native size. commit 330081ac7bc41c5754a92825e51456d231bf84dd Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:44:14 2012 +0000 draw: More simd vector width consistency fixes. commit d90ca002753596269e37297e2e6c139b19f29f03 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:43:00 2012 +0000 gallivm: Remove unused lp_build_int32_vec4_type() helper. commit cae23417824d75869c202aaf897808d73a2c1db0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 17:32:16 2012 +0100 gallivm: use global variable for native vector width instead of define We do not know the simd extensions (and hence the simd width we should use) available at compile time. At least for now keep a define for maximum vector width, since a global variable obviously can't be used to adjust alignment of automatic stack variables. Leave the runtime-determined value at 128 for now in all cases. commit 51270ace6349acc2c294fc6f34c025c707be538a Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 15:41:02 2012 +0000 gallivm: Add a hunk inadvertedly lost when rebasing. commit bf256df9cfdd0236637a455cbaece949b1253e98 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:24:23 2012 +0000 llvmpipe: Use consistent vector width in depth/stencil test. commit 5543b0901677146662c44be2cfba655fd55da94b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:19:59 2012 +0000 draw: Use a consistent the vector register width. Instead of 4x32 sometimes, LP_NATIVE_VECTOR_WIDTH other times. commit eada8bbd22a3a61f549f32fe2a7e408222e5c824 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 12:08:04 2012 +0000 gallivm: Remove garbagge collection. MC-JIT will require one compilation per module (as opposed to one compilation per function), therefore no state will be shared, eliminating the need to do garbagge collection. commit 556697ea0ed72e0641851e4fbbbb862c470fd7eb Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 10:33:41 2012 +0000 gallivm: Move all native target initialization to lp_set_target_options(). commit c518e8f3f2649d5dc265403511fab4bcbe2cc5c8 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:52:32 2012 +0000 llvmpipe: Create one gallivm instance for each test. commit 90f10af8920ec6be6f2b1e7365cfc477a0cb111d Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:48:08 2012 +0000 gallivm: Avoid LLVMAddGlobalMapping() in lp_bld_assert(). Brittle, complex, and unecesary. Just use function pointer constant. commit 98fde550b33401e3fe006af59db4db628bcbf476 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:21:26 2012 +0000 gallivm: Add a lp_build_const_func_pointer() helper. To be reused in all places where we want to call C code. commit 6cfedadb62c2ce5af8d75969bc95a607f3ece118 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:44:41 2012 +0000 gallivm: Cleanup/simplify lp_build_const_string_variable. - Move to lp_bld_const where it belongs - Rename to lp_build_const_string - take the length from the argument (and don't count the zero terminator twice) - bitcast the constant to generic i8 * commit db1d4018c0f1fa682a9da93c032977659adfb68c Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 11:52:17 2012 +0000 gallivm: Set NoFramePointerElimNonLeaf to true where supported. commit 088614164aa915baaa5044fede728aa898483183 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 22 19:38:47 2012 +0100 llvmpipe: pass in/out pointers rather scalar floats in lp_bld_arit we don't want llvm to potentially optimize away the vectors (though it doesn't seem to currently), plus we want to be able to handle in/out vectors of arbitrary length. commit 3f5c4e04af8a7592fdffa54938a277c34ae76b51 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:22:55 2012 +0100 gallivm: fix lp_build_sqrt() for vector length 1 since we optimize away vectors with length 1 need to emit intrinsic without vector type. commit 79d94e5f93ed8ba6757b97e2026722ea31d32c06 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 17:00:46 2012 +0000 llvmpipe: Remove lp_test_round. commit 81f41b5aeb3f4126e06453cfc78990086b85b78d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:56:24 2012 +0100 llvmpipe: subsume lp_test_round into lp_test_arit Much simpler, and since the arguments aren't passed as 128bit values can run on any arch. This also uses the float instead of the double versions of the c functions (which probably was the intention anyway). In contrast to lp_test_round the output is much less verbose however. Tested vector width of 32 to 512 bits - all pass except 32 (length 1) which crashes in lp_build_sqrt() due to wrong type. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 945b338b421defbd274481d8c4f7e0910fd0e7eb Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 09:55:03 2012 +0000 gallivm: Centralize the function compilation logic. This simplifies a lot of code. Also doing this in a central place will make it easier to carry out the changes necessary to use MC-JIT in the future. gallivm: Fix typo in explicit derivative shuffle. Trivial. draw: make DEBUG_STORE work again adapt to lp_build_printf() interface changes Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: get rid of vecnf_from_scalar() just use lp_build_broadcast directly (cannot assign a name but don't really need it, vecnf_from_scalar() was producing much uglier IR due to using repeated insertelement instead of insertelement+shuffle). Reviewed-by: José Fonseca <jfonseca@vmware.com> llvmpipe: fix typo in complex interpolation code Fixes position interpolation when using complex mode (piglit fp-fragment-position and similar) Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: fix clipvertex/position storing again This appears to be the result of a bad merge. Fixes piglit tests relying on clipping, like a lot of the interpolation tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Fix explicit derivative manipulation. Same counter variable was being used in two nested loops. Use more meanigful variable names for the counter to fix and avoid this. gallivm: Prevent buffer overflow in repeat wrap mode for NPOT. Based on Roland's patch, discussion, and review . Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix dims for TGSI_TEXTURE_1D in emit_tex. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix explicit volume texture derivatives. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: fix 1d shadow texture sampling Always r coordinate is used, hence need 3 coords not two (the second one is unused). Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Enable AVX support without MCJIT, where available. For now, this just enables AVX on Windows for testing. If the code is stable then we might consider prefering the old JIT wherever possible. No change elsewhere. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2012-07-13 18:09:30 +01:00
/*
* for byte vectors can do much better with psadbw.
* Using repeated shuffle/adds here. Note with multiple vectors
* this can be done more efficiently as outlined in the intel
* optimization manual.
* Note: could cause data rearrangement if used with smaller element
* sizes.
*/
2010-03-11 23:26:12 +00:00
gallivm,draw,llvmpipe: Support wider native registers. Squashed commit of the following: commit 7acb7b4f60dc505af3dd00dcff744f80315d5b0e Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:46:31 2012 +0100 draw: Don't use dynamically sized arrays. Not supported by MSVC. commit 5810c28c83647612cb372d1e763fd9d7780df3cb Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:44:16 2012 +0100 gallivm,llvmpipe: Don't use expressions with PIPE_ALIGN_VAR(). MSVC doesn't accept exceptions in _declspec(align(...)). Use a define instead. commit 8aafd1457ba572a02b289b3f3411e99a3c056072 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:41:56 2012 +0100 gallium/util: Make u_cpu_detect.h header C++ safe. commit 5795248350771f899cfbfc1a3a58f1835eb2671d Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 2 12:08:01 2012 +0100 gallium/util: Add ULL suffix to large constants. As suggested by Andy Furniss: it looks like some old gcc versions require it. commit 4c66c22727eff92226544c7d43c4eb94de359e10 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Truly disable INF/NAN tests on MSVC. Thanks to Brian for spotting this. commit 8bce274c7fad578d7eb656d9a1413f5c0844c94e Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Disable INF/NAN tests on MSVC. Somehow they are not recognized as constants. commit 6868649cff8d7fd2e2579c28d0b74ef6dd4f9716 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 15:05:24 2012 +0200 gallivm: Cleanup the 2 x 8 float -> 16 ub special path in lp_build_conv. No behaviour change intended, like 7b98455fb40c2df84cfd3cdb1eb7650f67c8a751. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 5147a0949c4407e8bce9e41d9859314b4a9ccf77 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 14:28:19 2012 +0200 gallivm: (trivial) fix issues with multiple-of-4 texture fetch Some formats can't handle non-multiple of 4 fetches I believe, but everything must support length 1 and multiples of 4. So avoid going to scalar fetch (which is very costly) just because length isn't 4. Also extend the hack to not use shift with variable count for yuv formats to arbitrary length (larger than 1) - doesn't matter how many elements we have we always want to avoid it unless we have variable shift count instruction (which we should get with avx2). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 87ebcb1bd71fa4c739451ec8ca89a7f29b168c08 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jul 4 02:09:55 2012 +0200 gallivm: (trivial) fix typo for wrap repeat mode in linear filtering aos code This would lead to bogus coordinates at the edges. (undetected by piglit because this path is only taken for block-based formats). Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 3a42717101b1619874c8932a580c0b9e6896b557 Author: José Fonseca <jfonseca@vmware.com> Date: Tue Jul 3 19:42:49 2012 +0100 gallivm: Fix TGSI integer translation with AVX. commit d71ff104085c196b16426081098fb0bde128ce4f Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 15:17:41 2012 +0100 llvmpipe: Fix LLVM JIT linear path. It was not working properly because it was looking at the JIT function before it was actually compiled. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit a94df0386213e1f5f9a6ed470c535f9688ec0a1b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Jun 28 18:07:10 2012 +0100 gallivm: Refactor lp_build_broadcast(_scalar) to share code. Doesn't really change the generated assembly, but produces more compact IR, and of course, makes code more consistent. Reviewed-by: Brian Paul <brianp@vmware.com> commit 66712ba2731fc029fa246d4fc477d61ab785edb5 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 17:30:13 2012 +0100 gallivm: Make LLVMContextRef a singleton. There are any places inside LLVM that depend on it. Too many to attempt to fix. Reviewed-by: Brian Paul <brianp@vmware.com> commit ff5fb7897495ac263f0b069370fab701b70dccef Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 28 18:15:27 2012 +0200 gallivm: don't use 8-wide texture fetch in aos path This appears to be a slight loss usually. There are probably several reasons for that: - fetching itself is scalar - filtering is pure int code hence needs splitting anyway, same for the final texel offset calculations - texture wrap related code, which can be done 8-wide, is slightly more complex with floats (with clamp_to_edge) and float operations generally more costly hence probably not much faster overall - the code needed to split when encountering different mip levels for the quads, adding complexity So, just split always for aos path (but leave it 8-wide for soa, since we do 8-wide filtering there when possible). This should certainly be revisited if we'd have avx2 support. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ce8032b43dcd8e8d816cbab6428f54b0798f945d Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:41:19 2012 +0200 gallivm: (trivial) don't extract fparts variable if not needed Did not have any consequences but unnecessary. commit aaa9aaed8f80dc282492f62aa583a7ee23a4c6d5 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:09:06 2012 +0200 gallivm: fix precision issue in aos linear int wrap code now not just passes at a quick glance but also with piglit... If we do the wrapping with floats, we also need to set the weights accordingly. We can potentially end up with different (integer) coordinates than what the integer calculations would have chosen, which means the integer weights calculated previously in this case are completely wrong. Well at least that's what I think happens, at least recalculating the weights helps. (Some day really should refactor all the wrapping, so we do whatever is fastest independent of 16bit int aos or 32bit float soa filtering.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit fd6f18588ced7ac8e081892f3bab2916623ad7a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 11:15:53 2012 +0100 gallium/util: Fix parsing of options with underscore. For example GALLIVM_DEBUG=no_brilinear which was being parsed as two options, "no" and "brilinear". commit 09a8f809088178a03e49e409fa18f1ac89561837 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 26 15:00:14 2012 +0100 gallivm: Added a generic lp_build_print_value which prints a LLVMValueRef. Updated lp_build_printf to share common code. Removed specific lp_build_print_vecX. Reviewed-by: José Fonseca <jfonseca@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit e59bdcc2c075931bfba2a84967a5ecd1dedd6eb0 Author: José Fonseca <jfonseca@vmware.com> Date: Wed May 16 15:00:23 2012 +0100 draw,llvmpipe: Avoid named struct types on LLVM 3.0 and later. Starting with LLVM 3.0, named structures are meant not for debugging, but for recursive data types, previously also known as opaque types. The recursive nature of these types leads to several memory management difficulties. Given that we don't actually need recursive types, avoid them altogether. This is an attempt to address fdo bugs 41791 and 44466. The issue is somewhat random so there's no easy way to check how effective this is. Cherry-picked from 9af1ba565dfd5cef9ee938bb7c04767d14878fbf commit df6070f618a203c7a876d984c847cde4cbc26bdb Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 14:42:53 2012 +0200 gallivm: (trivial) fix typo in faster aos linear int wrap code no longer crashes, now REALLY tested. commit d8f98dce452c867214e6782e86dc08562643c862 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 18:20:58 2012 +0200 llvmpipe: (trivial) remove bogus optimization for float aos repeat wrap This optimization for nearest filtering on the linear path generated likely bogus results, and the int path didn't have any optimizations there since the only shader using force_nearest apparently uses clamp_to_edge not repeat wrap anyway. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit c4e271a0631087c795e756a5bb6b046043b5099d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 23:01:52 2012 +0200 gallivm: faster repeat wrap for linear aos path too Even if we already have scaled integer coords, it's way faster to use the original float coord (plus some conversions) rather than use URem. The choice of what to do for texture wrapping is not really tied to int aos or float soa filtering though for some modes there can be some gains (because of easier weight calculations). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 1174a75b1806e92aee4264ffe0ffe7e70abbbfa3 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 14:39:22 2012 +0200 gallivm: improve npot tex wrap repeat in linear soa path URem gets translated into series of scalar divisions so just about anything else is faster. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit f849ffaa499ed96fa0efd3594fce255c7f22891b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 00:40:35 2012 +0100 gallivm: (trivial) fix near-invisible shift-space typo I blame the keyboard. commit 5298a0b19fe672aebeb70964c0797d5921b51cf0 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:24:28 2012 +0200 gallivm: add new intrinsic helper to deal with arbitrary vector length This helper will split vectors which are too large for the hw, or expand them if they are too small, so a caller of a function using intrinsics which uses such sizes need not split (or expand) the vectors manually and the function will still use the intrinsic instead of dropping back to generic llvm code. It can also accept scalars for use with pseudo-vector intrinsics (only useful for float arguments, all x86 scalar simd float intrinsics use 4vf32). Only used for lp_build_min/max() for now (also added the scalar float case for these while there). (Other basic binary functions could use it easily, whereas functions with a different interface would need different helpers.) Expanding vectors isn't widely used, because we always try to use build contexts with native hw vector sizes. But it might (or not) be nicer if this wouldn't need to be done, the generated code should in theory stay the same (it does get hit by lp_build_rho though already since we didn't have a intrinsic for the scalar lp_build_max case before). v2: incorporated Brian's feedback, and also made the scalar min/max case work instead of crash (all scalar simd float intrinsics take 4vf32 as argument, probably the reason why it wasn't used before). Moved to lp_bld_intr based on José's request, and passing intrinsic size instead of length. Ideally we'd derive the source type info from the passed in llvm value refs and process some llvmtype return type so we could handle intrinsics where the source and destination type isn't the same (like float/int conversions, packing instructions) but that's a bit too complicated for now. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 01aa760b99ec0b2dc8ce57a43650e83f8c1becdf Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:19:18 2012 +0200 gallivm: (trivial) increase max code size for shader disassembly 64kB was just short of what I needed (which caused a crash) hence increase to 96kB (should probably be smarter about that). commit 74aa739138d981311ce13076388382b5e89c6562 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:53:29 2012 +0100 gallivm: simplify aos float tex wrap repeat nearest just handle pot and npot the same. The previous pot handling ended up with exactly the same instructions plus 2 more (leave it in the soa path though since it is probably still cheaper there). While here also fix a issue which would cause a crash after an assert. commit 0e1e755645e9e49cfaa2025191e3245ccd723564 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:29:24 2012 +0100 gallivm: (trivial) skip floor rounding in ifloor when not signed This was only done for the non-sse41 case before, but even with sse41 this is obviously unnecessary (some callers already call itrunc in this case anyway but some might not). commit 7f01a62f27dcb1d52597b24825931e88bae76f33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:23:12 2012 +0100 gallivm: (trivial) fix bogus comments commit 5c85be25fd82e28490274c468ce7f3e6e8c1d416 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 20 11:51:57 2012 +0100 translate: Free elt8_func/elt16_func too. These were leaking. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 0ad498f36fb6f7458c7cffa73b6598adceee0a6c Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 15:55:34 2012 +0200 gallivm: fix bug for tex wrap repeat with linear sampling in aos float path The comparison needs to be against length not length_minus_one, otherwise the max texel is never chosen (for the second coordinate). Fixes piglit texwrap-1D-npot-proj (and 2D/3D versions). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit d1ad65937c5b76407dc2499b7b774ab59341209e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 16:13:43 2012 +0200 gallivm: simplify soa tex wrap repeat with npot textures and no mip filtering Similar to what is already done in aos sampling for the float path (but not the int path since we don't get normalized float coordinates there). URem is expensive and the calculation is done trivially with normalized floats instead (at least with sse41-capable cpus). (Some day should probably do the same for the mip filter path but it's much more complicated there hence the gain is smaller.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit e1e23f57ba9b910295c306d148f15643acc3fc83 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:38:56 2012 +0200 llvmpipe: (trivial) remove duplicated function declaration Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 07ca57eb09e04c48a157733255427ef5de620861 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:37:34 2012 +0200 llvmpipe: destroy setup variants on context destruction lp_delete_setup_variants() used to be called in garbage collection, but this no longer exists hence the setup shaders never got freed. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ed0003c633859a45f9963a479f4c15ae0ef1dca3 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 16:25:29 2012 +0100 gallivm: handle different ilod parts for multiple quad sampling This fixes filtering when the integer part of the lod is not the same for all quads. I'm not fully convinced of that solution yet as it just splits the vector if the levels to be sampled from are different. But otherwise we'd need to do things like some minify steps, and getting mip level base address separately anyway hence it wouldn't really look like much of a win (and making the code even more complex). This should now give identical results to single quad sampling. commit 8580ac4cfc43a64df55e84ac71ce1a774d33c0d2 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:14:47 2012 +0200 gallivm: de-duplicate sample code common to soa and aos sampling There doesn't seem to be any reason why this code dealing with cube face selection, lod and mip level calculation is separate in aos and soa sampling, and I am sick of having it to change in both places. commit fb541e5f957408ce305b272100196f1e12e5b1e8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:15:41 2012 +0200 gallivm: do mip filtering with per quad lod_fpart This gives better results for mip filtering, though the generated code might not be optimal. For now it also creates some artifacts if the lod_ipart isn't the same for all quads, since instead of using the same mip weight for all quads as previously (which just caused non-smooth gradients) this now will use the right weights but with the wrong mip level in this case (can easily be seen with things like texfilt, mipmap_tunnel). v2: use logic helper suggested by José, and fix issue with negative lod_fpart values commit f1cc84eef7d826a20fab6cd8ccef9a275ff78967 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 13 18:35:25 2012 +0200 gallivm: (trivial) fix bogus assert in lp_build_unpack_broadcast_aos_scalars commit 7c17dbae8ae290df9ce0f50781a09e8ed640c044 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:11:14 2012 +0100 util: Reimplement half <-> float conversions. Removed u_half.py used to generate the table for previous method. Previous implementation of float to half conversion was faulty for denormalised and NaNs and would require extra logic to fix, thus making the speedup of using tables irrelevant. commit 7762f59274070e1dd4b546f5cb431c2eb71ae5c3 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:12:16 2012 +0100 tests: Updated tests to properly handle NaN for half floats. commit fa94c135aea5911fd93d5dfb6e6f157fb40dce5e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:33:10 2012 +0200 gallivm: do mip level calculations per quad This is the final piece which shouldn't change the rendering output yet. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 23cbeaddfe03c09ca18c45d28955515317ffcf4c Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Jun 9 00:54:21 2012 +0200 gallivm: do per-quad cube face selection Doesn't quite fix the piglit cubemap test (not sure why actually) but doing per-quad face selection is doing the right thing and definitely an improvement. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit abfb372b3702ac97ac8b5aa80ad1b94a2cc39d33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:22:59 2012 +0200 gallivm: do all lod calculations per quad Still no functional change but lod is now converted to scalar after lod calculations. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 519368632747ae03feb5bca9c655eccbc5b751b4 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:46:10 2012 +0100 gallivm: Added support for half-float to float conversion in lp_build_conv. Updated various utility functions to support this change. commit 135b4d683a4c95f7577ba27b9bffa4a6fbd2c2e7 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:02:46 2012 +0100 gallivm: Added function for half-float to float conversion. Updated lp_build_format_aos_array to support half-float source. commit 37d648827406a20c5007abeb177698723ed86673 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:55:18 2012 +0100 util: Updated u_format_tests to rigidly test half-float boundary values. commit 2ad18165d96e578aa9046df7c93cb1c3284d8c6b Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:54:16 2012 +0100 llvmpipe: Updated lp_test_format to properly handle Inf/NaN results. commit 78740acf25aeba8a7d146493dd5c966e22c27b73 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:53:30 2012 +0100 util: Added functions for checking NaN / Inf for double and half-floats. commit 35e9f640ae01241f9e0d67fe893bbbf564c05809 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:05:13 2012 +0200 gallivm: Fix calculating rho for 3d textures for the single-quad case Discovered by accident, this looks like a very old typo bug. commit fc1220c636326536fd0541913154e62afa7cd1d8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:04:59 2012 +0200 gallivm: do calcs per-quad in lp_build_rho Still convert to scalar at the end of the function. commit 50a887ffc550bf310a6988fa2cea5c24d38c1a41 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 21 23:21:50 2012 +0200 gallivm: (trivial) return scalar in lp_build_extract_range for length 1 vectors Our type system on top of llvm's one doesn't generally support vectors of length 1, instead using scalars. So we should return a scalar from this function instead of having to bitcast the vector with length 1 later elsewhere. commit 80c71c621f9391f0f9230460198d861643324876 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 17:49:15 2012 +0100 draw: Fixed bad merge error commit c47401cfad0c9167de20ff560654f533579f452c Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:29:30 2012 +0100 draw: Updated store_clip to store whole vectors instead of individual elements. commit 2d9c1ad74b0b0b41861fffcecde39f09cc27f1cf Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:28:32 2012 +0100 gallivm: Added lp_build_fetch_rgba_aos_array. A version of lp_build_fetch_rgba_aos which is targeted at simple array formats. Reads the whole vector from memory in one, instead of reading each element individually. Tested with mesa tests and demos. commit ff7805dc2b6ef6d8b11ec4e54aab1633aef29ac8 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:27:40 2012 +0100 gallivm: Added lp_build_pad_vector. This function pads a vector with undef to a desired length. commit 701f50acef24a2791dabf4730e5b5687d6eb875d Author: James Benton <jbenton@vmware.com> Date: Fri May 18 17:27:19 2012 +0100 util: Added util_format_is_array. This function checks whether a format description is in a simple array format. commit 5e0a7fa543dcd009de26f34a7926674190fa6246 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:13:47 2012 +0100 draw: Removed draw_llvm_translate_from and draw/draw_llvm_translate.c. This is "replaced" by adding an optimised path in lp_build_fetch_rgba_aos in an upcoming patch. commit 8c886d6a7dd3fb464ecf031de6f747cb33e5361d Author: James Benton <jbenton@vmware.com> Date: Wed May 16 15:02:31 2012 +0100 draw: Modified store_aos to write the vector as one, not individual elements. commit 37337f3d657e21dfd662c7b26d61cb0f8cfa6f17 Author: James Benton <jbenton@vmware.com> Date: Wed May 16 14:16:23 2012 +0100 draw: Changed aos_to_soa to use lp_build_transpose_aos. commit bd2b69ce5d5c94b067944d1dcd5df9f8e84548f1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:14:27 2012 +0100 draw: Changed soa_to_aos to use lp_build_transpose_aos. commit 0b98a950d29a116e82ce31dfe7b82cdadb632f2b Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:45 2012 +0100 gallivm: Added lp_build_transpose_aos which converts between aos and soa. commit 69ea84531ad46fd145eb619ed1cedbe97dde7cb5 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:01 2012 +0100 gallivm: Added lp_build_interleave2_half aimed at AVX unpack instructions. commit 7a4cb1349dd35c18144ad5934525cfb9436792f9 Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 22 11:54:14 2012 +0100 gallivm: Fix build on Windows. MC-JIT not yet supported there. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit afd105fc16bb75d874e418046b80d9cc578818a1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:17:26 2012 +0100 llvmpipe: Added a error counter to lp_test_conv. Useful for keeping track of progress when fixing errors! Signed-off-by: José Fonseca <jfonseca@vmware.com> commit b644907d08c10a805657841330fc23db3963d59c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:16:46 2012 +0100 llvmpipe: Changed known failures in lp_test_conv. To comply with the recent fixes to lp_bld_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit d7061507bd94f6468581e218e61261b79c760d4f Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:14:38 2012 +0100 llvmpipe: Added fixed point types tests to lp_test_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 146b3ea39b4726dbe125ac666bd8902ea3d6ca8c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:26:35 2012 +0100 llvmpipe: Changed lp_test_conv src/dst alignment to be correct. Now based on the define rather than a fixed number. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit f3b57441f834833a4b142a951eb98df0aa874536 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:06:44 2012 +0100 gallivm: Fixed erroneous optimisation in lp_build_min/max. Previously assumed normalised was 0 to 1, but it can be -1 to 1 if type is signed. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a0613382e5a215cd146bb277646a6b394d376ae4 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:04:49 2012 +0100 gallivm: Compensate for lp_const_offset in lp_build_conv. Fixing a /*FIXME*/ to remove errors in integer conversion in lp_build_conv. Tested using lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a3d2bf15ea345bc8a0664f8f441276fd566566f3 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:01:25 2012 +0100 gallivm: Fixed overflow in lp_build_clamped_float_to_unsigned_norm. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit e7b1e76fe237613731fa6003b5e1601a2e506207 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 21 20:07:51 2012 +0100 gallivm: Fix build with LLVM 2.6 Trivial, and useful. commit d3c6bbe5c7f5ba1976710831281ab1b6a631082d Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 15 17:15:59 2012 +0100 gallivm: Enable MCJIT/AVX with vanilla LLVM 3.1. Add the necessary C++ glue, so that we don't need any modifications to the soon to be released LLVM 3.1. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 724a019a14d40fdbed21759a204a2bec8a315636 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 22:04:06 2012 +0100 gallivm: Use HAVE_LLVM 0x0301 consistently. commit af6991e2a3868e40ad599b46278551b794839748 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 21:49:06 2012 +0100 gallivm: Add MCRegisterInfo.h to silence benign warnings about missing implementation. Trivial. commit 6f8a1d75458daae2503a86c6b030ecc4bb494e23 Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Apr 2 22:14:15 2012 -0700 gallivm: Pass in a MCInstrInfo to createMCInstPrinter on llvm-3.1. llvm-3.1svn r153860 makes MCInstrInfo available to the MCInstPrinter. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 62555b6ed8760545794f83064e27cddcb3ce5284 Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 27 21:51:17 2012 -0700 gallivm: Fix method overriding in raw_debug_ostream. Use matching type qualifers to avoid method hiding. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 6a9bd784f4ac68ad0a731dcd39e5a3c39989f2be Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 13 22:40:52 2012 -0700 gallivm: Fix createOProfileJITEventListener namespace with llvm-3.1. llvm-3.1svn r152620 refactored the OProfile profiling code. createOProfileJITEventListener was moved from the llvm namespace to the llvm::JITEventListener namespace. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit b674955d39adae272a779be85aa1bd665de24e3e Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Mar 5 22:00:40 2012 -0800 gallivm: Pass in a MCRegisterInfo to MCInstPrinter on llvm-3.1. llvm-3.1svn r152043 changes createMCInstPrinter to take an additional MCRegisterInfo argument. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 11ab69971a8a31c62f6de74905dbf8c02884599f Author: Vinson Lee <vlee@freedesktop.org> Date: Wed Feb 29 21:20:53 2012 -0800 Revert "gallivm: Change getExtent and readByte to non-const with llvm-3.1." This reverts commit d5a6c172547d8964f4d4bb79637651decaf9deee. llvm-3.1svn r151687 makes MemoryObject accessor members const again. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 339960c82d2a9f5c928ee9035ed31dadb7f45537 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 14 16:19:56 2012 +0200 gallivm: (trivial) fix assertion failure for mipmapped 1d textures In lp_build_rho, we may end up with a 1-element vector (for mipmapped 1d textures), but in this case we require the type to be a non-vector type, so need a cast. commit 9d73edb727bd6d196030dc3026b7bf0c574b3e19 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:12:07 2012 +0200 gallivm: prepare for per-quad lod calculations for large vectors to be able to handle multiple quads at once in texture sampling and still do lod calculations per quad, it is necessary to get the per-quad derivatives into the lp_build_rho function. Until now these derivative values were just scalars, which isn't going to work. So we now use vectors, and since the interface needs to change we also do some different (slightly more efficient) packing of the values. For 8-wide vectors the packed derivative values for 3 coords would look like this, this scales to a arbitrary (multiple of 4) vector size: ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____ The second vector will be unused for 1d and 2d textures. To facilitate future changes the derivative values are put into a struct, since quite some functions just pass these values through. The generated code seems to be very slightly better for 2d textures (with 4-wide vectors) than before with sse2 (if you have a cpu with physical 128bit simd units - otherwise it's probably not a win). v2: suggestions from José, rename variables, add comments, use swizzle helper commit 0aa21de0d31466dac77b05c97005722e902517b8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:10:31 2012 +0200 gallivm: add undefined swizzle handling to lp_build_swizzle_aos This is useful for vectors with "holes", it lets llvm choose the most efficient shuffle instructions if some elements aren't needed without having to worry what elements to manually pick otherwise. commit 00faf3f370e7ce92f5ef51002b0ea42ef856e181 Author: José Fonseca <jfonseca@vmware.com> Date: Fri May 4 17:25:16 2012 +0100 gallivm: Get the LLVM IR optimization passes before JIT compilation. MC-JIT engine compiles the module immediately on creation, so the optimization passes were being run too late. So now we create a target data layout from a string, that matches the ABI parameters reported by the compiler. The backend optimization passes were always been run, so the performance improvement is modest (3% on multiarb mesa demo). Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit 40a43f4e2ce3074b5ce9027179d657ebba68800a Author: Roland Scheidegger <sroland@vmware.com> Date: Wed May 2 16:03:54 2012 +0200 gallivm: (trivial) fix wrong define used in lp_build_pack2 should fix stack-smashing crashes. commit e6371d0f4dffad4eb3b7a9d906c23f1c88a2ab9e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 30 21:25:29 2012 +0200 gallivm: add perf warnings when not using intrinsics with 256bit vectors Helper functions using integer sse2 intrinsics could split the vectors with AVX instead of using generic fallback (which should be faster). We don't actually expect to hit these paths (hence don't fix them up to actually do the vector splitting) so just emit warnings (for those functions where it's obvious doing split/intrinsic is faster than using generic path). Only emit warnings for 256bit vectors since we _really_ don't expect to hit arbitrary large vectors which would affect a lot more functions. The warnings do not actually depend on avx since the same logic applies to plain sse2 too (but of course again there's _really_ no reason we should hit these functions with 256bit vectors without avx). commit 8a9ea701ea7295181e846c6383bf66a5f5e47637 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:37:07 2012 +0200 gallivm: split vectors manually for avx in lp_build_pack2 (v2) There's 2 reasons for this: First, there's a llvm bug (fixed in 3.1) which generates tons of byte inserts/extracts otherwise, and second, more importantly, we want to use pack intrinsics instead of shuffles. We do this in lp_build_pack2 and not the calling code (aos sample path) because potentially other callers might find that useful too, even if for larger sequences of code using non-native vector sizes it might be better to manually split vectors. This should boost texture performance in the aos path considerably. v2: fix issues with intrinsics types with old llvm commit 27ac5b48fa1f2ea3efeb5248e2ce32264aba466e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:26:22 2012 +0200 llvmpipe: refactor lp_build_pack2 (v2) prettify, and it's unnecessary to assert when there's no intrinsic due to unsupported bit width - the shuffle path will work regardless. In contrast lp_build_packs2, should only rely on lp_build_pack2 doing the clamping for element sizes for which there is a sse2 intrinsic. v2: fix bug spotted by Jose regarding the intrinsic type for packusdw on old llvm versions. commit ddf279031f0111de4b18eaf783bdc0a1e47813c8 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:13:59 2012 +0200 gallivm: add src width check in lp_build_packs2() not doing so would skip clamping even if no sse2 pack instruction is available, which is incorrect (in theory only, such widths would also always hit a (unnecessary) assertion in lp_build_pack2(). commit e7f0ad7fe079975eae7712a6e0c54be4fae0114b Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Apr 27 15:57:00 2012 +0200 gallivm: (trivial) fix crash-causing typo for npot textures with avx commit 28a9d7f6f655b6ec508c8a3aa6ffefc1e79793a0 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Apr 25 19:38:45 2012 +0200 gallivm: (trivial) remove code mistakenly added twice. commit d5926537316f8ff67ad0a52e7242f7c5478d919b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Apr 24 21:16:15 2012 +0200 gallivm: add a new avx aos sample path (v2) Try to avoid mixing float and int address calculations. This does texture wrap modes with floats, and then the offset calculations still with ints (because of lack of precision with floats, though we could do some effort to make it work with not too large (16MB) textures). This also handles wrap repeat mode with npot-sized textures differently than either the old soa or aos int path (likely way faster but untested). Otherwise the actual address wrap code is largely similar to the soa path (not quite the same as this one also has some int code), it should get used by avx soa sampling later as well but doesn't handle more complex address modes yet (this will also have the benefit that we can use aos sampling path for all texture address modes). Generated code for that looks reasonable, but still does not split vectors explicitly for fetch/filter which means still get hit by llvm (fixed upstream) which generates hundreds of pinsrb/pextrb instead of two shuffles. It is not obvious though if it's much of a win over just doing address calcs 4-wide but with ints, even if it is definitely much less instructions on avx. piglit's texwrap seems to look exactly the same but doesn't test neither the non-normalized nor the npot cases. v2: fix comments, prettify based on Brian's and Jose's feedback. commit bffecd22dea66fb416ecff8cffd10dd4bdb73fce Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Apr 19 01:58:29 2012 +0200 gallivm: refactor aos lp_build_sample_image_nearest/linear split them up to separate address calculations and fetching/filtering. Need this for being able to do 8-wide float address calcs and 4-wide fetch/filter later (for avx). Plus the functions were very big scary monsters anyway (in particular lp_build_sample_image_linear). commit a80b325c57529adddcfa367f96f03557725c4773 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 16 17:17:18 2012 +0200 gallivm: fix lp_build_resize when truncating width but expanding vector size Missed this case which I thought was impossible - the assertion for it was right after the division by zero... (AoS) texture sampling may ask us to do this, for things like 8 4x32int vectors to 1 32x8int vector conversion (eventually, we probably don't want this to happen). commit f9c8337caa3eb185830d18bce8b95676a065b1d7 Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Apr 14 18:00:59 2012 +0200 gallivm: fix cube maps with larger vectors This makes the branchless cube face selection code work with larger vectors. Because the complexity is quite high (cannot really be improved it seems, per-face selection would reduce complexity a lot but this leads to errors unless the derivatives are calculated all from the same face which almost doubles the work to be done) it is still slower than the branching version, hence only enable this with large vectors. It doesn't actually do per-quad face selection yet (only makes sense with matching lod selection, in fact it will select the same face for all pixels based on the average of the first four pixels for now) but only different shuffles are required to make it work (the branching version actually should work with larger vectors too now thanks to the improved horizontal add but of course it cannot be extended to really select the face per-quad unless doing branching per quad). commit 7780c58869fc9a00af4f23209902db7e058e8a66 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 21:11:12 2012 +0100 llvmpipe: (trivial) fix compiler warning and also clarify comment regarding availability of popcnt instruction. commit a266dccf477df6d29a611154e988e8895892277e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 14:21:07 2012 +0100 gallivm: remove unneeded members in lp_build_sample_context Minor cleanup, the texture width, height, depth aren't accessed in their scalar form anywhere. Makes it more obvious those values should probably be fetched already vectorized (but this requires more invasive changes)... commit b678c57fb474e14f05e25658c829fc04d2792fff Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 29 15:53:55 2012 +0100 gallivm: add a helper for concatenating vectors Similar to the extract_range helper intended to get around slow code generated by llvm for 128bit insertelements. Concatenating two 128bit vectors this way will result in a single vinsertf128 operation rather than two 64bit stores plus one 128bit load, though it might be mildly useful for other purposes as well. commit 415ff228bcd0cf5e44a4c15350a661f0f5520029 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 19:41:15 2012 +0100 gallivm: add a custom 2x8f->1x16ub avx conversion path Similar to the existing 4x4f->1x16ub sse2 path, shaves off a couple instructions (min/max mostly) because it relies on pack intrinsics clamping. commit 78c08fc89f8fbcc6dba09779981b1e873e2a0299 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 18:44:07 2012 +0100 gallivm: add avx arithmetic intrinsics Add all avx intrinsics for arithmetic functions (with the exception of the horizontal add function which needs another look). Seems to pass basic tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit a586caa2800aa5ce54c173f7c0d4fc48153dbc4e Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 15:31:35 2012 +0100 gallivm: add avx logic intrinsics Add the blend intrinsics for 8-wide float and 4-wide double vectors. Since we lack 256bit int instructions these are used for int vectors as well, though obviously not for byte or word element values. The comparison intrinsics aren't extended for avx since these are only used for pre-2.7 llvm versions. commit 70275e4c13c89315fc2560a4c488c0e6935d5caf Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 00:40:53 2012 +0100 gallivm: new helper function for extract shuffles. Based on José's idea as we can need that in a couple places. Note that such shuffles should not be used lightly, since data layout of <4 x i8> is different to <16 x i8> for instance, hence might cause data rearrangement. commit 4d586dbae1b0c55915dda1759d2faea631c0a1c2 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 18:27:25 2012 +0100 gallivm: (trivial) don't overallocate shuffle variable using wrong define meant huge array... commit 06b0ec1f6d665d98c135f9573ddf4ba04b2121ad Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 17:54:20 2012 +0100 gallivm: don't do per-element extract/insert for vector element resize Instead of doing per-element extract/insert if the src vectors and dst vector differ in total size (which generates atrocious code) first change the src vectors size by using shuffles to destination vector size. We can still do better than that on AVX for packing to color buffer (by exploiting pack intrinsics characteristics hence eleminating the need for some clamps) but this already generates much better code. v2: incorporate feedback from José, Keith and use shuffle instead of bitcasts/extracts. Due to llvm deficiencies the latter cause all data to get moved to GPRs and back in pieces (even though the data in the regs actually stays the same...). commit c9970d70e05f95d3f52fe7d2cd794176a52693aa Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 19:33:19 2012 +0000 gallivm: fix bug in simple position interpolation Accidental use of position attribute instead of just pixel coordinates. Caused failures in piglit glsl-fs-ceil and glsl-fs-floor. commit d0b6fcdb008d04d7f73d3d725615321544da5a7e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 15:31:14 2012 +0000 gallivm: fix emission of ceil opcode lp_build_ceil seems more appropriate than lp_build_trunc. This seems to be never hit though someone performs some ceil to floor magic. commit d97fafed7e62ffa6bf76560a92ea246a1a26d256 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 22 11:46:52 2012 +0000 gallivm: new vectorized path for cubemap calculations should be faster when adapted to multiple quads as only selection masks need to be different. The code is more or less a per-pixel version adapted to only do it per quad. A per pixel version would be much simpler (could drop 2 selects, 6 broadcasts and the messy horizontal add of 3 vectors at the expense of only 2 more absolute value instructions - would also just work for arbitary large vectors). This version doesn't yet work with larger vectors because the horizontal add isn't adjusted to be able to work with 2x4 vectors (and also because face selection wouldn't be done per quad just per block though that would be only a correctness issue just as with lod selection). The downside is this code is quite a bit slower. On a Core2 it can be sped up by disabling the hw blend instructions for selection and using logicop fallbacks instead, but it is still slower than the old code, hence leave that in for now. Probably will chose one or the other version based on vector length in the end. commit b375fbb18a3fd46859b7fdd42f3e9908ea4ff9a3 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 21 14:42:29 2012 +0000 gallivm: fix optimized occlusion query intrinsic name commit a9ba0a3b611e48efbb0e79eb09caa85033dbe9a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Mar 21 16:19:43 2012 +0000 draw,gallivm,llvmpipe: Call gallivm_verify_function everywhere. commit f94c2238d2bc7383e088b8845b7410439a602071 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 18:54:10 2012 +0000 gallivm: optimize calculations for cube maps a bit this does some more vectorized calculations and uses horizontal adds if possible. A definite win with sse3 otherwise it doesn't seem to make much of a difference. In any case this is arithmetically identical, cannot handle larger vectors. Should be useful as a reference point against larger vector version later... commit 21a2c1cf3c8e1ac648ff49e59fdc0e3be77e2ebb Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 15:16:27 2012 +0000 llvmpipe: slight optimization of occlusion queries using movmskps when available. While this is slightly better for cpus without popcnt we should really sum the vectors ourselves (it is also possible to cast to i4 before doing the popcnt but that doesn't help that much neither since llvm is using some optimized popcnt version for i32) commit 5ab5a35f216619bcdf55eed52b0db275c4a06c1b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 13:32:11 2012 +0000 llvmpipe: fix occlusion queries with larger vectors need to adjust casts etc. commit ff95e6fdf5f16d4ef999ffcf05ea6e8c7160b0d5 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Mar 19 20:15:25 2012 +0000 gallivm: Restore optimization passes. commit 57b05b4b36451e351659e98946dae27be0959832 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:34:22 2012 +0000 llvmpipe: use existing min2 macro commit bc9a20e19b4f600a439f45679451f2e87cd4b299 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:07:27 2012 +0000 llvmpipe: add some safeguards against really large vectors As per José's suggestion, prevent things from blowing up if some cpu would have 1024bit or larger vectors. commit 0e2b525e5ca1c5bbaa63158bde52ad1c1564a3a9 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:31:08 2012 +0000 llvmpipe: fix mask generation for uberwide vectors this was the only piece preventing 16-wide vectors from working (apart from the LP_MAX_VECTOR_WIDTH define that is), which is the maximum as we don't get more pixels in the fragment shader at once. Hence adjust that so things could be tested properly with that size even though there seems to be no practical value. commit 3c8334162211c97f3a11c7f64e9e5a2a91ad9656 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:19:41 2012 +0000 llvmpipe: fix the simple interpolation method with larger vectors so both methods actually _really_ work now. Makes textures look nice with larger vectors... commit 1cb0464ef8871be1778d43b0c56adf9c06843e2d Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 17:26:35 2012 +0000 llvmpipe: fix mask generation and position interpolation with 8-wide vectors trivial bugs, with these things start to look somewhat reasonable. Textures though have some swizzling issues it seems. commit 168277a63ef5b72542cf063c337f2d701053ff4b Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 16:04:03 2012 +0000 llvmpipe: don't overallocate variables we never have more than 16 (stamp size) / 4 (minimum possible vector size). (With larger vectors those variables are still overallocated a bit.) commit 409b54b30f81ed0aa9ed0b01affe15c72de9abd2 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:56:48 2012 +0000 llvmpipe: add some 32f8 formats to lp_test_conv Also add the ability to handle different sized vectors. commit 55dcd3af8366ebdac0af3cdb22c2588f24aa18ce Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:47:27 2012 +0000 gallivm: handle different sized vectors in conversion / pack only fully generic path for now (extract/insert per element). commit 9c040f78c54575fcd94a8808216cf415fe8868f6 Author: Roland Scheidegger <sroland@vmware.com> Date: Sun Mar 18 00:58:28 2012 +0100 llvmpipe: fix harmless use of unitialized values commit 551e9d5468b92fc7d5aa2265db9a52bb1e368a36 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:31:21 2012 +0100 gallivm: drop special path in extract_broadcast with different sized vectors Not needed, llvm can handle shuffles with different sized result vector just fine. Should hopefully generate the same code in the end, but simpler IR. commit 44da531119ffa07a421eaa041f63607cec88f6f8 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:28:49 2012 +0100 llvmpipe: adapt interpolation for handling multiple quads at once this is still WIP there are actually two methods possible not quite sure what makes the most sense, so there's code for both for now: 1) the iterative method as used before (compute attrib values at upper left corner of stamp and upper left corner of each quad initially). It is improved to handle more than one quad at once, and also do some more vectorized calculations initially for slightly better code - newer cpus have full throughput with 4 wide float vectors, hence don't try to code up a path which might be faster if there's just one channel active per attribute. 2) just do straight interpolation for each pixel. Method 2) is more work per quad, but less initially - if all quads are executed significantly more overall though. But this might change with larger vector lengths. This method would also be needed if we'd do some kind of active quad merging when operating on multiple quads at once. This path contains some hack to force llvm to generate better code, it is still far from ideal though, still generates far too many unnecessary register spills/reloads. Both methods should work with different sized vectors. Not very well tested yet, still seems to work with four-wide vectors, need changes elsewhere to be able to test with wider vectors. commit be5d3e82e2fe14ad0a46529ab79f65bf2276cd28 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:59:37 2012 +0000 draw: Cleanup. commit f85bc12c7fbacb3de2a94e88c6cd2d5ee0ec0e8d Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:43:30 2012 +0000 gallivm: More module compilation refactoring. commit d76f093198f2a06a93b2204857e6fea5fd0b3ece Author: José Fonseca <jfonseca@vmware.com> Date: Thu Mar 15 21:29:11 2012 +0000 llvmpipe: Use gallivm_compile/free_function() in linear code. Should had been done before. commit 122e1adb613ce083ad739b153ced1cde61dfc8c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 13 14:47:10 2012 +0100 llvmpipe: generate partial pixel mask for multiple quads still works with one quad, cannot be tested yet with more At least for now always fixed order with multiple quads. commit 4c4f15081d75ed585a01392cd2dcce0ad10e0ea8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 22:09:24 2012 +0100 llvmpipe: refactor state setup a bit Refactor to make it easier to emit (and potentially later fetch in fs) coefficients for multiple attributes at once. Need to think more about how to make this actually happen however, the problem is different attributes can have different interpolation modes, requiring different handling in both setup and fs (though linear and perspective handling is close). commit 9363e49722ff47094d688a4be6f015a03fba9c79 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 19:23:23 2012 +0100 llvmpipe: vectorize tri offset calc cuts number of instructions in quad-offset-factor from 107 to 75. This code actually duplicated the (scalar) code calculating the determinant except it used different vertex order (leading to different sign but it doesn't matter) hence llvm could not have figured out it's the same (of course with determinant vectorized in the other place that wouldn't have worked any longer neither). Note this particular piece doesn't actually vectorize well, not many arithmetic instructions left but tons of shuffle instructions... Probably would need to work on n tris at a time for better vectorization. commit 63169dcb9dd445c94605625bf86d85306e2b4297 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 03:11:37 2012 +0100 llvmpipe: vectorize some scalar code in setup reduces number of arithmetic instructions, and avoids loading vector x,y values twice (once as scalars once as vectors). Results in a reduction of instructions from 76 to 64 in fs setup for glxgears (16%) on a cpu with sse41. Since this code uses vec2 disguised as vec4, on old cpus which had physical 64bit sse units (pre-Core2) it probably is less of a win in practice (and if you have no vectors you can only hope llvm eliminates the arithmetic for unneeded elements). commit 732ecb877f951ab89bf503ac5e35ab8d838b58a1 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 7 00:32:24 2012 +0100 draw: fix clipping bug introduced by 4822fea3f0440b5205e957cd303838c3b128419c broke clipping pretty badly (verified with lineclip test) commit ef5d90b86d624c152d200c7c4056f47c3c6d2688 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 23:38:59 2012 +0100 draw: don't store vertex header per attribute storing the vertex header once per attribute is totally unnecessary. Some quick look at the generated assembly says llvm in fact cannot optimize away the additional stores (maybe due to potentially aliasing pointers somewhere). Plus, this makes the code cleaner and also allows using a vector "or" instead of scalar ones. commit 6b3a5a57b0b9850854cfbd7b586e4e50102dda71 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 19:11:01 2012 +0100 draw: do the per-vertex "boolean" clipmask "or" with vectors no point extracting the values and doing it per component. Doesn't help that much since we still extract the values elsewhere anyway. commit 36519caf1af40e4480251cc79a2d527350b7c61f Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 2 22:27:01 2012 +0100 gallivm: fix lp_build_extract_broadcast with different sized vectors Fix the obviously wrong argument, so it doesn't blow up. commit 76d0ac3ad85066d6058486638013afd02b069c58 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 2 12:16:23 2012 +0000 draw: Compile per module and not per function (WIP). Enough to get gears w/ LLVM draw + softpipe to work on AVX doing: GALLIUM_DRIVER=softpipe SOFTPIPE_USE_LLVM=yes glxgears But still hackish -- will need to rethink and refactor this. commit 78e32b247d2a7a771be9a1a07eb000d1e54ea8bd Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 12:01:05 2012 +0000 llvmpipe: Remove lp_state_setup_fallback. Never used. commit 6895d5e40d19b4972c361e8b83fdb7eecda3c225 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Feb 27 19:14:27 2012 +0000 llvmpipe: Don't emit EMMS on x86 We already take precautions to ensure that LLVM never emits MMX code. commit 4822fea3f0440b5205e957cd303838c3b128419c Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 29 15:58:19 2012 +0100 draw: modifications for larger vector sizes We want to be able to use larger vectors especially for running the vertex shader. With this patch we build soa vectors which might have a different length than 4. Note that aos structures really remain the same, only when aos structures are converted to soa potentially different sized vectors are used. Samplers probably don't work yet, didn't look at them. Testing done: glxgears works with both 128bit and 256bit vectors. commit f4950fc1ea784680ab767d3dd0dce589f4e70603 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:57 2012 +0100 gallivm: override native vector width with LP_NATIVE_VECTOR_WIDTH env var for debug commit 6ad6dbf0c92f3bf68ae54e5f2aca035d19b76e53 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:24 2012 +0100 draw: allocate storage with alignment according to native vector width commit 7bf0e3e7c9bd2469ae7279cabf4c5229ae9880c1 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Feb 24 19:06:08 2012 +0000 gallivm: Fix comment grammar. Was missing several words. Spotted by Roland. commit b20f1b28eb890b2fa2de44a0399b9b6a0d453c52 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 19:22:09 2012 +0000 gallivm: Use MC-JIT on LLVM 3.1 + (i.e, SVN) MC-JIT Note: MC-JIT is still WIP. For this to work correctly it requires LLVM changes which are not yet upstream. commit b1af4dfcadfc241fd4023f4c3f823a1286d452c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 20:03:15 2012 +0100 llvmpipe: use new lp_type_width() helper in lp_test_blend commit 04e0a37e888237d4db2298f31973af459ef9c95f Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 19:50:34 2012 +0100 llvmpipe: clean up lp_test_blend a little Using variables just sized and aligned right makes it a bit more obvious what's going on. The test still only tests vector length 4. For AoS anything else probably isn't going to work. For SoA other lengths should work (at least with floats). commit e61c393d3ec392ddee0a3da170e985fda885a823 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:48:30 2012 +0000 gallivm: Ensure vector width consistency. Instead of assuming that everything is the max native size. commit 330081ac7bc41c5754a92825e51456d231bf84dd Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:44:14 2012 +0000 draw: More simd vector width consistency fixes. commit d90ca002753596269e37297e2e6c139b19f29f03 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:43:00 2012 +0000 gallivm: Remove unused lp_build_int32_vec4_type() helper. commit cae23417824d75869c202aaf897808d73a2c1db0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 17:32:16 2012 +0100 gallivm: use global variable for native vector width instead of define We do not know the simd extensions (and hence the simd width we should use) available at compile time. At least for now keep a define for maximum vector width, since a global variable obviously can't be used to adjust alignment of automatic stack variables. Leave the runtime-determined value at 128 for now in all cases. commit 51270ace6349acc2c294fc6f34c025c707be538a Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 15:41:02 2012 +0000 gallivm: Add a hunk inadvertedly lost when rebasing. commit bf256df9cfdd0236637a455cbaece949b1253e98 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:24:23 2012 +0000 llvmpipe: Use consistent vector width in depth/stencil test. commit 5543b0901677146662c44be2cfba655fd55da94b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:19:59 2012 +0000 draw: Use a consistent the vector register width. Instead of 4x32 sometimes, LP_NATIVE_VECTOR_WIDTH other times. commit eada8bbd22a3a61f549f32fe2a7e408222e5c824 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 12:08:04 2012 +0000 gallivm: Remove garbagge collection. MC-JIT will require one compilation per module (as opposed to one compilation per function), therefore no state will be shared, eliminating the need to do garbagge collection. commit 556697ea0ed72e0641851e4fbbbb862c470fd7eb Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 10:33:41 2012 +0000 gallivm: Move all native target initialization to lp_set_target_options(). commit c518e8f3f2649d5dc265403511fab4bcbe2cc5c8 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:52:32 2012 +0000 llvmpipe: Create one gallivm instance for each test. commit 90f10af8920ec6be6f2b1e7365cfc477a0cb111d Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:48:08 2012 +0000 gallivm: Avoid LLVMAddGlobalMapping() in lp_bld_assert(). Brittle, complex, and unecesary. Just use function pointer constant. commit 98fde550b33401e3fe006af59db4db628bcbf476 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:21:26 2012 +0000 gallivm: Add a lp_build_const_func_pointer() helper. To be reused in all places where we want to call C code. commit 6cfedadb62c2ce5af8d75969bc95a607f3ece118 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:44:41 2012 +0000 gallivm: Cleanup/simplify lp_build_const_string_variable. - Move to lp_bld_const where it belongs - Rename to lp_build_const_string - take the length from the argument (and don't count the zero terminator twice) - bitcast the constant to generic i8 * commit db1d4018c0f1fa682a9da93c032977659adfb68c Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 11:52:17 2012 +0000 gallivm: Set NoFramePointerElimNonLeaf to true where supported. commit 088614164aa915baaa5044fede728aa898483183 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 22 19:38:47 2012 +0100 llvmpipe: pass in/out pointers rather scalar floats in lp_bld_arit we don't want llvm to potentially optimize away the vectors (though it doesn't seem to currently), plus we want to be able to handle in/out vectors of arbitrary length. commit 3f5c4e04af8a7592fdffa54938a277c34ae76b51 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:22:55 2012 +0100 gallivm: fix lp_build_sqrt() for vector length 1 since we optimize away vectors with length 1 need to emit intrinsic without vector type. commit 79d94e5f93ed8ba6757b97e2026722ea31d32c06 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 17:00:46 2012 +0000 llvmpipe: Remove lp_test_round. commit 81f41b5aeb3f4126e06453cfc78990086b85b78d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:56:24 2012 +0100 llvmpipe: subsume lp_test_round into lp_test_arit Much simpler, and since the arguments aren't passed as 128bit values can run on any arch. This also uses the float instead of the double versions of the c functions (which probably was the intention anyway). In contrast to lp_test_round the output is much less verbose however. Tested vector width of 32 to 512 bits - all pass except 32 (length 1) which crashes in lp_build_sqrt() due to wrong type. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 945b338b421defbd274481d8c4f7e0910fd0e7eb Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 09:55:03 2012 +0000 gallivm: Centralize the function compilation logic. This simplifies a lot of code. Also doing this in a central place will make it easier to carry out the changes necessary to use MC-JIT in the future. gallivm: Fix typo in explicit derivative shuffle. Trivial. draw: make DEBUG_STORE work again adapt to lp_build_printf() interface changes Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: get rid of vecnf_from_scalar() just use lp_build_broadcast directly (cannot assign a name but don't really need it, vecnf_from_scalar() was producing much uglier IR due to using repeated insertelement instead of insertelement+shuffle). Reviewed-by: José Fonseca <jfonseca@vmware.com> llvmpipe: fix typo in complex interpolation code Fixes position interpolation when using complex mode (piglit fp-fragment-position and similar) Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: fix clipvertex/position storing again This appears to be the result of a bad merge. Fixes piglit tests relying on clipping, like a lot of the interpolation tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Fix explicit derivative manipulation. Same counter variable was being used in two nested loops. Use more meanigful variable names for the counter to fix and avoid this. gallivm: Prevent buffer overflow in repeat wrap mode for NPOT. Based on Roland's patch, discussion, and review . Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix dims for TGSI_TEXTURE_1D in emit_tex. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix explicit volume texture derivatives. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: fix 1d shadow texture sampling Always r coordinate is used, hence need 3 coords not two (the second one is unused). Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Enable AVX support without MCJIT, where available. For now, this just enables AVX on Windows for testing. If the code is stable then we might consider prefering the old JIT wherever possible. No change elsewhere. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2012-07-13 18:09:30 +01:00
vecres = a;
length = type.length / 2;
while (length > 1) {
LLVMValueRef vec1, vec2;
for (i = 0; i < length; i++) {
shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
}
vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
LLVMConstVector(shuffles1, length), "");
vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
LLVMConstVector(shuffles2, length), "");
if (type.floating) {
vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
}
else {
vecres = LLVMBuildAdd(builder, vec1, vec2, "");
}
length = length >> 1;
2010-03-11 23:26:12 +00:00
}
gallivm,draw,llvmpipe: Support wider native registers. Squashed commit of the following: commit 7acb7b4f60dc505af3dd00dcff744f80315d5b0e Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:46:31 2012 +0100 draw: Don't use dynamically sized arrays. Not supported by MSVC. commit 5810c28c83647612cb372d1e763fd9d7780df3cb Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:44:16 2012 +0100 gallivm,llvmpipe: Don't use expressions with PIPE_ALIGN_VAR(). MSVC doesn't accept exceptions in _declspec(align(...)). Use a define instead. commit 8aafd1457ba572a02b289b3f3411e99a3c056072 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:41:56 2012 +0100 gallium/util: Make u_cpu_detect.h header C++ safe. commit 5795248350771f899cfbfc1a3a58f1835eb2671d Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 2 12:08:01 2012 +0100 gallium/util: Add ULL suffix to large constants. As suggested by Andy Furniss: it looks like some old gcc versions require it. commit 4c66c22727eff92226544c7d43c4eb94de359e10 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Truly disable INF/NAN tests on MSVC. Thanks to Brian for spotting this. commit 8bce274c7fad578d7eb656d9a1413f5c0844c94e Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Disable INF/NAN tests on MSVC. Somehow they are not recognized as constants. commit 6868649cff8d7fd2e2579c28d0b74ef6dd4f9716 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 15:05:24 2012 +0200 gallivm: Cleanup the 2 x 8 float -> 16 ub special path in lp_build_conv. No behaviour change intended, like 7b98455fb40c2df84cfd3cdb1eb7650f67c8a751. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 5147a0949c4407e8bce9e41d9859314b4a9ccf77 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 14:28:19 2012 +0200 gallivm: (trivial) fix issues with multiple-of-4 texture fetch Some formats can't handle non-multiple of 4 fetches I believe, but everything must support length 1 and multiples of 4. So avoid going to scalar fetch (which is very costly) just because length isn't 4. Also extend the hack to not use shift with variable count for yuv formats to arbitrary length (larger than 1) - doesn't matter how many elements we have we always want to avoid it unless we have variable shift count instruction (which we should get with avx2). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 87ebcb1bd71fa4c739451ec8ca89a7f29b168c08 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jul 4 02:09:55 2012 +0200 gallivm: (trivial) fix typo for wrap repeat mode in linear filtering aos code This would lead to bogus coordinates at the edges. (undetected by piglit because this path is only taken for block-based formats). Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 3a42717101b1619874c8932a580c0b9e6896b557 Author: José Fonseca <jfonseca@vmware.com> Date: Tue Jul 3 19:42:49 2012 +0100 gallivm: Fix TGSI integer translation with AVX. commit d71ff104085c196b16426081098fb0bde128ce4f Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 15:17:41 2012 +0100 llvmpipe: Fix LLVM JIT linear path. It was not working properly because it was looking at the JIT function before it was actually compiled. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit a94df0386213e1f5f9a6ed470c535f9688ec0a1b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Jun 28 18:07:10 2012 +0100 gallivm: Refactor lp_build_broadcast(_scalar) to share code. Doesn't really change the generated assembly, but produces more compact IR, and of course, makes code more consistent. Reviewed-by: Brian Paul <brianp@vmware.com> commit 66712ba2731fc029fa246d4fc477d61ab785edb5 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 17:30:13 2012 +0100 gallivm: Make LLVMContextRef a singleton. There are any places inside LLVM that depend on it. Too many to attempt to fix. Reviewed-by: Brian Paul <brianp@vmware.com> commit ff5fb7897495ac263f0b069370fab701b70dccef Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 28 18:15:27 2012 +0200 gallivm: don't use 8-wide texture fetch in aos path This appears to be a slight loss usually. There are probably several reasons for that: - fetching itself is scalar - filtering is pure int code hence needs splitting anyway, same for the final texel offset calculations - texture wrap related code, which can be done 8-wide, is slightly more complex with floats (with clamp_to_edge) and float operations generally more costly hence probably not much faster overall - the code needed to split when encountering different mip levels for the quads, adding complexity So, just split always for aos path (but leave it 8-wide for soa, since we do 8-wide filtering there when possible). This should certainly be revisited if we'd have avx2 support. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ce8032b43dcd8e8d816cbab6428f54b0798f945d Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:41:19 2012 +0200 gallivm: (trivial) don't extract fparts variable if not needed Did not have any consequences but unnecessary. commit aaa9aaed8f80dc282492f62aa583a7ee23a4c6d5 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:09:06 2012 +0200 gallivm: fix precision issue in aos linear int wrap code now not just passes at a quick glance but also with piglit... If we do the wrapping with floats, we also need to set the weights accordingly. We can potentially end up with different (integer) coordinates than what the integer calculations would have chosen, which means the integer weights calculated previously in this case are completely wrong. Well at least that's what I think happens, at least recalculating the weights helps. (Some day really should refactor all the wrapping, so we do whatever is fastest independent of 16bit int aos or 32bit float soa filtering.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit fd6f18588ced7ac8e081892f3bab2916623ad7a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 11:15:53 2012 +0100 gallium/util: Fix parsing of options with underscore. For example GALLIVM_DEBUG=no_brilinear which was being parsed as two options, "no" and "brilinear". commit 09a8f809088178a03e49e409fa18f1ac89561837 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 26 15:00:14 2012 +0100 gallivm: Added a generic lp_build_print_value which prints a LLVMValueRef. Updated lp_build_printf to share common code. Removed specific lp_build_print_vecX. Reviewed-by: José Fonseca <jfonseca@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit e59bdcc2c075931bfba2a84967a5ecd1dedd6eb0 Author: José Fonseca <jfonseca@vmware.com> Date: Wed May 16 15:00:23 2012 +0100 draw,llvmpipe: Avoid named struct types on LLVM 3.0 and later. Starting with LLVM 3.0, named structures are meant not for debugging, but for recursive data types, previously also known as opaque types. The recursive nature of these types leads to several memory management difficulties. Given that we don't actually need recursive types, avoid them altogether. This is an attempt to address fdo bugs 41791 and 44466. The issue is somewhat random so there's no easy way to check how effective this is. Cherry-picked from 9af1ba565dfd5cef9ee938bb7c04767d14878fbf commit df6070f618a203c7a876d984c847cde4cbc26bdb Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 14:42:53 2012 +0200 gallivm: (trivial) fix typo in faster aos linear int wrap code no longer crashes, now REALLY tested. commit d8f98dce452c867214e6782e86dc08562643c862 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 18:20:58 2012 +0200 llvmpipe: (trivial) remove bogus optimization for float aos repeat wrap This optimization for nearest filtering on the linear path generated likely bogus results, and the int path didn't have any optimizations there since the only shader using force_nearest apparently uses clamp_to_edge not repeat wrap anyway. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit c4e271a0631087c795e756a5bb6b046043b5099d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 23:01:52 2012 +0200 gallivm: faster repeat wrap for linear aos path too Even if we already have scaled integer coords, it's way faster to use the original float coord (plus some conversions) rather than use URem. The choice of what to do for texture wrapping is not really tied to int aos or float soa filtering though for some modes there can be some gains (because of easier weight calculations). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 1174a75b1806e92aee4264ffe0ffe7e70abbbfa3 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 14:39:22 2012 +0200 gallivm: improve npot tex wrap repeat in linear soa path URem gets translated into series of scalar divisions so just about anything else is faster. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit f849ffaa499ed96fa0efd3594fce255c7f22891b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 00:40:35 2012 +0100 gallivm: (trivial) fix near-invisible shift-space typo I blame the keyboard. commit 5298a0b19fe672aebeb70964c0797d5921b51cf0 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:24:28 2012 +0200 gallivm: add new intrinsic helper to deal with arbitrary vector length This helper will split vectors which are too large for the hw, or expand them if they are too small, so a caller of a function using intrinsics which uses such sizes need not split (or expand) the vectors manually and the function will still use the intrinsic instead of dropping back to generic llvm code. It can also accept scalars for use with pseudo-vector intrinsics (only useful for float arguments, all x86 scalar simd float intrinsics use 4vf32). Only used for lp_build_min/max() for now (also added the scalar float case for these while there). (Other basic binary functions could use it easily, whereas functions with a different interface would need different helpers.) Expanding vectors isn't widely used, because we always try to use build contexts with native hw vector sizes. But it might (or not) be nicer if this wouldn't need to be done, the generated code should in theory stay the same (it does get hit by lp_build_rho though already since we didn't have a intrinsic for the scalar lp_build_max case before). v2: incorporated Brian's feedback, and also made the scalar min/max case work instead of crash (all scalar simd float intrinsics take 4vf32 as argument, probably the reason why it wasn't used before). Moved to lp_bld_intr based on José's request, and passing intrinsic size instead of length. Ideally we'd derive the source type info from the passed in llvm value refs and process some llvmtype return type so we could handle intrinsics where the source and destination type isn't the same (like float/int conversions, packing instructions) but that's a bit too complicated for now. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 01aa760b99ec0b2dc8ce57a43650e83f8c1becdf Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:19:18 2012 +0200 gallivm: (trivial) increase max code size for shader disassembly 64kB was just short of what I needed (which caused a crash) hence increase to 96kB (should probably be smarter about that). commit 74aa739138d981311ce13076388382b5e89c6562 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:53:29 2012 +0100 gallivm: simplify aos float tex wrap repeat nearest just handle pot and npot the same. The previous pot handling ended up with exactly the same instructions plus 2 more (leave it in the soa path though since it is probably still cheaper there). While here also fix a issue which would cause a crash after an assert. commit 0e1e755645e9e49cfaa2025191e3245ccd723564 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:29:24 2012 +0100 gallivm: (trivial) skip floor rounding in ifloor when not signed This was only done for the non-sse41 case before, but even with sse41 this is obviously unnecessary (some callers already call itrunc in this case anyway but some might not). commit 7f01a62f27dcb1d52597b24825931e88bae76f33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:23:12 2012 +0100 gallivm: (trivial) fix bogus comments commit 5c85be25fd82e28490274c468ce7f3e6e8c1d416 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 20 11:51:57 2012 +0100 translate: Free elt8_func/elt16_func too. These were leaking. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 0ad498f36fb6f7458c7cffa73b6598adceee0a6c Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 15:55:34 2012 +0200 gallivm: fix bug for tex wrap repeat with linear sampling in aos float path The comparison needs to be against length not length_minus_one, otherwise the max texel is never chosen (for the second coordinate). Fixes piglit texwrap-1D-npot-proj (and 2D/3D versions). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit d1ad65937c5b76407dc2499b7b774ab59341209e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 16:13:43 2012 +0200 gallivm: simplify soa tex wrap repeat with npot textures and no mip filtering Similar to what is already done in aos sampling for the float path (but not the int path since we don't get normalized float coordinates there). URem is expensive and the calculation is done trivially with normalized floats instead (at least with sse41-capable cpus). (Some day should probably do the same for the mip filter path but it's much more complicated there hence the gain is smaller.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit e1e23f57ba9b910295c306d148f15643acc3fc83 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:38:56 2012 +0200 llvmpipe: (trivial) remove duplicated function declaration Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 07ca57eb09e04c48a157733255427ef5de620861 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:37:34 2012 +0200 llvmpipe: destroy setup variants on context destruction lp_delete_setup_variants() used to be called in garbage collection, but this no longer exists hence the setup shaders never got freed. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ed0003c633859a45f9963a479f4c15ae0ef1dca3 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 16:25:29 2012 +0100 gallivm: handle different ilod parts for multiple quad sampling This fixes filtering when the integer part of the lod is not the same for all quads. I'm not fully convinced of that solution yet as it just splits the vector if the levels to be sampled from are different. But otherwise we'd need to do things like some minify steps, and getting mip level base address separately anyway hence it wouldn't really look like much of a win (and making the code even more complex). This should now give identical results to single quad sampling. commit 8580ac4cfc43a64df55e84ac71ce1a774d33c0d2 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:14:47 2012 +0200 gallivm: de-duplicate sample code common to soa and aos sampling There doesn't seem to be any reason why this code dealing with cube face selection, lod and mip level calculation is separate in aos and soa sampling, and I am sick of having it to change in both places. commit fb541e5f957408ce305b272100196f1e12e5b1e8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:15:41 2012 +0200 gallivm: do mip filtering with per quad lod_fpart This gives better results for mip filtering, though the generated code might not be optimal. For now it also creates some artifacts if the lod_ipart isn't the same for all quads, since instead of using the same mip weight for all quads as previously (which just caused non-smooth gradients) this now will use the right weights but with the wrong mip level in this case (can easily be seen with things like texfilt, mipmap_tunnel). v2: use logic helper suggested by José, and fix issue with negative lod_fpart values commit f1cc84eef7d826a20fab6cd8ccef9a275ff78967 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 13 18:35:25 2012 +0200 gallivm: (trivial) fix bogus assert in lp_build_unpack_broadcast_aos_scalars commit 7c17dbae8ae290df9ce0f50781a09e8ed640c044 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:11:14 2012 +0100 util: Reimplement half <-> float conversions. Removed u_half.py used to generate the table for previous method. Previous implementation of float to half conversion was faulty for denormalised and NaNs and would require extra logic to fix, thus making the speedup of using tables irrelevant. commit 7762f59274070e1dd4b546f5cb431c2eb71ae5c3 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:12:16 2012 +0100 tests: Updated tests to properly handle NaN for half floats. commit fa94c135aea5911fd93d5dfb6e6f157fb40dce5e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:33:10 2012 +0200 gallivm: do mip level calculations per quad This is the final piece which shouldn't change the rendering output yet. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 23cbeaddfe03c09ca18c45d28955515317ffcf4c Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Jun 9 00:54:21 2012 +0200 gallivm: do per-quad cube face selection Doesn't quite fix the piglit cubemap test (not sure why actually) but doing per-quad face selection is doing the right thing and definitely an improvement. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit abfb372b3702ac97ac8b5aa80ad1b94a2cc39d33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:22:59 2012 +0200 gallivm: do all lod calculations per quad Still no functional change but lod is now converted to scalar after lod calculations. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 519368632747ae03feb5bca9c655eccbc5b751b4 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:46:10 2012 +0100 gallivm: Added support for half-float to float conversion in lp_build_conv. Updated various utility functions to support this change. commit 135b4d683a4c95f7577ba27b9bffa4a6fbd2c2e7 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:02:46 2012 +0100 gallivm: Added function for half-float to float conversion. Updated lp_build_format_aos_array to support half-float source. commit 37d648827406a20c5007abeb177698723ed86673 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:55:18 2012 +0100 util: Updated u_format_tests to rigidly test half-float boundary values. commit 2ad18165d96e578aa9046df7c93cb1c3284d8c6b Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:54:16 2012 +0100 llvmpipe: Updated lp_test_format to properly handle Inf/NaN results. commit 78740acf25aeba8a7d146493dd5c966e22c27b73 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:53:30 2012 +0100 util: Added functions for checking NaN / Inf for double and half-floats. commit 35e9f640ae01241f9e0d67fe893bbbf564c05809 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:05:13 2012 +0200 gallivm: Fix calculating rho for 3d textures for the single-quad case Discovered by accident, this looks like a very old typo bug. commit fc1220c636326536fd0541913154e62afa7cd1d8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:04:59 2012 +0200 gallivm: do calcs per-quad in lp_build_rho Still convert to scalar at the end of the function. commit 50a887ffc550bf310a6988fa2cea5c24d38c1a41 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 21 23:21:50 2012 +0200 gallivm: (trivial) return scalar in lp_build_extract_range for length 1 vectors Our type system on top of llvm's one doesn't generally support vectors of length 1, instead using scalars. So we should return a scalar from this function instead of having to bitcast the vector with length 1 later elsewhere. commit 80c71c621f9391f0f9230460198d861643324876 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 17:49:15 2012 +0100 draw: Fixed bad merge error commit c47401cfad0c9167de20ff560654f533579f452c Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:29:30 2012 +0100 draw: Updated store_clip to store whole vectors instead of individual elements. commit 2d9c1ad74b0b0b41861fffcecde39f09cc27f1cf Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:28:32 2012 +0100 gallivm: Added lp_build_fetch_rgba_aos_array. A version of lp_build_fetch_rgba_aos which is targeted at simple array formats. Reads the whole vector from memory in one, instead of reading each element individually. Tested with mesa tests and demos. commit ff7805dc2b6ef6d8b11ec4e54aab1633aef29ac8 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:27:40 2012 +0100 gallivm: Added lp_build_pad_vector. This function pads a vector with undef to a desired length. commit 701f50acef24a2791dabf4730e5b5687d6eb875d Author: James Benton <jbenton@vmware.com> Date: Fri May 18 17:27:19 2012 +0100 util: Added util_format_is_array. This function checks whether a format description is in a simple array format. commit 5e0a7fa543dcd009de26f34a7926674190fa6246 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:13:47 2012 +0100 draw: Removed draw_llvm_translate_from and draw/draw_llvm_translate.c. This is "replaced" by adding an optimised path in lp_build_fetch_rgba_aos in an upcoming patch. commit 8c886d6a7dd3fb464ecf031de6f747cb33e5361d Author: James Benton <jbenton@vmware.com> Date: Wed May 16 15:02:31 2012 +0100 draw: Modified store_aos to write the vector as one, not individual elements. commit 37337f3d657e21dfd662c7b26d61cb0f8cfa6f17 Author: James Benton <jbenton@vmware.com> Date: Wed May 16 14:16:23 2012 +0100 draw: Changed aos_to_soa to use lp_build_transpose_aos. commit bd2b69ce5d5c94b067944d1dcd5df9f8e84548f1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:14:27 2012 +0100 draw: Changed soa_to_aos to use lp_build_transpose_aos. commit 0b98a950d29a116e82ce31dfe7b82cdadb632f2b Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:45 2012 +0100 gallivm: Added lp_build_transpose_aos which converts between aos and soa. commit 69ea84531ad46fd145eb619ed1cedbe97dde7cb5 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:01 2012 +0100 gallivm: Added lp_build_interleave2_half aimed at AVX unpack instructions. commit 7a4cb1349dd35c18144ad5934525cfb9436792f9 Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 22 11:54:14 2012 +0100 gallivm: Fix build on Windows. MC-JIT not yet supported there. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit afd105fc16bb75d874e418046b80d9cc578818a1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:17:26 2012 +0100 llvmpipe: Added a error counter to lp_test_conv. Useful for keeping track of progress when fixing errors! Signed-off-by: José Fonseca <jfonseca@vmware.com> commit b644907d08c10a805657841330fc23db3963d59c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:16:46 2012 +0100 llvmpipe: Changed known failures in lp_test_conv. To comply with the recent fixes to lp_bld_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit d7061507bd94f6468581e218e61261b79c760d4f Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:14:38 2012 +0100 llvmpipe: Added fixed point types tests to lp_test_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 146b3ea39b4726dbe125ac666bd8902ea3d6ca8c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:26:35 2012 +0100 llvmpipe: Changed lp_test_conv src/dst alignment to be correct. Now based on the define rather than a fixed number. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit f3b57441f834833a4b142a951eb98df0aa874536 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:06:44 2012 +0100 gallivm: Fixed erroneous optimisation in lp_build_min/max. Previously assumed normalised was 0 to 1, but it can be -1 to 1 if type is signed. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a0613382e5a215cd146bb277646a6b394d376ae4 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:04:49 2012 +0100 gallivm: Compensate for lp_const_offset in lp_build_conv. Fixing a /*FIXME*/ to remove errors in integer conversion in lp_build_conv. Tested using lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a3d2bf15ea345bc8a0664f8f441276fd566566f3 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:01:25 2012 +0100 gallivm: Fixed overflow in lp_build_clamped_float_to_unsigned_norm. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit e7b1e76fe237613731fa6003b5e1601a2e506207 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 21 20:07:51 2012 +0100 gallivm: Fix build with LLVM 2.6 Trivial, and useful. commit d3c6bbe5c7f5ba1976710831281ab1b6a631082d Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 15 17:15:59 2012 +0100 gallivm: Enable MCJIT/AVX with vanilla LLVM 3.1. Add the necessary C++ glue, so that we don't need any modifications to the soon to be released LLVM 3.1. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 724a019a14d40fdbed21759a204a2bec8a315636 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 22:04:06 2012 +0100 gallivm: Use HAVE_LLVM 0x0301 consistently. commit af6991e2a3868e40ad599b46278551b794839748 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 21:49:06 2012 +0100 gallivm: Add MCRegisterInfo.h to silence benign warnings about missing implementation. Trivial. commit 6f8a1d75458daae2503a86c6b030ecc4bb494e23 Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Apr 2 22:14:15 2012 -0700 gallivm: Pass in a MCInstrInfo to createMCInstPrinter on llvm-3.1. llvm-3.1svn r153860 makes MCInstrInfo available to the MCInstPrinter. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 62555b6ed8760545794f83064e27cddcb3ce5284 Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 27 21:51:17 2012 -0700 gallivm: Fix method overriding in raw_debug_ostream. Use matching type qualifers to avoid method hiding. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 6a9bd784f4ac68ad0a731dcd39e5a3c39989f2be Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 13 22:40:52 2012 -0700 gallivm: Fix createOProfileJITEventListener namespace with llvm-3.1. llvm-3.1svn r152620 refactored the OProfile profiling code. createOProfileJITEventListener was moved from the llvm namespace to the llvm::JITEventListener namespace. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit b674955d39adae272a779be85aa1bd665de24e3e Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Mar 5 22:00:40 2012 -0800 gallivm: Pass in a MCRegisterInfo to MCInstPrinter on llvm-3.1. llvm-3.1svn r152043 changes createMCInstPrinter to take an additional MCRegisterInfo argument. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 11ab69971a8a31c62f6de74905dbf8c02884599f Author: Vinson Lee <vlee@freedesktop.org> Date: Wed Feb 29 21:20:53 2012 -0800 Revert "gallivm: Change getExtent and readByte to non-const with llvm-3.1." This reverts commit d5a6c172547d8964f4d4bb79637651decaf9deee. llvm-3.1svn r151687 makes MemoryObject accessor members const again. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 339960c82d2a9f5c928ee9035ed31dadb7f45537 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 14 16:19:56 2012 +0200 gallivm: (trivial) fix assertion failure for mipmapped 1d textures In lp_build_rho, we may end up with a 1-element vector (for mipmapped 1d textures), but in this case we require the type to be a non-vector type, so need a cast. commit 9d73edb727bd6d196030dc3026b7bf0c574b3e19 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:12:07 2012 +0200 gallivm: prepare for per-quad lod calculations for large vectors to be able to handle multiple quads at once in texture sampling and still do lod calculations per quad, it is necessary to get the per-quad derivatives into the lp_build_rho function. Until now these derivative values were just scalars, which isn't going to work. So we now use vectors, and since the interface needs to change we also do some different (slightly more efficient) packing of the values. For 8-wide vectors the packed derivative values for 3 coords would look like this, this scales to a arbitrary (multiple of 4) vector size: ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____ The second vector will be unused for 1d and 2d textures. To facilitate future changes the derivative values are put into a struct, since quite some functions just pass these values through. The generated code seems to be very slightly better for 2d textures (with 4-wide vectors) than before with sse2 (if you have a cpu with physical 128bit simd units - otherwise it's probably not a win). v2: suggestions from José, rename variables, add comments, use swizzle helper commit 0aa21de0d31466dac77b05c97005722e902517b8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:10:31 2012 +0200 gallivm: add undefined swizzle handling to lp_build_swizzle_aos This is useful for vectors with "holes", it lets llvm choose the most efficient shuffle instructions if some elements aren't needed without having to worry what elements to manually pick otherwise. commit 00faf3f370e7ce92f5ef51002b0ea42ef856e181 Author: José Fonseca <jfonseca@vmware.com> Date: Fri May 4 17:25:16 2012 +0100 gallivm: Get the LLVM IR optimization passes before JIT compilation. MC-JIT engine compiles the module immediately on creation, so the optimization passes were being run too late. So now we create a target data layout from a string, that matches the ABI parameters reported by the compiler. The backend optimization passes were always been run, so the performance improvement is modest (3% on multiarb mesa demo). Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit 40a43f4e2ce3074b5ce9027179d657ebba68800a Author: Roland Scheidegger <sroland@vmware.com> Date: Wed May 2 16:03:54 2012 +0200 gallivm: (trivial) fix wrong define used in lp_build_pack2 should fix stack-smashing crashes. commit e6371d0f4dffad4eb3b7a9d906c23f1c88a2ab9e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 30 21:25:29 2012 +0200 gallivm: add perf warnings when not using intrinsics with 256bit vectors Helper functions using integer sse2 intrinsics could split the vectors with AVX instead of using generic fallback (which should be faster). We don't actually expect to hit these paths (hence don't fix them up to actually do the vector splitting) so just emit warnings (for those functions where it's obvious doing split/intrinsic is faster than using generic path). Only emit warnings for 256bit vectors since we _really_ don't expect to hit arbitrary large vectors which would affect a lot more functions. The warnings do not actually depend on avx since the same logic applies to plain sse2 too (but of course again there's _really_ no reason we should hit these functions with 256bit vectors without avx). commit 8a9ea701ea7295181e846c6383bf66a5f5e47637 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:37:07 2012 +0200 gallivm: split vectors manually for avx in lp_build_pack2 (v2) There's 2 reasons for this: First, there's a llvm bug (fixed in 3.1) which generates tons of byte inserts/extracts otherwise, and second, more importantly, we want to use pack intrinsics instead of shuffles. We do this in lp_build_pack2 and not the calling code (aos sample path) because potentially other callers might find that useful too, even if for larger sequences of code using non-native vector sizes it might be better to manually split vectors. This should boost texture performance in the aos path considerably. v2: fix issues with intrinsics types with old llvm commit 27ac5b48fa1f2ea3efeb5248e2ce32264aba466e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:26:22 2012 +0200 llvmpipe: refactor lp_build_pack2 (v2) prettify, and it's unnecessary to assert when there's no intrinsic due to unsupported bit width - the shuffle path will work regardless. In contrast lp_build_packs2, should only rely on lp_build_pack2 doing the clamping for element sizes for which there is a sse2 intrinsic. v2: fix bug spotted by Jose regarding the intrinsic type for packusdw on old llvm versions. commit ddf279031f0111de4b18eaf783bdc0a1e47813c8 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:13:59 2012 +0200 gallivm: add src width check in lp_build_packs2() not doing so would skip clamping even if no sse2 pack instruction is available, which is incorrect (in theory only, such widths would also always hit a (unnecessary) assertion in lp_build_pack2(). commit e7f0ad7fe079975eae7712a6e0c54be4fae0114b Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Apr 27 15:57:00 2012 +0200 gallivm: (trivial) fix crash-causing typo for npot textures with avx commit 28a9d7f6f655b6ec508c8a3aa6ffefc1e79793a0 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Apr 25 19:38:45 2012 +0200 gallivm: (trivial) remove code mistakenly added twice. commit d5926537316f8ff67ad0a52e7242f7c5478d919b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Apr 24 21:16:15 2012 +0200 gallivm: add a new avx aos sample path (v2) Try to avoid mixing float and int address calculations. This does texture wrap modes with floats, and then the offset calculations still with ints (because of lack of precision with floats, though we could do some effort to make it work with not too large (16MB) textures). This also handles wrap repeat mode with npot-sized textures differently than either the old soa or aos int path (likely way faster but untested). Otherwise the actual address wrap code is largely similar to the soa path (not quite the same as this one also has some int code), it should get used by avx soa sampling later as well but doesn't handle more complex address modes yet (this will also have the benefit that we can use aos sampling path for all texture address modes). Generated code for that looks reasonable, but still does not split vectors explicitly for fetch/filter which means still get hit by llvm (fixed upstream) which generates hundreds of pinsrb/pextrb instead of two shuffles. It is not obvious though if it's much of a win over just doing address calcs 4-wide but with ints, even if it is definitely much less instructions on avx. piglit's texwrap seems to look exactly the same but doesn't test neither the non-normalized nor the npot cases. v2: fix comments, prettify based on Brian's and Jose's feedback. commit bffecd22dea66fb416ecff8cffd10dd4bdb73fce Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Apr 19 01:58:29 2012 +0200 gallivm: refactor aos lp_build_sample_image_nearest/linear split them up to separate address calculations and fetching/filtering. Need this for being able to do 8-wide float address calcs and 4-wide fetch/filter later (for avx). Plus the functions were very big scary monsters anyway (in particular lp_build_sample_image_linear). commit a80b325c57529adddcfa367f96f03557725c4773 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 16 17:17:18 2012 +0200 gallivm: fix lp_build_resize when truncating width but expanding vector size Missed this case which I thought was impossible - the assertion for it was right after the division by zero... (AoS) texture sampling may ask us to do this, for things like 8 4x32int vectors to 1 32x8int vector conversion (eventually, we probably don't want this to happen). commit f9c8337caa3eb185830d18bce8b95676a065b1d7 Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Apr 14 18:00:59 2012 +0200 gallivm: fix cube maps with larger vectors This makes the branchless cube face selection code work with larger vectors. Because the complexity is quite high (cannot really be improved it seems, per-face selection would reduce complexity a lot but this leads to errors unless the derivatives are calculated all from the same face which almost doubles the work to be done) it is still slower than the branching version, hence only enable this with large vectors. It doesn't actually do per-quad face selection yet (only makes sense with matching lod selection, in fact it will select the same face for all pixels based on the average of the first four pixels for now) but only different shuffles are required to make it work (the branching version actually should work with larger vectors too now thanks to the improved horizontal add but of course it cannot be extended to really select the face per-quad unless doing branching per quad). commit 7780c58869fc9a00af4f23209902db7e058e8a66 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 21:11:12 2012 +0100 llvmpipe: (trivial) fix compiler warning and also clarify comment regarding availability of popcnt instruction. commit a266dccf477df6d29a611154e988e8895892277e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 14:21:07 2012 +0100 gallivm: remove unneeded members in lp_build_sample_context Minor cleanup, the texture width, height, depth aren't accessed in their scalar form anywhere. Makes it more obvious those values should probably be fetched already vectorized (but this requires more invasive changes)... commit b678c57fb474e14f05e25658c829fc04d2792fff Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 29 15:53:55 2012 +0100 gallivm: add a helper for concatenating vectors Similar to the extract_range helper intended to get around slow code generated by llvm for 128bit insertelements. Concatenating two 128bit vectors this way will result in a single vinsertf128 operation rather than two 64bit stores plus one 128bit load, though it might be mildly useful for other purposes as well. commit 415ff228bcd0cf5e44a4c15350a661f0f5520029 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 19:41:15 2012 +0100 gallivm: add a custom 2x8f->1x16ub avx conversion path Similar to the existing 4x4f->1x16ub sse2 path, shaves off a couple instructions (min/max mostly) because it relies on pack intrinsics clamping. commit 78c08fc89f8fbcc6dba09779981b1e873e2a0299 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 18:44:07 2012 +0100 gallivm: add avx arithmetic intrinsics Add all avx intrinsics for arithmetic functions (with the exception of the horizontal add function which needs another look). Seems to pass basic tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit a586caa2800aa5ce54c173f7c0d4fc48153dbc4e Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 15:31:35 2012 +0100 gallivm: add avx logic intrinsics Add the blend intrinsics for 8-wide float and 4-wide double vectors. Since we lack 256bit int instructions these are used for int vectors as well, though obviously not for byte or word element values. The comparison intrinsics aren't extended for avx since these are only used for pre-2.7 llvm versions. commit 70275e4c13c89315fc2560a4c488c0e6935d5caf Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 00:40:53 2012 +0100 gallivm: new helper function for extract shuffles. Based on José's idea as we can need that in a couple places. Note that such shuffles should not be used lightly, since data layout of <4 x i8> is different to <16 x i8> for instance, hence might cause data rearrangement. commit 4d586dbae1b0c55915dda1759d2faea631c0a1c2 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 18:27:25 2012 +0100 gallivm: (trivial) don't overallocate shuffle variable using wrong define meant huge array... commit 06b0ec1f6d665d98c135f9573ddf4ba04b2121ad Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 17:54:20 2012 +0100 gallivm: don't do per-element extract/insert for vector element resize Instead of doing per-element extract/insert if the src vectors and dst vector differ in total size (which generates atrocious code) first change the src vectors size by using shuffles to destination vector size. We can still do better than that on AVX for packing to color buffer (by exploiting pack intrinsics characteristics hence eleminating the need for some clamps) but this already generates much better code. v2: incorporate feedback from José, Keith and use shuffle instead of bitcasts/extracts. Due to llvm deficiencies the latter cause all data to get moved to GPRs and back in pieces (even though the data in the regs actually stays the same...). commit c9970d70e05f95d3f52fe7d2cd794176a52693aa Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 19:33:19 2012 +0000 gallivm: fix bug in simple position interpolation Accidental use of position attribute instead of just pixel coordinates. Caused failures in piglit glsl-fs-ceil and glsl-fs-floor. commit d0b6fcdb008d04d7f73d3d725615321544da5a7e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 15:31:14 2012 +0000 gallivm: fix emission of ceil opcode lp_build_ceil seems more appropriate than lp_build_trunc. This seems to be never hit though someone performs some ceil to floor magic. commit d97fafed7e62ffa6bf76560a92ea246a1a26d256 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 22 11:46:52 2012 +0000 gallivm: new vectorized path for cubemap calculations should be faster when adapted to multiple quads as only selection masks need to be different. The code is more or less a per-pixel version adapted to only do it per quad. A per pixel version would be much simpler (could drop 2 selects, 6 broadcasts and the messy horizontal add of 3 vectors at the expense of only 2 more absolute value instructions - would also just work for arbitary large vectors). This version doesn't yet work with larger vectors because the horizontal add isn't adjusted to be able to work with 2x4 vectors (and also because face selection wouldn't be done per quad just per block though that would be only a correctness issue just as with lod selection). The downside is this code is quite a bit slower. On a Core2 it can be sped up by disabling the hw blend instructions for selection and using logicop fallbacks instead, but it is still slower than the old code, hence leave that in for now. Probably will chose one or the other version based on vector length in the end. commit b375fbb18a3fd46859b7fdd42f3e9908ea4ff9a3 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 21 14:42:29 2012 +0000 gallivm: fix optimized occlusion query intrinsic name commit a9ba0a3b611e48efbb0e79eb09caa85033dbe9a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Mar 21 16:19:43 2012 +0000 draw,gallivm,llvmpipe: Call gallivm_verify_function everywhere. commit f94c2238d2bc7383e088b8845b7410439a602071 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 18:54:10 2012 +0000 gallivm: optimize calculations for cube maps a bit this does some more vectorized calculations and uses horizontal adds if possible. A definite win with sse3 otherwise it doesn't seem to make much of a difference. In any case this is arithmetically identical, cannot handle larger vectors. Should be useful as a reference point against larger vector version later... commit 21a2c1cf3c8e1ac648ff49e59fdc0e3be77e2ebb Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 15:16:27 2012 +0000 llvmpipe: slight optimization of occlusion queries using movmskps when available. While this is slightly better for cpus without popcnt we should really sum the vectors ourselves (it is also possible to cast to i4 before doing the popcnt but that doesn't help that much neither since llvm is using some optimized popcnt version for i32) commit 5ab5a35f216619bcdf55eed52b0db275c4a06c1b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 13:32:11 2012 +0000 llvmpipe: fix occlusion queries with larger vectors need to adjust casts etc. commit ff95e6fdf5f16d4ef999ffcf05ea6e8c7160b0d5 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Mar 19 20:15:25 2012 +0000 gallivm: Restore optimization passes. commit 57b05b4b36451e351659e98946dae27be0959832 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:34:22 2012 +0000 llvmpipe: use existing min2 macro commit bc9a20e19b4f600a439f45679451f2e87cd4b299 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:07:27 2012 +0000 llvmpipe: add some safeguards against really large vectors As per José's suggestion, prevent things from blowing up if some cpu would have 1024bit or larger vectors. commit 0e2b525e5ca1c5bbaa63158bde52ad1c1564a3a9 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:31:08 2012 +0000 llvmpipe: fix mask generation for uberwide vectors this was the only piece preventing 16-wide vectors from working (apart from the LP_MAX_VECTOR_WIDTH define that is), which is the maximum as we don't get more pixels in the fragment shader at once. Hence adjust that so things could be tested properly with that size even though there seems to be no practical value. commit 3c8334162211c97f3a11c7f64e9e5a2a91ad9656 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:19:41 2012 +0000 llvmpipe: fix the simple interpolation method with larger vectors so both methods actually _really_ work now. Makes textures look nice with larger vectors... commit 1cb0464ef8871be1778d43b0c56adf9c06843e2d Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 17:26:35 2012 +0000 llvmpipe: fix mask generation and position interpolation with 8-wide vectors trivial bugs, with these things start to look somewhat reasonable. Textures though have some swizzling issues it seems. commit 168277a63ef5b72542cf063c337f2d701053ff4b Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 16:04:03 2012 +0000 llvmpipe: don't overallocate variables we never have more than 16 (stamp size) / 4 (minimum possible vector size). (With larger vectors those variables are still overallocated a bit.) commit 409b54b30f81ed0aa9ed0b01affe15c72de9abd2 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:56:48 2012 +0000 llvmpipe: add some 32f8 formats to lp_test_conv Also add the ability to handle different sized vectors. commit 55dcd3af8366ebdac0af3cdb22c2588f24aa18ce Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:47:27 2012 +0000 gallivm: handle different sized vectors in conversion / pack only fully generic path for now (extract/insert per element). commit 9c040f78c54575fcd94a8808216cf415fe8868f6 Author: Roland Scheidegger <sroland@vmware.com> Date: Sun Mar 18 00:58:28 2012 +0100 llvmpipe: fix harmless use of unitialized values commit 551e9d5468b92fc7d5aa2265db9a52bb1e368a36 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:31:21 2012 +0100 gallivm: drop special path in extract_broadcast with different sized vectors Not needed, llvm can handle shuffles with different sized result vector just fine. Should hopefully generate the same code in the end, but simpler IR. commit 44da531119ffa07a421eaa041f63607cec88f6f8 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:28:49 2012 +0100 llvmpipe: adapt interpolation for handling multiple quads at once this is still WIP there are actually two methods possible not quite sure what makes the most sense, so there's code for both for now: 1) the iterative method as used before (compute attrib values at upper left corner of stamp and upper left corner of each quad initially). It is improved to handle more than one quad at once, and also do some more vectorized calculations initially for slightly better code - newer cpus have full throughput with 4 wide float vectors, hence don't try to code up a path which might be faster if there's just one channel active per attribute. 2) just do straight interpolation for each pixel. Method 2) is more work per quad, but less initially - if all quads are executed significantly more overall though. But this might change with larger vector lengths. This method would also be needed if we'd do some kind of active quad merging when operating on multiple quads at once. This path contains some hack to force llvm to generate better code, it is still far from ideal though, still generates far too many unnecessary register spills/reloads. Both methods should work with different sized vectors. Not very well tested yet, still seems to work with four-wide vectors, need changes elsewhere to be able to test with wider vectors. commit be5d3e82e2fe14ad0a46529ab79f65bf2276cd28 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:59:37 2012 +0000 draw: Cleanup. commit f85bc12c7fbacb3de2a94e88c6cd2d5ee0ec0e8d Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:43:30 2012 +0000 gallivm: More module compilation refactoring. commit d76f093198f2a06a93b2204857e6fea5fd0b3ece Author: José Fonseca <jfonseca@vmware.com> Date: Thu Mar 15 21:29:11 2012 +0000 llvmpipe: Use gallivm_compile/free_function() in linear code. Should had been done before. commit 122e1adb613ce083ad739b153ced1cde61dfc8c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 13 14:47:10 2012 +0100 llvmpipe: generate partial pixel mask for multiple quads still works with one quad, cannot be tested yet with more At least for now always fixed order with multiple quads. commit 4c4f15081d75ed585a01392cd2dcce0ad10e0ea8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 22:09:24 2012 +0100 llvmpipe: refactor state setup a bit Refactor to make it easier to emit (and potentially later fetch in fs) coefficients for multiple attributes at once. Need to think more about how to make this actually happen however, the problem is different attributes can have different interpolation modes, requiring different handling in both setup and fs (though linear and perspective handling is close). commit 9363e49722ff47094d688a4be6f015a03fba9c79 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 19:23:23 2012 +0100 llvmpipe: vectorize tri offset calc cuts number of instructions in quad-offset-factor from 107 to 75. This code actually duplicated the (scalar) code calculating the determinant except it used different vertex order (leading to different sign but it doesn't matter) hence llvm could not have figured out it's the same (of course with determinant vectorized in the other place that wouldn't have worked any longer neither). Note this particular piece doesn't actually vectorize well, not many arithmetic instructions left but tons of shuffle instructions... Probably would need to work on n tris at a time for better vectorization. commit 63169dcb9dd445c94605625bf86d85306e2b4297 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 03:11:37 2012 +0100 llvmpipe: vectorize some scalar code in setup reduces number of arithmetic instructions, and avoids loading vector x,y values twice (once as scalars once as vectors). Results in a reduction of instructions from 76 to 64 in fs setup for glxgears (16%) on a cpu with sse41. Since this code uses vec2 disguised as vec4, on old cpus which had physical 64bit sse units (pre-Core2) it probably is less of a win in practice (and if you have no vectors you can only hope llvm eliminates the arithmetic for unneeded elements). commit 732ecb877f951ab89bf503ac5e35ab8d838b58a1 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 7 00:32:24 2012 +0100 draw: fix clipping bug introduced by 4822fea3f0440b5205e957cd303838c3b128419c broke clipping pretty badly (verified with lineclip test) commit ef5d90b86d624c152d200c7c4056f47c3c6d2688 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 23:38:59 2012 +0100 draw: don't store vertex header per attribute storing the vertex header once per attribute is totally unnecessary. Some quick look at the generated assembly says llvm in fact cannot optimize away the additional stores (maybe due to potentially aliasing pointers somewhere). Plus, this makes the code cleaner and also allows using a vector "or" instead of scalar ones. commit 6b3a5a57b0b9850854cfbd7b586e4e50102dda71 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 19:11:01 2012 +0100 draw: do the per-vertex "boolean" clipmask "or" with vectors no point extracting the values and doing it per component. Doesn't help that much since we still extract the values elsewhere anyway. commit 36519caf1af40e4480251cc79a2d527350b7c61f Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 2 22:27:01 2012 +0100 gallivm: fix lp_build_extract_broadcast with different sized vectors Fix the obviously wrong argument, so it doesn't blow up. commit 76d0ac3ad85066d6058486638013afd02b069c58 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 2 12:16:23 2012 +0000 draw: Compile per module and not per function (WIP). Enough to get gears w/ LLVM draw + softpipe to work on AVX doing: GALLIUM_DRIVER=softpipe SOFTPIPE_USE_LLVM=yes glxgears But still hackish -- will need to rethink and refactor this. commit 78e32b247d2a7a771be9a1a07eb000d1e54ea8bd Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 12:01:05 2012 +0000 llvmpipe: Remove lp_state_setup_fallback. Never used. commit 6895d5e40d19b4972c361e8b83fdb7eecda3c225 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Feb 27 19:14:27 2012 +0000 llvmpipe: Don't emit EMMS on x86 We already take precautions to ensure that LLVM never emits MMX code. commit 4822fea3f0440b5205e957cd303838c3b128419c Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 29 15:58:19 2012 +0100 draw: modifications for larger vector sizes We want to be able to use larger vectors especially for running the vertex shader. With this patch we build soa vectors which might have a different length than 4. Note that aos structures really remain the same, only when aos structures are converted to soa potentially different sized vectors are used. Samplers probably don't work yet, didn't look at them. Testing done: glxgears works with both 128bit and 256bit vectors. commit f4950fc1ea784680ab767d3dd0dce589f4e70603 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:57 2012 +0100 gallivm: override native vector width with LP_NATIVE_VECTOR_WIDTH env var for debug commit 6ad6dbf0c92f3bf68ae54e5f2aca035d19b76e53 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:24 2012 +0100 draw: allocate storage with alignment according to native vector width commit 7bf0e3e7c9bd2469ae7279cabf4c5229ae9880c1 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Feb 24 19:06:08 2012 +0000 gallivm: Fix comment grammar. Was missing several words. Spotted by Roland. commit b20f1b28eb890b2fa2de44a0399b9b6a0d453c52 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 19:22:09 2012 +0000 gallivm: Use MC-JIT on LLVM 3.1 + (i.e, SVN) MC-JIT Note: MC-JIT is still WIP. For this to work correctly it requires LLVM changes which are not yet upstream. commit b1af4dfcadfc241fd4023f4c3f823a1286d452c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 20:03:15 2012 +0100 llvmpipe: use new lp_type_width() helper in lp_test_blend commit 04e0a37e888237d4db2298f31973af459ef9c95f Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 19:50:34 2012 +0100 llvmpipe: clean up lp_test_blend a little Using variables just sized and aligned right makes it a bit more obvious what's going on. The test still only tests vector length 4. For AoS anything else probably isn't going to work. For SoA other lengths should work (at least with floats). commit e61c393d3ec392ddee0a3da170e985fda885a823 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:48:30 2012 +0000 gallivm: Ensure vector width consistency. Instead of assuming that everything is the max native size. commit 330081ac7bc41c5754a92825e51456d231bf84dd Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:44:14 2012 +0000 draw: More simd vector width consistency fixes. commit d90ca002753596269e37297e2e6c139b19f29f03 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:43:00 2012 +0000 gallivm: Remove unused lp_build_int32_vec4_type() helper. commit cae23417824d75869c202aaf897808d73a2c1db0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 17:32:16 2012 +0100 gallivm: use global variable for native vector width instead of define We do not know the simd extensions (and hence the simd width we should use) available at compile time. At least for now keep a define for maximum vector width, since a global variable obviously can't be used to adjust alignment of automatic stack variables. Leave the runtime-determined value at 128 for now in all cases. commit 51270ace6349acc2c294fc6f34c025c707be538a Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 15:41:02 2012 +0000 gallivm: Add a hunk inadvertedly lost when rebasing. commit bf256df9cfdd0236637a455cbaece949b1253e98 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:24:23 2012 +0000 llvmpipe: Use consistent vector width in depth/stencil test. commit 5543b0901677146662c44be2cfba655fd55da94b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:19:59 2012 +0000 draw: Use a consistent the vector register width. Instead of 4x32 sometimes, LP_NATIVE_VECTOR_WIDTH other times. commit eada8bbd22a3a61f549f32fe2a7e408222e5c824 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 12:08:04 2012 +0000 gallivm: Remove garbagge collection. MC-JIT will require one compilation per module (as opposed to one compilation per function), therefore no state will be shared, eliminating the need to do garbagge collection. commit 556697ea0ed72e0641851e4fbbbb862c470fd7eb Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 10:33:41 2012 +0000 gallivm: Move all native target initialization to lp_set_target_options(). commit c518e8f3f2649d5dc265403511fab4bcbe2cc5c8 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:52:32 2012 +0000 llvmpipe: Create one gallivm instance for each test. commit 90f10af8920ec6be6f2b1e7365cfc477a0cb111d Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:48:08 2012 +0000 gallivm: Avoid LLVMAddGlobalMapping() in lp_bld_assert(). Brittle, complex, and unecesary. Just use function pointer constant. commit 98fde550b33401e3fe006af59db4db628bcbf476 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:21:26 2012 +0000 gallivm: Add a lp_build_const_func_pointer() helper. To be reused in all places where we want to call C code. commit 6cfedadb62c2ce5af8d75969bc95a607f3ece118 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:44:41 2012 +0000 gallivm: Cleanup/simplify lp_build_const_string_variable. - Move to lp_bld_const where it belongs - Rename to lp_build_const_string - take the length from the argument (and don't count the zero terminator twice) - bitcast the constant to generic i8 * commit db1d4018c0f1fa682a9da93c032977659adfb68c Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 11:52:17 2012 +0000 gallivm: Set NoFramePointerElimNonLeaf to true where supported. commit 088614164aa915baaa5044fede728aa898483183 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 22 19:38:47 2012 +0100 llvmpipe: pass in/out pointers rather scalar floats in lp_bld_arit we don't want llvm to potentially optimize away the vectors (though it doesn't seem to currently), plus we want to be able to handle in/out vectors of arbitrary length. commit 3f5c4e04af8a7592fdffa54938a277c34ae76b51 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:22:55 2012 +0100 gallivm: fix lp_build_sqrt() for vector length 1 since we optimize away vectors with length 1 need to emit intrinsic without vector type. commit 79d94e5f93ed8ba6757b97e2026722ea31d32c06 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 17:00:46 2012 +0000 llvmpipe: Remove lp_test_round. commit 81f41b5aeb3f4126e06453cfc78990086b85b78d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:56:24 2012 +0100 llvmpipe: subsume lp_test_round into lp_test_arit Much simpler, and since the arguments aren't passed as 128bit values can run on any arch. This also uses the float instead of the double versions of the c functions (which probably was the intention anyway). In contrast to lp_test_round the output is much less verbose however. Tested vector width of 32 to 512 bits - all pass except 32 (length 1) which crashes in lp_build_sqrt() due to wrong type. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 945b338b421defbd274481d8c4f7e0910fd0e7eb Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 09:55:03 2012 +0000 gallivm: Centralize the function compilation logic. This simplifies a lot of code. Also doing this in a central place will make it easier to carry out the changes necessary to use MC-JIT in the future. gallivm: Fix typo in explicit derivative shuffle. Trivial. draw: make DEBUG_STORE work again adapt to lp_build_printf() interface changes Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: get rid of vecnf_from_scalar() just use lp_build_broadcast directly (cannot assign a name but don't really need it, vecnf_from_scalar() was producing much uglier IR due to using repeated insertelement instead of insertelement+shuffle). Reviewed-by: José Fonseca <jfonseca@vmware.com> llvmpipe: fix typo in complex interpolation code Fixes position interpolation when using complex mode (piglit fp-fragment-position and similar) Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: fix clipvertex/position storing again This appears to be the result of a bad merge. Fixes piglit tests relying on clipping, like a lot of the interpolation tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Fix explicit derivative manipulation. Same counter variable was being used in two nested loops. Use more meanigful variable names for the counter to fix and avoid this. gallivm: Prevent buffer overflow in repeat wrap mode for NPOT. Based on Roland's patch, discussion, and review . Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix dims for TGSI_TEXTURE_1D in emit_tex. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix explicit volume texture derivatives. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: fix 1d shadow texture sampling Always r coordinate is used, hence need 3 coords not two (the second one is unused). Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Enable AVX support without MCJIT, where available. For now, this just enables AVX on Windows for testing. If the code is stable then we might consider prefering the old JIT wherever possible. No change elsewhere. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2012-07-13 18:09:30 +01:00
/* always have vector of size 2 here */
assert(length == 1);
index = lp_build_const_int32(bld->gallivm, 0);
res = LLVMBuildExtractElement(builder, vecres, index, "");
index = lp_build_const_int32(bld->gallivm, 1);
elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
if (type.floating)
res = LLVMBuildFAdd(builder, res, elem2, "");
else
res = LLVMBuildAdd(builder, res, elem2, "");
2010-03-11 23:26:12 +00:00
return res;
}
gallivm,draw,llvmpipe: Support wider native registers. Squashed commit of the following: commit 7acb7b4f60dc505af3dd00dcff744f80315d5b0e Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:46:31 2012 +0100 draw: Don't use dynamically sized arrays. Not supported by MSVC. commit 5810c28c83647612cb372d1e763fd9d7780df3cb Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:44:16 2012 +0100 gallivm,llvmpipe: Don't use expressions with PIPE_ALIGN_VAR(). MSVC doesn't accept exceptions in _declspec(align(...)). Use a define instead. commit 8aafd1457ba572a02b289b3f3411e99a3c056072 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:41:56 2012 +0100 gallium/util: Make u_cpu_detect.h header C++ safe. commit 5795248350771f899cfbfc1a3a58f1835eb2671d Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 2 12:08:01 2012 +0100 gallium/util: Add ULL suffix to large constants. As suggested by Andy Furniss: it looks like some old gcc versions require it. commit 4c66c22727eff92226544c7d43c4eb94de359e10 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Truly disable INF/NAN tests on MSVC. Thanks to Brian for spotting this. commit 8bce274c7fad578d7eb656d9a1413f5c0844c94e Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Disable INF/NAN tests on MSVC. Somehow they are not recognized as constants. commit 6868649cff8d7fd2e2579c28d0b74ef6dd4f9716 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 15:05:24 2012 +0200 gallivm: Cleanup the 2 x 8 float -> 16 ub special path in lp_build_conv. No behaviour change intended, like 7b98455fb40c2df84cfd3cdb1eb7650f67c8a751. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 5147a0949c4407e8bce9e41d9859314b4a9ccf77 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 14:28:19 2012 +0200 gallivm: (trivial) fix issues with multiple-of-4 texture fetch Some formats can't handle non-multiple of 4 fetches I believe, but everything must support length 1 and multiples of 4. So avoid going to scalar fetch (which is very costly) just because length isn't 4. Also extend the hack to not use shift with variable count for yuv formats to arbitrary length (larger than 1) - doesn't matter how many elements we have we always want to avoid it unless we have variable shift count instruction (which we should get with avx2). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 87ebcb1bd71fa4c739451ec8ca89a7f29b168c08 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jul 4 02:09:55 2012 +0200 gallivm: (trivial) fix typo for wrap repeat mode in linear filtering aos code This would lead to bogus coordinates at the edges. (undetected by piglit because this path is only taken for block-based formats). Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 3a42717101b1619874c8932a580c0b9e6896b557 Author: José Fonseca <jfonseca@vmware.com> Date: Tue Jul 3 19:42:49 2012 +0100 gallivm: Fix TGSI integer translation with AVX. commit d71ff104085c196b16426081098fb0bde128ce4f Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 15:17:41 2012 +0100 llvmpipe: Fix LLVM JIT linear path. It was not working properly because it was looking at the JIT function before it was actually compiled. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit a94df0386213e1f5f9a6ed470c535f9688ec0a1b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Jun 28 18:07:10 2012 +0100 gallivm: Refactor lp_build_broadcast(_scalar) to share code. Doesn't really change the generated assembly, but produces more compact IR, and of course, makes code more consistent. Reviewed-by: Brian Paul <brianp@vmware.com> commit 66712ba2731fc029fa246d4fc477d61ab785edb5 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 17:30:13 2012 +0100 gallivm: Make LLVMContextRef a singleton. There are any places inside LLVM that depend on it. Too many to attempt to fix. Reviewed-by: Brian Paul <brianp@vmware.com> commit ff5fb7897495ac263f0b069370fab701b70dccef Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 28 18:15:27 2012 +0200 gallivm: don't use 8-wide texture fetch in aos path This appears to be a slight loss usually. There are probably several reasons for that: - fetching itself is scalar - filtering is pure int code hence needs splitting anyway, same for the final texel offset calculations - texture wrap related code, which can be done 8-wide, is slightly more complex with floats (with clamp_to_edge) and float operations generally more costly hence probably not much faster overall - the code needed to split when encountering different mip levels for the quads, adding complexity So, just split always for aos path (but leave it 8-wide for soa, since we do 8-wide filtering there when possible). This should certainly be revisited if we'd have avx2 support. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ce8032b43dcd8e8d816cbab6428f54b0798f945d Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:41:19 2012 +0200 gallivm: (trivial) don't extract fparts variable if not needed Did not have any consequences but unnecessary. commit aaa9aaed8f80dc282492f62aa583a7ee23a4c6d5 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:09:06 2012 +0200 gallivm: fix precision issue in aos linear int wrap code now not just passes at a quick glance but also with piglit... If we do the wrapping with floats, we also need to set the weights accordingly. We can potentially end up with different (integer) coordinates than what the integer calculations would have chosen, which means the integer weights calculated previously in this case are completely wrong. Well at least that's what I think happens, at least recalculating the weights helps. (Some day really should refactor all the wrapping, so we do whatever is fastest independent of 16bit int aos or 32bit float soa filtering.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit fd6f18588ced7ac8e081892f3bab2916623ad7a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 11:15:53 2012 +0100 gallium/util: Fix parsing of options with underscore. For example GALLIVM_DEBUG=no_brilinear which was being parsed as two options, "no" and "brilinear". commit 09a8f809088178a03e49e409fa18f1ac89561837 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 26 15:00:14 2012 +0100 gallivm: Added a generic lp_build_print_value which prints a LLVMValueRef. Updated lp_build_printf to share common code. Removed specific lp_build_print_vecX. Reviewed-by: José Fonseca <jfonseca@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit e59bdcc2c075931bfba2a84967a5ecd1dedd6eb0 Author: José Fonseca <jfonseca@vmware.com> Date: Wed May 16 15:00:23 2012 +0100 draw,llvmpipe: Avoid named struct types on LLVM 3.0 and later. Starting with LLVM 3.0, named structures are meant not for debugging, but for recursive data types, previously also known as opaque types. The recursive nature of these types leads to several memory management difficulties. Given that we don't actually need recursive types, avoid them altogether. This is an attempt to address fdo bugs 41791 and 44466. The issue is somewhat random so there's no easy way to check how effective this is. Cherry-picked from 9af1ba565dfd5cef9ee938bb7c04767d14878fbf commit df6070f618a203c7a876d984c847cde4cbc26bdb Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 14:42:53 2012 +0200 gallivm: (trivial) fix typo in faster aos linear int wrap code no longer crashes, now REALLY tested. commit d8f98dce452c867214e6782e86dc08562643c862 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 18:20:58 2012 +0200 llvmpipe: (trivial) remove bogus optimization for float aos repeat wrap This optimization for nearest filtering on the linear path generated likely bogus results, and the int path didn't have any optimizations there since the only shader using force_nearest apparently uses clamp_to_edge not repeat wrap anyway. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit c4e271a0631087c795e756a5bb6b046043b5099d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 23:01:52 2012 +0200 gallivm: faster repeat wrap for linear aos path too Even if we already have scaled integer coords, it's way faster to use the original float coord (plus some conversions) rather than use URem. The choice of what to do for texture wrapping is not really tied to int aos or float soa filtering though for some modes there can be some gains (because of easier weight calculations). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 1174a75b1806e92aee4264ffe0ffe7e70abbbfa3 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 14:39:22 2012 +0200 gallivm: improve npot tex wrap repeat in linear soa path URem gets translated into series of scalar divisions so just about anything else is faster. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit f849ffaa499ed96fa0efd3594fce255c7f22891b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 00:40:35 2012 +0100 gallivm: (trivial) fix near-invisible shift-space typo I blame the keyboard. commit 5298a0b19fe672aebeb70964c0797d5921b51cf0 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:24:28 2012 +0200 gallivm: add new intrinsic helper to deal with arbitrary vector length This helper will split vectors which are too large for the hw, or expand them if they are too small, so a caller of a function using intrinsics which uses such sizes need not split (or expand) the vectors manually and the function will still use the intrinsic instead of dropping back to generic llvm code. It can also accept scalars for use with pseudo-vector intrinsics (only useful for float arguments, all x86 scalar simd float intrinsics use 4vf32). Only used for lp_build_min/max() for now (also added the scalar float case for these while there). (Other basic binary functions could use it easily, whereas functions with a different interface would need different helpers.) Expanding vectors isn't widely used, because we always try to use build contexts with native hw vector sizes. But it might (or not) be nicer if this wouldn't need to be done, the generated code should in theory stay the same (it does get hit by lp_build_rho though already since we didn't have a intrinsic for the scalar lp_build_max case before). v2: incorporated Brian's feedback, and also made the scalar min/max case work instead of crash (all scalar simd float intrinsics take 4vf32 as argument, probably the reason why it wasn't used before). Moved to lp_bld_intr based on José's request, and passing intrinsic size instead of length. Ideally we'd derive the source type info from the passed in llvm value refs and process some llvmtype return type so we could handle intrinsics where the source and destination type isn't the same (like float/int conversions, packing instructions) but that's a bit too complicated for now. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 01aa760b99ec0b2dc8ce57a43650e83f8c1becdf Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:19:18 2012 +0200 gallivm: (trivial) increase max code size for shader disassembly 64kB was just short of what I needed (which caused a crash) hence increase to 96kB (should probably be smarter about that). commit 74aa739138d981311ce13076388382b5e89c6562 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:53:29 2012 +0100 gallivm: simplify aos float tex wrap repeat nearest just handle pot and npot the same. The previous pot handling ended up with exactly the same instructions plus 2 more (leave it in the soa path though since it is probably still cheaper there). While here also fix a issue which would cause a crash after an assert. commit 0e1e755645e9e49cfaa2025191e3245ccd723564 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:29:24 2012 +0100 gallivm: (trivial) skip floor rounding in ifloor when not signed This was only done for the non-sse41 case before, but even with sse41 this is obviously unnecessary (some callers already call itrunc in this case anyway but some might not). commit 7f01a62f27dcb1d52597b24825931e88bae76f33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:23:12 2012 +0100 gallivm: (trivial) fix bogus comments commit 5c85be25fd82e28490274c468ce7f3e6e8c1d416 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 20 11:51:57 2012 +0100 translate: Free elt8_func/elt16_func too. These were leaking. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 0ad498f36fb6f7458c7cffa73b6598adceee0a6c Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 15:55:34 2012 +0200 gallivm: fix bug for tex wrap repeat with linear sampling in aos float path The comparison needs to be against length not length_minus_one, otherwise the max texel is never chosen (for the second coordinate). Fixes piglit texwrap-1D-npot-proj (and 2D/3D versions). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit d1ad65937c5b76407dc2499b7b774ab59341209e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 16:13:43 2012 +0200 gallivm: simplify soa tex wrap repeat with npot textures and no mip filtering Similar to what is already done in aos sampling for the float path (but not the int path since we don't get normalized float coordinates there). URem is expensive and the calculation is done trivially with normalized floats instead (at least with sse41-capable cpus). (Some day should probably do the same for the mip filter path but it's much more complicated there hence the gain is smaller.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit e1e23f57ba9b910295c306d148f15643acc3fc83 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:38:56 2012 +0200 llvmpipe: (trivial) remove duplicated function declaration Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 07ca57eb09e04c48a157733255427ef5de620861 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:37:34 2012 +0200 llvmpipe: destroy setup variants on context destruction lp_delete_setup_variants() used to be called in garbage collection, but this no longer exists hence the setup shaders never got freed. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ed0003c633859a45f9963a479f4c15ae0ef1dca3 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 16:25:29 2012 +0100 gallivm: handle different ilod parts for multiple quad sampling This fixes filtering when the integer part of the lod is not the same for all quads. I'm not fully convinced of that solution yet as it just splits the vector if the levels to be sampled from are different. But otherwise we'd need to do things like some minify steps, and getting mip level base address separately anyway hence it wouldn't really look like much of a win (and making the code even more complex). This should now give identical results to single quad sampling. commit 8580ac4cfc43a64df55e84ac71ce1a774d33c0d2 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:14:47 2012 +0200 gallivm: de-duplicate sample code common to soa and aos sampling There doesn't seem to be any reason why this code dealing with cube face selection, lod and mip level calculation is separate in aos and soa sampling, and I am sick of having it to change in both places. commit fb541e5f957408ce305b272100196f1e12e5b1e8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:15:41 2012 +0200 gallivm: do mip filtering with per quad lod_fpart This gives better results for mip filtering, though the generated code might not be optimal. For now it also creates some artifacts if the lod_ipart isn't the same for all quads, since instead of using the same mip weight for all quads as previously (which just caused non-smooth gradients) this now will use the right weights but with the wrong mip level in this case (can easily be seen with things like texfilt, mipmap_tunnel). v2: use logic helper suggested by José, and fix issue with negative lod_fpart values commit f1cc84eef7d826a20fab6cd8ccef9a275ff78967 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 13 18:35:25 2012 +0200 gallivm: (trivial) fix bogus assert in lp_build_unpack_broadcast_aos_scalars commit 7c17dbae8ae290df9ce0f50781a09e8ed640c044 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:11:14 2012 +0100 util: Reimplement half <-> float conversions. Removed u_half.py used to generate the table for previous method. Previous implementation of float to half conversion was faulty for denormalised and NaNs and would require extra logic to fix, thus making the speedup of using tables irrelevant. commit 7762f59274070e1dd4b546f5cb431c2eb71ae5c3 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:12:16 2012 +0100 tests: Updated tests to properly handle NaN for half floats. commit fa94c135aea5911fd93d5dfb6e6f157fb40dce5e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:33:10 2012 +0200 gallivm: do mip level calculations per quad This is the final piece which shouldn't change the rendering output yet. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 23cbeaddfe03c09ca18c45d28955515317ffcf4c Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Jun 9 00:54:21 2012 +0200 gallivm: do per-quad cube face selection Doesn't quite fix the piglit cubemap test (not sure why actually) but doing per-quad face selection is doing the right thing and definitely an improvement. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit abfb372b3702ac97ac8b5aa80ad1b94a2cc39d33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:22:59 2012 +0200 gallivm: do all lod calculations per quad Still no functional change but lod is now converted to scalar after lod calculations. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 519368632747ae03feb5bca9c655eccbc5b751b4 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:46:10 2012 +0100 gallivm: Added support for half-float to float conversion in lp_build_conv. Updated various utility functions to support this change. commit 135b4d683a4c95f7577ba27b9bffa4a6fbd2c2e7 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:02:46 2012 +0100 gallivm: Added function for half-float to float conversion. Updated lp_build_format_aos_array to support half-float source. commit 37d648827406a20c5007abeb177698723ed86673 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:55:18 2012 +0100 util: Updated u_format_tests to rigidly test half-float boundary values. commit 2ad18165d96e578aa9046df7c93cb1c3284d8c6b Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:54:16 2012 +0100 llvmpipe: Updated lp_test_format to properly handle Inf/NaN results. commit 78740acf25aeba8a7d146493dd5c966e22c27b73 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:53:30 2012 +0100 util: Added functions for checking NaN / Inf for double and half-floats. commit 35e9f640ae01241f9e0d67fe893bbbf564c05809 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:05:13 2012 +0200 gallivm: Fix calculating rho for 3d textures for the single-quad case Discovered by accident, this looks like a very old typo bug. commit fc1220c636326536fd0541913154e62afa7cd1d8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:04:59 2012 +0200 gallivm: do calcs per-quad in lp_build_rho Still convert to scalar at the end of the function. commit 50a887ffc550bf310a6988fa2cea5c24d38c1a41 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 21 23:21:50 2012 +0200 gallivm: (trivial) return scalar in lp_build_extract_range for length 1 vectors Our type system on top of llvm's one doesn't generally support vectors of length 1, instead using scalars. So we should return a scalar from this function instead of having to bitcast the vector with length 1 later elsewhere. commit 80c71c621f9391f0f9230460198d861643324876 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 17:49:15 2012 +0100 draw: Fixed bad merge error commit c47401cfad0c9167de20ff560654f533579f452c Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:29:30 2012 +0100 draw: Updated store_clip to store whole vectors instead of individual elements. commit 2d9c1ad74b0b0b41861fffcecde39f09cc27f1cf Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:28:32 2012 +0100 gallivm: Added lp_build_fetch_rgba_aos_array. A version of lp_build_fetch_rgba_aos which is targeted at simple array formats. Reads the whole vector from memory in one, instead of reading each element individually. Tested with mesa tests and demos. commit ff7805dc2b6ef6d8b11ec4e54aab1633aef29ac8 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:27:40 2012 +0100 gallivm: Added lp_build_pad_vector. This function pads a vector with undef to a desired length. commit 701f50acef24a2791dabf4730e5b5687d6eb875d Author: James Benton <jbenton@vmware.com> Date: Fri May 18 17:27:19 2012 +0100 util: Added util_format_is_array. This function checks whether a format description is in a simple array format. commit 5e0a7fa543dcd009de26f34a7926674190fa6246 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:13:47 2012 +0100 draw: Removed draw_llvm_translate_from and draw/draw_llvm_translate.c. This is "replaced" by adding an optimised path in lp_build_fetch_rgba_aos in an upcoming patch. commit 8c886d6a7dd3fb464ecf031de6f747cb33e5361d Author: James Benton <jbenton@vmware.com> Date: Wed May 16 15:02:31 2012 +0100 draw: Modified store_aos to write the vector as one, not individual elements. commit 37337f3d657e21dfd662c7b26d61cb0f8cfa6f17 Author: James Benton <jbenton@vmware.com> Date: Wed May 16 14:16:23 2012 +0100 draw: Changed aos_to_soa to use lp_build_transpose_aos. commit bd2b69ce5d5c94b067944d1dcd5df9f8e84548f1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:14:27 2012 +0100 draw: Changed soa_to_aos to use lp_build_transpose_aos. commit 0b98a950d29a116e82ce31dfe7b82cdadb632f2b Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:45 2012 +0100 gallivm: Added lp_build_transpose_aos which converts between aos and soa. commit 69ea84531ad46fd145eb619ed1cedbe97dde7cb5 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:01 2012 +0100 gallivm: Added lp_build_interleave2_half aimed at AVX unpack instructions. commit 7a4cb1349dd35c18144ad5934525cfb9436792f9 Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 22 11:54:14 2012 +0100 gallivm: Fix build on Windows. MC-JIT not yet supported there. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit afd105fc16bb75d874e418046b80d9cc578818a1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:17:26 2012 +0100 llvmpipe: Added a error counter to lp_test_conv. Useful for keeping track of progress when fixing errors! Signed-off-by: José Fonseca <jfonseca@vmware.com> commit b644907d08c10a805657841330fc23db3963d59c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:16:46 2012 +0100 llvmpipe: Changed known failures in lp_test_conv. To comply with the recent fixes to lp_bld_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit d7061507bd94f6468581e218e61261b79c760d4f Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:14:38 2012 +0100 llvmpipe: Added fixed point types tests to lp_test_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 146b3ea39b4726dbe125ac666bd8902ea3d6ca8c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:26:35 2012 +0100 llvmpipe: Changed lp_test_conv src/dst alignment to be correct. Now based on the define rather than a fixed number. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit f3b57441f834833a4b142a951eb98df0aa874536 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:06:44 2012 +0100 gallivm: Fixed erroneous optimisation in lp_build_min/max. Previously assumed normalised was 0 to 1, but it can be -1 to 1 if type is signed. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a0613382e5a215cd146bb277646a6b394d376ae4 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:04:49 2012 +0100 gallivm: Compensate for lp_const_offset in lp_build_conv. Fixing a /*FIXME*/ to remove errors in integer conversion in lp_build_conv. Tested using lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a3d2bf15ea345bc8a0664f8f441276fd566566f3 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:01:25 2012 +0100 gallivm: Fixed overflow in lp_build_clamped_float_to_unsigned_norm. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit e7b1e76fe237613731fa6003b5e1601a2e506207 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 21 20:07:51 2012 +0100 gallivm: Fix build with LLVM 2.6 Trivial, and useful. commit d3c6bbe5c7f5ba1976710831281ab1b6a631082d Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 15 17:15:59 2012 +0100 gallivm: Enable MCJIT/AVX with vanilla LLVM 3.1. Add the necessary C++ glue, so that we don't need any modifications to the soon to be released LLVM 3.1. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 724a019a14d40fdbed21759a204a2bec8a315636 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 22:04:06 2012 +0100 gallivm: Use HAVE_LLVM 0x0301 consistently. commit af6991e2a3868e40ad599b46278551b794839748 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 21:49:06 2012 +0100 gallivm: Add MCRegisterInfo.h to silence benign warnings about missing implementation. Trivial. commit 6f8a1d75458daae2503a86c6b030ecc4bb494e23 Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Apr 2 22:14:15 2012 -0700 gallivm: Pass in a MCInstrInfo to createMCInstPrinter on llvm-3.1. llvm-3.1svn r153860 makes MCInstrInfo available to the MCInstPrinter. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 62555b6ed8760545794f83064e27cddcb3ce5284 Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 27 21:51:17 2012 -0700 gallivm: Fix method overriding in raw_debug_ostream. Use matching type qualifers to avoid method hiding. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 6a9bd784f4ac68ad0a731dcd39e5a3c39989f2be Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 13 22:40:52 2012 -0700 gallivm: Fix createOProfileJITEventListener namespace with llvm-3.1. llvm-3.1svn r152620 refactored the OProfile profiling code. createOProfileJITEventListener was moved from the llvm namespace to the llvm::JITEventListener namespace. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit b674955d39adae272a779be85aa1bd665de24e3e Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Mar 5 22:00:40 2012 -0800 gallivm: Pass in a MCRegisterInfo to MCInstPrinter on llvm-3.1. llvm-3.1svn r152043 changes createMCInstPrinter to take an additional MCRegisterInfo argument. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 11ab69971a8a31c62f6de74905dbf8c02884599f Author: Vinson Lee <vlee@freedesktop.org> Date: Wed Feb 29 21:20:53 2012 -0800 Revert "gallivm: Change getExtent and readByte to non-const with llvm-3.1." This reverts commit d5a6c172547d8964f4d4bb79637651decaf9deee. llvm-3.1svn r151687 makes MemoryObject accessor members const again. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 339960c82d2a9f5c928ee9035ed31dadb7f45537 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 14 16:19:56 2012 +0200 gallivm: (trivial) fix assertion failure for mipmapped 1d textures In lp_build_rho, we may end up with a 1-element vector (for mipmapped 1d textures), but in this case we require the type to be a non-vector type, so need a cast. commit 9d73edb727bd6d196030dc3026b7bf0c574b3e19 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:12:07 2012 +0200 gallivm: prepare for per-quad lod calculations for large vectors to be able to handle multiple quads at once in texture sampling and still do lod calculations per quad, it is necessary to get the per-quad derivatives into the lp_build_rho function. Until now these derivative values were just scalars, which isn't going to work. So we now use vectors, and since the interface needs to change we also do some different (slightly more efficient) packing of the values. For 8-wide vectors the packed derivative values for 3 coords would look like this, this scales to a arbitrary (multiple of 4) vector size: ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____ The second vector will be unused for 1d and 2d textures. To facilitate future changes the derivative values are put into a struct, since quite some functions just pass these values through. The generated code seems to be very slightly better for 2d textures (with 4-wide vectors) than before with sse2 (if you have a cpu with physical 128bit simd units - otherwise it's probably not a win). v2: suggestions from José, rename variables, add comments, use swizzle helper commit 0aa21de0d31466dac77b05c97005722e902517b8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:10:31 2012 +0200 gallivm: add undefined swizzle handling to lp_build_swizzle_aos This is useful for vectors with "holes", it lets llvm choose the most efficient shuffle instructions if some elements aren't needed without having to worry what elements to manually pick otherwise. commit 00faf3f370e7ce92f5ef51002b0ea42ef856e181 Author: José Fonseca <jfonseca@vmware.com> Date: Fri May 4 17:25:16 2012 +0100 gallivm: Get the LLVM IR optimization passes before JIT compilation. MC-JIT engine compiles the module immediately on creation, so the optimization passes were being run too late. So now we create a target data layout from a string, that matches the ABI parameters reported by the compiler. The backend optimization passes were always been run, so the performance improvement is modest (3% on multiarb mesa demo). Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit 40a43f4e2ce3074b5ce9027179d657ebba68800a Author: Roland Scheidegger <sroland@vmware.com> Date: Wed May 2 16:03:54 2012 +0200 gallivm: (trivial) fix wrong define used in lp_build_pack2 should fix stack-smashing crashes. commit e6371d0f4dffad4eb3b7a9d906c23f1c88a2ab9e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 30 21:25:29 2012 +0200 gallivm: add perf warnings when not using intrinsics with 256bit vectors Helper functions using integer sse2 intrinsics could split the vectors with AVX instead of using generic fallback (which should be faster). We don't actually expect to hit these paths (hence don't fix them up to actually do the vector splitting) so just emit warnings (for those functions where it's obvious doing split/intrinsic is faster than using generic path). Only emit warnings for 256bit vectors since we _really_ don't expect to hit arbitrary large vectors which would affect a lot more functions. The warnings do not actually depend on avx since the same logic applies to plain sse2 too (but of course again there's _really_ no reason we should hit these functions with 256bit vectors without avx). commit 8a9ea701ea7295181e846c6383bf66a5f5e47637 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:37:07 2012 +0200 gallivm: split vectors manually for avx in lp_build_pack2 (v2) There's 2 reasons for this: First, there's a llvm bug (fixed in 3.1) which generates tons of byte inserts/extracts otherwise, and second, more importantly, we want to use pack intrinsics instead of shuffles. We do this in lp_build_pack2 and not the calling code (aos sample path) because potentially other callers might find that useful too, even if for larger sequences of code using non-native vector sizes it might be better to manually split vectors. This should boost texture performance in the aos path considerably. v2: fix issues with intrinsics types with old llvm commit 27ac5b48fa1f2ea3efeb5248e2ce32264aba466e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:26:22 2012 +0200 llvmpipe: refactor lp_build_pack2 (v2) prettify, and it's unnecessary to assert when there's no intrinsic due to unsupported bit width - the shuffle path will work regardless. In contrast lp_build_packs2, should only rely on lp_build_pack2 doing the clamping for element sizes for which there is a sse2 intrinsic. v2: fix bug spotted by Jose regarding the intrinsic type for packusdw on old llvm versions. commit ddf279031f0111de4b18eaf783bdc0a1e47813c8 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:13:59 2012 +0200 gallivm: add src width check in lp_build_packs2() not doing so would skip clamping even if no sse2 pack instruction is available, which is incorrect (in theory only, such widths would also always hit a (unnecessary) assertion in lp_build_pack2(). commit e7f0ad7fe079975eae7712a6e0c54be4fae0114b Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Apr 27 15:57:00 2012 +0200 gallivm: (trivial) fix crash-causing typo for npot textures with avx commit 28a9d7f6f655b6ec508c8a3aa6ffefc1e79793a0 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Apr 25 19:38:45 2012 +0200 gallivm: (trivial) remove code mistakenly added twice. commit d5926537316f8ff67ad0a52e7242f7c5478d919b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Apr 24 21:16:15 2012 +0200 gallivm: add a new avx aos sample path (v2) Try to avoid mixing float and int address calculations. This does texture wrap modes with floats, and then the offset calculations still with ints (because of lack of precision with floats, though we could do some effort to make it work with not too large (16MB) textures). This also handles wrap repeat mode with npot-sized textures differently than either the old soa or aos int path (likely way faster but untested). Otherwise the actual address wrap code is largely similar to the soa path (not quite the same as this one also has some int code), it should get used by avx soa sampling later as well but doesn't handle more complex address modes yet (this will also have the benefit that we can use aos sampling path for all texture address modes). Generated code for that looks reasonable, but still does not split vectors explicitly for fetch/filter which means still get hit by llvm (fixed upstream) which generates hundreds of pinsrb/pextrb instead of two shuffles. It is not obvious though if it's much of a win over just doing address calcs 4-wide but with ints, even if it is definitely much less instructions on avx. piglit's texwrap seems to look exactly the same but doesn't test neither the non-normalized nor the npot cases. v2: fix comments, prettify based on Brian's and Jose's feedback. commit bffecd22dea66fb416ecff8cffd10dd4bdb73fce Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Apr 19 01:58:29 2012 +0200 gallivm: refactor aos lp_build_sample_image_nearest/linear split them up to separate address calculations and fetching/filtering. Need this for being able to do 8-wide float address calcs and 4-wide fetch/filter later (for avx). Plus the functions were very big scary monsters anyway (in particular lp_build_sample_image_linear). commit a80b325c57529adddcfa367f96f03557725c4773 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 16 17:17:18 2012 +0200 gallivm: fix lp_build_resize when truncating width but expanding vector size Missed this case which I thought was impossible - the assertion for it was right after the division by zero... (AoS) texture sampling may ask us to do this, for things like 8 4x32int vectors to 1 32x8int vector conversion (eventually, we probably don't want this to happen). commit f9c8337caa3eb185830d18bce8b95676a065b1d7 Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Apr 14 18:00:59 2012 +0200 gallivm: fix cube maps with larger vectors This makes the branchless cube face selection code work with larger vectors. Because the complexity is quite high (cannot really be improved it seems, per-face selection would reduce complexity a lot but this leads to errors unless the derivatives are calculated all from the same face which almost doubles the work to be done) it is still slower than the branching version, hence only enable this with large vectors. It doesn't actually do per-quad face selection yet (only makes sense with matching lod selection, in fact it will select the same face for all pixels based on the average of the first four pixels for now) but only different shuffles are required to make it work (the branching version actually should work with larger vectors too now thanks to the improved horizontal add but of course it cannot be extended to really select the face per-quad unless doing branching per quad). commit 7780c58869fc9a00af4f23209902db7e058e8a66 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 21:11:12 2012 +0100 llvmpipe: (trivial) fix compiler warning and also clarify comment regarding availability of popcnt instruction. commit a266dccf477df6d29a611154e988e8895892277e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 14:21:07 2012 +0100 gallivm: remove unneeded members in lp_build_sample_context Minor cleanup, the texture width, height, depth aren't accessed in their scalar form anywhere. Makes it more obvious those values should probably be fetched already vectorized (but this requires more invasive changes)... commit b678c57fb474e14f05e25658c829fc04d2792fff Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 29 15:53:55 2012 +0100 gallivm: add a helper for concatenating vectors Similar to the extract_range helper intended to get around slow code generated by llvm for 128bit insertelements. Concatenating two 128bit vectors this way will result in a single vinsertf128 operation rather than two 64bit stores plus one 128bit load, though it might be mildly useful for other purposes as well. commit 415ff228bcd0cf5e44a4c15350a661f0f5520029 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 19:41:15 2012 +0100 gallivm: add a custom 2x8f->1x16ub avx conversion path Similar to the existing 4x4f->1x16ub sse2 path, shaves off a couple instructions (min/max mostly) because it relies on pack intrinsics clamping. commit 78c08fc89f8fbcc6dba09779981b1e873e2a0299 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 18:44:07 2012 +0100 gallivm: add avx arithmetic intrinsics Add all avx intrinsics for arithmetic functions (with the exception of the horizontal add function which needs another look). Seems to pass basic tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit a586caa2800aa5ce54c173f7c0d4fc48153dbc4e Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 15:31:35 2012 +0100 gallivm: add avx logic intrinsics Add the blend intrinsics for 8-wide float and 4-wide double vectors. Since we lack 256bit int instructions these are used for int vectors as well, though obviously not for byte or word element values. The comparison intrinsics aren't extended for avx since these are only used for pre-2.7 llvm versions. commit 70275e4c13c89315fc2560a4c488c0e6935d5caf Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 00:40:53 2012 +0100 gallivm: new helper function for extract shuffles. Based on José's idea as we can need that in a couple places. Note that such shuffles should not be used lightly, since data layout of <4 x i8> is different to <16 x i8> for instance, hence might cause data rearrangement. commit 4d586dbae1b0c55915dda1759d2faea631c0a1c2 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 18:27:25 2012 +0100 gallivm: (trivial) don't overallocate shuffle variable using wrong define meant huge array... commit 06b0ec1f6d665d98c135f9573ddf4ba04b2121ad Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 17:54:20 2012 +0100 gallivm: don't do per-element extract/insert for vector element resize Instead of doing per-element extract/insert if the src vectors and dst vector differ in total size (which generates atrocious code) first change the src vectors size by using shuffles to destination vector size. We can still do better than that on AVX for packing to color buffer (by exploiting pack intrinsics characteristics hence eleminating the need for some clamps) but this already generates much better code. v2: incorporate feedback from José, Keith and use shuffle instead of bitcasts/extracts. Due to llvm deficiencies the latter cause all data to get moved to GPRs and back in pieces (even though the data in the regs actually stays the same...). commit c9970d70e05f95d3f52fe7d2cd794176a52693aa Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 19:33:19 2012 +0000 gallivm: fix bug in simple position interpolation Accidental use of position attribute instead of just pixel coordinates. Caused failures in piglit glsl-fs-ceil and glsl-fs-floor. commit d0b6fcdb008d04d7f73d3d725615321544da5a7e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 15:31:14 2012 +0000 gallivm: fix emission of ceil opcode lp_build_ceil seems more appropriate than lp_build_trunc. This seems to be never hit though someone performs some ceil to floor magic. commit d97fafed7e62ffa6bf76560a92ea246a1a26d256 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 22 11:46:52 2012 +0000 gallivm: new vectorized path for cubemap calculations should be faster when adapted to multiple quads as only selection masks need to be different. The code is more or less a per-pixel version adapted to only do it per quad. A per pixel version would be much simpler (could drop 2 selects, 6 broadcasts and the messy horizontal add of 3 vectors at the expense of only 2 more absolute value instructions - would also just work for arbitary large vectors). This version doesn't yet work with larger vectors because the horizontal add isn't adjusted to be able to work with 2x4 vectors (and also because face selection wouldn't be done per quad just per block though that would be only a correctness issue just as with lod selection). The downside is this code is quite a bit slower. On a Core2 it can be sped up by disabling the hw blend instructions for selection and using logicop fallbacks instead, but it is still slower than the old code, hence leave that in for now. Probably will chose one or the other version based on vector length in the end. commit b375fbb18a3fd46859b7fdd42f3e9908ea4ff9a3 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 21 14:42:29 2012 +0000 gallivm: fix optimized occlusion query intrinsic name commit a9ba0a3b611e48efbb0e79eb09caa85033dbe9a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Mar 21 16:19:43 2012 +0000 draw,gallivm,llvmpipe: Call gallivm_verify_function everywhere. commit f94c2238d2bc7383e088b8845b7410439a602071 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 18:54:10 2012 +0000 gallivm: optimize calculations for cube maps a bit this does some more vectorized calculations and uses horizontal adds if possible. A definite win with sse3 otherwise it doesn't seem to make much of a difference. In any case this is arithmetically identical, cannot handle larger vectors. Should be useful as a reference point against larger vector version later... commit 21a2c1cf3c8e1ac648ff49e59fdc0e3be77e2ebb Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 15:16:27 2012 +0000 llvmpipe: slight optimization of occlusion queries using movmskps when available. While this is slightly better for cpus without popcnt we should really sum the vectors ourselves (it is also possible to cast to i4 before doing the popcnt but that doesn't help that much neither since llvm is using some optimized popcnt version for i32) commit 5ab5a35f216619bcdf55eed52b0db275c4a06c1b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 13:32:11 2012 +0000 llvmpipe: fix occlusion queries with larger vectors need to adjust casts etc. commit ff95e6fdf5f16d4ef999ffcf05ea6e8c7160b0d5 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Mar 19 20:15:25 2012 +0000 gallivm: Restore optimization passes. commit 57b05b4b36451e351659e98946dae27be0959832 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:34:22 2012 +0000 llvmpipe: use existing min2 macro commit bc9a20e19b4f600a439f45679451f2e87cd4b299 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:07:27 2012 +0000 llvmpipe: add some safeguards against really large vectors As per José's suggestion, prevent things from blowing up if some cpu would have 1024bit or larger vectors. commit 0e2b525e5ca1c5bbaa63158bde52ad1c1564a3a9 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:31:08 2012 +0000 llvmpipe: fix mask generation for uberwide vectors this was the only piece preventing 16-wide vectors from working (apart from the LP_MAX_VECTOR_WIDTH define that is), which is the maximum as we don't get more pixels in the fragment shader at once. Hence adjust that so things could be tested properly with that size even though there seems to be no practical value. commit 3c8334162211c97f3a11c7f64e9e5a2a91ad9656 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:19:41 2012 +0000 llvmpipe: fix the simple interpolation method with larger vectors so both methods actually _really_ work now. Makes textures look nice with larger vectors... commit 1cb0464ef8871be1778d43b0c56adf9c06843e2d Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 17:26:35 2012 +0000 llvmpipe: fix mask generation and position interpolation with 8-wide vectors trivial bugs, with these things start to look somewhat reasonable. Textures though have some swizzling issues it seems. commit 168277a63ef5b72542cf063c337f2d701053ff4b Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 16:04:03 2012 +0000 llvmpipe: don't overallocate variables we never have more than 16 (stamp size) / 4 (minimum possible vector size). (With larger vectors those variables are still overallocated a bit.) commit 409b54b30f81ed0aa9ed0b01affe15c72de9abd2 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:56:48 2012 +0000 llvmpipe: add some 32f8 formats to lp_test_conv Also add the ability to handle different sized vectors. commit 55dcd3af8366ebdac0af3cdb22c2588f24aa18ce Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:47:27 2012 +0000 gallivm: handle different sized vectors in conversion / pack only fully generic path for now (extract/insert per element). commit 9c040f78c54575fcd94a8808216cf415fe8868f6 Author: Roland Scheidegger <sroland@vmware.com> Date: Sun Mar 18 00:58:28 2012 +0100 llvmpipe: fix harmless use of unitialized values commit 551e9d5468b92fc7d5aa2265db9a52bb1e368a36 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:31:21 2012 +0100 gallivm: drop special path in extract_broadcast with different sized vectors Not needed, llvm can handle shuffles with different sized result vector just fine. Should hopefully generate the same code in the end, but simpler IR. commit 44da531119ffa07a421eaa041f63607cec88f6f8 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:28:49 2012 +0100 llvmpipe: adapt interpolation for handling multiple quads at once this is still WIP there are actually two methods possible not quite sure what makes the most sense, so there's code for both for now: 1) the iterative method as used before (compute attrib values at upper left corner of stamp and upper left corner of each quad initially). It is improved to handle more than one quad at once, and also do some more vectorized calculations initially for slightly better code - newer cpus have full throughput with 4 wide float vectors, hence don't try to code up a path which might be faster if there's just one channel active per attribute. 2) just do straight interpolation for each pixel. Method 2) is more work per quad, but less initially - if all quads are executed significantly more overall though. But this might change with larger vector lengths. This method would also be needed if we'd do some kind of active quad merging when operating on multiple quads at once. This path contains some hack to force llvm to generate better code, it is still far from ideal though, still generates far too many unnecessary register spills/reloads. Both methods should work with different sized vectors. Not very well tested yet, still seems to work with four-wide vectors, need changes elsewhere to be able to test with wider vectors. commit be5d3e82e2fe14ad0a46529ab79f65bf2276cd28 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:59:37 2012 +0000 draw: Cleanup. commit f85bc12c7fbacb3de2a94e88c6cd2d5ee0ec0e8d Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:43:30 2012 +0000 gallivm: More module compilation refactoring. commit d76f093198f2a06a93b2204857e6fea5fd0b3ece Author: José Fonseca <jfonseca@vmware.com> Date: Thu Mar 15 21:29:11 2012 +0000 llvmpipe: Use gallivm_compile/free_function() in linear code. Should had been done before. commit 122e1adb613ce083ad739b153ced1cde61dfc8c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 13 14:47:10 2012 +0100 llvmpipe: generate partial pixel mask for multiple quads still works with one quad, cannot be tested yet with more At least for now always fixed order with multiple quads. commit 4c4f15081d75ed585a01392cd2dcce0ad10e0ea8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 22:09:24 2012 +0100 llvmpipe: refactor state setup a bit Refactor to make it easier to emit (and potentially later fetch in fs) coefficients for multiple attributes at once. Need to think more about how to make this actually happen however, the problem is different attributes can have different interpolation modes, requiring different handling in both setup and fs (though linear and perspective handling is close). commit 9363e49722ff47094d688a4be6f015a03fba9c79 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 19:23:23 2012 +0100 llvmpipe: vectorize tri offset calc cuts number of instructions in quad-offset-factor from 107 to 75. This code actually duplicated the (scalar) code calculating the determinant except it used different vertex order (leading to different sign but it doesn't matter) hence llvm could not have figured out it's the same (of course with determinant vectorized in the other place that wouldn't have worked any longer neither). Note this particular piece doesn't actually vectorize well, not many arithmetic instructions left but tons of shuffle instructions... Probably would need to work on n tris at a time for better vectorization. commit 63169dcb9dd445c94605625bf86d85306e2b4297 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 03:11:37 2012 +0100 llvmpipe: vectorize some scalar code in setup reduces number of arithmetic instructions, and avoids loading vector x,y values twice (once as scalars once as vectors). Results in a reduction of instructions from 76 to 64 in fs setup for glxgears (16%) on a cpu with sse41. Since this code uses vec2 disguised as vec4, on old cpus which had physical 64bit sse units (pre-Core2) it probably is less of a win in practice (and if you have no vectors you can only hope llvm eliminates the arithmetic for unneeded elements). commit 732ecb877f951ab89bf503ac5e35ab8d838b58a1 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 7 00:32:24 2012 +0100 draw: fix clipping bug introduced by 4822fea3f0440b5205e957cd303838c3b128419c broke clipping pretty badly (verified with lineclip test) commit ef5d90b86d624c152d200c7c4056f47c3c6d2688 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 23:38:59 2012 +0100 draw: don't store vertex header per attribute storing the vertex header once per attribute is totally unnecessary. Some quick look at the generated assembly says llvm in fact cannot optimize away the additional stores (maybe due to potentially aliasing pointers somewhere). Plus, this makes the code cleaner and also allows using a vector "or" instead of scalar ones. commit 6b3a5a57b0b9850854cfbd7b586e4e50102dda71 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 19:11:01 2012 +0100 draw: do the per-vertex "boolean" clipmask "or" with vectors no point extracting the values and doing it per component. Doesn't help that much since we still extract the values elsewhere anyway. commit 36519caf1af40e4480251cc79a2d527350b7c61f Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 2 22:27:01 2012 +0100 gallivm: fix lp_build_extract_broadcast with different sized vectors Fix the obviously wrong argument, so it doesn't blow up. commit 76d0ac3ad85066d6058486638013afd02b069c58 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 2 12:16:23 2012 +0000 draw: Compile per module and not per function (WIP). Enough to get gears w/ LLVM draw + softpipe to work on AVX doing: GALLIUM_DRIVER=softpipe SOFTPIPE_USE_LLVM=yes glxgears But still hackish -- will need to rethink and refactor this. commit 78e32b247d2a7a771be9a1a07eb000d1e54ea8bd Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 12:01:05 2012 +0000 llvmpipe: Remove lp_state_setup_fallback. Never used. commit 6895d5e40d19b4972c361e8b83fdb7eecda3c225 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Feb 27 19:14:27 2012 +0000 llvmpipe: Don't emit EMMS on x86 We already take precautions to ensure that LLVM never emits MMX code. commit 4822fea3f0440b5205e957cd303838c3b128419c Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 29 15:58:19 2012 +0100 draw: modifications for larger vector sizes We want to be able to use larger vectors especially for running the vertex shader. With this patch we build soa vectors which might have a different length than 4. Note that aos structures really remain the same, only when aos structures are converted to soa potentially different sized vectors are used. Samplers probably don't work yet, didn't look at them. Testing done: glxgears works with both 128bit and 256bit vectors. commit f4950fc1ea784680ab767d3dd0dce589f4e70603 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:57 2012 +0100 gallivm: override native vector width with LP_NATIVE_VECTOR_WIDTH env var for debug commit 6ad6dbf0c92f3bf68ae54e5f2aca035d19b76e53 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:24 2012 +0100 draw: allocate storage with alignment according to native vector width commit 7bf0e3e7c9bd2469ae7279cabf4c5229ae9880c1 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Feb 24 19:06:08 2012 +0000 gallivm: Fix comment grammar. Was missing several words. Spotted by Roland. commit b20f1b28eb890b2fa2de44a0399b9b6a0d453c52 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 19:22:09 2012 +0000 gallivm: Use MC-JIT on LLVM 3.1 + (i.e, SVN) MC-JIT Note: MC-JIT is still WIP. For this to work correctly it requires LLVM changes which are not yet upstream. commit b1af4dfcadfc241fd4023f4c3f823a1286d452c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 20:03:15 2012 +0100 llvmpipe: use new lp_type_width() helper in lp_test_blend commit 04e0a37e888237d4db2298f31973af459ef9c95f Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 19:50:34 2012 +0100 llvmpipe: clean up lp_test_blend a little Using variables just sized and aligned right makes it a bit more obvious what's going on. The test still only tests vector length 4. For AoS anything else probably isn't going to work. For SoA other lengths should work (at least with floats). commit e61c393d3ec392ddee0a3da170e985fda885a823 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:48:30 2012 +0000 gallivm: Ensure vector width consistency. Instead of assuming that everything is the max native size. commit 330081ac7bc41c5754a92825e51456d231bf84dd Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:44:14 2012 +0000 draw: More simd vector width consistency fixes. commit d90ca002753596269e37297e2e6c139b19f29f03 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:43:00 2012 +0000 gallivm: Remove unused lp_build_int32_vec4_type() helper. commit cae23417824d75869c202aaf897808d73a2c1db0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 17:32:16 2012 +0100 gallivm: use global variable for native vector width instead of define We do not know the simd extensions (and hence the simd width we should use) available at compile time. At least for now keep a define for maximum vector width, since a global variable obviously can't be used to adjust alignment of automatic stack variables. Leave the runtime-determined value at 128 for now in all cases. commit 51270ace6349acc2c294fc6f34c025c707be538a Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 15:41:02 2012 +0000 gallivm: Add a hunk inadvertedly lost when rebasing. commit bf256df9cfdd0236637a455cbaece949b1253e98 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:24:23 2012 +0000 llvmpipe: Use consistent vector width in depth/stencil test. commit 5543b0901677146662c44be2cfba655fd55da94b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:19:59 2012 +0000 draw: Use a consistent the vector register width. Instead of 4x32 sometimes, LP_NATIVE_VECTOR_WIDTH other times. commit eada8bbd22a3a61f549f32fe2a7e408222e5c824 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 12:08:04 2012 +0000 gallivm: Remove garbagge collection. MC-JIT will require one compilation per module (as opposed to one compilation per function), therefore no state will be shared, eliminating the need to do garbagge collection. commit 556697ea0ed72e0641851e4fbbbb862c470fd7eb Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 10:33:41 2012 +0000 gallivm: Move all native target initialization to lp_set_target_options(). commit c518e8f3f2649d5dc265403511fab4bcbe2cc5c8 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:52:32 2012 +0000 llvmpipe: Create one gallivm instance for each test. commit 90f10af8920ec6be6f2b1e7365cfc477a0cb111d Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:48:08 2012 +0000 gallivm: Avoid LLVMAddGlobalMapping() in lp_bld_assert(). Brittle, complex, and unecesary. Just use function pointer constant. commit 98fde550b33401e3fe006af59db4db628bcbf476 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:21:26 2012 +0000 gallivm: Add a lp_build_const_func_pointer() helper. To be reused in all places where we want to call C code. commit 6cfedadb62c2ce5af8d75969bc95a607f3ece118 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:44:41 2012 +0000 gallivm: Cleanup/simplify lp_build_const_string_variable. - Move to lp_bld_const where it belongs - Rename to lp_build_const_string - take the length from the argument (and don't count the zero terminator twice) - bitcast the constant to generic i8 * commit db1d4018c0f1fa682a9da93c032977659adfb68c Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 11:52:17 2012 +0000 gallivm: Set NoFramePointerElimNonLeaf to true where supported. commit 088614164aa915baaa5044fede728aa898483183 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 22 19:38:47 2012 +0100 llvmpipe: pass in/out pointers rather scalar floats in lp_bld_arit we don't want llvm to potentially optimize away the vectors (though it doesn't seem to currently), plus we want to be able to handle in/out vectors of arbitrary length. commit 3f5c4e04af8a7592fdffa54938a277c34ae76b51 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:22:55 2012 +0100 gallivm: fix lp_build_sqrt() for vector length 1 since we optimize away vectors with length 1 need to emit intrinsic without vector type. commit 79d94e5f93ed8ba6757b97e2026722ea31d32c06 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 17:00:46 2012 +0000 llvmpipe: Remove lp_test_round. commit 81f41b5aeb3f4126e06453cfc78990086b85b78d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:56:24 2012 +0100 llvmpipe: subsume lp_test_round into lp_test_arit Much simpler, and since the arguments aren't passed as 128bit values can run on any arch. This also uses the float instead of the double versions of the c functions (which probably was the intention anyway). In contrast to lp_test_round the output is much less verbose however. Tested vector width of 32 to 512 bits - all pass except 32 (length 1) which crashes in lp_build_sqrt() due to wrong type. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 945b338b421defbd274481d8c4f7e0910fd0e7eb Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 09:55:03 2012 +0000 gallivm: Centralize the function compilation logic. This simplifies a lot of code. Also doing this in a central place will make it easier to carry out the changes necessary to use MC-JIT in the future. gallivm: Fix typo in explicit derivative shuffle. Trivial. draw: make DEBUG_STORE work again adapt to lp_build_printf() interface changes Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: get rid of vecnf_from_scalar() just use lp_build_broadcast directly (cannot assign a name but don't really need it, vecnf_from_scalar() was producing much uglier IR due to using repeated insertelement instead of insertelement+shuffle). Reviewed-by: José Fonseca <jfonseca@vmware.com> llvmpipe: fix typo in complex interpolation code Fixes position interpolation when using complex mode (piglit fp-fragment-position and similar) Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: fix clipvertex/position storing again This appears to be the result of a bad merge. Fixes piglit tests relying on clipping, like a lot of the interpolation tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Fix explicit derivative manipulation. Same counter variable was being used in two nested loops. Use more meanigful variable names for the counter to fix and avoid this. gallivm: Prevent buffer overflow in repeat wrap mode for NPOT. Based on Roland's patch, discussion, and review . Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix dims for TGSI_TEXTURE_1D in emit_tex. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix explicit volume texture derivatives. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: fix 1d shadow texture sampling Always r coordinate is used, hence need 3 coords not two (the second one is unused). Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Enable AVX support without MCJIT, where available. For now, this just enables AVX on Windows for testing. If the code is stable then we might consider prefering the old JIT wherever possible. No change elsewhere. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2012-07-13 18:09:30 +01:00
/**
* Return the horizontal sums of 4 float vectors as a float4 vector.
* This uses the technique as outlined in Intel Optimization Manual.
*/
static LLVMValueRef
lp_build_horizontal_add4x4f(struct lp_build_context *bld,
LLVMValueRef src[4])
{
struct gallivm_state *gallivm = bld->gallivm;
LLVMBuilderRef builder = gallivm->builder;
LLVMValueRef shuffles[4];
LLVMValueRef tmp[4];
LLVMValueRef sumtmp[2], shuftmp[2];
/* lower half of regs */
shuffles[0] = lp_build_const_int32(gallivm, 0);
shuffles[1] = lp_build_const_int32(gallivm, 1);
shuffles[2] = lp_build_const_int32(gallivm, 4);
shuffles[3] = lp_build_const_int32(gallivm, 5);
tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
LLVMConstVector(shuffles, 4), "");
tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
LLVMConstVector(shuffles, 4), "");
/* upper half of regs */
shuffles[0] = lp_build_const_int32(gallivm, 2);
shuffles[1] = lp_build_const_int32(gallivm, 3);
shuffles[2] = lp_build_const_int32(gallivm, 6);
shuffles[3] = lp_build_const_int32(gallivm, 7);
tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
LLVMConstVector(shuffles, 4), "");
tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
LLVMConstVector(shuffles, 4), "");
sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
shuffles[0] = lp_build_const_int32(gallivm, 0);
shuffles[1] = lp_build_const_int32(gallivm, 2);
shuffles[2] = lp_build_const_int32(gallivm, 4);
shuffles[3] = lp_build_const_int32(gallivm, 6);
shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
LLVMConstVector(shuffles, 4), "");
shuffles[0] = lp_build_const_int32(gallivm, 1);
shuffles[1] = lp_build_const_int32(gallivm, 3);
shuffles[2] = lp_build_const_int32(gallivm, 5);
shuffles[3] = lp_build_const_int32(gallivm, 7);
shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
LLVMConstVector(shuffles, 4), "");
return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
}
/*
* partially horizontally add 2-4 float vectors with length nx4,
* i.e. only four adjacent values in each vector will be added,
* assuming values are really grouped in 4 which also determines
* output order.
*
* Return a vector of the same length as the initial vectors,
* with the excess elements (if any) being undefined.
* The element order is independent of number of input vectors.
* For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
* the output order thus will be
* sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
*/
LLVMValueRef
lp_build_hadd_partial4(struct lp_build_context *bld,
LLVMValueRef vectors[],
unsigned num_vecs)
{
struct gallivm_state *gallivm = bld->gallivm;
LLVMBuilderRef builder = gallivm->builder;
LLVMValueRef ret_vec;
LLVMValueRef tmp[4];
const char *intrinsic = NULL;
assert(num_vecs >= 2 && num_vecs <= 4);
assert(bld->type.floating);
/* only use this with at least 2 vectors, as it is sort of expensive
* (depending on cpu) and we always need two horizontal adds anyway,
* so a shuffle/add approach might be better.
*/
tmp[0] = vectors[0];
tmp[1] = vectors[1];
tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
bld->type.length == 4) {
intrinsic = "llvm.x86.sse3.hadd.ps";
}
else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
bld->type.length == 8) {
intrinsic = "llvm.x86.avx.hadd.ps.256";
}
if (intrinsic) {
tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
lp_build_vec_type(gallivm, bld->type),
tmp[0], tmp[1]);
if (num_vecs > 2) {
tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
lp_build_vec_type(gallivm, bld->type),
tmp[2], tmp[3]);
}
else {
tmp[1] = tmp[0];
}
return lp_build_intrinsic_binary(builder, intrinsic,
lp_build_vec_type(gallivm, bld->type),
tmp[0], tmp[1]);
}
if (bld->type.length == 4) {
ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
}
else {
LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
unsigned j;
unsigned num_iter = bld->type.length / 4;
struct lp_type parttype = bld->type;
parttype.length = 4;
for (j = 0; j < num_iter; j++) {
LLVMValueRef partsrc[4];
unsigned i;
for (i = 0; i < 4; i++) {
partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
}
partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
}
ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
}
return ret_vec;
}
2010-03-11 23:26:12 +00:00
2009-08-19 20:13:49 +01:00
/**
* Generate a - b
*/
2009-08-01 17:27:05 +01:00
LLVMValueRef
lp_build_sub(struct lp_build_context *bld,
2009-08-01 17:27:05 +01:00
LLVMValueRef a,
LLVMValueRef b)
{
LLVMBuilderRef builder = bld->gallivm->builder;
const struct lp_type type = bld->type;
LLVMValueRef res;
2009-08-01 17:27:05 +01:00
assert(lp_check_value(type, a));
assert(lp_check_value(type, b));
if(b == bld->zero)
return a;
if(a == bld->undef || b == bld->undef)
return bld->undef;
if(a == b)
return bld->zero;
2009-08-01 17:27:05 +01:00
if(bld->type.norm) {
const char *intrinsic = NULL;
2009-08-01 17:27:05 +01:00
if(b == bld->one)
return bld->zero;
2009-08-01 17:27:05 +01:00
if (!type.floating && !type.fixed) {
if (type.width * type.length == 128) {
if (util_cpu_caps.has_sse2) {
if(type.width == 8)
intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
if(type.width == 16)
intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
} else if (util_cpu_caps.has_altivec) {
if(type.width == 8)
intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
if(type.width == 16)
intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
}
}
if (type.width * type.length == 256) {
if (util_cpu_caps.has_avx2) {
if(type.width == 8)
intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b";
if(type.width == 16)
intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : "llvm.x86.avx2.psubus.w";
}
}
}
if (intrinsic)
return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
}
if(type.norm && !type.floating && !type.fixed) {
if (type.sign) {
uint64_t sign = (uint64_t)1 << (type.width - 1);
LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
/* a_clamp_max is the maximum a for negative b,
a_clamp_min is the minimum a for positive b. */
LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
} else {
a = lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
}
}
if(LLVMIsConstant(a) && LLVMIsConstant(b))
if (type.floating)
res = LLVMConstFSub(a, b);
else
res = LLVMConstSub(a, b);
else
if (type.floating)
res = LLVMBuildFSub(builder, a, b, "");
else
res = LLVMBuildSub(builder, a, b, "");
if(bld->type.norm && (bld->type.floating || bld->type.fixed))
res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
return res;
2009-08-01 17:27:05 +01:00
}
/**
* Normalized multiplication.
*
* There are several approaches for (using 8-bit normalized multiplication as
* an example):
*
* - alpha plus one
*
* makes the following approximation to the division (Sree)
*
* a*b/255 ~= (a*(b + 1)) >> 256
*
* which is the fastest method that satisfies the following OpenGL criteria of
*
* 0*0 = 0 and 255*255 = 255
*
* - geometric series
*
* takes the geometric series approximation to the division
*
* t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
*
* in this case just the first two terms to fit in 16bit arithmetic
*
* t/255 ~= (t + (t >> 8)) >> 8
*
* note that just by itself it doesn't satisfies the OpenGL criteria, as
* 255*255 = 254, so the special case b = 255 must be accounted or roundoff
* must be used.
*
* - geometric series plus rounding
*
* when using a geometric series division instead of truncating the result
* use roundoff in the approximation (Jim Blinn)
*
* t/255 ~= (t + (t >> 8) + 0x80) >> 8
*
* achieving the exact results.
*
*
*
* @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
* ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
* @sa Michael Herf, The "double blend trick", May 2000,
* http://www.stereopsis.com/doubleblend.html
*/
static LLVMValueRef
lp_build_mul_norm(struct gallivm_state *gallivm,
struct lp_type wide_type,
LLVMValueRef a, LLVMValueRef b)
{
LLVMBuilderRef builder = gallivm->builder;
struct lp_build_context bld;
unsigned n;
LLVMValueRef half;
LLVMValueRef ab;
assert(!wide_type.floating);
assert(lp_check_value(wide_type, a));
assert(lp_check_value(wide_type, b));
lp_build_context_init(&bld, gallivm, wide_type);
n = wide_type.width / 2;
if (wide_type.sign) {
--n;
}
/*
* TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
* http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
*/
/*
* a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
*/
ab = LLVMBuildMul(builder, a, b, "");
ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
/*
* half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
*/
half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
if (wide_type.sign) {
LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
half = lp_build_select(&bld, sign, minus_half, half);
}
ab = LLVMBuildAdd(builder, ab, half, "");
/* Final division */
ab = lp_build_shr_imm(&bld, ab, n);
return ab;
}
2009-08-19 20:13:49 +01:00
/**
* Generate a * b
*/
2009-08-01 17:27:05 +01:00
LLVMValueRef
lp_build_mul(struct lp_build_context *bld,
LLVMValueRef a,
LLVMValueRef b)
2009-08-01 17:27:05 +01:00
{
LLVMBuilderRef builder = bld->gallivm->builder;
const struct lp_type type = bld->type;
LLVMValueRef shift;
LLVMValueRef res;
assert(lp_check_value(type, a));
assert(lp_check_value(type, b));
if(a == bld->zero)
return bld->zero;
if(a == bld->one)
2009-08-01 17:27:05 +01:00
return b;
if(b == bld->zero)
return bld->zero;
if(b == bld->one)
2009-08-01 17:27:05 +01:00
return a;
if(a == bld->undef || b == bld->undef)
return bld->undef;
2009-08-01 17:27:05 +01:00
if (!type.floating && !type.fixed && type.norm) {
struct lp_type wide_type = lp_wider_type(type);
LLVMValueRef al, ah, bl, bh, abl, abh, ab;
lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);
lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh);
/* PMULLW, PSRLW, PADDW */
abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh);
return ab;
}
if(type.fixed)
shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
else
shift = NULL;
if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
if (type.floating)
res = LLVMConstFMul(a, b);
else
res = LLVMConstMul(a, b);
if(shift) {
if(type.sign)
res = LLVMConstAShr(res, shift);
else
res = LLVMConstLShr(res, shift);
}
}
else {
if (type.floating)
res = LLVMBuildFMul(builder, a, b, "");
else
res = LLVMBuildMul(builder, a, b, "");
if(shift) {
if(type.sign)
res = LLVMBuildAShr(builder, res, shift, "");
else
res = LLVMBuildLShr(builder, res, shift, "");
}
}
return res;
2009-08-01 17:27:05 +01:00
}
/*
* Widening mul, valid for 32x32 bit -> 64bit only.
* Result is low 32bits, high bits returned in res_hi.
*
* Emits code that is meant to be compiled for the host CPU.
*/
LLVMValueRef
lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
LLVMValueRef a,
LLVMValueRef b,
LLVMValueRef *res_hi)
{
struct gallivm_state *gallivm = bld->gallivm;
LLVMBuilderRef builder = gallivm->builder;
assert(bld->type.width == 32);
assert(bld->type.floating == 0);
assert(bld->type.fixed == 0);
assert(bld->type.norm == 0);
/*
* XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
* for x86 simd is atrocious (even if the high bits weren't required),
* trying to handle real 64bit inputs (which of course can't happen due
* to using 64bit umul with 32bit numbers zero-extended to 64bit, but
* apparently llvm does not recognize this widening mul). This includes 6
* (instead of 2) pmuludq plus extra adds and shifts
* The same story applies to signed mul, albeit fixing this requires sse41.
* https://llvm.org/bugs/show_bug.cgi?id=30845
* So, whip up our own code, albeit only for length 4 and 8 (which
* should be good enough)...
*/
if ((bld->type.length == 4 || bld->type.length == 8) &&
((util_cpu_caps.has_sse2 && (bld->type.sign == 0)) ||
util_cpu_caps.has_sse4_1)) {
const char *intrinsic = NULL;
LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd;
LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec;
struct lp_type type_wide = lp_wider_type(bld->type);
LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide);
unsigned i;
for (i = 0; i < bld->type.length; i += 2) {
shuf[i] = lp_build_const_int32(gallivm, i+1);
shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
}
shuf_vec = LLVMConstVector(shuf, bld->type.length);
aeven = a;
beven = b;
aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, "");
bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, "");
if (util_cpu_caps.has_avx2 && bld->type.length == 8) {
if (bld->type.sign) {
intrinsic = "llvm.x86.avx2.pmul.dq";
} else {
intrinsic = "llvm.x86.avx2.pmulu.dq";
}
muleven = lp_build_intrinsic_binary(builder, intrinsic,
wider_type, aeven, beven);
mulodd = lp_build_intrinsic_binary(builder, intrinsic,
wider_type, aodd, bodd);
}
else {
/* for consistent naming look elsewhere... */
if (bld->type.sign) {
intrinsic = "llvm.x86.sse41.pmuldq";
} else {
intrinsic = "llvm.x86.sse2.pmulu.dq";
}
/*
* XXX If we only have AVX but not AVX2 this is a pain.
* lp_build_intrinsic_binary_anylength() can't handle it
* (due to src and dst type not being identical).
*/
if (bld->type.length == 8) {
LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi;
LLVMValueRef aoddlo, aoddhi, boddlo, boddhi;
LLVMValueRef muleven2[2], mulodd2[2];
struct lp_type type_wide_half = type_wide;
LLVMTypeRef wtype_half;
type_wide_half.length = 2;
wtype_half = lp_build_vec_type(gallivm, type_wide_half);
aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4);
aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4);
bevenlo = lp_build_extract_range(gallivm, beven, 0, 4);
bevenhi = lp_build_extract_range(gallivm, beven, 4, 4);
aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4);
aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4);
boddlo = lp_build_extract_range(gallivm, bodd, 0, 4);
boddhi = lp_build_extract_range(gallivm, bodd, 4, 4);
muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic,
wtype_half, aevenlo, bevenlo);
mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic,
wtype_half, aoddlo, boddlo);
muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic,
wtype_half, aevenhi, bevenhi);
mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic,
wtype_half, aoddhi, boddhi);
muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2);
mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2);
}
else {
muleven = lp_build_intrinsic_binary(builder, intrinsic,
wider_type, aeven, beven);
mulodd = lp_build_intrinsic_binary(builder, intrinsic,
wider_type, aodd, bodd);
}
}
muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, "");
mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, "");
for (i = 0; i < bld->type.length; i += 2) {
shuf[i] = lp_build_const_int32(gallivm, i + 1);
shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length);
}
shuf_vec = LLVMConstVector(shuf, bld->type.length);
*res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
for (i = 0; i < bld->type.length; i += 2) {
shuf[i] = lp_build_const_int32(gallivm, i);
shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length);
}
shuf_vec = LLVMConstVector(shuf, bld->type.length);
return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
}
else {
return lp_build_mul_32_lohi(bld, a, b, res_hi);
}
}
/*
* Widening mul, valid for 32x32 bit -> 64bit only.
* Result is low 32bits, high bits returned in res_hi.
*
* Emits generic code.
*/
LLVMValueRef
lp_build_mul_32_lohi(struct lp_build_context *bld,
LLVMValueRef a,
LLVMValueRef b,
LLVMValueRef *res_hi)
{
struct gallivm_state *gallivm = bld->gallivm;
LLVMBuilderRef builder = gallivm->builder;
LLVMValueRef tmp, shift, res_lo;
struct lp_type type_tmp;
LLVMTypeRef wide_type, narrow_type;
type_tmp = bld->type;
narrow_type = lp_build_vec_type(gallivm, type_tmp);
type_tmp.width *= 2;
wide_type = lp_build_vec_type(gallivm, type_tmp);
shift = lp_build_const_vec(gallivm, type_tmp, 32);
if (bld->type.sign) {
a = LLVMBuildSExt(builder, a, wide_type, "");
b = LLVMBuildSExt(builder, b, wide_type, "");
} else {
a = LLVMBuildZExt(builder, a, wide_type, "");
b = LLVMBuildZExt(builder, b, wide_type, "");
}
tmp = LLVMBuildMul(builder, a, b, "");
res_lo = LLVMBuildTrunc(builder, tmp, narrow_type, "");
/* Since we truncate anyway, LShr and AShr are equivalent. */
tmp = LLVMBuildLShr(builder, tmp, shift, "");
*res_hi = LLVMBuildTrunc(builder, tmp, narrow_type, "");
return res_lo;
}
/* a * b + c */
LLVMValueRef
lp_build_mad(struct lp_build_context *bld,
LLVMValueRef a,
LLVMValueRef b,
LLVMValueRef c)
{
const struct lp_type type = bld->type;
if (type.floating) {
return lp_build_fmuladd(bld->gallivm->builder, a, b, c);
} else {
return lp_build_add(bld, lp_build_mul(bld, a, b), c);
}
}
2009-10-25 11:48:17 +00:00
/**
* Small vector x scale multiplication optimization.
*/
LLVMValueRef
lp_build_mul_imm(struct lp_build_context *bld,
LLVMValueRef a,
int b)
{
LLVMBuilderRef builder = bld->gallivm->builder;
2009-10-25 11:48:17 +00:00
LLVMValueRef factor;
2010-08-09 17:30:33 +01:00
assert(lp_check_value(bld->type, a));
2009-10-25 11:48:17 +00:00
if(b == 0)
return bld->zero;
2009-10-25 11:48:17 +00:00
if(b == 1)
return a;
if(b == -1)
return lp_build_negate(bld, a);
2009-10-25 11:48:17 +00:00
if(b == 2 && bld->type.floating)
return lp_build_add(bld, a, a);
if(util_is_power_of_two(b)) {
2009-10-25 11:48:17 +00:00
unsigned shift = ffs(b) - 1;
if(bld->type.floating) {
#if 0
/*
gallivm,draw,llvmpipe: Support wider native registers. Squashed commit of the following: commit 7acb7b4f60dc505af3dd00dcff744f80315d5b0e Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:46:31 2012 +0100 draw: Don't use dynamically sized arrays. Not supported by MSVC. commit 5810c28c83647612cb372d1e763fd9d7780df3cb Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:44:16 2012 +0100 gallivm,llvmpipe: Don't use expressions with PIPE_ALIGN_VAR(). MSVC doesn't accept exceptions in _declspec(align(...)). Use a define instead. commit 8aafd1457ba572a02b289b3f3411e99a3c056072 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:41:56 2012 +0100 gallium/util: Make u_cpu_detect.h header C++ safe. commit 5795248350771f899cfbfc1a3a58f1835eb2671d Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 2 12:08:01 2012 +0100 gallium/util: Add ULL suffix to large constants. As suggested by Andy Furniss: it looks like some old gcc versions require it. commit 4c66c22727eff92226544c7d43c4eb94de359e10 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Truly disable INF/NAN tests on MSVC. Thanks to Brian for spotting this. commit 8bce274c7fad578d7eb656d9a1413f5c0844c94e Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Disable INF/NAN tests on MSVC. Somehow they are not recognized as constants. commit 6868649cff8d7fd2e2579c28d0b74ef6dd4f9716 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 15:05:24 2012 +0200 gallivm: Cleanup the 2 x 8 float -> 16 ub special path in lp_build_conv. No behaviour change intended, like 7b98455fb40c2df84cfd3cdb1eb7650f67c8a751. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 5147a0949c4407e8bce9e41d9859314b4a9ccf77 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 14:28:19 2012 +0200 gallivm: (trivial) fix issues with multiple-of-4 texture fetch Some formats can't handle non-multiple of 4 fetches I believe, but everything must support length 1 and multiples of 4. So avoid going to scalar fetch (which is very costly) just because length isn't 4. Also extend the hack to not use shift with variable count for yuv formats to arbitrary length (larger than 1) - doesn't matter how many elements we have we always want to avoid it unless we have variable shift count instruction (which we should get with avx2). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 87ebcb1bd71fa4c739451ec8ca89a7f29b168c08 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jul 4 02:09:55 2012 +0200 gallivm: (trivial) fix typo for wrap repeat mode in linear filtering aos code This would lead to bogus coordinates at the edges. (undetected by piglit because this path is only taken for block-based formats). Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 3a42717101b1619874c8932a580c0b9e6896b557 Author: José Fonseca <jfonseca@vmware.com> Date: Tue Jul 3 19:42:49 2012 +0100 gallivm: Fix TGSI integer translation with AVX. commit d71ff104085c196b16426081098fb0bde128ce4f Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 15:17:41 2012 +0100 llvmpipe: Fix LLVM JIT linear path. It was not working properly because it was looking at the JIT function before it was actually compiled. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit a94df0386213e1f5f9a6ed470c535f9688ec0a1b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Jun 28 18:07:10 2012 +0100 gallivm: Refactor lp_build_broadcast(_scalar) to share code. Doesn't really change the generated assembly, but produces more compact IR, and of course, makes code more consistent. Reviewed-by: Brian Paul <brianp@vmware.com> commit 66712ba2731fc029fa246d4fc477d61ab785edb5 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 17:30:13 2012 +0100 gallivm: Make LLVMContextRef a singleton. There are any places inside LLVM that depend on it. Too many to attempt to fix. Reviewed-by: Brian Paul <brianp@vmware.com> commit ff5fb7897495ac263f0b069370fab701b70dccef Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 28 18:15:27 2012 +0200 gallivm: don't use 8-wide texture fetch in aos path This appears to be a slight loss usually. There are probably several reasons for that: - fetching itself is scalar - filtering is pure int code hence needs splitting anyway, same for the final texel offset calculations - texture wrap related code, which can be done 8-wide, is slightly more complex with floats (with clamp_to_edge) and float operations generally more costly hence probably not much faster overall - the code needed to split when encountering different mip levels for the quads, adding complexity So, just split always for aos path (but leave it 8-wide for soa, since we do 8-wide filtering there when possible). This should certainly be revisited if we'd have avx2 support. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ce8032b43dcd8e8d816cbab6428f54b0798f945d Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:41:19 2012 +0200 gallivm: (trivial) don't extract fparts variable if not needed Did not have any consequences but unnecessary. commit aaa9aaed8f80dc282492f62aa583a7ee23a4c6d5 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:09:06 2012 +0200 gallivm: fix precision issue in aos linear int wrap code now not just passes at a quick glance but also with piglit... If we do the wrapping with floats, we also need to set the weights accordingly. We can potentially end up with different (integer) coordinates than what the integer calculations would have chosen, which means the integer weights calculated previously in this case are completely wrong. Well at least that's what I think happens, at least recalculating the weights helps. (Some day really should refactor all the wrapping, so we do whatever is fastest independent of 16bit int aos or 32bit float soa filtering.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit fd6f18588ced7ac8e081892f3bab2916623ad7a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 11:15:53 2012 +0100 gallium/util: Fix parsing of options with underscore. For example GALLIVM_DEBUG=no_brilinear which was being parsed as two options, "no" and "brilinear". commit 09a8f809088178a03e49e409fa18f1ac89561837 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 26 15:00:14 2012 +0100 gallivm: Added a generic lp_build_print_value which prints a LLVMValueRef. Updated lp_build_printf to share common code. Removed specific lp_build_print_vecX. Reviewed-by: José Fonseca <jfonseca@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit e59bdcc2c075931bfba2a84967a5ecd1dedd6eb0 Author: José Fonseca <jfonseca@vmware.com> Date: Wed May 16 15:00:23 2012 +0100 draw,llvmpipe: Avoid named struct types on LLVM 3.0 and later. Starting with LLVM 3.0, named structures are meant not for debugging, but for recursive data types, previously also known as opaque types. The recursive nature of these types leads to several memory management difficulties. Given that we don't actually need recursive types, avoid them altogether. This is an attempt to address fdo bugs 41791 and 44466. The issue is somewhat random so there's no easy way to check how effective this is. Cherry-picked from 9af1ba565dfd5cef9ee938bb7c04767d14878fbf commit df6070f618a203c7a876d984c847cde4cbc26bdb Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 14:42:53 2012 +0200 gallivm: (trivial) fix typo in faster aos linear int wrap code no longer crashes, now REALLY tested. commit d8f98dce452c867214e6782e86dc08562643c862 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 18:20:58 2012 +0200 llvmpipe: (trivial) remove bogus optimization for float aos repeat wrap This optimization for nearest filtering on the linear path generated likely bogus results, and the int path didn't have any optimizations there since the only shader using force_nearest apparently uses clamp_to_edge not repeat wrap anyway. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit c4e271a0631087c795e756a5bb6b046043b5099d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 23:01:52 2012 +0200 gallivm: faster repeat wrap for linear aos path too Even if we already have scaled integer coords, it's way faster to use the original float coord (plus some conversions) rather than use URem. The choice of what to do for texture wrapping is not really tied to int aos or float soa filtering though for some modes there can be some gains (because of easier weight calculations). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 1174a75b1806e92aee4264ffe0ffe7e70abbbfa3 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 14:39:22 2012 +0200 gallivm: improve npot tex wrap repeat in linear soa path URem gets translated into series of scalar divisions so just about anything else is faster. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit f849ffaa499ed96fa0efd3594fce255c7f22891b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 00:40:35 2012 +0100 gallivm: (trivial) fix near-invisible shift-space typo I blame the keyboard. commit 5298a0b19fe672aebeb70964c0797d5921b51cf0 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:24:28 2012 +0200 gallivm: add new intrinsic helper to deal with arbitrary vector length This helper will split vectors which are too large for the hw, or expand them if they are too small, so a caller of a function using intrinsics which uses such sizes need not split (or expand) the vectors manually and the function will still use the intrinsic instead of dropping back to generic llvm code. It can also accept scalars for use with pseudo-vector intrinsics (only useful for float arguments, all x86 scalar simd float intrinsics use 4vf32). Only used for lp_build_min/max() for now (also added the scalar float case for these while there). (Other basic binary functions could use it easily, whereas functions with a different interface would need different helpers.) Expanding vectors isn't widely used, because we always try to use build contexts with native hw vector sizes. But it might (or not) be nicer if this wouldn't need to be done, the generated code should in theory stay the same (it does get hit by lp_build_rho though already since we didn't have a intrinsic for the scalar lp_build_max case before). v2: incorporated Brian's feedback, and also made the scalar min/max case work instead of crash (all scalar simd float intrinsics take 4vf32 as argument, probably the reason why it wasn't used before). Moved to lp_bld_intr based on José's request, and passing intrinsic size instead of length. Ideally we'd derive the source type info from the passed in llvm value refs and process some llvmtype return type so we could handle intrinsics where the source and destination type isn't the same (like float/int conversions, packing instructions) but that's a bit too complicated for now. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 01aa760b99ec0b2dc8ce57a43650e83f8c1becdf Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:19:18 2012 +0200 gallivm: (trivial) increase max code size for shader disassembly 64kB was just short of what I needed (which caused a crash) hence increase to 96kB (should probably be smarter about that). commit 74aa739138d981311ce13076388382b5e89c6562 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:53:29 2012 +0100 gallivm: simplify aos float tex wrap repeat nearest just handle pot and npot the same. The previous pot handling ended up with exactly the same instructions plus 2 more (leave it in the soa path though since it is probably still cheaper there). While here also fix a issue which would cause a crash after an assert. commit 0e1e755645e9e49cfaa2025191e3245ccd723564 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:29:24 2012 +0100 gallivm: (trivial) skip floor rounding in ifloor when not signed This was only done for the non-sse41 case before, but even with sse41 this is obviously unnecessary (some callers already call itrunc in this case anyway but some might not). commit 7f01a62f27dcb1d52597b24825931e88bae76f33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:23:12 2012 +0100 gallivm: (trivial) fix bogus comments commit 5c85be25fd82e28490274c468ce7f3e6e8c1d416 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 20 11:51:57 2012 +0100 translate: Free elt8_func/elt16_func too. These were leaking. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 0ad498f36fb6f7458c7cffa73b6598adceee0a6c Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 15:55:34 2012 +0200 gallivm: fix bug for tex wrap repeat with linear sampling in aos float path The comparison needs to be against length not length_minus_one, otherwise the max texel is never chosen (for the second coordinate). Fixes piglit texwrap-1D-npot-proj (and 2D/3D versions). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit d1ad65937c5b76407dc2499b7b774ab59341209e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 16:13:43 2012 +0200 gallivm: simplify soa tex wrap repeat with npot textures and no mip filtering Similar to what is already done in aos sampling for the float path (but not the int path since we don't get normalized float coordinates there). URem is expensive and the calculation is done trivially with normalized floats instead (at least with sse41-capable cpus). (Some day should probably do the same for the mip filter path but it's much more complicated there hence the gain is smaller.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit e1e23f57ba9b910295c306d148f15643acc3fc83 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:38:56 2012 +0200 llvmpipe: (trivial) remove duplicated function declaration Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 07ca57eb09e04c48a157733255427ef5de620861 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:37:34 2012 +0200 llvmpipe: destroy setup variants on context destruction lp_delete_setup_variants() used to be called in garbage collection, but this no longer exists hence the setup shaders never got freed. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ed0003c633859a45f9963a479f4c15ae0ef1dca3 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 16:25:29 2012 +0100 gallivm: handle different ilod parts for multiple quad sampling This fixes filtering when the integer part of the lod is not the same for all quads. I'm not fully convinced of that solution yet as it just splits the vector if the levels to be sampled from are different. But otherwise we'd need to do things like some minify steps, and getting mip level base address separately anyway hence it wouldn't really look like much of a win (and making the code even more complex). This should now give identical results to single quad sampling. commit 8580ac4cfc43a64df55e84ac71ce1a774d33c0d2 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:14:47 2012 +0200 gallivm: de-duplicate sample code common to soa and aos sampling There doesn't seem to be any reason why this code dealing with cube face selection, lod and mip level calculation is separate in aos and soa sampling, and I am sick of having it to change in both places. commit fb541e5f957408ce305b272100196f1e12e5b1e8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:15:41 2012 +0200 gallivm: do mip filtering with per quad lod_fpart This gives better results for mip filtering, though the generated code might not be optimal. For now it also creates some artifacts if the lod_ipart isn't the same for all quads, since instead of using the same mip weight for all quads as previously (which just caused non-smooth gradients) this now will use the right weights but with the wrong mip level in this case (can easily be seen with things like texfilt, mipmap_tunnel). v2: use logic helper suggested by José, and fix issue with negative lod_fpart values commit f1cc84eef7d826a20fab6cd8ccef9a275ff78967 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 13 18:35:25 2012 +0200 gallivm: (trivial) fix bogus assert in lp_build_unpack_broadcast_aos_scalars commit 7c17dbae8ae290df9ce0f50781a09e8ed640c044 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:11:14 2012 +0100 util: Reimplement half <-> float conversions. Removed u_half.py used to generate the table for previous method. Previous implementation of float to half conversion was faulty for denormalised and NaNs and would require extra logic to fix, thus making the speedup of using tables irrelevant. commit 7762f59274070e1dd4b546f5cb431c2eb71ae5c3 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:12:16 2012 +0100 tests: Updated tests to properly handle NaN for half floats. commit fa94c135aea5911fd93d5dfb6e6f157fb40dce5e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:33:10 2012 +0200 gallivm: do mip level calculations per quad This is the final piece which shouldn't change the rendering output yet. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 23cbeaddfe03c09ca18c45d28955515317ffcf4c Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Jun 9 00:54:21 2012 +0200 gallivm: do per-quad cube face selection Doesn't quite fix the piglit cubemap test (not sure why actually) but doing per-quad face selection is doing the right thing and definitely an improvement. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit abfb372b3702ac97ac8b5aa80ad1b94a2cc39d33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:22:59 2012 +0200 gallivm: do all lod calculations per quad Still no functional change but lod is now converted to scalar after lod calculations. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 519368632747ae03feb5bca9c655eccbc5b751b4 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:46:10 2012 +0100 gallivm: Added support for half-float to float conversion in lp_build_conv. Updated various utility functions to support this change. commit 135b4d683a4c95f7577ba27b9bffa4a6fbd2c2e7 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:02:46 2012 +0100 gallivm: Added function for half-float to float conversion. Updated lp_build_format_aos_array to support half-float source. commit 37d648827406a20c5007abeb177698723ed86673 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:55:18 2012 +0100 util: Updated u_format_tests to rigidly test half-float boundary values. commit 2ad18165d96e578aa9046df7c93cb1c3284d8c6b Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:54:16 2012 +0100 llvmpipe: Updated lp_test_format to properly handle Inf/NaN results. commit 78740acf25aeba8a7d146493dd5c966e22c27b73 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:53:30 2012 +0100 util: Added functions for checking NaN / Inf for double and half-floats. commit 35e9f640ae01241f9e0d67fe893bbbf564c05809 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:05:13 2012 +0200 gallivm: Fix calculating rho for 3d textures for the single-quad case Discovered by accident, this looks like a very old typo bug. commit fc1220c636326536fd0541913154e62afa7cd1d8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:04:59 2012 +0200 gallivm: do calcs per-quad in lp_build_rho Still convert to scalar at the end of the function. commit 50a887ffc550bf310a6988fa2cea5c24d38c1a41 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 21 23:21:50 2012 +0200 gallivm: (trivial) return scalar in lp_build_extract_range for length 1 vectors Our type system on top of llvm's one doesn't generally support vectors of length 1, instead using scalars. So we should return a scalar from this function instead of having to bitcast the vector with length 1 later elsewhere. commit 80c71c621f9391f0f9230460198d861643324876 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 17:49:15 2012 +0100 draw: Fixed bad merge error commit c47401cfad0c9167de20ff560654f533579f452c Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:29:30 2012 +0100 draw: Updated store_clip to store whole vectors instead of individual elements. commit 2d9c1ad74b0b0b41861fffcecde39f09cc27f1cf Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:28:32 2012 +0100 gallivm: Added lp_build_fetch_rgba_aos_array. A version of lp_build_fetch_rgba_aos which is targeted at simple array formats. Reads the whole vector from memory in one, instead of reading each element individually. Tested with mesa tests and demos. commit ff7805dc2b6ef6d8b11ec4e54aab1633aef29ac8 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:27:40 2012 +0100 gallivm: Added lp_build_pad_vector. This function pads a vector with undef to a desired length. commit 701f50acef24a2791dabf4730e5b5687d6eb875d Author: James Benton <jbenton@vmware.com> Date: Fri May 18 17:27:19 2012 +0100 util: Added util_format_is_array. This function checks whether a format description is in a simple array format. commit 5e0a7fa543dcd009de26f34a7926674190fa6246 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:13:47 2012 +0100 draw: Removed draw_llvm_translate_from and draw/draw_llvm_translate.c. This is "replaced" by adding an optimised path in lp_build_fetch_rgba_aos in an upcoming patch. commit 8c886d6a7dd3fb464ecf031de6f747cb33e5361d Author: James Benton <jbenton@vmware.com> Date: Wed May 16 15:02:31 2012 +0100 draw: Modified store_aos to write the vector as one, not individual elements. commit 37337f3d657e21dfd662c7b26d61cb0f8cfa6f17 Author: James Benton <jbenton@vmware.com> Date: Wed May 16 14:16:23 2012 +0100 draw: Changed aos_to_soa to use lp_build_transpose_aos. commit bd2b69ce5d5c94b067944d1dcd5df9f8e84548f1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:14:27 2012 +0100 draw: Changed soa_to_aos to use lp_build_transpose_aos. commit 0b98a950d29a116e82ce31dfe7b82cdadb632f2b Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:45 2012 +0100 gallivm: Added lp_build_transpose_aos which converts between aos and soa. commit 69ea84531ad46fd145eb619ed1cedbe97dde7cb5 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:01 2012 +0100 gallivm: Added lp_build_interleave2_half aimed at AVX unpack instructions. commit 7a4cb1349dd35c18144ad5934525cfb9436792f9 Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 22 11:54:14 2012 +0100 gallivm: Fix build on Windows. MC-JIT not yet supported there. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit afd105fc16bb75d874e418046b80d9cc578818a1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:17:26 2012 +0100 llvmpipe: Added a error counter to lp_test_conv. Useful for keeping track of progress when fixing errors! Signed-off-by: José Fonseca <jfonseca@vmware.com> commit b644907d08c10a805657841330fc23db3963d59c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:16:46 2012 +0100 llvmpipe: Changed known failures in lp_test_conv. To comply with the recent fixes to lp_bld_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit d7061507bd94f6468581e218e61261b79c760d4f Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:14:38 2012 +0100 llvmpipe: Added fixed point types tests to lp_test_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 146b3ea39b4726dbe125ac666bd8902ea3d6ca8c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:26:35 2012 +0100 llvmpipe: Changed lp_test_conv src/dst alignment to be correct. Now based on the define rather than a fixed number. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit f3b57441f834833a4b142a951eb98df0aa874536 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:06:44 2012 +0100 gallivm: Fixed erroneous optimisation in lp_build_min/max. Previously assumed normalised was 0 to 1, but it can be -1 to 1 if type is signed. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a0613382e5a215cd146bb277646a6b394d376ae4 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:04:49 2012 +0100 gallivm: Compensate for lp_const_offset in lp_build_conv. Fixing a /*FIXME*/ to remove errors in integer conversion in lp_build_conv. Tested using lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a3d2bf15ea345bc8a0664f8f441276fd566566f3 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:01:25 2012 +0100 gallivm: Fixed overflow in lp_build_clamped_float_to_unsigned_norm. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit e7b1e76fe237613731fa6003b5e1601a2e506207 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 21 20:07:51 2012 +0100 gallivm: Fix build with LLVM 2.6 Trivial, and useful. commit d3c6bbe5c7f5ba1976710831281ab1b6a631082d Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 15 17:15:59 2012 +0100 gallivm: Enable MCJIT/AVX with vanilla LLVM 3.1. Add the necessary C++ glue, so that we don't need any modifications to the soon to be released LLVM 3.1. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 724a019a14d40fdbed21759a204a2bec8a315636 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 22:04:06 2012 +0100 gallivm: Use HAVE_LLVM 0x0301 consistently. commit af6991e2a3868e40ad599b46278551b794839748 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 21:49:06 2012 +0100 gallivm: Add MCRegisterInfo.h to silence benign warnings about missing implementation. Trivial. commit 6f8a1d75458daae2503a86c6b030ecc4bb494e23 Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Apr 2 22:14:15 2012 -0700 gallivm: Pass in a MCInstrInfo to createMCInstPrinter on llvm-3.1. llvm-3.1svn r153860 makes MCInstrInfo available to the MCInstPrinter. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 62555b6ed8760545794f83064e27cddcb3ce5284 Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 27 21:51:17 2012 -0700 gallivm: Fix method overriding in raw_debug_ostream. Use matching type qualifers to avoid method hiding. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 6a9bd784f4ac68ad0a731dcd39e5a3c39989f2be Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 13 22:40:52 2012 -0700 gallivm: Fix createOProfileJITEventListener namespace with llvm-3.1. llvm-3.1svn r152620 refactored the OProfile profiling code. createOProfileJITEventListener was moved from the llvm namespace to the llvm::JITEventListener namespace. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit b674955d39adae272a779be85aa1bd665de24e3e Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Mar 5 22:00:40 2012 -0800 gallivm: Pass in a MCRegisterInfo to MCInstPrinter on llvm-3.1. llvm-3.1svn r152043 changes createMCInstPrinter to take an additional MCRegisterInfo argument. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 11ab69971a8a31c62f6de74905dbf8c02884599f Author: Vinson Lee <vlee@freedesktop.org> Date: Wed Feb 29 21:20:53 2012 -0800 Revert "gallivm: Change getExtent and readByte to non-const with llvm-3.1." This reverts commit d5a6c172547d8964f4d4bb79637651decaf9deee. llvm-3.1svn r151687 makes MemoryObject accessor members const again. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 339960c82d2a9f5c928ee9035ed31dadb7f45537 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 14 16:19:56 2012 +0200 gallivm: (trivial) fix assertion failure for mipmapped 1d textures In lp_build_rho, we may end up with a 1-element vector (for mipmapped 1d textures), but in this case we require the type to be a non-vector type, so need a cast. commit 9d73edb727bd6d196030dc3026b7bf0c574b3e19 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:12:07 2012 +0200 gallivm: prepare for per-quad lod calculations for large vectors to be able to handle multiple quads at once in texture sampling and still do lod calculations per quad, it is necessary to get the per-quad derivatives into the lp_build_rho function. Until now these derivative values were just scalars, which isn't going to work. So we now use vectors, and since the interface needs to change we also do some different (slightly more efficient) packing of the values. For 8-wide vectors the packed derivative values for 3 coords would look like this, this scales to a arbitrary (multiple of 4) vector size: ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____ The second vector will be unused for 1d and 2d textures. To facilitate future changes the derivative values are put into a struct, since quite some functions just pass these values through. The generated code seems to be very slightly better for 2d textures (with 4-wide vectors) than before with sse2 (if you have a cpu with physical 128bit simd units - otherwise it's probably not a win). v2: suggestions from José, rename variables, add comments, use swizzle helper commit 0aa21de0d31466dac77b05c97005722e902517b8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:10:31 2012 +0200 gallivm: add undefined swizzle handling to lp_build_swizzle_aos This is useful for vectors with "holes", it lets llvm choose the most efficient shuffle instructions if some elements aren't needed without having to worry what elements to manually pick otherwise. commit 00faf3f370e7ce92f5ef51002b0ea42ef856e181 Author: José Fonseca <jfonseca@vmware.com> Date: Fri May 4 17:25:16 2012 +0100 gallivm: Get the LLVM IR optimization passes before JIT compilation. MC-JIT engine compiles the module immediately on creation, so the optimization passes were being run too late. So now we create a target data layout from a string, that matches the ABI parameters reported by the compiler. The backend optimization passes were always been run, so the performance improvement is modest (3% on multiarb mesa demo). Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit 40a43f4e2ce3074b5ce9027179d657ebba68800a Author: Roland Scheidegger <sroland@vmware.com> Date: Wed May 2 16:03:54 2012 +0200 gallivm: (trivial) fix wrong define used in lp_build_pack2 should fix stack-smashing crashes. commit e6371d0f4dffad4eb3b7a9d906c23f1c88a2ab9e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 30 21:25:29 2012 +0200 gallivm: add perf warnings when not using intrinsics with 256bit vectors Helper functions using integer sse2 intrinsics could split the vectors with AVX instead of using generic fallback (which should be faster). We don't actually expect to hit these paths (hence don't fix them up to actually do the vector splitting) so just emit warnings (for those functions where it's obvious doing split/intrinsic is faster than using generic path). Only emit warnings for 256bit vectors since we _really_ don't expect to hit arbitrary large vectors which would affect a lot more functions. The warnings do not actually depend on avx since the same logic applies to plain sse2 too (but of course again there's _really_ no reason we should hit these functions with 256bit vectors without avx). commit 8a9ea701ea7295181e846c6383bf66a5f5e47637 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:37:07 2012 +0200 gallivm: split vectors manually for avx in lp_build_pack2 (v2) There's 2 reasons for this: First, there's a llvm bug (fixed in 3.1) which generates tons of byte inserts/extracts otherwise, and second, more importantly, we want to use pack intrinsics instead of shuffles. We do this in lp_build_pack2 and not the calling code (aos sample path) because potentially other callers might find that useful too, even if for larger sequences of code using non-native vector sizes it might be better to manually split vectors. This should boost texture performance in the aos path considerably. v2: fix issues with intrinsics types with old llvm commit 27ac5b48fa1f2ea3efeb5248e2ce32264aba466e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:26:22 2012 +0200 llvmpipe: refactor lp_build_pack2 (v2) prettify, and it's unnecessary to assert when there's no intrinsic due to unsupported bit width - the shuffle path will work regardless. In contrast lp_build_packs2, should only rely on lp_build_pack2 doing the clamping for element sizes for which there is a sse2 intrinsic. v2: fix bug spotted by Jose regarding the intrinsic type for packusdw on old llvm versions. commit ddf279031f0111de4b18eaf783bdc0a1e47813c8 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:13:59 2012 +0200 gallivm: add src width check in lp_build_packs2() not doing so would skip clamping even if no sse2 pack instruction is available, which is incorrect (in theory only, such widths would also always hit a (unnecessary) assertion in lp_build_pack2(). commit e7f0ad7fe079975eae7712a6e0c54be4fae0114b Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Apr 27 15:57:00 2012 +0200 gallivm: (trivial) fix crash-causing typo for npot textures with avx commit 28a9d7f6f655b6ec508c8a3aa6ffefc1e79793a0 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Apr 25 19:38:45 2012 +0200 gallivm: (trivial) remove code mistakenly added twice. commit d5926537316f8ff67ad0a52e7242f7c5478d919b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Apr 24 21:16:15 2012 +0200 gallivm: add a new avx aos sample path (v2) Try to avoid mixing float and int address calculations. This does texture wrap modes with floats, and then the offset calculations still with ints (because of lack of precision with floats, though we could do some effort to make it work with not too large (16MB) textures). This also handles wrap repeat mode with npot-sized textures differently than either the old soa or aos int path (likely way faster but untested). Otherwise the actual address wrap code is largely similar to the soa path (not quite the same as this one also has some int code), it should get used by avx soa sampling later as well but doesn't handle more complex address modes yet (this will also have the benefit that we can use aos sampling path for all texture address modes). Generated code for that looks reasonable, but still does not split vectors explicitly for fetch/filter which means still get hit by llvm (fixed upstream) which generates hundreds of pinsrb/pextrb instead of two shuffles. It is not obvious though if it's much of a win over just doing address calcs 4-wide but with ints, even if it is definitely much less instructions on avx. piglit's texwrap seems to look exactly the same but doesn't test neither the non-normalized nor the npot cases. v2: fix comments, prettify based on Brian's and Jose's feedback. commit bffecd22dea66fb416ecff8cffd10dd4bdb73fce Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Apr 19 01:58:29 2012 +0200 gallivm: refactor aos lp_build_sample_image_nearest/linear split them up to separate address calculations and fetching/filtering. Need this for being able to do 8-wide float address calcs and 4-wide fetch/filter later (for avx). Plus the functions were very big scary monsters anyway (in particular lp_build_sample_image_linear). commit a80b325c57529adddcfa367f96f03557725c4773 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 16 17:17:18 2012 +0200 gallivm: fix lp_build_resize when truncating width but expanding vector size Missed this case which I thought was impossible - the assertion for it was right after the division by zero... (AoS) texture sampling may ask us to do this, for things like 8 4x32int vectors to 1 32x8int vector conversion (eventually, we probably don't want this to happen). commit f9c8337caa3eb185830d18bce8b95676a065b1d7 Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Apr 14 18:00:59 2012 +0200 gallivm: fix cube maps with larger vectors This makes the branchless cube face selection code work with larger vectors. Because the complexity is quite high (cannot really be improved it seems, per-face selection would reduce complexity a lot but this leads to errors unless the derivatives are calculated all from the same face which almost doubles the work to be done) it is still slower than the branching version, hence only enable this with large vectors. It doesn't actually do per-quad face selection yet (only makes sense with matching lod selection, in fact it will select the same face for all pixels based on the average of the first four pixels for now) but only different shuffles are required to make it work (the branching version actually should work with larger vectors too now thanks to the improved horizontal add but of course it cannot be extended to really select the face per-quad unless doing branching per quad). commit 7780c58869fc9a00af4f23209902db7e058e8a66 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 21:11:12 2012 +0100 llvmpipe: (trivial) fix compiler warning and also clarify comment regarding availability of popcnt instruction. commit a266dccf477df6d29a611154e988e8895892277e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 14:21:07 2012 +0100 gallivm: remove unneeded members in lp_build_sample_context Minor cleanup, the texture width, height, depth aren't accessed in their scalar form anywhere. Makes it more obvious those values should probably be fetched already vectorized (but this requires more invasive changes)... commit b678c57fb474e14f05e25658c829fc04d2792fff Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 29 15:53:55 2012 +0100 gallivm: add a helper for concatenating vectors Similar to the extract_range helper intended to get around slow code generated by llvm for 128bit insertelements. Concatenating two 128bit vectors this way will result in a single vinsertf128 operation rather than two 64bit stores plus one 128bit load, though it might be mildly useful for other purposes as well. commit 415ff228bcd0cf5e44a4c15350a661f0f5520029 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 19:41:15 2012 +0100 gallivm: add a custom 2x8f->1x16ub avx conversion path Similar to the existing 4x4f->1x16ub sse2 path, shaves off a couple instructions (min/max mostly) because it relies on pack intrinsics clamping. commit 78c08fc89f8fbcc6dba09779981b1e873e2a0299 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 18:44:07 2012 +0100 gallivm: add avx arithmetic intrinsics Add all avx intrinsics for arithmetic functions (with the exception of the horizontal add function which needs another look). Seems to pass basic tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit a586caa2800aa5ce54c173f7c0d4fc48153dbc4e Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 15:31:35 2012 +0100 gallivm: add avx logic intrinsics Add the blend intrinsics for 8-wide float and 4-wide double vectors. Since we lack 256bit int instructions these are used for int vectors as well, though obviously not for byte or word element values. The comparison intrinsics aren't extended for avx since these are only used for pre-2.7 llvm versions. commit 70275e4c13c89315fc2560a4c488c0e6935d5caf Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 00:40:53 2012 +0100 gallivm: new helper function for extract shuffles. Based on José's idea as we can need that in a couple places. Note that such shuffles should not be used lightly, since data layout of <4 x i8> is different to <16 x i8> for instance, hence might cause data rearrangement. commit 4d586dbae1b0c55915dda1759d2faea631c0a1c2 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 18:27:25 2012 +0100 gallivm: (trivial) don't overallocate shuffle variable using wrong define meant huge array... commit 06b0ec1f6d665d98c135f9573ddf4ba04b2121ad Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 17:54:20 2012 +0100 gallivm: don't do per-element extract/insert for vector element resize Instead of doing per-element extract/insert if the src vectors and dst vector differ in total size (which generates atrocious code) first change the src vectors size by using shuffles to destination vector size. We can still do better than that on AVX for packing to color buffer (by exploiting pack intrinsics characteristics hence eleminating the need for some clamps) but this already generates much better code. v2: incorporate feedback from José, Keith and use shuffle instead of bitcasts/extracts. Due to llvm deficiencies the latter cause all data to get moved to GPRs and back in pieces (even though the data in the regs actually stays the same...). commit c9970d70e05f95d3f52fe7d2cd794176a52693aa Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 19:33:19 2012 +0000 gallivm: fix bug in simple position interpolation Accidental use of position attribute instead of just pixel coordinates. Caused failures in piglit glsl-fs-ceil and glsl-fs-floor. commit d0b6fcdb008d04d7f73d3d725615321544da5a7e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 15:31:14 2012 +0000 gallivm: fix emission of ceil opcode lp_build_ceil seems more appropriate than lp_build_trunc. This seems to be never hit though someone performs some ceil to floor magic. commit d97fafed7e62ffa6bf76560a92ea246a1a26d256 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 22 11:46:52 2012 +0000 gallivm: new vectorized path for cubemap calculations should be faster when adapted to multiple quads as only selection masks need to be different. The code is more or less a per-pixel version adapted to only do it per quad. A per pixel version would be much simpler (could drop 2 selects, 6 broadcasts and the messy horizontal add of 3 vectors at the expense of only 2 more absolute value instructions - would also just work for arbitary large vectors). This version doesn't yet work with larger vectors because the horizontal add isn't adjusted to be able to work with 2x4 vectors (and also because face selection wouldn't be done per quad just per block though that would be only a correctness issue just as with lod selection). The downside is this code is quite a bit slower. On a Core2 it can be sped up by disabling the hw blend instructions for selection and using logicop fallbacks instead, but it is still slower than the old code, hence leave that in for now. Probably will chose one or the other version based on vector length in the end. commit b375fbb18a3fd46859b7fdd42f3e9908ea4ff9a3 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 21 14:42:29 2012 +0000 gallivm: fix optimized occlusion query intrinsic name commit a9ba0a3b611e48efbb0e79eb09caa85033dbe9a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Mar 21 16:19:43 2012 +0000 draw,gallivm,llvmpipe: Call gallivm_verify_function everywhere. commit f94c2238d2bc7383e088b8845b7410439a602071 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 18:54:10 2012 +0000 gallivm: optimize calculations for cube maps a bit this does some more vectorized calculations and uses horizontal adds if possible. A definite win with sse3 otherwise it doesn't seem to make much of a difference. In any case this is arithmetically identical, cannot handle larger vectors. Should be useful as a reference point against larger vector version later... commit 21a2c1cf3c8e1ac648ff49e59fdc0e3be77e2ebb Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 15:16:27 2012 +0000 llvmpipe: slight optimization of occlusion queries using movmskps when available. While this is slightly better for cpus without popcnt we should really sum the vectors ourselves (it is also possible to cast to i4 before doing the popcnt but that doesn't help that much neither since llvm is using some optimized popcnt version for i32) commit 5ab5a35f216619bcdf55eed52b0db275c4a06c1b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 13:32:11 2012 +0000 llvmpipe: fix occlusion queries with larger vectors need to adjust casts etc. commit ff95e6fdf5f16d4ef999ffcf05ea6e8c7160b0d5 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Mar 19 20:15:25 2012 +0000 gallivm: Restore optimization passes. commit 57b05b4b36451e351659e98946dae27be0959832 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:34:22 2012 +0000 llvmpipe: use existing min2 macro commit bc9a20e19b4f600a439f45679451f2e87cd4b299 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:07:27 2012 +0000 llvmpipe: add some safeguards against really large vectors As per José's suggestion, prevent things from blowing up if some cpu would have 1024bit or larger vectors. commit 0e2b525e5ca1c5bbaa63158bde52ad1c1564a3a9 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:31:08 2012 +0000 llvmpipe: fix mask generation for uberwide vectors this was the only piece preventing 16-wide vectors from working (apart from the LP_MAX_VECTOR_WIDTH define that is), which is the maximum as we don't get more pixels in the fragment shader at once. Hence adjust that so things could be tested properly with that size even though there seems to be no practical value. commit 3c8334162211c97f3a11c7f64e9e5a2a91ad9656 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:19:41 2012 +0000 llvmpipe: fix the simple interpolation method with larger vectors so both methods actually _really_ work now. Makes textures look nice with larger vectors... commit 1cb0464ef8871be1778d43b0c56adf9c06843e2d Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 17:26:35 2012 +0000 llvmpipe: fix mask generation and position interpolation with 8-wide vectors trivial bugs, with these things start to look somewhat reasonable. Textures though have some swizzling issues it seems. commit 168277a63ef5b72542cf063c337f2d701053ff4b Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 16:04:03 2012 +0000 llvmpipe: don't overallocate variables we never have more than 16 (stamp size) / 4 (minimum possible vector size). (With larger vectors those variables are still overallocated a bit.) commit 409b54b30f81ed0aa9ed0b01affe15c72de9abd2 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:56:48 2012 +0000 llvmpipe: add some 32f8 formats to lp_test_conv Also add the ability to handle different sized vectors. commit 55dcd3af8366ebdac0af3cdb22c2588f24aa18ce Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:47:27 2012 +0000 gallivm: handle different sized vectors in conversion / pack only fully generic path for now (extract/insert per element). commit 9c040f78c54575fcd94a8808216cf415fe8868f6 Author: Roland Scheidegger <sroland@vmware.com> Date: Sun Mar 18 00:58:28 2012 +0100 llvmpipe: fix harmless use of unitialized values commit 551e9d5468b92fc7d5aa2265db9a52bb1e368a36 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:31:21 2012 +0100 gallivm: drop special path in extract_broadcast with different sized vectors Not needed, llvm can handle shuffles with different sized result vector just fine. Should hopefully generate the same code in the end, but simpler IR. commit 44da531119ffa07a421eaa041f63607cec88f6f8 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:28:49 2012 +0100 llvmpipe: adapt interpolation for handling multiple quads at once this is still WIP there are actually two methods possible not quite sure what makes the most sense, so there's code for both for now: 1) the iterative method as used before (compute attrib values at upper left corner of stamp and upper left corner of each quad initially). It is improved to handle more than one quad at once, and also do some more vectorized calculations initially for slightly better code - newer cpus have full throughput with 4 wide float vectors, hence don't try to code up a path which might be faster if there's just one channel active per attribute. 2) just do straight interpolation for each pixel. Method 2) is more work per quad, but less initially - if all quads are executed significantly more overall though. But this might change with larger vector lengths. This method would also be needed if we'd do some kind of active quad merging when operating on multiple quads at once. This path contains some hack to force llvm to generate better code, it is still far from ideal though, still generates far too many unnecessary register spills/reloads. Both methods should work with different sized vectors. Not very well tested yet, still seems to work with four-wide vectors, need changes elsewhere to be able to test with wider vectors. commit be5d3e82e2fe14ad0a46529ab79f65bf2276cd28 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:59:37 2012 +0000 draw: Cleanup. commit f85bc12c7fbacb3de2a94e88c6cd2d5ee0ec0e8d Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:43:30 2012 +0000 gallivm: More module compilation refactoring. commit d76f093198f2a06a93b2204857e6fea5fd0b3ece Author: José Fonseca <jfonseca@vmware.com> Date: Thu Mar 15 21:29:11 2012 +0000 llvmpipe: Use gallivm_compile/free_function() in linear code. Should had been done before. commit 122e1adb613ce083ad739b153ced1cde61dfc8c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 13 14:47:10 2012 +0100 llvmpipe: generate partial pixel mask for multiple quads still works with one quad, cannot be tested yet with more At least for now always fixed order with multiple quads. commit 4c4f15081d75ed585a01392cd2dcce0ad10e0ea8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 22:09:24 2012 +0100 llvmpipe: refactor state setup a bit Refactor to make it easier to emit (and potentially later fetch in fs) coefficients for multiple attributes at once. Need to think more about how to make this actually happen however, the problem is different attributes can have different interpolation modes, requiring different handling in both setup and fs (though linear and perspective handling is close). commit 9363e49722ff47094d688a4be6f015a03fba9c79 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 19:23:23 2012 +0100 llvmpipe: vectorize tri offset calc cuts number of instructions in quad-offset-factor from 107 to 75. This code actually duplicated the (scalar) code calculating the determinant except it used different vertex order (leading to different sign but it doesn't matter) hence llvm could not have figured out it's the same (of course with determinant vectorized in the other place that wouldn't have worked any longer neither). Note this particular piece doesn't actually vectorize well, not many arithmetic instructions left but tons of shuffle instructions... Probably would need to work on n tris at a time for better vectorization. commit 63169dcb9dd445c94605625bf86d85306e2b4297 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 03:11:37 2012 +0100 llvmpipe: vectorize some scalar code in setup reduces number of arithmetic instructions, and avoids loading vector x,y values twice (once as scalars once as vectors). Results in a reduction of instructions from 76 to 64 in fs setup for glxgears (16%) on a cpu with sse41. Since this code uses vec2 disguised as vec4, on old cpus which had physical 64bit sse units (pre-Core2) it probably is less of a win in practice (and if you have no vectors you can only hope llvm eliminates the arithmetic for unneeded elements). commit 732ecb877f951ab89bf503ac5e35ab8d838b58a1 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 7 00:32:24 2012 +0100 draw: fix clipping bug introduced by 4822fea3f0440b5205e957cd303838c3b128419c broke clipping pretty badly (verified with lineclip test) commit ef5d90b86d624c152d200c7c4056f47c3c6d2688 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 23:38:59 2012 +0100 draw: don't store vertex header per attribute storing the vertex header once per attribute is totally unnecessary. Some quick look at the generated assembly says llvm in fact cannot optimize away the additional stores (maybe due to potentially aliasing pointers somewhere). Plus, this makes the code cleaner and also allows using a vector "or" instead of scalar ones. commit 6b3a5a57b0b9850854cfbd7b586e4e50102dda71 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 19:11:01 2012 +0100 draw: do the per-vertex "boolean" clipmask "or" with vectors no point extracting the values and doing it per component. Doesn't help that much since we still extract the values elsewhere anyway. commit 36519caf1af40e4480251cc79a2d527350b7c61f Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 2 22:27:01 2012 +0100 gallivm: fix lp_build_extract_broadcast with different sized vectors Fix the obviously wrong argument, so it doesn't blow up. commit 76d0ac3ad85066d6058486638013afd02b069c58 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 2 12:16:23 2012 +0000 draw: Compile per module and not per function (WIP). Enough to get gears w/ LLVM draw + softpipe to work on AVX doing: GALLIUM_DRIVER=softpipe SOFTPIPE_USE_LLVM=yes glxgears But still hackish -- will need to rethink and refactor this. commit 78e32b247d2a7a771be9a1a07eb000d1e54ea8bd Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 12:01:05 2012 +0000 llvmpipe: Remove lp_state_setup_fallback. Never used. commit 6895d5e40d19b4972c361e8b83fdb7eecda3c225 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Feb 27 19:14:27 2012 +0000 llvmpipe: Don't emit EMMS on x86 We already take precautions to ensure that LLVM never emits MMX code. commit 4822fea3f0440b5205e957cd303838c3b128419c Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 29 15:58:19 2012 +0100 draw: modifications for larger vector sizes We want to be able to use larger vectors especially for running the vertex shader. With this patch we build soa vectors which might have a different length than 4. Note that aos structures really remain the same, only when aos structures are converted to soa potentially different sized vectors are used. Samplers probably don't work yet, didn't look at them. Testing done: glxgears works with both 128bit and 256bit vectors. commit f4950fc1ea784680ab767d3dd0dce589f4e70603 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:57 2012 +0100 gallivm: override native vector width with LP_NATIVE_VECTOR_WIDTH env var for debug commit 6ad6dbf0c92f3bf68ae54e5f2aca035d19b76e53 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:24 2012 +0100 draw: allocate storage with alignment according to native vector width commit 7bf0e3e7c9bd2469ae7279cabf4c5229ae9880c1 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Feb 24 19:06:08 2012 +0000 gallivm: Fix comment grammar. Was missing several words. Spotted by Roland. commit b20f1b28eb890b2fa2de44a0399b9b6a0d453c52 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 19:22:09 2012 +0000 gallivm: Use MC-JIT on LLVM 3.1 + (i.e, SVN) MC-JIT Note: MC-JIT is still WIP. For this to work correctly it requires LLVM changes which are not yet upstream. commit b1af4dfcadfc241fd4023f4c3f823a1286d452c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 20:03:15 2012 +0100 llvmpipe: use new lp_type_width() helper in lp_test_blend commit 04e0a37e888237d4db2298f31973af459ef9c95f Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 19:50:34 2012 +0100 llvmpipe: clean up lp_test_blend a little Using variables just sized and aligned right makes it a bit more obvious what's going on. The test still only tests vector length 4. For AoS anything else probably isn't going to work. For SoA other lengths should work (at least with floats). commit e61c393d3ec392ddee0a3da170e985fda885a823 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:48:30 2012 +0000 gallivm: Ensure vector width consistency. Instead of assuming that everything is the max native size. commit 330081ac7bc41c5754a92825e51456d231bf84dd Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:44:14 2012 +0000 draw: More simd vector width consistency fixes. commit d90ca002753596269e37297e2e6c139b19f29f03 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:43:00 2012 +0000 gallivm: Remove unused lp_build_int32_vec4_type() helper. commit cae23417824d75869c202aaf897808d73a2c1db0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 17:32:16 2012 +0100 gallivm: use global variable for native vector width instead of define We do not know the simd extensions (and hence the simd width we should use) available at compile time. At least for now keep a define for maximum vector width, since a global variable obviously can't be used to adjust alignment of automatic stack variables. Leave the runtime-determined value at 128 for now in all cases. commit 51270ace6349acc2c294fc6f34c025c707be538a Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 15:41:02 2012 +0000 gallivm: Add a hunk inadvertedly lost when rebasing. commit bf256df9cfdd0236637a455cbaece949b1253e98 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:24:23 2012 +0000 llvmpipe: Use consistent vector width in depth/stencil test. commit 5543b0901677146662c44be2cfba655fd55da94b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:19:59 2012 +0000 draw: Use a consistent the vector register width. Instead of 4x32 sometimes, LP_NATIVE_VECTOR_WIDTH other times. commit eada8bbd22a3a61f549f32fe2a7e408222e5c824 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 12:08:04 2012 +0000 gallivm: Remove garbagge collection. MC-JIT will require one compilation per module (as opposed to one compilation per function), therefore no state will be shared, eliminating the need to do garbagge collection. commit 556697ea0ed72e0641851e4fbbbb862c470fd7eb Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 10:33:41 2012 +0000 gallivm: Move all native target initialization to lp_set_target_options(). commit c518e8f3f2649d5dc265403511fab4bcbe2cc5c8 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:52:32 2012 +0000 llvmpipe: Create one gallivm instance for each test. commit 90f10af8920ec6be6f2b1e7365cfc477a0cb111d Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:48:08 2012 +0000 gallivm: Avoid LLVMAddGlobalMapping() in lp_bld_assert(). Brittle, complex, and unecesary. Just use function pointer constant. commit 98fde550b33401e3fe006af59db4db628bcbf476 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:21:26 2012 +0000 gallivm: Add a lp_build_const_func_pointer() helper. To be reused in all places where we want to call C code. commit 6cfedadb62c2ce5af8d75969bc95a607f3ece118 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:44:41 2012 +0000 gallivm: Cleanup/simplify lp_build_const_string_variable. - Move to lp_bld_const where it belongs - Rename to lp_build_const_string - take the length from the argument (and don't count the zero terminator twice) - bitcast the constant to generic i8 * commit db1d4018c0f1fa682a9da93c032977659adfb68c Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 11:52:17 2012 +0000 gallivm: Set NoFramePointerElimNonLeaf to true where supported. commit 088614164aa915baaa5044fede728aa898483183 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 22 19:38:47 2012 +0100 llvmpipe: pass in/out pointers rather scalar floats in lp_bld_arit we don't want llvm to potentially optimize away the vectors (though it doesn't seem to currently), plus we want to be able to handle in/out vectors of arbitrary length. commit 3f5c4e04af8a7592fdffa54938a277c34ae76b51 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:22:55 2012 +0100 gallivm: fix lp_build_sqrt() for vector length 1 since we optimize away vectors with length 1 need to emit intrinsic without vector type. commit 79d94e5f93ed8ba6757b97e2026722ea31d32c06 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 17:00:46 2012 +0000 llvmpipe: Remove lp_test_round. commit 81f41b5aeb3f4126e06453cfc78990086b85b78d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:56:24 2012 +0100 llvmpipe: subsume lp_test_round into lp_test_arit Much simpler, and since the arguments aren't passed as 128bit values can run on any arch. This also uses the float instead of the double versions of the c functions (which probably was the intention anyway). In contrast to lp_test_round the output is much less verbose however. Tested vector width of 32 to 512 bits - all pass except 32 (length 1) which crashes in lp_build_sqrt() due to wrong type. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 945b338b421defbd274481d8c4f7e0910fd0e7eb Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 09:55:03 2012 +0000 gallivm: Centralize the function compilation logic. This simplifies a lot of code. Also doing this in a central place will make it easier to carry out the changes necessary to use MC-JIT in the future. gallivm: Fix typo in explicit derivative shuffle. Trivial. draw: make DEBUG_STORE work again adapt to lp_build_printf() interface changes Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: get rid of vecnf_from_scalar() just use lp_build_broadcast directly (cannot assign a name but don't really need it, vecnf_from_scalar() was producing much uglier IR due to using repeated insertelement instead of insertelement+shuffle). Reviewed-by: José Fonseca <jfonseca@vmware.com> llvmpipe: fix typo in complex interpolation code Fixes position interpolation when using complex mode (piglit fp-fragment-position and similar) Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: fix clipvertex/position storing again This appears to be the result of a bad merge. Fixes piglit tests relying on clipping, like a lot of the interpolation tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Fix explicit derivative manipulation. Same counter variable was being used in two nested loops. Use more meanigful variable names for the counter to fix and avoid this. gallivm: Prevent buffer overflow in repeat wrap mode for NPOT. Based on Roland's patch, discussion, and review . Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix dims for TGSI_TEXTURE_1D in emit_tex. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix explicit volume texture derivatives. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: fix 1d shadow texture sampling Always r coordinate is used, hence need 3 coords not two (the second one is unused). Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Enable AVX support without MCJIT, where available. For now, this just enables AVX on Windows for testing. If the code is stable then we might consider prefering the old JIT wherever possible. No change elsewhere. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2012-07-13 18:09:30 +01:00
* Power of two multiplication by directly manipulating the exponent.
2009-10-25 11:48:17 +00:00
*
* XXX: This might not be always faster, it will introduce a small error
* for multiplication by zero, and it will produce wrong results
* for Inf and NaN.
*/
unsigned mantissa = lp_mantissa(bld->type);
factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
a = LLVMBuildAdd(builder, a, factor, "");
a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
2009-10-25 11:48:17 +00:00
return a;
#endif
}
else {
factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
return LLVMBuildShl(builder, a, factor, "");
2009-10-25 11:48:17 +00:00
}
}
factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
2009-10-25 11:48:17 +00:00
return lp_build_mul(bld, a, factor);
2009-08-01 17:27:05 +01:00
}
2009-08-19 20:13:49 +01:00
/**
* Generate a / b
*/
LLVMValueRef
lp_build_div(struct lp_build_context *bld,
LLVMValueRef a,
LLVMValueRef b)
{
LLVMBuilderRef builder = bld->gallivm->builder;
const struct lp_type type = bld->type;
assert(lp_check_value(type, a));
assert(lp_check_value(type, b));
if(a == bld->zero)
return bld->zero;
if(a == bld->one && type.floating)
return lp_build_rcp(bld, b);
if(b == bld->zero)
return bld->undef;
if(b == bld->one)
return a;
if(a == bld->undef || b == bld->undef)
return bld->undef;
if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
if (type.floating)
return LLVMConstFDiv(a, b);
else if (type.sign)
return LLVMConstSDiv(a, b);
else
return LLVMConstUDiv(a, b);
}
/* fast rcp is disabled (just uses div), so makes no sense to try that */
if(FALSE &&
((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
gallivm,draw,llvmpipe: Support wider native registers. Squashed commit of the following: commit 7acb7b4f60dc505af3dd00dcff744f80315d5b0e Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:46:31 2012 +0100 draw: Don't use dynamically sized arrays. Not supported by MSVC. commit 5810c28c83647612cb372d1e763fd9d7780df3cb Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:44:16 2012 +0100 gallivm,llvmpipe: Don't use expressions with PIPE_ALIGN_VAR(). MSVC doesn't accept exceptions in _declspec(align(...)). Use a define instead. commit 8aafd1457ba572a02b289b3f3411e99a3c056072 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:41:56 2012 +0100 gallium/util: Make u_cpu_detect.h header C++ safe. commit 5795248350771f899cfbfc1a3a58f1835eb2671d Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 2 12:08:01 2012 +0100 gallium/util: Add ULL suffix to large constants. As suggested by Andy Furniss: it looks like some old gcc versions require it. commit 4c66c22727eff92226544c7d43c4eb94de359e10 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Truly disable INF/NAN tests on MSVC. Thanks to Brian for spotting this. commit 8bce274c7fad578d7eb656d9a1413f5c0844c94e Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Disable INF/NAN tests on MSVC. Somehow they are not recognized as constants. commit 6868649cff8d7fd2e2579c28d0b74ef6dd4f9716 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 15:05:24 2012 +0200 gallivm: Cleanup the 2 x 8 float -> 16 ub special path in lp_build_conv. No behaviour change intended, like 7b98455fb40c2df84cfd3cdb1eb7650f67c8a751. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 5147a0949c4407e8bce9e41d9859314b4a9ccf77 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 14:28:19 2012 +0200 gallivm: (trivial) fix issues with multiple-of-4 texture fetch Some formats can't handle non-multiple of 4 fetches I believe, but everything must support length 1 and multiples of 4. So avoid going to scalar fetch (which is very costly) just because length isn't 4. Also extend the hack to not use shift with variable count for yuv formats to arbitrary length (larger than 1) - doesn't matter how many elements we have we always want to avoid it unless we have variable shift count instruction (which we should get with avx2). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 87ebcb1bd71fa4c739451ec8ca89a7f29b168c08 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jul 4 02:09:55 2012 +0200 gallivm: (trivial) fix typo for wrap repeat mode in linear filtering aos code This would lead to bogus coordinates at the edges. (undetected by piglit because this path is only taken for block-based formats). Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 3a42717101b1619874c8932a580c0b9e6896b557 Author: José Fonseca <jfonseca@vmware.com> Date: Tue Jul 3 19:42:49 2012 +0100 gallivm: Fix TGSI integer translation with AVX. commit d71ff104085c196b16426081098fb0bde128ce4f Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 15:17:41 2012 +0100 llvmpipe: Fix LLVM JIT linear path. It was not working properly because it was looking at the JIT function before it was actually compiled. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit a94df0386213e1f5f9a6ed470c535f9688ec0a1b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Jun 28 18:07:10 2012 +0100 gallivm: Refactor lp_build_broadcast(_scalar) to share code. Doesn't really change the generated assembly, but produces more compact IR, and of course, makes code more consistent. Reviewed-by: Brian Paul <brianp@vmware.com> commit 66712ba2731fc029fa246d4fc477d61ab785edb5 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 17:30:13 2012 +0100 gallivm: Make LLVMContextRef a singleton. There are any places inside LLVM that depend on it. Too many to attempt to fix. Reviewed-by: Brian Paul <brianp@vmware.com> commit ff5fb7897495ac263f0b069370fab701b70dccef Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 28 18:15:27 2012 +0200 gallivm: don't use 8-wide texture fetch in aos path This appears to be a slight loss usually. There are probably several reasons for that: - fetching itself is scalar - filtering is pure int code hence needs splitting anyway, same for the final texel offset calculations - texture wrap related code, which can be done 8-wide, is slightly more complex with floats (with clamp_to_edge) and float operations generally more costly hence probably not much faster overall - the code needed to split when encountering different mip levels for the quads, adding complexity So, just split always for aos path (but leave it 8-wide for soa, since we do 8-wide filtering there when possible). This should certainly be revisited if we'd have avx2 support. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ce8032b43dcd8e8d816cbab6428f54b0798f945d Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:41:19 2012 +0200 gallivm: (trivial) don't extract fparts variable if not needed Did not have any consequences but unnecessary. commit aaa9aaed8f80dc282492f62aa583a7ee23a4c6d5 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:09:06 2012 +0200 gallivm: fix precision issue in aos linear int wrap code now not just passes at a quick glance but also with piglit... If we do the wrapping with floats, we also need to set the weights accordingly. We can potentially end up with different (integer) coordinates than what the integer calculations would have chosen, which means the integer weights calculated previously in this case are completely wrong. Well at least that's what I think happens, at least recalculating the weights helps. (Some day really should refactor all the wrapping, so we do whatever is fastest independent of 16bit int aos or 32bit float soa filtering.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit fd6f18588ced7ac8e081892f3bab2916623ad7a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 11:15:53 2012 +0100 gallium/util: Fix parsing of options with underscore. For example GALLIVM_DEBUG=no_brilinear which was being parsed as two options, "no" and "brilinear". commit 09a8f809088178a03e49e409fa18f1ac89561837 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 26 15:00:14 2012 +0100 gallivm: Added a generic lp_build_print_value which prints a LLVMValueRef. Updated lp_build_printf to share common code. Removed specific lp_build_print_vecX. Reviewed-by: José Fonseca <jfonseca@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit e59bdcc2c075931bfba2a84967a5ecd1dedd6eb0 Author: José Fonseca <jfonseca@vmware.com> Date: Wed May 16 15:00:23 2012 +0100 draw,llvmpipe: Avoid named struct types on LLVM 3.0 and later. Starting with LLVM 3.0, named structures are meant not for debugging, but for recursive data types, previously also known as opaque types. The recursive nature of these types leads to several memory management difficulties. Given that we don't actually need recursive types, avoid them altogether. This is an attempt to address fdo bugs 41791 and 44466. The issue is somewhat random so there's no easy way to check how effective this is. Cherry-picked from 9af1ba565dfd5cef9ee938bb7c04767d14878fbf commit df6070f618a203c7a876d984c847cde4cbc26bdb Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 14:42:53 2012 +0200 gallivm: (trivial) fix typo in faster aos linear int wrap code no longer crashes, now REALLY tested. commit d8f98dce452c867214e6782e86dc08562643c862 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 18:20:58 2012 +0200 llvmpipe: (trivial) remove bogus optimization for float aos repeat wrap This optimization for nearest filtering on the linear path generated likely bogus results, and the int path didn't have any optimizations there since the only shader using force_nearest apparently uses clamp_to_edge not repeat wrap anyway. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit c4e271a0631087c795e756a5bb6b046043b5099d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 23:01:52 2012 +0200 gallivm: faster repeat wrap for linear aos path too Even if we already have scaled integer coords, it's way faster to use the original float coord (plus some conversions) rather than use URem. The choice of what to do for texture wrapping is not really tied to int aos or float soa filtering though for some modes there can be some gains (because of easier weight calculations). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 1174a75b1806e92aee4264ffe0ffe7e70abbbfa3 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 14:39:22 2012 +0200 gallivm: improve npot tex wrap repeat in linear soa path URem gets translated into series of scalar divisions so just about anything else is faster. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit f849ffaa499ed96fa0efd3594fce255c7f22891b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 00:40:35 2012 +0100 gallivm: (trivial) fix near-invisible shift-space typo I blame the keyboard. commit 5298a0b19fe672aebeb70964c0797d5921b51cf0 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:24:28 2012 +0200 gallivm: add new intrinsic helper to deal with arbitrary vector length This helper will split vectors which are too large for the hw, or expand them if they are too small, so a caller of a function using intrinsics which uses such sizes need not split (or expand) the vectors manually and the function will still use the intrinsic instead of dropping back to generic llvm code. It can also accept scalars for use with pseudo-vector intrinsics (only useful for float arguments, all x86 scalar simd float intrinsics use 4vf32). Only used for lp_build_min/max() for now (also added the scalar float case for these while there). (Other basic binary functions could use it easily, whereas functions with a different interface would need different helpers.) Expanding vectors isn't widely used, because we always try to use build contexts with native hw vector sizes. But it might (or not) be nicer if this wouldn't need to be done, the generated code should in theory stay the same (it does get hit by lp_build_rho though already since we didn't have a intrinsic for the scalar lp_build_max case before). v2: incorporated Brian's feedback, and also made the scalar min/max case work instead of crash (all scalar simd float intrinsics take 4vf32 as argument, probably the reason why it wasn't used before). Moved to lp_bld_intr based on José's request, and passing intrinsic size instead of length. Ideally we'd derive the source type info from the passed in llvm value refs and process some llvmtype return type so we could handle intrinsics where the source and destination type isn't the same (like float/int conversions, packing instructions) but that's a bit too complicated for now. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 01aa760b99ec0b2dc8ce57a43650e83f8c1becdf Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:19:18 2012 +0200 gallivm: (trivial) increase max code size for shader disassembly 64kB was just short of what I needed (which caused a crash) hence increase to 96kB (should probably be smarter about that). commit 74aa739138d981311ce13076388382b5e89c6562 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:53:29 2012 +0100 gallivm: simplify aos float tex wrap repeat nearest just handle pot and npot the same. The previous pot handling ended up with exactly the same instructions plus 2 more (leave it in the soa path though since it is probably still cheaper there). While here also fix a issue which would cause a crash after an assert. commit 0e1e755645e9e49cfaa2025191e3245ccd723564 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:29:24 2012 +0100 gallivm: (trivial) skip floor rounding in ifloor when not signed This was only done for the non-sse41 case before, but even with sse41 this is obviously unnecessary (some callers already call itrunc in this case anyway but some might not). commit 7f01a62f27dcb1d52597b24825931e88bae76f33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:23:12 2012 +0100 gallivm: (trivial) fix bogus comments commit 5c85be25fd82e28490274c468ce7f3e6e8c1d416 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 20 11:51:57 2012 +0100 translate: Free elt8_func/elt16_func too. These were leaking. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 0ad498f36fb6f7458c7cffa73b6598adceee0a6c Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 15:55:34 2012 +0200 gallivm: fix bug for tex wrap repeat with linear sampling in aos float path The comparison needs to be against length not length_minus_one, otherwise the max texel is never chosen (for the second coordinate). Fixes piglit texwrap-1D-npot-proj (and 2D/3D versions). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit d1ad65937c5b76407dc2499b7b774ab59341209e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 16:13:43 2012 +0200 gallivm: simplify soa tex wrap repeat with npot textures and no mip filtering Similar to what is already done in aos sampling for the float path (but not the int path since we don't get normalized float coordinates there). URem is expensive and the calculation is done trivially with normalized floats instead (at least with sse41-capable cpus). (Some day should probably do the same for the mip filter path but it's much more complicated there hence the gain is smaller.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit e1e23f57ba9b910295c306d148f15643acc3fc83 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:38:56 2012 +0200 llvmpipe: (trivial) remove duplicated function declaration Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 07ca57eb09e04c48a157733255427ef5de620861 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:37:34 2012 +0200 llvmpipe: destroy setup variants on context destruction lp_delete_setup_variants() used to be called in garbage collection, but this no longer exists hence the setup shaders never got freed. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ed0003c633859a45f9963a479f4c15ae0ef1dca3 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 16:25:29 2012 +0100 gallivm: handle different ilod parts for multiple quad sampling This fixes filtering when the integer part of the lod is not the same for all quads. I'm not fully convinced of that solution yet as it just splits the vector if the levels to be sampled from are different. But otherwise we'd need to do things like some minify steps, and getting mip level base address separately anyway hence it wouldn't really look like much of a win (and making the code even more complex). This should now give identical results to single quad sampling. commit 8580ac4cfc43a64df55e84ac71ce1a774d33c0d2 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:14:47 2012 +0200 gallivm: de-duplicate sample code common to soa and aos sampling There doesn't seem to be any reason why this code dealing with cube face selection, lod and mip level calculation is separate in aos and soa sampling, and I am sick of having it to change in both places. commit fb541e5f957408ce305b272100196f1e12e5b1e8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:15:41 2012 +0200 gallivm: do mip filtering with per quad lod_fpart This gives better results for mip filtering, though the generated code might not be optimal. For now it also creates some artifacts if the lod_ipart isn't the same for all quads, since instead of using the same mip weight for all quads as previously (which just caused non-smooth gradients) this now will use the right weights but with the wrong mip level in this case (can easily be seen with things like texfilt, mipmap_tunnel). v2: use logic helper suggested by José, and fix issue with negative lod_fpart values commit f1cc84eef7d826a20fab6cd8ccef9a275ff78967 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 13 18:35:25 2012 +0200 gallivm: (trivial) fix bogus assert in lp_build_unpack_broadcast_aos_scalars commit 7c17dbae8ae290df9ce0f50781a09e8ed640c044 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:11:14 2012 +0100 util: Reimplement half <-> float conversions. Removed u_half.py used to generate the table for previous method. Previous implementation of float to half conversion was faulty for denormalised and NaNs and would require extra logic to fix, thus making the speedup of using tables irrelevant. commit 7762f59274070e1dd4b546f5cb431c2eb71ae5c3 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:12:16 2012 +0100 tests: Updated tests to properly handle NaN for half floats. commit fa94c135aea5911fd93d5dfb6e6f157fb40dce5e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:33:10 2012 +0200 gallivm: do mip level calculations per quad This is the final piece which shouldn't change the rendering output yet. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 23cbeaddfe03c09ca18c45d28955515317ffcf4c Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Jun 9 00:54:21 2012 +0200 gallivm: do per-quad cube face selection Doesn't quite fix the piglit cubemap test (not sure why actually) but doing per-quad face selection is doing the right thing and definitely an improvement. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit abfb372b3702ac97ac8b5aa80ad1b94a2cc39d33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:22:59 2012 +0200 gallivm: do all lod calculations per quad Still no functional change but lod is now converted to scalar after lod calculations. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 519368632747ae03feb5bca9c655eccbc5b751b4 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:46:10 2012 +0100 gallivm: Added support for half-float to float conversion in lp_build_conv. Updated various utility functions to support this change. commit 135b4d683a4c95f7577ba27b9bffa4a6fbd2c2e7 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:02:46 2012 +0100 gallivm: Added function for half-float to float conversion. Updated lp_build_format_aos_array to support half-float source. commit 37d648827406a20c5007abeb177698723ed86673 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:55:18 2012 +0100 util: Updated u_format_tests to rigidly test half-float boundary values. commit 2ad18165d96e578aa9046df7c93cb1c3284d8c6b Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:54:16 2012 +0100 llvmpipe: Updated lp_test_format to properly handle Inf/NaN results. commit 78740acf25aeba8a7d146493dd5c966e22c27b73 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:53:30 2012 +0100 util: Added functions for checking NaN / Inf for double and half-floats. commit 35e9f640ae01241f9e0d67fe893bbbf564c05809 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:05:13 2012 +0200 gallivm: Fix calculating rho for 3d textures for the single-quad case Discovered by accident, this looks like a very old typo bug. commit fc1220c636326536fd0541913154e62afa7cd1d8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:04:59 2012 +0200 gallivm: do calcs per-quad in lp_build_rho Still convert to scalar at the end of the function. commit 50a887ffc550bf310a6988fa2cea5c24d38c1a41 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 21 23:21:50 2012 +0200 gallivm: (trivial) return scalar in lp_build_extract_range for length 1 vectors Our type system on top of llvm's one doesn't generally support vectors of length 1, instead using scalars. So we should return a scalar from this function instead of having to bitcast the vector with length 1 later elsewhere. commit 80c71c621f9391f0f9230460198d861643324876 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 17:49:15 2012 +0100 draw: Fixed bad merge error commit c47401cfad0c9167de20ff560654f533579f452c Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:29:30 2012 +0100 draw: Updated store_clip to store whole vectors instead of individual elements. commit 2d9c1ad74b0b0b41861fffcecde39f09cc27f1cf Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:28:32 2012 +0100 gallivm: Added lp_build_fetch_rgba_aos_array. A version of lp_build_fetch_rgba_aos which is targeted at simple array formats. Reads the whole vector from memory in one, instead of reading each element individually. Tested with mesa tests and demos. commit ff7805dc2b6ef6d8b11ec4e54aab1633aef29ac8 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:27:40 2012 +0100 gallivm: Added lp_build_pad_vector. This function pads a vector with undef to a desired length. commit 701f50acef24a2791dabf4730e5b5687d6eb875d Author: James Benton <jbenton@vmware.com> Date: Fri May 18 17:27:19 2012 +0100 util: Added util_format_is_array. This function checks whether a format description is in a simple array format. commit 5e0a7fa543dcd009de26f34a7926674190fa6246 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:13:47 2012 +0100 draw: Removed draw_llvm_translate_from and draw/draw_llvm_translate.c. This is "replaced" by adding an optimised path in lp_build_fetch_rgba_aos in an upcoming patch. commit 8c886d6a7dd3fb464ecf031de6f747cb33e5361d Author: James Benton <jbenton@vmware.com> Date: Wed May 16 15:02:31 2012 +0100 draw: Modified store_aos to write the vector as one, not individual elements. commit 37337f3d657e21dfd662c7b26d61cb0f8cfa6f17 Author: James Benton <jbenton@vmware.com> Date: Wed May 16 14:16:23 2012 +0100 draw: Changed aos_to_soa to use lp_build_transpose_aos. commit bd2b69ce5d5c94b067944d1dcd5df9f8e84548f1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:14:27 2012 +0100 draw: Changed soa_to_aos to use lp_build_transpose_aos. commit 0b98a950d29a116e82ce31dfe7b82cdadb632f2b Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:45 2012 +0100 gallivm: Added lp_build_transpose_aos which converts between aos and soa. commit 69ea84531ad46fd145eb619ed1cedbe97dde7cb5 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:01 2012 +0100 gallivm: Added lp_build_interleave2_half aimed at AVX unpack instructions. commit 7a4cb1349dd35c18144ad5934525cfb9436792f9 Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 22 11:54:14 2012 +0100 gallivm: Fix build on Windows. MC-JIT not yet supported there. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit afd105fc16bb75d874e418046b80d9cc578818a1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:17:26 2012 +0100 llvmpipe: Added a error counter to lp_test_conv. Useful for keeping track of progress when fixing errors! Signed-off-by: José Fonseca <jfonseca@vmware.com> commit b644907d08c10a805657841330fc23db3963d59c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:16:46 2012 +0100 llvmpipe: Changed known failures in lp_test_conv. To comply with the recent fixes to lp_bld_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit d7061507bd94f6468581e218e61261b79c760d4f Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:14:38 2012 +0100 llvmpipe: Added fixed point types tests to lp_test_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 146b3ea39b4726dbe125ac666bd8902ea3d6ca8c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:26:35 2012 +0100 llvmpipe: Changed lp_test_conv src/dst alignment to be correct. Now based on the define rather than a fixed number. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit f3b57441f834833a4b142a951eb98df0aa874536 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:06:44 2012 +0100 gallivm: Fixed erroneous optimisation in lp_build_min/max. Previously assumed normalised was 0 to 1, but it can be -1 to 1 if type is signed. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a0613382e5a215cd146bb277646a6b394d376ae4 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:04:49 2012 +0100 gallivm: Compensate for lp_const_offset in lp_build_conv. Fixing a /*FIXME*/ to remove errors in integer conversion in lp_build_conv. Tested using lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a3d2bf15ea345bc8a0664f8f441276fd566566f3 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:01:25 2012 +0100 gallivm: Fixed overflow in lp_build_clamped_float_to_unsigned_norm. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit e7b1e76fe237613731fa6003b5e1601a2e506207 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 21 20:07:51 2012 +0100 gallivm: Fix build with LLVM 2.6 Trivial, and useful. commit d3c6bbe5c7f5ba1976710831281ab1b6a631082d Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 15 17:15:59 2012 +0100 gallivm: Enable MCJIT/AVX with vanilla LLVM 3.1. Add the necessary C++ glue, so that we don't need any modifications to the soon to be released LLVM 3.1. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 724a019a14d40fdbed21759a204a2bec8a315636 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 22:04:06 2012 +0100 gallivm: Use HAVE_LLVM 0x0301 consistently. commit af6991e2a3868e40ad599b46278551b794839748 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 21:49:06 2012 +0100 gallivm: Add MCRegisterInfo.h to silence benign warnings about missing implementation. Trivial. commit 6f8a1d75458daae2503a86c6b030ecc4bb494e23 Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Apr 2 22:14:15 2012 -0700 gallivm: Pass in a MCInstrInfo to createMCInstPrinter on llvm-3.1. llvm-3.1svn r153860 makes MCInstrInfo available to the MCInstPrinter. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 62555b6ed8760545794f83064e27cddcb3ce5284 Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 27 21:51:17 2012 -0700 gallivm: Fix method overriding in raw_debug_ostream. Use matching type qualifers to avoid method hiding. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 6a9bd784f4ac68ad0a731dcd39e5a3c39989f2be Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 13 22:40:52 2012 -0700 gallivm: Fix createOProfileJITEventListener namespace with llvm-3.1. llvm-3.1svn r152620 refactored the OProfile profiling code. createOProfileJITEventListener was moved from the llvm namespace to the llvm::JITEventListener namespace. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit b674955d39adae272a779be85aa1bd665de24e3e Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Mar 5 22:00:40 2012 -0800 gallivm: Pass in a MCRegisterInfo to MCInstPrinter on llvm-3.1. llvm-3.1svn r152043 changes createMCInstPrinter to take an additional MCRegisterInfo argument. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 11ab69971a8a31c62f6de74905dbf8c02884599f Author: Vinson Lee <vlee@freedesktop.org> Date: Wed Feb 29 21:20:53 2012 -0800 Revert "gallivm: Change getExtent and readByte to non-const with llvm-3.1." This reverts commit d5a6c172547d8964f4d4bb79637651decaf9deee. llvm-3.1svn r151687 makes MemoryObject accessor members const again. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 339960c82d2a9f5c928ee9035ed31dadb7f45537 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 14 16:19:56 2012 +0200 gallivm: (trivial) fix assertion failure for mipmapped 1d textures In lp_build_rho, we may end up with a 1-element vector (for mipmapped 1d textures), but in this case we require the type to be a non-vector type, so need a cast. commit 9d73edb727bd6d196030dc3026b7bf0c574b3e19 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:12:07 2012 +0200 gallivm: prepare for per-quad lod calculations for large vectors to be able to handle multiple quads at once in texture sampling and still do lod calculations per quad, it is necessary to get the per-quad derivatives into the lp_build_rho function. Until now these derivative values were just scalars, which isn't going to work. So we now use vectors, and since the interface needs to change we also do some different (slightly more efficient) packing of the values. For 8-wide vectors the packed derivative values for 3 coords would look like this, this scales to a arbitrary (multiple of 4) vector size: ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____ The second vector will be unused for 1d and 2d textures. To facilitate future changes the derivative values are put into a struct, since quite some functions just pass these values through. The generated code seems to be very slightly better for 2d textures (with 4-wide vectors) than before with sse2 (if you have a cpu with physical 128bit simd units - otherwise it's probably not a win). v2: suggestions from José, rename variables, add comments, use swizzle helper commit 0aa21de0d31466dac77b05c97005722e902517b8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:10:31 2012 +0200 gallivm: add undefined swizzle handling to lp_build_swizzle_aos This is useful for vectors with "holes", it lets llvm choose the most efficient shuffle instructions if some elements aren't needed without having to worry what elements to manually pick otherwise. commit 00faf3f370e7ce92f5ef51002b0ea42ef856e181 Author: José Fonseca <jfonseca@vmware.com> Date: Fri May 4 17:25:16 2012 +0100 gallivm: Get the LLVM IR optimization passes before JIT compilation. MC-JIT engine compiles the module immediately on creation, so the optimization passes were being run too late. So now we create a target data layout from a string, that matches the ABI parameters reported by the compiler. The backend optimization passes were always been run, so the performance improvement is modest (3% on multiarb mesa demo). Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit 40a43f4e2ce3074b5ce9027179d657ebba68800a Author: Roland Scheidegger <sroland@vmware.com> Date: Wed May 2 16:03:54 2012 +0200 gallivm: (trivial) fix wrong define used in lp_build_pack2 should fix stack-smashing crashes. commit e6371d0f4dffad4eb3b7a9d906c23f1c88a2ab9e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 30 21:25:29 2012 +0200 gallivm: add perf warnings when not using intrinsics with 256bit vectors Helper functions using integer sse2 intrinsics could split the vectors with AVX instead of using generic fallback (which should be faster). We don't actually expect to hit these paths (hence don't fix them up to actually do the vector splitting) so just emit warnings (for those functions where it's obvious doing split/intrinsic is faster than using generic path). Only emit warnings for 256bit vectors since we _really_ don't expect to hit arbitrary large vectors which would affect a lot more functions. The warnings do not actually depend on avx since the same logic applies to plain sse2 too (but of course again there's _really_ no reason we should hit these functions with 256bit vectors without avx). commit 8a9ea701ea7295181e846c6383bf66a5f5e47637 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:37:07 2012 +0200 gallivm: split vectors manually for avx in lp_build_pack2 (v2) There's 2 reasons for this: First, there's a llvm bug (fixed in 3.1) which generates tons of byte inserts/extracts otherwise, and second, more importantly, we want to use pack intrinsics instead of shuffles. We do this in lp_build_pack2 and not the calling code (aos sample path) because potentially other callers might find that useful too, even if for larger sequences of code using non-native vector sizes it might be better to manually split vectors. This should boost texture performance in the aos path considerably. v2: fix issues with intrinsics types with old llvm commit 27ac5b48fa1f2ea3efeb5248e2ce32264aba466e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:26:22 2012 +0200 llvmpipe: refactor lp_build_pack2 (v2) prettify, and it's unnecessary to assert when there's no intrinsic due to unsupported bit width - the shuffle path will work regardless. In contrast lp_build_packs2, should only rely on lp_build_pack2 doing the clamping for element sizes for which there is a sse2 intrinsic. v2: fix bug spotted by Jose regarding the intrinsic type for packusdw on old llvm versions. commit ddf279031f0111de4b18eaf783bdc0a1e47813c8 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:13:59 2012 +0200 gallivm: add src width check in lp_build_packs2() not doing so would skip clamping even if no sse2 pack instruction is available, which is incorrect (in theory only, such widths would also always hit a (unnecessary) assertion in lp_build_pack2(). commit e7f0ad7fe079975eae7712a6e0c54be4fae0114b Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Apr 27 15:57:00 2012 +0200 gallivm: (trivial) fix crash-causing typo for npot textures with avx commit 28a9d7f6f655b6ec508c8a3aa6ffefc1e79793a0 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Apr 25 19:38:45 2012 +0200 gallivm: (trivial) remove code mistakenly added twice. commit d5926537316f8ff67ad0a52e7242f7c5478d919b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Apr 24 21:16:15 2012 +0200 gallivm: add a new avx aos sample path (v2) Try to avoid mixing float and int address calculations. This does texture wrap modes with floats, and then the offset calculations still with ints (because of lack of precision with floats, though we could do some effort to make it work with not too large (16MB) textures). This also handles wrap repeat mode with npot-sized textures differently than either the old soa or aos int path (likely way faster but untested). Otherwise the actual address wrap code is largely similar to the soa path (not quite the same as this one also has some int code), it should get used by avx soa sampling later as well but doesn't handle more complex address modes yet (this will also have the benefit that we can use aos sampling path for all texture address modes). Generated code for that looks reasonable, but still does not split vectors explicitly for fetch/filter which means still get hit by llvm (fixed upstream) which generates hundreds of pinsrb/pextrb instead of two shuffles. It is not obvious though if it's much of a win over just doing address calcs 4-wide but with ints, even if it is definitely much less instructions on avx. piglit's texwrap seems to look exactly the same but doesn't test neither the non-normalized nor the npot cases. v2: fix comments, prettify based on Brian's and Jose's feedback. commit bffecd22dea66fb416ecff8cffd10dd4bdb73fce Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Apr 19 01:58:29 2012 +0200 gallivm: refactor aos lp_build_sample_image_nearest/linear split them up to separate address calculations and fetching/filtering. Need this for being able to do 8-wide float address calcs and 4-wide fetch/filter later (for avx). Plus the functions were very big scary monsters anyway (in particular lp_build_sample_image_linear). commit a80b325c57529adddcfa367f96f03557725c4773 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 16 17:17:18 2012 +0200 gallivm: fix lp_build_resize when truncating width but expanding vector size Missed this case which I thought was impossible - the assertion for it was right after the division by zero... (AoS) texture sampling may ask us to do this, for things like 8 4x32int vectors to 1 32x8int vector conversion (eventually, we probably don't want this to happen). commit f9c8337caa3eb185830d18bce8b95676a065b1d7 Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Apr 14 18:00:59 2012 +0200 gallivm: fix cube maps with larger vectors This makes the branchless cube face selection code work with larger vectors. Because the complexity is quite high (cannot really be improved it seems, per-face selection would reduce complexity a lot but this leads to errors unless the derivatives are calculated all from the same face which almost doubles the work to be done) it is still slower than the branching version, hence only enable this with large vectors. It doesn't actually do per-quad face selection yet (only makes sense with matching lod selection, in fact it will select the same face for all pixels based on the average of the first four pixels for now) but only different shuffles are required to make it work (the branching version actually should work with larger vectors too now thanks to the improved horizontal add but of course it cannot be extended to really select the face per-quad unless doing branching per quad). commit 7780c58869fc9a00af4f23209902db7e058e8a66 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 21:11:12 2012 +0100 llvmpipe: (trivial) fix compiler warning and also clarify comment regarding availability of popcnt instruction. commit a266dccf477df6d29a611154e988e8895892277e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 14:21:07 2012 +0100 gallivm: remove unneeded members in lp_build_sample_context Minor cleanup, the texture width, height, depth aren't accessed in their scalar form anywhere. Makes it more obvious those values should probably be fetched already vectorized (but this requires more invasive changes)... commit b678c57fb474e14f05e25658c829fc04d2792fff Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 29 15:53:55 2012 +0100 gallivm: add a helper for concatenating vectors Similar to the extract_range helper intended to get around slow code generated by llvm for 128bit insertelements. Concatenating two 128bit vectors this way will result in a single vinsertf128 operation rather than two 64bit stores plus one 128bit load, though it might be mildly useful for other purposes as well. commit 415ff228bcd0cf5e44a4c15350a661f0f5520029 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 19:41:15 2012 +0100 gallivm: add a custom 2x8f->1x16ub avx conversion path Similar to the existing 4x4f->1x16ub sse2 path, shaves off a couple instructions (min/max mostly) because it relies on pack intrinsics clamping. commit 78c08fc89f8fbcc6dba09779981b1e873e2a0299 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 18:44:07 2012 +0100 gallivm: add avx arithmetic intrinsics Add all avx intrinsics for arithmetic functions (with the exception of the horizontal add function which needs another look). Seems to pass basic tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit a586caa2800aa5ce54c173f7c0d4fc48153dbc4e Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 15:31:35 2012 +0100 gallivm: add avx logic intrinsics Add the blend intrinsics for 8-wide float and 4-wide double vectors. Since we lack 256bit int instructions these are used for int vectors as well, though obviously not for byte or word element values. The comparison intrinsics aren't extended for avx since these are only used for pre-2.7 llvm versions. commit 70275e4c13c89315fc2560a4c488c0e6935d5caf Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 00:40:53 2012 +0100 gallivm: new helper function for extract shuffles. Based on José's idea as we can need that in a couple places. Note that such shuffles should not be used lightly, since data layout of <4 x i8> is different to <16 x i8> for instance, hence might cause data rearrangement. commit 4d586dbae1b0c55915dda1759d2faea631c0a1c2 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 18:27:25 2012 +0100 gallivm: (trivial) don't overallocate shuffle variable using wrong define meant huge array... commit 06b0ec1f6d665d98c135f9573ddf4ba04b2121ad Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 17:54:20 2012 +0100 gallivm: don't do per-element extract/insert for vector element resize Instead of doing per-element extract/insert if the src vectors and dst vector differ in total size (which generates atrocious code) first change the src vectors size by using shuffles to destination vector size. We can still do better than that on AVX for packing to color buffer (by exploiting pack intrinsics characteristics hence eleminating the need for some clamps) but this already generates much better code. v2: incorporate feedback from José, Keith and use shuffle instead of bitcasts/extracts. Due to llvm deficiencies the latter cause all data to get moved to GPRs and back in pieces (even though the data in the regs actually stays the same...). commit c9970d70e05f95d3f52fe7d2cd794176a52693aa Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 19:33:19 2012 +0000 gallivm: fix bug in simple position interpolation Accidental use of position attribute instead of just pixel coordinates. Caused failures in piglit glsl-fs-ceil and glsl-fs-floor. commit d0b6fcdb008d04d7f73d3d725615321544da5a7e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 15:31:14 2012 +0000 gallivm: fix emission of ceil opcode lp_build_ceil seems more appropriate than lp_build_trunc. This seems to be never hit though someone performs some ceil to floor magic. commit d97fafed7e62ffa6bf76560a92ea246a1a26d256 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 22 11:46:52 2012 +0000 gallivm: new vectorized path for cubemap calculations should be faster when adapted to multiple quads as only selection masks need to be different. The code is more or less a per-pixel version adapted to only do it per quad. A per pixel version would be much simpler (could drop 2 selects, 6 broadcasts and the messy horizontal add of 3 vectors at the expense of only 2 more absolute value instructions - would also just work for arbitary large vectors). This version doesn't yet work with larger vectors because the horizontal add isn't adjusted to be able to work with 2x4 vectors (and also because face selection wouldn't be done per quad just per block though that would be only a correctness issue just as with lod selection). The downside is this code is quite a bit slower. On a Core2 it can be sped up by disabling the hw blend instructions for selection and using logicop fallbacks instead, but it is still slower than the old code, hence leave that in for now. Probably will chose one or the other version based on vector length in the end. commit b375fbb18a3fd46859b7fdd42f3e9908ea4ff9a3 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 21 14:42:29 2012 +0000 gallivm: fix optimized occlusion query intrinsic name commit a9ba0a3b611e48efbb0e79eb09caa85033dbe9a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Mar 21 16:19:43 2012 +0000 draw,gallivm,llvmpipe: Call gallivm_verify_function everywhere. commit f94c2238d2bc7383e088b8845b7410439a602071 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 18:54:10 2012 +0000 gallivm: optimize calculations for cube maps a bit this does some more vectorized calculations and uses horizontal adds if possible. A definite win with sse3 otherwise it doesn't seem to make much of a difference. In any case this is arithmetically identical, cannot handle larger vectors. Should be useful as a reference point against larger vector version later... commit 21a2c1cf3c8e1ac648ff49e59fdc0e3be77e2ebb Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 15:16:27 2012 +0000 llvmpipe: slight optimization of occlusion queries using movmskps when available. While this is slightly better for cpus without popcnt we should really sum the vectors ourselves (it is also possible to cast to i4 before doing the popcnt but that doesn't help that much neither since llvm is using some optimized popcnt version for i32) commit 5ab5a35f216619bcdf55eed52b0db275c4a06c1b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 13:32:11 2012 +0000 llvmpipe: fix occlusion queries with larger vectors need to adjust casts etc. commit ff95e6fdf5f16d4ef999ffcf05ea6e8c7160b0d5 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Mar 19 20:15:25 2012 +0000 gallivm: Restore optimization passes. commit 57b05b4b36451e351659e98946dae27be0959832 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:34:22 2012 +0000 llvmpipe: use existing min2 macro commit bc9a20e19b4f600a439f45679451f2e87cd4b299 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:07:27 2012 +0000 llvmpipe: add some safeguards against really large vectors As per José's suggestion, prevent things from blowing up if some cpu would have 1024bit or larger vectors. commit 0e2b525e5ca1c5bbaa63158bde52ad1c1564a3a9 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:31:08 2012 +0000 llvmpipe: fix mask generation for uberwide vectors this was the only piece preventing 16-wide vectors from working (apart from the LP_MAX_VECTOR_WIDTH define that is), which is the maximum as we don't get more pixels in the fragment shader at once. Hence adjust that so things could be tested properly with that size even though there seems to be no practical value. commit 3c8334162211c97f3a11c7f64e9e5a2a91ad9656 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:19:41 2012 +0000 llvmpipe: fix the simple interpolation method with larger vectors so both methods actually _really_ work now. Makes textures look nice with larger vectors... commit 1cb0464ef8871be1778d43b0c56adf9c06843e2d Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 17:26:35 2012 +0000 llvmpipe: fix mask generation and position interpolation with 8-wide vectors trivial bugs, with these things start to look somewhat reasonable. Textures though have some swizzling issues it seems. commit 168277a63ef5b72542cf063c337f2d701053ff4b Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 16:04:03 2012 +0000 llvmpipe: don't overallocate variables we never have more than 16 (stamp size) / 4 (minimum possible vector size). (With larger vectors those variables are still overallocated a bit.) commit 409b54b30f81ed0aa9ed0b01affe15c72de9abd2 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:56:48 2012 +0000 llvmpipe: add some 32f8 formats to lp_test_conv Also add the ability to handle different sized vectors. commit 55dcd3af8366ebdac0af3cdb22c2588f24aa18ce Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:47:27 2012 +0000 gallivm: handle different sized vectors in conversion / pack only fully generic path for now (extract/insert per element). commit 9c040f78c54575fcd94a8808216cf415fe8868f6 Author: Roland Scheidegger <sroland@vmware.com> Date: Sun Mar 18 00:58:28 2012 +0100 llvmpipe: fix harmless use of unitialized values commit 551e9d5468b92fc7d5aa2265db9a52bb1e368a36 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:31:21 2012 +0100 gallivm: drop special path in extract_broadcast with different sized vectors Not needed, llvm can handle shuffles with different sized result vector just fine. Should hopefully generate the same code in the end, but simpler IR. commit 44da531119ffa07a421eaa041f63607cec88f6f8 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:28:49 2012 +0100 llvmpipe: adapt interpolation for handling multiple quads at once this is still WIP there are actually two methods possible not quite sure what makes the most sense, so there's code for both for now: 1) the iterative method as used before (compute attrib values at upper left corner of stamp and upper left corner of each quad initially). It is improved to handle more than one quad at once, and also do some more vectorized calculations initially for slightly better code - newer cpus have full throughput with 4 wide float vectors, hence don't try to code up a path which might be faster if there's just one channel active per attribute. 2) just do straight interpolation for each pixel. Method 2) is more work per quad, but less initially - if all quads are executed significantly more overall though. But this might change with larger vector lengths. This method would also be needed if we'd do some kind of active quad merging when operating on multiple quads at once. This path contains some hack to force llvm to generate better code, it is still far from ideal though, still generates far too many unnecessary register spills/reloads. Both methods should work with different sized vectors. Not very well tested yet, still seems to work with four-wide vectors, need changes elsewhere to be able to test with wider vectors. commit be5d3e82e2fe14ad0a46529ab79f65bf2276cd28 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:59:37 2012 +0000 draw: Cleanup. commit f85bc12c7fbacb3de2a94e88c6cd2d5ee0ec0e8d Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:43:30 2012 +0000 gallivm: More module compilation refactoring. commit d76f093198f2a06a93b2204857e6fea5fd0b3ece Author: José Fonseca <jfonseca@vmware.com> Date: Thu Mar 15 21:29:11 2012 +0000 llvmpipe: Use gallivm_compile/free_function() in linear code. Should had been done before. commit 122e1adb613ce083ad739b153ced1cde61dfc8c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 13 14:47:10 2012 +0100 llvmpipe: generate partial pixel mask for multiple quads still works with one quad, cannot be tested yet with more At least for now always fixed order with multiple quads. commit 4c4f15081d75ed585a01392cd2dcce0ad10e0ea8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 22:09:24 2012 +0100 llvmpipe: refactor state setup a bit Refactor to make it easier to emit (and potentially later fetch in fs) coefficients for multiple attributes at once. Need to think more about how to make this actually happen however, the problem is different attributes can have different interpolation modes, requiring different handling in both setup and fs (though linear and perspective handling is close). commit 9363e49722ff47094d688a4be6f015a03fba9c79 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 19:23:23 2012 +0100 llvmpipe: vectorize tri offset calc cuts number of instructions in quad-offset-factor from 107 to 75. This code actually duplicated the (scalar) code calculating the determinant except it used different vertex order (leading to different sign but it doesn't matter) hence llvm could not have figured out it's the same (of course with determinant vectorized in the other place that wouldn't have worked any longer neither). Note this particular piece doesn't actually vectorize well, not many arithmetic instructions left but tons of shuffle instructions... Probably would need to work on n tris at a time for better vectorization. commit 63169dcb9dd445c94605625bf86d85306e2b4297 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 03:11:37 2012 +0100 llvmpipe: vectorize some scalar code in setup reduces number of arithmetic instructions, and avoids loading vector x,y values twice (once as scalars once as vectors). Results in a reduction of instructions from 76 to 64 in fs setup for glxgears (16%) on a cpu with sse41. Since this code uses vec2 disguised as vec4, on old cpus which had physical 64bit sse units (pre-Core2) it probably is less of a win in practice (and if you have no vectors you can only hope llvm eliminates the arithmetic for unneeded elements). commit 732ecb877f951ab89bf503ac5e35ab8d838b58a1 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 7 00:32:24 2012 +0100 draw: fix clipping bug introduced by 4822fea3f0440b5205e957cd303838c3b128419c broke clipping pretty badly (verified with lineclip test) commit ef5d90b86d624c152d200c7c4056f47c3c6d2688 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 23:38:59 2012 +0100 draw: don't store vertex header per attribute storing the vertex header once per attribute is totally unnecessary. Some quick look at the generated assembly says llvm in fact cannot optimize away the additional stores (maybe due to potentially aliasing pointers somewhere). Plus, this makes the code cleaner and also allows using a vector "or" instead of scalar ones. commit 6b3a5a57b0b9850854cfbd7b586e4e50102dda71 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 19:11:01 2012 +0100 draw: do the per-vertex "boolean" clipmask "or" with vectors no point extracting the values and doing it per component. Doesn't help that much since we still extract the values elsewhere anyway. commit 36519caf1af40e4480251cc79a2d527350b7c61f Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 2 22:27:01 2012 +0100 gallivm: fix lp_build_extract_broadcast with different sized vectors Fix the obviously wrong argument, so it doesn't blow up. commit 76d0ac3ad85066d6058486638013afd02b069c58 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 2 12:16:23 2012 +0000 draw: Compile per module and not per function (WIP). Enough to get gears w/ LLVM draw + softpipe to work on AVX doing: GALLIUM_DRIVER=softpipe SOFTPIPE_USE_LLVM=yes glxgears But still hackish -- will need to rethink and refactor this. commit 78e32b247d2a7a771be9a1a07eb000d1e54ea8bd Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 12:01:05 2012 +0000 llvmpipe: Remove lp_state_setup_fallback. Never used. commit 6895d5e40d19b4972c361e8b83fdb7eecda3c225 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Feb 27 19:14:27 2012 +0000 llvmpipe: Don't emit EMMS on x86 We already take precautions to ensure that LLVM never emits MMX code. commit 4822fea3f0440b5205e957cd303838c3b128419c Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 29 15:58:19 2012 +0100 draw: modifications for larger vector sizes We want to be able to use larger vectors especially for running the vertex shader. With this patch we build soa vectors which might have a different length than 4. Note that aos structures really remain the same, only when aos structures are converted to soa potentially different sized vectors are used. Samplers probably don't work yet, didn't look at them. Testing done: glxgears works with both 128bit and 256bit vectors. commit f4950fc1ea784680ab767d3dd0dce589f4e70603 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:57 2012 +0100 gallivm: override native vector width with LP_NATIVE_VECTOR_WIDTH env var for debug commit 6ad6dbf0c92f3bf68ae54e5f2aca035d19b76e53 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:24 2012 +0100 draw: allocate storage with alignment according to native vector width commit 7bf0e3e7c9bd2469ae7279cabf4c5229ae9880c1 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Feb 24 19:06:08 2012 +0000 gallivm: Fix comment grammar. Was missing several words. Spotted by Roland. commit b20f1b28eb890b2fa2de44a0399b9b6a0d453c52 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 19:22:09 2012 +0000 gallivm: Use MC-JIT on LLVM 3.1 + (i.e, SVN) MC-JIT Note: MC-JIT is still WIP. For this to work correctly it requires LLVM changes which are not yet upstream. commit b1af4dfcadfc241fd4023f4c3f823a1286d452c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 20:03:15 2012 +0100 llvmpipe: use new lp_type_width() helper in lp_test_blend commit 04e0a37e888237d4db2298f31973af459ef9c95f Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 19:50:34 2012 +0100 llvmpipe: clean up lp_test_blend a little Using variables just sized and aligned right makes it a bit more obvious what's going on. The test still only tests vector length 4. For AoS anything else probably isn't going to work. For SoA other lengths should work (at least with floats). commit e61c393d3ec392ddee0a3da170e985fda885a823 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:48:30 2012 +0000 gallivm: Ensure vector width consistency. Instead of assuming that everything is the max native size. commit 330081ac7bc41c5754a92825e51456d231bf84dd Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:44:14 2012 +0000 draw: More simd vector width consistency fixes. commit d90ca002753596269e37297e2e6c139b19f29f03 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:43:00 2012 +0000 gallivm: Remove unused lp_build_int32_vec4_type() helper. commit cae23417824d75869c202aaf897808d73a2c1db0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 17:32:16 2012 +0100 gallivm: use global variable for native vector width instead of define We do not know the simd extensions (and hence the simd width we should use) available at compile time. At least for now keep a define for maximum vector width, since a global variable obviously can't be used to adjust alignment of automatic stack variables. Leave the runtime-determined value at 128 for now in all cases. commit 51270ace6349acc2c294fc6f34c025c707be538a Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 15:41:02 2012 +0000 gallivm: Add a hunk inadvertedly lost when rebasing. commit bf256df9cfdd0236637a455cbaece949b1253e98 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:24:23 2012 +0000 llvmpipe: Use consistent vector width in depth/stencil test. commit 5543b0901677146662c44be2cfba655fd55da94b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:19:59 2012 +0000 draw: Use a consistent the vector register width. Instead of 4x32 sometimes, LP_NATIVE_VECTOR_WIDTH other times. commit eada8bbd22a3a61f549f32fe2a7e408222e5c824 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 12:08:04 2012 +0000 gallivm: Remove garbagge collection. MC-JIT will require one compilation per module (as opposed to one compilation per function), therefore no state will be shared, eliminating the need to do garbagge collection. commit 556697ea0ed72e0641851e4fbbbb862c470fd7eb Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 10:33:41 2012 +0000 gallivm: Move all native target initialization to lp_set_target_options(). commit c518e8f3f2649d5dc265403511fab4bcbe2cc5c8 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:52:32 2012 +0000 llvmpipe: Create one gallivm instance for each test. commit 90f10af8920ec6be6f2b1e7365cfc477a0cb111d Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:48:08 2012 +0000 gallivm: Avoid LLVMAddGlobalMapping() in lp_bld_assert(). Brittle, complex, and unecesary. Just use function pointer constant. commit 98fde550b33401e3fe006af59db4db628bcbf476 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:21:26 2012 +0000 gallivm: Add a lp_build_const_func_pointer() helper. To be reused in all places where we want to call C code. commit 6cfedadb62c2ce5af8d75969bc95a607f3ece118 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:44:41 2012 +0000 gallivm: Cleanup/simplify lp_build_const_string_variable. - Move to lp_bld_const where it belongs - Rename to lp_build_const_string - take the length from the argument (and don't count the zero terminator twice) - bitcast the constant to generic i8 * commit db1d4018c0f1fa682a9da93c032977659adfb68c Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 11:52:17 2012 +0000 gallivm: Set NoFramePointerElimNonLeaf to true where supported. commit 088614164aa915baaa5044fede728aa898483183 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 22 19:38:47 2012 +0100 llvmpipe: pass in/out pointers rather scalar floats in lp_bld_arit we don't want llvm to potentially optimize away the vectors (though it doesn't seem to currently), plus we want to be able to handle in/out vectors of arbitrary length. commit 3f5c4e04af8a7592fdffa54938a277c34ae76b51 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:22:55 2012 +0100 gallivm: fix lp_build_sqrt() for vector length 1 since we optimize away vectors with length 1 need to emit intrinsic without vector type. commit 79d94e5f93ed8ba6757b97e2026722ea31d32c06 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 17:00:46 2012 +0000 llvmpipe: Remove lp_test_round. commit 81f41b5aeb3f4126e06453cfc78990086b85b78d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:56:24 2012 +0100 llvmpipe: subsume lp_test_round into lp_test_arit Much simpler, and since the arguments aren't passed as 128bit values can run on any arch. This also uses the float instead of the double versions of the c functions (which probably was the intention anyway). In contrast to lp_test_round the output is much less verbose however. Tested vector width of 32 to 512 bits - all pass except 32 (length 1) which crashes in lp_build_sqrt() due to wrong type. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 945b338b421defbd274481d8c4f7e0910fd0e7eb Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 09:55:03 2012 +0000 gallivm: Centralize the function compilation logic. This simplifies a lot of code. Also doing this in a central place will make it easier to carry out the changes necessary to use MC-JIT in the future. gallivm: Fix typo in explicit derivative shuffle. Trivial. draw: make DEBUG_STORE work again adapt to lp_build_printf() interface changes Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: get rid of vecnf_from_scalar() just use lp_build_broadcast directly (cannot assign a name but don't really need it, vecnf_from_scalar() was producing much uglier IR due to using repeated insertelement instead of insertelement+shuffle). Reviewed-by: José Fonseca <jfonseca@vmware.com> llvmpipe: fix typo in complex interpolation code Fixes position interpolation when using complex mode (piglit fp-fragment-position and similar) Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: fix clipvertex/position storing again This appears to be the result of a bad merge. Fixes piglit tests relying on clipping, like a lot of the interpolation tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Fix explicit derivative manipulation. Same counter variable was being used in two nested loops. Use more meanigful variable names for the counter to fix and avoid this. gallivm: Prevent buffer overflow in repeat wrap mode for NPOT. Based on Roland's patch, discussion, and review . Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix dims for TGSI_TEXTURE_1D in emit_tex. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix explicit volume texture derivatives. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: fix 1d shadow texture sampling Always r coordinate is used, hence need 3 coords not two (the second one is unused). Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Enable AVX support without MCJIT, where available. For now, this just enables AVX on Windows for testing. If the code is stable then we might consider prefering the old JIT wherever possible. No change elsewhere. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2012-07-13 18:09:30 +01:00
(util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
type.floating)
return lp_build_mul(bld, a, lp_build_rcp(bld, b));
if (type.floating)
return LLVMBuildFDiv(builder, a, b, "");
else if (type.sign)
return LLVMBuildSDiv(builder, a, b, "");
else
return LLVMBuildUDiv(builder, a, b, "");
}
/**
* Linear interpolation helper.
*
* @param normalized whether we are interpolating normalized values,
* encoded in normalized integers, twice as wide.
*
* @sa http://www.stereopsis.com/doubleblend.html
*/
static inline LLVMValueRef
lp_build_lerp_simple(struct lp_build_context *bld,
LLVMValueRef x,
LLVMValueRef v0,
LLVMValueRef v1,
gallivm: Eliminate 8.8 fixed point intermediates from AoS sampling path. This change was meant as a stepping stone to use PMADDUBSW SSSE3 instruction, but actually this refactoring by itself yields a 10% speedup on texture intensive shaders (e.g, Google Earth's ocean water w/o S3TC on a Ivy Bridge machine), while giving yielding exactly the same results, whereas PMADDUBSW only gave an extra 5%, at the expense of 2bits of precision in the interpolation. I belive that the speedup of this change comes from the reduced register pressure (as 8.8 fixed point intermediates take twice the space of 8bit unorm). Also, not dealing with 8.8 simplifies lp_bld_sample_aos.c code substantially -- it's no longer necessary to have code duplicated for low and high register halfs. Note about lp_build_sample_mipmap(): the path for num_quads > 1 is never executed (as it is faster on AVX to split the 256bit wide texture computation into two 128bit chunks, in order to leverage integer opcodes). This path might be useful in the future, so in order to verify this change did not break that path I had to apply this change: @@ -1662,11 +1662,11 @@ lp_build_sample_soa(struct gallivm_state *gallivm, /* * we only try 8-wide sampling with soa as it appears to * be a loss with aos with AVX (but it should work). * (It should be faster if we'd support avx2) */ - if (num_quads == 1 || !use_aos) { + if (/* num_quads == 1 || ! */ use_aos) { if (num_quads > 1) { if (mip_filter == PIPE_TEX_MIPFILTER_NONE) { LLVMValueRef index0 = lp_build_const_int32(gallivm, 0); /* and then run texfilt mesademo: LP_NATIVE_VECTOR_WIDTH=256 ./texfilt Ran whole piglit without regressions. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2013-04-21 22:23:31 +01:00
unsigned flags)
{
unsigned half_width = bld->type.width/2;
LLVMBuilderRef builder = bld->gallivm->builder;
LLVMValueRef delta;
LLVMValueRef res;
2010-08-09 17:30:33 +01:00
assert(lp_check_value(bld->type, x));
assert(lp_check_value(bld->type, v0));
assert(lp_check_value(bld->type, v1));
delta = lp_build_sub(bld, v1, v0);
if (bld->type.floating) {
assert(flags == 0);
return lp_build_mad(bld, x, delta, v0);
}
gallivm: Eliminate 8.8 fixed point intermediates from AoS sampling path. This change was meant as a stepping stone to use PMADDUBSW SSSE3 instruction, but actually this refactoring by itself yields a 10% speedup on texture intensive shaders (e.g, Google Earth's ocean water w/o S3TC on a Ivy Bridge machine), while giving yielding exactly the same results, whereas PMADDUBSW only gave an extra 5%, at the expense of 2bits of precision in the interpolation. I belive that the speedup of this change comes from the reduced register pressure (as 8.8 fixed point intermediates take twice the space of 8bit unorm). Also, not dealing with 8.8 simplifies lp_bld_sample_aos.c code substantially -- it's no longer necessary to have code duplicated for low and high register halfs. Note about lp_build_sample_mipmap(): the path for num_quads > 1 is never executed (as it is faster on AVX to split the 256bit wide texture computation into two 128bit chunks, in order to leverage integer opcodes). This path might be useful in the future, so in order to verify this change did not break that path I had to apply this change: @@ -1662,11 +1662,11 @@ lp_build_sample_soa(struct gallivm_state *gallivm, /* * we only try 8-wide sampling with soa as it appears to * be a loss with aos with AVX (but it should work). * (It should be faster if we'd support avx2) */ - if (num_quads == 1 || !use_aos) { + if (/* num_quads == 1 || ! */ use_aos) { if (num_quads > 1) { if (mip_filter == PIPE_TEX_MIPFILTER_NONE) { LLVMValueRef index0 = lp_build_const_int32(gallivm, 0); /* and then run texfilt mesademo: LP_NATIVE_VECTOR_WIDTH=256 ./texfilt Ran whole piglit without regressions. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2013-04-21 22:23:31 +01:00
if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
if (!bld->type.sign) {
gallivm: Eliminate 8.8 fixed point intermediates from AoS sampling path. This change was meant as a stepping stone to use PMADDUBSW SSSE3 instruction, but actually this refactoring by itself yields a 10% speedup on texture intensive shaders (e.g, Google Earth's ocean water w/o S3TC on a Ivy Bridge machine), while giving yielding exactly the same results, whereas PMADDUBSW only gave an extra 5%, at the expense of 2bits of precision in the interpolation. I belive that the speedup of this change comes from the reduced register pressure (as 8.8 fixed point intermediates take twice the space of 8bit unorm). Also, not dealing with 8.8 simplifies lp_bld_sample_aos.c code substantially -- it's no longer necessary to have code duplicated for low and high register halfs. Note about lp_build_sample_mipmap(): the path for num_quads > 1 is never executed (as it is faster on AVX to split the 256bit wide texture computation into two 128bit chunks, in order to leverage integer opcodes). This path might be useful in the future, so in order to verify this change did not break that path I had to apply this change: @@ -1662,11 +1662,11 @@ lp_build_sample_soa(struct gallivm_state *gallivm, /* * we only try 8-wide sampling with soa as it appears to * be a loss with aos with AVX (but it should work). * (It should be faster if we'd support avx2) */ - if (num_quads == 1 || !use_aos) { + if (/* num_quads == 1 || ! */ use_aos) { if (num_quads > 1) { if (mip_filter == PIPE_TEX_MIPFILTER_NONE) { LLVMValueRef index0 = lp_build_const_int32(gallivm, 0); /* and then run texfilt mesademo: LP_NATIVE_VECTOR_WIDTH=256 ./texfilt Ran whole piglit without regressions. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2013-04-21 22:23:31 +01:00
if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
/*
* Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
* most-significant-bit to the lowest-significant-bit, so that
* later we can just divide by 2**n instead of 2**n - 1.
*/
x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
}
/* (x * delta) >> n */
res = lp_build_mul(bld, x, delta);
res = lp_build_shr_imm(bld, res, half_width);
} else {
/*
* The rescaling trick above doesn't work for signed numbers, so
* use the 2**n - 1 divison approximation in lp_build_mul_norm
* instead.
*/
gallivm: Eliminate 8.8 fixed point intermediates from AoS sampling path. This change was meant as a stepping stone to use PMADDUBSW SSSE3 instruction, but actually this refactoring by itself yields a 10% speedup on texture intensive shaders (e.g, Google Earth's ocean water w/o S3TC on a Ivy Bridge machine), while giving yielding exactly the same results, whereas PMADDUBSW only gave an extra 5%, at the expense of 2bits of precision in the interpolation. I belive that the speedup of this change comes from the reduced register pressure (as 8.8 fixed point intermediates take twice the space of 8bit unorm). Also, not dealing with 8.8 simplifies lp_bld_sample_aos.c code substantially -- it's no longer necessary to have code duplicated for low and high register halfs. Note about lp_build_sample_mipmap(): the path for num_quads > 1 is never executed (as it is faster on AVX to split the 256bit wide texture computation into two 128bit chunks, in order to leverage integer opcodes). This path might be useful in the future, so in order to verify this change did not break that path I had to apply this change: @@ -1662,11 +1662,11 @@ lp_build_sample_soa(struct gallivm_state *gallivm, /* * we only try 8-wide sampling with soa as it appears to * be a loss with aos with AVX (but it should work). * (It should be faster if we'd support avx2) */ - if (num_quads == 1 || !use_aos) { + if (/* num_quads == 1 || ! */ use_aos) { if (num_quads > 1) { if (mip_filter == PIPE_TEX_MIPFILTER_NONE) { LLVMValueRef index0 = lp_build_const_int32(gallivm, 0); /* and then run texfilt mesademo: LP_NATIVE_VECTOR_WIDTH=256 ./texfilt Ran whole piglit without regressions. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2013-04-21 22:23:31 +01:00
assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
}
} else {
gallivm: Eliminate 8.8 fixed point intermediates from AoS sampling path. This change was meant as a stepping stone to use PMADDUBSW SSSE3 instruction, but actually this refactoring by itself yields a 10% speedup on texture intensive shaders (e.g, Google Earth's ocean water w/o S3TC on a Ivy Bridge machine), while giving yielding exactly the same results, whereas PMADDUBSW only gave an extra 5%, at the expense of 2bits of precision in the interpolation. I belive that the speedup of this change comes from the reduced register pressure (as 8.8 fixed point intermediates take twice the space of 8bit unorm). Also, not dealing with 8.8 simplifies lp_bld_sample_aos.c code substantially -- it's no longer necessary to have code duplicated for low and high register halfs. Note about lp_build_sample_mipmap(): the path for num_quads > 1 is never executed (as it is faster on AVX to split the 256bit wide texture computation into two 128bit chunks, in order to leverage integer opcodes). This path might be useful in the future, so in order to verify this change did not break that path I had to apply this change: @@ -1662,11 +1662,11 @@ lp_build_sample_soa(struct gallivm_state *gallivm, /* * we only try 8-wide sampling with soa as it appears to * be a loss with aos with AVX (but it should work). * (It should be faster if we'd support avx2) */ - if (num_quads == 1 || !use_aos) { + if (/* num_quads == 1 || ! */ use_aos) { if (num_quads > 1) { if (mip_filter == PIPE_TEX_MIPFILTER_NONE) { LLVMValueRef index0 = lp_build_const_int32(gallivm, 0); /* and then run texfilt mesademo: LP_NATIVE_VECTOR_WIDTH=256 ./texfilt Ran whole piglit without regressions. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2013-04-21 22:23:31 +01:00
assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
res = lp_build_mul(bld, x, delta);
}
if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
/*
* At this point both res and v0 only use the lower half of the bits,
* the rest is zero. Instead of add / mask, do add with half wide type.
*/
struct lp_type narrow_type;
struct lp_build_context narrow_bld;
memset(&narrow_type, 0, sizeof narrow_type);
narrow_type.sign = bld->type.sign;
narrow_type.width = bld->type.width/2;
narrow_type.length = bld->type.length*2;
lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
res = lp_build_add(&narrow_bld, v0, res);
res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
} else {
res = lp_build_add(bld, v0, res);
if (bld->type.fixed) {
/*
* We need to mask out the high order bits when lerping 8bit
* normalized colors stored on 16bits
*/
/* XXX: This step is necessary for lerping 8bit colors stored on
* 16bits, but it will be wrong for true fixed point use cases.
* Basically we need a more powerful lp_type, capable of further
* distinguishing the values interpretation from the value storage.
*/
LLVMValueRef low_bits;
low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
res = LLVMBuildAnd(builder, res, low_bits, "");
}
}
return res;
}
/**
* Linear interpolation.
*/
LLVMValueRef
lp_build_lerp(struct lp_build_context *bld,
LLVMValueRef x,
LLVMValueRef v0,
gallivm: Eliminate 8.8 fixed point intermediates from AoS sampling path. This change was meant as a stepping stone to use PMADDUBSW SSSE3 instruction, but actually this refactoring by itself yields a 10% speedup on texture intensive shaders (e.g, Google Earth's ocean water w/o S3TC on a Ivy Bridge machine), while giving yielding exactly the same results, whereas PMADDUBSW only gave an extra 5%, at the expense of 2bits of precision in the interpolation. I belive that the speedup of this change comes from the reduced register pressure (as 8.8 fixed point intermediates take twice the space of 8bit unorm). Also, not dealing with 8.8 simplifies lp_bld_sample_aos.c code substantially -- it's no longer necessary to have code duplicated for low and high register halfs. Note about lp_build_sample_mipmap(): the path for num_quads > 1 is never executed (as it is faster on AVX to split the 256bit wide texture computation into two 128bit chunks, in order to leverage integer opcodes). This path might be useful in the future, so in order to verify this change did not break that path I had to apply this change: @@ -1662,11 +1662,11 @@ lp_build_sample_soa(struct gallivm_state *gallivm, /* * we only try 8-wide sampling with soa as it appears to * be a loss with aos with AVX (but it should work). * (It should be faster if we'd support avx2) */ - if (num_quads == 1 || !use_aos) { + if (/* num_quads == 1 || ! */ use_aos) { if (num_quads > 1) { if (mip_filter == PIPE_TEX_MIPFILTER_NONE) { LLVMValueRef index0 = lp_build_const_int32(gallivm, 0); /* and then run texfilt mesademo: LP_NATIVE_VECTOR_WIDTH=256 ./texfilt Ran whole piglit without regressions. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2013-04-21 22:23:31 +01:00
LLVMValueRef v1,
unsigned flags)
{
const struct lp_type type = bld->type;
LLVMValueRef res;
assert(lp_check_value(type, x));
assert(lp_check_value(type, v0));
assert(lp_check_value(type, v1));
gallivm: Eliminate 8.8 fixed point intermediates from AoS sampling path. This change was meant as a stepping stone to use PMADDUBSW SSSE3 instruction, but actually this refactoring by itself yields a 10% speedup on texture intensive shaders (e.g, Google Earth's ocean water w/o S3TC on a Ivy Bridge machine), while giving yielding exactly the same results, whereas PMADDUBSW only gave an extra 5%, at the expense of 2bits of precision in the interpolation. I belive that the speedup of this change comes from the reduced register pressure (as 8.8 fixed point intermediates take twice the space of 8bit unorm). Also, not dealing with 8.8 simplifies lp_bld_sample_aos.c code substantially -- it's no longer necessary to have code duplicated for low and high register halfs. Note about lp_build_sample_mipmap(): the path for num_quads > 1 is never executed (as it is faster on AVX to split the 256bit wide texture computation into two 128bit chunks, in order to leverage integer opcodes). This path might be useful in the future, so in order to verify this change did not break that path I had to apply this change: @@ -1662,11 +1662,11 @@ lp_build_sample_soa(struct gallivm_state *gallivm, /* * we only try 8-wide sampling with soa as it appears to * be a loss with aos with AVX (but it should work). * (It should be faster if we'd support avx2) */ - if (num_quads == 1 || !use_aos) { + if (/* num_quads == 1 || ! */ use_aos) { if (num_quads > 1) { if (mip_filter == PIPE_TEX_MIPFILTER_NONE) { LLVMValueRef index0 = lp_build_const_int32(gallivm, 0); /* and then run texfilt mesademo: LP_NATIVE_VECTOR_WIDTH=256 ./texfilt Ran whole piglit without regressions. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2013-04-21 22:23:31 +01:00
assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
if (type.norm) {
struct lp_type wide_type;
struct lp_build_context wide_bld;
LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
assert(type.length >= 2);
/*
* Create a wider integer type, enough to hold the
* intermediate result of the multiplication.
*/
memset(&wide_type, 0, sizeof wide_type);
wide_type.sign = type.sign;
wide_type.width = type.width*2;
wide_type.length = type.length/2;
lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
lp_build_unpack2_native(bld->gallivm, type, wide_type, x, &xl, &xh);
lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
/*
* Lerp both halves.
*/
gallivm: Eliminate 8.8 fixed point intermediates from AoS sampling path. This change was meant as a stepping stone to use PMADDUBSW SSSE3 instruction, but actually this refactoring by itself yields a 10% speedup on texture intensive shaders (e.g, Google Earth's ocean water w/o S3TC on a Ivy Bridge machine), while giving yielding exactly the same results, whereas PMADDUBSW only gave an extra 5%, at the expense of 2bits of precision in the interpolation. I belive that the speedup of this change comes from the reduced register pressure (as 8.8 fixed point intermediates take twice the space of 8bit unorm). Also, not dealing with 8.8 simplifies lp_bld_sample_aos.c code substantially -- it's no longer necessary to have code duplicated for low and high register halfs. Note about lp_build_sample_mipmap(): the path for num_quads > 1 is never executed (as it is faster on AVX to split the 256bit wide texture computation into two 128bit chunks, in order to leverage integer opcodes). This path might be useful in the future, so in order to verify this change did not break that path I had to apply this change: @@ -1662,11 +1662,11 @@ lp_build_sample_soa(struct gallivm_state *gallivm, /* * we only try 8-wide sampling with soa as it appears to * be a loss with aos with AVX (but it should work). * (It should be faster if we'd support avx2) */ - if (num_quads == 1 || !use_aos) { + if (/* num_quads == 1 || ! */ use_aos) { if (num_quads > 1) { if (mip_filter == PIPE_TEX_MIPFILTER_NONE) { LLVMValueRef index0 = lp_build_const_int32(gallivm, 0); /* and then run texfilt mesademo: LP_NATIVE_VECTOR_WIDTH=256 ./texfilt Ran whole piglit without regressions. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2013-04-21 22:23:31 +01:00
flags |= LP_BLD_LERP_WIDE_NORMALIZED;
resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh);
} else {
gallivm: Eliminate 8.8 fixed point intermediates from AoS sampling path. This change was meant as a stepping stone to use PMADDUBSW SSSE3 instruction, but actually this refactoring by itself yields a 10% speedup on texture intensive shaders (e.g, Google Earth's ocean water w/o S3TC on a Ivy Bridge machine), while giving yielding exactly the same results, whereas PMADDUBSW only gave an extra 5%, at the expense of 2bits of precision in the interpolation. I belive that the speedup of this change comes from the reduced register pressure (as 8.8 fixed point intermediates take twice the space of 8bit unorm). Also, not dealing with 8.8 simplifies lp_bld_sample_aos.c code substantially -- it's no longer necessary to have code duplicated for low and high register halfs. Note about lp_build_sample_mipmap(): the path for num_quads > 1 is never executed (as it is faster on AVX to split the 256bit wide texture computation into two 128bit chunks, in order to leverage integer opcodes). This path might be useful in the future, so in order to verify this change did not break that path I had to apply this change: @@ -1662,11 +1662,11 @@ lp_build_sample_soa(struct gallivm_state *gallivm, /* * we only try 8-wide sampling with soa as it appears to * be a loss with aos with AVX (but it should work). * (It should be faster if we'd support avx2) */ - if (num_quads == 1 || !use_aos) { + if (/* num_quads == 1 || ! */ use_aos) { if (num_quads > 1) { if (mip_filter == PIPE_TEX_MIPFILTER_NONE) { LLVMValueRef index0 = lp_build_const_int32(gallivm, 0); /* and then run texfilt mesademo: LP_NATIVE_VECTOR_WIDTH=256 ./texfilt Ran whole piglit without regressions. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2013-04-21 22:23:31 +01:00
res = lp_build_lerp_simple(bld, x, v0, v1, flags);
}
return res;
}
gallivm: Eliminate 8.8 fixed point intermediates from AoS sampling path. This change was meant as a stepping stone to use PMADDUBSW SSSE3 instruction, but actually this refactoring by itself yields a 10% speedup on texture intensive shaders (e.g, Google Earth's ocean water w/o S3TC on a Ivy Bridge machine), while giving yielding exactly the same results, whereas PMADDUBSW only gave an extra 5%, at the expense of 2bits of precision in the interpolation. I belive that the speedup of this change comes from the reduced register pressure (as 8.8 fixed point intermediates take twice the space of 8bit unorm). Also, not dealing with 8.8 simplifies lp_bld_sample_aos.c code substantially -- it's no longer necessary to have code duplicated for low and high register halfs. Note about lp_build_sample_mipmap(): the path for num_quads > 1 is never executed (as it is faster on AVX to split the 256bit wide texture computation into two 128bit chunks, in order to leverage integer opcodes). This path might be useful in the future, so in order to verify this change did not break that path I had to apply this change: @@ -1662,11 +1662,11 @@ lp_build_sample_soa(struct gallivm_state *gallivm, /* * we only try 8-wide sampling with soa as it appears to * be a loss with aos with AVX (but it should work). * (It should be faster if we'd support avx2) */ - if (num_quads == 1 || !use_aos) { + if (/* num_quads == 1 || ! */ use_aos) { if (num_quads > 1) { if (mip_filter == PIPE_TEX_MIPFILTER_NONE) { LLVMValueRef index0 = lp_build_const_int32(gallivm, 0); /* and then run texfilt mesademo: LP_NATIVE_VECTOR_WIDTH=256 ./texfilt Ran whole piglit without regressions. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2013-04-21 22:23:31 +01:00
/**
* Bilinear interpolation.
*
* Values indices are in v_{yx}.
*/
LLVMValueRef
lp_build_lerp_2d(struct lp_build_context *bld,
LLVMValueRef x,
LLVMValueRef y,
LLVMValueRef v00,
LLVMValueRef v01,
LLVMValueRef v10,
gallivm: Eliminate 8.8 fixed point intermediates from AoS sampling path. This change was meant as a stepping stone to use PMADDUBSW SSSE3 instruction, but actually this refactoring by itself yields a 10% speedup on texture intensive shaders (e.g, Google Earth's ocean water w/o S3TC on a Ivy Bridge machine), while giving yielding exactly the same results, whereas PMADDUBSW only gave an extra 5%, at the expense of 2bits of precision in the interpolation. I belive that the speedup of this change comes from the reduced register pressure (as 8.8 fixed point intermediates take twice the space of 8bit unorm). Also, not dealing with 8.8 simplifies lp_bld_sample_aos.c code substantially -- it's no longer necessary to have code duplicated for low and high register halfs. Note about lp_build_sample_mipmap(): the path for num_quads > 1 is never executed (as it is faster on AVX to split the 256bit wide texture computation into two 128bit chunks, in order to leverage integer opcodes). This path might be useful in the future, so in order to verify this change did not break that path I had to apply this change: @@ -1662,11 +1662,11 @@ lp_build_sample_soa(struct gallivm_state *gallivm, /* * we only try 8-wide sampling with soa as it appears to * be a loss with aos with AVX (but it should work). * (It should be faster if we'd support avx2) */ - if (num_quads == 1 || !use_aos) { + if (/* num_quads == 1 || ! */ use_aos) { if (num_quads > 1) { if (mip_filter == PIPE_TEX_MIPFILTER_NONE) { LLVMValueRef index0 = lp_build_const_int32(gallivm, 0); /* and then run texfilt mesademo: LP_NATIVE_VECTOR_WIDTH=256 ./texfilt Ran whole piglit without regressions. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2013-04-21 22:23:31 +01:00
LLVMValueRef v11,
unsigned flags)
{
gallivm: Eliminate 8.8 fixed point intermediates from AoS sampling path. This change was meant as a stepping stone to use PMADDUBSW SSSE3 instruction, but actually this refactoring by itself yields a 10% speedup on texture intensive shaders (e.g, Google Earth's ocean water w/o S3TC on a Ivy Bridge machine), while giving yielding exactly the same results, whereas PMADDUBSW only gave an extra 5%, at the expense of 2bits of precision in the interpolation. I belive that the speedup of this change comes from the reduced register pressure (as 8.8 fixed point intermediates take twice the space of 8bit unorm). Also, not dealing with 8.8 simplifies lp_bld_sample_aos.c code substantially -- it's no longer necessary to have code duplicated for low and high register halfs. Note about lp_build_sample_mipmap(): the path for num_quads > 1 is never executed (as it is faster on AVX to split the 256bit wide texture computation into two 128bit chunks, in order to leverage integer opcodes). This path might be useful in the future, so in order to verify this change did not break that path I had to apply this change: @@ -1662,11 +1662,11 @@ lp_build_sample_soa(struct gallivm_state *gallivm, /* * we only try 8-wide sampling with soa as it appears to * be a loss with aos with AVX (but it should work). * (It should be faster if we'd support avx2) */ - if (num_quads == 1 || !use_aos) { + if (/* num_quads == 1 || ! */ use_aos) { if (num_quads > 1) { if (mip_filter == PIPE_TEX_MIPFILTER_NONE) { LLVMValueRef index0 = lp_build_const_int32(gallivm, 0); /* and then run texfilt mesademo: LP_NATIVE_VECTOR_WIDTH=256 ./texfilt Ran whole piglit without regressions. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2013-04-21 22:23:31 +01:00
LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
return lp_build_lerp(bld, y, v0, v1, flags);
}
LLVMValueRef
lp_build_lerp_3d(struct lp_build_context *bld,
LLVMValueRef x,
LLVMValueRef y,
LLVMValueRef z,
LLVMValueRef v000,
LLVMValueRef v001,
LLVMValueRef v010,
LLVMValueRef v011,
LLVMValueRef v100,
LLVMValueRef v101,
LLVMValueRef v110,
gallivm: Eliminate 8.8 fixed point intermediates from AoS sampling path. This change was meant as a stepping stone to use PMADDUBSW SSSE3 instruction, but actually this refactoring by itself yields a 10% speedup on texture intensive shaders (e.g, Google Earth's ocean water w/o S3TC on a Ivy Bridge machine), while giving yielding exactly the same results, whereas PMADDUBSW only gave an extra 5%, at the expense of 2bits of precision in the interpolation. I belive that the speedup of this change comes from the reduced register pressure (as 8.8 fixed point intermediates take twice the space of 8bit unorm). Also, not dealing with 8.8 simplifies lp_bld_sample_aos.c code substantially -- it's no longer necessary to have code duplicated for low and high register halfs. Note about lp_build_sample_mipmap(): the path for num_quads > 1 is never executed (as it is faster on AVX to split the 256bit wide texture computation into two 128bit chunks, in order to leverage integer opcodes). This path might be useful in the future, so in order to verify this change did not break that path I had to apply this change: @@ -1662,11 +1662,11 @@ lp_build_sample_soa(struct gallivm_state *gallivm, /* * we only try 8-wide sampling with soa as it appears to * be a loss with aos with AVX (but it should work). * (It should be faster if we'd support avx2) */ - if (num_quads == 1 || !use_aos) { + if (/* num_quads == 1 || ! */ use_aos) { if (num_quads > 1) { if (mip_filter == PIPE_TEX_MIPFILTER_NONE) { LLVMValueRef index0 = lp_build_const_int32(gallivm, 0); /* and then run texfilt mesademo: LP_NATIVE_VECTOR_WIDTH=256 ./texfilt Ran whole piglit without regressions. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2013-04-21 22:23:31 +01:00
LLVMValueRef v111,
unsigned flags)
{
gallivm: Eliminate 8.8 fixed point intermediates from AoS sampling path. This change was meant as a stepping stone to use PMADDUBSW SSSE3 instruction, but actually this refactoring by itself yields a 10% speedup on texture intensive shaders (e.g, Google Earth's ocean water w/o S3TC on a Ivy Bridge machine), while giving yielding exactly the same results, whereas PMADDUBSW only gave an extra 5%, at the expense of 2bits of precision in the interpolation. I belive that the speedup of this change comes from the reduced register pressure (as 8.8 fixed point intermediates take twice the space of 8bit unorm). Also, not dealing with 8.8 simplifies lp_bld_sample_aos.c code substantially -- it's no longer necessary to have code duplicated for low and high register halfs. Note about lp_build_sample_mipmap(): the path for num_quads > 1 is never executed (as it is faster on AVX to split the 256bit wide texture computation into two 128bit chunks, in order to leverage integer opcodes). This path might be useful in the future, so in order to verify this change did not break that path I had to apply this change: @@ -1662,11 +1662,11 @@ lp_build_sample_soa(struct gallivm_state *gallivm, /* * we only try 8-wide sampling with soa as it appears to * be a loss with aos with AVX (but it should work). * (It should be faster if we'd support avx2) */ - if (num_quads == 1 || !use_aos) { + if (/* num_quads == 1 || ! */ use_aos) { if (num_quads > 1) { if (mip_filter == PIPE_TEX_MIPFILTER_NONE) { LLVMValueRef index0 = lp_build_const_int32(gallivm, 0); /* and then run texfilt mesademo: LP_NATIVE_VECTOR_WIDTH=256 ./texfilt Ran whole piglit without regressions. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2013-04-21 22:23:31 +01:00
LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
return lp_build_lerp(bld, z, v0, v1, flags);
}
2009-08-19 20:13:49 +01:00
/**
* Generate min(a, b)
* Do checks for special cases but not for nans.
2009-08-19 20:13:49 +01:00
*/
2009-08-01 17:27:05 +01:00
LLVMValueRef
lp_build_min(struct lp_build_context *bld,
LLVMValueRef a,
LLVMValueRef b)
2009-08-01 17:27:05 +01:00
{
2010-08-09 17:30:33 +01:00
assert(lp_check_value(bld->type, a));
assert(lp_check_value(bld->type, b));
if(a == bld->undef || b == bld->undef)
return bld->undef;
if(a == b)
return a;
if (bld->type.norm) {
if (!bld->type.sign) {
if (a == bld->zero || b == bld->zero) {
return bld->zero;
}
}
if(a == bld->one)
return b;
if(b == bld->one)
return a;
}
return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
2009-08-01 17:27:05 +01:00
}
/**
* Generate min(a, b)
* NaN's are handled according to the behavior specified by the
* nan_behavior argument.
*/
LLVMValueRef
lp_build_min_ext(struct lp_build_context *bld,
LLVMValueRef a,
LLVMValueRef b,
enum gallivm_nan_behavior nan_behavior)
{
assert(lp_check_value(bld->type, a));
assert(lp_check_value(bld->type, b));
if(a == bld->undef || b == bld->undef)
return bld->undef;
if(a == b)
return a;
if (bld->type.norm) {
if (!bld->type.sign) {
if (a == bld->zero || b == bld->zero) {
return bld->zero;
}
}
if(a == bld->one)
return b;
if(b == bld->one)
return a;
}
return lp_build_min_simple(bld, a, b, nan_behavior);
}
2009-08-19 20:13:49 +01:00
/**
* Generate max(a, b)
* Do checks for special cases, but NaN behavior is undefined.
2009-08-19 20:13:49 +01:00
*/
2009-08-01 17:27:05 +01:00
LLVMValueRef
lp_build_max(struct lp_build_context *bld,
LLVMValueRef a,
LLVMValueRef b)
2009-08-01 17:27:05 +01:00
{
2010-08-09 17:30:33 +01:00
assert(lp_check_value(bld->type, a));
assert(lp_check_value(bld->type, b));
if(a == bld->undef || b == bld->undef)
return bld->undef;
if(a == b)
return a;
if(bld->type.norm) {
if(a == bld->one || b == bld->one)
return bld->one;
if (!bld->type.sign) {
if (a == bld->zero) {
return b;
}
if (b == bld->zero) {
return a;
}
}
}
return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
2009-08-01 17:27:05 +01:00
}
/**
* Generate max(a, b)
* Checks for special cases.
* NaN's are handled according to the behavior specified by the
* nan_behavior argument.
*/
LLVMValueRef
lp_build_max_ext(struct lp_build_context *bld,
LLVMValueRef a,
LLVMValueRef b,
enum gallivm_nan_behavior nan_behavior)
{
assert(lp_check_value(bld->type, a));
assert(lp_check_value(bld->type, b));
if(a == bld->undef || b == bld->undef)
return bld->undef;
if(a == b)
return a;
if(bld->type.norm) {
if(a == bld->one || b == bld->one)
return bld->one;
if (!bld->type.sign) {
if (a == bld->zero) {
return b;
}
if (b == bld->zero) {
return a;
}
}
}
return lp_build_max_simple(bld, a, b, nan_behavior);
}
/**
* Generate clamp(a, min, max)
2013-11-11 14:29:25 +00:00
* NaN behavior (for any of a, min, max) is undefined.
* Do checks for special cases.
*/
LLVMValueRef
lp_build_clamp(struct lp_build_context *bld,
LLVMValueRef a,
LLVMValueRef min,
LLVMValueRef max)
{
2010-08-09 17:30:33 +01:00
assert(lp_check_value(bld->type, a));
assert(lp_check_value(bld->type, min));
assert(lp_check_value(bld->type, max));
a = lp_build_min(bld, a, max);
a = lp_build_max(bld, a, min);
return a;
}
2013-11-11 14:29:25 +00:00
/**
* Generate clamp(a, 0, 1)
* A NaN will get converted to zero.
*/
LLVMValueRef
lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
LLVMValueRef a)
{
a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
a = lp_build_min(bld, a, bld->one);
return a;
}
2009-08-19 20:13:49 +01:00
/**
* Generate abs(a)
*/
LLVMValueRef
lp_build_abs(struct lp_build_context *bld,
LLVMValueRef a)
{
LLVMBuilderRef builder = bld->gallivm->builder;
const struct lp_type type = bld->type;
LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2010-08-09 17:30:33 +01:00
assert(lp_check_value(type, a));
if(!type.sign)
return a;
if(type.floating) {
if (0x0306 <= HAVE_LLVM && HAVE_LLVM < 0x0309) {
/* Workaround llvm.org/PR27332 */
LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
unsigned long long absMask = ~(1ULL << (type.width - 1));
LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
a = LLVMBuildBitCast(builder, a, int_vec_type, "");
a = LLVMBuildAnd(builder, a, mask, "");
a = LLVMBuildBitCast(builder, a, vec_type, "");
return a;
} else {
char intrinsic[32];
lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
}
}
if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
switch(type.width) {
case 8:
return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
case 16:
return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
case 32:
return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
}
}
else if (type.width*type.length == 256 && util_cpu_caps.has_avx2) {
switch(type.width) {
case 8:
return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a);
case 16:
return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a);
case 32:
return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a);
}
}
gallivm,draw,llvmpipe: Support wider native registers. Squashed commit of the following: commit 7acb7b4f60dc505af3dd00dcff744f80315d5b0e Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:46:31 2012 +0100 draw: Don't use dynamically sized arrays. Not supported by MSVC. commit 5810c28c83647612cb372d1e763fd9d7780df3cb Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:44:16 2012 +0100 gallivm,llvmpipe: Don't use expressions with PIPE_ALIGN_VAR(). MSVC doesn't accept exceptions in _declspec(align(...)). Use a define instead. commit 8aafd1457ba572a02b289b3f3411e99a3c056072 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:41:56 2012 +0100 gallium/util: Make u_cpu_detect.h header C++ safe. commit 5795248350771f899cfbfc1a3a58f1835eb2671d Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 2 12:08:01 2012 +0100 gallium/util: Add ULL suffix to large constants. As suggested by Andy Furniss: it looks like some old gcc versions require it. commit 4c66c22727eff92226544c7d43c4eb94de359e10 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Truly disable INF/NAN tests on MSVC. Thanks to Brian for spotting this. commit 8bce274c7fad578d7eb656d9a1413f5c0844c94e Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Disable INF/NAN tests on MSVC. Somehow they are not recognized as constants. commit 6868649cff8d7fd2e2579c28d0b74ef6dd4f9716 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 15:05:24 2012 +0200 gallivm: Cleanup the 2 x 8 float -> 16 ub special path in lp_build_conv. No behaviour change intended, like 7b98455fb40c2df84cfd3cdb1eb7650f67c8a751. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 5147a0949c4407e8bce9e41d9859314b4a9ccf77 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 14:28:19 2012 +0200 gallivm: (trivial) fix issues with multiple-of-4 texture fetch Some formats can't handle non-multiple of 4 fetches I believe, but everything must support length 1 and multiples of 4. So avoid going to scalar fetch (which is very costly) just because length isn't 4. Also extend the hack to not use shift with variable count for yuv formats to arbitrary length (larger than 1) - doesn't matter how many elements we have we always want to avoid it unless we have variable shift count instruction (which we should get with avx2). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 87ebcb1bd71fa4c739451ec8ca89a7f29b168c08 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jul 4 02:09:55 2012 +0200 gallivm: (trivial) fix typo for wrap repeat mode in linear filtering aos code This would lead to bogus coordinates at the edges. (undetected by piglit because this path is only taken for block-based formats). Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 3a42717101b1619874c8932a580c0b9e6896b557 Author: José Fonseca <jfonseca@vmware.com> Date: Tue Jul 3 19:42:49 2012 +0100 gallivm: Fix TGSI integer translation with AVX. commit d71ff104085c196b16426081098fb0bde128ce4f Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 15:17:41 2012 +0100 llvmpipe: Fix LLVM JIT linear path. It was not working properly because it was looking at the JIT function before it was actually compiled. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit a94df0386213e1f5f9a6ed470c535f9688ec0a1b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Jun 28 18:07:10 2012 +0100 gallivm: Refactor lp_build_broadcast(_scalar) to share code. Doesn't really change the generated assembly, but produces more compact IR, and of course, makes code more consistent. Reviewed-by: Brian Paul <brianp@vmware.com> commit 66712ba2731fc029fa246d4fc477d61ab785edb5 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 17:30:13 2012 +0100 gallivm: Make LLVMContextRef a singleton. There are any places inside LLVM that depend on it. Too many to attempt to fix. Reviewed-by: Brian Paul <brianp@vmware.com> commit ff5fb7897495ac263f0b069370fab701b70dccef Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 28 18:15:27 2012 +0200 gallivm: don't use 8-wide texture fetch in aos path This appears to be a slight loss usually. There are probably several reasons for that: - fetching itself is scalar - filtering is pure int code hence needs splitting anyway, same for the final texel offset calculations - texture wrap related code, which can be done 8-wide, is slightly more complex with floats (with clamp_to_edge) and float operations generally more costly hence probably not much faster overall - the code needed to split when encountering different mip levels for the quads, adding complexity So, just split always for aos path (but leave it 8-wide for soa, since we do 8-wide filtering there when possible). This should certainly be revisited if we'd have avx2 support. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ce8032b43dcd8e8d816cbab6428f54b0798f945d Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:41:19 2012 +0200 gallivm: (trivial) don't extract fparts variable if not needed Did not have any consequences but unnecessary. commit aaa9aaed8f80dc282492f62aa583a7ee23a4c6d5 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:09:06 2012 +0200 gallivm: fix precision issue in aos linear int wrap code now not just passes at a quick glance but also with piglit... If we do the wrapping with floats, we also need to set the weights accordingly. We can potentially end up with different (integer) coordinates than what the integer calculations would have chosen, which means the integer weights calculated previously in this case are completely wrong. Well at least that's what I think happens, at least recalculating the weights helps. (Some day really should refactor all the wrapping, so we do whatever is fastest independent of 16bit int aos or 32bit float soa filtering.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit fd6f18588ced7ac8e081892f3bab2916623ad7a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 11:15:53 2012 +0100 gallium/util: Fix parsing of options with underscore. For example GALLIVM_DEBUG=no_brilinear which was being parsed as two options, "no" and "brilinear". commit 09a8f809088178a03e49e409fa18f1ac89561837 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 26 15:00:14 2012 +0100 gallivm: Added a generic lp_build_print_value which prints a LLVMValueRef. Updated lp_build_printf to share common code. Removed specific lp_build_print_vecX. Reviewed-by: José Fonseca <jfonseca@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit e59bdcc2c075931bfba2a84967a5ecd1dedd6eb0 Author: José Fonseca <jfonseca@vmware.com> Date: Wed May 16 15:00:23 2012 +0100 draw,llvmpipe: Avoid named struct types on LLVM 3.0 and later. Starting with LLVM 3.0, named structures are meant not for debugging, but for recursive data types, previously also known as opaque types. The recursive nature of these types leads to several memory management difficulties. Given that we don't actually need recursive types, avoid them altogether. This is an attempt to address fdo bugs 41791 and 44466. The issue is somewhat random so there's no easy way to check how effective this is. Cherry-picked from 9af1ba565dfd5cef9ee938bb7c04767d14878fbf commit df6070f618a203c7a876d984c847cde4cbc26bdb Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 14:42:53 2012 +0200 gallivm: (trivial) fix typo in faster aos linear int wrap code no longer crashes, now REALLY tested. commit d8f98dce452c867214e6782e86dc08562643c862 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 18:20:58 2012 +0200 llvmpipe: (trivial) remove bogus optimization for float aos repeat wrap This optimization for nearest filtering on the linear path generated likely bogus results, and the int path didn't have any optimizations there since the only shader using force_nearest apparently uses clamp_to_edge not repeat wrap anyway. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit c4e271a0631087c795e756a5bb6b046043b5099d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 23:01:52 2012 +0200 gallivm: faster repeat wrap for linear aos path too Even if we already have scaled integer coords, it's way faster to use the original float coord (plus some conversions) rather than use URem. The choice of what to do for texture wrapping is not really tied to int aos or float soa filtering though for some modes there can be some gains (because of easier weight calculations). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 1174a75b1806e92aee4264ffe0ffe7e70abbbfa3 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 14:39:22 2012 +0200 gallivm: improve npot tex wrap repeat in linear soa path URem gets translated into series of scalar divisions so just about anything else is faster. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit f849ffaa499ed96fa0efd3594fce255c7f22891b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 00:40:35 2012 +0100 gallivm: (trivial) fix near-invisible shift-space typo I blame the keyboard. commit 5298a0b19fe672aebeb70964c0797d5921b51cf0 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:24:28 2012 +0200 gallivm: add new intrinsic helper to deal with arbitrary vector length This helper will split vectors which are too large for the hw, or expand them if they are too small, so a caller of a function using intrinsics which uses such sizes need not split (or expand) the vectors manually and the function will still use the intrinsic instead of dropping back to generic llvm code. It can also accept scalars for use with pseudo-vector intrinsics (only useful for float arguments, all x86 scalar simd float intrinsics use 4vf32). Only used for lp_build_min/max() for now (also added the scalar float case for these while there). (Other basic binary functions could use it easily, whereas functions with a different interface would need different helpers.) Expanding vectors isn't widely used, because we always try to use build contexts with native hw vector sizes. But it might (or not) be nicer if this wouldn't need to be done, the generated code should in theory stay the same (it does get hit by lp_build_rho though already since we didn't have a intrinsic for the scalar lp_build_max case before). v2: incorporated Brian's feedback, and also made the scalar min/max case work instead of crash (all scalar simd float intrinsics take 4vf32 as argument, probably the reason why it wasn't used before). Moved to lp_bld_intr based on José's request, and passing intrinsic size instead of length. Ideally we'd derive the source type info from the passed in llvm value refs and process some llvmtype return type so we could handle intrinsics where the source and destination type isn't the same (like float/int conversions, packing instructions) but that's a bit too complicated for now. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 01aa760b99ec0b2dc8ce57a43650e83f8c1becdf Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:19:18 2012 +0200 gallivm: (trivial) increase max code size for shader disassembly 64kB was just short of what I needed (which caused a crash) hence increase to 96kB (should probably be smarter about that). commit 74aa739138d981311ce13076388382b5e89c6562 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:53:29 2012 +0100 gallivm: simplify aos float tex wrap repeat nearest just handle pot and npot the same. The previous pot handling ended up with exactly the same instructions plus 2 more (leave it in the soa path though since it is probably still cheaper there). While here also fix a issue which would cause a crash after an assert. commit 0e1e755645e9e49cfaa2025191e3245ccd723564 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:29:24 2012 +0100 gallivm: (trivial) skip floor rounding in ifloor when not signed This was only done for the non-sse41 case before, but even with sse41 this is obviously unnecessary (some callers already call itrunc in this case anyway but some might not). commit 7f01a62f27dcb1d52597b24825931e88bae76f33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:23:12 2012 +0100 gallivm: (trivial) fix bogus comments commit 5c85be25fd82e28490274c468ce7f3e6e8c1d416 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 20 11:51:57 2012 +0100 translate: Free elt8_func/elt16_func too. These were leaking. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 0ad498f36fb6f7458c7cffa73b6598adceee0a6c Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 15:55:34 2012 +0200 gallivm: fix bug for tex wrap repeat with linear sampling in aos float path The comparison needs to be against length not length_minus_one, otherwise the max texel is never chosen (for the second coordinate). Fixes piglit texwrap-1D-npot-proj (and 2D/3D versions). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit d1ad65937c5b76407dc2499b7b774ab59341209e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 16:13:43 2012 +0200 gallivm: simplify soa tex wrap repeat with npot textures and no mip filtering Similar to what is already done in aos sampling for the float path (but not the int path since we don't get normalized float coordinates there). URem is expensive and the calculation is done trivially with normalized floats instead (at least with sse41-capable cpus). (Some day should probably do the same for the mip filter path but it's much more complicated there hence the gain is smaller.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit e1e23f57ba9b910295c306d148f15643acc3fc83 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:38:56 2012 +0200 llvmpipe: (trivial) remove duplicated function declaration Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 07ca57eb09e04c48a157733255427ef5de620861 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:37:34 2012 +0200 llvmpipe: destroy setup variants on context destruction lp_delete_setup_variants() used to be called in garbage collection, but this no longer exists hence the setup shaders never got freed. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ed0003c633859a45f9963a479f4c15ae0ef1dca3 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 16:25:29 2012 +0100 gallivm: handle different ilod parts for multiple quad sampling This fixes filtering when the integer part of the lod is not the same for all quads. I'm not fully convinced of that solution yet as it just splits the vector if the levels to be sampled from are different. But otherwise we'd need to do things like some minify steps, and getting mip level base address separately anyway hence it wouldn't really look like much of a win (and making the code even more complex). This should now give identical results to single quad sampling. commit 8580ac4cfc43a64df55e84ac71ce1a774d33c0d2 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:14:47 2012 +0200 gallivm: de-duplicate sample code common to soa and aos sampling There doesn't seem to be any reason why this code dealing with cube face selection, lod and mip level calculation is separate in aos and soa sampling, and I am sick of having it to change in both places. commit fb541e5f957408ce305b272100196f1e12e5b1e8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:15:41 2012 +0200 gallivm: do mip filtering with per quad lod_fpart This gives better results for mip filtering, though the generated code might not be optimal. For now it also creates some artifacts if the lod_ipart isn't the same for all quads, since instead of using the same mip weight for all quads as previously (which just caused non-smooth gradients) this now will use the right weights but with the wrong mip level in this case (can easily be seen with things like texfilt, mipmap_tunnel). v2: use logic helper suggested by José, and fix issue with negative lod_fpart values commit f1cc84eef7d826a20fab6cd8ccef9a275ff78967 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 13 18:35:25 2012 +0200 gallivm: (trivial) fix bogus assert in lp_build_unpack_broadcast_aos_scalars commit 7c17dbae8ae290df9ce0f50781a09e8ed640c044 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:11:14 2012 +0100 util: Reimplement half <-> float conversions. Removed u_half.py used to generate the table for previous method. Previous implementation of float to half conversion was faulty for denormalised and NaNs and would require extra logic to fix, thus making the speedup of using tables irrelevant. commit 7762f59274070e1dd4b546f5cb431c2eb71ae5c3 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:12:16 2012 +0100 tests: Updated tests to properly handle NaN for half floats. commit fa94c135aea5911fd93d5dfb6e6f157fb40dce5e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:33:10 2012 +0200 gallivm: do mip level calculations per quad This is the final piece which shouldn't change the rendering output yet. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 23cbeaddfe03c09ca18c45d28955515317ffcf4c Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Jun 9 00:54:21 2012 +0200 gallivm: do per-quad cube face selection Doesn't quite fix the piglit cubemap test (not sure why actually) but doing per-quad face selection is doing the right thing and definitely an improvement. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit abfb372b3702ac97ac8b5aa80ad1b94a2cc39d33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:22:59 2012 +0200 gallivm: do all lod calculations per quad Still no functional change but lod is now converted to scalar after lod calculations. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 519368632747ae03feb5bca9c655eccbc5b751b4 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:46:10 2012 +0100 gallivm: Added support for half-float to float conversion in lp_build_conv. Updated various utility functions to support this change. commit 135b4d683a4c95f7577ba27b9bffa4a6fbd2c2e7 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:02:46 2012 +0100 gallivm: Added function for half-float to float conversion. Updated lp_build_format_aos_array to support half-float source. commit 37d648827406a20c5007abeb177698723ed86673 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:55:18 2012 +0100 util: Updated u_format_tests to rigidly test half-float boundary values. commit 2ad18165d96e578aa9046df7c93cb1c3284d8c6b Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:54:16 2012 +0100 llvmpipe: Updated lp_test_format to properly handle Inf/NaN results. commit 78740acf25aeba8a7d146493dd5c966e22c27b73 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:53:30 2012 +0100 util: Added functions for checking NaN / Inf for double and half-floats. commit 35e9f640ae01241f9e0d67fe893bbbf564c05809 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:05:13 2012 +0200 gallivm: Fix calculating rho for 3d textures for the single-quad case Discovered by accident, this looks like a very old typo bug. commit fc1220c636326536fd0541913154e62afa7cd1d8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:04:59 2012 +0200 gallivm: do calcs per-quad in lp_build_rho Still convert to scalar at the end of the function. commit 50a887ffc550bf310a6988fa2cea5c24d38c1a41 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 21 23:21:50 2012 +0200 gallivm: (trivial) return scalar in lp_build_extract_range for length 1 vectors Our type system on top of llvm's one doesn't generally support vectors of length 1, instead using scalars. So we should return a scalar from this function instead of having to bitcast the vector with length 1 later elsewhere. commit 80c71c621f9391f0f9230460198d861643324876 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 17:49:15 2012 +0100 draw: Fixed bad merge error commit c47401cfad0c9167de20ff560654f533579f452c Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:29:30 2012 +0100 draw: Updated store_clip to store whole vectors instead of individual elements. commit 2d9c1ad74b0b0b41861fffcecde39f09cc27f1cf Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:28:32 2012 +0100 gallivm: Added lp_build_fetch_rgba_aos_array. A version of lp_build_fetch_rgba_aos which is targeted at simple array formats. Reads the whole vector from memory in one, instead of reading each element individually. Tested with mesa tests and demos. commit ff7805dc2b6ef6d8b11ec4e54aab1633aef29ac8 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:27:40 2012 +0100 gallivm: Added lp_build_pad_vector. This function pads a vector with undef to a desired length. commit 701f50acef24a2791dabf4730e5b5687d6eb875d Author: James Benton <jbenton@vmware.com> Date: Fri May 18 17:27:19 2012 +0100 util: Added util_format_is_array. This function checks whether a format description is in a simple array format. commit 5e0a7fa543dcd009de26f34a7926674190fa6246 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:13:47 2012 +0100 draw: Removed draw_llvm_translate_from and draw/draw_llvm_translate.c. This is "replaced" by adding an optimised path in lp_build_fetch_rgba_aos in an upcoming patch. commit 8c886d6a7dd3fb464ecf031de6f747cb33e5361d Author: James Benton <jbenton@vmware.com> Date: Wed May 16 15:02:31 2012 +0100 draw: Modified store_aos to write the vector as one, not individual elements. commit 37337f3d657e21dfd662c7b26d61cb0f8cfa6f17 Author: James Benton <jbenton@vmware.com> Date: Wed May 16 14:16:23 2012 +0100 draw: Changed aos_to_soa to use lp_build_transpose_aos. commit bd2b69ce5d5c94b067944d1dcd5df9f8e84548f1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:14:27 2012 +0100 draw: Changed soa_to_aos to use lp_build_transpose_aos. commit 0b98a950d29a116e82ce31dfe7b82cdadb632f2b Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:45 2012 +0100 gallivm: Added lp_build_transpose_aos which converts between aos and soa. commit 69ea84531ad46fd145eb619ed1cedbe97dde7cb5 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:01 2012 +0100 gallivm: Added lp_build_interleave2_half aimed at AVX unpack instructions. commit 7a4cb1349dd35c18144ad5934525cfb9436792f9 Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 22 11:54:14 2012 +0100 gallivm: Fix build on Windows. MC-JIT not yet supported there. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit afd105fc16bb75d874e418046b80d9cc578818a1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:17:26 2012 +0100 llvmpipe: Added a error counter to lp_test_conv. Useful for keeping track of progress when fixing errors! Signed-off-by: José Fonseca <jfonseca@vmware.com> commit b644907d08c10a805657841330fc23db3963d59c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:16:46 2012 +0100 llvmpipe: Changed known failures in lp_test_conv. To comply with the recent fixes to lp_bld_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit d7061507bd94f6468581e218e61261b79c760d4f Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:14:38 2012 +0100 llvmpipe: Added fixed point types tests to lp_test_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 146b3ea39b4726dbe125ac666bd8902ea3d6ca8c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:26:35 2012 +0100 llvmpipe: Changed lp_test_conv src/dst alignment to be correct. Now based on the define rather than a fixed number. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit f3b57441f834833a4b142a951eb98df0aa874536 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:06:44 2012 +0100 gallivm: Fixed erroneous optimisation in lp_build_min/max. Previously assumed normalised was 0 to 1, but it can be -1 to 1 if type is signed. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a0613382e5a215cd146bb277646a6b394d376ae4 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:04:49 2012 +0100 gallivm: Compensate for lp_const_offset in lp_build_conv. Fixing a /*FIXME*/ to remove errors in integer conversion in lp_build_conv. Tested using lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a3d2bf15ea345bc8a0664f8f441276fd566566f3 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:01:25 2012 +0100 gallivm: Fixed overflow in lp_build_clamped_float_to_unsigned_norm. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit e7b1e76fe237613731fa6003b5e1601a2e506207 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 21 20:07:51 2012 +0100 gallivm: Fix build with LLVM 2.6 Trivial, and useful. commit d3c6bbe5c7f5ba1976710831281ab1b6a631082d Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 15 17:15:59 2012 +0100 gallivm: Enable MCJIT/AVX with vanilla LLVM 3.1. Add the necessary C++ glue, so that we don't need any modifications to the soon to be released LLVM 3.1. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 724a019a14d40fdbed21759a204a2bec8a315636 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 22:04:06 2012 +0100 gallivm: Use HAVE_LLVM 0x0301 consistently. commit af6991e2a3868e40ad599b46278551b794839748 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 21:49:06 2012 +0100 gallivm: Add MCRegisterInfo.h to silence benign warnings about missing implementation. Trivial. commit 6f8a1d75458daae2503a86c6b030ecc4bb494e23 Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Apr 2 22:14:15 2012 -0700 gallivm: Pass in a MCInstrInfo to createMCInstPrinter on llvm-3.1. llvm-3.1svn r153860 makes MCInstrInfo available to the MCInstPrinter. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 62555b6ed8760545794f83064e27cddcb3ce5284 Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 27 21:51:17 2012 -0700 gallivm: Fix method overriding in raw_debug_ostream. Use matching type qualifers to avoid method hiding. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 6a9bd784f4ac68ad0a731dcd39e5a3c39989f2be Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 13 22:40:52 2012 -0700 gallivm: Fix createOProfileJITEventListener namespace with llvm-3.1. llvm-3.1svn r152620 refactored the OProfile profiling code. createOProfileJITEventListener was moved from the llvm namespace to the llvm::JITEventListener namespace. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit b674955d39adae272a779be85aa1bd665de24e3e Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Mar 5 22:00:40 2012 -0800 gallivm: Pass in a MCRegisterInfo to MCInstPrinter on llvm-3.1. llvm-3.1svn r152043 changes createMCInstPrinter to take an additional MCRegisterInfo argument. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 11ab69971a8a31c62f6de74905dbf8c02884599f Author: Vinson Lee <vlee@freedesktop.org> Date: Wed Feb 29 21:20:53 2012 -0800 Revert "gallivm: Change getExtent and readByte to non-const with llvm-3.1." This reverts commit d5a6c172547d8964f4d4bb79637651decaf9deee. llvm-3.1svn r151687 makes MemoryObject accessor members const again. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 339960c82d2a9f5c928ee9035ed31dadb7f45537 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 14 16:19:56 2012 +0200 gallivm: (trivial) fix assertion failure for mipmapped 1d textures In lp_build_rho, we may end up with a 1-element vector (for mipmapped 1d textures), but in this case we require the type to be a non-vector type, so need a cast. commit 9d73edb727bd6d196030dc3026b7bf0c574b3e19 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:12:07 2012 +0200 gallivm: prepare for per-quad lod calculations for large vectors to be able to handle multiple quads at once in texture sampling and still do lod calculations per quad, it is necessary to get the per-quad derivatives into the lp_build_rho function. Until now these derivative values were just scalars, which isn't going to work. So we now use vectors, and since the interface needs to change we also do some different (slightly more efficient) packing of the values. For 8-wide vectors the packed derivative values for 3 coords would look like this, this scales to a arbitrary (multiple of 4) vector size: ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____ The second vector will be unused for 1d and 2d textures. To facilitate future changes the derivative values are put into a struct, since quite some functions just pass these values through. The generated code seems to be very slightly better for 2d textures (with 4-wide vectors) than before with sse2 (if you have a cpu with physical 128bit simd units - otherwise it's probably not a win). v2: suggestions from José, rename variables, add comments, use swizzle helper commit 0aa21de0d31466dac77b05c97005722e902517b8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:10:31 2012 +0200 gallivm: add undefined swizzle handling to lp_build_swizzle_aos This is useful for vectors with "holes", it lets llvm choose the most efficient shuffle instructions if some elements aren't needed without having to worry what elements to manually pick otherwise. commit 00faf3f370e7ce92f5ef51002b0ea42ef856e181 Author: José Fonseca <jfonseca@vmware.com> Date: Fri May 4 17:25:16 2012 +0100 gallivm: Get the LLVM IR optimization passes before JIT compilation. MC-JIT engine compiles the module immediately on creation, so the optimization passes were being run too late. So now we create a target data layout from a string, that matches the ABI parameters reported by the compiler. The backend optimization passes were always been run, so the performance improvement is modest (3% on multiarb mesa demo). Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit 40a43f4e2ce3074b5ce9027179d657ebba68800a Author: Roland Scheidegger <sroland@vmware.com> Date: Wed May 2 16:03:54 2012 +0200 gallivm: (trivial) fix wrong define used in lp_build_pack2 should fix stack-smashing crashes. commit e6371d0f4dffad4eb3b7a9d906c23f1c88a2ab9e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 30 21:25:29 2012 +0200 gallivm: add perf warnings when not using intrinsics with 256bit vectors Helper functions using integer sse2 intrinsics could split the vectors with AVX instead of using generic fallback (which should be faster). We don't actually expect to hit these paths (hence don't fix them up to actually do the vector splitting) so just emit warnings (for those functions where it's obvious doing split/intrinsic is faster than using generic path). Only emit warnings for 256bit vectors since we _really_ don't expect to hit arbitrary large vectors which would affect a lot more functions. The warnings do not actually depend on avx since the same logic applies to plain sse2 too (but of course again there's _really_ no reason we should hit these functions with 256bit vectors without avx). commit 8a9ea701ea7295181e846c6383bf66a5f5e47637 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:37:07 2012 +0200 gallivm: split vectors manually for avx in lp_build_pack2 (v2) There's 2 reasons for this: First, there's a llvm bug (fixed in 3.1) which generates tons of byte inserts/extracts otherwise, and second, more importantly, we want to use pack intrinsics instead of shuffles. We do this in lp_build_pack2 and not the calling code (aos sample path) because potentially other callers might find that useful too, even if for larger sequences of code using non-native vector sizes it might be better to manually split vectors. This should boost texture performance in the aos path considerably. v2: fix issues with intrinsics types with old llvm commit 27ac5b48fa1f2ea3efeb5248e2ce32264aba466e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:26:22 2012 +0200 llvmpipe: refactor lp_build_pack2 (v2) prettify, and it's unnecessary to assert when there's no intrinsic due to unsupported bit width - the shuffle path will work regardless. In contrast lp_build_packs2, should only rely on lp_build_pack2 doing the clamping for element sizes for which there is a sse2 intrinsic. v2: fix bug spotted by Jose regarding the intrinsic type for packusdw on old llvm versions. commit ddf279031f0111de4b18eaf783bdc0a1e47813c8 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:13:59 2012 +0200 gallivm: add src width check in lp_build_packs2() not doing so would skip clamping even if no sse2 pack instruction is available, which is incorrect (in theory only, such widths would also always hit a (unnecessary) assertion in lp_build_pack2(). commit e7f0ad7fe079975eae7712a6e0c54be4fae0114b Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Apr 27 15:57:00 2012 +0200 gallivm: (trivial) fix crash-causing typo for npot textures with avx commit 28a9d7f6f655b6ec508c8a3aa6ffefc1e79793a0 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Apr 25 19:38:45 2012 +0200 gallivm: (trivial) remove code mistakenly added twice. commit d5926537316f8ff67ad0a52e7242f7c5478d919b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Apr 24 21:16:15 2012 +0200 gallivm: add a new avx aos sample path (v2) Try to avoid mixing float and int address calculations. This does texture wrap modes with floats, and then the offset calculations still with ints (because of lack of precision with floats, though we could do some effort to make it work with not too large (16MB) textures). This also handles wrap repeat mode with npot-sized textures differently than either the old soa or aos int path (likely way faster but untested). Otherwise the actual address wrap code is largely similar to the soa path (not quite the same as this one also has some int code), it should get used by avx soa sampling later as well but doesn't handle more complex address modes yet (this will also have the benefit that we can use aos sampling path for all texture address modes). Generated code for that looks reasonable, but still does not split vectors explicitly for fetch/filter which means still get hit by llvm (fixed upstream) which generates hundreds of pinsrb/pextrb instead of two shuffles. It is not obvious though if it's much of a win over just doing address calcs 4-wide but with ints, even if it is definitely much less instructions on avx. piglit's texwrap seems to look exactly the same but doesn't test neither the non-normalized nor the npot cases. v2: fix comments, prettify based on Brian's and Jose's feedback. commit bffecd22dea66fb416ecff8cffd10dd4bdb73fce Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Apr 19 01:58:29 2012 +0200 gallivm: refactor aos lp_build_sample_image_nearest/linear split them up to separate address calculations and fetching/filtering. Need this for being able to do 8-wide float address calcs and 4-wide fetch/filter later (for avx). Plus the functions were very big scary monsters anyway (in particular lp_build_sample_image_linear). commit a80b325c57529adddcfa367f96f03557725c4773 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 16 17:17:18 2012 +0200 gallivm: fix lp_build_resize when truncating width but expanding vector size Missed this case which I thought was impossible - the assertion for it was right after the division by zero... (AoS) texture sampling may ask us to do this, for things like 8 4x32int vectors to 1 32x8int vector conversion (eventually, we probably don't want this to happen). commit f9c8337caa3eb185830d18bce8b95676a065b1d7 Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Apr 14 18:00:59 2012 +0200 gallivm: fix cube maps with larger vectors This makes the branchless cube face selection code work with larger vectors. Because the complexity is quite high (cannot really be improved it seems, per-face selection would reduce complexity a lot but this leads to errors unless the derivatives are calculated all from the same face which almost doubles the work to be done) it is still slower than the branching version, hence only enable this with large vectors. It doesn't actually do per-quad face selection yet (only makes sense with matching lod selection, in fact it will select the same face for all pixels based on the average of the first four pixels for now) but only different shuffles are required to make it work (the branching version actually should work with larger vectors too now thanks to the improved horizontal add but of course it cannot be extended to really select the face per-quad unless doing branching per quad). commit 7780c58869fc9a00af4f23209902db7e058e8a66 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 21:11:12 2012 +0100 llvmpipe: (trivial) fix compiler warning and also clarify comment regarding availability of popcnt instruction. commit a266dccf477df6d29a611154e988e8895892277e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 14:21:07 2012 +0100 gallivm: remove unneeded members in lp_build_sample_context Minor cleanup, the texture width, height, depth aren't accessed in their scalar form anywhere. Makes it more obvious those values should probably be fetched already vectorized (but this requires more invasive changes)... commit b678c57fb474e14f05e25658c829fc04d2792fff Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 29 15:53:55 2012 +0100 gallivm: add a helper for concatenating vectors Similar to the extract_range helper intended to get around slow code generated by llvm for 128bit insertelements. Concatenating two 128bit vectors this way will result in a single vinsertf128 operation rather than two 64bit stores plus one 128bit load, though it might be mildly useful for other purposes as well. commit 415ff228bcd0cf5e44a4c15350a661f0f5520029 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 19:41:15 2012 +0100 gallivm: add a custom 2x8f->1x16ub avx conversion path Similar to the existing 4x4f->1x16ub sse2 path, shaves off a couple instructions (min/max mostly) because it relies on pack intrinsics clamping. commit 78c08fc89f8fbcc6dba09779981b1e873e2a0299 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 18:44:07 2012 +0100 gallivm: add avx arithmetic intrinsics Add all avx intrinsics for arithmetic functions (with the exception of the horizontal add function which needs another look). Seems to pass basic tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit a586caa2800aa5ce54c173f7c0d4fc48153dbc4e Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 15:31:35 2012 +0100 gallivm: add avx logic intrinsics Add the blend intrinsics for 8-wide float and 4-wide double vectors. Since we lack 256bit int instructions these are used for int vectors as well, though obviously not for byte or word element values. The comparison intrinsics aren't extended for avx since these are only used for pre-2.7 llvm versions. commit 70275e4c13c89315fc2560a4c488c0e6935d5caf Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 00:40:53 2012 +0100 gallivm: new helper function for extract shuffles. Based on José's idea as we can need that in a couple places. Note that such shuffles should not be used lightly, since data layout of <4 x i8> is different to <16 x i8> for instance, hence might cause data rearrangement. commit 4d586dbae1b0c55915dda1759d2faea631c0a1c2 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 18:27:25 2012 +0100 gallivm: (trivial) don't overallocate shuffle variable using wrong define meant huge array... commit 06b0ec1f6d665d98c135f9573ddf4ba04b2121ad Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 17:54:20 2012 +0100 gallivm: don't do per-element extract/insert for vector element resize Instead of doing per-element extract/insert if the src vectors and dst vector differ in total size (which generates atrocious code) first change the src vectors size by using shuffles to destination vector size. We can still do better than that on AVX for packing to color buffer (by exploiting pack intrinsics characteristics hence eleminating the need for some clamps) but this already generates much better code. v2: incorporate feedback from José, Keith and use shuffle instead of bitcasts/extracts. Due to llvm deficiencies the latter cause all data to get moved to GPRs and back in pieces (even though the data in the regs actually stays the same...). commit c9970d70e05f95d3f52fe7d2cd794176a52693aa Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 19:33:19 2012 +0000 gallivm: fix bug in simple position interpolation Accidental use of position attribute instead of just pixel coordinates. Caused failures in piglit glsl-fs-ceil and glsl-fs-floor. commit d0b6fcdb008d04d7f73d3d725615321544da5a7e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 15:31:14 2012 +0000 gallivm: fix emission of ceil opcode lp_build_ceil seems more appropriate than lp_build_trunc. This seems to be never hit though someone performs some ceil to floor magic. commit d97fafed7e62ffa6bf76560a92ea246a1a26d256 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 22 11:46:52 2012 +0000 gallivm: new vectorized path for cubemap calculations should be faster when adapted to multiple quads as only selection masks need to be different. The code is more or less a per-pixel version adapted to only do it per quad. A per pixel version would be much simpler (could drop 2 selects, 6 broadcasts and the messy horizontal add of 3 vectors at the expense of only 2 more absolute value instructions - would also just work for arbitary large vectors). This version doesn't yet work with larger vectors because the horizontal add isn't adjusted to be able to work with 2x4 vectors (and also because face selection wouldn't be done per quad just per block though that would be only a correctness issue just as with lod selection). The downside is this code is quite a bit slower. On a Core2 it can be sped up by disabling the hw blend instructions for selection and using logicop fallbacks instead, but it is still slower than the old code, hence leave that in for now. Probably will chose one or the other version based on vector length in the end. commit b375fbb18a3fd46859b7fdd42f3e9908ea4ff9a3 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 21 14:42:29 2012 +0000 gallivm: fix optimized occlusion query intrinsic name commit a9ba0a3b611e48efbb0e79eb09caa85033dbe9a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Mar 21 16:19:43 2012 +0000 draw,gallivm,llvmpipe: Call gallivm_verify_function everywhere. commit f94c2238d2bc7383e088b8845b7410439a602071 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 18:54:10 2012 +0000 gallivm: optimize calculations for cube maps a bit this does some more vectorized calculations and uses horizontal adds if possible. A definite win with sse3 otherwise it doesn't seem to make much of a difference. In any case this is arithmetically identical, cannot handle larger vectors. Should be useful as a reference point against larger vector version later... commit 21a2c1cf3c8e1ac648ff49e59fdc0e3be77e2ebb Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 15:16:27 2012 +0000 llvmpipe: slight optimization of occlusion queries using movmskps when available. While this is slightly better for cpus without popcnt we should really sum the vectors ourselves (it is also possible to cast to i4 before doing the popcnt but that doesn't help that much neither since llvm is using some optimized popcnt version for i32) commit 5ab5a35f216619bcdf55eed52b0db275c4a06c1b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 13:32:11 2012 +0000 llvmpipe: fix occlusion queries with larger vectors need to adjust casts etc. commit ff95e6fdf5f16d4ef999ffcf05ea6e8c7160b0d5 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Mar 19 20:15:25 2012 +0000 gallivm: Restore optimization passes. commit 57b05b4b36451e351659e98946dae27be0959832 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:34:22 2012 +0000 llvmpipe: use existing min2 macro commit bc9a20e19b4f600a439f45679451f2e87cd4b299 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:07:27 2012 +0000 llvmpipe: add some safeguards against really large vectors As per José's suggestion, prevent things from blowing up if some cpu would have 1024bit or larger vectors. commit 0e2b525e5ca1c5bbaa63158bde52ad1c1564a3a9 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:31:08 2012 +0000 llvmpipe: fix mask generation for uberwide vectors this was the only piece preventing 16-wide vectors from working (apart from the LP_MAX_VECTOR_WIDTH define that is), which is the maximum as we don't get more pixels in the fragment shader at once. Hence adjust that so things could be tested properly with that size even though there seems to be no practical value. commit 3c8334162211c97f3a11c7f64e9e5a2a91ad9656 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:19:41 2012 +0000 llvmpipe: fix the simple interpolation method with larger vectors so both methods actually _really_ work now. Makes textures look nice with larger vectors... commit 1cb0464ef8871be1778d43b0c56adf9c06843e2d Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 17:26:35 2012 +0000 llvmpipe: fix mask generation and position interpolation with 8-wide vectors trivial bugs, with these things start to look somewhat reasonable. Textures though have some swizzling issues it seems. commit 168277a63ef5b72542cf063c337f2d701053ff4b Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 16:04:03 2012 +0000 llvmpipe: don't overallocate variables we never have more than 16 (stamp size) / 4 (minimum possible vector size). (With larger vectors those variables are still overallocated a bit.) commit 409b54b30f81ed0aa9ed0b01affe15c72de9abd2 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:56:48 2012 +0000 llvmpipe: add some 32f8 formats to lp_test_conv Also add the ability to handle different sized vectors. commit 55dcd3af8366ebdac0af3cdb22c2588f24aa18ce Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:47:27 2012 +0000 gallivm: handle different sized vectors in conversion / pack only fully generic path for now (extract/insert per element). commit 9c040f78c54575fcd94a8808216cf415fe8868f6 Author: Roland Scheidegger <sroland@vmware.com> Date: Sun Mar 18 00:58:28 2012 +0100 llvmpipe: fix harmless use of unitialized values commit 551e9d5468b92fc7d5aa2265db9a52bb1e368a36 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:31:21 2012 +0100 gallivm: drop special path in extract_broadcast with different sized vectors Not needed, llvm can handle shuffles with different sized result vector just fine. Should hopefully generate the same code in the end, but simpler IR. commit 44da531119ffa07a421eaa041f63607cec88f6f8 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:28:49 2012 +0100 llvmpipe: adapt interpolation for handling multiple quads at once this is still WIP there are actually two methods possible not quite sure what makes the most sense, so there's code for both for now: 1) the iterative method as used before (compute attrib values at upper left corner of stamp and upper left corner of each quad initially). It is improved to handle more than one quad at once, and also do some more vectorized calculations initially for slightly better code - newer cpus have full throughput with 4 wide float vectors, hence don't try to code up a path which might be faster if there's just one channel active per attribute. 2) just do straight interpolation for each pixel. Method 2) is more work per quad, but less initially - if all quads are executed significantly more overall though. But this might change with larger vector lengths. This method would also be needed if we'd do some kind of active quad merging when operating on multiple quads at once. This path contains some hack to force llvm to generate better code, it is still far from ideal though, still generates far too many unnecessary register spills/reloads. Both methods should work with different sized vectors. Not very well tested yet, still seems to work with four-wide vectors, need changes elsewhere to be able to test with wider vectors. commit be5d3e82e2fe14ad0a46529ab79f65bf2276cd28 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:59:37 2012 +0000 draw: Cleanup. commit f85bc12c7fbacb3de2a94e88c6cd2d5ee0ec0e8d Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:43:30 2012 +0000 gallivm: More module compilation refactoring. commit d76f093198f2a06a93b2204857e6fea5fd0b3ece Author: José Fonseca <jfonseca@vmware.com> Date: Thu Mar 15 21:29:11 2012 +0000 llvmpipe: Use gallivm_compile/free_function() in linear code. Should had been done before. commit 122e1adb613ce083ad739b153ced1cde61dfc8c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 13 14:47:10 2012 +0100 llvmpipe: generate partial pixel mask for multiple quads still works with one quad, cannot be tested yet with more At least for now always fixed order with multiple quads. commit 4c4f15081d75ed585a01392cd2dcce0ad10e0ea8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 22:09:24 2012 +0100 llvmpipe: refactor state setup a bit Refactor to make it easier to emit (and potentially later fetch in fs) coefficients for multiple attributes at once. Need to think more about how to make this actually happen however, the problem is different attributes can have different interpolation modes, requiring different handling in both setup and fs (though linear and perspective handling is close). commit 9363e49722ff47094d688a4be6f015a03fba9c79 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 19:23:23 2012 +0100 llvmpipe: vectorize tri offset calc cuts number of instructions in quad-offset-factor from 107 to 75. This code actually duplicated the (scalar) code calculating the determinant except it used different vertex order (leading to different sign but it doesn't matter) hence llvm could not have figured out it's the same (of course with determinant vectorized in the other place that wouldn't have worked any longer neither). Note this particular piece doesn't actually vectorize well, not many arithmetic instructions left but tons of shuffle instructions... Probably would need to work on n tris at a time for better vectorization. commit 63169dcb9dd445c94605625bf86d85306e2b4297 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 03:11:37 2012 +0100 llvmpipe: vectorize some scalar code in setup reduces number of arithmetic instructions, and avoids loading vector x,y values twice (once as scalars once as vectors). Results in a reduction of instructions from 76 to 64 in fs setup for glxgears (16%) on a cpu with sse41. Since this code uses vec2 disguised as vec4, on old cpus which had physical 64bit sse units (pre-Core2) it probably is less of a win in practice (and if you have no vectors you can only hope llvm eliminates the arithmetic for unneeded elements). commit 732ecb877f951ab89bf503ac5e35ab8d838b58a1 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 7 00:32:24 2012 +0100 draw: fix clipping bug introduced by 4822fea3f0440b5205e957cd303838c3b128419c broke clipping pretty badly (verified with lineclip test) commit ef5d90b86d624c152d200c7c4056f47c3c6d2688 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 23:38:59 2012 +0100 draw: don't store vertex header per attribute storing the vertex header once per attribute is totally unnecessary. Some quick look at the generated assembly says llvm in fact cannot optimize away the additional stores (maybe due to potentially aliasing pointers somewhere). Plus, this makes the code cleaner and also allows using a vector "or" instead of scalar ones. commit 6b3a5a57b0b9850854cfbd7b586e4e50102dda71 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 19:11:01 2012 +0100 draw: do the per-vertex "boolean" clipmask "or" with vectors no point extracting the values and doing it per component. Doesn't help that much since we still extract the values elsewhere anyway. commit 36519caf1af40e4480251cc79a2d527350b7c61f Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 2 22:27:01 2012 +0100 gallivm: fix lp_build_extract_broadcast with different sized vectors Fix the obviously wrong argument, so it doesn't blow up. commit 76d0ac3ad85066d6058486638013afd02b069c58 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 2 12:16:23 2012 +0000 draw: Compile per module and not per function (WIP). Enough to get gears w/ LLVM draw + softpipe to work on AVX doing: GALLIUM_DRIVER=softpipe SOFTPIPE_USE_LLVM=yes glxgears But still hackish -- will need to rethink and refactor this. commit 78e32b247d2a7a771be9a1a07eb000d1e54ea8bd Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 12:01:05 2012 +0000 llvmpipe: Remove lp_state_setup_fallback. Never used. commit 6895d5e40d19b4972c361e8b83fdb7eecda3c225 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Feb 27 19:14:27 2012 +0000 llvmpipe: Don't emit EMMS on x86 We already take precautions to ensure that LLVM never emits MMX code. commit 4822fea3f0440b5205e957cd303838c3b128419c Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 29 15:58:19 2012 +0100 draw: modifications for larger vector sizes We want to be able to use larger vectors especially for running the vertex shader. With this patch we build soa vectors which might have a different length than 4. Note that aos structures really remain the same, only when aos structures are converted to soa potentially different sized vectors are used. Samplers probably don't work yet, didn't look at them. Testing done: glxgears works with both 128bit and 256bit vectors. commit f4950fc1ea784680ab767d3dd0dce589f4e70603 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:57 2012 +0100 gallivm: override native vector width with LP_NATIVE_VECTOR_WIDTH env var for debug commit 6ad6dbf0c92f3bf68ae54e5f2aca035d19b76e53 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:24 2012 +0100 draw: allocate storage with alignment according to native vector width commit 7bf0e3e7c9bd2469ae7279cabf4c5229ae9880c1 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Feb 24 19:06:08 2012 +0000 gallivm: Fix comment grammar. Was missing several words. Spotted by Roland. commit b20f1b28eb890b2fa2de44a0399b9b6a0d453c52 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 19:22:09 2012 +0000 gallivm: Use MC-JIT on LLVM 3.1 + (i.e, SVN) MC-JIT Note: MC-JIT is still WIP. For this to work correctly it requires LLVM changes which are not yet upstream. commit b1af4dfcadfc241fd4023f4c3f823a1286d452c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 20:03:15 2012 +0100 llvmpipe: use new lp_type_width() helper in lp_test_blend commit 04e0a37e888237d4db2298f31973af459ef9c95f Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 19:50:34 2012 +0100 llvmpipe: clean up lp_test_blend a little Using variables just sized and aligned right makes it a bit more obvious what's going on. The test still only tests vector length 4. For AoS anything else probably isn't going to work. For SoA other lengths should work (at least with floats). commit e61c393d3ec392ddee0a3da170e985fda885a823 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:48:30 2012 +0000 gallivm: Ensure vector width consistency. Instead of assuming that everything is the max native size. commit 330081ac7bc41c5754a92825e51456d231bf84dd Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:44:14 2012 +0000 draw: More simd vector width consistency fixes. commit d90ca002753596269e37297e2e6c139b19f29f03 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:43:00 2012 +0000 gallivm: Remove unused lp_build_int32_vec4_type() helper. commit cae23417824d75869c202aaf897808d73a2c1db0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 17:32:16 2012 +0100 gallivm: use global variable for native vector width instead of define We do not know the simd extensions (and hence the simd width we should use) available at compile time. At least for now keep a define for maximum vector width, since a global variable obviously can't be used to adjust alignment of automatic stack variables. Leave the runtime-determined value at 128 for now in all cases. commit 51270ace6349acc2c294fc6f34c025c707be538a Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 15:41:02 2012 +0000 gallivm: Add a hunk inadvertedly lost when rebasing. commit bf256df9cfdd0236637a455cbaece949b1253e98 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:24:23 2012 +0000 llvmpipe: Use consistent vector width in depth/stencil test. commit 5543b0901677146662c44be2cfba655fd55da94b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:19:59 2012 +0000 draw: Use a consistent the vector register width. Instead of 4x32 sometimes, LP_NATIVE_VECTOR_WIDTH other times. commit eada8bbd22a3a61f549f32fe2a7e408222e5c824 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 12:08:04 2012 +0000 gallivm: Remove garbagge collection. MC-JIT will require one compilation per module (as opposed to one compilation per function), therefore no state will be shared, eliminating the need to do garbagge collection. commit 556697ea0ed72e0641851e4fbbbb862c470fd7eb Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 10:33:41 2012 +0000 gallivm: Move all native target initialization to lp_set_target_options(). commit c518e8f3f2649d5dc265403511fab4bcbe2cc5c8 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:52:32 2012 +0000 llvmpipe: Create one gallivm instance for each test. commit 90f10af8920ec6be6f2b1e7365cfc477a0cb111d Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:48:08 2012 +0000 gallivm: Avoid LLVMAddGlobalMapping() in lp_bld_assert(). Brittle, complex, and unecesary. Just use function pointer constant. commit 98fde550b33401e3fe006af59db4db628bcbf476 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:21:26 2012 +0000 gallivm: Add a lp_build_const_func_pointer() helper. To be reused in all places where we want to call C code. commit 6cfedadb62c2ce5af8d75969bc95a607f3ece118 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:44:41 2012 +0000 gallivm: Cleanup/simplify lp_build_const_string_variable. - Move to lp_bld_const where it belongs - Rename to lp_build_const_string - take the length from the argument (and don't count the zero terminator twice) - bitcast the constant to generic i8 * commit db1d4018c0f1fa682a9da93c032977659adfb68c Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 11:52:17 2012 +0000 gallivm: Set NoFramePointerElimNonLeaf to true where supported. commit 088614164aa915baaa5044fede728aa898483183 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 22 19:38:47 2012 +0100 llvmpipe: pass in/out pointers rather scalar floats in lp_bld_arit we don't want llvm to potentially optimize away the vectors (though it doesn't seem to currently), plus we want to be able to handle in/out vectors of arbitrary length. commit 3f5c4e04af8a7592fdffa54938a277c34ae76b51 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:22:55 2012 +0100 gallivm: fix lp_build_sqrt() for vector length 1 since we optimize away vectors with length 1 need to emit intrinsic without vector type. commit 79d94e5f93ed8ba6757b97e2026722ea31d32c06 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 17:00:46 2012 +0000 llvmpipe: Remove lp_test_round. commit 81f41b5aeb3f4126e06453cfc78990086b85b78d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:56:24 2012 +0100 llvmpipe: subsume lp_test_round into lp_test_arit Much simpler, and since the arguments aren't passed as 128bit values can run on any arch. This also uses the float instead of the double versions of the c functions (which probably was the intention anyway). In contrast to lp_test_round the output is much less verbose however. Tested vector width of 32 to 512 bits - all pass except 32 (length 1) which crashes in lp_build_sqrt() due to wrong type. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 945b338b421defbd274481d8c4f7e0910fd0e7eb Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 09:55:03 2012 +0000 gallivm: Centralize the function compilation logic. This simplifies a lot of code. Also doing this in a central place will make it easier to carry out the changes necessary to use MC-JIT in the future. gallivm: Fix typo in explicit derivative shuffle. Trivial. draw: make DEBUG_STORE work again adapt to lp_build_printf() interface changes Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: get rid of vecnf_from_scalar() just use lp_build_broadcast directly (cannot assign a name but don't really need it, vecnf_from_scalar() was producing much uglier IR due to using repeated insertelement instead of insertelement+shuffle). Reviewed-by: José Fonseca <jfonseca@vmware.com> llvmpipe: fix typo in complex interpolation code Fixes position interpolation when using complex mode (piglit fp-fragment-position and similar) Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: fix clipvertex/position storing again This appears to be the result of a bad merge. Fixes piglit tests relying on clipping, like a lot of the interpolation tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Fix explicit derivative manipulation. Same counter variable was being used in two nested loops. Use more meanigful variable names for the counter to fix and avoid this. gallivm: Prevent buffer overflow in repeat wrap mode for NPOT. Based on Roland's patch, discussion, and review . Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix dims for TGSI_TEXTURE_1D in emit_tex. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix explicit volume texture derivatives. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: fix 1d shadow texture sampling Always r coordinate is used, hence need 3 coords not two (the second one is unused). Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Enable AVX support without MCJIT, where available. For now, this just enables AVX on Windows for testing. If the code is stable then we might consider prefering the old JIT wherever possible. No change elsewhere. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2012-07-13 18:09:30 +01:00
else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
(gallivm_debug & GALLIVM_DEBUG_PERF) &&
(type.width == 8 || type.width == 16 || type.width == 32)) {
debug_printf("%s: inefficient code, should split vectors manually\n",
__FUNCTION__);
}
return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
}
2010-03-04 15:36:26 +00:00
LLVMValueRef
lp_build_negate(struct lp_build_context *bld,
LLVMValueRef a)
{
LLVMBuilderRef builder = bld->gallivm->builder;
2010-08-09 17:30:33 +01:00
assert(lp_check_value(bld->type, a));
if (bld->type.floating)
a = LLVMBuildFNeg(builder, a, "");
else
a = LLVMBuildNeg(builder, a, "");
return a;
2010-03-04 15:36:26 +00:00
}
/** Return -1, 0 or +1 depending on the sign of a */
LLVMValueRef
lp_build_sgn(struct lp_build_context *bld,
LLVMValueRef a)
{
LLVMBuilderRef builder = bld->gallivm->builder;
const struct lp_type type = bld->type;
LLVMValueRef cond;
LLVMValueRef res;
2010-08-09 17:30:33 +01:00
assert(lp_check_value(type, a));
/* Handle non-zero case */
if(!type.sign) {
/* if not zero then sign must be positive */
res = bld->one;
}
else if(type.floating) {
LLVMTypeRef vec_type;
LLVMTypeRef int_type;
LLVMValueRef mask;
LLVMValueRef sign;
LLVMValueRef one;
unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
int_type = lp_build_int_vec_type(bld->gallivm, type);
vec_type = lp_build_vec_type(bld->gallivm, type);
mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
/* Take the sign bit and add it to 1 constant */
sign = LLVMBuildBitCast(builder, a, int_type, "");
sign = LLVMBuildAnd(builder, sign, mask, "");
one = LLVMConstBitCast(bld->one, int_type);
res = LLVMBuildOr(builder, sign, one, "");
res = LLVMBuildBitCast(builder, res, vec_type, "");
}
else
{
/* signed int/norm/fixed point */
gallivm,draw,llvmpipe: Support wider native registers. Squashed commit of the following: commit 7acb7b4f60dc505af3dd00dcff744f80315d5b0e Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:46:31 2012 +0100 draw: Don't use dynamically sized arrays. Not supported by MSVC. commit 5810c28c83647612cb372d1e763fd9d7780df3cb Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:44:16 2012 +0100 gallivm,llvmpipe: Don't use expressions with PIPE_ALIGN_VAR(). MSVC doesn't accept exceptions in _declspec(align(...)). Use a define instead. commit 8aafd1457ba572a02b289b3f3411e99a3c056072 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:41:56 2012 +0100 gallium/util: Make u_cpu_detect.h header C++ safe. commit 5795248350771f899cfbfc1a3a58f1835eb2671d Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 2 12:08:01 2012 +0100 gallium/util: Add ULL suffix to large constants. As suggested by Andy Furniss: it looks like some old gcc versions require it. commit 4c66c22727eff92226544c7d43c4eb94de359e10 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Truly disable INF/NAN tests on MSVC. Thanks to Brian for spotting this. commit 8bce274c7fad578d7eb656d9a1413f5c0844c94e Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Disable INF/NAN tests on MSVC. Somehow they are not recognized as constants. commit 6868649cff8d7fd2e2579c28d0b74ef6dd4f9716 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 15:05:24 2012 +0200 gallivm: Cleanup the 2 x 8 float -> 16 ub special path in lp_build_conv. No behaviour change intended, like 7b98455fb40c2df84cfd3cdb1eb7650f67c8a751. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 5147a0949c4407e8bce9e41d9859314b4a9ccf77 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 14:28:19 2012 +0200 gallivm: (trivial) fix issues with multiple-of-4 texture fetch Some formats can't handle non-multiple of 4 fetches I believe, but everything must support length 1 and multiples of 4. So avoid going to scalar fetch (which is very costly) just because length isn't 4. Also extend the hack to not use shift with variable count for yuv formats to arbitrary length (larger than 1) - doesn't matter how many elements we have we always want to avoid it unless we have variable shift count instruction (which we should get with avx2). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 87ebcb1bd71fa4c739451ec8ca89a7f29b168c08 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jul 4 02:09:55 2012 +0200 gallivm: (trivial) fix typo for wrap repeat mode in linear filtering aos code This would lead to bogus coordinates at the edges. (undetected by piglit because this path is only taken for block-based formats). Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 3a42717101b1619874c8932a580c0b9e6896b557 Author: José Fonseca <jfonseca@vmware.com> Date: Tue Jul 3 19:42:49 2012 +0100 gallivm: Fix TGSI integer translation with AVX. commit d71ff104085c196b16426081098fb0bde128ce4f Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 15:17:41 2012 +0100 llvmpipe: Fix LLVM JIT linear path. It was not working properly because it was looking at the JIT function before it was actually compiled. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit a94df0386213e1f5f9a6ed470c535f9688ec0a1b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Jun 28 18:07:10 2012 +0100 gallivm: Refactor lp_build_broadcast(_scalar) to share code. Doesn't really change the generated assembly, but produces more compact IR, and of course, makes code more consistent. Reviewed-by: Brian Paul <brianp@vmware.com> commit 66712ba2731fc029fa246d4fc477d61ab785edb5 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 17:30:13 2012 +0100 gallivm: Make LLVMContextRef a singleton. There are any places inside LLVM that depend on it. Too many to attempt to fix. Reviewed-by: Brian Paul <brianp@vmware.com> commit ff5fb7897495ac263f0b069370fab701b70dccef Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 28 18:15:27 2012 +0200 gallivm: don't use 8-wide texture fetch in aos path This appears to be a slight loss usually. There are probably several reasons for that: - fetching itself is scalar - filtering is pure int code hence needs splitting anyway, same for the final texel offset calculations - texture wrap related code, which can be done 8-wide, is slightly more complex with floats (with clamp_to_edge) and float operations generally more costly hence probably not much faster overall - the code needed to split when encountering different mip levels for the quads, adding complexity So, just split always for aos path (but leave it 8-wide for soa, since we do 8-wide filtering there when possible). This should certainly be revisited if we'd have avx2 support. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ce8032b43dcd8e8d816cbab6428f54b0798f945d Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:41:19 2012 +0200 gallivm: (trivial) don't extract fparts variable if not needed Did not have any consequences but unnecessary. commit aaa9aaed8f80dc282492f62aa583a7ee23a4c6d5 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:09:06 2012 +0200 gallivm: fix precision issue in aos linear int wrap code now not just passes at a quick glance but also with piglit... If we do the wrapping with floats, we also need to set the weights accordingly. We can potentially end up with different (integer) coordinates than what the integer calculations would have chosen, which means the integer weights calculated previously in this case are completely wrong. Well at least that's what I think happens, at least recalculating the weights helps. (Some day really should refactor all the wrapping, so we do whatever is fastest independent of 16bit int aos or 32bit float soa filtering.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit fd6f18588ced7ac8e081892f3bab2916623ad7a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 11:15:53 2012 +0100 gallium/util: Fix parsing of options with underscore. For example GALLIVM_DEBUG=no_brilinear which was being parsed as two options, "no" and "brilinear". commit 09a8f809088178a03e49e409fa18f1ac89561837 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 26 15:00:14 2012 +0100 gallivm: Added a generic lp_build_print_value which prints a LLVMValueRef. Updated lp_build_printf to share common code. Removed specific lp_build_print_vecX. Reviewed-by: José Fonseca <jfonseca@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit e59bdcc2c075931bfba2a84967a5ecd1dedd6eb0 Author: José Fonseca <jfonseca@vmware.com> Date: Wed May 16 15:00:23 2012 +0100 draw,llvmpipe: Avoid named struct types on LLVM 3.0 and later. Starting with LLVM 3.0, named structures are meant not for debugging, but for recursive data types, previously also known as opaque types. The recursive nature of these types leads to several memory management difficulties. Given that we don't actually need recursive types, avoid them altogether. This is an attempt to address fdo bugs 41791 and 44466. The issue is somewhat random so there's no easy way to check how effective this is. Cherry-picked from 9af1ba565dfd5cef9ee938bb7c04767d14878fbf commit df6070f618a203c7a876d984c847cde4cbc26bdb Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 14:42:53 2012 +0200 gallivm: (trivial) fix typo in faster aos linear int wrap code no longer crashes, now REALLY tested. commit d8f98dce452c867214e6782e86dc08562643c862 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 18:20:58 2012 +0200 llvmpipe: (trivial) remove bogus optimization for float aos repeat wrap This optimization for nearest filtering on the linear path generated likely bogus results, and the int path didn't have any optimizations there since the only shader using force_nearest apparently uses clamp_to_edge not repeat wrap anyway. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit c4e271a0631087c795e756a5bb6b046043b5099d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 23:01:52 2012 +0200 gallivm: faster repeat wrap for linear aos path too Even if we already have scaled integer coords, it's way faster to use the original float coord (plus some conversions) rather than use URem. The choice of what to do for texture wrapping is not really tied to int aos or float soa filtering though for some modes there can be some gains (because of easier weight calculations). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 1174a75b1806e92aee4264ffe0ffe7e70abbbfa3 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 14:39:22 2012 +0200 gallivm: improve npot tex wrap repeat in linear soa path URem gets translated into series of scalar divisions so just about anything else is faster. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit f849ffaa499ed96fa0efd3594fce255c7f22891b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 00:40:35 2012 +0100 gallivm: (trivial) fix near-invisible shift-space typo I blame the keyboard. commit 5298a0b19fe672aebeb70964c0797d5921b51cf0 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:24:28 2012 +0200 gallivm: add new intrinsic helper to deal with arbitrary vector length This helper will split vectors which are too large for the hw, or expand them if they are too small, so a caller of a function using intrinsics which uses such sizes need not split (or expand) the vectors manually and the function will still use the intrinsic instead of dropping back to generic llvm code. It can also accept scalars for use with pseudo-vector intrinsics (only useful for float arguments, all x86 scalar simd float intrinsics use 4vf32). Only used for lp_build_min/max() for now (also added the scalar float case for these while there). (Other basic binary functions could use it easily, whereas functions with a different interface would need different helpers.) Expanding vectors isn't widely used, because we always try to use build contexts with native hw vector sizes. But it might (or not) be nicer if this wouldn't need to be done, the generated code should in theory stay the same (it does get hit by lp_build_rho though already since we didn't have a intrinsic for the scalar lp_build_max case before). v2: incorporated Brian's feedback, and also made the scalar min/max case work instead of crash (all scalar simd float intrinsics take 4vf32 as argument, probably the reason why it wasn't used before). Moved to lp_bld_intr based on José's request, and passing intrinsic size instead of length. Ideally we'd derive the source type info from the passed in llvm value refs and process some llvmtype return type so we could handle intrinsics where the source and destination type isn't the same (like float/int conversions, packing instructions) but that's a bit too complicated for now. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 01aa760b99ec0b2dc8ce57a43650e83f8c1becdf Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:19:18 2012 +0200 gallivm: (trivial) increase max code size for shader disassembly 64kB was just short of what I needed (which caused a crash) hence increase to 96kB (should probably be smarter about that). commit 74aa739138d981311ce13076388382b5e89c6562 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:53:29 2012 +0100 gallivm: simplify aos float tex wrap repeat nearest just handle pot and npot the same. The previous pot handling ended up with exactly the same instructions plus 2 more (leave it in the soa path though since it is probably still cheaper there). While here also fix a issue which would cause a crash after an assert. commit 0e1e755645e9e49cfaa2025191e3245ccd723564 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:29:24 2012 +0100 gallivm: (trivial) skip floor rounding in ifloor when not signed This was only done for the non-sse41 case before, but even with sse41 this is obviously unnecessary (some callers already call itrunc in this case anyway but some might not). commit 7f01a62f27dcb1d52597b24825931e88bae76f33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:23:12 2012 +0100 gallivm: (trivial) fix bogus comments commit 5c85be25fd82e28490274c468ce7f3e6e8c1d416 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 20 11:51:57 2012 +0100 translate: Free elt8_func/elt16_func too. These were leaking. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 0ad498f36fb6f7458c7cffa73b6598adceee0a6c Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 15:55:34 2012 +0200 gallivm: fix bug for tex wrap repeat with linear sampling in aos float path The comparison needs to be against length not length_minus_one, otherwise the max texel is never chosen (for the second coordinate). Fixes piglit texwrap-1D-npot-proj (and 2D/3D versions). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit d1ad65937c5b76407dc2499b7b774ab59341209e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 16:13:43 2012 +0200 gallivm: simplify soa tex wrap repeat with npot textures and no mip filtering Similar to what is already done in aos sampling for the float path (but not the int path since we don't get normalized float coordinates there). URem is expensive and the calculation is done trivially with normalized floats instead (at least with sse41-capable cpus). (Some day should probably do the same for the mip filter path but it's much more complicated there hence the gain is smaller.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit e1e23f57ba9b910295c306d148f15643acc3fc83 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:38:56 2012 +0200 llvmpipe: (trivial) remove duplicated function declaration Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 07ca57eb09e04c48a157733255427ef5de620861 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:37:34 2012 +0200 llvmpipe: destroy setup variants on context destruction lp_delete_setup_variants() used to be called in garbage collection, but this no longer exists hence the setup shaders never got freed. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ed0003c633859a45f9963a479f4c15ae0ef1dca3 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 16:25:29 2012 +0100 gallivm: handle different ilod parts for multiple quad sampling This fixes filtering when the integer part of the lod is not the same for all quads. I'm not fully convinced of that solution yet as it just splits the vector if the levels to be sampled from are different. But otherwise we'd need to do things like some minify steps, and getting mip level base address separately anyway hence it wouldn't really look like much of a win (and making the code even more complex). This should now give identical results to single quad sampling. commit 8580ac4cfc43a64df55e84ac71ce1a774d33c0d2 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:14:47 2012 +0200 gallivm: de-duplicate sample code common to soa and aos sampling There doesn't seem to be any reason why this code dealing with cube face selection, lod and mip level calculation is separate in aos and soa sampling, and I am sick of having it to change in both places. commit fb541e5f957408ce305b272100196f1e12e5b1e8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:15:41 2012 +0200 gallivm: do mip filtering with per quad lod_fpart This gives better results for mip filtering, though the generated code might not be optimal. For now it also creates some artifacts if the lod_ipart isn't the same for all quads, since instead of using the same mip weight for all quads as previously (which just caused non-smooth gradients) this now will use the right weights but with the wrong mip level in this case (can easily be seen with things like texfilt, mipmap_tunnel). v2: use logic helper suggested by José, and fix issue with negative lod_fpart values commit f1cc84eef7d826a20fab6cd8ccef9a275ff78967 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 13 18:35:25 2012 +0200 gallivm: (trivial) fix bogus assert in lp_build_unpack_broadcast_aos_scalars commit 7c17dbae8ae290df9ce0f50781a09e8ed640c044 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:11:14 2012 +0100 util: Reimplement half <-> float conversions. Removed u_half.py used to generate the table for previous method. Previous implementation of float to half conversion was faulty for denormalised and NaNs and would require extra logic to fix, thus making the speedup of using tables irrelevant. commit 7762f59274070e1dd4b546f5cb431c2eb71ae5c3 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:12:16 2012 +0100 tests: Updated tests to properly handle NaN for half floats. commit fa94c135aea5911fd93d5dfb6e6f157fb40dce5e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:33:10 2012 +0200 gallivm: do mip level calculations per quad This is the final piece which shouldn't change the rendering output yet. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 23cbeaddfe03c09ca18c45d28955515317ffcf4c Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Jun 9 00:54:21 2012 +0200 gallivm: do per-quad cube face selection Doesn't quite fix the piglit cubemap test (not sure why actually) but doing per-quad face selection is doing the right thing and definitely an improvement. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit abfb372b3702ac97ac8b5aa80ad1b94a2cc39d33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:22:59 2012 +0200 gallivm: do all lod calculations per quad Still no functional change but lod is now converted to scalar after lod calculations. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 519368632747ae03feb5bca9c655eccbc5b751b4 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:46:10 2012 +0100 gallivm: Added support for half-float to float conversion in lp_build_conv. Updated various utility functions to support this change. commit 135b4d683a4c95f7577ba27b9bffa4a6fbd2c2e7 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:02:46 2012 +0100 gallivm: Added function for half-float to float conversion. Updated lp_build_format_aos_array to support half-float source. commit 37d648827406a20c5007abeb177698723ed86673 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:55:18 2012 +0100 util: Updated u_format_tests to rigidly test half-float boundary values. commit 2ad18165d96e578aa9046df7c93cb1c3284d8c6b Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:54:16 2012 +0100 llvmpipe: Updated lp_test_format to properly handle Inf/NaN results. commit 78740acf25aeba8a7d146493dd5c966e22c27b73 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:53:30 2012 +0100 util: Added functions for checking NaN / Inf for double and half-floats. commit 35e9f640ae01241f9e0d67fe893bbbf564c05809 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:05:13 2012 +0200 gallivm: Fix calculating rho for 3d textures for the single-quad case Discovered by accident, this looks like a very old typo bug. commit fc1220c636326536fd0541913154e62afa7cd1d8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:04:59 2012 +0200 gallivm: do calcs per-quad in lp_build_rho Still convert to scalar at the end of the function. commit 50a887ffc550bf310a6988fa2cea5c24d38c1a41 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 21 23:21:50 2012 +0200 gallivm: (trivial) return scalar in lp_build_extract_range for length 1 vectors Our type system on top of llvm's one doesn't generally support vectors of length 1, instead using scalars. So we should return a scalar from this function instead of having to bitcast the vector with length 1 later elsewhere. commit 80c71c621f9391f0f9230460198d861643324876 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 17:49:15 2012 +0100 draw: Fixed bad merge error commit c47401cfad0c9167de20ff560654f533579f452c Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:29:30 2012 +0100 draw: Updated store_clip to store whole vectors instead of individual elements. commit 2d9c1ad74b0b0b41861fffcecde39f09cc27f1cf Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:28:32 2012 +0100 gallivm: Added lp_build_fetch_rgba_aos_array. A version of lp_build_fetch_rgba_aos which is targeted at simple array formats. Reads the whole vector from memory in one, instead of reading each element individually. Tested with mesa tests and demos. commit ff7805dc2b6ef6d8b11ec4e54aab1633aef29ac8 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:27:40 2012 +0100 gallivm: Added lp_build_pad_vector. This function pads a vector with undef to a desired length. commit 701f50acef24a2791dabf4730e5b5687d6eb875d Author: James Benton <jbenton@vmware.com> Date: Fri May 18 17:27:19 2012 +0100 util: Added util_format_is_array. This function checks whether a format description is in a simple array format. commit 5e0a7fa543dcd009de26f34a7926674190fa6246 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:13:47 2012 +0100 draw: Removed draw_llvm_translate_from and draw/draw_llvm_translate.c. This is "replaced" by adding an optimised path in lp_build_fetch_rgba_aos in an upcoming patch. commit 8c886d6a7dd3fb464ecf031de6f747cb33e5361d Author: James Benton <jbenton@vmware.com> Date: Wed May 16 15:02:31 2012 +0100 draw: Modified store_aos to write the vector as one, not individual elements. commit 37337f3d657e21dfd662c7b26d61cb0f8cfa6f17 Author: James Benton <jbenton@vmware.com> Date: Wed May 16 14:16:23 2012 +0100 draw: Changed aos_to_soa to use lp_build_transpose_aos. commit bd2b69ce5d5c94b067944d1dcd5df9f8e84548f1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:14:27 2012 +0100 draw: Changed soa_to_aos to use lp_build_transpose_aos. commit 0b98a950d29a116e82ce31dfe7b82cdadb632f2b Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:45 2012 +0100 gallivm: Added lp_build_transpose_aos which converts between aos and soa. commit 69ea84531ad46fd145eb619ed1cedbe97dde7cb5 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:01 2012 +0100 gallivm: Added lp_build_interleave2_half aimed at AVX unpack instructions. commit 7a4cb1349dd35c18144ad5934525cfb9436792f9 Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 22 11:54:14 2012 +0100 gallivm: Fix build on Windows. MC-JIT not yet supported there. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit afd105fc16bb75d874e418046b80d9cc578818a1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:17:26 2012 +0100 llvmpipe: Added a error counter to lp_test_conv. Useful for keeping track of progress when fixing errors! Signed-off-by: José Fonseca <jfonseca@vmware.com> commit b644907d08c10a805657841330fc23db3963d59c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:16:46 2012 +0100 llvmpipe: Changed known failures in lp_test_conv. To comply with the recent fixes to lp_bld_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit d7061507bd94f6468581e218e61261b79c760d4f Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:14:38 2012 +0100 llvmpipe: Added fixed point types tests to lp_test_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 146b3ea39b4726dbe125ac666bd8902ea3d6ca8c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:26:35 2012 +0100 llvmpipe: Changed lp_test_conv src/dst alignment to be correct. Now based on the define rather than a fixed number. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit f3b57441f834833a4b142a951eb98df0aa874536 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:06:44 2012 +0100 gallivm: Fixed erroneous optimisation in lp_build_min/max. Previously assumed normalised was 0 to 1, but it can be -1 to 1 if type is signed. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a0613382e5a215cd146bb277646a6b394d376ae4 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:04:49 2012 +0100 gallivm: Compensate for lp_const_offset in lp_build_conv. Fixing a /*FIXME*/ to remove errors in integer conversion in lp_build_conv. Tested using lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a3d2bf15ea345bc8a0664f8f441276fd566566f3 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:01:25 2012 +0100 gallivm: Fixed overflow in lp_build_clamped_float_to_unsigned_norm. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit e7b1e76fe237613731fa6003b5e1601a2e506207 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 21 20:07:51 2012 +0100 gallivm: Fix build with LLVM 2.6 Trivial, and useful. commit d3c6bbe5c7f5ba1976710831281ab1b6a631082d Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 15 17:15:59 2012 +0100 gallivm: Enable MCJIT/AVX with vanilla LLVM 3.1. Add the necessary C++ glue, so that we don't need any modifications to the soon to be released LLVM 3.1. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 724a019a14d40fdbed21759a204a2bec8a315636 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 22:04:06 2012 +0100 gallivm: Use HAVE_LLVM 0x0301 consistently. commit af6991e2a3868e40ad599b46278551b794839748 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 21:49:06 2012 +0100 gallivm: Add MCRegisterInfo.h to silence benign warnings about missing implementation. Trivial. commit 6f8a1d75458daae2503a86c6b030ecc4bb494e23 Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Apr 2 22:14:15 2012 -0700 gallivm: Pass in a MCInstrInfo to createMCInstPrinter on llvm-3.1. llvm-3.1svn r153860 makes MCInstrInfo available to the MCInstPrinter. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 62555b6ed8760545794f83064e27cddcb3ce5284 Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 27 21:51:17 2012 -0700 gallivm: Fix method overriding in raw_debug_ostream. Use matching type qualifers to avoid method hiding. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 6a9bd784f4ac68ad0a731dcd39e5a3c39989f2be Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 13 22:40:52 2012 -0700 gallivm: Fix createOProfileJITEventListener namespace with llvm-3.1. llvm-3.1svn r152620 refactored the OProfile profiling code. createOProfileJITEventListener was moved from the llvm namespace to the llvm::JITEventListener namespace. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit b674955d39adae272a779be85aa1bd665de24e3e Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Mar 5 22:00:40 2012 -0800 gallivm: Pass in a MCRegisterInfo to MCInstPrinter on llvm-3.1. llvm-3.1svn r152043 changes createMCInstPrinter to take an additional MCRegisterInfo argument. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 11ab69971a8a31c62f6de74905dbf8c02884599f Author: Vinson Lee <vlee@freedesktop.org> Date: Wed Feb 29 21:20:53 2012 -0800 Revert "gallivm: Change getExtent and readByte to non-const with llvm-3.1." This reverts commit d5a6c172547d8964f4d4bb79637651decaf9deee. llvm-3.1svn r151687 makes MemoryObject accessor members const again. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 339960c82d2a9f5c928ee9035ed31dadb7f45537 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 14 16:19:56 2012 +0200 gallivm: (trivial) fix assertion failure for mipmapped 1d textures In lp_build_rho, we may end up with a 1-element vector (for mipmapped 1d textures), but in this case we require the type to be a non-vector type, so need a cast. commit 9d73edb727bd6d196030dc3026b7bf0c574b3e19 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:12:07 2012 +0200 gallivm: prepare for per-quad lod calculations for large vectors to be able to handle multiple quads at once in texture sampling and still do lod calculations per quad, it is necessary to get the per-quad derivatives into the lp_build_rho function. Until now these derivative values were just scalars, which isn't going to work. So we now use vectors, and since the interface needs to change we also do some different (slightly more efficient) packing of the values. For 8-wide vectors the packed derivative values for 3 coords would look like this, this scales to a arbitrary (multiple of 4) vector size: ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____ The second vector will be unused for 1d and 2d textures. To facilitate future changes the derivative values are put into a struct, since quite some functions just pass these values through. The generated code seems to be very slightly better for 2d textures (with 4-wide vectors) than before with sse2 (if you have a cpu with physical 128bit simd units - otherwise it's probably not a win). v2: suggestions from José, rename variables, add comments, use swizzle helper commit 0aa21de0d31466dac77b05c97005722e902517b8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:10:31 2012 +0200 gallivm: add undefined swizzle handling to lp_build_swizzle_aos This is useful for vectors with "holes", it lets llvm choose the most efficient shuffle instructions if some elements aren't needed without having to worry what elements to manually pick otherwise. commit 00faf3f370e7ce92f5ef51002b0ea42ef856e181 Author: José Fonseca <jfonseca@vmware.com> Date: Fri May 4 17:25:16 2012 +0100 gallivm: Get the LLVM IR optimization passes before JIT compilation. MC-JIT engine compiles the module immediately on creation, so the optimization passes were being run too late. So now we create a target data layout from a string, that matches the ABI parameters reported by the compiler. The backend optimization passes were always been run, so the performance improvement is modest (3% on multiarb mesa demo). Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit 40a43f4e2ce3074b5ce9027179d657ebba68800a Author: Roland Scheidegger <sroland@vmware.com> Date: Wed May 2 16:03:54 2012 +0200 gallivm: (trivial) fix wrong define used in lp_build_pack2 should fix stack-smashing crashes. commit e6371d0f4dffad4eb3b7a9d906c23f1c88a2ab9e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 30 21:25:29 2012 +0200 gallivm: add perf warnings when not using intrinsics with 256bit vectors Helper functions using integer sse2 intrinsics could split the vectors with AVX instead of using generic fallback (which should be faster). We don't actually expect to hit these paths (hence don't fix them up to actually do the vector splitting) so just emit warnings (for those functions where it's obvious doing split/intrinsic is faster than using generic path). Only emit warnings for 256bit vectors since we _really_ don't expect to hit arbitrary large vectors which would affect a lot more functions. The warnings do not actually depend on avx since the same logic applies to plain sse2 too (but of course again there's _really_ no reason we should hit these functions with 256bit vectors without avx). commit 8a9ea701ea7295181e846c6383bf66a5f5e47637 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:37:07 2012 +0200 gallivm: split vectors manually for avx in lp_build_pack2 (v2) There's 2 reasons for this: First, there's a llvm bug (fixed in 3.1) which generates tons of byte inserts/extracts otherwise, and second, more importantly, we want to use pack intrinsics instead of shuffles. We do this in lp_build_pack2 and not the calling code (aos sample path) because potentially other callers might find that useful too, even if for larger sequences of code using non-native vector sizes it might be better to manually split vectors. This should boost texture performance in the aos path considerably. v2: fix issues with intrinsics types with old llvm commit 27ac5b48fa1f2ea3efeb5248e2ce32264aba466e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:26:22 2012 +0200 llvmpipe: refactor lp_build_pack2 (v2) prettify, and it's unnecessary to assert when there's no intrinsic due to unsupported bit width - the shuffle path will work regardless. In contrast lp_build_packs2, should only rely on lp_build_pack2 doing the clamping for element sizes for which there is a sse2 intrinsic. v2: fix bug spotted by Jose regarding the intrinsic type for packusdw on old llvm versions. commit ddf279031f0111de4b18eaf783bdc0a1e47813c8 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:13:59 2012 +0200 gallivm: add src width check in lp_build_packs2() not doing so would skip clamping even if no sse2 pack instruction is available, which is incorrect (in theory only, such widths would also always hit a (unnecessary) assertion in lp_build_pack2(). commit e7f0ad7fe079975eae7712a6e0c54be4fae0114b Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Apr 27 15:57:00 2012 +0200 gallivm: (trivial) fix crash-causing typo for npot textures with avx commit 28a9d7f6f655b6ec508c8a3aa6ffefc1e79793a0 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Apr 25 19:38:45 2012 +0200 gallivm: (trivial) remove code mistakenly added twice. commit d5926537316f8ff67ad0a52e7242f7c5478d919b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Apr 24 21:16:15 2012 +0200 gallivm: add a new avx aos sample path (v2) Try to avoid mixing float and int address calculations. This does texture wrap modes with floats, and then the offset calculations still with ints (because of lack of precision with floats, though we could do some effort to make it work with not too large (16MB) textures). This also handles wrap repeat mode with npot-sized textures differently than either the old soa or aos int path (likely way faster but untested). Otherwise the actual address wrap code is largely similar to the soa path (not quite the same as this one also has some int code), it should get used by avx soa sampling later as well but doesn't handle more complex address modes yet (this will also have the benefit that we can use aos sampling path for all texture address modes). Generated code for that looks reasonable, but still does not split vectors explicitly for fetch/filter which means still get hit by llvm (fixed upstream) which generates hundreds of pinsrb/pextrb instead of two shuffles. It is not obvious though if it's much of a win over just doing address calcs 4-wide but with ints, even if it is definitely much less instructions on avx. piglit's texwrap seems to look exactly the same but doesn't test neither the non-normalized nor the npot cases. v2: fix comments, prettify based on Brian's and Jose's feedback. commit bffecd22dea66fb416ecff8cffd10dd4bdb73fce Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Apr 19 01:58:29 2012 +0200 gallivm: refactor aos lp_build_sample_image_nearest/linear split them up to separate address calculations and fetching/filtering. Need this for being able to do 8-wide float address calcs and 4-wide fetch/filter later (for avx). Plus the functions were very big scary monsters anyway (in particular lp_build_sample_image_linear). commit a80b325c57529adddcfa367f96f03557725c4773 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 16 17:17:18 2012 +0200 gallivm: fix lp_build_resize when truncating width but expanding vector size Missed this case which I thought was impossible - the assertion for it was right after the division by zero... (AoS) texture sampling may ask us to do this, for things like 8 4x32int vectors to 1 32x8int vector conversion (eventually, we probably don't want this to happen). commit f9c8337caa3eb185830d18bce8b95676a065b1d7 Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Apr 14 18:00:59 2012 +0200 gallivm: fix cube maps with larger vectors This makes the branchless cube face selection code work with larger vectors. Because the complexity is quite high (cannot really be improved it seems, per-face selection would reduce complexity a lot but this leads to errors unless the derivatives are calculated all from the same face which almost doubles the work to be done) it is still slower than the branching version, hence only enable this with large vectors. It doesn't actually do per-quad face selection yet (only makes sense with matching lod selection, in fact it will select the same face for all pixels based on the average of the first four pixels for now) but only different shuffles are required to make it work (the branching version actually should work with larger vectors too now thanks to the improved horizontal add but of course it cannot be extended to really select the face per-quad unless doing branching per quad). commit 7780c58869fc9a00af4f23209902db7e058e8a66 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 21:11:12 2012 +0100 llvmpipe: (trivial) fix compiler warning and also clarify comment regarding availability of popcnt instruction. commit a266dccf477df6d29a611154e988e8895892277e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 14:21:07 2012 +0100 gallivm: remove unneeded members in lp_build_sample_context Minor cleanup, the texture width, height, depth aren't accessed in their scalar form anywhere. Makes it more obvious those values should probably be fetched already vectorized (but this requires more invasive changes)... commit b678c57fb474e14f05e25658c829fc04d2792fff Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 29 15:53:55 2012 +0100 gallivm: add a helper for concatenating vectors Similar to the extract_range helper intended to get around slow code generated by llvm for 128bit insertelements. Concatenating two 128bit vectors this way will result in a single vinsertf128 operation rather than two 64bit stores plus one 128bit load, though it might be mildly useful for other purposes as well. commit 415ff228bcd0cf5e44a4c15350a661f0f5520029 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 19:41:15 2012 +0100 gallivm: add a custom 2x8f->1x16ub avx conversion path Similar to the existing 4x4f->1x16ub sse2 path, shaves off a couple instructions (min/max mostly) because it relies on pack intrinsics clamping. commit 78c08fc89f8fbcc6dba09779981b1e873e2a0299 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 18:44:07 2012 +0100 gallivm: add avx arithmetic intrinsics Add all avx intrinsics for arithmetic functions (with the exception of the horizontal add function which needs another look). Seems to pass basic tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit a586caa2800aa5ce54c173f7c0d4fc48153dbc4e Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 15:31:35 2012 +0100 gallivm: add avx logic intrinsics Add the blend intrinsics for 8-wide float and 4-wide double vectors. Since we lack 256bit int instructions these are used for int vectors as well, though obviously not for byte or word element values. The comparison intrinsics aren't extended for avx since these are only used for pre-2.7 llvm versions. commit 70275e4c13c89315fc2560a4c488c0e6935d5caf Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 00:40:53 2012 +0100 gallivm: new helper function for extract shuffles. Based on José's idea as we can need that in a couple places. Note that such shuffles should not be used lightly, since data layout of <4 x i8> is different to <16 x i8> for instance, hence might cause data rearrangement. commit 4d586dbae1b0c55915dda1759d2faea631c0a1c2 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 18:27:25 2012 +0100 gallivm: (trivial) don't overallocate shuffle variable using wrong define meant huge array... commit 06b0ec1f6d665d98c135f9573ddf4ba04b2121ad Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 17:54:20 2012 +0100 gallivm: don't do per-element extract/insert for vector element resize Instead of doing per-element extract/insert if the src vectors and dst vector differ in total size (which generates atrocious code) first change the src vectors size by using shuffles to destination vector size. We can still do better than that on AVX for packing to color buffer (by exploiting pack intrinsics characteristics hence eleminating the need for some clamps) but this already generates much better code. v2: incorporate feedback from José, Keith and use shuffle instead of bitcasts/extracts. Due to llvm deficiencies the latter cause all data to get moved to GPRs and back in pieces (even though the data in the regs actually stays the same...). commit c9970d70e05f95d3f52fe7d2cd794176a52693aa Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 19:33:19 2012 +0000 gallivm: fix bug in simple position interpolation Accidental use of position attribute instead of just pixel coordinates. Caused failures in piglit glsl-fs-ceil and glsl-fs-floor. commit d0b6fcdb008d04d7f73d3d725615321544da5a7e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 15:31:14 2012 +0000 gallivm: fix emission of ceil opcode lp_build_ceil seems more appropriate than lp_build_trunc. This seems to be never hit though someone performs some ceil to floor magic. commit d97fafed7e62ffa6bf76560a92ea246a1a26d256 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 22 11:46:52 2012 +0000 gallivm: new vectorized path for cubemap calculations should be faster when adapted to multiple quads as only selection masks need to be different. The code is more or less a per-pixel version adapted to only do it per quad. A per pixel version would be much simpler (could drop 2 selects, 6 broadcasts and the messy horizontal add of 3 vectors at the expense of only 2 more absolute value instructions - would also just work for arbitary large vectors). This version doesn't yet work with larger vectors because the horizontal add isn't adjusted to be able to work with 2x4 vectors (and also because face selection wouldn't be done per quad just per block though that would be only a correctness issue just as with lod selection). The downside is this code is quite a bit slower. On a Core2 it can be sped up by disabling the hw blend instructions for selection and using logicop fallbacks instead, but it is still slower than the old code, hence leave that in for now. Probably will chose one or the other version based on vector length in the end. commit b375fbb18a3fd46859b7fdd42f3e9908ea4ff9a3 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 21 14:42:29 2012 +0000 gallivm: fix optimized occlusion query intrinsic name commit a9ba0a3b611e48efbb0e79eb09caa85033dbe9a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Mar 21 16:19:43 2012 +0000 draw,gallivm,llvmpipe: Call gallivm_verify_function everywhere. commit f94c2238d2bc7383e088b8845b7410439a602071 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 18:54:10 2012 +0000 gallivm: optimize calculations for cube maps a bit this does some more vectorized calculations and uses horizontal adds if possible. A definite win with sse3 otherwise it doesn't seem to make much of a difference. In any case this is arithmetically identical, cannot handle larger vectors. Should be useful as a reference point against larger vector version later... commit 21a2c1cf3c8e1ac648ff49e59fdc0e3be77e2ebb Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 15:16:27 2012 +0000 llvmpipe: slight optimization of occlusion queries using movmskps when available. While this is slightly better for cpus without popcnt we should really sum the vectors ourselves (it is also possible to cast to i4 before doing the popcnt but that doesn't help that much neither since llvm is using some optimized popcnt version for i32) commit 5ab5a35f216619bcdf55eed52b0db275c4a06c1b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 13:32:11 2012 +0000 llvmpipe: fix occlusion queries with larger vectors need to adjust casts etc. commit ff95e6fdf5f16d4ef999ffcf05ea6e8c7160b0d5 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Mar 19 20:15:25 2012 +0000 gallivm: Restore optimization passes. commit 57b05b4b36451e351659e98946dae27be0959832 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:34:22 2012 +0000 llvmpipe: use existing min2 macro commit bc9a20e19b4f600a439f45679451f2e87cd4b299 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:07:27 2012 +0000 llvmpipe: add some safeguards against really large vectors As per José's suggestion, prevent things from blowing up if some cpu would have 1024bit or larger vectors. commit 0e2b525e5ca1c5bbaa63158bde52ad1c1564a3a9 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:31:08 2012 +0000 llvmpipe: fix mask generation for uberwide vectors this was the only piece preventing 16-wide vectors from working (apart from the LP_MAX_VECTOR_WIDTH define that is), which is the maximum as we don't get more pixels in the fragment shader at once. Hence adjust that so things could be tested properly with that size even though there seems to be no practical value. commit 3c8334162211c97f3a11c7f64e9e5a2a91ad9656 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:19:41 2012 +0000 llvmpipe: fix the simple interpolation method with larger vectors so both methods actually _really_ work now. Makes textures look nice with larger vectors... commit 1cb0464ef8871be1778d43b0c56adf9c06843e2d Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 17:26:35 2012 +0000 llvmpipe: fix mask generation and position interpolation with 8-wide vectors trivial bugs, with these things start to look somewhat reasonable. Textures though have some swizzling issues it seems. commit 168277a63ef5b72542cf063c337f2d701053ff4b Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 16:04:03 2012 +0000 llvmpipe: don't overallocate variables we never have more than 16 (stamp size) / 4 (minimum possible vector size). (With larger vectors those variables are still overallocated a bit.) commit 409b54b30f81ed0aa9ed0b01affe15c72de9abd2 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:56:48 2012 +0000 llvmpipe: add some 32f8 formats to lp_test_conv Also add the ability to handle different sized vectors. commit 55dcd3af8366ebdac0af3cdb22c2588f24aa18ce Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:47:27 2012 +0000 gallivm: handle different sized vectors in conversion / pack only fully generic path for now (extract/insert per element). commit 9c040f78c54575fcd94a8808216cf415fe8868f6 Author: Roland Scheidegger <sroland@vmware.com> Date: Sun Mar 18 00:58:28 2012 +0100 llvmpipe: fix harmless use of unitialized values commit 551e9d5468b92fc7d5aa2265db9a52bb1e368a36 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:31:21 2012 +0100 gallivm: drop special path in extract_broadcast with different sized vectors Not needed, llvm can handle shuffles with different sized result vector just fine. Should hopefully generate the same code in the end, but simpler IR. commit 44da531119ffa07a421eaa041f63607cec88f6f8 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:28:49 2012 +0100 llvmpipe: adapt interpolation for handling multiple quads at once this is still WIP there are actually two methods possible not quite sure what makes the most sense, so there's code for both for now: 1) the iterative method as used before (compute attrib values at upper left corner of stamp and upper left corner of each quad initially). It is improved to handle more than one quad at once, and also do some more vectorized calculations initially for slightly better code - newer cpus have full throughput with 4 wide float vectors, hence don't try to code up a path which might be faster if there's just one channel active per attribute. 2) just do straight interpolation for each pixel. Method 2) is more work per quad, but less initially - if all quads are executed significantly more overall though. But this might change with larger vector lengths. This method would also be needed if we'd do some kind of active quad merging when operating on multiple quads at once. This path contains some hack to force llvm to generate better code, it is still far from ideal though, still generates far too many unnecessary register spills/reloads. Both methods should work with different sized vectors. Not very well tested yet, still seems to work with four-wide vectors, need changes elsewhere to be able to test with wider vectors. commit be5d3e82e2fe14ad0a46529ab79f65bf2276cd28 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:59:37 2012 +0000 draw: Cleanup. commit f85bc12c7fbacb3de2a94e88c6cd2d5ee0ec0e8d Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:43:30 2012 +0000 gallivm: More module compilation refactoring. commit d76f093198f2a06a93b2204857e6fea5fd0b3ece Author: José Fonseca <jfonseca@vmware.com> Date: Thu Mar 15 21:29:11 2012 +0000 llvmpipe: Use gallivm_compile/free_function() in linear code. Should had been done before. commit 122e1adb613ce083ad739b153ced1cde61dfc8c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 13 14:47:10 2012 +0100 llvmpipe: generate partial pixel mask for multiple quads still works with one quad, cannot be tested yet with more At least for now always fixed order with multiple quads. commit 4c4f15081d75ed585a01392cd2dcce0ad10e0ea8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 22:09:24 2012 +0100 llvmpipe: refactor state setup a bit Refactor to make it easier to emit (and potentially later fetch in fs) coefficients for multiple attributes at once. Need to think more about how to make this actually happen however, the problem is different attributes can have different interpolation modes, requiring different handling in both setup and fs (though linear and perspective handling is close). commit 9363e49722ff47094d688a4be6f015a03fba9c79 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 19:23:23 2012 +0100 llvmpipe: vectorize tri offset calc cuts number of instructions in quad-offset-factor from 107 to 75. This code actually duplicated the (scalar) code calculating the determinant except it used different vertex order (leading to different sign but it doesn't matter) hence llvm could not have figured out it's the same (of course with determinant vectorized in the other place that wouldn't have worked any longer neither). Note this particular piece doesn't actually vectorize well, not many arithmetic instructions left but tons of shuffle instructions... Probably would need to work on n tris at a time for better vectorization. commit 63169dcb9dd445c94605625bf86d85306e2b4297 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 03:11:37 2012 +0100 llvmpipe: vectorize some scalar code in setup reduces number of arithmetic instructions, and avoids loading vector x,y values twice (once as scalars once as vectors). Results in a reduction of instructions from 76 to 64 in fs setup for glxgears (16%) on a cpu with sse41. Since this code uses vec2 disguised as vec4, on old cpus which had physical 64bit sse units (pre-Core2) it probably is less of a win in practice (and if you have no vectors you can only hope llvm eliminates the arithmetic for unneeded elements). commit 732ecb877f951ab89bf503ac5e35ab8d838b58a1 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 7 00:32:24 2012 +0100 draw: fix clipping bug introduced by 4822fea3f0440b5205e957cd303838c3b128419c broke clipping pretty badly (verified with lineclip test) commit ef5d90b86d624c152d200c7c4056f47c3c6d2688 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 23:38:59 2012 +0100 draw: don't store vertex header per attribute storing the vertex header once per attribute is totally unnecessary. Some quick look at the generated assembly says llvm in fact cannot optimize away the additional stores (maybe due to potentially aliasing pointers somewhere). Plus, this makes the code cleaner and also allows using a vector "or" instead of scalar ones. commit 6b3a5a57b0b9850854cfbd7b586e4e50102dda71 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 19:11:01 2012 +0100 draw: do the per-vertex "boolean" clipmask "or" with vectors no point extracting the values and doing it per component. Doesn't help that much since we still extract the values elsewhere anyway. commit 36519caf1af40e4480251cc79a2d527350b7c61f Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 2 22:27:01 2012 +0100 gallivm: fix lp_build_extract_broadcast with different sized vectors Fix the obviously wrong argument, so it doesn't blow up. commit 76d0ac3ad85066d6058486638013afd02b069c58 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 2 12:16:23 2012 +0000 draw: Compile per module and not per function (WIP). Enough to get gears w/ LLVM draw + softpipe to work on AVX doing: GALLIUM_DRIVER=softpipe SOFTPIPE_USE_LLVM=yes glxgears But still hackish -- will need to rethink and refactor this. commit 78e32b247d2a7a771be9a1a07eb000d1e54ea8bd Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 12:01:05 2012 +0000 llvmpipe: Remove lp_state_setup_fallback. Never used. commit 6895d5e40d19b4972c361e8b83fdb7eecda3c225 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Feb 27 19:14:27 2012 +0000 llvmpipe: Don't emit EMMS on x86 We already take precautions to ensure that LLVM never emits MMX code. commit 4822fea3f0440b5205e957cd303838c3b128419c Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 29 15:58:19 2012 +0100 draw: modifications for larger vector sizes We want to be able to use larger vectors especially for running the vertex shader. With this patch we build soa vectors which might have a different length than 4. Note that aos structures really remain the same, only when aos structures are converted to soa potentially different sized vectors are used. Samplers probably don't work yet, didn't look at them. Testing done: glxgears works with both 128bit and 256bit vectors. commit f4950fc1ea784680ab767d3dd0dce589f4e70603 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:57 2012 +0100 gallivm: override native vector width with LP_NATIVE_VECTOR_WIDTH env var for debug commit 6ad6dbf0c92f3bf68ae54e5f2aca035d19b76e53 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:24 2012 +0100 draw: allocate storage with alignment according to native vector width commit 7bf0e3e7c9bd2469ae7279cabf4c5229ae9880c1 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Feb 24 19:06:08 2012 +0000 gallivm: Fix comment grammar. Was missing several words. Spotted by Roland. commit b20f1b28eb890b2fa2de44a0399b9b6a0d453c52 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 19:22:09 2012 +0000 gallivm: Use MC-JIT on LLVM 3.1 + (i.e, SVN) MC-JIT Note: MC-JIT is still WIP. For this to work correctly it requires LLVM changes which are not yet upstream. commit b1af4dfcadfc241fd4023f4c3f823a1286d452c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 20:03:15 2012 +0100 llvmpipe: use new lp_type_width() helper in lp_test_blend commit 04e0a37e888237d4db2298f31973af459ef9c95f Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 19:50:34 2012 +0100 llvmpipe: clean up lp_test_blend a little Using variables just sized and aligned right makes it a bit more obvious what's going on. The test still only tests vector length 4. For AoS anything else probably isn't going to work. For SoA other lengths should work (at least with floats). commit e61c393d3ec392ddee0a3da170e985fda885a823 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:48:30 2012 +0000 gallivm: Ensure vector width consistency. Instead of assuming that everything is the max native size. commit 330081ac7bc41c5754a92825e51456d231bf84dd Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:44:14 2012 +0000 draw: More simd vector width consistency fixes. commit d90ca002753596269e37297e2e6c139b19f29f03 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:43:00 2012 +0000 gallivm: Remove unused lp_build_int32_vec4_type() helper. commit cae23417824d75869c202aaf897808d73a2c1db0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 17:32:16 2012 +0100 gallivm: use global variable for native vector width instead of define We do not know the simd extensions (and hence the simd width we should use) available at compile time. At least for now keep a define for maximum vector width, since a global variable obviously can't be used to adjust alignment of automatic stack variables. Leave the runtime-determined value at 128 for now in all cases. commit 51270ace6349acc2c294fc6f34c025c707be538a Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 15:41:02 2012 +0000 gallivm: Add a hunk inadvertedly lost when rebasing. commit bf256df9cfdd0236637a455cbaece949b1253e98 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:24:23 2012 +0000 llvmpipe: Use consistent vector width in depth/stencil test. commit 5543b0901677146662c44be2cfba655fd55da94b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:19:59 2012 +0000 draw: Use a consistent the vector register width. Instead of 4x32 sometimes, LP_NATIVE_VECTOR_WIDTH other times. commit eada8bbd22a3a61f549f32fe2a7e408222e5c824 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 12:08:04 2012 +0000 gallivm: Remove garbagge collection. MC-JIT will require one compilation per module (as opposed to one compilation per function), therefore no state will be shared, eliminating the need to do garbagge collection. commit 556697ea0ed72e0641851e4fbbbb862c470fd7eb Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 10:33:41 2012 +0000 gallivm: Move all native target initialization to lp_set_target_options(). commit c518e8f3f2649d5dc265403511fab4bcbe2cc5c8 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:52:32 2012 +0000 llvmpipe: Create one gallivm instance for each test. commit 90f10af8920ec6be6f2b1e7365cfc477a0cb111d Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:48:08 2012 +0000 gallivm: Avoid LLVMAddGlobalMapping() in lp_bld_assert(). Brittle, complex, and unecesary. Just use function pointer constant. commit 98fde550b33401e3fe006af59db4db628bcbf476 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:21:26 2012 +0000 gallivm: Add a lp_build_const_func_pointer() helper. To be reused in all places where we want to call C code. commit 6cfedadb62c2ce5af8d75969bc95a607f3ece118 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:44:41 2012 +0000 gallivm: Cleanup/simplify lp_build_const_string_variable. - Move to lp_bld_const where it belongs - Rename to lp_build_const_string - take the length from the argument (and don't count the zero terminator twice) - bitcast the constant to generic i8 * commit db1d4018c0f1fa682a9da93c032977659adfb68c Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 11:52:17 2012 +0000 gallivm: Set NoFramePointerElimNonLeaf to true where supported. commit 088614164aa915baaa5044fede728aa898483183 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 22 19:38:47 2012 +0100 llvmpipe: pass in/out pointers rather scalar floats in lp_bld_arit we don't want llvm to potentially optimize away the vectors (though it doesn't seem to currently), plus we want to be able to handle in/out vectors of arbitrary length. commit 3f5c4e04af8a7592fdffa54938a277c34ae76b51 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:22:55 2012 +0100 gallivm: fix lp_build_sqrt() for vector length 1 since we optimize away vectors with length 1 need to emit intrinsic without vector type. commit 79d94e5f93ed8ba6757b97e2026722ea31d32c06 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 17:00:46 2012 +0000 llvmpipe: Remove lp_test_round. commit 81f41b5aeb3f4126e06453cfc78990086b85b78d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:56:24 2012 +0100 llvmpipe: subsume lp_test_round into lp_test_arit Much simpler, and since the arguments aren't passed as 128bit values can run on any arch. This also uses the float instead of the double versions of the c functions (which probably was the intention anyway). In contrast to lp_test_round the output is much less verbose however. Tested vector width of 32 to 512 bits - all pass except 32 (length 1) which crashes in lp_build_sqrt() due to wrong type. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 945b338b421defbd274481d8c4f7e0910fd0e7eb Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 09:55:03 2012 +0000 gallivm: Centralize the function compilation logic. This simplifies a lot of code. Also doing this in a central place will make it easier to carry out the changes necessary to use MC-JIT in the future. gallivm: Fix typo in explicit derivative shuffle. Trivial. draw: make DEBUG_STORE work again adapt to lp_build_printf() interface changes Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: get rid of vecnf_from_scalar() just use lp_build_broadcast directly (cannot assign a name but don't really need it, vecnf_from_scalar() was producing much uglier IR due to using repeated insertelement instead of insertelement+shuffle). Reviewed-by: José Fonseca <jfonseca@vmware.com> llvmpipe: fix typo in complex interpolation code Fixes position interpolation when using complex mode (piglit fp-fragment-position and similar) Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: fix clipvertex/position storing again This appears to be the result of a bad merge. Fixes piglit tests relying on clipping, like a lot of the interpolation tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Fix explicit derivative manipulation. Same counter variable was being used in two nested loops. Use more meanigful variable names for the counter to fix and avoid this. gallivm: Prevent buffer overflow in repeat wrap mode for NPOT. Based on Roland's patch, discussion, and review . Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix dims for TGSI_TEXTURE_1D in emit_tex. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix explicit volume texture derivatives. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: fix 1d shadow texture sampling Always r coordinate is used, hence need 3 coords not two (the second one is unused). Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Enable AVX support without MCJIT, where available. For now, this just enables AVX on Windows for testing. If the code is stable then we might consider prefering the old JIT wherever possible. No change elsewhere. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2012-07-13 18:09:30 +01:00
/* could use psign with sse3 and appropriate vectors here */
LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
res = lp_build_select(bld, cond, bld->one, minus_one);
}
/* Handle zero */
cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
res = lp_build_select(bld, cond, bld->zero, res);
return res;
}
2010-03-04 16:45:34 +00:00
/**
* Set the sign of float vector 'a' according to 'sign'.
* If sign==0, return abs(a).
* If sign==1, return -abs(a);
* Other values for sign produce undefined results.
*/
LLVMValueRef
lp_build_set_sign(struct lp_build_context *bld,
LLVMValueRef a, LLVMValueRef sign)
{
LLVMBuilderRef builder = bld->gallivm->builder;
2010-03-04 16:45:34 +00:00
const struct lp_type type = bld->type;
LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2010-03-04 16:45:34 +00:00
~((unsigned long long) 1 << (type.width - 1)));
LLVMValueRef val, res;
assert(type.floating);
assert(lp_check_value(type, a));
2010-03-04 16:45:34 +00:00
/* val = reinterpret_cast<int>(a) */
val = LLVMBuildBitCast(builder, a, int_vec_type, "");
2010-03-04 16:45:34 +00:00
/* val = val & mask */
val = LLVMBuildAnd(builder, val, mask, "");
2010-03-04 16:45:34 +00:00
/* sign = sign << shift */
sign = LLVMBuildShl(builder, sign, shift, "");
2010-03-04 16:45:34 +00:00
/* res = val | sign */
res = LLVMBuildOr(builder, val, sign, "");
2010-03-04 16:45:34 +00:00
/* res = reinterpret_cast<float>(res) */
res = LLVMBuildBitCast(builder, res, vec_type, "");
2010-03-04 16:45:34 +00:00
return res;
}
/**
* Convert vector of (or scalar) int to vector of (or scalar) float.
*/
LLVMValueRef
lp_build_int_to_float(struct lp_build_context *bld,
LLVMValueRef a)
{
LLVMBuilderRef builder = bld->gallivm->builder;
const struct lp_type type = bld->type;
LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
assert(type.floating);
return LLVMBuildSIToFP(builder, a, vec_type, "");
}
gallivm,draw,llvmpipe: Support wider native registers. Squashed commit of the following: commit 7acb7b4f60dc505af3dd00dcff744f80315d5b0e Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:46:31 2012 +0100 draw: Don't use dynamically sized arrays. Not supported by MSVC. commit 5810c28c83647612cb372d1e763fd9d7780df3cb Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:44:16 2012 +0100 gallivm,llvmpipe: Don't use expressions with PIPE_ALIGN_VAR(). MSVC doesn't accept exceptions in _declspec(align(...)). Use a define instead. commit 8aafd1457ba572a02b289b3f3411e99a3c056072 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:41:56 2012 +0100 gallium/util: Make u_cpu_detect.h header C++ safe. commit 5795248350771f899cfbfc1a3a58f1835eb2671d Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 2 12:08:01 2012 +0100 gallium/util: Add ULL suffix to large constants. As suggested by Andy Furniss: it looks like some old gcc versions require it. commit 4c66c22727eff92226544c7d43c4eb94de359e10 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Truly disable INF/NAN tests on MSVC. Thanks to Brian for spotting this. commit 8bce274c7fad578d7eb656d9a1413f5c0844c94e Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Disable INF/NAN tests on MSVC. Somehow they are not recognized as constants. commit 6868649cff8d7fd2e2579c28d0b74ef6dd4f9716 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 15:05:24 2012 +0200 gallivm: Cleanup the 2 x 8 float -> 16 ub special path in lp_build_conv. No behaviour change intended, like 7b98455fb40c2df84cfd3cdb1eb7650f67c8a751. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 5147a0949c4407e8bce9e41d9859314b4a9ccf77 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 14:28:19 2012 +0200 gallivm: (trivial) fix issues with multiple-of-4 texture fetch Some formats can't handle non-multiple of 4 fetches I believe, but everything must support length 1 and multiples of 4. So avoid going to scalar fetch (which is very costly) just because length isn't 4. Also extend the hack to not use shift with variable count for yuv formats to arbitrary length (larger than 1) - doesn't matter how many elements we have we always want to avoid it unless we have variable shift count instruction (which we should get with avx2). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 87ebcb1bd71fa4c739451ec8ca89a7f29b168c08 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jul 4 02:09:55 2012 +0200 gallivm: (trivial) fix typo for wrap repeat mode in linear filtering aos code This would lead to bogus coordinates at the edges. (undetected by piglit because this path is only taken for block-based formats). Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 3a42717101b1619874c8932a580c0b9e6896b557 Author: José Fonseca <jfonseca@vmware.com> Date: Tue Jul 3 19:42:49 2012 +0100 gallivm: Fix TGSI integer translation with AVX. commit d71ff104085c196b16426081098fb0bde128ce4f Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 15:17:41 2012 +0100 llvmpipe: Fix LLVM JIT linear path. It was not working properly because it was looking at the JIT function before it was actually compiled. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit a94df0386213e1f5f9a6ed470c535f9688ec0a1b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Jun 28 18:07:10 2012 +0100 gallivm: Refactor lp_build_broadcast(_scalar) to share code. Doesn't really change the generated assembly, but produces more compact IR, and of course, makes code more consistent. Reviewed-by: Brian Paul <brianp@vmware.com> commit 66712ba2731fc029fa246d4fc477d61ab785edb5 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 17:30:13 2012 +0100 gallivm: Make LLVMContextRef a singleton. There are any places inside LLVM that depend on it. Too many to attempt to fix. Reviewed-by: Brian Paul <brianp@vmware.com> commit ff5fb7897495ac263f0b069370fab701b70dccef Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 28 18:15:27 2012 +0200 gallivm: don't use 8-wide texture fetch in aos path This appears to be a slight loss usually. There are probably several reasons for that: - fetching itself is scalar - filtering is pure int code hence needs splitting anyway, same for the final texel offset calculations - texture wrap related code, which can be done 8-wide, is slightly more complex with floats (with clamp_to_edge) and float operations generally more costly hence probably not much faster overall - the code needed to split when encountering different mip levels for the quads, adding complexity So, just split always for aos path (but leave it 8-wide for soa, since we do 8-wide filtering there when possible). This should certainly be revisited if we'd have avx2 support. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ce8032b43dcd8e8d816cbab6428f54b0798f945d Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:41:19 2012 +0200 gallivm: (trivial) don't extract fparts variable if not needed Did not have any consequences but unnecessary. commit aaa9aaed8f80dc282492f62aa583a7ee23a4c6d5 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:09:06 2012 +0200 gallivm: fix precision issue in aos linear int wrap code now not just passes at a quick glance but also with piglit... If we do the wrapping with floats, we also need to set the weights accordingly. We can potentially end up with different (integer) coordinates than what the integer calculations would have chosen, which means the integer weights calculated previously in this case are completely wrong. Well at least that's what I think happens, at least recalculating the weights helps. (Some day really should refactor all the wrapping, so we do whatever is fastest independent of 16bit int aos or 32bit float soa filtering.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit fd6f18588ced7ac8e081892f3bab2916623ad7a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 11:15:53 2012 +0100 gallium/util: Fix parsing of options with underscore. For example GALLIVM_DEBUG=no_brilinear which was being parsed as two options, "no" and "brilinear". commit 09a8f809088178a03e49e409fa18f1ac89561837 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 26 15:00:14 2012 +0100 gallivm: Added a generic lp_build_print_value which prints a LLVMValueRef. Updated lp_build_printf to share common code. Removed specific lp_build_print_vecX. Reviewed-by: José Fonseca <jfonseca@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit e59bdcc2c075931bfba2a84967a5ecd1dedd6eb0 Author: José Fonseca <jfonseca@vmware.com> Date: Wed May 16 15:00:23 2012 +0100 draw,llvmpipe: Avoid named struct types on LLVM 3.0 and later. Starting with LLVM 3.0, named structures are meant not for debugging, but for recursive data types, previously also known as opaque types. The recursive nature of these types leads to several memory management difficulties. Given that we don't actually need recursive types, avoid them altogether. This is an attempt to address fdo bugs 41791 and 44466. The issue is somewhat random so there's no easy way to check how effective this is. Cherry-picked from 9af1ba565dfd5cef9ee938bb7c04767d14878fbf commit df6070f618a203c7a876d984c847cde4cbc26bdb Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 14:42:53 2012 +0200 gallivm: (trivial) fix typo in faster aos linear int wrap code no longer crashes, now REALLY tested. commit d8f98dce452c867214e6782e86dc08562643c862 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 18:20:58 2012 +0200 llvmpipe: (trivial) remove bogus optimization for float aos repeat wrap This optimization for nearest filtering on the linear path generated likely bogus results, and the int path didn't have any optimizations there since the only shader using force_nearest apparently uses clamp_to_edge not repeat wrap anyway. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit c4e271a0631087c795e756a5bb6b046043b5099d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 23:01:52 2012 +0200 gallivm: faster repeat wrap for linear aos path too Even if we already have scaled integer coords, it's way faster to use the original float coord (plus some conversions) rather than use URem. The choice of what to do for texture wrapping is not really tied to int aos or float soa filtering though for some modes there can be some gains (because of easier weight calculations). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 1174a75b1806e92aee4264ffe0ffe7e70abbbfa3 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 14:39:22 2012 +0200 gallivm: improve npot tex wrap repeat in linear soa path URem gets translated into series of scalar divisions so just about anything else is faster. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit f849ffaa499ed96fa0efd3594fce255c7f22891b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 00:40:35 2012 +0100 gallivm: (trivial) fix near-invisible shift-space typo I blame the keyboard. commit 5298a0b19fe672aebeb70964c0797d5921b51cf0 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:24:28 2012 +0200 gallivm: add new intrinsic helper to deal with arbitrary vector length This helper will split vectors which are too large for the hw, or expand them if they are too small, so a caller of a function using intrinsics which uses such sizes need not split (or expand) the vectors manually and the function will still use the intrinsic instead of dropping back to generic llvm code. It can also accept scalars for use with pseudo-vector intrinsics (only useful for float arguments, all x86 scalar simd float intrinsics use 4vf32). Only used for lp_build_min/max() for now (also added the scalar float case for these while there). (Other basic binary functions could use it easily, whereas functions with a different interface would need different helpers.) Expanding vectors isn't widely used, because we always try to use build contexts with native hw vector sizes. But it might (or not) be nicer if this wouldn't need to be done, the generated code should in theory stay the same (it does get hit by lp_build_rho though already since we didn't have a intrinsic for the scalar lp_build_max case before). v2: incorporated Brian's feedback, and also made the scalar min/max case work instead of crash (all scalar simd float intrinsics take 4vf32 as argument, probably the reason why it wasn't used before). Moved to lp_bld_intr based on José's request, and passing intrinsic size instead of length. Ideally we'd derive the source type info from the passed in llvm value refs and process some llvmtype return type so we could handle intrinsics where the source and destination type isn't the same (like float/int conversions, packing instructions) but that's a bit too complicated for now. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 01aa760b99ec0b2dc8ce57a43650e83f8c1becdf Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:19:18 2012 +0200 gallivm: (trivial) increase max code size for shader disassembly 64kB was just short of what I needed (which caused a crash) hence increase to 96kB (should probably be smarter about that). commit 74aa739138d981311ce13076388382b5e89c6562 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:53:29 2012 +0100 gallivm: simplify aos float tex wrap repeat nearest just handle pot and npot the same. The previous pot handling ended up with exactly the same instructions plus 2 more (leave it in the soa path though since it is probably still cheaper there). While here also fix a issue which would cause a crash after an assert. commit 0e1e755645e9e49cfaa2025191e3245ccd723564 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:29:24 2012 +0100 gallivm: (trivial) skip floor rounding in ifloor when not signed This was only done for the non-sse41 case before, but even with sse41 this is obviously unnecessary (some callers already call itrunc in this case anyway but some might not). commit 7f01a62f27dcb1d52597b24825931e88bae76f33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:23:12 2012 +0100 gallivm: (trivial) fix bogus comments commit 5c85be25fd82e28490274c468ce7f3e6e8c1d416 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 20 11:51:57 2012 +0100 translate: Free elt8_func/elt16_func too. These were leaking. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 0ad498f36fb6f7458c7cffa73b6598adceee0a6c Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 15:55:34 2012 +0200 gallivm: fix bug for tex wrap repeat with linear sampling in aos float path The comparison needs to be against length not length_minus_one, otherwise the max texel is never chosen (for the second coordinate). Fixes piglit texwrap-1D-npot-proj (and 2D/3D versions). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit d1ad65937c5b76407dc2499b7b774ab59341209e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 16:13:43 2012 +0200 gallivm: simplify soa tex wrap repeat with npot textures and no mip filtering Similar to what is already done in aos sampling for the float path (but not the int path since we don't get normalized float coordinates there). URem is expensive and the calculation is done trivially with normalized floats instead (at least with sse41-capable cpus). (Some day should probably do the same for the mip filter path but it's much more complicated there hence the gain is smaller.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit e1e23f57ba9b910295c306d148f15643acc3fc83 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:38:56 2012 +0200 llvmpipe: (trivial) remove duplicated function declaration Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 07ca57eb09e04c48a157733255427ef5de620861 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:37:34 2012 +0200 llvmpipe: destroy setup variants on context destruction lp_delete_setup_variants() used to be called in garbage collection, but this no longer exists hence the setup shaders never got freed. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ed0003c633859a45f9963a479f4c15ae0ef1dca3 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 16:25:29 2012 +0100 gallivm: handle different ilod parts for multiple quad sampling This fixes filtering when the integer part of the lod is not the same for all quads. I'm not fully convinced of that solution yet as it just splits the vector if the levels to be sampled from are different. But otherwise we'd need to do things like some minify steps, and getting mip level base address separately anyway hence it wouldn't really look like much of a win (and making the code even more complex). This should now give identical results to single quad sampling. commit 8580ac4cfc43a64df55e84ac71ce1a774d33c0d2 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:14:47 2012 +0200 gallivm: de-duplicate sample code common to soa and aos sampling There doesn't seem to be any reason why this code dealing with cube face selection, lod and mip level calculation is separate in aos and soa sampling, and I am sick of having it to change in both places. commit fb541e5f957408ce305b272100196f1e12e5b1e8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:15:41 2012 +0200 gallivm: do mip filtering with per quad lod_fpart This gives better results for mip filtering, though the generated code might not be optimal. For now it also creates some artifacts if the lod_ipart isn't the same for all quads, since instead of using the same mip weight for all quads as previously (which just caused non-smooth gradients) this now will use the right weights but with the wrong mip level in this case (can easily be seen with things like texfilt, mipmap_tunnel). v2: use logic helper suggested by José, and fix issue with negative lod_fpart values commit f1cc84eef7d826a20fab6cd8ccef9a275ff78967 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 13 18:35:25 2012 +0200 gallivm: (trivial) fix bogus assert in lp_build_unpack_broadcast_aos_scalars commit 7c17dbae8ae290df9ce0f50781a09e8ed640c044 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:11:14 2012 +0100 util: Reimplement half <-> float conversions. Removed u_half.py used to generate the table for previous method. Previous implementation of float to half conversion was faulty for denormalised and NaNs and would require extra logic to fix, thus making the speedup of using tables irrelevant. commit 7762f59274070e1dd4b546f5cb431c2eb71ae5c3 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:12:16 2012 +0100 tests: Updated tests to properly handle NaN for half floats. commit fa94c135aea5911fd93d5dfb6e6f157fb40dce5e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:33:10 2012 +0200 gallivm: do mip level calculations per quad This is the final piece which shouldn't change the rendering output yet. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 23cbeaddfe03c09ca18c45d28955515317ffcf4c Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Jun 9 00:54:21 2012 +0200 gallivm: do per-quad cube face selection Doesn't quite fix the piglit cubemap test (not sure why actually) but doing per-quad face selection is doing the right thing and definitely an improvement. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit abfb372b3702ac97ac8b5aa80ad1b94a2cc39d33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:22:59 2012 +0200 gallivm: do all lod calculations per quad Still no functional change but lod is now converted to scalar after lod calculations. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 519368632747ae03feb5bca9c655eccbc5b751b4 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:46:10 2012 +0100 gallivm: Added support for half-float to float conversion in lp_build_conv. Updated various utility functions to support this change. commit 135b4d683a4c95f7577ba27b9bffa4a6fbd2c2e7 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:02:46 2012 +0100 gallivm: Added function for half-float to float conversion. Updated lp_build_format_aos_array to support half-float source. commit 37d648827406a20c5007abeb177698723ed86673 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:55:18 2012 +0100 util: Updated u_format_tests to rigidly test half-float boundary values. commit 2ad18165d96e578aa9046df7c93cb1c3284d8c6b Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:54:16 2012 +0100 llvmpipe: Updated lp_test_format to properly handle Inf/NaN results. commit 78740acf25aeba8a7d146493dd5c966e22c27b73 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:53:30 2012 +0100 util: Added functions for checking NaN / Inf for double and half-floats. commit 35e9f640ae01241f9e0d67fe893bbbf564c05809 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:05:13 2012 +0200 gallivm: Fix calculating rho for 3d textures for the single-quad case Discovered by accident, this looks like a very old typo bug. commit fc1220c636326536fd0541913154e62afa7cd1d8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:04:59 2012 +0200 gallivm: do calcs per-quad in lp_build_rho Still convert to scalar at the end of the function. commit 50a887ffc550bf310a6988fa2cea5c24d38c1a41 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 21 23:21:50 2012 +0200 gallivm: (trivial) return scalar in lp_build_extract_range for length 1 vectors Our type system on top of llvm's one doesn't generally support vectors of length 1, instead using scalars. So we should return a scalar from this function instead of having to bitcast the vector with length 1 later elsewhere. commit 80c71c621f9391f0f9230460198d861643324876 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 17:49:15 2012 +0100 draw: Fixed bad merge error commit c47401cfad0c9167de20ff560654f533579f452c Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:29:30 2012 +0100 draw: Updated store_clip to store whole vectors instead of individual elements. commit 2d9c1ad74b0b0b41861fffcecde39f09cc27f1cf Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:28:32 2012 +0100 gallivm: Added lp_build_fetch_rgba_aos_array. A version of lp_build_fetch_rgba_aos which is targeted at simple array formats. Reads the whole vector from memory in one, instead of reading each element individually. Tested with mesa tests and demos. commit ff7805dc2b6ef6d8b11ec4e54aab1633aef29ac8 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:27:40 2012 +0100 gallivm: Added lp_build_pad_vector. This function pads a vector with undef to a desired length. commit 701f50acef24a2791dabf4730e5b5687d6eb875d Author: James Benton <jbenton@vmware.com> Date: Fri May 18 17:27:19 2012 +0100 util: Added util_format_is_array. This function checks whether a format description is in a simple array format. commit 5e0a7fa543dcd009de26f34a7926674190fa6246 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:13:47 2012 +0100 draw: Removed draw_llvm_translate_from and draw/draw_llvm_translate.c. This is "replaced" by adding an optimised path in lp_build_fetch_rgba_aos in an upcoming patch. commit 8c886d6a7dd3fb464ecf031de6f747cb33e5361d Author: James Benton <jbenton@vmware.com> Date: Wed May 16 15:02:31 2012 +0100 draw: Modified store_aos to write the vector as one, not individual elements. commit 37337f3d657e21dfd662c7b26d61cb0f8cfa6f17 Author: James Benton <jbenton@vmware.com> Date: Wed May 16 14:16:23 2012 +0100 draw: Changed aos_to_soa to use lp_build_transpose_aos. commit bd2b69ce5d5c94b067944d1dcd5df9f8e84548f1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:14:27 2012 +0100 draw: Changed soa_to_aos to use lp_build_transpose_aos. commit 0b98a950d29a116e82ce31dfe7b82cdadb632f2b Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:45 2012 +0100 gallivm: Added lp_build_transpose_aos which converts between aos and soa. commit 69ea84531ad46fd145eb619ed1cedbe97dde7cb5 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:01 2012 +0100 gallivm: Added lp_build_interleave2_half aimed at AVX unpack instructions. commit 7a4cb1349dd35c18144ad5934525cfb9436792f9 Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 22 11:54:14 2012 +0100 gallivm: Fix build on Windows. MC-JIT not yet supported there. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit afd105fc16bb75d874e418046b80d9cc578818a1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:17:26 2012 +0100 llvmpipe: Added a error counter to lp_test_conv. Useful for keeping track of progress when fixing errors! Signed-off-by: José Fonseca <jfonseca@vmware.com> commit b644907d08c10a805657841330fc23db3963d59c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:16:46 2012 +0100 llvmpipe: Changed known failures in lp_test_conv. To comply with the recent fixes to lp_bld_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit d7061507bd94f6468581e218e61261b79c760d4f Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:14:38 2012 +0100 llvmpipe: Added fixed point types tests to lp_test_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 146b3ea39b4726dbe125ac666bd8902ea3d6ca8c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:26:35 2012 +0100 llvmpipe: Changed lp_test_conv src/dst alignment to be correct. Now based on the define rather than a fixed number. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit f3b57441f834833a4b142a951eb98df0aa874536 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:06:44 2012 +0100 gallivm: Fixed erroneous optimisation in lp_build_min/max. Previously assumed normalised was 0 to 1, but it can be -1 to 1 if type is signed. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a0613382e5a215cd146bb277646a6b394d376ae4 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:04:49 2012 +0100 gallivm: Compensate for lp_const_offset in lp_build_conv. Fixing a /*FIXME*/ to remove errors in integer conversion in lp_build_conv. Tested using lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a3d2bf15ea345bc8a0664f8f441276fd566566f3 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:01:25 2012 +0100 gallivm: Fixed overflow in lp_build_clamped_float_to_unsigned_norm. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit e7b1e76fe237613731fa6003b5e1601a2e506207 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 21 20:07:51 2012 +0100 gallivm: Fix build with LLVM 2.6 Trivial, and useful. commit d3c6bbe5c7f5ba1976710831281ab1b6a631082d Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 15 17:15:59 2012 +0100 gallivm: Enable MCJIT/AVX with vanilla LLVM 3.1. Add the necessary C++ glue, so that we don't need any modifications to the soon to be released LLVM 3.1. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 724a019a14d40fdbed21759a204a2bec8a315636 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 22:04:06 2012 +0100 gallivm: Use HAVE_LLVM 0x0301 consistently. commit af6991e2a3868e40ad599b46278551b794839748 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 21:49:06 2012 +0100 gallivm: Add MCRegisterInfo.h to silence benign warnings about missing implementation. Trivial. commit 6f8a1d75458daae2503a86c6b030ecc4bb494e23 Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Apr 2 22:14:15 2012 -0700 gallivm: Pass in a MCInstrInfo to createMCInstPrinter on llvm-3.1. llvm-3.1svn r153860 makes MCInstrInfo available to the MCInstPrinter. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 62555b6ed8760545794f83064e27cddcb3ce5284 Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 27 21:51:17 2012 -0700 gallivm: Fix method overriding in raw_debug_ostream. Use matching type qualifers to avoid method hiding. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 6a9bd784f4ac68ad0a731dcd39e5a3c39989f2be Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 13 22:40:52 2012 -0700 gallivm: Fix createOProfileJITEventListener namespace with llvm-3.1. llvm-3.1svn r152620 refactored the OProfile profiling code. createOProfileJITEventListener was moved from the llvm namespace to the llvm::JITEventListener namespace. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit b674955d39adae272a779be85aa1bd665de24e3e Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Mar 5 22:00:40 2012 -0800 gallivm: Pass in a MCRegisterInfo to MCInstPrinter on llvm-3.1. llvm-3.1svn r152043 changes createMCInstPrinter to take an additional MCRegisterInfo argument. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 11ab69971a8a31c62f6de74905dbf8c02884599f Author: Vinson Lee <vlee@freedesktop.org> Date: Wed Feb 29 21:20:53 2012 -0800 Revert "gallivm: Change getExtent and readByte to non-const with llvm-3.1." This reverts commit d5a6c172547d8964f4d4bb79637651decaf9deee. llvm-3.1svn r151687 makes MemoryObject accessor members const again. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 339960c82d2a9f5c928ee9035ed31dadb7f45537 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 14 16:19:56 2012 +0200 gallivm: (trivial) fix assertion failure for mipmapped 1d textures In lp_build_rho, we may end up with a 1-element vector (for mipmapped 1d textures), but in this case we require the type to be a non-vector type, so need a cast. commit 9d73edb727bd6d196030dc3026b7bf0c574b3e19 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:12:07 2012 +0200 gallivm: prepare for per-quad lod calculations for large vectors to be able to handle multiple quads at once in texture sampling and still do lod calculations per quad, it is necessary to get the per-quad derivatives into the lp_build_rho function. Until now these derivative values were just scalars, which isn't going to work. So we now use vectors, and since the interface needs to change we also do some different (slightly more efficient) packing of the values. For 8-wide vectors the packed derivative values for 3 coords would look like this, this scales to a arbitrary (multiple of 4) vector size: ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____ The second vector will be unused for 1d and 2d textures. To facilitate future changes the derivative values are put into a struct, since quite some functions just pass these values through. The generated code seems to be very slightly better for 2d textures (with 4-wide vectors) than before with sse2 (if you have a cpu with physical 128bit simd units - otherwise it's probably not a win). v2: suggestions from José, rename variables, add comments, use swizzle helper commit 0aa21de0d31466dac77b05c97005722e902517b8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:10:31 2012 +0200 gallivm: add undefined swizzle handling to lp_build_swizzle_aos This is useful for vectors with "holes", it lets llvm choose the most efficient shuffle instructions if some elements aren't needed without having to worry what elements to manually pick otherwise. commit 00faf3f370e7ce92f5ef51002b0ea42ef856e181 Author: José Fonseca <jfonseca@vmware.com> Date: Fri May 4 17:25:16 2012 +0100 gallivm: Get the LLVM IR optimization passes before JIT compilation. MC-JIT engine compiles the module immediately on creation, so the optimization passes were being run too late. So now we create a target data layout from a string, that matches the ABI parameters reported by the compiler. The backend optimization passes were always been run, so the performance improvement is modest (3% on multiarb mesa demo). Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit 40a43f4e2ce3074b5ce9027179d657ebba68800a Author: Roland Scheidegger <sroland@vmware.com> Date: Wed May 2 16:03:54 2012 +0200 gallivm: (trivial) fix wrong define used in lp_build_pack2 should fix stack-smashing crashes. commit e6371d0f4dffad4eb3b7a9d906c23f1c88a2ab9e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 30 21:25:29 2012 +0200 gallivm: add perf warnings when not using intrinsics with 256bit vectors Helper functions using integer sse2 intrinsics could split the vectors with AVX instead of using generic fallback (which should be faster). We don't actually expect to hit these paths (hence don't fix them up to actually do the vector splitting) so just emit warnings (for those functions where it's obvious doing split/intrinsic is faster than using generic path). Only emit warnings for 256bit vectors since we _really_ don't expect to hit arbitrary large vectors which would affect a lot more functions. The warnings do not actually depend on avx since the same logic applies to plain sse2 too (but of course again there's _really_ no reason we should hit these functions with 256bit vectors without avx). commit 8a9ea701ea7295181e846c6383bf66a5f5e47637 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:37:07 2012 +0200 gallivm: split vectors manually for avx in lp_build_pack2 (v2) There's 2 reasons for this: First, there's a llvm bug (fixed in 3.1) which generates tons of byte inserts/extracts otherwise, and second, more importantly, we want to use pack intrinsics instead of shuffles. We do this in lp_build_pack2 and not the calling code (aos sample path) because potentially other callers might find that useful too, even if for larger sequences of code using non-native vector sizes it might be better to manually split vectors. This should boost texture performance in the aos path considerably. v2: fix issues with intrinsics types with old llvm commit 27ac5b48fa1f2ea3efeb5248e2ce32264aba466e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:26:22 2012 +0200 llvmpipe: refactor lp_build_pack2 (v2) prettify, and it's unnecessary to assert when there's no intrinsic due to unsupported bit width - the shuffle path will work regardless. In contrast lp_build_packs2, should only rely on lp_build_pack2 doing the clamping for element sizes for which there is a sse2 intrinsic. v2: fix bug spotted by Jose regarding the intrinsic type for packusdw on old llvm versions. commit ddf279031f0111de4b18eaf783bdc0a1e47813c8 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:13:59 2012 +0200 gallivm: add src width check in lp_build_packs2() not doing so would skip clamping even if no sse2 pack instruction is available, which is incorrect (in theory only, such widths would also always hit a (unnecessary) assertion in lp_build_pack2(). commit e7f0ad7fe079975eae7712a6e0c54be4fae0114b Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Apr 27 15:57:00 2012 +0200 gallivm: (trivial) fix crash-causing typo for npot textures with avx commit 28a9d7f6f655b6ec508c8a3aa6ffefc1e79793a0 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Apr 25 19:38:45 2012 +0200 gallivm: (trivial) remove code mistakenly added twice. commit d5926537316f8ff67ad0a52e7242f7c5478d919b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Apr 24 21:16:15 2012 +0200 gallivm: add a new avx aos sample path (v2) Try to avoid mixing float and int address calculations. This does texture wrap modes with floats, and then the offset calculations still with ints (because of lack of precision with floats, though we could do some effort to make it work with not too large (16MB) textures). This also handles wrap repeat mode with npot-sized textures differently than either the old soa or aos int path (likely way faster but untested). Otherwise the actual address wrap code is largely similar to the soa path (not quite the same as this one also has some int code), it should get used by avx soa sampling later as well but doesn't handle more complex address modes yet (this will also have the benefit that we can use aos sampling path for all texture address modes). Generated code for that looks reasonable, but still does not split vectors explicitly for fetch/filter which means still get hit by llvm (fixed upstream) which generates hundreds of pinsrb/pextrb instead of two shuffles. It is not obvious though if it's much of a win over just doing address calcs 4-wide but with ints, even if it is definitely much less instructions on avx. piglit's texwrap seems to look exactly the same but doesn't test neither the non-normalized nor the npot cases. v2: fix comments, prettify based on Brian's and Jose's feedback. commit bffecd22dea66fb416ecff8cffd10dd4bdb73fce Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Apr 19 01:58:29 2012 +0200 gallivm: refactor aos lp_build_sample_image_nearest/linear split them up to separate address calculations and fetching/filtering. Need this for being able to do 8-wide float address calcs and 4-wide fetch/filter later (for avx). Plus the functions were very big scary monsters anyway (in particular lp_build_sample_image_linear). commit a80b325c57529adddcfa367f96f03557725c4773 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 16 17:17:18 2012 +0200 gallivm: fix lp_build_resize when truncating width but expanding vector size Missed this case which I thought was impossible - the assertion for it was right after the division by zero... (AoS) texture sampling may ask us to do this, for things like 8 4x32int vectors to 1 32x8int vector conversion (eventually, we probably don't want this to happen). commit f9c8337caa3eb185830d18bce8b95676a065b1d7 Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Apr 14 18:00:59 2012 +0200 gallivm: fix cube maps with larger vectors This makes the branchless cube face selection code work with larger vectors. Because the complexity is quite high (cannot really be improved it seems, per-face selection would reduce complexity a lot but this leads to errors unless the derivatives are calculated all from the same face which almost doubles the work to be done) it is still slower than the branching version, hence only enable this with large vectors. It doesn't actually do per-quad face selection yet (only makes sense with matching lod selection, in fact it will select the same face for all pixels based on the average of the first four pixels for now) but only different shuffles are required to make it work (the branching version actually should work with larger vectors too now thanks to the improved horizontal add but of course it cannot be extended to really select the face per-quad unless doing branching per quad). commit 7780c58869fc9a00af4f23209902db7e058e8a66 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 21:11:12 2012 +0100 llvmpipe: (trivial) fix compiler warning and also clarify comment regarding availability of popcnt instruction. commit a266dccf477df6d29a611154e988e8895892277e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 14:21:07 2012 +0100 gallivm: remove unneeded members in lp_build_sample_context Minor cleanup, the texture width, height, depth aren't accessed in their scalar form anywhere. Makes it more obvious those values should probably be fetched already vectorized (but this requires more invasive changes)... commit b678c57fb474e14f05e25658c829fc04d2792fff Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 29 15:53:55 2012 +0100 gallivm: add a helper for concatenating vectors Similar to the extract_range helper intended to get around slow code generated by llvm for 128bit insertelements. Concatenating two 128bit vectors this way will result in a single vinsertf128 operation rather than two 64bit stores plus one 128bit load, though it might be mildly useful for other purposes as well. commit 415ff228bcd0cf5e44a4c15350a661f0f5520029 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 19:41:15 2012 +0100 gallivm: add a custom 2x8f->1x16ub avx conversion path Similar to the existing 4x4f->1x16ub sse2 path, shaves off a couple instructions (min/max mostly) because it relies on pack intrinsics clamping. commit 78c08fc89f8fbcc6dba09779981b1e873e2a0299 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 18:44:07 2012 +0100 gallivm: add avx arithmetic intrinsics Add all avx intrinsics for arithmetic functions (with the exception of the horizontal add function which needs another look). Seems to pass basic tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit a586caa2800aa5ce54c173f7c0d4fc48153dbc4e Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 15:31:35 2012 +0100 gallivm: add avx logic intrinsics Add the blend intrinsics for 8-wide float and 4-wide double vectors. Since we lack 256bit int instructions these are used for int vectors as well, though obviously not for byte or word element values. The comparison intrinsics aren't extended for avx since these are only used for pre-2.7 llvm versions. commit 70275e4c13c89315fc2560a4c488c0e6935d5caf Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 00:40:53 2012 +0100 gallivm: new helper function for extract shuffles. Based on José's idea as we can need that in a couple places. Note that such shuffles should not be used lightly, since data layout of <4 x i8> is different to <16 x i8> for instance, hence might cause data rearrangement. commit 4d586dbae1b0c55915dda1759d2faea631c0a1c2 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 18:27:25 2012 +0100 gallivm: (trivial) don't overallocate shuffle variable using wrong define meant huge array... commit 06b0ec1f6d665d98c135f9573ddf4ba04b2121ad Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 17:54:20 2012 +0100 gallivm: don't do per-element extract/insert for vector element resize Instead of doing per-element extract/insert if the src vectors and dst vector differ in total size (which generates atrocious code) first change the src vectors size by using shuffles to destination vector size. We can still do better than that on AVX for packing to color buffer (by exploiting pack intrinsics characteristics hence eleminating the need for some clamps) but this already generates much better code. v2: incorporate feedback from José, Keith and use shuffle instead of bitcasts/extracts. Due to llvm deficiencies the latter cause all data to get moved to GPRs and back in pieces (even though the data in the regs actually stays the same...). commit c9970d70e05f95d3f52fe7d2cd794176a52693aa Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 19:33:19 2012 +0000 gallivm: fix bug in simple position interpolation Accidental use of position attribute instead of just pixel coordinates. Caused failures in piglit glsl-fs-ceil and glsl-fs-floor. commit d0b6fcdb008d04d7f73d3d725615321544da5a7e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 15:31:14 2012 +0000 gallivm: fix emission of ceil opcode lp_build_ceil seems more appropriate than lp_build_trunc. This seems to be never hit though someone performs some ceil to floor magic. commit d97fafed7e62ffa6bf76560a92ea246a1a26d256 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 22 11:46:52 2012 +0000 gallivm: new vectorized path for cubemap calculations should be faster when adapted to multiple quads as only selection masks need to be different. The code is more or less a per-pixel version adapted to only do it per quad. A per pixel version would be much simpler (could drop 2 selects, 6 broadcasts and the messy horizontal add of 3 vectors at the expense of only 2 more absolute value instructions - would also just work for arbitary large vectors). This version doesn't yet work with larger vectors because the horizontal add isn't adjusted to be able to work with 2x4 vectors (and also because face selection wouldn't be done per quad just per block though that would be only a correctness issue just as with lod selection). The downside is this code is quite a bit slower. On a Core2 it can be sped up by disabling the hw blend instructions for selection and using logicop fallbacks instead, but it is still slower than the old code, hence leave that in for now. Probably will chose one or the other version based on vector length in the end. commit b375fbb18a3fd46859b7fdd42f3e9908ea4ff9a3 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 21 14:42:29 2012 +0000 gallivm: fix optimized occlusion query intrinsic name commit a9ba0a3b611e48efbb0e79eb09caa85033dbe9a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Mar 21 16:19:43 2012 +0000 draw,gallivm,llvmpipe: Call gallivm_verify_function everywhere. commit f94c2238d2bc7383e088b8845b7410439a602071 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 18:54:10 2012 +0000 gallivm: optimize calculations for cube maps a bit this does some more vectorized calculations and uses horizontal adds if possible. A definite win with sse3 otherwise it doesn't seem to make much of a difference. In any case this is arithmetically identical, cannot handle larger vectors. Should be useful as a reference point against larger vector version later... commit 21a2c1cf3c8e1ac648ff49e59fdc0e3be77e2ebb Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 15:16:27 2012 +0000 llvmpipe: slight optimization of occlusion queries using movmskps when available. While this is slightly better for cpus without popcnt we should really sum the vectors ourselves (it is also possible to cast to i4 before doing the popcnt but that doesn't help that much neither since llvm is using some optimized popcnt version for i32) commit 5ab5a35f216619bcdf55eed52b0db275c4a06c1b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 13:32:11 2012 +0000 llvmpipe: fix occlusion queries with larger vectors need to adjust casts etc. commit ff95e6fdf5f16d4ef999ffcf05ea6e8c7160b0d5 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Mar 19 20:15:25 2012 +0000 gallivm: Restore optimization passes. commit 57b05b4b36451e351659e98946dae27be0959832 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:34:22 2012 +0000 llvmpipe: use existing min2 macro commit bc9a20e19b4f600a439f45679451f2e87cd4b299 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:07:27 2012 +0000 llvmpipe: add some safeguards against really large vectors As per José's suggestion, prevent things from blowing up if some cpu would have 1024bit or larger vectors. commit 0e2b525e5ca1c5bbaa63158bde52ad1c1564a3a9 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:31:08 2012 +0000 llvmpipe: fix mask generation for uberwide vectors this was the only piece preventing 16-wide vectors from working (apart from the LP_MAX_VECTOR_WIDTH define that is), which is the maximum as we don't get more pixels in the fragment shader at once. Hence adjust that so things could be tested properly with that size even though there seems to be no practical value. commit 3c8334162211c97f3a11c7f64e9e5a2a91ad9656 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:19:41 2012 +0000 llvmpipe: fix the simple interpolation method with larger vectors so both methods actually _really_ work now. Makes textures look nice with larger vectors... commit 1cb0464ef8871be1778d43b0c56adf9c06843e2d Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 17:26:35 2012 +0000 llvmpipe: fix mask generation and position interpolation with 8-wide vectors trivial bugs, with these things start to look somewhat reasonable. Textures though have some swizzling issues it seems. commit 168277a63ef5b72542cf063c337f2d701053ff4b Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 16:04:03 2012 +0000 llvmpipe: don't overallocate variables we never have more than 16 (stamp size) / 4 (minimum possible vector size). (With larger vectors those variables are still overallocated a bit.) commit 409b54b30f81ed0aa9ed0b01affe15c72de9abd2 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:56:48 2012 +0000 llvmpipe: add some 32f8 formats to lp_test_conv Also add the ability to handle different sized vectors. commit 55dcd3af8366ebdac0af3cdb22c2588f24aa18ce Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:47:27 2012 +0000 gallivm: handle different sized vectors in conversion / pack only fully generic path for now (extract/insert per element). commit 9c040f78c54575fcd94a8808216cf415fe8868f6 Author: Roland Scheidegger <sroland@vmware.com> Date: Sun Mar 18 00:58:28 2012 +0100 llvmpipe: fix harmless use of unitialized values commit 551e9d5468b92fc7d5aa2265db9a52bb1e368a36 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:31:21 2012 +0100 gallivm: drop special path in extract_broadcast with different sized vectors Not needed, llvm can handle shuffles with different sized result vector just fine. Should hopefully generate the same code in the end, but simpler IR. commit 44da531119ffa07a421eaa041f63607cec88f6f8 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:28:49 2012 +0100 llvmpipe: adapt interpolation for handling multiple quads at once this is still WIP there are actually two methods possible not quite sure what makes the most sense, so there's code for both for now: 1) the iterative method as used before (compute attrib values at upper left corner of stamp and upper left corner of each quad initially). It is improved to handle more than one quad at once, and also do some more vectorized calculations initially for slightly better code - newer cpus have full throughput with 4 wide float vectors, hence don't try to code up a path which might be faster if there's just one channel active per attribute. 2) just do straight interpolation for each pixel. Method 2) is more work per quad, but less initially - if all quads are executed significantly more overall though. But this might change with larger vector lengths. This method would also be needed if we'd do some kind of active quad merging when operating on multiple quads at once. This path contains some hack to force llvm to generate better code, it is still far from ideal though, still generates far too many unnecessary register spills/reloads. Both methods should work with different sized vectors. Not very well tested yet, still seems to work with four-wide vectors, need changes elsewhere to be able to test with wider vectors. commit be5d3e82e2fe14ad0a46529ab79f65bf2276cd28 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:59:37 2012 +0000 draw: Cleanup. commit f85bc12c7fbacb3de2a94e88c6cd2d5ee0ec0e8d Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:43:30 2012 +0000 gallivm: More module compilation refactoring. commit d76f093198f2a06a93b2204857e6fea5fd0b3ece Author: José Fonseca <jfonseca@vmware.com> Date: Thu Mar 15 21:29:11 2012 +0000 llvmpipe: Use gallivm_compile/free_function() in linear code. Should had been done before. commit 122e1adb613ce083ad739b153ced1cde61dfc8c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 13 14:47:10 2012 +0100 llvmpipe: generate partial pixel mask for multiple quads still works with one quad, cannot be tested yet with more At least for now always fixed order with multiple quads. commit 4c4f15081d75ed585a01392cd2dcce0ad10e0ea8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 22:09:24 2012 +0100 llvmpipe: refactor state setup a bit Refactor to make it easier to emit (and potentially later fetch in fs) coefficients for multiple attributes at once. Need to think more about how to make this actually happen however, the problem is different attributes can have different interpolation modes, requiring different handling in both setup and fs (though linear and perspective handling is close). commit 9363e49722ff47094d688a4be6f015a03fba9c79 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 19:23:23 2012 +0100 llvmpipe: vectorize tri offset calc cuts number of instructions in quad-offset-factor from 107 to 75. This code actually duplicated the (scalar) code calculating the determinant except it used different vertex order (leading to different sign but it doesn't matter) hence llvm could not have figured out it's the same (of course with determinant vectorized in the other place that wouldn't have worked any longer neither). Note this particular piece doesn't actually vectorize well, not many arithmetic instructions left but tons of shuffle instructions... Probably would need to work on n tris at a time for better vectorization. commit 63169dcb9dd445c94605625bf86d85306e2b4297 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 03:11:37 2012 +0100 llvmpipe: vectorize some scalar code in setup reduces number of arithmetic instructions, and avoids loading vector x,y values twice (once as scalars once as vectors). Results in a reduction of instructions from 76 to 64 in fs setup for glxgears (16%) on a cpu with sse41. Since this code uses vec2 disguised as vec4, on old cpus which had physical 64bit sse units (pre-Core2) it probably is less of a win in practice (and if you have no vectors you can only hope llvm eliminates the arithmetic for unneeded elements). commit 732ecb877f951ab89bf503ac5e35ab8d838b58a1 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 7 00:32:24 2012 +0100 draw: fix clipping bug introduced by 4822fea3f0440b5205e957cd303838c3b128419c broke clipping pretty badly (verified with lineclip test) commit ef5d90b86d624c152d200c7c4056f47c3c6d2688 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 23:38:59 2012 +0100 draw: don't store vertex header per attribute storing the vertex header once per attribute is totally unnecessary. Some quick look at the generated assembly says llvm in fact cannot optimize away the additional stores (maybe due to potentially aliasing pointers somewhere). Plus, this makes the code cleaner and also allows using a vector "or" instead of scalar ones. commit 6b3a5a57b0b9850854cfbd7b586e4e50102dda71 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 19:11:01 2012 +0100 draw: do the per-vertex "boolean" clipmask "or" with vectors no point extracting the values and doing it per component. Doesn't help that much since we still extract the values elsewhere anyway. commit 36519caf1af40e4480251cc79a2d527350b7c61f Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 2 22:27:01 2012 +0100 gallivm: fix lp_build_extract_broadcast with different sized vectors Fix the obviously wrong argument, so it doesn't blow up. commit 76d0ac3ad85066d6058486638013afd02b069c58 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 2 12:16:23 2012 +0000 draw: Compile per module and not per function (WIP). Enough to get gears w/ LLVM draw + softpipe to work on AVX doing: GALLIUM_DRIVER=softpipe SOFTPIPE_USE_LLVM=yes glxgears But still hackish -- will need to rethink and refactor this. commit 78e32b247d2a7a771be9a1a07eb000d1e54ea8bd Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 12:01:05 2012 +0000 llvmpipe: Remove lp_state_setup_fallback. Never used. commit 6895d5e40d19b4972c361e8b83fdb7eecda3c225 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Feb 27 19:14:27 2012 +0000 llvmpipe: Don't emit EMMS on x86 We already take precautions to ensure that LLVM never emits MMX code. commit 4822fea3f0440b5205e957cd303838c3b128419c Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 29 15:58:19 2012 +0100 draw: modifications for larger vector sizes We want to be able to use larger vectors especially for running the vertex shader. With this patch we build soa vectors which might have a different length than 4. Note that aos structures really remain the same, only when aos structures are converted to soa potentially different sized vectors are used. Samplers probably don't work yet, didn't look at them. Testing done: glxgears works with both 128bit and 256bit vectors. commit f4950fc1ea784680ab767d3dd0dce589f4e70603 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:57 2012 +0100 gallivm: override native vector width with LP_NATIVE_VECTOR_WIDTH env var for debug commit 6ad6dbf0c92f3bf68ae54e5f2aca035d19b76e53 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:24 2012 +0100 draw: allocate storage with alignment according to native vector width commit 7bf0e3e7c9bd2469ae7279cabf4c5229ae9880c1 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Feb 24 19:06:08 2012 +0000 gallivm: Fix comment grammar. Was missing several words. Spotted by Roland. commit b20f1b28eb890b2fa2de44a0399b9b6a0d453c52 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 19:22:09 2012 +0000 gallivm: Use MC-JIT on LLVM 3.1 + (i.e, SVN) MC-JIT Note: MC-JIT is still WIP. For this to work correctly it requires LLVM changes which are not yet upstream. commit b1af4dfcadfc241fd4023f4c3f823a1286d452c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 20:03:15 2012 +0100 llvmpipe: use new lp_type_width() helper in lp_test_blend commit 04e0a37e888237d4db2298f31973af459ef9c95f Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 19:50:34 2012 +0100 llvmpipe: clean up lp_test_blend a little Using variables just sized and aligned right makes it a bit more obvious what's going on. The test still only tests vector length 4. For AoS anything else probably isn't going to work. For SoA other lengths should work (at least with floats). commit e61c393d3ec392ddee0a3da170e985fda885a823 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:48:30 2012 +0000 gallivm: Ensure vector width consistency. Instead of assuming that everything is the max native size. commit 330081ac7bc41c5754a92825e51456d231bf84dd Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:44:14 2012 +0000 draw: More simd vector width consistency fixes. commit d90ca002753596269e37297e2e6c139b19f29f03 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:43:00 2012 +0000 gallivm: Remove unused lp_build_int32_vec4_type() helper. commit cae23417824d75869c202aaf897808d73a2c1db0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 17:32:16 2012 +0100 gallivm: use global variable for native vector width instead of define We do not know the simd extensions (and hence the simd width we should use) available at compile time. At least for now keep a define for maximum vector width, since a global variable obviously can't be used to adjust alignment of automatic stack variables. Leave the runtime-determined value at 128 for now in all cases. commit 51270ace6349acc2c294fc6f34c025c707be538a Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 15:41:02 2012 +0000 gallivm: Add a hunk inadvertedly lost when rebasing. commit bf256df9cfdd0236637a455cbaece949b1253e98 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:24:23 2012 +0000 llvmpipe: Use consistent vector width in depth/stencil test. commit 5543b0901677146662c44be2cfba655fd55da94b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:19:59 2012 +0000 draw: Use a consistent the vector register width. Instead of 4x32 sometimes, LP_NATIVE_VECTOR_WIDTH other times. commit eada8bbd22a3a61f549f32fe2a7e408222e5c824 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 12:08:04 2012 +0000 gallivm: Remove garbagge collection. MC-JIT will require one compilation per module (as opposed to one compilation per function), therefore no state will be shared, eliminating the need to do garbagge collection. commit 556697ea0ed72e0641851e4fbbbb862c470fd7eb Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 10:33:41 2012 +0000 gallivm: Move all native target initialization to lp_set_target_options(). commit c518e8f3f2649d5dc265403511fab4bcbe2cc5c8 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:52:32 2012 +0000 llvmpipe: Create one gallivm instance for each test. commit 90f10af8920ec6be6f2b1e7365cfc477a0cb111d Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:48:08 2012 +0000 gallivm: Avoid LLVMAddGlobalMapping() in lp_bld_assert(). Brittle, complex, and unecesary. Just use function pointer constant. commit 98fde550b33401e3fe006af59db4db628bcbf476 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:21:26 2012 +0000 gallivm: Add a lp_build_const_func_pointer() helper. To be reused in all places where we want to call C code. commit 6cfedadb62c2ce5af8d75969bc95a607f3ece118 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:44:41 2012 +0000 gallivm: Cleanup/simplify lp_build_const_string_variable. - Move to lp_bld_const where it belongs - Rename to lp_build_const_string - take the length from the argument (and don't count the zero terminator twice) - bitcast the constant to generic i8 * commit db1d4018c0f1fa682a9da93c032977659adfb68c Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 11:52:17 2012 +0000 gallivm: Set NoFramePointerElimNonLeaf to true where supported. commit 088614164aa915baaa5044fede728aa898483183 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 22 19:38:47 2012 +0100 llvmpipe: pass in/out pointers rather scalar floats in lp_bld_arit we don't want llvm to potentially optimize away the vectors (though it doesn't seem to currently), plus we want to be able to handle in/out vectors of arbitrary length. commit 3f5c4e04af8a7592fdffa54938a277c34ae76b51 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:22:55 2012 +0100 gallivm: fix lp_build_sqrt() for vector length 1 since we optimize away vectors with length 1 need to emit intrinsic without vector type. commit 79d94e5f93ed8ba6757b97e2026722ea31d32c06 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 17:00:46 2012 +0000 llvmpipe: Remove lp_test_round. commit 81f41b5aeb3f4126e06453cfc78990086b85b78d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:56:24 2012 +0100 llvmpipe: subsume lp_test_round into lp_test_arit Much simpler, and since the arguments aren't passed as 128bit values can run on any arch. This also uses the float instead of the double versions of the c functions (which probably was the intention anyway). In contrast to lp_test_round the output is much less verbose however. Tested vector width of 32 to 512 bits - all pass except 32 (length 1) which crashes in lp_build_sqrt() due to wrong type. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 945b338b421defbd274481d8c4f7e0910fd0e7eb Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 09:55:03 2012 +0000 gallivm: Centralize the function compilation logic. This simplifies a lot of code. Also doing this in a central place will make it easier to carry out the changes necessary to use MC-JIT in the future. gallivm: Fix typo in explicit derivative shuffle. Trivial. draw: make DEBUG_STORE work again adapt to lp_build_printf() interface changes Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: get rid of vecnf_from_scalar() just use lp_build_broadcast directly (cannot assign a name but don't really need it, vecnf_from_scalar() was producing much uglier IR due to using repeated insertelement instead of insertelement+shuffle). Reviewed-by: José Fonseca <jfonseca@vmware.com> llvmpipe: fix typo in complex interpolation code Fixes position interpolation when using complex mode (piglit fp-fragment-position and similar) Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: fix clipvertex/position storing again This appears to be the result of a bad merge. Fixes piglit tests relying on clipping, like a lot of the interpolation tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Fix explicit derivative manipulation. Same counter variable was being used in two nested loops. Use more meanigful variable names for the counter to fix and avoid this. gallivm: Prevent buffer overflow in repeat wrap mode for NPOT. Based on Roland's patch, discussion, and review . Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix dims for TGSI_TEXTURE_1D in emit_tex. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix explicit volume texture derivatives. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: fix 1d shadow texture sampling Always r coordinate is used, hence need 3 coords not two (the second one is unused). Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Enable AVX support without MCJIT, where available. For now, this just enables AVX on Windows for testing. If the code is stable then we might consider prefering the old JIT wherever possible. No change elsewhere. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2012-07-13 18:09:30 +01:00
static boolean
arch_rounding_available(const struct lp_type type)
gallivm,draw,llvmpipe: Support wider native registers. Squashed commit of the following: commit 7acb7b4f60dc505af3dd00dcff744f80315d5b0e Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:46:31 2012 +0100 draw: Don't use dynamically sized arrays. Not supported by MSVC. commit 5810c28c83647612cb372d1e763fd9d7780df3cb Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:44:16 2012 +0100 gallivm,llvmpipe: Don't use expressions with PIPE_ALIGN_VAR(). MSVC doesn't accept exceptions in _declspec(align(...)). Use a define instead. commit 8aafd1457ba572a02b289b3f3411e99a3c056072 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:41:56 2012 +0100 gallium/util: Make u_cpu_detect.h header C++ safe. commit 5795248350771f899cfbfc1a3a58f1835eb2671d Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 2 12:08:01 2012 +0100 gallium/util: Add ULL suffix to large constants. As suggested by Andy Furniss: it looks like some old gcc versions require it. commit 4c66c22727eff92226544c7d43c4eb94de359e10 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Truly disable INF/NAN tests on MSVC. Thanks to Brian for spotting this. commit 8bce274c7fad578d7eb656d9a1413f5c0844c94e Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Disable INF/NAN tests on MSVC. Somehow they are not recognized as constants. commit 6868649cff8d7fd2e2579c28d0b74ef6dd4f9716 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 15:05:24 2012 +0200 gallivm: Cleanup the 2 x 8 float -> 16 ub special path in lp_build_conv. No behaviour change intended, like 7b98455fb40c2df84cfd3cdb1eb7650f67c8a751. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 5147a0949c4407e8bce9e41d9859314b4a9ccf77 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 14:28:19 2012 +0200 gallivm: (trivial) fix issues with multiple-of-4 texture fetch Some formats can't handle non-multiple of 4 fetches I believe, but everything must support length 1 and multiples of 4. So avoid going to scalar fetch (which is very costly) just because length isn't 4. Also extend the hack to not use shift with variable count for yuv formats to arbitrary length (larger than 1) - doesn't matter how many elements we have we always want to avoid it unless we have variable shift count instruction (which we should get with avx2). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 87ebcb1bd71fa4c739451ec8ca89a7f29b168c08 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jul 4 02:09:55 2012 +0200 gallivm: (trivial) fix typo for wrap repeat mode in linear filtering aos code This would lead to bogus coordinates at the edges. (undetected by piglit because this path is only taken for block-based formats). Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 3a42717101b1619874c8932a580c0b9e6896b557 Author: José Fonseca <jfonseca@vmware.com> Date: Tue Jul 3 19:42:49 2012 +0100 gallivm: Fix TGSI integer translation with AVX. commit d71ff104085c196b16426081098fb0bde128ce4f Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 15:17:41 2012 +0100 llvmpipe: Fix LLVM JIT linear path. It was not working properly because it was looking at the JIT function before it was actually compiled. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit a94df0386213e1f5f9a6ed470c535f9688ec0a1b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Jun 28 18:07:10 2012 +0100 gallivm: Refactor lp_build_broadcast(_scalar) to share code. Doesn't really change the generated assembly, but produces more compact IR, and of course, makes code more consistent. Reviewed-by: Brian Paul <brianp@vmware.com> commit 66712ba2731fc029fa246d4fc477d61ab785edb5 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 17:30:13 2012 +0100 gallivm: Make LLVMContextRef a singleton. There are any places inside LLVM that depend on it. Too many to attempt to fix. Reviewed-by: Brian Paul <brianp@vmware.com> commit ff5fb7897495ac263f0b069370fab701b70dccef Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 28 18:15:27 2012 +0200 gallivm: don't use 8-wide texture fetch in aos path This appears to be a slight loss usually. There are probably several reasons for that: - fetching itself is scalar - filtering is pure int code hence needs splitting anyway, same for the final texel offset calculations - texture wrap related code, which can be done 8-wide, is slightly more complex with floats (with clamp_to_edge) and float operations generally more costly hence probably not much faster overall - the code needed to split when encountering different mip levels for the quads, adding complexity So, just split always for aos path (but leave it 8-wide for soa, since we do 8-wide filtering there when possible). This should certainly be revisited if we'd have avx2 support. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ce8032b43dcd8e8d816cbab6428f54b0798f945d Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:41:19 2012 +0200 gallivm: (trivial) don't extract fparts variable if not needed Did not have any consequences but unnecessary. commit aaa9aaed8f80dc282492f62aa583a7ee23a4c6d5 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:09:06 2012 +0200 gallivm: fix precision issue in aos linear int wrap code now not just passes at a quick glance but also with piglit... If we do the wrapping with floats, we also need to set the weights accordingly. We can potentially end up with different (integer) coordinates than what the integer calculations would have chosen, which means the integer weights calculated previously in this case are completely wrong. Well at least that's what I think happens, at least recalculating the weights helps. (Some day really should refactor all the wrapping, so we do whatever is fastest independent of 16bit int aos or 32bit float soa filtering.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit fd6f18588ced7ac8e081892f3bab2916623ad7a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 11:15:53 2012 +0100 gallium/util: Fix parsing of options with underscore. For example GALLIVM_DEBUG=no_brilinear which was being parsed as two options, "no" and "brilinear". commit 09a8f809088178a03e49e409fa18f1ac89561837 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 26 15:00:14 2012 +0100 gallivm: Added a generic lp_build_print_value which prints a LLVMValueRef. Updated lp_build_printf to share common code. Removed specific lp_build_print_vecX. Reviewed-by: José Fonseca <jfonseca@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit e59bdcc2c075931bfba2a84967a5ecd1dedd6eb0 Author: José Fonseca <jfonseca@vmware.com> Date: Wed May 16 15:00:23 2012 +0100 draw,llvmpipe: Avoid named struct types on LLVM 3.0 and later. Starting with LLVM 3.0, named structures are meant not for debugging, but for recursive data types, previously also known as opaque types. The recursive nature of these types leads to several memory management difficulties. Given that we don't actually need recursive types, avoid them altogether. This is an attempt to address fdo bugs 41791 and 44466. The issue is somewhat random so there's no easy way to check how effective this is. Cherry-picked from 9af1ba565dfd5cef9ee938bb7c04767d14878fbf commit df6070f618a203c7a876d984c847cde4cbc26bdb Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 14:42:53 2012 +0200 gallivm: (trivial) fix typo in faster aos linear int wrap code no longer crashes, now REALLY tested. commit d8f98dce452c867214e6782e86dc08562643c862 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 18:20:58 2012 +0200 llvmpipe: (trivial) remove bogus optimization for float aos repeat wrap This optimization for nearest filtering on the linear path generated likely bogus results, and the int path didn't have any optimizations there since the only shader using force_nearest apparently uses clamp_to_edge not repeat wrap anyway. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit c4e271a0631087c795e756a5bb6b046043b5099d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 23:01:52 2012 +0200 gallivm: faster repeat wrap for linear aos path too Even if we already have scaled integer coords, it's way faster to use the original float coord (plus some conversions) rather than use URem. The choice of what to do for texture wrapping is not really tied to int aos or float soa filtering though for some modes there can be some gains (because of easier weight calculations). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 1174a75b1806e92aee4264ffe0ffe7e70abbbfa3 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 14:39:22 2012 +0200 gallivm: improve npot tex wrap repeat in linear soa path URem gets translated into series of scalar divisions so just about anything else is faster. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit f849ffaa499ed96fa0efd3594fce255c7f22891b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 00:40:35 2012 +0100 gallivm: (trivial) fix near-invisible shift-space typo I blame the keyboard. commit 5298a0b19fe672aebeb70964c0797d5921b51cf0 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:24:28 2012 +0200 gallivm: add new intrinsic helper to deal with arbitrary vector length This helper will split vectors which are too large for the hw, or expand them if they are too small, so a caller of a function using intrinsics which uses such sizes need not split (or expand) the vectors manually and the function will still use the intrinsic instead of dropping back to generic llvm code. It can also accept scalars for use with pseudo-vector intrinsics (only useful for float arguments, all x86 scalar simd float intrinsics use 4vf32). Only used for lp_build_min/max() for now (also added the scalar float case for these while there). (Other basic binary functions could use it easily, whereas functions with a different interface would need different helpers.) Expanding vectors isn't widely used, because we always try to use build contexts with native hw vector sizes. But it might (or not) be nicer if this wouldn't need to be done, the generated code should in theory stay the same (it does get hit by lp_build_rho though already since we didn't have a intrinsic for the scalar lp_build_max case before). v2: incorporated Brian's feedback, and also made the scalar min/max case work instead of crash (all scalar simd float intrinsics take 4vf32 as argument, probably the reason why it wasn't used before). Moved to lp_bld_intr based on José's request, and passing intrinsic size instead of length. Ideally we'd derive the source type info from the passed in llvm value refs and process some llvmtype return type so we could handle intrinsics where the source and destination type isn't the same (like float/int conversions, packing instructions) but that's a bit too complicated for now. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 01aa760b99ec0b2dc8ce57a43650e83f8c1becdf Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:19:18 2012 +0200 gallivm: (trivial) increase max code size for shader disassembly 64kB was just short of what I needed (which caused a crash) hence increase to 96kB (should probably be smarter about that). commit 74aa739138d981311ce13076388382b5e89c6562 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:53:29 2012 +0100 gallivm: simplify aos float tex wrap repeat nearest just handle pot and npot the same. The previous pot handling ended up with exactly the same instructions plus 2 more (leave it in the soa path though since it is probably still cheaper there). While here also fix a issue which would cause a crash after an assert. commit 0e1e755645e9e49cfaa2025191e3245ccd723564 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:29:24 2012 +0100 gallivm: (trivial) skip floor rounding in ifloor when not signed This was only done for the non-sse41 case before, but even with sse41 this is obviously unnecessary (some callers already call itrunc in this case anyway but some might not). commit 7f01a62f27dcb1d52597b24825931e88bae76f33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:23:12 2012 +0100 gallivm: (trivial) fix bogus comments commit 5c85be25fd82e28490274c468ce7f3e6e8c1d416 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 20 11:51:57 2012 +0100 translate: Free elt8_func/elt16_func too. These were leaking. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 0ad498f36fb6f7458c7cffa73b6598adceee0a6c Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 15:55:34 2012 +0200 gallivm: fix bug for tex wrap repeat with linear sampling in aos float path The comparison needs to be against length not length_minus_one, otherwise the max texel is never chosen (for the second coordinate). Fixes piglit texwrap-1D-npot-proj (and 2D/3D versions). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit d1ad65937c5b76407dc2499b7b774ab59341209e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 16:13:43 2012 +0200 gallivm: simplify soa tex wrap repeat with npot textures and no mip filtering Similar to what is already done in aos sampling for the float path (but not the int path since we don't get normalized float coordinates there). URem is expensive and the calculation is done trivially with normalized floats instead (at least with sse41-capable cpus). (Some day should probably do the same for the mip filter path but it's much more complicated there hence the gain is smaller.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit e1e23f57ba9b910295c306d148f15643acc3fc83 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:38:56 2012 +0200 llvmpipe: (trivial) remove duplicated function declaration Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 07ca57eb09e04c48a157733255427ef5de620861 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:37:34 2012 +0200 llvmpipe: destroy setup variants on context destruction lp_delete_setup_variants() used to be called in garbage collection, but this no longer exists hence the setup shaders never got freed. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ed0003c633859a45f9963a479f4c15ae0ef1dca3 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 16:25:29 2012 +0100 gallivm: handle different ilod parts for multiple quad sampling This fixes filtering when the integer part of the lod is not the same for all quads. I'm not fully convinced of that solution yet as it just splits the vector if the levels to be sampled from are different. But otherwise we'd need to do things like some minify steps, and getting mip level base address separately anyway hence it wouldn't really look like much of a win (and making the code even more complex). This should now give identical results to single quad sampling. commit 8580ac4cfc43a64df55e84ac71ce1a774d33c0d2 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:14:47 2012 +0200 gallivm: de-duplicate sample code common to soa and aos sampling There doesn't seem to be any reason why this code dealing with cube face selection, lod and mip level calculation is separate in aos and soa sampling, and I am sick of having it to change in both places. commit fb541e5f957408ce305b272100196f1e12e5b1e8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:15:41 2012 +0200 gallivm: do mip filtering with per quad lod_fpart This gives better results for mip filtering, though the generated code might not be optimal. For now it also creates some artifacts if the lod_ipart isn't the same for all quads, since instead of using the same mip weight for all quads as previously (which just caused non-smooth gradients) this now will use the right weights but with the wrong mip level in this case (can easily be seen with things like texfilt, mipmap_tunnel). v2: use logic helper suggested by José, and fix issue with negative lod_fpart values commit f1cc84eef7d826a20fab6cd8ccef9a275ff78967 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 13 18:35:25 2012 +0200 gallivm: (trivial) fix bogus assert in lp_build_unpack_broadcast_aos_scalars commit 7c17dbae8ae290df9ce0f50781a09e8ed640c044 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:11:14 2012 +0100 util: Reimplement half <-> float conversions. Removed u_half.py used to generate the table for previous method. Previous implementation of float to half conversion was faulty for denormalised and NaNs and would require extra logic to fix, thus making the speedup of using tables irrelevant. commit 7762f59274070e1dd4b546f5cb431c2eb71ae5c3 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:12:16 2012 +0100 tests: Updated tests to properly handle NaN for half floats. commit fa94c135aea5911fd93d5dfb6e6f157fb40dce5e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:33:10 2012 +0200 gallivm: do mip level calculations per quad This is the final piece which shouldn't change the rendering output yet. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 23cbeaddfe03c09ca18c45d28955515317ffcf4c Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Jun 9 00:54:21 2012 +0200 gallivm: do per-quad cube face selection Doesn't quite fix the piglit cubemap test (not sure why actually) but doing per-quad face selection is doing the right thing and definitely an improvement. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit abfb372b3702ac97ac8b5aa80ad1b94a2cc39d33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:22:59 2012 +0200 gallivm: do all lod calculations per quad Still no functional change but lod is now converted to scalar after lod calculations. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 519368632747ae03feb5bca9c655eccbc5b751b4 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:46:10 2012 +0100 gallivm: Added support for half-float to float conversion in lp_build_conv. Updated various utility functions to support this change. commit 135b4d683a4c95f7577ba27b9bffa4a6fbd2c2e7 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:02:46 2012 +0100 gallivm: Added function for half-float to float conversion. Updated lp_build_format_aos_array to support half-float source. commit 37d648827406a20c5007abeb177698723ed86673 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:55:18 2012 +0100 util: Updated u_format_tests to rigidly test half-float boundary values. commit 2ad18165d96e578aa9046df7c93cb1c3284d8c6b Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:54:16 2012 +0100 llvmpipe: Updated lp_test_format to properly handle Inf/NaN results. commit 78740acf25aeba8a7d146493dd5c966e22c27b73 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:53:30 2012 +0100 util: Added functions for checking NaN / Inf for double and half-floats. commit 35e9f640ae01241f9e0d67fe893bbbf564c05809 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:05:13 2012 +0200 gallivm: Fix calculating rho for 3d textures for the single-quad case Discovered by accident, this looks like a very old typo bug. commit fc1220c636326536fd0541913154e62afa7cd1d8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:04:59 2012 +0200 gallivm: do calcs per-quad in lp_build_rho Still convert to scalar at the end of the function. commit 50a887ffc550bf310a6988fa2cea5c24d38c1a41 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 21 23:21:50 2012 +0200 gallivm: (trivial) return scalar in lp_build_extract_range for length 1 vectors Our type system on top of llvm's one doesn't generally support vectors of length 1, instead using scalars. So we should return a scalar from this function instead of having to bitcast the vector with length 1 later elsewhere. commit 80c71c621f9391f0f9230460198d861643324876 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 17:49:15 2012 +0100 draw: Fixed bad merge error commit c47401cfad0c9167de20ff560654f533579f452c Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:29:30 2012 +0100 draw: Updated store_clip to store whole vectors instead of individual elements. commit 2d9c1ad74b0b0b41861fffcecde39f09cc27f1cf Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:28:32 2012 +0100 gallivm: Added lp_build_fetch_rgba_aos_array. A version of lp_build_fetch_rgba_aos which is targeted at simple array formats. Reads the whole vector from memory in one, instead of reading each element individually. Tested with mesa tests and demos. commit ff7805dc2b6ef6d8b11ec4e54aab1633aef29ac8 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:27:40 2012 +0100 gallivm: Added lp_build_pad_vector. This function pads a vector with undef to a desired length. commit 701f50acef24a2791dabf4730e5b5687d6eb875d Author: James Benton <jbenton@vmware.com> Date: Fri May 18 17:27:19 2012 +0100 util: Added util_format_is_array. This function checks whether a format description is in a simple array format. commit 5e0a7fa543dcd009de26f34a7926674190fa6246 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:13:47 2012 +0100 draw: Removed draw_llvm_translate_from and draw/draw_llvm_translate.c. This is "replaced" by adding an optimised path in lp_build_fetch_rgba_aos in an upcoming patch. commit 8c886d6a7dd3fb464ecf031de6f747cb33e5361d Author: James Benton <jbenton@vmware.com> Date: Wed May 16 15:02:31 2012 +0100 draw: Modified store_aos to write the vector as one, not individual elements. commit 37337f3d657e21dfd662c7b26d61cb0f8cfa6f17 Author: James Benton <jbenton@vmware.com> Date: Wed May 16 14:16:23 2012 +0100 draw: Changed aos_to_soa to use lp_build_transpose_aos. commit bd2b69ce5d5c94b067944d1dcd5df9f8e84548f1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:14:27 2012 +0100 draw: Changed soa_to_aos to use lp_build_transpose_aos. commit 0b98a950d29a116e82ce31dfe7b82cdadb632f2b Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:45 2012 +0100 gallivm: Added lp_build_transpose_aos which converts between aos and soa. commit 69ea84531ad46fd145eb619ed1cedbe97dde7cb5 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:01 2012 +0100 gallivm: Added lp_build_interleave2_half aimed at AVX unpack instructions. commit 7a4cb1349dd35c18144ad5934525cfb9436792f9 Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 22 11:54:14 2012 +0100 gallivm: Fix build on Windows. MC-JIT not yet supported there. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit afd105fc16bb75d874e418046b80d9cc578818a1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:17:26 2012 +0100 llvmpipe: Added a error counter to lp_test_conv. Useful for keeping track of progress when fixing errors! Signed-off-by: José Fonseca <jfonseca@vmware.com> commit b644907d08c10a805657841330fc23db3963d59c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:16:46 2012 +0100 llvmpipe: Changed known failures in lp_test_conv. To comply with the recent fixes to lp_bld_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit d7061507bd94f6468581e218e61261b79c760d4f Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:14:38 2012 +0100 llvmpipe: Added fixed point types tests to lp_test_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 146b3ea39b4726dbe125ac666bd8902ea3d6ca8c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:26:35 2012 +0100 llvmpipe: Changed lp_test_conv src/dst alignment to be correct. Now based on the define rather than a fixed number. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit f3b57441f834833a4b142a951eb98df0aa874536 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:06:44 2012 +0100 gallivm: Fixed erroneous optimisation in lp_build_min/max. Previously assumed normalised was 0 to 1, but it can be -1 to 1 if type is signed. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a0613382e5a215cd146bb277646a6b394d376ae4 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:04:49 2012 +0100 gallivm: Compensate for lp_const_offset in lp_build_conv. Fixing a /*FIXME*/ to remove errors in integer conversion in lp_build_conv. Tested using lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a3d2bf15ea345bc8a0664f8f441276fd566566f3 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:01:25 2012 +0100 gallivm: Fixed overflow in lp_build_clamped_float_to_unsigned_norm. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit e7b1e76fe237613731fa6003b5e1601a2e506207 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 21 20:07:51 2012 +0100 gallivm: Fix build with LLVM 2.6 Trivial, and useful. commit d3c6bbe5c7f5ba1976710831281ab1b6a631082d Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 15 17:15:59 2012 +0100 gallivm: Enable MCJIT/AVX with vanilla LLVM 3.1. Add the necessary C++ glue, so that we don't need any modifications to the soon to be released LLVM 3.1. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 724a019a14d40fdbed21759a204a2bec8a315636 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 22:04:06 2012 +0100 gallivm: Use HAVE_LLVM 0x0301 consistently. commit af6991e2a3868e40ad599b46278551b794839748 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 21:49:06 2012 +0100 gallivm: Add MCRegisterInfo.h to silence benign warnings about missing implementation. Trivial. commit 6f8a1d75458daae2503a86c6b030ecc4bb494e23 Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Apr 2 22:14:15 2012 -0700 gallivm: Pass in a MCInstrInfo to createMCInstPrinter on llvm-3.1. llvm-3.1svn r153860 makes MCInstrInfo available to the MCInstPrinter. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 62555b6ed8760545794f83064e27cddcb3ce5284 Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 27 21:51:17 2012 -0700 gallivm: Fix method overriding in raw_debug_ostream. Use matching type qualifers to avoid method hiding. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 6a9bd784f4ac68ad0a731dcd39e5a3c39989f2be Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 13 22:40:52 2012 -0700 gallivm: Fix createOProfileJITEventListener namespace with llvm-3.1. llvm-3.1svn r152620 refactored the OProfile profiling code. createOProfileJITEventListener was moved from the llvm namespace to the llvm::JITEventListener namespace. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit b674955d39adae272a779be85aa1bd665de24e3e Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Mar 5 22:00:40 2012 -0800 gallivm: Pass in a MCRegisterInfo to MCInstPrinter on llvm-3.1. llvm-3.1svn r152043 changes createMCInstPrinter to take an additional MCRegisterInfo argument. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 11ab69971a8a31c62f6de74905dbf8c02884599f Author: Vinson Lee <vlee@freedesktop.org> Date: Wed Feb 29 21:20:53 2012 -0800 Revert "gallivm: Change getExtent and readByte to non-const with llvm-3.1." This reverts commit d5a6c172547d8964f4d4bb79637651decaf9deee. llvm-3.1svn r151687 makes MemoryObject accessor members const again. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 339960c82d2a9f5c928ee9035ed31dadb7f45537 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 14 16:19:56 2012 +0200 gallivm: (trivial) fix assertion failure for mipmapped 1d textures In lp_build_rho, we may end up with a 1-element vector (for mipmapped 1d textures), but in this case we require the type to be a non-vector type, so need a cast. commit 9d73edb727bd6d196030dc3026b7bf0c574b3e19 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:12:07 2012 +0200 gallivm: prepare for per-quad lod calculations for large vectors to be able to handle multiple quads at once in texture sampling and still do lod calculations per quad, it is necessary to get the per-quad derivatives into the lp_build_rho function. Until now these derivative values were just scalars, which isn't going to work. So we now use vectors, and since the interface needs to change we also do some different (slightly more efficient) packing of the values. For 8-wide vectors the packed derivative values for 3 coords would look like this, this scales to a arbitrary (multiple of 4) vector size: ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____ The second vector will be unused for 1d and 2d textures. To facilitate future changes the derivative values are put into a struct, since quite some functions just pass these values through. The generated code seems to be very slightly better for 2d textures (with 4-wide vectors) than before with sse2 (if you have a cpu with physical 128bit simd units - otherwise it's probably not a win). v2: suggestions from José, rename variables, add comments, use swizzle helper commit 0aa21de0d31466dac77b05c97005722e902517b8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:10:31 2012 +0200 gallivm: add undefined swizzle handling to lp_build_swizzle_aos This is useful for vectors with "holes", it lets llvm choose the most efficient shuffle instructions if some elements aren't needed without having to worry what elements to manually pick otherwise. commit 00faf3f370e7ce92f5ef51002b0ea42ef856e181 Author: José Fonseca <jfonseca@vmware.com> Date: Fri May 4 17:25:16 2012 +0100 gallivm: Get the LLVM IR optimization passes before JIT compilation. MC-JIT engine compiles the module immediately on creation, so the optimization passes were being run too late. So now we create a target data layout from a string, that matches the ABI parameters reported by the compiler. The backend optimization passes were always been run, so the performance improvement is modest (3% on multiarb mesa demo). Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit 40a43f4e2ce3074b5ce9027179d657ebba68800a Author: Roland Scheidegger <sroland@vmware.com> Date: Wed May 2 16:03:54 2012 +0200 gallivm: (trivial) fix wrong define used in lp_build_pack2 should fix stack-smashing crashes. commit e6371d0f4dffad4eb3b7a9d906c23f1c88a2ab9e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 30 21:25:29 2012 +0200 gallivm: add perf warnings when not using intrinsics with 256bit vectors Helper functions using integer sse2 intrinsics could split the vectors with AVX instead of using generic fallback (which should be faster). We don't actually expect to hit these paths (hence don't fix them up to actually do the vector splitting) so just emit warnings (for those functions where it's obvious doing split/intrinsic is faster than using generic path). Only emit warnings for 256bit vectors since we _really_ don't expect to hit arbitrary large vectors which would affect a lot more functions. The warnings do not actually depend on avx since the same logic applies to plain sse2 too (but of course again there's _really_ no reason we should hit these functions with 256bit vectors without avx). commit 8a9ea701ea7295181e846c6383bf66a5f5e47637 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:37:07 2012 +0200 gallivm: split vectors manually for avx in lp_build_pack2 (v2) There's 2 reasons for this: First, there's a llvm bug (fixed in 3.1) which generates tons of byte inserts/extracts otherwise, and second, more importantly, we want to use pack intrinsics instead of shuffles. We do this in lp_build_pack2 and not the calling code (aos sample path) because potentially other callers might find that useful too, even if for larger sequences of code using non-native vector sizes it might be better to manually split vectors. This should boost texture performance in the aos path considerably. v2: fix issues with intrinsics types with old llvm commit 27ac5b48fa1f2ea3efeb5248e2ce32264aba466e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:26:22 2012 +0200 llvmpipe: refactor lp_build_pack2 (v2) prettify, and it's unnecessary to assert when there's no intrinsic due to unsupported bit width - the shuffle path will work regardless. In contrast lp_build_packs2, should only rely on lp_build_pack2 doing the clamping for element sizes for which there is a sse2 intrinsic. v2: fix bug spotted by Jose regarding the intrinsic type for packusdw on old llvm versions. commit ddf279031f0111de4b18eaf783bdc0a1e47813c8 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:13:59 2012 +0200 gallivm: add src width check in lp_build_packs2() not doing so would skip clamping even if no sse2 pack instruction is available, which is incorrect (in theory only, such widths would also always hit a (unnecessary) assertion in lp_build_pack2(). commit e7f0ad7fe079975eae7712a6e0c54be4fae0114b Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Apr 27 15:57:00 2012 +0200 gallivm: (trivial) fix crash-causing typo for npot textures with avx commit 28a9d7f6f655b6ec508c8a3aa6ffefc1e79793a0 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Apr 25 19:38:45 2012 +0200 gallivm: (trivial) remove code mistakenly added twice. commit d5926537316f8ff67ad0a52e7242f7c5478d919b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Apr 24 21:16:15 2012 +0200 gallivm: add a new avx aos sample path (v2) Try to avoid mixing float and int address calculations. This does texture wrap modes with floats, and then the offset calculations still with ints (because of lack of precision with floats, though we could do some effort to make it work with not too large (16MB) textures). This also handles wrap repeat mode with npot-sized textures differently than either the old soa or aos int path (likely way faster but untested). Otherwise the actual address wrap code is largely similar to the soa path (not quite the same as this one also has some int code), it should get used by avx soa sampling later as well but doesn't handle more complex address modes yet (this will also have the benefit that we can use aos sampling path for all texture address modes). Generated code for that looks reasonable, but still does not split vectors explicitly for fetch/filter which means still get hit by llvm (fixed upstream) which generates hundreds of pinsrb/pextrb instead of two shuffles. It is not obvious though if it's much of a win over just doing address calcs 4-wide but with ints, even if it is definitely much less instructions on avx. piglit's texwrap seems to look exactly the same but doesn't test neither the non-normalized nor the npot cases. v2: fix comments, prettify based on Brian's and Jose's feedback. commit bffecd22dea66fb416ecff8cffd10dd4bdb73fce Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Apr 19 01:58:29 2012 +0200 gallivm: refactor aos lp_build_sample_image_nearest/linear split them up to separate address calculations and fetching/filtering. Need this for being able to do 8-wide float address calcs and 4-wide fetch/filter later (for avx). Plus the functions were very big scary monsters anyway (in particular lp_build_sample_image_linear). commit a80b325c57529adddcfa367f96f03557725c4773 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 16 17:17:18 2012 +0200 gallivm: fix lp_build_resize when truncating width but expanding vector size Missed this case which I thought was impossible - the assertion for it was right after the division by zero... (AoS) texture sampling may ask us to do this, for things like 8 4x32int vectors to 1 32x8int vector conversion (eventually, we probably don't want this to happen). commit f9c8337caa3eb185830d18bce8b95676a065b1d7 Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Apr 14 18:00:59 2012 +0200 gallivm: fix cube maps with larger vectors This makes the branchless cube face selection code work with larger vectors. Because the complexity is quite high (cannot really be improved it seems, per-face selection would reduce complexity a lot but this leads to errors unless the derivatives are calculated all from the same face which almost doubles the work to be done) it is still slower than the branching version, hence only enable this with large vectors. It doesn't actually do per-quad face selection yet (only makes sense with matching lod selection, in fact it will select the same face for all pixels based on the average of the first four pixels for now) but only different shuffles are required to make it work (the branching version actually should work with larger vectors too now thanks to the improved horizontal add but of course it cannot be extended to really select the face per-quad unless doing branching per quad). commit 7780c58869fc9a00af4f23209902db7e058e8a66 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 21:11:12 2012 +0100 llvmpipe: (trivial) fix compiler warning and also clarify comment regarding availability of popcnt instruction. commit a266dccf477df6d29a611154e988e8895892277e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 14:21:07 2012 +0100 gallivm: remove unneeded members in lp_build_sample_context Minor cleanup, the texture width, height, depth aren't accessed in their scalar form anywhere. Makes it more obvious those values should probably be fetched already vectorized (but this requires more invasive changes)... commit b678c57fb474e14f05e25658c829fc04d2792fff Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 29 15:53:55 2012 +0100 gallivm: add a helper for concatenating vectors Similar to the extract_range helper intended to get around slow code generated by llvm for 128bit insertelements. Concatenating two 128bit vectors this way will result in a single vinsertf128 operation rather than two 64bit stores plus one 128bit load, though it might be mildly useful for other purposes as well. commit 415ff228bcd0cf5e44a4c15350a661f0f5520029 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 19:41:15 2012 +0100 gallivm: add a custom 2x8f->1x16ub avx conversion path Similar to the existing 4x4f->1x16ub sse2 path, shaves off a couple instructions (min/max mostly) because it relies on pack intrinsics clamping. commit 78c08fc89f8fbcc6dba09779981b1e873e2a0299 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 18:44:07 2012 +0100 gallivm: add avx arithmetic intrinsics Add all avx intrinsics for arithmetic functions (with the exception of the horizontal add function which needs another look). Seems to pass basic tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit a586caa2800aa5ce54c173f7c0d4fc48153dbc4e Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 15:31:35 2012 +0100 gallivm: add avx logic intrinsics Add the blend intrinsics for 8-wide float and 4-wide double vectors. Since we lack 256bit int instructions these are used for int vectors as well, though obviously not for byte or word element values. The comparison intrinsics aren't extended for avx since these are only used for pre-2.7 llvm versions. commit 70275e4c13c89315fc2560a4c488c0e6935d5caf Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 00:40:53 2012 +0100 gallivm: new helper function for extract shuffles. Based on José's idea as we can need that in a couple places. Note that such shuffles should not be used lightly, since data layout of <4 x i8> is different to <16 x i8> for instance, hence might cause data rearrangement. commit 4d586dbae1b0c55915dda1759d2faea631c0a1c2 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 18:27:25 2012 +0100 gallivm: (trivial) don't overallocate shuffle variable using wrong define meant huge array... commit 06b0ec1f6d665d98c135f9573ddf4ba04b2121ad Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 17:54:20 2012 +0100 gallivm: don't do per-element extract/insert for vector element resize Instead of doing per-element extract/insert if the src vectors and dst vector differ in total size (which generates atrocious code) first change the src vectors size by using shuffles to destination vector size. We can still do better than that on AVX for packing to color buffer (by exploiting pack intrinsics characteristics hence eleminating the need for some clamps) but this already generates much better code. v2: incorporate feedback from José, Keith and use shuffle instead of bitcasts/extracts. Due to llvm deficiencies the latter cause all data to get moved to GPRs and back in pieces (even though the data in the regs actually stays the same...). commit c9970d70e05f95d3f52fe7d2cd794176a52693aa Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 19:33:19 2012 +0000 gallivm: fix bug in simple position interpolation Accidental use of position attribute instead of just pixel coordinates. Caused failures in piglit glsl-fs-ceil and glsl-fs-floor. commit d0b6fcdb008d04d7f73d3d725615321544da5a7e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 15:31:14 2012 +0000 gallivm: fix emission of ceil opcode lp_build_ceil seems more appropriate than lp_build_trunc. This seems to be never hit though someone performs some ceil to floor magic. commit d97fafed7e62ffa6bf76560a92ea246a1a26d256 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 22 11:46:52 2012 +0000 gallivm: new vectorized path for cubemap calculations should be faster when adapted to multiple quads as only selection masks need to be different. The code is more or less a per-pixel version adapted to only do it per quad. A per pixel version would be much simpler (could drop 2 selects, 6 broadcasts and the messy horizontal add of 3 vectors at the expense of only 2 more absolute value instructions - would also just work for arbitary large vectors). This version doesn't yet work with larger vectors because the horizontal add isn't adjusted to be able to work with 2x4 vectors (and also because face selection wouldn't be done per quad just per block though that would be only a correctness issue just as with lod selection). The downside is this code is quite a bit slower. On a Core2 it can be sped up by disabling the hw blend instructions for selection and using logicop fallbacks instead, but it is still slower than the old code, hence leave that in for now. Probably will chose one or the other version based on vector length in the end. commit b375fbb18a3fd46859b7fdd42f3e9908ea4ff9a3 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 21 14:42:29 2012 +0000 gallivm: fix optimized occlusion query intrinsic name commit a9ba0a3b611e48efbb0e79eb09caa85033dbe9a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Mar 21 16:19:43 2012 +0000 draw,gallivm,llvmpipe: Call gallivm_verify_function everywhere. commit f94c2238d2bc7383e088b8845b7410439a602071 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 18:54:10 2012 +0000 gallivm: optimize calculations for cube maps a bit this does some more vectorized calculations and uses horizontal adds if possible. A definite win with sse3 otherwise it doesn't seem to make much of a difference. In any case this is arithmetically identical, cannot handle larger vectors. Should be useful as a reference point against larger vector version later... commit 21a2c1cf3c8e1ac648ff49e59fdc0e3be77e2ebb Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 15:16:27 2012 +0000 llvmpipe: slight optimization of occlusion queries using movmskps when available. While this is slightly better for cpus without popcnt we should really sum the vectors ourselves (it is also possible to cast to i4 before doing the popcnt but that doesn't help that much neither since llvm is using some optimized popcnt version for i32) commit 5ab5a35f216619bcdf55eed52b0db275c4a06c1b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 13:32:11 2012 +0000 llvmpipe: fix occlusion queries with larger vectors need to adjust casts etc. commit ff95e6fdf5f16d4ef999ffcf05ea6e8c7160b0d5 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Mar 19 20:15:25 2012 +0000 gallivm: Restore optimization passes. commit 57b05b4b36451e351659e98946dae27be0959832 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:34:22 2012 +0000 llvmpipe: use existing min2 macro commit bc9a20e19b4f600a439f45679451f2e87cd4b299 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:07:27 2012 +0000 llvmpipe: add some safeguards against really large vectors As per José's suggestion, prevent things from blowing up if some cpu would have 1024bit or larger vectors. commit 0e2b525e5ca1c5bbaa63158bde52ad1c1564a3a9 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:31:08 2012 +0000 llvmpipe: fix mask generation for uberwide vectors this was the only piece preventing 16-wide vectors from working (apart from the LP_MAX_VECTOR_WIDTH define that is), which is the maximum as we don't get more pixels in the fragment shader at once. Hence adjust that so things could be tested properly with that size even though there seems to be no practical value. commit 3c8334162211c97f3a11c7f64e9e5a2a91ad9656 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:19:41 2012 +0000 llvmpipe: fix the simple interpolation method with larger vectors so both methods actually _really_ work now. Makes textures look nice with larger vectors... commit 1cb0464ef8871be1778d43b0c56adf9c06843e2d Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 17:26:35 2012 +0000 llvmpipe: fix mask generation and position interpolation with 8-wide vectors trivial bugs, with these things start to look somewhat reasonable. Textures though have some swizzling issues it seems. commit 168277a63ef5b72542cf063c337f2d701053ff4b Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 16:04:03 2012 +0000 llvmpipe: don't overallocate variables we never have more than 16 (stamp size) / 4 (minimum possible vector size). (With larger vectors those variables are still overallocated a bit.) commit 409b54b30f81ed0aa9ed0b01affe15c72de9abd2 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:56:48 2012 +0000 llvmpipe: add some 32f8 formats to lp_test_conv Also add the ability to handle different sized vectors. commit 55dcd3af8366ebdac0af3cdb22c2588f24aa18ce Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:47:27 2012 +0000 gallivm: handle different sized vectors in conversion / pack only fully generic path for now (extract/insert per element). commit 9c040f78c54575fcd94a8808216cf415fe8868f6 Author: Roland Scheidegger <sroland@vmware.com> Date: Sun Mar 18 00:58:28 2012 +0100 llvmpipe: fix harmless use of unitialized values commit 551e9d5468b92fc7d5aa2265db9a52bb1e368a36 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:31:21 2012 +0100 gallivm: drop special path in extract_broadcast with different sized vectors Not needed, llvm can handle shuffles with different sized result vector just fine. Should hopefully generate the same code in the end, but simpler IR. commit 44da531119ffa07a421eaa041f63607cec88f6f8 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:28:49 2012 +0100 llvmpipe: adapt interpolation for handling multiple quads at once this is still WIP there are actually two methods possible not quite sure what makes the most sense, so there's code for both for now: 1) the iterative method as used before (compute attrib values at upper left corner of stamp and upper left corner of each quad initially). It is improved to handle more than one quad at once, and also do some more vectorized calculations initially for slightly better code - newer cpus have full throughput with 4 wide float vectors, hence don't try to code up a path which might be faster if there's just one channel active per attribute. 2) just do straight interpolation for each pixel. Method 2) is more work per quad, but less initially - if all quads are executed significantly more overall though. But this might change with larger vector lengths. This method would also be needed if we'd do some kind of active quad merging when operating on multiple quads at once. This path contains some hack to force llvm to generate better code, it is still far from ideal though, still generates far too many unnecessary register spills/reloads. Both methods should work with different sized vectors. Not very well tested yet, still seems to work with four-wide vectors, need changes elsewhere to be able to test with wider vectors. commit be5d3e82e2fe14ad0a46529ab79f65bf2276cd28 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:59:37 2012 +0000 draw: Cleanup. commit f85bc12c7fbacb3de2a94e88c6cd2d5ee0ec0e8d Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:43:30 2012 +0000 gallivm: More module compilation refactoring. commit d76f093198f2a06a93b2204857e6fea5fd0b3ece Author: José Fonseca <jfonseca@vmware.com> Date: Thu Mar 15 21:29:11 2012 +0000 llvmpipe: Use gallivm_compile/free_function() in linear code. Should had been done before. commit 122e1adb613ce083ad739b153ced1cde61dfc8c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 13 14:47:10 2012 +0100 llvmpipe: generate partial pixel mask for multiple quads still works with one quad, cannot be tested yet with more At least for now always fixed order with multiple quads. commit 4c4f15081d75ed585a01392cd2dcce0ad10e0ea8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 22:09:24 2012 +0100 llvmpipe: refactor state setup a bit Refactor to make it easier to emit (and potentially later fetch in fs) coefficients for multiple attributes at once. Need to think more about how to make this actually happen however, the problem is different attributes can have different interpolation modes, requiring different handling in both setup and fs (though linear and perspective handling is close). commit 9363e49722ff47094d688a4be6f015a03fba9c79 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 19:23:23 2012 +0100 llvmpipe: vectorize tri offset calc cuts number of instructions in quad-offset-factor from 107 to 75. This code actually duplicated the (scalar) code calculating the determinant except it used different vertex order (leading to different sign but it doesn't matter) hence llvm could not have figured out it's the same (of course with determinant vectorized in the other place that wouldn't have worked any longer neither). Note this particular piece doesn't actually vectorize well, not many arithmetic instructions left but tons of shuffle instructions... Probably would need to work on n tris at a time for better vectorization. commit 63169dcb9dd445c94605625bf86d85306e2b4297 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 03:11:37 2012 +0100 llvmpipe: vectorize some scalar code in setup reduces number of arithmetic instructions, and avoids loading vector x,y values twice (once as scalars once as vectors). Results in a reduction of instructions from 76 to 64 in fs setup for glxgears (16%) on a cpu with sse41. Since this code uses vec2 disguised as vec4, on old cpus which had physical 64bit sse units (pre-Core2) it probably is less of a win in practice (and if you have no vectors you can only hope llvm eliminates the arithmetic for unneeded elements). commit 732ecb877f951ab89bf503ac5e35ab8d838b58a1 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 7 00:32:24 2012 +0100 draw: fix clipping bug introduced by 4822fea3f0440b5205e957cd303838c3b128419c broke clipping pretty badly (verified with lineclip test) commit ef5d90b86d624c152d200c7c4056f47c3c6d2688 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 23:38:59 2012 +0100 draw: don't store vertex header per attribute storing the vertex header once per attribute is totally unnecessary. Some quick look at the generated assembly says llvm in fact cannot optimize away the additional stores (maybe due to potentially aliasing pointers somewhere). Plus, this makes the code cleaner and also allows using a vector "or" instead of scalar ones. commit 6b3a5a57b0b9850854cfbd7b586e4e50102dda71 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 19:11:01 2012 +0100 draw: do the per-vertex "boolean" clipmask "or" with vectors no point extracting the values and doing it per component. Doesn't help that much since we still extract the values elsewhere anyway. commit 36519caf1af40e4480251cc79a2d527350b7c61f Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 2 22:27:01 2012 +0100 gallivm: fix lp_build_extract_broadcast with different sized vectors Fix the obviously wrong argument, so it doesn't blow up. commit 76d0ac3ad85066d6058486638013afd02b069c58 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 2 12:16:23 2012 +0000 draw: Compile per module and not per function (WIP). Enough to get gears w/ LLVM draw + softpipe to work on AVX doing: GALLIUM_DRIVER=softpipe SOFTPIPE_USE_LLVM=yes glxgears But still hackish -- will need to rethink and refactor this. commit 78e32b247d2a7a771be9a1a07eb000d1e54ea8bd Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 12:01:05 2012 +0000 llvmpipe: Remove lp_state_setup_fallback. Never used. commit 6895d5e40d19b4972c361e8b83fdb7eecda3c225 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Feb 27 19:14:27 2012 +0000 llvmpipe: Don't emit EMMS on x86 We already take precautions to ensure that LLVM never emits MMX code. commit 4822fea3f0440b5205e957cd303838c3b128419c Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 29 15:58:19 2012 +0100 draw: modifications for larger vector sizes We want to be able to use larger vectors especially for running the vertex shader. With this patch we build soa vectors which might have a different length than 4. Note that aos structures really remain the same, only when aos structures are converted to soa potentially different sized vectors are used. Samplers probably don't work yet, didn't look at them. Testing done: glxgears works with both 128bit and 256bit vectors. commit f4950fc1ea784680ab767d3dd0dce589f4e70603 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:57 2012 +0100 gallivm: override native vector width with LP_NATIVE_VECTOR_WIDTH env var for debug commit 6ad6dbf0c92f3bf68ae54e5f2aca035d19b76e53 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:24 2012 +0100 draw: allocate storage with alignment according to native vector width commit 7bf0e3e7c9bd2469ae7279cabf4c5229ae9880c1 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Feb 24 19:06:08 2012 +0000 gallivm: Fix comment grammar. Was missing several words. Spotted by Roland. commit b20f1b28eb890b2fa2de44a0399b9b6a0d453c52 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 19:22:09 2012 +0000 gallivm: Use MC-JIT on LLVM 3.1 + (i.e, SVN) MC-JIT Note: MC-JIT is still WIP. For this to work correctly it requires LLVM changes which are not yet upstream. commit b1af4dfcadfc241fd4023f4c3f823a1286d452c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 20:03:15 2012 +0100 llvmpipe: use new lp_type_width() helper in lp_test_blend commit 04e0a37e888237d4db2298f31973af459ef9c95f Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 19:50:34 2012 +0100 llvmpipe: clean up lp_test_blend a little Using variables just sized and aligned right makes it a bit more obvious what's going on. The test still only tests vector length 4. For AoS anything else probably isn't going to work. For SoA other lengths should work (at least with floats). commit e61c393d3ec392ddee0a3da170e985fda885a823 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:48:30 2012 +0000 gallivm: Ensure vector width consistency. Instead of assuming that everything is the max native size. commit 330081ac7bc41c5754a92825e51456d231bf84dd Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:44:14 2012 +0000 draw: More simd vector width consistency fixes. commit d90ca002753596269e37297e2e6c139b19f29f03 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:43:00 2012 +0000 gallivm: Remove unused lp_build_int32_vec4_type() helper. commit cae23417824d75869c202aaf897808d73a2c1db0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 17:32:16 2012 +0100 gallivm: use global variable for native vector width instead of define We do not know the simd extensions (and hence the simd width we should use) available at compile time. At least for now keep a define for maximum vector width, since a global variable obviously can't be used to adjust alignment of automatic stack variables. Leave the runtime-determined value at 128 for now in all cases. commit 51270ace6349acc2c294fc6f34c025c707be538a Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 15:41:02 2012 +0000 gallivm: Add a hunk inadvertedly lost when rebasing. commit bf256df9cfdd0236637a455cbaece949b1253e98 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:24:23 2012 +0000 llvmpipe: Use consistent vector width in depth/stencil test. commit 5543b0901677146662c44be2cfba655fd55da94b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:19:59 2012 +0000 draw: Use a consistent the vector register width. Instead of 4x32 sometimes, LP_NATIVE_VECTOR_WIDTH other times. commit eada8bbd22a3a61f549f32fe2a7e408222e5c824 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 12:08:04 2012 +0000 gallivm: Remove garbagge collection. MC-JIT will require one compilation per module (as opposed to one compilation per function), therefore no state will be shared, eliminating the need to do garbagge collection. commit 556697ea0ed72e0641851e4fbbbb862c470fd7eb Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 10:33:41 2012 +0000 gallivm: Move all native target initialization to lp_set_target_options(). commit c518e8f3f2649d5dc265403511fab4bcbe2cc5c8 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:52:32 2012 +0000 llvmpipe: Create one gallivm instance for each test. commit 90f10af8920ec6be6f2b1e7365cfc477a0cb111d Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:48:08 2012 +0000 gallivm: Avoid LLVMAddGlobalMapping() in lp_bld_assert(). Brittle, complex, and unecesary. Just use function pointer constant. commit 98fde550b33401e3fe006af59db4db628bcbf476 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:21:26 2012 +0000 gallivm: Add a lp_build_const_func_pointer() helper. To be reused in all places where we want to call C code. commit 6cfedadb62c2ce5af8d75969bc95a607f3ece118 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:44:41 2012 +0000 gallivm: Cleanup/simplify lp_build_const_string_variable. - Move to lp_bld_const where it belongs - Rename to lp_build_const_string - take the length from the argument (and don't count the zero terminator twice) - bitcast the constant to generic i8 * commit db1d4018c0f1fa682a9da93c032977659adfb68c Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 11:52:17 2012 +0000 gallivm: Set NoFramePointerElimNonLeaf to true where supported. commit 088614164aa915baaa5044fede728aa898483183 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 22 19:38:47 2012 +0100 llvmpipe: pass in/out pointers rather scalar floats in lp_bld_arit we don't want llvm to potentially optimize away the vectors (though it doesn't seem to currently), plus we want to be able to handle in/out vectors of arbitrary length. commit 3f5c4e04af8a7592fdffa54938a277c34ae76b51 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:22:55 2012 +0100 gallivm: fix lp_build_sqrt() for vector length 1 since we optimize away vectors with length 1 need to emit intrinsic without vector type. commit 79d94e5f93ed8ba6757b97e2026722ea31d32c06 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 17:00:46 2012 +0000 llvmpipe: Remove lp_test_round. commit 81f41b5aeb3f4126e06453cfc78990086b85b78d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:56:24 2012 +0100 llvmpipe: subsume lp_test_round into lp_test_arit Much simpler, and since the arguments aren't passed as 128bit values can run on any arch. This also uses the float instead of the double versions of the c functions (which probably was the intention anyway). In contrast to lp_test_round the output is much less verbose however. Tested vector width of 32 to 512 bits - all pass except 32 (length 1) which crashes in lp_build_sqrt() due to wrong type. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 945b338b421defbd274481d8c4f7e0910fd0e7eb Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 09:55:03 2012 +0000 gallivm: Centralize the function compilation logic. This simplifies a lot of code. Also doing this in a central place will make it easier to carry out the changes necessary to use MC-JIT in the future. gallivm: Fix typo in explicit derivative shuffle. Trivial. draw: make DEBUG_STORE work again adapt to lp_build_printf() interface changes Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: get rid of vecnf_from_scalar() just use lp_build_broadcast directly (cannot assign a name but don't really need it, vecnf_from_scalar() was producing much uglier IR due to using repeated insertelement instead of insertelement+shuffle). Reviewed-by: José Fonseca <jfonseca@vmware.com> llvmpipe: fix typo in complex interpolation code Fixes position interpolation when using complex mode (piglit fp-fragment-position and similar) Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: fix clipvertex/position storing again This appears to be the result of a bad merge. Fixes piglit tests relying on clipping, like a lot of the interpolation tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Fix explicit derivative manipulation. Same counter variable was being used in two nested loops. Use more meanigful variable names for the counter to fix and avoid this. gallivm: Prevent buffer overflow in repeat wrap mode for NPOT. Based on Roland's patch, discussion, and review . Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix dims for TGSI_TEXTURE_1D in emit_tex. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix explicit volume texture derivatives. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: fix 1d shadow texture sampling Always r coordinate is used, hence need 3 coords not two (the second one is unused). Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Enable AVX support without MCJIT, where available. For now, this just enables AVX on Windows for testing. If the code is stable then we might consider prefering the old JIT wherever possible. No change elsewhere. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2012-07-13 18:09:30 +01:00
{
if ((util_cpu_caps.has_sse4_1 &&
(type.length == 1 || type.width*type.length == 128)) ||
(util_cpu_caps.has_avx && type.width*type.length == 256))
return TRUE;
else if ((util_cpu_caps.has_altivec &&
(type.width == 32 && type.length == 4)))
return TRUE;
gallivm,draw,llvmpipe: Support wider native registers. Squashed commit of the following: commit 7acb7b4f60dc505af3dd00dcff744f80315d5b0e Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:46:31 2012 +0100 draw: Don't use dynamically sized arrays. Not supported by MSVC. commit 5810c28c83647612cb372d1e763fd9d7780df3cb Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:44:16 2012 +0100 gallivm,llvmpipe: Don't use expressions with PIPE_ALIGN_VAR(). MSVC doesn't accept exceptions in _declspec(align(...)). Use a define instead. commit 8aafd1457ba572a02b289b3f3411e99a3c056072 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:41:56 2012 +0100 gallium/util: Make u_cpu_detect.h header C++ safe. commit 5795248350771f899cfbfc1a3a58f1835eb2671d Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 2 12:08:01 2012 +0100 gallium/util: Add ULL suffix to large constants. As suggested by Andy Furniss: it looks like some old gcc versions require it. commit 4c66c22727eff92226544c7d43c4eb94de359e10 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Truly disable INF/NAN tests on MSVC. Thanks to Brian for spotting this. commit 8bce274c7fad578d7eb656d9a1413f5c0844c94e Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Disable INF/NAN tests on MSVC. Somehow they are not recognized as constants. commit 6868649cff8d7fd2e2579c28d0b74ef6dd4f9716 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 15:05:24 2012 +0200 gallivm: Cleanup the 2 x 8 float -> 16 ub special path in lp_build_conv. No behaviour change intended, like 7b98455fb40c2df84cfd3cdb1eb7650f67c8a751. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 5147a0949c4407e8bce9e41d9859314b4a9ccf77 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 14:28:19 2012 +0200 gallivm: (trivial) fix issues with multiple-of-4 texture fetch Some formats can't handle non-multiple of 4 fetches I believe, but everything must support length 1 and multiples of 4. So avoid going to scalar fetch (which is very costly) just because length isn't 4. Also extend the hack to not use shift with variable count for yuv formats to arbitrary length (larger than 1) - doesn't matter how many elements we have we always want to avoid it unless we have variable shift count instruction (which we should get with avx2). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 87ebcb1bd71fa4c739451ec8ca89a7f29b168c08 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jul 4 02:09:55 2012 +0200 gallivm: (trivial) fix typo for wrap repeat mode in linear filtering aos code This would lead to bogus coordinates at the edges. (undetected by piglit because this path is only taken for block-based formats). Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 3a42717101b1619874c8932a580c0b9e6896b557 Author: José Fonseca <jfonseca@vmware.com> Date: Tue Jul 3 19:42:49 2012 +0100 gallivm: Fix TGSI integer translation with AVX. commit d71ff104085c196b16426081098fb0bde128ce4f Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 15:17:41 2012 +0100 llvmpipe: Fix LLVM JIT linear path. It was not working properly because it was looking at the JIT function before it was actually compiled. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit a94df0386213e1f5f9a6ed470c535f9688ec0a1b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Jun 28 18:07:10 2012 +0100 gallivm: Refactor lp_build_broadcast(_scalar) to share code. Doesn't really change the generated assembly, but produces more compact IR, and of course, makes code more consistent. Reviewed-by: Brian Paul <brianp@vmware.com> commit 66712ba2731fc029fa246d4fc477d61ab785edb5 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 17:30:13 2012 +0100 gallivm: Make LLVMContextRef a singleton. There are any places inside LLVM that depend on it. Too many to attempt to fix. Reviewed-by: Brian Paul <brianp@vmware.com> commit ff5fb7897495ac263f0b069370fab701b70dccef Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 28 18:15:27 2012 +0200 gallivm: don't use 8-wide texture fetch in aos path This appears to be a slight loss usually. There are probably several reasons for that: - fetching itself is scalar - filtering is pure int code hence needs splitting anyway, same for the final texel offset calculations - texture wrap related code, which can be done 8-wide, is slightly more complex with floats (with clamp_to_edge) and float operations generally more costly hence probably not much faster overall - the code needed to split when encountering different mip levels for the quads, adding complexity So, just split always for aos path (but leave it 8-wide for soa, since we do 8-wide filtering there when possible). This should certainly be revisited if we'd have avx2 support. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ce8032b43dcd8e8d816cbab6428f54b0798f945d Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:41:19 2012 +0200 gallivm: (trivial) don't extract fparts variable if not needed Did not have any consequences but unnecessary. commit aaa9aaed8f80dc282492f62aa583a7ee23a4c6d5 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:09:06 2012 +0200 gallivm: fix precision issue in aos linear int wrap code now not just passes at a quick glance but also with piglit... If we do the wrapping with floats, we also need to set the weights accordingly. We can potentially end up with different (integer) coordinates than what the integer calculations would have chosen, which means the integer weights calculated previously in this case are completely wrong. Well at least that's what I think happens, at least recalculating the weights helps. (Some day really should refactor all the wrapping, so we do whatever is fastest independent of 16bit int aos or 32bit float soa filtering.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit fd6f18588ced7ac8e081892f3bab2916623ad7a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 11:15:53 2012 +0100 gallium/util: Fix parsing of options with underscore. For example GALLIVM_DEBUG=no_brilinear which was being parsed as two options, "no" and "brilinear". commit 09a8f809088178a03e49e409fa18f1ac89561837 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 26 15:00:14 2012 +0100 gallivm: Added a generic lp_build_print_value which prints a LLVMValueRef. Updated lp_build_printf to share common code. Removed specific lp_build_print_vecX. Reviewed-by: José Fonseca <jfonseca@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit e59bdcc2c075931bfba2a84967a5ecd1dedd6eb0 Author: José Fonseca <jfonseca@vmware.com> Date: Wed May 16 15:00:23 2012 +0100 draw,llvmpipe: Avoid named struct types on LLVM 3.0 and later. Starting with LLVM 3.0, named structures are meant not for debugging, but for recursive data types, previously also known as opaque types. The recursive nature of these types leads to several memory management difficulties. Given that we don't actually need recursive types, avoid them altogether. This is an attempt to address fdo bugs 41791 and 44466. The issue is somewhat random so there's no easy way to check how effective this is. Cherry-picked from 9af1ba565dfd5cef9ee938bb7c04767d14878fbf commit df6070f618a203c7a876d984c847cde4cbc26bdb Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 14:42:53 2012 +0200 gallivm: (trivial) fix typo in faster aos linear int wrap code no longer crashes, now REALLY tested. commit d8f98dce452c867214e6782e86dc08562643c862 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 18:20:58 2012 +0200 llvmpipe: (trivial) remove bogus optimization for float aos repeat wrap This optimization for nearest filtering on the linear path generated likely bogus results, and the int path didn't have any optimizations there since the only shader using force_nearest apparently uses clamp_to_edge not repeat wrap anyway. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit c4e271a0631087c795e756a5bb6b046043b5099d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 23:01:52 2012 +0200 gallivm: faster repeat wrap for linear aos path too Even if we already have scaled integer coords, it's way faster to use the original float coord (plus some conversions) rather than use URem. The choice of what to do for texture wrapping is not really tied to int aos or float soa filtering though for some modes there can be some gains (because of easier weight calculations). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 1174a75b1806e92aee4264ffe0ffe7e70abbbfa3 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 14:39:22 2012 +0200 gallivm: improve npot tex wrap repeat in linear soa path URem gets translated into series of scalar divisions so just about anything else is faster. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit f849ffaa499ed96fa0efd3594fce255c7f22891b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 00:40:35 2012 +0100 gallivm: (trivial) fix near-invisible shift-space typo I blame the keyboard. commit 5298a0b19fe672aebeb70964c0797d5921b51cf0 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:24:28 2012 +0200 gallivm: add new intrinsic helper to deal with arbitrary vector length This helper will split vectors which are too large for the hw, or expand them if they are too small, so a caller of a function using intrinsics which uses such sizes need not split (or expand) the vectors manually and the function will still use the intrinsic instead of dropping back to generic llvm code. It can also accept scalars for use with pseudo-vector intrinsics (only useful for float arguments, all x86 scalar simd float intrinsics use 4vf32). Only used for lp_build_min/max() for now (also added the scalar float case for these while there). (Other basic binary functions could use it easily, whereas functions with a different interface would need different helpers.) Expanding vectors isn't widely used, because we always try to use build contexts with native hw vector sizes. But it might (or not) be nicer if this wouldn't need to be done, the generated code should in theory stay the same (it does get hit by lp_build_rho though already since we didn't have a intrinsic for the scalar lp_build_max case before). v2: incorporated Brian's feedback, and also made the scalar min/max case work instead of crash (all scalar simd float intrinsics take 4vf32 as argument, probably the reason why it wasn't used before). Moved to lp_bld_intr based on José's request, and passing intrinsic size instead of length. Ideally we'd derive the source type info from the passed in llvm value refs and process some llvmtype return type so we could handle intrinsics where the source and destination type isn't the same (like float/int conversions, packing instructions) but that's a bit too complicated for now. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 01aa760b99ec0b2dc8ce57a43650e83f8c1becdf Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:19:18 2012 +0200 gallivm: (trivial) increase max code size for shader disassembly 64kB was just short of what I needed (which caused a crash) hence increase to 96kB (should probably be smarter about that). commit 74aa739138d981311ce13076388382b5e89c6562 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:53:29 2012 +0100 gallivm: simplify aos float tex wrap repeat nearest just handle pot and npot the same. The previous pot handling ended up with exactly the same instructions plus 2 more (leave it in the soa path though since it is probably still cheaper there). While here also fix a issue which would cause a crash after an assert. commit 0e1e755645e9e49cfaa2025191e3245ccd723564 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:29:24 2012 +0100 gallivm: (trivial) skip floor rounding in ifloor when not signed This was only done for the non-sse41 case before, but even with sse41 this is obviously unnecessary (some callers already call itrunc in this case anyway but some might not). commit 7f01a62f27dcb1d52597b24825931e88bae76f33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:23:12 2012 +0100 gallivm: (trivial) fix bogus comments commit 5c85be25fd82e28490274c468ce7f3e6e8c1d416 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 20 11:51:57 2012 +0100 translate: Free elt8_func/elt16_func too. These were leaking. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 0ad498f36fb6f7458c7cffa73b6598adceee0a6c Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 15:55:34 2012 +0200 gallivm: fix bug for tex wrap repeat with linear sampling in aos float path The comparison needs to be against length not length_minus_one, otherwise the max texel is never chosen (for the second coordinate). Fixes piglit texwrap-1D-npot-proj (and 2D/3D versions). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit d1ad65937c5b76407dc2499b7b774ab59341209e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 16:13:43 2012 +0200 gallivm: simplify soa tex wrap repeat with npot textures and no mip filtering Similar to what is already done in aos sampling for the float path (but not the int path since we don't get normalized float coordinates there). URem is expensive and the calculation is done trivially with normalized floats instead (at least with sse41-capable cpus). (Some day should probably do the same for the mip filter path but it's much more complicated there hence the gain is smaller.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit e1e23f57ba9b910295c306d148f15643acc3fc83 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:38:56 2012 +0200 llvmpipe: (trivial) remove duplicated function declaration Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 07ca57eb09e04c48a157733255427ef5de620861 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:37:34 2012 +0200 llvmpipe: destroy setup variants on context destruction lp_delete_setup_variants() used to be called in garbage collection, but this no longer exists hence the setup shaders never got freed. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ed0003c633859a45f9963a479f4c15ae0ef1dca3 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 16:25:29 2012 +0100 gallivm: handle different ilod parts for multiple quad sampling This fixes filtering when the integer part of the lod is not the same for all quads. I'm not fully convinced of that solution yet as it just splits the vector if the levels to be sampled from are different. But otherwise we'd need to do things like some minify steps, and getting mip level base address separately anyway hence it wouldn't really look like much of a win (and making the code even more complex). This should now give identical results to single quad sampling. commit 8580ac4cfc43a64df55e84ac71ce1a774d33c0d2 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:14:47 2012 +0200 gallivm: de-duplicate sample code common to soa and aos sampling There doesn't seem to be any reason why this code dealing with cube face selection, lod and mip level calculation is separate in aos and soa sampling, and I am sick of having it to change in both places. commit fb541e5f957408ce305b272100196f1e12e5b1e8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:15:41 2012 +0200 gallivm: do mip filtering with per quad lod_fpart This gives better results for mip filtering, though the generated code might not be optimal. For now it also creates some artifacts if the lod_ipart isn't the same for all quads, since instead of using the same mip weight for all quads as previously (which just caused non-smooth gradients) this now will use the right weights but with the wrong mip level in this case (can easily be seen with things like texfilt, mipmap_tunnel). v2: use logic helper suggested by José, and fix issue with negative lod_fpart values commit f1cc84eef7d826a20fab6cd8ccef9a275ff78967 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 13 18:35:25 2012 +0200 gallivm: (trivial) fix bogus assert in lp_build_unpack_broadcast_aos_scalars commit 7c17dbae8ae290df9ce0f50781a09e8ed640c044 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:11:14 2012 +0100 util: Reimplement half <-> float conversions. Removed u_half.py used to generate the table for previous method. Previous implementation of float to half conversion was faulty for denormalised and NaNs and would require extra logic to fix, thus making the speedup of using tables irrelevant. commit 7762f59274070e1dd4b546f5cb431c2eb71ae5c3 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:12:16 2012 +0100 tests: Updated tests to properly handle NaN for half floats. commit fa94c135aea5911fd93d5dfb6e6f157fb40dce5e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:33:10 2012 +0200 gallivm: do mip level calculations per quad This is the final piece which shouldn't change the rendering output yet. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 23cbeaddfe03c09ca18c45d28955515317ffcf4c Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Jun 9 00:54:21 2012 +0200 gallivm: do per-quad cube face selection Doesn't quite fix the piglit cubemap test (not sure why actually) but doing per-quad face selection is doing the right thing and definitely an improvement. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit abfb372b3702ac97ac8b5aa80ad1b94a2cc39d33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:22:59 2012 +0200 gallivm: do all lod calculations per quad Still no functional change but lod is now converted to scalar after lod calculations. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 519368632747ae03feb5bca9c655eccbc5b751b4 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:46:10 2012 +0100 gallivm: Added support for half-float to float conversion in lp_build_conv. Updated various utility functions to support this change. commit 135b4d683a4c95f7577ba27b9bffa4a6fbd2c2e7 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:02:46 2012 +0100 gallivm: Added function for half-float to float conversion. Updated lp_build_format_aos_array to support half-float source. commit 37d648827406a20c5007abeb177698723ed86673 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:55:18 2012 +0100 util: Updated u_format_tests to rigidly test half-float boundary values. commit 2ad18165d96e578aa9046df7c93cb1c3284d8c6b Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:54:16 2012 +0100 llvmpipe: Updated lp_test_format to properly handle Inf/NaN results. commit 78740acf25aeba8a7d146493dd5c966e22c27b73 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:53:30 2012 +0100 util: Added functions for checking NaN / Inf for double and half-floats. commit 35e9f640ae01241f9e0d67fe893bbbf564c05809 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:05:13 2012 +0200 gallivm: Fix calculating rho for 3d textures for the single-quad case Discovered by accident, this looks like a very old typo bug. commit fc1220c636326536fd0541913154e62afa7cd1d8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:04:59 2012 +0200 gallivm: do calcs per-quad in lp_build_rho Still convert to scalar at the end of the function. commit 50a887ffc550bf310a6988fa2cea5c24d38c1a41 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 21 23:21:50 2012 +0200 gallivm: (trivial) return scalar in lp_build_extract_range for length 1 vectors Our type system on top of llvm's one doesn't generally support vectors of length 1, instead using scalars. So we should return a scalar from this function instead of having to bitcast the vector with length 1 later elsewhere. commit 80c71c621f9391f0f9230460198d861643324876 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 17:49:15 2012 +0100 draw: Fixed bad merge error commit c47401cfad0c9167de20ff560654f533579f452c Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:29:30 2012 +0100 draw: Updated store_clip to store whole vectors instead of individual elements. commit 2d9c1ad74b0b0b41861fffcecde39f09cc27f1cf Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:28:32 2012 +0100 gallivm: Added lp_build_fetch_rgba_aos_array. A version of lp_build_fetch_rgba_aos which is targeted at simple array formats. Reads the whole vector from memory in one, instead of reading each element individually. Tested with mesa tests and demos. commit ff7805dc2b6ef6d8b11ec4e54aab1633aef29ac8 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:27:40 2012 +0100 gallivm: Added lp_build_pad_vector. This function pads a vector with undef to a desired length. commit 701f50acef24a2791dabf4730e5b5687d6eb875d Author: James Benton <jbenton@vmware.com> Date: Fri May 18 17:27:19 2012 +0100 util: Added util_format_is_array. This function checks whether a format description is in a simple array format. commit 5e0a7fa543dcd009de26f34a7926674190fa6246 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:13:47 2012 +0100 draw: Removed draw_llvm_translate_from and draw/draw_llvm_translate.c. This is "replaced" by adding an optimised path in lp_build_fetch_rgba_aos in an upcoming patch. commit 8c886d6a7dd3fb464ecf031de6f747cb33e5361d Author: James Benton <jbenton@vmware.com> Date: Wed May 16 15:02:31 2012 +0100 draw: Modified store_aos to write the vector as one, not individual elements. commit 37337f3d657e21dfd662c7b26d61cb0f8cfa6f17 Author: James Benton <jbenton@vmware.com> Date: Wed May 16 14:16:23 2012 +0100 draw: Changed aos_to_soa to use lp_build_transpose_aos. commit bd2b69ce5d5c94b067944d1dcd5df9f8e84548f1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:14:27 2012 +0100 draw: Changed soa_to_aos to use lp_build_transpose_aos. commit 0b98a950d29a116e82ce31dfe7b82cdadb632f2b Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:45 2012 +0100 gallivm: Added lp_build_transpose_aos which converts between aos and soa. commit 69ea84531ad46fd145eb619ed1cedbe97dde7cb5 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:01 2012 +0100 gallivm: Added lp_build_interleave2_half aimed at AVX unpack instructions. commit 7a4cb1349dd35c18144ad5934525cfb9436792f9 Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 22 11:54:14 2012 +0100 gallivm: Fix build on Windows. MC-JIT not yet supported there. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit afd105fc16bb75d874e418046b80d9cc578818a1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:17:26 2012 +0100 llvmpipe: Added a error counter to lp_test_conv. Useful for keeping track of progress when fixing errors! Signed-off-by: José Fonseca <jfonseca@vmware.com> commit b644907d08c10a805657841330fc23db3963d59c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:16:46 2012 +0100 llvmpipe: Changed known failures in lp_test_conv. To comply with the recent fixes to lp_bld_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit d7061507bd94f6468581e218e61261b79c760d4f Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:14:38 2012 +0100 llvmpipe: Added fixed point types tests to lp_test_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 146b3ea39b4726dbe125ac666bd8902ea3d6ca8c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:26:35 2012 +0100 llvmpipe: Changed lp_test_conv src/dst alignment to be correct. Now based on the define rather than a fixed number. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit f3b57441f834833a4b142a951eb98df0aa874536 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:06:44 2012 +0100 gallivm: Fixed erroneous optimisation in lp_build_min/max. Previously assumed normalised was 0 to 1, but it can be -1 to 1 if type is signed. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a0613382e5a215cd146bb277646a6b394d376ae4 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:04:49 2012 +0100 gallivm: Compensate for lp_const_offset in lp_build_conv. Fixing a /*FIXME*/ to remove errors in integer conversion in lp_build_conv. Tested using lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a3d2bf15ea345bc8a0664f8f441276fd566566f3 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:01:25 2012 +0100 gallivm: Fixed overflow in lp_build_clamped_float_to_unsigned_norm. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit e7b1e76fe237613731fa6003b5e1601a2e506207 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 21 20:07:51 2012 +0100 gallivm: Fix build with LLVM 2.6 Trivial, and useful. commit d3c6bbe5c7f5ba1976710831281ab1b6a631082d Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 15 17:15:59 2012 +0100 gallivm: Enable MCJIT/AVX with vanilla LLVM 3.1. Add the necessary C++ glue, so that we don't need any modifications to the soon to be released LLVM 3.1. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 724a019a14d40fdbed21759a204a2bec8a315636 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 22:04:06 2012 +0100 gallivm: Use HAVE_LLVM 0x0301 consistently. commit af6991e2a3868e40ad599b46278551b794839748 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 21:49:06 2012 +0100 gallivm: Add MCRegisterInfo.h to silence benign warnings about missing implementation. Trivial. commit 6f8a1d75458daae2503a86c6b030ecc4bb494e23 Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Apr 2 22:14:15 2012 -0700 gallivm: Pass in a MCInstrInfo to createMCInstPrinter on llvm-3.1. llvm-3.1svn r153860 makes MCInstrInfo available to the MCInstPrinter. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 62555b6ed8760545794f83064e27cddcb3ce5284 Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 27 21:51:17 2012 -0700 gallivm: Fix method overriding in raw_debug_ostream. Use matching type qualifers to avoid method hiding. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 6a9bd784f4ac68ad0a731dcd39e5a3c39989f2be Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 13 22:40:52 2012 -0700 gallivm: Fix createOProfileJITEventListener namespace with llvm-3.1. llvm-3.1svn r152620 refactored the OProfile profiling code. createOProfileJITEventListener was moved from the llvm namespace to the llvm::JITEventListener namespace. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit b674955d39adae272a779be85aa1bd665de24e3e Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Mar 5 22:00:40 2012 -0800 gallivm: Pass in a MCRegisterInfo to MCInstPrinter on llvm-3.1. llvm-3.1svn r152043 changes createMCInstPrinter to take an additional MCRegisterInfo argument. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 11ab69971a8a31c62f6de74905dbf8c02884599f Author: Vinson Lee <vlee@freedesktop.org> Date: Wed Feb 29 21:20:53 2012 -0800 Revert "gallivm: Change getExtent and readByte to non-const with llvm-3.1." This reverts commit d5a6c172547d8964f4d4bb79637651decaf9deee. llvm-3.1svn r151687 makes MemoryObject accessor members const again. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 339960c82d2a9f5c928ee9035ed31dadb7f45537 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 14 16:19:56 2012 +0200 gallivm: (trivial) fix assertion failure for mipmapped 1d textures In lp_build_rho, we may end up with a 1-element vector (for mipmapped 1d textures), but in this case we require the type to be a non-vector type, so need a cast. commit 9d73edb727bd6d196030dc3026b7bf0c574b3e19 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:12:07 2012 +0200 gallivm: prepare for per-quad lod calculations for large vectors to be able to handle multiple quads at once in texture sampling and still do lod calculations per quad, it is necessary to get the per-quad derivatives into the lp_build_rho function. Until now these derivative values were just scalars, which isn't going to work. So we now use vectors, and since the interface needs to change we also do some different (slightly more efficient) packing of the values. For 8-wide vectors the packed derivative values for 3 coords would look like this, this scales to a arbitrary (multiple of 4) vector size: ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____ The second vector will be unused for 1d and 2d textures. To facilitate future changes the derivative values are put into a struct, since quite some functions just pass these values through. The generated code seems to be very slightly better for 2d textures (with 4-wide vectors) than before with sse2 (if you have a cpu with physical 128bit simd units - otherwise it's probably not a win). v2: suggestions from José, rename variables, add comments, use swizzle helper commit 0aa21de0d31466dac77b05c97005722e902517b8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:10:31 2012 +0200 gallivm: add undefined swizzle handling to lp_build_swizzle_aos This is useful for vectors with "holes", it lets llvm choose the most efficient shuffle instructions if some elements aren't needed without having to worry what elements to manually pick otherwise. commit 00faf3f370e7ce92f5ef51002b0ea42ef856e181 Author: José Fonseca <jfonseca@vmware.com> Date: Fri May 4 17:25:16 2012 +0100 gallivm: Get the LLVM IR optimization passes before JIT compilation. MC-JIT engine compiles the module immediately on creation, so the optimization passes were being run too late. So now we create a target data layout from a string, that matches the ABI parameters reported by the compiler. The backend optimization passes were always been run, so the performance improvement is modest (3% on multiarb mesa demo). Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit 40a43f4e2ce3074b5ce9027179d657ebba68800a Author: Roland Scheidegger <sroland@vmware.com> Date: Wed May 2 16:03:54 2012 +0200 gallivm: (trivial) fix wrong define used in lp_build_pack2 should fix stack-smashing crashes. commit e6371d0f4dffad4eb3b7a9d906c23f1c88a2ab9e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 30 21:25:29 2012 +0200 gallivm: add perf warnings when not using intrinsics with 256bit vectors Helper functions using integer sse2 intrinsics could split the vectors with AVX instead of using generic fallback (which should be faster). We don't actually expect to hit these paths (hence don't fix them up to actually do the vector splitting) so just emit warnings (for those functions where it's obvious doing split/intrinsic is faster than using generic path). Only emit warnings for 256bit vectors since we _really_ don't expect to hit arbitrary large vectors which would affect a lot more functions. The warnings do not actually depend on avx since the same logic applies to plain sse2 too (but of course again there's _really_ no reason we should hit these functions with 256bit vectors without avx). commit 8a9ea701ea7295181e846c6383bf66a5f5e47637 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:37:07 2012 +0200 gallivm: split vectors manually for avx in lp_build_pack2 (v2) There's 2 reasons for this: First, there's a llvm bug (fixed in 3.1) which generates tons of byte inserts/extracts otherwise, and second, more importantly, we want to use pack intrinsics instead of shuffles. We do this in lp_build_pack2 and not the calling code (aos sample path) because potentially other callers might find that useful too, even if for larger sequences of code using non-native vector sizes it might be better to manually split vectors. This should boost texture performance in the aos path considerably. v2: fix issues with intrinsics types with old llvm commit 27ac5b48fa1f2ea3efeb5248e2ce32264aba466e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:26:22 2012 +0200 llvmpipe: refactor lp_build_pack2 (v2) prettify, and it's unnecessary to assert when there's no intrinsic due to unsupported bit width - the shuffle path will work regardless. In contrast lp_build_packs2, should only rely on lp_build_pack2 doing the clamping for element sizes for which there is a sse2 intrinsic. v2: fix bug spotted by Jose regarding the intrinsic type for packusdw on old llvm versions. commit ddf279031f0111de4b18eaf783bdc0a1e47813c8 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:13:59 2012 +0200 gallivm: add src width check in lp_build_packs2() not doing so would skip clamping even if no sse2 pack instruction is available, which is incorrect (in theory only, such widths would also always hit a (unnecessary) assertion in lp_build_pack2(). commit e7f0ad7fe079975eae7712a6e0c54be4fae0114b Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Apr 27 15:57:00 2012 +0200 gallivm: (trivial) fix crash-causing typo for npot textures with avx commit 28a9d7f6f655b6ec508c8a3aa6ffefc1e79793a0 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Apr 25 19:38:45 2012 +0200 gallivm: (trivial) remove code mistakenly added twice. commit d5926537316f8ff67ad0a52e7242f7c5478d919b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Apr 24 21:16:15 2012 +0200 gallivm: add a new avx aos sample path (v2) Try to avoid mixing float and int address calculations. This does texture wrap modes with floats, and then the offset calculations still with ints (because of lack of precision with floats, though we could do some effort to make it work with not too large (16MB) textures). This also handles wrap repeat mode with npot-sized textures differently than either the old soa or aos int path (likely way faster but untested). Otherwise the actual address wrap code is largely similar to the soa path (not quite the same as this one also has some int code), it should get used by avx soa sampling later as well but doesn't handle more complex address modes yet (this will also have the benefit that we can use aos sampling path for all texture address modes). Generated code for that looks reasonable, but still does not split vectors explicitly for fetch/filter which means still get hit by llvm (fixed upstream) which generates hundreds of pinsrb/pextrb instead of two shuffles. It is not obvious though if it's much of a win over just doing address calcs 4-wide but with ints, even if it is definitely much less instructions on avx. piglit's texwrap seems to look exactly the same but doesn't test neither the non-normalized nor the npot cases. v2: fix comments, prettify based on Brian's and Jose's feedback. commit bffecd22dea66fb416ecff8cffd10dd4bdb73fce Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Apr 19 01:58:29 2012 +0200 gallivm: refactor aos lp_build_sample_image_nearest/linear split them up to separate address calculations and fetching/filtering. Need this for being able to do 8-wide float address calcs and 4-wide fetch/filter later (for avx). Plus the functions were very big scary monsters anyway (in particular lp_build_sample_image_linear). commit a80b325c57529adddcfa367f96f03557725c4773 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 16 17:17:18 2012 +0200 gallivm: fix lp_build_resize when truncating width but expanding vector size Missed this case which I thought was impossible - the assertion for it was right after the division by zero... (AoS) texture sampling may ask us to do this, for things like 8 4x32int vectors to 1 32x8int vector conversion (eventually, we probably don't want this to happen). commit f9c8337caa3eb185830d18bce8b95676a065b1d7 Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Apr 14 18:00:59 2012 +0200 gallivm: fix cube maps with larger vectors This makes the branchless cube face selection code work with larger vectors. Because the complexity is quite high (cannot really be improved it seems, per-face selection would reduce complexity a lot but this leads to errors unless the derivatives are calculated all from the same face which almost doubles the work to be done) it is still slower than the branching version, hence only enable this with large vectors. It doesn't actually do per-quad face selection yet (only makes sense with matching lod selection, in fact it will select the same face for all pixels based on the average of the first four pixels for now) but only different shuffles are required to make it work (the branching version actually should work with larger vectors too now thanks to the improved horizontal add but of course it cannot be extended to really select the face per-quad unless doing branching per quad). commit 7780c58869fc9a00af4f23209902db7e058e8a66 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 21:11:12 2012 +0100 llvmpipe: (trivial) fix compiler warning and also clarify comment regarding availability of popcnt instruction. commit a266dccf477df6d29a611154e988e8895892277e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 14:21:07 2012 +0100 gallivm: remove unneeded members in lp_build_sample_context Minor cleanup, the texture width, height, depth aren't accessed in their scalar form anywhere. Makes it more obvious those values should probably be fetched already vectorized (but this requires more invasive changes)... commit b678c57fb474e14f05e25658c829fc04d2792fff Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 29 15:53:55 2012 +0100 gallivm: add a helper for concatenating vectors Similar to the extract_range helper intended to get around slow code generated by llvm for 128bit insertelements. Concatenating two 128bit vectors this way will result in a single vinsertf128 operation rather than two 64bit stores plus one 128bit load, though it might be mildly useful for other purposes as well. commit 415ff228bcd0cf5e44a4c15350a661f0f5520029 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 19:41:15 2012 +0100 gallivm: add a custom 2x8f->1x16ub avx conversion path Similar to the existing 4x4f->1x16ub sse2 path, shaves off a couple instructions (min/max mostly) because it relies on pack intrinsics clamping. commit 78c08fc89f8fbcc6dba09779981b1e873e2a0299 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 18:44:07 2012 +0100 gallivm: add avx arithmetic intrinsics Add all avx intrinsics for arithmetic functions (with the exception of the horizontal add function which needs another look). Seems to pass basic tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit a586caa2800aa5ce54c173f7c0d4fc48153dbc4e Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 15:31:35 2012 +0100 gallivm: add avx logic intrinsics Add the blend intrinsics for 8-wide float and 4-wide double vectors. Since we lack 256bit int instructions these are used for int vectors as well, though obviously not for byte or word element values. The comparison intrinsics aren't extended for avx since these are only used for pre-2.7 llvm versions. commit 70275e4c13c89315fc2560a4c488c0e6935d5caf Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 00:40:53 2012 +0100 gallivm: new helper function for extract shuffles. Based on José's idea as we can need that in a couple places. Note that such shuffles should not be used lightly, since data layout of <4 x i8> is different to <16 x i8> for instance, hence might cause data rearrangement. commit 4d586dbae1b0c55915dda1759d2faea631c0a1c2 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 18:27:25 2012 +0100 gallivm: (trivial) don't overallocate shuffle variable using wrong define meant huge array... commit 06b0ec1f6d665d98c135f9573ddf4ba04b2121ad Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 17:54:20 2012 +0100 gallivm: don't do per-element extract/insert for vector element resize Instead of doing per-element extract/insert if the src vectors and dst vector differ in total size (which generates atrocious code) first change the src vectors size by using shuffles to destination vector size. We can still do better than that on AVX for packing to color buffer (by exploiting pack intrinsics characteristics hence eleminating the need for some clamps) but this already generates much better code. v2: incorporate feedback from José, Keith and use shuffle instead of bitcasts/extracts. Due to llvm deficiencies the latter cause all data to get moved to GPRs and back in pieces (even though the data in the regs actually stays the same...). commit c9970d70e05f95d3f52fe7d2cd794176a52693aa Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 19:33:19 2012 +0000 gallivm: fix bug in simple position interpolation Accidental use of position attribute instead of just pixel coordinates. Caused failures in piglit glsl-fs-ceil and glsl-fs-floor. commit d0b6fcdb008d04d7f73d3d725615321544da5a7e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 15:31:14 2012 +0000 gallivm: fix emission of ceil opcode lp_build_ceil seems more appropriate than lp_build_trunc. This seems to be never hit though someone performs some ceil to floor magic. commit d97fafed7e62ffa6bf76560a92ea246a1a26d256 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 22 11:46:52 2012 +0000 gallivm: new vectorized path for cubemap calculations should be faster when adapted to multiple quads as only selection masks need to be different. The code is more or less a per-pixel version adapted to only do it per quad. A per pixel version would be much simpler (could drop 2 selects, 6 broadcasts and the messy horizontal add of 3 vectors at the expense of only 2 more absolute value instructions - would also just work for arbitary large vectors). This version doesn't yet work with larger vectors because the horizontal add isn't adjusted to be able to work with 2x4 vectors (and also because face selection wouldn't be done per quad just per block though that would be only a correctness issue just as with lod selection). The downside is this code is quite a bit slower. On a Core2 it can be sped up by disabling the hw blend instructions for selection and using logicop fallbacks instead, but it is still slower than the old code, hence leave that in for now. Probably will chose one or the other version based on vector length in the end. commit b375fbb18a3fd46859b7fdd42f3e9908ea4ff9a3 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 21 14:42:29 2012 +0000 gallivm: fix optimized occlusion query intrinsic name commit a9ba0a3b611e48efbb0e79eb09caa85033dbe9a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Mar 21 16:19:43 2012 +0000 draw,gallivm,llvmpipe: Call gallivm_verify_function everywhere. commit f94c2238d2bc7383e088b8845b7410439a602071 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 18:54:10 2012 +0000 gallivm: optimize calculations for cube maps a bit this does some more vectorized calculations and uses horizontal adds if possible. A definite win with sse3 otherwise it doesn't seem to make much of a difference. In any case this is arithmetically identical, cannot handle larger vectors. Should be useful as a reference point against larger vector version later... commit 21a2c1cf3c8e1ac648ff49e59fdc0e3be77e2ebb Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 15:16:27 2012 +0000 llvmpipe: slight optimization of occlusion queries using movmskps when available. While this is slightly better for cpus without popcnt we should really sum the vectors ourselves (it is also possible to cast to i4 before doing the popcnt but that doesn't help that much neither since llvm is using some optimized popcnt version for i32) commit 5ab5a35f216619bcdf55eed52b0db275c4a06c1b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 13:32:11 2012 +0000 llvmpipe: fix occlusion queries with larger vectors need to adjust casts etc. commit ff95e6fdf5f16d4ef999ffcf05ea6e8c7160b0d5 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Mar 19 20:15:25 2012 +0000 gallivm: Restore optimization passes. commit 57b05b4b36451e351659e98946dae27be0959832 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:34:22 2012 +0000 llvmpipe: use existing min2 macro commit bc9a20e19b4f600a439f45679451f2e87cd4b299 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:07:27 2012 +0000 llvmpipe: add some safeguards against really large vectors As per José's suggestion, prevent things from blowing up if some cpu would have 1024bit or larger vectors. commit 0e2b525e5ca1c5bbaa63158bde52ad1c1564a3a9 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:31:08 2012 +0000 llvmpipe: fix mask generation for uberwide vectors this was the only piece preventing 16-wide vectors from working (apart from the LP_MAX_VECTOR_WIDTH define that is), which is the maximum as we don't get more pixels in the fragment shader at once. Hence adjust that so things could be tested properly with that size even though there seems to be no practical value. commit 3c8334162211c97f3a11c7f64e9e5a2a91ad9656 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:19:41 2012 +0000 llvmpipe: fix the simple interpolation method with larger vectors so both methods actually _really_ work now. Makes textures look nice with larger vectors... commit 1cb0464ef8871be1778d43b0c56adf9c06843e2d Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 17:26:35 2012 +0000 llvmpipe: fix mask generation and position interpolation with 8-wide vectors trivial bugs, with these things start to look somewhat reasonable. Textures though have some swizzling issues it seems. commit 168277a63ef5b72542cf063c337f2d701053ff4b Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 16:04:03 2012 +0000 llvmpipe: don't overallocate variables we never have more than 16 (stamp size) / 4 (minimum possible vector size). (With larger vectors those variables are still overallocated a bit.) commit 409b54b30f81ed0aa9ed0b01affe15c72de9abd2 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:56:48 2012 +0000 llvmpipe: add some 32f8 formats to lp_test_conv Also add the ability to handle different sized vectors. commit 55dcd3af8366ebdac0af3cdb22c2588f24aa18ce Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:47:27 2012 +0000 gallivm: handle different sized vectors in conversion / pack only fully generic path for now (extract/insert per element). commit 9c040f78c54575fcd94a8808216cf415fe8868f6 Author: Roland Scheidegger <sroland@vmware.com> Date: Sun Mar 18 00:58:28 2012 +0100 llvmpipe: fix harmless use of unitialized values commit 551e9d5468b92fc7d5aa2265db9a52bb1e368a36 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:31:21 2012 +0100 gallivm: drop special path in extract_broadcast with different sized vectors Not needed, llvm can handle shuffles with different sized result vector just fine. Should hopefully generate the same code in the end, but simpler IR. commit 44da531119ffa07a421eaa041f63607cec88f6f8 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:28:49 2012 +0100 llvmpipe: adapt interpolation for handling multiple quads at once this is still WIP there are actually two methods possible not quite sure what makes the most sense, so there's code for both for now: 1) the iterative method as used before (compute attrib values at upper left corner of stamp and upper left corner of each quad initially). It is improved to handle more than one quad at once, and also do some more vectorized calculations initially for slightly better code - newer cpus have full throughput with 4 wide float vectors, hence don't try to code up a path which might be faster if there's just one channel active per attribute. 2) just do straight interpolation for each pixel. Method 2) is more work per quad, but less initially - if all quads are executed significantly more overall though. But this might change with larger vector lengths. This method would also be needed if we'd do some kind of active quad merging when operating on multiple quads at once. This path contains some hack to force llvm to generate better code, it is still far from ideal though, still generates far too many unnecessary register spills/reloads. Both methods should work with different sized vectors. Not very well tested yet, still seems to work with four-wide vectors, need changes elsewhere to be able to test with wider vectors. commit be5d3e82e2fe14ad0a46529ab79f65bf2276cd28 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:59:37 2012 +0000 draw: Cleanup. commit f85bc12c7fbacb3de2a94e88c6cd2d5ee0ec0e8d Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:43:30 2012 +0000 gallivm: More module compilation refactoring. commit d76f093198f2a06a93b2204857e6fea5fd0b3ece Author: José Fonseca <jfonseca@vmware.com> Date: Thu Mar 15 21:29:11 2012 +0000 llvmpipe: Use gallivm_compile/free_function() in linear code. Should had been done before. commit 122e1adb613ce083ad739b153ced1cde61dfc8c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 13 14:47:10 2012 +0100 llvmpipe: generate partial pixel mask for multiple quads still works with one quad, cannot be tested yet with more At least for now always fixed order with multiple quads. commit 4c4f15081d75ed585a01392cd2dcce0ad10e0ea8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 22:09:24 2012 +0100 llvmpipe: refactor state setup a bit Refactor to make it easier to emit (and potentially later fetch in fs) coefficients for multiple attributes at once. Need to think more about how to make this actually happen however, the problem is different attributes can have different interpolation modes, requiring different handling in both setup and fs (though linear and perspective handling is close). commit 9363e49722ff47094d688a4be6f015a03fba9c79 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 19:23:23 2012 +0100 llvmpipe: vectorize tri offset calc cuts number of instructions in quad-offset-factor from 107 to 75. This code actually duplicated the (scalar) code calculating the determinant except it used different vertex order (leading to different sign but it doesn't matter) hence llvm could not have figured out it's the same (of course with determinant vectorized in the other place that wouldn't have worked any longer neither). Note this particular piece doesn't actually vectorize well, not many arithmetic instructions left but tons of shuffle instructions... Probably would need to work on n tris at a time for better vectorization. commit 63169dcb9dd445c94605625bf86d85306e2b4297 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 03:11:37 2012 +0100 llvmpipe: vectorize some scalar code in setup reduces number of arithmetic instructions, and avoids loading vector x,y values twice (once as scalars once as vectors). Results in a reduction of instructions from 76 to 64 in fs setup for glxgears (16%) on a cpu with sse41. Since this code uses vec2 disguised as vec4, on old cpus which had physical 64bit sse units (pre-Core2) it probably is less of a win in practice (and if you have no vectors you can only hope llvm eliminates the arithmetic for unneeded elements). commit 732ecb877f951ab89bf503ac5e35ab8d838b58a1 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 7 00:32:24 2012 +0100 draw: fix clipping bug introduced by 4822fea3f0440b5205e957cd303838c3b128419c broke clipping pretty badly (verified with lineclip test) commit ef5d90b86d624c152d200c7c4056f47c3c6d2688 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 23:38:59 2012 +0100 draw: don't store vertex header per attribute storing the vertex header once per attribute is totally unnecessary. Some quick look at the generated assembly says llvm in fact cannot optimize away the additional stores (maybe due to potentially aliasing pointers somewhere). Plus, this makes the code cleaner and also allows using a vector "or" instead of scalar ones. commit 6b3a5a57b0b9850854cfbd7b586e4e50102dda71 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 19:11:01 2012 +0100 draw: do the per-vertex "boolean" clipmask "or" with vectors no point extracting the values and doing it per component. Doesn't help that much since we still extract the values elsewhere anyway. commit 36519caf1af40e4480251cc79a2d527350b7c61f Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 2 22:27:01 2012 +0100 gallivm: fix lp_build_extract_broadcast with different sized vectors Fix the obviously wrong argument, so it doesn't blow up. commit 76d0ac3ad85066d6058486638013afd02b069c58 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 2 12:16:23 2012 +0000 draw: Compile per module and not per function (WIP). Enough to get gears w/ LLVM draw + softpipe to work on AVX doing: GALLIUM_DRIVER=softpipe SOFTPIPE_USE_LLVM=yes glxgears But still hackish -- will need to rethink and refactor this. commit 78e32b247d2a7a771be9a1a07eb000d1e54ea8bd Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 12:01:05 2012 +0000 llvmpipe: Remove lp_state_setup_fallback. Never used. commit 6895d5e40d19b4972c361e8b83fdb7eecda3c225 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Feb 27 19:14:27 2012 +0000 llvmpipe: Don't emit EMMS on x86 We already take precautions to ensure that LLVM never emits MMX code. commit 4822fea3f0440b5205e957cd303838c3b128419c Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 29 15:58:19 2012 +0100 draw: modifications for larger vector sizes We want to be able to use larger vectors especially for running the vertex shader. With this patch we build soa vectors which might have a different length than 4. Note that aos structures really remain the same, only when aos structures are converted to soa potentially different sized vectors are used. Samplers probably don't work yet, didn't look at them. Testing done: glxgears works with both 128bit and 256bit vectors. commit f4950fc1ea784680ab767d3dd0dce589f4e70603 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:57 2012 +0100 gallivm: override native vector width with LP_NATIVE_VECTOR_WIDTH env var for debug commit 6ad6dbf0c92f3bf68ae54e5f2aca035d19b76e53 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:24 2012 +0100 draw: allocate storage with alignment according to native vector width commit 7bf0e3e7c9bd2469ae7279cabf4c5229ae9880c1 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Feb 24 19:06:08 2012 +0000 gallivm: Fix comment grammar. Was missing several words. Spotted by Roland. commit b20f1b28eb890b2fa2de44a0399b9b6a0d453c52 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 19:22:09 2012 +0000 gallivm: Use MC-JIT on LLVM 3.1 + (i.e, SVN) MC-JIT Note: MC-JIT is still WIP. For this to work correctly it requires LLVM changes which are not yet upstream. commit b1af4dfcadfc241fd4023f4c3f823a1286d452c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 20:03:15 2012 +0100 llvmpipe: use new lp_type_width() helper in lp_test_blend commit 04e0a37e888237d4db2298f31973af459ef9c95f Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 19:50:34 2012 +0100 llvmpipe: clean up lp_test_blend a little Using variables just sized and aligned right makes it a bit more obvious what's going on. The test still only tests vector length 4. For AoS anything else probably isn't going to work. For SoA other lengths should work (at least with floats). commit e61c393d3ec392ddee0a3da170e985fda885a823 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:48:30 2012 +0000 gallivm: Ensure vector width consistency. Instead of assuming that everything is the max native size. commit 330081ac7bc41c5754a92825e51456d231bf84dd Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:44:14 2012 +0000 draw: More simd vector width consistency fixes. commit d90ca002753596269e37297e2e6c139b19f29f03 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:43:00 2012 +0000 gallivm: Remove unused lp_build_int32_vec4_type() helper. commit cae23417824d75869c202aaf897808d73a2c1db0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 17:32:16 2012 +0100 gallivm: use global variable for native vector width instead of define We do not know the simd extensions (and hence the simd width we should use) available at compile time. At least for now keep a define for maximum vector width, since a global variable obviously can't be used to adjust alignment of automatic stack variables. Leave the runtime-determined value at 128 for now in all cases. commit 51270ace6349acc2c294fc6f34c025c707be538a Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 15:41:02 2012 +0000 gallivm: Add a hunk inadvertedly lost when rebasing. commit bf256df9cfdd0236637a455cbaece949b1253e98 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:24:23 2012 +0000 llvmpipe: Use consistent vector width in depth/stencil test. commit 5543b0901677146662c44be2cfba655fd55da94b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:19:59 2012 +0000 draw: Use a consistent the vector register width. Instead of 4x32 sometimes, LP_NATIVE_VECTOR_WIDTH other times. commit eada8bbd22a3a61f549f32fe2a7e408222e5c824 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 12:08:04 2012 +0000 gallivm: Remove garbagge collection. MC-JIT will require one compilation per module (as opposed to one compilation per function), therefore no state will be shared, eliminating the need to do garbagge collection. commit 556697ea0ed72e0641851e4fbbbb862c470fd7eb Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 10:33:41 2012 +0000 gallivm: Move all native target initialization to lp_set_target_options(). commit c518e8f3f2649d5dc265403511fab4bcbe2cc5c8 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:52:32 2012 +0000 llvmpipe: Create one gallivm instance for each test. commit 90f10af8920ec6be6f2b1e7365cfc477a0cb111d Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:48:08 2012 +0000 gallivm: Avoid LLVMAddGlobalMapping() in lp_bld_assert(). Brittle, complex, and unecesary. Just use function pointer constant. commit 98fde550b33401e3fe006af59db4db628bcbf476 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:21:26 2012 +0000 gallivm: Add a lp_build_const_func_pointer() helper. To be reused in all places where we want to call C code. commit 6cfedadb62c2ce5af8d75969bc95a607f3ece118 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:44:41 2012 +0000 gallivm: Cleanup/simplify lp_build_const_string_variable. - Move to lp_bld_const where it belongs - Rename to lp_build_const_string - take the length from the argument (and don't count the zero terminator twice) - bitcast the constant to generic i8 * commit db1d4018c0f1fa682a9da93c032977659adfb68c Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 11:52:17 2012 +0000 gallivm: Set NoFramePointerElimNonLeaf to true where supported. commit 088614164aa915baaa5044fede728aa898483183 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 22 19:38:47 2012 +0100 llvmpipe: pass in/out pointers rather scalar floats in lp_bld_arit we don't want llvm to potentially optimize away the vectors (though it doesn't seem to currently), plus we want to be able to handle in/out vectors of arbitrary length. commit 3f5c4e04af8a7592fdffa54938a277c34ae76b51 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:22:55 2012 +0100 gallivm: fix lp_build_sqrt() for vector length 1 since we optimize away vectors with length 1 need to emit intrinsic without vector type. commit 79d94e5f93ed8ba6757b97e2026722ea31d32c06 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 17:00:46 2012 +0000 llvmpipe: Remove lp_test_round. commit 81f41b5aeb3f4126e06453cfc78990086b85b78d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:56:24 2012 +0100 llvmpipe: subsume lp_test_round into lp_test_arit Much simpler, and since the arguments aren't passed as 128bit values can run on any arch. This also uses the float instead of the double versions of the c functions (which probably was the intention anyway). In contrast to lp_test_round the output is much less verbose however. Tested vector width of 32 to 512 bits - all pass except 32 (length 1) which crashes in lp_build_sqrt() due to wrong type. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 945b338b421defbd274481d8c4f7e0910fd0e7eb Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 09:55:03 2012 +0000 gallivm: Centralize the function compilation logic. This simplifies a lot of code. Also doing this in a central place will make it easier to carry out the changes necessary to use MC-JIT in the future. gallivm: Fix typo in explicit derivative shuffle. Trivial. draw: make DEBUG_STORE work again adapt to lp_build_printf() interface changes Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: get rid of vecnf_from_scalar() just use lp_build_broadcast directly (cannot assign a name but don't really need it, vecnf_from_scalar() was producing much uglier IR due to using repeated insertelement instead of insertelement+shuffle). Reviewed-by: José Fonseca <jfonseca@vmware.com> llvmpipe: fix typo in complex interpolation code Fixes position interpolation when using complex mode (piglit fp-fragment-position and similar) Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: fix clipvertex/position storing again This appears to be the result of a bad merge. Fixes piglit tests relying on clipping, like a lot of the interpolation tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Fix explicit derivative manipulation. Same counter variable was being used in two nested loops. Use more meanigful variable names for the counter to fix and avoid this. gallivm: Prevent buffer overflow in repeat wrap mode for NPOT. Based on Roland's patch, discussion, and review . Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix dims for TGSI_TEXTURE_1D in emit_tex. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix explicit volume texture derivatives. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: fix 1d shadow texture sampling Always r coordinate is used, hence need 3 coords not two (the second one is unused). Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Enable AVX support without MCJIT, where available. For now, this just enables AVX on Windows for testing. If the code is stable then we might consider prefering the old JIT wherever possible. No change elsewhere. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2012-07-13 18:09:30 +01:00
return FALSE;
}
enum lp_build_round_mode
{
LP_BUILD_ROUND_NEAREST = 0,
LP_BUILD_ROUND_FLOOR = 1,
LP_BUILD_ROUND_CEIL = 2,
LP_BUILD_ROUND_TRUNCATE = 3
};
static inline LLVMValueRef
lp_build_iround_nearest_sse2(struct lp_build_context *bld,
LLVMValueRef a)
{
LLVMBuilderRef builder = bld->gallivm->builder;
const struct lp_type type = bld->type;
LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
const char *intrinsic;
LLVMValueRef res;
assert(type.floating);
/* using the double precision conversions is a bit more complicated */
assert(type.width == 32);
assert(lp_check_value(type, a));
assert(util_cpu_caps.has_sse2);
/* This is relying on MXCSR rounding mode, which should always be nearest. */
if (type.length == 1) {
LLVMTypeRef vec_type;
LLVMValueRef undef;
LLVMValueRef arg;
LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
vec_type = LLVMVectorType(bld->elem_type, 4);
intrinsic = "llvm.x86.sse.cvtss2si";
undef = LLVMGetUndef(vec_type);
arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
res = lp_build_intrinsic_unary(builder, intrinsic,
ret_type, arg);
}
else {
gallivm,draw,llvmpipe: Support wider native registers. Squashed commit of the following: commit 7acb7b4f60dc505af3dd00dcff744f80315d5b0e Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:46:31 2012 +0100 draw: Don't use dynamically sized arrays. Not supported by MSVC. commit 5810c28c83647612cb372d1e763fd9d7780df3cb Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:44:16 2012 +0100 gallivm,llvmpipe: Don't use expressions with PIPE_ALIGN_VAR(). MSVC doesn't accept exceptions in _declspec(align(...)). Use a define instead. commit 8aafd1457ba572a02b289b3f3411e99a3c056072 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:41:56 2012 +0100 gallium/util: Make u_cpu_detect.h header C++ safe. commit 5795248350771f899cfbfc1a3a58f1835eb2671d Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 2 12:08:01 2012 +0100 gallium/util: Add ULL suffix to large constants. As suggested by Andy Furniss: it looks like some old gcc versions require it. commit 4c66c22727eff92226544c7d43c4eb94de359e10 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Truly disable INF/NAN tests on MSVC. Thanks to Brian for spotting this. commit 8bce274c7fad578d7eb656d9a1413f5c0844c94e Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Disable INF/NAN tests on MSVC. Somehow they are not recognized as constants. commit 6868649cff8d7fd2e2579c28d0b74ef6dd4f9716 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 15:05:24 2012 +0200 gallivm: Cleanup the 2 x 8 float -> 16 ub special path in lp_build_conv. No behaviour change intended, like 7b98455fb40c2df84cfd3cdb1eb7650f67c8a751. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 5147a0949c4407e8bce9e41d9859314b4a9ccf77 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 14:28:19 2012 +0200 gallivm: (trivial) fix issues with multiple-of-4 texture fetch Some formats can't handle non-multiple of 4 fetches I believe, but everything must support length 1 and multiples of 4. So avoid going to scalar fetch (which is very costly) just because length isn't 4. Also extend the hack to not use shift with variable count for yuv formats to arbitrary length (larger than 1) - doesn't matter how many elements we have we always want to avoid it unless we have variable shift count instruction (which we should get with avx2). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 87ebcb1bd71fa4c739451ec8ca89a7f29b168c08 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jul 4 02:09:55 2012 +0200 gallivm: (trivial) fix typo for wrap repeat mode in linear filtering aos code This would lead to bogus coordinates at the edges. (undetected by piglit because this path is only taken for block-based formats). Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 3a42717101b1619874c8932a580c0b9e6896b557 Author: José Fonseca <jfonseca@vmware.com> Date: Tue Jul 3 19:42:49 2012 +0100 gallivm: Fix TGSI integer translation with AVX. commit d71ff104085c196b16426081098fb0bde128ce4f Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 15:17:41 2012 +0100 llvmpipe: Fix LLVM JIT linear path. It was not working properly because it was looking at the JIT function before it was actually compiled. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit a94df0386213e1f5f9a6ed470c535f9688ec0a1b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Jun 28 18:07:10 2012 +0100 gallivm: Refactor lp_build_broadcast(_scalar) to share code. Doesn't really change the generated assembly, but produces more compact IR, and of course, makes code more consistent. Reviewed-by: Brian Paul <brianp@vmware.com> commit 66712ba2731fc029fa246d4fc477d61ab785edb5 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 17:30:13 2012 +0100 gallivm: Make LLVMContextRef a singleton. There are any places inside LLVM that depend on it. Too many to attempt to fix. Reviewed-by: Brian Paul <brianp@vmware.com> commit ff5fb7897495ac263f0b069370fab701b70dccef Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 28 18:15:27 2012 +0200 gallivm: don't use 8-wide texture fetch in aos path This appears to be a slight loss usually. There are probably several reasons for that: - fetching itself is scalar - filtering is pure int code hence needs splitting anyway, same for the final texel offset calculations - texture wrap related code, which can be done 8-wide, is slightly more complex with floats (with clamp_to_edge) and float operations generally more costly hence probably not much faster overall - the code needed to split when encountering different mip levels for the quads, adding complexity So, just split always for aos path (but leave it 8-wide for soa, since we do 8-wide filtering there when possible). This should certainly be revisited if we'd have avx2 support. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ce8032b43dcd8e8d816cbab6428f54b0798f945d Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:41:19 2012 +0200 gallivm: (trivial) don't extract fparts variable if not needed Did not have any consequences but unnecessary. commit aaa9aaed8f80dc282492f62aa583a7ee23a4c6d5 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:09:06 2012 +0200 gallivm: fix precision issue in aos linear int wrap code now not just passes at a quick glance but also with piglit... If we do the wrapping with floats, we also need to set the weights accordingly. We can potentially end up with different (integer) coordinates than what the integer calculations would have chosen, which means the integer weights calculated previously in this case are completely wrong. Well at least that's what I think happens, at least recalculating the weights helps. (Some day really should refactor all the wrapping, so we do whatever is fastest independent of 16bit int aos or 32bit float soa filtering.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit fd6f18588ced7ac8e081892f3bab2916623ad7a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 11:15:53 2012 +0100 gallium/util: Fix parsing of options with underscore. For example GALLIVM_DEBUG=no_brilinear which was being parsed as two options, "no" and "brilinear". commit 09a8f809088178a03e49e409fa18f1ac89561837 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 26 15:00:14 2012 +0100 gallivm: Added a generic lp_build_print_value which prints a LLVMValueRef. Updated lp_build_printf to share common code. Removed specific lp_build_print_vecX. Reviewed-by: José Fonseca <jfonseca@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit e59bdcc2c075931bfba2a84967a5ecd1dedd6eb0 Author: José Fonseca <jfonseca@vmware.com> Date: Wed May 16 15:00:23 2012 +0100 draw,llvmpipe: Avoid named struct types on LLVM 3.0 and later. Starting with LLVM 3.0, named structures are meant not for debugging, but for recursive data types, previously also known as opaque types. The recursive nature of these types leads to several memory management difficulties. Given that we don't actually need recursive types, avoid them altogether. This is an attempt to address fdo bugs 41791 and 44466. The issue is somewhat random so there's no easy way to check how effective this is. Cherry-picked from 9af1ba565dfd5cef9ee938bb7c04767d14878fbf commit df6070f618a203c7a876d984c847cde4cbc26bdb Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 14:42:53 2012 +0200 gallivm: (trivial) fix typo in faster aos linear int wrap code no longer crashes, now REALLY tested. commit d8f98dce452c867214e6782e86dc08562643c862 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 18:20:58 2012 +0200 llvmpipe: (trivial) remove bogus optimization for float aos repeat wrap This optimization for nearest filtering on the linear path generated likely bogus results, and the int path didn't have any optimizations there since the only shader using force_nearest apparently uses clamp_to_edge not repeat wrap anyway. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit c4e271a0631087c795e756a5bb6b046043b5099d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 23:01:52 2012 +0200 gallivm: faster repeat wrap for linear aos path too Even if we already have scaled integer coords, it's way faster to use the original float coord (plus some conversions) rather than use URem. The choice of what to do for texture wrapping is not really tied to int aos or float soa filtering though for some modes there can be some gains (because of easier weight calculations). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 1174a75b1806e92aee4264ffe0ffe7e70abbbfa3 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 14:39:22 2012 +0200 gallivm: improve npot tex wrap repeat in linear soa path URem gets translated into series of scalar divisions so just about anything else is faster. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit f849ffaa499ed96fa0efd3594fce255c7f22891b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 00:40:35 2012 +0100 gallivm: (trivial) fix near-invisible shift-space typo I blame the keyboard. commit 5298a0b19fe672aebeb70964c0797d5921b51cf0 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:24:28 2012 +0200 gallivm: add new intrinsic helper to deal with arbitrary vector length This helper will split vectors which are too large for the hw, or expand them if they are too small, so a caller of a function using intrinsics which uses such sizes need not split (or expand) the vectors manually and the function will still use the intrinsic instead of dropping back to generic llvm code. It can also accept scalars for use with pseudo-vector intrinsics (only useful for float arguments, all x86 scalar simd float intrinsics use 4vf32). Only used for lp_build_min/max() for now (also added the scalar float case for these while there). (Other basic binary functions could use it easily, whereas functions with a different interface would need different helpers.) Expanding vectors isn't widely used, because we always try to use build contexts with native hw vector sizes. But it might (or not) be nicer if this wouldn't need to be done, the generated code should in theory stay the same (it does get hit by lp_build_rho though already since we didn't have a intrinsic for the scalar lp_build_max case before). v2: incorporated Brian's feedback, and also made the scalar min/max case work instead of crash (all scalar simd float intrinsics take 4vf32 as argument, probably the reason why it wasn't used before). Moved to lp_bld_intr based on José's request, and passing intrinsic size instead of length. Ideally we'd derive the source type info from the passed in llvm value refs and process some llvmtype return type so we could handle intrinsics where the source and destination type isn't the same (like float/int conversions, packing instructions) but that's a bit too complicated for now. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 01aa760b99ec0b2dc8ce57a43650e83f8c1becdf Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:19:18 2012 +0200 gallivm: (trivial) increase max code size for shader disassembly 64kB was just short of what I needed (which caused a crash) hence increase to 96kB (should probably be smarter about that). commit 74aa739138d981311ce13076388382b5e89c6562 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:53:29 2012 +0100 gallivm: simplify aos float tex wrap repeat nearest just handle pot and npot the same. The previous pot handling ended up with exactly the same instructions plus 2 more (leave it in the soa path though since it is probably still cheaper there). While here also fix a issue which would cause a crash after an assert. commit 0e1e755645e9e49cfaa2025191e3245ccd723564 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:29:24 2012 +0100 gallivm: (trivial) skip floor rounding in ifloor when not signed This was only done for the non-sse41 case before, but even with sse41 this is obviously unnecessary (some callers already call itrunc in this case anyway but some might not). commit 7f01a62f27dcb1d52597b24825931e88bae76f33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:23:12 2012 +0100 gallivm: (trivial) fix bogus comments commit 5c85be25fd82e28490274c468ce7f3e6e8c1d416 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 20 11:51:57 2012 +0100 translate: Free elt8_func/elt16_func too. These were leaking. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 0ad498f36fb6f7458c7cffa73b6598adceee0a6c Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 15:55:34 2012 +0200 gallivm: fix bug for tex wrap repeat with linear sampling in aos float path The comparison needs to be against length not length_minus_one, otherwise the max texel is never chosen (for the second coordinate). Fixes piglit texwrap-1D-npot-proj (and 2D/3D versions). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit d1ad65937c5b76407dc2499b7b774ab59341209e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 16:13:43 2012 +0200 gallivm: simplify soa tex wrap repeat with npot textures and no mip filtering Similar to what is already done in aos sampling for the float path (but not the int path since we don't get normalized float coordinates there). URem is expensive and the calculation is done trivially with normalized floats instead (at least with sse41-capable cpus). (Some day should probably do the same for the mip filter path but it's much more complicated there hence the gain is smaller.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit e1e23f57ba9b910295c306d148f15643acc3fc83 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:38:56 2012 +0200 llvmpipe: (trivial) remove duplicated function declaration Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 07ca57eb09e04c48a157733255427ef5de620861 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:37:34 2012 +0200 llvmpipe: destroy setup variants on context destruction lp_delete_setup_variants() used to be called in garbage collection, but this no longer exists hence the setup shaders never got freed. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ed0003c633859a45f9963a479f4c15ae0ef1dca3 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 16:25:29 2012 +0100 gallivm: handle different ilod parts for multiple quad sampling This fixes filtering when the integer part of the lod is not the same for all quads. I'm not fully convinced of that solution yet as it just splits the vector if the levels to be sampled from are different. But otherwise we'd need to do things like some minify steps, and getting mip level base address separately anyway hence it wouldn't really look like much of a win (and making the code even more complex). This should now give identical results to single quad sampling. commit 8580ac4cfc43a64df55e84ac71ce1a774d33c0d2 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:14:47 2012 +0200 gallivm: de-duplicate sample code common to soa and aos sampling There doesn't seem to be any reason why this code dealing with cube face selection, lod and mip level calculation is separate in aos and soa sampling, and I am sick of having it to change in both places. commit fb541e5f957408ce305b272100196f1e12e5b1e8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:15:41 2012 +0200 gallivm: do mip filtering with per quad lod_fpart This gives better results for mip filtering, though the generated code might not be optimal. For now it also creates some artifacts if the lod_ipart isn't the same for all quads, since instead of using the same mip weight for all quads as previously (which just caused non-smooth gradients) this now will use the right weights but with the wrong mip level in this case (can easily be seen with things like texfilt, mipmap_tunnel). v2: use logic helper suggested by José, and fix issue with negative lod_fpart values commit f1cc84eef7d826a20fab6cd8ccef9a275ff78967 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 13 18:35:25 2012 +0200 gallivm: (trivial) fix bogus assert in lp_build_unpack_broadcast_aos_scalars commit 7c17dbae8ae290df9ce0f50781a09e8ed640c044 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:11:14 2012 +0100 util: Reimplement half <-> float conversions. Removed u_half.py used to generate the table for previous method. Previous implementation of float to half conversion was faulty for denormalised and NaNs and would require extra logic to fix, thus making the speedup of using tables irrelevant. commit 7762f59274070e1dd4b546f5cb431c2eb71ae5c3 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:12:16 2012 +0100 tests: Updated tests to properly handle NaN for half floats. commit fa94c135aea5911fd93d5dfb6e6f157fb40dce5e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:33:10 2012 +0200 gallivm: do mip level calculations per quad This is the final piece which shouldn't change the rendering output yet. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 23cbeaddfe03c09ca18c45d28955515317ffcf4c Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Jun 9 00:54:21 2012 +0200 gallivm: do per-quad cube face selection Doesn't quite fix the piglit cubemap test (not sure why actually) but doing per-quad face selection is doing the right thing and definitely an improvement. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit abfb372b3702ac97ac8b5aa80ad1b94a2cc39d33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:22:59 2012 +0200 gallivm: do all lod calculations per quad Still no functional change but lod is now converted to scalar after lod calculations. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 519368632747ae03feb5bca9c655eccbc5b751b4 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:46:10 2012 +0100 gallivm: Added support for half-float to float conversion in lp_build_conv. Updated various utility functions to support this change. commit 135b4d683a4c95f7577ba27b9bffa4a6fbd2c2e7 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:02:46 2012 +0100 gallivm: Added function for half-float to float conversion. Updated lp_build_format_aos_array to support half-float source. commit 37d648827406a20c5007abeb177698723ed86673 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:55:18 2012 +0100 util: Updated u_format_tests to rigidly test half-float boundary values. commit 2ad18165d96e578aa9046df7c93cb1c3284d8c6b Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:54:16 2012 +0100 llvmpipe: Updated lp_test_format to properly handle Inf/NaN results. commit 78740acf25aeba8a7d146493dd5c966e22c27b73 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:53:30 2012 +0100 util: Added functions for checking NaN / Inf for double and half-floats. commit 35e9f640ae01241f9e0d67fe893bbbf564c05809 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:05:13 2012 +0200 gallivm: Fix calculating rho for 3d textures for the single-quad case Discovered by accident, this looks like a very old typo bug. commit fc1220c636326536fd0541913154e62afa7cd1d8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:04:59 2012 +0200 gallivm: do calcs per-quad in lp_build_rho Still convert to scalar at the end of the function. commit 50a887ffc550bf310a6988fa2cea5c24d38c1a41 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 21 23:21:50 2012 +0200 gallivm: (trivial) return scalar in lp_build_extract_range for length 1 vectors Our type system on top of llvm's one doesn't generally support vectors of length 1, instead using scalars. So we should return a scalar from this function instead of having to bitcast the vector with length 1 later elsewhere. commit 80c71c621f9391f0f9230460198d861643324876 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 17:49:15 2012 +0100 draw: Fixed bad merge error commit c47401cfad0c9167de20ff560654f533579f452c Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:29:30 2012 +0100 draw: Updated store_clip to store whole vectors instead of individual elements. commit 2d9c1ad74b0b0b41861fffcecde39f09cc27f1cf Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:28:32 2012 +0100 gallivm: Added lp_build_fetch_rgba_aos_array. A version of lp_build_fetch_rgba_aos which is targeted at simple array formats. Reads the whole vector from memory in one, instead of reading each element individually. Tested with mesa tests and demos. commit ff7805dc2b6ef6d8b11ec4e54aab1633aef29ac8 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:27:40 2012 +0100 gallivm: Added lp_build_pad_vector. This function pads a vector with undef to a desired length. commit 701f50acef24a2791dabf4730e5b5687d6eb875d Author: James Benton <jbenton@vmware.com> Date: Fri May 18 17:27:19 2012 +0100 util: Added util_format_is_array. This function checks whether a format description is in a simple array format. commit 5e0a7fa543dcd009de26f34a7926674190fa6246 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:13:47 2012 +0100 draw: Removed draw_llvm_translate_from and draw/draw_llvm_translate.c. This is "replaced" by adding an optimised path in lp_build_fetch_rgba_aos in an upcoming patch. commit 8c886d6a7dd3fb464ecf031de6f747cb33e5361d Author: James Benton <jbenton@vmware.com> Date: Wed May 16 15:02:31 2012 +0100 draw: Modified store_aos to write the vector as one, not individual elements. commit 37337f3d657e21dfd662c7b26d61cb0f8cfa6f17 Author: James Benton <jbenton@vmware.com> Date: Wed May 16 14:16:23 2012 +0100 draw: Changed aos_to_soa to use lp_build_transpose_aos. commit bd2b69ce5d5c94b067944d1dcd5df9f8e84548f1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:14:27 2012 +0100 draw: Changed soa_to_aos to use lp_build_transpose_aos. commit 0b98a950d29a116e82ce31dfe7b82cdadb632f2b Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:45 2012 +0100 gallivm: Added lp_build_transpose_aos which converts between aos and soa. commit 69ea84531ad46fd145eb619ed1cedbe97dde7cb5 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:01 2012 +0100 gallivm: Added lp_build_interleave2_half aimed at AVX unpack instructions. commit 7a4cb1349dd35c18144ad5934525cfb9436792f9 Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 22 11:54:14 2012 +0100 gallivm: Fix build on Windows. MC-JIT not yet supported there. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit afd105fc16bb75d874e418046b80d9cc578818a1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:17:26 2012 +0100 llvmpipe: Added a error counter to lp_test_conv. Useful for keeping track of progress when fixing errors! Signed-off-by: José Fonseca <jfonseca@vmware.com> commit b644907d08c10a805657841330fc23db3963d59c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:16:46 2012 +0100 llvmpipe: Changed known failures in lp_test_conv. To comply with the recent fixes to lp_bld_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit d7061507bd94f6468581e218e61261b79c760d4f Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:14:38 2012 +0100 llvmpipe: Added fixed point types tests to lp_test_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 146b3ea39b4726dbe125ac666bd8902ea3d6ca8c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:26:35 2012 +0100 llvmpipe: Changed lp_test_conv src/dst alignment to be correct. Now based on the define rather than a fixed number. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit f3b57441f834833a4b142a951eb98df0aa874536 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:06:44 2012 +0100 gallivm: Fixed erroneous optimisation in lp_build_min/max. Previously assumed normalised was 0 to 1, but it can be -1 to 1 if type is signed. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a0613382e5a215cd146bb277646a6b394d376ae4 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:04:49 2012 +0100 gallivm: Compensate for lp_const_offset in lp_build_conv. Fixing a /*FIXME*/ to remove errors in integer conversion in lp_build_conv. Tested using lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a3d2bf15ea345bc8a0664f8f441276fd566566f3 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:01:25 2012 +0100 gallivm: Fixed overflow in lp_build_clamped_float_to_unsigned_norm. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit e7b1e76fe237613731fa6003b5e1601a2e506207 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 21 20:07:51 2012 +0100 gallivm: Fix build with LLVM 2.6 Trivial, and useful. commit d3c6bbe5c7f5ba1976710831281ab1b6a631082d Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 15 17:15:59 2012 +0100 gallivm: Enable MCJIT/AVX with vanilla LLVM 3.1. Add the necessary C++ glue, so that we don't need any modifications to the soon to be released LLVM 3.1. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 724a019a14d40fdbed21759a204a2bec8a315636 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 22:04:06 2012 +0100 gallivm: Use HAVE_LLVM 0x0301 consistently. commit af6991e2a3868e40ad599b46278551b794839748 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 21:49:06 2012 +0100 gallivm: Add MCRegisterInfo.h to silence benign warnings about missing implementation. Trivial. commit 6f8a1d75458daae2503a86c6b030ecc4bb494e23 Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Apr 2 22:14:15 2012 -0700 gallivm: Pass in a MCInstrInfo to createMCInstPrinter on llvm-3.1. llvm-3.1svn r153860 makes MCInstrInfo available to the MCInstPrinter. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 62555b6ed8760545794f83064e27cddcb3ce5284 Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 27 21:51:17 2012 -0700 gallivm: Fix method overriding in raw_debug_ostream. Use matching type qualifers to avoid method hiding. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 6a9bd784f4ac68ad0a731dcd39e5a3c39989f2be Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 13 22:40:52 2012 -0700 gallivm: Fix createOProfileJITEventListener namespace with llvm-3.1. llvm-3.1svn r152620 refactored the OProfile profiling code. createOProfileJITEventListener was moved from the llvm namespace to the llvm::JITEventListener namespace. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit b674955d39adae272a779be85aa1bd665de24e3e Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Mar 5 22:00:40 2012 -0800 gallivm: Pass in a MCRegisterInfo to MCInstPrinter on llvm-3.1. llvm-3.1svn r152043 changes createMCInstPrinter to take an additional MCRegisterInfo argument. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 11ab69971a8a31c62f6de74905dbf8c02884599f Author: Vinson Lee <vlee@freedesktop.org> Date: Wed Feb 29 21:20:53 2012 -0800 Revert "gallivm: Change getExtent and readByte to non-const with llvm-3.1." This reverts commit d5a6c172547d8964f4d4bb79637651decaf9deee. llvm-3.1svn r151687 makes MemoryObject accessor members const again. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 339960c82d2a9f5c928ee9035ed31dadb7f45537 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 14 16:19:56 2012 +0200 gallivm: (trivial) fix assertion failure for mipmapped 1d textures In lp_build_rho, we may end up with a 1-element vector (for mipmapped 1d textures), but in this case we require the type to be a non-vector type, so need a cast. commit 9d73edb727bd6d196030dc3026b7bf0c574b3e19 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:12:07 2012 +0200 gallivm: prepare for per-quad lod calculations for large vectors to be able to handle multiple quads at once in texture sampling and still do lod calculations per quad, it is necessary to get the per-quad derivatives into the lp_build_rho function. Until now these derivative values were just scalars, which isn't going to work. So we now use vectors, and since the interface needs to change we also do some different (slightly more efficient) packing of the values. For 8-wide vectors the packed derivative values for 3 coords would look like this, this scales to a arbitrary (multiple of 4) vector size: ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____ The second vector will be unused for 1d and 2d textures. To facilitate future changes the derivative values are put into a struct, since quite some functions just pass these values through. The generated code seems to be very slightly better for 2d textures (with 4-wide vectors) than before with sse2 (if you have a cpu with physical 128bit simd units - otherwise it's probably not a win). v2: suggestions from José, rename variables, add comments, use swizzle helper commit 0aa21de0d31466dac77b05c97005722e902517b8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:10:31 2012 +0200 gallivm: add undefined swizzle handling to lp_build_swizzle_aos This is useful for vectors with "holes", it lets llvm choose the most efficient shuffle instructions if some elements aren't needed without having to worry what elements to manually pick otherwise. commit 00faf3f370e7ce92f5ef51002b0ea42ef856e181 Author: José Fonseca <jfonseca@vmware.com> Date: Fri May 4 17:25:16 2012 +0100 gallivm: Get the LLVM IR optimization passes before JIT compilation. MC-JIT engine compiles the module immediately on creation, so the optimization passes were being run too late. So now we create a target data layout from a string, that matches the ABI parameters reported by the compiler. The backend optimization passes were always been run, so the performance improvement is modest (3% on multiarb mesa demo). Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit 40a43f4e2ce3074b5ce9027179d657ebba68800a Author: Roland Scheidegger <sroland@vmware.com> Date: Wed May 2 16:03:54 2012 +0200 gallivm: (trivial) fix wrong define used in lp_build_pack2 should fix stack-smashing crashes. commit e6371d0f4dffad4eb3b7a9d906c23f1c88a2ab9e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 30 21:25:29 2012 +0200 gallivm: add perf warnings when not using intrinsics with 256bit vectors Helper functions using integer sse2 intrinsics could split the vectors with AVX instead of using generic fallback (which should be faster). We don't actually expect to hit these paths (hence don't fix them up to actually do the vector splitting) so just emit warnings (for those functions where it's obvious doing split/intrinsic is faster than using generic path). Only emit warnings for 256bit vectors since we _really_ don't expect to hit arbitrary large vectors which would affect a lot more functions. The warnings do not actually depend on avx since the same logic applies to plain sse2 too (but of course again there's _really_ no reason we should hit these functions with 256bit vectors without avx). commit 8a9ea701ea7295181e846c6383bf66a5f5e47637 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:37:07 2012 +0200 gallivm: split vectors manually for avx in lp_build_pack2 (v2) There's 2 reasons for this: First, there's a llvm bug (fixed in 3.1) which generates tons of byte inserts/extracts otherwise, and second, more importantly, we want to use pack intrinsics instead of shuffles. We do this in lp_build_pack2 and not the calling code (aos sample path) because potentially other callers might find that useful too, even if for larger sequences of code using non-native vector sizes it might be better to manually split vectors. This should boost texture performance in the aos path considerably. v2: fix issues with intrinsics types with old llvm commit 27ac5b48fa1f2ea3efeb5248e2ce32264aba466e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:26:22 2012 +0200 llvmpipe: refactor lp_build_pack2 (v2) prettify, and it's unnecessary to assert when there's no intrinsic due to unsupported bit width - the shuffle path will work regardless. In contrast lp_build_packs2, should only rely on lp_build_pack2 doing the clamping for element sizes for which there is a sse2 intrinsic. v2: fix bug spotted by Jose regarding the intrinsic type for packusdw on old llvm versions. commit ddf279031f0111de4b18eaf783bdc0a1e47813c8 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:13:59 2012 +0200 gallivm: add src width check in lp_build_packs2() not doing so would skip clamping even if no sse2 pack instruction is available, which is incorrect (in theory only, such widths would also always hit a (unnecessary) assertion in lp_build_pack2(). commit e7f0ad7fe079975eae7712a6e0c54be4fae0114b Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Apr 27 15:57:00 2012 +0200 gallivm: (trivial) fix crash-causing typo for npot textures with avx commit 28a9d7f6f655b6ec508c8a3aa6ffefc1e79793a0 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Apr 25 19:38:45 2012 +0200 gallivm: (trivial) remove code mistakenly added twice. commit d5926537316f8ff67ad0a52e7242f7c5478d919b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Apr 24 21:16:15 2012 +0200 gallivm: add a new avx aos sample path (v2) Try to avoid mixing float and int address calculations. This does texture wrap modes with floats, and then the offset calculations still with ints (because of lack of precision with floats, though we could do some effort to make it work with not too large (16MB) textures). This also handles wrap repeat mode with npot-sized textures differently than either the old soa or aos int path (likely way faster but untested). Otherwise the actual address wrap code is largely similar to the soa path (not quite the same as this one also has some int code), it should get used by avx soa sampling later as well but doesn't handle more complex address modes yet (this will also have the benefit that we can use aos sampling path for all texture address modes). Generated code for that looks reasonable, but still does not split vectors explicitly for fetch/filter which means still get hit by llvm (fixed upstream) which generates hundreds of pinsrb/pextrb instead of two shuffles. It is not obvious though if it's much of a win over just doing address calcs 4-wide but with ints, even if it is definitely much less instructions on avx. piglit's texwrap seems to look exactly the same but doesn't test neither the non-normalized nor the npot cases. v2: fix comments, prettify based on Brian's and Jose's feedback. commit bffecd22dea66fb416ecff8cffd10dd4bdb73fce Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Apr 19 01:58:29 2012 +0200 gallivm: refactor aos lp_build_sample_image_nearest/linear split them up to separate address calculations and fetching/filtering. Need this for being able to do 8-wide float address calcs and 4-wide fetch/filter later (for avx). Plus the functions were very big scary monsters anyway (in particular lp_build_sample_image_linear). commit a80b325c57529adddcfa367f96f03557725c4773 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 16 17:17:18 2012 +0200 gallivm: fix lp_build_resize when truncating width but expanding vector size Missed this case which I thought was impossible - the assertion for it was right after the division by zero... (AoS) texture sampling may ask us to do this, for things like 8 4x32int vectors to 1 32x8int vector conversion (eventually, we probably don't want this to happen). commit f9c8337caa3eb185830d18bce8b95676a065b1d7 Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Apr 14 18:00:59 2012 +0200 gallivm: fix cube maps with larger vectors This makes the branchless cube face selection code work with larger vectors. Because the complexity is quite high (cannot really be improved it seems, per-face selection would reduce complexity a lot but this leads to errors unless the derivatives are calculated all from the same face which almost doubles the work to be done) it is still slower than the branching version, hence only enable this with large vectors. It doesn't actually do per-quad face selection yet (only makes sense with matching lod selection, in fact it will select the same face for all pixels based on the average of the first four pixels for now) but only different shuffles are required to make it work (the branching version actually should work with larger vectors too now thanks to the improved horizontal add but of course it cannot be extended to really select the face per-quad unless doing branching per quad). commit 7780c58869fc9a00af4f23209902db7e058e8a66 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 21:11:12 2012 +0100 llvmpipe: (trivial) fix compiler warning and also clarify comment regarding availability of popcnt instruction. commit a266dccf477df6d29a611154e988e8895892277e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 14:21:07 2012 +0100 gallivm: remove unneeded members in lp_build_sample_context Minor cleanup, the texture width, height, depth aren't accessed in their scalar form anywhere. Makes it more obvious those values should probably be fetched already vectorized (but this requires more invasive changes)... commit b678c57fb474e14f05e25658c829fc04d2792fff Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 29 15:53:55 2012 +0100 gallivm: add a helper for concatenating vectors Similar to the extract_range helper intended to get around slow code generated by llvm for 128bit insertelements. Concatenating two 128bit vectors this way will result in a single vinsertf128 operation rather than two 64bit stores plus one 128bit load, though it might be mildly useful for other purposes as well. commit 415ff228bcd0cf5e44a4c15350a661f0f5520029 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 19:41:15 2012 +0100 gallivm: add a custom 2x8f->1x16ub avx conversion path Similar to the existing 4x4f->1x16ub sse2 path, shaves off a couple instructions (min/max mostly) because it relies on pack intrinsics clamping. commit 78c08fc89f8fbcc6dba09779981b1e873e2a0299 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 18:44:07 2012 +0100 gallivm: add avx arithmetic intrinsics Add all avx intrinsics for arithmetic functions (with the exception of the horizontal add function which needs another look). Seems to pass basic tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit a586caa2800aa5ce54c173f7c0d4fc48153dbc4e Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 15:31:35 2012 +0100 gallivm: add avx logic intrinsics Add the blend intrinsics for 8-wide float and 4-wide double vectors. Since we lack 256bit int instructions these are used for int vectors as well, though obviously not for byte or word element values. The comparison intrinsics aren't extended for avx since these are only used for pre-2.7 llvm versions. commit 70275e4c13c89315fc2560a4c488c0e6935d5caf Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 00:40:53 2012 +0100 gallivm: new helper function for extract shuffles. Based on José's idea as we can need that in a couple places. Note that such shuffles should not be used lightly, since data layout of <4 x i8> is different to <16 x i8> for instance, hence might cause data rearrangement. commit 4d586dbae1b0c55915dda1759d2faea631c0a1c2 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 18:27:25 2012 +0100 gallivm: (trivial) don't overallocate shuffle variable using wrong define meant huge array... commit 06b0ec1f6d665d98c135f9573ddf4ba04b2121ad Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 17:54:20 2012 +0100 gallivm: don't do per-element extract/insert for vector element resize Instead of doing per-element extract/insert if the src vectors and dst vector differ in total size (which generates atrocious code) first change the src vectors size by using shuffles to destination vector size. We can still do better than that on AVX for packing to color buffer (by exploiting pack intrinsics characteristics hence eleminating the need for some clamps) but this already generates much better code. v2: incorporate feedback from José, Keith and use shuffle instead of bitcasts/extracts. Due to llvm deficiencies the latter cause all data to get moved to GPRs and back in pieces (even though the data in the regs actually stays the same...). commit c9970d70e05f95d3f52fe7d2cd794176a52693aa Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 19:33:19 2012 +0000 gallivm: fix bug in simple position interpolation Accidental use of position attribute instead of just pixel coordinates. Caused failures in piglit glsl-fs-ceil and glsl-fs-floor. commit d0b6fcdb008d04d7f73d3d725615321544da5a7e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 15:31:14 2012 +0000 gallivm: fix emission of ceil opcode lp_build_ceil seems more appropriate than lp_build_trunc. This seems to be never hit though someone performs some ceil to floor magic. commit d97fafed7e62ffa6bf76560a92ea246a1a26d256 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 22 11:46:52 2012 +0000 gallivm: new vectorized path for cubemap calculations should be faster when adapted to multiple quads as only selection masks need to be different. The code is more or less a per-pixel version adapted to only do it per quad. A per pixel version would be much simpler (could drop 2 selects, 6 broadcasts and the messy horizontal add of 3 vectors at the expense of only 2 more absolute value instructions - would also just work for arbitary large vectors). This version doesn't yet work with larger vectors because the horizontal add isn't adjusted to be able to work with 2x4 vectors (and also because face selection wouldn't be done per quad just per block though that would be only a correctness issue just as with lod selection). The downside is this code is quite a bit slower. On a Core2 it can be sped up by disabling the hw blend instructions for selection and using logicop fallbacks instead, but it is still slower than the old code, hence leave that in for now. Probably will chose one or the other version based on vector length in the end. commit b375fbb18a3fd46859b7fdd42f3e9908ea4ff9a3 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 21 14:42:29 2012 +0000 gallivm: fix optimized occlusion query intrinsic name commit a9ba0a3b611e48efbb0e79eb09caa85033dbe9a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Mar 21 16:19:43 2012 +0000 draw,gallivm,llvmpipe: Call gallivm_verify_function everywhere. commit f94c2238d2bc7383e088b8845b7410439a602071 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 18:54:10 2012 +0000 gallivm: optimize calculations for cube maps a bit this does some more vectorized calculations and uses horizontal adds if possible. A definite win with sse3 otherwise it doesn't seem to make much of a difference. In any case this is arithmetically identical, cannot handle larger vectors. Should be useful as a reference point against larger vector version later... commit 21a2c1cf3c8e1ac648ff49e59fdc0e3be77e2ebb Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 15:16:27 2012 +0000 llvmpipe: slight optimization of occlusion queries using movmskps when available. While this is slightly better for cpus without popcnt we should really sum the vectors ourselves (it is also possible to cast to i4 before doing the popcnt but that doesn't help that much neither since llvm is using some optimized popcnt version for i32) commit 5ab5a35f216619bcdf55eed52b0db275c4a06c1b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 13:32:11 2012 +0000 llvmpipe: fix occlusion queries with larger vectors need to adjust casts etc. commit ff95e6fdf5f16d4ef999ffcf05ea6e8c7160b0d5 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Mar 19 20:15:25 2012 +0000 gallivm: Restore optimization passes. commit 57b05b4b36451e351659e98946dae27be0959832 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:34:22 2012 +0000 llvmpipe: use existing min2 macro commit bc9a20e19b4f600a439f45679451f2e87cd4b299 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:07:27 2012 +0000 llvmpipe: add some safeguards against really large vectors As per José's suggestion, prevent things from blowing up if some cpu would have 1024bit or larger vectors. commit 0e2b525e5ca1c5bbaa63158bde52ad1c1564a3a9 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:31:08 2012 +0000 llvmpipe: fix mask generation for uberwide vectors this was the only piece preventing 16-wide vectors from working (apart from the LP_MAX_VECTOR_WIDTH define that is), which is the maximum as we don't get more pixels in the fragment shader at once. Hence adjust that so things could be tested properly with that size even though there seems to be no practical value. commit 3c8334162211c97f3a11c7f64e9e5a2a91ad9656 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:19:41 2012 +0000 llvmpipe: fix the simple interpolation method with larger vectors so both methods actually _really_ work now. Makes textures look nice with larger vectors... commit 1cb0464ef8871be1778d43b0c56adf9c06843e2d Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 17:26:35 2012 +0000 llvmpipe: fix mask generation and position interpolation with 8-wide vectors trivial bugs, with these things start to look somewhat reasonable. Textures though have some swizzling issues it seems. commit 168277a63ef5b72542cf063c337f2d701053ff4b Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 16:04:03 2012 +0000 llvmpipe: don't overallocate variables we never have more than 16 (stamp size) / 4 (minimum possible vector size). (With larger vectors those variables are still overallocated a bit.) commit 409b54b30f81ed0aa9ed0b01affe15c72de9abd2 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:56:48 2012 +0000 llvmpipe: add some 32f8 formats to lp_test_conv Also add the ability to handle different sized vectors. commit 55dcd3af8366ebdac0af3cdb22c2588f24aa18ce Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:47:27 2012 +0000 gallivm: handle different sized vectors in conversion / pack only fully generic path for now (extract/insert per element). commit 9c040f78c54575fcd94a8808216cf415fe8868f6 Author: Roland Scheidegger <sroland@vmware.com> Date: Sun Mar 18 00:58:28 2012 +0100 llvmpipe: fix harmless use of unitialized values commit 551e9d5468b92fc7d5aa2265db9a52bb1e368a36 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:31:21 2012 +0100 gallivm: drop special path in extract_broadcast with different sized vectors Not needed, llvm can handle shuffles with different sized result vector just fine. Should hopefully generate the same code in the end, but simpler IR. commit 44da531119ffa07a421eaa041f63607cec88f6f8 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:28:49 2012 +0100 llvmpipe: adapt interpolation for handling multiple quads at once this is still WIP there are actually two methods possible not quite sure what makes the most sense, so there's code for both for now: 1) the iterative method as used before (compute attrib values at upper left corner of stamp and upper left corner of each quad initially). It is improved to handle more than one quad at once, and also do some more vectorized calculations initially for slightly better code - newer cpus have full throughput with 4 wide float vectors, hence don't try to code up a path which might be faster if there's just one channel active per attribute. 2) just do straight interpolation for each pixel. Method 2) is more work per quad, but less initially - if all quads are executed significantly more overall though. But this might change with larger vector lengths. This method would also be needed if we'd do some kind of active quad merging when operating on multiple quads at once. This path contains some hack to force llvm to generate better code, it is still far from ideal though, still generates far too many unnecessary register spills/reloads. Both methods should work with different sized vectors. Not very well tested yet, still seems to work with four-wide vectors, need changes elsewhere to be able to test with wider vectors. commit be5d3e82e2fe14ad0a46529ab79f65bf2276cd28 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:59:37 2012 +0000 draw: Cleanup. commit f85bc12c7fbacb3de2a94e88c6cd2d5ee0ec0e8d Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:43:30 2012 +0000 gallivm: More module compilation refactoring. commit d76f093198f2a06a93b2204857e6fea5fd0b3ece Author: José Fonseca <jfonseca@vmware.com> Date: Thu Mar 15 21:29:11 2012 +0000 llvmpipe: Use gallivm_compile/free_function() in linear code. Should had been done before. commit 122e1adb613ce083ad739b153ced1cde61dfc8c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 13 14:47:10 2012 +0100 llvmpipe: generate partial pixel mask for multiple quads still works with one quad, cannot be tested yet with more At least for now always fixed order with multiple quads. commit 4c4f15081d75ed585a01392cd2dcce0ad10e0ea8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 22:09:24 2012 +0100 llvmpipe: refactor state setup a bit Refactor to make it easier to emit (and potentially later fetch in fs) coefficients for multiple attributes at once. Need to think more about how to make this actually happen however, the problem is different attributes can have different interpolation modes, requiring different handling in both setup and fs (though linear and perspective handling is close). commit 9363e49722ff47094d688a4be6f015a03fba9c79 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 19:23:23 2012 +0100 llvmpipe: vectorize tri offset calc cuts number of instructions in quad-offset-factor from 107 to 75. This code actually duplicated the (scalar) code calculating the determinant except it used different vertex order (leading to different sign but it doesn't matter) hence llvm could not have figured out it's the same (of course with determinant vectorized in the other place that wouldn't have worked any longer neither). Note this particular piece doesn't actually vectorize well, not many arithmetic instructions left but tons of shuffle instructions... Probably would need to work on n tris at a time for better vectorization. commit 63169dcb9dd445c94605625bf86d85306e2b4297 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 03:11:37 2012 +0100 llvmpipe: vectorize some scalar code in setup reduces number of arithmetic instructions, and avoids loading vector x,y values twice (once as scalars once as vectors). Results in a reduction of instructions from 76 to 64 in fs setup for glxgears (16%) on a cpu with sse41. Since this code uses vec2 disguised as vec4, on old cpus which had physical 64bit sse units (pre-Core2) it probably is less of a win in practice (and if you have no vectors you can only hope llvm eliminates the arithmetic for unneeded elements). commit 732ecb877f951ab89bf503ac5e35ab8d838b58a1 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 7 00:32:24 2012 +0100 draw: fix clipping bug introduced by 4822fea3f0440b5205e957cd303838c3b128419c broke clipping pretty badly (verified with lineclip test) commit ef5d90b86d624c152d200c7c4056f47c3c6d2688 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 23:38:59 2012 +0100 draw: don't store vertex header per attribute storing the vertex header once per attribute is totally unnecessary. Some quick look at the generated assembly says llvm in fact cannot optimize away the additional stores (maybe due to potentially aliasing pointers somewhere). Plus, this makes the code cleaner and also allows using a vector "or" instead of scalar ones. commit 6b3a5a57b0b9850854cfbd7b586e4e50102dda71 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 19:11:01 2012 +0100 draw: do the per-vertex "boolean" clipmask "or" with vectors no point extracting the values and doing it per component. Doesn't help that much since we still extract the values elsewhere anyway. commit 36519caf1af40e4480251cc79a2d527350b7c61f Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 2 22:27:01 2012 +0100 gallivm: fix lp_build_extract_broadcast with different sized vectors Fix the obviously wrong argument, so it doesn't blow up. commit 76d0ac3ad85066d6058486638013afd02b069c58 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 2 12:16:23 2012 +0000 draw: Compile per module and not per function (WIP). Enough to get gears w/ LLVM draw + softpipe to work on AVX doing: GALLIUM_DRIVER=softpipe SOFTPIPE_USE_LLVM=yes glxgears But still hackish -- will need to rethink and refactor this. commit 78e32b247d2a7a771be9a1a07eb000d1e54ea8bd Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 12:01:05 2012 +0000 llvmpipe: Remove lp_state_setup_fallback. Never used. commit 6895d5e40d19b4972c361e8b83fdb7eecda3c225 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Feb 27 19:14:27 2012 +0000 llvmpipe: Don't emit EMMS on x86 We already take precautions to ensure that LLVM never emits MMX code. commit 4822fea3f0440b5205e957cd303838c3b128419c Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 29 15:58:19 2012 +0100 draw: modifications for larger vector sizes We want to be able to use larger vectors especially for running the vertex shader. With this patch we build soa vectors which might have a different length than 4. Note that aos structures really remain the same, only when aos structures are converted to soa potentially different sized vectors are used. Samplers probably don't work yet, didn't look at them. Testing done: glxgears works with both 128bit and 256bit vectors. commit f4950fc1ea784680ab767d3dd0dce589f4e70603 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:57 2012 +0100 gallivm: override native vector width with LP_NATIVE_VECTOR_WIDTH env var for debug commit 6ad6dbf0c92f3bf68ae54e5f2aca035d19b76e53 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:24 2012 +0100 draw: allocate storage with alignment according to native vector width commit 7bf0e3e7c9bd2469ae7279cabf4c5229ae9880c1 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Feb 24 19:06:08 2012 +0000 gallivm: Fix comment grammar. Was missing several words. Spotted by Roland. commit b20f1b28eb890b2fa2de44a0399b9b6a0d453c52 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 19:22:09 2012 +0000 gallivm: Use MC-JIT on LLVM 3.1 + (i.e, SVN) MC-JIT Note: MC-JIT is still WIP. For this to work correctly it requires LLVM changes which are not yet upstream. commit b1af4dfcadfc241fd4023f4c3f823a1286d452c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 20:03:15 2012 +0100 llvmpipe: use new lp_type_width() helper in lp_test_blend commit 04e0a37e888237d4db2298f31973af459ef9c95f Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 19:50:34 2012 +0100 llvmpipe: clean up lp_test_blend a little Using variables just sized and aligned right makes it a bit more obvious what's going on. The test still only tests vector length 4. For AoS anything else probably isn't going to work. For SoA other lengths should work (at least with floats). commit e61c393d3ec392ddee0a3da170e985fda885a823 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:48:30 2012 +0000 gallivm: Ensure vector width consistency. Instead of assuming that everything is the max native size. commit 330081ac7bc41c5754a92825e51456d231bf84dd Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:44:14 2012 +0000 draw: More simd vector width consistency fixes. commit d90ca002753596269e37297e2e6c139b19f29f03 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:43:00 2012 +0000 gallivm: Remove unused lp_build_int32_vec4_type() helper. commit cae23417824d75869c202aaf897808d73a2c1db0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 17:32:16 2012 +0100 gallivm: use global variable for native vector width instead of define We do not know the simd extensions (and hence the simd width we should use) available at compile time. At least for now keep a define for maximum vector width, since a global variable obviously can't be used to adjust alignment of automatic stack variables. Leave the runtime-determined value at 128 for now in all cases. commit 51270ace6349acc2c294fc6f34c025c707be538a Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 15:41:02 2012 +0000 gallivm: Add a hunk inadvertedly lost when rebasing. commit bf256df9cfdd0236637a455cbaece949b1253e98 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:24:23 2012 +0000 llvmpipe: Use consistent vector width in depth/stencil test. commit 5543b0901677146662c44be2cfba655fd55da94b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:19:59 2012 +0000 draw: Use a consistent the vector register width. Instead of 4x32 sometimes, LP_NATIVE_VECTOR_WIDTH other times. commit eada8bbd22a3a61f549f32fe2a7e408222e5c824 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 12:08:04 2012 +0000 gallivm: Remove garbagge collection. MC-JIT will require one compilation per module (as opposed to one compilation per function), therefore no state will be shared, eliminating the need to do garbagge collection. commit 556697ea0ed72e0641851e4fbbbb862c470fd7eb Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 10:33:41 2012 +0000 gallivm: Move all native target initialization to lp_set_target_options(). commit c518e8f3f2649d5dc265403511fab4bcbe2cc5c8 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:52:32 2012 +0000 llvmpipe: Create one gallivm instance for each test. commit 90f10af8920ec6be6f2b1e7365cfc477a0cb111d Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:48:08 2012 +0000 gallivm: Avoid LLVMAddGlobalMapping() in lp_bld_assert(). Brittle, complex, and unecesary. Just use function pointer constant. commit 98fde550b33401e3fe006af59db4db628bcbf476 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:21:26 2012 +0000 gallivm: Add a lp_build_const_func_pointer() helper. To be reused in all places where we want to call C code. commit 6cfedadb62c2ce5af8d75969bc95a607f3ece118 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:44:41 2012 +0000 gallivm: Cleanup/simplify lp_build_const_string_variable. - Move to lp_bld_const where it belongs - Rename to lp_build_const_string - take the length from the argument (and don't count the zero terminator twice) - bitcast the constant to generic i8 * commit db1d4018c0f1fa682a9da93c032977659adfb68c Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 11:52:17 2012 +0000 gallivm: Set NoFramePointerElimNonLeaf to true where supported. commit 088614164aa915baaa5044fede728aa898483183 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 22 19:38:47 2012 +0100 llvmpipe: pass in/out pointers rather scalar floats in lp_bld_arit we don't want llvm to potentially optimize away the vectors (though it doesn't seem to currently), plus we want to be able to handle in/out vectors of arbitrary length. commit 3f5c4e04af8a7592fdffa54938a277c34ae76b51 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:22:55 2012 +0100 gallivm: fix lp_build_sqrt() for vector length 1 since we optimize away vectors with length 1 need to emit intrinsic without vector type. commit 79d94e5f93ed8ba6757b97e2026722ea31d32c06 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 17:00:46 2012 +0000 llvmpipe: Remove lp_test_round. commit 81f41b5aeb3f4126e06453cfc78990086b85b78d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:56:24 2012 +0100 llvmpipe: subsume lp_test_round into lp_test_arit Much simpler, and since the arguments aren't passed as 128bit values can run on any arch. This also uses the float instead of the double versions of the c functions (which probably was the intention anyway). In contrast to lp_test_round the output is much less verbose however. Tested vector width of 32 to 512 bits - all pass except 32 (length 1) which crashes in lp_build_sqrt() due to wrong type. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 945b338b421defbd274481d8c4f7e0910fd0e7eb Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 09:55:03 2012 +0000 gallivm: Centralize the function compilation logic. This simplifies a lot of code. Also doing this in a central place will make it easier to carry out the changes necessary to use MC-JIT in the future. gallivm: Fix typo in explicit derivative shuffle. Trivial. draw: make DEBUG_STORE work again adapt to lp_build_printf() interface changes Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: get rid of vecnf_from_scalar() just use lp_build_broadcast directly (cannot assign a name but don't really need it, vecnf_from_scalar() was producing much uglier IR due to using repeated insertelement instead of insertelement+shuffle). Reviewed-by: José Fonseca <jfonseca@vmware.com> llvmpipe: fix typo in complex interpolation code Fixes position interpolation when using complex mode (piglit fp-fragment-position and similar) Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: fix clipvertex/position storing again This appears to be the result of a bad merge. Fixes piglit tests relying on clipping, like a lot of the interpolation tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Fix explicit derivative manipulation. Same counter variable was being used in two nested loops. Use more meanigful variable names for the counter to fix and avoid this. gallivm: Prevent buffer overflow in repeat wrap mode for NPOT. Based on Roland's patch, discussion, and review . Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix dims for TGSI_TEXTURE_1D in emit_tex. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix explicit volume texture derivatives. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: fix 1d shadow texture sampling Always r coordinate is used, hence need 3 coords not two (the second one is unused). Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Enable AVX support without MCJIT, where available. For now, this just enables AVX on Windows for testing. If the code is stable then we might consider prefering the old JIT wherever possible. No change elsewhere. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2012-07-13 18:09:30 +01:00
if (type.width* type.length == 128) {
intrinsic = "llvm.x86.sse2.cvtps2dq";
}
else {
assert(type.width*type.length == 256);
assert(util_cpu_caps.has_avx);
gallivm,draw,llvmpipe: Support wider native registers. Squashed commit of the following: commit 7acb7b4f60dc505af3dd00dcff744f80315d5b0e Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:46:31 2012 +0100 draw: Don't use dynamically sized arrays. Not supported by MSVC. commit 5810c28c83647612cb372d1e763fd9d7780df3cb Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:44:16 2012 +0100 gallivm,llvmpipe: Don't use expressions with PIPE_ALIGN_VAR(). MSVC doesn't accept exceptions in _declspec(align(...)). Use a define instead. commit 8aafd1457ba572a02b289b3f3411e99a3c056072 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:41:56 2012 +0100 gallium/util: Make u_cpu_detect.h header C++ safe. commit 5795248350771f899cfbfc1a3a58f1835eb2671d Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 2 12:08:01 2012 +0100 gallium/util: Add ULL suffix to large constants. As suggested by Andy Furniss: it looks like some old gcc versions require it. commit 4c66c22727eff92226544c7d43c4eb94de359e10 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Truly disable INF/NAN tests on MSVC. Thanks to Brian for spotting this. commit 8bce274c7fad578d7eb656d9a1413f5c0844c94e Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Disable INF/NAN tests on MSVC. Somehow they are not recognized as constants. commit 6868649cff8d7fd2e2579c28d0b74ef6dd4f9716 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 15:05:24 2012 +0200 gallivm: Cleanup the 2 x 8 float -> 16 ub special path in lp_build_conv. No behaviour change intended, like 7b98455fb40c2df84cfd3cdb1eb7650f67c8a751. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 5147a0949c4407e8bce9e41d9859314b4a9ccf77 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 14:28:19 2012 +0200 gallivm: (trivial) fix issues with multiple-of-4 texture fetch Some formats can't handle non-multiple of 4 fetches I believe, but everything must support length 1 and multiples of 4. So avoid going to scalar fetch (which is very costly) just because length isn't 4. Also extend the hack to not use shift with variable count for yuv formats to arbitrary length (larger than 1) - doesn't matter how many elements we have we always want to avoid it unless we have variable shift count instruction (which we should get with avx2). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 87ebcb1bd71fa4c739451ec8ca89a7f29b168c08 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jul 4 02:09:55 2012 +0200 gallivm: (trivial) fix typo for wrap repeat mode in linear filtering aos code This would lead to bogus coordinates at the edges. (undetected by piglit because this path is only taken for block-based formats). Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 3a42717101b1619874c8932a580c0b9e6896b557 Author: José Fonseca <jfonseca@vmware.com> Date: Tue Jul 3 19:42:49 2012 +0100 gallivm: Fix TGSI integer translation with AVX. commit d71ff104085c196b16426081098fb0bde128ce4f Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 15:17:41 2012 +0100 llvmpipe: Fix LLVM JIT linear path. It was not working properly because it was looking at the JIT function before it was actually compiled. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit a94df0386213e1f5f9a6ed470c535f9688ec0a1b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Jun 28 18:07:10 2012 +0100 gallivm: Refactor lp_build_broadcast(_scalar) to share code. Doesn't really change the generated assembly, but produces more compact IR, and of course, makes code more consistent. Reviewed-by: Brian Paul <brianp@vmware.com> commit 66712ba2731fc029fa246d4fc477d61ab785edb5 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 17:30:13 2012 +0100 gallivm: Make LLVMContextRef a singleton. There are any places inside LLVM that depend on it. Too many to attempt to fix. Reviewed-by: Brian Paul <brianp@vmware.com> commit ff5fb7897495ac263f0b069370fab701b70dccef Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 28 18:15:27 2012 +0200 gallivm: don't use 8-wide texture fetch in aos path This appears to be a slight loss usually. There are probably several reasons for that: - fetching itself is scalar - filtering is pure int code hence needs splitting anyway, same for the final texel offset calculations - texture wrap related code, which can be done 8-wide, is slightly more complex with floats (with clamp_to_edge) and float operations generally more costly hence probably not much faster overall - the code needed to split when encountering different mip levels for the quads, adding complexity So, just split always for aos path (but leave it 8-wide for soa, since we do 8-wide filtering there when possible). This should certainly be revisited if we'd have avx2 support. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ce8032b43dcd8e8d816cbab6428f54b0798f945d Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:41:19 2012 +0200 gallivm: (trivial) don't extract fparts variable if not needed Did not have any consequences but unnecessary. commit aaa9aaed8f80dc282492f62aa583a7ee23a4c6d5 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:09:06 2012 +0200 gallivm: fix precision issue in aos linear int wrap code now not just passes at a quick glance but also with piglit... If we do the wrapping with floats, we also need to set the weights accordingly. We can potentially end up with different (integer) coordinates than what the integer calculations would have chosen, which means the integer weights calculated previously in this case are completely wrong. Well at least that's what I think happens, at least recalculating the weights helps. (Some day really should refactor all the wrapping, so we do whatever is fastest independent of 16bit int aos or 32bit float soa filtering.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit fd6f18588ced7ac8e081892f3bab2916623ad7a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 11:15:53 2012 +0100 gallium/util: Fix parsing of options with underscore. For example GALLIVM_DEBUG=no_brilinear which was being parsed as two options, "no" and "brilinear". commit 09a8f809088178a03e49e409fa18f1ac89561837 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 26 15:00:14 2012 +0100 gallivm: Added a generic lp_build_print_value which prints a LLVMValueRef. Updated lp_build_printf to share common code. Removed specific lp_build_print_vecX. Reviewed-by: José Fonseca <jfonseca@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit e59bdcc2c075931bfba2a84967a5ecd1dedd6eb0 Author: José Fonseca <jfonseca@vmware.com> Date: Wed May 16 15:00:23 2012 +0100 draw,llvmpipe: Avoid named struct types on LLVM 3.0 and later. Starting with LLVM 3.0, named structures are meant not for debugging, but for recursive data types, previously also known as opaque types. The recursive nature of these types leads to several memory management difficulties. Given that we don't actually need recursive types, avoid them altogether. This is an attempt to address fdo bugs 41791 and 44466. The issue is somewhat random so there's no easy way to check how effective this is. Cherry-picked from 9af1ba565dfd5cef9ee938bb7c04767d14878fbf commit df6070f618a203c7a876d984c847cde4cbc26bdb Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 14:42:53 2012 +0200 gallivm: (trivial) fix typo in faster aos linear int wrap code no longer crashes, now REALLY tested. commit d8f98dce452c867214e6782e86dc08562643c862 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 18:20:58 2012 +0200 llvmpipe: (trivial) remove bogus optimization for float aos repeat wrap This optimization for nearest filtering on the linear path generated likely bogus results, and the int path didn't have any optimizations there since the only shader using force_nearest apparently uses clamp_to_edge not repeat wrap anyway. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit c4e271a0631087c795e756a5bb6b046043b5099d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 23:01:52 2012 +0200 gallivm: faster repeat wrap for linear aos path too Even if we already have scaled integer coords, it's way faster to use the original float coord (plus some conversions) rather than use URem. The choice of what to do for texture wrapping is not really tied to int aos or float soa filtering though for some modes there can be some gains (because of easier weight calculations). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 1174a75b1806e92aee4264ffe0ffe7e70abbbfa3 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 14:39:22 2012 +0200 gallivm: improve npot tex wrap repeat in linear soa path URem gets translated into series of scalar divisions so just about anything else is faster. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit f849ffaa499ed96fa0efd3594fce255c7f22891b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 00:40:35 2012 +0100 gallivm: (trivial) fix near-invisible shift-space typo I blame the keyboard. commit 5298a0b19fe672aebeb70964c0797d5921b51cf0 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:24:28 2012 +0200 gallivm: add new intrinsic helper to deal with arbitrary vector length This helper will split vectors which are too large for the hw, or expand them if they are too small, so a caller of a function using intrinsics which uses such sizes need not split (or expand) the vectors manually and the function will still use the intrinsic instead of dropping back to generic llvm code. It can also accept scalars for use with pseudo-vector intrinsics (only useful for float arguments, all x86 scalar simd float intrinsics use 4vf32). Only used for lp_build_min/max() for now (also added the scalar float case for these while there). (Other basic binary functions could use it easily, whereas functions with a different interface would need different helpers.) Expanding vectors isn't widely used, because we always try to use build contexts with native hw vector sizes. But it might (or not) be nicer if this wouldn't need to be done, the generated code should in theory stay the same (it does get hit by lp_build_rho though already since we didn't have a intrinsic for the scalar lp_build_max case before). v2: incorporated Brian's feedback, and also made the scalar min/max case work instead of crash (all scalar simd float intrinsics take 4vf32 as argument, probably the reason why it wasn't used before). Moved to lp_bld_intr based on José's request, and passing intrinsic size instead of length. Ideally we'd derive the source type info from the passed in llvm value refs and process some llvmtype return type so we could handle intrinsics where the source and destination type isn't the same (like float/int conversions, packing instructions) but that's a bit too complicated for now. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 01aa760b99ec0b2dc8ce57a43650e83f8c1becdf Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:19:18 2012 +0200 gallivm: (trivial) increase max code size for shader disassembly 64kB was just short of what I needed (which caused a crash) hence increase to 96kB (should probably be smarter about that). commit 74aa739138d981311ce13076388382b5e89c6562 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:53:29 2012 +0100 gallivm: simplify aos float tex wrap repeat nearest just handle pot and npot the same. The previous pot handling ended up with exactly the same instructions plus 2 more (leave it in the soa path though since it is probably still cheaper there). While here also fix a issue which would cause a crash after an assert. commit 0e1e755645e9e49cfaa2025191e3245ccd723564 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:29:24 2012 +0100 gallivm: (trivial) skip floor rounding in ifloor when not signed This was only done for the non-sse41 case before, but even with sse41 this is obviously unnecessary (some callers already call itrunc in this case anyway but some might not). commit 7f01a62f27dcb1d52597b24825931e88bae76f33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:23:12 2012 +0100 gallivm: (trivial) fix bogus comments commit 5c85be25fd82e28490274c468ce7f3e6e8c1d416 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 20 11:51:57 2012 +0100 translate: Free elt8_func/elt16_func too. These were leaking. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 0ad498f36fb6f7458c7cffa73b6598adceee0a6c Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 15:55:34 2012 +0200 gallivm: fix bug for tex wrap repeat with linear sampling in aos float path The comparison needs to be against length not length_minus_one, otherwise the max texel is never chosen (for the second coordinate). Fixes piglit texwrap-1D-npot-proj (and 2D/3D versions). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit d1ad65937c5b76407dc2499b7b774ab59341209e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 16:13:43 2012 +0200 gallivm: simplify soa tex wrap repeat with npot textures and no mip filtering Similar to what is already done in aos sampling for the float path (but not the int path since we don't get normalized float coordinates there). URem is expensive and the calculation is done trivially with normalized floats instead (at least with sse41-capable cpus). (Some day should probably do the same for the mip filter path but it's much more complicated there hence the gain is smaller.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit e1e23f57ba9b910295c306d148f15643acc3fc83 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:38:56 2012 +0200 llvmpipe: (trivial) remove duplicated function declaration Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 07ca57eb09e04c48a157733255427ef5de620861 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:37:34 2012 +0200 llvmpipe: destroy setup variants on context destruction lp_delete_setup_variants() used to be called in garbage collection, but this no longer exists hence the setup shaders never got freed. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ed0003c633859a45f9963a479f4c15ae0ef1dca3 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 16:25:29 2012 +0100 gallivm: handle different ilod parts for multiple quad sampling This fixes filtering when the integer part of the lod is not the same for all quads. I'm not fully convinced of that solution yet as it just splits the vector if the levels to be sampled from are different. But otherwise we'd need to do things like some minify steps, and getting mip level base address separately anyway hence it wouldn't really look like much of a win (and making the code even more complex). This should now give identical results to single quad sampling. commit 8580ac4cfc43a64df55e84ac71ce1a774d33c0d2 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:14:47 2012 +0200 gallivm: de-duplicate sample code common to soa and aos sampling There doesn't seem to be any reason why this code dealing with cube face selection, lod and mip level calculation is separate in aos and soa sampling, and I am sick of having it to change in both places. commit fb541e5f957408ce305b272100196f1e12e5b1e8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:15:41 2012 +0200 gallivm: do mip filtering with per quad lod_fpart This gives better results for mip filtering, though the generated code might not be optimal. For now it also creates some artifacts if the lod_ipart isn't the same for all quads, since instead of using the same mip weight for all quads as previously (which just caused non-smooth gradients) this now will use the right weights but with the wrong mip level in this case (can easily be seen with things like texfilt, mipmap_tunnel). v2: use logic helper suggested by José, and fix issue with negative lod_fpart values commit f1cc84eef7d826a20fab6cd8ccef9a275ff78967 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 13 18:35:25 2012 +0200 gallivm: (trivial) fix bogus assert in lp_build_unpack_broadcast_aos_scalars commit 7c17dbae8ae290df9ce0f50781a09e8ed640c044 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:11:14 2012 +0100 util: Reimplement half <-> float conversions. Removed u_half.py used to generate the table for previous method. Previous implementation of float to half conversion was faulty for denormalised and NaNs and would require extra logic to fix, thus making the speedup of using tables irrelevant. commit 7762f59274070e1dd4b546f5cb431c2eb71ae5c3 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:12:16 2012 +0100 tests: Updated tests to properly handle NaN for half floats. commit fa94c135aea5911fd93d5dfb6e6f157fb40dce5e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:33:10 2012 +0200 gallivm: do mip level calculations per quad This is the final piece which shouldn't change the rendering output yet. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 23cbeaddfe03c09ca18c45d28955515317ffcf4c Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Jun 9 00:54:21 2012 +0200 gallivm: do per-quad cube face selection Doesn't quite fix the piglit cubemap test (not sure why actually) but doing per-quad face selection is doing the right thing and definitely an improvement. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit abfb372b3702ac97ac8b5aa80ad1b94a2cc39d33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:22:59 2012 +0200 gallivm: do all lod calculations per quad Still no functional change but lod is now converted to scalar after lod calculations. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 519368632747ae03feb5bca9c655eccbc5b751b4 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:46:10 2012 +0100 gallivm: Added support for half-float to float conversion in lp_build_conv. Updated various utility functions to support this change. commit 135b4d683a4c95f7577ba27b9bffa4a6fbd2c2e7 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:02:46 2012 +0100 gallivm: Added function for half-float to float conversion. Updated lp_build_format_aos_array to support half-float source. commit 37d648827406a20c5007abeb177698723ed86673 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:55:18 2012 +0100 util: Updated u_format_tests to rigidly test half-float boundary values. commit 2ad18165d96e578aa9046df7c93cb1c3284d8c6b Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:54:16 2012 +0100 llvmpipe: Updated lp_test_format to properly handle Inf/NaN results. commit 78740acf25aeba8a7d146493dd5c966e22c27b73 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:53:30 2012 +0100 util: Added functions for checking NaN / Inf for double and half-floats. commit 35e9f640ae01241f9e0d67fe893bbbf564c05809 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:05:13 2012 +0200 gallivm: Fix calculating rho for 3d textures for the single-quad case Discovered by accident, this looks like a very old typo bug. commit fc1220c636326536fd0541913154e62afa7cd1d8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:04:59 2012 +0200 gallivm: do calcs per-quad in lp_build_rho Still convert to scalar at the end of the function. commit 50a887ffc550bf310a6988fa2cea5c24d38c1a41 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 21 23:21:50 2012 +0200 gallivm: (trivial) return scalar in lp_build_extract_range for length 1 vectors Our type system on top of llvm's one doesn't generally support vectors of length 1, instead using scalars. So we should return a scalar from this function instead of having to bitcast the vector with length 1 later elsewhere. commit 80c71c621f9391f0f9230460198d861643324876 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 17:49:15 2012 +0100 draw: Fixed bad merge error commit c47401cfad0c9167de20ff560654f533579f452c Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:29:30 2012 +0100 draw: Updated store_clip to store whole vectors instead of individual elements. commit 2d9c1ad74b0b0b41861fffcecde39f09cc27f1cf Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:28:32 2012 +0100 gallivm: Added lp_build_fetch_rgba_aos_array. A version of lp_build_fetch_rgba_aos which is targeted at simple array formats. Reads the whole vector from memory in one, instead of reading each element individually. Tested with mesa tests and demos. commit ff7805dc2b6ef6d8b11ec4e54aab1633aef29ac8 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:27:40 2012 +0100 gallivm: Added lp_build_pad_vector. This function pads a vector with undef to a desired length. commit 701f50acef24a2791dabf4730e5b5687d6eb875d Author: James Benton <jbenton@vmware.com> Date: Fri May 18 17:27:19 2012 +0100 util: Added util_format_is_array. This function checks whether a format description is in a simple array format. commit 5e0a7fa543dcd009de26f34a7926674190fa6246 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:13:47 2012 +0100 draw: Removed draw_llvm_translate_from and draw/draw_llvm_translate.c. This is "replaced" by adding an optimised path in lp_build_fetch_rgba_aos in an upcoming patch. commit 8c886d6a7dd3fb464ecf031de6f747cb33e5361d Author: James Benton <jbenton@vmware.com> Date: Wed May 16 15:02:31 2012 +0100 draw: Modified store_aos to write the vector as one, not individual elements. commit 37337f3d657e21dfd662c7b26d61cb0f8cfa6f17 Author: James Benton <jbenton@vmware.com> Date: Wed May 16 14:16:23 2012 +0100 draw: Changed aos_to_soa to use lp_build_transpose_aos. commit bd2b69ce5d5c94b067944d1dcd5df9f8e84548f1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:14:27 2012 +0100 draw: Changed soa_to_aos to use lp_build_transpose_aos. commit 0b98a950d29a116e82ce31dfe7b82cdadb632f2b Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:45 2012 +0100 gallivm: Added lp_build_transpose_aos which converts between aos and soa. commit 69ea84531ad46fd145eb619ed1cedbe97dde7cb5 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:01 2012 +0100 gallivm: Added lp_build_interleave2_half aimed at AVX unpack instructions. commit 7a4cb1349dd35c18144ad5934525cfb9436792f9 Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 22 11:54:14 2012 +0100 gallivm: Fix build on Windows. MC-JIT not yet supported there. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit afd105fc16bb75d874e418046b80d9cc578818a1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:17:26 2012 +0100 llvmpipe: Added a error counter to lp_test_conv. Useful for keeping track of progress when fixing errors! Signed-off-by: José Fonseca <jfonseca@vmware.com> commit b644907d08c10a805657841330fc23db3963d59c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:16:46 2012 +0100 llvmpipe: Changed known failures in lp_test_conv. To comply with the recent fixes to lp_bld_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit d7061507bd94f6468581e218e61261b79c760d4f Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:14:38 2012 +0100 llvmpipe: Added fixed point types tests to lp_test_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 146b3ea39b4726dbe125ac666bd8902ea3d6ca8c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:26:35 2012 +0100 llvmpipe: Changed lp_test_conv src/dst alignment to be correct. Now based on the define rather than a fixed number. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit f3b57441f834833a4b142a951eb98df0aa874536 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:06:44 2012 +0100 gallivm: Fixed erroneous optimisation in lp_build_min/max. Previously assumed normalised was 0 to 1, but it can be -1 to 1 if type is signed. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a0613382e5a215cd146bb277646a6b394d376ae4 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:04:49 2012 +0100 gallivm: Compensate for lp_const_offset in lp_build_conv. Fixing a /*FIXME*/ to remove errors in integer conversion in lp_build_conv. Tested using lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a3d2bf15ea345bc8a0664f8f441276fd566566f3 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:01:25 2012 +0100 gallivm: Fixed overflow in lp_build_clamped_float_to_unsigned_norm. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit e7b1e76fe237613731fa6003b5e1601a2e506207 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 21 20:07:51 2012 +0100 gallivm: Fix build with LLVM 2.6 Trivial, and useful. commit d3c6bbe5c7f5ba1976710831281ab1b6a631082d Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 15 17:15:59 2012 +0100 gallivm: Enable MCJIT/AVX with vanilla LLVM 3.1. Add the necessary C++ glue, so that we don't need any modifications to the soon to be released LLVM 3.1. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 724a019a14d40fdbed21759a204a2bec8a315636 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 22:04:06 2012 +0100 gallivm: Use HAVE_LLVM 0x0301 consistently. commit af6991e2a3868e40ad599b46278551b794839748 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 21:49:06 2012 +0100 gallivm: Add MCRegisterInfo.h to silence benign warnings about missing implementation. Trivial. commit 6f8a1d75458daae2503a86c6b030ecc4bb494e23 Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Apr 2 22:14:15 2012 -0700 gallivm: Pass in a MCInstrInfo to createMCInstPrinter on llvm-3.1. llvm-3.1svn r153860 makes MCInstrInfo available to the MCInstPrinter. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 62555b6ed8760545794f83064e27cddcb3ce5284 Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 27 21:51:17 2012 -0700 gallivm: Fix method overriding in raw_debug_ostream. Use matching type qualifers to avoid method hiding. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 6a9bd784f4ac68ad0a731dcd39e5a3c39989f2be Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 13 22:40:52 2012 -0700 gallivm: Fix createOProfileJITEventListener namespace with llvm-3.1. llvm-3.1svn r152620 refactored the OProfile profiling code. createOProfileJITEventListener was moved from the llvm namespace to the llvm::JITEventListener namespace. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit b674955d39adae272a779be85aa1bd665de24e3e Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Mar 5 22:00:40 2012 -0800 gallivm: Pass in a MCRegisterInfo to MCInstPrinter on llvm-3.1. llvm-3.1svn r152043 changes createMCInstPrinter to take an additional MCRegisterInfo argument. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 11ab69971a8a31c62f6de74905dbf8c02884599f Author: Vinson Lee <vlee@freedesktop.org> Date: Wed Feb 29 21:20:53 2012 -0800 Revert "gallivm: Change getExtent and readByte to non-const with llvm-3.1." This reverts commit d5a6c172547d8964f4d4bb79637651decaf9deee. llvm-3.1svn r151687 makes MemoryObject accessor members const again. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 339960c82d2a9f5c928ee9035ed31dadb7f45537 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 14 16:19:56 2012 +0200 gallivm: (trivial) fix assertion failure for mipmapped 1d textures In lp_build_rho, we may end up with a 1-element vector (for mipmapped 1d textures), but in this case we require the type to be a non-vector type, so need a cast. commit 9d73edb727bd6d196030dc3026b7bf0c574b3e19 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:12:07 2012 +0200 gallivm: prepare for per-quad lod calculations for large vectors to be able to handle multiple quads at once in texture sampling and still do lod calculations per quad, it is necessary to get the per-quad derivatives into the lp_build_rho function. Until now these derivative values were just scalars, which isn't going to work. So we now use vectors, and since the interface needs to change we also do some different (slightly more efficient) packing of the values. For 8-wide vectors the packed derivative values for 3 coords would look like this, this scales to a arbitrary (multiple of 4) vector size: ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____ The second vector will be unused for 1d and 2d textures. To facilitate future changes the derivative values are put into a struct, since quite some functions just pass these values through. The generated code seems to be very slightly better for 2d textures (with 4-wide vectors) than before with sse2 (if you have a cpu with physical 128bit simd units - otherwise it's probably not a win). v2: suggestions from José, rename variables, add comments, use swizzle helper commit 0aa21de0d31466dac77b05c97005722e902517b8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:10:31 2012 +0200 gallivm: add undefined swizzle handling to lp_build_swizzle_aos This is useful for vectors with "holes", it lets llvm choose the most efficient shuffle instructions if some elements aren't needed without having to worry what elements to manually pick otherwise. commit 00faf3f370e7ce92f5ef51002b0ea42ef856e181 Author: José Fonseca <jfonseca@vmware.com> Date: Fri May 4 17:25:16 2012 +0100 gallivm: Get the LLVM IR optimization passes before JIT compilation. MC-JIT engine compiles the module immediately on creation, so the optimization passes were being run too late. So now we create a target data layout from a string, that matches the ABI parameters reported by the compiler. The backend optimization passes were always been run, so the performance improvement is modest (3% on multiarb mesa demo). Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit 40a43f4e2ce3074b5ce9027179d657ebba68800a Author: Roland Scheidegger <sroland@vmware.com> Date: Wed May 2 16:03:54 2012 +0200 gallivm: (trivial) fix wrong define used in lp_build_pack2 should fix stack-smashing crashes. commit e6371d0f4dffad4eb3b7a9d906c23f1c88a2ab9e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 30 21:25:29 2012 +0200 gallivm: add perf warnings when not using intrinsics with 256bit vectors Helper functions using integer sse2 intrinsics could split the vectors with AVX instead of using generic fallback (which should be faster). We don't actually expect to hit these paths (hence don't fix them up to actually do the vector splitting) so just emit warnings (for those functions where it's obvious doing split/intrinsic is faster than using generic path). Only emit warnings for 256bit vectors since we _really_ don't expect to hit arbitrary large vectors which would affect a lot more functions. The warnings do not actually depend on avx since the same logic applies to plain sse2 too (but of course again there's _really_ no reason we should hit these functions with 256bit vectors without avx). commit 8a9ea701ea7295181e846c6383bf66a5f5e47637 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:37:07 2012 +0200 gallivm: split vectors manually for avx in lp_build_pack2 (v2) There's 2 reasons for this: First, there's a llvm bug (fixed in 3.1) which generates tons of byte inserts/extracts otherwise, and second, more importantly, we want to use pack intrinsics instead of shuffles. We do this in lp_build_pack2 and not the calling code (aos sample path) because potentially other callers might find that useful too, even if for larger sequences of code using non-native vector sizes it might be better to manually split vectors. This should boost texture performance in the aos path considerably. v2: fix issues with intrinsics types with old llvm commit 27ac5b48fa1f2ea3efeb5248e2ce32264aba466e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:26:22 2012 +0200 llvmpipe: refactor lp_build_pack2 (v2) prettify, and it's unnecessary to assert when there's no intrinsic due to unsupported bit width - the shuffle path will work regardless. In contrast lp_build_packs2, should only rely on lp_build_pack2 doing the clamping for element sizes for which there is a sse2 intrinsic. v2: fix bug spotted by Jose regarding the intrinsic type for packusdw on old llvm versions. commit ddf279031f0111de4b18eaf783bdc0a1e47813c8 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:13:59 2012 +0200 gallivm: add src width check in lp_build_packs2() not doing so would skip clamping even if no sse2 pack instruction is available, which is incorrect (in theory only, such widths would also always hit a (unnecessary) assertion in lp_build_pack2(). commit e7f0ad7fe079975eae7712a6e0c54be4fae0114b Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Apr 27 15:57:00 2012 +0200 gallivm: (trivial) fix crash-causing typo for npot textures with avx commit 28a9d7f6f655b6ec508c8a3aa6ffefc1e79793a0 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Apr 25 19:38:45 2012 +0200 gallivm: (trivial) remove code mistakenly added twice. commit d5926537316f8ff67ad0a52e7242f7c5478d919b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Apr 24 21:16:15 2012 +0200 gallivm: add a new avx aos sample path (v2) Try to avoid mixing float and int address calculations. This does texture wrap modes with floats, and then the offset calculations still with ints (because of lack of precision with floats, though we could do some effort to make it work with not too large (16MB) textures). This also handles wrap repeat mode with npot-sized textures differently than either the old soa or aos int path (likely way faster but untested). Otherwise the actual address wrap code is largely similar to the soa path (not quite the same as this one also has some int code), it should get used by avx soa sampling later as well but doesn't handle more complex address modes yet (this will also have the benefit that we can use aos sampling path for all texture address modes). Generated code for that looks reasonable, but still does not split vectors explicitly for fetch/filter which means still get hit by llvm (fixed upstream) which generates hundreds of pinsrb/pextrb instead of two shuffles. It is not obvious though if it's much of a win over just doing address calcs 4-wide but with ints, even if it is definitely much less instructions on avx. piglit's texwrap seems to look exactly the same but doesn't test neither the non-normalized nor the npot cases. v2: fix comments, prettify based on Brian's and Jose's feedback. commit bffecd22dea66fb416ecff8cffd10dd4bdb73fce Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Apr 19 01:58:29 2012 +0200 gallivm: refactor aos lp_build_sample_image_nearest/linear split them up to separate address calculations and fetching/filtering. Need this for being able to do 8-wide float address calcs and 4-wide fetch/filter later (for avx). Plus the functions were very big scary monsters anyway (in particular lp_build_sample_image_linear). commit a80b325c57529adddcfa367f96f03557725c4773 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 16 17:17:18 2012 +0200 gallivm: fix lp_build_resize when truncating width but expanding vector size Missed this case which I thought was impossible - the assertion for it was right after the division by zero... (AoS) texture sampling may ask us to do this, for things like 8 4x32int vectors to 1 32x8int vector conversion (eventually, we probably don't want this to happen). commit f9c8337caa3eb185830d18bce8b95676a065b1d7 Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Apr 14 18:00:59 2012 +0200 gallivm: fix cube maps with larger vectors This makes the branchless cube face selection code work with larger vectors. Because the complexity is quite high (cannot really be improved it seems, per-face selection would reduce complexity a lot but this leads to errors unless the derivatives are calculated all from the same face which almost doubles the work to be done) it is still slower than the branching version, hence only enable this with large vectors. It doesn't actually do per-quad face selection yet (only makes sense with matching lod selection, in fact it will select the same face for all pixels based on the average of the first four pixels for now) but only different shuffles are required to make it work (the branching version actually should work with larger vectors too now thanks to the improved horizontal add but of course it cannot be extended to really select the face per-quad unless doing branching per quad). commit 7780c58869fc9a00af4f23209902db7e058e8a66 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 21:11:12 2012 +0100 llvmpipe: (trivial) fix compiler warning and also clarify comment regarding availability of popcnt instruction. commit a266dccf477df6d29a611154e988e8895892277e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 14:21:07 2012 +0100 gallivm: remove unneeded members in lp_build_sample_context Minor cleanup, the texture width, height, depth aren't accessed in their scalar form anywhere. Makes it more obvious those values should probably be fetched already vectorized (but this requires more invasive changes)... commit b678c57fb474e14f05e25658c829fc04d2792fff Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 29 15:53:55 2012 +0100 gallivm: add a helper for concatenating vectors Similar to the extract_range helper intended to get around slow code generated by llvm for 128bit insertelements. Concatenating two 128bit vectors this way will result in a single vinsertf128 operation rather than two 64bit stores plus one 128bit load, though it might be mildly useful for other purposes as well. commit 415ff228bcd0cf5e44a4c15350a661f0f5520029 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 19:41:15 2012 +0100 gallivm: add a custom 2x8f->1x16ub avx conversion path Similar to the existing 4x4f->1x16ub sse2 path, shaves off a couple instructions (min/max mostly) because it relies on pack intrinsics clamping. commit 78c08fc89f8fbcc6dba09779981b1e873e2a0299 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 18:44:07 2012 +0100 gallivm: add avx arithmetic intrinsics Add all avx intrinsics for arithmetic functions (with the exception of the horizontal add function which needs another look). Seems to pass basic tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit a586caa2800aa5ce54c173f7c0d4fc48153dbc4e Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 15:31:35 2012 +0100 gallivm: add avx logic intrinsics Add the blend intrinsics for 8-wide float and 4-wide double vectors. Since we lack 256bit int instructions these are used for int vectors as well, though obviously not for byte or word element values. The comparison intrinsics aren't extended for avx since these are only used for pre-2.7 llvm versions. commit 70275e4c13c89315fc2560a4c488c0e6935d5caf Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 00:40:53 2012 +0100 gallivm: new helper function for extract shuffles. Based on José's idea as we can need that in a couple places. Note that such shuffles should not be used lightly, since data layout of <4 x i8> is different to <16 x i8> for instance, hence might cause data rearrangement. commit 4d586dbae1b0c55915dda1759d2faea631c0a1c2 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 18:27:25 2012 +0100 gallivm: (trivial) don't overallocate shuffle variable using wrong define meant huge array... commit 06b0ec1f6d665d98c135f9573ddf4ba04b2121ad Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 17:54:20 2012 +0100 gallivm: don't do per-element extract/insert for vector element resize Instead of doing per-element extract/insert if the src vectors and dst vector differ in total size (which generates atrocious code) first change the src vectors size by using shuffles to destination vector size. We can still do better than that on AVX for packing to color buffer (by exploiting pack intrinsics characteristics hence eleminating the need for some clamps) but this already generates much better code. v2: incorporate feedback from José, Keith and use shuffle instead of bitcasts/extracts. Due to llvm deficiencies the latter cause all data to get moved to GPRs and back in pieces (even though the data in the regs actually stays the same...). commit c9970d70e05f95d3f52fe7d2cd794176a52693aa Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 19:33:19 2012 +0000 gallivm: fix bug in simple position interpolation Accidental use of position attribute instead of just pixel coordinates. Caused failures in piglit glsl-fs-ceil and glsl-fs-floor. commit d0b6fcdb008d04d7f73d3d725615321544da5a7e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 15:31:14 2012 +0000 gallivm: fix emission of ceil opcode lp_build_ceil seems more appropriate than lp_build_trunc. This seems to be never hit though someone performs some ceil to floor magic. commit d97fafed7e62ffa6bf76560a92ea246a1a26d256 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 22 11:46:52 2012 +0000 gallivm: new vectorized path for cubemap calculations should be faster when adapted to multiple quads as only selection masks need to be different. The code is more or less a per-pixel version adapted to only do it per quad. A per pixel version would be much simpler (could drop 2 selects, 6 broadcasts and the messy horizontal add of 3 vectors at the expense of only 2 more absolute value instructions - would also just work for arbitary large vectors). This version doesn't yet work with larger vectors because the horizontal add isn't adjusted to be able to work with 2x4 vectors (and also because face selection wouldn't be done per quad just per block though that would be only a correctness issue just as with lod selection). The downside is this code is quite a bit slower. On a Core2 it can be sped up by disabling the hw blend instructions for selection and using logicop fallbacks instead, but it is still slower than the old code, hence leave that in for now. Probably will chose one or the other version based on vector length in the end. commit b375fbb18a3fd46859b7fdd42f3e9908ea4ff9a3 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 21 14:42:29 2012 +0000 gallivm: fix optimized occlusion query intrinsic name commit a9ba0a3b611e48efbb0e79eb09caa85033dbe9a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Mar 21 16:19:43 2012 +0000 draw,gallivm,llvmpipe: Call gallivm_verify_function everywhere. commit f94c2238d2bc7383e088b8845b7410439a602071 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 18:54:10 2012 +0000 gallivm: optimize calculations for cube maps a bit this does some more vectorized calculations and uses horizontal adds if possible. A definite win with sse3 otherwise it doesn't seem to make much of a difference. In any case this is arithmetically identical, cannot handle larger vectors. Should be useful as a reference point against larger vector version later... commit 21a2c1cf3c8e1ac648ff49e59fdc0e3be77e2ebb Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 15:16:27 2012 +0000 llvmpipe: slight optimization of occlusion queries using movmskps when available. While this is slightly better for cpus without popcnt we should really sum the vectors ourselves (it is also possible to cast to i4 before doing the popcnt but that doesn't help that much neither since llvm is using some optimized popcnt version for i32) commit 5ab5a35f216619bcdf55eed52b0db275c4a06c1b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 13:32:11 2012 +0000 llvmpipe: fix occlusion queries with larger vectors need to adjust casts etc. commit ff95e6fdf5f16d4ef999ffcf05ea6e8c7160b0d5 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Mar 19 20:15:25 2012 +0000 gallivm: Restore optimization passes. commit 57b05b4b36451e351659e98946dae27be0959832 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:34:22 2012 +0000 llvmpipe: use existing min2 macro commit bc9a20e19b4f600a439f45679451f2e87cd4b299 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:07:27 2012 +0000 llvmpipe: add some safeguards against really large vectors As per José's suggestion, prevent things from blowing up if some cpu would have 1024bit or larger vectors. commit 0e2b525e5ca1c5bbaa63158bde52ad1c1564a3a9 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:31:08 2012 +0000 llvmpipe: fix mask generation for uberwide vectors this was the only piece preventing 16-wide vectors from working (apart from the LP_MAX_VECTOR_WIDTH define that is), which is the maximum as we don't get more pixels in the fragment shader at once. Hence adjust that so things could be tested properly with that size even though there seems to be no practical value. commit 3c8334162211c97f3a11c7f64e9e5a2a91ad9656 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:19:41 2012 +0000 llvmpipe: fix the simple interpolation method with larger vectors so both methods actually _really_ work now. Makes textures look nice with larger vectors... commit 1cb0464ef8871be1778d43b0c56adf9c06843e2d Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 17:26:35 2012 +0000 llvmpipe: fix mask generation and position interpolation with 8-wide vectors trivial bugs, with these things start to look somewhat reasonable. Textures though have some swizzling issues it seems. commit 168277a63ef5b72542cf063c337f2d701053ff4b Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 16:04:03 2012 +0000 llvmpipe: don't overallocate variables we never have more than 16 (stamp size) / 4 (minimum possible vector size). (With larger vectors those variables are still overallocated a bit.) commit 409b54b30f81ed0aa9ed0b01affe15c72de9abd2 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:56:48 2012 +0000 llvmpipe: add some 32f8 formats to lp_test_conv Also add the ability to handle different sized vectors. commit 55dcd3af8366ebdac0af3cdb22c2588f24aa18ce Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:47:27 2012 +0000 gallivm: handle different sized vectors in conversion / pack only fully generic path for now (extract/insert per element). commit 9c040f78c54575fcd94a8808216cf415fe8868f6 Author: Roland Scheidegger <sroland@vmware.com> Date: Sun Mar 18 00:58:28 2012 +0100 llvmpipe: fix harmless use of unitialized values commit 551e9d5468b92fc7d5aa2265db9a52bb1e368a36 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:31:21 2012 +0100 gallivm: drop special path in extract_broadcast with different sized vectors Not needed, llvm can handle shuffles with different sized result vector just fine. Should hopefully generate the same code in the end, but simpler IR. commit 44da531119ffa07a421eaa041f63607cec88f6f8 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:28:49 2012 +0100 llvmpipe: adapt interpolation for handling multiple quads at once this is still WIP there are actually two methods possible not quite sure what makes the most sense, so there's code for both for now: 1) the iterative method as used before (compute attrib values at upper left corner of stamp and upper left corner of each quad initially). It is improved to handle more than one quad at once, and also do some more vectorized calculations initially for slightly better code - newer cpus have full throughput with 4 wide float vectors, hence don't try to code up a path which might be faster if there's just one channel active per attribute. 2) just do straight interpolation for each pixel. Method 2) is more work per quad, but less initially - if all quads are executed significantly more overall though. But this might change with larger vector lengths. This method would also be needed if we'd do some kind of active quad merging when operating on multiple quads at once. This path contains some hack to force llvm to generate better code, it is still far from ideal though, still generates far too many unnecessary register spills/reloads. Both methods should work with different sized vectors. Not very well tested yet, still seems to work with four-wide vectors, need changes elsewhere to be able to test with wider vectors. commit be5d3e82e2fe14ad0a46529ab79f65bf2276cd28 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:59:37 2012 +0000 draw: Cleanup. commit f85bc12c7fbacb3de2a94e88c6cd2d5ee0ec0e8d Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:43:30 2012 +0000 gallivm: More module compilation refactoring. commit d76f093198f2a06a93b2204857e6fea5fd0b3ece Author: José Fonseca <jfonseca@vmware.com> Date: Thu Mar 15 21:29:11 2012 +0000 llvmpipe: Use gallivm_compile/free_function() in linear code. Should had been done before. commit 122e1adb613ce083ad739b153ced1cde61dfc8c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 13 14:47:10 2012 +0100 llvmpipe: generate partial pixel mask for multiple quads still works with one quad, cannot be tested yet with more At least for now always fixed order with multiple quads. commit 4c4f15081d75ed585a01392cd2dcce0ad10e0ea8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 22:09:24 2012 +0100 llvmpipe: refactor state setup a bit Refactor to make it easier to emit (and potentially later fetch in fs) coefficients for multiple attributes at once. Need to think more about how to make this actually happen however, the problem is different attributes can have different interpolation modes, requiring different handling in both setup and fs (though linear and perspective handling is close). commit 9363e49722ff47094d688a4be6f015a03fba9c79 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 19:23:23 2012 +0100 llvmpipe: vectorize tri offset calc cuts number of instructions in quad-offset-factor from 107 to 75. This code actually duplicated the (scalar) code calculating the determinant except it used different vertex order (leading to different sign but it doesn't matter) hence llvm could not have figured out it's the same (of course with determinant vectorized in the other place that wouldn't have worked any longer neither). Note this particular piece doesn't actually vectorize well, not many arithmetic instructions left but tons of shuffle instructions... Probably would need to work on n tris at a time for better vectorization. commit 63169dcb9dd445c94605625bf86d85306e2b4297 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 03:11:37 2012 +0100 llvmpipe: vectorize some scalar code in setup reduces number of arithmetic instructions, and avoids loading vector x,y values twice (once as scalars once as vectors). Results in a reduction of instructions from 76 to 64 in fs setup for glxgears (16%) on a cpu with sse41. Since this code uses vec2 disguised as vec4, on old cpus which had physical 64bit sse units (pre-Core2) it probably is less of a win in practice (and if you have no vectors you can only hope llvm eliminates the arithmetic for unneeded elements). commit 732ecb877f951ab89bf503ac5e35ab8d838b58a1 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 7 00:32:24 2012 +0100 draw: fix clipping bug introduced by 4822fea3f0440b5205e957cd303838c3b128419c broke clipping pretty badly (verified with lineclip test) commit ef5d90b86d624c152d200c7c4056f47c3c6d2688 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 23:38:59 2012 +0100 draw: don't store vertex header per attribute storing the vertex header once per attribute is totally unnecessary. Some quick look at the generated assembly says llvm in fact cannot optimize away the additional stores (maybe due to potentially aliasing pointers somewhere). Plus, this makes the code cleaner and also allows using a vector "or" instead of scalar ones. commit 6b3a5a57b0b9850854cfbd7b586e4e50102dda71 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 19:11:01 2012 +0100 draw: do the per-vertex "boolean" clipmask "or" with vectors no point extracting the values and doing it per component. Doesn't help that much since we still extract the values elsewhere anyway. commit 36519caf1af40e4480251cc79a2d527350b7c61f Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 2 22:27:01 2012 +0100 gallivm: fix lp_build_extract_broadcast with different sized vectors Fix the obviously wrong argument, so it doesn't blow up. commit 76d0ac3ad85066d6058486638013afd02b069c58 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 2 12:16:23 2012 +0000 draw: Compile per module and not per function (WIP). Enough to get gears w/ LLVM draw + softpipe to work on AVX doing: GALLIUM_DRIVER=softpipe SOFTPIPE_USE_LLVM=yes glxgears But still hackish -- will need to rethink and refactor this. commit 78e32b247d2a7a771be9a1a07eb000d1e54ea8bd Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 12:01:05 2012 +0000 llvmpipe: Remove lp_state_setup_fallback. Never used. commit 6895d5e40d19b4972c361e8b83fdb7eecda3c225 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Feb 27 19:14:27 2012 +0000 llvmpipe: Don't emit EMMS on x86 We already take precautions to ensure that LLVM never emits MMX code. commit 4822fea3f0440b5205e957cd303838c3b128419c Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 29 15:58:19 2012 +0100 draw: modifications for larger vector sizes We want to be able to use larger vectors especially for running the vertex shader. With this patch we build soa vectors which might have a different length than 4. Note that aos structures really remain the same, only when aos structures are converted to soa potentially different sized vectors are used. Samplers probably don't work yet, didn't look at them. Testing done: glxgears works with both 128bit and 256bit vectors. commit f4950fc1ea784680ab767d3dd0dce589f4e70603 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:57 2012 +0100 gallivm: override native vector width with LP_NATIVE_VECTOR_WIDTH env var for debug commit 6ad6dbf0c92f3bf68ae54e5f2aca035d19b76e53 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:24 2012 +0100 draw: allocate storage with alignment according to native vector width commit 7bf0e3e7c9bd2469ae7279cabf4c5229ae9880c1 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Feb 24 19:06:08 2012 +0000 gallivm: Fix comment grammar. Was missing several words. Spotted by Roland. commit b20f1b28eb890b2fa2de44a0399b9b6a0d453c52 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 19:22:09 2012 +0000 gallivm: Use MC-JIT on LLVM 3.1 + (i.e, SVN) MC-JIT Note: MC-JIT is still WIP. For this to work correctly it requires LLVM changes which are not yet upstream. commit b1af4dfcadfc241fd4023f4c3f823a1286d452c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 20:03:15 2012 +0100 llvmpipe: use new lp_type_width() helper in lp_test_blend commit 04e0a37e888237d4db2298f31973af459ef9c95f Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 19:50:34 2012 +0100 llvmpipe: clean up lp_test_blend a little Using variables just sized and aligned right makes it a bit more obvious what's going on. The test still only tests vector length 4. For AoS anything else probably isn't going to work. For SoA other lengths should work (at least with floats). commit e61c393d3ec392ddee0a3da170e985fda885a823 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:48:30 2012 +0000 gallivm: Ensure vector width consistency. Instead of assuming that everything is the max native size. commit 330081ac7bc41c5754a92825e51456d231bf84dd Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:44:14 2012 +0000 draw: More simd vector width consistency fixes. commit d90ca002753596269e37297e2e6c139b19f29f03 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:43:00 2012 +0000 gallivm: Remove unused lp_build_int32_vec4_type() helper. commit cae23417824d75869c202aaf897808d73a2c1db0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 17:32:16 2012 +0100 gallivm: use global variable for native vector width instead of define We do not know the simd extensions (and hence the simd width we should use) available at compile time. At least for now keep a define for maximum vector width, since a global variable obviously can't be used to adjust alignment of automatic stack variables. Leave the runtime-determined value at 128 for now in all cases. commit 51270ace6349acc2c294fc6f34c025c707be538a Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 15:41:02 2012 +0000 gallivm: Add a hunk inadvertedly lost when rebasing. commit bf256df9cfdd0236637a455cbaece949b1253e98 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:24:23 2012 +0000 llvmpipe: Use consistent vector width in depth/stencil test. commit 5543b0901677146662c44be2cfba655fd55da94b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:19:59 2012 +0000 draw: Use a consistent the vector register width. Instead of 4x32 sometimes, LP_NATIVE_VECTOR_WIDTH other times. commit eada8bbd22a3a61f549f32fe2a7e408222e5c824 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 12:08:04 2012 +0000 gallivm: Remove garbagge collection. MC-JIT will require one compilation per module (as opposed to one compilation per function), therefore no state will be shared, eliminating the need to do garbagge collection. commit 556697ea0ed72e0641851e4fbbbb862c470fd7eb Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 10:33:41 2012 +0000 gallivm: Move all native target initialization to lp_set_target_options(). commit c518e8f3f2649d5dc265403511fab4bcbe2cc5c8 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:52:32 2012 +0000 llvmpipe: Create one gallivm instance for each test. commit 90f10af8920ec6be6f2b1e7365cfc477a0cb111d Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:48:08 2012 +0000 gallivm: Avoid LLVMAddGlobalMapping() in lp_bld_assert(). Brittle, complex, and unecesary. Just use function pointer constant. commit 98fde550b33401e3fe006af59db4db628bcbf476 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:21:26 2012 +0000 gallivm: Add a lp_build_const_func_pointer() helper. To be reused in all places where we want to call C code. commit 6cfedadb62c2ce5af8d75969bc95a607f3ece118 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:44:41 2012 +0000 gallivm: Cleanup/simplify lp_build_const_string_variable. - Move to lp_bld_const where it belongs - Rename to lp_build_const_string - take the length from the argument (and don't count the zero terminator twice) - bitcast the constant to generic i8 * commit db1d4018c0f1fa682a9da93c032977659adfb68c Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 11:52:17 2012 +0000 gallivm: Set NoFramePointerElimNonLeaf to true where supported. commit 088614164aa915baaa5044fede728aa898483183 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 22 19:38:47 2012 +0100 llvmpipe: pass in/out pointers rather scalar floats in lp_bld_arit we don't want llvm to potentially optimize away the vectors (though it doesn't seem to currently), plus we want to be able to handle in/out vectors of arbitrary length. commit 3f5c4e04af8a7592fdffa54938a277c34ae76b51 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:22:55 2012 +0100 gallivm: fix lp_build_sqrt() for vector length 1 since we optimize away vectors with length 1 need to emit intrinsic without vector type. commit 79d94e5f93ed8ba6757b97e2026722ea31d32c06 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 17:00:46 2012 +0000 llvmpipe: Remove lp_test_round. commit 81f41b5aeb3f4126e06453cfc78990086b85b78d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:56:24 2012 +0100 llvmpipe: subsume lp_test_round into lp_test_arit Much simpler, and since the arguments aren't passed as 128bit values can run on any arch. This also uses the float instead of the double versions of the c functions (which probably was the intention anyway). In contrast to lp_test_round the output is much less verbose however. Tested vector width of 32 to 512 bits - all pass except 32 (length 1) which crashes in lp_build_sqrt() due to wrong type. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 945b338b421defbd274481d8c4f7e0910fd0e7eb Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 09:55:03 2012 +0000 gallivm: Centralize the function compilation logic. This simplifies a lot of code. Also doing this in a central place will make it easier to carry out the changes necessary to use MC-JIT in the future. gallivm: Fix typo in explicit derivative shuffle. Trivial. draw: make DEBUG_STORE work again adapt to lp_build_printf() interface changes Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: get rid of vecnf_from_scalar() just use lp_build_broadcast directly (cannot assign a name but don't really need it, vecnf_from_scalar() was producing much uglier IR due to using repeated insertelement instead of insertelement+shuffle). Reviewed-by: José Fonseca <jfonseca@vmware.com> llvmpipe: fix typo in complex interpolation code Fixes position interpolation when using complex mode (piglit fp-fragment-position and similar) Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: fix clipvertex/position storing again This appears to be the result of a bad merge. Fixes piglit tests relying on clipping, like a lot of the interpolation tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Fix explicit derivative manipulation. Same counter variable was being used in two nested loops. Use more meanigful variable names for the counter to fix and avoid this. gallivm: Prevent buffer overflow in repeat wrap mode for NPOT. Based on Roland's patch, discussion, and review . Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix dims for TGSI_TEXTURE_1D in emit_tex. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix explicit volume texture derivatives. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: fix 1d shadow texture sampling Always r coordinate is used, hence need 3 coords not two (the second one is unused). Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Enable AVX support without MCJIT, where available. For now, this just enables AVX on Windows for testing. If the code is stable then we might consider prefering the old JIT wherever possible. No change elsewhere. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2012-07-13 18:09:30 +01:00
intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
}
res = lp_build_intrinsic_unary(builder, intrinsic,
ret_type, a);
}
return res;
}
/*
*/
static inline LLVMValueRef
lp_build_round_altivec(struct lp_build_context *bld,
LLVMValueRef a,
enum lp_build_round_mode mode)
{
LLVMBuilderRef builder = bld->gallivm->builder;
const struct lp_type type = bld->type;
const char *intrinsic = NULL;
assert(type.floating);
assert(lp_check_value(type, a));
assert(util_cpu_caps.has_altivec);
(void)type;
switch (mode) {
case LP_BUILD_ROUND_NEAREST:
intrinsic = "llvm.ppc.altivec.vrfin";
break;
case LP_BUILD_ROUND_FLOOR:
intrinsic = "llvm.ppc.altivec.vrfim";
break;
case LP_BUILD_ROUND_CEIL:
intrinsic = "llvm.ppc.altivec.vrfip";
break;
case LP_BUILD_ROUND_TRUNCATE:
intrinsic = "llvm.ppc.altivec.vrfiz";
break;
}
return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
}
static inline LLVMValueRef
lp_build_round_arch(struct lp_build_context *bld,
LLVMValueRef a,
enum lp_build_round_mode mode)
{
if (util_cpu_caps.has_sse4_1) {
LLVMBuilderRef builder = bld->gallivm->builder;
const struct lp_type type = bld->type;
const char *intrinsic_root;
char intrinsic[32];
assert(type.floating);
assert(lp_check_value(type, a));
(void)type;
switch (mode) {
case LP_BUILD_ROUND_NEAREST:
intrinsic_root = "llvm.nearbyint";
break;
case LP_BUILD_ROUND_FLOOR:
intrinsic_root = "llvm.floor";
break;
case LP_BUILD_ROUND_CEIL:
intrinsic_root = "llvm.ceil";
break;
case LP_BUILD_ROUND_TRUNCATE:
intrinsic_root = "llvm.trunc";
break;
}
lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
}
else /* (util_cpu_caps.has_altivec) */
return lp_build_round_altivec(bld, a, mode);
}
/**
* Return the integer part of a float (vector) value (== round toward zero).
* The returned value is a float (vector).
* Ex: trunc(-1.5) = -1.0
*/
LLVMValueRef
lp_build_trunc(struct lp_build_context *bld,
LLVMValueRef a)
{
LLVMBuilderRef builder = bld->gallivm->builder;
const struct lp_type type = bld->type;
assert(type.floating);
assert(lp_check_value(type, a));
if (arch_rounding_available(type)) {
return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
}
else {
const struct lp_type type = bld->type;
struct lp_type inttype;
struct lp_build_context intbld;
LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
LLVMValueRef trunc, res, anosign, mask;
LLVMTypeRef int_vec_type = bld->int_vec_type;
LLVMTypeRef vec_type = bld->vec_type;
assert(type.width == 32); /* might want to handle doubles at some point */
inttype = type;
inttype.floating = 0;
lp_build_context_init(&intbld, bld->gallivm, inttype);
/* round by truncation */
trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
/* mask out sign bit */
anosign = lp_build_abs(bld, a);
/*
* mask out all values if anosign > 2^24
* This should work both for large ints (all rounding is no-op for them
* because such floats are always exact) as well as special cases like
* NaNs, Infs (taking advantage of the fact they use max exponent).
* (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
*/
anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
return lp_build_select(bld, mask, a, res);
}
}
/**
* Return float (vector) rounded to nearest integer (vector). The returned
* value is a float (vector).
* Ex: round(0.9) = 1.0
* Ex: round(-1.5) = -2.0
*/
LLVMValueRef
lp_build_round(struct lp_build_context *bld,
LLVMValueRef a)
{
LLVMBuilderRef builder = bld->gallivm->builder;
const struct lp_type type = bld->type;
assert(type.floating);
assert(lp_check_value(type, a));
if (arch_rounding_available(type)) {
return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
}
else {
const struct lp_type type = bld->type;
struct lp_type inttype;
struct lp_build_context intbld;
LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
LLVMValueRef res, anosign, mask;
LLVMTypeRef int_vec_type = bld->int_vec_type;
LLVMTypeRef vec_type = bld->vec_type;
assert(type.width == 32); /* might want to handle doubles at some point */
inttype = type;
inttype.floating = 0;
lp_build_context_init(&intbld, bld->gallivm, inttype);
res = lp_build_iround(bld, a);
res = LLVMBuildSIToFP(builder, res, vec_type, "");
/* mask out sign bit */
anosign = lp_build_abs(bld, a);
/*
* mask out all values if anosign > 2^24
* This should work both for large ints (all rounding is no-op for them
* because such floats are always exact) as well as special cases like
* NaNs, Infs (taking advantage of the fact they use max exponent).
* (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
*/
anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
return lp_build_select(bld, mask, a, res);
}
}
/**
* Return floor of float (vector), result is a float (vector)
* Ex: floor(1.1) = 1.0
* Ex: floor(-1.1) = -2.0
*/
LLVMValueRef
lp_build_floor(struct lp_build_context *bld,
LLVMValueRef a)
{
LLVMBuilderRef builder = bld->gallivm->builder;
const struct lp_type type = bld->type;
assert(type.floating);
assert(lp_check_value(type, a));
if (arch_rounding_available(type)) {
return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
}
else {
const struct lp_type type = bld->type;
struct lp_type inttype;
struct lp_build_context intbld;
LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
LLVMValueRef trunc, res, anosign, mask;
LLVMTypeRef int_vec_type = bld->int_vec_type;
LLVMTypeRef vec_type = bld->vec_type;
if (type.width != 32) {
char intrinsic[32];
lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type);
return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
}
assert(type.width == 32); /* might want to handle doubles at some point */
inttype = type;
inttype.floating = 0;
lp_build_context_init(&intbld, bld->gallivm, inttype);
/* round by truncation */
trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
if (type.sign) {
LLVMValueRef tmp;
/*
* fix values if rounding is wrong (for non-special cases)
* - this is the case if trunc > a
*/
mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
/* tmp = trunc > a ? 1.0 : 0.0 */
tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
tmp = lp_build_and(&intbld, mask, tmp);
tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
res = lp_build_sub(bld, res, tmp);
}
/* mask out sign bit */
anosign = lp_build_abs(bld, a);
/*
* mask out all values if anosign > 2^24
* This should work both for large ints (all rounding is no-op for them
* because such floats are always exact) as well as special cases like
* NaNs, Infs (taking advantage of the fact they use max exponent).
* (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
*/
anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
return lp_build_select(bld, mask, a, res);
}
}
/**
* Return ceiling of float (vector), returning float (vector).
* Ex: ceil( 1.1) = 2.0
* Ex: ceil(-1.1) = -1.0
*/
LLVMValueRef
lp_build_ceil(struct lp_build_context *bld,
LLVMValueRef a)
{
LLVMBuilderRef builder = bld->gallivm->builder;
const struct lp_type type = bld->type;
assert(type.floating);
assert(lp_check_value(type, a));
if (arch_rounding_available(type)) {
return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
}
else {
const struct lp_type type = bld->type;
struct lp_type inttype;
struct lp_build_context intbld;
LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
LLVMValueRef trunc, res, anosign, mask, tmp;
LLVMTypeRef int_vec_type = bld->int_vec_type;
LLVMTypeRef vec_type = bld->vec_type;
if (type.width != 32) {
char intrinsic[32];
lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type);
return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
}
assert(type.width == 32); /* might want to handle doubles at some point */
inttype = type;
inttype.floating = 0;
lp_build_context_init(&intbld, bld->gallivm, inttype);
/* round by truncation */
trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
/*
* fix values if rounding is wrong (for non-special cases)
* - this is the case if trunc < a
*/
mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
/* tmp = trunc < a ? 1.0 : 0.0 */
tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
tmp = lp_build_and(&intbld, mask, tmp);
tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
res = lp_build_add(bld, trunc, tmp);
/* mask out sign bit */
anosign = lp_build_abs(bld, a);
/*
* mask out all values if anosign > 2^24
* This should work both for large ints (all rounding is no-op for them
* because such floats are always exact) as well as special cases like
* NaNs, Infs (taking advantage of the fact they use max exponent).
* (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
*/
anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
return lp_build_select(bld, mask, a, res);
}
}
2010-03-04 17:50:26 +00:00
/**
* Return fractional part of 'a' computed as a - floor(a)
2010-03-04 17:50:26 +00:00
* Typically used in texture coord arithmetic.
*/
LLVMValueRef
lp_build_fract(struct lp_build_context *bld,
LLVMValueRef a)
{
assert(bld->type.floating);
return lp_build_sub(bld, a, lp_build_floor(bld, a));
}
gallivm,draw,llvmpipe: Support wider native registers. Squashed commit of the following: commit 7acb7b4f60dc505af3dd00dcff744f80315d5b0e Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:46:31 2012 +0100 draw: Don't use dynamically sized arrays. Not supported by MSVC. commit 5810c28c83647612cb372d1e763fd9d7780df3cb Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:44:16 2012 +0100 gallivm,llvmpipe: Don't use expressions with PIPE_ALIGN_VAR(). MSVC doesn't accept exceptions in _declspec(align(...)). Use a define instead. commit 8aafd1457ba572a02b289b3f3411e99a3c056072 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:41:56 2012 +0100 gallium/util: Make u_cpu_detect.h header C++ safe. commit 5795248350771f899cfbfc1a3a58f1835eb2671d Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 2 12:08:01 2012 +0100 gallium/util: Add ULL suffix to large constants. As suggested by Andy Furniss: it looks like some old gcc versions require it. commit 4c66c22727eff92226544c7d43c4eb94de359e10 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Truly disable INF/NAN tests on MSVC. Thanks to Brian for spotting this. commit 8bce274c7fad578d7eb656d9a1413f5c0844c94e Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Disable INF/NAN tests on MSVC. Somehow they are not recognized as constants. commit 6868649cff8d7fd2e2579c28d0b74ef6dd4f9716 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 15:05:24 2012 +0200 gallivm: Cleanup the 2 x 8 float -> 16 ub special path in lp_build_conv. No behaviour change intended, like 7b98455fb40c2df84cfd3cdb1eb7650f67c8a751. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 5147a0949c4407e8bce9e41d9859314b4a9ccf77 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 14:28:19 2012 +0200 gallivm: (trivial) fix issues with multiple-of-4 texture fetch Some formats can't handle non-multiple of 4 fetches I believe, but everything must support length 1 and multiples of 4. So avoid going to scalar fetch (which is very costly) just because length isn't 4. Also extend the hack to not use shift with variable count for yuv formats to arbitrary length (larger than 1) - doesn't matter how many elements we have we always want to avoid it unless we have variable shift count instruction (which we should get with avx2). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 87ebcb1bd71fa4c739451ec8ca89a7f29b168c08 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jul 4 02:09:55 2012 +0200 gallivm: (trivial) fix typo for wrap repeat mode in linear filtering aos code This would lead to bogus coordinates at the edges. (undetected by piglit because this path is only taken for block-based formats). Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 3a42717101b1619874c8932a580c0b9e6896b557 Author: José Fonseca <jfonseca@vmware.com> Date: Tue Jul 3 19:42:49 2012 +0100 gallivm: Fix TGSI integer translation with AVX. commit d71ff104085c196b16426081098fb0bde128ce4f Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 15:17:41 2012 +0100 llvmpipe: Fix LLVM JIT linear path. It was not working properly because it was looking at the JIT function before it was actually compiled. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit a94df0386213e1f5f9a6ed470c535f9688ec0a1b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Jun 28 18:07:10 2012 +0100 gallivm: Refactor lp_build_broadcast(_scalar) to share code. Doesn't really change the generated assembly, but produces more compact IR, and of course, makes code more consistent. Reviewed-by: Brian Paul <brianp@vmware.com> commit 66712ba2731fc029fa246d4fc477d61ab785edb5 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 17:30:13 2012 +0100 gallivm: Make LLVMContextRef a singleton. There are any places inside LLVM that depend on it. Too many to attempt to fix. Reviewed-by: Brian Paul <brianp@vmware.com> commit ff5fb7897495ac263f0b069370fab701b70dccef Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 28 18:15:27 2012 +0200 gallivm: don't use 8-wide texture fetch in aos path This appears to be a slight loss usually. There are probably several reasons for that: - fetching itself is scalar - filtering is pure int code hence needs splitting anyway, same for the final texel offset calculations - texture wrap related code, which can be done 8-wide, is slightly more complex with floats (with clamp_to_edge) and float operations generally more costly hence probably not much faster overall - the code needed to split when encountering different mip levels for the quads, adding complexity So, just split always for aos path (but leave it 8-wide for soa, since we do 8-wide filtering there when possible). This should certainly be revisited if we'd have avx2 support. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ce8032b43dcd8e8d816cbab6428f54b0798f945d Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:41:19 2012 +0200 gallivm: (trivial) don't extract fparts variable if not needed Did not have any consequences but unnecessary. commit aaa9aaed8f80dc282492f62aa583a7ee23a4c6d5 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:09:06 2012 +0200 gallivm: fix precision issue in aos linear int wrap code now not just passes at a quick glance but also with piglit... If we do the wrapping with floats, we also need to set the weights accordingly. We can potentially end up with different (integer) coordinates than what the integer calculations would have chosen, which means the integer weights calculated previously in this case are completely wrong. Well at least that's what I think happens, at least recalculating the weights helps. (Some day really should refactor all the wrapping, so we do whatever is fastest independent of 16bit int aos or 32bit float soa filtering.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit fd6f18588ced7ac8e081892f3bab2916623ad7a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 11:15:53 2012 +0100 gallium/util: Fix parsing of options with underscore. For example GALLIVM_DEBUG=no_brilinear which was being parsed as two options, "no" and "brilinear". commit 09a8f809088178a03e49e409fa18f1ac89561837 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 26 15:00:14 2012 +0100 gallivm: Added a generic lp_build_print_value which prints a LLVMValueRef. Updated lp_build_printf to share common code. Removed specific lp_build_print_vecX. Reviewed-by: José Fonseca <jfonseca@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit e59bdcc2c075931bfba2a84967a5ecd1dedd6eb0 Author: José Fonseca <jfonseca@vmware.com> Date: Wed May 16 15:00:23 2012 +0100 draw,llvmpipe: Avoid named struct types on LLVM 3.0 and later. Starting with LLVM 3.0, named structures are meant not for debugging, but for recursive data types, previously also known as opaque types. The recursive nature of these types leads to several memory management difficulties. Given that we don't actually need recursive types, avoid them altogether. This is an attempt to address fdo bugs 41791 and 44466. The issue is somewhat random so there's no easy way to check how effective this is. Cherry-picked from 9af1ba565dfd5cef9ee938bb7c04767d14878fbf commit df6070f618a203c7a876d984c847cde4cbc26bdb Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 14:42:53 2012 +0200 gallivm: (trivial) fix typo in faster aos linear int wrap code no longer crashes, now REALLY tested. commit d8f98dce452c867214e6782e86dc08562643c862 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 18:20:58 2012 +0200 llvmpipe: (trivial) remove bogus optimization for float aos repeat wrap This optimization for nearest filtering on the linear path generated likely bogus results, and the int path didn't have any optimizations there since the only shader using force_nearest apparently uses clamp_to_edge not repeat wrap anyway. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit c4e271a0631087c795e756a5bb6b046043b5099d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 23:01:52 2012 +0200 gallivm: faster repeat wrap for linear aos path too Even if we already have scaled integer coords, it's way faster to use the original float coord (plus some conversions) rather than use URem. The choice of what to do for texture wrapping is not really tied to int aos or float soa filtering though for some modes there can be some gains (because of easier weight calculations). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 1174a75b1806e92aee4264ffe0ffe7e70abbbfa3 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 14:39:22 2012 +0200 gallivm: improve npot tex wrap repeat in linear soa path URem gets translated into series of scalar divisions so just about anything else is faster. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit f849ffaa499ed96fa0efd3594fce255c7f22891b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 00:40:35 2012 +0100 gallivm: (trivial) fix near-invisible shift-space typo I blame the keyboard. commit 5298a0b19fe672aebeb70964c0797d5921b51cf0 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:24:28 2012 +0200 gallivm: add new intrinsic helper to deal with arbitrary vector length This helper will split vectors which are too large for the hw, or expand them if they are too small, so a caller of a function using intrinsics which uses such sizes need not split (or expand) the vectors manually and the function will still use the intrinsic instead of dropping back to generic llvm code. It can also accept scalars for use with pseudo-vector intrinsics (only useful for float arguments, all x86 scalar simd float intrinsics use 4vf32). Only used for lp_build_min/max() for now (also added the scalar float case for these while there). (Other basic binary functions could use it easily, whereas functions with a different interface would need different helpers.) Expanding vectors isn't widely used, because we always try to use build contexts with native hw vector sizes. But it might (or not) be nicer if this wouldn't need to be done, the generated code should in theory stay the same (it does get hit by lp_build_rho though already since we didn't have a intrinsic for the scalar lp_build_max case before). v2: incorporated Brian's feedback, and also made the scalar min/max case work instead of crash (all scalar simd float intrinsics take 4vf32 as argument, probably the reason why it wasn't used before). Moved to lp_bld_intr based on José's request, and passing intrinsic size instead of length. Ideally we'd derive the source type info from the passed in llvm value refs and process some llvmtype return type so we could handle intrinsics where the source and destination type isn't the same (like float/int conversions, packing instructions) but that's a bit too complicated for now. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 01aa760b99ec0b2dc8ce57a43650e83f8c1becdf Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:19:18 2012 +0200 gallivm: (trivial) increase max code size for shader disassembly 64kB was just short of what I needed (which caused a crash) hence increase to 96kB (should probably be smarter about that). commit 74aa739138d981311ce13076388382b5e89c6562 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:53:29 2012 +0100 gallivm: simplify aos float tex wrap repeat nearest just handle pot and npot the same. The previous pot handling ended up with exactly the same instructions plus 2 more (leave it in the soa path though since it is probably still cheaper there). While here also fix a issue which would cause a crash after an assert. commit 0e1e755645e9e49cfaa2025191e3245ccd723564 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:29:24 2012 +0100 gallivm: (trivial) skip floor rounding in ifloor when not signed This was only done for the non-sse41 case before, but even with sse41 this is obviously unnecessary (some callers already call itrunc in this case anyway but some might not). commit 7f01a62f27dcb1d52597b24825931e88bae76f33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:23:12 2012 +0100 gallivm: (trivial) fix bogus comments commit 5c85be25fd82e28490274c468ce7f3e6e8c1d416 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 20 11:51:57 2012 +0100 translate: Free elt8_func/elt16_func too. These were leaking. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 0ad498f36fb6f7458c7cffa73b6598adceee0a6c Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 15:55:34 2012 +0200 gallivm: fix bug for tex wrap repeat with linear sampling in aos float path The comparison needs to be against length not length_minus_one, otherwise the max texel is never chosen (for the second coordinate). Fixes piglit texwrap-1D-npot-proj (and 2D/3D versions). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit d1ad65937c5b76407dc2499b7b774ab59341209e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 16:13:43 2012 +0200 gallivm: simplify soa tex wrap repeat with npot textures and no mip filtering Similar to what is already done in aos sampling for the float path (but not the int path since we don't get normalized float coordinates there). URem is expensive and the calculation is done trivially with normalized floats instead (at least with sse41-capable cpus). (Some day should probably do the same for the mip filter path but it's much more complicated there hence the gain is smaller.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit e1e23f57ba9b910295c306d148f15643acc3fc83 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:38:56 2012 +0200 llvmpipe: (trivial) remove duplicated function declaration Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 07ca57eb09e04c48a157733255427ef5de620861 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:37:34 2012 +0200 llvmpipe: destroy setup variants on context destruction lp_delete_setup_variants() used to be called in garbage collection, but this no longer exists hence the setup shaders never got freed. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ed0003c633859a45f9963a479f4c15ae0ef1dca3 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 16:25:29 2012 +0100 gallivm: handle different ilod parts for multiple quad sampling This fixes filtering when the integer part of the lod is not the same for all quads. I'm not fully convinced of that solution yet as it just splits the vector if the levels to be sampled from are different. But otherwise we'd need to do things like some minify steps, and getting mip level base address separately anyway hence it wouldn't really look like much of a win (and making the code even more complex). This should now give identical results to single quad sampling. commit 8580ac4cfc43a64df55e84ac71ce1a774d33c0d2 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:14:47 2012 +0200 gallivm: de-duplicate sample code common to soa and aos sampling There doesn't seem to be any reason why this code dealing with cube face selection, lod and mip level calculation is separate in aos and soa sampling, and I am sick of having it to change in both places. commit fb541e5f957408ce305b272100196f1e12e5b1e8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:15:41 2012 +0200 gallivm: do mip filtering with per quad lod_fpart This gives better results for mip filtering, though the generated code might not be optimal. For now it also creates some artifacts if the lod_ipart isn't the same for all quads, since instead of using the same mip weight for all quads as previously (which just caused non-smooth gradients) this now will use the right weights but with the wrong mip level in this case (can easily be seen with things like texfilt, mipmap_tunnel). v2: use logic helper suggested by José, and fix issue with negative lod_fpart values commit f1cc84eef7d826a20fab6cd8ccef9a275ff78967 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 13 18:35:25 2012 +0200 gallivm: (trivial) fix bogus assert in lp_build_unpack_broadcast_aos_scalars commit 7c17dbae8ae290df9ce0f50781a09e8ed640c044 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:11:14 2012 +0100 util: Reimplement half <-> float conversions. Removed u_half.py used to generate the table for previous method. Previous implementation of float to half conversion was faulty for denormalised and NaNs and would require extra logic to fix, thus making the speedup of using tables irrelevant. commit 7762f59274070e1dd4b546f5cb431c2eb71ae5c3 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:12:16 2012 +0100 tests: Updated tests to properly handle NaN for half floats. commit fa94c135aea5911fd93d5dfb6e6f157fb40dce5e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:33:10 2012 +0200 gallivm: do mip level calculations per quad This is the final piece which shouldn't change the rendering output yet. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 23cbeaddfe03c09ca18c45d28955515317ffcf4c Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Jun 9 00:54:21 2012 +0200 gallivm: do per-quad cube face selection Doesn't quite fix the piglit cubemap test (not sure why actually) but doing per-quad face selection is doing the right thing and definitely an improvement. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit abfb372b3702ac97ac8b5aa80ad1b94a2cc39d33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:22:59 2012 +0200 gallivm: do all lod calculations per quad Still no functional change but lod is now converted to scalar after lod calculations. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 519368632747ae03feb5bca9c655eccbc5b751b4 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:46:10 2012 +0100 gallivm: Added support for half-float to float conversion in lp_build_conv. Updated various utility functions to support this change. commit 135b4d683a4c95f7577ba27b9bffa4a6fbd2c2e7 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:02:46 2012 +0100 gallivm: Added function for half-float to float conversion. Updated lp_build_format_aos_array to support half-float source. commit 37d648827406a20c5007abeb177698723ed86673 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:55:18 2012 +0100 util: Updated u_format_tests to rigidly test half-float boundary values. commit 2ad18165d96e578aa9046df7c93cb1c3284d8c6b Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:54:16 2012 +0100 llvmpipe: Updated lp_test_format to properly handle Inf/NaN results. commit 78740acf25aeba8a7d146493dd5c966e22c27b73 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:53:30 2012 +0100 util: Added functions for checking NaN / Inf for double and half-floats. commit 35e9f640ae01241f9e0d67fe893bbbf564c05809 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:05:13 2012 +0200 gallivm: Fix calculating rho for 3d textures for the single-quad case Discovered by accident, this looks like a very old typo bug. commit fc1220c636326536fd0541913154e62afa7cd1d8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:04:59 2012 +0200 gallivm: do calcs per-quad in lp_build_rho Still convert to scalar at the end of the function. commit 50a887ffc550bf310a6988fa2cea5c24d38c1a41 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 21 23:21:50 2012 +0200 gallivm: (trivial) return scalar in lp_build_extract_range for length 1 vectors Our type system on top of llvm's one doesn't generally support vectors of length 1, instead using scalars. So we should return a scalar from this function instead of having to bitcast the vector with length 1 later elsewhere. commit 80c71c621f9391f0f9230460198d861643324876 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 17:49:15 2012 +0100 draw: Fixed bad merge error commit c47401cfad0c9167de20ff560654f533579f452c Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:29:30 2012 +0100 draw: Updated store_clip to store whole vectors instead of individual elements. commit 2d9c1ad74b0b0b41861fffcecde39f09cc27f1cf Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:28:32 2012 +0100 gallivm: Added lp_build_fetch_rgba_aos_array. A version of lp_build_fetch_rgba_aos which is targeted at simple array formats. Reads the whole vector from memory in one, instead of reading each element individually. Tested with mesa tests and demos. commit ff7805dc2b6ef6d8b11ec4e54aab1633aef29ac8 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:27:40 2012 +0100 gallivm: Added lp_build_pad_vector. This function pads a vector with undef to a desired length. commit 701f50acef24a2791dabf4730e5b5687d6eb875d Author: James Benton <jbenton@vmware.com> Date: Fri May 18 17:27:19 2012 +0100 util: Added util_format_is_array. This function checks whether a format description is in a simple array format. commit 5e0a7fa543dcd009de26f34a7926674190fa6246 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:13:47 2012 +0100 draw: Removed draw_llvm_translate_from and draw/draw_llvm_translate.c. This is "replaced" by adding an optimised path in lp_build_fetch_rgba_aos in an upcoming patch. commit 8c886d6a7dd3fb464ecf031de6f747cb33e5361d Author: James Benton <jbenton@vmware.com> Date: Wed May 16 15:02:31 2012 +0100 draw: Modified store_aos to write the vector as one, not individual elements. commit 37337f3d657e21dfd662c7b26d61cb0f8cfa6f17 Author: James Benton <jbenton@vmware.com> Date: Wed May 16 14:16:23 2012 +0100 draw: Changed aos_to_soa to use lp_build_transpose_aos. commit bd2b69ce5d5c94b067944d1dcd5df9f8e84548f1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:14:27 2012 +0100 draw: Changed soa_to_aos to use lp_build_transpose_aos. commit 0b98a950d29a116e82ce31dfe7b82cdadb632f2b Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:45 2012 +0100 gallivm: Added lp_build_transpose_aos which converts between aos and soa. commit 69ea84531ad46fd145eb619ed1cedbe97dde7cb5 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:01 2012 +0100 gallivm: Added lp_build_interleave2_half aimed at AVX unpack instructions. commit 7a4cb1349dd35c18144ad5934525cfb9436792f9 Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 22 11:54:14 2012 +0100 gallivm: Fix build on Windows. MC-JIT not yet supported there. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit afd105fc16bb75d874e418046b80d9cc578818a1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:17:26 2012 +0100 llvmpipe: Added a error counter to lp_test_conv. Useful for keeping track of progress when fixing errors! Signed-off-by: José Fonseca <jfonseca@vmware.com> commit b644907d08c10a805657841330fc23db3963d59c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:16:46 2012 +0100 llvmpipe: Changed known failures in lp_test_conv. To comply with the recent fixes to lp_bld_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit d7061507bd94f6468581e218e61261b79c760d4f Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:14:38 2012 +0100 llvmpipe: Added fixed point types tests to lp_test_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 146b3ea39b4726dbe125ac666bd8902ea3d6ca8c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:26:35 2012 +0100 llvmpipe: Changed lp_test_conv src/dst alignment to be correct. Now based on the define rather than a fixed number. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit f3b57441f834833a4b142a951eb98df0aa874536 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:06:44 2012 +0100 gallivm: Fixed erroneous optimisation in lp_build_min/max. Previously assumed normalised was 0 to 1, but it can be -1 to 1 if type is signed. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a0613382e5a215cd146bb277646a6b394d376ae4 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:04:49 2012 +0100 gallivm: Compensate for lp_const_offset in lp_build_conv. Fixing a /*FIXME*/ to remove errors in integer conversion in lp_build_conv. Tested using lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a3d2bf15ea345bc8a0664f8f441276fd566566f3 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:01:25 2012 +0100 gallivm: Fixed overflow in lp_build_clamped_float_to_unsigned_norm. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit e7b1e76fe237613731fa6003b5e1601a2e506207 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 21 20:07:51 2012 +0100 gallivm: Fix build with LLVM 2.6 Trivial, and useful. commit d3c6bbe5c7f5ba1976710831281ab1b6a631082d Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 15 17:15:59 2012 +0100 gallivm: Enable MCJIT/AVX with vanilla LLVM 3.1. Add the necessary C++ glue, so that we don't need any modifications to the soon to be released LLVM 3.1. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 724a019a14d40fdbed21759a204a2bec8a315636 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 22:04:06 2012 +0100 gallivm: Use HAVE_LLVM 0x0301 consistently. commit af6991e2a3868e40ad599b46278551b794839748 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 21:49:06 2012 +0100 gallivm: Add MCRegisterInfo.h to silence benign warnings about missing implementation. Trivial. commit 6f8a1d75458daae2503a86c6b030ecc4bb494e23 Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Apr 2 22:14:15 2012 -0700 gallivm: Pass in a MCInstrInfo to createMCInstPrinter on llvm-3.1. llvm-3.1svn r153860 makes MCInstrInfo available to the MCInstPrinter. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 62555b6ed8760545794f83064e27cddcb3ce5284 Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 27 21:51:17 2012 -0700 gallivm: Fix method overriding in raw_debug_ostream. Use matching type qualifers to avoid method hiding. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 6a9bd784f4ac68ad0a731dcd39e5a3c39989f2be Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 13 22:40:52 2012 -0700 gallivm: Fix createOProfileJITEventListener namespace with llvm-3.1. llvm-3.1svn r152620 refactored the OProfile profiling code. createOProfileJITEventListener was moved from the llvm namespace to the llvm::JITEventListener namespace. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit b674955d39adae272a779be85aa1bd665de24e3e Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Mar 5 22:00:40 2012 -0800 gallivm: Pass in a MCRegisterInfo to MCInstPrinter on llvm-3.1. llvm-3.1svn r152043 changes createMCInstPrinter to take an additional MCRegisterInfo argument. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 11ab69971a8a31c62f6de74905dbf8c02884599f Author: Vinson Lee <vlee@freedesktop.org> Date: Wed Feb 29 21:20:53 2012 -0800 Revert "gallivm: Change getExtent and readByte to non-const with llvm-3.1." This reverts commit d5a6c172547d8964f4d4bb79637651decaf9deee. llvm-3.1svn r151687 makes MemoryObject accessor members const again. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 339960c82d2a9f5c928ee9035ed31dadb7f45537 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 14 16:19:56 2012 +0200 gallivm: (trivial) fix assertion failure for mipmapped 1d textures In lp_build_rho, we may end up with a 1-element vector (for mipmapped 1d textures), but in this case we require the type to be a non-vector type, so need a cast. commit 9d73edb727bd6d196030dc3026b7bf0c574b3e19 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:12:07 2012 +0200 gallivm: prepare for per-quad lod calculations for large vectors to be able to handle multiple quads at once in texture sampling and still do lod calculations per quad, it is necessary to get the per-quad derivatives into the lp_build_rho function. Until now these derivative values were just scalars, which isn't going to work. So we now use vectors, and since the interface needs to change we also do some different (slightly more efficient) packing of the values. For 8-wide vectors the packed derivative values for 3 coords would look like this, this scales to a arbitrary (multiple of 4) vector size: ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____ The second vector will be unused for 1d and 2d textures. To facilitate future changes the derivative values are put into a struct, since quite some functions just pass these values through. The generated code seems to be very slightly better for 2d textures (with 4-wide vectors) than before with sse2 (if you have a cpu with physical 128bit simd units - otherwise it's probably not a win). v2: suggestions from José, rename variables, add comments, use swizzle helper commit 0aa21de0d31466dac77b05c97005722e902517b8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:10:31 2012 +0200 gallivm: add undefined swizzle handling to lp_build_swizzle_aos This is useful for vectors with "holes", it lets llvm choose the most efficient shuffle instructions if some elements aren't needed without having to worry what elements to manually pick otherwise. commit 00faf3f370e7ce92f5ef51002b0ea42ef856e181 Author: José Fonseca <jfonseca@vmware.com> Date: Fri May 4 17:25:16 2012 +0100 gallivm: Get the LLVM IR optimization passes before JIT compilation. MC-JIT engine compiles the module immediately on creation, so the optimization passes were being run too late. So now we create a target data layout from a string, that matches the ABI parameters reported by the compiler. The backend optimization passes were always been run, so the performance improvement is modest (3% on multiarb mesa demo). Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit 40a43f4e2ce3074b5ce9027179d657ebba68800a Author: Roland Scheidegger <sroland@vmware.com> Date: Wed May 2 16:03:54 2012 +0200 gallivm: (trivial) fix wrong define used in lp_build_pack2 should fix stack-smashing crashes. commit e6371d0f4dffad4eb3b7a9d906c23f1c88a2ab9e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 30 21:25:29 2012 +0200 gallivm: add perf warnings when not using intrinsics with 256bit vectors Helper functions using integer sse2 intrinsics could split the vectors with AVX instead of using generic fallback (which should be faster). We don't actually expect to hit these paths (hence don't fix them up to actually do the vector splitting) so just emit warnings (for those functions where it's obvious doing split/intrinsic is faster than using generic path). Only emit warnings for 256bit vectors since we _really_ don't expect to hit arbitrary large vectors which would affect a lot more functions. The warnings do not actually depend on avx since the same logic applies to plain sse2 too (but of course again there's _really_ no reason we should hit these functions with 256bit vectors without avx). commit 8a9ea701ea7295181e846c6383bf66a5f5e47637 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:37:07 2012 +0200 gallivm: split vectors manually for avx in lp_build_pack2 (v2) There's 2 reasons for this: First, there's a llvm bug (fixed in 3.1) which generates tons of byte inserts/extracts otherwise, and second, more importantly, we want to use pack intrinsics instead of shuffles. We do this in lp_build_pack2 and not the calling code (aos sample path) because potentially other callers might find that useful too, even if for larger sequences of code using non-native vector sizes it might be better to manually split vectors. This should boost texture performance in the aos path considerably. v2: fix issues with intrinsics types with old llvm commit 27ac5b48fa1f2ea3efeb5248e2ce32264aba466e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:26:22 2012 +0200 llvmpipe: refactor lp_build_pack2 (v2) prettify, and it's unnecessary to assert when there's no intrinsic due to unsupported bit width - the shuffle path will work regardless. In contrast lp_build_packs2, should only rely on lp_build_pack2 doing the clamping for element sizes for which there is a sse2 intrinsic. v2: fix bug spotted by Jose regarding the intrinsic type for packusdw on old llvm versions. commit ddf279031f0111de4b18eaf783bdc0a1e47813c8 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:13:59 2012 +0200 gallivm: add src width check in lp_build_packs2() not doing so would skip clamping even if no sse2 pack instruction is available, which is incorrect (in theory only, such widths would also always hit a (unnecessary) assertion in lp_build_pack2(). commit e7f0ad7fe079975eae7712a6e0c54be4fae0114b Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Apr 27 15:57:00 2012 +0200 gallivm: (trivial) fix crash-causing typo for npot textures with avx commit 28a9d7f6f655b6ec508c8a3aa6ffefc1e79793a0 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Apr 25 19:38:45 2012 +0200 gallivm: (trivial) remove code mistakenly added twice. commit d5926537316f8ff67ad0a52e7242f7c5478d919b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Apr 24 21:16:15 2012 +0200 gallivm: add a new avx aos sample path (v2) Try to avoid mixing float and int address calculations. This does texture wrap modes with floats, and then the offset calculations still with ints (because of lack of precision with floats, though we could do some effort to make it work with not too large (16MB) textures). This also handles wrap repeat mode with npot-sized textures differently than either the old soa or aos int path (likely way faster but untested). Otherwise the actual address wrap code is largely similar to the soa path (not quite the same as this one also has some int code), it should get used by avx soa sampling later as well but doesn't handle more complex address modes yet (this will also have the benefit that we can use aos sampling path for all texture address modes). Generated code for that looks reasonable, but still does not split vectors explicitly for fetch/filter which means still get hit by llvm (fixed upstream) which generates hundreds of pinsrb/pextrb instead of two shuffles. It is not obvious though if it's much of a win over just doing address calcs 4-wide but with ints, even if it is definitely much less instructions on avx. piglit's texwrap seems to look exactly the same but doesn't test neither the non-normalized nor the npot cases. v2: fix comments, prettify based on Brian's and Jose's feedback. commit bffecd22dea66fb416ecff8cffd10dd4bdb73fce Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Apr 19 01:58:29 2012 +0200 gallivm: refactor aos lp_build_sample_image_nearest/linear split them up to separate address calculations and fetching/filtering. Need this for being able to do 8-wide float address calcs and 4-wide fetch/filter later (for avx). Plus the functions were very big scary monsters anyway (in particular lp_build_sample_image_linear). commit a80b325c57529adddcfa367f96f03557725c4773 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 16 17:17:18 2012 +0200 gallivm: fix lp_build_resize when truncating width but expanding vector size Missed this case which I thought was impossible - the assertion for it was right after the division by zero... (AoS) texture sampling may ask us to do this, for things like 8 4x32int vectors to 1 32x8int vector conversion (eventually, we probably don't want this to happen). commit f9c8337caa3eb185830d18bce8b95676a065b1d7 Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Apr 14 18:00:59 2012 +0200 gallivm: fix cube maps with larger vectors This makes the branchless cube face selection code work with larger vectors. Because the complexity is quite high (cannot really be improved it seems, per-face selection would reduce complexity a lot but this leads to errors unless the derivatives are calculated all from the same face which almost doubles the work to be done) it is still slower than the branching version, hence only enable this with large vectors. It doesn't actually do per-quad face selection yet (only makes sense with matching lod selection, in fact it will select the same face for all pixels based on the average of the first four pixels for now) but only different shuffles are required to make it work (the branching version actually should work with larger vectors too now thanks to the improved horizontal add but of course it cannot be extended to really select the face per-quad unless doing branching per quad). commit 7780c58869fc9a00af4f23209902db7e058e8a66 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 21:11:12 2012 +0100 llvmpipe: (trivial) fix compiler warning and also clarify comment regarding availability of popcnt instruction. commit a266dccf477df6d29a611154e988e8895892277e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 14:21:07 2012 +0100 gallivm: remove unneeded members in lp_build_sample_context Minor cleanup, the texture width, height, depth aren't accessed in their scalar form anywhere. Makes it more obvious those values should probably be fetched already vectorized (but this requires more invasive changes)... commit b678c57fb474e14f05e25658c829fc04d2792fff Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 29 15:53:55 2012 +0100 gallivm: add a helper for concatenating vectors Similar to the extract_range helper intended to get around slow code generated by llvm for 128bit insertelements. Concatenating two 128bit vectors this way will result in a single vinsertf128 operation rather than two 64bit stores plus one 128bit load, though it might be mildly useful for other purposes as well. commit 415ff228bcd0cf5e44a4c15350a661f0f5520029 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 19:41:15 2012 +0100 gallivm: add a custom 2x8f->1x16ub avx conversion path Similar to the existing 4x4f->1x16ub sse2 path, shaves off a couple instructions (min/max mostly) because it relies on pack intrinsics clamping. commit 78c08fc89f8fbcc6dba09779981b1e873e2a0299 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 18:44:07 2012 +0100 gallivm: add avx arithmetic intrinsics Add all avx intrinsics for arithmetic functions (with the exception of the horizontal add function which needs another look). Seems to pass basic tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit a586caa2800aa5ce54c173f7c0d4fc48153dbc4e Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 15:31:35 2012 +0100 gallivm: add avx logic intrinsics Add the blend intrinsics for 8-wide float and 4-wide double vectors. Since we lack 256bit int instructions these are used for int vectors as well, though obviously not for byte or word element values. The comparison intrinsics aren't extended for avx since these are only used for pre-2.7 llvm versions. commit 70275e4c13c89315fc2560a4c488c0e6935d5caf Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 00:40:53 2012 +0100 gallivm: new helper function for extract shuffles. Based on José's idea as we can need that in a couple places. Note that such shuffles should not be used lightly, since data layout of <4 x i8> is different to <16 x i8> for instance, hence might cause data rearrangement. commit 4d586dbae1b0c55915dda1759d2faea631c0a1c2 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 18:27:25 2012 +0100 gallivm: (trivial) don't overallocate shuffle variable using wrong define meant huge array... commit 06b0ec1f6d665d98c135f9573ddf4ba04b2121ad Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 17:54:20 2012 +0100 gallivm: don't do per-element extract/insert for vector element resize Instead of doing per-element extract/insert if the src vectors and dst vector differ in total size (which generates atrocious code) first change the src vectors size by using shuffles to destination vector size. We can still do better than that on AVX for packing to color buffer (by exploiting pack intrinsics characteristics hence eleminating the need for some clamps) but this already generates much better code. v2: incorporate feedback from José, Keith and use shuffle instead of bitcasts/extracts. Due to llvm deficiencies the latter cause all data to get moved to GPRs and back in pieces (even though the data in the regs actually stays the same...). commit c9970d70e05f95d3f52fe7d2cd794176a52693aa Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 19:33:19 2012 +0000 gallivm: fix bug in simple position interpolation Accidental use of position attribute instead of just pixel coordinates. Caused failures in piglit glsl-fs-ceil and glsl-fs-floor. commit d0b6fcdb008d04d7f73d3d725615321544da5a7e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 15:31:14 2012 +0000 gallivm: fix emission of ceil opcode lp_build_ceil seems more appropriate than lp_build_trunc. This seems to be never hit though someone performs some ceil to floor magic. commit d97fafed7e62ffa6bf76560a92ea246a1a26d256 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 22 11:46:52 2012 +0000 gallivm: new vectorized path for cubemap calculations should be faster when adapted to multiple quads as only selection masks need to be different. The code is more or less a per-pixel version adapted to only do it per quad. A per pixel version would be much simpler (could drop 2 selects, 6 broadcasts and the messy horizontal add of 3 vectors at the expense of only 2 more absolute value instructions - would also just work for arbitary large vectors). This version doesn't yet work with larger vectors because the horizontal add isn't adjusted to be able to work with 2x4 vectors (and also because face selection wouldn't be done per quad just per block though that would be only a correctness issue just as with lod selection). The downside is this code is quite a bit slower. On a Core2 it can be sped up by disabling the hw blend instructions for selection and using logicop fallbacks instead, but it is still slower than the old code, hence leave that in for now. Probably will chose one or the other version based on vector length in the end. commit b375fbb18a3fd46859b7fdd42f3e9908ea4ff9a3 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 21 14:42:29 2012 +0000 gallivm: fix optimized occlusion query intrinsic name commit a9ba0a3b611e48efbb0e79eb09caa85033dbe9a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Mar 21 16:19:43 2012 +0000 draw,gallivm,llvmpipe: Call gallivm_verify_function everywhere. commit f94c2238d2bc7383e088b8845b7410439a602071 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 18:54:10 2012 +0000 gallivm: optimize calculations for cube maps a bit this does some more vectorized calculations and uses horizontal adds if possible. A definite win with sse3 otherwise it doesn't seem to make much of a difference. In any case this is arithmetically identical, cannot handle larger vectors. Should be useful as a reference point against larger vector version later... commit 21a2c1cf3c8e1ac648ff49e59fdc0e3be77e2ebb Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 15:16:27 2012 +0000 llvmpipe: slight optimization of occlusion queries using movmskps when available. While this is slightly better for cpus without popcnt we should really sum the vectors ourselves (it is also possible to cast to i4 before doing the popcnt but that doesn't help that much neither since llvm is using some optimized popcnt version for i32) commit 5ab5a35f216619bcdf55eed52b0db275c4a06c1b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 13:32:11 2012 +0000 llvmpipe: fix occlusion queries with larger vectors need to adjust casts etc. commit ff95e6fdf5f16d4ef999ffcf05ea6e8c7160b0d5 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Mar 19 20:15:25 2012 +0000 gallivm: Restore optimization passes. commit 57b05b4b36451e351659e98946dae27be0959832 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:34:22 2012 +0000 llvmpipe: use existing min2 macro commit bc9a20e19b4f600a439f45679451f2e87cd4b299 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:07:27 2012 +0000 llvmpipe: add some safeguards against really large vectors As per José's suggestion, prevent things from blowing up if some cpu would have 1024bit or larger vectors. commit 0e2b525e5ca1c5bbaa63158bde52ad1c1564a3a9 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:31:08 2012 +0000 llvmpipe: fix mask generation for uberwide vectors this was the only piece preventing 16-wide vectors from working (apart from the LP_MAX_VECTOR_WIDTH define that is), which is the maximum as we don't get more pixels in the fragment shader at once. Hence adjust that so things could be tested properly with that size even though there seems to be no practical value. commit 3c8334162211c97f3a11c7f64e9e5a2a91ad9656 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:19:41 2012 +0000 llvmpipe: fix the simple interpolation method with larger vectors so both methods actually _really_ work now. Makes textures look nice with larger vectors... commit 1cb0464ef8871be1778d43b0c56adf9c06843e2d Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 17:26:35 2012 +0000 llvmpipe: fix mask generation and position interpolation with 8-wide vectors trivial bugs, with these things start to look somewhat reasonable. Textures though have some swizzling issues it seems. commit 168277a63ef5b72542cf063c337f2d701053ff4b Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 16:04:03 2012 +0000 llvmpipe: don't overallocate variables we never have more than 16 (stamp size) / 4 (minimum possible vector size). (With larger vectors those variables are still overallocated a bit.) commit 409b54b30f81ed0aa9ed0b01affe15c72de9abd2 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:56:48 2012 +0000 llvmpipe: add some 32f8 formats to lp_test_conv Also add the ability to handle different sized vectors. commit 55dcd3af8366ebdac0af3cdb22c2588f24aa18ce Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:47:27 2012 +0000 gallivm: handle different sized vectors in conversion / pack only fully generic path for now (extract/insert per element). commit 9c040f78c54575fcd94a8808216cf415fe8868f6 Author: Roland Scheidegger <sroland@vmware.com> Date: Sun Mar 18 00:58:28 2012 +0100 llvmpipe: fix harmless use of unitialized values commit 551e9d5468b92fc7d5aa2265db9a52bb1e368a36 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:31:21 2012 +0100 gallivm: drop special path in extract_broadcast with different sized vectors Not needed, llvm can handle shuffles with different sized result vector just fine. Should hopefully generate the same code in the end, but simpler IR. commit 44da531119ffa07a421eaa041f63607cec88f6f8 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:28:49 2012 +0100 llvmpipe: adapt interpolation for handling multiple quads at once this is still WIP there are actually two methods possible not quite sure what makes the most sense, so there's code for both for now: 1) the iterative method as used before (compute attrib values at upper left corner of stamp and upper left corner of each quad initially). It is improved to handle more than one quad at once, and also do some more vectorized calculations initially for slightly better code - newer cpus have full throughput with 4 wide float vectors, hence don't try to code up a path which might be faster if there's just one channel active per attribute. 2) just do straight interpolation for each pixel. Method 2) is more work per quad, but less initially - if all quads are executed significantly more overall though. But this might change with larger vector lengths. This method would also be needed if we'd do some kind of active quad merging when operating on multiple quads at once. This path contains some hack to force llvm to generate better code, it is still far from ideal though, still generates far too many unnecessary register spills/reloads. Both methods should work with different sized vectors. Not very well tested yet, still seems to work with four-wide vectors, need changes elsewhere to be able to test with wider vectors. commit be5d3e82e2fe14ad0a46529ab79f65bf2276cd28 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:59:37 2012 +0000 draw: Cleanup. commit f85bc12c7fbacb3de2a94e88c6cd2d5ee0ec0e8d Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:43:30 2012 +0000 gallivm: More module compilation refactoring. commit d76f093198f2a06a93b2204857e6fea5fd0b3ece Author: José Fonseca <jfonseca@vmware.com> Date: Thu Mar 15 21:29:11 2012 +0000 llvmpipe: Use gallivm_compile/free_function() in linear code. Should had been done before. commit 122e1adb613ce083ad739b153ced1cde61dfc8c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 13 14:47:10 2012 +0100 llvmpipe: generate partial pixel mask for multiple quads still works with one quad, cannot be tested yet with more At least for now always fixed order with multiple quads. commit 4c4f15081d75ed585a01392cd2dcce0ad10e0ea8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 22:09:24 2012 +0100 llvmpipe: refactor state setup a bit Refactor to make it easier to emit (and potentially later fetch in fs) coefficients for multiple attributes at once. Need to think more about how to make this actually happen however, the problem is different attributes can have different interpolation modes, requiring different handling in both setup and fs (though linear and perspective handling is close). commit 9363e49722ff47094d688a4be6f015a03fba9c79 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 19:23:23 2012 +0100 llvmpipe: vectorize tri offset calc cuts number of instructions in quad-offset-factor from 107 to 75. This code actually duplicated the (scalar) code calculating the determinant except it used different vertex order (leading to different sign but it doesn't matter) hence llvm could not have figured out it's the same (of course with determinant vectorized in the other place that wouldn't have worked any longer neither). Note this particular piece doesn't actually vectorize well, not many arithmetic instructions left but tons of shuffle instructions... Probably would need to work on n tris at a time for better vectorization. commit 63169dcb9dd445c94605625bf86d85306e2b4297 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 03:11:37 2012 +0100 llvmpipe: vectorize some scalar code in setup reduces number of arithmetic instructions, and avoids loading vector x,y values twice (once as scalars once as vectors). Results in a reduction of instructions from 76 to 64 in fs setup for glxgears (16%) on a cpu with sse41. Since this code uses vec2 disguised as vec4, on old cpus which had physical 64bit sse units (pre-Core2) it probably is less of a win in practice (and if you have no vectors you can only hope llvm eliminates the arithmetic for unneeded elements). commit 732ecb877f951ab89bf503ac5e35ab8d838b58a1 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 7 00:32:24 2012 +0100 draw: fix clipping bug introduced by 4822fea3f0440b5205e957cd303838c3b128419c broke clipping pretty badly (verified with lineclip test) commit ef5d90b86d624c152d200c7c4056f47c3c6d2688 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 23:38:59 2012 +0100 draw: don't store vertex header per attribute storing the vertex header once per attribute is totally unnecessary. Some quick look at the generated assembly says llvm in fact cannot optimize away the additional stores (maybe due to potentially aliasing pointers somewhere). Plus, this makes the code cleaner and also allows using a vector "or" instead of scalar ones. commit 6b3a5a57b0b9850854cfbd7b586e4e50102dda71 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 19:11:01 2012 +0100 draw: do the per-vertex "boolean" clipmask "or" with vectors no point extracting the values and doing it per component. Doesn't help that much since we still extract the values elsewhere anyway. commit 36519caf1af40e4480251cc79a2d527350b7c61f Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 2 22:27:01 2012 +0100 gallivm: fix lp_build_extract_broadcast with different sized vectors Fix the obviously wrong argument, so it doesn't blow up. commit 76d0ac3ad85066d6058486638013afd02b069c58 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 2 12:16:23 2012 +0000 draw: Compile per module and not per function (WIP). Enough to get gears w/ LLVM draw + softpipe to work on AVX doing: GALLIUM_DRIVER=softpipe SOFTPIPE_USE_LLVM=yes glxgears But still hackish -- will need to rethink and refactor this. commit 78e32b247d2a7a771be9a1a07eb000d1e54ea8bd Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 12:01:05 2012 +0000 llvmpipe: Remove lp_state_setup_fallback. Never used. commit 6895d5e40d19b4972c361e8b83fdb7eecda3c225 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Feb 27 19:14:27 2012 +0000 llvmpipe: Don't emit EMMS on x86 We already take precautions to ensure that LLVM never emits MMX code. commit 4822fea3f0440b5205e957cd303838c3b128419c Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 29 15:58:19 2012 +0100 draw: modifications for larger vector sizes We want to be able to use larger vectors especially for running the vertex shader. With this patch we build soa vectors which might have a different length than 4. Note that aos structures really remain the same, only when aos structures are converted to soa potentially different sized vectors are used. Samplers probably don't work yet, didn't look at them. Testing done: glxgears works with both 128bit and 256bit vectors. commit f4950fc1ea784680ab767d3dd0dce589f4e70603 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:57 2012 +0100 gallivm: override native vector width with LP_NATIVE_VECTOR_WIDTH env var for debug commit 6ad6dbf0c92f3bf68ae54e5f2aca035d19b76e53 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:24 2012 +0100 draw: allocate storage with alignment according to native vector width commit 7bf0e3e7c9bd2469ae7279cabf4c5229ae9880c1 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Feb 24 19:06:08 2012 +0000 gallivm: Fix comment grammar. Was missing several words. Spotted by Roland. commit b20f1b28eb890b2fa2de44a0399b9b6a0d453c52 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 19:22:09 2012 +0000 gallivm: Use MC-JIT on LLVM 3.1 + (i.e, SVN) MC-JIT Note: MC-JIT is still WIP. For this to work correctly it requires LLVM changes which are not yet upstream. commit b1af4dfcadfc241fd4023f4c3f823a1286d452c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 20:03:15 2012 +0100 llvmpipe: use new lp_type_width() helper in lp_test_blend commit 04e0a37e888237d4db2298f31973af459ef9c95f Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 19:50:34 2012 +0100 llvmpipe: clean up lp_test_blend a little Using variables just sized and aligned right makes it a bit more obvious what's going on. The test still only tests vector length 4. For AoS anything else probably isn't going to work. For SoA other lengths should work (at least with floats). commit e61c393d3ec392ddee0a3da170e985fda885a823 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:48:30 2012 +0000 gallivm: Ensure vector width consistency. Instead of assuming that everything is the max native size. commit 330081ac7bc41c5754a92825e51456d231bf84dd Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:44:14 2012 +0000 draw: More simd vector width consistency fixes. commit d90ca002753596269e37297e2e6c139b19f29f03 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:43:00 2012 +0000 gallivm: Remove unused lp_build_int32_vec4_type() helper. commit cae23417824d75869c202aaf897808d73a2c1db0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 17:32:16 2012 +0100 gallivm: use global variable for native vector width instead of define We do not know the simd extensions (and hence the simd width we should use) available at compile time. At least for now keep a define for maximum vector width, since a global variable obviously can't be used to adjust alignment of automatic stack variables. Leave the runtime-determined value at 128 for now in all cases. commit 51270ace6349acc2c294fc6f34c025c707be538a Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 15:41:02 2012 +0000 gallivm: Add a hunk inadvertedly lost when rebasing. commit bf256df9cfdd0236637a455cbaece949b1253e98 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:24:23 2012 +0000 llvmpipe: Use consistent vector width in depth/stencil test. commit 5543b0901677146662c44be2cfba655fd55da94b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:19:59 2012 +0000 draw: Use a consistent the vector register width. Instead of 4x32 sometimes, LP_NATIVE_VECTOR_WIDTH other times. commit eada8bbd22a3a61f549f32fe2a7e408222e5c824 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 12:08:04 2012 +0000 gallivm: Remove garbagge collection. MC-JIT will require one compilation per module (as opposed to one compilation per function), therefore no state will be shared, eliminating the need to do garbagge collection. commit 556697ea0ed72e0641851e4fbbbb862c470fd7eb Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 10:33:41 2012 +0000 gallivm: Move all native target initialization to lp_set_target_options(). commit c518e8f3f2649d5dc265403511fab4bcbe2cc5c8 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:52:32 2012 +0000 llvmpipe: Create one gallivm instance for each test. commit 90f10af8920ec6be6f2b1e7365cfc477a0cb111d Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:48:08 2012 +0000 gallivm: Avoid LLVMAddGlobalMapping() in lp_bld_assert(). Brittle, complex, and unecesary. Just use function pointer constant. commit 98fde550b33401e3fe006af59db4db628bcbf476 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:21:26 2012 +0000 gallivm: Add a lp_build_const_func_pointer() helper. To be reused in all places where we want to call C code. commit 6cfedadb62c2ce5af8d75969bc95a607f3ece118 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:44:41 2012 +0000 gallivm: Cleanup/simplify lp_build_const_string_variable. - Move to lp_bld_const where it belongs - Rename to lp_build_const_string - take the length from the argument (and don't count the zero terminator twice) - bitcast the constant to generic i8 * commit db1d4018c0f1fa682a9da93c032977659adfb68c Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 11:52:17 2012 +0000 gallivm: Set NoFramePointerElimNonLeaf to true where supported. commit 088614164aa915baaa5044fede728aa898483183 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 22 19:38:47 2012 +0100 llvmpipe: pass in/out pointers rather scalar floats in lp_bld_arit we don't want llvm to potentially optimize away the vectors (though it doesn't seem to currently), plus we want to be able to handle in/out vectors of arbitrary length. commit 3f5c4e04af8a7592fdffa54938a277c34ae76b51 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:22:55 2012 +0100 gallivm: fix lp_build_sqrt() for vector length 1 since we optimize away vectors with length 1 need to emit intrinsic without vector type. commit 79d94e5f93ed8ba6757b97e2026722ea31d32c06 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 17:00:46 2012 +0000 llvmpipe: Remove lp_test_round. commit 81f41b5aeb3f4126e06453cfc78990086b85b78d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:56:24 2012 +0100 llvmpipe: subsume lp_test_round into lp_test_arit Much simpler, and since the arguments aren't passed as 128bit values can run on any arch. This also uses the float instead of the double versions of the c functions (which probably was the intention anyway). In contrast to lp_test_round the output is much less verbose however. Tested vector width of 32 to 512 bits - all pass except 32 (length 1) which crashes in lp_build_sqrt() due to wrong type. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 945b338b421defbd274481d8c4f7e0910fd0e7eb Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 09:55:03 2012 +0000 gallivm: Centralize the function compilation logic. This simplifies a lot of code. Also doing this in a central place will make it easier to carry out the changes necessary to use MC-JIT in the future. gallivm: Fix typo in explicit derivative shuffle. Trivial. draw: make DEBUG_STORE work again adapt to lp_build_printf() interface changes Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: get rid of vecnf_from_scalar() just use lp_build_broadcast directly (cannot assign a name but don't really need it, vecnf_from_scalar() was producing much uglier IR due to using repeated insertelement instead of insertelement+shuffle). Reviewed-by: José Fonseca <jfonseca@vmware.com> llvmpipe: fix typo in complex interpolation code Fixes position interpolation when using complex mode (piglit fp-fragment-position and similar) Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: fix clipvertex/position storing again This appears to be the result of a bad merge. Fixes piglit tests relying on clipping, like a lot of the interpolation tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Fix explicit derivative manipulation. Same counter variable was being used in two nested loops. Use more meanigful variable names for the counter to fix and avoid this. gallivm: Prevent buffer overflow in repeat wrap mode for NPOT. Based on Roland's patch, discussion, and review . Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix dims for TGSI_TEXTURE_1D in emit_tex. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix explicit volume texture derivatives. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: fix 1d shadow texture sampling Always r coordinate is used, hence need 3 coords not two (the second one is unused). Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Enable AVX support without MCJIT, where available. For now, this just enables AVX on Windows for testing. If the code is stable then we might consider prefering the old JIT wherever possible. No change elsewhere. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2012-07-13 18:09:30 +01:00
/**
* Prevent returning 1.0 for very small negative values of 'a' by clamping
* against 0.99999(9). (Will also return that value for NaNs.)
gallivm,draw,llvmpipe: Support wider native registers. Squashed commit of the following: commit 7acb7b4f60dc505af3dd00dcff744f80315d5b0e Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:46:31 2012 +0100 draw: Don't use dynamically sized arrays. Not supported by MSVC. commit 5810c28c83647612cb372d1e763fd9d7780df3cb Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:44:16 2012 +0100 gallivm,llvmpipe: Don't use expressions with PIPE_ALIGN_VAR(). MSVC doesn't accept exceptions in _declspec(align(...)). Use a define instead. commit 8aafd1457ba572a02b289b3f3411e99a3c056072 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:41:56 2012 +0100 gallium/util: Make u_cpu_detect.h header C++ safe. commit 5795248350771f899cfbfc1a3a58f1835eb2671d Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 2 12:08:01 2012 +0100 gallium/util: Add ULL suffix to large constants. As suggested by Andy Furniss: it looks like some old gcc versions require it. commit 4c66c22727eff92226544c7d43c4eb94de359e10 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Truly disable INF/NAN tests on MSVC. Thanks to Brian for spotting this. commit 8bce274c7fad578d7eb656d9a1413f5c0844c94e Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Disable INF/NAN tests on MSVC. Somehow they are not recognized as constants. commit 6868649cff8d7fd2e2579c28d0b74ef6dd4f9716 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 15:05:24 2012 +0200 gallivm: Cleanup the 2 x 8 float -> 16 ub special path in lp_build_conv. No behaviour change intended, like 7b98455fb40c2df84cfd3cdb1eb7650f67c8a751. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 5147a0949c4407e8bce9e41d9859314b4a9ccf77 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 14:28:19 2012 +0200 gallivm: (trivial) fix issues with multiple-of-4 texture fetch Some formats can't handle non-multiple of 4 fetches I believe, but everything must support length 1 and multiples of 4. So avoid going to scalar fetch (which is very costly) just because length isn't 4. Also extend the hack to not use shift with variable count for yuv formats to arbitrary length (larger than 1) - doesn't matter how many elements we have we always want to avoid it unless we have variable shift count instruction (which we should get with avx2). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 87ebcb1bd71fa4c739451ec8ca89a7f29b168c08 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jul 4 02:09:55 2012 +0200 gallivm: (trivial) fix typo for wrap repeat mode in linear filtering aos code This would lead to bogus coordinates at the edges. (undetected by piglit because this path is only taken for block-based formats). Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 3a42717101b1619874c8932a580c0b9e6896b557 Author: José Fonseca <jfonseca@vmware.com> Date: Tue Jul 3 19:42:49 2012 +0100 gallivm: Fix TGSI integer translation with AVX. commit d71ff104085c196b16426081098fb0bde128ce4f Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 15:17:41 2012 +0100 llvmpipe: Fix LLVM JIT linear path. It was not working properly because it was looking at the JIT function before it was actually compiled. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit a94df0386213e1f5f9a6ed470c535f9688ec0a1b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Jun 28 18:07:10 2012 +0100 gallivm: Refactor lp_build_broadcast(_scalar) to share code. Doesn't really change the generated assembly, but produces more compact IR, and of course, makes code more consistent. Reviewed-by: Brian Paul <brianp@vmware.com> commit 66712ba2731fc029fa246d4fc477d61ab785edb5 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 17:30:13 2012 +0100 gallivm: Make LLVMContextRef a singleton. There are any places inside LLVM that depend on it. Too many to attempt to fix. Reviewed-by: Brian Paul <brianp@vmware.com> commit ff5fb7897495ac263f0b069370fab701b70dccef Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 28 18:15:27 2012 +0200 gallivm: don't use 8-wide texture fetch in aos path This appears to be a slight loss usually. There are probably several reasons for that: - fetching itself is scalar - filtering is pure int code hence needs splitting anyway, same for the final texel offset calculations - texture wrap related code, which can be done 8-wide, is slightly more complex with floats (with clamp_to_edge) and float operations generally more costly hence probably not much faster overall - the code needed to split when encountering different mip levels for the quads, adding complexity So, just split always for aos path (but leave it 8-wide for soa, since we do 8-wide filtering there when possible). This should certainly be revisited if we'd have avx2 support. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ce8032b43dcd8e8d816cbab6428f54b0798f945d Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:41:19 2012 +0200 gallivm: (trivial) don't extract fparts variable if not needed Did not have any consequences but unnecessary. commit aaa9aaed8f80dc282492f62aa583a7ee23a4c6d5 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:09:06 2012 +0200 gallivm: fix precision issue in aos linear int wrap code now not just passes at a quick glance but also with piglit... If we do the wrapping with floats, we also need to set the weights accordingly. We can potentially end up with different (integer) coordinates than what the integer calculations would have chosen, which means the integer weights calculated previously in this case are completely wrong. Well at least that's what I think happens, at least recalculating the weights helps. (Some day really should refactor all the wrapping, so we do whatever is fastest independent of 16bit int aos or 32bit float soa filtering.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit fd6f18588ced7ac8e081892f3bab2916623ad7a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 11:15:53 2012 +0100 gallium/util: Fix parsing of options with underscore. For example GALLIVM_DEBUG=no_brilinear which was being parsed as two options, "no" and "brilinear". commit 09a8f809088178a03e49e409fa18f1ac89561837 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 26 15:00:14 2012 +0100 gallivm: Added a generic lp_build_print_value which prints a LLVMValueRef. Updated lp_build_printf to share common code. Removed specific lp_build_print_vecX. Reviewed-by: José Fonseca <jfonseca@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit e59bdcc2c075931bfba2a84967a5ecd1dedd6eb0 Author: José Fonseca <jfonseca@vmware.com> Date: Wed May 16 15:00:23 2012 +0100 draw,llvmpipe: Avoid named struct types on LLVM 3.0 and later. Starting with LLVM 3.0, named structures are meant not for debugging, but for recursive data types, previously also known as opaque types. The recursive nature of these types leads to several memory management difficulties. Given that we don't actually need recursive types, avoid them altogether. This is an attempt to address fdo bugs 41791 and 44466. The issue is somewhat random so there's no easy way to check how effective this is. Cherry-picked from 9af1ba565dfd5cef9ee938bb7c04767d14878fbf commit df6070f618a203c7a876d984c847cde4cbc26bdb Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 14:42:53 2012 +0200 gallivm: (trivial) fix typo in faster aos linear int wrap code no longer crashes, now REALLY tested. commit d8f98dce452c867214e6782e86dc08562643c862 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 18:20:58 2012 +0200 llvmpipe: (trivial) remove bogus optimization for float aos repeat wrap This optimization for nearest filtering on the linear path generated likely bogus results, and the int path didn't have any optimizations there since the only shader using force_nearest apparently uses clamp_to_edge not repeat wrap anyway. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit c4e271a0631087c795e756a5bb6b046043b5099d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 23:01:52 2012 +0200 gallivm: faster repeat wrap for linear aos path too Even if we already have scaled integer coords, it's way faster to use the original float coord (plus some conversions) rather than use URem. The choice of what to do for texture wrapping is not really tied to int aos or float soa filtering though for some modes there can be some gains (because of easier weight calculations). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 1174a75b1806e92aee4264ffe0ffe7e70abbbfa3 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 14:39:22 2012 +0200 gallivm: improve npot tex wrap repeat in linear soa path URem gets translated into series of scalar divisions so just about anything else is faster. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit f849ffaa499ed96fa0efd3594fce255c7f22891b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 00:40:35 2012 +0100 gallivm: (trivial) fix near-invisible shift-space typo I blame the keyboard. commit 5298a0b19fe672aebeb70964c0797d5921b51cf0 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:24:28 2012 +0200 gallivm: add new intrinsic helper to deal with arbitrary vector length This helper will split vectors which are too large for the hw, or expand them if they are too small, so a caller of a function using intrinsics which uses such sizes need not split (or expand) the vectors manually and the function will still use the intrinsic instead of dropping back to generic llvm code. It can also accept scalars for use with pseudo-vector intrinsics (only useful for float arguments, all x86 scalar simd float intrinsics use 4vf32). Only used for lp_build_min/max() for now (also added the scalar float case for these while there). (Other basic binary functions could use it easily, whereas functions with a different interface would need different helpers.) Expanding vectors isn't widely used, because we always try to use build contexts with native hw vector sizes. But it might (or not) be nicer if this wouldn't need to be done, the generated code should in theory stay the same (it does get hit by lp_build_rho though already since we didn't have a intrinsic for the scalar lp_build_max case before). v2: incorporated Brian's feedback, and also made the scalar min/max case work instead of crash (all scalar simd float intrinsics take 4vf32 as argument, probably the reason why it wasn't used before). Moved to lp_bld_intr based on José's request, and passing intrinsic size instead of length. Ideally we'd derive the source type info from the passed in llvm value refs and process some llvmtype return type so we could handle intrinsics where the source and destination type isn't the same (like float/int conversions, packing instructions) but that's a bit too complicated for now. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 01aa760b99ec0b2dc8ce57a43650e83f8c1becdf Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:19:18 2012 +0200 gallivm: (trivial) increase max code size for shader disassembly 64kB was just short of what I needed (which caused a crash) hence increase to 96kB (should probably be smarter about that). commit 74aa739138d981311ce13076388382b5e89c6562 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:53:29 2012 +0100 gallivm: simplify aos float tex wrap repeat nearest just handle pot and npot the same. The previous pot handling ended up with exactly the same instructions plus 2 more (leave it in the soa path though since it is probably still cheaper there). While here also fix a issue which would cause a crash after an assert. commit 0e1e755645e9e49cfaa2025191e3245ccd723564 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:29:24 2012 +0100 gallivm: (trivial) skip floor rounding in ifloor when not signed This was only done for the non-sse41 case before, but even with sse41 this is obviously unnecessary (some callers already call itrunc in this case anyway but some might not). commit 7f01a62f27dcb1d52597b24825931e88bae76f33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:23:12 2012 +0100 gallivm: (trivial) fix bogus comments commit 5c85be25fd82e28490274c468ce7f3e6e8c1d416 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 20 11:51:57 2012 +0100 translate: Free elt8_func/elt16_func too. These were leaking. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 0ad498f36fb6f7458c7cffa73b6598adceee0a6c Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 15:55:34 2012 +0200 gallivm: fix bug for tex wrap repeat with linear sampling in aos float path The comparison needs to be against length not length_minus_one, otherwise the max texel is never chosen (for the second coordinate). Fixes piglit texwrap-1D-npot-proj (and 2D/3D versions). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit d1ad65937c5b76407dc2499b7b774ab59341209e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 16:13:43 2012 +0200 gallivm: simplify soa tex wrap repeat with npot textures and no mip filtering Similar to what is already done in aos sampling for the float path (but not the int path since we don't get normalized float coordinates there). URem is expensive and the calculation is done trivially with normalized floats instead (at least with sse41-capable cpus). (Some day should probably do the same for the mip filter path but it's much more complicated there hence the gain is smaller.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit e1e23f57ba9b910295c306d148f15643acc3fc83 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:38:56 2012 +0200 llvmpipe: (trivial) remove duplicated function declaration Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 07ca57eb09e04c48a157733255427ef5de620861 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:37:34 2012 +0200 llvmpipe: destroy setup variants on context destruction lp_delete_setup_variants() used to be called in garbage collection, but this no longer exists hence the setup shaders never got freed. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ed0003c633859a45f9963a479f4c15ae0ef1dca3 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 16:25:29 2012 +0100 gallivm: handle different ilod parts for multiple quad sampling This fixes filtering when the integer part of the lod is not the same for all quads. I'm not fully convinced of that solution yet as it just splits the vector if the levels to be sampled from are different. But otherwise we'd need to do things like some minify steps, and getting mip level base address separately anyway hence it wouldn't really look like much of a win (and making the code even more complex). This should now give identical results to single quad sampling. commit 8580ac4cfc43a64df55e84ac71ce1a774d33c0d2 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:14:47 2012 +0200 gallivm: de-duplicate sample code common to soa and aos sampling There doesn't seem to be any reason why this code dealing with cube face selection, lod and mip level calculation is separate in aos and soa sampling, and I am sick of having it to change in both places. commit fb541e5f957408ce305b272100196f1e12e5b1e8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:15:41 2012 +0200 gallivm: do mip filtering with per quad lod_fpart This gives better results for mip filtering, though the generated code might not be optimal. For now it also creates some artifacts if the lod_ipart isn't the same for all quads, since instead of using the same mip weight for all quads as previously (which just caused non-smooth gradients) this now will use the right weights but with the wrong mip level in this case (can easily be seen with things like texfilt, mipmap_tunnel). v2: use logic helper suggested by José, and fix issue with negative lod_fpart values commit f1cc84eef7d826a20fab6cd8ccef9a275ff78967 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 13 18:35:25 2012 +0200 gallivm: (trivial) fix bogus assert in lp_build_unpack_broadcast_aos_scalars commit 7c17dbae8ae290df9ce0f50781a09e8ed640c044 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:11:14 2012 +0100 util: Reimplement half <-> float conversions. Removed u_half.py used to generate the table for previous method. Previous implementation of float to half conversion was faulty for denormalised and NaNs and would require extra logic to fix, thus making the speedup of using tables irrelevant. commit 7762f59274070e1dd4b546f5cb431c2eb71ae5c3 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:12:16 2012 +0100 tests: Updated tests to properly handle NaN for half floats. commit fa94c135aea5911fd93d5dfb6e6f157fb40dce5e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:33:10 2012 +0200 gallivm: do mip level calculations per quad This is the final piece which shouldn't change the rendering output yet. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 23cbeaddfe03c09ca18c45d28955515317ffcf4c Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Jun 9 00:54:21 2012 +0200 gallivm: do per-quad cube face selection Doesn't quite fix the piglit cubemap test (not sure why actually) but doing per-quad face selection is doing the right thing and definitely an improvement. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit abfb372b3702ac97ac8b5aa80ad1b94a2cc39d33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:22:59 2012 +0200 gallivm: do all lod calculations per quad Still no functional change but lod is now converted to scalar after lod calculations. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 519368632747ae03feb5bca9c655eccbc5b751b4 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:46:10 2012 +0100 gallivm: Added support for half-float to float conversion in lp_build_conv. Updated various utility functions to support this change. commit 135b4d683a4c95f7577ba27b9bffa4a6fbd2c2e7 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:02:46 2012 +0100 gallivm: Added function for half-float to float conversion. Updated lp_build_format_aos_array to support half-float source. commit 37d648827406a20c5007abeb177698723ed86673 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:55:18 2012 +0100 util: Updated u_format_tests to rigidly test half-float boundary values. commit 2ad18165d96e578aa9046df7c93cb1c3284d8c6b Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:54:16 2012 +0100 llvmpipe: Updated lp_test_format to properly handle Inf/NaN results. commit 78740acf25aeba8a7d146493dd5c966e22c27b73 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:53:30 2012 +0100 util: Added functions for checking NaN / Inf for double and half-floats. commit 35e9f640ae01241f9e0d67fe893bbbf564c05809 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:05:13 2012 +0200 gallivm: Fix calculating rho for 3d textures for the single-quad case Discovered by accident, this looks like a very old typo bug. commit fc1220c636326536fd0541913154e62afa7cd1d8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:04:59 2012 +0200 gallivm: do calcs per-quad in lp_build_rho Still convert to scalar at the end of the function. commit 50a887ffc550bf310a6988fa2cea5c24d38c1a41 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 21 23:21:50 2012 +0200 gallivm: (trivial) return scalar in lp_build_extract_range for length 1 vectors Our type system on top of llvm's one doesn't generally support vectors of length 1, instead using scalars. So we should return a scalar from this function instead of having to bitcast the vector with length 1 later elsewhere. commit 80c71c621f9391f0f9230460198d861643324876 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 17:49:15 2012 +0100 draw: Fixed bad merge error commit c47401cfad0c9167de20ff560654f533579f452c Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:29:30 2012 +0100 draw: Updated store_clip to store whole vectors instead of individual elements. commit 2d9c1ad74b0b0b41861fffcecde39f09cc27f1cf Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:28:32 2012 +0100 gallivm: Added lp_build_fetch_rgba_aos_array. A version of lp_build_fetch_rgba_aos which is targeted at simple array formats. Reads the whole vector from memory in one, instead of reading each element individually. Tested with mesa tests and demos. commit ff7805dc2b6ef6d8b11ec4e54aab1633aef29ac8 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:27:40 2012 +0100 gallivm: Added lp_build_pad_vector. This function pads a vector with undef to a desired length. commit 701f50acef24a2791dabf4730e5b5687d6eb875d Author: James Benton <jbenton@vmware.com> Date: Fri May 18 17:27:19 2012 +0100 util: Added util_format_is_array. This function checks whether a format description is in a simple array format. commit 5e0a7fa543dcd009de26f34a7926674190fa6246 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:13:47 2012 +0100 draw: Removed draw_llvm_translate_from and draw/draw_llvm_translate.c. This is "replaced" by adding an optimised path in lp_build_fetch_rgba_aos in an upcoming patch. commit 8c886d6a7dd3fb464ecf031de6f747cb33e5361d Author: James Benton <jbenton@vmware.com> Date: Wed May 16 15:02:31 2012 +0100 draw: Modified store_aos to write the vector as one, not individual elements. commit 37337f3d657e21dfd662c7b26d61cb0f8cfa6f17 Author: James Benton <jbenton@vmware.com> Date: Wed May 16 14:16:23 2012 +0100 draw: Changed aos_to_soa to use lp_build_transpose_aos. commit bd2b69ce5d5c94b067944d1dcd5df9f8e84548f1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:14:27 2012 +0100 draw: Changed soa_to_aos to use lp_build_transpose_aos. commit 0b98a950d29a116e82ce31dfe7b82cdadb632f2b Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:45 2012 +0100 gallivm: Added lp_build_transpose_aos which converts between aos and soa. commit 69ea84531ad46fd145eb619ed1cedbe97dde7cb5 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:01 2012 +0100 gallivm: Added lp_build_interleave2_half aimed at AVX unpack instructions. commit 7a4cb1349dd35c18144ad5934525cfb9436792f9 Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 22 11:54:14 2012 +0100 gallivm: Fix build on Windows. MC-JIT not yet supported there. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit afd105fc16bb75d874e418046b80d9cc578818a1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:17:26 2012 +0100 llvmpipe: Added a error counter to lp_test_conv. Useful for keeping track of progress when fixing errors! Signed-off-by: José Fonseca <jfonseca@vmware.com> commit b644907d08c10a805657841330fc23db3963d59c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:16:46 2012 +0100 llvmpipe: Changed known failures in lp_test_conv. To comply with the recent fixes to lp_bld_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit d7061507bd94f6468581e218e61261b79c760d4f Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:14:38 2012 +0100 llvmpipe: Added fixed point types tests to lp_test_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 146b3ea39b4726dbe125ac666bd8902ea3d6ca8c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:26:35 2012 +0100 llvmpipe: Changed lp_test_conv src/dst alignment to be correct. Now based on the define rather than a fixed number. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit f3b57441f834833a4b142a951eb98df0aa874536 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:06:44 2012 +0100 gallivm: Fixed erroneous optimisation in lp_build_min/max. Previously assumed normalised was 0 to 1, but it can be -1 to 1 if type is signed. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a0613382e5a215cd146bb277646a6b394d376ae4 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:04:49 2012 +0100 gallivm: Compensate for lp_const_offset in lp_build_conv. Fixing a /*FIXME*/ to remove errors in integer conversion in lp_build_conv. Tested using lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a3d2bf15ea345bc8a0664f8f441276fd566566f3 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:01:25 2012 +0100 gallivm: Fixed overflow in lp_build_clamped_float_to_unsigned_norm. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit e7b1e76fe237613731fa6003b5e1601a2e506207 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 21 20:07:51 2012 +0100 gallivm: Fix build with LLVM 2.6 Trivial, and useful. commit d3c6bbe5c7f5ba1976710831281ab1b6a631082d Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 15 17:15:59 2012 +0100 gallivm: Enable MCJIT/AVX with vanilla LLVM 3.1. Add the necessary C++ glue, so that we don't need any modifications to the soon to be released LLVM 3.1. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 724a019a14d40fdbed21759a204a2bec8a315636 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 22:04:06 2012 +0100 gallivm: Use HAVE_LLVM 0x0301 consistently. commit af6991e2a3868e40ad599b46278551b794839748 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 21:49:06 2012 +0100 gallivm: Add MCRegisterInfo.h to silence benign warnings about missing implementation. Trivial. commit 6f8a1d75458daae2503a86c6b030ecc4bb494e23 Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Apr 2 22:14:15 2012 -0700 gallivm: Pass in a MCInstrInfo to createMCInstPrinter on llvm-3.1. llvm-3.1svn r153860 makes MCInstrInfo available to the MCInstPrinter. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 62555b6ed8760545794f83064e27cddcb3ce5284 Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 27 21:51:17 2012 -0700 gallivm: Fix method overriding in raw_debug_ostream. Use matching type qualifers to avoid method hiding. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 6a9bd784f4ac68ad0a731dcd39e5a3c39989f2be Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 13 22:40:52 2012 -0700 gallivm: Fix createOProfileJITEventListener namespace with llvm-3.1. llvm-3.1svn r152620 refactored the OProfile profiling code. createOProfileJITEventListener was moved from the llvm namespace to the llvm::JITEventListener namespace. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit b674955d39adae272a779be85aa1bd665de24e3e Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Mar 5 22:00:40 2012 -0800 gallivm: Pass in a MCRegisterInfo to MCInstPrinter on llvm-3.1. llvm-3.1svn r152043 changes createMCInstPrinter to take an additional MCRegisterInfo argument. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 11ab69971a8a31c62f6de74905dbf8c02884599f Author: Vinson Lee <vlee@freedesktop.org> Date: Wed Feb 29 21:20:53 2012 -0800 Revert "gallivm: Change getExtent and readByte to non-const with llvm-3.1." This reverts commit d5a6c172547d8964f4d4bb79637651decaf9deee. llvm-3.1svn r151687 makes MemoryObject accessor members const again. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 339960c82d2a9f5c928ee9035ed31dadb7f45537 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 14 16:19:56 2012 +0200 gallivm: (trivial) fix assertion failure for mipmapped 1d textures In lp_build_rho, we may end up with a 1-element vector (for mipmapped 1d textures), but in this case we require the type to be a non-vector type, so need a cast. commit 9d73edb727bd6d196030dc3026b7bf0c574b3e19 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:12:07 2012 +0200 gallivm: prepare for per-quad lod calculations for large vectors to be able to handle multiple quads at once in texture sampling and still do lod calculations per quad, it is necessary to get the per-quad derivatives into the lp_build_rho function. Until now these derivative values were just scalars, which isn't going to work. So we now use vectors, and since the interface needs to change we also do some different (slightly more efficient) packing of the values. For 8-wide vectors the packed derivative values for 3 coords would look like this, this scales to a arbitrary (multiple of 4) vector size: ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____ The second vector will be unused for 1d and 2d textures. To facilitate future changes the derivative values are put into a struct, since quite some functions just pass these values through. The generated code seems to be very slightly better for 2d textures (with 4-wide vectors) than before with sse2 (if you have a cpu with physical 128bit simd units - otherwise it's probably not a win). v2: suggestions from José, rename variables, add comments, use swizzle helper commit 0aa21de0d31466dac77b05c97005722e902517b8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:10:31 2012 +0200 gallivm: add undefined swizzle handling to lp_build_swizzle_aos This is useful for vectors with "holes", it lets llvm choose the most efficient shuffle instructions if some elements aren't needed without having to worry what elements to manually pick otherwise. commit 00faf3f370e7ce92f5ef51002b0ea42ef856e181 Author: José Fonseca <jfonseca@vmware.com> Date: Fri May 4 17:25:16 2012 +0100 gallivm: Get the LLVM IR optimization passes before JIT compilation. MC-JIT engine compiles the module immediately on creation, so the optimization passes were being run too late. So now we create a target data layout from a string, that matches the ABI parameters reported by the compiler. The backend optimization passes were always been run, so the performance improvement is modest (3% on multiarb mesa demo). Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit 40a43f4e2ce3074b5ce9027179d657ebba68800a Author: Roland Scheidegger <sroland@vmware.com> Date: Wed May 2 16:03:54 2012 +0200 gallivm: (trivial) fix wrong define used in lp_build_pack2 should fix stack-smashing crashes. commit e6371d0f4dffad4eb3b7a9d906c23f1c88a2ab9e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 30 21:25:29 2012 +0200 gallivm: add perf warnings when not using intrinsics with 256bit vectors Helper functions using integer sse2 intrinsics could split the vectors with AVX instead of using generic fallback (which should be faster). We don't actually expect to hit these paths (hence don't fix them up to actually do the vector splitting) so just emit warnings (for those functions where it's obvious doing split/intrinsic is faster than using generic path). Only emit warnings for 256bit vectors since we _really_ don't expect to hit arbitrary large vectors which would affect a lot more functions. The warnings do not actually depend on avx since the same logic applies to plain sse2 too (but of course again there's _really_ no reason we should hit these functions with 256bit vectors without avx). commit 8a9ea701ea7295181e846c6383bf66a5f5e47637 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:37:07 2012 +0200 gallivm: split vectors manually for avx in lp_build_pack2 (v2) There's 2 reasons for this: First, there's a llvm bug (fixed in 3.1) which generates tons of byte inserts/extracts otherwise, and second, more importantly, we want to use pack intrinsics instead of shuffles. We do this in lp_build_pack2 and not the calling code (aos sample path) because potentially other callers might find that useful too, even if for larger sequences of code using non-native vector sizes it might be better to manually split vectors. This should boost texture performance in the aos path considerably. v2: fix issues with intrinsics types with old llvm commit 27ac5b48fa1f2ea3efeb5248e2ce32264aba466e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:26:22 2012 +0200 llvmpipe: refactor lp_build_pack2 (v2) prettify, and it's unnecessary to assert when there's no intrinsic due to unsupported bit width - the shuffle path will work regardless. In contrast lp_build_packs2, should only rely on lp_build_pack2 doing the clamping for element sizes for which there is a sse2 intrinsic. v2: fix bug spotted by Jose regarding the intrinsic type for packusdw on old llvm versions. commit ddf279031f0111de4b18eaf783bdc0a1e47813c8 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:13:59 2012 +0200 gallivm: add src width check in lp_build_packs2() not doing so would skip clamping even if no sse2 pack instruction is available, which is incorrect (in theory only, such widths would also always hit a (unnecessary) assertion in lp_build_pack2(). commit e7f0ad7fe079975eae7712a6e0c54be4fae0114b Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Apr 27 15:57:00 2012 +0200 gallivm: (trivial) fix crash-causing typo for npot textures with avx commit 28a9d7f6f655b6ec508c8a3aa6ffefc1e79793a0 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Apr 25 19:38:45 2012 +0200 gallivm: (trivial) remove code mistakenly added twice. commit d5926537316f8ff67ad0a52e7242f7c5478d919b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Apr 24 21:16:15 2012 +0200 gallivm: add a new avx aos sample path (v2) Try to avoid mixing float and int address calculations. This does texture wrap modes with floats, and then the offset calculations still with ints (because of lack of precision with floats, though we could do some effort to make it work with not too large (16MB) textures). This also handles wrap repeat mode with npot-sized textures differently than either the old soa or aos int path (likely way faster but untested). Otherwise the actual address wrap code is largely similar to the soa path (not quite the same as this one also has some int code), it should get used by avx soa sampling later as well but doesn't handle more complex address modes yet (this will also have the benefit that we can use aos sampling path for all texture address modes). Generated code for that looks reasonable, but still does not split vectors explicitly for fetch/filter which means still get hit by llvm (fixed upstream) which generates hundreds of pinsrb/pextrb instead of two shuffles. It is not obvious though if it's much of a win over just doing address calcs 4-wide but with ints, even if it is definitely much less instructions on avx. piglit's texwrap seems to look exactly the same but doesn't test neither the non-normalized nor the npot cases. v2: fix comments, prettify based on Brian's and Jose's feedback. commit bffecd22dea66fb416ecff8cffd10dd4bdb73fce Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Apr 19 01:58:29 2012 +0200 gallivm: refactor aos lp_build_sample_image_nearest/linear split them up to separate address calculations and fetching/filtering. Need this for being able to do 8-wide float address calcs and 4-wide fetch/filter later (for avx). Plus the functions were very big scary monsters anyway (in particular lp_build_sample_image_linear). commit a80b325c57529adddcfa367f96f03557725c4773 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 16 17:17:18 2012 +0200 gallivm: fix lp_build_resize when truncating width but expanding vector size Missed this case which I thought was impossible - the assertion for it was right after the division by zero... (AoS) texture sampling may ask us to do this, for things like 8 4x32int vectors to 1 32x8int vector conversion (eventually, we probably don't want this to happen). commit f9c8337caa3eb185830d18bce8b95676a065b1d7 Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Apr 14 18:00:59 2012 +0200 gallivm: fix cube maps with larger vectors This makes the branchless cube face selection code work with larger vectors. Because the complexity is quite high (cannot really be improved it seems, per-face selection would reduce complexity a lot but this leads to errors unless the derivatives are calculated all from the same face which almost doubles the work to be done) it is still slower than the branching version, hence only enable this with large vectors. It doesn't actually do per-quad face selection yet (only makes sense with matching lod selection, in fact it will select the same face for all pixels based on the average of the first four pixels for now) but only different shuffles are required to make it work (the branching version actually should work with larger vectors too now thanks to the improved horizontal add but of course it cannot be extended to really select the face per-quad unless doing branching per quad). commit 7780c58869fc9a00af4f23209902db7e058e8a66 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 21:11:12 2012 +0100 llvmpipe: (trivial) fix compiler warning and also clarify comment regarding availability of popcnt instruction. commit a266dccf477df6d29a611154e988e8895892277e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 14:21:07 2012 +0100 gallivm: remove unneeded members in lp_build_sample_context Minor cleanup, the texture width, height, depth aren't accessed in their scalar form anywhere. Makes it more obvious those values should probably be fetched already vectorized (but this requires more invasive changes)... commit b678c57fb474e14f05e25658c829fc04d2792fff Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 29 15:53:55 2012 +0100 gallivm: add a helper for concatenating vectors Similar to the extract_range helper intended to get around slow code generated by llvm for 128bit insertelements. Concatenating two 128bit vectors this way will result in a single vinsertf128 operation rather than two 64bit stores plus one 128bit load, though it might be mildly useful for other purposes as well. commit 415ff228bcd0cf5e44a4c15350a661f0f5520029 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 19:41:15 2012 +0100 gallivm: add a custom 2x8f->1x16ub avx conversion path Similar to the existing 4x4f->1x16ub sse2 path, shaves off a couple instructions (min/max mostly) because it relies on pack intrinsics clamping. commit 78c08fc89f8fbcc6dba09779981b1e873e2a0299 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 18:44:07 2012 +0100 gallivm: add avx arithmetic intrinsics Add all avx intrinsics for arithmetic functions (with the exception of the horizontal add function which needs another look). Seems to pass basic tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit a586caa2800aa5ce54c173f7c0d4fc48153dbc4e Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 15:31:35 2012 +0100 gallivm: add avx logic intrinsics Add the blend intrinsics for 8-wide float and 4-wide double vectors. Since we lack 256bit int instructions these are used for int vectors as well, though obviously not for byte or word element values. The comparison intrinsics aren't extended for avx since these are only used for pre-2.7 llvm versions. commit 70275e4c13c89315fc2560a4c488c0e6935d5caf Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 00:40:53 2012 +0100 gallivm: new helper function for extract shuffles. Based on José's idea as we can need that in a couple places. Note that such shuffles should not be used lightly, since data layout of <4 x i8> is different to <16 x i8> for instance, hence might cause data rearrangement. commit 4d586dbae1b0c55915dda1759d2faea631c0a1c2 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 18:27:25 2012 +0100 gallivm: (trivial) don't overallocate shuffle variable using wrong define meant huge array... commit 06b0ec1f6d665d98c135f9573ddf4ba04b2121ad Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 17:54:20 2012 +0100 gallivm: don't do per-element extract/insert for vector element resize Instead of doing per-element extract/insert if the src vectors and dst vector differ in total size (which generates atrocious code) first change the src vectors size by using shuffles to destination vector size. We can still do better than that on AVX for packing to color buffer (by exploiting pack intrinsics characteristics hence eleminating the need for some clamps) but this already generates much better code. v2: incorporate feedback from José, Keith and use shuffle instead of bitcasts/extracts. Due to llvm deficiencies the latter cause all data to get moved to GPRs and back in pieces (even though the data in the regs actually stays the same...). commit c9970d70e05f95d3f52fe7d2cd794176a52693aa Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 19:33:19 2012 +0000 gallivm: fix bug in simple position interpolation Accidental use of position attribute instead of just pixel coordinates. Caused failures in piglit glsl-fs-ceil and glsl-fs-floor. commit d0b6fcdb008d04d7f73d3d725615321544da5a7e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 15:31:14 2012 +0000 gallivm: fix emission of ceil opcode lp_build_ceil seems more appropriate than lp_build_trunc. This seems to be never hit though someone performs some ceil to floor magic. commit d97fafed7e62ffa6bf76560a92ea246a1a26d256 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 22 11:46:52 2012 +0000 gallivm: new vectorized path for cubemap calculations should be faster when adapted to multiple quads as only selection masks need to be different. The code is more or less a per-pixel version adapted to only do it per quad. A per pixel version would be much simpler (could drop 2 selects, 6 broadcasts and the messy horizontal add of 3 vectors at the expense of only 2 more absolute value instructions - would also just work for arbitary large vectors). This version doesn't yet work with larger vectors because the horizontal add isn't adjusted to be able to work with 2x4 vectors (and also because face selection wouldn't be done per quad just per block though that would be only a correctness issue just as with lod selection). The downside is this code is quite a bit slower. On a Core2 it can be sped up by disabling the hw blend instructions for selection and using logicop fallbacks instead, but it is still slower than the old code, hence leave that in for now. Probably will chose one or the other version based on vector length in the end. commit b375fbb18a3fd46859b7fdd42f3e9908ea4ff9a3 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 21 14:42:29 2012 +0000 gallivm: fix optimized occlusion query intrinsic name commit a9ba0a3b611e48efbb0e79eb09caa85033dbe9a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Mar 21 16:19:43 2012 +0000 draw,gallivm,llvmpipe: Call gallivm_verify_function everywhere. commit f94c2238d2bc7383e088b8845b7410439a602071 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 18:54:10 2012 +0000 gallivm: optimize calculations for cube maps a bit this does some more vectorized calculations and uses horizontal adds if possible. A definite win with sse3 otherwise it doesn't seem to make much of a difference. In any case this is arithmetically identical, cannot handle larger vectors. Should be useful as a reference point against larger vector version later... commit 21a2c1cf3c8e1ac648ff49e59fdc0e3be77e2ebb Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 15:16:27 2012 +0000 llvmpipe: slight optimization of occlusion queries using movmskps when available. While this is slightly better for cpus without popcnt we should really sum the vectors ourselves (it is also possible to cast to i4 before doing the popcnt but that doesn't help that much neither since llvm is using some optimized popcnt version for i32) commit 5ab5a35f216619bcdf55eed52b0db275c4a06c1b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 13:32:11 2012 +0000 llvmpipe: fix occlusion queries with larger vectors need to adjust casts etc. commit ff95e6fdf5f16d4ef999ffcf05ea6e8c7160b0d5 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Mar 19 20:15:25 2012 +0000 gallivm: Restore optimization passes. commit 57b05b4b36451e351659e98946dae27be0959832 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:34:22 2012 +0000 llvmpipe: use existing min2 macro commit bc9a20e19b4f600a439f45679451f2e87cd4b299 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:07:27 2012 +0000 llvmpipe: add some safeguards against really large vectors As per José's suggestion, prevent things from blowing up if some cpu would have 1024bit or larger vectors. commit 0e2b525e5ca1c5bbaa63158bde52ad1c1564a3a9 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:31:08 2012 +0000 llvmpipe: fix mask generation for uberwide vectors this was the only piece preventing 16-wide vectors from working (apart from the LP_MAX_VECTOR_WIDTH define that is), which is the maximum as we don't get more pixels in the fragment shader at once. Hence adjust that so things could be tested properly with that size even though there seems to be no practical value. commit 3c8334162211c97f3a11c7f64e9e5a2a91ad9656 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:19:41 2012 +0000 llvmpipe: fix the simple interpolation method with larger vectors so both methods actually _really_ work now. Makes textures look nice with larger vectors... commit 1cb0464ef8871be1778d43b0c56adf9c06843e2d Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 17:26:35 2012 +0000 llvmpipe: fix mask generation and position interpolation with 8-wide vectors trivial bugs, with these things start to look somewhat reasonable. Textures though have some swizzling issues it seems. commit 168277a63ef5b72542cf063c337f2d701053ff4b Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 16:04:03 2012 +0000 llvmpipe: don't overallocate variables we never have more than 16 (stamp size) / 4 (minimum possible vector size). (With larger vectors those variables are still overallocated a bit.) commit 409b54b30f81ed0aa9ed0b01affe15c72de9abd2 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:56:48 2012 +0000 llvmpipe: add some 32f8 formats to lp_test_conv Also add the ability to handle different sized vectors. commit 55dcd3af8366ebdac0af3cdb22c2588f24aa18ce Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:47:27 2012 +0000 gallivm: handle different sized vectors in conversion / pack only fully generic path for now (extract/insert per element). commit 9c040f78c54575fcd94a8808216cf415fe8868f6 Author: Roland Scheidegger <sroland@vmware.com> Date: Sun Mar 18 00:58:28 2012 +0100 llvmpipe: fix harmless use of unitialized values commit 551e9d5468b92fc7d5aa2265db9a52bb1e368a36 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:31:21 2012 +0100 gallivm: drop special path in extract_broadcast with different sized vectors Not needed, llvm can handle shuffles with different sized result vector just fine. Should hopefully generate the same code in the end, but simpler IR. commit 44da531119ffa07a421eaa041f63607cec88f6f8 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:28:49 2012 +0100 llvmpipe: adapt interpolation for handling multiple quads at once this is still WIP there are actually two methods possible not quite sure what makes the most sense, so there's code for both for now: 1) the iterative method as used before (compute attrib values at upper left corner of stamp and upper left corner of each quad initially). It is improved to handle more than one quad at once, and also do some more vectorized calculations initially for slightly better code - newer cpus have full throughput with 4 wide float vectors, hence don't try to code up a path which might be faster if there's just one channel active per attribute. 2) just do straight interpolation for each pixel. Method 2) is more work per quad, but less initially - if all quads are executed significantly more overall though. But this might change with larger vector lengths. This method would also be needed if we'd do some kind of active quad merging when operating on multiple quads at once. This path contains some hack to force llvm to generate better code, it is still far from ideal though, still generates far too many unnecessary register spills/reloads. Both methods should work with different sized vectors. Not very well tested yet, still seems to work with four-wide vectors, need changes elsewhere to be able to test with wider vectors. commit be5d3e82e2fe14ad0a46529ab79f65bf2276cd28 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:59:37 2012 +0000 draw: Cleanup. commit f85bc12c7fbacb3de2a94e88c6cd2d5ee0ec0e8d Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:43:30 2012 +0000 gallivm: More module compilation refactoring. commit d76f093198f2a06a93b2204857e6fea5fd0b3ece Author: José Fonseca <jfonseca@vmware.com> Date: Thu Mar 15 21:29:11 2012 +0000 llvmpipe: Use gallivm_compile/free_function() in linear code. Should had been done before. commit 122e1adb613ce083ad739b153ced1cde61dfc8c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 13 14:47:10 2012 +0100 llvmpipe: generate partial pixel mask for multiple quads still works with one quad, cannot be tested yet with more At least for now always fixed order with multiple quads. commit 4c4f15081d75ed585a01392cd2dcce0ad10e0ea8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 22:09:24 2012 +0100 llvmpipe: refactor state setup a bit Refactor to make it easier to emit (and potentially later fetch in fs) coefficients for multiple attributes at once. Need to think more about how to make this actually happen however, the problem is different attributes can have different interpolation modes, requiring different handling in both setup and fs (though linear and perspective handling is close). commit 9363e49722ff47094d688a4be6f015a03fba9c79 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 19:23:23 2012 +0100 llvmpipe: vectorize tri offset calc cuts number of instructions in quad-offset-factor from 107 to 75. This code actually duplicated the (scalar) code calculating the determinant except it used different vertex order (leading to different sign but it doesn't matter) hence llvm could not have figured out it's the same (of course with determinant vectorized in the other place that wouldn't have worked any longer neither). Note this particular piece doesn't actually vectorize well, not many arithmetic instructions left but tons of shuffle instructions... Probably would need to work on n tris at a time for better vectorization. commit 63169dcb9dd445c94605625bf86d85306e2b4297 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 03:11:37 2012 +0100 llvmpipe: vectorize some scalar code in setup reduces number of arithmetic instructions, and avoids loading vector x,y values twice (once as scalars once as vectors). Results in a reduction of instructions from 76 to 64 in fs setup for glxgears (16%) on a cpu with sse41. Since this code uses vec2 disguised as vec4, on old cpus which had physical 64bit sse units (pre-Core2) it probably is less of a win in practice (and if you have no vectors you can only hope llvm eliminates the arithmetic for unneeded elements). commit 732ecb877f951ab89bf503ac5e35ab8d838b58a1 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 7 00:32:24 2012 +0100 draw: fix clipping bug introduced by 4822fea3f0440b5205e957cd303838c3b128419c broke clipping pretty badly (verified with lineclip test) commit ef5d90b86d624c152d200c7c4056f47c3c6d2688 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 23:38:59 2012 +0100 draw: don't store vertex header per attribute storing the vertex header once per attribute is totally unnecessary. Some quick look at the generated assembly says llvm in fact cannot optimize away the additional stores (maybe due to potentially aliasing pointers somewhere). Plus, this makes the code cleaner and also allows using a vector "or" instead of scalar ones. commit 6b3a5a57b0b9850854cfbd7b586e4e50102dda71 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 19:11:01 2012 +0100 draw: do the per-vertex "boolean" clipmask "or" with vectors no point extracting the values and doing it per component. Doesn't help that much since we still extract the values elsewhere anyway. commit 36519caf1af40e4480251cc79a2d527350b7c61f Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 2 22:27:01 2012 +0100 gallivm: fix lp_build_extract_broadcast with different sized vectors Fix the obviously wrong argument, so it doesn't blow up. commit 76d0ac3ad85066d6058486638013afd02b069c58 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 2 12:16:23 2012 +0000 draw: Compile per module and not per function (WIP). Enough to get gears w/ LLVM draw + softpipe to work on AVX doing: GALLIUM_DRIVER=softpipe SOFTPIPE_USE_LLVM=yes glxgears But still hackish -- will need to rethink and refactor this. commit 78e32b247d2a7a771be9a1a07eb000d1e54ea8bd Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 12:01:05 2012 +0000 llvmpipe: Remove lp_state_setup_fallback. Never used. commit 6895d5e40d19b4972c361e8b83fdb7eecda3c225 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Feb 27 19:14:27 2012 +0000 llvmpipe: Don't emit EMMS on x86 We already take precautions to ensure that LLVM never emits MMX code. commit 4822fea3f0440b5205e957cd303838c3b128419c Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 29 15:58:19 2012 +0100 draw: modifications for larger vector sizes We want to be able to use larger vectors especially for running the vertex shader. With this patch we build soa vectors which might have a different length than 4. Note that aos structures really remain the same, only when aos structures are converted to soa potentially different sized vectors are used. Samplers probably don't work yet, didn't look at them. Testing done: glxgears works with both 128bit and 256bit vectors. commit f4950fc1ea784680ab767d3dd0dce589f4e70603 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:57 2012 +0100 gallivm: override native vector width with LP_NATIVE_VECTOR_WIDTH env var for debug commit 6ad6dbf0c92f3bf68ae54e5f2aca035d19b76e53 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:24 2012 +0100 draw: allocate storage with alignment according to native vector width commit 7bf0e3e7c9bd2469ae7279cabf4c5229ae9880c1 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Feb 24 19:06:08 2012 +0000 gallivm: Fix comment grammar. Was missing several words. Spotted by Roland. commit b20f1b28eb890b2fa2de44a0399b9b6a0d453c52 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 19:22:09 2012 +0000 gallivm: Use MC-JIT on LLVM 3.1 + (i.e, SVN) MC-JIT Note: MC-JIT is still WIP. For this to work correctly it requires LLVM changes which are not yet upstream. commit b1af4dfcadfc241fd4023f4c3f823a1286d452c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 20:03:15 2012 +0100 llvmpipe: use new lp_type_width() helper in lp_test_blend commit 04e0a37e888237d4db2298f31973af459ef9c95f Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 19:50:34 2012 +0100 llvmpipe: clean up lp_test_blend a little Using variables just sized and aligned right makes it a bit more obvious what's going on. The test still only tests vector length 4. For AoS anything else probably isn't going to work. For SoA other lengths should work (at least with floats). commit e61c393d3ec392ddee0a3da170e985fda885a823 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:48:30 2012 +0000 gallivm: Ensure vector width consistency. Instead of assuming that everything is the max native size. commit 330081ac7bc41c5754a92825e51456d231bf84dd Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:44:14 2012 +0000 draw: More simd vector width consistency fixes. commit d90ca002753596269e37297e2e6c139b19f29f03 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:43:00 2012 +0000 gallivm: Remove unused lp_build_int32_vec4_type() helper. commit cae23417824d75869c202aaf897808d73a2c1db0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 17:32:16 2012 +0100 gallivm: use global variable for native vector width instead of define We do not know the simd extensions (and hence the simd width we should use) available at compile time. At least for now keep a define for maximum vector width, since a global variable obviously can't be used to adjust alignment of automatic stack variables. Leave the runtime-determined value at 128 for now in all cases. commit 51270ace6349acc2c294fc6f34c025c707be538a Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 15:41:02 2012 +0000 gallivm: Add a hunk inadvertedly lost when rebasing. commit bf256df9cfdd0236637a455cbaece949b1253e98 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:24:23 2012 +0000 llvmpipe: Use consistent vector width in depth/stencil test. commit 5543b0901677146662c44be2cfba655fd55da94b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:19:59 2012 +0000 draw: Use a consistent the vector register width. Instead of 4x32 sometimes, LP_NATIVE_VECTOR_WIDTH other times. commit eada8bbd22a3a61f549f32fe2a7e408222e5c824 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 12:08:04 2012 +0000 gallivm: Remove garbagge collection. MC-JIT will require one compilation per module (as opposed to one compilation per function), therefore no state will be shared, eliminating the need to do garbagge collection. commit 556697ea0ed72e0641851e4fbbbb862c470fd7eb Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 10:33:41 2012 +0000 gallivm: Move all native target initialization to lp_set_target_options(). commit c518e8f3f2649d5dc265403511fab4bcbe2cc5c8 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:52:32 2012 +0000 llvmpipe: Create one gallivm instance for each test. commit 90f10af8920ec6be6f2b1e7365cfc477a0cb111d Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:48:08 2012 +0000 gallivm: Avoid LLVMAddGlobalMapping() in lp_bld_assert(). Brittle, complex, and unecesary. Just use function pointer constant. commit 98fde550b33401e3fe006af59db4db628bcbf476 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:21:26 2012 +0000 gallivm: Add a lp_build_const_func_pointer() helper. To be reused in all places where we want to call C code. commit 6cfedadb62c2ce5af8d75969bc95a607f3ece118 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:44:41 2012 +0000 gallivm: Cleanup/simplify lp_build_const_string_variable. - Move to lp_bld_const where it belongs - Rename to lp_build_const_string - take the length from the argument (and don't count the zero terminator twice) - bitcast the constant to generic i8 * commit db1d4018c0f1fa682a9da93c032977659adfb68c Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 11:52:17 2012 +0000 gallivm: Set NoFramePointerElimNonLeaf to true where supported. commit 088614164aa915baaa5044fede728aa898483183 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 22 19:38:47 2012 +0100 llvmpipe: pass in/out pointers rather scalar floats in lp_bld_arit we don't want llvm to potentially optimize away the vectors (though it doesn't seem to currently), plus we want to be able to handle in/out vectors of arbitrary length. commit 3f5c4e04af8a7592fdffa54938a277c34ae76b51 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:22:55 2012 +0100 gallivm: fix lp_build_sqrt() for vector length 1 since we optimize away vectors with length 1 need to emit intrinsic without vector type. commit 79d94e5f93ed8ba6757b97e2026722ea31d32c06 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 17:00:46 2012 +0000 llvmpipe: Remove lp_test_round. commit 81f41b5aeb3f4126e06453cfc78990086b85b78d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:56:24 2012 +0100 llvmpipe: subsume lp_test_round into lp_test_arit Much simpler, and since the arguments aren't passed as 128bit values can run on any arch. This also uses the float instead of the double versions of the c functions (which probably was the intention anyway). In contrast to lp_test_round the output is much less verbose however. Tested vector width of 32 to 512 bits - all pass except 32 (length 1) which crashes in lp_build_sqrt() due to wrong type. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 945b338b421defbd274481d8c4f7e0910fd0e7eb Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 09:55:03 2012 +0000 gallivm: Centralize the function compilation logic. This simplifies a lot of code. Also doing this in a central place will make it easier to carry out the changes necessary to use MC-JIT in the future. gallivm: Fix typo in explicit derivative shuffle. Trivial. draw: make DEBUG_STORE work again adapt to lp_build_printf() interface changes Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: get rid of vecnf_from_scalar() just use lp_build_broadcast directly (cannot assign a name but don't really need it, vecnf_from_scalar() was producing much uglier IR due to using repeated insertelement instead of insertelement+shuffle). Reviewed-by: José Fonseca <jfonseca@vmware.com> llvmpipe: fix typo in complex interpolation code Fixes position interpolation when using complex mode (piglit fp-fragment-position and similar) Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: fix clipvertex/position storing again This appears to be the result of a bad merge. Fixes piglit tests relying on clipping, like a lot of the interpolation tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Fix explicit derivative manipulation. Same counter variable was being used in two nested loops. Use more meanigful variable names for the counter to fix and avoid this. gallivm: Prevent buffer overflow in repeat wrap mode for NPOT. Based on Roland's patch, discussion, and review . Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix dims for TGSI_TEXTURE_1D in emit_tex. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix explicit volume texture derivatives. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: fix 1d shadow texture sampling Always r coordinate is used, hence need 3 coords not two (the second one is unused). Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Enable AVX support without MCJIT, where available. For now, this just enables AVX on Windows for testing. If the code is stable then we might consider prefering the old JIT wherever possible. No change elsewhere. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2012-07-13 18:09:30 +01:00
*/
static inline LLVMValueRef
clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
{
LLVMValueRef max;
/* this is the largest number smaller than 1.0 representable as float */
max = lp_build_const_vec(bld->gallivm, bld->type,
1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
return lp_build_min_ext(bld, fract, max,
GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
gallivm,draw,llvmpipe: Support wider native registers. Squashed commit of the following: commit 7acb7b4f60dc505af3dd00dcff744f80315d5b0e Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:46:31 2012 +0100 draw: Don't use dynamically sized arrays. Not supported by MSVC. commit 5810c28c83647612cb372d1e763fd9d7780df3cb Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:44:16 2012 +0100 gallivm,llvmpipe: Don't use expressions with PIPE_ALIGN_VAR(). MSVC doesn't accept exceptions in _declspec(align(...)). Use a define instead. commit 8aafd1457ba572a02b289b3f3411e99a3c056072 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:41:56 2012 +0100 gallium/util: Make u_cpu_detect.h header C++ safe. commit 5795248350771f899cfbfc1a3a58f1835eb2671d Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 2 12:08:01 2012 +0100 gallium/util: Add ULL suffix to large constants. As suggested by Andy Furniss: it looks like some old gcc versions require it. commit 4c66c22727eff92226544c7d43c4eb94de359e10 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Truly disable INF/NAN tests on MSVC. Thanks to Brian for spotting this. commit 8bce274c7fad578d7eb656d9a1413f5c0844c94e Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Disable INF/NAN tests on MSVC. Somehow they are not recognized as constants. commit 6868649cff8d7fd2e2579c28d0b74ef6dd4f9716 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 15:05:24 2012 +0200 gallivm: Cleanup the 2 x 8 float -> 16 ub special path in lp_build_conv. No behaviour change intended, like 7b98455fb40c2df84cfd3cdb1eb7650f67c8a751. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 5147a0949c4407e8bce9e41d9859314b4a9ccf77 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 14:28:19 2012 +0200 gallivm: (trivial) fix issues with multiple-of-4 texture fetch Some formats can't handle non-multiple of 4 fetches I believe, but everything must support length 1 and multiples of 4. So avoid going to scalar fetch (which is very costly) just because length isn't 4. Also extend the hack to not use shift with variable count for yuv formats to arbitrary length (larger than 1) - doesn't matter how many elements we have we always want to avoid it unless we have variable shift count instruction (which we should get with avx2). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 87ebcb1bd71fa4c739451ec8ca89a7f29b168c08 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jul 4 02:09:55 2012 +0200 gallivm: (trivial) fix typo for wrap repeat mode in linear filtering aos code This would lead to bogus coordinates at the edges. (undetected by piglit because this path is only taken for block-based formats). Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 3a42717101b1619874c8932a580c0b9e6896b557 Author: José Fonseca <jfonseca@vmware.com> Date: Tue Jul 3 19:42:49 2012 +0100 gallivm: Fix TGSI integer translation with AVX. commit d71ff104085c196b16426081098fb0bde128ce4f Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 15:17:41 2012 +0100 llvmpipe: Fix LLVM JIT linear path. It was not working properly because it was looking at the JIT function before it was actually compiled. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit a94df0386213e1f5f9a6ed470c535f9688ec0a1b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Jun 28 18:07:10 2012 +0100 gallivm: Refactor lp_build_broadcast(_scalar) to share code. Doesn't really change the generated assembly, but produces more compact IR, and of course, makes code more consistent. Reviewed-by: Brian Paul <brianp@vmware.com> commit 66712ba2731fc029fa246d4fc477d61ab785edb5 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 17:30:13 2012 +0100 gallivm: Make LLVMContextRef a singleton. There are any places inside LLVM that depend on it. Too many to attempt to fix. Reviewed-by: Brian Paul <brianp@vmware.com> commit ff5fb7897495ac263f0b069370fab701b70dccef Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 28 18:15:27 2012 +0200 gallivm: don't use 8-wide texture fetch in aos path This appears to be a slight loss usually. There are probably several reasons for that: - fetching itself is scalar - filtering is pure int code hence needs splitting anyway, same for the final texel offset calculations - texture wrap related code, which can be done 8-wide, is slightly more complex with floats (with clamp_to_edge) and float operations generally more costly hence probably not much faster overall - the code needed to split when encountering different mip levels for the quads, adding complexity So, just split always for aos path (but leave it 8-wide for soa, since we do 8-wide filtering there when possible). This should certainly be revisited if we'd have avx2 support. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ce8032b43dcd8e8d816cbab6428f54b0798f945d Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:41:19 2012 +0200 gallivm: (trivial) don't extract fparts variable if not needed Did not have any consequences but unnecessary. commit aaa9aaed8f80dc282492f62aa583a7ee23a4c6d5 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:09:06 2012 +0200 gallivm: fix precision issue in aos linear int wrap code now not just passes at a quick glance but also with piglit... If we do the wrapping with floats, we also need to set the weights accordingly. We can potentially end up with different (integer) coordinates than what the integer calculations would have chosen, which means the integer weights calculated previously in this case are completely wrong. Well at least that's what I think happens, at least recalculating the weights helps. (Some day really should refactor all the wrapping, so we do whatever is fastest independent of 16bit int aos or 32bit float soa filtering.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit fd6f18588ced7ac8e081892f3bab2916623ad7a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 11:15:53 2012 +0100 gallium/util: Fix parsing of options with underscore. For example GALLIVM_DEBUG=no_brilinear which was being parsed as two options, "no" and "brilinear". commit 09a8f809088178a03e49e409fa18f1ac89561837 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 26 15:00:14 2012 +0100 gallivm: Added a generic lp_build_print_value which prints a LLVMValueRef. Updated lp_build_printf to share common code. Removed specific lp_build_print_vecX. Reviewed-by: José Fonseca <jfonseca@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit e59bdcc2c075931bfba2a84967a5ecd1dedd6eb0 Author: José Fonseca <jfonseca@vmware.com> Date: Wed May 16 15:00:23 2012 +0100 draw,llvmpipe: Avoid named struct types on LLVM 3.0 and later. Starting with LLVM 3.0, named structures are meant not for debugging, but for recursive data types, previously also known as opaque types. The recursive nature of these types leads to several memory management difficulties. Given that we don't actually need recursive types, avoid them altogether. This is an attempt to address fdo bugs 41791 and 44466. The issue is somewhat random so there's no easy way to check how effective this is. Cherry-picked from 9af1ba565dfd5cef9ee938bb7c04767d14878fbf commit df6070f618a203c7a876d984c847cde4cbc26bdb Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 14:42:53 2012 +0200 gallivm: (trivial) fix typo in faster aos linear int wrap code no longer crashes, now REALLY tested. commit d8f98dce452c867214e6782e86dc08562643c862 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 18:20:58 2012 +0200 llvmpipe: (trivial) remove bogus optimization for float aos repeat wrap This optimization for nearest filtering on the linear path generated likely bogus results, and the int path didn't have any optimizations there since the only shader using force_nearest apparently uses clamp_to_edge not repeat wrap anyway. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit c4e271a0631087c795e756a5bb6b046043b5099d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 23:01:52 2012 +0200 gallivm: faster repeat wrap for linear aos path too Even if we already have scaled integer coords, it's way faster to use the original float coord (plus some conversions) rather than use URem. The choice of what to do for texture wrapping is not really tied to int aos or float soa filtering though for some modes there can be some gains (because of easier weight calculations). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 1174a75b1806e92aee4264ffe0ffe7e70abbbfa3 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 14:39:22 2012 +0200 gallivm: improve npot tex wrap repeat in linear soa path URem gets translated into series of scalar divisions so just about anything else is faster. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit f849ffaa499ed96fa0efd3594fce255c7f22891b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 00:40:35 2012 +0100 gallivm: (trivial) fix near-invisible shift-space typo I blame the keyboard. commit 5298a0b19fe672aebeb70964c0797d5921b51cf0 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:24:28 2012 +0200 gallivm: add new intrinsic helper to deal with arbitrary vector length This helper will split vectors which are too large for the hw, or expand them if they are too small, so a caller of a function using intrinsics which uses such sizes need not split (or expand) the vectors manually and the function will still use the intrinsic instead of dropping back to generic llvm code. It can also accept scalars for use with pseudo-vector intrinsics (only useful for float arguments, all x86 scalar simd float intrinsics use 4vf32). Only used for lp_build_min/max() for now (also added the scalar float case for these while there). (Other basic binary functions could use it easily, whereas functions with a different interface would need different helpers.) Expanding vectors isn't widely used, because we always try to use build contexts with native hw vector sizes. But it might (or not) be nicer if this wouldn't need to be done, the generated code should in theory stay the same (it does get hit by lp_build_rho though already since we didn't have a intrinsic for the scalar lp_build_max case before). v2: incorporated Brian's feedback, and also made the scalar min/max case work instead of crash (all scalar simd float intrinsics take 4vf32 as argument, probably the reason why it wasn't used before). Moved to lp_bld_intr based on José's request, and passing intrinsic size instead of length. Ideally we'd derive the source type info from the passed in llvm value refs and process some llvmtype return type so we could handle intrinsics where the source and destination type isn't the same (like float/int conversions, packing instructions) but that's a bit too complicated for now. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 01aa760b99ec0b2dc8ce57a43650e83f8c1becdf Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:19:18 2012 +0200 gallivm: (trivial) increase max code size for shader disassembly 64kB was just short of what I needed (which caused a crash) hence increase to 96kB (should probably be smarter about that). commit 74aa739138d981311ce13076388382b5e89c6562 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:53:29 2012 +0100 gallivm: simplify aos float tex wrap repeat nearest just handle pot and npot the same. The previous pot handling ended up with exactly the same instructions plus 2 more (leave it in the soa path though since it is probably still cheaper there). While here also fix a issue which would cause a crash after an assert. commit 0e1e755645e9e49cfaa2025191e3245ccd723564 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:29:24 2012 +0100 gallivm: (trivial) skip floor rounding in ifloor when not signed This was only done for the non-sse41 case before, but even with sse41 this is obviously unnecessary (some callers already call itrunc in this case anyway but some might not). commit 7f01a62f27dcb1d52597b24825931e88bae76f33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:23:12 2012 +0100 gallivm: (trivial) fix bogus comments commit 5c85be25fd82e28490274c468ce7f3e6e8c1d416 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 20 11:51:57 2012 +0100 translate: Free elt8_func/elt16_func too. These were leaking. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 0ad498f36fb6f7458c7cffa73b6598adceee0a6c Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 15:55:34 2012 +0200 gallivm: fix bug for tex wrap repeat with linear sampling in aos float path The comparison needs to be against length not length_minus_one, otherwise the max texel is never chosen (for the second coordinate). Fixes piglit texwrap-1D-npot-proj (and 2D/3D versions). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit d1ad65937c5b76407dc2499b7b774ab59341209e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 16:13:43 2012 +0200 gallivm: simplify soa tex wrap repeat with npot textures and no mip filtering Similar to what is already done in aos sampling for the float path (but not the int path since we don't get normalized float coordinates there). URem is expensive and the calculation is done trivially with normalized floats instead (at least with sse41-capable cpus). (Some day should probably do the same for the mip filter path but it's much more complicated there hence the gain is smaller.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit e1e23f57ba9b910295c306d148f15643acc3fc83 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:38:56 2012 +0200 llvmpipe: (trivial) remove duplicated function declaration Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 07ca57eb09e04c48a157733255427ef5de620861 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:37:34 2012 +0200 llvmpipe: destroy setup variants on context destruction lp_delete_setup_variants() used to be called in garbage collection, but this no longer exists hence the setup shaders never got freed. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ed0003c633859a45f9963a479f4c15ae0ef1dca3 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 16:25:29 2012 +0100 gallivm: handle different ilod parts for multiple quad sampling This fixes filtering when the integer part of the lod is not the same for all quads. I'm not fully convinced of that solution yet as it just splits the vector if the levels to be sampled from are different. But otherwise we'd need to do things like some minify steps, and getting mip level base address separately anyway hence it wouldn't really look like much of a win (and making the code even more complex). This should now give identical results to single quad sampling. commit 8580ac4cfc43a64df55e84ac71ce1a774d33c0d2 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:14:47 2012 +0200 gallivm: de-duplicate sample code common to soa and aos sampling There doesn't seem to be any reason why this code dealing with cube face selection, lod and mip level calculation is separate in aos and soa sampling, and I am sick of having it to change in both places. commit fb541e5f957408ce305b272100196f1e12e5b1e8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:15:41 2012 +0200 gallivm: do mip filtering with per quad lod_fpart This gives better results for mip filtering, though the generated code might not be optimal. For now it also creates some artifacts if the lod_ipart isn't the same for all quads, since instead of using the same mip weight for all quads as previously (which just caused non-smooth gradients) this now will use the right weights but with the wrong mip level in this case (can easily be seen with things like texfilt, mipmap_tunnel). v2: use logic helper suggested by José, and fix issue with negative lod_fpart values commit f1cc84eef7d826a20fab6cd8ccef9a275ff78967 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 13 18:35:25 2012 +0200 gallivm: (trivial) fix bogus assert in lp_build_unpack_broadcast_aos_scalars commit 7c17dbae8ae290df9ce0f50781a09e8ed640c044 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:11:14 2012 +0100 util: Reimplement half <-> float conversions. Removed u_half.py used to generate the table for previous method. Previous implementation of float to half conversion was faulty for denormalised and NaNs and would require extra logic to fix, thus making the speedup of using tables irrelevant. commit 7762f59274070e1dd4b546f5cb431c2eb71ae5c3 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:12:16 2012 +0100 tests: Updated tests to properly handle NaN for half floats. commit fa94c135aea5911fd93d5dfb6e6f157fb40dce5e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:33:10 2012 +0200 gallivm: do mip level calculations per quad This is the final piece which shouldn't change the rendering output yet. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 23cbeaddfe03c09ca18c45d28955515317ffcf4c Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Jun 9 00:54:21 2012 +0200 gallivm: do per-quad cube face selection Doesn't quite fix the piglit cubemap test (not sure why actually) but doing per-quad face selection is doing the right thing and definitely an improvement. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit abfb372b3702ac97ac8b5aa80ad1b94a2cc39d33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:22:59 2012 +0200 gallivm: do all lod calculations per quad Still no functional change but lod is now converted to scalar after lod calculations. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 519368632747ae03feb5bca9c655eccbc5b751b4 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:46:10 2012 +0100 gallivm: Added support for half-float to float conversion in lp_build_conv. Updated various utility functions to support this change. commit 135b4d683a4c95f7577ba27b9bffa4a6fbd2c2e7 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:02:46 2012 +0100 gallivm: Added function for half-float to float conversion. Updated lp_build_format_aos_array to support half-float source. commit 37d648827406a20c5007abeb177698723ed86673 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:55:18 2012 +0100 util: Updated u_format_tests to rigidly test half-float boundary values. commit 2ad18165d96e578aa9046df7c93cb1c3284d8c6b Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:54:16 2012 +0100 llvmpipe: Updated lp_test_format to properly handle Inf/NaN results. commit 78740acf25aeba8a7d146493dd5c966e22c27b73 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:53:30 2012 +0100 util: Added functions for checking NaN / Inf for double and half-floats. commit 35e9f640ae01241f9e0d67fe893bbbf564c05809 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:05:13 2012 +0200 gallivm: Fix calculating rho for 3d textures for the single-quad case Discovered by accident, this looks like a very old typo bug. commit fc1220c636326536fd0541913154e62afa7cd1d8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:04:59 2012 +0200 gallivm: do calcs per-quad in lp_build_rho Still convert to scalar at the end of the function. commit 50a887ffc550bf310a6988fa2cea5c24d38c1a41 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 21 23:21:50 2012 +0200 gallivm: (trivial) return scalar in lp_build_extract_range for length 1 vectors Our type system on top of llvm's one doesn't generally support vectors of length 1, instead using scalars. So we should return a scalar from this function instead of having to bitcast the vector with length 1 later elsewhere. commit 80c71c621f9391f0f9230460198d861643324876 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 17:49:15 2012 +0100 draw: Fixed bad merge error commit c47401cfad0c9167de20ff560654f533579f452c Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:29:30 2012 +0100 draw: Updated store_clip to store whole vectors instead of individual elements. commit 2d9c1ad74b0b0b41861fffcecde39f09cc27f1cf Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:28:32 2012 +0100 gallivm: Added lp_build_fetch_rgba_aos_array. A version of lp_build_fetch_rgba_aos which is targeted at simple array formats. Reads the whole vector from memory in one, instead of reading each element individually. Tested with mesa tests and demos. commit ff7805dc2b6ef6d8b11ec4e54aab1633aef29ac8 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:27:40 2012 +0100 gallivm: Added lp_build_pad_vector. This function pads a vector with undef to a desired length. commit 701f50acef24a2791dabf4730e5b5687d6eb875d Author: James Benton <jbenton@vmware.com> Date: Fri May 18 17:27:19 2012 +0100 util: Added util_format_is_array. This function checks whether a format description is in a simple array format. commit 5e0a7fa543dcd009de26f34a7926674190fa6246 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:13:47 2012 +0100 draw: Removed draw_llvm_translate_from and draw/draw_llvm_translate.c. This is "replaced" by adding an optimised path in lp_build_fetch_rgba_aos in an upcoming patch. commit 8c886d6a7dd3fb464ecf031de6f747cb33e5361d Author: James Benton <jbenton@vmware.com> Date: Wed May 16 15:02:31 2012 +0100 draw: Modified store_aos to write the vector as one, not individual elements. commit 37337f3d657e21dfd662c7b26d61cb0f8cfa6f17 Author: James Benton <jbenton@vmware.com> Date: Wed May 16 14:16:23 2012 +0100 draw: Changed aos_to_soa to use lp_build_transpose_aos. commit bd2b69ce5d5c94b067944d1dcd5df9f8e84548f1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:14:27 2012 +0100 draw: Changed soa_to_aos to use lp_build_transpose_aos. commit 0b98a950d29a116e82ce31dfe7b82cdadb632f2b Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:45 2012 +0100 gallivm: Added lp_build_transpose_aos which converts between aos and soa. commit 69ea84531ad46fd145eb619ed1cedbe97dde7cb5 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:01 2012 +0100 gallivm: Added lp_build_interleave2_half aimed at AVX unpack instructions. commit 7a4cb1349dd35c18144ad5934525cfb9436792f9 Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 22 11:54:14 2012 +0100 gallivm: Fix build on Windows. MC-JIT not yet supported there. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit afd105fc16bb75d874e418046b80d9cc578818a1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:17:26 2012 +0100 llvmpipe: Added a error counter to lp_test_conv. Useful for keeping track of progress when fixing errors! Signed-off-by: José Fonseca <jfonseca@vmware.com> commit b644907d08c10a805657841330fc23db3963d59c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:16:46 2012 +0100 llvmpipe: Changed known failures in lp_test_conv. To comply with the recent fixes to lp_bld_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit d7061507bd94f6468581e218e61261b79c760d4f Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:14:38 2012 +0100 llvmpipe: Added fixed point types tests to lp_test_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 146b3ea39b4726dbe125ac666bd8902ea3d6ca8c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:26:35 2012 +0100 llvmpipe: Changed lp_test_conv src/dst alignment to be correct. Now based on the define rather than a fixed number. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit f3b57441f834833a4b142a951eb98df0aa874536 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:06:44 2012 +0100 gallivm: Fixed erroneous optimisation in lp_build_min/max. Previously assumed normalised was 0 to 1, but it can be -1 to 1 if type is signed. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a0613382e5a215cd146bb277646a6b394d376ae4 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:04:49 2012 +0100 gallivm: Compensate for lp_const_offset in lp_build_conv. Fixing a /*FIXME*/ to remove errors in integer conversion in lp_build_conv. Tested using lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a3d2bf15ea345bc8a0664f8f441276fd566566f3 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:01:25 2012 +0100 gallivm: Fixed overflow in lp_build_clamped_float_to_unsigned_norm. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit e7b1e76fe237613731fa6003b5e1601a2e506207 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 21 20:07:51 2012 +0100 gallivm: Fix build with LLVM 2.6 Trivial, and useful. commit d3c6bbe5c7f5ba1976710831281ab1b6a631082d Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 15 17:15:59 2012 +0100 gallivm: Enable MCJIT/AVX with vanilla LLVM 3.1. Add the necessary C++ glue, so that we don't need any modifications to the soon to be released LLVM 3.1. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 724a019a14d40fdbed21759a204a2bec8a315636 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 22:04:06 2012 +0100 gallivm: Use HAVE_LLVM 0x0301 consistently. commit af6991e2a3868e40ad599b46278551b794839748 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 21:49:06 2012 +0100 gallivm: Add MCRegisterInfo.h to silence benign warnings about missing implementation. Trivial. commit 6f8a1d75458daae2503a86c6b030ecc4bb494e23 Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Apr 2 22:14:15 2012 -0700 gallivm: Pass in a MCInstrInfo to createMCInstPrinter on llvm-3.1. llvm-3.1svn r153860 makes MCInstrInfo available to the MCInstPrinter. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 62555b6ed8760545794f83064e27cddcb3ce5284 Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 27 21:51:17 2012 -0700 gallivm: Fix method overriding in raw_debug_ostream. Use matching type qualifers to avoid method hiding. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 6a9bd784f4ac68ad0a731dcd39e5a3c39989f2be Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 13 22:40:52 2012 -0700 gallivm: Fix createOProfileJITEventListener namespace with llvm-3.1. llvm-3.1svn r152620 refactored the OProfile profiling code. createOProfileJITEventListener was moved from the llvm namespace to the llvm::JITEventListener namespace. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit b674955d39adae272a779be85aa1bd665de24e3e Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Mar 5 22:00:40 2012 -0800 gallivm: Pass in a MCRegisterInfo to MCInstPrinter on llvm-3.1. llvm-3.1svn r152043 changes createMCInstPrinter to take an additional MCRegisterInfo argument. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 11ab69971a8a31c62f6de74905dbf8c02884599f Author: Vinson Lee <vlee@freedesktop.org> Date: Wed Feb 29 21:20:53 2012 -0800 Revert "gallivm: Change getExtent and readByte to non-const with llvm-3.1." This reverts commit d5a6c172547d8964f4d4bb79637651decaf9deee. llvm-3.1svn r151687 makes MemoryObject accessor members const again. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 339960c82d2a9f5c928ee9035ed31dadb7f45537 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 14 16:19:56 2012 +0200 gallivm: (trivial) fix assertion failure for mipmapped 1d textures In lp_build_rho, we may end up with a 1-element vector (for mipmapped 1d textures), but in this case we require the type to be a non-vector type, so need a cast. commit 9d73edb727bd6d196030dc3026b7bf0c574b3e19 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:12:07 2012 +0200 gallivm: prepare for per-quad lod calculations for large vectors to be able to handle multiple quads at once in texture sampling and still do lod calculations per quad, it is necessary to get the per-quad derivatives into the lp_build_rho function. Until now these derivative values were just scalars, which isn't going to work. So we now use vectors, and since the interface needs to change we also do some different (slightly more efficient) packing of the values. For 8-wide vectors the packed derivative values for 3 coords would look like this, this scales to a arbitrary (multiple of 4) vector size: ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____ The second vector will be unused for 1d and 2d textures. To facilitate future changes the derivative values are put into a struct, since quite some functions just pass these values through. The generated code seems to be very slightly better for 2d textures (with 4-wide vectors) than before with sse2 (if you have a cpu with physical 128bit simd units - otherwise it's probably not a win). v2: suggestions from José, rename variables, add comments, use swizzle helper commit 0aa21de0d31466dac77b05c97005722e902517b8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:10:31 2012 +0200 gallivm: add undefined swizzle handling to lp_build_swizzle_aos This is useful for vectors with "holes", it lets llvm choose the most efficient shuffle instructions if some elements aren't needed without having to worry what elements to manually pick otherwise. commit 00faf3f370e7ce92f5ef51002b0ea42ef856e181 Author: José Fonseca <jfonseca@vmware.com> Date: Fri May 4 17:25:16 2012 +0100 gallivm: Get the LLVM IR optimization passes before JIT compilation. MC-JIT engine compiles the module immediately on creation, so the optimization passes were being run too late. So now we create a target data layout from a string, that matches the ABI parameters reported by the compiler. The backend optimization passes were always been run, so the performance improvement is modest (3% on multiarb mesa demo). Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit 40a43f4e2ce3074b5ce9027179d657ebba68800a Author: Roland Scheidegger <sroland@vmware.com> Date: Wed May 2 16:03:54 2012 +0200 gallivm: (trivial) fix wrong define used in lp_build_pack2 should fix stack-smashing crashes. commit e6371d0f4dffad4eb3b7a9d906c23f1c88a2ab9e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 30 21:25:29 2012 +0200 gallivm: add perf warnings when not using intrinsics with 256bit vectors Helper functions using integer sse2 intrinsics could split the vectors with AVX instead of using generic fallback (which should be faster). We don't actually expect to hit these paths (hence don't fix them up to actually do the vector splitting) so just emit warnings (for those functions where it's obvious doing split/intrinsic is faster than using generic path). Only emit warnings for 256bit vectors since we _really_ don't expect to hit arbitrary large vectors which would affect a lot more functions. The warnings do not actually depend on avx since the same logic applies to plain sse2 too (but of course again there's _really_ no reason we should hit these functions with 256bit vectors without avx). commit 8a9ea701ea7295181e846c6383bf66a5f5e47637 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:37:07 2012 +0200 gallivm: split vectors manually for avx in lp_build_pack2 (v2) There's 2 reasons for this: First, there's a llvm bug (fixed in 3.1) which generates tons of byte inserts/extracts otherwise, and second, more importantly, we want to use pack intrinsics instead of shuffles. We do this in lp_build_pack2 and not the calling code (aos sample path) because potentially other callers might find that useful too, even if for larger sequences of code using non-native vector sizes it might be better to manually split vectors. This should boost texture performance in the aos path considerably. v2: fix issues with intrinsics types with old llvm commit 27ac5b48fa1f2ea3efeb5248e2ce32264aba466e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:26:22 2012 +0200 llvmpipe: refactor lp_build_pack2 (v2) prettify, and it's unnecessary to assert when there's no intrinsic due to unsupported bit width - the shuffle path will work regardless. In contrast lp_build_packs2, should only rely on lp_build_pack2 doing the clamping for element sizes for which there is a sse2 intrinsic. v2: fix bug spotted by Jose regarding the intrinsic type for packusdw on old llvm versions. commit ddf279031f0111de4b18eaf783bdc0a1e47813c8 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:13:59 2012 +0200 gallivm: add src width check in lp_build_packs2() not doing so would skip clamping even if no sse2 pack instruction is available, which is incorrect (in theory only, such widths would also always hit a (unnecessary) assertion in lp_build_pack2(). commit e7f0ad7fe079975eae7712a6e0c54be4fae0114b Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Apr 27 15:57:00 2012 +0200 gallivm: (trivial) fix crash-causing typo for npot textures with avx commit 28a9d7f6f655b6ec508c8a3aa6ffefc1e79793a0 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Apr 25 19:38:45 2012 +0200 gallivm: (trivial) remove code mistakenly added twice. commit d5926537316f8ff67ad0a52e7242f7c5478d919b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Apr 24 21:16:15 2012 +0200 gallivm: add a new avx aos sample path (v2) Try to avoid mixing float and int address calculations. This does texture wrap modes with floats, and then the offset calculations still with ints (because of lack of precision with floats, though we could do some effort to make it work with not too large (16MB) textures). This also handles wrap repeat mode with npot-sized textures differently than either the old soa or aos int path (likely way faster but untested). Otherwise the actual address wrap code is largely similar to the soa path (not quite the same as this one also has some int code), it should get used by avx soa sampling later as well but doesn't handle more complex address modes yet (this will also have the benefit that we can use aos sampling path for all texture address modes). Generated code for that looks reasonable, but still does not split vectors explicitly for fetch/filter which means still get hit by llvm (fixed upstream) which generates hundreds of pinsrb/pextrb instead of two shuffles. It is not obvious though if it's much of a win over just doing address calcs 4-wide but with ints, even if it is definitely much less instructions on avx. piglit's texwrap seems to look exactly the same but doesn't test neither the non-normalized nor the npot cases. v2: fix comments, prettify based on Brian's and Jose's feedback. commit bffecd22dea66fb416ecff8cffd10dd4bdb73fce Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Apr 19 01:58:29 2012 +0200 gallivm: refactor aos lp_build_sample_image_nearest/linear split them up to separate address calculations and fetching/filtering. Need this for being able to do 8-wide float address calcs and 4-wide fetch/filter later (for avx). Plus the functions were very big scary monsters anyway (in particular lp_build_sample_image_linear). commit a80b325c57529adddcfa367f96f03557725c4773 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 16 17:17:18 2012 +0200 gallivm: fix lp_build_resize when truncating width but expanding vector size Missed this case which I thought was impossible - the assertion for it was right after the division by zero... (AoS) texture sampling may ask us to do this, for things like 8 4x32int vectors to 1 32x8int vector conversion (eventually, we probably don't want this to happen). commit f9c8337caa3eb185830d18bce8b95676a065b1d7 Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Apr 14 18:00:59 2012 +0200 gallivm: fix cube maps with larger vectors This makes the branchless cube face selection code work with larger vectors. Because the complexity is quite high (cannot really be improved it seems, per-face selection would reduce complexity a lot but this leads to errors unless the derivatives are calculated all from the same face which almost doubles the work to be done) it is still slower than the branching version, hence only enable this with large vectors. It doesn't actually do per-quad face selection yet (only makes sense with matching lod selection, in fact it will select the same face for all pixels based on the average of the first four pixels for now) but only different shuffles are required to make it work (the branching version actually should work with larger vectors too now thanks to the improved horizontal add but of course it cannot be extended to really select the face per-quad unless doing branching per quad). commit 7780c58869fc9a00af4f23209902db7e058e8a66 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 21:11:12 2012 +0100 llvmpipe: (trivial) fix compiler warning and also clarify comment regarding availability of popcnt instruction. commit a266dccf477df6d29a611154e988e8895892277e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 14:21:07 2012 +0100 gallivm: remove unneeded members in lp_build_sample_context Minor cleanup, the texture width, height, depth aren't accessed in their scalar form anywhere. Makes it more obvious those values should probably be fetched already vectorized (but this requires more invasive changes)... commit b678c57fb474e14f05e25658c829fc04d2792fff Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 29 15:53:55 2012 +0100 gallivm: add a helper for concatenating vectors Similar to the extract_range helper intended to get around slow code generated by llvm for 128bit insertelements. Concatenating two 128bit vectors this way will result in a single vinsertf128 operation rather than two 64bit stores plus one 128bit load, though it might be mildly useful for other purposes as well. commit 415ff228bcd0cf5e44a4c15350a661f0f5520029 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 19:41:15 2012 +0100 gallivm: add a custom 2x8f->1x16ub avx conversion path Similar to the existing 4x4f->1x16ub sse2 path, shaves off a couple instructions (min/max mostly) because it relies on pack intrinsics clamping. commit 78c08fc89f8fbcc6dba09779981b1e873e2a0299 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 18:44:07 2012 +0100 gallivm: add avx arithmetic intrinsics Add all avx intrinsics for arithmetic functions (with the exception of the horizontal add function which needs another look). Seems to pass basic tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit a586caa2800aa5ce54c173f7c0d4fc48153dbc4e Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 15:31:35 2012 +0100 gallivm: add avx logic intrinsics Add the blend intrinsics for 8-wide float and 4-wide double vectors. Since we lack 256bit int instructions these are used for int vectors as well, though obviously not for byte or word element values. The comparison intrinsics aren't extended for avx since these are only used for pre-2.7 llvm versions. commit 70275e4c13c89315fc2560a4c488c0e6935d5caf Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 00:40:53 2012 +0100 gallivm: new helper function for extract shuffles. Based on José's idea as we can need that in a couple places. Note that such shuffles should not be used lightly, since data layout of <4 x i8> is different to <16 x i8> for instance, hence might cause data rearrangement. commit 4d586dbae1b0c55915dda1759d2faea631c0a1c2 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 18:27:25 2012 +0100 gallivm: (trivial) don't overallocate shuffle variable using wrong define meant huge array... commit 06b0ec1f6d665d98c135f9573ddf4ba04b2121ad Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 17:54:20 2012 +0100 gallivm: don't do per-element extract/insert for vector element resize Instead of doing per-element extract/insert if the src vectors and dst vector differ in total size (which generates atrocious code) first change the src vectors size by using shuffles to destination vector size. We can still do better than that on AVX for packing to color buffer (by exploiting pack intrinsics characteristics hence eleminating the need for some clamps) but this already generates much better code. v2: incorporate feedback from José, Keith and use shuffle instead of bitcasts/extracts. Due to llvm deficiencies the latter cause all data to get moved to GPRs and back in pieces (even though the data in the regs actually stays the same...). commit c9970d70e05f95d3f52fe7d2cd794176a52693aa Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 19:33:19 2012 +0000 gallivm: fix bug in simple position interpolation Accidental use of position attribute instead of just pixel coordinates. Caused failures in piglit glsl-fs-ceil and glsl-fs-floor. commit d0b6fcdb008d04d7f73d3d725615321544da5a7e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 15:31:14 2012 +0000 gallivm: fix emission of ceil opcode lp_build_ceil seems more appropriate than lp_build_trunc. This seems to be never hit though someone performs some ceil to floor magic. commit d97fafed7e62ffa6bf76560a92ea246a1a26d256 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 22 11:46:52 2012 +0000 gallivm: new vectorized path for cubemap calculations should be faster when adapted to multiple quads as only selection masks need to be different. The code is more or less a per-pixel version adapted to only do it per quad. A per pixel version would be much simpler (could drop 2 selects, 6 broadcasts and the messy horizontal add of 3 vectors at the expense of only 2 more absolute value instructions - would also just work for arbitary large vectors). This version doesn't yet work with larger vectors because the horizontal add isn't adjusted to be able to work with 2x4 vectors (and also because face selection wouldn't be done per quad just per block though that would be only a correctness issue just as with lod selection). The downside is this code is quite a bit slower. On a Core2 it can be sped up by disabling the hw blend instructions for selection and using logicop fallbacks instead, but it is still slower than the old code, hence leave that in for now. Probably will chose one or the other version based on vector length in the end. commit b375fbb18a3fd46859b7fdd42f3e9908ea4ff9a3 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 21 14:42:29 2012 +0000 gallivm: fix optimized occlusion query intrinsic name commit a9ba0a3b611e48efbb0e79eb09caa85033dbe9a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Mar 21 16:19:43 2012 +0000 draw,gallivm,llvmpipe: Call gallivm_verify_function everywhere. commit f94c2238d2bc7383e088b8845b7410439a602071 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 18:54:10 2012 +0000 gallivm: optimize calculations for cube maps a bit this does some more vectorized calculations and uses horizontal adds if possible. A definite win with sse3 otherwise it doesn't seem to make much of a difference. In any case this is arithmetically identical, cannot handle larger vectors. Should be useful as a reference point against larger vector version later... commit 21a2c1cf3c8e1ac648ff49e59fdc0e3be77e2ebb Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 15:16:27 2012 +0000 llvmpipe: slight optimization of occlusion queries using movmskps when available. While this is slightly better for cpus without popcnt we should really sum the vectors ourselves (it is also possible to cast to i4 before doing the popcnt but that doesn't help that much neither since llvm is using some optimized popcnt version for i32) commit 5ab5a35f216619bcdf55eed52b0db275c4a06c1b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 13:32:11 2012 +0000 llvmpipe: fix occlusion queries with larger vectors need to adjust casts etc. commit ff95e6fdf5f16d4ef999ffcf05ea6e8c7160b0d5 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Mar 19 20:15:25 2012 +0000 gallivm: Restore optimization passes. commit 57b05b4b36451e351659e98946dae27be0959832 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:34:22 2012 +0000 llvmpipe: use existing min2 macro commit bc9a20e19b4f600a439f45679451f2e87cd4b299 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:07:27 2012 +0000 llvmpipe: add some safeguards against really large vectors As per José's suggestion, prevent things from blowing up if some cpu would have 1024bit or larger vectors. commit 0e2b525e5ca1c5bbaa63158bde52ad1c1564a3a9 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:31:08 2012 +0000 llvmpipe: fix mask generation for uberwide vectors this was the only piece preventing 16-wide vectors from working (apart from the LP_MAX_VECTOR_WIDTH define that is), which is the maximum as we don't get more pixels in the fragment shader at once. Hence adjust that so things could be tested properly with that size even though there seems to be no practical value. commit 3c8334162211c97f3a11c7f64e9e5a2a91ad9656 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:19:41 2012 +0000 llvmpipe: fix the simple interpolation method with larger vectors so both methods actually _really_ work now. Makes textures look nice with larger vectors... commit 1cb0464ef8871be1778d43b0c56adf9c06843e2d Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 17:26:35 2012 +0000 llvmpipe: fix mask generation and position interpolation with 8-wide vectors trivial bugs, with these things start to look somewhat reasonable. Textures though have some swizzling issues it seems. commit 168277a63ef5b72542cf063c337f2d701053ff4b Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 16:04:03 2012 +0000 llvmpipe: don't overallocate variables we never have more than 16 (stamp size) / 4 (minimum possible vector size). (With larger vectors those variables are still overallocated a bit.) commit 409b54b30f81ed0aa9ed0b01affe15c72de9abd2 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:56:48 2012 +0000 llvmpipe: add some 32f8 formats to lp_test_conv Also add the ability to handle different sized vectors. commit 55dcd3af8366ebdac0af3cdb22c2588f24aa18ce Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:47:27 2012 +0000 gallivm: handle different sized vectors in conversion / pack only fully generic path for now (extract/insert per element). commit 9c040f78c54575fcd94a8808216cf415fe8868f6 Author: Roland Scheidegger <sroland@vmware.com> Date: Sun Mar 18 00:58:28 2012 +0100 llvmpipe: fix harmless use of unitialized values commit 551e9d5468b92fc7d5aa2265db9a52bb1e368a36 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:31:21 2012 +0100 gallivm: drop special path in extract_broadcast with different sized vectors Not needed, llvm can handle shuffles with different sized result vector just fine. Should hopefully generate the same code in the end, but simpler IR. commit 44da531119ffa07a421eaa041f63607cec88f6f8 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:28:49 2012 +0100 llvmpipe: adapt interpolation for handling multiple quads at once this is still WIP there are actually two methods possible not quite sure what makes the most sense, so there's code for both for now: 1) the iterative method as used before (compute attrib values at upper left corner of stamp and upper left corner of each quad initially). It is improved to handle more than one quad at once, and also do some more vectorized calculations initially for slightly better code - newer cpus have full throughput with 4 wide float vectors, hence don't try to code up a path which might be faster if there's just one channel active per attribute. 2) just do straight interpolation for each pixel. Method 2) is more work per quad, but less initially - if all quads are executed significantly more overall though. But this might change with larger vector lengths. This method would also be needed if we'd do some kind of active quad merging when operating on multiple quads at once. This path contains some hack to force llvm to generate better code, it is still far from ideal though, still generates far too many unnecessary register spills/reloads. Both methods should work with different sized vectors. Not very well tested yet, still seems to work with four-wide vectors, need changes elsewhere to be able to test with wider vectors. commit be5d3e82e2fe14ad0a46529ab79f65bf2276cd28 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:59:37 2012 +0000 draw: Cleanup. commit f85bc12c7fbacb3de2a94e88c6cd2d5ee0ec0e8d Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:43:30 2012 +0000 gallivm: More module compilation refactoring. commit d76f093198f2a06a93b2204857e6fea5fd0b3ece Author: José Fonseca <jfonseca@vmware.com> Date: Thu Mar 15 21:29:11 2012 +0000 llvmpipe: Use gallivm_compile/free_function() in linear code. Should had been done before. commit 122e1adb613ce083ad739b153ced1cde61dfc8c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 13 14:47:10 2012 +0100 llvmpipe: generate partial pixel mask for multiple quads still works with one quad, cannot be tested yet with more At least for now always fixed order with multiple quads. commit 4c4f15081d75ed585a01392cd2dcce0ad10e0ea8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 22:09:24 2012 +0100 llvmpipe: refactor state setup a bit Refactor to make it easier to emit (and potentially later fetch in fs) coefficients for multiple attributes at once. Need to think more about how to make this actually happen however, the problem is different attributes can have different interpolation modes, requiring different handling in both setup and fs (though linear and perspective handling is close). commit 9363e49722ff47094d688a4be6f015a03fba9c79 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 19:23:23 2012 +0100 llvmpipe: vectorize tri offset calc cuts number of instructions in quad-offset-factor from 107 to 75. This code actually duplicated the (scalar) code calculating the determinant except it used different vertex order (leading to different sign but it doesn't matter) hence llvm could not have figured out it's the same (of course with determinant vectorized in the other place that wouldn't have worked any longer neither). Note this particular piece doesn't actually vectorize well, not many arithmetic instructions left but tons of shuffle instructions... Probably would need to work on n tris at a time for better vectorization. commit 63169dcb9dd445c94605625bf86d85306e2b4297 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 03:11:37 2012 +0100 llvmpipe: vectorize some scalar code in setup reduces number of arithmetic instructions, and avoids loading vector x,y values twice (once as scalars once as vectors). Results in a reduction of instructions from 76 to 64 in fs setup for glxgears (16%) on a cpu with sse41. Since this code uses vec2 disguised as vec4, on old cpus which had physical 64bit sse units (pre-Core2) it probably is less of a win in practice (and if you have no vectors you can only hope llvm eliminates the arithmetic for unneeded elements). commit 732ecb877f951ab89bf503ac5e35ab8d838b58a1 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 7 00:32:24 2012 +0100 draw: fix clipping bug introduced by 4822fea3f0440b5205e957cd303838c3b128419c broke clipping pretty badly (verified with lineclip test) commit ef5d90b86d624c152d200c7c4056f47c3c6d2688 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 23:38:59 2012 +0100 draw: don't store vertex header per attribute storing the vertex header once per attribute is totally unnecessary. Some quick look at the generated assembly says llvm in fact cannot optimize away the additional stores (maybe due to potentially aliasing pointers somewhere). Plus, this makes the code cleaner and also allows using a vector "or" instead of scalar ones. commit 6b3a5a57b0b9850854cfbd7b586e4e50102dda71 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 19:11:01 2012 +0100 draw: do the per-vertex "boolean" clipmask "or" with vectors no point extracting the values and doing it per component. Doesn't help that much since we still extract the values elsewhere anyway. commit 36519caf1af40e4480251cc79a2d527350b7c61f Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 2 22:27:01 2012 +0100 gallivm: fix lp_build_extract_broadcast with different sized vectors Fix the obviously wrong argument, so it doesn't blow up. commit 76d0ac3ad85066d6058486638013afd02b069c58 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 2 12:16:23 2012 +0000 draw: Compile per module and not per function (WIP). Enough to get gears w/ LLVM draw + softpipe to work on AVX doing: GALLIUM_DRIVER=softpipe SOFTPIPE_USE_LLVM=yes glxgears But still hackish -- will need to rethink and refactor this. commit 78e32b247d2a7a771be9a1a07eb000d1e54ea8bd Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 12:01:05 2012 +0000 llvmpipe: Remove lp_state_setup_fallback. Never used. commit 6895d5e40d19b4972c361e8b83fdb7eecda3c225 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Feb 27 19:14:27 2012 +0000 llvmpipe: Don't emit EMMS on x86 We already take precautions to ensure that LLVM never emits MMX code. commit 4822fea3f0440b5205e957cd303838c3b128419c Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 29 15:58:19 2012 +0100 draw: modifications for larger vector sizes We want to be able to use larger vectors especially for running the vertex shader. With this patch we build soa vectors which might have a different length than 4. Note that aos structures really remain the same, only when aos structures are converted to soa potentially different sized vectors are used. Samplers probably don't work yet, didn't look at them. Testing done: glxgears works with both 128bit and 256bit vectors. commit f4950fc1ea784680ab767d3dd0dce589f4e70603 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:57 2012 +0100 gallivm: override native vector width with LP_NATIVE_VECTOR_WIDTH env var for debug commit 6ad6dbf0c92f3bf68ae54e5f2aca035d19b76e53 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:24 2012 +0100 draw: allocate storage with alignment according to native vector width commit 7bf0e3e7c9bd2469ae7279cabf4c5229ae9880c1 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Feb 24 19:06:08 2012 +0000 gallivm: Fix comment grammar. Was missing several words. Spotted by Roland. commit b20f1b28eb890b2fa2de44a0399b9b6a0d453c52 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 19:22:09 2012 +0000 gallivm: Use MC-JIT on LLVM 3.1 + (i.e, SVN) MC-JIT Note: MC-JIT is still WIP. For this to work correctly it requires LLVM changes which are not yet upstream. commit b1af4dfcadfc241fd4023f4c3f823a1286d452c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 20:03:15 2012 +0100 llvmpipe: use new lp_type_width() helper in lp_test_blend commit 04e0a37e888237d4db2298f31973af459ef9c95f Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 19:50:34 2012 +0100 llvmpipe: clean up lp_test_blend a little Using variables just sized and aligned right makes it a bit more obvious what's going on. The test still only tests vector length 4. For AoS anything else probably isn't going to work. For SoA other lengths should work (at least with floats). commit e61c393d3ec392ddee0a3da170e985fda885a823 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:48:30 2012 +0000 gallivm: Ensure vector width consistency. Instead of assuming that everything is the max native size. commit 330081ac7bc41c5754a92825e51456d231bf84dd Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:44:14 2012 +0000 draw: More simd vector width consistency fixes. commit d90ca002753596269e37297e2e6c139b19f29f03 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:43:00 2012 +0000 gallivm: Remove unused lp_build_int32_vec4_type() helper. commit cae23417824d75869c202aaf897808d73a2c1db0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 17:32:16 2012 +0100 gallivm: use global variable for native vector width instead of define We do not know the simd extensions (and hence the simd width we should use) available at compile time. At least for now keep a define for maximum vector width, since a global variable obviously can't be used to adjust alignment of automatic stack variables. Leave the runtime-determined value at 128 for now in all cases. commit 51270ace6349acc2c294fc6f34c025c707be538a Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 15:41:02 2012 +0000 gallivm: Add a hunk inadvertedly lost when rebasing. commit bf256df9cfdd0236637a455cbaece949b1253e98 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:24:23 2012 +0000 llvmpipe: Use consistent vector width in depth/stencil test. commit 5543b0901677146662c44be2cfba655fd55da94b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:19:59 2012 +0000 draw: Use a consistent the vector register width. Instead of 4x32 sometimes, LP_NATIVE_VECTOR_WIDTH other times. commit eada8bbd22a3a61f549f32fe2a7e408222e5c824 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 12:08:04 2012 +0000 gallivm: Remove garbagge collection. MC-JIT will require one compilation per module (as opposed to one compilation per function), therefore no state will be shared, eliminating the need to do garbagge collection. commit 556697ea0ed72e0641851e4fbbbb862c470fd7eb Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 10:33:41 2012 +0000 gallivm: Move all native target initialization to lp_set_target_options(). commit c518e8f3f2649d5dc265403511fab4bcbe2cc5c8 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:52:32 2012 +0000 llvmpipe: Create one gallivm instance for each test. commit 90f10af8920ec6be6f2b1e7365cfc477a0cb111d Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:48:08 2012 +0000 gallivm: Avoid LLVMAddGlobalMapping() in lp_bld_assert(). Brittle, complex, and unecesary. Just use function pointer constant. commit 98fde550b33401e3fe006af59db4db628bcbf476 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:21:26 2012 +0000 gallivm: Add a lp_build_const_func_pointer() helper. To be reused in all places where we want to call C code. commit 6cfedadb62c2ce5af8d75969bc95a607f3ece118 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:44:41 2012 +0000 gallivm: Cleanup/simplify lp_build_const_string_variable. - Move to lp_bld_const where it belongs - Rename to lp_build_const_string - take the length from the argument (and don't count the zero terminator twice) - bitcast the constant to generic i8 * commit db1d4018c0f1fa682a9da93c032977659adfb68c Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 11:52:17 2012 +0000 gallivm: Set NoFramePointerElimNonLeaf to true where supported. commit 088614164aa915baaa5044fede728aa898483183 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 22 19:38:47 2012 +0100 llvmpipe: pass in/out pointers rather scalar floats in lp_bld_arit we don't want llvm to potentially optimize away the vectors (though it doesn't seem to currently), plus we want to be able to handle in/out vectors of arbitrary length. commit 3f5c4e04af8a7592fdffa54938a277c34ae76b51 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:22:55 2012 +0100 gallivm: fix lp_build_sqrt() for vector length 1 since we optimize away vectors with length 1 need to emit intrinsic without vector type. commit 79d94e5f93ed8ba6757b97e2026722ea31d32c06 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 17:00:46 2012 +0000 llvmpipe: Remove lp_test_round. commit 81f41b5aeb3f4126e06453cfc78990086b85b78d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:56:24 2012 +0100 llvmpipe: subsume lp_test_round into lp_test_arit Much simpler, and since the arguments aren't passed as 128bit values can run on any arch. This also uses the float instead of the double versions of the c functions (which probably was the intention anyway). In contrast to lp_test_round the output is much less verbose however. Tested vector width of 32 to 512 bits - all pass except 32 (length 1) which crashes in lp_build_sqrt() due to wrong type. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 945b338b421defbd274481d8c4f7e0910fd0e7eb Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 09:55:03 2012 +0000 gallivm: Centralize the function compilation logic. This simplifies a lot of code. Also doing this in a central place will make it easier to carry out the changes necessary to use MC-JIT in the future. gallivm: Fix typo in explicit derivative shuffle. Trivial. draw: make DEBUG_STORE work again adapt to lp_build_printf() interface changes Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: get rid of vecnf_from_scalar() just use lp_build_broadcast directly (cannot assign a name but don't really need it, vecnf_from_scalar() was producing much uglier IR due to using repeated insertelement instead of insertelement+shuffle). Reviewed-by: José Fonseca <jfonseca@vmware.com> llvmpipe: fix typo in complex interpolation code Fixes position interpolation when using complex mode (piglit fp-fragment-position and similar) Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: fix clipvertex/position storing again This appears to be the result of a bad merge. Fixes piglit tests relying on clipping, like a lot of the interpolation tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Fix explicit derivative manipulation. Same counter variable was being used in two nested loops. Use more meanigful variable names for the counter to fix and avoid this. gallivm: Prevent buffer overflow in repeat wrap mode for NPOT. Based on Roland's patch, discussion, and review . Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix dims for TGSI_TEXTURE_1D in emit_tex. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix explicit volume texture derivatives. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: fix 1d shadow texture sampling Always r coordinate is used, hence need 3 coords not two (the second one is unused). Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Enable AVX support without MCJIT, where available. For now, this just enables AVX on Windows for testing. If the code is stable then we might consider prefering the old JIT wherever possible. No change elsewhere. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2012-07-13 18:09:30 +01:00
}
/**
* Same as lp_build_fract, but guarantees that the result is always smaller
* than one. Will also return the smaller-than-one value for infs, NaNs.
gallivm,draw,llvmpipe: Support wider native registers. Squashed commit of the following: commit 7acb7b4f60dc505af3dd00dcff744f80315d5b0e Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:46:31 2012 +0100 draw: Don't use dynamically sized arrays. Not supported by MSVC. commit 5810c28c83647612cb372d1e763fd9d7780df3cb Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:44:16 2012 +0100 gallivm,llvmpipe: Don't use expressions with PIPE_ALIGN_VAR(). MSVC doesn't accept exceptions in _declspec(align(...)). Use a define instead. commit 8aafd1457ba572a02b289b3f3411e99a3c056072 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:41:56 2012 +0100 gallium/util: Make u_cpu_detect.h header C++ safe. commit 5795248350771f899cfbfc1a3a58f1835eb2671d Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 2 12:08:01 2012 +0100 gallium/util: Add ULL suffix to large constants. As suggested by Andy Furniss: it looks like some old gcc versions require it. commit 4c66c22727eff92226544c7d43c4eb94de359e10 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Truly disable INF/NAN tests on MSVC. Thanks to Brian for spotting this. commit 8bce274c7fad578d7eb656d9a1413f5c0844c94e Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Disable INF/NAN tests on MSVC. Somehow they are not recognized as constants. commit 6868649cff8d7fd2e2579c28d0b74ef6dd4f9716 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 15:05:24 2012 +0200 gallivm: Cleanup the 2 x 8 float -> 16 ub special path in lp_build_conv. No behaviour change intended, like 7b98455fb40c2df84cfd3cdb1eb7650f67c8a751. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 5147a0949c4407e8bce9e41d9859314b4a9ccf77 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 14:28:19 2012 +0200 gallivm: (trivial) fix issues with multiple-of-4 texture fetch Some formats can't handle non-multiple of 4 fetches I believe, but everything must support length 1 and multiples of 4. So avoid going to scalar fetch (which is very costly) just because length isn't 4. Also extend the hack to not use shift with variable count for yuv formats to arbitrary length (larger than 1) - doesn't matter how many elements we have we always want to avoid it unless we have variable shift count instruction (which we should get with avx2). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 87ebcb1bd71fa4c739451ec8ca89a7f29b168c08 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jul 4 02:09:55 2012 +0200 gallivm: (trivial) fix typo for wrap repeat mode in linear filtering aos code This would lead to bogus coordinates at the edges. (undetected by piglit because this path is only taken for block-based formats). Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 3a42717101b1619874c8932a580c0b9e6896b557 Author: José Fonseca <jfonseca@vmware.com> Date: Tue Jul 3 19:42:49 2012 +0100 gallivm: Fix TGSI integer translation with AVX. commit d71ff104085c196b16426081098fb0bde128ce4f Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 15:17:41 2012 +0100 llvmpipe: Fix LLVM JIT linear path. It was not working properly because it was looking at the JIT function before it was actually compiled. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit a94df0386213e1f5f9a6ed470c535f9688ec0a1b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Jun 28 18:07:10 2012 +0100 gallivm: Refactor lp_build_broadcast(_scalar) to share code. Doesn't really change the generated assembly, but produces more compact IR, and of course, makes code more consistent. Reviewed-by: Brian Paul <brianp@vmware.com> commit 66712ba2731fc029fa246d4fc477d61ab785edb5 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 17:30:13 2012 +0100 gallivm: Make LLVMContextRef a singleton. There are any places inside LLVM that depend on it. Too many to attempt to fix. Reviewed-by: Brian Paul <brianp@vmware.com> commit ff5fb7897495ac263f0b069370fab701b70dccef Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 28 18:15:27 2012 +0200 gallivm: don't use 8-wide texture fetch in aos path This appears to be a slight loss usually. There are probably several reasons for that: - fetching itself is scalar - filtering is pure int code hence needs splitting anyway, same for the final texel offset calculations - texture wrap related code, which can be done 8-wide, is slightly more complex with floats (with clamp_to_edge) and float operations generally more costly hence probably not much faster overall - the code needed to split when encountering different mip levels for the quads, adding complexity So, just split always for aos path (but leave it 8-wide for soa, since we do 8-wide filtering there when possible). This should certainly be revisited if we'd have avx2 support. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ce8032b43dcd8e8d816cbab6428f54b0798f945d Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:41:19 2012 +0200 gallivm: (trivial) don't extract fparts variable if not needed Did not have any consequences but unnecessary. commit aaa9aaed8f80dc282492f62aa583a7ee23a4c6d5 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:09:06 2012 +0200 gallivm: fix precision issue in aos linear int wrap code now not just passes at a quick glance but also with piglit... If we do the wrapping with floats, we also need to set the weights accordingly. We can potentially end up with different (integer) coordinates than what the integer calculations would have chosen, which means the integer weights calculated previously in this case are completely wrong. Well at least that's what I think happens, at least recalculating the weights helps. (Some day really should refactor all the wrapping, so we do whatever is fastest independent of 16bit int aos or 32bit float soa filtering.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit fd6f18588ced7ac8e081892f3bab2916623ad7a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 11:15:53 2012 +0100 gallium/util: Fix parsing of options with underscore. For example GALLIVM_DEBUG=no_brilinear which was being parsed as two options, "no" and "brilinear". commit 09a8f809088178a03e49e409fa18f1ac89561837 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 26 15:00:14 2012 +0100 gallivm: Added a generic lp_build_print_value which prints a LLVMValueRef. Updated lp_build_printf to share common code. Removed specific lp_build_print_vecX. Reviewed-by: José Fonseca <jfonseca@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit e59bdcc2c075931bfba2a84967a5ecd1dedd6eb0 Author: José Fonseca <jfonseca@vmware.com> Date: Wed May 16 15:00:23 2012 +0100 draw,llvmpipe: Avoid named struct types on LLVM 3.0 and later. Starting with LLVM 3.0, named structures are meant not for debugging, but for recursive data types, previously also known as opaque types. The recursive nature of these types leads to several memory management difficulties. Given that we don't actually need recursive types, avoid them altogether. This is an attempt to address fdo bugs 41791 and 44466. The issue is somewhat random so there's no easy way to check how effective this is. Cherry-picked from 9af1ba565dfd5cef9ee938bb7c04767d14878fbf commit df6070f618a203c7a876d984c847cde4cbc26bdb Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 14:42:53 2012 +0200 gallivm: (trivial) fix typo in faster aos linear int wrap code no longer crashes, now REALLY tested. commit d8f98dce452c867214e6782e86dc08562643c862 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 18:20:58 2012 +0200 llvmpipe: (trivial) remove bogus optimization for float aos repeat wrap This optimization for nearest filtering on the linear path generated likely bogus results, and the int path didn't have any optimizations there since the only shader using force_nearest apparently uses clamp_to_edge not repeat wrap anyway. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit c4e271a0631087c795e756a5bb6b046043b5099d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 23:01:52 2012 +0200 gallivm: faster repeat wrap for linear aos path too Even if we already have scaled integer coords, it's way faster to use the original float coord (plus some conversions) rather than use URem. The choice of what to do for texture wrapping is not really tied to int aos or float soa filtering though for some modes there can be some gains (because of easier weight calculations). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 1174a75b1806e92aee4264ffe0ffe7e70abbbfa3 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 14:39:22 2012 +0200 gallivm: improve npot tex wrap repeat in linear soa path URem gets translated into series of scalar divisions so just about anything else is faster. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit f849ffaa499ed96fa0efd3594fce255c7f22891b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 00:40:35 2012 +0100 gallivm: (trivial) fix near-invisible shift-space typo I blame the keyboard. commit 5298a0b19fe672aebeb70964c0797d5921b51cf0 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:24:28 2012 +0200 gallivm: add new intrinsic helper to deal with arbitrary vector length This helper will split vectors which are too large for the hw, or expand them if they are too small, so a caller of a function using intrinsics which uses such sizes need not split (or expand) the vectors manually and the function will still use the intrinsic instead of dropping back to generic llvm code. It can also accept scalars for use with pseudo-vector intrinsics (only useful for float arguments, all x86 scalar simd float intrinsics use 4vf32). Only used for lp_build_min/max() for now (also added the scalar float case for these while there). (Other basic binary functions could use it easily, whereas functions with a different interface would need different helpers.) Expanding vectors isn't widely used, because we always try to use build contexts with native hw vector sizes. But it might (or not) be nicer if this wouldn't need to be done, the generated code should in theory stay the same (it does get hit by lp_build_rho though already since we didn't have a intrinsic for the scalar lp_build_max case before). v2: incorporated Brian's feedback, and also made the scalar min/max case work instead of crash (all scalar simd float intrinsics take 4vf32 as argument, probably the reason why it wasn't used before). Moved to lp_bld_intr based on José's request, and passing intrinsic size instead of length. Ideally we'd derive the source type info from the passed in llvm value refs and process some llvmtype return type so we could handle intrinsics where the source and destination type isn't the same (like float/int conversions, packing instructions) but that's a bit too complicated for now. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 01aa760b99ec0b2dc8ce57a43650e83f8c1becdf Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:19:18 2012 +0200 gallivm: (trivial) increase max code size for shader disassembly 64kB was just short of what I needed (which caused a crash) hence increase to 96kB (should probably be smarter about that). commit 74aa739138d981311ce13076388382b5e89c6562 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:53:29 2012 +0100 gallivm: simplify aos float tex wrap repeat nearest just handle pot and npot the same. The previous pot handling ended up with exactly the same instructions plus 2 more (leave it in the soa path though since it is probably still cheaper there). While here also fix a issue which would cause a crash after an assert. commit 0e1e755645e9e49cfaa2025191e3245ccd723564 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:29:24 2012 +0100 gallivm: (trivial) skip floor rounding in ifloor when not signed This was only done for the non-sse41 case before, but even with sse41 this is obviously unnecessary (some callers already call itrunc in this case anyway but some might not). commit 7f01a62f27dcb1d52597b24825931e88bae76f33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:23:12 2012 +0100 gallivm: (trivial) fix bogus comments commit 5c85be25fd82e28490274c468ce7f3e6e8c1d416 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 20 11:51:57 2012 +0100 translate: Free elt8_func/elt16_func too. These were leaking. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 0ad498f36fb6f7458c7cffa73b6598adceee0a6c Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 15:55:34 2012 +0200 gallivm: fix bug for tex wrap repeat with linear sampling in aos float path The comparison needs to be against length not length_minus_one, otherwise the max texel is never chosen (for the second coordinate). Fixes piglit texwrap-1D-npot-proj (and 2D/3D versions). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit d1ad65937c5b76407dc2499b7b774ab59341209e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 16:13:43 2012 +0200 gallivm: simplify soa tex wrap repeat with npot textures and no mip filtering Similar to what is already done in aos sampling for the float path (but not the int path since we don't get normalized float coordinates there). URem is expensive and the calculation is done trivially with normalized floats instead (at least with sse41-capable cpus). (Some day should probably do the same for the mip filter path but it's much more complicated there hence the gain is smaller.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit e1e23f57ba9b910295c306d148f15643acc3fc83 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:38:56 2012 +0200 llvmpipe: (trivial) remove duplicated function declaration Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 07ca57eb09e04c48a157733255427ef5de620861 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:37:34 2012 +0200 llvmpipe: destroy setup variants on context destruction lp_delete_setup_variants() used to be called in garbage collection, but this no longer exists hence the setup shaders never got freed. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ed0003c633859a45f9963a479f4c15ae0ef1dca3 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 16:25:29 2012 +0100 gallivm: handle different ilod parts for multiple quad sampling This fixes filtering when the integer part of the lod is not the same for all quads. I'm not fully convinced of that solution yet as it just splits the vector if the levels to be sampled from are different. But otherwise we'd need to do things like some minify steps, and getting mip level base address separately anyway hence it wouldn't really look like much of a win (and making the code even more complex). This should now give identical results to single quad sampling. commit 8580ac4cfc43a64df55e84ac71ce1a774d33c0d2 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:14:47 2012 +0200 gallivm: de-duplicate sample code common to soa and aos sampling There doesn't seem to be any reason why this code dealing with cube face selection, lod and mip level calculation is separate in aos and soa sampling, and I am sick of having it to change in both places. commit fb541e5f957408ce305b272100196f1e12e5b1e8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:15:41 2012 +0200 gallivm: do mip filtering with per quad lod_fpart This gives better results for mip filtering, though the generated code might not be optimal. For now it also creates some artifacts if the lod_ipart isn't the same for all quads, since instead of using the same mip weight for all quads as previously (which just caused non-smooth gradients) this now will use the right weights but with the wrong mip level in this case (can easily be seen with things like texfilt, mipmap_tunnel). v2: use logic helper suggested by José, and fix issue with negative lod_fpart values commit f1cc84eef7d826a20fab6cd8ccef9a275ff78967 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 13 18:35:25 2012 +0200 gallivm: (trivial) fix bogus assert in lp_build_unpack_broadcast_aos_scalars commit 7c17dbae8ae290df9ce0f50781a09e8ed640c044 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:11:14 2012 +0100 util: Reimplement half <-> float conversions. Removed u_half.py used to generate the table for previous method. Previous implementation of float to half conversion was faulty for denormalised and NaNs and would require extra logic to fix, thus making the speedup of using tables irrelevant. commit 7762f59274070e1dd4b546f5cb431c2eb71ae5c3 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:12:16 2012 +0100 tests: Updated tests to properly handle NaN for half floats. commit fa94c135aea5911fd93d5dfb6e6f157fb40dce5e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:33:10 2012 +0200 gallivm: do mip level calculations per quad This is the final piece which shouldn't change the rendering output yet. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 23cbeaddfe03c09ca18c45d28955515317ffcf4c Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Jun 9 00:54:21 2012 +0200 gallivm: do per-quad cube face selection Doesn't quite fix the piglit cubemap test (not sure why actually) but doing per-quad face selection is doing the right thing and definitely an improvement. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit abfb372b3702ac97ac8b5aa80ad1b94a2cc39d33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:22:59 2012 +0200 gallivm: do all lod calculations per quad Still no functional change but lod is now converted to scalar after lod calculations. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 519368632747ae03feb5bca9c655eccbc5b751b4 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:46:10 2012 +0100 gallivm: Added support for half-float to float conversion in lp_build_conv. Updated various utility functions to support this change. commit 135b4d683a4c95f7577ba27b9bffa4a6fbd2c2e7 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:02:46 2012 +0100 gallivm: Added function for half-float to float conversion. Updated lp_build_format_aos_array to support half-float source. commit 37d648827406a20c5007abeb177698723ed86673 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:55:18 2012 +0100 util: Updated u_format_tests to rigidly test half-float boundary values. commit 2ad18165d96e578aa9046df7c93cb1c3284d8c6b Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:54:16 2012 +0100 llvmpipe: Updated lp_test_format to properly handle Inf/NaN results. commit 78740acf25aeba8a7d146493dd5c966e22c27b73 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:53:30 2012 +0100 util: Added functions for checking NaN / Inf for double and half-floats. commit 35e9f640ae01241f9e0d67fe893bbbf564c05809 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:05:13 2012 +0200 gallivm: Fix calculating rho for 3d textures for the single-quad case Discovered by accident, this looks like a very old typo bug. commit fc1220c636326536fd0541913154e62afa7cd1d8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:04:59 2012 +0200 gallivm: do calcs per-quad in lp_build_rho Still convert to scalar at the end of the function. commit 50a887ffc550bf310a6988fa2cea5c24d38c1a41 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 21 23:21:50 2012 +0200 gallivm: (trivial) return scalar in lp_build_extract_range for length 1 vectors Our type system on top of llvm's one doesn't generally support vectors of length 1, instead using scalars. So we should return a scalar from this function instead of having to bitcast the vector with length 1 later elsewhere. commit 80c71c621f9391f0f9230460198d861643324876 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 17:49:15 2012 +0100 draw: Fixed bad merge error commit c47401cfad0c9167de20ff560654f533579f452c Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:29:30 2012 +0100 draw: Updated store_clip to store whole vectors instead of individual elements. commit 2d9c1ad74b0b0b41861fffcecde39f09cc27f1cf Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:28:32 2012 +0100 gallivm: Added lp_build_fetch_rgba_aos_array. A version of lp_build_fetch_rgba_aos which is targeted at simple array formats. Reads the whole vector from memory in one, instead of reading each element individually. Tested with mesa tests and demos. commit ff7805dc2b6ef6d8b11ec4e54aab1633aef29ac8 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:27:40 2012 +0100 gallivm: Added lp_build_pad_vector. This function pads a vector with undef to a desired length. commit 701f50acef24a2791dabf4730e5b5687d6eb875d Author: James Benton <jbenton@vmware.com> Date: Fri May 18 17:27:19 2012 +0100 util: Added util_format_is_array. This function checks whether a format description is in a simple array format. commit 5e0a7fa543dcd009de26f34a7926674190fa6246 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:13:47 2012 +0100 draw: Removed draw_llvm_translate_from and draw/draw_llvm_translate.c. This is "replaced" by adding an optimised path in lp_build_fetch_rgba_aos in an upcoming patch. commit 8c886d6a7dd3fb464ecf031de6f747cb33e5361d Author: James Benton <jbenton@vmware.com> Date: Wed May 16 15:02:31 2012 +0100 draw: Modified store_aos to write the vector as one, not individual elements. commit 37337f3d657e21dfd662c7b26d61cb0f8cfa6f17 Author: James Benton <jbenton@vmware.com> Date: Wed May 16 14:16:23 2012 +0100 draw: Changed aos_to_soa to use lp_build_transpose_aos. commit bd2b69ce5d5c94b067944d1dcd5df9f8e84548f1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:14:27 2012 +0100 draw: Changed soa_to_aos to use lp_build_transpose_aos. commit 0b98a950d29a116e82ce31dfe7b82cdadb632f2b Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:45 2012 +0100 gallivm: Added lp_build_transpose_aos which converts between aos and soa. commit 69ea84531ad46fd145eb619ed1cedbe97dde7cb5 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:01 2012 +0100 gallivm: Added lp_build_interleave2_half aimed at AVX unpack instructions. commit 7a4cb1349dd35c18144ad5934525cfb9436792f9 Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 22 11:54:14 2012 +0100 gallivm: Fix build on Windows. MC-JIT not yet supported there. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit afd105fc16bb75d874e418046b80d9cc578818a1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:17:26 2012 +0100 llvmpipe: Added a error counter to lp_test_conv. Useful for keeping track of progress when fixing errors! Signed-off-by: José Fonseca <jfonseca@vmware.com> commit b644907d08c10a805657841330fc23db3963d59c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:16:46 2012 +0100 llvmpipe: Changed known failures in lp_test_conv. To comply with the recent fixes to lp_bld_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit d7061507bd94f6468581e218e61261b79c760d4f Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:14:38 2012 +0100 llvmpipe: Added fixed point types tests to lp_test_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 146b3ea39b4726dbe125ac666bd8902ea3d6ca8c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:26:35 2012 +0100 llvmpipe: Changed lp_test_conv src/dst alignment to be correct. Now based on the define rather than a fixed number. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit f3b57441f834833a4b142a951eb98df0aa874536 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:06:44 2012 +0100 gallivm: Fixed erroneous optimisation in lp_build_min/max. Previously assumed normalised was 0 to 1, but it can be -1 to 1 if type is signed. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a0613382e5a215cd146bb277646a6b394d376ae4 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:04:49 2012 +0100 gallivm: Compensate for lp_const_offset in lp_build_conv. Fixing a /*FIXME*/ to remove errors in integer conversion in lp_build_conv. Tested using lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a3d2bf15ea345bc8a0664f8f441276fd566566f3 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:01:25 2012 +0100 gallivm: Fixed overflow in lp_build_clamped_float_to_unsigned_norm. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit e7b1e76fe237613731fa6003b5e1601a2e506207 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 21 20:07:51 2012 +0100 gallivm: Fix build with LLVM 2.6 Trivial, and useful. commit d3c6bbe5c7f5ba1976710831281ab1b6a631082d Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 15 17:15:59 2012 +0100 gallivm: Enable MCJIT/AVX with vanilla LLVM 3.1. Add the necessary C++ glue, so that we don't need any modifications to the soon to be released LLVM 3.1. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 724a019a14d40fdbed21759a204a2bec8a315636 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 22:04:06 2012 +0100 gallivm: Use HAVE_LLVM 0x0301 consistently. commit af6991e2a3868e40ad599b46278551b794839748 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 21:49:06 2012 +0100 gallivm: Add MCRegisterInfo.h to silence benign warnings about missing implementation. Trivial. commit 6f8a1d75458daae2503a86c6b030ecc4bb494e23 Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Apr 2 22:14:15 2012 -0700 gallivm: Pass in a MCInstrInfo to createMCInstPrinter on llvm-3.1. llvm-3.1svn r153860 makes MCInstrInfo available to the MCInstPrinter. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 62555b6ed8760545794f83064e27cddcb3ce5284 Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 27 21:51:17 2012 -0700 gallivm: Fix method overriding in raw_debug_ostream. Use matching type qualifers to avoid method hiding. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 6a9bd784f4ac68ad0a731dcd39e5a3c39989f2be Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 13 22:40:52 2012 -0700 gallivm: Fix createOProfileJITEventListener namespace with llvm-3.1. llvm-3.1svn r152620 refactored the OProfile profiling code. createOProfileJITEventListener was moved from the llvm namespace to the llvm::JITEventListener namespace. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit b674955d39adae272a779be85aa1bd665de24e3e Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Mar 5 22:00:40 2012 -0800 gallivm: Pass in a MCRegisterInfo to MCInstPrinter on llvm-3.1. llvm-3.1svn r152043 changes createMCInstPrinter to take an additional MCRegisterInfo argument. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 11ab69971a8a31c62f6de74905dbf8c02884599f Author: Vinson Lee <vlee@freedesktop.org> Date: Wed Feb 29 21:20:53 2012 -0800 Revert "gallivm: Change getExtent and readByte to non-const with llvm-3.1." This reverts commit d5a6c172547d8964f4d4bb79637651decaf9deee. llvm-3.1svn r151687 makes MemoryObject accessor members const again. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 339960c82d2a9f5c928ee9035ed31dadb7f45537 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 14 16:19:56 2012 +0200 gallivm: (trivial) fix assertion failure for mipmapped 1d textures In lp_build_rho, we may end up with a 1-element vector (for mipmapped 1d textures), but in this case we require the type to be a non-vector type, so need a cast. commit 9d73edb727bd6d196030dc3026b7bf0c574b3e19 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:12:07 2012 +0200 gallivm: prepare for per-quad lod calculations for large vectors to be able to handle multiple quads at once in texture sampling and still do lod calculations per quad, it is necessary to get the per-quad derivatives into the lp_build_rho function. Until now these derivative values were just scalars, which isn't going to work. So we now use vectors, and since the interface needs to change we also do some different (slightly more efficient) packing of the values. For 8-wide vectors the packed derivative values for 3 coords would look like this, this scales to a arbitrary (multiple of 4) vector size: ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____ The second vector will be unused for 1d and 2d textures. To facilitate future changes the derivative values are put into a struct, since quite some functions just pass these values through. The generated code seems to be very slightly better for 2d textures (with 4-wide vectors) than before with sse2 (if you have a cpu with physical 128bit simd units - otherwise it's probably not a win). v2: suggestions from José, rename variables, add comments, use swizzle helper commit 0aa21de0d31466dac77b05c97005722e902517b8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:10:31 2012 +0200 gallivm: add undefined swizzle handling to lp_build_swizzle_aos This is useful for vectors with "holes", it lets llvm choose the most efficient shuffle instructions if some elements aren't needed without having to worry what elements to manually pick otherwise. commit 00faf3f370e7ce92f5ef51002b0ea42ef856e181 Author: José Fonseca <jfonseca@vmware.com> Date: Fri May 4 17:25:16 2012 +0100 gallivm: Get the LLVM IR optimization passes before JIT compilation. MC-JIT engine compiles the module immediately on creation, so the optimization passes were being run too late. So now we create a target data layout from a string, that matches the ABI parameters reported by the compiler. The backend optimization passes were always been run, so the performance improvement is modest (3% on multiarb mesa demo). Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit 40a43f4e2ce3074b5ce9027179d657ebba68800a Author: Roland Scheidegger <sroland@vmware.com> Date: Wed May 2 16:03:54 2012 +0200 gallivm: (trivial) fix wrong define used in lp_build_pack2 should fix stack-smashing crashes. commit e6371d0f4dffad4eb3b7a9d906c23f1c88a2ab9e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 30 21:25:29 2012 +0200 gallivm: add perf warnings when not using intrinsics with 256bit vectors Helper functions using integer sse2 intrinsics could split the vectors with AVX instead of using generic fallback (which should be faster). We don't actually expect to hit these paths (hence don't fix them up to actually do the vector splitting) so just emit warnings (for those functions where it's obvious doing split/intrinsic is faster than using generic path). Only emit warnings for 256bit vectors since we _really_ don't expect to hit arbitrary large vectors which would affect a lot more functions. The warnings do not actually depend on avx since the same logic applies to plain sse2 too (but of course again there's _really_ no reason we should hit these functions with 256bit vectors without avx). commit 8a9ea701ea7295181e846c6383bf66a5f5e47637 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:37:07 2012 +0200 gallivm: split vectors manually for avx in lp_build_pack2 (v2) There's 2 reasons for this: First, there's a llvm bug (fixed in 3.1) which generates tons of byte inserts/extracts otherwise, and second, more importantly, we want to use pack intrinsics instead of shuffles. We do this in lp_build_pack2 and not the calling code (aos sample path) because potentially other callers might find that useful too, even if for larger sequences of code using non-native vector sizes it might be better to manually split vectors. This should boost texture performance in the aos path considerably. v2: fix issues with intrinsics types with old llvm commit 27ac5b48fa1f2ea3efeb5248e2ce32264aba466e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:26:22 2012 +0200 llvmpipe: refactor lp_build_pack2 (v2) prettify, and it's unnecessary to assert when there's no intrinsic due to unsupported bit width - the shuffle path will work regardless. In contrast lp_build_packs2, should only rely on lp_build_pack2 doing the clamping for element sizes for which there is a sse2 intrinsic. v2: fix bug spotted by Jose regarding the intrinsic type for packusdw on old llvm versions. commit ddf279031f0111de4b18eaf783bdc0a1e47813c8 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:13:59 2012 +0200 gallivm: add src width check in lp_build_packs2() not doing so would skip clamping even if no sse2 pack instruction is available, which is incorrect (in theory only, such widths would also always hit a (unnecessary) assertion in lp_build_pack2(). commit e7f0ad7fe079975eae7712a6e0c54be4fae0114b Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Apr 27 15:57:00 2012 +0200 gallivm: (trivial) fix crash-causing typo for npot textures with avx commit 28a9d7f6f655b6ec508c8a3aa6ffefc1e79793a0 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Apr 25 19:38:45 2012 +0200 gallivm: (trivial) remove code mistakenly added twice. commit d5926537316f8ff67ad0a52e7242f7c5478d919b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Apr 24 21:16:15 2012 +0200 gallivm: add a new avx aos sample path (v2) Try to avoid mixing float and int address calculations. This does texture wrap modes with floats, and then the offset calculations still with ints (because of lack of precision with floats, though we could do some effort to make it work with not too large (16MB) textures). This also handles wrap repeat mode with npot-sized textures differently than either the old soa or aos int path (likely way faster but untested). Otherwise the actual address wrap code is largely similar to the soa path (not quite the same as this one also has some int code), it should get used by avx soa sampling later as well but doesn't handle more complex address modes yet (this will also have the benefit that we can use aos sampling path for all texture address modes). Generated code for that looks reasonable, but still does not split vectors explicitly for fetch/filter which means still get hit by llvm (fixed upstream) which generates hundreds of pinsrb/pextrb instead of two shuffles. It is not obvious though if it's much of a win over just doing address calcs 4-wide but with ints, even if it is definitely much less instructions on avx. piglit's texwrap seems to look exactly the same but doesn't test neither the non-normalized nor the npot cases. v2: fix comments, prettify based on Brian's and Jose's feedback. commit bffecd22dea66fb416ecff8cffd10dd4bdb73fce Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Apr 19 01:58:29 2012 +0200 gallivm: refactor aos lp_build_sample_image_nearest/linear split them up to separate address calculations and fetching/filtering. Need this for being able to do 8-wide float address calcs and 4-wide fetch/filter later (for avx). Plus the functions were very big scary monsters anyway (in particular lp_build_sample_image_linear). commit a80b325c57529adddcfa367f96f03557725c4773 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 16 17:17:18 2012 +0200 gallivm: fix lp_build_resize when truncating width but expanding vector size Missed this case which I thought was impossible - the assertion for it was right after the division by zero... (AoS) texture sampling may ask us to do this, for things like 8 4x32int vectors to 1 32x8int vector conversion (eventually, we probably don't want this to happen). commit f9c8337caa3eb185830d18bce8b95676a065b1d7 Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Apr 14 18:00:59 2012 +0200 gallivm: fix cube maps with larger vectors This makes the branchless cube face selection code work with larger vectors. Because the complexity is quite high (cannot really be improved it seems, per-face selection would reduce complexity a lot but this leads to errors unless the derivatives are calculated all from the same face which almost doubles the work to be done) it is still slower than the branching version, hence only enable this with large vectors. It doesn't actually do per-quad face selection yet (only makes sense with matching lod selection, in fact it will select the same face for all pixels based on the average of the first four pixels for now) but only different shuffles are required to make it work (the branching version actually should work with larger vectors too now thanks to the improved horizontal add but of course it cannot be extended to really select the face per-quad unless doing branching per quad). commit 7780c58869fc9a00af4f23209902db7e058e8a66 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 21:11:12 2012 +0100 llvmpipe: (trivial) fix compiler warning and also clarify comment regarding availability of popcnt instruction. commit a266dccf477df6d29a611154e988e8895892277e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 14:21:07 2012 +0100 gallivm: remove unneeded members in lp_build_sample_context Minor cleanup, the texture width, height, depth aren't accessed in their scalar form anywhere. Makes it more obvious those values should probably be fetched already vectorized (but this requires more invasive changes)... commit b678c57fb474e14f05e25658c829fc04d2792fff Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 29 15:53:55 2012 +0100 gallivm: add a helper for concatenating vectors Similar to the extract_range helper intended to get around slow code generated by llvm for 128bit insertelements. Concatenating two 128bit vectors this way will result in a single vinsertf128 operation rather than two 64bit stores plus one 128bit load, though it might be mildly useful for other purposes as well. commit 415ff228bcd0cf5e44a4c15350a661f0f5520029 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 19:41:15 2012 +0100 gallivm: add a custom 2x8f->1x16ub avx conversion path Similar to the existing 4x4f->1x16ub sse2 path, shaves off a couple instructions (min/max mostly) because it relies on pack intrinsics clamping. commit 78c08fc89f8fbcc6dba09779981b1e873e2a0299 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 18:44:07 2012 +0100 gallivm: add avx arithmetic intrinsics Add all avx intrinsics for arithmetic functions (with the exception of the horizontal add function which needs another look). Seems to pass basic tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit a586caa2800aa5ce54c173f7c0d4fc48153dbc4e Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 15:31:35 2012 +0100 gallivm: add avx logic intrinsics Add the blend intrinsics for 8-wide float and 4-wide double vectors. Since we lack 256bit int instructions these are used for int vectors as well, though obviously not for byte or word element values. The comparison intrinsics aren't extended for avx since these are only used for pre-2.7 llvm versions. commit 70275e4c13c89315fc2560a4c488c0e6935d5caf Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 00:40:53 2012 +0100 gallivm: new helper function for extract shuffles. Based on José's idea as we can need that in a couple places. Note that such shuffles should not be used lightly, since data layout of <4 x i8> is different to <16 x i8> for instance, hence might cause data rearrangement. commit 4d586dbae1b0c55915dda1759d2faea631c0a1c2 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 18:27:25 2012 +0100 gallivm: (trivial) don't overallocate shuffle variable using wrong define meant huge array... commit 06b0ec1f6d665d98c135f9573ddf4ba04b2121ad Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 17:54:20 2012 +0100 gallivm: don't do per-element extract/insert for vector element resize Instead of doing per-element extract/insert if the src vectors and dst vector differ in total size (which generates atrocious code) first change the src vectors size by using shuffles to destination vector size. We can still do better than that on AVX for packing to color buffer (by exploiting pack intrinsics characteristics hence eleminating the need for some clamps) but this already generates much better code. v2: incorporate feedback from José, Keith and use shuffle instead of bitcasts/extracts. Due to llvm deficiencies the latter cause all data to get moved to GPRs and back in pieces (even though the data in the regs actually stays the same...). commit c9970d70e05f95d3f52fe7d2cd794176a52693aa Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 19:33:19 2012 +0000 gallivm: fix bug in simple position interpolation Accidental use of position attribute instead of just pixel coordinates. Caused failures in piglit glsl-fs-ceil and glsl-fs-floor. commit d0b6fcdb008d04d7f73d3d725615321544da5a7e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 15:31:14 2012 +0000 gallivm: fix emission of ceil opcode lp_build_ceil seems more appropriate than lp_build_trunc. This seems to be never hit though someone performs some ceil to floor magic. commit d97fafed7e62ffa6bf76560a92ea246a1a26d256 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 22 11:46:52 2012 +0000 gallivm: new vectorized path for cubemap calculations should be faster when adapted to multiple quads as only selection masks need to be different. The code is more or less a per-pixel version adapted to only do it per quad. A per pixel version would be much simpler (could drop 2 selects, 6 broadcasts and the messy horizontal add of 3 vectors at the expense of only 2 more absolute value instructions - would also just work for arbitary large vectors). This version doesn't yet work with larger vectors because the horizontal add isn't adjusted to be able to work with 2x4 vectors (and also because face selection wouldn't be done per quad just per block though that would be only a correctness issue just as with lod selection). The downside is this code is quite a bit slower. On a Core2 it can be sped up by disabling the hw blend instructions for selection and using logicop fallbacks instead, but it is still slower than the old code, hence leave that in for now. Probably will chose one or the other version based on vector length in the end. commit b375fbb18a3fd46859b7fdd42f3e9908ea4ff9a3 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 21 14:42:29 2012 +0000 gallivm: fix optimized occlusion query intrinsic name commit a9ba0a3b611e48efbb0e79eb09caa85033dbe9a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Mar 21 16:19:43 2012 +0000 draw,gallivm,llvmpipe: Call gallivm_verify_function everywhere. commit f94c2238d2bc7383e088b8845b7410439a602071 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 18:54:10 2012 +0000 gallivm: optimize calculations for cube maps a bit this does some more vectorized calculations and uses horizontal adds if possible. A definite win with sse3 otherwise it doesn't seem to make much of a difference. In any case this is arithmetically identical, cannot handle larger vectors. Should be useful as a reference point against larger vector version later... commit 21a2c1cf3c8e1ac648ff49e59fdc0e3be77e2ebb Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 15:16:27 2012 +0000 llvmpipe: slight optimization of occlusion queries using movmskps when available. While this is slightly better for cpus without popcnt we should really sum the vectors ourselves (it is also possible to cast to i4 before doing the popcnt but that doesn't help that much neither since llvm is using some optimized popcnt version for i32) commit 5ab5a35f216619bcdf55eed52b0db275c4a06c1b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 13:32:11 2012 +0000 llvmpipe: fix occlusion queries with larger vectors need to adjust casts etc. commit ff95e6fdf5f16d4ef999ffcf05ea6e8c7160b0d5 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Mar 19 20:15:25 2012 +0000 gallivm: Restore optimization passes. commit 57b05b4b36451e351659e98946dae27be0959832 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:34:22 2012 +0000 llvmpipe: use existing min2 macro commit bc9a20e19b4f600a439f45679451f2e87cd4b299 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:07:27 2012 +0000 llvmpipe: add some safeguards against really large vectors As per José's suggestion, prevent things from blowing up if some cpu would have 1024bit or larger vectors. commit 0e2b525e5ca1c5bbaa63158bde52ad1c1564a3a9 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:31:08 2012 +0000 llvmpipe: fix mask generation for uberwide vectors this was the only piece preventing 16-wide vectors from working (apart from the LP_MAX_VECTOR_WIDTH define that is), which is the maximum as we don't get more pixels in the fragment shader at once. Hence adjust that so things could be tested properly with that size even though there seems to be no practical value. commit 3c8334162211c97f3a11c7f64e9e5a2a91ad9656 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:19:41 2012 +0000 llvmpipe: fix the simple interpolation method with larger vectors so both methods actually _really_ work now. Makes textures look nice with larger vectors... commit 1cb0464ef8871be1778d43b0c56adf9c06843e2d Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 17:26:35 2012 +0000 llvmpipe: fix mask generation and position interpolation with 8-wide vectors trivial bugs, with these things start to look somewhat reasonable. Textures though have some swizzling issues it seems. commit 168277a63ef5b72542cf063c337f2d701053ff4b Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 16:04:03 2012 +0000 llvmpipe: don't overallocate variables we never have more than 16 (stamp size) / 4 (minimum possible vector size). (With larger vectors those variables are still overallocated a bit.) commit 409b54b30f81ed0aa9ed0b01affe15c72de9abd2 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:56:48 2012 +0000 llvmpipe: add some 32f8 formats to lp_test_conv Also add the ability to handle different sized vectors. commit 55dcd3af8366ebdac0af3cdb22c2588f24aa18ce Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:47:27 2012 +0000 gallivm: handle different sized vectors in conversion / pack only fully generic path for now (extract/insert per element). commit 9c040f78c54575fcd94a8808216cf415fe8868f6 Author: Roland Scheidegger <sroland@vmware.com> Date: Sun Mar 18 00:58:28 2012 +0100 llvmpipe: fix harmless use of unitialized values commit 551e9d5468b92fc7d5aa2265db9a52bb1e368a36 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:31:21 2012 +0100 gallivm: drop special path in extract_broadcast with different sized vectors Not needed, llvm can handle shuffles with different sized result vector just fine. Should hopefully generate the same code in the end, but simpler IR. commit 44da531119ffa07a421eaa041f63607cec88f6f8 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:28:49 2012 +0100 llvmpipe: adapt interpolation for handling multiple quads at once this is still WIP there are actually two methods possible not quite sure what makes the most sense, so there's code for both for now: 1) the iterative method as used before (compute attrib values at upper left corner of stamp and upper left corner of each quad initially). It is improved to handle more than one quad at once, and also do some more vectorized calculations initially for slightly better code - newer cpus have full throughput with 4 wide float vectors, hence don't try to code up a path which might be faster if there's just one channel active per attribute. 2) just do straight interpolation for each pixel. Method 2) is more work per quad, but less initially - if all quads are executed significantly more overall though. But this might change with larger vector lengths. This method would also be needed if we'd do some kind of active quad merging when operating on multiple quads at once. This path contains some hack to force llvm to generate better code, it is still far from ideal though, still generates far too many unnecessary register spills/reloads. Both methods should work with different sized vectors. Not very well tested yet, still seems to work with four-wide vectors, need changes elsewhere to be able to test with wider vectors. commit be5d3e82e2fe14ad0a46529ab79f65bf2276cd28 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:59:37 2012 +0000 draw: Cleanup. commit f85bc12c7fbacb3de2a94e88c6cd2d5ee0ec0e8d Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:43:30 2012 +0000 gallivm: More module compilation refactoring. commit d76f093198f2a06a93b2204857e6fea5fd0b3ece Author: José Fonseca <jfonseca@vmware.com> Date: Thu Mar 15 21:29:11 2012 +0000 llvmpipe: Use gallivm_compile/free_function() in linear code. Should had been done before. commit 122e1adb613ce083ad739b153ced1cde61dfc8c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 13 14:47:10 2012 +0100 llvmpipe: generate partial pixel mask for multiple quads still works with one quad, cannot be tested yet with more At least for now always fixed order with multiple quads. commit 4c4f15081d75ed585a01392cd2dcce0ad10e0ea8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 22:09:24 2012 +0100 llvmpipe: refactor state setup a bit Refactor to make it easier to emit (and potentially later fetch in fs) coefficients for multiple attributes at once. Need to think more about how to make this actually happen however, the problem is different attributes can have different interpolation modes, requiring different handling in both setup and fs (though linear and perspective handling is close). commit 9363e49722ff47094d688a4be6f015a03fba9c79 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 19:23:23 2012 +0100 llvmpipe: vectorize tri offset calc cuts number of instructions in quad-offset-factor from 107 to 75. This code actually duplicated the (scalar) code calculating the determinant except it used different vertex order (leading to different sign but it doesn't matter) hence llvm could not have figured out it's the same (of course with determinant vectorized in the other place that wouldn't have worked any longer neither). Note this particular piece doesn't actually vectorize well, not many arithmetic instructions left but tons of shuffle instructions... Probably would need to work on n tris at a time for better vectorization. commit 63169dcb9dd445c94605625bf86d85306e2b4297 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 03:11:37 2012 +0100 llvmpipe: vectorize some scalar code in setup reduces number of arithmetic instructions, and avoids loading vector x,y values twice (once as scalars once as vectors). Results in a reduction of instructions from 76 to 64 in fs setup for glxgears (16%) on a cpu with sse41. Since this code uses vec2 disguised as vec4, on old cpus which had physical 64bit sse units (pre-Core2) it probably is less of a win in practice (and if you have no vectors you can only hope llvm eliminates the arithmetic for unneeded elements). commit 732ecb877f951ab89bf503ac5e35ab8d838b58a1 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 7 00:32:24 2012 +0100 draw: fix clipping bug introduced by 4822fea3f0440b5205e957cd303838c3b128419c broke clipping pretty badly (verified with lineclip test) commit ef5d90b86d624c152d200c7c4056f47c3c6d2688 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 23:38:59 2012 +0100 draw: don't store vertex header per attribute storing the vertex header once per attribute is totally unnecessary. Some quick look at the generated assembly says llvm in fact cannot optimize away the additional stores (maybe due to potentially aliasing pointers somewhere). Plus, this makes the code cleaner and also allows using a vector "or" instead of scalar ones. commit 6b3a5a57b0b9850854cfbd7b586e4e50102dda71 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 19:11:01 2012 +0100 draw: do the per-vertex "boolean" clipmask "or" with vectors no point extracting the values and doing it per component. Doesn't help that much since we still extract the values elsewhere anyway. commit 36519caf1af40e4480251cc79a2d527350b7c61f Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 2 22:27:01 2012 +0100 gallivm: fix lp_build_extract_broadcast with different sized vectors Fix the obviously wrong argument, so it doesn't blow up. commit 76d0ac3ad85066d6058486638013afd02b069c58 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 2 12:16:23 2012 +0000 draw: Compile per module and not per function (WIP). Enough to get gears w/ LLVM draw + softpipe to work on AVX doing: GALLIUM_DRIVER=softpipe SOFTPIPE_USE_LLVM=yes glxgears But still hackish -- will need to rethink and refactor this. commit 78e32b247d2a7a771be9a1a07eb000d1e54ea8bd Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 12:01:05 2012 +0000 llvmpipe: Remove lp_state_setup_fallback. Never used. commit 6895d5e40d19b4972c361e8b83fdb7eecda3c225 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Feb 27 19:14:27 2012 +0000 llvmpipe: Don't emit EMMS on x86 We already take precautions to ensure that LLVM never emits MMX code. commit 4822fea3f0440b5205e957cd303838c3b128419c Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 29 15:58:19 2012 +0100 draw: modifications for larger vector sizes We want to be able to use larger vectors especially for running the vertex shader. With this patch we build soa vectors which might have a different length than 4. Note that aos structures really remain the same, only when aos structures are converted to soa potentially different sized vectors are used. Samplers probably don't work yet, didn't look at them. Testing done: glxgears works with both 128bit and 256bit vectors. commit f4950fc1ea784680ab767d3dd0dce589f4e70603 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:57 2012 +0100 gallivm: override native vector width with LP_NATIVE_VECTOR_WIDTH env var for debug commit 6ad6dbf0c92f3bf68ae54e5f2aca035d19b76e53 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:24 2012 +0100 draw: allocate storage with alignment according to native vector width commit 7bf0e3e7c9bd2469ae7279cabf4c5229ae9880c1 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Feb 24 19:06:08 2012 +0000 gallivm: Fix comment grammar. Was missing several words. Spotted by Roland. commit b20f1b28eb890b2fa2de44a0399b9b6a0d453c52 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 19:22:09 2012 +0000 gallivm: Use MC-JIT on LLVM 3.1 + (i.e, SVN) MC-JIT Note: MC-JIT is still WIP. For this to work correctly it requires LLVM changes which are not yet upstream. commit b1af4dfcadfc241fd4023f4c3f823a1286d452c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 20:03:15 2012 +0100 llvmpipe: use new lp_type_width() helper in lp_test_blend commit 04e0a37e888237d4db2298f31973af459ef9c95f Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 19:50:34 2012 +0100 llvmpipe: clean up lp_test_blend a little Using variables just sized and aligned right makes it a bit more obvious what's going on. The test still only tests vector length 4. For AoS anything else probably isn't going to work. For SoA other lengths should work (at least with floats). commit e61c393d3ec392ddee0a3da170e985fda885a823 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:48:30 2012 +0000 gallivm: Ensure vector width consistency. Instead of assuming that everything is the max native size. commit 330081ac7bc41c5754a92825e51456d231bf84dd Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:44:14 2012 +0000 draw: More simd vector width consistency fixes. commit d90ca002753596269e37297e2e6c139b19f29f03 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:43:00 2012 +0000 gallivm: Remove unused lp_build_int32_vec4_type() helper. commit cae23417824d75869c202aaf897808d73a2c1db0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 17:32:16 2012 +0100 gallivm: use global variable for native vector width instead of define We do not know the simd extensions (and hence the simd width we should use) available at compile time. At least for now keep a define for maximum vector width, since a global variable obviously can't be used to adjust alignment of automatic stack variables. Leave the runtime-determined value at 128 for now in all cases. commit 51270ace6349acc2c294fc6f34c025c707be538a Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 15:41:02 2012 +0000 gallivm: Add a hunk inadvertedly lost when rebasing. commit bf256df9cfdd0236637a455cbaece949b1253e98 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:24:23 2012 +0000 llvmpipe: Use consistent vector width in depth/stencil test. commit 5543b0901677146662c44be2cfba655fd55da94b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:19:59 2012 +0000 draw: Use a consistent the vector register width. Instead of 4x32 sometimes, LP_NATIVE_VECTOR_WIDTH other times. commit eada8bbd22a3a61f549f32fe2a7e408222e5c824 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 12:08:04 2012 +0000 gallivm: Remove garbagge collection. MC-JIT will require one compilation per module (as opposed to one compilation per function), therefore no state will be shared, eliminating the need to do garbagge collection. commit 556697ea0ed72e0641851e4fbbbb862c470fd7eb Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 10:33:41 2012 +0000 gallivm: Move all native target initialization to lp_set_target_options(). commit c518e8f3f2649d5dc265403511fab4bcbe2cc5c8 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:52:32 2012 +0000 llvmpipe: Create one gallivm instance for each test. commit 90f10af8920ec6be6f2b1e7365cfc477a0cb111d Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:48:08 2012 +0000 gallivm: Avoid LLVMAddGlobalMapping() in lp_bld_assert(). Brittle, complex, and unecesary. Just use function pointer constant. commit 98fde550b33401e3fe006af59db4db628bcbf476 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:21:26 2012 +0000 gallivm: Add a lp_build_const_func_pointer() helper. To be reused in all places where we want to call C code. commit 6cfedadb62c2ce5af8d75969bc95a607f3ece118 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:44:41 2012 +0000 gallivm: Cleanup/simplify lp_build_const_string_variable. - Move to lp_bld_const where it belongs - Rename to lp_build_const_string - take the length from the argument (and don't count the zero terminator twice) - bitcast the constant to generic i8 * commit db1d4018c0f1fa682a9da93c032977659adfb68c Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 11:52:17 2012 +0000 gallivm: Set NoFramePointerElimNonLeaf to true where supported. commit 088614164aa915baaa5044fede728aa898483183 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 22 19:38:47 2012 +0100 llvmpipe: pass in/out pointers rather scalar floats in lp_bld_arit we don't want llvm to potentially optimize away the vectors (though it doesn't seem to currently), plus we want to be able to handle in/out vectors of arbitrary length. commit 3f5c4e04af8a7592fdffa54938a277c34ae76b51 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:22:55 2012 +0100 gallivm: fix lp_build_sqrt() for vector length 1 since we optimize away vectors with length 1 need to emit intrinsic without vector type. commit 79d94e5f93ed8ba6757b97e2026722ea31d32c06 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 17:00:46 2012 +0000 llvmpipe: Remove lp_test_round. commit 81f41b5aeb3f4126e06453cfc78990086b85b78d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:56:24 2012 +0100 llvmpipe: subsume lp_test_round into lp_test_arit Much simpler, and since the arguments aren't passed as 128bit values can run on any arch. This also uses the float instead of the double versions of the c functions (which probably was the intention anyway). In contrast to lp_test_round the output is much less verbose however. Tested vector width of 32 to 512 bits - all pass except 32 (length 1) which crashes in lp_build_sqrt() due to wrong type. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 945b338b421defbd274481d8c4f7e0910fd0e7eb Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 09:55:03 2012 +0000 gallivm: Centralize the function compilation logic. This simplifies a lot of code. Also doing this in a central place will make it easier to carry out the changes necessary to use MC-JIT in the future. gallivm: Fix typo in explicit derivative shuffle. Trivial. draw: make DEBUG_STORE work again adapt to lp_build_printf() interface changes Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: get rid of vecnf_from_scalar() just use lp_build_broadcast directly (cannot assign a name but don't really need it, vecnf_from_scalar() was producing much uglier IR due to using repeated insertelement instead of insertelement+shuffle). Reviewed-by: José Fonseca <jfonseca@vmware.com> llvmpipe: fix typo in complex interpolation code Fixes position interpolation when using complex mode (piglit fp-fragment-position and similar) Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: fix clipvertex/position storing again This appears to be the result of a bad merge. Fixes piglit tests relying on clipping, like a lot of the interpolation tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Fix explicit derivative manipulation. Same counter variable was being used in two nested loops. Use more meanigful variable names for the counter to fix and avoid this. gallivm: Prevent buffer overflow in repeat wrap mode for NPOT. Based on Roland's patch, discussion, and review . Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix dims for TGSI_TEXTURE_1D in emit_tex. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix explicit volume texture derivatives. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: fix 1d shadow texture sampling Always r coordinate is used, hence need 3 coords not two (the second one is unused). Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Enable AVX support without MCJIT, where available. For now, this just enables AVX on Windows for testing. If the code is stable then we might consider prefering the old JIT wherever possible. No change elsewhere. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2012-07-13 18:09:30 +01:00
*/
LLVMValueRef
lp_build_fract_safe(struct lp_build_context *bld,
LLVMValueRef a)
{
return clamp_fract(bld, lp_build_fract(bld, a));
}
/**
* Return the integer part of a float (vector) value (== round toward zero).
* The returned value is an integer (vector).
* Ex: itrunc(-1.5) = -1
*/
LLVMValueRef
lp_build_itrunc(struct lp_build_context *bld,
LLVMValueRef a)
{
LLVMBuilderRef builder = bld->gallivm->builder;
const struct lp_type type = bld->type;
LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
assert(type.floating);
assert(lp_check_value(type, a));
return LLVMBuildFPToSI(builder, a, int_vec_type, "");
}
/**
* Return float (vector) rounded to nearest integer (vector). The returned
* value is an integer (vector).
* Ex: iround(0.9) = 1
* Ex: iround(-1.5) = -2
*/
LLVMValueRef
lp_build_iround(struct lp_build_context *bld,
LLVMValueRef a)
{
LLVMBuilderRef builder = bld->gallivm->builder;
const struct lp_type type = bld->type;
LLVMTypeRef int_vec_type = bld->int_vec_type;
LLVMValueRef res;
assert(type.floating);
assert(lp_check_value(type, a));
gallivm,draw,llvmpipe: Support wider native registers. Squashed commit of the following: commit 7acb7b4f60dc505af3dd00dcff744f80315d5b0e Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:46:31 2012 +0100 draw: Don't use dynamically sized arrays. Not supported by MSVC. commit 5810c28c83647612cb372d1e763fd9d7780df3cb Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:44:16 2012 +0100 gallivm,llvmpipe: Don't use expressions with PIPE_ALIGN_VAR(). MSVC doesn't accept exceptions in _declspec(align(...)). Use a define instead. commit 8aafd1457ba572a02b289b3f3411e99a3c056072 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:41:56 2012 +0100 gallium/util: Make u_cpu_detect.h header C++ safe. commit 5795248350771f899cfbfc1a3a58f1835eb2671d Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 2 12:08:01 2012 +0100 gallium/util: Add ULL suffix to large constants. As suggested by Andy Furniss: it looks like some old gcc versions require it. commit 4c66c22727eff92226544c7d43c4eb94de359e10 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Truly disable INF/NAN tests on MSVC. Thanks to Brian for spotting this. commit 8bce274c7fad578d7eb656d9a1413f5c0844c94e Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Disable INF/NAN tests on MSVC. Somehow they are not recognized as constants. commit 6868649cff8d7fd2e2579c28d0b74ef6dd4f9716 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 15:05:24 2012 +0200 gallivm: Cleanup the 2 x 8 float -> 16 ub special path in lp_build_conv. No behaviour change intended, like 7b98455fb40c2df84cfd3cdb1eb7650f67c8a751. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 5147a0949c4407e8bce9e41d9859314b4a9ccf77 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 14:28:19 2012 +0200 gallivm: (trivial) fix issues with multiple-of-4 texture fetch Some formats can't handle non-multiple of 4 fetches I believe, but everything must support length 1 and multiples of 4. So avoid going to scalar fetch (which is very costly) just because length isn't 4. Also extend the hack to not use shift with variable count for yuv formats to arbitrary length (larger than 1) - doesn't matter how many elements we have we always want to avoid it unless we have variable shift count instruction (which we should get with avx2). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 87ebcb1bd71fa4c739451ec8ca89a7f29b168c08 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jul 4 02:09:55 2012 +0200 gallivm: (trivial) fix typo for wrap repeat mode in linear filtering aos code This would lead to bogus coordinates at the edges. (undetected by piglit because this path is only taken for block-based formats). Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 3a42717101b1619874c8932a580c0b9e6896b557 Author: José Fonseca <jfonseca@vmware.com> Date: Tue Jul 3 19:42:49 2012 +0100 gallivm: Fix TGSI integer translation with AVX. commit d71ff104085c196b16426081098fb0bde128ce4f Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 15:17:41 2012 +0100 llvmpipe: Fix LLVM JIT linear path. It was not working properly because it was looking at the JIT function before it was actually compiled. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit a94df0386213e1f5f9a6ed470c535f9688ec0a1b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Jun 28 18:07:10 2012 +0100 gallivm: Refactor lp_build_broadcast(_scalar) to share code. Doesn't really change the generated assembly, but produces more compact IR, and of course, makes code more consistent. Reviewed-by: Brian Paul <brianp@vmware.com> commit 66712ba2731fc029fa246d4fc477d61ab785edb5 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 17:30:13 2012 +0100 gallivm: Make LLVMContextRef a singleton. There are any places inside LLVM that depend on it. Too many to attempt to fix. Reviewed-by: Brian Paul <brianp@vmware.com> commit ff5fb7897495ac263f0b069370fab701b70dccef Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 28 18:15:27 2012 +0200 gallivm: don't use 8-wide texture fetch in aos path This appears to be a slight loss usually. There are probably several reasons for that: - fetching itself is scalar - filtering is pure int code hence needs splitting anyway, same for the final texel offset calculations - texture wrap related code, which can be done 8-wide, is slightly more complex with floats (with clamp_to_edge) and float operations generally more costly hence probably not much faster overall - the code needed to split when encountering different mip levels for the quads, adding complexity So, just split always for aos path (but leave it 8-wide for soa, since we do 8-wide filtering there when possible). This should certainly be revisited if we'd have avx2 support. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ce8032b43dcd8e8d816cbab6428f54b0798f945d Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:41:19 2012 +0200 gallivm: (trivial) don't extract fparts variable if not needed Did not have any consequences but unnecessary. commit aaa9aaed8f80dc282492f62aa583a7ee23a4c6d5 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:09:06 2012 +0200 gallivm: fix precision issue in aos linear int wrap code now not just passes at a quick glance but also with piglit... If we do the wrapping with floats, we also need to set the weights accordingly. We can potentially end up with different (integer) coordinates than what the integer calculations would have chosen, which means the integer weights calculated previously in this case are completely wrong. Well at least that's what I think happens, at least recalculating the weights helps. (Some day really should refactor all the wrapping, so we do whatever is fastest independent of 16bit int aos or 32bit float soa filtering.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit fd6f18588ced7ac8e081892f3bab2916623ad7a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 11:15:53 2012 +0100 gallium/util: Fix parsing of options with underscore. For example GALLIVM_DEBUG=no_brilinear which was being parsed as two options, "no" and "brilinear". commit 09a8f809088178a03e49e409fa18f1ac89561837 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 26 15:00:14 2012 +0100 gallivm: Added a generic lp_build_print_value which prints a LLVMValueRef. Updated lp_build_printf to share common code. Removed specific lp_build_print_vecX. Reviewed-by: José Fonseca <jfonseca@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit e59bdcc2c075931bfba2a84967a5ecd1dedd6eb0 Author: José Fonseca <jfonseca@vmware.com> Date: Wed May 16 15:00:23 2012 +0100 draw,llvmpipe: Avoid named struct types on LLVM 3.0 and later. Starting with LLVM 3.0, named structures are meant not for debugging, but for recursive data types, previously also known as opaque types. The recursive nature of these types leads to several memory management difficulties. Given that we don't actually need recursive types, avoid them altogether. This is an attempt to address fdo bugs 41791 and 44466. The issue is somewhat random so there's no easy way to check how effective this is. Cherry-picked from 9af1ba565dfd5cef9ee938bb7c04767d14878fbf commit df6070f618a203c7a876d984c847cde4cbc26bdb Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 14:42:53 2012 +0200 gallivm: (trivial) fix typo in faster aos linear int wrap code no longer crashes, now REALLY tested. commit d8f98dce452c867214e6782e86dc08562643c862 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 18:20:58 2012 +0200 llvmpipe: (trivial) remove bogus optimization for float aos repeat wrap This optimization for nearest filtering on the linear path generated likely bogus results, and the int path didn't have any optimizations there since the only shader using force_nearest apparently uses clamp_to_edge not repeat wrap anyway. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit c4e271a0631087c795e756a5bb6b046043b5099d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 23:01:52 2012 +0200 gallivm: faster repeat wrap for linear aos path too Even if we already have scaled integer coords, it's way faster to use the original float coord (plus some conversions) rather than use URem. The choice of what to do for texture wrapping is not really tied to int aos or float soa filtering though for some modes there can be some gains (because of easier weight calculations). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 1174a75b1806e92aee4264ffe0ffe7e70abbbfa3 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 14:39:22 2012 +0200 gallivm: improve npot tex wrap repeat in linear soa path URem gets translated into series of scalar divisions so just about anything else is faster. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit f849ffaa499ed96fa0efd3594fce255c7f22891b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 00:40:35 2012 +0100 gallivm: (trivial) fix near-invisible shift-space typo I blame the keyboard. commit 5298a0b19fe672aebeb70964c0797d5921b51cf0 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:24:28 2012 +0200 gallivm: add new intrinsic helper to deal with arbitrary vector length This helper will split vectors which are too large for the hw, or expand them if they are too small, so a caller of a function using intrinsics which uses such sizes need not split (or expand) the vectors manually and the function will still use the intrinsic instead of dropping back to generic llvm code. It can also accept scalars for use with pseudo-vector intrinsics (only useful for float arguments, all x86 scalar simd float intrinsics use 4vf32). Only used for lp_build_min/max() for now (also added the scalar float case for these while there). (Other basic binary functions could use it easily, whereas functions with a different interface would need different helpers.) Expanding vectors isn't widely used, because we always try to use build contexts with native hw vector sizes. But it might (or not) be nicer if this wouldn't need to be done, the generated code should in theory stay the same (it does get hit by lp_build_rho though already since we didn't have a intrinsic for the scalar lp_build_max case before). v2: incorporated Brian's feedback, and also made the scalar min/max case work instead of crash (all scalar simd float intrinsics take 4vf32 as argument, probably the reason why it wasn't used before). Moved to lp_bld_intr based on José's request, and passing intrinsic size instead of length. Ideally we'd derive the source type info from the passed in llvm value refs and process some llvmtype return type so we could handle intrinsics where the source and destination type isn't the same (like float/int conversions, packing instructions) but that's a bit too complicated for now. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 01aa760b99ec0b2dc8ce57a43650e83f8c1becdf Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:19:18 2012 +0200 gallivm: (trivial) increase max code size for shader disassembly 64kB was just short of what I needed (which caused a crash) hence increase to 96kB (should probably be smarter about that). commit 74aa739138d981311ce13076388382b5e89c6562 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:53:29 2012 +0100 gallivm: simplify aos float tex wrap repeat nearest just handle pot and npot the same. The previous pot handling ended up with exactly the same instructions plus 2 more (leave it in the soa path though since it is probably still cheaper there). While here also fix a issue which would cause a crash after an assert. commit 0e1e755645e9e49cfaa2025191e3245ccd723564 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:29:24 2012 +0100 gallivm: (trivial) skip floor rounding in ifloor when not signed This was only done for the non-sse41 case before, but even with sse41 this is obviously unnecessary (some callers already call itrunc in this case anyway but some might not). commit 7f01a62f27dcb1d52597b24825931e88bae76f33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:23:12 2012 +0100 gallivm: (trivial) fix bogus comments commit 5c85be25fd82e28490274c468ce7f3e6e8c1d416 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 20 11:51:57 2012 +0100 translate: Free elt8_func/elt16_func too. These were leaking. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 0ad498f36fb6f7458c7cffa73b6598adceee0a6c Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 15:55:34 2012 +0200 gallivm: fix bug for tex wrap repeat with linear sampling in aos float path The comparison needs to be against length not length_minus_one, otherwise the max texel is never chosen (for the second coordinate). Fixes piglit texwrap-1D-npot-proj (and 2D/3D versions). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit d1ad65937c5b76407dc2499b7b774ab59341209e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 16:13:43 2012 +0200 gallivm: simplify soa tex wrap repeat with npot textures and no mip filtering Similar to what is already done in aos sampling for the float path (but not the int path since we don't get normalized float coordinates there). URem is expensive and the calculation is done trivially with normalized floats instead (at least with sse41-capable cpus). (Some day should probably do the same for the mip filter path but it's much more complicated there hence the gain is smaller.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit e1e23f57ba9b910295c306d148f15643acc3fc83 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:38:56 2012 +0200 llvmpipe: (trivial) remove duplicated function declaration Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 07ca57eb09e04c48a157733255427ef5de620861 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:37:34 2012 +0200 llvmpipe: destroy setup variants on context destruction lp_delete_setup_variants() used to be called in garbage collection, but this no longer exists hence the setup shaders never got freed. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ed0003c633859a45f9963a479f4c15ae0ef1dca3 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 16:25:29 2012 +0100 gallivm: handle different ilod parts for multiple quad sampling This fixes filtering when the integer part of the lod is not the same for all quads. I'm not fully convinced of that solution yet as it just splits the vector if the levels to be sampled from are different. But otherwise we'd need to do things like some minify steps, and getting mip level base address separately anyway hence it wouldn't really look like much of a win (and making the code even more complex). This should now give identical results to single quad sampling. commit 8580ac4cfc43a64df55e84ac71ce1a774d33c0d2 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:14:47 2012 +0200 gallivm: de-duplicate sample code common to soa and aos sampling There doesn't seem to be any reason why this code dealing with cube face selection, lod and mip level calculation is separate in aos and soa sampling, and I am sick of having it to change in both places. commit fb541e5f957408ce305b272100196f1e12e5b1e8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:15:41 2012 +0200 gallivm: do mip filtering with per quad lod_fpart This gives better results for mip filtering, though the generated code might not be optimal. For now it also creates some artifacts if the lod_ipart isn't the same for all quads, since instead of using the same mip weight for all quads as previously (which just caused non-smooth gradients) this now will use the right weights but with the wrong mip level in this case (can easily be seen with things like texfilt, mipmap_tunnel). v2: use logic helper suggested by José, and fix issue with negative lod_fpart values commit f1cc84eef7d826a20fab6cd8ccef9a275ff78967 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 13 18:35:25 2012 +0200 gallivm: (trivial) fix bogus assert in lp_build_unpack_broadcast_aos_scalars commit 7c17dbae8ae290df9ce0f50781a09e8ed640c044 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:11:14 2012 +0100 util: Reimplement half <-> float conversions. Removed u_half.py used to generate the table for previous method. Previous implementation of float to half conversion was faulty for denormalised and NaNs and would require extra logic to fix, thus making the speedup of using tables irrelevant. commit 7762f59274070e1dd4b546f5cb431c2eb71ae5c3 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:12:16 2012 +0100 tests: Updated tests to properly handle NaN for half floats. commit fa94c135aea5911fd93d5dfb6e6f157fb40dce5e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:33:10 2012 +0200 gallivm: do mip level calculations per quad This is the final piece which shouldn't change the rendering output yet. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 23cbeaddfe03c09ca18c45d28955515317ffcf4c Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Jun 9 00:54:21 2012 +0200 gallivm: do per-quad cube face selection Doesn't quite fix the piglit cubemap test (not sure why actually) but doing per-quad face selection is doing the right thing and definitely an improvement. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit abfb372b3702ac97ac8b5aa80ad1b94a2cc39d33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:22:59 2012 +0200 gallivm: do all lod calculations per quad Still no functional change but lod is now converted to scalar after lod calculations. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 519368632747ae03feb5bca9c655eccbc5b751b4 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:46:10 2012 +0100 gallivm: Added support for half-float to float conversion in lp_build_conv. Updated various utility functions to support this change. commit 135b4d683a4c95f7577ba27b9bffa4a6fbd2c2e7 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:02:46 2012 +0100 gallivm: Added function for half-float to float conversion. Updated lp_build_format_aos_array to support half-float source. commit 37d648827406a20c5007abeb177698723ed86673 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:55:18 2012 +0100 util: Updated u_format_tests to rigidly test half-float boundary values. commit 2ad18165d96e578aa9046df7c93cb1c3284d8c6b Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:54:16 2012 +0100 llvmpipe: Updated lp_test_format to properly handle Inf/NaN results. commit 78740acf25aeba8a7d146493dd5c966e22c27b73 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:53:30 2012 +0100 util: Added functions for checking NaN / Inf for double and half-floats. commit 35e9f640ae01241f9e0d67fe893bbbf564c05809 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:05:13 2012 +0200 gallivm: Fix calculating rho for 3d textures for the single-quad case Discovered by accident, this looks like a very old typo bug. commit fc1220c636326536fd0541913154e62afa7cd1d8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:04:59 2012 +0200 gallivm: do calcs per-quad in lp_build_rho Still convert to scalar at the end of the function. commit 50a887ffc550bf310a6988fa2cea5c24d38c1a41 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 21 23:21:50 2012 +0200 gallivm: (trivial) return scalar in lp_build_extract_range for length 1 vectors Our type system on top of llvm's one doesn't generally support vectors of length 1, instead using scalars. So we should return a scalar from this function instead of having to bitcast the vector with length 1 later elsewhere. commit 80c71c621f9391f0f9230460198d861643324876 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 17:49:15 2012 +0100 draw: Fixed bad merge error commit c47401cfad0c9167de20ff560654f533579f452c Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:29:30 2012 +0100 draw: Updated store_clip to store whole vectors instead of individual elements. commit 2d9c1ad74b0b0b41861fffcecde39f09cc27f1cf Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:28:32 2012 +0100 gallivm: Added lp_build_fetch_rgba_aos_array. A version of lp_build_fetch_rgba_aos which is targeted at simple array formats. Reads the whole vector from memory in one, instead of reading each element individually. Tested with mesa tests and demos. commit ff7805dc2b6ef6d8b11ec4e54aab1633aef29ac8 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:27:40 2012 +0100 gallivm: Added lp_build_pad_vector. This function pads a vector with undef to a desired length. commit 701f50acef24a2791dabf4730e5b5687d6eb875d Author: James Benton <jbenton@vmware.com> Date: Fri May 18 17:27:19 2012 +0100 util: Added util_format_is_array. This function checks whether a format description is in a simple array format. commit 5e0a7fa543dcd009de26f34a7926674190fa6246 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:13:47 2012 +0100 draw: Removed draw_llvm_translate_from and draw/draw_llvm_translate.c. This is "replaced" by adding an optimised path in lp_build_fetch_rgba_aos in an upcoming patch. commit 8c886d6a7dd3fb464ecf031de6f747cb33e5361d Author: James Benton <jbenton@vmware.com> Date: Wed May 16 15:02:31 2012 +0100 draw: Modified store_aos to write the vector as one, not individual elements. commit 37337f3d657e21dfd662c7b26d61cb0f8cfa6f17 Author: James Benton <jbenton@vmware.com> Date: Wed May 16 14:16:23 2012 +0100 draw: Changed aos_to_soa to use lp_build_transpose_aos. commit bd2b69ce5d5c94b067944d1dcd5df9f8e84548f1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:14:27 2012 +0100 draw: Changed soa_to_aos to use lp_build_transpose_aos. commit 0b98a950d29a116e82ce31dfe7b82cdadb632f2b Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:45 2012 +0100 gallivm: Added lp_build_transpose_aos which converts between aos and soa. commit 69ea84531ad46fd145eb619ed1cedbe97dde7cb5 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:01 2012 +0100 gallivm: Added lp_build_interleave2_half aimed at AVX unpack instructions. commit 7a4cb1349dd35c18144ad5934525cfb9436792f9 Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 22 11:54:14 2012 +0100 gallivm: Fix build on Windows. MC-JIT not yet supported there. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit afd105fc16bb75d874e418046b80d9cc578818a1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:17:26 2012 +0100 llvmpipe: Added a error counter to lp_test_conv. Useful for keeping track of progress when fixing errors! Signed-off-by: José Fonseca <jfonseca@vmware.com> commit b644907d08c10a805657841330fc23db3963d59c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:16:46 2012 +0100 llvmpipe: Changed known failures in lp_test_conv. To comply with the recent fixes to lp_bld_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit d7061507bd94f6468581e218e61261b79c760d4f Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:14:38 2012 +0100 llvmpipe: Added fixed point types tests to lp_test_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 146b3ea39b4726dbe125ac666bd8902ea3d6ca8c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:26:35 2012 +0100 llvmpipe: Changed lp_test_conv src/dst alignment to be correct. Now based on the define rather than a fixed number. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit f3b57441f834833a4b142a951eb98df0aa874536 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:06:44 2012 +0100 gallivm: Fixed erroneous optimisation in lp_build_min/max. Previously assumed normalised was 0 to 1, but it can be -1 to 1 if type is signed. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a0613382e5a215cd146bb277646a6b394d376ae4 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:04:49 2012 +0100 gallivm: Compensate for lp_const_offset in lp_build_conv. Fixing a /*FIXME*/ to remove errors in integer conversion in lp_build_conv. Tested using lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a3d2bf15ea345bc8a0664f8f441276fd566566f3 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:01:25 2012 +0100 gallivm: Fixed overflow in lp_build_clamped_float_to_unsigned_norm. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit e7b1e76fe237613731fa6003b5e1601a2e506207 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 21 20:07:51 2012 +0100 gallivm: Fix build with LLVM 2.6 Trivial, and useful. commit d3c6bbe5c7f5ba1976710831281ab1b6a631082d Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 15 17:15:59 2012 +0100 gallivm: Enable MCJIT/AVX with vanilla LLVM 3.1. Add the necessary C++ glue, so that we don't need any modifications to the soon to be released LLVM 3.1. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 724a019a14d40fdbed21759a204a2bec8a315636 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 22:04:06 2012 +0100 gallivm: Use HAVE_LLVM 0x0301 consistently. commit af6991e2a3868e40ad599b46278551b794839748 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 21:49:06 2012 +0100 gallivm: Add MCRegisterInfo.h to silence benign warnings about missing implementation. Trivial. commit 6f8a1d75458daae2503a86c6b030ecc4bb494e23 Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Apr 2 22:14:15 2012 -0700 gallivm: Pass in a MCInstrInfo to createMCInstPrinter on llvm-3.1. llvm-3.1svn r153860 makes MCInstrInfo available to the MCInstPrinter. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 62555b6ed8760545794f83064e27cddcb3ce5284 Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 27 21:51:17 2012 -0700 gallivm: Fix method overriding in raw_debug_ostream. Use matching type qualifers to avoid method hiding. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 6a9bd784f4ac68ad0a731dcd39e5a3c39989f2be Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 13 22:40:52 2012 -0700 gallivm: Fix createOProfileJITEventListener namespace with llvm-3.1. llvm-3.1svn r152620 refactored the OProfile profiling code. createOProfileJITEventListener was moved from the llvm namespace to the llvm::JITEventListener namespace. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit b674955d39adae272a779be85aa1bd665de24e3e Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Mar 5 22:00:40 2012 -0800 gallivm: Pass in a MCRegisterInfo to MCInstPrinter on llvm-3.1. llvm-3.1svn r152043 changes createMCInstPrinter to take an additional MCRegisterInfo argument. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 11ab69971a8a31c62f6de74905dbf8c02884599f Author: Vinson Lee <vlee@freedesktop.org> Date: Wed Feb 29 21:20:53 2012 -0800 Revert "gallivm: Change getExtent and readByte to non-const with llvm-3.1." This reverts commit d5a6c172547d8964f4d4bb79637651decaf9deee. llvm-3.1svn r151687 makes MemoryObject accessor members const again. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 339960c82d2a9f5c928ee9035ed31dadb7f45537 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 14 16:19:56 2012 +0200 gallivm: (trivial) fix assertion failure for mipmapped 1d textures In lp_build_rho, we may end up with a 1-element vector (for mipmapped 1d textures), but in this case we require the type to be a non-vector type, so need a cast. commit 9d73edb727bd6d196030dc3026b7bf0c574b3e19 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:12:07 2012 +0200 gallivm: prepare for per-quad lod calculations for large vectors to be able to handle multiple quads at once in texture sampling and still do lod calculations per quad, it is necessary to get the per-quad derivatives into the lp_build_rho function. Until now these derivative values were just scalars, which isn't going to work. So we now use vectors, and since the interface needs to change we also do some different (slightly more efficient) packing of the values. For 8-wide vectors the packed derivative values for 3 coords would look like this, this scales to a arbitrary (multiple of 4) vector size: ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____ The second vector will be unused for 1d and 2d textures. To facilitate future changes the derivative values are put into a struct, since quite some functions just pass these values through. The generated code seems to be very slightly better for 2d textures (with 4-wide vectors) than before with sse2 (if you have a cpu with physical 128bit simd units - otherwise it's probably not a win). v2: suggestions from José, rename variables, add comments, use swizzle helper commit 0aa21de0d31466dac77b05c97005722e902517b8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:10:31 2012 +0200 gallivm: add undefined swizzle handling to lp_build_swizzle_aos This is useful for vectors with "holes", it lets llvm choose the most efficient shuffle instructions if some elements aren't needed without having to worry what elements to manually pick otherwise. commit 00faf3f370e7ce92f5ef51002b0ea42ef856e181 Author: José Fonseca <jfonseca@vmware.com> Date: Fri May 4 17:25:16 2012 +0100 gallivm: Get the LLVM IR optimization passes before JIT compilation. MC-JIT engine compiles the module immediately on creation, so the optimization passes were being run too late. So now we create a target data layout from a string, that matches the ABI parameters reported by the compiler. The backend optimization passes were always been run, so the performance improvement is modest (3% on multiarb mesa demo). Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit 40a43f4e2ce3074b5ce9027179d657ebba68800a Author: Roland Scheidegger <sroland@vmware.com> Date: Wed May 2 16:03:54 2012 +0200 gallivm: (trivial) fix wrong define used in lp_build_pack2 should fix stack-smashing crashes. commit e6371d0f4dffad4eb3b7a9d906c23f1c88a2ab9e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 30 21:25:29 2012 +0200 gallivm: add perf warnings when not using intrinsics with 256bit vectors Helper functions using integer sse2 intrinsics could split the vectors with AVX instead of using generic fallback (which should be faster). We don't actually expect to hit these paths (hence don't fix them up to actually do the vector splitting) so just emit warnings (for those functions where it's obvious doing split/intrinsic is faster than using generic path). Only emit warnings for 256bit vectors since we _really_ don't expect to hit arbitrary large vectors which would affect a lot more functions. The warnings do not actually depend on avx since the same logic applies to plain sse2 too (but of course again there's _really_ no reason we should hit these functions with 256bit vectors without avx). commit 8a9ea701ea7295181e846c6383bf66a5f5e47637 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:37:07 2012 +0200 gallivm: split vectors manually for avx in lp_build_pack2 (v2) There's 2 reasons for this: First, there's a llvm bug (fixed in 3.1) which generates tons of byte inserts/extracts otherwise, and second, more importantly, we want to use pack intrinsics instead of shuffles. We do this in lp_build_pack2 and not the calling code (aos sample path) because potentially other callers might find that useful too, even if for larger sequences of code using non-native vector sizes it might be better to manually split vectors. This should boost texture performance in the aos path considerably. v2: fix issues with intrinsics types with old llvm commit 27ac5b48fa1f2ea3efeb5248e2ce32264aba466e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:26:22 2012 +0200 llvmpipe: refactor lp_build_pack2 (v2) prettify, and it's unnecessary to assert when there's no intrinsic due to unsupported bit width - the shuffle path will work regardless. In contrast lp_build_packs2, should only rely on lp_build_pack2 doing the clamping for element sizes for which there is a sse2 intrinsic. v2: fix bug spotted by Jose regarding the intrinsic type for packusdw on old llvm versions. commit ddf279031f0111de4b18eaf783bdc0a1e47813c8 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:13:59 2012 +0200 gallivm: add src width check in lp_build_packs2() not doing so would skip clamping even if no sse2 pack instruction is available, which is incorrect (in theory only, such widths would also always hit a (unnecessary) assertion in lp_build_pack2(). commit e7f0ad7fe079975eae7712a6e0c54be4fae0114b Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Apr 27 15:57:00 2012 +0200 gallivm: (trivial) fix crash-causing typo for npot textures with avx commit 28a9d7f6f655b6ec508c8a3aa6ffefc1e79793a0 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Apr 25 19:38:45 2012 +0200 gallivm: (trivial) remove code mistakenly added twice. commit d5926537316f8ff67ad0a52e7242f7c5478d919b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Apr 24 21:16:15 2012 +0200 gallivm: add a new avx aos sample path (v2) Try to avoid mixing float and int address calculations. This does texture wrap modes with floats, and then the offset calculations still with ints (because of lack of precision with floats, though we could do some effort to make it work with not too large (16MB) textures). This also handles wrap repeat mode with npot-sized textures differently than either the old soa or aos int path (likely way faster but untested). Otherwise the actual address wrap code is largely similar to the soa path (not quite the same as this one also has some int code), it should get used by avx soa sampling later as well but doesn't handle more complex address modes yet (this will also have the benefit that we can use aos sampling path for all texture address modes). Generated code for that looks reasonable, but still does not split vectors explicitly for fetch/filter which means still get hit by llvm (fixed upstream) which generates hundreds of pinsrb/pextrb instead of two shuffles. It is not obvious though if it's much of a win over just doing address calcs 4-wide but with ints, even if it is definitely much less instructions on avx. piglit's texwrap seems to look exactly the same but doesn't test neither the non-normalized nor the npot cases. v2: fix comments, prettify based on Brian's and Jose's feedback. commit bffecd22dea66fb416ecff8cffd10dd4bdb73fce Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Apr 19 01:58:29 2012 +0200 gallivm: refactor aos lp_build_sample_image_nearest/linear split them up to separate address calculations and fetching/filtering. Need this for being able to do 8-wide float address calcs and 4-wide fetch/filter later (for avx). Plus the functions were very big scary monsters anyway (in particular lp_build_sample_image_linear). commit a80b325c57529adddcfa367f96f03557725c4773 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 16 17:17:18 2012 +0200 gallivm: fix lp_build_resize when truncating width but expanding vector size Missed this case which I thought was impossible - the assertion for it was right after the division by zero... (AoS) texture sampling may ask us to do this, for things like 8 4x32int vectors to 1 32x8int vector conversion (eventually, we probably don't want this to happen). commit f9c8337caa3eb185830d18bce8b95676a065b1d7 Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Apr 14 18:00:59 2012 +0200 gallivm: fix cube maps with larger vectors This makes the branchless cube face selection code work with larger vectors. Because the complexity is quite high (cannot really be improved it seems, per-face selection would reduce complexity a lot but this leads to errors unless the derivatives are calculated all from the same face which almost doubles the work to be done) it is still slower than the branching version, hence only enable this with large vectors. It doesn't actually do per-quad face selection yet (only makes sense with matching lod selection, in fact it will select the same face for all pixels based on the average of the first four pixels for now) but only different shuffles are required to make it work (the branching version actually should work with larger vectors too now thanks to the improved horizontal add but of course it cannot be extended to really select the face per-quad unless doing branching per quad). commit 7780c58869fc9a00af4f23209902db7e058e8a66 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 21:11:12 2012 +0100 llvmpipe: (trivial) fix compiler warning and also clarify comment regarding availability of popcnt instruction. commit a266dccf477df6d29a611154e988e8895892277e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 14:21:07 2012 +0100 gallivm: remove unneeded members in lp_build_sample_context Minor cleanup, the texture width, height, depth aren't accessed in their scalar form anywhere. Makes it more obvious those values should probably be fetched already vectorized (but this requires more invasive changes)... commit b678c57fb474e14f05e25658c829fc04d2792fff Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 29 15:53:55 2012 +0100 gallivm: add a helper for concatenating vectors Similar to the extract_range helper intended to get around slow code generated by llvm for 128bit insertelements. Concatenating two 128bit vectors this way will result in a single vinsertf128 operation rather than two 64bit stores plus one 128bit load, though it might be mildly useful for other purposes as well. commit 415ff228bcd0cf5e44a4c15350a661f0f5520029 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 19:41:15 2012 +0100 gallivm: add a custom 2x8f->1x16ub avx conversion path Similar to the existing 4x4f->1x16ub sse2 path, shaves off a couple instructions (min/max mostly) because it relies on pack intrinsics clamping. commit 78c08fc89f8fbcc6dba09779981b1e873e2a0299 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 18:44:07 2012 +0100 gallivm: add avx arithmetic intrinsics Add all avx intrinsics for arithmetic functions (with the exception of the horizontal add function which needs another look). Seems to pass basic tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit a586caa2800aa5ce54c173f7c0d4fc48153dbc4e Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 15:31:35 2012 +0100 gallivm: add avx logic intrinsics Add the blend intrinsics for 8-wide float and 4-wide double vectors. Since we lack 256bit int instructions these are used for int vectors as well, though obviously not for byte or word element values. The comparison intrinsics aren't extended for avx since these are only used for pre-2.7 llvm versions. commit 70275e4c13c89315fc2560a4c488c0e6935d5caf Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 00:40:53 2012 +0100 gallivm: new helper function for extract shuffles. Based on José's idea as we can need that in a couple places. Note that such shuffles should not be used lightly, since data layout of <4 x i8> is different to <16 x i8> for instance, hence might cause data rearrangement. commit 4d586dbae1b0c55915dda1759d2faea631c0a1c2 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 18:27:25 2012 +0100 gallivm: (trivial) don't overallocate shuffle variable using wrong define meant huge array... commit 06b0ec1f6d665d98c135f9573ddf4ba04b2121ad Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 17:54:20 2012 +0100 gallivm: don't do per-element extract/insert for vector element resize Instead of doing per-element extract/insert if the src vectors and dst vector differ in total size (which generates atrocious code) first change the src vectors size by using shuffles to destination vector size. We can still do better than that on AVX for packing to color buffer (by exploiting pack intrinsics characteristics hence eleminating the need for some clamps) but this already generates much better code. v2: incorporate feedback from José, Keith and use shuffle instead of bitcasts/extracts. Due to llvm deficiencies the latter cause all data to get moved to GPRs and back in pieces (even though the data in the regs actually stays the same...). commit c9970d70e05f95d3f52fe7d2cd794176a52693aa Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 19:33:19 2012 +0000 gallivm: fix bug in simple position interpolation Accidental use of position attribute instead of just pixel coordinates. Caused failures in piglit glsl-fs-ceil and glsl-fs-floor. commit d0b6fcdb008d04d7f73d3d725615321544da5a7e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 15:31:14 2012 +0000 gallivm: fix emission of ceil opcode lp_build_ceil seems more appropriate than lp_build_trunc. This seems to be never hit though someone performs some ceil to floor magic. commit d97fafed7e62ffa6bf76560a92ea246a1a26d256 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 22 11:46:52 2012 +0000 gallivm: new vectorized path for cubemap calculations should be faster when adapted to multiple quads as only selection masks need to be different. The code is more or less a per-pixel version adapted to only do it per quad. A per pixel version would be much simpler (could drop 2 selects, 6 broadcasts and the messy horizontal add of 3 vectors at the expense of only 2 more absolute value instructions - would also just work for arbitary large vectors). This version doesn't yet work with larger vectors because the horizontal add isn't adjusted to be able to work with 2x4 vectors (and also because face selection wouldn't be done per quad just per block though that would be only a correctness issue just as with lod selection). The downside is this code is quite a bit slower. On a Core2 it can be sped up by disabling the hw blend instructions for selection and using logicop fallbacks instead, but it is still slower than the old code, hence leave that in for now. Probably will chose one or the other version based on vector length in the end. commit b375fbb18a3fd46859b7fdd42f3e9908ea4ff9a3 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 21 14:42:29 2012 +0000 gallivm: fix optimized occlusion query intrinsic name commit a9ba0a3b611e48efbb0e79eb09caa85033dbe9a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Mar 21 16:19:43 2012 +0000 draw,gallivm,llvmpipe: Call gallivm_verify_function everywhere. commit f94c2238d2bc7383e088b8845b7410439a602071 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 18:54:10 2012 +0000 gallivm: optimize calculations for cube maps a bit this does some more vectorized calculations and uses horizontal adds if possible. A definite win with sse3 otherwise it doesn't seem to make much of a difference. In any case this is arithmetically identical, cannot handle larger vectors. Should be useful as a reference point against larger vector version later... commit 21a2c1cf3c8e1ac648ff49e59fdc0e3be77e2ebb Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 15:16:27 2012 +0000 llvmpipe: slight optimization of occlusion queries using movmskps when available. While this is slightly better for cpus without popcnt we should really sum the vectors ourselves (it is also possible to cast to i4 before doing the popcnt but that doesn't help that much neither since llvm is using some optimized popcnt version for i32) commit 5ab5a35f216619bcdf55eed52b0db275c4a06c1b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 13:32:11 2012 +0000 llvmpipe: fix occlusion queries with larger vectors need to adjust casts etc. commit ff95e6fdf5f16d4ef999ffcf05ea6e8c7160b0d5 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Mar 19 20:15:25 2012 +0000 gallivm: Restore optimization passes. commit 57b05b4b36451e351659e98946dae27be0959832 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:34:22 2012 +0000 llvmpipe: use existing min2 macro commit bc9a20e19b4f600a439f45679451f2e87cd4b299 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:07:27 2012 +0000 llvmpipe: add some safeguards against really large vectors As per José's suggestion, prevent things from blowing up if some cpu would have 1024bit or larger vectors. commit 0e2b525e5ca1c5bbaa63158bde52ad1c1564a3a9 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:31:08 2012 +0000 llvmpipe: fix mask generation for uberwide vectors this was the only piece preventing 16-wide vectors from working (apart from the LP_MAX_VECTOR_WIDTH define that is), which is the maximum as we don't get more pixels in the fragment shader at once. Hence adjust that so things could be tested properly with that size even though there seems to be no practical value. commit 3c8334162211c97f3a11c7f64e9e5a2a91ad9656 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:19:41 2012 +0000 llvmpipe: fix the simple interpolation method with larger vectors so both methods actually _really_ work now. Makes textures look nice with larger vectors... commit 1cb0464ef8871be1778d43b0c56adf9c06843e2d Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 17:26:35 2012 +0000 llvmpipe: fix mask generation and position interpolation with 8-wide vectors trivial bugs, with these things start to look somewhat reasonable. Textures though have some swizzling issues it seems. commit 168277a63ef5b72542cf063c337f2d701053ff4b Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 16:04:03 2012 +0000 llvmpipe: don't overallocate variables we never have more than 16 (stamp size) / 4 (minimum possible vector size). (With larger vectors those variables are still overallocated a bit.) commit 409b54b30f81ed0aa9ed0b01affe15c72de9abd2 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:56:48 2012 +0000 llvmpipe: add some 32f8 formats to lp_test_conv Also add the ability to handle different sized vectors. commit 55dcd3af8366ebdac0af3cdb22c2588f24aa18ce Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:47:27 2012 +0000 gallivm: handle different sized vectors in conversion / pack only fully generic path for now (extract/insert per element). commit 9c040f78c54575fcd94a8808216cf415fe8868f6 Author: Roland Scheidegger <sroland@vmware.com> Date: Sun Mar 18 00:58:28 2012 +0100 llvmpipe: fix harmless use of unitialized values commit 551e9d5468b92fc7d5aa2265db9a52bb1e368a36 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:31:21 2012 +0100 gallivm: drop special path in extract_broadcast with different sized vectors Not needed, llvm can handle shuffles with different sized result vector just fine. Should hopefully generate the same code in the end, but simpler IR. commit 44da531119ffa07a421eaa041f63607cec88f6f8 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:28:49 2012 +0100 llvmpipe: adapt interpolation for handling multiple quads at once this is still WIP there are actually two methods possible not quite sure what makes the most sense, so there's code for both for now: 1) the iterative method as used before (compute attrib values at upper left corner of stamp and upper left corner of each quad initially). It is improved to handle more than one quad at once, and also do some more vectorized calculations initially for slightly better code - newer cpus have full throughput with 4 wide float vectors, hence don't try to code up a path which might be faster if there's just one channel active per attribute. 2) just do straight interpolation for each pixel. Method 2) is more work per quad, but less initially - if all quads are executed significantly more overall though. But this might change with larger vector lengths. This method would also be needed if we'd do some kind of active quad merging when operating on multiple quads at once. This path contains some hack to force llvm to generate better code, it is still far from ideal though, still generates far too many unnecessary register spills/reloads. Both methods should work with different sized vectors. Not very well tested yet, still seems to work with four-wide vectors, need changes elsewhere to be able to test with wider vectors. commit be5d3e82e2fe14ad0a46529ab79f65bf2276cd28 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:59:37 2012 +0000 draw: Cleanup. commit f85bc12c7fbacb3de2a94e88c6cd2d5ee0ec0e8d Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:43:30 2012 +0000 gallivm: More module compilation refactoring. commit d76f093198f2a06a93b2204857e6fea5fd0b3ece Author: José Fonseca <jfonseca@vmware.com> Date: Thu Mar 15 21:29:11 2012 +0000 llvmpipe: Use gallivm_compile/free_function() in linear code. Should had been done before. commit 122e1adb613ce083ad739b153ced1cde61dfc8c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 13 14:47:10 2012 +0100 llvmpipe: generate partial pixel mask for multiple quads still works with one quad, cannot be tested yet with more At least for now always fixed order with multiple quads. commit 4c4f15081d75ed585a01392cd2dcce0ad10e0ea8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 22:09:24 2012 +0100 llvmpipe: refactor state setup a bit Refactor to make it easier to emit (and potentially later fetch in fs) coefficients for multiple attributes at once. Need to think more about how to make this actually happen however, the problem is different attributes can have different interpolation modes, requiring different handling in both setup and fs (though linear and perspective handling is close). commit 9363e49722ff47094d688a4be6f015a03fba9c79 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 19:23:23 2012 +0100 llvmpipe: vectorize tri offset calc cuts number of instructions in quad-offset-factor from 107 to 75. This code actually duplicated the (scalar) code calculating the determinant except it used different vertex order (leading to different sign but it doesn't matter) hence llvm could not have figured out it's the same (of course with determinant vectorized in the other place that wouldn't have worked any longer neither). Note this particular piece doesn't actually vectorize well, not many arithmetic instructions left but tons of shuffle instructions... Probably would need to work on n tris at a time for better vectorization. commit 63169dcb9dd445c94605625bf86d85306e2b4297 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 03:11:37 2012 +0100 llvmpipe: vectorize some scalar code in setup reduces number of arithmetic instructions, and avoids loading vector x,y values twice (once as scalars once as vectors). Results in a reduction of instructions from 76 to 64 in fs setup for glxgears (16%) on a cpu with sse41. Since this code uses vec2 disguised as vec4, on old cpus which had physical 64bit sse units (pre-Core2) it probably is less of a win in practice (and if you have no vectors you can only hope llvm eliminates the arithmetic for unneeded elements). commit 732ecb877f951ab89bf503ac5e35ab8d838b58a1 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 7 00:32:24 2012 +0100 draw: fix clipping bug introduced by 4822fea3f0440b5205e957cd303838c3b128419c broke clipping pretty badly (verified with lineclip test) commit ef5d90b86d624c152d200c7c4056f47c3c6d2688 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 23:38:59 2012 +0100 draw: don't store vertex header per attribute storing the vertex header once per attribute is totally unnecessary. Some quick look at the generated assembly says llvm in fact cannot optimize away the additional stores (maybe due to potentially aliasing pointers somewhere). Plus, this makes the code cleaner and also allows using a vector "or" instead of scalar ones. commit 6b3a5a57b0b9850854cfbd7b586e4e50102dda71 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 19:11:01 2012 +0100 draw: do the per-vertex "boolean" clipmask "or" with vectors no point extracting the values and doing it per component. Doesn't help that much since we still extract the values elsewhere anyway. commit 36519caf1af40e4480251cc79a2d527350b7c61f Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 2 22:27:01 2012 +0100 gallivm: fix lp_build_extract_broadcast with different sized vectors Fix the obviously wrong argument, so it doesn't blow up. commit 76d0ac3ad85066d6058486638013afd02b069c58 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 2 12:16:23 2012 +0000 draw: Compile per module and not per function (WIP). Enough to get gears w/ LLVM draw + softpipe to work on AVX doing: GALLIUM_DRIVER=softpipe SOFTPIPE_USE_LLVM=yes glxgears But still hackish -- will need to rethink and refactor this. commit 78e32b247d2a7a771be9a1a07eb000d1e54ea8bd Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 12:01:05 2012 +0000 llvmpipe: Remove lp_state_setup_fallback. Never used. commit 6895d5e40d19b4972c361e8b83fdb7eecda3c225 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Feb 27 19:14:27 2012 +0000 llvmpipe: Don't emit EMMS on x86 We already take precautions to ensure that LLVM never emits MMX code. commit 4822fea3f0440b5205e957cd303838c3b128419c Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 29 15:58:19 2012 +0100 draw: modifications for larger vector sizes We want to be able to use larger vectors especially for running the vertex shader. With this patch we build soa vectors which might have a different length than 4. Note that aos structures really remain the same, only when aos structures are converted to soa potentially different sized vectors are used. Samplers probably don't work yet, didn't look at them. Testing done: glxgears works with both 128bit and 256bit vectors. commit f4950fc1ea784680ab767d3dd0dce589f4e70603 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:57 2012 +0100 gallivm: override native vector width with LP_NATIVE_VECTOR_WIDTH env var for debug commit 6ad6dbf0c92f3bf68ae54e5f2aca035d19b76e53 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:24 2012 +0100 draw: allocate storage with alignment according to native vector width commit 7bf0e3e7c9bd2469ae7279cabf4c5229ae9880c1 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Feb 24 19:06:08 2012 +0000 gallivm: Fix comment grammar. Was missing several words. Spotted by Roland. commit b20f1b28eb890b2fa2de44a0399b9b6a0d453c52 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 19:22:09 2012 +0000 gallivm: Use MC-JIT on LLVM 3.1 + (i.e, SVN) MC-JIT Note: MC-JIT is still WIP. For this to work correctly it requires LLVM changes which are not yet upstream. commit b1af4dfcadfc241fd4023f4c3f823a1286d452c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 20:03:15 2012 +0100 llvmpipe: use new lp_type_width() helper in lp_test_blend commit 04e0a37e888237d4db2298f31973af459ef9c95f Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 19:50:34 2012 +0100 llvmpipe: clean up lp_test_blend a little Using variables just sized and aligned right makes it a bit more obvious what's going on. The test still only tests vector length 4. For AoS anything else probably isn't going to work. For SoA other lengths should work (at least with floats). commit e61c393d3ec392ddee0a3da170e985fda885a823 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:48:30 2012 +0000 gallivm: Ensure vector width consistency. Instead of assuming that everything is the max native size. commit 330081ac7bc41c5754a92825e51456d231bf84dd Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:44:14 2012 +0000 draw: More simd vector width consistency fixes. commit d90ca002753596269e37297e2e6c139b19f29f03 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:43:00 2012 +0000 gallivm: Remove unused lp_build_int32_vec4_type() helper. commit cae23417824d75869c202aaf897808d73a2c1db0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 17:32:16 2012 +0100 gallivm: use global variable for native vector width instead of define We do not know the simd extensions (and hence the simd width we should use) available at compile time. At least for now keep a define for maximum vector width, since a global variable obviously can't be used to adjust alignment of automatic stack variables. Leave the runtime-determined value at 128 for now in all cases. commit 51270ace6349acc2c294fc6f34c025c707be538a Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 15:41:02 2012 +0000 gallivm: Add a hunk inadvertedly lost when rebasing. commit bf256df9cfdd0236637a455cbaece949b1253e98 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:24:23 2012 +0000 llvmpipe: Use consistent vector width in depth/stencil test. commit 5543b0901677146662c44be2cfba655fd55da94b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:19:59 2012 +0000 draw: Use a consistent the vector register width. Instead of 4x32 sometimes, LP_NATIVE_VECTOR_WIDTH other times. commit eada8bbd22a3a61f549f32fe2a7e408222e5c824 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 12:08:04 2012 +0000 gallivm: Remove garbagge collection. MC-JIT will require one compilation per module (as opposed to one compilation per function), therefore no state will be shared, eliminating the need to do garbagge collection. commit 556697ea0ed72e0641851e4fbbbb862c470fd7eb Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 10:33:41 2012 +0000 gallivm: Move all native target initialization to lp_set_target_options(). commit c518e8f3f2649d5dc265403511fab4bcbe2cc5c8 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:52:32 2012 +0000 llvmpipe: Create one gallivm instance for each test. commit 90f10af8920ec6be6f2b1e7365cfc477a0cb111d Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:48:08 2012 +0000 gallivm: Avoid LLVMAddGlobalMapping() in lp_bld_assert(). Brittle, complex, and unecesary. Just use function pointer constant. commit 98fde550b33401e3fe006af59db4db628bcbf476 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:21:26 2012 +0000 gallivm: Add a lp_build_const_func_pointer() helper. To be reused in all places where we want to call C code. commit 6cfedadb62c2ce5af8d75969bc95a607f3ece118 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:44:41 2012 +0000 gallivm: Cleanup/simplify lp_build_const_string_variable. - Move to lp_bld_const where it belongs - Rename to lp_build_const_string - take the length from the argument (and don't count the zero terminator twice) - bitcast the constant to generic i8 * commit db1d4018c0f1fa682a9da93c032977659adfb68c Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 11:52:17 2012 +0000 gallivm: Set NoFramePointerElimNonLeaf to true where supported. commit 088614164aa915baaa5044fede728aa898483183 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 22 19:38:47 2012 +0100 llvmpipe: pass in/out pointers rather scalar floats in lp_bld_arit we don't want llvm to potentially optimize away the vectors (though it doesn't seem to currently), plus we want to be able to handle in/out vectors of arbitrary length. commit 3f5c4e04af8a7592fdffa54938a277c34ae76b51 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:22:55 2012 +0100 gallivm: fix lp_build_sqrt() for vector length 1 since we optimize away vectors with length 1 need to emit intrinsic without vector type. commit 79d94e5f93ed8ba6757b97e2026722ea31d32c06 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 17:00:46 2012 +0000 llvmpipe: Remove lp_test_round. commit 81f41b5aeb3f4126e06453cfc78990086b85b78d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:56:24 2012 +0100 llvmpipe: subsume lp_test_round into lp_test_arit Much simpler, and since the arguments aren't passed as 128bit values can run on any arch. This also uses the float instead of the double versions of the c functions (which probably was the intention anyway). In contrast to lp_test_round the output is much less verbose however. Tested vector width of 32 to 512 bits - all pass except 32 (length 1) which crashes in lp_build_sqrt() due to wrong type. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 945b338b421defbd274481d8c4f7e0910fd0e7eb Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 09:55:03 2012 +0000 gallivm: Centralize the function compilation logic. This simplifies a lot of code. Also doing this in a central place will make it easier to carry out the changes necessary to use MC-JIT in the future. gallivm: Fix typo in explicit derivative shuffle. Trivial. draw: make DEBUG_STORE work again adapt to lp_build_printf() interface changes Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: get rid of vecnf_from_scalar() just use lp_build_broadcast directly (cannot assign a name but don't really need it, vecnf_from_scalar() was producing much uglier IR due to using repeated insertelement instead of insertelement+shuffle). Reviewed-by: José Fonseca <jfonseca@vmware.com> llvmpipe: fix typo in complex interpolation code Fixes position interpolation when using complex mode (piglit fp-fragment-position and similar) Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: fix clipvertex/position storing again This appears to be the result of a bad merge. Fixes piglit tests relying on clipping, like a lot of the interpolation tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Fix explicit derivative manipulation. Same counter variable was being used in two nested loops. Use more meanigful variable names for the counter to fix and avoid this. gallivm: Prevent buffer overflow in repeat wrap mode for NPOT. Based on Roland's patch, discussion, and review . Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix dims for TGSI_TEXTURE_1D in emit_tex. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix explicit volume texture derivatives. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: fix 1d shadow texture sampling Always r coordinate is used, hence need 3 coords not two (the second one is unused). Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Enable AVX support without MCJIT, where available. For now, this just enables AVX on Windows for testing. If the code is stable then we might consider prefering the old JIT wherever possible. No change elsewhere. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2012-07-13 18:09:30 +01:00
if ((util_cpu_caps.has_sse2 &&
((type.width == 32) && (type.length == 1 || type.length == 4))) ||
(util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
return lp_build_iround_nearest_sse2(bld, a);
}
if (arch_rounding_available(type)) {
res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
}
else {
LLVMValueRef half;
half = lp_build_const_vec(bld->gallivm, type, 0.5);
if (type.sign) {
LLVMTypeRef vec_type = bld->vec_type;
LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
(unsigned long long)1 << (type.width - 1));
LLVMValueRef sign;
/* get sign bit */
sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
sign = LLVMBuildAnd(builder, sign, mask, "");
/* sign * 0.5 */
half = LLVMBuildBitCast(builder, half, int_vec_type, "");
half = LLVMBuildOr(builder, sign, half, "");
half = LLVMBuildBitCast(builder, half, vec_type, "");
}
res = LLVMBuildFAdd(builder, a, half, "");
}
res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
return res;
}
/**
* Return floor of float (vector), result is an int (vector)
* Ex: ifloor(1.1) = 1.0
* Ex: ifloor(-1.1) = -2.0
*/
LLVMValueRef
lp_build_ifloor(struct lp_build_context *bld,
LLVMValueRef a)
{
LLVMBuilderRef builder = bld->gallivm->builder;
const struct lp_type type = bld->type;
LLVMTypeRef int_vec_type = bld->int_vec_type;
LLVMValueRef res;
assert(type.floating);
assert(lp_check_value(type, a));
gallivm,draw,llvmpipe: Support wider native registers. Squashed commit of the following: commit 7acb7b4f60dc505af3dd00dcff744f80315d5b0e Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:46:31 2012 +0100 draw: Don't use dynamically sized arrays. Not supported by MSVC. commit 5810c28c83647612cb372d1e763fd9d7780df3cb Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:44:16 2012 +0100 gallivm,llvmpipe: Don't use expressions with PIPE_ALIGN_VAR(). MSVC doesn't accept exceptions in _declspec(align(...)). Use a define instead. commit 8aafd1457ba572a02b289b3f3411e99a3c056072 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:41:56 2012 +0100 gallium/util: Make u_cpu_detect.h header C++ safe. commit 5795248350771f899cfbfc1a3a58f1835eb2671d Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 2 12:08:01 2012 +0100 gallium/util: Add ULL suffix to large constants. As suggested by Andy Furniss: it looks like some old gcc versions require it. commit 4c66c22727eff92226544c7d43c4eb94de359e10 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Truly disable INF/NAN tests on MSVC. Thanks to Brian for spotting this. commit 8bce274c7fad578d7eb656d9a1413f5c0844c94e Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Disable INF/NAN tests on MSVC. Somehow they are not recognized as constants. commit 6868649cff8d7fd2e2579c28d0b74ef6dd4f9716 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 15:05:24 2012 +0200 gallivm: Cleanup the 2 x 8 float -> 16 ub special path in lp_build_conv. No behaviour change intended, like 7b98455fb40c2df84cfd3cdb1eb7650f67c8a751. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 5147a0949c4407e8bce9e41d9859314b4a9ccf77 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 14:28:19 2012 +0200 gallivm: (trivial) fix issues with multiple-of-4 texture fetch Some formats can't handle non-multiple of 4 fetches I believe, but everything must support length 1 and multiples of 4. So avoid going to scalar fetch (which is very costly) just because length isn't 4. Also extend the hack to not use shift with variable count for yuv formats to arbitrary length (larger than 1) - doesn't matter how many elements we have we always want to avoid it unless we have variable shift count instruction (which we should get with avx2). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 87ebcb1bd71fa4c739451ec8ca89a7f29b168c08 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jul 4 02:09:55 2012 +0200 gallivm: (trivial) fix typo for wrap repeat mode in linear filtering aos code This would lead to bogus coordinates at the edges. (undetected by piglit because this path is only taken for block-based formats). Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 3a42717101b1619874c8932a580c0b9e6896b557 Author: José Fonseca <jfonseca@vmware.com> Date: Tue Jul 3 19:42:49 2012 +0100 gallivm: Fix TGSI integer translation with AVX. commit d71ff104085c196b16426081098fb0bde128ce4f Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 15:17:41 2012 +0100 llvmpipe: Fix LLVM JIT linear path. It was not working properly because it was looking at the JIT function before it was actually compiled. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit a94df0386213e1f5f9a6ed470c535f9688ec0a1b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Jun 28 18:07:10 2012 +0100 gallivm: Refactor lp_build_broadcast(_scalar) to share code. Doesn't really change the generated assembly, but produces more compact IR, and of course, makes code more consistent. Reviewed-by: Brian Paul <brianp@vmware.com> commit 66712ba2731fc029fa246d4fc477d61ab785edb5 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 17:30:13 2012 +0100 gallivm: Make LLVMContextRef a singleton. There are any places inside LLVM that depend on it. Too many to attempt to fix. Reviewed-by: Brian Paul <brianp@vmware.com> commit ff5fb7897495ac263f0b069370fab701b70dccef Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 28 18:15:27 2012 +0200 gallivm: don't use 8-wide texture fetch in aos path This appears to be a slight loss usually. There are probably several reasons for that: - fetching itself is scalar - filtering is pure int code hence needs splitting anyway, same for the final texel offset calculations - texture wrap related code, which can be done 8-wide, is slightly more complex with floats (with clamp_to_edge) and float operations generally more costly hence probably not much faster overall - the code needed to split when encountering different mip levels for the quads, adding complexity So, just split always for aos path (but leave it 8-wide for soa, since we do 8-wide filtering there when possible). This should certainly be revisited if we'd have avx2 support. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ce8032b43dcd8e8d816cbab6428f54b0798f945d Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:41:19 2012 +0200 gallivm: (trivial) don't extract fparts variable if not needed Did not have any consequences but unnecessary. commit aaa9aaed8f80dc282492f62aa583a7ee23a4c6d5 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:09:06 2012 +0200 gallivm: fix precision issue in aos linear int wrap code now not just passes at a quick glance but also with piglit... If we do the wrapping with floats, we also need to set the weights accordingly. We can potentially end up with different (integer) coordinates than what the integer calculations would have chosen, which means the integer weights calculated previously in this case are completely wrong. Well at least that's what I think happens, at least recalculating the weights helps. (Some day really should refactor all the wrapping, so we do whatever is fastest independent of 16bit int aos or 32bit float soa filtering.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit fd6f18588ced7ac8e081892f3bab2916623ad7a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 11:15:53 2012 +0100 gallium/util: Fix parsing of options with underscore. For example GALLIVM_DEBUG=no_brilinear which was being parsed as two options, "no" and "brilinear". commit 09a8f809088178a03e49e409fa18f1ac89561837 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 26 15:00:14 2012 +0100 gallivm: Added a generic lp_build_print_value which prints a LLVMValueRef. Updated lp_build_printf to share common code. Removed specific lp_build_print_vecX. Reviewed-by: José Fonseca <jfonseca@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit e59bdcc2c075931bfba2a84967a5ecd1dedd6eb0 Author: José Fonseca <jfonseca@vmware.com> Date: Wed May 16 15:00:23 2012 +0100 draw,llvmpipe: Avoid named struct types on LLVM 3.0 and later. Starting with LLVM 3.0, named structures are meant not for debugging, but for recursive data types, previously also known as opaque types. The recursive nature of these types leads to several memory management difficulties. Given that we don't actually need recursive types, avoid them altogether. This is an attempt to address fdo bugs 41791 and 44466. The issue is somewhat random so there's no easy way to check how effective this is. Cherry-picked from 9af1ba565dfd5cef9ee938bb7c04767d14878fbf commit df6070f618a203c7a876d984c847cde4cbc26bdb Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 14:42:53 2012 +0200 gallivm: (trivial) fix typo in faster aos linear int wrap code no longer crashes, now REALLY tested. commit d8f98dce452c867214e6782e86dc08562643c862 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 18:20:58 2012 +0200 llvmpipe: (trivial) remove bogus optimization for float aos repeat wrap This optimization for nearest filtering on the linear path generated likely bogus results, and the int path didn't have any optimizations there since the only shader using force_nearest apparently uses clamp_to_edge not repeat wrap anyway. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit c4e271a0631087c795e756a5bb6b046043b5099d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 23:01:52 2012 +0200 gallivm: faster repeat wrap for linear aos path too Even if we already have scaled integer coords, it's way faster to use the original float coord (plus some conversions) rather than use URem. The choice of what to do for texture wrapping is not really tied to int aos or float soa filtering though for some modes there can be some gains (because of easier weight calculations). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 1174a75b1806e92aee4264ffe0ffe7e70abbbfa3 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 14:39:22 2012 +0200 gallivm: improve npot tex wrap repeat in linear soa path URem gets translated into series of scalar divisions so just about anything else is faster. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit f849ffaa499ed96fa0efd3594fce255c7f22891b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 00:40:35 2012 +0100 gallivm: (trivial) fix near-invisible shift-space typo I blame the keyboard. commit 5298a0b19fe672aebeb70964c0797d5921b51cf0 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:24:28 2012 +0200 gallivm: add new intrinsic helper to deal with arbitrary vector length This helper will split vectors which are too large for the hw, or expand them if they are too small, so a caller of a function using intrinsics which uses such sizes need not split (or expand) the vectors manually and the function will still use the intrinsic instead of dropping back to generic llvm code. It can also accept scalars for use with pseudo-vector intrinsics (only useful for float arguments, all x86 scalar simd float intrinsics use 4vf32). Only used for lp_build_min/max() for now (also added the scalar float case for these while there). (Other basic binary functions could use it easily, whereas functions with a different interface would need different helpers.) Expanding vectors isn't widely used, because we always try to use build contexts with native hw vector sizes. But it might (or not) be nicer if this wouldn't need to be done, the generated code should in theory stay the same (it does get hit by lp_build_rho though already since we didn't have a intrinsic for the scalar lp_build_max case before). v2: incorporated Brian's feedback, and also made the scalar min/max case work instead of crash (all scalar simd float intrinsics take 4vf32 as argument, probably the reason why it wasn't used before). Moved to lp_bld_intr based on José's request, and passing intrinsic size instead of length. Ideally we'd derive the source type info from the passed in llvm value refs and process some llvmtype return type so we could handle intrinsics where the source and destination type isn't the same (like float/int conversions, packing instructions) but that's a bit too complicated for now. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 01aa760b99ec0b2dc8ce57a43650e83f8c1becdf Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:19:18 2012 +0200 gallivm: (trivial) increase max code size for shader disassembly 64kB was just short of what I needed (which caused a crash) hence increase to 96kB (should probably be smarter about that). commit 74aa739138d981311ce13076388382b5e89c6562 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:53:29 2012 +0100 gallivm: simplify aos float tex wrap repeat nearest just handle pot and npot the same. The previous pot handling ended up with exactly the same instructions plus 2 more (leave it in the soa path though since it is probably still cheaper there). While here also fix a issue which would cause a crash after an assert. commit 0e1e755645e9e49cfaa2025191e3245ccd723564 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:29:24 2012 +0100 gallivm: (trivial) skip floor rounding in ifloor when not signed This was only done for the non-sse41 case before, but even with sse41 this is obviously unnecessary (some callers already call itrunc in this case anyway but some might not). commit 7f01a62f27dcb1d52597b24825931e88bae76f33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:23:12 2012 +0100 gallivm: (trivial) fix bogus comments commit 5c85be25fd82e28490274c468ce7f3e6e8c1d416 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 20 11:51:57 2012 +0100 translate: Free elt8_func/elt16_func too. These were leaking. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 0ad498f36fb6f7458c7cffa73b6598adceee0a6c Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 15:55:34 2012 +0200 gallivm: fix bug for tex wrap repeat with linear sampling in aos float path The comparison needs to be against length not length_minus_one, otherwise the max texel is never chosen (for the second coordinate). Fixes piglit texwrap-1D-npot-proj (and 2D/3D versions). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit d1ad65937c5b76407dc2499b7b774ab59341209e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 16:13:43 2012 +0200 gallivm: simplify soa tex wrap repeat with npot textures and no mip filtering Similar to what is already done in aos sampling for the float path (but not the int path since we don't get normalized float coordinates there). URem is expensive and the calculation is done trivially with normalized floats instead (at least with sse41-capable cpus). (Some day should probably do the same for the mip filter path but it's much more complicated there hence the gain is smaller.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit e1e23f57ba9b910295c306d148f15643acc3fc83 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:38:56 2012 +0200 llvmpipe: (trivial) remove duplicated function declaration Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 07ca57eb09e04c48a157733255427ef5de620861 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:37:34 2012 +0200 llvmpipe: destroy setup variants on context destruction lp_delete_setup_variants() used to be called in garbage collection, but this no longer exists hence the setup shaders never got freed. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ed0003c633859a45f9963a479f4c15ae0ef1dca3 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 16:25:29 2012 +0100 gallivm: handle different ilod parts for multiple quad sampling This fixes filtering when the integer part of the lod is not the same for all quads. I'm not fully convinced of that solution yet as it just splits the vector if the levels to be sampled from are different. But otherwise we'd need to do things like some minify steps, and getting mip level base address separately anyway hence it wouldn't really look like much of a win (and making the code even more complex). This should now give identical results to single quad sampling. commit 8580ac4cfc43a64df55e84ac71ce1a774d33c0d2 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:14:47 2012 +0200 gallivm: de-duplicate sample code common to soa and aos sampling There doesn't seem to be any reason why this code dealing with cube face selection, lod and mip level calculation is separate in aos and soa sampling, and I am sick of having it to change in both places. commit fb541e5f957408ce305b272100196f1e12e5b1e8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:15:41 2012 +0200 gallivm: do mip filtering with per quad lod_fpart This gives better results for mip filtering, though the generated code might not be optimal. For now it also creates some artifacts if the lod_ipart isn't the same for all quads, since instead of using the same mip weight for all quads as previously (which just caused non-smooth gradients) this now will use the right weights but with the wrong mip level in this case (can easily be seen with things like texfilt, mipmap_tunnel). v2: use logic helper suggested by José, and fix issue with negative lod_fpart values commit f1cc84eef7d826a20fab6cd8ccef9a275ff78967 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 13 18:35:25 2012 +0200 gallivm: (trivial) fix bogus assert in lp_build_unpack_broadcast_aos_scalars commit 7c17dbae8ae290df9ce0f50781a09e8ed640c044 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:11:14 2012 +0100 util: Reimplement half <-> float conversions. Removed u_half.py used to generate the table for previous method. Previous implementation of float to half conversion was faulty for denormalised and NaNs and would require extra logic to fix, thus making the speedup of using tables irrelevant. commit 7762f59274070e1dd4b546f5cb431c2eb71ae5c3 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:12:16 2012 +0100 tests: Updated tests to properly handle NaN for half floats. commit fa94c135aea5911fd93d5dfb6e6f157fb40dce5e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:33:10 2012 +0200 gallivm: do mip level calculations per quad This is the final piece which shouldn't change the rendering output yet. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 23cbeaddfe03c09ca18c45d28955515317ffcf4c Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Jun 9 00:54:21 2012 +0200 gallivm: do per-quad cube face selection Doesn't quite fix the piglit cubemap test (not sure why actually) but doing per-quad face selection is doing the right thing and definitely an improvement. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit abfb372b3702ac97ac8b5aa80ad1b94a2cc39d33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:22:59 2012 +0200 gallivm: do all lod calculations per quad Still no functional change but lod is now converted to scalar after lod calculations. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 519368632747ae03feb5bca9c655eccbc5b751b4 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:46:10 2012 +0100 gallivm: Added support for half-float to float conversion in lp_build_conv. Updated various utility functions to support this change. commit 135b4d683a4c95f7577ba27b9bffa4a6fbd2c2e7 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:02:46 2012 +0100 gallivm: Added function for half-float to float conversion. Updated lp_build_format_aos_array to support half-float source. commit 37d648827406a20c5007abeb177698723ed86673 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:55:18 2012 +0100 util: Updated u_format_tests to rigidly test half-float boundary values. commit 2ad18165d96e578aa9046df7c93cb1c3284d8c6b Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:54:16 2012 +0100 llvmpipe: Updated lp_test_format to properly handle Inf/NaN results. commit 78740acf25aeba8a7d146493dd5c966e22c27b73 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:53:30 2012 +0100 util: Added functions for checking NaN / Inf for double and half-floats. commit 35e9f640ae01241f9e0d67fe893bbbf564c05809 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:05:13 2012 +0200 gallivm: Fix calculating rho for 3d textures for the single-quad case Discovered by accident, this looks like a very old typo bug. commit fc1220c636326536fd0541913154e62afa7cd1d8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:04:59 2012 +0200 gallivm: do calcs per-quad in lp_build_rho Still convert to scalar at the end of the function. commit 50a887ffc550bf310a6988fa2cea5c24d38c1a41 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 21 23:21:50 2012 +0200 gallivm: (trivial) return scalar in lp_build_extract_range for length 1 vectors Our type system on top of llvm's one doesn't generally support vectors of length 1, instead using scalars. So we should return a scalar from this function instead of having to bitcast the vector with length 1 later elsewhere. commit 80c71c621f9391f0f9230460198d861643324876 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 17:49:15 2012 +0100 draw: Fixed bad merge error commit c47401cfad0c9167de20ff560654f533579f452c Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:29:30 2012 +0100 draw: Updated store_clip to store whole vectors instead of individual elements. commit 2d9c1ad74b0b0b41861fffcecde39f09cc27f1cf Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:28:32 2012 +0100 gallivm: Added lp_build_fetch_rgba_aos_array. A version of lp_build_fetch_rgba_aos which is targeted at simple array formats. Reads the whole vector from memory in one, instead of reading each element individually. Tested with mesa tests and demos. commit ff7805dc2b6ef6d8b11ec4e54aab1633aef29ac8 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:27:40 2012 +0100 gallivm: Added lp_build_pad_vector. This function pads a vector with undef to a desired length. commit 701f50acef24a2791dabf4730e5b5687d6eb875d Author: James Benton <jbenton@vmware.com> Date: Fri May 18 17:27:19 2012 +0100 util: Added util_format_is_array. This function checks whether a format description is in a simple array format. commit 5e0a7fa543dcd009de26f34a7926674190fa6246 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:13:47 2012 +0100 draw: Removed draw_llvm_translate_from and draw/draw_llvm_translate.c. This is "replaced" by adding an optimised path in lp_build_fetch_rgba_aos in an upcoming patch. commit 8c886d6a7dd3fb464ecf031de6f747cb33e5361d Author: James Benton <jbenton@vmware.com> Date: Wed May 16 15:02:31 2012 +0100 draw: Modified store_aos to write the vector as one, not individual elements. commit 37337f3d657e21dfd662c7b26d61cb0f8cfa6f17 Author: James Benton <jbenton@vmware.com> Date: Wed May 16 14:16:23 2012 +0100 draw: Changed aos_to_soa to use lp_build_transpose_aos. commit bd2b69ce5d5c94b067944d1dcd5df9f8e84548f1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:14:27 2012 +0100 draw: Changed soa_to_aos to use lp_build_transpose_aos. commit 0b98a950d29a116e82ce31dfe7b82cdadb632f2b Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:45 2012 +0100 gallivm: Added lp_build_transpose_aos which converts between aos and soa. commit 69ea84531ad46fd145eb619ed1cedbe97dde7cb5 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:01 2012 +0100 gallivm: Added lp_build_interleave2_half aimed at AVX unpack instructions. commit 7a4cb1349dd35c18144ad5934525cfb9436792f9 Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 22 11:54:14 2012 +0100 gallivm: Fix build on Windows. MC-JIT not yet supported there. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit afd105fc16bb75d874e418046b80d9cc578818a1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:17:26 2012 +0100 llvmpipe: Added a error counter to lp_test_conv. Useful for keeping track of progress when fixing errors! Signed-off-by: José Fonseca <jfonseca@vmware.com> commit b644907d08c10a805657841330fc23db3963d59c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:16:46 2012 +0100 llvmpipe: Changed known failures in lp_test_conv. To comply with the recent fixes to lp_bld_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit d7061507bd94f6468581e218e61261b79c760d4f Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:14:38 2012 +0100 llvmpipe: Added fixed point types tests to lp_test_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 146b3ea39b4726dbe125ac666bd8902ea3d6ca8c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:26:35 2012 +0100 llvmpipe: Changed lp_test_conv src/dst alignment to be correct. Now based on the define rather than a fixed number. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit f3b57441f834833a4b142a951eb98df0aa874536 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:06:44 2012 +0100 gallivm: Fixed erroneous optimisation in lp_build_min/max. Previously assumed normalised was 0 to 1, but it can be -1 to 1 if type is signed. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a0613382e5a215cd146bb277646a6b394d376ae4 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:04:49 2012 +0100 gallivm: Compensate for lp_const_offset in lp_build_conv. Fixing a /*FIXME*/ to remove errors in integer conversion in lp_build_conv. Tested using lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a3d2bf15ea345bc8a0664f8f441276fd566566f3 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:01:25 2012 +0100 gallivm: Fixed overflow in lp_build_clamped_float_to_unsigned_norm. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit e7b1e76fe237613731fa6003b5e1601a2e506207 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 21 20:07:51 2012 +0100 gallivm: Fix build with LLVM 2.6 Trivial, and useful. commit d3c6bbe5c7f5ba1976710831281ab1b6a631082d Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 15 17:15:59 2012 +0100 gallivm: Enable MCJIT/AVX with vanilla LLVM 3.1. Add the necessary C++ glue, so that we don't need any modifications to the soon to be released LLVM 3.1. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 724a019a14d40fdbed21759a204a2bec8a315636 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 22:04:06 2012 +0100 gallivm: Use HAVE_LLVM 0x0301 consistently. commit af6991e2a3868e40ad599b46278551b794839748 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 21:49:06 2012 +0100 gallivm: Add MCRegisterInfo.h to silence benign warnings about missing implementation. Trivial. commit 6f8a1d75458daae2503a86c6b030ecc4bb494e23 Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Apr 2 22:14:15 2012 -0700 gallivm: Pass in a MCInstrInfo to createMCInstPrinter on llvm-3.1. llvm-3.1svn r153860 makes MCInstrInfo available to the MCInstPrinter. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 62555b6ed8760545794f83064e27cddcb3ce5284 Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 27 21:51:17 2012 -0700 gallivm: Fix method overriding in raw_debug_ostream. Use matching type qualifers to avoid method hiding. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 6a9bd784f4ac68ad0a731dcd39e5a3c39989f2be Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 13 22:40:52 2012 -0700 gallivm: Fix createOProfileJITEventListener namespace with llvm-3.1. llvm-3.1svn r152620 refactored the OProfile profiling code. createOProfileJITEventListener was moved from the llvm namespace to the llvm::JITEventListener namespace. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit b674955d39adae272a779be85aa1bd665de24e3e Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Mar 5 22:00:40 2012 -0800 gallivm: Pass in a MCRegisterInfo to MCInstPrinter on llvm-3.1. llvm-3.1svn r152043 changes createMCInstPrinter to take an additional MCRegisterInfo argument. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 11ab69971a8a31c62f6de74905dbf8c02884599f Author: Vinson Lee <vlee@freedesktop.org> Date: Wed Feb 29 21:20:53 2012 -0800 Revert "gallivm: Change getExtent and readByte to non-const with llvm-3.1." This reverts commit d5a6c172547d8964f4d4bb79637651decaf9deee. llvm-3.1svn r151687 makes MemoryObject accessor members const again. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 339960c82d2a9f5c928ee9035ed31dadb7f45537 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 14 16:19:56 2012 +0200 gallivm: (trivial) fix assertion failure for mipmapped 1d textures In lp_build_rho, we may end up with a 1-element vector (for mipmapped 1d textures), but in this case we require the type to be a non-vector type, so need a cast. commit 9d73edb727bd6d196030dc3026b7bf0c574b3e19 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:12:07 2012 +0200 gallivm: prepare for per-quad lod calculations for large vectors to be able to handle multiple quads at once in texture sampling and still do lod calculations per quad, it is necessary to get the per-quad derivatives into the lp_build_rho function. Until now these derivative values were just scalars, which isn't going to work. So we now use vectors, and since the interface needs to change we also do some different (slightly more efficient) packing of the values. For 8-wide vectors the packed derivative values for 3 coords would look like this, this scales to a arbitrary (multiple of 4) vector size: ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____ The second vector will be unused for 1d and 2d textures. To facilitate future changes the derivative values are put into a struct, since quite some functions just pass these values through. The generated code seems to be very slightly better for 2d textures (with 4-wide vectors) than before with sse2 (if you have a cpu with physical 128bit simd units - otherwise it's probably not a win). v2: suggestions from José, rename variables, add comments, use swizzle helper commit 0aa21de0d31466dac77b05c97005722e902517b8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:10:31 2012 +0200 gallivm: add undefined swizzle handling to lp_build_swizzle_aos This is useful for vectors with "holes", it lets llvm choose the most efficient shuffle instructions if some elements aren't needed without having to worry what elements to manually pick otherwise. commit 00faf3f370e7ce92f5ef51002b0ea42ef856e181 Author: José Fonseca <jfonseca@vmware.com> Date: Fri May 4 17:25:16 2012 +0100 gallivm: Get the LLVM IR optimization passes before JIT compilation. MC-JIT engine compiles the module immediately on creation, so the optimization passes were being run too late. So now we create a target data layout from a string, that matches the ABI parameters reported by the compiler. The backend optimization passes were always been run, so the performance improvement is modest (3% on multiarb mesa demo). Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit 40a43f4e2ce3074b5ce9027179d657ebba68800a Author: Roland Scheidegger <sroland@vmware.com> Date: Wed May 2 16:03:54 2012 +0200 gallivm: (trivial) fix wrong define used in lp_build_pack2 should fix stack-smashing crashes. commit e6371d0f4dffad4eb3b7a9d906c23f1c88a2ab9e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 30 21:25:29 2012 +0200 gallivm: add perf warnings when not using intrinsics with 256bit vectors Helper functions using integer sse2 intrinsics could split the vectors with AVX instead of using generic fallback (which should be faster). We don't actually expect to hit these paths (hence don't fix them up to actually do the vector splitting) so just emit warnings (for those functions where it's obvious doing split/intrinsic is faster than using generic path). Only emit warnings for 256bit vectors since we _really_ don't expect to hit arbitrary large vectors which would affect a lot more functions. The warnings do not actually depend on avx since the same logic applies to plain sse2 too (but of course again there's _really_ no reason we should hit these functions with 256bit vectors without avx). commit 8a9ea701ea7295181e846c6383bf66a5f5e47637 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:37:07 2012 +0200 gallivm: split vectors manually for avx in lp_build_pack2 (v2) There's 2 reasons for this: First, there's a llvm bug (fixed in 3.1) which generates tons of byte inserts/extracts otherwise, and second, more importantly, we want to use pack intrinsics instead of shuffles. We do this in lp_build_pack2 and not the calling code (aos sample path) because potentially other callers might find that useful too, even if for larger sequences of code using non-native vector sizes it might be better to manually split vectors. This should boost texture performance in the aos path considerably. v2: fix issues with intrinsics types with old llvm commit 27ac5b48fa1f2ea3efeb5248e2ce32264aba466e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:26:22 2012 +0200 llvmpipe: refactor lp_build_pack2 (v2) prettify, and it's unnecessary to assert when there's no intrinsic due to unsupported bit width - the shuffle path will work regardless. In contrast lp_build_packs2, should only rely on lp_build_pack2 doing the clamping for element sizes for which there is a sse2 intrinsic. v2: fix bug spotted by Jose regarding the intrinsic type for packusdw on old llvm versions. commit ddf279031f0111de4b18eaf783bdc0a1e47813c8 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:13:59 2012 +0200 gallivm: add src width check in lp_build_packs2() not doing so would skip clamping even if no sse2 pack instruction is available, which is incorrect (in theory only, such widths would also always hit a (unnecessary) assertion in lp_build_pack2(). commit e7f0ad7fe079975eae7712a6e0c54be4fae0114b Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Apr 27 15:57:00 2012 +0200 gallivm: (trivial) fix crash-causing typo for npot textures with avx commit 28a9d7f6f655b6ec508c8a3aa6ffefc1e79793a0 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Apr 25 19:38:45 2012 +0200 gallivm: (trivial) remove code mistakenly added twice. commit d5926537316f8ff67ad0a52e7242f7c5478d919b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Apr 24 21:16:15 2012 +0200 gallivm: add a new avx aos sample path (v2) Try to avoid mixing float and int address calculations. This does texture wrap modes with floats, and then the offset calculations still with ints (because of lack of precision with floats, though we could do some effort to make it work with not too large (16MB) textures). This also handles wrap repeat mode with npot-sized textures differently than either the old soa or aos int path (likely way faster but untested). Otherwise the actual address wrap code is largely similar to the soa path (not quite the same as this one also has some int code), it should get used by avx soa sampling later as well but doesn't handle more complex address modes yet (this will also have the benefit that we can use aos sampling path for all texture address modes). Generated code for that looks reasonable, but still does not split vectors explicitly for fetch/filter which means still get hit by llvm (fixed upstream) which generates hundreds of pinsrb/pextrb instead of two shuffles. It is not obvious though if it's much of a win over just doing address calcs 4-wide but with ints, even if it is definitely much less instructions on avx. piglit's texwrap seems to look exactly the same but doesn't test neither the non-normalized nor the npot cases. v2: fix comments, prettify based on Brian's and Jose's feedback. commit bffecd22dea66fb416ecff8cffd10dd4bdb73fce Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Apr 19 01:58:29 2012 +0200 gallivm: refactor aos lp_build_sample_image_nearest/linear split them up to separate address calculations and fetching/filtering. Need this for being able to do 8-wide float address calcs and 4-wide fetch/filter later (for avx). Plus the functions were very big scary monsters anyway (in particular lp_build_sample_image_linear). commit a80b325c57529adddcfa367f96f03557725c4773 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 16 17:17:18 2012 +0200 gallivm: fix lp_build_resize when truncating width but expanding vector size Missed this case which I thought was impossible - the assertion for it was right after the division by zero... (AoS) texture sampling may ask us to do this, for things like 8 4x32int vectors to 1 32x8int vector conversion (eventually, we probably don't want this to happen). commit f9c8337caa3eb185830d18bce8b95676a065b1d7 Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Apr 14 18:00:59 2012 +0200 gallivm: fix cube maps with larger vectors This makes the branchless cube face selection code work with larger vectors. Because the complexity is quite high (cannot really be improved it seems, per-face selection would reduce complexity a lot but this leads to errors unless the derivatives are calculated all from the same face which almost doubles the work to be done) it is still slower than the branching version, hence only enable this with large vectors. It doesn't actually do per-quad face selection yet (only makes sense with matching lod selection, in fact it will select the same face for all pixels based on the average of the first four pixels for now) but only different shuffles are required to make it work (the branching version actually should work with larger vectors too now thanks to the improved horizontal add but of course it cannot be extended to really select the face per-quad unless doing branching per quad). commit 7780c58869fc9a00af4f23209902db7e058e8a66 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 21:11:12 2012 +0100 llvmpipe: (trivial) fix compiler warning and also clarify comment regarding availability of popcnt instruction. commit a266dccf477df6d29a611154e988e8895892277e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 14:21:07 2012 +0100 gallivm: remove unneeded members in lp_build_sample_context Minor cleanup, the texture width, height, depth aren't accessed in their scalar form anywhere. Makes it more obvious those values should probably be fetched already vectorized (but this requires more invasive changes)... commit b678c57fb474e14f05e25658c829fc04d2792fff Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 29 15:53:55 2012 +0100 gallivm: add a helper for concatenating vectors Similar to the extract_range helper intended to get around slow code generated by llvm for 128bit insertelements. Concatenating two 128bit vectors this way will result in a single vinsertf128 operation rather than two 64bit stores plus one 128bit load, though it might be mildly useful for other purposes as well. commit 415ff228bcd0cf5e44a4c15350a661f0f5520029 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 19:41:15 2012 +0100 gallivm: add a custom 2x8f->1x16ub avx conversion path Similar to the existing 4x4f->1x16ub sse2 path, shaves off a couple instructions (min/max mostly) because it relies on pack intrinsics clamping. commit 78c08fc89f8fbcc6dba09779981b1e873e2a0299 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 18:44:07 2012 +0100 gallivm: add avx arithmetic intrinsics Add all avx intrinsics for arithmetic functions (with the exception of the horizontal add function which needs another look). Seems to pass basic tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit a586caa2800aa5ce54c173f7c0d4fc48153dbc4e Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 15:31:35 2012 +0100 gallivm: add avx logic intrinsics Add the blend intrinsics for 8-wide float and 4-wide double vectors. Since we lack 256bit int instructions these are used for int vectors as well, though obviously not for byte or word element values. The comparison intrinsics aren't extended for avx since these are only used for pre-2.7 llvm versions. commit 70275e4c13c89315fc2560a4c488c0e6935d5caf Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 00:40:53 2012 +0100 gallivm: new helper function for extract shuffles. Based on José's idea as we can need that in a couple places. Note that such shuffles should not be used lightly, since data layout of <4 x i8> is different to <16 x i8> for instance, hence might cause data rearrangement. commit 4d586dbae1b0c55915dda1759d2faea631c0a1c2 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 18:27:25 2012 +0100 gallivm: (trivial) don't overallocate shuffle variable using wrong define meant huge array... commit 06b0ec1f6d665d98c135f9573ddf4ba04b2121ad Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 17:54:20 2012 +0100 gallivm: don't do per-element extract/insert for vector element resize Instead of doing per-element extract/insert if the src vectors and dst vector differ in total size (which generates atrocious code) first change the src vectors size by using shuffles to destination vector size. We can still do better than that on AVX for packing to color buffer (by exploiting pack intrinsics characteristics hence eleminating the need for some clamps) but this already generates much better code. v2: incorporate feedback from José, Keith and use shuffle instead of bitcasts/extracts. Due to llvm deficiencies the latter cause all data to get moved to GPRs and back in pieces (even though the data in the regs actually stays the same...). commit c9970d70e05f95d3f52fe7d2cd794176a52693aa Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 19:33:19 2012 +0000 gallivm: fix bug in simple position interpolation Accidental use of position attribute instead of just pixel coordinates. Caused failures in piglit glsl-fs-ceil and glsl-fs-floor. commit d0b6fcdb008d04d7f73d3d725615321544da5a7e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 15:31:14 2012 +0000 gallivm: fix emission of ceil opcode lp_build_ceil seems more appropriate than lp_build_trunc. This seems to be never hit though someone performs some ceil to floor magic. commit d97fafed7e62ffa6bf76560a92ea246a1a26d256 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 22 11:46:52 2012 +0000 gallivm: new vectorized path for cubemap calculations should be faster when adapted to multiple quads as only selection masks need to be different. The code is more or less a per-pixel version adapted to only do it per quad. A per pixel version would be much simpler (could drop 2 selects, 6 broadcasts and the messy horizontal add of 3 vectors at the expense of only 2 more absolute value instructions - would also just work for arbitary large vectors). This version doesn't yet work with larger vectors because the horizontal add isn't adjusted to be able to work with 2x4 vectors (and also because face selection wouldn't be done per quad just per block though that would be only a correctness issue just as with lod selection). The downside is this code is quite a bit slower. On a Core2 it can be sped up by disabling the hw blend instructions for selection and using logicop fallbacks instead, but it is still slower than the old code, hence leave that in for now. Probably will chose one or the other version based on vector length in the end. commit b375fbb18a3fd46859b7fdd42f3e9908ea4ff9a3 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 21 14:42:29 2012 +0000 gallivm: fix optimized occlusion query intrinsic name commit a9ba0a3b611e48efbb0e79eb09caa85033dbe9a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Mar 21 16:19:43 2012 +0000 draw,gallivm,llvmpipe: Call gallivm_verify_function everywhere. commit f94c2238d2bc7383e088b8845b7410439a602071 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 18:54:10 2012 +0000 gallivm: optimize calculations for cube maps a bit this does some more vectorized calculations and uses horizontal adds if possible. A definite win with sse3 otherwise it doesn't seem to make much of a difference. In any case this is arithmetically identical, cannot handle larger vectors. Should be useful as a reference point against larger vector version later... commit 21a2c1cf3c8e1ac648ff49e59fdc0e3be77e2ebb Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 15:16:27 2012 +0000 llvmpipe: slight optimization of occlusion queries using movmskps when available. While this is slightly better for cpus without popcnt we should really sum the vectors ourselves (it is also possible to cast to i4 before doing the popcnt but that doesn't help that much neither since llvm is using some optimized popcnt version for i32) commit 5ab5a35f216619bcdf55eed52b0db275c4a06c1b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 13:32:11 2012 +0000 llvmpipe: fix occlusion queries with larger vectors need to adjust casts etc. commit ff95e6fdf5f16d4ef999ffcf05ea6e8c7160b0d5 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Mar 19 20:15:25 2012 +0000 gallivm: Restore optimization passes. commit 57b05b4b36451e351659e98946dae27be0959832 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:34:22 2012 +0000 llvmpipe: use existing min2 macro commit bc9a20e19b4f600a439f45679451f2e87cd4b299 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:07:27 2012 +0000 llvmpipe: add some safeguards against really large vectors As per José's suggestion, prevent things from blowing up if some cpu would have 1024bit or larger vectors. commit 0e2b525e5ca1c5bbaa63158bde52ad1c1564a3a9 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:31:08 2012 +0000 llvmpipe: fix mask generation for uberwide vectors this was the only piece preventing 16-wide vectors from working (apart from the LP_MAX_VECTOR_WIDTH define that is), which is the maximum as we don't get more pixels in the fragment shader at once. Hence adjust that so things could be tested properly with that size even though there seems to be no practical value. commit 3c8334162211c97f3a11c7f64e9e5a2a91ad9656 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:19:41 2012 +0000 llvmpipe: fix the simple interpolation method with larger vectors so both methods actually _really_ work now. Makes textures look nice with larger vectors... commit 1cb0464ef8871be1778d43b0c56adf9c06843e2d Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 17:26:35 2012 +0000 llvmpipe: fix mask generation and position interpolation with 8-wide vectors trivial bugs, with these things start to look somewhat reasonable. Textures though have some swizzling issues it seems. commit 168277a63ef5b72542cf063c337f2d701053ff4b Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 16:04:03 2012 +0000 llvmpipe: don't overallocate variables we never have more than 16 (stamp size) / 4 (minimum possible vector size). (With larger vectors those variables are still overallocated a bit.) commit 409b54b30f81ed0aa9ed0b01affe15c72de9abd2 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:56:48 2012 +0000 llvmpipe: add some 32f8 formats to lp_test_conv Also add the ability to handle different sized vectors. commit 55dcd3af8366ebdac0af3cdb22c2588f24aa18ce Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:47:27 2012 +0000 gallivm: handle different sized vectors in conversion / pack only fully generic path for now (extract/insert per element). commit 9c040f78c54575fcd94a8808216cf415fe8868f6 Author: Roland Scheidegger <sroland@vmware.com> Date: Sun Mar 18 00:58:28 2012 +0100 llvmpipe: fix harmless use of unitialized values commit 551e9d5468b92fc7d5aa2265db9a52bb1e368a36 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:31:21 2012 +0100 gallivm: drop special path in extract_broadcast with different sized vectors Not needed, llvm can handle shuffles with different sized result vector just fine. Should hopefully generate the same code in the end, but simpler IR. commit 44da531119ffa07a421eaa041f63607cec88f6f8 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:28:49 2012 +0100 llvmpipe: adapt interpolation for handling multiple quads at once this is still WIP there are actually two methods possible not quite sure what makes the most sense, so there's code for both for now: 1) the iterative method as used before (compute attrib values at upper left corner of stamp and upper left corner of each quad initially). It is improved to handle more than one quad at once, and also do some more vectorized calculations initially for slightly better code - newer cpus have full throughput with 4 wide float vectors, hence don't try to code up a path which might be faster if there's just one channel active per attribute. 2) just do straight interpolation for each pixel. Method 2) is more work per quad, but less initially - if all quads are executed significantly more overall though. But this might change with larger vector lengths. This method would also be needed if we'd do some kind of active quad merging when operating on multiple quads at once. This path contains some hack to force llvm to generate better code, it is still far from ideal though, still generates far too many unnecessary register spills/reloads. Both methods should work with different sized vectors. Not very well tested yet, still seems to work with four-wide vectors, need changes elsewhere to be able to test with wider vectors. commit be5d3e82e2fe14ad0a46529ab79f65bf2276cd28 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:59:37 2012 +0000 draw: Cleanup. commit f85bc12c7fbacb3de2a94e88c6cd2d5ee0ec0e8d Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:43:30 2012 +0000 gallivm: More module compilation refactoring. commit d76f093198f2a06a93b2204857e6fea5fd0b3ece Author: José Fonseca <jfonseca@vmware.com> Date: Thu Mar 15 21:29:11 2012 +0000 llvmpipe: Use gallivm_compile/free_function() in linear code. Should had been done before. commit 122e1adb613ce083ad739b153ced1cde61dfc8c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 13 14:47:10 2012 +0100 llvmpipe: generate partial pixel mask for multiple quads still works with one quad, cannot be tested yet with more At least for now always fixed order with multiple quads. commit 4c4f15081d75ed585a01392cd2dcce0ad10e0ea8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 22:09:24 2012 +0100 llvmpipe: refactor state setup a bit Refactor to make it easier to emit (and potentially later fetch in fs) coefficients for multiple attributes at once. Need to think more about how to make this actually happen however, the problem is different attributes can have different interpolation modes, requiring different handling in both setup and fs (though linear and perspective handling is close). commit 9363e49722ff47094d688a4be6f015a03fba9c79 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 19:23:23 2012 +0100 llvmpipe: vectorize tri offset calc cuts number of instructions in quad-offset-factor from 107 to 75. This code actually duplicated the (scalar) code calculating the determinant except it used different vertex order (leading to different sign but it doesn't matter) hence llvm could not have figured out it's the same (of course with determinant vectorized in the other place that wouldn't have worked any longer neither). Note this particular piece doesn't actually vectorize well, not many arithmetic instructions left but tons of shuffle instructions... Probably would need to work on n tris at a time for better vectorization. commit 63169dcb9dd445c94605625bf86d85306e2b4297 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 03:11:37 2012 +0100 llvmpipe: vectorize some scalar code in setup reduces number of arithmetic instructions, and avoids loading vector x,y values twice (once as scalars once as vectors). Results in a reduction of instructions from 76 to 64 in fs setup for glxgears (16%) on a cpu with sse41. Since this code uses vec2 disguised as vec4, on old cpus which had physical 64bit sse units (pre-Core2) it probably is less of a win in practice (and if you have no vectors you can only hope llvm eliminates the arithmetic for unneeded elements). commit 732ecb877f951ab89bf503ac5e35ab8d838b58a1 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 7 00:32:24 2012 +0100 draw: fix clipping bug introduced by 4822fea3f0440b5205e957cd303838c3b128419c broke clipping pretty badly (verified with lineclip test) commit ef5d90b86d624c152d200c7c4056f47c3c6d2688 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 23:38:59 2012 +0100 draw: don't store vertex header per attribute storing the vertex header once per attribute is totally unnecessary. Some quick look at the generated assembly says llvm in fact cannot optimize away the additional stores (maybe due to potentially aliasing pointers somewhere). Plus, this makes the code cleaner and also allows using a vector "or" instead of scalar ones. commit 6b3a5a57b0b9850854cfbd7b586e4e50102dda71 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 19:11:01 2012 +0100 draw: do the per-vertex "boolean" clipmask "or" with vectors no point extracting the values and doing it per component. Doesn't help that much since we still extract the values elsewhere anyway. commit 36519caf1af40e4480251cc79a2d527350b7c61f Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 2 22:27:01 2012 +0100 gallivm: fix lp_build_extract_broadcast with different sized vectors Fix the obviously wrong argument, so it doesn't blow up. commit 76d0ac3ad85066d6058486638013afd02b069c58 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 2 12:16:23 2012 +0000 draw: Compile per module and not per function (WIP). Enough to get gears w/ LLVM draw + softpipe to work on AVX doing: GALLIUM_DRIVER=softpipe SOFTPIPE_USE_LLVM=yes glxgears But still hackish -- will need to rethink and refactor this. commit 78e32b247d2a7a771be9a1a07eb000d1e54ea8bd Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 12:01:05 2012 +0000 llvmpipe: Remove lp_state_setup_fallback. Never used. commit 6895d5e40d19b4972c361e8b83fdb7eecda3c225 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Feb 27 19:14:27 2012 +0000 llvmpipe: Don't emit EMMS on x86 We already take precautions to ensure that LLVM never emits MMX code. commit 4822fea3f0440b5205e957cd303838c3b128419c Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 29 15:58:19 2012 +0100 draw: modifications for larger vector sizes We want to be able to use larger vectors especially for running the vertex shader. With this patch we build soa vectors which might have a different length than 4. Note that aos structures really remain the same, only when aos structures are converted to soa potentially different sized vectors are used. Samplers probably don't work yet, didn't look at them. Testing done: glxgears works with both 128bit and 256bit vectors. commit f4950fc1ea784680ab767d3dd0dce589f4e70603 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:57 2012 +0100 gallivm: override native vector width with LP_NATIVE_VECTOR_WIDTH env var for debug commit 6ad6dbf0c92f3bf68ae54e5f2aca035d19b76e53 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:24 2012 +0100 draw: allocate storage with alignment according to native vector width commit 7bf0e3e7c9bd2469ae7279cabf4c5229ae9880c1 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Feb 24 19:06:08 2012 +0000 gallivm: Fix comment grammar. Was missing several words. Spotted by Roland. commit b20f1b28eb890b2fa2de44a0399b9b6a0d453c52 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 19:22:09 2012 +0000 gallivm: Use MC-JIT on LLVM 3.1 + (i.e, SVN) MC-JIT Note: MC-JIT is still WIP. For this to work correctly it requires LLVM changes which are not yet upstream. commit b1af4dfcadfc241fd4023f4c3f823a1286d452c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 20:03:15 2012 +0100 llvmpipe: use new lp_type_width() helper in lp_test_blend commit 04e0a37e888237d4db2298f31973af459ef9c95f Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 19:50:34 2012 +0100 llvmpipe: clean up lp_test_blend a little Using variables just sized and aligned right makes it a bit more obvious what's going on. The test still only tests vector length 4. For AoS anything else probably isn't going to work. For SoA other lengths should work (at least with floats). commit e61c393d3ec392ddee0a3da170e985fda885a823 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:48:30 2012 +0000 gallivm: Ensure vector width consistency. Instead of assuming that everything is the max native size. commit 330081ac7bc41c5754a92825e51456d231bf84dd Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:44:14 2012 +0000 draw: More simd vector width consistency fixes. commit d90ca002753596269e37297e2e6c139b19f29f03 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:43:00 2012 +0000 gallivm: Remove unused lp_build_int32_vec4_type() helper. commit cae23417824d75869c202aaf897808d73a2c1db0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 17:32:16 2012 +0100 gallivm: use global variable for native vector width instead of define We do not know the simd extensions (and hence the simd width we should use) available at compile time. At least for now keep a define for maximum vector width, since a global variable obviously can't be used to adjust alignment of automatic stack variables. Leave the runtime-determined value at 128 for now in all cases. commit 51270ace6349acc2c294fc6f34c025c707be538a Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 15:41:02 2012 +0000 gallivm: Add a hunk inadvertedly lost when rebasing. commit bf256df9cfdd0236637a455cbaece949b1253e98 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:24:23 2012 +0000 llvmpipe: Use consistent vector width in depth/stencil test. commit 5543b0901677146662c44be2cfba655fd55da94b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:19:59 2012 +0000 draw: Use a consistent the vector register width. Instead of 4x32 sometimes, LP_NATIVE_VECTOR_WIDTH other times. commit eada8bbd22a3a61f549f32fe2a7e408222e5c824 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 12:08:04 2012 +0000 gallivm: Remove garbagge collection. MC-JIT will require one compilation per module (as opposed to one compilation per function), therefore no state will be shared, eliminating the need to do garbagge collection. commit 556697ea0ed72e0641851e4fbbbb862c470fd7eb Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 10:33:41 2012 +0000 gallivm: Move all native target initialization to lp_set_target_options(). commit c518e8f3f2649d5dc265403511fab4bcbe2cc5c8 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:52:32 2012 +0000 llvmpipe: Create one gallivm instance for each test. commit 90f10af8920ec6be6f2b1e7365cfc477a0cb111d Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:48:08 2012 +0000 gallivm: Avoid LLVMAddGlobalMapping() in lp_bld_assert(). Brittle, complex, and unecesary. Just use function pointer constant. commit 98fde550b33401e3fe006af59db4db628bcbf476 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:21:26 2012 +0000 gallivm: Add a lp_build_const_func_pointer() helper. To be reused in all places where we want to call C code. commit 6cfedadb62c2ce5af8d75969bc95a607f3ece118 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:44:41 2012 +0000 gallivm: Cleanup/simplify lp_build_const_string_variable. - Move to lp_bld_const where it belongs - Rename to lp_build_const_string - take the length from the argument (and don't count the zero terminator twice) - bitcast the constant to generic i8 * commit db1d4018c0f1fa682a9da93c032977659adfb68c Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 11:52:17 2012 +0000 gallivm: Set NoFramePointerElimNonLeaf to true where supported. commit 088614164aa915baaa5044fede728aa898483183 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 22 19:38:47 2012 +0100 llvmpipe: pass in/out pointers rather scalar floats in lp_bld_arit we don't want llvm to potentially optimize away the vectors (though it doesn't seem to currently), plus we want to be able to handle in/out vectors of arbitrary length. commit 3f5c4e04af8a7592fdffa54938a277c34ae76b51 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:22:55 2012 +0100 gallivm: fix lp_build_sqrt() for vector length 1 since we optimize away vectors with length 1 need to emit intrinsic without vector type. commit 79d94e5f93ed8ba6757b97e2026722ea31d32c06 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 17:00:46 2012 +0000 llvmpipe: Remove lp_test_round. commit 81f41b5aeb3f4126e06453cfc78990086b85b78d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:56:24 2012 +0100 llvmpipe: subsume lp_test_round into lp_test_arit Much simpler, and since the arguments aren't passed as 128bit values can run on any arch. This also uses the float instead of the double versions of the c functions (which probably was the intention anyway). In contrast to lp_test_round the output is much less verbose however. Tested vector width of 32 to 512 bits - all pass except 32 (length 1) which crashes in lp_build_sqrt() due to wrong type. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 945b338b421defbd274481d8c4f7e0910fd0e7eb Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 09:55:03 2012 +0000 gallivm: Centralize the function compilation logic. This simplifies a lot of code. Also doing this in a central place will make it easier to carry out the changes necessary to use MC-JIT in the future. gallivm: Fix typo in explicit derivative shuffle. Trivial. draw: make DEBUG_STORE work again adapt to lp_build_printf() interface changes Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: get rid of vecnf_from_scalar() just use lp_build_broadcast directly (cannot assign a name but don't really need it, vecnf_from_scalar() was producing much uglier IR due to using repeated insertelement instead of insertelement+shuffle). Reviewed-by: José Fonseca <jfonseca@vmware.com> llvmpipe: fix typo in complex interpolation code Fixes position interpolation when using complex mode (piglit fp-fragment-position and similar) Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: fix clipvertex/position storing again This appears to be the result of a bad merge. Fixes piglit tests relying on clipping, like a lot of the interpolation tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Fix explicit derivative manipulation. Same counter variable was being used in two nested loops. Use more meanigful variable names for the counter to fix and avoid this. gallivm: Prevent buffer overflow in repeat wrap mode for NPOT. Based on Roland's patch, discussion, and review . Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix dims for TGSI_TEXTURE_1D in emit_tex. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix explicit volume texture derivatives. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: fix 1d shadow texture sampling Always r coordinate is used, hence need 3 coords not two (the second one is unused). Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Enable AVX support without MCJIT, where available. For now, this just enables AVX on Windows for testing. If the code is stable then we might consider prefering the old JIT wherever possible. No change elsewhere. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2012-07-13 18:09:30 +01:00
res = a;
if (type.sign) {
if (arch_rounding_available(type)) {
res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
gallivm,draw,llvmpipe: Support wider native registers. Squashed commit of the following: commit 7acb7b4f60dc505af3dd00dcff744f80315d5b0e Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:46:31 2012 +0100 draw: Don't use dynamically sized arrays. Not supported by MSVC. commit 5810c28c83647612cb372d1e763fd9d7780df3cb Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:44:16 2012 +0100 gallivm,llvmpipe: Don't use expressions with PIPE_ALIGN_VAR(). MSVC doesn't accept exceptions in _declspec(align(...)). Use a define instead. commit 8aafd1457ba572a02b289b3f3411e99a3c056072 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:41:56 2012 +0100 gallium/util: Make u_cpu_detect.h header C++ safe. commit 5795248350771f899cfbfc1a3a58f1835eb2671d Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 2 12:08:01 2012 +0100 gallium/util: Add ULL suffix to large constants. As suggested by Andy Furniss: it looks like some old gcc versions require it. commit 4c66c22727eff92226544c7d43c4eb94de359e10 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Truly disable INF/NAN tests on MSVC. Thanks to Brian for spotting this. commit 8bce274c7fad578d7eb656d9a1413f5c0844c94e Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Disable INF/NAN tests on MSVC. Somehow they are not recognized as constants. commit 6868649cff8d7fd2e2579c28d0b74ef6dd4f9716 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 15:05:24 2012 +0200 gallivm: Cleanup the 2 x 8 float -> 16 ub special path in lp_build_conv. No behaviour change intended, like 7b98455fb40c2df84cfd3cdb1eb7650f67c8a751. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 5147a0949c4407e8bce9e41d9859314b4a9ccf77 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 14:28:19 2012 +0200 gallivm: (trivial) fix issues with multiple-of-4 texture fetch Some formats can't handle non-multiple of 4 fetches I believe, but everything must support length 1 and multiples of 4. So avoid going to scalar fetch (which is very costly) just because length isn't 4. Also extend the hack to not use shift with variable count for yuv formats to arbitrary length (larger than 1) - doesn't matter how many elements we have we always want to avoid it unless we have variable shift count instruction (which we should get with avx2). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 87ebcb1bd71fa4c739451ec8ca89a7f29b168c08 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jul 4 02:09:55 2012 +0200 gallivm: (trivial) fix typo for wrap repeat mode in linear filtering aos code This would lead to bogus coordinates at the edges. (undetected by piglit because this path is only taken for block-based formats). Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 3a42717101b1619874c8932a580c0b9e6896b557 Author: José Fonseca <jfonseca@vmware.com> Date: Tue Jul 3 19:42:49 2012 +0100 gallivm: Fix TGSI integer translation with AVX. commit d71ff104085c196b16426081098fb0bde128ce4f Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 15:17:41 2012 +0100 llvmpipe: Fix LLVM JIT linear path. It was not working properly because it was looking at the JIT function before it was actually compiled. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit a94df0386213e1f5f9a6ed470c535f9688ec0a1b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Jun 28 18:07:10 2012 +0100 gallivm: Refactor lp_build_broadcast(_scalar) to share code. Doesn't really change the generated assembly, but produces more compact IR, and of course, makes code more consistent. Reviewed-by: Brian Paul <brianp@vmware.com> commit 66712ba2731fc029fa246d4fc477d61ab785edb5 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 17:30:13 2012 +0100 gallivm: Make LLVMContextRef a singleton. There are any places inside LLVM that depend on it. Too many to attempt to fix. Reviewed-by: Brian Paul <brianp@vmware.com> commit ff5fb7897495ac263f0b069370fab701b70dccef Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 28 18:15:27 2012 +0200 gallivm: don't use 8-wide texture fetch in aos path This appears to be a slight loss usually. There are probably several reasons for that: - fetching itself is scalar - filtering is pure int code hence needs splitting anyway, same for the final texel offset calculations - texture wrap related code, which can be done 8-wide, is slightly more complex with floats (with clamp_to_edge) and float operations generally more costly hence probably not much faster overall - the code needed to split when encountering different mip levels for the quads, adding complexity So, just split always for aos path (but leave it 8-wide for soa, since we do 8-wide filtering there when possible). This should certainly be revisited if we'd have avx2 support. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ce8032b43dcd8e8d816cbab6428f54b0798f945d Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:41:19 2012 +0200 gallivm: (trivial) don't extract fparts variable if not needed Did not have any consequences but unnecessary. commit aaa9aaed8f80dc282492f62aa583a7ee23a4c6d5 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:09:06 2012 +0200 gallivm: fix precision issue in aos linear int wrap code now not just passes at a quick glance but also with piglit... If we do the wrapping with floats, we also need to set the weights accordingly. We can potentially end up with different (integer) coordinates than what the integer calculations would have chosen, which means the integer weights calculated previously in this case are completely wrong. Well at least that's what I think happens, at least recalculating the weights helps. (Some day really should refactor all the wrapping, so we do whatever is fastest independent of 16bit int aos or 32bit float soa filtering.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit fd6f18588ced7ac8e081892f3bab2916623ad7a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 11:15:53 2012 +0100 gallium/util: Fix parsing of options with underscore. For example GALLIVM_DEBUG=no_brilinear which was being parsed as two options, "no" and "brilinear". commit 09a8f809088178a03e49e409fa18f1ac89561837 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 26 15:00:14 2012 +0100 gallivm: Added a generic lp_build_print_value which prints a LLVMValueRef. Updated lp_build_printf to share common code. Removed specific lp_build_print_vecX. Reviewed-by: José Fonseca <jfonseca@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit e59bdcc2c075931bfba2a84967a5ecd1dedd6eb0 Author: José Fonseca <jfonseca@vmware.com> Date: Wed May 16 15:00:23 2012 +0100 draw,llvmpipe: Avoid named struct types on LLVM 3.0 and later. Starting with LLVM 3.0, named structures are meant not for debugging, but for recursive data types, previously also known as opaque types. The recursive nature of these types leads to several memory management difficulties. Given that we don't actually need recursive types, avoid them altogether. This is an attempt to address fdo bugs 41791 and 44466. The issue is somewhat random so there's no easy way to check how effective this is. Cherry-picked from 9af1ba565dfd5cef9ee938bb7c04767d14878fbf commit df6070f618a203c7a876d984c847cde4cbc26bdb Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 14:42:53 2012 +0200 gallivm: (trivial) fix typo in faster aos linear int wrap code no longer crashes, now REALLY tested. commit d8f98dce452c867214e6782e86dc08562643c862 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 18:20:58 2012 +0200 llvmpipe: (trivial) remove bogus optimization for float aos repeat wrap This optimization for nearest filtering on the linear path generated likely bogus results, and the int path didn't have any optimizations there since the only shader using force_nearest apparently uses clamp_to_edge not repeat wrap anyway. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit c4e271a0631087c795e756a5bb6b046043b5099d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 23:01:52 2012 +0200 gallivm: faster repeat wrap for linear aos path too Even if we already have scaled integer coords, it's way faster to use the original float coord (plus some conversions) rather than use URem. The choice of what to do for texture wrapping is not really tied to int aos or float soa filtering though for some modes there can be some gains (because of easier weight calculations). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 1174a75b1806e92aee4264ffe0ffe7e70abbbfa3 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 14:39:22 2012 +0200 gallivm: improve npot tex wrap repeat in linear soa path URem gets translated into series of scalar divisions so just about anything else is faster. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit f849ffaa499ed96fa0efd3594fce255c7f22891b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 00:40:35 2012 +0100 gallivm: (trivial) fix near-invisible shift-space typo I blame the keyboard. commit 5298a0b19fe672aebeb70964c0797d5921b51cf0 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:24:28 2012 +0200 gallivm: add new intrinsic helper to deal with arbitrary vector length This helper will split vectors which are too large for the hw, or expand them if they are too small, so a caller of a function using intrinsics which uses such sizes need not split (or expand) the vectors manually and the function will still use the intrinsic instead of dropping back to generic llvm code. It can also accept scalars for use with pseudo-vector intrinsics (only useful for float arguments, all x86 scalar simd float intrinsics use 4vf32). Only used for lp_build_min/max() for now (also added the scalar float case for these while there). (Other basic binary functions could use it easily, whereas functions with a different interface would need different helpers.) Expanding vectors isn't widely used, because we always try to use build contexts with native hw vector sizes. But it might (or not) be nicer if this wouldn't need to be done, the generated code should in theory stay the same (it does get hit by lp_build_rho though already since we didn't have a intrinsic for the scalar lp_build_max case before). v2: incorporated Brian's feedback, and also made the scalar min/max case work instead of crash (all scalar simd float intrinsics take 4vf32 as argument, probably the reason why it wasn't used before). Moved to lp_bld_intr based on José's request, and passing intrinsic size instead of length. Ideally we'd derive the source type info from the passed in llvm value refs and process some llvmtype return type so we could handle intrinsics where the source and destination type isn't the same (like float/int conversions, packing instructions) but that's a bit too complicated for now. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 01aa760b99ec0b2dc8ce57a43650e83f8c1becdf Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:19:18 2012 +0200 gallivm: (trivial) increase max code size for shader disassembly 64kB was just short of what I needed (which caused a crash) hence increase to 96kB (should probably be smarter about that). commit 74aa739138d981311ce13076388382b5e89c6562 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:53:29 2012 +0100 gallivm: simplify aos float tex wrap repeat nearest just handle pot and npot the same. The previous pot handling ended up with exactly the same instructions plus 2 more (leave it in the soa path though since it is probably still cheaper there). While here also fix a issue which would cause a crash after an assert. commit 0e1e755645e9e49cfaa2025191e3245ccd723564 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:29:24 2012 +0100 gallivm: (trivial) skip floor rounding in ifloor when not signed This was only done for the non-sse41 case before, but even with sse41 this is obviously unnecessary (some callers already call itrunc in this case anyway but some might not). commit 7f01a62f27dcb1d52597b24825931e88bae76f33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:23:12 2012 +0100 gallivm: (trivial) fix bogus comments commit 5c85be25fd82e28490274c468ce7f3e6e8c1d416 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 20 11:51:57 2012 +0100 translate: Free elt8_func/elt16_func too. These were leaking. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 0ad498f36fb6f7458c7cffa73b6598adceee0a6c Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 15:55:34 2012 +0200 gallivm: fix bug for tex wrap repeat with linear sampling in aos float path The comparison needs to be against length not length_minus_one, otherwise the max texel is never chosen (for the second coordinate). Fixes piglit texwrap-1D-npot-proj (and 2D/3D versions). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit d1ad65937c5b76407dc2499b7b774ab59341209e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 16:13:43 2012 +0200 gallivm: simplify soa tex wrap repeat with npot textures and no mip filtering Similar to what is already done in aos sampling for the float path (but not the int path since we don't get normalized float coordinates there). URem is expensive and the calculation is done trivially with normalized floats instead (at least with sse41-capable cpus). (Some day should probably do the same for the mip filter path but it's much more complicated there hence the gain is smaller.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit e1e23f57ba9b910295c306d148f15643acc3fc83 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:38:56 2012 +0200 llvmpipe: (trivial) remove duplicated function declaration Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 07ca57eb09e04c48a157733255427ef5de620861 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:37:34 2012 +0200 llvmpipe: destroy setup variants on context destruction lp_delete_setup_variants() used to be called in garbage collection, but this no longer exists hence the setup shaders never got freed. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ed0003c633859a45f9963a479f4c15ae0ef1dca3 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 16:25:29 2012 +0100 gallivm: handle different ilod parts for multiple quad sampling This fixes filtering when the integer part of the lod is not the same for all quads. I'm not fully convinced of that solution yet as it just splits the vector if the levels to be sampled from are different. But otherwise we'd need to do things like some minify steps, and getting mip level base address separately anyway hence it wouldn't really look like much of a win (and making the code even more complex). This should now give identical results to single quad sampling. commit 8580ac4cfc43a64df55e84ac71ce1a774d33c0d2 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:14:47 2012 +0200 gallivm: de-duplicate sample code common to soa and aos sampling There doesn't seem to be any reason why this code dealing with cube face selection, lod and mip level calculation is separate in aos and soa sampling, and I am sick of having it to change in both places. commit fb541e5f957408ce305b272100196f1e12e5b1e8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:15:41 2012 +0200 gallivm: do mip filtering with per quad lod_fpart This gives better results for mip filtering, though the generated code might not be optimal. For now it also creates some artifacts if the lod_ipart isn't the same for all quads, since instead of using the same mip weight for all quads as previously (which just caused non-smooth gradients) this now will use the right weights but with the wrong mip level in this case (can easily be seen with things like texfilt, mipmap_tunnel). v2: use logic helper suggested by José, and fix issue with negative lod_fpart values commit f1cc84eef7d826a20fab6cd8ccef9a275ff78967 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 13 18:35:25 2012 +0200 gallivm: (trivial) fix bogus assert in lp_build_unpack_broadcast_aos_scalars commit 7c17dbae8ae290df9ce0f50781a09e8ed640c044 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:11:14 2012 +0100 util: Reimplement half <-> float conversions. Removed u_half.py used to generate the table for previous method. Previous implementation of float to half conversion was faulty for denormalised and NaNs and would require extra logic to fix, thus making the speedup of using tables irrelevant. commit 7762f59274070e1dd4b546f5cb431c2eb71ae5c3 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:12:16 2012 +0100 tests: Updated tests to properly handle NaN for half floats. commit fa94c135aea5911fd93d5dfb6e6f157fb40dce5e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:33:10 2012 +0200 gallivm: do mip level calculations per quad This is the final piece which shouldn't change the rendering output yet. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 23cbeaddfe03c09ca18c45d28955515317ffcf4c Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Jun 9 00:54:21 2012 +0200 gallivm: do per-quad cube face selection Doesn't quite fix the piglit cubemap test (not sure why actually) but doing per-quad face selection is doing the right thing and definitely an improvement. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit abfb372b3702ac97ac8b5aa80ad1b94a2cc39d33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:22:59 2012 +0200 gallivm: do all lod calculations per quad Still no functional change but lod is now converted to scalar after lod calculations. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 519368632747ae03feb5bca9c655eccbc5b751b4 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:46:10 2012 +0100 gallivm: Added support for half-float to float conversion in lp_build_conv. Updated various utility functions to support this change. commit 135b4d683a4c95f7577ba27b9bffa4a6fbd2c2e7 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:02:46 2012 +0100 gallivm: Added function for half-float to float conversion. Updated lp_build_format_aos_array to support half-float source. commit 37d648827406a20c5007abeb177698723ed86673 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:55:18 2012 +0100 util: Updated u_format_tests to rigidly test half-float boundary values. commit 2ad18165d96e578aa9046df7c93cb1c3284d8c6b Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:54:16 2012 +0100 llvmpipe: Updated lp_test_format to properly handle Inf/NaN results. commit 78740acf25aeba8a7d146493dd5c966e22c27b73 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:53:30 2012 +0100 util: Added functions for checking NaN / Inf for double and half-floats. commit 35e9f640ae01241f9e0d67fe893bbbf564c05809 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:05:13 2012 +0200 gallivm: Fix calculating rho for 3d textures for the single-quad case Discovered by accident, this looks like a very old typo bug. commit fc1220c636326536fd0541913154e62afa7cd1d8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:04:59 2012 +0200 gallivm: do calcs per-quad in lp_build_rho Still convert to scalar at the end of the function. commit 50a887ffc550bf310a6988fa2cea5c24d38c1a41 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 21 23:21:50 2012 +0200 gallivm: (trivial) return scalar in lp_build_extract_range for length 1 vectors Our type system on top of llvm's one doesn't generally support vectors of length 1, instead using scalars. So we should return a scalar from this function instead of having to bitcast the vector with length 1 later elsewhere. commit 80c71c621f9391f0f9230460198d861643324876 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 17:49:15 2012 +0100 draw: Fixed bad merge error commit c47401cfad0c9167de20ff560654f533579f452c Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:29:30 2012 +0100 draw: Updated store_clip to store whole vectors instead of individual elements. commit 2d9c1ad74b0b0b41861fffcecde39f09cc27f1cf Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:28:32 2012 +0100 gallivm: Added lp_build_fetch_rgba_aos_array. A version of lp_build_fetch_rgba_aos which is targeted at simple array formats. Reads the whole vector from memory in one, instead of reading each element individually. Tested with mesa tests and demos. commit ff7805dc2b6ef6d8b11ec4e54aab1633aef29ac8 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:27:40 2012 +0100 gallivm: Added lp_build_pad_vector. This function pads a vector with undef to a desired length. commit 701f50acef24a2791dabf4730e5b5687d6eb875d Author: James Benton <jbenton@vmware.com> Date: Fri May 18 17:27:19 2012 +0100 util: Added util_format_is_array. This function checks whether a format description is in a simple array format. commit 5e0a7fa543dcd009de26f34a7926674190fa6246 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:13:47 2012 +0100 draw: Removed draw_llvm_translate_from and draw/draw_llvm_translate.c. This is "replaced" by adding an optimised path in lp_build_fetch_rgba_aos in an upcoming patch. commit 8c886d6a7dd3fb464ecf031de6f747cb33e5361d Author: James Benton <jbenton@vmware.com> Date: Wed May 16 15:02:31 2012 +0100 draw: Modified store_aos to write the vector as one, not individual elements. commit 37337f3d657e21dfd662c7b26d61cb0f8cfa6f17 Author: James Benton <jbenton@vmware.com> Date: Wed May 16 14:16:23 2012 +0100 draw: Changed aos_to_soa to use lp_build_transpose_aos. commit bd2b69ce5d5c94b067944d1dcd5df9f8e84548f1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:14:27 2012 +0100 draw: Changed soa_to_aos to use lp_build_transpose_aos. commit 0b98a950d29a116e82ce31dfe7b82cdadb632f2b Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:45 2012 +0100 gallivm: Added lp_build_transpose_aos which converts between aos and soa. commit 69ea84531ad46fd145eb619ed1cedbe97dde7cb5 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:01 2012 +0100 gallivm: Added lp_build_interleave2_half aimed at AVX unpack instructions. commit 7a4cb1349dd35c18144ad5934525cfb9436792f9 Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 22 11:54:14 2012 +0100 gallivm: Fix build on Windows. MC-JIT not yet supported there. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit afd105fc16bb75d874e418046b80d9cc578818a1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:17:26 2012 +0100 llvmpipe: Added a error counter to lp_test_conv. Useful for keeping track of progress when fixing errors! Signed-off-by: José Fonseca <jfonseca@vmware.com> commit b644907d08c10a805657841330fc23db3963d59c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:16:46 2012 +0100 llvmpipe: Changed known failures in lp_test_conv. To comply with the recent fixes to lp_bld_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit d7061507bd94f6468581e218e61261b79c760d4f Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:14:38 2012 +0100 llvmpipe: Added fixed point types tests to lp_test_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 146b3ea39b4726dbe125ac666bd8902ea3d6ca8c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:26:35 2012 +0100 llvmpipe: Changed lp_test_conv src/dst alignment to be correct. Now based on the define rather than a fixed number. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit f3b57441f834833a4b142a951eb98df0aa874536 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:06:44 2012 +0100 gallivm: Fixed erroneous optimisation in lp_build_min/max. Previously assumed normalised was 0 to 1, but it can be -1 to 1 if type is signed. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a0613382e5a215cd146bb277646a6b394d376ae4 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:04:49 2012 +0100 gallivm: Compensate for lp_const_offset in lp_build_conv. Fixing a /*FIXME*/ to remove errors in integer conversion in lp_build_conv. Tested using lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a3d2bf15ea345bc8a0664f8f441276fd566566f3 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:01:25 2012 +0100 gallivm: Fixed overflow in lp_build_clamped_float_to_unsigned_norm. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit e7b1e76fe237613731fa6003b5e1601a2e506207 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 21 20:07:51 2012 +0100 gallivm: Fix build with LLVM 2.6 Trivial, and useful. commit d3c6bbe5c7f5ba1976710831281ab1b6a631082d Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 15 17:15:59 2012 +0100 gallivm: Enable MCJIT/AVX with vanilla LLVM 3.1. Add the necessary C++ glue, so that we don't need any modifications to the soon to be released LLVM 3.1. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 724a019a14d40fdbed21759a204a2bec8a315636 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 22:04:06 2012 +0100 gallivm: Use HAVE_LLVM 0x0301 consistently. commit af6991e2a3868e40ad599b46278551b794839748 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 21:49:06 2012 +0100 gallivm: Add MCRegisterInfo.h to silence benign warnings about missing implementation. Trivial. commit 6f8a1d75458daae2503a86c6b030ecc4bb494e23 Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Apr 2 22:14:15 2012 -0700 gallivm: Pass in a MCInstrInfo to createMCInstPrinter on llvm-3.1. llvm-3.1svn r153860 makes MCInstrInfo available to the MCInstPrinter. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 62555b6ed8760545794f83064e27cddcb3ce5284 Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 27 21:51:17 2012 -0700 gallivm: Fix method overriding in raw_debug_ostream. Use matching type qualifers to avoid method hiding. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 6a9bd784f4ac68ad0a731dcd39e5a3c39989f2be Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 13 22:40:52 2012 -0700 gallivm: Fix createOProfileJITEventListener namespace with llvm-3.1. llvm-3.1svn r152620 refactored the OProfile profiling code. createOProfileJITEventListener was moved from the llvm namespace to the llvm::JITEventListener namespace. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit b674955d39adae272a779be85aa1bd665de24e3e Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Mar 5 22:00:40 2012 -0800 gallivm: Pass in a MCRegisterInfo to MCInstPrinter on llvm-3.1. llvm-3.1svn r152043 changes createMCInstPrinter to take an additional MCRegisterInfo argument. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 11ab69971a8a31c62f6de74905dbf8c02884599f Author: Vinson Lee <vlee@freedesktop.org> Date: Wed Feb 29 21:20:53 2012 -0800 Revert "gallivm: Change getExtent and readByte to non-const with llvm-3.1." This reverts commit d5a6c172547d8964f4d4bb79637651decaf9deee. llvm-3.1svn r151687 makes MemoryObject accessor members const again. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 339960c82d2a9f5c928ee9035ed31dadb7f45537 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 14 16:19:56 2012 +0200 gallivm: (trivial) fix assertion failure for mipmapped 1d textures In lp_build_rho, we may end up with a 1-element vector (for mipmapped 1d textures), but in this case we require the type to be a non-vector type, so need a cast. commit 9d73edb727bd6d196030dc3026b7bf0c574b3e19 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:12:07 2012 +0200 gallivm: prepare for per-quad lod calculations for large vectors to be able to handle multiple quads at once in texture sampling and still do lod calculations per quad, it is necessary to get the per-quad derivatives into the lp_build_rho function. Until now these derivative values were just scalars, which isn't going to work. So we now use vectors, and since the interface needs to change we also do some different (slightly more efficient) packing of the values. For 8-wide vectors the packed derivative values for 3 coords would look like this, this scales to a arbitrary (multiple of 4) vector size: ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____ The second vector will be unused for 1d and 2d textures. To facilitate future changes the derivative values are put into a struct, since quite some functions just pass these values through. The generated code seems to be very slightly better for 2d textures (with 4-wide vectors) than before with sse2 (if you have a cpu with physical 128bit simd units - otherwise it's probably not a win). v2: suggestions from José, rename variables, add comments, use swizzle helper commit 0aa21de0d31466dac77b05c97005722e902517b8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:10:31 2012 +0200 gallivm: add undefined swizzle handling to lp_build_swizzle_aos This is useful for vectors with "holes", it lets llvm choose the most efficient shuffle instructions if some elements aren't needed without having to worry what elements to manually pick otherwise. commit 00faf3f370e7ce92f5ef51002b0ea42ef856e181 Author: José Fonseca <jfonseca@vmware.com> Date: Fri May 4 17:25:16 2012 +0100 gallivm: Get the LLVM IR optimization passes before JIT compilation. MC-JIT engine compiles the module immediately on creation, so the optimization passes were being run too late. So now we create a target data layout from a string, that matches the ABI parameters reported by the compiler. The backend optimization passes were always been run, so the performance improvement is modest (3% on multiarb mesa demo). Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit 40a43f4e2ce3074b5ce9027179d657ebba68800a Author: Roland Scheidegger <sroland@vmware.com> Date: Wed May 2 16:03:54 2012 +0200 gallivm: (trivial) fix wrong define used in lp_build_pack2 should fix stack-smashing crashes. commit e6371d0f4dffad4eb3b7a9d906c23f1c88a2ab9e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 30 21:25:29 2012 +0200 gallivm: add perf warnings when not using intrinsics with 256bit vectors Helper functions using integer sse2 intrinsics could split the vectors with AVX instead of using generic fallback (which should be faster). We don't actually expect to hit these paths (hence don't fix them up to actually do the vector splitting) so just emit warnings (for those functions where it's obvious doing split/intrinsic is faster than using generic path). Only emit warnings for 256bit vectors since we _really_ don't expect to hit arbitrary large vectors which would affect a lot more functions. The warnings do not actually depend on avx since the same logic applies to plain sse2 too (but of course again there's _really_ no reason we should hit these functions with 256bit vectors without avx). commit 8a9ea701ea7295181e846c6383bf66a5f5e47637 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:37:07 2012 +0200 gallivm: split vectors manually for avx in lp_build_pack2 (v2) There's 2 reasons for this: First, there's a llvm bug (fixed in 3.1) which generates tons of byte inserts/extracts otherwise, and second, more importantly, we want to use pack intrinsics instead of shuffles. We do this in lp_build_pack2 and not the calling code (aos sample path) because potentially other callers might find that useful too, even if for larger sequences of code using non-native vector sizes it might be better to manually split vectors. This should boost texture performance in the aos path considerably. v2: fix issues with intrinsics types with old llvm commit 27ac5b48fa1f2ea3efeb5248e2ce32264aba466e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:26:22 2012 +0200 llvmpipe: refactor lp_build_pack2 (v2) prettify, and it's unnecessary to assert when there's no intrinsic due to unsupported bit width - the shuffle path will work regardless. In contrast lp_build_packs2, should only rely on lp_build_pack2 doing the clamping for element sizes for which there is a sse2 intrinsic. v2: fix bug spotted by Jose regarding the intrinsic type for packusdw on old llvm versions. commit ddf279031f0111de4b18eaf783bdc0a1e47813c8 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:13:59 2012 +0200 gallivm: add src width check in lp_build_packs2() not doing so would skip clamping even if no sse2 pack instruction is available, which is incorrect (in theory only, such widths would also always hit a (unnecessary) assertion in lp_build_pack2(). commit e7f0ad7fe079975eae7712a6e0c54be4fae0114b Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Apr 27 15:57:00 2012 +0200 gallivm: (trivial) fix crash-causing typo for npot textures with avx commit 28a9d7f6f655b6ec508c8a3aa6ffefc1e79793a0 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Apr 25 19:38:45 2012 +0200 gallivm: (trivial) remove code mistakenly added twice. commit d5926537316f8ff67ad0a52e7242f7c5478d919b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Apr 24 21:16:15 2012 +0200 gallivm: add a new avx aos sample path (v2) Try to avoid mixing float and int address calculations. This does texture wrap modes with floats, and then the offset calculations still with ints (because of lack of precision with floats, though we could do some effort to make it work with not too large (16MB) textures). This also handles wrap repeat mode with npot-sized textures differently than either the old soa or aos int path (likely way faster but untested). Otherwise the actual address wrap code is largely similar to the soa path (not quite the same as this one also has some int code), it should get used by avx soa sampling later as well but doesn't handle more complex address modes yet (this will also have the benefit that we can use aos sampling path for all texture address modes). Generated code for that looks reasonable, but still does not split vectors explicitly for fetch/filter which means still get hit by llvm (fixed upstream) which generates hundreds of pinsrb/pextrb instead of two shuffles. It is not obvious though if it's much of a win over just doing address calcs 4-wide but with ints, even if it is definitely much less instructions on avx. piglit's texwrap seems to look exactly the same but doesn't test neither the non-normalized nor the npot cases. v2: fix comments, prettify based on Brian's and Jose's feedback. commit bffecd22dea66fb416ecff8cffd10dd4bdb73fce Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Apr 19 01:58:29 2012 +0200 gallivm: refactor aos lp_build_sample_image_nearest/linear split them up to separate address calculations and fetching/filtering. Need this for being able to do 8-wide float address calcs and 4-wide fetch/filter later (for avx). Plus the functions were very big scary monsters anyway (in particular lp_build_sample_image_linear). commit a80b325c57529adddcfa367f96f03557725c4773 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 16 17:17:18 2012 +0200 gallivm: fix lp_build_resize when truncating width but expanding vector size Missed this case which I thought was impossible - the assertion for it was right after the division by zero... (AoS) texture sampling may ask us to do this, for things like 8 4x32int vectors to 1 32x8int vector conversion (eventually, we probably don't want this to happen). commit f9c8337caa3eb185830d18bce8b95676a065b1d7 Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Apr 14 18:00:59 2012 +0200 gallivm: fix cube maps with larger vectors This makes the branchless cube face selection code work with larger vectors. Because the complexity is quite high (cannot really be improved it seems, per-face selection would reduce complexity a lot but this leads to errors unless the derivatives are calculated all from the same face which almost doubles the work to be done) it is still slower than the branching version, hence only enable this with large vectors. It doesn't actually do per-quad face selection yet (only makes sense with matching lod selection, in fact it will select the same face for all pixels based on the average of the first four pixels for now) but only different shuffles are required to make it work (the branching version actually should work with larger vectors too now thanks to the improved horizontal add but of course it cannot be extended to really select the face per-quad unless doing branching per quad). commit 7780c58869fc9a00af4f23209902db7e058e8a66 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 21:11:12 2012 +0100 llvmpipe: (trivial) fix compiler warning and also clarify comment regarding availability of popcnt instruction. commit a266dccf477df6d29a611154e988e8895892277e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 14:21:07 2012 +0100 gallivm: remove unneeded members in lp_build_sample_context Minor cleanup, the texture width, height, depth aren't accessed in their scalar form anywhere. Makes it more obvious those values should probably be fetched already vectorized (but this requires more invasive changes)... commit b678c57fb474e14f05e25658c829fc04d2792fff Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 29 15:53:55 2012 +0100 gallivm: add a helper for concatenating vectors Similar to the extract_range helper intended to get around slow code generated by llvm for 128bit insertelements. Concatenating two 128bit vectors this way will result in a single vinsertf128 operation rather than two 64bit stores plus one 128bit load, though it might be mildly useful for other purposes as well. commit 415ff228bcd0cf5e44a4c15350a661f0f5520029 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 19:41:15 2012 +0100 gallivm: add a custom 2x8f->1x16ub avx conversion path Similar to the existing 4x4f->1x16ub sse2 path, shaves off a couple instructions (min/max mostly) because it relies on pack intrinsics clamping. commit 78c08fc89f8fbcc6dba09779981b1e873e2a0299 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 18:44:07 2012 +0100 gallivm: add avx arithmetic intrinsics Add all avx intrinsics for arithmetic functions (with the exception of the horizontal add function which needs another look). Seems to pass basic tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit a586caa2800aa5ce54c173f7c0d4fc48153dbc4e Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 15:31:35 2012 +0100 gallivm: add avx logic intrinsics Add the blend intrinsics for 8-wide float and 4-wide double vectors. Since we lack 256bit int instructions these are used for int vectors as well, though obviously not for byte or word element values. The comparison intrinsics aren't extended for avx since these are only used for pre-2.7 llvm versions. commit 70275e4c13c89315fc2560a4c488c0e6935d5caf Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 00:40:53 2012 +0100 gallivm: new helper function for extract shuffles. Based on José's idea as we can need that in a couple places. Note that such shuffles should not be used lightly, since data layout of <4 x i8> is different to <16 x i8> for instance, hence might cause data rearrangement. commit 4d586dbae1b0c55915dda1759d2faea631c0a1c2 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 18:27:25 2012 +0100 gallivm: (trivial) don't overallocate shuffle variable using wrong define meant huge array... commit 06b0ec1f6d665d98c135f9573ddf4ba04b2121ad Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 17:54:20 2012 +0100 gallivm: don't do per-element extract/insert for vector element resize Instead of doing per-element extract/insert if the src vectors and dst vector differ in total size (which generates atrocious code) first change the src vectors size by using shuffles to destination vector size. We can still do better than that on AVX for packing to color buffer (by exploiting pack intrinsics characteristics hence eleminating the need for some clamps) but this already generates much better code. v2: incorporate feedback from José, Keith and use shuffle instead of bitcasts/extracts. Due to llvm deficiencies the latter cause all data to get moved to GPRs and back in pieces (even though the data in the regs actually stays the same...). commit c9970d70e05f95d3f52fe7d2cd794176a52693aa Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 19:33:19 2012 +0000 gallivm: fix bug in simple position interpolation Accidental use of position attribute instead of just pixel coordinates. Caused failures in piglit glsl-fs-ceil and glsl-fs-floor. commit d0b6fcdb008d04d7f73d3d725615321544da5a7e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 15:31:14 2012 +0000 gallivm: fix emission of ceil opcode lp_build_ceil seems more appropriate than lp_build_trunc. This seems to be never hit though someone performs some ceil to floor magic. commit d97fafed7e62ffa6bf76560a92ea246a1a26d256 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 22 11:46:52 2012 +0000 gallivm: new vectorized path for cubemap calculations should be faster when adapted to multiple quads as only selection masks need to be different. The code is more or less a per-pixel version adapted to only do it per quad. A per pixel version would be much simpler (could drop 2 selects, 6 broadcasts and the messy horizontal add of 3 vectors at the expense of only 2 more absolute value instructions - would also just work for arbitary large vectors). This version doesn't yet work with larger vectors because the horizontal add isn't adjusted to be able to work with 2x4 vectors (and also because face selection wouldn't be done per quad just per block though that would be only a correctness issue just as with lod selection). The downside is this code is quite a bit slower. On a Core2 it can be sped up by disabling the hw blend instructions for selection and using logicop fallbacks instead, but it is still slower than the old code, hence leave that in for now. Probably will chose one or the other version based on vector length in the end. commit b375fbb18a3fd46859b7fdd42f3e9908ea4ff9a3 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 21 14:42:29 2012 +0000 gallivm: fix optimized occlusion query intrinsic name commit a9ba0a3b611e48efbb0e79eb09caa85033dbe9a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Mar 21 16:19:43 2012 +0000 draw,gallivm,llvmpipe: Call gallivm_verify_function everywhere. commit f94c2238d2bc7383e088b8845b7410439a602071 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 18:54:10 2012 +0000 gallivm: optimize calculations for cube maps a bit this does some more vectorized calculations and uses horizontal adds if possible. A definite win with sse3 otherwise it doesn't seem to make much of a difference. In any case this is arithmetically identical, cannot handle larger vectors. Should be useful as a reference point against larger vector version later... commit 21a2c1cf3c8e1ac648ff49e59fdc0e3be77e2ebb Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 15:16:27 2012 +0000 llvmpipe: slight optimization of occlusion queries using movmskps when available. While this is slightly better for cpus without popcnt we should really sum the vectors ourselves (it is also possible to cast to i4 before doing the popcnt but that doesn't help that much neither since llvm is using some optimized popcnt version for i32) commit 5ab5a35f216619bcdf55eed52b0db275c4a06c1b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 13:32:11 2012 +0000 llvmpipe: fix occlusion queries with larger vectors need to adjust casts etc. commit ff95e6fdf5f16d4ef999ffcf05ea6e8c7160b0d5 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Mar 19 20:15:25 2012 +0000 gallivm: Restore optimization passes. commit 57b05b4b36451e351659e98946dae27be0959832 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:34:22 2012 +0000 llvmpipe: use existing min2 macro commit bc9a20e19b4f600a439f45679451f2e87cd4b299 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:07:27 2012 +0000 llvmpipe: add some safeguards against really large vectors As per José's suggestion, prevent things from blowing up if some cpu would have 1024bit or larger vectors. commit 0e2b525e5ca1c5bbaa63158bde52ad1c1564a3a9 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:31:08 2012 +0000 llvmpipe: fix mask generation for uberwide vectors this was the only piece preventing 16-wide vectors from working (apart from the LP_MAX_VECTOR_WIDTH define that is), which is the maximum as we don't get more pixels in the fragment shader at once. Hence adjust that so things could be tested properly with that size even though there seems to be no practical value. commit 3c8334162211c97f3a11c7f64e9e5a2a91ad9656 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:19:41 2012 +0000 llvmpipe: fix the simple interpolation method with larger vectors so both methods actually _really_ work now. Makes textures look nice with larger vectors... commit 1cb0464ef8871be1778d43b0c56adf9c06843e2d Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 17:26:35 2012 +0000 llvmpipe: fix mask generation and position interpolation with 8-wide vectors trivial bugs, with these things start to look somewhat reasonable. Textures though have some swizzling issues it seems. commit 168277a63ef5b72542cf063c337f2d701053ff4b Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 16:04:03 2012 +0000 llvmpipe: don't overallocate variables we never have more than 16 (stamp size) / 4 (minimum possible vector size). (With larger vectors those variables are still overallocated a bit.) commit 409b54b30f81ed0aa9ed0b01affe15c72de9abd2 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:56:48 2012 +0000 llvmpipe: add some 32f8 formats to lp_test_conv Also add the ability to handle different sized vectors. commit 55dcd3af8366ebdac0af3cdb22c2588f24aa18ce Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:47:27 2012 +0000 gallivm: handle different sized vectors in conversion / pack only fully generic path for now (extract/insert per element). commit 9c040f78c54575fcd94a8808216cf415fe8868f6 Author: Roland Scheidegger <sroland@vmware.com> Date: Sun Mar 18 00:58:28 2012 +0100 llvmpipe: fix harmless use of unitialized values commit 551e9d5468b92fc7d5aa2265db9a52bb1e368a36 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:31:21 2012 +0100 gallivm: drop special path in extract_broadcast with different sized vectors Not needed, llvm can handle shuffles with different sized result vector just fine. Should hopefully generate the same code in the end, but simpler IR. commit 44da531119ffa07a421eaa041f63607cec88f6f8 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:28:49 2012 +0100 llvmpipe: adapt interpolation for handling multiple quads at once this is still WIP there are actually two methods possible not quite sure what makes the most sense, so there's code for both for now: 1) the iterative method as used before (compute attrib values at upper left corner of stamp and upper left corner of each quad initially). It is improved to handle more than one quad at once, and also do some more vectorized calculations initially for slightly better code - newer cpus have full throughput with 4 wide float vectors, hence don't try to code up a path which might be faster if there's just one channel active per attribute. 2) just do straight interpolation for each pixel. Method 2) is more work per quad, but less initially - if all quads are executed significantly more overall though. But this might change with larger vector lengths. This method would also be needed if we'd do some kind of active quad merging when operating on multiple quads at once. This path contains some hack to force llvm to generate better code, it is still far from ideal though, still generates far too many unnecessary register spills/reloads. Both methods should work with different sized vectors. Not very well tested yet, still seems to work with four-wide vectors, need changes elsewhere to be able to test with wider vectors. commit be5d3e82e2fe14ad0a46529ab79f65bf2276cd28 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:59:37 2012 +0000 draw: Cleanup. commit f85bc12c7fbacb3de2a94e88c6cd2d5ee0ec0e8d Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:43:30 2012 +0000 gallivm: More module compilation refactoring. commit d76f093198f2a06a93b2204857e6fea5fd0b3ece Author: José Fonseca <jfonseca@vmware.com> Date: Thu Mar 15 21:29:11 2012 +0000 llvmpipe: Use gallivm_compile/free_function() in linear code. Should had been done before. commit 122e1adb613ce083ad739b153ced1cde61dfc8c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 13 14:47:10 2012 +0100 llvmpipe: generate partial pixel mask for multiple quads still works with one quad, cannot be tested yet with more At least for now always fixed order with multiple quads. commit 4c4f15081d75ed585a01392cd2dcce0ad10e0ea8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 22:09:24 2012 +0100 llvmpipe: refactor state setup a bit Refactor to make it easier to emit (and potentially later fetch in fs) coefficients for multiple attributes at once. Need to think more about how to make this actually happen however, the problem is different attributes can have different interpolation modes, requiring different handling in both setup and fs (though linear and perspective handling is close). commit 9363e49722ff47094d688a4be6f015a03fba9c79 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 19:23:23 2012 +0100 llvmpipe: vectorize tri offset calc cuts number of instructions in quad-offset-factor from 107 to 75. This code actually duplicated the (scalar) code calculating the determinant except it used different vertex order (leading to different sign but it doesn't matter) hence llvm could not have figured out it's the same (of course with determinant vectorized in the other place that wouldn't have worked any longer neither). Note this particular piece doesn't actually vectorize well, not many arithmetic instructions left but tons of shuffle instructions... Probably would need to work on n tris at a time for better vectorization. commit 63169dcb9dd445c94605625bf86d85306e2b4297 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 03:11:37 2012 +0100 llvmpipe: vectorize some scalar code in setup reduces number of arithmetic instructions, and avoids loading vector x,y values twice (once as scalars once as vectors). Results in a reduction of instructions from 76 to 64 in fs setup for glxgears (16%) on a cpu with sse41. Since this code uses vec2 disguised as vec4, on old cpus which had physical 64bit sse units (pre-Core2) it probably is less of a win in practice (and if you have no vectors you can only hope llvm eliminates the arithmetic for unneeded elements). commit 732ecb877f951ab89bf503ac5e35ab8d838b58a1 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 7 00:32:24 2012 +0100 draw: fix clipping bug introduced by 4822fea3f0440b5205e957cd303838c3b128419c broke clipping pretty badly (verified with lineclip test) commit ef5d90b86d624c152d200c7c4056f47c3c6d2688 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 23:38:59 2012 +0100 draw: don't store vertex header per attribute storing the vertex header once per attribute is totally unnecessary. Some quick look at the generated assembly says llvm in fact cannot optimize away the additional stores (maybe due to potentially aliasing pointers somewhere). Plus, this makes the code cleaner and also allows using a vector "or" instead of scalar ones. commit 6b3a5a57b0b9850854cfbd7b586e4e50102dda71 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 19:11:01 2012 +0100 draw: do the per-vertex "boolean" clipmask "or" with vectors no point extracting the values and doing it per component. Doesn't help that much since we still extract the values elsewhere anyway. commit 36519caf1af40e4480251cc79a2d527350b7c61f Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 2 22:27:01 2012 +0100 gallivm: fix lp_build_extract_broadcast with different sized vectors Fix the obviously wrong argument, so it doesn't blow up. commit 76d0ac3ad85066d6058486638013afd02b069c58 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 2 12:16:23 2012 +0000 draw: Compile per module and not per function (WIP). Enough to get gears w/ LLVM draw + softpipe to work on AVX doing: GALLIUM_DRIVER=softpipe SOFTPIPE_USE_LLVM=yes glxgears But still hackish -- will need to rethink and refactor this. commit 78e32b247d2a7a771be9a1a07eb000d1e54ea8bd Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 12:01:05 2012 +0000 llvmpipe: Remove lp_state_setup_fallback. Never used. commit 6895d5e40d19b4972c361e8b83fdb7eecda3c225 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Feb 27 19:14:27 2012 +0000 llvmpipe: Don't emit EMMS on x86 We already take precautions to ensure that LLVM never emits MMX code. commit 4822fea3f0440b5205e957cd303838c3b128419c Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 29 15:58:19 2012 +0100 draw: modifications for larger vector sizes We want to be able to use larger vectors especially for running the vertex shader. With this patch we build soa vectors which might have a different length than 4. Note that aos structures really remain the same, only when aos structures are converted to soa potentially different sized vectors are used. Samplers probably don't work yet, didn't look at them. Testing done: glxgears works with both 128bit and 256bit vectors. commit f4950fc1ea784680ab767d3dd0dce589f4e70603 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:57 2012 +0100 gallivm: override native vector width with LP_NATIVE_VECTOR_WIDTH env var for debug commit 6ad6dbf0c92f3bf68ae54e5f2aca035d19b76e53 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:24 2012 +0100 draw: allocate storage with alignment according to native vector width commit 7bf0e3e7c9bd2469ae7279cabf4c5229ae9880c1 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Feb 24 19:06:08 2012 +0000 gallivm: Fix comment grammar. Was missing several words. Spotted by Roland. commit b20f1b28eb890b2fa2de44a0399b9b6a0d453c52 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 19:22:09 2012 +0000 gallivm: Use MC-JIT on LLVM 3.1 + (i.e, SVN) MC-JIT Note: MC-JIT is still WIP. For this to work correctly it requires LLVM changes which are not yet upstream. commit b1af4dfcadfc241fd4023f4c3f823a1286d452c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 20:03:15 2012 +0100 llvmpipe: use new lp_type_width() helper in lp_test_blend commit 04e0a37e888237d4db2298f31973af459ef9c95f Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 19:50:34 2012 +0100 llvmpipe: clean up lp_test_blend a little Using variables just sized and aligned right makes it a bit more obvious what's going on. The test still only tests vector length 4. For AoS anything else probably isn't going to work. For SoA other lengths should work (at least with floats). commit e61c393d3ec392ddee0a3da170e985fda885a823 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:48:30 2012 +0000 gallivm: Ensure vector width consistency. Instead of assuming that everything is the max native size. commit 330081ac7bc41c5754a92825e51456d231bf84dd Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:44:14 2012 +0000 draw: More simd vector width consistency fixes. commit d90ca002753596269e37297e2e6c139b19f29f03 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:43:00 2012 +0000 gallivm: Remove unused lp_build_int32_vec4_type() helper. commit cae23417824d75869c202aaf897808d73a2c1db0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 17:32:16 2012 +0100 gallivm: use global variable for native vector width instead of define We do not know the simd extensions (and hence the simd width we should use) available at compile time. At least for now keep a define for maximum vector width, since a global variable obviously can't be used to adjust alignment of automatic stack variables. Leave the runtime-determined value at 128 for now in all cases. commit 51270ace6349acc2c294fc6f34c025c707be538a Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 15:41:02 2012 +0000 gallivm: Add a hunk inadvertedly lost when rebasing. commit bf256df9cfdd0236637a455cbaece949b1253e98 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:24:23 2012 +0000 llvmpipe: Use consistent vector width in depth/stencil test. commit 5543b0901677146662c44be2cfba655fd55da94b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:19:59 2012 +0000 draw: Use a consistent the vector register width. Instead of 4x32 sometimes, LP_NATIVE_VECTOR_WIDTH other times. commit eada8bbd22a3a61f549f32fe2a7e408222e5c824 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 12:08:04 2012 +0000 gallivm: Remove garbagge collection. MC-JIT will require one compilation per module (as opposed to one compilation per function), therefore no state will be shared, eliminating the need to do garbagge collection. commit 556697ea0ed72e0641851e4fbbbb862c470fd7eb Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 10:33:41 2012 +0000 gallivm: Move all native target initialization to lp_set_target_options(). commit c518e8f3f2649d5dc265403511fab4bcbe2cc5c8 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:52:32 2012 +0000 llvmpipe: Create one gallivm instance for each test. commit 90f10af8920ec6be6f2b1e7365cfc477a0cb111d Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:48:08 2012 +0000 gallivm: Avoid LLVMAddGlobalMapping() in lp_bld_assert(). Brittle, complex, and unecesary. Just use function pointer constant. commit 98fde550b33401e3fe006af59db4db628bcbf476 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:21:26 2012 +0000 gallivm: Add a lp_build_const_func_pointer() helper. To be reused in all places where we want to call C code. commit 6cfedadb62c2ce5af8d75969bc95a607f3ece118 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:44:41 2012 +0000 gallivm: Cleanup/simplify lp_build_const_string_variable. - Move to lp_bld_const where it belongs - Rename to lp_build_const_string - take the length from the argument (and don't count the zero terminator twice) - bitcast the constant to generic i8 * commit db1d4018c0f1fa682a9da93c032977659adfb68c Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 11:52:17 2012 +0000 gallivm: Set NoFramePointerElimNonLeaf to true where supported. commit 088614164aa915baaa5044fede728aa898483183 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 22 19:38:47 2012 +0100 llvmpipe: pass in/out pointers rather scalar floats in lp_bld_arit we don't want llvm to potentially optimize away the vectors (though it doesn't seem to currently), plus we want to be able to handle in/out vectors of arbitrary length. commit 3f5c4e04af8a7592fdffa54938a277c34ae76b51 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:22:55 2012 +0100 gallivm: fix lp_build_sqrt() for vector length 1 since we optimize away vectors with length 1 need to emit intrinsic without vector type. commit 79d94e5f93ed8ba6757b97e2026722ea31d32c06 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 17:00:46 2012 +0000 llvmpipe: Remove lp_test_round. commit 81f41b5aeb3f4126e06453cfc78990086b85b78d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:56:24 2012 +0100 llvmpipe: subsume lp_test_round into lp_test_arit Much simpler, and since the arguments aren't passed as 128bit values can run on any arch. This also uses the float instead of the double versions of the c functions (which probably was the intention anyway). In contrast to lp_test_round the output is much less verbose however. Tested vector width of 32 to 512 bits - all pass except 32 (length 1) which crashes in lp_build_sqrt() due to wrong type. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 945b338b421defbd274481d8c4f7e0910fd0e7eb Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 09:55:03 2012 +0000 gallivm: Centralize the function compilation logic. This simplifies a lot of code. Also doing this in a central place will make it easier to carry out the changes necessary to use MC-JIT in the future. gallivm: Fix typo in explicit derivative shuffle. Trivial. draw: make DEBUG_STORE work again adapt to lp_build_printf() interface changes Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: get rid of vecnf_from_scalar() just use lp_build_broadcast directly (cannot assign a name but don't really need it, vecnf_from_scalar() was producing much uglier IR due to using repeated insertelement instead of insertelement+shuffle). Reviewed-by: José Fonseca <jfonseca@vmware.com> llvmpipe: fix typo in complex interpolation code Fixes position interpolation when using complex mode (piglit fp-fragment-position and similar) Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: fix clipvertex/position storing again This appears to be the result of a bad merge. Fixes piglit tests relying on clipping, like a lot of the interpolation tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Fix explicit derivative manipulation. Same counter variable was being used in two nested loops. Use more meanigful variable names for the counter to fix and avoid this. gallivm: Prevent buffer overflow in repeat wrap mode for NPOT. Based on Roland's patch, discussion, and review . Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix dims for TGSI_TEXTURE_1D in emit_tex. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix explicit volume texture derivatives. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: fix 1d shadow texture sampling Always r coordinate is used, hence need 3 coords not two (the second one is unused). Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Enable AVX support without MCJIT, where available. For now, this just enables AVX on Windows for testing. If the code is stable then we might consider prefering the old JIT wherever possible. No change elsewhere. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2012-07-13 18:09:30 +01:00
}
else {
struct lp_type inttype;
struct lp_build_context intbld;
LLVMValueRef trunc, itrunc, mask;
assert(type.floating);
assert(lp_check_value(type, a));
inttype = type;
inttype.floating = 0;
lp_build_context_init(&intbld, bld->gallivm, inttype);
/* round by truncation */
itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
/*
* fix values if rounding is wrong (for non-special cases)
* - this is the case if trunc > a
* The results of doing this with NaNs, very large values etc.
* are undefined but this seems to be the case anyway.
*/
mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
/* cheapie minus one with mask since the mask is minus one / zero */
return lp_build_add(&intbld, itrunc, mask);
}
}
/* round to nearest (toward zero) */
res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
return res;
}
/**
* Return ceiling of float (vector), returning int (vector).
* Ex: iceil( 1.1) = 2
* Ex: iceil(-1.1) = -1
*/
LLVMValueRef
lp_build_iceil(struct lp_build_context *bld,
LLVMValueRef a)
{
LLVMBuilderRef builder = bld->gallivm->builder;
const struct lp_type type = bld->type;
LLVMTypeRef int_vec_type = bld->int_vec_type;
LLVMValueRef res;
assert(type.floating);
assert(lp_check_value(type, a));
if (arch_rounding_available(type)) {
res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
}
else {
struct lp_type inttype;
struct lp_build_context intbld;
LLVMValueRef trunc, itrunc, mask;
assert(type.floating);
assert(lp_check_value(type, a));
inttype = type;
inttype.floating = 0;
lp_build_context_init(&intbld, bld->gallivm, inttype);
/* round by truncation */
itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
/*
* fix values if rounding is wrong (for non-special cases)
* - this is the case if trunc < a
* The results of doing this with NaNs, very large values etc.
* are undefined but this seems to be the case anyway.
*/
mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
/* cheapie plus one with mask since the mask is minus one / zero */
return lp_build_sub(&intbld, itrunc, mask);
}
/* round to nearest (toward zero) */
res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
return res;
}
/**
* Combined ifloor() & fract().
*
* Preferred to calling the functions separately, as it will ensure that the
gallivm,draw,llvmpipe: Support wider native registers. Squashed commit of the following: commit 7acb7b4f60dc505af3dd00dcff744f80315d5b0e Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:46:31 2012 +0100 draw: Don't use dynamically sized arrays. Not supported by MSVC. commit 5810c28c83647612cb372d1e763fd9d7780df3cb Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:44:16 2012 +0100 gallivm,llvmpipe: Don't use expressions with PIPE_ALIGN_VAR(). MSVC doesn't accept exceptions in _declspec(align(...)). Use a define instead. commit 8aafd1457ba572a02b289b3f3411e99a3c056072 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:41:56 2012 +0100 gallium/util: Make u_cpu_detect.h header C++ safe. commit 5795248350771f899cfbfc1a3a58f1835eb2671d Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 2 12:08:01 2012 +0100 gallium/util: Add ULL suffix to large constants. As suggested by Andy Furniss: it looks like some old gcc versions require it. commit 4c66c22727eff92226544c7d43c4eb94de359e10 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Truly disable INF/NAN tests on MSVC. Thanks to Brian for spotting this. commit 8bce274c7fad578d7eb656d9a1413f5c0844c94e Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Disable INF/NAN tests on MSVC. Somehow they are not recognized as constants. commit 6868649cff8d7fd2e2579c28d0b74ef6dd4f9716 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 15:05:24 2012 +0200 gallivm: Cleanup the 2 x 8 float -> 16 ub special path in lp_build_conv. No behaviour change intended, like 7b98455fb40c2df84cfd3cdb1eb7650f67c8a751. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 5147a0949c4407e8bce9e41d9859314b4a9ccf77 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 14:28:19 2012 +0200 gallivm: (trivial) fix issues with multiple-of-4 texture fetch Some formats can't handle non-multiple of 4 fetches I believe, but everything must support length 1 and multiples of 4. So avoid going to scalar fetch (which is very costly) just because length isn't 4. Also extend the hack to not use shift with variable count for yuv formats to arbitrary length (larger than 1) - doesn't matter how many elements we have we always want to avoid it unless we have variable shift count instruction (which we should get with avx2). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 87ebcb1bd71fa4c739451ec8ca89a7f29b168c08 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jul 4 02:09:55 2012 +0200 gallivm: (trivial) fix typo for wrap repeat mode in linear filtering aos code This would lead to bogus coordinates at the edges. (undetected by piglit because this path is only taken for block-based formats). Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 3a42717101b1619874c8932a580c0b9e6896b557 Author: José Fonseca <jfonseca@vmware.com> Date: Tue Jul 3 19:42:49 2012 +0100 gallivm: Fix TGSI integer translation with AVX. commit d71ff104085c196b16426081098fb0bde128ce4f Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 15:17:41 2012 +0100 llvmpipe: Fix LLVM JIT linear path. It was not working properly because it was looking at the JIT function before it was actually compiled. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit a94df0386213e1f5f9a6ed470c535f9688ec0a1b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Jun 28 18:07:10 2012 +0100 gallivm: Refactor lp_build_broadcast(_scalar) to share code. Doesn't really change the generated assembly, but produces more compact IR, and of course, makes code more consistent. Reviewed-by: Brian Paul <brianp@vmware.com> commit 66712ba2731fc029fa246d4fc477d61ab785edb5 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 17:30:13 2012 +0100 gallivm: Make LLVMContextRef a singleton. There are any places inside LLVM that depend on it. Too many to attempt to fix. Reviewed-by: Brian Paul <brianp@vmware.com> commit ff5fb7897495ac263f0b069370fab701b70dccef Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 28 18:15:27 2012 +0200 gallivm: don't use 8-wide texture fetch in aos path This appears to be a slight loss usually. There are probably several reasons for that: - fetching itself is scalar - filtering is pure int code hence needs splitting anyway, same for the final texel offset calculations - texture wrap related code, which can be done 8-wide, is slightly more complex with floats (with clamp_to_edge) and float operations generally more costly hence probably not much faster overall - the code needed to split when encountering different mip levels for the quads, adding complexity So, just split always for aos path (but leave it 8-wide for soa, since we do 8-wide filtering there when possible). This should certainly be revisited if we'd have avx2 support. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ce8032b43dcd8e8d816cbab6428f54b0798f945d Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:41:19 2012 +0200 gallivm: (trivial) don't extract fparts variable if not needed Did not have any consequences but unnecessary. commit aaa9aaed8f80dc282492f62aa583a7ee23a4c6d5 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:09:06 2012 +0200 gallivm: fix precision issue in aos linear int wrap code now not just passes at a quick glance but also with piglit... If we do the wrapping with floats, we also need to set the weights accordingly. We can potentially end up with different (integer) coordinates than what the integer calculations would have chosen, which means the integer weights calculated previously in this case are completely wrong. Well at least that's what I think happens, at least recalculating the weights helps. (Some day really should refactor all the wrapping, so we do whatever is fastest independent of 16bit int aos or 32bit float soa filtering.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit fd6f18588ced7ac8e081892f3bab2916623ad7a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 11:15:53 2012 +0100 gallium/util: Fix parsing of options with underscore. For example GALLIVM_DEBUG=no_brilinear which was being parsed as two options, "no" and "brilinear". commit 09a8f809088178a03e49e409fa18f1ac89561837 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 26 15:00:14 2012 +0100 gallivm: Added a generic lp_build_print_value which prints a LLVMValueRef. Updated lp_build_printf to share common code. Removed specific lp_build_print_vecX. Reviewed-by: José Fonseca <jfonseca@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit e59bdcc2c075931bfba2a84967a5ecd1dedd6eb0 Author: José Fonseca <jfonseca@vmware.com> Date: Wed May 16 15:00:23 2012 +0100 draw,llvmpipe: Avoid named struct types on LLVM 3.0 and later. Starting with LLVM 3.0, named structures are meant not for debugging, but for recursive data types, previously also known as opaque types. The recursive nature of these types leads to several memory management difficulties. Given that we don't actually need recursive types, avoid them altogether. This is an attempt to address fdo bugs 41791 and 44466. The issue is somewhat random so there's no easy way to check how effective this is. Cherry-picked from 9af1ba565dfd5cef9ee938bb7c04767d14878fbf commit df6070f618a203c7a876d984c847cde4cbc26bdb Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 14:42:53 2012 +0200 gallivm: (trivial) fix typo in faster aos linear int wrap code no longer crashes, now REALLY tested. commit d8f98dce452c867214e6782e86dc08562643c862 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 18:20:58 2012 +0200 llvmpipe: (trivial) remove bogus optimization for float aos repeat wrap This optimization for nearest filtering on the linear path generated likely bogus results, and the int path didn't have any optimizations there since the only shader using force_nearest apparently uses clamp_to_edge not repeat wrap anyway. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit c4e271a0631087c795e756a5bb6b046043b5099d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 23:01:52 2012 +0200 gallivm: faster repeat wrap for linear aos path too Even if we already have scaled integer coords, it's way faster to use the original float coord (plus some conversions) rather than use URem. The choice of what to do for texture wrapping is not really tied to int aos or float soa filtering though for some modes there can be some gains (because of easier weight calculations). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 1174a75b1806e92aee4264ffe0ffe7e70abbbfa3 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 14:39:22 2012 +0200 gallivm: improve npot tex wrap repeat in linear soa path URem gets translated into series of scalar divisions so just about anything else is faster. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit f849ffaa499ed96fa0efd3594fce255c7f22891b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 00:40:35 2012 +0100 gallivm: (trivial) fix near-invisible shift-space typo I blame the keyboard. commit 5298a0b19fe672aebeb70964c0797d5921b51cf0 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:24:28 2012 +0200 gallivm: add new intrinsic helper to deal with arbitrary vector length This helper will split vectors which are too large for the hw, or expand them if they are too small, so a caller of a function using intrinsics which uses such sizes need not split (or expand) the vectors manually and the function will still use the intrinsic instead of dropping back to generic llvm code. It can also accept scalars for use with pseudo-vector intrinsics (only useful for float arguments, all x86 scalar simd float intrinsics use 4vf32). Only used for lp_build_min/max() for now (also added the scalar float case for these while there). (Other basic binary functions could use it easily, whereas functions with a different interface would need different helpers.) Expanding vectors isn't widely used, because we always try to use build contexts with native hw vector sizes. But it might (or not) be nicer if this wouldn't need to be done, the generated code should in theory stay the same (it does get hit by lp_build_rho though already since we didn't have a intrinsic for the scalar lp_build_max case before). v2: incorporated Brian's feedback, and also made the scalar min/max case work instead of crash (all scalar simd float intrinsics take 4vf32 as argument, probably the reason why it wasn't used before). Moved to lp_bld_intr based on José's request, and passing intrinsic size instead of length. Ideally we'd derive the source type info from the passed in llvm value refs and process some llvmtype return type so we could handle intrinsics where the source and destination type isn't the same (like float/int conversions, packing instructions) but that's a bit too complicated for now. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 01aa760b99ec0b2dc8ce57a43650e83f8c1becdf Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:19:18 2012 +0200 gallivm: (trivial) increase max code size for shader disassembly 64kB was just short of what I needed (which caused a crash) hence increase to 96kB (should probably be smarter about that). commit 74aa739138d981311ce13076388382b5e89c6562 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:53:29 2012 +0100 gallivm: simplify aos float tex wrap repeat nearest just handle pot and npot the same. The previous pot handling ended up with exactly the same instructions plus 2 more (leave it in the soa path though since it is probably still cheaper there). While here also fix a issue which would cause a crash after an assert. commit 0e1e755645e9e49cfaa2025191e3245ccd723564 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:29:24 2012 +0100 gallivm: (trivial) skip floor rounding in ifloor when not signed This was only done for the non-sse41 case before, but even with sse41 this is obviously unnecessary (some callers already call itrunc in this case anyway but some might not). commit 7f01a62f27dcb1d52597b24825931e88bae76f33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:23:12 2012 +0100 gallivm: (trivial) fix bogus comments commit 5c85be25fd82e28490274c468ce7f3e6e8c1d416 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 20 11:51:57 2012 +0100 translate: Free elt8_func/elt16_func too. These were leaking. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 0ad498f36fb6f7458c7cffa73b6598adceee0a6c Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 15:55:34 2012 +0200 gallivm: fix bug for tex wrap repeat with linear sampling in aos float path The comparison needs to be against length not length_minus_one, otherwise the max texel is never chosen (for the second coordinate). Fixes piglit texwrap-1D-npot-proj (and 2D/3D versions). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit d1ad65937c5b76407dc2499b7b774ab59341209e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 16:13:43 2012 +0200 gallivm: simplify soa tex wrap repeat with npot textures and no mip filtering Similar to what is already done in aos sampling for the float path (but not the int path since we don't get normalized float coordinates there). URem is expensive and the calculation is done trivially with normalized floats instead (at least with sse41-capable cpus). (Some day should probably do the same for the mip filter path but it's much more complicated there hence the gain is smaller.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit e1e23f57ba9b910295c306d148f15643acc3fc83 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:38:56 2012 +0200 llvmpipe: (trivial) remove duplicated function declaration Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 07ca57eb09e04c48a157733255427ef5de620861 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:37:34 2012 +0200 llvmpipe: destroy setup variants on context destruction lp_delete_setup_variants() used to be called in garbage collection, but this no longer exists hence the setup shaders never got freed. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ed0003c633859a45f9963a479f4c15ae0ef1dca3 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 16:25:29 2012 +0100 gallivm: handle different ilod parts for multiple quad sampling This fixes filtering when the integer part of the lod is not the same for all quads. I'm not fully convinced of that solution yet as it just splits the vector if the levels to be sampled from are different. But otherwise we'd need to do things like some minify steps, and getting mip level base address separately anyway hence it wouldn't really look like much of a win (and making the code even more complex). This should now give identical results to single quad sampling. commit 8580ac4cfc43a64df55e84ac71ce1a774d33c0d2 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:14:47 2012 +0200 gallivm: de-duplicate sample code common to soa and aos sampling There doesn't seem to be any reason why this code dealing with cube face selection, lod and mip level calculation is separate in aos and soa sampling, and I am sick of having it to change in both places. commit fb541e5f957408ce305b272100196f1e12e5b1e8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:15:41 2012 +0200 gallivm: do mip filtering with per quad lod_fpart This gives better results for mip filtering, though the generated code might not be optimal. For now it also creates some artifacts if the lod_ipart isn't the same for all quads, since instead of using the same mip weight for all quads as previously (which just caused non-smooth gradients) this now will use the right weights but with the wrong mip level in this case (can easily be seen with things like texfilt, mipmap_tunnel). v2: use logic helper suggested by José, and fix issue with negative lod_fpart values commit f1cc84eef7d826a20fab6cd8ccef9a275ff78967 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 13 18:35:25 2012 +0200 gallivm: (trivial) fix bogus assert in lp_build_unpack_broadcast_aos_scalars commit 7c17dbae8ae290df9ce0f50781a09e8ed640c044 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:11:14 2012 +0100 util: Reimplement half <-> float conversions. Removed u_half.py used to generate the table for previous method. Previous implementation of float to half conversion was faulty for denormalised and NaNs and would require extra logic to fix, thus making the speedup of using tables irrelevant. commit 7762f59274070e1dd4b546f5cb431c2eb71ae5c3 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:12:16 2012 +0100 tests: Updated tests to properly handle NaN for half floats. commit fa94c135aea5911fd93d5dfb6e6f157fb40dce5e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:33:10 2012 +0200 gallivm: do mip level calculations per quad This is the final piece which shouldn't change the rendering output yet. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 23cbeaddfe03c09ca18c45d28955515317ffcf4c Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Jun 9 00:54:21 2012 +0200 gallivm: do per-quad cube face selection Doesn't quite fix the piglit cubemap test (not sure why actually) but doing per-quad face selection is doing the right thing and definitely an improvement. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit abfb372b3702ac97ac8b5aa80ad1b94a2cc39d33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:22:59 2012 +0200 gallivm: do all lod calculations per quad Still no functional change but lod is now converted to scalar after lod calculations. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 519368632747ae03feb5bca9c655eccbc5b751b4 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:46:10 2012 +0100 gallivm: Added support for half-float to float conversion in lp_build_conv. Updated various utility functions to support this change. commit 135b4d683a4c95f7577ba27b9bffa4a6fbd2c2e7 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:02:46 2012 +0100 gallivm: Added function for half-float to float conversion. Updated lp_build_format_aos_array to support half-float source. commit 37d648827406a20c5007abeb177698723ed86673 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:55:18 2012 +0100 util: Updated u_format_tests to rigidly test half-float boundary values. commit 2ad18165d96e578aa9046df7c93cb1c3284d8c6b Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:54:16 2012 +0100 llvmpipe: Updated lp_test_format to properly handle Inf/NaN results. commit 78740acf25aeba8a7d146493dd5c966e22c27b73 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:53:30 2012 +0100 util: Added functions for checking NaN / Inf for double and half-floats. commit 35e9f640ae01241f9e0d67fe893bbbf564c05809 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:05:13 2012 +0200 gallivm: Fix calculating rho for 3d textures for the single-quad case Discovered by accident, this looks like a very old typo bug. commit fc1220c636326536fd0541913154e62afa7cd1d8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:04:59 2012 +0200 gallivm: do calcs per-quad in lp_build_rho Still convert to scalar at the end of the function. commit 50a887ffc550bf310a6988fa2cea5c24d38c1a41 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 21 23:21:50 2012 +0200 gallivm: (trivial) return scalar in lp_build_extract_range for length 1 vectors Our type system on top of llvm's one doesn't generally support vectors of length 1, instead using scalars. So we should return a scalar from this function instead of having to bitcast the vector with length 1 later elsewhere. commit 80c71c621f9391f0f9230460198d861643324876 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 17:49:15 2012 +0100 draw: Fixed bad merge error commit c47401cfad0c9167de20ff560654f533579f452c Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:29:30 2012 +0100 draw: Updated store_clip to store whole vectors instead of individual elements. commit 2d9c1ad74b0b0b41861fffcecde39f09cc27f1cf Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:28:32 2012 +0100 gallivm: Added lp_build_fetch_rgba_aos_array. A version of lp_build_fetch_rgba_aos which is targeted at simple array formats. Reads the whole vector from memory in one, instead of reading each element individually. Tested with mesa tests and demos. commit ff7805dc2b6ef6d8b11ec4e54aab1633aef29ac8 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:27:40 2012 +0100 gallivm: Added lp_build_pad_vector. This function pads a vector with undef to a desired length. commit 701f50acef24a2791dabf4730e5b5687d6eb875d Author: James Benton <jbenton@vmware.com> Date: Fri May 18 17:27:19 2012 +0100 util: Added util_format_is_array. This function checks whether a format description is in a simple array format. commit 5e0a7fa543dcd009de26f34a7926674190fa6246 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:13:47 2012 +0100 draw: Removed draw_llvm_translate_from and draw/draw_llvm_translate.c. This is "replaced" by adding an optimised path in lp_build_fetch_rgba_aos in an upcoming patch. commit 8c886d6a7dd3fb464ecf031de6f747cb33e5361d Author: James Benton <jbenton@vmware.com> Date: Wed May 16 15:02:31 2012 +0100 draw: Modified store_aos to write the vector as one, not individual elements. commit 37337f3d657e21dfd662c7b26d61cb0f8cfa6f17 Author: James Benton <jbenton@vmware.com> Date: Wed May 16 14:16:23 2012 +0100 draw: Changed aos_to_soa to use lp_build_transpose_aos. commit bd2b69ce5d5c94b067944d1dcd5df9f8e84548f1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:14:27 2012 +0100 draw: Changed soa_to_aos to use lp_build_transpose_aos. commit 0b98a950d29a116e82ce31dfe7b82cdadb632f2b Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:45 2012 +0100 gallivm: Added lp_build_transpose_aos which converts between aos and soa. commit 69ea84531ad46fd145eb619ed1cedbe97dde7cb5 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:01 2012 +0100 gallivm: Added lp_build_interleave2_half aimed at AVX unpack instructions. commit 7a4cb1349dd35c18144ad5934525cfb9436792f9 Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 22 11:54:14 2012 +0100 gallivm: Fix build on Windows. MC-JIT not yet supported there. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit afd105fc16bb75d874e418046b80d9cc578818a1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:17:26 2012 +0100 llvmpipe: Added a error counter to lp_test_conv. Useful for keeping track of progress when fixing errors! Signed-off-by: José Fonseca <jfonseca@vmware.com> commit b644907d08c10a805657841330fc23db3963d59c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:16:46 2012 +0100 llvmpipe: Changed known failures in lp_test_conv. To comply with the recent fixes to lp_bld_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit d7061507bd94f6468581e218e61261b79c760d4f Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:14:38 2012 +0100 llvmpipe: Added fixed point types tests to lp_test_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 146b3ea39b4726dbe125ac666bd8902ea3d6ca8c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:26:35 2012 +0100 llvmpipe: Changed lp_test_conv src/dst alignment to be correct. Now based on the define rather than a fixed number. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit f3b57441f834833a4b142a951eb98df0aa874536 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:06:44 2012 +0100 gallivm: Fixed erroneous optimisation in lp_build_min/max. Previously assumed normalised was 0 to 1, but it can be -1 to 1 if type is signed. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a0613382e5a215cd146bb277646a6b394d376ae4 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:04:49 2012 +0100 gallivm: Compensate for lp_const_offset in lp_build_conv. Fixing a /*FIXME*/ to remove errors in integer conversion in lp_build_conv. Tested using lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a3d2bf15ea345bc8a0664f8f441276fd566566f3 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:01:25 2012 +0100 gallivm: Fixed overflow in lp_build_clamped_float_to_unsigned_norm. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit e7b1e76fe237613731fa6003b5e1601a2e506207 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 21 20:07:51 2012 +0100 gallivm: Fix build with LLVM 2.6 Trivial, and useful. commit d3c6bbe5c7f5ba1976710831281ab1b6a631082d Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 15 17:15:59 2012 +0100 gallivm: Enable MCJIT/AVX with vanilla LLVM 3.1. Add the necessary C++ glue, so that we don't need any modifications to the soon to be released LLVM 3.1. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 724a019a14d40fdbed21759a204a2bec8a315636 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 22:04:06 2012 +0100 gallivm: Use HAVE_LLVM 0x0301 consistently. commit af6991e2a3868e40ad599b46278551b794839748 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 21:49:06 2012 +0100 gallivm: Add MCRegisterInfo.h to silence benign warnings about missing implementation. Trivial. commit 6f8a1d75458daae2503a86c6b030ecc4bb494e23 Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Apr 2 22:14:15 2012 -0700 gallivm: Pass in a MCInstrInfo to createMCInstPrinter on llvm-3.1. llvm-3.1svn r153860 makes MCInstrInfo available to the MCInstPrinter. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 62555b6ed8760545794f83064e27cddcb3ce5284 Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 27 21:51:17 2012 -0700 gallivm: Fix method overriding in raw_debug_ostream. Use matching type qualifers to avoid method hiding. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 6a9bd784f4ac68ad0a731dcd39e5a3c39989f2be Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 13 22:40:52 2012 -0700 gallivm: Fix createOProfileJITEventListener namespace with llvm-3.1. llvm-3.1svn r152620 refactored the OProfile profiling code. createOProfileJITEventListener was moved from the llvm namespace to the llvm::JITEventListener namespace. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit b674955d39adae272a779be85aa1bd665de24e3e Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Mar 5 22:00:40 2012 -0800 gallivm: Pass in a MCRegisterInfo to MCInstPrinter on llvm-3.1. llvm-3.1svn r152043 changes createMCInstPrinter to take an additional MCRegisterInfo argument. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 11ab69971a8a31c62f6de74905dbf8c02884599f Author: Vinson Lee <vlee@freedesktop.org> Date: Wed Feb 29 21:20:53 2012 -0800 Revert "gallivm: Change getExtent and readByte to non-const with llvm-3.1." This reverts commit d5a6c172547d8964f4d4bb79637651decaf9deee. llvm-3.1svn r151687 makes MemoryObject accessor members const again. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 339960c82d2a9f5c928ee9035ed31dadb7f45537 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 14 16:19:56 2012 +0200 gallivm: (trivial) fix assertion failure for mipmapped 1d textures In lp_build_rho, we may end up with a 1-element vector (for mipmapped 1d textures), but in this case we require the type to be a non-vector type, so need a cast. commit 9d73edb727bd6d196030dc3026b7bf0c574b3e19 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:12:07 2012 +0200 gallivm: prepare for per-quad lod calculations for large vectors to be able to handle multiple quads at once in texture sampling and still do lod calculations per quad, it is necessary to get the per-quad derivatives into the lp_build_rho function. Until now these derivative values were just scalars, which isn't going to work. So we now use vectors, and since the interface needs to change we also do some different (slightly more efficient) packing of the values. For 8-wide vectors the packed derivative values for 3 coords would look like this, this scales to a arbitrary (multiple of 4) vector size: ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____ The second vector will be unused for 1d and 2d textures. To facilitate future changes the derivative values are put into a struct, since quite some functions just pass these values through. The generated code seems to be very slightly better for 2d textures (with 4-wide vectors) than before with sse2 (if you have a cpu with physical 128bit simd units - otherwise it's probably not a win). v2: suggestions from José, rename variables, add comments, use swizzle helper commit 0aa21de0d31466dac77b05c97005722e902517b8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:10:31 2012 +0200 gallivm: add undefined swizzle handling to lp_build_swizzle_aos This is useful for vectors with "holes", it lets llvm choose the most efficient shuffle instructions if some elements aren't needed without having to worry what elements to manually pick otherwise. commit 00faf3f370e7ce92f5ef51002b0ea42ef856e181 Author: José Fonseca <jfonseca@vmware.com> Date: Fri May 4 17:25:16 2012 +0100 gallivm: Get the LLVM IR optimization passes before JIT compilation. MC-JIT engine compiles the module immediately on creation, so the optimization passes were being run too late. So now we create a target data layout from a string, that matches the ABI parameters reported by the compiler. The backend optimization passes were always been run, so the performance improvement is modest (3% on multiarb mesa demo). Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit 40a43f4e2ce3074b5ce9027179d657ebba68800a Author: Roland Scheidegger <sroland@vmware.com> Date: Wed May 2 16:03:54 2012 +0200 gallivm: (trivial) fix wrong define used in lp_build_pack2 should fix stack-smashing crashes. commit e6371d0f4dffad4eb3b7a9d906c23f1c88a2ab9e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 30 21:25:29 2012 +0200 gallivm: add perf warnings when not using intrinsics with 256bit vectors Helper functions using integer sse2 intrinsics could split the vectors with AVX instead of using generic fallback (which should be faster). We don't actually expect to hit these paths (hence don't fix them up to actually do the vector splitting) so just emit warnings (for those functions where it's obvious doing split/intrinsic is faster than using generic path). Only emit warnings for 256bit vectors since we _really_ don't expect to hit arbitrary large vectors which would affect a lot more functions. The warnings do not actually depend on avx since the same logic applies to plain sse2 too (but of course again there's _really_ no reason we should hit these functions with 256bit vectors without avx). commit 8a9ea701ea7295181e846c6383bf66a5f5e47637 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:37:07 2012 +0200 gallivm: split vectors manually for avx in lp_build_pack2 (v2) There's 2 reasons for this: First, there's a llvm bug (fixed in 3.1) which generates tons of byte inserts/extracts otherwise, and second, more importantly, we want to use pack intrinsics instead of shuffles. We do this in lp_build_pack2 and not the calling code (aos sample path) because potentially other callers might find that useful too, even if for larger sequences of code using non-native vector sizes it might be better to manually split vectors. This should boost texture performance in the aos path considerably. v2: fix issues with intrinsics types with old llvm commit 27ac5b48fa1f2ea3efeb5248e2ce32264aba466e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:26:22 2012 +0200 llvmpipe: refactor lp_build_pack2 (v2) prettify, and it's unnecessary to assert when there's no intrinsic due to unsupported bit width - the shuffle path will work regardless. In contrast lp_build_packs2, should only rely on lp_build_pack2 doing the clamping for element sizes for which there is a sse2 intrinsic. v2: fix bug spotted by Jose regarding the intrinsic type for packusdw on old llvm versions. commit ddf279031f0111de4b18eaf783bdc0a1e47813c8 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:13:59 2012 +0200 gallivm: add src width check in lp_build_packs2() not doing so would skip clamping even if no sse2 pack instruction is available, which is incorrect (in theory only, such widths would also always hit a (unnecessary) assertion in lp_build_pack2(). commit e7f0ad7fe079975eae7712a6e0c54be4fae0114b Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Apr 27 15:57:00 2012 +0200 gallivm: (trivial) fix crash-causing typo for npot textures with avx commit 28a9d7f6f655b6ec508c8a3aa6ffefc1e79793a0 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Apr 25 19:38:45 2012 +0200 gallivm: (trivial) remove code mistakenly added twice. commit d5926537316f8ff67ad0a52e7242f7c5478d919b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Apr 24 21:16:15 2012 +0200 gallivm: add a new avx aos sample path (v2) Try to avoid mixing float and int address calculations. This does texture wrap modes with floats, and then the offset calculations still with ints (because of lack of precision with floats, though we could do some effort to make it work with not too large (16MB) textures). This also handles wrap repeat mode with npot-sized textures differently than either the old soa or aos int path (likely way faster but untested). Otherwise the actual address wrap code is largely similar to the soa path (not quite the same as this one also has some int code), it should get used by avx soa sampling later as well but doesn't handle more complex address modes yet (this will also have the benefit that we can use aos sampling path for all texture address modes). Generated code for that looks reasonable, but still does not split vectors explicitly for fetch/filter which means still get hit by llvm (fixed upstream) which generates hundreds of pinsrb/pextrb instead of two shuffles. It is not obvious though if it's much of a win over just doing address calcs 4-wide but with ints, even if it is definitely much less instructions on avx. piglit's texwrap seems to look exactly the same but doesn't test neither the non-normalized nor the npot cases. v2: fix comments, prettify based on Brian's and Jose's feedback. commit bffecd22dea66fb416ecff8cffd10dd4bdb73fce Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Apr 19 01:58:29 2012 +0200 gallivm: refactor aos lp_build_sample_image_nearest/linear split them up to separate address calculations and fetching/filtering. Need this for being able to do 8-wide float address calcs and 4-wide fetch/filter later (for avx). Plus the functions were very big scary monsters anyway (in particular lp_build_sample_image_linear). commit a80b325c57529adddcfa367f96f03557725c4773 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 16 17:17:18 2012 +0200 gallivm: fix lp_build_resize when truncating width but expanding vector size Missed this case which I thought was impossible - the assertion for it was right after the division by zero... (AoS) texture sampling may ask us to do this, for things like 8 4x32int vectors to 1 32x8int vector conversion (eventually, we probably don't want this to happen). commit f9c8337caa3eb185830d18bce8b95676a065b1d7 Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Apr 14 18:00:59 2012 +0200 gallivm: fix cube maps with larger vectors This makes the branchless cube face selection code work with larger vectors. Because the complexity is quite high (cannot really be improved it seems, per-face selection would reduce complexity a lot but this leads to errors unless the derivatives are calculated all from the same face which almost doubles the work to be done) it is still slower than the branching version, hence only enable this with large vectors. It doesn't actually do per-quad face selection yet (only makes sense with matching lod selection, in fact it will select the same face for all pixels based on the average of the first four pixels for now) but only different shuffles are required to make it work (the branching version actually should work with larger vectors too now thanks to the improved horizontal add but of course it cannot be extended to really select the face per-quad unless doing branching per quad). commit 7780c58869fc9a00af4f23209902db7e058e8a66 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 21:11:12 2012 +0100 llvmpipe: (trivial) fix compiler warning and also clarify comment regarding availability of popcnt instruction. commit a266dccf477df6d29a611154e988e8895892277e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 14:21:07 2012 +0100 gallivm: remove unneeded members in lp_build_sample_context Minor cleanup, the texture width, height, depth aren't accessed in their scalar form anywhere. Makes it more obvious those values should probably be fetched already vectorized (but this requires more invasive changes)... commit b678c57fb474e14f05e25658c829fc04d2792fff Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 29 15:53:55 2012 +0100 gallivm: add a helper for concatenating vectors Similar to the extract_range helper intended to get around slow code generated by llvm for 128bit insertelements. Concatenating two 128bit vectors this way will result in a single vinsertf128 operation rather than two 64bit stores plus one 128bit load, though it might be mildly useful for other purposes as well. commit 415ff228bcd0cf5e44a4c15350a661f0f5520029 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 19:41:15 2012 +0100 gallivm: add a custom 2x8f->1x16ub avx conversion path Similar to the existing 4x4f->1x16ub sse2 path, shaves off a couple instructions (min/max mostly) because it relies on pack intrinsics clamping. commit 78c08fc89f8fbcc6dba09779981b1e873e2a0299 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 18:44:07 2012 +0100 gallivm: add avx arithmetic intrinsics Add all avx intrinsics for arithmetic functions (with the exception of the horizontal add function which needs another look). Seems to pass basic tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit a586caa2800aa5ce54c173f7c0d4fc48153dbc4e Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 15:31:35 2012 +0100 gallivm: add avx logic intrinsics Add the blend intrinsics for 8-wide float and 4-wide double vectors. Since we lack 256bit int instructions these are used for int vectors as well, though obviously not for byte or word element values. The comparison intrinsics aren't extended for avx since these are only used for pre-2.7 llvm versions. commit 70275e4c13c89315fc2560a4c488c0e6935d5caf Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 00:40:53 2012 +0100 gallivm: new helper function for extract shuffles. Based on José's idea as we can need that in a couple places. Note that such shuffles should not be used lightly, since data layout of <4 x i8> is different to <16 x i8> for instance, hence might cause data rearrangement. commit 4d586dbae1b0c55915dda1759d2faea631c0a1c2 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 18:27:25 2012 +0100 gallivm: (trivial) don't overallocate shuffle variable using wrong define meant huge array... commit 06b0ec1f6d665d98c135f9573ddf4ba04b2121ad Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 17:54:20 2012 +0100 gallivm: don't do per-element extract/insert for vector element resize Instead of doing per-element extract/insert if the src vectors and dst vector differ in total size (which generates atrocious code) first change the src vectors size by using shuffles to destination vector size. We can still do better than that on AVX for packing to color buffer (by exploiting pack intrinsics characteristics hence eleminating the need for some clamps) but this already generates much better code. v2: incorporate feedback from José, Keith and use shuffle instead of bitcasts/extracts. Due to llvm deficiencies the latter cause all data to get moved to GPRs and back in pieces (even though the data in the regs actually stays the same...). commit c9970d70e05f95d3f52fe7d2cd794176a52693aa Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 19:33:19 2012 +0000 gallivm: fix bug in simple position interpolation Accidental use of position attribute instead of just pixel coordinates. Caused failures in piglit glsl-fs-ceil and glsl-fs-floor. commit d0b6fcdb008d04d7f73d3d725615321544da5a7e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 15:31:14 2012 +0000 gallivm: fix emission of ceil opcode lp_build_ceil seems more appropriate than lp_build_trunc. This seems to be never hit though someone performs some ceil to floor magic. commit d97fafed7e62ffa6bf76560a92ea246a1a26d256 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 22 11:46:52 2012 +0000 gallivm: new vectorized path for cubemap calculations should be faster when adapted to multiple quads as only selection masks need to be different. The code is more or less a per-pixel version adapted to only do it per quad. A per pixel version would be much simpler (could drop 2 selects, 6 broadcasts and the messy horizontal add of 3 vectors at the expense of only 2 more absolute value instructions - would also just work for arbitary large vectors). This version doesn't yet work with larger vectors because the horizontal add isn't adjusted to be able to work with 2x4 vectors (and also because face selection wouldn't be done per quad just per block though that would be only a correctness issue just as with lod selection). The downside is this code is quite a bit slower. On a Core2 it can be sped up by disabling the hw blend instructions for selection and using logicop fallbacks instead, but it is still slower than the old code, hence leave that in for now. Probably will chose one or the other version based on vector length in the end. commit b375fbb18a3fd46859b7fdd42f3e9908ea4ff9a3 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 21 14:42:29 2012 +0000 gallivm: fix optimized occlusion query intrinsic name commit a9ba0a3b611e48efbb0e79eb09caa85033dbe9a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Mar 21 16:19:43 2012 +0000 draw,gallivm,llvmpipe: Call gallivm_verify_function everywhere. commit f94c2238d2bc7383e088b8845b7410439a602071 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 18:54:10 2012 +0000 gallivm: optimize calculations for cube maps a bit this does some more vectorized calculations and uses horizontal adds if possible. A definite win with sse3 otherwise it doesn't seem to make much of a difference. In any case this is arithmetically identical, cannot handle larger vectors. Should be useful as a reference point against larger vector version later... commit 21a2c1cf3c8e1ac648ff49e59fdc0e3be77e2ebb Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 15:16:27 2012 +0000 llvmpipe: slight optimization of occlusion queries using movmskps when available. While this is slightly better for cpus without popcnt we should really sum the vectors ourselves (it is also possible to cast to i4 before doing the popcnt but that doesn't help that much neither since llvm is using some optimized popcnt version for i32) commit 5ab5a35f216619bcdf55eed52b0db275c4a06c1b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 13:32:11 2012 +0000 llvmpipe: fix occlusion queries with larger vectors need to adjust casts etc. commit ff95e6fdf5f16d4ef999ffcf05ea6e8c7160b0d5 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Mar 19 20:15:25 2012 +0000 gallivm: Restore optimization passes. commit 57b05b4b36451e351659e98946dae27be0959832 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:34:22 2012 +0000 llvmpipe: use existing min2 macro commit bc9a20e19b4f600a439f45679451f2e87cd4b299 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:07:27 2012 +0000 llvmpipe: add some safeguards against really large vectors As per José's suggestion, prevent things from blowing up if some cpu would have 1024bit or larger vectors. commit 0e2b525e5ca1c5bbaa63158bde52ad1c1564a3a9 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:31:08 2012 +0000 llvmpipe: fix mask generation for uberwide vectors this was the only piece preventing 16-wide vectors from working (apart from the LP_MAX_VECTOR_WIDTH define that is), which is the maximum as we don't get more pixels in the fragment shader at once. Hence adjust that so things could be tested properly with that size even though there seems to be no practical value. commit 3c8334162211c97f3a11c7f64e9e5a2a91ad9656 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:19:41 2012 +0000 llvmpipe: fix the simple interpolation method with larger vectors so both methods actually _really_ work now. Makes textures look nice with larger vectors... commit 1cb0464ef8871be1778d43b0c56adf9c06843e2d Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 17:26:35 2012 +0000 llvmpipe: fix mask generation and position interpolation with 8-wide vectors trivial bugs, with these things start to look somewhat reasonable. Textures though have some swizzling issues it seems. commit 168277a63ef5b72542cf063c337f2d701053ff4b Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 16:04:03 2012 +0000 llvmpipe: don't overallocate variables we never have more than 16 (stamp size) / 4 (minimum possible vector size). (With larger vectors those variables are still overallocated a bit.) commit 409b54b30f81ed0aa9ed0b01affe15c72de9abd2 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:56:48 2012 +0000 llvmpipe: add some 32f8 formats to lp_test_conv Also add the ability to handle different sized vectors. commit 55dcd3af8366ebdac0af3cdb22c2588f24aa18ce Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:47:27 2012 +0000 gallivm: handle different sized vectors in conversion / pack only fully generic path for now (extract/insert per element). commit 9c040f78c54575fcd94a8808216cf415fe8868f6 Author: Roland Scheidegger <sroland@vmware.com> Date: Sun Mar 18 00:58:28 2012 +0100 llvmpipe: fix harmless use of unitialized values commit 551e9d5468b92fc7d5aa2265db9a52bb1e368a36 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:31:21 2012 +0100 gallivm: drop special path in extract_broadcast with different sized vectors Not needed, llvm can handle shuffles with different sized result vector just fine. Should hopefully generate the same code in the end, but simpler IR. commit 44da531119ffa07a421eaa041f63607cec88f6f8 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:28:49 2012 +0100 llvmpipe: adapt interpolation for handling multiple quads at once this is still WIP there are actually two methods possible not quite sure what makes the most sense, so there's code for both for now: 1) the iterative method as used before (compute attrib values at upper left corner of stamp and upper left corner of each quad initially). It is improved to handle more than one quad at once, and also do some more vectorized calculations initially for slightly better code - newer cpus have full throughput with 4 wide float vectors, hence don't try to code up a path which might be faster if there's just one channel active per attribute. 2) just do straight interpolation for each pixel. Method 2) is more work per quad, but less initially - if all quads are executed significantly more overall though. But this might change with larger vector lengths. This method would also be needed if we'd do some kind of active quad merging when operating on multiple quads at once. This path contains some hack to force llvm to generate better code, it is still far from ideal though, still generates far too many unnecessary register spills/reloads. Both methods should work with different sized vectors. Not very well tested yet, still seems to work with four-wide vectors, need changes elsewhere to be able to test with wider vectors. commit be5d3e82e2fe14ad0a46529ab79f65bf2276cd28 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:59:37 2012 +0000 draw: Cleanup. commit f85bc12c7fbacb3de2a94e88c6cd2d5ee0ec0e8d Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:43:30 2012 +0000 gallivm: More module compilation refactoring. commit d76f093198f2a06a93b2204857e6fea5fd0b3ece Author: José Fonseca <jfonseca@vmware.com> Date: Thu Mar 15 21:29:11 2012 +0000 llvmpipe: Use gallivm_compile/free_function() in linear code. Should had been done before. commit 122e1adb613ce083ad739b153ced1cde61dfc8c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 13 14:47:10 2012 +0100 llvmpipe: generate partial pixel mask for multiple quads still works with one quad, cannot be tested yet with more At least for now always fixed order with multiple quads. commit 4c4f15081d75ed585a01392cd2dcce0ad10e0ea8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 22:09:24 2012 +0100 llvmpipe: refactor state setup a bit Refactor to make it easier to emit (and potentially later fetch in fs) coefficients for multiple attributes at once. Need to think more about how to make this actually happen however, the problem is different attributes can have different interpolation modes, requiring different handling in both setup and fs (though linear and perspective handling is close). commit 9363e49722ff47094d688a4be6f015a03fba9c79 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 19:23:23 2012 +0100 llvmpipe: vectorize tri offset calc cuts number of instructions in quad-offset-factor from 107 to 75. This code actually duplicated the (scalar) code calculating the determinant except it used different vertex order (leading to different sign but it doesn't matter) hence llvm could not have figured out it's the same (of course with determinant vectorized in the other place that wouldn't have worked any longer neither). Note this particular piece doesn't actually vectorize well, not many arithmetic instructions left but tons of shuffle instructions... Probably would need to work on n tris at a time for better vectorization. commit 63169dcb9dd445c94605625bf86d85306e2b4297 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 03:11:37 2012 +0100 llvmpipe: vectorize some scalar code in setup reduces number of arithmetic instructions, and avoids loading vector x,y values twice (once as scalars once as vectors). Results in a reduction of instructions from 76 to 64 in fs setup for glxgears (16%) on a cpu with sse41. Since this code uses vec2 disguised as vec4, on old cpus which had physical 64bit sse units (pre-Core2) it probably is less of a win in practice (and if you have no vectors you can only hope llvm eliminates the arithmetic for unneeded elements). commit 732ecb877f951ab89bf503ac5e35ab8d838b58a1 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 7 00:32:24 2012 +0100 draw: fix clipping bug introduced by 4822fea3f0440b5205e957cd303838c3b128419c broke clipping pretty badly (verified with lineclip test) commit ef5d90b86d624c152d200c7c4056f47c3c6d2688 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 23:38:59 2012 +0100 draw: don't store vertex header per attribute storing the vertex header once per attribute is totally unnecessary. Some quick look at the generated assembly says llvm in fact cannot optimize away the additional stores (maybe due to potentially aliasing pointers somewhere). Plus, this makes the code cleaner and also allows using a vector "or" instead of scalar ones. commit 6b3a5a57b0b9850854cfbd7b586e4e50102dda71 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 19:11:01 2012 +0100 draw: do the per-vertex "boolean" clipmask "or" with vectors no point extracting the values and doing it per component. Doesn't help that much since we still extract the values elsewhere anyway. commit 36519caf1af40e4480251cc79a2d527350b7c61f Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 2 22:27:01 2012 +0100 gallivm: fix lp_build_extract_broadcast with different sized vectors Fix the obviously wrong argument, so it doesn't blow up. commit 76d0ac3ad85066d6058486638013afd02b069c58 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 2 12:16:23 2012 +0000 draw: Compile per module and not per function (WIP). Enough to get gears w/ LLVM draw + softpipe to work on AVX doing: GALLIUM_DRIVER=softpipe SOFTPIPE_USE_LLVM=yes glxgears But still hackish -- will need to rethink and refactor this. commit 78e32b247d2a7a771be9a1a07eb000d1e54ea8bd Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 12:01:05 2012 +0000 llvmpipe: Remove lp_state_setup_fallback. Never used. commit 6895d5e40d19b4972c361e8b83fdb7eecda3c225 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Feb 27 19:14:27 2012 +0000 llvmpipe: Don't emit EMMS on x86 We already take precautions to ensure that LLVM never emits MMX code. commit 4822fea3f0440b5205e957cd303838c3b128419c Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 29 15:58:19 2012 +0100 draw: modifications for larger vector sizes We want to be able to use larger vectors especially for running the vertex shader. With this patch we build soa vectors which might have a different length than 4. Note that aos structures really remain the same, only when aos structures are converted to soa potentially different sized vectors are used. Samplers probably don't work yet, didn't look at them. Testing done: glxgears works with both 128bit and 256bit vectors. commit f4950fc1ea784680ab767d3dd0dce589f4e70603 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:57 2012 +0100 gallivm: override native vector width with LP_NATIVE_VECTOR_WIDTH env var for debug commit 6ad6dbf0c92f3bf68ae54e5f2aca035d19b76e53 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:24 2012 +0100 draw: allocate storage with alignment according to native vector width commit 7bf0e3e7c9bd2469ae7279cabf4c5229ae9880c1 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Feb 24 19:06:08 2012 +0000 gallivm: Fix comment grammar. Was missing several words. Spotted by Roland. commit b20f1b28eb890b2fa2de44a0399b9b6a0d453c52 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 19:22:09 2012 +0000 gallivm: Use MC-JIT on LLVM 3.1 + (i.e, SVN) MC-JIT Note: MC-JIT is still WIP. For this to work correctly it requires LLVM changes which are not yet upstream. commit b1af4dfcadfc241fd4023f4c3f823a1286d452c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 20:03:15 2012 +0100 llvmpipe: use new lp_type_width() helper in lp_test_blend commit 04e0a37e888237d4db2298f31973af459ef9c95f Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 19:50:34 2012 +0100 llvmpipe: clean up lp_test_blend a little Using variables just sized and aligned right makes it a bit more obvious what's going on. The test still only tests vector length 4. For AoS anything else probably isn't going to work. For SoA other lengths should work (at least with floats). commit e61c393d3ec392ddee0a3da170e985fda885a823 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:48:30 2012 +0000 gallivm: Ensure vector width consistency. Instead of assuming that everything is the max native size. commit 330081ac7bc41c5754a92825e51456d231bf84dd Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:44:14 2012 +0000 draw: More simd vector width consistency fixes. commit d90ca002753596269e37297e2e6c139b19f29f03 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:43:00 2012 +0000 gallivm: Remove unused lp_build_int32_vec4_type() helper. commit cae23417824d75869c202aaf897808d73a2c1db0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 17:32:16 2012 +0100 gallivm: use global variable for native vector width instead of define We do not know the simd extensions (and hence the simd width we should use) available at compile time. At least for now keep a define for maximum vector width, since a global variable obviously can't be used to adjust alignment of automatic stack variables. Leave the runtime-determined value at 128 for now in all cases. commit 51270ace6349acc2c294fc6f34c025c707be538a Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 15:41:02 2012 +0000 gallivm: Add a hunk inadvertedly lost when rebasing. commit bf256df9cfdd0236637a455cbaece949b1253e98 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:24:23 2012 +0000 llvmpipe: Use consistent vector width in depth/stencil test. commit 5543b0901677146662c44be2cfba655fd55da94b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:19:59 2012 +0000 draw: Use a consistent the vector register width. Instead of 4x32 sometimes, LP_NATIVE_VECTOR_WIDTH other times. commit eada8bbd22a3a61f549f32fe2a7e408222e5c824 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 12:08:04 2012 +0000 gallivm: Remove garbagge collection. MC-JIT will require one compilation per module (as opposed to one compilation per function), therefore no state will be shared, eliminating the need to do garbagge collection. commit 556697ea0ed72e0641851e4fbbbb862c470fd7eb Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 10:33:41 2012 +0000 gallivm: Move all native target initialization to lp_set_target_options(). commit c518e8f3f2649d5dc265403511fab4bcbe2cc5c8 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:52:32 2012 +0000 llvmpipe: Create one gallivm instance for each test. commit 90f10af8920ec6be6f2b1e7365cfc477a0cb111d Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:48:08 2012 +0000 gallivm: Avoid LLVMAddGlobalMapping() in lp_bld_assert(). Brittle, complex, and unecesary. Just use function pointer constant. commit 98fde550b33401e3fe006af59db4db628bcbf476 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:21:26 2012 +0000 gallivm: Add a lp_build_const_func_pointer() helper. To be reused in all places where we want to call C code. commit 6cfedadb62c2ce5af8d75969bc95a607f3ece118 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:44:41 2012 +0000 gallivm: Cleanup/simplify lp_build_const_string_variable. - Move to lp_bld_const where it belongs - Rename to lp_build_const_string - take the length from the argument (and don't count the zero terminator twice) - bitcast the constant to generic i8 * commit db1d4018c0f1fa682a9da93c032977659adfb68c Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 11:52:17 2012 +0000 gallivm: Set NoFramePointerElimNonLeaf to true where supported. commit 088614164aa915baaa5044fede728aa898483183 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 22 19:38:47 2012 +0100 llvmpipe: pass in/out pointers rather scalar floats in lp_bld_arit we don't want llvm to potentially optimize away the vectors (though it doesn't seem to currently), plus we want to be able to handle in/out vectors of arbitrary length. commit 3f5c4e04af8a7592fdffa54938a277c34ae76b51 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:22:55 2012 +0100 gallivm: fix lp_build_sqrt() for vector length 1 since we optimize away vectors with length 1 need to emit intrinsic without vector type. commit 79d94e5f93ed8ba6757b97e2026722ea31d32c06 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 17:00:46 2012 +0000 llvmpipe: Remove lp_test_round. commit 81f41b5aeb3f4126e06453cfc78990086b85b78d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:56:24 2012 +0100 llvmpipe: subsume lp_test_round into lp_test_arit Much simpler, and since the arguments aren't passed as 128bit values can run on any arch. This also uses the float instead of the double versions of the c functions (which probably was the intention anyway). In contrast to lp_test_round the output is much less verbose however. Tested vector width of 32 to 512 bits - all pass except 32 (length 1) which crashes in lp_build_sqrt() due to wrong type. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 945b338b421defbd274481d8c4f7e0910fd0e7eb Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 09:55:03 2012 +0000 gallivm: Centralize the function compilation logic. This simplifies a lot of code. Also doing this in a central place will make it easier to carry out the changes necessary to use MC-JIT in the future. gallivm: Fix typo in explicit derivative shuffle. Trivial. draw: make DEBUG_STORE work again adapt to lp_build_printf() interface changes Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: get rid of vecnf_from_scalar() just use lp_build_broadcast directly (cannot assign a name but don't really need it, vecnf_from_scalar() was producing much uglier IR due to using repeated insertelement instead of insertelement+shuffle). Reviewed-by: José Fonseca <jfonseca@vmware.com> llvmpipe: fix typo in complex interpolation code Fixes position interpolation when using complex mode (piglit fp-fragment-position and similar) Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: fix clipvertex/position storing again This appears to be the result of a bad merge. Fixes piglit tests relying on clipping, like a lot of the interpolation tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Fix explicit derivative manipulation. Same counter variable was being used in two nested loops. Use more meanigful variable names for the counter to fix and avoid this. gallivm: Prevent buffer overflow in repeat wrap mode for NPOT. Based on Roland's patch, discussion, and review . Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix dims for TGSI_TEXTURE_1D in emit_tex. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix explicit volume texture derivatives. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: fix 1d shadow texture sampling Always r coordinate is used, hence need 3 coords not two (the second one is unused). Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Enable AVX support without MCJIT, where available. For now, this just enables AVX on Windows for testing. If the code is stable then we might consider prefering the old JIT wherever possible. No change elsewhere. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2012-07-13 18:09:30 +01:00
* strategy (floor() vs ifloor()) that results in less redundant work is used.
*/
void
lp_build_ifloor_fract(struct lp_build_context *bld,
LLVMValueRef a,
LLVMValueRef *out_ipart,
LLVMValueRef *out_fpart)
{
LLVMBuilderRef builder = bld->gallivm->builder;
const struct lp_type type = bld->type;
LLVMValueRef ipart;
assert(type.floating);
assert(lp_check_value(type, a));
if (arch_rounding_available(type)) {
/*
* floor() is easier.
*/
ipart = lp_build_floor(bld, a);
*out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
*out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
}
else {
/*
* ifloor() is easier.
*/
*out_ipart = lp_build_ifloor(bld, a);
ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
*out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
}
}
gallivm,draw,llvmpipe: Support wider native registers. Squashed commit of the following: commit 7acb7b4f60dc505af3dd00dcff744f80315d5b0e Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:46:31 2012 +0100 draw: Don't use dynamically sized arrays. Not supported by MSVC. commit 5810c28c83647612cb372d1e763fd9d7780df3cb Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:44:16 2012 +0100 gallivm,llvmpipe: Don't use expressions with PIPE_ALIGN_VAR(). MSVC doesn't accept exceptions in _declspec(align(...)). Use a define instead. commit 8aafd1457ba572a02b289b3f3411e99a3c056072 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:41:56 2012 +0100 gallium/util: Make u_cpu_detect.h header C++ safe. commit 5795248350771f899cfbfc1a3a58f1835eb2671d Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 2 12:08:01 2012 +0100 gallium/util: Add ULL suffix to large constants. As suggested by Andy Furniss: it looks like some old gcc versions require it. commit 4c66c22727eff92226544c7d43c4eb94de359e10 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Truly disable INF/NAN tests on MSVC. Thanks to Brian for spotting this. commit 8bce274c7fad578d7eb656d9a1413f5c0844c94e Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Disable INF/NAN tests on MSVC. Somehow they are not recognized as constants. commit 6868649cff8d7fd2e2579c28d0b74ef6dd4f9716 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 15:05:24 2012 +0200 gallivm: Cleanup the 2 x 8 float -> 16 ub special path in lp_build_conv. No behaviour change intended, like 7b98455fb40c2df84cfd3cdb1eb7650f67c8a751. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 5147a0949c4407e8bce9e41d9859314b4a9ccf77 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 14:28:19 2012 +0200 gallivm: (trivial) fix issues with multiple-of-4 texture fetch Some formats can't handle non-multiple of 4 fetches I believe, but everything must support length 1 and multiples of 4. So avoid going to scalar fetch (which is very costly) just because length isn't 4. Also extend the hack to not use shift with variable count for yuv formats to arbitrary length (larger than 1) - doesn't matter how many elements we have we always want to avoid it unless we have variable shift count instruction (which we should get with avx2). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 87ebcb1bd71fa4c739451ec8ca89a7f29b168c08 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jul 4 02:09:55 2012 +0200 gallivm: (trivial) fix typo for wrap repeat mode in linear filtering aos code This would lead to bogus coordinates at the edges. (undetected by piglit because this path is only taken for block-based formats). Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 3a42717101b1619874c8932a580c0b9e6896b557 Author: José Fonseca <jfonseca@vmware.com> Date: Tue Jul 3 19:42:49 2012 +0100 gallivm: Fix TGSI integer translation with AVX. commit d71ff104085c196b16426081098fb0bde128ce4f Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 15:17:41 2012 +0100 llvmpipe: Fix LLVM JIT linear path. It was not working properly because it was looking at the JIT function before it was actually compiled. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit a94df0386213e1f5f9a6ed470c535f9688ec0a1b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Jun 28 18:07:10 2012 +0100 gallivm: Refactor lp_build_broadcast(_scalar) to share code. Doesn't really change the generated assembly, but produces more compact IR, and of course, makes code more consistent. Reviewed-by: Brian Paul <brianp@vmware.com> commit 66712ba2731fc029fa246d4fc477d61ab785edb5 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 17:30:13 2012 +0100 gallivm: Make LLVMContextRef a singleton. There are any places inside LLVM that depend on it. Too many to attempt to fix. Reviewed-by: Brian Paul <brianp@vmware.com> commit ff5fb7897495ac263f0b069370fab701b70dccef Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 28 18:15:27 2012 +0200 gallivm: don't use 8-wide texture fetch in aos path This appears to be a slight loss usually. There are probably several reasons for that: - fetching itself is scalar - filtering is pure int code hence needs splitting anyway, same for the final texel offset calculations - texture wrap related code, which can be done 8-wide, is slightly more complex with floats (with clamp_to_edge) and float operations generally more costly hence probably not much faster overall - the code needed to split when encountering different mip levels for the quads, adding complexity So, just split always for aos path (but leave it 8-wide for soa, since we do 8-wide filtering there when possible). This should certainly be revisited if we'd have avx2 support. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ce8032b43dcd8e8d816cbab6428f54b0798f945d Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:41:19 2012 +0200 gallivm: (trivial) don't extract fparts variable if not needed Did not have any consequences but unnecessary. commit aaa9aaed8f80dc282492f62aa583a7ee23a4c6d5 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:09:06 2012 +0200 gallivm: fix precision issue in aos linear int wrap code now not just passes at a quick glance but also with piglit... If we do the wrapping with floats, we also need to set the weights accordingly. We can potentially end up with different (integer) coordinates than what the integer calculations would have chosen, which means the integer weights calculated previously in this case are completely wrong. Well at least that's what I think happens, at least recalculating the weights helps. (Some day really should refactor all the wrapping, so we do whatever is fastest independent of 16bit int aos or 32bit float soa filtering.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit fd6f18588ced7ac8e081892f3bab2916623ad7a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 11:15:53 2012 +0100 gallium/util: Fix parsing of options with underscore. For example GALLIVM_DEBUG=no_brilinear which was being parsed as two options, "no" and "brilinear". commit 09a8f809088178a03e49e409fa18f1ac89561837 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 26 15:00:14 2012 +0100 gallivm: Added a generic lp_build_print_value which prints a LLVMValueRef. Updated lp_build_printf to share common code. Removed specific lp_build_print_vecX. Reviewed-by: José Fonseca <jfonseca@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit e59bdcc2c075931bfba2a84967a5ecd1dedd6eb0 Author: José Fonseca <jfonseca@vmware.com> Date: Wed May 16 15:00:23 2012 +0100 draw,llvmpipe: Avoid named struct types on LLVM 3.0 and later. Starting with LLVM 3.0, named structures are meant not for debugging, but for recursive data types, previously also known as opaque types. The recursive nature of these types leads to several memory management difficulties. Given that we don't actually need recursive types, avoid them altogether. This is an attempt to address fdo bugs 41791 and 44466. The issue is somewhat random so there's no easy way to check how effective this is. Cherry-picked from 9af1ba565dfd5cef9ee938bb7c04767d14878fbf commit df6070f618a203c7a876d984c847cde4cbc26bdb Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 14:42:53 2012 +0200 gallivm: (trivial) fix typo in faster aos linear int wrap code no longer crashes, now REALLY tested. commit d8f98dce452c867214e6782e86dc08562643c862 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 18:20:58 2012 +0200 llvmpipe: (trivial) remove bogus optimization for float aos repeat wrap This optimization for nearest filtering on the linear path generated likely bogus results, and the int path didn't have any optimizations there since the only shader using force_nearest apparently uses clamp_to_edge not repeat wrap anyway. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit c4e271a0631087c795e756a5bb6b046043b5099d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 23:01:52 2012 +0200 gallivm: faster repeat wrap for linear aos path too Even if we already have scaled integer coords, it's way faster to use the original float coord (plus some conversions) rather than use URem. The choice of what to do for texture wrapping is not really tied to int aos or float soa filtering though for some modes there can be some gains (because of easier weight calculations). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 1174a75b1806e92aee4264ffe0ffe7e70abbbfa3 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 14:39:22 2012 +0200 gallivm: improve npot tex wrap repeat in linear soa path URem gets translated into series of scalar divisions so just about anything else is faster. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit f849ffaa499ed96fa0efd3594fce255c7f22891b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 00:40:35 2012 +0100 gallivm: (trivial) fix near-invisible shift-space typo I blame the keyboard. commit 5298a0b19fe672aebeb70964c0797d5921b51cf0 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:24:28 2012 +0200 gallivm: add new intrinsic helper to deal with arbitrary vector length This helper will split vectors which are too large for the hw, or expand them if they are too small, so a caller of a function using intrinsics which uses such sizes need not split (or expand) the vectors manually and the function will still use the intrinsic instead of dropping back to generic llvm code. It can also accept scalars for use with pseudo-vector intrinsics (only useful for float arguments, all x86 scalar simd float intrinsics use 4vf32). Only used for lp_build_min/max() for now (also added the scalar float case for these while there). (Other basic binary functions could use it easily, whereas functions with a different interface would need different helpers.) Expanding vectors isn't widely used, because we always try to use build contexts with native hw vector sizes. But it might (or not) be nicer if this wouldn't need to be done, the generated code should in theory stay the same (it does get hit by lp_build_rho though already since we didn't have a intrinsic for the scalar lp_build_max case before). v2: incorporated Brian's feedback, and also made the scalar min/max case work instead of crash (all scalar simd float intrinsics take 4vf32 as argument, probably the reason why it wasn't used before). Moved to lp_bld_intr based on José's request, and passing intrinsic size instead of length. Ideally we'd derive the source type info from the passed in llvm value refs and process some llvmtype return type so we could handle intrinsics where the source and destination type isn't the same (like float/int conversions, packing instructions) but that's a bit too complicated for now. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 01aa760b99ec0b2dc8ce57a43650e83f8c1becdf Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:19:18 2012 +0200 gallivm: (trivial) increase max code size for shader disassembly 64kB was just short of what I needed (which caused a crash) hence increase to 96kB (should probably be smarter about that). commit 74aa739138d981311ce13076388382b5e89c6562 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:53:29 2012 +0100 gallivm: simplify aos float tex wrap repeat nearest just handle pot and npot the same. The previous pot handling ended up with exactly the same instructions plus 2 more (leave it in the soa path though since it is probably still cheaper there). While here also fix a issue which would cause a crash after an assert. commit 0e1e755645e9e49cfaa2025191e3245ccd723564 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:29:24 2012 +0100 gallivm: (trivial) skip floor rounding in ifloor when not signed This was only done for the non-sse41 case before, but even with sse41 this is obviously unnecessary (some callers already call itrunc in this case anyway but some might not). commit 7f01a62f27dcb1d52597b24825931e88bae76f33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:23:12 2012 +0100 gallivm: (trivial) fix bogus comments commit 5c85be25fd82e28490274c468ce7f3e6e8c1d416 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 20 11:51:57 2012 +0100 translate: Free elt8_func/elt16_func too. These were leaking. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 0ad498f36fb6f7458c7cffa73b6598adceee0a6c Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 15:55:34 2012 +0200 gallivm: fix bug for tex wrap repeat with linear sampling in aos float path The comparison needs to be against length not length_minus_one, otherwise the max texel is never chosen (for the second coordinate). Fixes piglit texwrap-1D-npot-proj (and 2D/3D versions). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit d1ad65937c5b76407dc2499b7b774ab59341209e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 16:13:43 2012 +0200 gallivm: simplify soa tex wrap repeat with npot textures and no mip filtering Similar to what is already done in aos sampling for the float path (but not the int path since we don't get normalized float coordinates there). URem is expensive and the calculation is done trivially with normalized floats instead (at least with sse41-capable cpus). (Some day should probably do the same for the mip filter path but it's much more complicated there hence the gain is smaller.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit e1e23f57ba9b910295c306d148f15643acc3fc83 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:38:56 2012 +0200 llvmpipe: (trivial) remove duplicated function declaration Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 07ca57eb09e04c48a157733255427ef5de620861 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:37:34 2012 +0200 llvmpipe: destroy setup variants on context destruction lp_delete_setup_variants() used to be called in garbage collection, but this no longer exists hence the setup shaders never got freed. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ed0003c633859a45f9963a479f4c15ae0ef1dca3 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 16:25:29 2012 +0100 gallivm: handle different ilod parts for multiple quad sampling This fixes filtering when the integer part of the lod is not the same for all quads. I'm not fully convinced of that solution yet as it just splits the vector if the levels to be sampled from are different. But otherwise we'd need to do things like some minify steps, and getting mip level base address separately anyway hence it wouldn't really look like much of a win (and making the code even more complex). This should now give identical results to single quad sampling. commit 8580ac4cfc43a64df55e84ac71ce1a774d33c0d2 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:14:47 2012 +0200 gallivm: de-duplicate sample code common to soa and aos sampling There doesn't seem to be any reason why this code dealing with cube face selection, lod and mip level calculation is separate in aos and soa sampling, and I am sick of having it to change in both places. commit fb541e5f957408ce305b272100196f1e12e5b1e8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:15:41 2012 +0200 gallivm: do mip filtering with per quad lod_fpart This gives better results for mip filtering, though the generated code might not be optimal. For now it also creates some artifacts if the lod_ipart isn't the same for all quads, since instead of using the same mip weight for all quads as previously (which just caused non-smooth gradients) this now will use the right weights but with the wrong mip level in this case (can easily be seen with things like texfilt, mipmap_tunnel). v2: use logic helper suggested by José, and fix issue with negative lod_fpart values commit f1cc84eef7d826a20fab6cd8ccef9a275ff78967 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 13 18:35:25 2012 +0200 gallivm: (trivial) fix bogus assert in lp_build_unpack_broadcast_aos_scalars commit 7c17dbae8ae290df9ce0f50781a09e8ed640c044 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:11:14 2012 +0100 util: Reimplement half <-> float conversions. Removed u_half.py used to generate the table for previous method. Previous implementation of float to half conversion was faulty for denormalised and NaNs and would require extra logic to fix, thus making the speedup of using tables irrelevant. commit 7762f59274070e1dd4b546f5cb431c2eb71ae5c3 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:12:16 2012 +0100 tests: Updated tests to properly handle NaN for half floats. commit fa94c135aea5911fd93d5dfb6e6f157fb40dce5e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:33:10 2012 +0200 gallivm: do mip level calculations per quad This is the final piece which shouldn't change the rendering output yet. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 23cbeaddfe03c09ca18c45d28955515317ffcf4c Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Jun 9 00:54:21 2012 +0200 gallivm: do per-quad cube face selection Doesn't quite fix the piglit cubemap test (not sure why actually) but doing per-quad face selection is doing the right thing and definitely an improvement. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit abfb372b3702ac97ac8b5aa80ad1b94a2cc39d33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:22:59 2012 +0200 gallivm: do all lod calculations per quad Still no functional change but lod is now converted to scalar after lod calculations. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 519368632747ae03feb5bca9c655eccbc5b751b4 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:46:10 2012 +0100 gallivm: Added support for half-float to float conversion in lp_build_conv. Updated various utility functions to support this change. commit 135b4d683a4c95f7577ba27b9bffa4a6fbd2c2e7 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:02:46 2012 +0100 gallivm: Added function for half-float to float conversion. Updated lp_build_format_aos_array to support half-float source. commit 37d648827406a20c5007abeb177698723ed86673 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:55:18 2012 +0100 util: Updated u_format_tests to rigidly test half-float boundary values. commit 2ad18165d96e578aa9046df7c93cb1c3284d8c6b Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:54:16 2012 +0100 llvmpipe: Updated lp_test_format to properly handle Inf/NaN results. commit 78740acf25aeba8a7d146493dd5c966e22c27b73 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:53:30 2012 +0100 util: Added functions for checking NaN / Inf for double and half-floats. commit 35e9f640ae01241f9e0d67fe893bbbf564c05809 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:05:13 2012 +0200 gallivm: Fix calculating rho for 3d textures for the single-quad case Discovered by accident, this looks like a very old typo bug. commit fc1220c636326536fd0541913154e62afa7cd1d8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:04:59 2012 +0200 gallivm: do calcs per-quad in lp_build_rho Still convert to scalar at the end of the function. commit 50a887ffc550bf310a6988fa2cea5c24d38c1a41 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 21 23:21:50 2012 +0200 gallivm: (trivial) return scalar in lp_build_extract_range for length 1 vectors Our type system on top of llvm's one doesn't generally support vectors of length 1, instead using scalars. So we should return a scalar from this function instead of having to bitcast the vector with length 1 later elsewhere. commit 80c71c621f9391f0f9230460198d861643324876 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 17:49:15 2012 +0100 draw: Fixed bad merge error commit c47401cfad0c9167de20ff560654f533579f452c Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:29:30 2012 +0100 draw: Updated store_clip to store whole vectors instead of individual elements. commit 2d9c1ad74b0b0b41861fffcecde39f09cc27f1cf Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:28:32 2012 +0100 gallivm: Added lp_build_fetch_rgba_aos_array. A version of lp_build_fetch_rgba_aos which is targeted at simple array formats. Reads the whole vector from memory in one, instead of reading each element individually. Tested with mesa tests and demos. commit ff7805dc2b6ef6d8b11ec4e54aab1633aef29ac8 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:27:40 2012 +0100 gallivm: Added lp_build_pad_vector. This function pads a vector with undef to a desired length. commit 701f50acef24a2791dabf4730e5b5687d6eb875d Author: James Benton <jbenton@vmware.com> Date: Fri May 18 17:27:19 2012 +0100 util: Added util_format_is_array. This function checks whether a format description is in a simple array format. commit 5e0a7fa543dcd009de26f34a7926674190fa6246 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:13:47 2012 +0100 draw: Removed draw_llvm_translate_from and draw/draw_llvm_translate.c. This is "replaced" by adding an optimised path in lp_build_fetch_rgba_aos in an upcoming patch. commit 8c886d6a7dd3fb464ecf031de6f747cb33e5361d Author: James Benton <jbenton@vmware.com> Date: Wed May 16 15:02:31 2012 +0100 draw: Modified store_aos to write the vector as one, not individual elements. commit 37337f3d657e21dfd662c7b26d61cb0f8cfa6f17 Author: James Benton <jbenton@vmware.com> Date: Wed May 16 14:16:23 2012 +0100 draw: Changed aos_to_soa to use lp_build_transpose_aos. commit bd2b69ce5d5c94b067944d1dcd5df9f8e84548f1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:14:27 2012 +0100 draw: Changed soa_to_aos to use lp_build_transpose_aos. commit 0b98a950d29a116e82ce31dfe7b82cdadb632f2b Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:45 2012 +0100 gallivm: Added lp_build_transpose_aos which converts between aos and soa. commit 69ea84531ad46fd145eb619ed1cedbe97dde7cb5 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:01 2012 +0100 gallivm: Added lp_build_interleave2_half aimed at AVX unpack instructions. commit 7a4cb1349dd35c18144ad5934525cfb9436792f9 Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 22 11:54:14 2012 +0100 gallivm: Fix build on Windows. MC-JIT not yet supported there. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit afd105fc16bb75d874e418046b80d9cc578818a1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:17:26 2012 +0100 llvmpipe: Added a error counter to lp_test_conv. Useful for keeping track of progress when fixing errors! Signed-off-by: José Fonseca <jfonseca@vmware.com> commit b644907d08c10a805657841330fc23db3963d59c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:16:46 2012 +0100 llvmpipe: Changed known failures in lp_test_conv. To comply with the recent fixes to lp_bld_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit d7061507bd94f6468581e218e61261b79c760d4f Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:14:38 2012 +0100 llvmpipe: Added fixed point types tests to lp_test_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 146b3ea39b4726dbe125ac666bd8902ea3d6ca8c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:26:35 2012 +0100 llvmpipe: Changed lp_test_conv src/dst alignment to be correct. Now based on the define rather than a fixed number. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit f3b57441f834833a4b142a951eb98df0aa874536 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:06:44 2012 +0100 gallivm: Fixed erroneous optimisation in lp_build_min/max. Previously assumed normalised was 0 to 1, but it can be -1 to 1 if type is signed. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a0613382e5a215cd146bb277646a6b394d376ae4 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:04:49 2012 +0100 gallivm: Compensate for lp_const_offset in lp_build_conv. Fixing a /*FIXME*/ to remove errors in integer conversion in lp_build_conv. Tested using lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a3d2bf15ea345bc8a0664f8f441276fd566566f3 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:01:25 2012 +0100 gallivm: Fixed overflow in lp_build_clamped_float_to_unsigned_norm. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit e7b1e76fe237613731fa6003b5e1601a2e506207 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 21 20:07:51 2012 +0100 gallivm: Fix build with LLVM 2.6 Trivial, and useful. commit d3c6bbe5c7f5ba1976710831281ab1b6a631082d Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 15 17:15:59 2012 +0100 gallivm: Enable MCJIT/AVX with vanilla LLVM 3.1. Add the necessary C++ glue, so that we don't need any modifications to the soon to be released LLVM 3.1. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 724a019a14d40fdbed21759a204a2bec8a315636 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 22:04:06 2012 +0100 gallivm: Use HAVE_LLVM 0x0301 consistently. commit af6991e2a3868e40ad599b46278551b794839748 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 21:49:06 2012 +0100 gallivm: Add MCRegisterInfo.h to silence benign warnings about missing implementation. Trivial. commit 6f8a1d75458daae2503a86c6b030ecc4bb494e23 Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Apr 2 22:14:15 2012 -0700 gallivm: Pass in a MCInstrInfo to createMCInstPrinter on llvm-3.1. llvm-3.1svn r153860 makes MCInstrInfo available to the MCInstPrinter. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 62555b6ed8760545794f83064e27cddcb3ce5284 Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 27 21:51:17 2012 -0700 gallivm: Fix method overriding in raw_debug_ostream. Use matching type qualifers to avoid method hiding. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 6a9bd784f4ac68ad0a731dcd39e5a3c39989f2be Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 13 22:40:52 2012 -0700 gallivm: Fix createOProfileJITEventListener namespace with llvm-3.1. llvm-3.1svn r152620 refactored the OProfile profiling code. createOProfileJITEventListener was moved from the llvm namespace to the llvm::JITEventListener namespace. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit b674955d39adae272a779be85aa1bd665de24e3e Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Mar 5 22:00:40 2012 -0800 gallivm: Pass in a MCRegisterInfo to MCInstPrinter on llvm-3.1. llvm-3.1svn r152043 changes createMCInstPrinter to take an additional MCRegisterInfo argument. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 11ab69971a8a31c62f6de74905dbf8c02884599f Author: Vinson Lee <vlee@freedesktop.org> Date: Wed Feb 29 21:20:53 2012 -0800 Revert "gallivm: Change getExtent and readByte to non-const with llvm-3.1." This reverts commit d5a6c172547d8964f4d4bb79637651decaf9deee. llvm-3.1svn r151687 makes MemoryObject accessor members const again. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 339960c82d2a9f5c928ee9035ed31dadb7f45537 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 14 16:19:56 2012 +0200 gallivm: (trivial) fix assertion failure for mipmapped 1d textures In lp_build_rho, we may end up with a 1-element vector (for mipmapped 1d textures), but in this case we require the type to be a non-vector type, so need a cast. commit 9d73edb727bd6d196030dc3026b7bf0c574b3e19 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:12:07 2012 +0200 gallivm: prepare for per-quad lod calculations for large vectors to be able to handle multiple quads at once in texture sampling and still do lod calculations per quad, it is necessary to get the per-quad derivatives into the lp_build_rho function. Until now these derivative values were just scalars, which isn't going to work. So we now use vectors, and since the interface needs to change we also do some different (slightly more efficient) packing of the values. For 8-wide vectors the packed derivative values for 3 coords would look like this, this scales to a arbitrary (multiple of 4) vector size: ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____ The second vector will be unused for 1d and 2d textures. To facilitate future changes the derivative values are put into a struct, since quite some functions just pass these values through. The generated code seems to be very slightly better for 2d textures (with 4-wide vectors) than before with sse2 (if you have a cpu with physical 128bit simd units - otherwise it's probably not a win). v2: suggestions from José, rename variables, add comments, use swizzle helper commit 0aa21de0d31466dac77b05c97005722e902517b8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:10:31 2012 +0200 gallivm: add undefined swizzle handling to lp_build_swizzle_aos This is useful for vectors with "holes", it lets llvm choose the most efficient shuffle instructions if some elements aren't needed without having to worry what elements to manually pick otherwise. commit 00faf3f370e7ce92f5ef51002b0ea42ef856e181 Author: José Fonseca <jfonseca@vmware.com> Date: Fri May 4 17:25:16 2012 +0100 gallivm: Get the LLVM IR optimization passes before JIT compilation. MC-JIT engine compiles the module immediately on creation, so the optimization passes were being run too late. So now we create a target data layout from a string, that matches the ABI parameters reported by the compiler. The backend optimization passes were always been run, so the performance improvement is modest (3% on multiarb mesa demo). Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit 40a43f4e2ce3074b5ce9027179d657ebba68800a Author: Roland Scheidegger <sroland@vmware.com> Date: Wed May 2 16:03:54 2012 +0200 gallivm: (trivial) fix wrong define used in lp_build_pack2 should fix stack-smashing crashes. commit e6371d0f4dffad4eb3b7a9d906c23f1c88a2ab9e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 30 21:25:29 2012 +0200 gallivm: add perf warnings when not using intrinsics with 256bit vectors Helper functions using integer sse2 intrinsics could split the vectors with AVX instead of using generic fallback (which should be faster). We don't actually expect to hit these paths (hence don't fix them up to actually do the vector splitting) so just emit warnings (for those functions where it's obvious doing split/intrinsic is faster than using generic path). Only emit warnings for 256bit vectors since we _really_ don't expect to hit arbitrary large vectors which would affect a lot more functions. The warnings do not actually depend on avx since the same logic applies to plain sse2 too (but of course again there's _really_ no reason we should hit these functions with 256bit vectors without avx). commit 8a9ea701ea7295181e846c6383bf66a5f5e47637 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:37:07 2012 +0200 gallivm: split vectors manually for avx in lp_build_pack2 (v2) There's 2 reasons for this: First, there's a llvm bug (fixed in 3.1) which generates tons of byte inserts/extracts otherwise, and second, more importantly, we want to use pack intrinsics instead of shuffles. We do this in lp_build_pack2 and not the calling code (aos sample path) because potentially other callers might find that useful too, even if for larger sequences of code using non-native vector sizes it might be better to manually split vectors. This should boost texture performance in the aos path considerably. v2: fix issues with intrinsics types with old llvm commit 27ac5b48fa1f2ea3efeb5248e2ce32264aba466e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:26:22 2012 +0200 llvmpipe: refactor lp_build_pack2 (v2) prettify, and it's unnecessary to assert when there's no intrinsic due to unsupported bit width - the shuffle path will work regardless. In contrast lp_build_packs2, should only rely on lp_build_pack2 doing the clamping for element sizes for which there is a sse2 intrinsic. v2: fix bug spotted by Jose regarding the intrinsic type for packusdw on old llvm versions. commit ddf279031f0111de4b18eaf783bdc0a1e47813c8 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:13:59 2012 +0200 gallivm: add src width check in lp_build_packs2() not doing so would skip clamping even if no sse2 pack instruction is available, which is incorrect (in theory only, such widths would also always hit a (unnecessary) assertion in lp_build_pack2(). commit e7f0ad7fe079975eae7712a6e0c54be4fae0114b Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Apr 27 15:57:00 2012 +0200 gallivm: (trivial) fix crash-causing typo for npot textures with avx commit 28a9d7f6f655b6ec508c8a3aa6ffefc1e79793a0 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Apr 25 19:38:45 2012 +0200 gallivm: (trivial) remove code mistakenly added twice. commit d5926537316f8ff67ad0a52e7242f7c5478d919b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Apr 24 21:16:15 2012 +0200 gallivm: add a new avx aos sample path (v2) Try to avoid mixing float and int address calculations. This does texture wrap modes with floats, and then the offset calculations still with ints (because of lack of precision with floats, though we could do some effort to make it work with not too large (16MB) textures). This also handles wrap repeat mode with npot-sized textures differently than either the old soa or aos int path (likely way faster but untested). Otherwise the actual address wrap code is largely similar to the soa path (not quite the same as this one also has some int code), it should get used by avx soa sampling later as well but doesn't handle more complex address modes yet (this will also have the benefit that we can use aos sampling path for all texture address modes). Generated code for that looks reasonable, but still does not split vectors explicitly for fetch/filter which means still get hit by llvm (fixed upstream) which generates hundreds of pinsrb/pextrb instead of two shuffles. It is not obvious though if it's much of a win over just doing address calcs 4-wide but with ints, even if it is definitely much less instructions on avx. piglit's texwrap seems to look exactly the same but doesn't test neither the non-normalized nor the npot cases. v2: fix comments, prettify based on Brian's and Jose's feedback. commit bffecd22dea66fb416ecff8cffd10dd4bdb73fce Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Apr 19 01:58:29 2012 +0200 gallivm: refactor aos lp_build_sample_image_nearest/linear split them up to separate address calculations and fetching/filtering. Need this for being able to do 8-wide float address calcs and 4-wide fetch/filter later (for avx). Plus the functions were very big scary monsters anyway (in particular lp_build_sample_image_linear). commit a80b325c57529adddcfa367f96f03557725c4773 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 16 17:17:18 2012 +0200 gallivm: fix lp_build_resize when truncating width but expanding vector size Missed this case which I thought was impossible - the assertion for it was right after the division by zero... (AoS) texture sampling may ask us to do this, for things like 8 4x32int vectors to 1 32x8int vector conversion (eventually, we probably don't want this to happen). commit f9c8337caa3eb185830d18bce8b95676a065b1d7 Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Apr 14 18:00:59 2012 +0200 gallivm: fix cube maps with larger vectors This makes the branchless cube face selection code work with larger vectors. Because the complexity is quite high (cannot really be improved it seems, per-face selection would reduce complexity a lot but this leads to errors unless the derivatives are calculated all from the same face which almost doubles the work to be done) it is still slower than the branching version, hence only enable this with large vectors. It doesn't actually do per-quad face selection yet (only makes sense with matching lod selection, in fact it will select the same face for all pixels based on the average of the first four pixels for now) but only different shuffles are required to make it work (the branching version actually should work with larger vectors too now thanks to the improved horizontal add but of course it cannot be extended to really select the face per-quad unless doing branching per quad). commit 7780c58869fc9a00af4f23209902db7e058e8a66 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 21:11:12 2012 +0100 llvmpipe: (trivial) fix compiler warning and also clarify comment regarding availability of popcnt instruction. commit a266dccf477df6d29a611154e988e8895892277e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 14:21:07 2012 +0100 gallivm: remove unneeded members in lp_build_sample_context Minor cleanup, the texture width, height, depth aren't accessed in their scalar form anywhere. Makes it more obvious those values should probably be fetched already vectorized (but this requires more invasive changes)... commit b678c57fb474e14f05e25658c829fc04d2792fff Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 29 15:53:55 2012 +0100 gallivm: add a helper for concatenating vectors Similar to the extract_range helper intended to get around slow code generated by llvm for 128bit insertelements. Concatenating two 128bit vectors this way will result in a single vinsertf128 operation rather than two 64bit stores plus one 128bit load, though it might be mildly useful for other purposes as well. commit 415ff228bcd0cf5e44a4c15350a661f0f5520029 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 19:41:15 2012 +0100 gallivm: add a custom 2x8f->1x16ub avx conversion path Similar to the existing 4x4f->1x16ub sse2 path, shaves off a couple instructions (min/max mostly) because it relies on pack intrinsics clamping. commit 78c08fc89f8fbcc6dba09779981b1e873e2a0299 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 18:44:07 2012 +0100 gallivm: add avx arithmetic intrinsics Add all avx intrinsics for arithmetic functions (with the exception of the horizontal add function which needs another look). Seems to pass basic tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit a586caa2800aa5ce54c173f7c0d4fc48153dbc4e Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 15:31:35 2012 +0100 gallivm: add avx logic intrinsics Add the blend intrinsics for 8-wide float and 4-wide double vectors. Since we lack 256bit int instructions these are used for int vectors as well, though obviously not for byte or word element values. The comparison intrinsics aren't extended for avx since these are only used for pre-2.7 llvm versions. commit 70275e4c13c89315fc2560a4c488c0e6935d5caf Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 00:40:53 2012 +0100 gallivm: new helper function for extract shuffles. Based on José's idea as we can need that in a couple places. Note that such shuffles should not be used lightly, since data layout of <4 x i8> is different to <16 x i8> for instance, hence might cause data rearrangement. commit 4d586dbae1b0c55915dda1759d2faea631c0a1c2 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 18:27:25 2012 +0100 gallivm: (trivial) don't overallocate shuffle variable using wrong define meant huge array... commit 06b0ec1f6d665d98c135f9573ddf4ba04b2121ad Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 17:54:20 2012 +0100 gallivm: don't do per-element extract/insert for vector element resize Instead of doing per-element extract/insert if the src vectors and dst vector differ in total size (which generates atrocious code) first change the src vectors size by using shuffles to destination vector size. We can still do better than that on AVX for packing to color buffer (by exploiting pack intrinsics characteristics hence eleminating the need for some clamps) but this already generates much better code. v2: incorporate feedback from José, Keith and use shuffle instead of bitcasts/extracts. Due to llvm deficiencies the latter cause all data to get moved to GPRs and back in pieces (even though the data in the regs actually stays the same...). commit c9970d70e05f95d3f52fe7d2cd794176a52693aa Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 19:33:19 2012 +0000 gallivm: fix bug in simple position interpolation Accidental use of position attribute instead of just pixel coordinates. Caused failures in piglit glsl-fs-ceil and glsl-fs-floor. commit d0b6fcdb008d04d7f73d3d725615321544da5a7e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 15:31:14 2012 +0000 gallivm: fix emission of ceil opcode lp_build_ceil seems more appropriate than lp_build_trunc. This seems to be never hit though someone performs some ceil to floor magic. commit d97fafed7e62ffa6bf76560a92ea246a1a26d256 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 22 11:46:52 2012 +0000 gallivm: new vectorized path for cubemap calculations should be faster when adapted to multiple quads as only selection masks need to be different. The code is more or less a per-pixel version adapted to only do it per quad. A per pixel version would be much simpler (could drop 2 selects, 6 broadcasts and the messy horizontal add of 3 vectors at the expense of only 2 more absolute value instructions - would also just work for arbitary large vectors). This version doesn't yet work with larger vectors because the horizontal add isn't adjusted to be able to work with 2x4 vectors (and also because face selection wouldn't be done per quad just per block though that would be only a correctness issue just as with lod selection). The downside is this code is quite a bit slower. On a Core2 it can be sped up by disabling the hw blend instructions for selection and using logicop fallbacks instead, but it is still slower than the old code, hence leave that in for now. Probably will chose one or the other version based on vector length in the end. commit b375fbb18a3fd46859b7fdd42f3e9908ea4ff9a3 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 21 14:42:29 2012 +0000 gallivm: fix optimized occlusion query intrinsic name commit a9ba0a3b611e48efbb0e79eb09caa85033dbe9a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Mar 21 16:19:43 2012 +0000 draw,gallivm,llvmpipe: Call gallivm_verify_function everywhere. commit f94c2238d2bc7383e088b8845b7410439a602071 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 18:54:10 2012 +0000 gallivm: optimize calculations for cube maps a bit this does some more vectorized calculations and uses horizontal adds if possible. A definite win with sse3 otherwise it doesn't seem to make much of a difference. In any case this is arithmetically identical, cannot handle larger vectors. Should be useful as a reference point against larger vector version later... commit 21a2c1cf3c8e1ac648ff49e59fdc0e3be77e2ebb Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 15:16:27 2012 +0000 llvmpipe: slight optimization of occlusion queries using movmskps when available. While this is slightly better for cpus without popcnt we should really sum the vectors ourselves (it is also possible to cast to i4 before doing the popcnt but that doesn't help that much neither since llvm is using some optimized popcnt version for i32) commit 5ab5a35f216619bcdf55eed52b0db275c4a06c1b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 13:32:11 2012 +0000 llvmpipe: fix occlusion queries with larger vectors need to adjust casts etc. commit ff95e6fdf5f16d4ef999ffcf05ea6e8c7160b0d5 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Mar 19 20:15:25 2012 +0000 gallivm: Restore optimization passes. commit 57b05b4b36451e351659e98946dae27be0959832 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:34:22 2012 +0000 llvmpipe: use existing min2 macro commit bc9a20e19b4f600a439f45679451f2e87cd4b299 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:07:27 2012 +0000 llvmpipe: add some safeguards against really large vectors As per José's suggestion, prevent things from blowing up if some cpu would have 1024bit or larger vectors. commit 0e2b525e5ca1c5bbaa63158bde52ad1c1564a3a9 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:31:08 2012 +0000 llvmpipe: fix mask generation for uberwide vectors this was the only piece preventing 16-wide vectors from working (apart from the LP_MAX_VECTOR_WIDTH define that is), which is the maximum as we don't get more pixels in the fragment shader at once. Hence adjust that so things could be tested properly with that size even though there seems to be no practical value. commit 3c8334162211c97f3a11c7f64e9e5a2a91ad9656 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:19:41 2012 +0000 llvmpipe: fix the simple interpolation method with larger vectors so both methods actually _really_ work now. Makes textures look nice with larger vectors... commit 1cb0464ef8871be1778d43b0c56adf9c06843e2d Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 17:26:35 2012 +0000 llvmpipe: fix mask generation and position interpolation with 8-wide vectors trivial bugs, with these things start to look somewhat reasonable. Textures though have some swizzling issues it seems. commit 168277a63ef5b72542cf063c337f2d701053ff4b Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 16:04:03 2012 +0000 llvmpipe: don't overallocate variables we never have more than 16 (stamp size) / 4 (minimum possible vector size). (With larger vectors those variables are still overallocated a bit.) commit 409b54b30f81ed0aa9ed0b01affe15c72de9abd2 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:56:48 2012 +0000 llvmpipe: add some 32f8 formats to lp_test_conv Also add the ability to handle different sized vectors. commit 55dcd3af8366ebdac0af3cdb22c2588f24aa18ce Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:47:27 2012 +0000 gallivm: handle different sized vectors in conversion / pack only fully generic path for now (extract/insert per element). commit 9c040f78c54575fcd94a8808216cf415fe8868f6 Author: Roland Scheidegger <sroland@vmware.com> Date: Sun Mar 18 00:58:28 2012 +0100 llvmpipe: fix harmless use of unitialized values commit 551e9d5468b92fc7d5aa2265db9a52bb1e368a36 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:31:21 2012 +0100 gallivm: drop special path in extract_broadcast with different sized vectors Not needed, llvm can handle shuffles with different sized result vector just fine. Should hopefully generate the same code in the end, but simpler IR. commit 44da531119ffa07a421eaa041f63607cec88f6f8 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:28:49 2012 +0100 llvmpipe: adapt interpolation for handling multiple quads at once this is still WIP there are actually two methods possible not quite sure what makes the most sense, so there's code for both for now: 1) the iterative method as used before (compute attrib values at upper left corner of stamp and upper left corner of each quad initially). It is improved to handle more than one quad at once, and also do some more vectorized calculations initially for slightly better code - newer cpus have full throughput with 4 wide float vectors, hence don't try to code up a path which might be faster if there's just one channel active per attribute. 2) just do straight interpolation for each pixel. Method 2) is more work per quad, but less initially - if all quads are executed significantly more overall though. But this might change with larger vector lengths. This method would also be needed if we'd do some kind of active quad merging when operating on multiple quads at once. This path contains some hack to force llvm to generate better code, it is still far from ideal though, still generates far too many unnecessary register spills/reloads. Both methods should work with different sized vectors. Not very well tested yet, still seems to work with four-wide vectors, need changes elsewhere to be able to test with wider vectors. commit be5d3e82e2fe14ad0a46529ab79f65bf2276cd28 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:59:37 2012 +0000 draw: Cleanup. commit f85bc12c7fbacb3de2a94e88c6cd2d5ee0ec0e8d Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:43:30 2012 +0000 gallivm: More module compilation refactoring. commit d76f093198f2a06a93b2204857e6fea5fd0b3ece Author: José Fonseca <jfonseca@vmware.com> Date: Thu Mar 15 21:29:11 2012 +0000 llvmpipe: Use gallivm_compile/free_function() in linear code. Should had been done before. commit 122e1adb613ce083ad739b153ced1cde61dfc8c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 13 14:47:10 2012 +0100 llvmpipe: generate partial pixel mask for multiple quads still works with one quad, cannot be tested yet with more At least for now always fixed order with multiple quads. commit 4c4f15081d75ed585a01392cd2dcce0ad10e0ea8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 22:09:24 2012 +0100 llvmpipe: refactor state setup a bit Refactor to make it easier to emit (and potentially later fetch in fs) coefficients for multiple attributes at once. Need to think more about how to make this actually happen however, the problem is different attributes can have different interpolation modes, requiring different handling in both setup and fs (though linear and perspective handling is close). commit 9363e49722ff47094d688a4be6f015a03fba9c79 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 19:23:23 2012 +0100 llvmpipe: vectorize tri offset calc cuts number of instructions in quad-offset-factor from 107 to 75. This code actually duplicated the (scalar) code calculating the determinant except it used different vertex order (leading to different sign but it doesn't matter) hence llvm could not have figured out it's the same (of course with determinant vectorized in the other place that wouldn't have worked any longer neither). Note this particular piece doesn't actually vectorize well, not many arithmetic instructions left but tons of shuffle instructions... Probably would need to work on n tris at a time for better vectorization. commit 63169dcb9dd445c94605625bf86d85306e2b4297 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 03:11:37 2012 +0100 llvmpipe: vectorize some scalar code in setup reduces number of arithmetic instructions, and avoids loading vector x,y values twice (once as scalars once as vectors). Results in a reduction of instructions from 76 to 64 in fs setup for glxgears (16%) on a cpu with sse41. Since this code uses vec2 disguised as vec4, on old cpus which had physical 64bit sse units (pre-Core2) it probably is less of a win in practice (and if you have no vectors you can only hope llvm eliminates the arithmetic for unneeded elements). commit 732ecb877f951ab89bf503ac5e35ab8d838b58a1 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 7 00:32:24 2012 +0100 draw: fix clipping bug introduced by 4822fea3f0440b5205e957cd303838c3b128419c broke clipping pretty badly (verified with lineclip test) commit ef5d90b86d624c152d200c7c4056f47c3c6d2688 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 23:38:59 2012 +0100 draw: don't store vertex header per attribute storing the vertex header once per attribute is totally unnecessary. Some quick look at the generated assembly says llvm in fact cannot optimize away the additional stores (maybe due to potentially aliasing pointers somewhere). Plus, this makes the code cleaner and also allows using a vector "or" instead of scalar ones. commit 6b3a5a57b0b9850854cfbd7b586e4e50102dda71 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 19:11:01 2012 +0100 draw: do the per-vertex "boolean" clipmask "or" with vectors no point extracting the values and doing it per component. Doesn't help that much since we still extract the values elsewhere anyway. commit 36519caf1af40e4480251cc79a2d527350b7c61f Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 2 22:27:01 2012 +0100 gallivm: fix lp_build_extract_broadcast with different sized vectors Fix the obviously wrong argument, so it doesn't blow up. commit 76d0ac3ad85066d6058486638013afd02b069c58 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 2 12:16:23 2012 +0000 draw: Compile per module and not per function (WIP). Enough to get gears w/ LLVM draw + softpipe to work on AVX doing: GALLIUM_DRIVER=softpipe SOFTPIPE_USE_LLVM=yes glxgears But still hackish -- will need to rethink and refactor this. commit 78e32b247d2a7a771be9a1a07eb000d1e54ea8bd Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 12:01:05 2012 +0000 llvmpipe: Remove lp_state_setup_fallback. Never used. commit 6895d5e40d19b4972c361e8b83fdb7eecda3c225 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Feb 27 19:14:27 2012 +0000 llvmpipe: Don't emit EMMS on x86 We already take precautions to ensure that LLVM never emits MMX code. commit 4822fea3f0440b5205e957cd303838c3b128419c Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 29 15:58:19 2012 +0100 draw: modifications for larger vector sizes We want to be able to use larger vectors especially for running the vertex shader. With this patch we build soa vectors which might have a different length than 4. Note that aos structures really remain the same, only when aos structures are converted to soa potentially different sized vectors are used. Samplers probably don't work yet, didn't look at them. Testing done: glxgears works with both 128bit and 256bit vectors. commit f4950fc1ea784680ab767d3dd0dce589f4e70603 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:57 2012 +0100 gallivm: override native vector width with LP_NATIVE_VECTOR_WIDTH env var for debug commit 6ad6dbf0c92f3bf68ae54e5f2aca035d19b76e53 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:24 2012 +0100 draw: allocate storage with alignment according to native vector width commit 7bf0e3e7c9bd2469ae7279cabf4c5229ae9880c1 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Feb 24 19:06:08 2012 +0000 gallivm: Fix comment grammar. Was missing several words. Spotted by Roland. commit b20f1b28eb890b2fa2de44a0399b9b6a0d453c52 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 19:22:09 2012 +0000 gallivm: Use MC-JIT on LLVM 3.1 + (i.e, SVN) MC-JIT Note: MC-JIT is still WIP. For this to work correctly it requires LLVM changes which are not yet upstream. commit b1af4dfcadfc241fd4023f4c3f823a1286d452c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 20:03:15 2012 +0100 llvmpipe: use new lp_type_width() helper in lp_test_blend commit 04e0a37e888237d4db2298f31973af459ef9c95f Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 19:50:34 2012 +0100 llvmpipe: clean up lp_test_blend a little Using variables just sized and aligned right makes it a bit more obvious what's going on. The test still only tests vector length 4. For AoS anything else probably isn't going to work. For SoA other lengths should work (at least with floats). commit e61c393d3ec392ddee0a3da170e985fda885a823 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:48:30 2012 +0000 gallivm: Ensure vector width consistency. Instead of assuming that everything is the max native size. commit 330081ac7bc41c5754a92825e51456d231bf84dd Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:44:14 2012 +0000 draw: More simd vector width consistency fixes. commit d90ca002753596269e37297e2e6c139b19f29f03 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:43:00 2012 +0000 gallivm: Remove unused lp_build_int32_vec4_type() helper. commit cae23417824d75869c202aaf897808d73a2c1db0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 17:32:16 2012 +0100 gallivm: use global variable for native vector width instead of define We do not know the simd extensions (and hence the simd width we should use) available at compile time. At least for now keep a define for maximum vector width, since a global variable obviously can't be used to adjust alignment of automatic stack variables. Leave the runtime-determined value at 128 for now in all cases. commit 51270ace6349acc2c294fc6f34c025c707be538a Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 15:41:02 2012 +0000 gallivm: Add a hunk inadvertedly lost when rebasing. commit bf256df9cfdd0236637a455cbaece949b1253e98 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:24:23 2012 +0000 llvmpipe: Use consistent vector width in depth/stencil test. commit 5543b0901677146662c44be2cfba655fd55da94b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:19:59 2012 +0000 draw: Use a consistent the vector register width. Instead of 4x32 sometimes, LP_NATIVE_VECTOR_WIDTH other times. commit eada8bbd22a3a61f549f32fe2a7e408222e5c824 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 12:08:04 2012 +0000 gallivm: Remove garbagge collection. MC-JIT will require one compilation per module (as opposed to one compilation per function), therefore no state will be shared, eliminating the need to do garbagge collection. commit 556697ea0ed72e0641851e4fbbbb862c470fd7eb Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 10:33:41 2012 +0000 gallivm: Move all native target initialization to lp_set_target_options(). commit c518e8f3f2649d5dc265403511fab4bcbe2cc5c8 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:52:32 2012 +0000 llvmpipe: Create one gallivm instance for each test. commit 90f10af8920ec6be6f2b1e7365cfc477a0cb111d Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:48:08 2012 +0000 gallivm: Avoid LLVMAddGlobalMapping() in lp_bld_assert(). Brittle, complex, and unecesary. Just use function pointer constant. commit 98fde550b33401e3fe006af59db4db628bcbf476 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:21:26 2012 +0000 gallivm: Add a lp_build_const_func_pointer() helper. To be reused in all places where we want to call C code. commit 6cfedadb62c2ce5af8d75969bc95a607f3ece118 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:44:41 2012 +0000 gallivm: Cleanup/simplify lp_build_const_string_variable. - Move to lp_bld_const where it belongs - Rename to lp_build_const_string - take the length from the argument (and don't count the zero terminator twice) - bitcast the constant to generic i8 * commit db1d4018c0f1fa682a9da93c032977659adfb68c Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 11:52:17 2012 +0000 gallivm: Set NoFramePointerElimNonLeaf to true where supported. commit 088614164aa915baaa5044fede728aa898483183 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 22 19:38:47 2012 +0100 llvmpipe: pass in/out pointers rather scalar floats in lp_bld_arit we don't want llvm to potentially optimize away the vectors (though it doesn't seem to currently), plus we want to be able to handle in/out vectors of arbitrary length. commit 3f5c4e04af8a7592fdffa54938a277c34ae76b51 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:22:55 2012 +0100 gallivm: fix lp_build_sqrt() for vector length 1 since we optimize away vectors with length 1 need to emit intrinsic without vector type. commit 79d94e5f93ed8ba6757b97e2026722ea31d32c06 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 17:00:46 2012 +0000 llvmpipe: Remove lp_test_round. commit 81f41b5aeb3f4126e06453cfc78990086b85b78d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:56:24 2012 +0100 llvmpipe: subsume lp_test_round into lp_test_arit Much simpler, and since the arguments aren't passed as 128bit values can run on any arch. This also uses the float instead of the double versions of the c functions (which probably was the intention anyway). In contrast to lp_test_round the output is much less verbose however. Tested vector width of 32 to 512 bits - all pass except 32 (length 1) which crashes in lp_build_sqrt() due to wrong type. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 945b338b421defbd274481d8c4f7e0910fd0e7eb Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 09:55:03 2012 +0000 gallivm: Centralize the function compilation logic. This simplifies a lot of code. Also doing this in a central place will make it easier to carry out the changes necessary to use MC-JIT in the future. gallivm: Fix typo in explicit derivative shuffle. Trivial. draw: make DEBUG_STORE work again adapt to lp_build_printf() interface changes Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: get rid of vecnf_from_scalar() just use lp_build_broadcast directly (cannot assign a name but don't really need it, vecnf_from_scalar() was producing much uglier IR due to using repeated insertelement instead of insertelement+shuffle). Reviewed-by: José Fonseca <jfonseca@vmware.com> llvmpipe: fix typo in complex interpolation code Fixes position interpolation when using complex mode (piglit fp-fragment-position and similar) Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: fix clipvertex/position storing again This appears to be the result of a bad merge. Fixes piglit tests relying on clipping, like a lot of the interpolation tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Fix explicit derivative manipulation. Same counter variable was being used in two nested loops. Use more meanigful variable names for the counter to fix and avoid this. gallivm: Prevent buffer overflow in repeat wrap mode for NPOT. Based on Roland's patch, discussion, and review . Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix dims for TGSI_TEXTURE_1D in emit_tex. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix explicit volume texture derivatives. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: fix 1d shadow texture sampling Always r coordinate is used, hence need 3 coords not two (the second one is unused). Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Enable AVX support without MCJIT, where available. For now, this just enables AVX on Windows for testing. If the code is stable then we might consider prefering the old JIT wherever possible. No change elsewhere. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2012-07-13 18:09:30 +01:00
/**
* Same as lp_build_ifloor_fract, but guarantees that the fractional part is
* always smaller than one.
*/
void
lp_build_ifloor_fract_safe(struct lp_build_context *bld,
LLVMValueRef a,
LLVMValueRef *out_ipart,
LLVMValueRef *out_fpart)
{
lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
*out_fpart = clamp_fract(bld, *out_fpart);
}
LLVMValueRef
lp_build_sqrt(struct lp_build_context *bld,
LLVMValueRef a)
{
LLVMBuilderRef builder = bld->gallivm->builder;
const struct lp_type type = bld->type;
LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
char intrinsic[32];
2010-08-09 17:30:33 +01:00
assert(lp_check_value(type, a));
assert(type.floating);
lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type);
return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
}
/**
* Do one Newton-Raphson step to improve reciprocate precision:
*
* x_{i+1} = x_i * (2 - a * x_i)
*
* XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
* +/-Inf, giving NaN instead. Certain applications rely on this behavior,
* such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
* halo. It would be necessary to clamp the argument to prevent this.
*
* See also:
* - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
* - http://softwarecommunity.intel.com/articles/eng/1818.htm
*/
static inline LLVMValueRef
lp_build_rcp_refine(struct lp_build_context *bld,
LLVMValueRef a,
LLVMValueRef rcp_a)
{
LLVMBuilderRef builder = bld->gallivm->builder;
LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
LLVMValueRef res;
res = LLVMBuildFMul(builder, a, rcp_a, "");
res = LLVMBuildFSub(builder, two, res, "");
res = LLVMBuildFMul(builder, rcp_a, res, "");
return res;
}
LLVMValueRef
lp_build_rcp(struct lp_build_context *bld,
LLVMValueRef a)
{
LLVMBuilderRef builder = bld->gallivm->builder;
const struct lp_type type = bld->type;
2010-08-09 17:30:33 +01:00
assert(lp_check_value(type, a));
if(a == bld->zero)
return bld->undef;
if(a == bld->one)
return bld->one;
if(a == bld->undef)
return bld->undef;
assert(type.floating);
if(LLVMIsConstant(a))
return LLVMConstFDiv(bld->one, a);
/*
* We don't use RCPPS because:
* - it only has 10bits of precision
* - it doesn't even get the reciprocate of 1.0 exactly
* - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
* - for recent processors the benefit over DIVPS is marginal, a case
gallivm,draw,llvmpipe: Support wider native registers. Squashed commit of the following: commit 7acb7b4f60dc505af3dd00dcff744f80315d5b0e Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:46:31 2012 +0100 draw: Don't use dynamically sized arrays. Not supported by MSVC. commit 5810c28c83647612cb372d1e763fd9d7780df3cb Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:44:16 2012 +0100 gallivm,llvmpipe: Don't use expressions with PIPE_ALIGN_VAR(). MSVC doesn't accept exceptions in _declspec(align(...)). Use a define instead. commit 8aafd1457ba572a02b289b3f3411e99a3c056072 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:41:56 2012 +0100 gallium/util: Make u_cpu_detect.h header C++ safe. commit 5795248350771f899cfbfc1a3a58f1835eb2671d Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 2 12:08:01 2012 +0100 gallium/util: Add ULL suffix to large constants. As suggested by Andy Furniss: it looks like some old gcc versions require it. commit 4c66c22727eff92226544c7d43c4eb94de359e10 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Truly disable INF/NAN tests on MSVC. Thanks to Brian for spotting this. commit 8bce274c7fad578d7eb656d9a1413f5c0844c94e Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Disable INF/NAN tests on MSVC. Somehow they are not recognized as constants. commit 6868649cff8d7fd2e2579c28d0b74ef6dd4f9716 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 15:05:24 2012 +0200 gallivm: Cleanup the 2 x 8 float -> 16 ub special path in lp_build_conv. No behaviour change intended, like 7b98455fb40c2df84cfd3cdb1eb7650f67c8a751. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 5147a0949c4407e8bce9e41d9859314b4a9ccf77 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 14:28:19 2012 +0200 gallivm: (trivial) fix issues with multiple-of-4 texture fetch Some formats can't handle non-multiple of 4 fetches I believe, but everything must support length 1 and multiples of 4. So avoid going to scalar fetch (which is very costly) just because length isn't 4. Also extend the hack to not use shift with variable count for yuv formats to arbitrary length (larger than 1) - doesn't matter how many elements we have we always want to avoid it unless we have variable shift count instruction (which we should get with avx2). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 87ebcb1bd71fa4c739451ec8ca89a7f29b168c08 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jul 4 02:09:55 2012 +0200 gallivm: (trivial) fix typo for wrap repeat mode in linear filtering aos code This would lead to bogus coordinates at the edges. (undetected by piglit because this path is only taken for block-based formats). Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 3a42717101b1619874c8932a580c0b9e6896b557 Author: José Fonseca <jfonseca@vmware.com> Date: Tue Jul 3 19:42:49 2012 +0100 gallivm: Fix TGSI integer translation with AVX. commit d71ff104085c196b16426081098fb0bde128ce4f Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 15:17:41 2012 +0100 llvmpipe: Fix LLVM JIT linear path. It was not working properly because it was looking at the JIT function before it was actually compiled. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit a94df0386213e1f5f9a6ed470c535f9688ec0a1b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Jun 28 18:07:10 2012 +0100 gallivm: Refactor lp_build_broadcast(_scalar) to share code. Doesn't really change the generated assembly, but produces more compact IR, and of course, makes code more consistent. Reviewed-by: Brian Paul <brianp@vmware.com> commit 66712ba2731fc029fa246d4fc477d61ab785edb5 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 17:30:13 2012 +0100 gallivm: Make LLVMContextRef a singleton. There are any places inside LLVM that depend on it. Too many to attempt to fix. Reviewed-by: Brian Paul <brianp@vmware.com> commit ff5fb7897495ac263f0b069370fab701b70dccef Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 28 18:15:27 2012 +0200 gallivm: don't use 8-wide texture fetch in aos path This appears to be a slight loss usually. There are probably several reasons for that: - fetching itself is scalar - filtering is pure int code hence needs splitting anyway, same for the final texel offset calculations - texture wrap related code, which can be done 8-wide, is slightly more complex with floats (with clamp_to_edge) and float operations generally more costly hence probably not much faster overall - the code needed to split when encountering different mip levels for the quads, adding complexity So, just split always for aos path (but leave it 8-wide for soa, since we do 8-wide filtering there when possible). This should certainly be revisited if we'd have avx2 support. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ce8032b43dcd8e8d816cbab6428f54b0798f945d Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:41:19 2012 +0200 gallivm: (trivial) don't extract fparts variable if not needed Did not have any consequences but unnecessary. commit aaa9aaed8f80dc282492f62aa583a7ee23a4c6d5 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:09:06 2012 +0200 gallivm: fix precision issue in aos linear int wrap code now not just passes at a quick glance but also with piglit... If we do the wrapping with floats, we also need to set the weights accordingly. We can potentially end up with different (integer) coordinates than what the integer calculations would have chosen, which means the integer weights calculated previously in this case are completely wrong. Well at least that's what I think happens, at least recalculating the weights helps. (Some day really should refactor all the wrapping, so we do whatever is fastest independent of 16bit int aos or 32bit float soa filtering.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit fd6f18588ced7ac8e081892f3bab2916623ad7a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 11:15:53 2012 +0100 gallium/util: Fix parsing of options with underscore. For example GALLIVM_DEBUG=no_brilinear which was being parsed as two options, "no" and "brilinear". commit 09a8f809088178a03e49e409fa18f1ac89561837 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 26 15:00:14 2012 +0100 gallivm: Added a generic lp_build_print_value which prints a LLVMValueRef. Updated lp_build_printf to share common code. Removed specific lp_build_print_vecX. Reviewed-by: José Fonseca <jfonseca@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit e59bdcc2c075931bfba2a84967a5ecd1dedd6eb0 Author: José Fonseca <jfonseca@vmware.com> Date: Wed May 16 15:00:23 2012 +0100 draw,llvmpipe: Avoid named struct types on LLVM 3.0 and later. Starting with LLVM 3.0, named structures are meant not for debugging, but for recursive data types, previously also known as opaque types. The recursive nature of these types leads to several memory management difficulties. Given that we don't actually need recursive types, avoid them altogether. This is an attempt to address fdo bugs 41791 and 44466. The issue is somewhat random so there's no easy way to check how effective this is. Cherry-picked from 9af1ba565dfd5cef9ee938bb7c04767d14878fbf commit df6070f618a203c7a876d984c847cde4cbc26bdb Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 14:42:53 2012 +0200 gallivm: (trivial) fix typo in faster aos linear int wrap code no longer crashes, now REALLY tested. commit d8f98dce452c867214e6782e86dc08562643c862 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 18:20:58 2012 +0200 llvmpipe: (trivial) remove bogus optimization for float aos repeat wrap This optimization for nearest filtering on the linear path generated likely bogus results, and the int path didn't have any optimizations there since the only shader using force_nearest apparently uses clamp_to_edge not repeat wrap anyway. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit c4e271a0631087c795e756a5bb6b046043b5099d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 23:01:52 2012 +0200 gallivm: faster repeat wrap for linear aos path too Even if we already have scaled integer coords, it's way faster to use the original float coord (plus some conversions) rather than use URem. The choice of what to do for texture wrapping is not really tied to int aos or float soa filtering though for some modes there can be some gains (because of easier weight calculations). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 1174a75b1806e92aee4264ffe0ffe7e70abbbfa3 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 14:39:22 2012 +0200 gallivm: improve npot tex wrap repeat in linear soa path URem gets translated into series of scalar divisions so just about anything else is faster. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit f849ffaa499ed96fa0efd3594fce255c7f22891b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 00:40:35 2012 +0100 gallivm: (trivial) fix near-invisible shift-space typo I blame the keyboard. commit 5298a0b19fe672aebeb70964c0797d5921b51cf0 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:24:28 2012 +0200 gallivm: add new intrinsic helper to deal with arbitrary vector length This helper will split vectors which are too large for the hw, or expand them if they are too small, so a caller of a function using intrinsics which uses such sizes need not split (or expand) the vectors manually and the function will still use the intrinsic instead of dropping back to generic llvm code. It can also accept scalars for use with pseudo-vector intrinsics (only useful for float arguments, all x86 scalar simd float intrinsics use 4vf32). Only used for lp_build_min/max() for now (also added the scalar float case for these while there). (Other basic binary functions could use it easily, whereas functions with a different interface would need different helpers.) Expanding vectors isn't widely used, because we always try to use build contexts with native hw vector sizes. But it might (or not) be nicer if this wouldn't need to be done, the generated code should in theory stay the same (it does get hit by lp_build_rho though already since we didn't have a intrinsic for the scalar lp_build_max case before). v2: incorporated Brian's feedback, and also made the scalar min/max case work instead of crash (all scalar simd float intrinsics take 4vf32 as argument, probably the reason why it wasn't used before). Moved to lp_bld_intr based on José's request, and passing intrinsic size instead of length. Ideally we'd derive the source type info from the passed in llvm value refs and process some llvmtype return type so we could handle intrinsics where the source and destination type isn't the same (like float/int conversions, packing instructions) but that's a bit too complicated for now. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 01aa760b99ec0b2dc8ce57a43650e83f8c1becdf Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:19:18 2012 +0200 gallivm: (trivial) increase max code size for shader disassembly 64kB was just short of what I needed (which caused a crash) hence increase to 96kB (should probably be smarter about that). commit 74aa739138d981311ce13076388382b5e89c6562 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:53:29 2012 +0100 gallivm: simplify aos float tex wrap repeat nearest just handle pot and npot the same. The previous pot handling ended up with exactly the same instructions plus 2 more (leave it in the soa path though since it is probably still cheaper there). While here also fix a issue which would cause a crash after an assert. commit 0e1e755645e9e49cfaa2025191e3245ccd723564 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:29:24 2012 +0100 gallivm: (trivial) skip floor rounding in ifloor when not signed This was only done for the non-sse41 case before, but even with sse41 this is obviously unnecessary (some callers already call itrunc in this case anyway but some might not). commit 7f01a62f27dcb1d52597b24825931e88bae76f33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:23:12 2012 +0100 gallivm: (trivial) fix bogus comments commit 5c85be25fd82e28490274c468ce7f3e6e8c1d416 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 20 11:51:57 2012 +0100 translate: Free elt8_func/elt16_func too. These were leaking. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 0ad498f36fb6f7458c7cffa73b6598adceee0a6c Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 15:55:34 2012 +0200 gallivm: fix bug for tex wrap repeat with linear sampling in aos float path The comparison needs to be against length not length_minus_one, otherwise the max texel is never chosen (for the second coordinate). Fixes piglit texwrap-1D-npot-proj (and 2D/3D versions). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit d1ad65937c5b76407dc2499b7b774ab59341209e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 16:13:43 2012 +0200 gallivm: simplify soa tex wrap repeat with npot textures and no mip filtering Similar to what is already done in aos sampling for the float path (but not the int path since we don't get normalized float coordinates there). URem is expensive and the calculation is done trivially with normalized floats instead (at least with sse41-capable cpus). (Some day should probably do the same for the mip filter path but it's much more complicated there hence the gain is smaller.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit e1e23f57ba9b910295c306d148f15643acc3fc83 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:38:56 2012 +0200 llvmpipe: (trivial) remove duplicated function declaration Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 07ca57eb09e04c48a157733255427ef5de620861 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:37:34 2012 +0200 llvmpipe: destroy setup variants on context destruction lp_delete_setup_variants() used to be called in garbage collection, but this no longer exists hence the setup shaders never got freed. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ed0003c633859a45f9963a479f4c15ae0ef1dca3 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 16:25:29 2012 +0100 gallivm: handle different ilod parts for multiple quad sampling This fixes filtering when the integer part of the lod is not the same for all quads. I'm not fully convinced of that solution yet as it just splits the vector if the levels to be sampled from are different. But otherwise we'd need to do things like some minify steps, and getting mip level base address separately anyway hence it wouldn't really look like much of a win (and making the code even more complex). This should now give identical results to single quad sampling. commit 8580ac4cfc43a64df55e84ac71ce1a774d33c0d2 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:14:47 2012 +0200 gallivm: de-duplicate sample code common to soa and aos sampling There doesn't seem to be any reason why this code dealing with cube face selection, lod and mip level calculation is separate in aos and soa sampling, and I am sick of having it to change in both places. commit fb541e5f957408ce305b272100196f1e12e5b1e8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:15:41 2012 +0200 gallivm: do mip filtering with per quad lod_fpart This gives better results for mip filtering, though the generated code might not be optimal. For now it also creates some artifacts if the lod_ipart isn't the same for all quads, since instead of using the same mip weight for all quads as previously (which just caused non-smooth gradients) this now will use the right weights but with the wrong mip level in this case (can easily be seen with things like texfilt, mipmap_tunnel). v2: use logic helper suggested by José, and fix issue with negative lod_fpart values commit f1cc84eef7d826a20fab6cd8ccef9a275ff78967 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 13 18:35:25 2012 +0200 gallivm: (trivial) fix bogus assert in lp_build_unpack_broadcast_aos_scalars commit 7c17dbae8ae290df9ce0f50781a09e8ed640c044 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:11:14 2012 +0100 util: Reimplement half <-> float conversions. Removed u_half.py used to generate the table for previous method. Previous implementation of float to half conversion was faulty for denormalised and NaNs and would require extra logic to fix, thus making the speedup of using tables irrelevant. commit 7762f59274070e1dd4b546f5cb431c2eb71ae5c3 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:12:16 2012 +0100 tests: Updated tests to properly handle NaN for half floats. commit fa94c135aea5911fd93d5dfb6e6f157fb40dce5e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:33:10 2012 +0200 gallivm: do mip level calculations per quad This is the final piece which shouldn't change the rendering output yet. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 23cbeaddfe03c09ca18c45d28955515317ffcf4c Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Jun 9 00:54:21 2012 +0200 gallivm: do per-quad cube face selection Doesn't quite fix the piglit cubemap test (not sure why actually) but doing per-quad face selection is doing the right thing and definitely an improvement. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit abfb372b3702ac97ac8b5aa80ad1b94a2cc39d33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:22:59 2012 +0200 gallivm: do all lod calculations per quad Still no functional change but lod is now converted to scalar after lod calculations. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 519368632747ae03feb5bca9c655eccbc5b751b4 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:46:10 2012 +0100 gallivm: Added support for half-float to float conversion in lp_build_conv. Updated various utility functions to support this change. commit 135b4d683a4c95f7577ba27b9bffa4a6fbd2c2e7 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:02:46 2012 +0100 gallivm: Added function for half-float to float conversion. Updated lp_build_format_aos_array to support half-float source. commit 37d648827406a20c5007abeb177698723ed86673 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:55:18 2012 +0100 util: Updated u_format_tests to rigidly test half-float boundary values. commit 2ad18165d96e578aa9046df7c93cb1c3284d8c6b Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:54:16 2012 +0100 llvmpipe: Updated lp_test_format to properly handle Inf/NaN results. commit 78740acf25aeba8a7d146493dd5c966e22c27b73 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:53:30 2012 +0100 util: Added functions for checking NaN / Inf for double and half-floats. commit 35e9f640ae01241f9e0d67fe893bbbf564c05809 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:05:13 2012 +0200 gallivm: Fix calculating rho for 3d textures for the single-quad case Discovered by accident, this looks like a very old typo bug. commit fc1220c636326536fd0541913154e62afa7cd1d8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:04:59 2012 +0200 gallivm: do calcs per-quad in lp_build_rho Still convert to scalar at the end of the function. commit 50a887ffc550bf310a6988fa2cea5c24d38c1a41 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 21 23:21:50 2012 +0200 gallivm: (trivial) return scalar in lp_build_extract_range for length 1 vectors Our type system on top of llvm's one doesn't generally support vectors of length 1, instead using scalars. So we should return a scalar from this function instead of having to bitcast the vector with length 1 later elsewhere. commit 80c71c621f9391f0f9230460198d861643324876 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 17:49:15 2012 +0100 draw: Fixed bad merge error commit c47401cfad0c9167de20ff560654f533579f452c Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:29:30 2012 +0100 draw: Updated store_clip to store whole vectors instead of individual elements. commit 2d9c1ad74b0b0b41861fffcecde39f09cc27f1cf Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:28:32 2012 +0100 gallivm: Added lp_build_fetch_rgba_aos_array. A version of lp_build_fetch_rgba_aos which is targeted at simple array formats. Reads the whole vector from memory in one, instead of reading each element individually. Tested with mesa tests and demos. commit ff7805dc2b6ef6d8b11ec4e54aab1633aef29ac8 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:27:40 2012 +0100 gallivm: Added lp_build_pad_vector. This function pads a vector with undef to a desired length. commit 701f50acef24a2791dabf4730e5b5687d6eb875d Author: James Benton <jbenton@vmware.com> Date: Fri May 18 17:27:19 2012 +0100 util: Added util_format_is_array. This function checks whether a format description is in a simple array format. commit 5e0a7fa543dcd009de26f34a7926674190fa6246 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:13:47 2012 +0100 draw: Removed draw_llvm_translate_from and draw/draw_llvm_translate.c. This is "replaced" by adding an optimised path in lp_build_fetch_rgba_aos in an upcoming patch. commit 8c886d6a7dd3fb464ecf031de6f747cb33e5361d Author: James Benton <jbenton@vmware.com> Date: Wed May 16 15:02:31 2012 +0100 draw: Modified store_aos to write the vector as one, not individual elements. commit 37337f3d657e21dfd662c7b26d61cb0f8cfa6f17 Author: James Benton <jbenton@vmware.com> Date: Wed May 16 14:16:23 2012 +0100 draw: Changed aos_to_soa to use lp_build_transpose_aos. commit bd2b69ce5d5c94b067944d1dcd5df9f8e84548f1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:14:27 2012 +0100 draw: Changed soa_to_aos to use lp_build_transpose_aos. commit 0b98a950d29a116e82ce31dfe7b82cdadb632f2b Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:45 2012 +0100 gallivm: Added lp_build_transpose_aos which converts between aos and soa. commit 69ea84531ad46fd145eb619ed1cedbe97dde7cb5 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:01 2012 +0100 gallivm: Added lp_build_interleave2_half aimed at AVX unpack instructions. commit 7a4cb1349dd35c18144ad5934525cfb9436792f9 Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 22 11:54:14 2012 +0100 gallivm: Fix build on Windows. MC-JIT not yet supported there. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit afd105fc16bb75d874e418046b80d9cc578818a1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:17:26 2012 +0100 llvmpipe: Added a error counter to lp_test_conv. Useful for keeping track of progress when fixing errors! Signed-off-by: José Fonseca <jfonseca@vmware.com> commit b644907d08c10a805657841330fc23db3963d59c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:16:46 2012 +0100 llvmpipe: Changed known failures in lp_test_conv. To comply with the recent fixes to lp_bld_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit d7061507bd94f6468581e218e61261b79c760d4f Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:14:38 2012 +0100 llvmpipe: Added fixed point types tests to lp_test_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 146b3ea39b4726dbe125ac666bd8902ea3d6ca8c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:26:35 2012 +0100 llvmpipe: Changed lp_test_conv src/dst alignment to be correct. Now based on the define rather than a fixed number. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit f3b57441f834833a4b142a951eb98df0aa874536 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:06:44 2012 +0100 gallivm: Fixed erroneous optimisation in lp_build_min/max. Previously assumed normalised was 0 to 1, but it can be -1 to 1 if type is signed. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a0613382e5a215cd146bb277646a6b394d376ae4 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:04:49 2012 +0100 gallivm: Compensate for lp_const_offset in lp_build_conv. Fixing a /*FIXME*/ to remove errors in integer conversion in lp_build_conv. Tested using lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a3d2bf15ea345bc8a0664f8f441276fd566566f3 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:01:25 2012 +0100 gallivm: Fixed overflow in lp_build_clamped_float_to_unsigned_norm. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit e7b1e76fe237613731fa6003b5e1601a2e506207 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 21 20:07:51 2012 +0100 gallivm: Fix build with LLVM 2.6 Trivial, and useful. commit d3c6bbe5c7f5ba1976710831281ab1b6a631082d Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 15 17:15:59 2012 +0100 gallivm: Enable MCJIT/AVX with vanilla LLVM 3.1. Add the necessary C++ glue, so that we don't need any modifications to the soon to be released LLVM 3.1. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 724a019a14d40fdbed21759a204a2bec8a315636 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 22:04:06 2012 +0100 gallivm: Use HAVE_LLVM 0x0301 consistently. commit af6991e2a3868e40ad599b46278551b794839748 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 21:49:06 2012 +0100 gallivm: Add MCRegisterInfo.h to silence benign warnings about missing implementation. Trivial. commit 6f8a1d75458daae2503a86c6b030ecc4bb494e23 Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Apr 2 22:14:15 2012 -0700 gallivm: Pass in a MCInstrInfo to createMCInstPrinter on llvm-3.1. llvm-3.1svn r153860 makes MCInstrInfo available to the MCInstPrinter. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 62555b6ed8760545794f83064e27cddcb3ce5284 Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 27 21:51:17 2012 -0700 gallivm: Fix method overriding in raw_debug_ostream. Use matching type qualifers to avoid method hiding. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 6a9bd784f4ac68ad0a731dcd39e5a3c39989f2be Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 13 22:40:52 2012 -0700 gallivm: Fix createOProfileJITEventListener namespace with llvm-3.1. llvm-3.1svn r152620 refactored the OProfile profiling code. createOProfileJITEventListener was moved from the llvm namespace to the llvm::JITEventListener namespace. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit b674955d39adae272a779be85aa1bd665de24e3e Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Mar 5 22:00:40 2012 -0800 gallivm: Pass in a MCRegisterInfo to MCInstPrinter on llvm-3.1. llvm-3.1svn r152043 changes createMCInstPrinter to take an additional MCRegisterInfo argument. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 11ab69971a8a31c62f6de74905dbf8c02884599f Author: Vinson Lee <vlee@freedesktop.org> Date: Wed Feb 29 21:20:53 2012 -0800 Revert "gallivm: Change getExtent and readByte to non-const with llvm-3.1." This reverts commit d5a6c172547d8964f4d4bb79637651decaf9deee. llvm-3.1svn r151687 makes MemoryObject accessor members const again. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 339960c82d2a9f5c928ee9035ed31dadb7f45537 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 14 16:19:56 2012 +0200 gallivm: (trivial) fix assertion failure for mipmapped 1d textures In lp_build_rho, we may end up with a 1-element vector (for mipmapped 1d textures), but in this case we require the type to be a non-vector type, so need a cast. commit 9d73edb727bd6d196030dc3026b7bf0c574b3e19 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:12:07 2012 +0200 gallivm: prepare for per-quad lod calculations for large vectors to be able to handle multiple quads at once in texture sampling and still do lod calculations per quad, it is necessary to get the per-quad derivatives into the lp_build_rho function. Until now these derivative values were just scalars, which isn't going to work. So we now use vectors, and since the interface needs to change we also do some different (slightly more efficient) packing of the values. For 8-wide vectors the packed derivative values for 3 coords would look like this, this scales to a arbitrary (multiple of 4) vector size: ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____ The second vector will be unused for 1d and 2d textures. To facilitate future changes the derivative values are put into a struct, since quite some functions just pass these values through. The generated code seems to be very slightly better for 2d textures (with 4-wide vectors) than before with sse2 (if you have a cpu with physical 128bit simd units - otherwise it's probably not a win). v2: suggestions from José, rename variables, add comments, use swizzle helper commit 0aa21de0d31466dac77b05c97005722e902517b8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:10:31 2012 +0200 gallivm: add undefined swizzle handling to lp_build_swizzle_aos This is useful for vectors with "holes", it lets llvm choose the most efficient shuffle instructions if some elements aren't needed without having to worry what elements to manually pick otherwise. commit 00faf3f370e7ce92f5ef51002b0ea42ef856e181 Author: José Fonseca <jfonseca@vmware.com> Date: Fri May 4 17:25:16 2012 +0100 gallivm: Get the LLVM IR optimization passes before JIT compilation. MC-JIT engine compiles the module immediately on creation, so the optimization passes were being run too late. So now we create a target data layout from a string, that matches the ABI parameters reported by the compiler. The backend optimization passes were always been run, so the performance improvement is modest (3% on multiarb mesa demo). Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit 40a43f4e2ce3074b5ce9027179d657ebba68800a Author: Roland Scheidegger <sroland@vmware.com> Date: Wed May 2 16:03:54 2012 +0200 gallivm: (trivial) fix wrong define used in lp_build_pack2 should fix stack-smashing crashes. commit e6371d0f4dffad4eb3b7a9d906c23f1c88a2ab9e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 30 21:25:29 2012 +0200 gallivm: add perf warnings when not using intrinsics with 256bit vectors Helper functions using integer sse2 intrinsics could split the vectors with AVX instead of using generic fallback (which should be faster). We don't actually expect to hit these paths (hence don't fix them up to actually do the vector splitting) so just emit warnings (for those functions where it's obvious doing split/intrinsic is faster than using generic path). Only emit warnings for 256bit vectors since we _really_ don't expect to hit arbitrary large vectors which would affect a lot more functions. The warnings do not actually depend on avx since the same logic applies to plain sse2 too (but of course again there's _really_ no reason we should hit these functions with 256bit vectors without avx). commit 8a9ea701ea7295181e846c6383bf66a5f5e47637 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:37:07 2012 +0200 gallivm: split vectors manually for avx in lp_build_pack2 (v2) There's 2 reasons for this: First, there's a llvm bug (fixed in 3.1) which generates tons of byte inserts/extracts otherwise, and second, more importantly, we want to use pack intrinsics instead of shuffles. We do this in lp_build_pack2 and not the calling code (aos sample path) because potentially other callers might find that useful too, even if for larger sequences of code using non-native vector sizes it might be better to manually split vectors. This should boost texture performance in the aos path considerably. v2: fix issues with intrinsics types with old llvm commit 27ac5b48fa1f2ea3efeb5248e2ce32264aba466e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:26:22 2012 +0200 llvmpipe: refactor lp_build_pack2 (v2) prettify, and it's unnecessary to assert when there's no intrinsic due to unsupported bit width - the shuffle path will work regardless. In contrast lp_build_packs2, should only rely on lp_build_pack2 doing the clamping for element sizes for which there is a sse2 intrinsic. v2: fix bug spotted by Jose regarding the intrinsic type for packusdw on old llvm versions. commit ddf279031f0111de4b18eaf783bdc0a1e47813c8 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:13:59 2012 +0200 gallivm: add src width check in lp_build_packs2() not doing so would skip clamping even if no sse2 pack instruction is available, which is incorrect (in theory only, such widths would also always hit a (unnecessary) assertion in lp_build_pack2(). commit e7f0ad7fe079975eae7712a6e0c54be4fae0114b Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Apr 27 15:57:00 2012 +0200 gallivm: (trivial) fix crash-causing typo for npot textures with avx commit 28a9d7f6f655b6ec508c8a3aa6ffefc1e79793a0 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Apr 25 19:38:45 2012 +0200 gallivm: (trivial) remove code mistakenly added twice. commit d5926537316f8ff67ad0a52e7242f7c5478d919b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Apr 24 21:16:15 2012 +0200 gallivm: add a new avx aos sample path (v2) Try to avoid mixing float and int address calculations. This does texture wrap modes with floats, and then the offset calculations still with ints (because of lack of precision with floats, though we could do some effort to make it work with not too large (16MB) textures). This also handles wrap repeat mode with npot-sized textures differently than either the old soa or aos int path (likely way faster but untested). Otherwise the actual address wrap code is largely similar to the soa path (not quite the same as this one also has some int code), it should get used by avx soa sampling later as well but doesn't handle more complex address modes yet (this will also have the benefit that we can use aos sampling path for all texture address modes). Generated code for that looks reasonable, but still does not split vectors explicitly for fetch/filter which means still get hit by llvm (fixed upstream) which generates hundreds of pinsrb/pextrb instead of two shuffles. It is not obvious though if it's much of a win over just doing address calcs 4-wide but with ints, even if it is definitely much less instructions on avx. piglit's texwrap seems to look exactly the same but doesn't test neither the non-normalized nor the npot cases. v2: fix comments, prettify based on Brian's and Jose's feedback. commit bffecd22dea66fb416ecff8cffd10dd4bdb73fce Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Apr 19 01:58:29 2012 +0200 gallivm: refactor aos lp_build_sample_image_nearest/linear split them up to separate address calculations and fetching/filtering. Need this for being able to do 8-wide float address calcs and 4-wide fetch/filter later (for avx). Plus the functions were very big scary monsters anyway (in particular lp_build_sample_image_linear). commit a80b325c57529adddcfa367f96f03557725c4773 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 16 17:17:18 2012 +0200 gallivm: fix lp_build_resize when truncating width but expanding vector size Missed this case which I thought was impossible - the assertion for it was right after the division by zero... (AoS) texture sampling may ask us to do this, for things like 8 4x32int vectors to 1 32x8int vector conversion (eventually, we probably don't want this to happen). commit f9c8337caa3eb185830d18bce8b95676a065b1d7 Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Apr 14 18:00:59 2012 +0200 gallivm: fix cube maps with larger vectors This makes the branchless cube face selection code work with larger vectors. Because the complexity is quite high (cannot really be improved it seems, per-face selection would reduce complexity a lot but this leads to errors unless the derivatives are calculated all from the same face which almost doubles the work to be done) it is still slower than the branching version, hence only enable this with large vectors. It doesn't actually do per-quad face selection yet (only makes sense with matching lod selection, in fact it will select the same face for all pixels based on the average of the first four pixels for now) but only different shuffles are required to make it work (the branching version actually should work with larger vectors too now thanks to the improved horizontal add but of course it cannot be extended to really select the face per-quad unless doing branching per quad). commit 7780c58869fc9a00af4f23209902db7e058e8a66 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 21:11:12 2012 +0100 llvmpipe: (trivial) fix compiler warning and also clarify comment regarding availability of popcnt instruction. commit a266dccf477df6d29a611154e988e8895892277e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 14:21:07 2012 +0100 gallivm: remove unneeded members in lp_build_sample_context Minor cleanup, the texture width, height, depth aren't accessed in their scalar form anywhere. Makes it more obvious those values should probably be fetched already vectorized (but this requires more invasive changes)... commit b678c57fb474e14f05e25658c829fc04d2792fff Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 29 15:53:55 2012 +0100 gallivm: add a helper for concatenating vectors Similar to the extract_range helper intended to get around slow code generated by llvm for 128bit insertelements. Concatenating two 128bit vectors this way will result in a single vinsertf128 operation rather than two 64bit stores plus one 128bit load, though it might be mildly useful for other purposes as well. commit 415ff228bcd0cf5e44a4c15350a661f0f5520029 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 19:41:15 2012 +0100 gallivm: add a custom 2x8f->1x16ub avx conversion path Similar to the existing 4x4f->1x16ub sse2 path, shaves off a couple instructions (min/max mostly) because it relies on pack intrinsics clamping. commit 78c08fc89f8fbcc6dba09779981b1e873e2a0299 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 18:44:07 2012 +0100 gallivm: add avx arithmetic intrinsics Add all avx intrinsics for arithmetic functions (with the exception of the horizontal add function which needs another look). Seems to pass basic tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit a586caa2800aa5ce54c173f7c0d4fc48153dbc4e Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 15:31:35 2012 +0100 gallivm: add avx logic intrinsics Add the blend intrinsics for 8-wide float and 4-wide double vectors. Since we lack 256bit int instructions these are used for int vectors as well, though obviously not for byte or word element values. The comparison intrinsics aren't extended for avx since these are only used for pre-2.7 llvm versions. commit 70275e4c13c89315fc2560a4c488c0e6935d5caf Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 00:40:53 2012 +0100 gallivm: new helper function for extract shuffles. Based on José's idea as we can need that in a couple places. Note that such shuffles should not be used lightly, since data layout of <4 x i8> is different to <16 x i8> for instance, hence might cause data rearrangement. commit 4d586dbae1b0c55915dda1759d2faea631c0a1c2 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 18:27:25 2012 +0100 gallivm: (trivial) don't overallocate shuffle variable using wrong define meant huge array... commit 06b0ec1f6d665d98c135f9573ddf4ba04b2121ad Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 17:54:20 2012 +0100 gallivm: don't do per-element extract/insert for vector element resize Instead of doing per-element extract/insert if the src vectors and dst vector differ in total size (which generates atrocious code) first change the src vectors size by using shuffles to destination vector size. We can still do better than that on AVX for packing to color buffer (by exploiting pack intrinsics characteristics hence eleminating the need for some clamps) but this already generates much better code. v2: incorporate feedback from José, Keith and use shuffle instead of bitcasts/extracts. Due to llvm deficiencies the latter cause all data to get moved to GPRs and back in pieces (even though the data in the regs actually stays the same...). commit c9970d70e05f95d3f52fe7d2cd794176a52693aa Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 19:33:19 2012 +0000 gallivm: fix bug in simple position interpolation Accidental use of position attribute instead of just pixel coordinates. Caused failures in piglit glsl-fs-ceil and glsl-fs-floor. commit d0b6fcdb008d04d7f73d3d725615321544da5a7e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 15:31:14 2012 +0000 gallivm: fix emission of ceil opcode lp_build_ceil seems more appropriate than lp_build_trunc. This seems to be never hit though someone performs some ceil to floor magic. commit d97fafed7e62ffa6bf76560a92ea246a1a26d256 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 22 11:46:52 2012 +0000 gallivm: new vectorized path for cubemap calculations should be faster when adapted to multiple quads as only selection masks need to be different. The code is more or less a per-pixel version adapted to only do it per quad. A per pixel version would be much simpler (could drop 2 selects, 6 broadcasts and the messy horizontal add of 3 vectors at the expense of only 2 more absolute value instructions - would also just work for arbitary large vectors). This version doesn't yet work with larger vectors because the horizontal add isn't adjusted to be able to work with 2x4 vectors (and also because face selection wouldn't be done per quad just per block though that would be only a correctness issue just as with lod selection). The downside is this code is quite a bit slower. On a Core2 it can be sped up by disabling the hw blend instructions for selection and using logicop fallbacks instead, but it is still slower than the old code, hence leave that in for now. Probably will chose one or the other version based on vector length in the end. commit b375fbb18a3fd46859b7fdd42f3e9908ea4ff9a3 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 21 14:42:29 2012 +0000 gallivm: fix optimized occlusion query intrinsic name commit a9ba0a3b611e48efbb0e79eb09caa85033dbe9a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Mar 21 16:19:43 2012 +0000 draw,gallivm,llvmpipe: Call gallivm_verify_function everywhere. commit f94c2238d2bc7383e088b8845b7410439a602071 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 18:54:10 2012 +0000 gallivm: optimize calculations for cube maps a bit this does some more vectorized calculations and uses horizontal adds if possible. A definite win with sse3 otherwise it doesn't seem to make much of a difference. In any case this is arithmetically identical, cannot handle larger vectors. Should be useful as a reference point against larger vector version later... commit 21a2c1cf3c8e1ac648ff49e59fdc0e3be77e2ebb Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 15:16:27 2012 +0000 llvmpipe: slight optimization of occlusion queries using movmskps when available. While this is slightly better for cpus without popcnt we should really sum the vectors ourselves (it is also possible to cast to i4 before doing the popcnt but that doesn't help that much neither since llvm is using some optimized popcnt version for i32) commit 5ab5a35f216619bcdf55eed52b0db275c4a06c1b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 13:32:11 2012 +0000 llvmpipe: fix occlusion queries with larger vectors need to adjust casts etc. commit ff95e6fdf5f16d4ef999ffcf05ea6e8c7160b0d5 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Mar 19 20:15:25 2012 +0000 gallivm: Restore optimization passes. commit 57b05b4b36451e351659e98946dae27be0959832 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:34:22 2012 +0000 llvmpipe: use existing min2 macro commit bc9a20e19b4f600a439f45679451f2e87cd4b299 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:07:27 2012 +0000 llvmpipe: add some safeguards against really large vectors As per José's suggestion, prevent things from blowing up if some cpu would have 1024bit or larger vectors. commit 0e2b525e5ca1c5bbaa63158bde52ad1c1564a3a9 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:31:08 2012 +0000 llvmpipe: fix mask generation for uberwide vectors this was the only piece preventing 16-wide vectors from working (apart from the LP_MAX_VECTOR_WIDTH define that is), which is the maximum as we don't get more pixels in the fragment shader at once. Hence adjust that so things could be tested properly with that size even though there seems to be no practical value. commit 3c8334162211c97f3a11c7f64e9e5a2a91ad9656 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:19:41 2012 +0000 llvmpipe: fix the simple interpolation method with larger vectors so both methods actually _really_ work now. Makes textures look nice with larger vectors... commit 1cb0464ef8871be1778d43b0c56adf9c06843e2d Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 17:26:35 2012 +0000 llvmpipe: fix mask generation and position interpolation with 8-wide vectors trivial bugs, with these things start to look somewhat reasonable. Textures though have some swizzling issues it seems. commit 168277a63ef5b72542cf063c337f2d701053ff4b Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 16:04:03 2012 +0000 llvmpipe: don't overallocate variables we never have more than 16 (stamp size) / 4 (minimum possible vector size). (With larger vectors those variables are still overallocated a bit.) commit 409b54b30f81ed0aa9ed0b01affe15c72de9abd2 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:56:48 2012 +0000 llvmpipe: add some 32f8 formats to lp_test_conv Also add the ability to handle different sized vectors. commit 55dcd3af8366ebdac0af3cdb22c2588f24aa18ce Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:47:27 2012 +0000 gallivm: handle different sized vectors in conversion / pack only fully generic path for now (extract/insert per element). commit 9c040f78c54575fcd94a8808216cf415fe8868f6 Author: Roland Scheidegger <sroland@vmware.com> Date: Sun Mar 18 00:58:28 2012 +0100 llvmpipe: fix harmless use of unitialized values commit 551e9d5468b92fc7d5aa2265db9a52bb1e368a36 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:31:21 2012 +0100 gallivm: drop special path in extract_broadcast with different sized vectors Not needed, llvm can handle shuffles with different sized result vector just fine. Should hopefully generate the same code in the end, but simpler IR. commit 44da531119ffa07a421eaa041f63607cec88f6f8 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:28:49 2012 +0100 llvmpipe: adapt interpolation for handling multiple quads at once this is still WIP there are actually two methods possible not quite sure what makes the most sense, so there's code for both for now: 1) the iterative method as used before (compute attrib values at upper left corner of stamp and upper left corner of each quad initially). It is improved to handle more than one quad at once, and also do some more vectorized calculations initially for slightly better code - newer cpus have full throughput with 4 wide float vectors, hence don't try to code up a path which might be faster if there's just one channel active per attribute. 2) just do straight interpolation for each pixel. Method 2) is more work per quad, but less initially - if all quads are executed significantly more overall though. But this might change with larger vector lengths. This method would also be needed if we'd do some kind of active quad merging when operating on multiple quads at once. This path contains some hack to force llvm to generate better code, it is still far from ideal though, still generates far too many unnecessary register spills/reloads. Both methods should work with different sized vectors. Not very well tested yet, still seems to work with four-wide vectors, need changes elsewhere to be able to test with wider vectors. commit be5d3e82e2fe14ad0a46529ab79f65bf2276cd28 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:59:37 2012 +0000 draw: Cleanup. commit f85bc12c7fbacb3de2a94e88c6cd2d5ee0ec0e8d Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:43:30 2012 +0000 gallivm: More module compilation refactoring. commit d76f093198f2a06a93b2204857e6fea5fd0b3ece Author: José Fonseca <jfonseca@vmware.com> Date: Thu Mar 15 21:29:11 2012 +0000 llvmpipe: Use gallivm_compile/free_function() in linear code. Should had been done before. commit 122e1adb613ce083ad739b153ced1cde61dfc8c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 13 14:47:10 2012 +0100 llvmpipe: generate partial pixel mask for multiple quads still works with one quad, cannot be tested yet with more At least for now always fixed order with multiple quads. commit 4c4f15081d75ed585a01392cd2dcce0ad10e0ea8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 22:09:24 2012 +0100 llvmpipe: refactor state setup a bit Refactor to make it easier to emit (and potentially later fetch in fs) coefficients for multiple attributes at once. Need to think more about how to make this actually happen however, the problem is different attributes can have different interpolation modes, requiring different handling in both setup and fs (though linear and perspective handling is close). commit 9363e49722ff47094d688a4be6f015a03fba9c79 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 19:23:23 2012 +0100 llvmpipe: vectorize tri offset calc cuts number of instructions in quad-offset-factor from 107 to 75. This code actually duplicated the (scalar) code calculating the determinant except it used different vertex order (leading to different sign but it doesn't matter) hence llvm could not have figured out it's the same (of course with determinant vectorized in the other place that wouldn't have worked any longer neither). Note this particular piece doesn't actually vectorize well, not many arithmetic instructions left but tons of shuffle instructions... Probably would need to work on n tris at a time for better vectorization. commit 63169dcb9dd445c94605625bf86d85306e2b4297 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 03:11:37 2012 +0100 llvmpipe: vectorize some scalar code in setup reduces number of arithmetic instructions, and avoids loading vector x,y values twice (once as scalars once as vectors). Results in a reduction of instructions from 76 to 64 in fs setup for glxgears (16%) on a cpu with sse41. Since this code uses vec2 disguised as vec4, on old cpus which had physical 64bit sse units (pre-Core2) it probably is less of a win in practice (and if you have no vectors you can only hope llvm eliminates the arithmetic for unneeded elements). commit 732ecb877f951ab89bf503ac5e35ab8d838b58a1 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 7 00:32:24 2012 +0100 draw: fix clipping bug introduced by 4822fea3f0440b5205e957cd303838c3b128419c broke clipping pretty badly (verified with lineclip test) commit ef5d90b86d624c152d200c7c4056f47c3c6d2688 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 23:38:59 2012 +0100 draw: don't store vertex header per attribute storing the vertex header once per attribute is totally unnecessary. Some quick look at the generated assembly says llvm in fact cannot optimize away the additional stores (maybe due to potentially aliasing pointers somewhere). Plus, this makes the code cleaner and also allows using a vector "or" instead of scalar ones. commit 6b3a5a57b0b9850854cfbd7b586e4e50102dda71 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 19:11:01 2012 +0100 draw: do the per-vertex "boolean" clipmask "or" with vectors no point extracting the values and doing it per component. Doesn't help that much since we still extract the values elsewhere anyway. commit 36519caf1af40e4480251cc79a2d527350b7c61f Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 2 22:27:01 2012 +0100 gallivm: fix lp_build_extract_broadcast with different sized vectors Fix the obviously wrong argument, so it doesn't blow up. commit 76d0ac3ad85066d6058486638013afd02b069c58 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 2 12:16:23 2012 +0000 draw: Compile per module and not per function (WIP). Enough to get gears w/ LLVM draw + softpipe to work on AVX doing: GALLIUM_DRIVER=softpipe SOFTPIPE_USE_LLVM=yes glxgears But still hackish -- will need to rethink and refactor this. commit 78e32b247d2a7a771be9a1a07eb000d1e54ea8bd Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 12:01:05 2012 +0000 llvmpipe: Remove lp_state_setup_fallback. Never used. commit 6895d5e40d19b4972c361e8b83fdb7eecda3c225 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Feb 27 19:14:27 2012 +0000 llvmpipe: Don't emit EMMS on x86 We already take precautions to ensure that LLVM never emits MMX code. commit 4822fea3f0440b5205e957cd303838c3b128419c Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 29 15:58:19 2012 +0100 draw: modifications for larger vector sizes We want to be able to use larger vectors especially for running the vertex shader. With this patch we build soa vectors which might have a different length than 4. Note that aos structures really remain the same, only when aos structures are converted to soa potentially different sized vectors are used. Samplers probably don't work yet, didn't look at them. Testing done: glxgears works with both 128bit and 256bit vectors. commit f4950fc1ea784680ab767d3dd0dce589f4e70603 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:57 2012 +0100 gallivm: override native vector width with LP_NATIVE_VECTOR_WIDTH env var for debug commit 6ad6dbf0c92f3bf68ae54e5f2aca035d19b76e53 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:24 2012 +0100 draw: allocate storage with alignment according to native vector width commit 7bf0e3e7c9bd2469ae7279cabf4c5229ae9880c1 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Feb 24 19:06:08 2012 +0000 gallivm: Fix comment grammar. Was missing several words. Spotted by Roland. commit b20f1b28eb890b2fa2de44a0399b9b6a0d453c52 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 19:22:09 2012 +0000 gallivm: Use MC-JIT on LLVM 3.1 + (i.e, SVN) MC-JIT Note: MC-JIT is still WIP. For this to work correctly it requires LLVM changes which are not yet upstream. commit b1af4dfcadfc241fd4023f4c3f823a1286d452c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 20:03:15 2012 +0100 llvmpipe: use new lp_type_width() helper in lp_test_blend commit 04e0a37e888237d4db2298f31973af459ef9c95f Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 19:50:34 2012 +0100 llvmpipe: clean up lp_test_blend a little Using variables just sized and aligned right makes it a bit more obvious what's going on. The test still only tests vector length 4. For AoS anything else probably isn't going to work. For SoA other lengths should work (at least with floats). commit e61c393d3ec392ddee0a3da170e985fda885a823 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:48:30 2012 +0000 gallivm: Ensure vector width consistency. Instead of assuming that everything is the max native size. commit 330081ac7bc41c5754a92825e51456d231bf84dd Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:44:14 2012 +0000 draw: More simd vector width consistency fixes. commit d90ca002753596269e37297e2e6c139b19f29f03 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:43:00 2012 +0000 gallivm: Remove unused lp_build_int32_vec4_type() helper. commit cae23417824d75869c202aaf897808d73a2c1db0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 17:32:16 2012 +0100 gallivm: use global variable for native vector width instead of define We do not know the simd extensions (and hence the simd width we should use) available at compile time. At least for now keep a define for maximum vector width, since a global variable obviously can't be used to adjust alignment of automatic stack variables. Leave the runtime-determined value at 128 for now in all cases. commit 51270ace6349acc2c294fc6f34c025c707be538a Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 15:41:02 2012 +0000 gallivm: Add a hunk inadvertedly lost when rebasing. commit bf256df9cfdd0236637a455cbaece949b1253e98 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:24:23 2012 +0000 llvmpipe: Use consistent vector width in depth/stencil test. commit 5543b0901677146662c44be2cfba655fd55da94b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:19:59 2012 +0000 draw: Use a consistent the vector register width. Instead of 4x32 sometimes, LP_NATIVE_VECTOR_WIDTH other times. commit eada8bbd22a3a61f549f32fe2a7e408222e5c824 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 12:08:04 2012 +0000 gallivm: Remove garbagge collection. MC-JIT will require one compilation per module (as opposed to one compilation per function), therefore no state will be shared, eliminating the need to do garbagge collection. commit 556697ea0ed72e0641851e4fbbbb862c470fd7eb Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 10:33:41 2012 +0000 gallivm: Move all native target initialization to lp_set_target_options(). commit c518e8f3f2649d5dc265403511fab4bcbe2cc5c8 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:52:32 2012 +0000 llvmpipe: Create one gallivm instance for each test. commit 90f10af8920ec6be6f2b1e7365cfc477a0cb111d Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:48:08 2012 +0000 gallivm: Avoid LLVMAddGlobalMapping() in lp_bld_assert(). Brittle, complex, and unecesary. Just use function pointer constant. commit 98fde550b33401e3fe006af59db4db628bcbf476 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:21:26 2012 +0000 gallivm: Add a lp_build_const_func_pointer() helper. To be reused in all places where we want to call C code. commit 6cfedadb62c2ce5af8d75969bc95a607f3ece118 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:44:41 2012 +0000 gallivm: Cleanup/simplify lp_build_const_string_variable. - Move to lp_bld_const where it belongs - Rename to lp_build_const_string - take the length from the argument (and don't count the zero terminator twice) - bitcast the constant to generic i8 * commit db1d4018c0f1fa682a9da93c032977659adfb68c Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 11:52:17 2012 +0000 gallivm: Set NoFramePointerElimNonLeaf to true where supported. commit 088614164aa915baaa5044fede728aa898483183 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 22 19:38:47 2012 +0100 llvmpipe: pass in/out pointers rather scalar floats in lp_bld_arit we don't want llvm to potentially optimize away the vectors (though it doesn't seem to currently), plus we want to be able to handle in/out vectors of arbitrary length. commit 3f5c4e04af8a7592fdffa54938a277c34ae76b51 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:22:55 2012 +0100 gallivm: fix lp_build_sqrt() for vector length 1 since we optimize away vectors with length 1 need to emit intrinsic without vector type. commit 79d94e5f93ed8ba6757b97e2026722ea31d32c06 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 17:00:46 2012 +0000 llvmpipe: Remove lp_test_round. commit 81f41b5aeb3f4126e06453cfc78990086b85b78d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:56:24 2012 +0100 llvmpipe: subsume lp_test_round into lp_test_arit Much simpler, and since the arguments aren't passed as 128bit values can run on any arch. This also uses the float instead of the double versions of the c functions (which probably was the intention anyway). In contrast to lp_test_round the output is much less verbose however. Tested vector width of 32 to 512 bits - all pass except 32 (length 1) which crashes in lp_build_sqrt() due to wrong type. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 945b338b421defbd274481d8c4f7e0910fd0e7eb Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 09:55:03 2012 +0000 gallivm: Centralize the function compilation logic. This simplifies a lot of code. Also doing this in a central place will make it easier to carry out the changes necessary to use MC-JIT in the future. gallivm: Fix typo in explicit derivative shuffle. Trivial. draw: make DEBUG_STORE work again adapt to lp_build_printf() interface changes Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: get rid of vecnf_from_scalar() just use lp_build_broadcast directly (cannot assign a name but don't really need it, vecnf_from_scalar() was producing much uglier IR due to using repeated insertelement instead of insertelement+shuffle). Reviewed-by: José Fonseca <jfonseca@vmware.com> llvmpipe: fix typo in complex interpolation code Fixes position interpolation when using complex mode (piglit fp-fragment-position and similar) Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: fix clipvertex/position storing again This appears to be the result of a bad merge. Fixes piglit tests relying on clipping, like a lot of the interpolation tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Fix explicit derivative manipulation. Same counter variable was being used in two nested loops. Use more meanigful variable names for the counter to fix and avoid this. gallivm: Prevent buffer overflow in repeat wrap mode for NPOT. Based on Roland's patch, discussion, and review . Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix dims for TGSI_TEXTURE_1D in emit_tex. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix explicit volume texture derivatives. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: fix 1d shadow texture sampling Always r coordinate is used, hence need 3 coords not two (the second one is unused). Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Enable AVX support without MCJIT, where available. For now, this just enables AVX on Windows for testing. If the code is stable then we might consider prefering the old JIT wherever possible. No change elsewhere. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2012-07-13 18:09:30 +01:00
* dependent
*
* We could still use it on certain processors if benchmarks show that the
* RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
* particular uses that require less workarounds.
*/
gallivm,draw,llvmpipe: Support wider native registers. Squashed commit of the following: commit 7acb7b4f60dc505af3dd00dcff744f80315d5b0e Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:46:31 2012 +0100 draw: Don't use dynamically sized arrays. Not supported by MSVC. commit 5810c28c83647612cb372d1e763fd9d7780df3cb Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:44:16 2012 +0100 gallivm,llvmpipe: Don't use expressions with PIPE_ALIGN_VAR(). MSVC doesn't accept exceptions in _declspec(align(...)). Use a define instead. commit 8aafd1457ba572a02b289b3f3411e99a3c056072 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:41:56 2012 +0100 gallium/util: Make u_cpu_detect.h header C++ safe. commit 5795248350771f899cfbfc1a3a58f1835eb2671d Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 2 12:08:01 2012 +0100 gallium/util: Add ULL suffix to large constants. As suggested by Andy Furniss: it looks like some old gcc versions require it. commit 4c66c22727eff92226544c7d43c4eb94de359e10 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Truly disable INF/NAN tests on MSVC. Thanks to Brian for spotting this. commit 8bce274c7fad578d7eb656d9a1413f5c0844c94e Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Disable INF/NAN tests on MSVC. Somehow they are not recognized as constants. commit 6868649cff8d7fd2e2579c28d0b74ef6dd4f9716 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 15:05:24 2012 +0200 gallivm: Cleanup the 2 x 8 float -> 16 ub special path in lp_build_conv. No behaviour change intended, like 7b98455fb40c2df84cfd3cdb1eb7650f67c8a751. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 5147a0949c4407e8bce9e41d9859314b4a9ccf77 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 14:28:19 2012 +0200 gallivm: (trivial) fix issues with multiple-of-4 texture fetch Some formats can't handle non-multiple of 4 fetches I believe, but everything must support length 1 and multiples of 4. So avoid going to scalar fetch (which is very costly) just because length isn't 4. Also extend the hack to not use shift with variable count for yuv formats to arbitrary length (larger than 1) - doesn't matter how many elements we have we always want to avoid it unless we have variable shift count instruction (which we should get with avx2). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 87ebcb1bd71fa4c739451ec8ca89a7f29b168c08 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jul 4 02:09:55 2012 +0200 gallivm: (trivial) fix typo for wrap repeat mode in linear filtering aos code This would lead to bogus coordinates at the edges. (undetected by piglit because this path is only taken for block-based formats). Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 3a42717101b1619874c8932a580c0b9e6896b557 Author: José Fonseca <jfonseca@vmware.com> Date: Tue Jul 3 19:42:49 2012 +0100 gallivm: Fix TGSI integer translation with AVX. commit d71ff104085c196b16426081098fb0bde128ce4f Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 15:17:41 2012 +0100 llvmpipe: Fix LLVM JIT linear path. It was not working properly because it was looking at the JIT function before it was actually compiled. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit a94df0386213e1f5f9a6ed470c535f9688ec0a1b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Jun 28 18:07:10 2012 +0100 gallivm: Refactor lp_build_broadcast(_scalar) to share code. Doesn't really change the generated assembly, but produces more compact IR, and of course, makes code more consistent. Reviewed-by: Brian Paul <brianp@vmware.com> commit 66712ba2731fc029fa246d4fc477d61ab785edb5 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 17:30:13 2012 +0100 gallivm: Make LLVMContextRef a singleton. There are any places inside LLVM that depend on it. Too many to attempt to fix. Reviewed-by: Brian Paul <brianp@vmware.com> commit ff5fb7897495ac263f0b069370fab701b70dccef Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 28 18:15:27 2012 +0200 gallivm: don't use 8-wide texture fetch in aos path This appears to be a slight loss usually. There are probably several reasons for that: - fetching itself is scalar - filtering is pure int code hence needs splitting anyway, same for the final texel offset calculations - texture wrap related code, which can be done 8-wide, is slightly more complex with floats (with clamp_to_edge) and float operations generally more costly hence probably not much faster overall - the code needed to split when encountering different mip levels for the quads, adding complexity So, just split always for aos path (but leave it 8-wide for soa, since we do 8-wide filtering there when possible). This should certainly be revisited if we'd have avx2 support. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ce8032b43dcd8e8d816cbab6428f54b0798f945d Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:41:19 2012 +0200 gallivm: (trivial) don't extract fparts variable if not needed Did not have any consequences but unnecessary. commit aaa9aaed8f80dc282492f62aa583a7ee23a4c6d5 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:09:06 2012 +0200 gallivm: fix precision issue in aos linear int wrap code now not just passes at a quick glance but also with piglit... If we do the wrapping with floats, we also need to set the weights accordingly. We can potentially end up with different (integer) coordinates than what the integer calculations would have chosen, which means the integer weights calculated previously in this case are completely wrong. Well at least that's what I think happens, at least recalculating the weights helps. (Some day really should refactor all the wrapping, so we do whatever is fastest independent of 16bit int aos or 32bit float soa filtering.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit fd6f18588ced7ac8e081892f3bab2916623ad7a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 11:15:53 2012 +0100 gallium/util: Fix parsing of options with underscore. For example GALLIVM_DEBUG=no_brilinear which was being parsed as two options, "no" and "brilinear". commit 09a8f809088178a03e49e409fa18f1ac89561837 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 26 15:00:14 2012 +0100 gallivm: Added a generic lp_build_print_value which prints a LLVMValueRef. Updated lp_build_printf to share common code. Removed specific lp_build_print_vecX. Reviewed-by: José Fonseca <jfonseca@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit e59bdcc2c075931bfba2a84967a5ecd1dedd6eb0 Author: José Fonseca <jfonseca@vmware.com> Date: Wed May 16 15:00:23 2012 +0100 draw,llvmpipe: Avoid named struct types on LLVM 3.0 and later. Starting with LLVM 3.0, named structures are meant not for debugging, but for recursive data types, previously also known as opaque types. The recursive nature of these types leads to several memory management difficulties. Given that we don't actually need recursive types, avoid them altogether. This is an attempt to address fdo bugs 41791 and 44466. The issue is somewhat random so there's no easy way to check how effective this is. Cherry-picked from 9af1ba565dfd5cef9ee938bb7c04767d14878fbf commit df6070f618a203c7a876d984c847cde4cbc26bdb Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 14:42:53 2012 +0200 gallivm: (trivial) fix typo in faster aos linear int wrap code no longer crashes, now REALLY tested. commit d8f98dce452c867214e6782e86dc08562643c862 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 18:20:58 2012 +0200 llvmpipe: (trivial) remove bogus optimization for float aos repeat wrap This optimization for nearest filtering on the linear path generated likely bogus results, and the int path didn't have any optimizations there since the only shader using force_nearest apparently uses clamp_to_edge not repeat wrap anyway. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit c4e271a0631087c795e756a5bb6b046043b5099d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 23:01:52 2012 +0200 gallivm: faster repeat wrap for linear aos path too Even if we already have scaled integer coords, it's way faster to use the original float coord (plus some conversions) rather than use URem. The choice of what to do for texture wrapping is not really tied to int aos or float soa filtering though for some modes there can be some gains (because of easier weight calculations). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 1174a75b1806e92aee4264ffe0ffe7e70abbbfa3 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 14:39:22 2012 +0200 gallivm: improve npot tex wrap repeat in linear soa path URem gets translated into series of scalar divisions so just about anything else is faster. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit f849ffaa499ed96fa0efd3594fce255c7f22891b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 00:40:35 2012 +0100 gallivm: (trivial) fix near-invisible shift-space typo I blame the keyboard. commit 5298a0b19fe672aebeb70964c0797d5921b51cf0 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:24:28 2012 +0200 gallivm: add new intrinsic helper to deal with arbitrary vector length This helper will split vectors which are too large for the hw, or expand them if they are too small, so a caller of a function using intrinsics which uses such sizes need not split (or expand) the vectors manually and the function will still use the intrinsic instead of dropping back to generic llvm code. It can also accept scalars for use with pseudo-vector intrinsics (only useful for float arguments, all x86 scalar simd float intrinsics use 4vf32). Only used for lp_build_min/max() for now (also added the scalar float case for these while there). (Other basic binary functions could use it easily, whereas functions with a different interface would need different helpers.) Expanding vectors isn't widely used, because we always try to use build contexts with native hw vector sizes. But it might (or not) be nicer if this wouldn't need to be done, the generated code should in theory stay the same (it does get hit by lp_build_rho though already since we didn't have a intrinsic for the scalar lp_build_max case before). v2: incorporated Brian's feedback, and also made the scalar min/max case work instead of crash (all scalar simd float intrinsics take 4vf32 as argument, probably the reason why it wasn't used before). Moved to lp_bld_intr based on José's request, and passing intrinsic size instead of length. Ideally we'd derive the source type info from the passed in llvm value refs and process some llvmtype return type so we could handle intrinsics where the source and destination type isn't the same (like float/int conversions, packing instructions) but that's a bit too complicated for now. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 01aa760b99ec0b2dc8ce57a43650e83f8c1becdf Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:19:18 2012 +0200 gallivm: (trivial) increase max code size for shader disassembly 64kB was just short of what I needed (which caused a crash) hence increase to 96kB (should probably be smarter about that). commit 74aa739138d981311ce13076388382b5e89c6562 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:53:29 2012 +0100 gallivm: simplify aos float tex wrap repeat nearest just handle pot and npot the same. The previous pot handling ended up with exactly the same instructions plus 2 more (leave it in the soa path though since it is probably still cheaper there). While here also fix a issue which would cause a crash after an assert. commit 0e1e755645e9e49cfaa2025191e3245ccd723564 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:29:24 2012 +0100 gallivm: (trivial) skip floor rounding in ifloor when not signed This was only done for the non-sse41 case before, but even with sse41 this is obviously unnecessary (some callers already call itrunc in this case anyway but some might not). commit 7f01a62f27dcb1d52597b24825931e88bae76f33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:23:12 2012 +0100 gallivm: (trivial) fix bogus comments commit 5c85be25fd82e28490274c468ce7f3e6e8c1d416 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 20 11:51:57 2012 +0100 translate: Free elt8_func/elt16_func too. These were leaking. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 0ad498f36fb6f7458c7cffa73b6598adceee0a6c Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 15:55:34 2012 +0200 gallivm: fix bug for tex wrap repeat with linear sampling in aos float path The comparison needs to be against length not length_minus_one, otherwise the max texel is never chosen (for the second coordinate). Fixes piglit texwrap-1D-npot-proj (and 2D/3D versions). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit d1ad65937c5b76407dc2499b7b774ab59341209e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 16:13:43 2012 +0200 gallivm: simplify soa tex wrap repeat with npot textures and no mip filtering Similar to what is already done in aos sampling for the float path (but not the int path since we don't get normalized float coordinates there). URem is expensive and the calculation is done trivially with normalized floats instead (at least with sse41-capable cpus). (Some day should probably do the same for the mip filter path but it's much more complicated there hence the gain is smaller.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit e1e23f57ba9b910295c306d148f15643acc3fc83 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:38:56 2012 +0200 llvmpipe: (trivial) remove duplicated function declaration Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 07ca57eb09e04c48a157733255427ef5de620861 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:37:34 2012 +0200 llvmpipe: destroy setup variants on context destruction lp_delete_setup_variants() used to be called in garbage collection, but this no longer exists hence the setup shaders never got freed. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ed0003c633859a45f9963a479f4c15ae0ef1dca3 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 16:25:29 2012 +0100 gallivm: handle different ilod parts for multiple quad sampling This fixes filtering when the integer part of the lod is not the same for all quads. I'm not fully convinced of that solution yet as it just splits the vector if the levels to be sampled from are different. But otherwise we'd need to do things like some minify steps, and getting mip level base address separately anyway hence it wouldn't really look like much of a win (and making the code even more complex). This should now give identical results to single quad sampling. commit 8580ac4cfc43a64df55e84ac71ce1a774d33c0d2 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:14:47 2012 +0200 gallivm: de-duplicate sample code common to soa and aos sampling There doesn't seem to be any reason why this code dealing with cube face selection, lod and mip level calculation is separate in aos and soa sampling, and I am sick of having it to change in both places. commit fb541e5f957408ce305b272100196f1e12e5b1e8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:15:41 2012 +0200 gallivm: do mip filtering with per quad lod_fpart This gives better results for mip filtering, though the generated code might not be optimal. For now it also creates some artifacts if the lod_ipart isn't the same for all quads, since instead of using the same mip weight for all quads as previously (which just caused non-smooth gradients) this now will use the right weights but with the wrong mip level in this case (can easily be seen with things like texfilt, mipmap_tunnel). v2: use logic helper suggested by José, and fix issue with negative lod_fpart values commit f1cc84eef7d826a20fab6cd8ccef9a275ff78967 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 13 18:35:25 2012 +0200 gallivm: (trivial) fix bogus assert in lp_build_unpack_broadcast_aos_scalars commit 7c17dbae8ae290df9ce0f50781a09e8ed640c044 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:11:14 2012 +0100 util: Reimplement half <-> float conversions. Removed u_half.py used to generate the table for previous method. Previous implementation of float to half conversion was faulty for denormalised and NaNs and would require extra logic to fix, thus making the speedup of using tables irrelevant. commit 7762f59274070e1dd4b546f5cb431c2eb71ae5c3 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:12:16 2012 +0100 tests: Updated tests to properly handle NaN for half floats. commit fa94c135aea5911fd93d5dfb6e6f157fb40dce5e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:33:10 2012 +0200 gallivm: do mip level calculations per quad This is the final piece which shouldn't change the rendering output yet. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 23cbeaddfe03c09ca18c45d28955515317ffcf4c Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Jun 9 00:54:21 2012 +0200 gallivm: do per-quad cube face selection Doesn't quite fix the piglit cubemap test (not sure why actually) but doing per-quad face selection is doing the right thing and definitely an improvement. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit abfb372b3702ac97ac8b5aa80ad1b94a2cc39d33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:22:59 2012 +0200 gallivm: do all lod calculations per quad Still no functional change but lod is now converted to scalar after lod calculations. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 519368632747ae03feb5bca9c655eccbc5b751b4 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:46:10 2012 +0100 gallivm: Added support for half-float to float conversion in lp_build_conv. Updated various utility functions to support this change. commit 135b4d683a4c95f7577ba27b9bffa4a6fbd2c2e7 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:02:46 2012 +0100 gallivm: Added function for half-float to float conversion. Updated lp_build_format_aos_array to support half-float source. commit 37d648827406a20c5007abeb177698723ed86673 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:55:18 2012 +0100 util: Updated u_format_tests to rigidly test half-float boundary values. commit 2ad18165d96e578aa9046df7c93cb1c3284d8c6b Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:54:16 2012 +0100 llvmpipe: Updated lp_test_format to properly handle Inf/NaN results. commit 78740acf25aeba8a7d146493dd5c966e22c27b73 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:53:30 2012 +0100 util: Added functions for checking NaN / Inf for double and half-floats. commit 35e9f640ae01241f9e0d67fe893bbbf564c05809 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:05:13 2012 +0200 gallivm: Fix calculating rho for 3d textures for the single-quad case Discovered by accident, this looks like a very old typo bug. commit fc1220c636326536fd0541913154e62afa7cd1d8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:04:59 2012 +0200 gallivm: do calcs per-quad in lp_build_rho Still convert to scalar at the end of the function. commit 50a887ffc550bf310a6988fa2cea5c24d38c1a41 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 21 23:21:50 2012 +0200 gallivm: (trivial) return scalar in lp_build_extract_range for length 1 vectors Our type system on top of llvm's one doesn't generally support vectors of length 1, instead using scalars. So we should return a scalar from this function instead of having to bitcast the vector with length 1 later elsewhere. commit 80c71c621f9391f0f9230460198d861643324876 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 17:49:15 2012 +0100 draw: Fixed bad merge error commit c47401cfad0c9167de20ff560654f533579f452c Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:29:30 2012 +0100 draw: Updated store_clip to store whole vectors instead of individual elements. commit 2d9c1ad74b0b0b41861fffcecde39f09cc27f1cf Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:28:32 2012 +0100 gallivm: Added lp_build_fetch_rgba_aos_array. A version of lp_build_fetch_rgba_aos which is targeted at simple array formats. Reads the whole vector from memory in one, instead of reading each element individually. Tested with mesa tests and demos. commit ff7805dc2b6ef6d8b11ec4e54aab1633aef29ac8 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:27:40 2012 +0100 gallivm: Added lp_build_pad_vector. This function pads a vector with undef to a desired length. commit 701f50acef24a2791dabf4730e5b5687d6eb875d Author: James Benton <jbenton@vmware.com> Date: Fri May 18 17:27:19 2012 +0100 util: Added util_format_is_array. This function checks whether a format description is in a simple array format. commit 5e0a7fa543dcd009de26f34a7926674190fa6246 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:13:47 2012 +0100 draw: Removed draw_llvm_translate_from and draw/draw_llvm_translate.c. This is "replaced" by adding an optimised path in lp_build_fetch_rgba_aos in an upcoming patch. commit 8c886d6a7dd3fb464ecf031de6f747cb33e5361d Author: James Benton <jbenton@vmware.com> Date: Wed May 16 15:02:31 2012 +0100 draw: Modified store_aos to write the vector as one, not individual elements. commit 37337f3d657e21dfd662c7b26d61cb0f8cfa6f17 Author: James Benton <jbenton@vmware.com> Date: Wed May 16 14:16:23 2012 +0100 draw: Changed aos_to_soa to use lp_build_transpose_aos. commit bd2b69ce5d5c94b067944d1dcd5df9f8e84548f1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:14:27 2012 +0100 draw: Changed soa_to_aos to use lp_build_transpose_aos. commit 0b98a950d29a116e82ce31dfe7b82cdadb632f2b Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:45 2012 +0100 gallivm: Added lp_build_transpose_aos which converts between aos and soa. commit 69ea84531ad46fd145eb619ed1cedbe97dde7cb5 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:01 2012 +0100 gallivm: Added lp_build_interleave2_half aimed at AVX unpack instructions. commit 7a4cb1349dd35c18144ad5934525cfb9436792f9 Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 22 11:54:14 2012 +0100 gallivm: Fix build on Windows. MC-JIT not yet supported there. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit afd105fc16bb75d874e418046b80d9cc578818a1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:17:26 2012 +0100 llvmpipe: Added a error counter to lp_test_conv. Useful for keeping track of progress when fixing errors! Signed-off-by: José Fonseca <jfonseca@vmware.com> commit b644907d08c10a805657841330fc23db3963d59c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:16:46 2012 +0100 llvmpipe: Changed known failures in lp_test_conv. To comply with the recent fixes to lp_bld_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit d7061507bd94f6468581e218e61261b79c760d4f Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:14:38 2012 +0100 llvmpipe: Added fixed point types tests to lp_test_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 146b3ea39b4726dbe125ac666bd8902ea3d6ca8c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:26:35 2012 +0100 llvmpipe: Changed lp_test_conv src/dst alignment to be correct. Now based on the define rather than a fixed number. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit f3b57441f834833a4b142a951eb98df0aa874536 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:06:44 2012 +0100 gallivm: Fixed erroneous optimisation in lp_build_min/max. Previously assumed normalised was 0 to 1, but it can be -1 to 1 if type is signed. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a0613382e5a215cd146bb277646a6b394d376ae4 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:04:49 2012 +0100 gallivm: Compensate for lp_const_offset in lp_build_conv. Fixing a /*FIXME*/ to remove errors in integer conversion in lp_build_conv. Tested using lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a3d2bf15ea345bc8a0664f8f441276fd566566f3 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:01:25 2012 +0100 gallivm: Fixed overflow in lp_build_clamped_float_to_unsigned_norm. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit e7b1e76fe237613731fa6003b5e1601a2e506207 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 21 20:07:51 2012 +0100 gallivm: Fix build with LLVM 2.6 Trivial, and useful. commit d3c6bbe5c7f5ba1976710831281ab1b6a631082d Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 15 17:15:59 2012 +0100 gallivm: Enable MCJIT/AVX with vanilla LLVM 3.1. Add the necessary C++ glue, so that we don't need any modifications to the soon to be released LLVM 3.1. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 724a019a14d40fdbed21759a204a2bec8a315636 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 22:04:06 2012 +0100 gallivm: Use HAVE_LLVM 0x0301 consistently. commit af6991e2a3868e40ad599b46278551b794839748 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 21:49:06 2012 +0100 gallivm: Add MCRegisterInfo.h to silence benign warnings about missing implementation. Trivial. commit 6f8a1d75458daae2503a86c6b030ecc4bb494e23 Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Apr 2 22:14:15 2012 -0700 gallivm: Pass in a MCInstrInfo to createMCInstPrinter on llvm-3.1. llvm-3.1svn r153860 makes MCInstrInfo available to the MCInstPrinter. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 62555b6ed8760545794f83064e27cddcb3ce5284 Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 27 21:51:17 2012 -0700 gallivm: Fix method overriding in raw_debug_ostream. Use matching type qualifers to avoid method hiding. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 6a9bd784f4ac68ad0a731dcd39e5a3c39989f2be Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 13 22:40:52 2012 -0700 gallivm: Fix createOProfileJITEventListener namespace with llvm-3.1. llvm-3.1svn r152620 refactored the OProfile profiling code. createOProfileJITEventListener was moved from the llvm namespace to the llvm::JITEventListener namespace. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit b674955d39adae272a779be85aa1bd665de24e3e Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Mar 5 22:00:40 2012 -0800 gallivm: Pass in a MCRegisterInfo to MCInstPrinter on llvm-3.1. llvm-3.1svn r152043 changes createMCInstPrinter to take an additional MCRegisterInfo argument. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 11ab69971a8a31c62f6de74905dbf8c02884599f Author: Vinson Lee <vlee@freedesktop.org> Date: Wed Feb 29 21:20:53 2012 -0800 Revert "gallivm: Change getExtent and readByte to non-const with llvm-3.1." This reverts commit d5a6c172547d8964f4d4bb79637651decaf9deee. llvm-3.1svn r151687 makes MemoryObject accessor members const again. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 339960c82d2a9f5c928ee9035ed31dadb7f45537 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 14 16:19:56 2012 +0200 gallivm: (trivial) fix assertion failure for mipmapped 1d textures In lp_build_rho, we may end up with a 1-element vector (for mipmapped 1d textures), but in this case we require the type to be a non-vector type, so need a cast. commit 9d73edb727bd6d196030dc3026b7bf0c574b3e19 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:12:07 2012 +0200 gallivm: prepare for per-quad lod calculations for large vectors to be able to handle multiple quads at once in texture sampling and still do lod calculations per quad, it is necessary to get the per-quad derivatives into the lp_build_rho function. Until now these derivative values were just scalars, which isn't going to work. So we now use vectors, and since the interface needs to change we also do some different (slightly more efficient) packing of the values. For 8-wide vectors the packed derivative values for 3 coords would look like this, this scales to a arbitrary (multiple of 4) vector size: ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____ The second vector will be unused for 1d and 2d textures. To facilitate future changes the derivative values are put into a struct, since quite some functions just pass these values through. The generated code seems to be very slightly better for 2d textures (with 4-wide vectors) than before with sse2 (if you have a cpu with physical 128bit simd units - otherwise it's probably not a win). v2: suggestions from José, rename variables, add comments, use swizzle helper commit 0aa21de0d31466dac77b05c97005722e902517b8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:10:31 2012 +0200 gallivm: add undefined swizzle handling to lp_build_swizzle_aos This is useful for vectors with "holes", it lets llvm choose the most efficient shuffle instructions if some elements aren't needed without having to worry what elements to manually pick otherwise. commit 00faf3f370e7ce92f5ef51002b0ea42ef856e181 Author: José Fonseca <jfonseca@vmware.com> Date: Fri May 4 17:25:16 2012 +0100 gallivm: Get the LLVM IR optimization passes before JIT compilation. MC-JIT engine compiles the module immediately on creation, so the optimization passes were being run too late. So now we create a target data layout from a string, that matches the ABI parameters reported by the compiler. The backend optimization passes were always been run, so the performance improvement is modest (3% on multiarb mesa demo). Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit 40a43f4e2ce3074b5ce9027179d657ebba68800a Author: Roland Scheidegger <sroland@vmware.com> Date: Wed May 2 16:03:54 2012 +0200 gallivm: (trivial) fix wrong define used in lp_build_pack2 should fix stack-smashing crashes. commit e6371d0f4dffad4eb3b7a9d906c23f1c88a2ab9e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 30 21:25:29 2012 +0200 gallivm: add perf warnings when not using intrinsics with 256bit vectors Helper functions using integer sse2 intrinsics could split the vectors with AVX instead of using generic fallback (which should be faster). We don't actually expect to hit these paths (hence don't fix them up to actually do the vector splitting) so just emit warnings (for those functions where it's obvious doing split/intrinsic is faster than using generic path). Only emit warnings for 256bit vectors since we _really_ don't expect to hit arbitrary large vectors which would affect a lot more functions. The warnings do not actually depend on avx since the same logic applies to plain sse2 too (but of course again there's _really_ no reason we should hit these functions with 256bit vectors without avx). commit 8a9ea701ea7295181e846c6383bf66a5f5e47637 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:37:07 2012 +0200 gallivm: split vectors manually for avx in lp_build_pack2 (v2) There's 2 reasons for this: First, there's a llvm bug (fixed in 3.1) which generates tons of byte inserts/extracts otherwise, and second, more importantly, we want to use pack intrinsics instead of shuffles. We do this in lp_build_pack2 and not the calling code (aos sample path) because potentially other callers might find that useful too, even if for larger sequences of code using non-native vector sizes it might be better to manually split vectors. This should boost texture performance in the aos path considerably. v2: fix issues with intrinsics types with old llvm commit 27ac5b48fa1f2ea3efeb5248e2ce32264aba466e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:26:22 2012 +0200 llvmpipe: refactor lp_build_pack2 (v2) prettify, and it's unnecessary to assert when there's no intrinsic due to unsupported bit width - the shuffle path will work regardless. In contrast lp_build_packs2, should only rely on lp_build_pack2 doing the clamping for element sizes for which there is a sse2 intrinsic. v2: fix bug spotted by Jose regarding the intrinsic type for packusdw on old llvm versions. commit ddf279031f0111de4b18eaf783bdc0a1e47813c8 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:13:59 2012 +0200 gallivm: add src width check in lp_build_packs2() not doing so would skip clamping even if no sse2 pack instruction is available, which is incorrect (in theory only, such widths would also always hit a (unnecessary) assertion in lp_build_pack2(). commit e7f0ad7fe079975eae7712a6e0c54be4fae0114b Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Apr 27 15:57:00 2012 +0200 gallivm: (trivial) fix crash-causing typo for npot textures with avx commit 28a9d7f6f655b6ec508c8a3aa6ffefc1e79793a0 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Apr 25 19:38:45 2012 +0200 gallivm: (trivial) remove code mistakenly added twice. commit d5926537316f8ff67ad0a52e7242f7c5478d919b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Apr 24 21:16:15 2012 +0200 gallivm: add a new avx aos sample path (v2) Try to avoid mixing float and int address calculations. This does texture wrap modes with floats, and then the offset calculations still with ints (because of lack of precision with floats, though we could do some effort to make it work with not too large (16MB) textures). This also handles wrap repeat mode with npot-sized textures differently than either the old soa or aos int path (likely way faster but untested). Otherwise the actual address wrap code is largely similar to the soa path (not quite the same as this one also has some int code), it should get used by avx soa sampling later as well but doesn't handle more complex address modes yet (this will also have the benefit that we can use aos sampling path for all texture address modes). Generated code for that looks reasonable, but still does not split vectors explicitly for fetch/filter which means still get hit by llvm (fixed upstream) which generates hundreds of pinsrb/pextrb instead of two shuffles. It is not obvious though if it's much of a win over just doing address calcs 4-wide but with ints, even if it is definitely much less instructions on avx. piglit's texwrap seems to look exactly the same but doesn't test neither the non-normalized nor the npot cases. v2: fix comments, prettify based on Brian's and Jose's feedback. commit bffecd22dea66fb416ecff8cffd10dd4bdb73fce Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Apr 19 01:58:29 2012 +0200 gallivm: refactor aos lp_build_sample_image_nearest/linear split them up to separate address calculations and fetching/filtering. Need this for being able to do 8-wide float address calcs and 4-wide fetch/filter later (for avx). Plus the functions were very big scary monsters anyway (in particular lp_build_sample_image_linear). commit a80b325c57529adddcfa367f96f03557725c4773 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 16 17:17:18 2012 +0200 gallivm: fix lp_build_resize when truncating width but expanding vector size Missed this case which I thought was impossible - the assertion for it was right after the division by zero... (AoS) texture sampling may ask us to do this, for things like 8 4x32int vectors to 1 32x8int vector conversion (eventually, we probably don't want this to happen). commit f9c8337caa3eb185830d18bce8b95676a065b1d7 Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Apr 14 18:00:59 2012 +0200 gallivm: fix cube maps with larger vectors This makes the branchless cube face selection code work with larger vectors. Because the complexity is quite high (cannot really be improved it seems, per-face selection would reduce complexity a lot but this leads to errors unless the derivatives are calculated all from the same face which almost doubles the work to be done) it is still slower than the branching version, hence only enable this with large vectors. It doesn't actually do per-quad face selection yet (only makes sense with matching lod selection, in fact it will select the same face for all pixels based on the average of the first four pixels for now) but only different shuffles are required to make it work (the branching version actually should work with larger vectors too now thanks to the improved horizontal add but of course it cannot be extended to really select the face per-quad unless doing branching per quad). commit 7780c58869fc9a00af4f23209902db7e058e8a66 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 21:11:12 2012 +0100 llvmpipe: (trivial) fix compiler warning and also clarify comment regarding availability of popcnt instruction. commit a266dccf477df6d29a611154e988e8895892277e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 14:21:07 2012 +0100 gallivm: remove unneeded members in lp_build_sample_context Minor cleanup, the texture width, height, depth aren't accessed in their scalar form anywhere. Makes it more obvious those values should probably be fetched already vectorized (but this requires more invasive changes)... commit b678c57fb474e14f05e25658c829fc04d2792fff Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 29 15:53:55 2012 +0100 gallivm: add a helper for concatenating vectors Similar to the extract_range helper intended to get around slow code generated by llvm for 128bit insertelements. Concatenating two 128bit vectors this way will result in a single vinsertf128 operation rather than two 64bit stores plus one 128bit load, though it might be mildly useful for other purposes as well. commit 415ff228bcd0cf5e44a4c15350a661f0f5520029 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 19:41:15 2012 +0100 gallivm: add a custom 2x8f->1x16ub avx conversion path Similar to the existing 4x4f->1x16ub sse2 path, shaves off a couple instructions (min/max mostly) because it relies on pack intrinsics clamping. commit 78c08fc89f8fbcc6dba09779981b1e873e2a0299 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 18:44:07 2012 +0100 gallivm: add avx arithmetic intrinsics Add all avx intrinsics for arithmetic functions (with the exception of the horizontal add function which needs another look). Seems to pass basic tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit a586caa2800aa5ce54c173f7c0d4fc48153dbc4e Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 15:31:35 2012 +0100 gallivm: add avx logic intrinsics Add the blend intrinsics for 8-wide float and 4-wide double vectors. Since we lack 256bit int instructions these are used for int vectors as well, though obviously not for byte or word element values. The comparison intrinsics aren't extended for avx since these are only used for pre-2.7 llvm versions. commit 70275e4c13c89315fc2560a4c488c0e6935d5caf Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 00:40:53 2012 +0100 gallivm: new helper function for extract shuffles. Based on José's idea as we can need that in a couple places. Note that such shuffles should not be used lightly, since data layout of <4 x i8> is different to <16 x i8> for instance, hence might cause data rearrangement. commit 4d586dbae1b0c55915dda1759d2faea631c0a1c2 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 18:27:25 2012 +0100 gallivm: (trivial) don't overallocate shuffle variable using wrong define meant huge array... commit 06b0ec1f6d665d98c135f9573ddf4ba04b2121ad Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 17:54:20 2012 +0100 gallivm: don't do per-element extract/insert for vector element resize Instead of doing per-element extract/insert if the src vectors and dst vector differ in total size (which generates atrocious code) first change the src vectors size by using shuffles to destination vector size. We can still do better than that on AVX for packing to color buffer (by exploiting pack intrinsics characteristics hence eleminating the need for some clamps) but this already generates much better code. v2: incorporate feedback from José, Keith and use shuffle instead of bitcasts/extracts. Due to llvm deficiencies the latter cause all data to get moved to GPRs and back in pieces (even though the data in the regs actually stays the same...). commit c9970d70e05f95d3f52fe7d2cd794176a52693aa Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 19:33:19 2012 +0000 gallivm: fix bug in simple position interpolation Accidental use of position attribute instead of just pixel coordinates. Caused failures in piglit glsl-fs-ceil and glsl-fs-floor. commit d0b6fcdb008d04d7f73d3d725615321544da5a7e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 15:31:14 2012 +0000 gallivm: fix emission of ceil opcode lp_build_ceil seems more appropriate than lp_build_trunc. This seems to be never hit though someone performs some ceil to floor magic. commit d97fafed7e62ffa6bf76560a92ea246a1a26d256 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 22 11:46:52 2012 +0000 gallivm: new vectorized path for cubemap calculations should be faster when adapted to multiple quads as only selection masks need to be different. The code is more or less a per-pixel version adapted to only do it per quad. A per pixel version would be much simpler (could drop 2 selects, 6 broadcasts and the messy horizontal add of 3 vectors at the expense of only 2 more absolute value instructions - would also just work for arbitary large vectors). This version doesn't yet work with larger vectors because the horizontal add isn't adjusted to be able to work with 2x4 vectors (and also because face selection wouldn't be done per quad just per block though that would be only a correctness issue just as with lod selection). The downside is this code is quite a bit slower. On a Core2 it can be sped up by disabling the hw blend instructions for selection and using logicop fallbacks instead, but it is still slower than the old code, hence leave that in for now. Probably will chose one or the other version based on vector length in the end. commit b375fbb18a3fd46859b7fdd42f3e9908ea4ff9a3 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 21 14:42:29 2012 +0000 gallivm: fix optimized occlusion query intrinsic name commit a9ba0a3b611e48efbb0e79eb09caa85033dbe9a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Mar 21 16:19:43 2012 +0000 draw,gallivm,llvmpipe: Call gallivm_verify_function everywhere. commit f94c2238d2bc7383e088b8845b7410439a602071 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 18:54:10 2012 +0000 gallivm: optimize calculations for cube maps a bit this does some more vectorized calculations and uses horizontal adds if possible. A definite win with sse3 otherwise it doesn't seem to make much of a difference. In any case this is arithmetically identical, cannot handle larger vectors. Should be useful as a reference point against larger vector version later... commit 21a2c1cf3c8e1ac648ff49e59fdc0e3be77e2ebb Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 15:16:27 2012 +0000 llvmpipe: slight optimization of occlusion queries using movmskps when available. While this is slightly better for cpus without popcnt we should really sum the vectors ourselves (it is also possible to cast to i4 before doing the popcnt but that doesn't help that much neither since llvm is using some optimized popcnt version for i32) commit 5ab5a35f216619bcdf55eed52b0db275c4a06c1b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 13:32:11 2012 +0000 llvmpipe: fix occlusion queries with larger vectors need to adjust casts etc. commit ff95e6fdf5f16d4ef999ffcf05ea6e8c7160b0d5 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Mar 19 20:15:25 2012 +0000 gallivm: Restore optimization passes. commit 57b05b4b36451e351659e98946dae27be0959832 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:34:22 2012 +0000 llvmpipe: use existing min2 macro commit bc9a20e19b4f600a439f45679451f2e87cd4b299 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:07:27 2012 +0000 llvmpipe: add some safeguards against really large vectors As per José's suggestion, prevent things from blowing up if some cpu would have 1024bit or larger vectors. commit 0e2b525e5ca1c5bbaa63158bde52ad1c1564a3a9 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:31:08 2012 +0000 llvmpipe: fix mask generation for uberwide vectors this was the only piece preventing 16-wide vectors from working (apart from the LP_MAX_VECTOR_WIDTH define that is), which is the maximum as we don't get more pixels in the fragment shader at once. Hence adjust that so things could be tested properly with that size even though there seems to be no practical value. commit 3c8334162211c97f3a11c7f64e9e5a2a91ad9656 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:19:41 2012 +0000 llvmpipe: fix the simple interpolation method with larger vectors so both methods actually _really_ work now. Makes textures look nice with larger vectors... commit 1cb0464ef8871be1778d43b0c56adf9c06843e2d Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 17:26:35 2012 +0000 llvmpipe: fix mask generation and position interpolation with 8-wide vectors trivial bugs, with these things start to look somewhat reasonable. Textures though have some swizzling issues it seems. commit 168277a63ef5b72542cf063c337f2d701053ff4b Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 16:04:03 2012 +0000 llvmpipe: don't overallocate variables we never have more than 16 (stamp size) / 4 (minimum possible vector size). (With larger vectors those variables are still overallocated a bit.) commit 409b54b30f81ed0aa9ed0b01affe15c72de9abd2 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:56:48 2012 +0000 llvmpipe: add some 32f8 formats to lp_test_conv Also add the ability to handle different sized vectors. commit 55dcd3af8366ebdac0af3cdb22c2588f24aa18ce Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:47:27 2012 +0000 gallivm: handle different sized vectors in conversion / pack only fully generic path for now (extract/insert per element). commit 9c040f78c54575fcd94a8808216cf415fe8868f6 Author: Roland Scheidegger <sroland@vmware.com> Date: Sun Mar 18 00:58:28 2012 +0100 llvmpipe: fix harmless use of unitialized values commit 551e9d5468b92fc7d5aa2265db9a52bb1e368a36 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:31:21 2012 +0100 gallivm: drop special path in extract_broadcast with different sized vectors Not needed, llvm can handle shuffles with different sized result vector just fine. Should hopefully generate the same code in the end, but simpler IR. commit 44da531119ffa07a421eaa041f63607cec88f6f8 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:28:49 2012 +0100 llvmpipe: adapt interpolation for handling multiple quads at once this is still WIP there are actually two methods possible not quite sure what makes the most sense, so there's code for both for now: 1) the iterative method as used before (compute attrib values at upper left corner of stamp and upper left corner of each quad initially). It is improved to handle more than one quad at once, and also do some more vectorized calculations initially for slightly better code - newer cpus have full throughput with 4 wide float vectors, hence don't try to code up a path which might be faster if there's just one channel active per attribute. 2) just do straight interpolation for each pixel. Method 2) is more work per quad, but less initially - if all quads are executed significantly more overall though. But this might change with larger vector lengths. This method would also be needed if we'd do some kind of active quad merging when operating on multiple quads at once. This path contains some hack to force llvm to generate better code, it is still far from ideal though, still generates far too many unnecessary register spills/reloads. Both methods should work with different sized vectors. Not very well tested yet, still seems to work with four-wide vectors, need changes elsewhere to be able to test with wider vectors. commit be5d3e82e2fe14ad0a46529ab79f65bf2276cd28 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:59:37 2012 +0000 draw: Cleanup. commit f85bc12c7fbacb3de2a94e88c6cd2d5ee0ec0e8d Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:43:30 2012 +0000 gallivm: More module compilation refactoring. commit d76f093198f2a06a93b2204857e6fea5fd0b3ece Author: José Fonseca <jfonseca@vmware.com> Date: Thu Mar 15 21:29:11 2012 +0000 llvmpipe: Use gallivm_compile/free_function() in linear code. Should had been done before. commit 122e1adb613ce083ad739b153ced1cde61dfc8c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 13 14:47:10 2012 +0100 llvmpipe: generate partial pixel mask for multiple quads still works with one quad, cannot be tested yet with more At least for now always fixed order with multiple quads. commit 4c4f15081d75ed585a01392cd2dcce0ad10e0ea8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 22:09:24 2012 +0100 llvmpipe: refactor state setup a bit Refactor to make it easier to emit (and potentially later fetch in fs) coefficients for multiple attributes at once. Need to think more about how to make this actually happen however, the problem is different attributes can have different interpolation modes, requiring different handling in both setup and fs (though linear and perspective handling is close). commit 9363e49722ff47094d688a4be6f015a03fba9c79 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 19:23:23 2012 +0100 llvmpipe: vectorize tri offset calc cuts number of instructions in quad-offset-factor from 107 to 75. This code actually duplicated the (scalar) code calculating the determinant except it used different vertex order (leading to different sign but it doesn't matter) hence llvm could not have figured out it's the same (of course with determinant vectorized in the other place that wouldn't have worked any longer neither). Note this particular piece doesn't actually vectorize well, not many arithmetic instructions left but tons of shuffle instructions... Probably would need to work on n tris at a time for better vectorization. commit 63169dcb9dd445c94605625bf86d85306e2b4297 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 03:11:37 2012 +0100 llvmpipe: vectorize some scalar code in setup reduces number of arithmetic instructions, and avoids loading vector x,y values twice (once as scalars once as vectors). Results in a reduction of instructions from 76 to 64 in fs setup for glxgears (16%) on a cpu with sse41. Since this code uses vec2 disguised as vec4, on old cpus which had physical 64bit sse units (pre-Core2) it probably is less of a win in practice (and if you have no vectors you can only hope llvm eliminates the arithmetic for unneeded elements). commit 732ecb877f951ab89bf503ac5e35ab8d838b58a1 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 7 00:32:24 2012 +0100 draw: fix clipping bug introduced by 4822fea3f0440b5205e957cd303838c3b128419c broke clipping pretty badly (verified with lineclip test) commit ef5d90b86d624c152d200c7c4056f47c3c6d2688 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 23:38:59 2012 +0100 draw: don't store vertex header per attribute storing the vertex header once per attribute is totally unnecessary. Some quick look at the generated assembly says llvm in fact cannot optimize away the additional stores (maybe due to potentially aliasing pointers somewhere). Plus, this makes the code cleaner and also allows using a vector "or" instead of scalar ones. commit 6b3a5a57b0b9850854cfbd7b586e4e50102dda71 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 19:11:01 2012 +0100 draw: do the per-vertex "boolean" clipmask "or" with vectors no point extracting the values and doing it per component. Doesn't help that much since we still extract the values elsewhere anyway. commit 36519caf1af40e4480251cc79a2d527350b7c61f Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 2 22:27:01 2012 +0100 gallivm: fix lp_build_extract_broadcast with different sized vectors Fix the obviously wrong argument, so it doesn't blow up. commit 76d0ac3ad85066d6058486638013afd02b069c58 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 2 12:16:23 2012 +0000 draw: Compile per module and not per function (WIP). Enough to get gears w/ LLVM draw + softpipe to work on AVX doing: GALLIUM_DRIVER=softpipe SOFTPIPE_USE_LLVM=yes glxgears But still hackish -- will need to rethink and refactor this. commit 78e32b247d2a7a771be9a1a07eb000d1e54ea8bd Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 12:01:05 2012 +0000 llvmpipe: Remove lp_state_setup_fallback. Never used. commit 6895d5e40d19b4972c361e8b83fdb7eecda3c225 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Feb 27 19:14:27 2012 +0000 llvmpipe: Don't emit EMMS on x86 We already take precautions to ensure that LLVM never emits MMX code. commit 4822fea3f0440b5205e957cd303838c3b128419c Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 29 15:58:19 2012 +0100 draw: modifications for larger vector sizes We want to be able to use larger vectors especially for running the vertex shader. With this patch we build soa vectors which might have a different length than 4. Note that aos structures really remain the same, only when aos structures are converted to soa potentially different sized vectors are used. Samplers probably don't work yet, didn't look at them. Testing done: glxgears works with both 128bit and 256bit vectors. commit f4950fc1ea784680ab767d3dd0dce589f4e70603 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:57 2012 +0100 gallivm: override native vector width with LP_NATIVE_VECTOR_WIDTH env var for debug commit 6ad6dbf0c92f3bf68ae54e5f2aca035d19b76e53 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:24 2012 +0100 draw: allocate storage with alignment according to native vector width commit 7bf0e3e7c9bd2469ae7279cabf4c5229ae9880c1 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Feb 24 19:06:08 2012 +0000 gallivm: Fix comment grammar. Was missing several words. Spotted by Roland. commit b20f1b28eb890b2fa2de44a0399b9b6a0d453c52 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 19:22:09 2012 +0000 gallivm: Use MC-JIT on LLVM 3.1 + (i.e, SVN) MC-JIT Note: MC-JIT is still WIP. For this to work correctly it requires LLVM changes which are not yet upstream. commit b1af4dfcadfc241fd4023f4c3f823a1286d452c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 20:03:15 2012 +0100 llvmpipe: use new lp_type_width() helper in lp_test_blend commit 04e0a37e888237d4db2298f31973af459ef9c95f Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 19:50:34 2012 +0100 llvmpipe: clean up lp_test_blend a little Using variables just sized and aligned right makes it a bit more obvious what's going on. The test still only tests vector length 4. For AoS anything else probably isn't going to work. For SoA other lengths should work (at least with floats). commit e61c393d3ec392ddee0a3da170e985fda885a823 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:48:30 2012 +0000 gallivm: Ensure vector width consistency. Instead of assuming that everything is the max native size. commit 330081ac7bc41c5754a92825e51456d231bf84dd Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:44:14 2012 +0000 draw: More simd vector width consistency fixes. commit d90ca002753596269e37297e2e6c139b19f29f03 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:43:00 2012 +0000 gallivm: Remove unused lp_build_int32_vec4_type() helper. commit cae23417824d75869c202aaf897808d73a2c1db0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 17:32:16 2012 +0100 gallivm: use global variable for native vector width instead of define We do not know the simd extensions (and hence the simd width we should use) available at compile time. At least for now keep a define for maximum vector width, since a global variable obviously can't be used to adjust alignment of automatic stack variables. Leave the runtime-determined value at 128 for now in all cases. commit 51270ace6349acc2c294fc6f34c025c707be538a Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 15:41:02 2012 +0000 gallivm: Add a hunk inadvertedly lost when rebasing. commit bf256df9cfdd0236637a455cbaece949b1253e98 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:24:23 2012 +0000 llvmpipe: Use consistent vector width in depth/stencil test. commit 5543b0901677146662c44be2cfba655fd55da94b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:19:59 2012 +0000 draw: Use a consistent the vector register width. Instead of 4x32 sometimes, LP_NATIVE_VECTOR_WIDTH other times. commit eada8bbd22a3a61f549f32fe2a7e408222e5c824 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 12:08:04 2012 +0000 gallivm: Remove garbagge collection. MC-JIT will require one compilation per module (as opposed to one compilation per function), therefore no state will be shared, eliminating the need to do garbagge collection. commit 556697ea0ed72e0641851e4fbbbb862c470fd7eb Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 10:33:41 2012 +0000 gallivm: Move all native target initialization to lp_set_target_options(). commit c518e8f3f2649d5dc265403511fab4bcbe2cc5c8 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:52:32 2012 +0000 llvmpipe: Create one gallivm instance for each test. commit 90f10af8920ec6be6f2b1e7365cfc477a0cb111d Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:48:08 2012 +0000 gallivm: Avoid LLVMAddGlobalMapping() in lp_bld_assert(). Brittle, complex, and unecesary. Just use function pointer constant. commit 98fde550b33401e3fe006af59db4db628bcbf476 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:21:26 2012 +0000 gallivm: Add a lp_build_const_func_pointer() helper. To be reused in all places where we want to call C code. commit 6cfedadb62c2ce5af8d75969bc95a607f3ece118 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:44:41 2012 +0000 gallivm: Cleanup/simplify lp_build_const_string_variable. - Move to lp_bld_const where it belongs - Rename to lp_build_const_string - take the length from the argument (and don't count the zero terminator twice) - bitcast the constant to generic i8 * commit db1d4018c0f1fa682a9da93c032977659adfb68c Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 11:52:17 2012 +0000 gallivm: Set NoFramePointerElimNonLeaf to true where supported. commit 088614164aa915baaa5044fede728aa898483183 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 22 19:38:47 2012 +0100 llvmpipe: pass in/out pointers rather scalar floats in lp_bld_arit we don't want llvm to potentially optimize away the vectors (though it doesn't seem to currently), plus we want to be able to handle in/out vectors of arbitrary length. commit 3f5c4e04af8a7592fdffa54938a277c34ae76b51 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:22:55 2012 +0100 gallivm: fix lp_build_sqrt() for vector length 1 since we optimize away vectors with length 1 need to emit intrinsic without vector type. commit 79d94e5f93ed8ba6757b97e2026722ea31d32c06 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 17:00:46 2012 +0000 llvmpipe: Remove lp_test_round. commit 81f41b5aeb3f4126e06453cfc78990086b85b78d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:56:24 2012 +0100 llvmpipe: subsume lp_test_round into lp_test_arit Much simpler, and since the arguments aren't passed as 128bit values can run on any arch. This also uses the float instead of the double versions of the c functions (which probably was the intention anyway). In contrast to lp_test_round the output is much less verbose however. Tested vector width of 32 to 512 bits - all pass except 32 (length 1) which crashes in lp_build_sqrt() due to wrong type. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 945b338b421defbd274481d8c4f7e0910fd0e7eb Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 09:55:03 2012 +0000 gallivm: Centralize the function compilation logic. This simplifies a lot of code. Also doing this in a central place will make it easier to carry out the changes necessary to use MC-JIT in the future. gallivm: Fix typo in explicit derivative shuffle. Trivial. draw: make DEBUG_STORE work again adapt to lp_build_printf() interface changes Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: get rid of vecnf_from_scalar() just use lp_build_broadcast directly (cannot assign a name but don't really need it, vecnf_from_scalar() was producing much uglier IR due to using repeated insertelement instead of insertelement+shuffle). Reviewed-by: José Fonseca <jfonseca@vmware.com> llvmpipe: fix typo in complex interpolation code Fixes position interpolation when using complex mode (piglit fp-fragment-position and similar) Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: fix clipvertex/position storing again This appears to be the result of a bad merge. Fixes piglit tests relying on clipping, like a lot of the interpolation tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Fix explicit derivative manipulation. Same counter variable was being used in two nested loops. Use more meanigful variable names for the counter to fix and avoid this. gallivm: Prevent buffer overflow in repeat wrap mode for NPOT. Based on Roland's patch, discussion, and review . Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix dims for TGSI_TEXTURE_1D in emit_tex. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix explicit volume texture derivatives. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: fix 1d shadow texture sampling Always r coordinate is used, hence need 3 coords not two (the second one is unused). Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Enable AVX support without MCJIT, where available. For now, this just enables AVX on Windows for testing. If the code is stable then we might consider prefering the old JIT wherever possible. No change elsewhere. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2012-07-13 18:09:30 +01:00
if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
(util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
const unsigned num_iterations = 0;
LLVMValueRef res;
unsigned i;
gallivm,draw,llvmpipe: Support wider native registers. Squashed commit of the following: commit 7acb7b4f60dc505af3dd00dcff744f80315d5b0e Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:46:31 2012 +0100 draw: Don't use dynamically sized arrays. Not supported by MSVC. commit 5810c28c83647612cb372d1e763fd9d7780df3cb Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:44:16 2012 +0100 gallivm,llvmpipe: Don't use expressions with PIPE_ALIGN_VAR(). MSVC doesn't accept exceptions in _declspec(align(...)). Use a define instead. commit 8aafd1457ba572a02b289b3f3411e99a3c056072 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:41:56 2012 +0100 gallium/util: Make u_cpu_detect.h header C++ safe. commit 5795248350771f899cfbfc1a3a58f1835eb2671d Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 2 12:08:01 2012 +0100 gallium/util: Add ULL suffix to large constants. As suggested by Andy Furniss: it looks like some old gcc versions require it. commit 4c66c22727eff92226544c7d43c4eb94de359e10 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Truly disable INF/NAN tests on MSVC. Thanks to Brian for spotting this. commit 8bce274c7fad578d7eb656d9a1413f5c0844c94e Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Disable INF/NAN tests on MSVC. Somehow they are not recognized as constants. commit 6868649cff8d7fd2e2579c28d0b74ef6dd4f9716 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 15:05:24 2012 +0200 gallivm: Cleanup the 2 x 8 float -> 16 ub special path in lp_build_conv. No behaviour change intended, like 7b98455fb40c2df84cfd3cdb1eb7650f67c8a751. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 5147a0949c4407e8bce9e41d9859314b4a9ccf77 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 14:28:19 2012 +0200 gallivm: (trivial) fix issues with multiple-of-4 texture fetch Some formats can't handle non-multiple of 4 fetches I believe, but everything must support length 1 and multiples of 4. So avoid going to scalar fetch (which is very costly) just because length isn't 4. Also extend the hack to not use shift with variable count for yuv formats to arbitrary length (larger than 1) - doesn't matter how many elements we have we always want to avoid it unless we have variable shift count instruction (which we should get with avx2). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 87ebcb1bd71fa4c739451ec8ca89a7f29b168c08 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jul 4 02:09:55 2012 +0200 gallivm: (trivial) fix typo for wrap repeat mode in linear filtering aos code This would lead to bogus coordinates at the edges. (undetected by piglit because this path is only taken for block-based formats). Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 3a42717101b1619874c8932a580c0b9e6896b557 Author: José Fonseca <jfonseca@vmware.com> Date: Tue Jul 3 19:42:49 2012 +0100 gallivm: Fix TGSI integer translation with AVX. commit d71ff104085c196b16426081098fb0bde128ce4f Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 15:17:41 2012 +0100 llvmpipe: Fix LLVM JIT linear path. It was not working properly because it was looking at the JIT function before it was actually compiled. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit a94df0386213e1f5f9a6ed470c535f9688ec0a1b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Jun 28 18:07:10 2012 +0100 gallivm: Refactor lp_build_broadcast(_scalar) to share code. Doesn't really change the generated assembly, but produces more compact IR, and of course, makes code more consistent. Reviewed-by: Brian Paul <brianp@vmware.com> commit 66712ba2731fc029fa246d4fc477d61ab785edb5 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 17:30:13 2012 +0100 gallivm: Make LLVMContextRef a singleton. There are any places inside LLVM that depend on it. Too many to attempt to fix. Reviewed-by: Brian Paul <brianp@vmware.com> commit ff5fb7897495ac263f0b069370fab701b70dccef Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 28 18:15:27 2012 +0200 gallivm: don't use 8-wide texture fetch in aos path This appears to be a slight loss usually. There are probably several reasons for that: - fetching itself is scalar - filtering is pure int code hence needs splitting anyway, same for the final texel offset calculations - texture wrap related code, which can be done 8-wide, is slightly more complex with floats (with clamp_to_edge) and float operations generally more costly hence probably not much faster overall - the code needed to split when encountering different mip levels for the quads, adding complexity So, just split always for aos path (but leave it 8-wide for soa, since we do 8-wide filtering there when possible). This should certainly be revisited if we'd have avx2 support. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ce8032b43dcd8e8d816cbab6428f54b0798f945d Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:41:19 2012 +0200 gallivm: (trivial) don't extract fparts variable if not needed Did not have any consequences but unnecessary. commit aaa9aaed8f80dc282492f62aa583a7ee23a4c6d5 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:09:06 2012 +0200 gallivm: fix precision issue in aos linear int wrap code now not just passes at a quick glance but also with piglit... If we do the wrapping with floats, we also need to set the weights accordingly. We can potentially end up with different (integer) coordinates than what the integer calculations would have chosen, which means the integer weights calculated previously in this case are completely wrong. Well at least that's what I think happens, at least recalculating the weights helps. (Some day really should refactor all the wrapping, so we do whatever is fastest independent of 16bit int aos or 32bit float soa filtering.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit fd6f18588ced7ac8e081892f3bab2916623ad7a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 11:15:53 2012 +0100 gallium/util: Fix parsing of options with underscore. For example GALLIVM_DEBUG=no_brilinear which was being parsed as two options, "no" and "brilinear". commit 09a8f809088178a03e49e409fa18f1ac89561837 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 26 15:00:14 2012 +0100 gallivm: Added a generic lp_build_print_value which prints a LLVMValueRef. Updated lp_build_printf to share common code. Removed specific lp_build_print_vecX. Reviewed-by: José Fonseca <jfonseca@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit e59bdcc2c075931bfba2a84967a5ecd1dedd6eb0 Author: José Fonseca <jfonseca@vmware.com> Date: Wed May 16 15:00:23 2012 +0100 draw,llvmpipe: Avoid named struct types on LLVM 3.0 and later. Starting with LLVM 3.0, named structures are meant not for debugging, but for recursive data types, previously also known as opaque types. The recursive nature of these types leads to several memory management difficulties. Given that we don't actually need recursive types, avoid them altogether. This is an attempt to address fdo bugs 41791 and 44466. The issue is somewhat random so there's no easy way to check how effective this is. Cherry-picked from 9af1ba565dfd5cef9ee938bb7c04767d14878fbf commit df6070f618a203c7a876d984c847cde4cbc26bdb Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 14:42:53 2012 +0200 gallivm: (trivial) fix typo in faster aos linear int wrap code no longer crashes, now REALLY tested. commit d8f98dce452c867214e6782e86dc08562643c862 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 18:20:58 2012 +0200 llvmpipe: (trivial) remove bogus optimization for float aos repeat wrap This optimization for nearest filtering on the linear path generated likely bogus results, and the int path didn't have any optimizations there since the only shader using force_nearest apparently uses clamp_to_edge not repeat wrap anyway. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit c4e271a0631087c795e756a5bb6b046043b5099d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 23:01:52 2012 +0200 gallivm: faster repeat wrap for linear aos path too Even if we already have scaled integer coords, it's way faster to use the original float coord (plus some conversions) rather than use URem. The choice of what to do for texture wrapping is not really tied to int aos or float soa filtering though for some modes there can be some gains (because of easier weight calculations). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 1174a75b1806e92aee4264ffe0ffe7e70abbbfa3 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 14:39:22 2012 +0200 gallivm: improve npot tex wrap repeat in linear soa path URem gets translated into series of scalar divisions so just about anything else is faster. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit f849ffaa499ed96fa0efd3594fce255c7f22891b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 00:40:35 2012 +0100 gallivm: (trivial) fix near-invisible shift-space typo I blame the keyboard. commit 5298a0b19fe672aebeb70964c0797d5921b51cf0 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:24:28 2012 +0200 gallivm: add new intrinsic helper to deal with arbitrary vector length This helper will split vectors which are too large for the hw, or expand them if they are too small, so a caller of a function using intrinsics which uses such sizes need not split (or expand) the vectors manually and the function will still use the intrinsic instead of dropping back to generic llvm code. It can also accept scalars for use with pseudo-vector intrinsics (only useful for float arguments, all x86 scalar simd float intrinsics use 4vf32). Only used for lp_build_min/max() for now (also added the scalar float case for these while there). (Other basic binary functions could use it easily, whereas functions with a different interface would need different helpers.) Expanding vectors isn't widely used, because we always try to use build contexts with native hw vector sizes. But it might (or not) be nicer if this wouldn't need to be done, the generated code should in theory stay the same (it does get hit by lp_build_rho though already since we didn't have a intrinsic for the scalar lp_build_max case before). v2: incorporated Brian's feedback, and also made the scalar min/max case work instead of crash (all scalar simd float intrinsics take 4vf32 as argument, probably the reason why it wasn't used before). Moved to lp_bld_intr based on José's request, and passing intrinsic size instead of length. Ideally we'd derive the source type info from the passed in llvm value refs and process some llvmtype return type so we could handle intrinsics where the source and destination type isn't the same (like float/int conversions, packing instructions) but that's a bit too complicated for now. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 01aa760b99ec0b2dc8ce57a43650e83f8c1becdf Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:19:18 2012 +0200 gallivm: (trivial) increase max code size for shader disassembly 64kB was just short of what I needed (which caused a crash) hence increase to 96kB (should probably be smarter about that). commit 74aa739138d981311ce13076388382b5e89c6562 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:53:29 2012 +0100 gallivm: simplify aos float tex wrap repeat nearest just handle pot and npot the same. The previous pot handling ended up with exactly the same instructions plus 2 more (leave it in the soa path though since it is probably still cheaper there). While here also fix a issue which would cause a crash after an assert. commit 0e1e755645e9e49cfaa2025191e3245ccd723564 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:29:24 2012 +0100 gallivm: (trivial) skip floor rounding in ifloor when not signed This was only done for the non-sse41 case before, but even with sse41 this is obviously unnecessary (some callers already call itrunc in this case anyway but some might not). commit 7f01a62f27dcb1d52597b24825931e88bae76f33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:23:12 2012 +0100 gallivm: (trivial) fix bogus comments commit 5c85be25fd82e28490274c468ce7f3e6e8c1d416 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 20 11:51:57 2012 +0100 translate: Free elt8_func/elt16_func too. These were leaking. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 0ad498f36fb6f7458c7cffa73b6598adceee0a6c Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 15:55:34 2012 +0200 gallivm: fix bug for tex wrap repeat with linear sampling in aos float path The comparison needs to be against length not length_minus_one, otherwise the max texel is never chosen (for the second coordinate). Fixes piglit texwrap-1D-npot-proj (and 2D/3D versions). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit d1ad65937c5b76407dc2499b7b774ab59341209e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 16:13:43 2012 +0200 gallivm: simplify soa tex wrap repeat with npot textures and no mip filtering Similar to what is already done in aos sampling for the float path (but not the int path since we don't get normalized float coordinates there). URem is expensive and the calculation is done trivially with normalized floats instead (at least with sse41-capable cpus). (Some day should probably do the same for the mip filter path but it's much more complicated there hence the gain is smaller.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit e1e23f57ba9b910295c306d148f15643acc3fc83 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:38:56 2012 +0200 llvmpipe: (trivial) remove duplicated function declaration Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 07ca57eb09e04c48a157733255427ef5de620861 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:37:34 2012 +0200 llvmpipe: destroy setup variants on context destruction lp_delete_setup_variants() used to be called in garbage collection, but this no longer exists hence the setup shaders never got freed. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ed0003c633859a45f9963a479f4c15ae0ef1dca3 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 16:25:29 2012 +0100 gallivm: handle different ilod parts for multiple quad sampling This fixes filtering when the integer part of the lod is not the same for all quads. I'm not fully convinced of that solution yet as it just splits the vector if the levels to be sampled from are different. But otherwise we'd need to do things like some minify steps, and getting mip level base address separately anyway hence it wouldn't really look like much of a win (and making the code even more complex). This should now give identical results to single quad sampling. commit 8580ac4cfc43a64df55e84ac71ce1a774d33c0d2 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:14:47 2012 +0200 gallivm: de-duplicate sample code common to soa and aos sampling There doesn't seem to be any reason why this code dealing with cube face selection, lod and mip level calculation is separate in aos and soa sampling, and I am sick of having it to change in both places. commit fb541e5f957408ce305b272100196f1e12e5b1e8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:15:41 2012 +0200 gallivm: do mip filtering with per quad lod_fpart This gives better results for mip filtering, though the generated code might not be optimal. For now it also creates some artifacts if the lod_ipart isn't the same for all quads, since instead of using the same mip weight for all quads as previously (which just caused non-smooth gradients) this now will use the right weights but with the wrong mip level in this case (can easily be seen with things like texfilt, mipmap_tunnel). v2: use logic helper suggested by José, and fix issue with negative lod_fpart values commit f1cc84eef7d826a20fab6cd8ccef9a275ff78967 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 13 18:35:25 2012 +0200 gallivm: (trivial) fix bogus assert in lp_build_unpack_broadcast_aos_scalars commit 7c17dbae8ae290df9ce0f50781a09e8ed640c044 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:11:14 2012 +0100 util: Reimplement half <-> float conversions. Removed u_half.py used to generate the table for previous method. Previous implementation of float to half conversion was faulty for denormalised and NaNs and would require extra logic to fix, thus making the speedup of using tables irrelevant. commit 7762f59274070e1dd4b546f5cb431c2eb71ae5c3 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:12:16 2012 +0100 tests: Updated tests to properly handle NaN for half floats. commit fa94c135aea5911fd93d5dfb6e6f157fb40dce5e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:33:10 2012 +0200 gallivm: do mip level calculations per quad This is the final piece which shouldn't change the rendering output yet. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 23cbeaddfe03c09ca18c45d28955515317ffcf4c Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Jun 9 00:54:21 2012 +0200 gallivm: do per-quad cube face selection Doesn't quite fix the piglit cubemap test (not sure why actually) but doing per-quad face selection is doing the right thing and definitely an improvement. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit abfb372b3702ac97ac8b5aa80ad1b94a2cc39d33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:22:59 2012 +0200 gallivm: do all lod calculations per quad Still no functional change but lod is now converted to scalar after lod calculations. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 519368632747ae03feb5bca9c655eccbc5b751b4 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:46:10 2012 +0100 gallivm: Added support for half-float to float conversion in lp_build_conv. Updated various utility functions to support this change. commit 135b4d683a4c95f7577ba27b9bffa4a6fbd2c2e7 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:02:46 2012 +0100 gallivm: Added function for half-float to float conversion. Updated lp_build_format_aos_array to support half-float source. commit 37d648827406a20c5007abeb177698723ed86673 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:55:18 2012 +0100 util: Updated u_format_tests to rigidly test half-float boundary values. commit 2ad18165d96e578aa9046df7c93cb1c3284d8c6b Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:54:16 2012 +0100 llvmpipe: Updated lp_test_format to properly handle Inf/NaN results. commit 78740acf25aeba8a7d146493dd5c966e22c27b73 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:53:30 2012 +0100 util: Added functions for checking NaN / Inf for double and half-floats. commit 35e9f640ae01241f9e0d67fe893bbbf564c05809 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:05:13 2012 +0200 gallivm: Fix calculating rho for 3d textures for the single-quad case Discovered by accident, this looks like a very old typo bug. commit fc1220c636326536fd0541913154e62afa7cd1d8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:04:59 2012 +0200 gallivm: do calcs per-quad in lp_build_rho Still convert to scalar at the end of the function. commit 50a887ffc550bf310a6988fa2cea5c24d38c1a41 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 21 23:21:50 2012 +0200 gallivm: (trivial) return scalar in lp_build_extract_range for length 1 vectors Our type system on top of llvm's one doesn't generally support vectors of length 1, instead using scalars. So we should return a scalar from this function instead of having to bitcast the vector with length 1 later elsewhere. commit 80c71c621f9391f0f9230460198d861643324876 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 17:49:15 2012 +0100 draw: Fixed bad merge error commit c47401cfad0c9167de20ff560654f533579f452c Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:29:30 2012 +0100 draw: Updated store_clip to store whole vectors instead of individual elements. commit 2d9c1ad74b0b0b41861fffcecde39f09cc27f1cf Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:28:32 2012 +0100 gallivm: Added lp_build_fetch_rgba_aos_array. A version of lp_build_fetch_rgba_aos which is targeted at simple array formats. Reads the whole vector from memory in one, instead of reading each element individually. Tested with mesa tests and demos. commit ff7805dc2b6ef6d8b11ec4e54aab1633aef29ac8 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:27:40 2012 +0100 gallivm: Added lp_build_pad_vector. This function pads a vector with undef to a desired length. commit 701f50acef24a2791dabf4730e5b5687d6eb875d Author: James Benton <jbenton@vmware.com> Date: Fri May 18 17:27:19 2012 +0100 util: Added util_format_is_array. This function checks whether a format description is in a simple array format. commit 5e0a7fa543dcd009de26f34a7926674190fa6246 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:13:47 2012 +0100 draw: Removed draw_llvm_translate_from and draw/draw_llvm_translate.c. This is "replaced" by adding an optimised path in lp_build_fetch_rgba_aos in an upcoming patch. commit 8c886d6a7dd3fb464ecf031de6f747cb33e5361d Author: James Benton <jbenton@vmware.com> Date: Wed May 16 15:02:31 2012 +0100 draw: Modified store_aos to write the vector as one, not individual elements. commit 37337f3d657e21dfd662c7b26d61cb0f8cfa6f17 Author: James Benton <jbenton@vmware.com> Date: Wed May 16 14:16:23 2012 +0100 draw: Changed aos_to_soa to use lp_build_transpose_aos. commit bd2b69ce5d5c94b067944d1dcd5df9f8e84548f1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:14:27 2012 +0100 draw: Changed soa_to_aos to use lp_build_transpose_aos. commit 0b98a950d29a116e82ce31dfe7b82cdadb632f2b Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:45 2012 +0100 gallivm: Added lp_build_transpose_aos which converts between aos and soa. commit 69ea84531ad46fd145eb619ed1cedbe97dde7cb5 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:01 2012 +0100 gallivm: Added lp_build_interleave2_half aimed at AVX unpack instructions. commit 7a4cb1349dd35c18144ad5934525cfb9436792f9 Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 22 11:54:14 2012 +0100 gallivm: Fix build on Windows. MC-JIT not yet supported there. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit afd105fc16bb75d874e418046b80d9cc578818a1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:17:26 2012 +0100 llvmpipe: Added a error counter to lp_test_conv. Useful for keeping track of progress when fixing errors! Signed-off-by: José Fonseca <jfonseca@vmware.com> commit b644907d08c10a805657841330fc23db3963d59c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:16:46 2012 +0100 llvmpipe: Changed known failures in lp_test_conv. To comply with the recent fixes to lp_bld_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit d7061507bd94f6468581e218e61261b79c760d4f Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:14:38 2012 +0100 llvmpipe: Added fixed point types tests to lp_test_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 146b3ea39b4726dbe125ac666bd8902ea3d6ca8c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:26:35 2012 +0100 llvmpipe: Changed lp_test_conv src/dst alignment to be correct. Now based on the define rather than a fixed number. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit f3b57441f834833a4b142a951eb98df0aa874536 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:06:44 2012 +0100 gallivm: Fixed erroneous optimisation in lp_build_min/max. Previously assumed normalised was 0 to 1, but it can be -1 to 1 if type is signed. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a0613382e5a215cd146bb277646a6b394d376ae4 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:04:49 2012 +0100 gallivm: Compensate for lp_const_offset in lp_build_conv. Fixing a /*FIXME*/ to remove errors in integer conversion in lp_build_conv. Tested using lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a3d2bf15ea345bc8a0664f8f441276fd566566f3 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:01:25 2012 +0100 gallivm: Fixed overflow in lp_build_clamped_float_to_unsigned_norm. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit e7b1e76fe237613731fa6003b5e1601a2e506207 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 21 20:07:51 2012 +0100 gallivm: Fix build with LLVM 2.6 Trivial, and useful. commit d3c6bbe5c7f5ba1976710831281ab1b6a631082d Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 15 17:15:59 2012 +0100 gallivm: Enable MCJIT/AVX with vanilla LLVM 3.1. Add the necessary C++ glue, so that we don't need any modifications to the soon to be released LLVM 3.1. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 724a019a14d40fdbed21759a204a2bec8a315636 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 22:04:06 2012 +0100 gallivm: Use HAVE_LLVM 0x0301 consistently. commit af6991e2a3868e40ad599b46278551b794839748 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 21:49:06 2012 +0100 gallivm: Add MCRegisterInfo.h to silence benign warnings about missing implementation. Trivial. commit 6f8a1d75458daae2503a86c6b030ecc4bb494e23 Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Apr 2 22:14:15 2012 -0700 gallivm: Pass in a MCInstrInfo to createMCInstPrinter on llvm-3.1. llvm-3.1svn r153860 makes MCInstrInfo available to the MCInstPrinter. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 62555b6ed8760545794f83064e27cddcb3ce5284 Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 27 21:51:17 2012 -0700 gallivm: Fix method overriding in raw_debug_ostream. Use matching type qualifers to avoid method hiding. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 6a9bd784f4ac68ad0a731dcd39e5a3c39989f2be Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 13 22:40:52 2012 -0700 gallivm: Fix createOProfileJITEventListener namespace with llvm-3.1. llvm-3.1svn r152620 refactored the OProfile profiling code. createOProfileJITEventListener was moved from the llvm namespace to the llvm::JITEventListener namespace. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit b674955d39adae272a779be85aa1bd665de24e3e Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Mar 5 22:00:40 2012 -0800 gallivm: Pass in a MCRegisterInfo to MCInstPrinter on llvm-3.1. llvm-3.1svn r152043 changes createMCInstPrinter to take an additional MCRegisterInfo argument. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 11ab69971a8a31c62f6de74905dbf8c02884599f Author: Vinson Lee <vlee@freedesktop.org> Date: Wed Feb 29 21:20:53 2012 -0800 Revert "gallivm: Change getExtent and readByte to non-const with llvm-3.1." This reverts commit d5a6c172547d8964f4d4bb79637651decaf9deee. llvm-3.1svn r151687 makes MemoryObject accessor members const again. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 339960c82d2a9f5c928ee9035ed31dadb7f45537 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 14 16:19:56 2012 +0200 gallivm: (trivial) fix assertion failure for mipmapped 1d textures In lp_build_rho, we may end up with a 1-element vector (for mipmapped 1d textures), but in this case we require the type to be a non-vector type, so need a cast. commit 9d73edb727bd6d196030dc3026b7bf0c574b3e19 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:12:07 2012 +0200 gallivm: prepare for per-quad lod calculations for large vectors to be able to handle multiple quads at once in texture sampling and still do lod calculations per quad, it is necessary to get the per-quad derivatives into the lp_build_rho function. Until now these derivative values were just scalars, which isn't going to work. So we now use vectors, and since the interface needs to change we also do some different (slightly more efficient) packing of the values. For 8-wide vectors the packed derivative values for 3 coords would look like this, this scales to a arbitrary (multiple of 4) vector size: ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____ The second vector will be unused for 1d and 2d textures. To facilitate future changes the derivative values are put into a struct, since quite some functions just pass these values through. The generated code seems to be very slightly better for 2d textures (with 4-wide vectors) than before with sse2 (if you have a cpu with physical 128bit simd units - otherwise it's probably not a win). v2: suggestions from José, rename variables, add comments, use swizzle helper commit 0aa21de0d31466dac77b05c97005722e902517b8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:10:31 2012 +0200 gallivm: add undefined swizzle handling to lp_build_swizzle_aos This is useful for vectors with "holes", it lets llvm choose the most efficient shuffle instructions if some elements aren't needed without having to worry what elements to manually pick otherwise. commit 00faf3f370e7ce92f5ef51002b0ea42ef856e181 Author: José Fonseca <jfonseca@vmware.com> Date: Fri May 4 17:25:16 2012 +0100 gallivm: Get the LLVM IR optimization passes before JIT compilation. MC-JIT engine compiles the module immediately on creation, so the optimization passes were being run too late. So now we create a target data layout from a string, that matches the ABI parameters reported by the compiler. The backend optimization passes were always been run, so the performance improvement is modest (3% on multiarb mesa demo). Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit 40a43f4e2ce3074b5ce9027179d657ebba68800a Author: Roland Scheidegger <sroland@vmware.com> Date: Wed May 2 16:03:54 2012 +0200 gallivm: (trivial) fix wrong define used in lp_build_pack2 should fix stack-smashing crashes. commit e6371d0f4dffad4eb3b7a9d906c23f1c88a2ab9e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 30 21:25:29 2012 +0200 gallivm: add perf warnings when not using intrinsics with 256bit vectors Helper functions using integer sse2 intrinsics could split the vectors with AVX instead of using generic fallback (which should be faster). We don't actually expect to hit these paths (hence don't fix them up to actually do the vector splitting) so just emit warnings (for those functions where it's obvious doing split/intrinsic is faster than using generic path). Only emit warnings for 256bit vectors since we _really_ don't expect to hit arbitrary large vectors which would affect a lot more functions. The warnings do not actually depend on avx since the same logic applies to plain sse2 too (but of course again there's _really_ no reason we should hit these functions with 256bit vectors without avx). commit 8a9ea701ea7295181e846c6383bf66a5f5e47637 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:37:07 2012 +0200 gallivm: split vectors manually for avx in lp_build_pack2 (v2) There's 2 reasons for this: First, there's a llvm bug (fixed in 3.1) which generates tons of byte inserts/extracts otherwise, and second, more importantly, we want to use pack intrinsics instead of shuffles. We do this in lp_build_pack2 and not the calling code (aos sample path) because potentially other callers might find that useful too, even if for larger sequences of code using non-native vector sizes it might be better to manually split vectors. This should boost texture performance in the aos path considerably. v2: fix issues with intrinsics types with old llvm commit 27ac5b48fa1f2ea3efeb5248e2ce32264aba466e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:26:22 2012 +0200 llvmpipe: refactor lp_build_pack2 (v2) prettify, and it's unnecessary to assert when there's no intrinsic due to unsupported bit width - the shuffle path will work regardless. In contrast lp_build_packs2, should only rely on lp_build_pack2 doing the clamping for element sizes for which there is a sse2 intrinsic. v2: fix bug spotted by Jose regarding the intrinsic type for packusdw on old llvm versions. commit ddf279031f0111de4b18eaf783bdc0a1e47813c8 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:13:59 2012 +0200 gallivm: add src width check in lp_build_packs2() not doing so would skip clamping even if no sse2 pack instruction is available, which is incorrect (in theory only, such widths would also always hit a (unnecessary) assertion in lp_build_pack2(). commit e7f0ad7fe079975eae7712a6e0c54be4fae0114b Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Apr 27 15:57:00 2012 +0200 gallivm: (trivial) fix crash-causing typo for npot textures with avx commit 28a9d7f6f655b6ec508c8a3aa6ffefc1e79793a0 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Apr 25 19:38:45 2012 +0200 gallivm: (trivial) remove code mistakenly added twice. commit d5926537316f8ff67ad0a52e7242f7c5478d919b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Apr 24 21:16:15 2012 +0200 gallivm: add a new avx aos sample path (v2) Try to avoid mixing float and int address calculations. This does texture wrap modes with floats, and then the offset calculations still with ints (because of lack of precision with floats, though we could do some effort to make it work with not too large (16MB) textures). This also handles wrap repeat mode with npot-sized textures differently than either the old soa or aos int path (likely way faster but untested). Otherwise the actual address wrap code is largely similar to the soa path (not quite the same as this one also has some int code), it should get used by avx soa sampling later as well but doesn't handle more complex address modes yet (this will also have the benefit that we can use aos sampling path for all texture address modes). Generated code for that looks reasonable, but still does not split vectors explicitly for fetch/filter which means still get hit by llvm (fixed upstream) which generates hundreds of pinsrb/pextrb instead of two shuffles. It is not obvious though if it's much of a win over just doing address calcs 4-wide but with ints, even if it is definitely much less instructions on avx. piglit's texwrap seems to look exactly the same but doesn't test neither the non-normalized nor the npot cases. v2: fix comments, prettify based on Brian's and Jose's feedback. commit bffecd22dea66fb416ecff8cffd10dd4bdb73fce Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Apr 19 01:58:29 2012 +0200 gallivm: refactor aos lp_build_sample_image_nearest/linear split them up to separate address calculations and fetching/filtering. Need this for being able to do 8-wide float address calcs and 4-wide fetch/filter later (for avx). Plus the functions were very big scary monsters anyway (in particular lp_build_sample_image_linear). commit a80b325c57529adddcfa367f96f03557725c4773 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 16 17:17:18 2012 +0200 gallivm: fix lp_build_resize when truncating width but expanding vector size Missed this case which I thought was impossible - the assertion for it was right after the division by zero... (AoS) texture sampling may ask us to do this, for things like 8 4x32int vectors to 1 32x8int vector conversion (eventually, we probably don't want this to happen). commit f9c8337caa3eb185830d18bce8b95676a065b1d7 Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Apr 14 18:00:59 2012 +0200 gallivm: fix cube maps with larger vectors This makes the branchless cube face selection code work with larger vectors. Because the complexity is quite high (cannot really be improved it seems, per-face selection would reduce complexity a lot but this leads to errors unless the derivatives are calculated all from the same face which almost doubles the work to be done) it is still slower than the branching version, hence only enable this with large vectors. It doesn't actually do per-quad face selection yet (only makes sense with matching lod selection, in fact it will select the same face for all pixels based on the average of the first four pixels for now) but only different shuffles are required to make it work (the branching version actually should work with larger vectors too now thanks to the improved horizontal add but of course it cannot be extended to really select the face per-quad unless doing branching per quad). commit 7780c58869fc9a00af4f23209902db7e058e8a66 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 21:11:12 2012 +0100 llvmpipe: (trivial) fix compiler warning and also clarify comment regarding availability of popcnt instruction. commit a266dccf477df6d29a611154e988e8895892277e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 14:21:07 2012 +0100 gallivm: remove unneeded members in lp_build_sample_context Minor cleanup, the texture width, height, depth aren't accessed in their scalar form anywhere. Makes it more obvious those values should probably be fetched already vectorized (but this requires more invasive changes)... commit b678c57fb474e14f05e25658c829fc04d2792fff Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 29 15:53:55 2012 +0100 gallivm: add a helper for concatenating vectors Similar to the extract_range helper intended to get around slow code generated by llvm for 128bit insertelements. Concatenating two 128bit vectors this way will result in a single vinsertf128 operation rather than two 64bit stores plus one 128bit load, though it might be mildly useful for other purposes as well. commit 415ff228bcd0cf5e44a4c15350a661f0f5520029 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 19:41:15 2012 +0100 gallivm: add a custom 2x8f->1x16ub avx conversion path Similar to the existing 4x4f->1x16ub sse2 path, shaves off a couple instructions (min/max mostly) because it relies on pack intrinsics clamping. commit 78c08fc89f8fbcc6dba09779981b1e873e2a0299 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 18:44:07 2012 +0100 gallivm: add avx arithmetic intrinsics Add all avx intrinsics for arithmetic functions (with the exception of the horizontal add function which needs another look). Seems to pass basic tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit a586caa2800aa5ce54c173f7c0d4fc48153dbc4e Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 15:31:35 2012 +0100 gallivm: add avx logic intrinsics Add the blend intrinsics for 8-wide float and 4-wide double vectors. Since we lack 256bit int instructions these are used for int vectors as well, though obviously not for byte or word element values. The comparison intrinsics aren't extended for avx since these are only used for pre-2.7 llvm versions. commit 70275e4c13c89315fc2560a4c488c0e6935d5caf Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 00:40:53 2012 +0100 gallivm: new helper function for extract shuffles. Based on José's idea as we can need that in a couple places. Note that such shuffles should not be used lightly, since data layout of <4 x i8> is different to <16 x i8> for instance, hence might cause data rearrangement. commit 4d586dbae1b0c55915dda1759d2faea631c0a1c2 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 18:27:25 2012 +0100 gallivm: (trivial) don't overallocate shuffle variable using wrong define meant huge array... commit 06b0ec1f6d665d98c135f9573ddf4ba04b2121ad Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 17:54:20 2012 +0100 gallivm: don't do per-element extract/insert for vector element resize Instead of doing per-element extract/insert if the src vectors and dst vector differ in total size (which generates atrocious code) first change the src vectors size by using shuffles to destination vector size. We can still do better than that on AVX for packing to color buffer (by exploiting pack intrinsics characteristics hence eleminating the need for some clamps) but this already generates much better code. v2: incorporate feedback from José, Keith and use shuffle instead of bitcasts/extracts. Due to llvm deficiencies the latter cause all data to get moved to GPRs and back in pieces (even though the data in the regs actually stays the same...). commit c9970d70e05f95d3f52fe7d2cd794176a52693aa Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 19:33:19 2012 +0000 gallivm: fix bug in simple position interpolation Accidental use of position attribute instead of just pixel coordinates. Caused failures in piglit glsl-fs-ceil and glsl-fs-floor. commit d0b6fcdb008d04d7f73d3d725615321544da5a7e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 15:31:14 2012 +0000 gallivm: fix emission of ceil opcode lp_build_ceil seems more appropriate than lp_build_trunc. This seems to be never hit though someone performs some ceil to floor magic. commit d97fafed7e62ffa6bf76560a92ea246a1a26d256 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 22 11:46:52 2012 +0000 gallivm: new vectorized path for cubemap calculations should be faster when adapted to multiple quads as only selection masks need to be different. The code is more or less a per-pixel version adapted to only do it per quad. A per pixel version would be much simpler (could drop 2 selects, 6 broadcasts and the messy horizontal add of 3 vectors at the expense of only 2 more absolute value instructions - would also just work for arbitary large vectors). This version doesn't yet work with larger vectors because the horizontal add isn't adjusted to be able to work with 2x4 vectors (and also because face selection wouldn't be done per quad just per block though that would be only a correctness issue just as with lod selection). The downside is this code is quite a bit slower. On a Core2 it can be sped up by disabling the hw blend instructions for selection and using logicop fallbacks instead, but it is still slower than the old code, hence leave that in for now. Probably will chose one or the other version based on vector length in the end. commit b375fbb18a3fd46859b7fdd42f3e9908ea4ff9a3 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 21 14:42:29 2012 +0000 gallivm: fix optimized occlusion query intrinsic name commit a9ba0a3b611e48efbb0e79eb09caa85033dbe9a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Mar 21 16:19:43 2012 +0000 draw,gallivm,llvmpipe: Call gallivm_verify_function everywhere. commit f94c2238d2bc7383e088b8845b7410439a602071 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 18:54:10 2012 +0000 gallivm: optimize calculations for cube maps a bit this does some more vectorized calculations and uses horizontal adds if possible. A definite win with sse3 otherwise it doesn't seem to make much of a difference. In any case this is arithmetically identical, cannot handle larger vectors. Should be useful as a reference point against larger vector version later... commit 21a2c1cf3c8e1ac648ff49e59fdc0e3be77e2ebb Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 15:16:27 2012 +0000 llvmpipe: slight optimization of occlusion queries using movmskps when available. While this is slightly better for cpus without popcnt we should really sum the vectors ourselves (it is also possible to cast to i4 before doing the popcnt but that doesn't help that much neither since llvm is using some optimized popcnt version for i32) commit 5ab5a35f216619bcdf55eed52b0db275c4a06c1b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 13:32:11 2012 +0000 llvmpipe: fix occlusion queries with larger vectors need to adjust casts etc. commit ff95e6fdf5f16d4ef999ffcf05ea6e8c7160b0d5 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Mar 19 20:15:25 2012 +0000 gallivm: Restore optimization passes. commit 57b05b4b36451e351659e98946dae27be0959832 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:34:22 2012 +0000 llvmpipe: use existing min2 macro commit bc9a20e19b4f600a439f45679451f2e87cd4b299 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:07:27 2012 +0000 llvmpipe: add some safeguards against really large vectors As per José's suggestion, prevent things from blowing up if some cpu would have 1024bit or larger vectors. commit 0e2b525e5ca1c5bbaa63158bde52ad1c1564a3a9 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:31:08 2012 +0000 llvmpipe: fix mask generation for uberwide vectors this was the only piece preventing 16-wide vectors from working (apart from the LP_MAX_VECTOR_WIDTH define that is), which is the maximum as we don't get more pixels in the fragment shader at once. Hence adjust that so things could be tested properly with that size even though there seems to be no practical value. commit 3c8334162211c97f3a11c7f64e9e5a2a91ad9656 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:19:41 2012 +0000 llvmpipe: fix the simple interpolation method with larger vectors so both methods actually _really_ work now. Makes textures look nice with larger vectors... commit 1cb0464ef8871be1778d43b0c56adf9c06843e2d Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 17:26:35 2012 +0000 llvmpipe: fix mask generation and position interpolation with 8-wide vectors trivial bugs, with these things start to look somewhat reasonable. Textures though have some swizzling issues it seems. commit 168277a63ef5b72542cf063c337f2d701053ff4b Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 16:04:03 2012 +0000 llvmpipe: don't overallocate variables we never have more than 16 (stamp size) / 4 (minimum possible vector size). (With larger vectors those variables are still overallocated a bit.) commit 409b54b30f81ed0aa9ed0b01affe15c72de9abd2 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:56:48 2012 +0000 llvmpipe: add some 32f8 formats to lp_test_conv Also add the ability to handle different sized vectors. commit 55dcd3af8366ebdac0af3cdb22c2588f24aa18ce Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:47:27 2012 +0000 gallivm: handle different sized vectors in conversion / pack only fully generic path for now (extract/insert per element). commit 9c040f78c54575fcd94a8808216cf415fe8868f6 Author: Roland Scheidegger <sroland@vmware.com> Date: Sun Mar 18 00:58:28 2012 +0100 llvmpipe: fix harmless use of unitialized values commit 551e9d5468b92fc7d5aa2265db9a52bb1e368a36 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:31:21 2012 +0100 gallivm: drop special path in extract_broadcast with different sized vectors Not needed, llvm can handle shuffles with different sized result vector just fine. Should hopefully generate the same code in the end, but simpler IR. commit 44da531119ffa07a421eaa041f63607cec88f6f8 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:28:49 2012 +0100 llvmpipe: adapt interpolation for handling multiple quads at once this is still WIP there are actually two methods possible not quite sure what makes the most sense, so there's code for both for now: 1) the iterative method as used before (compute attrib values at upper left corner of stamp and upper left corner of each quad initially). It is improved to handle more than one quad at once, and also do some more vectorized calculations initially for slightly better code - newer cpus have full throughput with 4 wide float vectors, hence don't try to code up a path which might be faster if there's just one channel active per attribute. 2) just do straight interpolation for each pixel. Method 2) is more work per quad, but less initially - if all quads are executed significantly more overall though. But this might change with larger vector lengths. This method would also be needed if we'd do some kind of active quad merging when operating on multiple quads at once. This path contains some hack to force llvm to generate better code, it is still far from ideal though, still generates far too many unnecessary register spills/reloads. Both methods should work with different sized vectors. Not very well tested yet, still seems to work with four-wide vectors, need changes elsewhere to be able to test with wider vectors. commit be5d3e82e2fe14ad0a46529ab79f65bf2276cd28 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:59:37 2012 +0000 draw: Cleanup. commit f85bc12c7fbacb3de2a94e88c6cd2d5ee0ec0e8d Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:43:30 2012 +0000 gallivm: More module compilation refactoring. commit d76f093198f2a06a93b2204857e6fea5fd0b3ece Author: José Fonseca <jfonseca@vmware.com> Date: Thu Mar 15 21:29:11 2012 +0000 llvmpipe: Use gallivm_compile/free_function() in linear code. Should had been done before. commit 122e1adb613ce083ad739b153ced1cde61dfc8c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 13 14:47:10 2012 +0100 llvmpipe: generate partial pixel mask for multiple quads still works with one quad, cannot be tested yet with more At least for now always fixed order with multiple quads. commit 4c4f15081d75ed585a01392cd2dcce0ad10e0ea8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 22:09:24 2012 +0100 llvmpipe: refactor state setup a bit Refactor to make it easier to emit (and potentially later fetch in fs) coefficients for multiple attributes at once. Need to think more about how to make this actually happen however, the problem is different attributes can have different interpolation modes, requiring different handling in both setup and fs (though linear and perspective handling is close). commit 9363e49722ff47094d688a4be6f015a03fba9c79 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 19:23:23 2012 +0100 llvmpipe: vectorize tri offset calc cuts number of instructions in quad-offset-factor from 107 to 75. This code actually duplicated the (scalar) code calculating the determinant except it used different vertex order (leading to different sign but it doesn't matter) hence llvm could not have figured out it's the same (of course with determinant vectorized in the other place that wouldn't have worked any longer neither). Note this particular piece doesn't actually vectorize well, not many arithmetic instructions left but tons of shuffle instructions... Probably would need to work on n tris at a time for better vectorization. commit 63169dcb9dd445c94605625bf86d85306e2b4297 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 03:11:37 2012 +0100 llvmpipe: vectorize some scalar code in setup reduces number of arithmetic instructions, and avoids loading vector x,y values twice (once as scalars once as vectors). Results in a reduction of instructions from 76 to 64 in fs setup for glxgears (16%) on a cpu with sse41. Since this code uses vec2 disguised as vec4, on old cpus which had physical 64bit sse units (pre-Core2) it probably is less of a win in practice (and if you have no vectors you can only hope llvm eliminates the arithmetic for unneeded elements). commit 732ecb877f951ab89bf503ac5e35ab8d838b58a1 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 7 00:32:24 2012 +0100 draw: fix clipping bug introduced by 4822fea3f0440b5205e957cd303838c3b128419c broke clipping pretty badly (verified with lineclip test) commit ef5d90b86d624c152d200c7c4056f47c3c6d2688 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 23:38:59 2012 +0100 draw: don't store vertex header per attribute storing the vertex header once per attribute is totally unnecessary. Some quick look at the generated assembly says llvm in fact cannot optimize away the additional stores (maybe due to potentially aliasing pointers somewhere). Plus, this makes the code cleaner and also allows using a vector "or" instead of scalar ones. commit 6b3a5a57b0b9850854cfbd7b586e4e50102dda71 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 19:11:01 2012 +0100 draw: do the per-vertex "boolean" clipmask "or" with vectors no point extracting the values and doing it per component. Doesn't help that much since we still extract the values elsewhere anyway. commit 36519caf1af40e4480251cc79a2d527350b7c61f Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 2 22:27:01 2012 +0100 gallivm: fix lp_build_extract_broadcast with different sized vectors Fix the obviously wrong argument, so it doesn't blow up. commit 76d0ac3ad85066d6058486638013afd02b069c58 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 2 12:16:23 2012 +0000 draw: Compile per module and not per function (WIP). Enough to get gears w/ LLVM draw + softpipe to work on AVX doing: GALLIUM_DRIVER=softpipe SOFTPIPE_USE_LLVM=yes glxgears But still hackish -- will need to rethink and refactor this. commit 78e32b247d2a7a771be9a1a07eb000d1e54ea8bd Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 12:01:05 2012 +0000 llvmpipe: Remove lp_state_setup_fallback. Never used. commit 6895d5e40d19b4972c361e8b83fdb7eecda3c225 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Feb 27 19:14:27 2012 +0000 llvmpipe: Don't emit EMMS on x86 We already take precautions to ensure that LLVM never emits MMX code. commit 4822fea3f0440b5205e957cd303838c3b128419c Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 29 15:58:19 2012 +0100 draw: modifications for larger vector sizes We want to be able to use larger vectors especially for running the vertex shader. With this patch we build soa vectors which might have a different length than 4. Note that aos structures really remain the same, only when aos structures are converted to soa potentially different sized vectors are used. Samplers probably don't work yet, didn't look at them. Testing done: glxgears works with both 128bit and 256bit vectors. commit f4950fc1ea784680ab767d3dd0dce589f4e70603 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:57 2012 +0100 gallivm: override native vector width with LP_NATIVE_VECTOR_WIDTH env var for debug commit 6ad6dbf0c92f3bf68ae54e5f2aca035d19b76e53 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:24 2012 +0100 draw: allocate storage with alignment according to native vector width commit 7bf0e3e7c9bd2469ae7279cabf4c5229ae9880c1 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Feb 24 19:06:08 2012 +0000 gallivm: Fix comment grammar. Was missing several words. Spotted by Roland. commit b20f1b28eb890b2fa2de44a0399b9b6a0d453c52 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 19:22:09 2012 +0000 gallivm: Use MC-JIT on LLVM 3.1 + (i.e, SVN) MC-JIT Note: MC-JIT is still WIP. For this to work correctly it requires LLVM changes which are not yet upstream. commit b1af4dfcadfc241fd4023f4c3f823a1286d452c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 20:03:15 2012 +0100 llvmpipe: use new lp_type_width() helper in lp_test_blend commit 04e0a37e888237d4db2298f31973af459ef9c95f Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 19:50:34 2012 +0100 llvmpipe: clean up lp_test_blend a little Using variables just sized and aligned right makes it a bit more obvious what's going on. The test still only tests vector length 4. For AoS anything else probably isn't going to work. For SoA other lengths should work (at least with floats). commit e61c393d3ec392ddee0a3da170e985fda885a823 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:48:30 2012 +0000 gallivm: Ensure vector width consistency. Instead of assuming that everything is the max native size. commit 330081ac7bc41c5754a92825e51456d231bf84dd Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:44:14 2012 +0000 draw: More simd vector width consistency fixes. commit d90ca002753596269e37297e2e6c139b19f29f03 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:43:00 2012 +0000 gallivm: Remove unused lp_build_int32_vec4_type() helper. commit cae23417824d75869c202aaf897808d73a2c1db0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 17:32:16 2012 +0100 gallivm: use global variable for native vector width instead of define We do not know the simd extensions (and hence the simd width we should use) available at compile time. At least for now keep a define for maximum vector width, since a global variable obviously can't be used to adjust alignment of automatic stack variables. Leave the runtime-determined value at 128 for now in all cases. commit 51270ace6349acc2c294fc6f34c025c707be538a Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 15:41:02 2012 +0000 gallivm: Add a hunk inadvertedly lost when rebasing. commit bf256df9cfdd0236637a455cbaece949b1253e98 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:24:23 2012 +0000 llvmpipe: Use consistent vector width in depth/stencil test. commit 5543b0901677146662c44be2cfba655fd55da94b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:19:59 2012 +0000 draw: Use a consistent the vector register width. Instead of 4x32 sometimes, LP_NATIVE_VECTOR_WIDTH other times. commit eada8bbd22a3a61f549f32fe2a7e408222e5c824 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 12:08:04 2012 +0000 gallivm: Remove garbagge collection. MC-JIT will require one compilation per module (as opposed to one compilation per function), therefore no state will be shared, eliminating the need to do garbagge collection. commit 556697ea0ed72e0641851e4fbbbb862c470fd7eb Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 10:33:41 2012 +0000 gallivm: Move all native target initialization to lp_set_target_options(). commit c518e8f3f2649d5dc265403511fab4bcbe2cc5c8 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:52:32 2012 +0000 llvmpipe: Create one gallivm instance for each test. commit 90f10af8920ec6be6f2b1e7365cfc477a0cb111d Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:48:08 2012 +0000 gallivm: Avoid LLVMAddGlobalMapping() in lp_bld_assert(). Brittle, complex, and unecesary. Just use function pointer constant. commit 98fde550b33401e3fe006af59db4db628bcbf476 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:21:26 2012 +0000 gallivm: Add a lp_build_const_func_pointer() helper. To be reused in all places where we want to call C code. commit 6cfedadb62c2ce5af8d75969bc95a607f3ece118 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:44:41 2012 +0000 gallivm: Cleanup/simplify lp_build_const_string_variable. - Move to lp_bld_const where it belongs - Rename to lp_build_const_string - take the length from the argument (and don't count the zero terminator twice) - bitcast the constant to generic i8 * commit db1d4018c0f1fa682a9da93c032977659adfb68c Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 11:52:17 2012 +0000 gallivm: Set NoFramePointerElimNonLeaf to true where supported. commit 088614164aa915baaa5044fede728aa898483183 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 22 19:38:47 2012 +0100 llvmpipe: pass in/out pointers rather scalar floats in lp_bld_arit we don't want llvm to potentially optimize away the vectors (though it doesn't seem to currently), plus we want to be able to handle in/out vectors of arbitrary length. commit 3f5c4e04af8a7592fdffa54938a277c34ae76b51 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:22:55 2012 +0100 gallivm: fix lp_build_sqrt() for vector length 1 since we optimize away vectors with length 1 need to emit intrinsic without vector type. commit 79d94e5f93ed8ba6757b97e2026722ea31d32c06 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 17:00:46 2012 +0000 llvmpipe: Remove lp_test_round. commit 81f41b5aeb3f4126e06453cfc78990086b85b78d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:56:24 2012 +0100 llvmpipe: subsume lp_test_round into lp_test_arit Much simpler, and since the arguments aren't passed as 128bit values can run on any arch. This also uses the float instead of the double versions of the c functions (which probably was the intention anyway). In contrast to lp_test_round the output is much less verbose however. Tested vector width of 32 to 512 bits - all pass except 32 (length 1) which crashes in lp_build_sqrt() due to wrong type. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 945b338b421defbd274481d8c4f7e0910fd0e7eb Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 09:55:03 2012 +0000 gallivm: Centralize the function compilation logic. This simplifies a lot of code. Also doing this in a central place will make it easier to carry out the changes necessary to use MC-JIT in the future. gallivm: Fix typo in explicit derivative shuffle. Trivial. draw: make DEBUG_STORE work again adapt to lp_build_printf() interface changes Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: get rid of vecnf_from_scalar() just use lp_build_broadcast directly (cannot assign a name but don't really need it, vecnf_from_scalar() was producing much uglier IR due to using repeated insertelement instead of insertelement+shuffle). Reviewed-by: José Fonseca <jfonseca@vmware.com> llvmpipe: fix typo in complex interpolation code Fixes position interpolation when using complex mode (piglit fp-fragment-position and similar) Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: fix clipvertex/position storing again This appears to be the result of a bad merge. Fixes piglit tests relying on clipping, like a lot of the interpolation tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Fix explicit derivative manipulation. Same counter variable was being used in two nested loops. Use more meanigful variable names for the counter to fix and avoid this. gallivm: Prevent buffer overflow in repeat wrap mode for NPOT. Based on Roland's patch, discussion, and review . Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix dims for TGSI_TEXTURE_1D in emit_tex. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix explicit volume texture derivatives. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: fix 1d shadow texture sampling Always r coordinate is used, hence need 3 coords not two (the second one is unused). Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Enable AVX support without MCJIT, where available. For now, this just enables AVX on Windows for testing. If the code is stable then we might consider prefering the old JIT wherever possible. No change elsewhere. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2012-07-13 18:09:30 +01:00
const char *intrinsic = NULL;
if (type.length == 4) {
intrinsic = "llvm.x86.sse.rcp.ps";
}
else {
intrinsic = "llvm.x86.avx.rcp.ps.256";
}
gallivm,draw,llvmpipe: Support wider native registers. Squashed commit of the following: commit 7acb7b4f60dc505af3dd00dcff744f80315d5b0e Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:46:31 2012 +0100 draw: Don't use dynamically sized arrays. Not supported by MSVC. commit 5810c28c83647612cb372d1e763fd9d7780df3cb Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:44:16 2012 +0100 gallivm,llvmpipe: Don't use expressions with PIPE_ALIGN_VAR(). MSVC doesn't accept exceptions in _declspec(align(...)). Use a define instead. commit 8aafd1457ba572a02b289b3f3411e99a3c056072 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:41:56 2012 +0100 gallium/util: Make u_cpu_detect.h header C++ safe. commit 5795248350771f899cfbfc1a3a58f1835eb2671d Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 2 12:08:01 2012 +0100 gallium/util: Add ULL suffix to large constants. As suggested by Andy Furniss: it looks like some old gcc versions require it. commit 4c66c22727eff92226544c7d43c4eb94de359e10 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Truly disable INF/NAN tests on MSVC. Thanks to Brian for spotting this. commit 8bce274c7fad578d7eb656d9a1413f5c0844c94e Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Disable INF/NAN tests on MSVC. Somehow they are not recognized as constants. commit 6868649cff8d7fd2e2579c28d0b74ef6dd4f9716 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 15:05:24 2012 +0200 gallivm: Cleanup the 2 x 8 float -> 16 ub special path in lp_build_conv. No behaviour change intended, like 7b98455fb40c2df84cfd3cdb1eb7650f67c8a751. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 5147a0949c4407e8bce9e41d9859314b4a9ccf77 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 14:28:19 2012 +0200 gallivm: (trivial) fix issues with multiple-of-4 texture fetch Some formats can't handle non-multiple of 4 fetches I believe, but everything must support length 1 and multiples of 4. So avoid going to scalar fetch (which is very costly) just because length isn't 4. Also extend the hack to not use shift with variable count for yuv formats to arbitrary length (larger than 1) - doesn't matter how many elements we have we always want to avoid it unless we have variable shift count instruction (which we should get with avx2). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 87ebcb1bd71fa4c739451ec8ca89a7f29b168c08 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jul 4 02:09:55 2012 +0200 gallivm: (trivial) fix typo for wrap repeat mode in linear filtering aos code This would lead to bogus coordinates at the edges. (undetected by piglit because this path is only taken for block-based formats). Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 3a42717101b1619874c8932a580c0b9e6896b557 Author: José Fonseca <jfonseca@vmware.com> Date: Tue Jul 3 19:42:49 2012 +0100 gallivm: Fix TGSI integer translation with AVX. commit d71ff104085c196b16426081098fb0bde128ce4f Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 15:17:41 2012 +0100 llvmpipe: Fix LLVM JIT linear path. It was not working properly because it was looking at the JIT function before it was actually compiled. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit a94df0386213e1f5f9a6ed470c535f9688ec0a1b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Jun 28 18:07:10 2012 +0100 gallivm: Refactor lp_build_broadcast(_scalar) to share code. Doesn't really change the generated assembly, but produces more compact IR, and of course, makes code more consistent. Reviewed-by: Brian Paul <brianp@vmware.com> commit 66712ba2731fc029fa246d4fc477d61ab785edb5 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 17:30:13 2012 +0100 gallivm: Make LLVMContextRef a singleton. There are any places inside LLVM that depend on it. Too many to attempt to fix. Reviewed-by: Brian Paul <brianp@vmware.com> commit ff5fb7897495ac263f0b069370fab701b70dccef Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 28 18:15:27 2012 +0200 gallivm: don't use 8-wide texture fetch in aos path This appears to be a slight loss usually. There are probably several reasons for that: - fetching itself is scalar - filtering is pure int code hence needs splitting anyway, same for the final texel offset calculations - texture wrap related code, which can be done 8-wide, is slightly more complex with floats (with clamp_to_edge) and float operations generally more costly hence probably not much faster overall - the code needed to split when encountering different mip levels for the quads, adding complexity So, just split always for aos path (but leave it 8-wide for soa, since we do 8-wide filtering there when possible). This should certainly be revisited if we'd have avx2 support. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ce8032b43dcd8e8d816cbab6428f54b0798f945d Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:41:19 2012 +0200 gallivm: (trivial) don't extract fparts variable if not needed Did not have any consequences but unnecessary. commit aaa9aaed8f80dc282492f62aa583a7ee23a4c6d5 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:09:06 2012 +0200 gallivm: fix precision issue in aos linear int wrap code now not just passes at a quick glance but also with piglit... If we do the wrapping with floats, we also need to set the weights accordingly. We can potentially end up with different (integer) coordinates than what the integer calculations would have chosen, which means the integer weights calculated previously in this case are completely wrong. Well at least that's what I think happens, at least recalculating the weights helps. (Some day really should refactor all the wrapping, so we do whatever is fastest independent of 16bit int aos or 32bit float soa filtering.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit fd6f18588ced7ac8e081892f3bab2916623ad7a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 11:15:53 2012 +0100 gallium/util: Fix parsing of options with underscore. For example GALLIVM_DEBUG=no_brilinear which was being parsed as two options, "no" and "brilinear". commit 09a8f809088178a03e49e409fa18f1ac89561837 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 26 15:00:14 2012 +0100 gallivm: Added a generic lp_build_print_value which prints a LLVMValueRef. Updated lp_build_printf to share common code. Removed specific lp_build_print_vecX. Reviewed-by: José Fonseca <jfonseca@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit e59bdcc2c075931bfba2a84967a5ecd1dedd6eb0 Author: José Fonseca <jfonseca@vmware.com> Date: Wed May 16 15:00:23 2012 +0100 draw,llvmpipe: Avoid named struct types on LLVM 3.0 and later. Starting with LLVM 3.0, named structures are meant not for debugging, but for recursive data types, previously also known as opaque types. The recursive nature of these types leads to several memory management difficulties. Given that we don't actually need recursive types, avoid them altogether. This is an attempt to address fdo bugs 41791 and 44466. The issue is somewhat random so there's no easy way to check how effective this is. Cherry-picked from 9af1ba565dfd5cef9ee938bb7c04767d14878fbf commit df6070f618a203c7a876d984c847cde4cbc26bdb Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 14:42:53 2012 +0200 gallivm: (trivial) fix typo in faster aos linear int wrap code no longer crashes, now REALLY tested. commit d8f98dce452c867214e6782e86dc08562643c862 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 18:20:58 2012 +0200 llvmpipe: (trivial) remove bogus optimization for float aos repeat wrap This optimization for nearest filtering on the linear path generated likely bogus results, and the int path didn't have any optimizations there since the only shader using force_nearest apparently uses clamp_to_edge not repeat wrap anyway. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit c4e271a0631087c795e756a5bb6b046043b5099d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 23:01:52 2012 +0200 gallivm: faster repeat wrap for linear aos path too Even if we already have scaled integer coords, it's way faster to use the original float coord (plus some conversions) rather than use URem. The choice of what to do for texture wrapping is not really tied to int aos or float soa filtering though for some modes there can be some gains (because of easier weight calculations). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 1174a75b1806e92aee4264ffe0ffe7e70abbbfa3 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 14:39:22 2012 +0200 gallivm: improve npot tex wrap repeat in linear soa path URem gets translated into series of scalar divisions so just about anything else is faster. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit f849ffaa499ed96fa0efd3594fce255c7f22891b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 00:40:35 2012 +0100 gallivm: (trivial) fix near-invisible shift-space typo I blame the keyboard. commit 5298a0b19fe672aebeb70964c0797d5921b51cf0 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:24:28 2012 +0200 gallivm: add new intrinsic helper to deal with arbitrary vector length This helper will split vectors which are too large for the hw, or expand them if they are too small, so a caller of a function using intrinsics which uses such sizes need not split (or expand) the vectors manually and the function will still use the intrinsic instead of dropping back to generic llvm code. It can also accept scalars for use with pseudo-vector intrinsics (only useful for float arguments, all x86 scalar simd float intrinsics use 4vf32). Only used for lp_build_min/max() for now (also added the scalar float case for these while there). (Other basic binary functions could use it easily, whereas functions with a different interface would need different helpers.) Expanding vectors isn't widely used, because we always try to use build contexts with native hw vector sizes. But it might (or not) be nicer if this wouldn't need to be done, the generated code should in theory stay the same (it does get hit by lp_build_rho though already since we didn't have a intrinsic for the scalar lp_build_max case before). v2: incorporated Brian's feedback, and also made the scalar min/max case work instead of crash (all scalar simd float intrinsics take 4vf32 as argument, probably the reason why it wasn't used before). Moved to lp_bld_intr based on José's request, and passing intrinsic size instead of length. Ideally we'd derive the source type info from the passed in llvm value refs and process some llvmtype return type so we could handle intrinsics where the source and destination type isn't the same (like float/int conversions, packing instructions) but that's a bit too complicated for now. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 01aa760b99ec0b2dc8ce57a43650e83f8c1becdf Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:19:18 2012 +0200 gallivm: (trivial) increase max code size for shader disassembly 64kB was just short of what I needed (which caused a crash) hence increase to 96kB (should probably be smarter about that). commit 74aa739138d981311ce13076388382b5e89c6562 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:53:29 2012 +0100 gallivm: simplify aos float tex wrap repeat nearest just handle pot and npot the same. The previous pot handling ended up with exactly the same instructions plus 2 more (leave it in the soa path though since it is probably still cheaper there). While here also fix a issue which would cause a crash after an assert. commit 0e1e755645e9e49cfaa2025191e3245ccd723564 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:29:24 2012 +0100 gallivm: (trivial) skip floor rounding in ifloor when not signed This was only done for the non-sse41 case before, but even with sse41 this is obviously unnecessary (some callers already call itrunc in this case anyway but some might not). commit 7f01a62f27dcb1d52597b24825931e88bae76f33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:23:12 2012 +0100 gallivm: (trivial) fix bogus comments commit 5c85be25fd82e28490274c468ce7f3e6e8c1d416 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 20 11:51:57 2012 +0100 translate: Free elt8_func/elt16_func too. These were leaking. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 0ad498f36fb6f7458c7cffa73b6598adceee0a6c Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 15:55:34 2012 +0200 gallivm: fix bug for tex wrap repeat with linear sampling in aos float path The comparison needs to be against length not length_minus_one, otherwise the max texel is never chosen (for the second coordinate). Fixes piglit texwrap-1D-npot-proj (and 2D/3D versions). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit d1ad65937c5b76407dc2499b7b774ab59341209e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 16:13:43 2012 +0200 gallivm: simplify soa tex wrap repeat with npot textures and no mip filtering Similar to what is already done in aos sampling for the float path (but not the int path since we don't get normalized float coordinates there). URem is expensive and the calculation is done trivially with normalized floats instead (at least with sse41-capable cpus). (Some day should probably do the same for the mip filter path but it's much more complicated there hence the gain is smaller.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit e1e23f57ba9b910295c306d148f15643acc3fc83 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:38:56 2012 +0200 llvmpipe: (trivial) remove duplicated function declaration Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 07ca57eb09e04c48a157733255427ef5de620861 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:37:34 2012 +0200 llvmpipe: destroy setup variants on context destruction lp_delete_setup_variants() used to be called in garbage collection, but this no longer exists hence the setup shaders never got freed. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ed0003c633859a45f9963a479f4c15ae0ef1dca3 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 16:25:29 2012 +0100 gallivm: handle different ilod parts for multiple quad sampling This fixes filtering when the integer part of the lod is not the same for all quads. I'm not fully convinced of that solution yet as it just splits the vector if the levels to be sampled from are different. But otherwise we'd need to do things like some minify steps, and getting mip level base address separately anyway hence it wouldn't really look like much of a win (and making the code even more complex). This should now give identical results to single quad sampling. commit 8580ac4cfc43a64df55e84ac71ce1a774d33c0d2 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:14:47 2012 +0200 gallivm: de-duplicate sample code common to soa and aos sampling There doesn't seem to be any reason why this code dealing with cube face selection, lod and mip level calculation is separate in aos and soa sampling, and I am sick of having it to change in both places. commit fb541e5f957408ce305b272100196f1e12e5b1e8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:15:41 2012 +0200 gallivm: do mip filtering with per quad lod_fpart This gives better results for mip filtering, though the generated code might not be optimal. For now it also creates some artifacts if the lod_ipart isn't the same for all quads, since instead of using the same mip weight for all quads as previously (which just caused non-smooth gradients) this now will use the right weights but with the wrong mip level in this case (can easily be seen with things like texfilt, mipmap_tunnel). v2: use logic helper suggested by José, and fix issue with negative lod_fpart values commit f1cc84eef7d826a20fab6cd8ccef9a275ff78967 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 13 18:35:25 2012 +0200 gallivm: (trivial) fix bogus assert in lp_build_unpack_broadcast_aos_scalars commit 7c17dbae8ae290df9ce0f50781a09e8ed640c044 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:11:14 2012 +0100 util: Reimplement half <-> float conversions. Removed u_half.py used to generate the table for previous method. Previous implementation of float to half conversion was faulty for denormalised and NaNs and would require extra logic to fix, thus making the speedup of using tables irrelevant. commit 7762f59274070e1dd4b546f5cb431c2eb71ae5c3 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:12:16 2012 +0100 tests: Updated tests to properly handle NaN for half floats. commit fa94c135aea5911fd93d5dfb6e6f157fb40dce5e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:33:10 2012 +0200 gallivm: do mip level calculations per quad This is the final piece which shouldn't change the rendering output yet. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 23cbeaddfe03c09ca18c45d28955515317ffcf4c Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Jun 9 00:54:21 2012 +0200 gallivm: do per-quad cube face selection Doesn't quite fix the piglit cubemap test (not sure why actually) but doing per-quad face selection is doing the right thing and definitely an improvement. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit abfb372b3702ac97ac8b5aa80ad1b94a2cc39d33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:22:59 2012 +0200 gallivm: do all lod calculations per quad Still no functional change but lod is now converted to scalar after lod calculations. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 519368632747ae03feb5bca9c655eccbc5b751b4 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:46:10 2012 +0100 gallivm: Added support for half-float to float conversion in lp_build_conv. Updated various utility functions to support this change. commit 135b4d683a4c95f7577ba27b9bffa4a6fbd2c2e7 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:02:46 2012 +0100 gallivm: Added function for half-float to float conversion. Updated lp_build_format_aos_array to support half-float source. commit 37d648827406a20c5007abeb177698723ed86673 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:55:18 2012 +0100 util: Updated u_format_tests to rigidly test half-float boundary values. commit 2ad18165d96e578aa9046df7c93cb1c3284d8c6b Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:54:16 2012 +0100 llvmpipe: Updated lp_test_format to properly handle Inf/NaN results. commit 78740acf25aeba8a7d146493dd5c966e22c27b73 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:53:30 2012 +0100 util: Added functions for checking NaN / Inf for double and half-floats. commit 35e9f640ae01241f9e0d67fe893bbbf564c05809 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:05:13 2012 +0200 gallivm: Fix calculating rho for 3d textures for the single-quad case Discovered by accident, this looks like a very old typo bug. commit fc1220c636326536fd0541913154e62afa7cd1d8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:04:59 2012 +0200 gallivm: do calcs per-quad in lp_build_rho Still convert to scalar at the end of the function. commit 50a887ffc550bf310a6988fa2cea5c24d38c1a41 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 21 23:21:50 2012 +0200 gallivm: (trivial) return scalar in lp_build_extract_range for length 1 vectors Our type system on top of llvm's one doesn't generally support vectors of length 1, instead using scalars. So we should return a scalar from this function instead of having to bitcast the vector with length 1 later elsewhere. commit 80c71c621f9391f0f9230460198d861643324876 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 17:49:15 2012 +0100 draw: Fixed bad merge error commit c47401cfad0c9167de20ff560654f533579f452c Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:29:30 2012 +0100 draw: Updated store_clip to store whole vectors instead of individual elements. commit 2d9c1ad74b0b0b41861fffcecde39f09cc27f1cf Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:28:32 2012 +0100 gallivm: Added lp_build_fetch_rgba_aos_array. A version of lp_build_fetch_rgba_aos which is targeted at simple array formats. Reads the whole vector from memory in one, instead of reading each element individually. Tested with mesa tests and demos. commit ff7805dc2b6ef6d8b11ec4e54aab1633aef29ac8 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:27:40 2012 +0100 gallivm: Added lp_build_pad_vector. This function pads a vector with undef to a desired length. commit 701f50acef24a2791dabf4730e5b5687d6eb875d Author: James Benton <jbenton@vmware.com> Date: Fri May 18 17:27:19 2012 +0100 util: Added util_format_is_array. This function checks whether a format description is in a simple array format. commit 5e0a7fa543dcd009de26f34a7926674190fa6246 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:13:47 2012 +0100 draw: Removed draw_llvm_translate_from and draw/draw_llvm_translate.c. This is "replaced" by adding an optimised path in lp_build_fetch_rgba_aos in an upcoming patch. commit 8c886d6a7dd3fb464ecf031de6f747cb33e5361d Author: James Benton <jbenton@vmware.com> Date: Wed May 16 15:02:31 2012 +0100 draw: Modified store_aos to write the vector as one, not individual elements. commit 37337f3d657e21dfd662c7b26d61cb0f8cfa6f17 Author: James Benton <jbenton@vmware.com> Date: Wed May 16 14:16:23 2012 +0100 draw: Changed aos_to_soa to use lp_build_transpose_aos. commit bd2b69ce5d5c94b067944d1dcd5df9f8e84548f1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:14:27 2012 +0100 draw: Changed soa_to_aos to use lp_build_transpose_aos. commit 0b98a950d29a116e82ce31dfe7b82cdadb632f2b Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:45 2012 +0100 gallivm: Added lp_build_transpose_aos which converts between aos and soa. commit 69ea84531ad46fd145eb619ed1cedbe97dde7cb5 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:01 2012 +0100 gallivm: Added lp_build_interleave2_half aimed at AVX unpack instructions. commit 7a4cb1349dd35c18144ad5934525cfb9436792f9 Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 22 11:54:14 2012 +0100 gallivm: Fix build on Windows. MC-JIT not yet supported there. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit afd105fc16bb75d874e418046b80d9cc578818a1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:17:26 2012 +0100 llvmpipe: Added a error counter to lp_test_conv. Useful for keeping track of progress when fixing errors! Signed-off-by: José Fonseca <jfonseca@vmware.com> commit b644907d08c10a805657841330fc23db3963d59c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:16:46 2012 +0100 llvmpipe: Changed known failures in lp_test_conv. To comply with the recent fixes to lp_bld_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit d7061507bd94f6468581e218e61261b79c760d4f Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:14:38 2012 +0100 llvmpipe: Added fixed point types tests to lp_test_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 146b3ea39b4726dbe125ac666bd8902ea3d6ca8c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:26:35 2012 +0100 llvmpipe: Changed lp_test_conv src/dst alignment to be correct. Now based on the define rather than a fixed number. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit f3b57441f834833a4b142a951eb98df0aa874536 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:06:44 2012 +0100 gallivm: Fixed erroneous optimisation in lp_build_min/max. Previously assumed normalised was 0 to 1, but it can be -1 to 1 if type is signed. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a0613382e5a215cd146bb277646a6b394d376ae4 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:04:49 2012 +0100 gallivm: Compensate for lp_const_offset in lp_build_conv. Fixing a /*FIXME*/ to remove errors in integer conversion in lp_build_conv. Tested using lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a3d2bf15ea345bc8a0664f8f441276fd566566f3 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:01:25 2012 +0100 gallivm: Fixed overflow in lp_build_clamped_float_to_unsigned_norm. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit e7b1e76fe237613731fa6003b5e1601a2e506207 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 21 20:07:51 2012 +0100 gallivm: Fix build with LLVM 2.6 Trivial, and useful. commit d3c6bbe5c7f5ba1976710831281ab1b6a631082d Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 15 17:15:59 2012 +0100 gallivm: Enable MCJIT/AVX with vanilla LLVM 3.1. Add the necessary C++ glue, so that we don't need any modifications to the soon to be released LLVM 3.1. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 724a019a14d40fdbed21759a204a2bec8a315636 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 22:04:06 2012 +0100 gallivm: Use HAVE_LLVM 0x0301 consistently. commit af6991e2a3868e40ad599b46278551b794839748 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 21:49:06 2012 +0100 gallivm: Add MCRegisterInfo.h to silence benign warnings about missing implementation. Trivial. commit 6f8a1d75458daae2503a86c6b030ecc4bb494e23 Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Apr 2 22:14:15 2012 -0700 gallivm: Pass in a MCInstrInfo to createMCInstPrinter on llvm-3.1. llvm-3.1svn r153860 makes MCInstrInfo available to the MCInstPrinter. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 62555b6ed8760545794f83064e27cddcb3ce5284 Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 27 21:51:17 2012 -0700 gallivm: Fix method overriding in raw_debug_ostream. Use matching type qualifers to avoid method hiding. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 6a9bd784f4ac68ad0a731dcd39e5a3c39989f2be Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 13 22:40:52 2012 -0700 gallivm: Fix createOProfileJITEventListener namespace with llvm-3.1. llvm-3.1svn r152620 refactored the OProfile profiling code. createOProfileJITEventListener was moved from the llvm namespace to the llvm::JITEventListener namespace. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit b674955d39adae272a779be85aa1bd665de24e3e Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Mar 5 22:00:40 2012 -0800 gallivm: Pass in a MCRegisterInfo to MCInstPrinter on llvm-3.1. llvm-3.1svn r152043 changes createMCInstPrinter to take an additional MCRegisterInfo argument. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 11ab69971a8a31c62f6de74905dbf8c02884599f Author: Vinson Lee <vlee@freedesktop.org> Date: Wed Feb 29 21:20:53 2012 -0800 Revert "gallivm: Change getExtent and readByte to non-const with llvm-3.1." This reverts commit d5a6c172547d8964f4d4bb79637651decaf9deee. llvm-3.1svn r151687 makes MemoryObject accessor members const again. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 339960c82d2a9f5c928ee9035ed31dadb7f45537 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 14 16:19:56 2012 +0200 gallivm: (trivial) fix assertion failure for mipmapped 1d textures In lp_build_rho, we may end up with a 1-element vector (for mipmapped 1d textures), but in this case we require the type to be a non-vector type, so need a cast. commit 9d73edb727bd6d196030dc3026b7bf0c574b3e19 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:12:07 2012 +0200 gallivm: prepare for per-quad lod calculations for large vectors to be able to handle multiple quads at once in texture sampling and still do lod calculations per quad, it is necessary to get the per-quad derivatives into the lp_build_rho function. Until now these derivative values were just scalars, which isn't going to work. So we now use vectors, and since the interface needs to change we also do some different (slightly more efficient) packing of the values. For 8-wide vectors the packed derivative values for 3 coords would look like this, this scales to a arbitrary (multiple of 4) vector size: ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____ The second vector will be unused for 1d and 2d textures. To facilitate future changes the derivative values are put into a struct, since quite some functions just pass these values through. The generated code seems to be very slightly better for 2d textures (with 4-wide vectors) than before with sse2 (if you have a cpu with physical 128bit simd units - otherwise it's probably not a win). v2: suggestions from José, rename variables, add comments, use swizzle helper commit 0aa21de0d31466dac77b05c97005722e902517b8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:10:31 2012 +0200 gallivm: add undefined swizzle handling to lp_build_swizzle_aos This is useful for vectors with "holes", it lets llvm choose the most efficient shuffle instructions if some elements aren't needed without having to worry what elements to manually pick otherwise. commit 00faf3f370e7ce92f5ef51002b0ea42ef856e181 Author: José Fonseca <jfonseca@vmware.com> Date: Fri May 4 17:25:16 2012 +0100 gallivm: Get the LLVM IR optimization passes before JIT compilation. MC-JIT engine compiles the module immediately on creation, so the optimization passes were being run too late. So now we create a target data layout from a string, that matches the ABI parameters reported by the compiler. The backend optimization passes were always been run, so the performance improvement is modest (3% on multiarb mesa demo). Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit 40a43f4e2ce3074b5ce9027179d657ebba68800a Author: Roland Scheidegger <sroland@vmware.com> Date: Wed May 2 16:03:54 2012 +0200 gallivm: (trivial) fix wrong define used in lp_build_pack2 should fix stack-smashing crashes. commit e6371d0f4dffad4eb3b7a9d906c23f1c88a2ab9e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 30 21:25:29 2012 +0200 gallivm: add perf warnings when not using intrinsics with 256bit vectors Helper functions using integer sse2 intrinsics could split the vectors with AVX instead of using generic fallback (which should be faster). We don't actually expect to hit these paths (hence don't fix them up to actually do the vector splitting) so just emit warnings (for those functions where it's obvious doing split/intrinsic is faster than using generic path). Only emit warnings for 256bit vectors since we _really_ don't expect to hit arbitrary large vectors which would affect a lot more functions. The warnings do not actually depend on avx since the same logic applies to plain sse2 too (but of course again there's _really_ no reason we should hit these functions with 256bit vectors without avx). commit 8a9ea701ea7295181e846c6383bf66a5f5e47637 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:37:07 2012 +0200 gallivm: split vectors manually for avx in lp_build_pack2 (v2) There's 2 reasons for this: First, there's a llvm bug (fixed in 3.1) which generates tons of byte inserts/extracts otherwise, and second, more importantly, we want to use pack intrinsics instead of shuffles. We do this in lp_build_pack2 and not the calling code (aos sample path) because potentially other callers might find that useful too, even if for larger sequences of code using non-native vector sizes it might be better to manually split vectors. This should boost texture performance in the aos path considerably. v2: fix issues with intrinsics types with old llvm commit 27ac5b48fa1f2ea3efeb5248e2ce32264aba466e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:26:22 2012 +0200 llvmpipe: refactor lp_build_pack2 (v2) prettify, and it's unnecessary to assert when there's no intrinsic due to unsupported bit width - the shuffle path will work regardless. In contrast lp_build_packs2, should only rely on lp_build_pack2 doing the clamping for element sizes for which there is a sse2 intrinsic. v2: fix bug spotted by Jose regarding the intrinsic type for packusdw on old llvm versions. commit ddf279031f0111de4b18eaf783bdc0a1e47813c8 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:13:59 2012 +0200 gallivm: add src width check in lp_build_packs2() not doing so would skip clamping even if no sse2 pack instruction is available, which is incorrect (in theory only, such widths would also always hit a (unnecessary) assertion in lp_build_pack2(). commit e7f0ad7fe079975eae7712a6e0c54be4fae0114b Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Apr 27 15:57:00 2012 +0200 gallivm: (trivial) fix crash-causing typo for npot textures with avx commit 28a9d7f6f655b6ec508c8a3aa6ffefc1e79793a0 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Apr 25 19:38:45 2012 +0200 gallivm: (trivial) remove code mistakenly added twice. commit d5926537316f8ff67ad0a52e7242f7c5478d919b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Apr 24 21:16:15 2012 +0200 gallivm: add a new avx aos sample path (v2) Try to avoid mixing float and int address calculations. This does texture wrap modes with floats, and then the offset calculations still with ints (because of lack of precision with floats, though we could do some effort to make it work with not too large (16MB) textures). This also handles wrap repeat mode with npot-sized textures differently than either the old soa or aos int path (likely way faster but untested). Otherwise the actual address wrap code is largely similar to the soa path (not quite the same as this one also has some int code), it should get used by avx soa sampling later as well but doesn't handle more complex address modes yet (this will also have the benefit that we can use aos sampling path for all texture address modes). Generated code for that looks reasonable, but still does not split vectors explicitly for fetch/filter which means still get hit by llvm (fixed upstream) which generates hundreds of pinsrb/pextrb instead of two shuffles. It is not obvious though if it's much of a win over just doing address calcs 4-wide but with ints, even if it is definitely much less instructions on avx. piglit's texwrap seems to look exactly the same but doesn't test neither the non-normalized nor the npot cases. v2: fix comments, prettify based on Brian's and Jose's feedback. commit bffecd22dea66fb416ecff8cffd10dd4bdb73fce Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Apr 19 01:58:29 2012 +0200 gallivm: refactor aos lp_build_sample_image_nearest/linear split them up to separate address calculations and fetching/filtering. Need this for being able to do 8-wide float address calcs and 4-wide fetch/filter later (for avx). Plus the functions were very big scary monsters anyway (in particular lp_build_sample_image_linear). commit a80b325c57529adddcfa367f96f03557725c4773 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 16 17:17:18 2012 +0200 gallivm: fix lp_build_resize when truncating width but expanding vector size Missed this case which I thought was impossible - the assertion for it was right after the division by zero... (AoS) texture sampling may ask us to do this, for things like 8 4x32int vectors to 1 32x8int vector conversion (eventually, we probably don't want this to happen). commit f9c8337caa3eb185830d18bce8b95676a065b1d7 Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Apr 14 18:00:59 2012 +0200 gallivm: fix cube maps with larger vectors This makes the branchless cube face selection code work with larger vectors. Because the complexity is quite high (cannot really be improved it seems, per-face selection would reduce complexity a lot but this leads to errors unless the derivatives are calculated all from the same face which almost doubles the work to be done) it is still slower than the branching version, hence only enable this with large vectors. It doesn't actually do per-quad face selection yet (only makes sense with matching lod selection, in fact it will select the same face for all pixels based on the average of the first four pixels for now) but only different shuffles are required to make it work (the branching version actually should work with larger vectors too now thanks to the improved horizontal add but of course it cannot be extended to really select the face per-quad unless doing branching per quad). commit 7780c58869fc9a00af4f23209902db7e058e8a66 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 21:11:12 2012 +0100 llvmpipe: (trivial) fix compiler warning and also clarify comment regarding availability of popcnt instruction. commit a266dccf477df6d29a611154e988e8895892277e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 14:21:07 2012 +0100 gallivm: remove unneeded members in lp_build_sample_context Minor cleanup, the texture width, height, depth aren't accessed in their scalar form anywhere. Makes it more obvious those values should probably be fetched already vectorized (but this requires more invasive changes)... commit b678c57fb474e14f05e25658c829fc04d2792fff Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 29 15:53:55 2012 +0100 gallivm: add a helper for concatenating vectors Similar to the extract_range helper intended to get around slow code generated by llvm for 128bit insertelements. Concatenating two 128bit vectors this way will result in a single vinsertf128 operation rather than two 64bit stores plus one 128bit load, though it might be mildly useful for other purposes as well. commit 415ff228bcd0cf5e44a4c15350a661f0f5520029 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 19:41:15 2012 +0100 gallivm: add a custom 2x8f->1x16ub avx conversion path Similar to the existing 4x4f->1x16ub sse2 path, shaves off a couple instructions (min/max mostly) because it relies on pack intrinsics clamping. commit 78c08fc89f8fbcc6dba09779981b1e873e2a0299 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 18:44:07 2012 +0100 gallivm: add avx arithmetic intrinsics Add all avx intrinsics for arithmetic functions (with the exception of the horizontal add function which needs another look). Seems to pass basic tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit a586caa2800aa5ce54c173f7c0d4fc48153dbc4e Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 15:31:35 2012 +0100 gallivm: add avx logic intrinsics Add the blend intrinsics for 8-wide float and 4-wide double vectors. Since we lack 256bit int instructions these are used for int vectors as well, though obviously not for byte or word element values. The comparison intrinsics aren't extended for avx since these are only used for pre-2.7 llvm versions. commit 70275e4c13c89315fc2560a4c488c0e6935d5caf Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 00:40:53 2012 +0100 gallivm: new helper function for extract shuffles. Based on José's idea as we can need that in a couple places. Note that such shuffles should not be used lightly, since data layout of <4 x i8> is different to <16 x i8> for instance, hence might cause data rearrangement. commit 4d586dbae1b0c55915dda1759d2faea631c0a1c2 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 18:27:25 2012 +0100 gallivm: (trivial) don't overallocate shuffle variable using wrong define meant huge array... commit 06b0ec1f6d665d98c135f9573ddf4ba04b2121ad Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 17:54:20 2012 +0100 gallivm: don't do per-element extract/insert for vector element resize Instead of doing per-element extract/insert if the src vectors and dst vector differ in total size (which generates atrocious code) first change the src vectors size by using shuffles to destination vector size. We can still do better than that on AVX for packing to color buffer (by exploiting pack intrinsics characteristics hence eleminating the need for some clamps) but this already generates much better code. v2: incorporate feedback from José, Keith and use shuffle instead of bitcasts/extracts. Due to llvm deficiencies the latter cause all data to get moved to GPRs and back in pieces (even though the data in the regs actually stays the same...). commit c9970d70e05f95d3f52fe7d2cd794176a52693aa Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 19:33:19 2012 +0000 gallivm: fix bug in simple position interpolation Accidental use of position attribute instead of just pixel coordinates. Caused failures in piglit glsl-fs-ceil and glsl-fs-floor. commit d0b6fcdb008d04d7f73d3d725615321544da5a7e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 15:31:14 2012 +0000 gallivm: fix emission of ceil opcode lp_build_ceil seems more appropriate than lp_build_trunc. This seems to be never hit though someone performs some ceil to floor magic. commit d97fafed7e62ffa6bf76560a92ea246a1a26d256 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 22 11:46:52 2012 +0000 gallivm: new vectorized path for cubemap calculations should be faster when adapted to multiple quads as only selection masks need to be different. The code is more or less a per-pixel version adapted to only do it per quad. A per pixel version would be much simpler (could drop 2 selects, 6 broadcasts and the messy horizontal add of 3 vectors at the expense of only 2 more absolute value instructions - would also just work for arbitary large vectors). This version doesn't yet work with larger vectors because the horizontal add isn't adjusted to be able to work with 2x4 vectors (and also because face selection wouldn't be done per quad just per block though that would be only a correctness issue just as with lod selection). The downside is this code is quite a bit slower. On a Core2 it can be sped up by disabling the hw blend instructions for selection and using logicop fallbacks instead, but it is still slower than the old code, hence leave that in for now. Probably will chose one or the other version based on vector length in the end. commit b375fbb18a3fd46859b7fdd42f3e9908ea4ff9a3 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 21 14:42:29 2012 +0000 gallivm: fix optimized occlusion query intrinsic name commit a9ba0a3b611e48efbb0e79eb09caa85033dbe9a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Mar 21 16:19:43 2012 +0000 draw,gallivm,llvmpipe: Call gallivm_verify_function everywhere. commit f94c2238d2bc7383e088b8845b7410439a602071 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 18:54:10 2012 +0000 gallivm: optimize calculations for cube maps a bit this does some more vectorized calculations and uses horizontal adds if possible. A definite win with sse3 otherwise it doesn't seem to make much of a difference. In any case this is arithmetically identical, cannot handle larger vectors. Should be useful as a reference point against larger vector version later... commit 21a2c1cf3c8e1ac648ff49e59fdc0e3be77e2ebb Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 15:16:27 2012 +0000 llvmpipe: slight optimization of occlusion queries using movmskps when available. While this is slightly better for cpus without popcnt we should really sum the vectors ourselves (it is also possible to cast to i4 before doing the popcnt but that doesn't help that much neither since llvm is using some optimized popcnt version for i32) commit 5ab5a35f216619bcdf55eed52b0db275c4a06c1b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 13:32:11 2012 +0000 llvmpipe: fix occlusion queries with larger vectors need to adjust casts etc. commit ff95e6fdf5f16d4ef999ffcf05ea6e8c7160b0d5 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Mar 19 20:15:25 2012 +0000 gallivm: Restore optimization passes. commit 57b05b4b36451e351659e98946dae27be0959832 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:34:22 2012 +0000 llvmpipe: use existing min2 macro commit bc9a20e19b4f600a439f45679451f2e87cd4b299 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:07:27 2012 +0000 llvmpipe: add some safeguards against really large vectors As per José's suggestion, prevent things from blowing up if some cpu would have 1024bit or larger vectors. commit 0e2b525e5ca1c5bbaa63158bde52ad1c1564a3a9 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:31:08 2012 +0000 llvmpipe: fix mask generation for uberwide vectors this was the only piece preventing 16-wide vectors from working (apart from the LP_MAX_VECTOR_WIDTH define that is), which is the maximum as we don't get more pixels in the fragment shader at once. Hence adjust that so things could be tested properly with that size even though there seems to be no practical value. commit 3c8334162211c97f3a11c7f64e9e5a2a91ad9656 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:19:41 2012 +0000 llvmpipe: fix the simple interpolation method with larger vectors so both methods actually _really_ work now. Makes textures look nice with larger vectors... commit 1cb0464ef8871be1778d43b0c56adf9c06843e2d Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 17:26:35 2012 +0000 llvmpipe: fix mask generation and position interpolation with 8-wide vectors trivial bugs, with these things start to look somewhat reasonable. Textures though have some swizzling issues it seems. commit 168277a63ef5b72542cf063c337f2d701053ff4b Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 16:04:03 2012 +0000 llvmpipe: don't overallocate variables we never have more than 16 (stamp size) / 4 (minimum possible vector size). (With larger vectors those variables are still overallocated a bit.) commit 409b54b30f81ed0aa9ed0b01affe15c72de9abd2 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:56:48 2012 +0000 llvmpipe: add some 32f8 formats to lp_test_conv Also add the ability to handle different sized vectors. commit 55dcd3af8366ebdac0af3cdb22c2588f24aa18ce Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:47:27 2012 +0000 gallivm: handle different sized vectors in conversion / pack only fully generic path for now (extract/insert per element). commit 9c040f78c54575fcd94a8808216cf415fe8868f6 Author: Roland Scheidegger <sroland@vmware.com> Date: Sun Mar 18 00:58:28 2012 +0100 llvmpipe: fix harmless use of unitialized values commit 551e9d5468b92fc7d5aa2265db9a52bb1e368a36 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:31:21 2012 +0100 gallivm: drop special path in extract_broadcast with different sized vectors Not needed, llvm can handle shuffles with different sized result vector just fine. Should hopefully generate the same code in the end, but simpler IR. commit 44da531119ffa07a421eaa041f63607cec88f6f8 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:28:49 2012 +0100 llvmpipe: adapt interpolation for handling multiple quads at once this is still WIP there are actually two methods possible not quite sure what makes the most sense, so there's code for both for now: 1) the iterative method as used before (compute attrib values at upper left corner of stamp and upper left corner of each quad initially). It is improved to handle more than one quad at once, and also do some more vectorized calculations initially for slightly better code - newer cpus have full throughput with 4 wide float vectors, hence don't try to code up a path which might be faster if there's just one channel active per attribute. 2) just do straight interpolation for each pixel. Method 2) is more work per quad, but less initially - if all quads are executed significantly more overall though. But this might change with larger vector lengths. This method would also be needed if we'd do some kind of active quad merging when operating on multiple quads at once. This path contains some hack to force llvm to generate better code, it is still far from ideal though, still generates far too many unnecessary register spills/reloads. Both methods should work with different sized vectors. Not very well tested yet, still seems to work with four-wide vectors, need changes elsewhere to be able to test with wider vectors. commit be5d3e82e2fe14ad0a46529ab79f65bf2276cd28 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:59:37 2012 +0000 draw: Cleanup. commit f85bc12c7fbacb3de2a94e88c6cd2d5ee0ec0e8d Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:43:30 2012 +0000 gallivm: More module compilation refactoring. commit d76f093198f2a06a93b2204857e6fea5fd0b3ece Author: José Fonseca <jfonseca@vmware.com> Date: Thu Mar 15 21:29:11 2012 +0000 llvmpipe: Use gallivm_compile/free_function() in linear code. Should had been done before. commit 122e1adb613ce083ad739b153ced1cde61dfc8c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 13 14:47:10 2012 +0100 llvmpipe: generate partial pixel mask for multiple quads still works with one quad, cannot be tested yet with more At least for now always fixed order with multiple quads. commit 4c4f15081d75ed585a01392cd2dcce0ad10e0ea8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 22:09:24 2012 +0100 llvmpipe: refactor state setup a bit Refactor to make it easier to emit (and potentially later fetch in fs) coefficients for multiple attributes at once. Need to think more about how to make this actually happen however, the problem is different attributes can have different interpolation modes, requiring different handling in both setup and fs (though linear and perspective handling is close). commit 9363e49722ff47094d688a4be6f015a03fba9c79 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 19:23:23 2012 +0100 llvmpipe: vectorize tri offset calc cuts number of instructions in quad-offset-factor from 107 to 75. This code actually duplicated the (scalar) code calculating the determinant except it used different vertex order (leading to different sign but it doesn't matter) hence llvm could not have figured out it's the same (of course with determinant vectorized in the other place that wouldn't have worked any longer neither). Note this particular piece doesn't actually vectorize well, not many arithmetic instructions left but tons of shuffle instructions... Probably would need to work on n tris at a time for better vectorization. commit 63169dcb9dd445c94605625bf86d85306e2b4297 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 03:11:37 2012 +0100 llvmpipe: vectorize some scalar code in setup reduces number of arithmetic instructions, and avoids loading vector x,y values twice (once as scalars once as vectors). Results in a reduction of instructions from 76 to 64 in fs setup for glxgears (16%) on a cpu with sse41. Since this code uses vec2 disguised as vec4, on old cpus which had physical 64bit sse units (pre-Core2) it probably is less of a win in practice (and if you have no vectors you can only hope llvm eliminates the arithmetic for unneeded elements). commit 732ecb877f951ab89bf503ac5e35ab8d838b58a1 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 7 00:32:24 2012 +0100 draw: fix clipping bug introduced by 4822fea3f0440b5205e957cd303838c3b128419c broke clipping pretty badly (verified with lineclip test) commit ef5d90b86d624c152d200c7c4056f47c3c6d2688 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 23:38:59 2012 +0100 draw: don't store vertex header per attribute storing the vertex header once per attribute is totally unnecessary. Some quick look at the generated assembly says llvm in fact cannot optimize away the additional stores (maybe due to potentially aliasing pointers somewhere). Plus, this makes the code cleaner and also allows using a vector "or" instead of scalar ones. commit 6b3a5a57b0b9850854cfbd7b586e4e50102dda71 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 19:11:01 2012 +0100 draw: do the per-vertex "boolean" clipmask "or" with vectors no point extracting the values and doing it per component. Doesn't help that much since we still extract the values elsewhere anyway. commit 36519caf1af40e4480251cc79a2d527350b7c61f Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 2 22:27:01 2012 +0100 gallivm: fix lp_build_extract_broadcast with different sized vectors Fix the obviously wrong argument, so it doesn't blow up. commit 76d0ac3ad85066d6058486638013afd02b069c58 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 2 12:16:23 2012 +0000 draw: Compile per module and not per function (WIP). Enough to get gears w/ LLVM draw + softpipe to work on AVX doing: GALLIUM_DRIVER=softpipe SOFTPIPE_USE_LLVM=yes glxgears But still hackish -- will need to rethink and refactor this. commit 78e32b247d2a7a771be9a1a07eb000d1e54ea8bd Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 12:01:05 2012 +0000 llvmpipe: Remove lp_state_setup_fallback. Never used. commit 6895d5e40d19b4972c361e8b83fdb7eecda3c225 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Feb 27 19:14:27 2012 +0000 llvmpipe: Don't emit EMMS on x86 We already take precautions to ensure that LLVM never emits MMX code. commit 4822fea3f0440b5205e957cd303838c3b128419c Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 29 15:58:19 2012 +0100 draw: modifications for larger vector sizes We want to be able to use larger vectors especially for running the vertex shader. With this patch we build soa vectors which might have a different length than 4. Note that aos structures really remain the same, only when aos structures are converted to soa potentially different sized vectors are used. Samplers probably don't work yet, didn't look at them. Testing done: glxgears works with both 128bit and 256bit vectors. commit f4950fc1ea784680ab767d3dd0dce589f4e70603 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:57 2012 +0100 gallivm: override native vector width with LP_NATIVE_VECTOR_WIDTH env var for debug commit 6ad6dbf0c92f3bf68ae54e5f2aca035d19b76e53 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:24 2012 +0100 draw: allocate storage with alignment according to native vector width commit 7bf0e3e7c9bd2469ae7279cabf4c5229ae9880c1 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Feb 24 19:06:08 2012 +0000 gallivm: Fix comment grammar. Was missing several words. Spotted by Roland. commit b20f1b28eb890b2fa2de44a0399b9b6a0d453c52 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 19:22:09 2012 +0000 gallivm: Use MC-JIT on LLVM 3.1 + (i.e, SVN) MC-JIT Note: MC-JIT is still WIP. For this to work correctly it requires LLVM changes which are not yet upstream. commit b1af4dfcadfc241fd4023f4c3f823a1286d452c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 20:03:15 2012 +0100 llvmpipe: use new lp_type_width() helper in lp_test_blend commit 04e0a37e888237d4db2298f31973af459ef9c95f Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 19:50:34 2012 +0100 llvmpipe: clean up lp_test_blend a little Using variables just sized and aligned right makes it a bit more obvious what's going on. The test still only tests vector length 4. For AoS anything else probably isn't going to work. For SoA other lengths should work (at least with floats). commit e61c393d3ec392ddee0a3da170e985fda885a823 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:48:30 2012 +0000 gallivm: Ensure vector width consistency. Instead of assuming that everything is the max native size. commit 330081ac7bc41c5754a92825e51456d231bf84dd Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:44:14 2012 +0000 draw: More simd vector width consistency fixes. commit d90ca002753596269e37297e2e6c139b19f29f03 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:43:00 2012 +0000 gallivm: Remove unused lp_build_int32_vec4_type() helper. commit cae23417824d75869c202aaf897808d73a2c1db0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 17:32:16 2012 +0100 gallivm: use global variable for native vector width instead of define We do not know the simd extensions (and hence the simd width we should use) available at compile time. At least for now keep a define for maximum vector width, since a global variable obviously can't be used to adjust alignment of automatic stack variables. Leave the runtime-determined value at 128 for now in all cases. commit 51270ace6349acc2c294fc6f34c025c707be538a Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 15:41:02 2012 +0000 gallivm: Add a hunk inadvertedly lost when rebasing. commit bf256df9cfdd0236637a455cbaece949b1253e98 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:24:23 2012 +0000 llvmpipe: Use consistent vector width in depth/stencil test. commit 5543b0901677146662c44be2cfba655fd55da94b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:19:59 2012 +0000 draw: Use a consistent the vector register width. Instead of 4x32 sometimes, LP_NATIVE_VECTOR_WIDTH other times. commit eada8bbd22a3a61f549f32fe2a7e408222e5c824 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 12:08:04 2012 +0000 gallivm: Remove garbagge collection. MC-JIT will require one compilation per module (as opposed to one compilation per function), therefore no state will be shared, eliminating the need to do garbagge collection. commit 556697ea0ed72e0641851e4fbbbb862c470fd7eb Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 10:33:41 2012 +0000 gallivm: Move all native target initialization to lp_set_target_options(). commit c518e8f3f2649d5dc265403511fab4bcbe2cc5c8 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:52:32 2012 +0000 llvmpipe: Create one gallivm instance for each test. commit 90f10af8920ec6be6f2b1e7365cfc477a0cb111d Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:48:08 2012 +0000 gallivm: Avoid LLVMAddGlobalMapping() in lp_bld_assert(). Brittle, complex, and unecesary. Just use function pointer constant. commit 98fde550b33401e3fe006af59db4db628bcbf476 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:21:26 2012 +0000 gallivm: Add a lp_build_const_func_pointer() helper. To be reused in all places where we want to call C code. commit 6cfedadb62c2ce5af8d75969bc95a607f3ece118 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:44:41 2012 +0000 gallivm: Cleanup/simplify lp_build_const_string_variable. - Move to lp_bld_const where it belongs - Rename to lp_build_const_string - take the length from the argument (and don't count the zero terminator twice) - bitcast the constant to generic i8 * commit db1d4018c0f1fa682a9da93c032977659adfb68c Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 11:52:17 2012 +0000 gallivm: Set NoFramePointerElimNonLeaf to true where supported. commit 088614164aa915baaa5044fede728aa898483183 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 22 19:38:47 2012 +0100 llvmpipe: pass in/out pointers rather scalar floats in lp_bld_arit we don't want llvm to potentially optimize away the vectors (though it doesn't seem to currently), plus we want to be able to handle in/out vectors of arbitrary length. commit 3f5c4e04af8a7592fdffa54938a277c34ae76b51 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:22:55 2012 +0100 gallivm: fix lp_build_sqrt() for vector length 1 since we optimize away vectors with length 1 need to emit intrinsic without vector type. commit 79d94e5f93ed8ba6757b97e2026722ea31d32c06 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 17:00:46 2012 +0000 llvmpipe: Remove lp_test_round. commit 81f41b5aeb3f4126e06453cfc78990086b85b78d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:56:24 2012 +0100 llvmpipe: subsume lp_test_round into lp_test_arit Much simpler, and since the arguments aren't passed as 128bit values can run on any arch. This also uses the float instead of the double versions of the c functions (which probably was the intention anyway). In contrast to lp_test_round the output is much less verbose however. Tested vector width of 32 to 512 bits - all pass except 32 (length 1) which crashes in lp_build_sqrt() due to wrong type. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 945b338b421defbd274481d8c4f7e0910fd0e7eb Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 09:55:03 2012 +0000 gallivm: Centralize the function compilation logic. This simplifies a lot of code. Also doing this in a central place will make it easier to carry out the changes necessary to use MC-JIT in the future. gallivm: Fix typo in explicit derivative shuffle. Trivial. draw: make DEBUG_STORE work again adapt to lp_build_printf() interface changes Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: get rid of vecnf_from_scalar() just use lp_build_broadcast directly (cannot assign a name but don't really need it, vecnf_from_scalar() was producing much uglier IR due to using repeated insertelement instead of insertelement+shuffle). Reviewed-by: José Fonseca <jfonseca@vmware.com> llvmpipe: fix typo in complex interpolation code Fixes position interpolation when using complex mode (piglit fp-fragment-position and similar) Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: fix clipvertex/position storing again This appears to be the result of a bad merge. Fixes piglit tests relying on clipping, like a lot of the interpolation tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Fix explicit derivative manipulation. Same counter variable was being used in two nested loops. Use more meanigful variable names for the counter to fix and avoid this. gallivm: Prevent buffer overflow in repeat wrap mode for NPOT. Based on Roland's patch, discussion, and review . Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix dims for TGSI_TEXTURE_1D in emit_tex. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix explicit volume texture derivatives. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: fix 1d shadow texture sampling Always r coordinate is used, hence need 3 coords not two (the second one is unused). Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Enable AVX support without MCJIT, where available. For now, this just enables AVX on Windows for testing. If the code is stable then we might consider prefering the old JIT wherever possible. No change elsewhere. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2012-07-13 18:09:30 +01:00
res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
for (i = 0; i < num_iterations; ++i) {
res = lp_build_rcp_refine(bld, a, res);
}
return res;
}
return LLVMBuildFDiv(builder, bld->one, a, "");
}
/**
* Do one Newton-Raphson step to improve rsqrt precision:
*
* x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
*
* See also Intel 64 and IA-32 Architectures Optimization Manual.
*/
static inline LLVMValueRef
lp_build_rsqrt_refine(struct lp_build_context *bld,
LLVMValueRef a,
LLVMValueRef rsqrt_a)
{
LLVMBuilderRef builder = bld->gallivm->builder;
LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
LLVMValueRef res;
res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
res = LLVMBuildFMul(builder, a, res, "");
res = LLVMBuildFSub(builder, three, res, "");
res = LLVMBuildFMul(builder, rsqrt_a, res, "");
res = LLVMBuildFMul(builder, half, res, "");
return res;
}
2009-08-19 20:13:49 +01:00
/**
* Generate 1/sqrt(a).
* Result is undefined for values < 0, infinity for +0.
2009-08-19 20:13:49 +01:00
*/
LLVMValueRef
lp_build_rsqrt(struct lp_build_context *bld,
LLVMValueRef a)
{
const struct lp_type type = bld->type;
2010-08-09 17:30:33 +01:00
assert(lp_check_value(type, a));
assert(type.floating);
/*
* This should be faster but all denormals will end up as infinity.
*/
if (0 && lp_build_fast_rsqrt_available(type)) {
const unsigned num_iterations = 1;
LLVMValueRef res;
unsigned i;
gallivm,draw,llvmpipe: Support wider native registers. Squashed commit of the following: commit 7acb7b4f60dc505af3dd00dcff744f80315d5b0e Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:46:31 2012 +0100 draw: Don't use dynamically sized arrays. Not supported by MSVC. commit 5810c28c83647612cb372d1e763fd9d7780df3cb Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:44:16 2012 +0100 gallivm,llvmpipe: Don't use expressions with PIPE_ALIGN_VAR(). MSVC doesn't accept exceptions in _declspec(align(...)). Use a define instead. commit 8aafd1457ba572a02b289b3f3411e99a3c056072 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 9 17:41:56 2012 +0100 gallium/util: Make u_cpu_detect.h header C++ safe. commit 5795248350771f899cfbfc1a3a58f1835eb2671d Author: José Fonseca <jfonseca@vmware.com> Date: Mon Jul 2 12:08:01 2012 +0100 gallium/util: Add ULL suffix to large constants. As suggested by Andy Furniss: it looks like some old gcc versions require it. commit 4c66c22727eff92226544c7d43c4eb94de359e10 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Truly disable INF/NAN tests on MSVC. Thanks to Brian for spotting this. commit 8bce274c7fad578d7eb656d9a1413f5c0844c94e Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 13:39:07 2012 +0100 gallium/util: Disable INF/NAN tests on MSVC. Somehow they are not recognized as constants. commit 6868649cff8d7fd2e2579c28d0b74ef6dd4f9716 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 15:05:24 2012 +0200 gallivm: Cleanup the 2 x 8 float -> 16 ub special path in lp_build_conv. No behaviour change intended, like 7b98455fb40c2df84cfd3cdb1eb7650f67c8a751. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 5147a0949c4407e8bce9e41d9859314b4a9ccf77 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jul 5 14:28:19 2012 +0200 gallivm: (trivial) fix issues with multiple-of-4 texture fetch Some formats can't handle non-multiple of 4 fetches I believe, but everything must support length 1 and multiples of 4. So avoid going to scalar fetch (which is very costly) just because length isn't 4. Also extend the hack to not use shift with variable count for yuv formats to arbitrary length (larger than 1) - doesn't matter how many elements we have we always want to avoid it unless we have variable shift count instruction (which we should get with avx2). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 87ebcb1bd71fa4c739451ec8ca89a7f29b168c08 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jul 4 02:09:55 2012 +0200 gallivm: (trivial) fix typo for wrap repeat mode in linear filtering aos code This would lead to bogus coordinates at the edges. (undetected by piglit because this path is only taken for block-based formats). Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 3a42717101b1619874c8932a580c0b9e6896b557 Author: José Fonseca <jfonseca@vmware.com> Date: Tue Jul 3 19:42:49 2012 +0100 gallivm: Fix TGSI integer translation with AVX. commit d71ff104085c196b16426081098fb0bde128ce4f Author: José Fonseca <jfonseca@vmware.com> Date: Fri Jun 29 15:17:41 2012 +0100 llvmpipe: Fix LLVM JIT linear path. It was not working properly because it was looking at the JIT function before it was actually compiled. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit a94df0386213e1f5f9a6ed470c535f9688ec0a1b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Jun 28 18:07:10 2012 +0100 gallivm: Refactor lp_build_broadcast(_scalar) to share code. Doesn't really change the generated assembly, but produces more compact IR, and of course, makes code more consistent. Reviewed-by: Brian Paul <brianp@vmware.com> commit 66712ba2731fc029fa246d4fc477d61ab785edb5 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 17:30:13 2012 +0100 gallivm: Make LLVMContextRef a singleton. There are any places inside LLVM that depend on it. Too many to attempt to fix. Reviewed-by: Brian Paul <brianp@vmware.com> commit ff5fb7897495ac263f0b069370fab701b70dccef Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 28 18:15:27 2012 +0200 gallivm: don't use 8-wide texture fetch in aos path This appears to be a slight loss usually. There are probably several reasons for that: - fetching itself is scalar - filtering is pure int code hence needs splitting anyway, same for the final texel offset calculations - texture wrap related code, which can be done 8-wide, is slightly more complex with floats (with clamp_to_edge) and float operations generally more costly hence probably not much faster overall - the code needed to split when encountering different mip levels for the quads, adding complexity So, just split always for aos path (but leave it 8-wide for soa, since we do 8-wide filtering there when possible). This should certainly be revisited if we'd have avx2 support. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ce8032b43dcd8e8d816cbab6428f54b0798f945d Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:41:19 2012 +0200 gallivm: (trivial) don't extract fparts variable if not needed Did not have any consequences but unnecessary. commit aaa9aaed8f80dc282492f62aa583a7ee23a4c6d5 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 18:09:06 2012 +0200 gallivm: fix precision issue in aos linear int wrap code now not just passes at a quick glance but also with piglit... If we do the wrapping with floats, we also need to set the weights accordingly. We can potentially end up with different (integer) coordinates than what the integer calculations would have chosen, which means the integer weights calculated previously in this case are completely wrong. Well at least that's what I think happens, at least recalculating the weights helps. (Some day really should refactor all the wrapping, so we do whatever is fastest independent of 16bit int aos or 32bit float soa filtering.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit fd6f18588ced7ac8e081892f3bab2916623ad7a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 27 11:15:53 2012 +0100 gallium/util: Fix parsing of options with underscore. For example GALLIVM_DEBUG=no_brilinear which was being parsed as two options, "no" and "brilinear". commit 09a8f809088178a03e49e409fa18f1ac89561837 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 26 15:00:14 2012 +0100 gallivm: Added a generic lp_build_print_value which prints a LLVMValueRef. Updated lp_build_printf to share common code. Removed specific lp_build_print_vecX. Reviewed-by: José Fonseca <jfonseca@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit e59bdcc2c075931bfba2a84967a5ecd1dedd6eb0 Author: José Fonseca <jfonseca@vmware.com> Date: Wed May 16 15:00:23 2012 +0100 draw,llvmpipe: Avoid named struct types on LLVM 3.0 and later. Starting with LLVM 3.0, named structures are meant not for debugging, but for recursive data types, previously also known as opaque types. The recursive nature of these types leads to several memory management difficulties. Given that we don't actually need recursive types, avoid them altogether. This is an attempt to address fdo bugs 41791 and 44466. The issue is somewhat random so there's no easy way to check how effective this is. Cherry-picked from 9af1ba565dfd5cef9ee938bb7c04767d14878fbf commit df6070f618a203c7a876d984c847cde4cbc26bdb Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 27 14:42:53 2012 +0200 gallivm: (trivial) fix typo in faster aos linear int wrap code no longer crashes, now REALLY tested. commit d8f98dce452c867214e6782e86dc08562643c862 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 18:20:58 2012 +0200 llvmpipe: (trivial) remove bogus optimization for float aos repeat wrap This optimization for nearest filtering on the linear path generated likely bogus results, and the int path didn't have any optimizations there since the only shader using force_nearest apparently uses clamp_to_edge not repeat wrap anyway. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit c4e271a0631087c795e756a5bb6b046043b5099d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 23:01:52 2012 +0200 gallivm: faster repeat wrap for linear aos path too Even if we already have scaled integer coords, it's way faster to use the original float coord (plus some conversions) rather than use URem. The choice of what to do for texture wrapping is not really tied to int aos or float soa filtering though for some modes there can be some gains (because of easier weight calculations). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 1174a75b1806e92aee4264ffe0ffe7e70abbbfa3 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 14:39:22 2012 +0200 gallivm: improve npot tex wrap repeat in linear soa path URem gets translated into series of scalar divisions so just about anything else is faster. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit f849ffaa499ed96fa0efd3594fce255c7f22891b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 26 00:40:35 2012 +0100 gallivm: (trivial) fix near-invisible shift-space typo I blame the keyboard. commit 5298a0b19fe672aebeb70964c0797d5921b51cf0 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:24:28 2012 +0200 gallivm: add new intrinsic helper to deal with arbitrary vector length This helper will split vectors which are too large for the hw, or expand them if they are too small, so a caller of a function using intrinsics which uses such sizes need not split (or expand) the vectors manually and the function will still use the intrinsic instead of dropping back to generic llvm code. It can also accept scalars for use with pseudo-vector intrinsics (only useful for float arguments, all x86 scalar simd float intrinsics use 4vf32). Only used for lp_build_min/max() for now (also added the scalar float case for these while there). (Other basic binary functions could use it easily, whereas functions with a different interface would need different helpers.) Expanding vectors isn't widely used, because we always try to use build contexts with native hw vector sizes. But it might (or not) be nicer if this wouldn't need to be done, the generated code should in theory stay the same (it does get hit by lp_build_rho though already since we didn't have a intrinsic for the scalar lp_build_max case before). v2: incorporated Brian's feedback, and also made the scalar min/max case work instead of crash (all scalar simd float intrinsics take 4vf32 as argument, probably the reason why it wasn't used before). Moved to lp_bld_intr based on José's request, and passing intrinsic size instead of length. Ideally we'd derive the source type info from the passed in llvm value refs and process some llvmtype return type so we could handle intrinsics where the source and destination type isn't the same (like float/int conversions, packing instructions) but that's a bit too complicated for now. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 01aa760b99ec0b2dc8ce57a43650e83f8c1becdf Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 16:19:18 2012 +0200 gallivm: (trivial) increase max code size for shader disassembly 64kB was just short of what I needed (which caused a crash) hence increase to 96kB (should probably be smarter about that). commit 74aa739138d981311ce13076388382b5e89c6562 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:53:29 2012 +0100 gallivm: simplify aos float tex wrap repeat nearest just handle pot and npot the same. The previous pot handling ended up with exactly the same instructions plus 2 more (leave it in the soa path though since it is probably still cheaper there). While here also fix a issue which would cause a crash after an assert. commit 0e1e755645e9e49cfaa2025191e3245ccd723564 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:29:24 2012 +0100 gallivm: (trivial) skip floor rounding in ifloor when not signed This was only done for the non-sse41 case before, but even with sse41 this is obviously unnecessary (some callers already call itrunc in this case anyway but some might not). commit 7f01a62f27dcb1d52597b24825931e88bae76f33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 25 11:23:12 2012 +0100 gallivm: (trivial) fix bogus comments commit 5c85be25fd82e28490274c468ce7f3e6e8c1d416 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Jun 20 11:51:57 2012 +0100 translate: Free elt8_func/elt16_func too. These were leaking. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 0ad498f36fb6f7458c7cffa73b6598adceee0a6c Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 15:55:34 2012 +0200 gallivm: fix bug for tex wrap repeat with linear sampling in aos float path The comparison needs to be against length not length_minus_one, otherwise the max texel is never chosen (for the second coordinate). Fixes piglit texwrap-1D-npot-proj (and 2D/3D versions). Reviewed-by: José Fonseca <jfonseca@vmware.com> commit d1ad65937c5b76407dc2499b7b774ab59341209e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Jun 19 16:13:43 2012 +0200 gallivm: simplify soa tex wrap repeat with npot textures and no mip filtering Similar to what is already done in aos sampling for the float path (but not the int path since we don't get normalized float coordinates there). URem is expensive and the calculation is done trivially with normalized floats instead (at least with sse41-capable cpus). (Some day should probably do the same for the mip filter path but it's much more complicated there hence the gain is smaller.) Reviewed-by: José Fonseca <jfonseca@vmware.com> commit e1e23f57ba9b910295c306d148f15643acc3fc83 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:38:56 2012 +0200 llvmpipe: (trivial) remove duplicated function declaration Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 07ca57eb09e04c48a157733255427ef5de620861 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 20:37:34 2012 +0200 llvmpipe: destroy setup variants on context destruction lp_delete_setup_variants() used to be called in garbage collection, but this no longer exists hence the setup shaders never got freed. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit ed0003c633859a45f9963a479f4c15ae0ef1dca3 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 18 16:25:29 2012 +0100 gallivm: handle different ilod parts for multiple quad sampling This fixes filtering when the integer part of the lod is not the same for all quads. I'm not fully convinced of that solution yet as it just splits the vector if the levels to be sampled from are different. But otherwise we'd need to do things like some minify steps, and getting mip level base address separately anyway hence it wouldn't really look like much of a win (and making the code even more complex). This should now give identical results to single quad sampling. commit 8580ac4cfc43a64df55e84ac71ce1a774d33c0d2 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:14:47 2012 +0200 gallivm: de-duplicate sample code common to soa and aos sampling There doesn't seem to be any reason why this code dealing with cube face selection, lod and mip level calculation is separate in aos and soa sampling, and I am sick of having it to change in both places. commit fb541e5f957408ce305b272100196f1e12e5b1e8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Jun 14 18:15:41 2012 +0200 gallivm: do mip filtering with per quad lod_fpart This gives better results for mip filtering, though the generated code might not be optimal. For now it also creates some artifacts if the lod_ipart isn't the same for all quads, since instead of using the same mip weight for all quads as previously (which just caused non-smooth gradients) this now will use the right weights but with the wrong mip level in this case (can easily be seen with things like texfilt, mipmap_tunnel). v2: use logic helper suggested by José, and fix issue with negative lod_fpart values commit f1cc84eef7d826a20fab6cd8ccef9a275ff78967 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Jun 13 18:35:25 2012 +0200 gallivm: (trivial) fix bogus assert in lp_build_unpack_broadcast_aos_scalars commit 7c17dbae8ae290df9ce0f50781a09e8ed640c044 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:11:14 2012 +0100 util: Reimplement half <-> float conversions. Removed u_half.py used to generate the table for previous method. Previous implementation of float to half conversion was faulty for denormalised and NaNs and would require extra logic to fix, thus making the speedup of using tables irrelevant. commit 7762f59274070e1dd4b546f5cb431c2eb71ae5c3 Author: James Benton <jbenton@vmware.com> Date: Tue Jun 12 12:12:16 2012 +0100 tests: Updated tests to properly handle NaN for half floats. commit fa94c135aea5911fd93d5dfb6e6f157fb40dce5e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:33:10 2012 +0200 gallivm: do mip level calculations per quad This is the final piece which shouldn't change the rendering output yet. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 23cbeaddfe03c09ca18c45d28955515317ffcf4c Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Jun 9 00:54:21 2012 +0200 gallivm: do per-quad cube face selection Doesn't quite fix the piglit cubemap test (not sure why actually) but doing per-quad face selection is doing the right thing and definitely an improvement. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit abfb372b3702ac97ac8b5aa80ad1b94a2cc39d33 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Jun 11 18:22:59 2012 +0200 gallivm: do all lod calculations per quad Still no functional change but lod is now converted to scalar after lod calculations. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 519368632747ae03feb5bca9c655eccbc5b751b4 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:46:10 2012 +0100 gallivm: Added support for half-float to float conversion in lp_build_conv. Updated various utility functions to support this change. commit 135b4d683a4c95f7577ba27b9bffa4a6fbd2c2e7 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 16:02:46 2012 +0100 gallivm: Added function for half-float to float conversion. Updated lp_build_format_aos_array to support half-float source. commit 37d648827406a20c5007abeb177698723ed86673 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:55:18 2012 +0100 util: Updated u_format_tests to rigidly test half-float boundary values. commit 2ad18165d96e578aa9046df7c93cb1c3284d8c6b Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:54:16 2012 +0100 llvmpipe: Updated lp_test_format to properly handle Inf/NaN results. commit 78740acf25aeba8a7d146493dd5c966e22c27b73 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 14:53:30 2012 +0100 util: Added functions for checking NaN / Inf for double and half-floats. commit 35e9f640ae01241f9e0d67fe893bbbf564c05809 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:05:13 2012 +0200 gallivm: Fix calculating rho for 3d textures for the single-quad case Discovered by accident, this looks like a very old typo bug. commit fc1220c636326536fd0541913154e62afa7cd1d8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 24 21:04:59 2012 +0200 gallivm: do calcs per-quad in lp_build_rho Still convert to scalar at the end of the function. commit 50a887ffc550bf310a6988fa2cea5c24d38c1a41 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 21 23:21:50 2012 +0200 gallivm: (trivial) return scalar in lp_build_extract_range for length 1 vectors Our type system on top of llvm's one doesn't generally support vectors of length 1, instead using scalars. So we should return a scalar from this function instead of having to bitcast the vector with length 1 later elsewhere. commit 80c71c621f9391f0f9230460198d861643324876 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 17:49:15 2012 +0100 draw: Fixed bad merge error commit c47401cfad0c9167de20ff560654f533579f452c Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:29:30 2012 +0100 draw: Updated store_clip to store whole vectors instead of individual elements. commit 2d9c1ad74b0b0b41861fffcecde39f09cc27f1cf Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:28:32 2012 +0100 gallivm: Added lp_build_fetch_rgba_aos_array. A version of lp_build_fetch_rgba_aos which is targeted at simple array formats. Reads the whole vector from memory in one, instead of reading each element individually. Tested with mesa tests and demos. commit ff7805dc2b6ef6d8b11ec4e54aab1633aef29ac8 Author: James Benton <jbenton@vmware.com> Date: Tue May 22 15:27:40 2012 +0100 gallivm: Added lp_build_pad_vector. This function pads a vector with undef to a desired length. commit 701f50acef24a2791dabf4730e5b5687d6eb875d Author: James Benton <jbenton@vmware.com> Date: Fri May 18 17:27:19 2012 +0100 util: Added util_format_is_array. This function checks whether a format description is in a simple array format. commit 5e0a7fa543dcd009de26f34a7926674190fa6246 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:13:47 2012 +0100 draw: Removed draw_llvm_translate_from and draw/draw_llvm_translate.c. This is "replaced" by adding an optimised path in lp_build_fetch_rgba_aos in an upcoming patch. commit 8c886d6a7dd3fb464ecf031de6f747cb33e5361d Author: James Benton <jbenton@vmware.com> Date: Wed May 16 15:02:31 2012 +0100 draw: Modified store_aos to write the vector as one, not individual elements. commit 37337f3d657e21dfd662c7b26d61cb0f8cfa6f17 Author: James Benton <jbenton@vmware.com> Date: Wed May 16 14:16:23 2012 +0100 draw: Changed aos_to_soa to use lp_build_transpose_aos. commit bd2b69ce5d5c94b067944d1dcd5df9f8e84548f1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 19:14:27 2012 +0100 draw: Changed soa_to_aos to use lp_build_transpose_aos. commit 0b98a950d29a116e82ce31dfe7b82cdadb632f2b Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:45 2012 +0100 gallivm: Added lp_build_transpose_aos which converts between aos and soa. commit 69ea84531ad46fd145eb619ed1cedbe97dde7cb5 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 18:57:01 2012 +0100 gallivm: Added lp_build_interleave2_half aimed at AVX unpack instructions. commit 7a4cb1349dd35c18144ad5934525cfb9436792f9 Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 22 11:54:14 2012 +0100 gallivm: Fix build on Windows. MC-JIT not yet supported there. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit afd105fc16bb75d874e418046b80d9cc578818a1 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:17:26 2012 +0100 llvmpipe: Added a error counter to lp_test_conv. Useful for keeping track of progress when fixing errors! Signed-off-by: José Fonseca <jfonseca@vmware.com> commit b644907d08c10a805657841330fc23db3963d59c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:16:46 2012 +0100 llvmpipe: Changed known failures in lp_test_conv. To comply with the recent fixes to lp_bld_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit d7061507bd94f6468581e218e61261b79c760d4f Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:14:38 2012 +0100 llvmpipe: Added fixed point types tests to lp_test_conv. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 146b3ea39b4726dbe125ac666bd8902ea3d6ca8c Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:26:35 2012 +0100 llvmpipe: Changed lp_test_conv src/dst alignment to be correct. Now based on the define rather than a fixed number. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit f3b57441f834833a4b142a951eb98df0aa874536 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:06:44 2012 +0100 gallivm: Fixed erroneous optimisation in lp_build_min/max. Previously assumed normalised was 0 to 1, but it can be -1 to 1 if type is signed. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a0613382e5a215cd146bb277646a6b394d376ae4 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:04:49 2012 +0100 gallivm: Compensate for lp_const_offset in lp_build_conv. Fixing a /*FIXME*/ to remove errors in integer conversion in lp_build_conv. Tested using lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit a3d2bf15ea345bc8a0664f8f441276fd566566f3 Author: James Benton <jbenton@vmware.com> Date: Fri May 18 16:01:25 2012 +0100 gallivm: Fixed overflow in lp_build_clamped_float_to_unsigned_norm. Tested with lp_test_conv and lp_test_format, reduced errors. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit e7b1e76fe237613731fa6003b5e1601a2e506207 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 21 20:07:51 2012 +0100 gallivm: Fix build with LLVM 2.6 Trivial, and useful. commit d3c6bbe5c7f5ba1976710831281ab1b6a631082d Author: José Fonseca <jfonseca@vmware.com> Date: Tue May 15 17:15:59 2012 +0100 gallivm: Enable MCJIT/AVX with vanilla LLVM 3.1. Add the necessary C++ glue, so that we don't need any modifications to the soon to be released LLVM 3.1. Reviewed-by: Roland Scheidegger <sroland@vmware.com> commit 724a019a14d40fdbed21759a204a2bec8a315636 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 22:04:06 2012 +0100 gallivm: Use HAVE_LLVM 0x0301 consistently. commit af6991e2a3868e40ad599b46278551b794839748 Author: José Fonseca <jfonseca@vmware.com> Date: Mon May 14 21:49:06 2012 +0100 gallivm: Add MCRegisterInfo.h to silence benign warnings about missing implementation. Trivial. commit 6f8a1d75458daae2503a86c6b030ecc4bb494e23 Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Apr 2 22:14:15 2012 -0700 gallivm: Pass in a MCInstrInfo to createMCInstPrinter on llvm-3.1. llvm-3.1svn r153860 makes MCInstrInfo available to the MCInstPrinter. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 62555b6ed8760545794f83064e27cddcb3ce5284 Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 27 21:51:17 2012 -0700 gallivm: Fix method overriding in raw_debug_ostream. Use matching type qualifers to avoid method hiding. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: José Fonseca <jfonseca@vmware.com> commit 6a9bd784f4ac68ad0a731dcd39e5a3c39989f2be Author: Vinson Lee <vlee@freedesktop.org> Date: Tue Mar 13 22:40:52 2012 -0700 gallivm: Fix createOProfileJITEventListener namespace with llvm-3.1. llvm-3.1svn r152620 refactored the OProfile profiling code. createOProfileJITEventListener was moved from the llvm namespace to the llvm::JITEventListener namespace. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit b674955d39adae272a779be85aa1bd665de24e3e Author: Vinson Lee <vlee@freedesktop.org> Date: Mon Mar 5 22:00:40 2012 -0800 gallivm: Pass in a MCRegisterInfo to MCInstPrinter on llvm-3.1. llvm-3.1svn r152043 changes createMCInstPrinter to take an additional MCRegisterInfo argument. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 11ab69971a8a31c62f6de74905dbf8c02884599f Author: Vinson Lee <vlee@freedesktop.org> Date: Wed Feb 29 21:20:53 2012 -0800 Revert "gallivm: Change getExtent and readByte to non-const with llvm-3.1." This reverts commit d5a6c172547d8964f4d4bb79637651decaf9deee. llvm-3.1svn r151687 makes MemoryObject accessor members const again. Signed-off-by: Vinson Lee <vlee@freedesktop.org> Reviewed-by: Brian Paul <brianp@vmware.com> commit 339960c82d2a9f5c928ee9035ed31dadb7f45537 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon May 14 16:19:56 2012 +0200 gallivm: (trivial) fix assertion failure for mipmapped 1d textures In lp_build_rho, we may end up with a 1-element vector (for mipmapped 1d textures), but in this case we require the type to be a non-vector type, so need a cast. commit 9d73edb727bd6d196030dc3026b7bf0c574b3e19 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:12:07 2012 +0200 gallivm: prepare for per-quad lod calculations for large vectors to be able to handle multiple quads at once in texture sampling and still do lod calculations per quad, it is necessary to get the per-quad derivatives into the lp_build_rho function. Until now these derivative values were just scalars, which isn't going to work. So we now use vectors, and since the interface needs to change we also do some different (slightly more efficient) packing of the values. For 8-wide vectors the packed derivative values for 3 coords would look like this, this scales to a arbitrary (multiple of 4) vector size: ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____ The second vector will be unused for 1d and 2d textures. To facilitate future changes the derivative values are put into a struct, since quite some functions just pass these values through. The generated code seems to be very slightly better for 2d textures (with 4-wide vectors) than before with sse2 (if you have a cpu with physical 128bit simd units - otherwise it's probably not a win). v2: suggestions from José, rename variables, add comments, use swizzle helper commit 0aa21de0d31466dac77b05c97005722e902517b8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu May 10 18:10:31 2012 +0200 gallivm: add undefined swizzle handling to lp_build_swizzle_aos This is useful for vectors with "holes", it lets llvm choose the most efficient shuffle instructions if some elements aren't needed without having to worry what elements to manually pick otherwise. commit 00faf3f370e7ce92f5ef51002b0ea42ef856e181 Author: José Fonseca <jfonseca@vmware.com> Date: Fri May 4 17:25:16 2012 +0100 gallivm: Get the LLVM IR optimization passes before JIT compilation. MC-JIT engine compiles the module immediately on creation, so the optimization passes were being run too late. So now we create a target data layout from a string, that matches the ABI parameters reported by the compiler. The backend optimization passes were always been run, so the performance improvement is modest (3% on multiarb mesa demo). Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Brian Paul <brianp@vmware.com> commit 40a43f4e2ce3074b5ce9027179d657ebba68800a Author: Roland Scheidegger <sroland@vmware.com> Date: Wed May 2 16:03:54 2012 +0200 gallivm: (trivial) fix wrong define used in lp_build_pack2 should fix stack-smashing crashes. commit e6371d0f4dffad4eb3b7a9d906c23f1c88a2ab9e Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 30 21:25:29 2012 +0200 gallivm: add perf warnings when not using intrinsics with 256bit vectors Helper functions using integer sse2 intrinsics could split the vectors with AVX instead of using generic fallback (which should be faster). We don't actually expect to hit these paths (hence don't fix them up to actually do the vector splitting) so just emit warnings (for those functions where it's obvious doing split/intrinsic is faster than using generic path). Only emit warnings for 256bit vectors since we _really_ don't expect to hit arbitrary large vectors which would affect a lot more functions. The warnings do not actually depend on avx since the same logic applies to plain sse2 too (but of course again there's _really_ no reason we should hit these functions with 256bit vectors without avx). commit 8a9ea701ea7295181e846c6383bf66a5f5e47637 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:37:07 2012 +0200 gallivm: split vectors manually for avx in lp_build_pack2 (v2) There's 2 reasons for this: First, there's a llvm bug (fixed in 3.1) which generates tons of byte inserts/extracts otherwise, and second, more importantly, we want to use pack intrinsics instead of shuffles. We do this in lp_build_pack2 and not the calling code (aos sample path) because potentially other callers might find that useful too, even if for larger sequences of code using non-native vector sizes it might be better to manually split vectors. This should boost texture performance in the aos path considerably. v2: fix issues with intrinsics types with old llvm commit 27ac5b48fa1f2ea3efeb5248e2ce32264aba466e Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:26:22 2012 +0200 llvmpipe: refactor lp_build_pack2 (v2) prettify, and it's unnecessary to assert when there's no intrinsic due to unsupported bit width - the shuffle path will work regardless. In contrast lp_build_packs2, should only rely on lp_build_pack2 doing the clamping for element sizes for which there is a sse2 intrinsic. v2: fix bug spotted by Jose regarding the intrinsic type for packusdw on old llvm versions. commit ddf279031f0111de4b18eaf783bdc0a1e47813c8 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue May 1 20:13:59 2012 +0200 gallivm: add src width check in lp_build_packs2() not doing so would skip clamping even if no sse2 pack instruction is available, which is incorrect (in theory only, such widths would also always hit a (unnecessary) assertion in lp_build_pack2(). commit e7f0ad7fe079975eae7712a6e0c54be4fae0114b Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Apr 27 15:57:00 2012 +0200 gallivm: (trivial) fix crash-causing typo for npot textures with avx commit 28a9d7f6f655b6ec508c8a3aa6ffefc1e79793a0 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Apr 25 19:38:45 2012 +0200 gallivm: (trivial) remove code mistakenly added twice. commit d5926537316f8ff67ad0a52e7242f7c5478d919b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Apr 24 21:16:15 2012 +0200 gallivm: add a new avx aos sample path (v2) Try to avoid mixing float and int address calculations. This does texture wrap modes with floats, and then the offset calculations still with ints (because of lack of precision with floats, though we could do some effort to make it work with not too large (16MB) textures). This also handles wrap repeat mode with npot-sized textures differently than either the old soa or aos int path (likely way faster but untested). Otherwise the actual address wrap code is largely similar to the soa path (not quite the same as this one also has some int code), it should get used by avx soa sampling later as well but doesn't handle more complex address modes yet (this will also have the benefit that we can use aos sampling path for all texture address modes). Generated code for that looks reasonable, but still does not split vectors explicitly for fetch/filter which means still get hit by llvm (fixed upstream) which generates hundreds of pinsrb/pextrb instead of two shuffles. It is not obvious though if it's much of a win over just doing address calcs 4-wide but with ints, even if it is definitely much less instructions on avx. piglit's texwrap seems to look exactly the same but doesn't test neither the non-normalized nor the npot cases. v2: fix comments, prettify based on Brian's and Jose's feedback. commit bffecd22dea66fb416ecff8cffd10dd4bdb73fce Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Apr 19 01:58:29 2012 +0200 gallivm: refactor aos lp_build_sample_image_nearest/linear split them up to separate address calculations and fetching/filtering. Need this for being able to do 8-wide float address calcs and 4-wide fetch/filter later (for avx). Plus the functions were very big scary monsters anyway (in particular lp_build_sample_image_linear). commit a80b325c57529adddcfa367f96f03557725c4773 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Apr 16 17:17:18 2012 +0200 gallivm: fix lp_build_resize when truncating width but expanding vector size Missed this case which I thought was impossible - the assertion for it was right after the division by zero... (AoS) texture sampling may ask us to do this, for things like 8 4x32int vectors to 1 32x8int vector conversion (eventually, we probably don't want this to happen). commit f9c8337caa3eb185830d18bce8b95676a065b1d7 Author: Roland Scheidegger <sroland@vmware.com> Date: Sat Apr 14 18:00:59 2012 +0200 gallivm: fix cube maps with larger vectors This makes the branchless cube face selection code work with larger vectors. Because the complexity is quite high (cannot really be improved it seems, per-face selection would reduce complexity a lot but this leads to errors unless the derivatives are calculated all from the same face which almost doubles the work to be done) it is still slower than the branching version, hence only enable this with large vectors. It doesn't actually do per-quad face selection yet (only makes sense with matching lod selection, in fact it will select the same face for all pixels based on the average of the first four pixels for now) but only different shuffles are required to make it work (the branching version actually should work with larger vectors too now thanks to the improved horizontal add but of course it cannot be extended to really select the face per-quad unless doing branching per quad). commit 7780c58869fc9a00af4f23209902db7e058e8a66 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 21:11:12 2012 +0100 llvmpipe: (trivial) fix compiler warning and also clarify comment regarding availability of popcnt instruction. commit a266dccf477df6d29a611154e988e8895892277e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 30 14:21:07 2012 +0100 gallivm: remove unneeded members in lp_build_sample_context Minor cleanup, the texture width, height, depth aren't accessed in their scalar form anywhere. Makes it more obvious those values should probably be fetched already vectorized (but this requires more invasive changes)... commit b678c57fb474e14f05e25658c829fc04d2792fff Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 29 15:53:55 2012 +0100 gallivm: add a helper for concatenating vectors Similar to the extract_range helper intended to get around slow code generated by llvm for 128bit insertelements. Concatenating two 128bit vectors this way will result in a single vinsertf128 operation rather than two 64bit stores plus one 128bit load, though it might be mildly useful for other purposes as well. commit 415ff228bcd0cf5e44a4c15350a661f0f5520029 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 19:41:15 2012 +0100 gallivm: add a custom 2x8f->1x16ub avx conversion path Similar to the existing 4x4f->1x16ub sse2 path, shaves off a couple instructions (min/max mostly) because it relies on pack intrinsics clamping. commit 78c08fc89f8fbcc6dba09779981b1e873e2a0299 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 18:44:07 2012 +0100 gallivm: add avx arithmetic intrinsics Add all avx intrinsics for arithmetic functions (with the exception of the horizontal add function which needs another look). Seems to pass basic tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> commit a586caa2800aa5ce54c173f7c0d4fc48153dbc4e Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 15:31:35 2012 +0100 gallivm: add avx logic intrinsics Add the blend intrinsics for 8-wide float and 4-wide double vectors. Since we lack 256bit int instructions these are used for int vectors as well, though obviously not for byte or word element values. The comparison intrinsics aren't extended for avx since these are only used for pre-2.7 llvm versions. commit 70275e4c13c89315fc2560a4c488c0e6935d5caf Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 28 00:40:53 2012 +0100 gallivm: new helper function for extract shuffles. Based on José's idea as we can need that in a couple places. Note that such shuffles should not be used lightly, since data layout of <4 x i8> is different to <16 x i8> for instance, hence might cause data rearrangement. commit 4d586dbae1b0c55915dda1759d2faea631c0a1c2 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 18:27:25 2012 +0100 gallivm: (trivial) don't overallocate shuffle variable using wrong define meant huge array... commit 06b0ec1f6d665d98c135f9573ddf4ba04b2121ad Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 27 17:54:20 2012 +0100 gallivm: don't do per-element extract/insert for vector element resize Instead of doing per-element extract/insert if the src vectors and dst vector differ in total size (which generates atrocious code) first change the src vectors size by using shuffles to destination vector size. We can still do better than that on AVX for packing to color buffer (by exploiting pack intrinsics characteristics hence eleminating the need for some clamps) but this already generates much better code. v2: incorporate feedback from José, Keith and use shuffle instead of bitcasts/extracts. Due to llvm deficiencies the latter cause all data to get moved to GPRs and back in pieces (even though the data in the regs actually stays the same...). commit c9970d70e05f95d3f52fe7d2cd794176a52693aa Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 19:33:19 2012 +0000 gallivm: fix bug in simple position interpolation Accidental use of position attribute instead of just pixel coordinates. Caused failures in piglit glsl-fs-ceil and glsl-fs-floor. commit d0b6fcdb008d04d7f73d3d725615321544da5a7e Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 23 15:31:14 2012 +0000 gallivm: fix emission of ceil opcode lp_build_ceil seems more appropriate than lp_build_trunc. This seems to be never hit though someone performs some ceil to floor magic. commit d97fafed7e62ffa6bf76560a92ea246a1a26d256 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 22 11:46:52 2012 +0000 gallivm: new vectorized path for cubemap calculations should be faster when adapted to multiple quads as only selection masks need to be different. The code is more or less a per-pixel version adapted to only do it per quad. A per pixel version would be much simpler (could drop 2 selects, 6 broadcasts and the messy horizontal add of 3 vectors at the expense of only 2 more absolute value instructions - would also just work for arbitary large vectors). This version doesn't yet work with larger vectors because the horizontal add isn't adjusted to be able to work with 2x4 vectors (and also because face selection wouldn't be done per quad just per block though that would be only a correctness issue just as with lod selection). The downside is this code is quite a bit slower. On a Core2 it can be sped up by disabling the hw blend instructions for selection and using logicop fallbacks instead, but it is still slower than the old code, hence leave that in for now. Probably will chose one or the other version based on vector length in the end. commit b375fbb18a3fd46859b7fdd42f3e9908ea4ff9a3 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 21 14:42:29 2012 +0000 gallivm: fix optimized occlusion query intrinsic name commit a9ba0a3b611e48efbb0e79eb09caa85033dbe9a2 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Mar 21 16:19:43 2012 +0000 draw,gallivm,llvmpipe: Call gallivm_verify_function everywhere. commit f94c2238d2bc7383e088b8845b7410439a602071 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 18:54:10 2012 +0000 gallivm: optimize calculations for cube maps a bit this does some more vectorized calculations and uses horizontal adds if possible. A definite win with sse3 otherwise it doesn't seem to make much of a difference. In any case this is arithmetically identical, cannot handle larger vectors. Should be useful as a reference point against larger vector version later... commit 21a2c1cf3c8e1ac648ff49e59fdc0e3be77e2ebb Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 15:16:27 2012 +0000 llvmpipe: slight optimization of occlusion queries using movmskps when available. While this is slightly better for cpus without popcnt we should really sum the vectors ourselves (it is also possible to cast to i4 before doing the popcnt but that doesn't help that much neither since llvm is using some optimized popcnt version for i32) commit 5ab5a35f216619bcdf55eed52b0db275c4a06c1b Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 20 13:32:11 2012 +0000 llvmpipe: fix occlusion queries with larger vectors need to adjust casts etc. commit ff95e6fdf5f16d4ef999ffcf05ea6e8c7160b0d5 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Mar 19 20:15:25 2012 +0000 gallivm: Restore optimization passes. commit 57b05b4b36451e351659e98946dae27be0959832 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:34:22 2012 +0000 llvmpipe: use existing min2 macro commit bc9a20e19b4f600a439f45679451f2e87cd4b299 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 19:07:27 2012 +0000 llvmpipe: add some safeguards against really large vectors As per José's suggestion, prevent things from blowing up if some cpu would have 1024bit or larger vectors. commit 0e2b525e5ca1c5bbaa63158bde52ad1c1564a3a9 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:31:08 2012 +0000 llvmpipe: fix mask generation for uberwide vectors this was the only piece preventing 16-wide vectors from working (apart from the LP_MAX_VECTOR_WIDTH define that is), which is the maximum as we don't get more pixels in the fragment shader at once. Hence adjust that so things could be tested properly with that size even though there seems to be no practical value. commit 3c8334162211c97f3a11c7f64e9e5a2a91ad9656 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 18:19:41 2012 +0000 llvmpipe: fix the simple interpolation method with larger vectors so both methods actually _really_ work now. Makes textures look nice with larger vectors... commit 1cb0464ef8871be1778d43b0c56adf9c06843e2d Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 17:26:35 2012 +0000 llvmpipe: fix mask generation and position interpolation with 8-wide vectors trivial bugs, with these things start to look somewhat reasonable. Textures though have some swizzling issues it seems. commit 168277a63ef5b72542cf063c337f2d701053ff4b Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 16:04:03 2012 +0000 llvmpipe: don't overallocate variables we never have more than 16 (stamp size) / 4 (minimum possible vector size). (With larger vectors those variables are still overallocated a bit.) commit 409b54b30f81ed0aa9ed0b01affe15c72de9abd2 Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:56:48 2012 +0000 llvmpipe: add some 32f8 formats to lp_test_conv Also add the ability to handle different sized vectors. commit 55dcd3af8366ebdac0af3cdb22c2588f24aa18ce Author: Roland Scheidegger <sroland@vmware.com> Date: Mon Mar 19 15:47:27 2012 +0000 gallivm: handle different sized vectors in conversion / pack only fully generic path for now (extract/insert per element). commit 9c040f78c54575fcd94a8808216cf415fe8868f6 Author: Roland Scheidegger <sroland@vmware.com> Date: Sun Mar 18 00:58:28 2012 +0100 llvmpipe: fix harmless use of unitialized values commit 551e9d5468b92fc7d5aa2265db9a52bb1e368a36 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:31:21 2012 +0100 gallivm: drop special path in extract_broadcast with different sized vectors Not needed, llvm can handle shuffles with different sized result vector just fine. Should hopefully generate the same code in the end, but simpler IR. commit 44da531119ffa07a421eaa041f63607cec88f6f8 Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 16 23:28:49 2012 +0100 llvmpipe: adapt interpolation for handling multiple quads at once this is still WIP there are actually two methods possible not quite sure what makes the most sense, so there's code for both for now: 1) the iterative method as used before (compute attrib values at upper left corner of stamp and upper left corner of each quad initially). It is improved to handle more than one quad at once, and also do some more vectorized calculations initially for slightly better code - newer cpus have full throughput with 4 wide float vectors, hence don't try to code up a path which might be faster if there's just one channel active per attribute. 2) just do straight interpolation for each pixel. Method 2) is more work per quad, but less initially - if all quads are executed significantly more overall though. But this might change with larger vector lengths. This method would also be needed if we'd do some kind of active quad merging when operating on multiple quads at once. This path contains some hack to force llvm to generate better code, it is still far from ideal though, still generates far too many unnecessary register spills/reloads. Both methods should work with different sized vectors. Not very well tested yet, still seems to work with four-wide vectors, need changes elsewhere to be able to test with wider vectors. commit be5d3e82e2fe14ad0a46529ab79f65bf2276cd28 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:59:37 2012 +0000 draw: Cleanup. commit f85bc12c7fbacb3de2a94e88c6cd2d5ee0ec0e8d Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 16 20:43:30 2012 +0000 gallivm: More module compilation refactoring. commit d76f093198f2a06a93b2204857e6fea5fd0b3ece Author: José Fonseca <jfonseca@vmware.com> Date: Thu Mar 15 21:29:11 2012 +0000 llvmpipe: Use gallivm_compile/free_function() in linear code. Should had been done before. commit 122e1adb613ce083ad739b153ced1cde61dfc8c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 13 14:47:10 2012 +0100 llvmpipe: generate partial pixel mask for multiple quads still works with one quad, cannot be tested yet with more At least for now always fixed order with multiple quads. commit 4c4f15081d75ed585a01392cd2dcce0ad10e0ea8 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 22:09:24 2012 +0100 llvmpipe: refactor state setup a bit Refactor to make it easier to emit (and potentially later fetch in fs) coefficients for multiple attributes at once. Need to think more about how to make this actually happen however, the problem is different attributes can have different interpolation modes, requiring different handling in both setup and fs (though linear and perspective handling is close). commit 9363e49722ff47094d688a4be6f015a03fba9c79 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 19:23:23 2012 +0100 llvmpipe: vectorize tri offset calc cuts number of instructions in quad-offset-factor from 107 to 75. This code actually duplicated the (scalar) code calculating the determinant except it used different vertex order (leading to different sign but it doesn't matter) hence llvm could not have figured out it's the same (of course with determinant vectorized in the other place that wouldn't have worked any longer neither). Note this particular piece doesn't actually vectorize well, not many arithmetic instructions left but tons of shuffle instructions... Probably would need to work on n tris at a time for better vectorization. commit 63169dcb9dd445c94605625bf86d85306e2b4297 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Mar 8 03:11:37 2012 +0100 llvmpipe: vectorize some scalar code in setup reduces number of arithmetic instructions, and avoids loading vector x,y values twice (once as scalars once as vectors). Results in a reduction of instructions from 76 to 64 in fs setup for glxgears (16%) on a cpu with sse41. Since this code uses vec2 disguised as vec4, on old cpus which had physical 64bit sse units (pre-Core2) it probably is less of a win in practice (and if you have no vectors you can only hope llvm eliminates the arithmetic for unneeded elements). commit 732ecb877f951ab89bf503ac5e35ab8d838b58a1 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Mar 7 00:32:24 2012 +0100 draw: fix clipping bug introduced by 4822fea3f0440b5205e957cd303838c3b128419c broke clipping pretty badly (verified with lineclip test) commit ef5d90b86d624c152d200c7c4056f47c3c6d2688 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 23:38:59 2012 +0100 draw: don't store vertex header per attribute storing the vertex header once per attribute is totally unnecessary. Some quick look at the generated assembly says llvm in fact cannot optimize away the additional stores (maybe due to potentially aliasing pointers somewhere). Plus, this makes the code cleaner and also allows using a vector "or" instead of scalar ones. commit 6b3a5a57b0b9850854cfbd7b586e4e50102dda71 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Mar 6 19:11:01 2012 +0100 draw: do the per-vertex "boolean" clipmask "or" with vectors no point extracting the values and doing it per component. Doesn't help that much since we still extract the values elsewhere anyway. commit 36519caf1af40e4480251cc79a2d527350b7c61f Author: Roland Scheidegger <sroland@vmware.com> Date: Fri Mar 2 22:27:01 2012 +0100 gallivm: fix lp_build_extract_broadcast with different sized vectors Fix the obviously wrong argument, so it doesn't blow up. commit 76d0ac3ad85066d6058486638013afd02b069c58 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Mar 2 12:16:23 2012 +0000 draw: Compile per module and not per function (WIP). Enough to get gears w/ LLVM draw + softpipe to work on AVX doing: GALLIUM_DRIVER=softpipe SOFTPIPE_USE_LLVM=yes glxgears But still hackish -- will need to rethink and refactor this. commit 78e32b247d2a7a771be9a1a07eb000d1e54ea8bd Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 12:01:05 2012 +0000 llvmpipe: Remove lp_state_setup_fallback. Never used. commit 6895d5e40d19b4972c361e8b83fdb7eecda3c225 Author: José Fonseca <jfonseca@vmware.com> Date: Mon Feb 27 19:14:27 2012 +0000 llvmpipe: Don't emit EMMS on x86 We already take precautions to ensure that LLVM never emits MMX code. commit 4822fea3f0440b5205e957cd303838c3b128419c Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 29 15:58:19 2012 +0100 draw: modifications for larger vector sizes We want to be able to use larger vectors especially for running the vertex shader. With this patch we build soa vectors which might have a different length than 4. Note that aos structures really remain the same, only when aos structures are converted to soa potentially different sized vectors are used. Samplers probably don't work yet, didn't look at them. Testing done: glxgears works with both 128bit and 256bit vectors. commit f4950fc1ea784680ab767d3dd0dce589f4e70603 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:57 2012 +0100 gallivm: override native vector width with LP_NATIVE_VECTOR_WIDTH env var for debug commit 6ad6dbf0c92f3bf68ae54e5f2aca035d19b76e53 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 29 15:51:24 2012 +0100 draw: allocate storage with alignment according to native vector width commit 7bf0e3e7c9bd2469ae7279cabf4c5229ae9880c1 Author: José Fonseca <jfonseca@vmware.com> Date: Fri Feb 24 19:06:08 2012 +0000 gallivm: Fix comment grammar. Was missing several words. Spotted by Roland. commit b20f1b28eb890b2fa2de44a0399b9b6a0d453c52 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 19:22:09 2012 +0000 gallivm: Use MC-JIT on LLVM 3.1 + (i.e, SVN) MC-JIT Note: MC-JIT is still WIP. For this to work correctly it requires LLVM changes which are not yet upstream. commit b1af4dfcadfc241fd4023f4c3f823a1286d452c0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 20:03:15 2012 +0100 llvmpipe: use new lp_type_width() helper in lp_test_blend commit 04e0a37e888237d4db2298f31973af459ef9c95f Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 19:50:34 2012 +0100 llvmpipe: clean up lp_test_blend a little Using variables just sized and aligned right makes it a bit more obvious what's going on. The test still only tests vector length 4. For AoS anything else probably isn't going to work. For SoA other lengths should work (at least with floats). commit e61c393d3ec392ddee0a3da170e985fda885a823 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:48:30 2012 +0000 gallivm: Ensure vector width consistency. Instead of assuming that everything is the max native size. commit 330081ac7bc41c5754a92825e51456d231bf84dd Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:44:14 2012 +0000 draw: More simd vector width consistency fixes. commit d90ca002753596269e37297e2e6c139b19f29f03 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 17:43:00 2012 +0000 gallivm: Remove unused lp_build_int32_vec4_type() helper. commit cae23417824d75869c202aaf897808d73a2c1db0 Author: Roland Scheidegger <sroland@vmware.com> Date: Thu Feb 23 17:32:16 2012 +0100 gallivm: use global variable for native vector width instead of define We do not know the simd extensions (and hence the simd width we should use) available at compile time. At least for now keep a define for maximum vector width, since a global variable obviously can't be used to adjust alignment of automatic stack variables. Leave the runtime-determined value at 128 for now in all cases. commit 51270ace6349acc2c294fc6f34c025c707be538a Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 15:41:02 2012 +0000 gallivm: Add a hunk inadvertedly lost when rebasing. commit bf256df9cfdd0236637a455cbaece949b1253e98 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:24:23 2012 +0000 llvmpipe: Use consistent vector width in depth/stencil test. commit 5543b0901677146662c44be2cfba655fd55da94b Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 14:19:59 2012 +0000 draw: Use a consistent the vector register width. Instead of 4x32 sometimes, LP_NATIVE_VECTOR_WIDTH other times. commit eada8bbd22a3a61f549f32fe2a7e408222e5c824 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 12:08:04 2012 +0000 gallivm: Remove garbagge collection. MC-JIT will require one compilation per module (as opposed to one compilation per function), therefore no state will be shared, eliminating the need to do garbagge collection. commit 556697ea0ed72e0641851e4fbbbb862c470fd7eb Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 10:33:41 2012 +0000 gallivm: Move all native target initialization to lp_set_target_options(). commit c518e8f3f2649d5dc265403511fab4bcbe2cc5c8 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:52:32 2012 +0000 llvmpipe: Create one gallivm instance for each test. commit 90f10af8920ec6be6f2b1e7365cfc477a0cb111d Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:48:08 2012 +0000 gallivm: Avoid LLVMAddGlobalMapping() in lp_bld_assert(). Brittle, complex, and unecesary. Just use function pointer constant. commit 98fde550b33401e3fe006af59db4db628bcbf476 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:21:26 2012 +0000 gallivm: Add a lp_build_const_func_pointer() helper. To be reused in all places where we want to call C code. commit 6cfedadb62c2ce5af8d75969bc95a607f3ece118 Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 09:44:41 2012 +0000 gallivm: Cleanup/simplify lp_build_const_string_variable. - Move to lp_bld_const where it belongs - Rename to lp_build_const_string - take the length from the argument (and don't count the zero terminator twice) - bitcast the constant to generic i8 * commit db1d4018c0f1fa682a9da93c032977659adfb68c Author: José Fonseca <jfonseca@vmware.com> Date: Thu Feb 23 11:52:17 2012 +0000 gallivm: Set NoFramePointerElimNonLeaf to true where supported. commit 088614164aa915baaa5044fede728aa898483183 Author: Roland Scheidegger <sroland@vmware.com> Date: Wed Feb 22 19:38:47 2012 +0100 llvmpipe: pass in/out pointers rather scalar floats in lp_bld_arit we don't want llvm to potentially optimize away the vectors (though it doesn't seem to currently), plus we want to be able to handle in/out vectors of arbitrary length. commit 3f5c4e04af8a7592fdffa54938a277c34ae76b51 Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:22:55 2012 +0100 gallivm: fix lp_build_sqrt() for vector length 1 since we optimize away vectors with length 1 need to emit intrinsic without vector type. commit 79d94e5f93ed8ba6757b97e2026722ea31d32c06 Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 17:00:46 2012 +0000 llvmpipe: Remove lp_test_round. commit 81f41b5aeb3f4126e06453cfc78990086b85b78d Author: Roland Scheidegger <sroland@vmware.com> Date: Tue Feb 21 23:56:24 2012 +0100 llvmpipe: subsume lp_test_round into lp_test_arit Much simpler, and since the arguments aren't passed as 128bit values can run on any arch. This also uses the float instead of the double versions of the c functions (which probably was the intention anyway). In contrast to lp_test_round the output is much less verbose however. Tested vector width of 32 to 512 bits - all pass except 32 (length 1) which crashes in lp_build_sqrt() due to wrong type. Signed-off-by: José Fonseca <jfonseca@vmware.com> commit 945b338b421defbd274481d8c4f7e0910fd0e7eb Author: José Fonseca <jfonseca@vmware.com> Date: Wed Feb 22 09:55:03 2012 +0000 gallivm: Centralize the function compilation logic. This simplifies a lot of code. Also doing this in a central place will make it easier to carry out the changes necessary to use MC-JIT in the future. gallivm: Fix typo in explicit derivative shuffle. Trivial. draw: make DEBUG_STORE work again adapt to lp_build_printf() interface changes Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: get rid of vecnf_from_scalar() just use lp_build_broadcast directly (cannot assign a name but don't really need it, vecnf_from_scalar() was producing much uglier IR due to using repeated insertelement instead of insertelement+shuffle). Reviewed-by: José Fonseca <jfonseca@vmware.com> llvmpipe: fix typo in complex interpolation code Fixes position interpolation when using complex mode (piglit fp-fragment-position and similar) Reviewed-by: José Fonseca <jfonseca@vmware.com> draw: fix clipvertex/position storing again This appears to be the result of a bad merge. Fixes piglit tests relying on clipping, like a lot of the interpolation tests. Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Fix explicit derivative manipulation. Same counter variable was being used in two nested loops. Use more meanigful variable names for the counter to fix and avoid this. gallivm: Prevent buffer overflow in repeat wrap mode for NPOT. Based on Roland's patch, discussion, and review . Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix dims for TGSI_TEXTURE_1D in emit_tex. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: Fix explicit volume texture derivatives. Reviewed-by: Roland Scheidegger <sroland@vmware.com> gallivm: fix 1d shadow texture sampling Always r coordinate is used, hence need 3 coords not two (the second one is unused). Reviewed-by: José Fonseca <jfonseca@vmware.com> gallivm: Enable AVX support without MCJIT, where available. For now, this just enables AVX on Windows for testing. If the code is stable then we might consider prefering the old JIT wherever possible. No change elsewhere. Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2012-07-13 18:09:30 +01:00
/* rsqrt(1.0) != 1.0 here */
res = lp_build_fast_rsqrt(bld, a);
if (num_iterations) {
/*
* Newton-Raphson will result in NaN instead of infinity for zero,
* and NaN instead of zero for infinity.
* Also, need to ensure rsqrt(1.0) == 1.0.
* All numbers smaller than FLT_MIN will result in +infinity
* (rsqrtps treats all denormals as zero).
*/
LLVMValueRef cmp;
LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
for (i = 0; i < num_iterations; ++i) {
res = lp_build_rsqrt_refine(bld, a, res);
}
cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
res = lp_build_select(bld, cmp, inf, res);
cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
res = lp_build_select(bld, cmp, bld->zero, res);
cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
res = lp_build_select(bld, cmp, bld->one, res);
}
return res;
}
return lp_build_rcp(bld, lp_build_sqrt(bld, a));
}
/**
* If there's a fast (inaccurate) rsqrt instruction available
* (caller may want to avoid to call rsqrt_fast if it's not available,
* i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
* unavailable it would result in sqrt/div/mul so obviously
* much better to just call sqrt, skipping both div and mul).
*/
boolean
lp_build_fast_rsqrt_available(struct lp_type type)
{
assert(type.floating);
if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
(util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
return true;
}
return false;
}
/**
* Generate 1/sqrt(a).
* Result is undefined for values < 0, infinity for +0.
* Precision is limited, only ~10 bits guaranteed
* (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
*/
LLVMValueRef
lp_build_fast_rsqrt(struct lp_build_context *bld,
LLVMValueRef a)
{
LLVMBuilderRef builder = bld->gallivm->builder;
const struct lp_type type = bld->type;
assert(lp_check_value(type, a));
if (lp_build_fast_rsqrt_available(type)) {
const char *intrinsic = NULL;
if (type.length == 4) {
intrinsic = "llvm.x86.sse.rsqrt.ps";
}
else {
intrinsic = "llvm.x86.avx.rsqrt.ps.256";
}
return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
}
else {
debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
}
return lp_build_rcp(bld, lp_build_sqrt(bld, a));
}
2009-08-19 20:13:49 +01:00
/**
* Generate sin(a) or cos(a) using polynomial approximation.
* TODO: it might be worth recognizing sin and cos using same source
* (i.e. d3d10 sincos opcode). Obviously doing both at the same time
* would be way cheaper than calculating (nearly) everything twice...
* Not sure it's common enough to be worth bothering however, scs
* opcode could also benefit from calculating both though.
2009-08-19 20:13:49 +01:00
*/
static LLVMValueRef
lp_build_sin_or_cos(struct lp_build_context *bld,
LLVMValueRef a,
boolean cos)
{
struct gallivm_state *gallivm = bld->gallivm;
LLVMBuilderRef b = gallivm->builder;
struct lp_type int_type = lp_int_type(bld->type);
/*
* take the absolute value,
* x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
*/
LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
/*
* scale by 4/Pi
* y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
*/
LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
/*
* store the integer part of y in mm0
* emm2 = _mm_cvttps_epi32(y);
*/
LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
/*
* j=(j+1) & (~1) (see the cephes sources)
* emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
*/
LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
/*
* emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
*/
LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
/*
* y = _mm_cvtepi32_ps(emm2);
*/
LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
/*
* Argument used for poly selection and sign bit determination
* is different for sin vs. cos.
*/
LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
emm2_and;
LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
LLVMBuildNot(b, emm2_2, ""), ""),
const_29, "sign_bit") :
LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
LLVMBuildShl(b, emm2_add,
const_29, ""), ""),
sign_mask, "sign_bit");
/*
* get the polynom selection mask
* there is one polynom for 0 <= x <= Pi/4
* and another one for Pi/4<x<=Pi/2
* Both branches will be computed.
*
* emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
* emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
*/
LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
LLVMValueRef poly_mask = lp_build_compare(gallivm,
int_type, PIPE_FUNC_EQUAL,
emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
/*
* _PS_CONST(minus_cephes_DP1, -0.78515625);
* _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
* _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
*/
LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
/*
* The magic pass: "Extended precision modular arithmetic"
* x = ((x - y * DP1) - y * DP2) - y * DP3;
*/
LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs);
LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1);
LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2);
/*
* Evaluate the first polynom (0 <= x <= Pi/4)
*
* z = _mm_mul_ps(x,x);
*/
LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
/*
* _PS_CONST(coscof_p0, 2.443315711809948E-005);
* _PS_CONST(coscof_p1, -1.388731625493765E-003);
* _PS_CONST(coscof_p2, 4.166664568298827E-002);
*/
LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
/*
* y = *(v4sf*)_ps_coscof_p0;
* y = _mm_mul_ps(y, z);
*/
LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1);
LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2);
LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
/*
* tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
* y = _mm_sub_ps(y, tmp);
* y = _mm_add_ps(y, *(v4sf*)_ps_1);
*/
LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
/*
* _PS_CONST(sincof_p0, -1.9515295891E-4);
* _PS_CONST(sincof_p1, 8.3321608736E-3);
* _PS_CONST(sincof_p2, -1.6666654611E-1);
*/
LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
/*
* Evaluate the second polynom (Pi/4 <= x <= 0)
*
* y2 = *(v4sf*)_ps_sincof_p0;
* y2 = _mm_mul_ps(y2, z);
* y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
* y2 = _mm_mul_ps(y2, z);
* y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
* y2 = _mm_mul_ps(y2, z);
* y2 = _mm_mul_ps(y2, x);
* y2 = _mm_add_ps(y2, x);
*/
LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1);
LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2);
LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3);
/*
* select the correct result from the two polynoms
* xmm3 = poly_mask;
* y2 = _mm_and_ps(xmm3, y2); //, xmm3);
* y = _mm_andnot_ps(xmm3, y);
* y = _mm_or_ps(y,y2);
*/
LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
/*
* update the sign
* y = _mm_xor_ps(y, sign_bit);
*/
LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
LLVMValueRef isfinite = lp_build_isfinite(bld, a);
/* clamp output to be within [-1, 1] */
y_result = lp_build_clamp(bld, y_result,
lp_build_const_vec(bld->gallivm, bld->type, -1.f),
lp_build_const_vec(bld->gallivm, bld->type, 1.f));
/* If a is -inf, inf or NaN then return NaN */
y_result = lp_build_select(bld, isfinite, y_result,
lp_build_const_vec(bld->gallivm, bld->type, NAN));
return y_result;
}
2009-08-19 20:13:49 +01:00
/**
* Generate sin(a)
2009-08-19 20:13:49 +01:00
*/
LLVMValueRef
lp_build_sin(struct lp_build_context *bld,
LLVMValueRef a)
{
return lp_build_sin_or_cos(bld, a, FALSE);
}
/**
* Generate cos(a)
*/
LLVMValueRef
lp_build_cos(struct lp_build_context *bld,
LLVMValueRef a)
{
return lp_build_sin_or_cos(bld, a, TRUE);
}
2009-08-19 20:13:49 +01:00
/**
* Generate pow(x, y)
*/
LLVMValueRef
lp_build_pow(struct lp_build_context *bld,
LLVMValueRef x,
LLVMValueRef y)
{
/* TODO: optimize the constant case */
if (gallivm_debug & GALLIVM_DEBUG_PERF &&
LLVMIsConstant(x) && LLVMIsConstant(y)) {
debug_printf("%s: inefficient/imprecise constant arithmetic\n",
__FUNCTION__);
}
return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
}
2009-08-19 20:13:49 +01:00
/**
* Generate exp(x)
*/
LLVMValueRef
lp_build_exp(struct lp_build_context *bld,
LLVMValueRef x)
{
/* log2(e) = 1/log(2) */
LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
1.4426950408889634);
2010-08-09 17:30:33 +01:00
assert(lp_check_value(bld->type, x));
return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
}
2009-08-19 20:13:49 +01:00
/**
* Generate log(x)
* Behavior is undefined with infs, 0s and nans
2009-08-19 20:13:49 +01:00
*/
LLVMValueRef
lp_build_log(struct lp_build_context *bld,
LLVMValueRef x)
{
/* log(2) */
LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
0.69314718055994529);
2010-08-09 17:30:33 +01:00
assert(lp_check_value(bld->type, x));
return lp_build_mul(bld, log2, lp_build_log2(bld, x));
}
/**
* Generate log(x) that handles edge cases (infs, 0s and nans)
*/
LLVMValueRef
lp_build_log_safe(struct lp_build_context *bld,
LLVMValueRef x)
{
/* log(2) */
LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
0.69314718055994529);
assert(lp_check_value(bld->type, x));
return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
}
2009-08-19 20:13:49 +01:00
/**
* Generate polynomial.
* Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
2009-08-19 20:13:49 +01:00
*/
gallivm: handle srgb-to-linear and linear-to-srgb conversions srgb-to-linear is using 3rd degree polynomial for now which should be _just_ good enough. Reverse is using some rational polynomials and is quite accurate, though not hooked into llvmpipe's blend code yet and hence unused (untested). Using a table might also be an option (for srgb-to-linear especially). This does not enable any new features yet because EXT_texture_srgb was already supported via util_format fallbacks, but performance was lacking probably due to the external function call (the table used by the util_format_srgb code may not be all that much slower on its own). Some performance figures (taken from modified gloss, replaced both base and sphere texture to use GL_SRGB instead of GL_RGB, measured on 1Ghz Sandy Bridge, the numbers aren't terribly accurate): normal gloss, aos, 8-wide: 47 fps normal gloss, aos, 4-wide: 48 fps normal gloss, forced to soa, 8-wide: 48 fps normal gloss, forced to soa, 4-wide: 47 fps patched gloss, old code, soa, 8-wide: 21 fps patched gloss, old code, soa, 4-wide: 24 fps patched gloss, new code, soa, 8-wide: 41 fps patched gloss, new code, soa, 4-wide: 38 fps So there's a performance hit but it seems acceptable, certainly better than using the fallback. Note the new code only works for 4x8bit srgb formats, others (L8/L8A8) will continue to use the old util_format fallback, because I can't be bothered to write code for formats noone uses anyway (as decoding is done as part of lp_build_unpack_rgba_soa which can only handle block type width of 32). Compressed srgb formats should get their own path though eventually (it is going to be expensive in any case, first decompress, then convert). No piglit regressions. v2: use lp_build_polynomial instead of ad-hoc polynomial construction, also since keeping both linear to srgb functions for now make sure both are compiled (since they share quite some code just integrate into the same function). v3: formatting fixes and bugfix in the complicated (disabled) linear-to-srgb path. Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
2013-07-13 16:31:52 +01:00
LLVMValueRef
lp_build_polynomial(struct lp_build_context *bld,
LLVMValueRef x,
const double *coeffs,
unsigned num_coeffs)
{
const struct lp_type type = bld->type;
LLVMValueRef even = NULL, odd = NULL;
LLVMValueRef x2;
unsigned i;
2010-08-09 17:30:33 +01:00
assert(lp_check_value(bld->type, x));
/* TODO: optimize the constant case */
if (gallivm_debug & GALLIVM_DEBUG_PERF &&
LLVMIsConstant(x)) {
debug_printf("%s: inefficient/imprecise constant arithmetic\n",
__FUNCTION__);
}
/*
* Calculate odd and even terms seperately to decrease data dependency
* Ex:
* c[0] + x^2 * c[2] + x^4 * c[4] ...
* + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
*/
x2 = lp_build_mul(bld, x, x);
for (i = num_coeffs; i--; ) {
LLVMValueRef coeff;
coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
if (i % 2 == 0) {
if (even)
even = lp_build_mad(bld, x2, even, coeff);
else
even = coeff;
} else {
if (odd)
odd = lp_build_mad(bld, x2, odd, coeff);
else
odd = coeff;
}
}
if (odd)
return lp_build_mad(bld, odd, x, even);
else if (even)
return even;
else
return bld->undef;
}
/**
* Minimax polynomial fit of 2**x, in range [0, 1[
*/
const double lp_build_exp2_polynomial[] = {
#if EXP_POLY_DEGREE == 5
1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
0.693153073200168932794,
0.240153617044375388211,
0.0558263180532956664775,
0.00898934009049466391101,
0.00187757667519147912699
#elif EXP_POLY_DEGREE == 4
1.00000259337069434683,
0.693003834469974940458,
0.24144275689150793076,
0.0520114606103070150235,
0.0135341679161270268764
#elif EXP_POLY_DEGREE == 3
0.999925218562710312959,
0.695833540494823811697,
0.226067155427249155588,
0.0780245226406372992967
#elif EXP_POLY_DEGREE == 2
1.00172476321474503578,
0.657636275736077639316,
0.33718943461968720704
#else
#error
#endif
};
LLVMValueRef
lp_build_exp2(struct lp_build_context *bld,
LLVMValueRef x)
{
LLVMBuilderRef builder = bld->gallivm->builder;
const struct lp_type type = bld->type;
LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2009-08-16 21:07:15 +01:00
LLVMValueRef ipart = NULL;
LLVMValueRef fpart = NULL;
LLVMValueRef expipart = NULL;
LLVMValueRef expfpart = NULL;
LLVMValueRef res = NULL;
2010-08-09 17:30:33 +01:00
assert(lp_check_value(bld->type, x));
/* TODO: optimize the constant case */
if (gallivm_debug & GALLIVM_DEBUG_PERF &&
LLVMIsConstant(x)) {
debug_printf("%s: inefficient/imprecise constant arithmetic\n",
__FUNCTION__);
}
assert(type.floating && type.width == 32);
/* We want to preserve NaN and make sure than for exp2 if x > 128,
* the result is INF and if it's smaller than -126.9 the result is 0 */
x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type, 128.0), x,
GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
/* ipart = floor(x) */
/* fpart = x - ipart */
lp_build_ifloor_fract(bld, x, &ipart, &fpart);
/* expipart = (float) (1 << ipart) */
expipart = LLVMBuildAdd(builder, ipart,
lp_build_const_int_vec(bld->gallivm, type, 127), "");
expipart = LLVMBuildShl(builder, expipart,
lp_build_const_int_vec(bld->gallivm, type, 23), "");
expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
ARRAY_SIZE(lp_build_exp2_polynomial));
res = LLVMBuildFMul(builder, expipart, expfpart, "");
return res;
}
/**
* Extract the exponent of a IEEE-754 floating point value.
*
* Optionally apply an integer bias.
*
* Result is an integer value with
*
* ifloor(log2(x)) + bias
*/
LLVMValueRef
lp_build_extract_exponent(struct lp_build_context *bld,
LLVMValueRef x,
int bias)
{
LLVMBuilderRef builder = bld->gallivm->builder;
const struct lp_type type = bld->type;
unsigned mantissa = lp_mantissa(type);
LLVMValueRef res;
assert(type.floating);
assert(lp_check_value(bld->type, x));
x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
res = LLVMBuildLShr(builder, x,
lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
res = LLVMBuildAnd(builder, res,
lp_build_const_int_vec(bld->gallivm, type, 255), "");
res = LLVMBuildSub(builder, res,
lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
return res;
}
/**
* Extract the mantissa of the a floating.
*
* Result is a floating point value with
*
* x / floor(log2(x))
*/
LLVMValueRef
lp_build_extract_mantissa(struct lp_build_context *bld,
LLVMValueRef x)
{
LLVMBuilderRef builder = bld->gallivm->builder;
const struct lp_type type = bld->type;
unsigned mantissa = lp_mantissa(type);
LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
(1ULL << mantissa) - 1);
LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
LLVMValueRef res;
assert(lp_check_value(bld->type, x));
assert(type.floating);
x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
/* res = x / 2**ipart */
res = LLVMBuildAnd(builder, x, mantmask, "");
res = LLVMBuildOr(builder, res, one, "");
res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
return res;
}
/**
* Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
* These coefficients can be generate with
* http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
*/
const double lp_build_log2_polynomial[] = {
#if LOG_POLY_DEGREE == 5
2.88539008148777786488L,
0.961796878841293367824L,
0.577058946784739859012L,
0.412914355135828735411L,
0.308591899232910175289L,
0.352376952300281371868L,
#elif LOG_POLY_DEGREE == 4
2.88539009343309178325L,
0.961791550404184197881L,
0.577440339438736392009L,
0.403343858251329912514L,
0.406718052498846252698L,
#elif LOG_POLY_DEGREE == 3
2.88538959748872753838L,
0.961932915889597772928L,
0.571118517972136195241L,
0.493997535084709500285L,
#else
#error
#endif
};
/**
* See http://www.devmaster.net/forums/showthread.php?p=43580
* http://en.wikipedia.org/wiki/Logarithm#Calculation
* http://www.nezumi.demon.co.uk/consult/logx.htm
*
* If handle_edge_cases is true the function will perform computations
* to match the required D3D10+ behavior for each of the edge cases.
* That means that if input is:
* - less than zero (to and including -inf) then NaN will be returned
* - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
* - +infinity, then +infinity will be returned
* - NaN, then NaN will be returned
*
* Those checks are fairly expensive so if you don't need them make sure
* handle_edge_cases is false.
*/
void
lp_build_log2_approx(struct lp_build_context *bld,
LLVMValueRef x,
LLVMValueRef *p_exp,
LLVMValueRef *p_floor_log2,
LLVMValueRef *p_log2,
boolean handle_edge_cases)
{
LLVMBuilderRef builder = bld->gallivm->builder;
const struct lp_type type = bld->type;
LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
2009-08-16 21:07:15 +01:00
LLVMValueRef i = NULL;
LLVMValueRef y = NULL;
LLVMValueRef z = NULL;
2009-08-16 21:07:15 +01:00
LLVMValueRef exp = NULL;
LLVMValueRef mant = NULL;
LLVMValueRef logexp = NULL;
LLVMValueRef p_z = NULL;
2009-08-16 21:07:15 +01:00
LLVMValueRef res = NULL;
2010-08-09 17:30:33 +01:00
assert(lp_check_value(bld->type, x));
if(p_exp || p_floor_log2 || p_log2) {
/* TODO: optimize the constant case */
if (gallivm_debug & GALLIVM_DEBUG_PERF &&
LLVMIsConstant(x)) {
debug_printf("%s: inefficient/imprecise constant arithmetic\n",
__FUNCTION__);
}
assert(type.floating && type.width == 32);
/*
* We don't explicitly handle denormalized numbers. They will yield a
* result in the neighbourhood of -127, which appears to be adequate
* enough.
*/
i = LLVMBuildBitCast(builder, x, int_vec_type, "");
/* exp = (float) exponent(x) */
exp = LLVMBuildAnd(builder, i, expmask, "");
}
if(p_floor_log2 || p_log2) {
logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
}
if (p_log2) {
/* mant = 1 + (float) mantissa(x) */
mant = LLVMBuildAnd(builder, i, mantmask, "");
mant = LLVMBuildOr(builder, mant, one, "");
mant = LLVMBuildBitCast(builder, mant, vec_type, "");
/* y = (mant - 1) / (mant + 1) */
y = lp_build_div(bld,
lp_build_sub(bld, mant, bld->one),
lp_build_add(bld, mant, bld->one)
);
/* z = y^2 */
z = lp_build_mul(bld, y, y);
/* compute P(z) */
p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
ARRAY_SIZE(lp_build_log2_polynomial));
/* y * P(z) + logexp */
res = lp_build_mad(bld, y, p_z, logexp);
if (type.floating && handle_edge_cases) {
LLVMValueRef negmask, infmask, zmask;
negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
lp_build_const_vec(bld->gallivm, type, 0.0f));
zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
lp_build_const_vec(bld->gallivm, type, 0.0f));
infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
lp_build_const_vec(bld->gallivm, type, INFINITY));
/* If x is qual to inf make sure we return inf */
res = lp_build_select(bld, infmask,
lp_build_const_vec(bld->gallivm, type, INFINITY),
res);
/* If x is qual to 0, return -inf */
res = lp_build_select(bld, zmask,
lp_build_const_vec(bld->gallivm, type, -INFINITY),
res);
/* If x is nan or less than 0, return nan */
res = lp_build_select(bld, negmask,
lp_build_const_vec(bld->gallivm, type, NAN),
res);
}
}
if (p_exp) {
exp = LLVMBuildBitCast(builder, exp, vec_type, "");
*p_exp = exp;
}
if (p_floor_log2)
*p_floor_log2 = logexp;
if (p_log2)
*p_log2 = res;
}
/*
* log2 implementation which doesn't have special code to
* handle edge cases (-inf, 0, inf, NaN). It's faster but
* the results for those cases are undefined.
*/
LLVMValueRef
lp_build_log2(struct lp_build_context *bld,
LLVMValueRef x)
{
LLVMValueRef res;
lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
return res;
}
/*
* Version of log2 which handles all edge cases.
* Look at documentation of lp_build_log2_approx for
* description of the behavior for each of the edge cases.
*/
LLVMValueRef
lp_build_log2_safe(struct lp_build_context *bld,
LLVMValueRef x)
{
LLVMValueRef res;
lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
return res;
}
/**
* Faster (and less accurate) log2.
*
* log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
*
* Piece-wise linear approximation, with exact results when x is a
* power of two.
*
* See http://www.flipcode.com/archives/Fast_log_Function.shtml
*/
LLVMValueRef
lp_build_fast_log2(struct lp_build_context *bld,
LLVMValueRef x)
{
LLVMBuilderRef builder = bld->gallivm->builder;
LLVMValueRef ipart;
LLVMValueRef fpart;
assert(lp_check_value(bld->type, x));
assert(bld->type.floating);
/* ipart = floor(log2(x)) - 1 */
ipart = lp_build_extract_exponent(bld, x, -1);
ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
/* fpart = x / 2**ipart */
fpart = lp_build_extract_mantissa(bld, x);
/* ipart + fpart */
return LLVMBuildFAdd(builder, ipart, fpart, "");
}
/**
* Fast implementation of iround(log2(x)).
*
* Not an approximation -- it should give accurate results all the time.
*/
LLVMValueRef
lp_build_ilog2(struct lp_build_context *bld,
LLVMValueRef x)
{
LLVMBuilderRef builder = bld->gallivm->builder;
LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
LLVMValueRef ipart;
assert(bld->type.floating);
assert(lp_check_value(bld->type, x));
/* x * 2^(0.5) i.e., add 0.5 to the log2(x) */
x = LLVMBuildFMul(builder, x, sqrt2, "");
/* ipart = floor(log2(x) + 0.5) */
ipart = lp_build_extract_exponent(bld, x, 0);
return ipart;
}
LLVMValueRef
lp_build_mod(struct lp_build_context *bld,
LLVMValueRef x,
LLVMValueRef y)
{
LLVMBuilderRef builder = bld->gallivm->builder;
LLVMValueRef res;
const struct lp_type type = bld->type;
assert(lp_check_value(type, x));
assert(lp_check_value(type, y));
if (type.floating)
res = LLVMBuildFRem(builder, x, y, "");
else if (type.sign)
res = LLVMBuildSRem(builder, x, y, "");
else
res = LLVMBuildURem(builder, x, y, "");
return res;
}
/*
* For floating inputs it creates and returns a mask
* which is all 1's for channels which are NaN.
* Channels inside x which are not NaN will be 0.
*/
LLVMValueRef
lp_build_isnan(struct lp_build_context *bld,
LLVMValueRef x)
{
LLVMValueRef mask;
LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
assert(bld->type.floating);
assert(lp_check_value(bld->type, x));
mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
"isnotnan");
mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
return mask;
}
/* Returns all 1's for floating point numbers that are
* finite numbers and returns all zeros for -inf,
* inf and nan's */
LLVMValueRef
lp_build_isfinite(struct lp_build_context *bld,
LLVMValueRef x)
{
LLVMBuilderRef builder = bld->gallivm->builder;
LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
struct lp_type int_type = lp_int_type(bld->type);
LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
0x7f800000);
if (!bld->type.floating) {
return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
}
assert(bld->type.floating);
assert(lp_check_value(bld->type, x));
assert(bld->type.width == 32);
intx = LLVMBuildAnd(builder, intx, infornan32, "");
return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
intx, infornan32);
}
/*
* Returns true if the number is nan or inf and false otherwise.
* The input has to be a floating point vector.
*/
LLVMValueRef
lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
const struct lp_type type,
LLVMValueRef x)
{
LLVMBuilderRef builder = gallivm->builder;
struct lp_type int_type = lp_int_type(type);
LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
0x7f800000);
LLVMValueRef ret;
assert(type.floating);
ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
ret = LLVMBuildAnd(builder, ret, const0, "");
ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
ret, const0);
return ret;
}
LLVMValueRef
lp_build_fpstate_get(struct gallivm_state *gallivm)
{
if (util_cpu_caps.has_sse) {
LLVMBuilderRef builder = gallivm->builder;
LLVMValueRef mxcsr_ptr = lp_build_alloca(
gallivm,
LLVMInt32TypeInContext(gallivm->context),
"mxcsr_ptr");
LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
lp_build_intrinsic(builder,
"llvm.x86.sse.stmxcsr",
LLVMVoidTypeInContext(gallivm->context),
&mxcsr_ptr8, 1, 0);
return mxcsr_ptr;
}
return 0;
}
void
lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
boolean zero)
{
if (util_cpu_caps.has_sse) {
/* turn on DAZ (64) | FTZ (32768) = 32832 if available */
int daz_ftz = _MM_FLUSH_ZERO_MASK;
LLVMBuilderRef builder = gallivm->builder;
LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
LLVMValueRef mxcsr =
LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
if (util_cpu_caps.has_daz) {
/* Enable denormals are zero mode */
daz_ftz |= _MM_DENORMALS_ZERO_MASK;
}
if (zero) {
mxcsr = LLVMBuildOr(builder, mxcsr,
LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
} else {
mxcsr = LLVMBuildAnd(builder, mxcsr,
LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
}
LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
lp_build_fpstate_set(gallivm, mxcsr_ptr);
}
}
void
lp_build_fpstate_set(struct gallivm_state *gallivm,
LLVMValueRef mxcsr_ptr)
{
if (util_cpu_caps.has_sse) {
LLVMBuilderRef builder = gallivm->builder;
mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
lp_build_intrinsic(builder,
"llvm.x86.sse.ldmxcsr",
LLVMVoidTypeInContext(gallivm->context),
&mxcsr_ptr, 1, 0);
}
}