i965: Allocate register sets at screen creation, not context creation.

Register sets depend on the particular hardware generation, but don't
depend on anything in the actual OpenGL context.  Computing them is
fairly expensive, and they take up a large amount of memory.  Putting
them in the screen allows us to compute/allocate them once for all
contexts, saving both time and space.

Improves the performance of a context creation/destruction
microbenchmark by about 3x on my Haswell i7-4750HQ.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Eric Anholt <eric@anholt.net>
This commit is contained in:
Kenneth Graunke 2014-03-17 13:53:44 -07:00
parent b3e4b769dd
commit 7a0fd3ca1d
6 changed files with 88 additions and 88 deletions

View File

@ -775,9 +775,6 @@ brwCreateContext(gl_api api,
if ((flags & __DRI_CTX_FLAG_ROBUST_BUFFER_ACCESS) != 0)
ctx->Const.ContextFlags |= GL_CONTEXT_FLAG_ROBUST_ACCESS_BIT_ARB;
brw_fs_alloc_reg_sets(brw);
brw_vec4_alloc_reg_set(brw);
if (INTEL_DEBUG & DEBUG_SHADER_TIME)
brw_init_shader_time(brw);

View File

@ -1272,26 +1272,6 @@ struct brw_context
*/
struct brw_vue_map vue_map_geom_out;
/**
* Data structures used by all vec4 program compiles (not specific to any
* particular program).
*/
struct {
struct ra_regs *regs;
/**
* Array of the ra classes for the unaligned contiguous register
* block sizes used.
*/
int *classes;
/**
* Mapping for register-allocated objects in *regs to the first
* GRF for that object.
*/
uint8_t *ra_reg_to_grf;
} vec4;
struct {
struct brw_stage_state base;
struct brw_vs_prog_data *prog_data;
@ -1356,28 +1336,6 @@ struct brw_context
* Gen6. See brw_update_null_renderbuffer_surface().
*/
drm_intel_bo *multisampled_null_render_target_bo;
struct {
struct ra_regs *regs;
/**
* Array of the ra classes for the unaligned contiguous register
* block sizes used, indexed by register size.
*/
int classes[16];
/**
* Mapping for register-allocated objects in *regs to the first
* GRF for that object.
*/
uint8_t *ra_reg_to_grf;
/**
* ra class for the aligned pairs we use for PLN, which doesn't
* appear in *classes.
*/
int aligned_pairs_class;
} reg_sets[2];
} wm;
@ -1607,10 +1565,10 @@ void brw_upload_cs_urb_state(struct brw_context *brw);
/* brw_fs_reg_allocate.cpp
*/
void brw_fs_alloc_reg_sets(struct brw_context *brw);
void brw_fs_alloc_reg_sets(struct intel_screen *screen);
/* brw_vec4_reg_allocate.cpp */
void brw_vec4_alloc_reg_set(struct brw_context *brw);
void brw_vec4_alloc_reg_set(struct intel_screen *screen);
/* brw_disasm.c */
int brw_disasm (FILE *file, struct brw_instruction *inst, int gen);

View File

@ -71,8 +71,9 @@ fs_visitor::assign_regs_trivial()
}
static void
brw_alloc_reg_set(struct brw_context *brw, int reg_width)
brw_alloc_reg_set(struct intel_screen *screen, int reg_width)
{
const struct brw_device_info *devinfo = screen->devinfo;
int base_reg_count = BRW_MAX_GRF / reg_width;
int index = reg_width - 1;
@ -102,7 +103,7 @@ brw_alloc_reg_set(struct brw_context *brw, int reg_width)
int class_count;
int class_sizes[BRW_MAX_MRF];
if (brw->gen >= 7) {
if (devinfo->gen >= 7) {
for (class_count = 0; class_count < MAX_SAMPLER_MESSAGE_SIZE;
class_count++)
class_sizes[class_count] = class_count + 1;
@ -118,11 +119,11 @@ brw_alloc_reg_set(struct brw_context *brw, int reg_width)
ra_reg_count += base_reg_count - (class_sizes[i] - 1);
}
uint8_t *ra_reg_to_grf = ralloc_array(brw, uint8_t, ra_reg_count);
struct ra_regs *regs = ra_alloc_reg_set(brw, ra_reg_count);
if (brw->gen >= 6)
uint8_t *ra_reg_to_grf = ralloc_array(screen, uint8_t, ra_reg_count);
struct ra_regs *regs = ra_alloc_reg_set(screen, ra_reg_count);
if (devinfo->gen >= 6)
ra_set_allocate_round_robin(regs);
int *classes = ralloc_array(brw, int, class_count);
int *classes = ralloc_array(screen, int, class_count);
int aligned_pairs_class = -1;
/* Now, add the registers to their classes, and add the conflicts
@ -160,7 +161,7 @@ brw_alloc_reg_set(struct brw_context *brw, int reg_width)
/* Add a special class for aligned pairs, which we'll put delta_x/y
* in on gen5 so that we can do PLN.
*/
if (brw->has_pln && reg_width == 1 && brw->gen < 6) {
if (devinfo->has_pln && reg_width == 1 && devinfo->gen < 6) {
aligned_pairs_class = ra_alloc_reg_class(regs);
for (int i = 0; i < pairs_reg_count; i++) {
@ -172,20 +173,20 @@ brw_alloc_reg_set(struct brw_context *brw, int reg_width)
ra_set_finalize(regs, NULL);
brw->wm.reg_sets[index].regs = regs;
for (unsigned i = 0; i < ARRAY_SIZE(brw->wm.reg_sets[index].classes); i++)
brw->wm.reg_sets[index].classes[i] = -1;
screen->wm_reg_sets[index].regs = regs;
for (unsigned i = 0; i < ARRAY_SIZE(screen->wm_reg_sets[index].classes); i++)
screen->wm_reg_sets[index].classes[i] = -1;
for (int i = 0; i < class_count; i++)
brw->wm.reg_sets[index].classes[class_sizes[i] - 1] = classes[i];
brw->wm.reg_sets[index].ra_reg_to_grf = ra_reg_to_grf;
brw->wm.reg_sets[index].aligned_pairs_class = aligned_pairs_class;
screen->wm_reg_sets[index].classes[class_sizes[i] - 1] = classes[i];
screen->wm_reg_sets[index].ra_reg_to_grf = ra_reg_to_grf;
screen->wm_reg_sets[index].aligned_pairs_class = aligned_pairs_class;
}
void
brw_fs_alloc_reg_sets(struct brw_context *brw)
brw_fs_alloc_reg_sets(struct intel_screen *screen)
{
brw_alloc_reg_set(brw, 1);
brw_alloc_reg_set(brw, 2);
brw_alloc_reg_set(screen, 1);
brw_alloc_reg_set(screen, 2);
}
int
@ -420,6 +421,7 @@ fs_visitor::setup_mrf_hack_interference(struct ra_graph *g, int first_mrf_node)
bool
fs_visitor::assign_regs(bool allow_spilling)
{
struct intel_screen *screen = brw->intelScreen;
/* Most of this allocation was written for a reg_width of 1
* (dispatch_width == 8). In extending to SIMD16, the code was
* left in place and it was converted to have the hardware
@ -430,7 +432,7 @@ fs_visitor::assign_regs(bool allow_spilling)
int hw_reg_mapping[this->virtual_grf_count];
int payload_node_count = (ALIGN(this->first_non_payload_grf, reg_width) /
reg_width);
int rsi = reg_width - 1; /* Which brw->wm.reg_sets[] to use */
int rsi = reg_width - 1; /* Which screen->wm_reg_sets[] to use */
calculate_live_intervals();
int node_count = this->virtual_grf_count;
@ -439,16 +441,16 @@ fs_visitor::assign_regs(bool allow_spilling)
int first_mrf_hack_node = node_count;
if (brw->gen >= 7)
node_count += BRW_MAX_GRF - GEN7_MRF_HACK_START;
struct ra_graph *g = ra_alloc_interference_graph(brw->wm.reg_sets[rsi].regs,
struct ra_graph *g = ra_alloc_interference_graph(screen->wm_reg_sets[rsi].regs,
node_count);
for (int i = 0; i < this->virtual_grf_count; i++) {
unsigned size = this->virtual_grf_sizes[i];
int c;
assert(size <= ARRAY_SIZE(brw->wm.reg_sets[rsi].classes) &&
assert(size <= ARRAY_SIZE(screen->wm_reg_sets[rsi].classes) &&
"Register allocation relies on split_virtual_grfs()");
c = brw->wm.reg_sets[rsi].classes[size - 1];
c = screen->wm_reg_sets[rsi].classes[size - 1];
/* Special case: on pre-GEN6 hardware that supports PLN, the
* second operand of a PLN instruction needs to be an
@ -459,9 +461,9 @@ fs_visitor::assign_regs(bool allow_spilling)
* any other interpolation modes). So all we need to do is find
* that register and set it to the appropriate class.
*/
if (brw->wm.reg_sets[rsi].aligned_pairs_class >= 0 &&
if (screen->wm_reg_sets[rsi].aligned_pairs_class >= 0 &&
this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg == i) {
c = brw->wm.reg_sets[rsi].aligned_pairs_class;
c = screen->wm_reg_sets[rsi].aligned_pairs_class;
}
ra_set_node_class(g, i, c);
@ -514,7 +516,7 @@ fs_visitor::assign_regs(bool allow_spilling)
for (int i = 0; i < this->virtual_grf_count; i++) {
int reg = ra_get_node_reg(g, i);
hw_reg_mapping[i] = brw->wm.reg_sets[rsi].ra_reg_to_grf[reg] * reg_width;
hw_reg_mapping[i] = screen->wm_reg_sets[rsi].ra_reg_to_grf[reg] * reg_width;
this->grf_used = MAX2(this->grf_used,
hw_reg_mapping[i] + this->virtual_grf_sizes[i] *
reg_width);

View File

@ -97,9 +97,10 @@ vec4_visitor::reg_allocate_trivial()
}
extern "C" void
brw_vec4_alloc_reg_set(struct brw_context *brw)
brw_vec4_alloc_reg_set(struct intel_screen *screen)
{
int base_reg_count = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
int base_reg_count =
screen->devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
/* After running split_virtual_grfs(), almost all VGRFs will be of size 1.
* SEND-from-GRF sources cannot be split, so we also need classes for each
@ -114,14 +115,14 @@ brw_vec4_alloc_reg_set(struct brw_context *brw)
ra_reg_count += base_reg_count - (class_sizes[i] - 1);
}
ralloc_free(brw->vec4.ra_reg_to_grf);
brw->vec4.ra_reg_to_grf = ralloc_array(brw, uint8_t, ra_reg_count);
ralloc_free(brw->vec4.regs);
brw->vec4.regs = ra_alloc_reg_set(brw, ra_reg_count);
if (brw->gen >= 6)
ra_set_allocate_round_robin(brw->vec4.regs);
ralloc_free(brw->vec4.classes);
brw->vec4.classes = ralloc_array(brw, int, class_count);
ralloc_free(screen->vec4_reg_set.ra_reg_to_grf);
screen->vec4_reg_set.ra_reg_to_grf = ralloc_array(screen, uint8_t, ra_reg_count);
ralloc_free(screen->vec4_reg_set.regs);
screen->vec4_reg_set.regs = ra_alloc_reg_set(screen, ra_reg_count);
if (screen->devinfo->gen >= 6)
ra_set_allocate_round_robin(screen->vec4_reg_set.regs);
ralloc_free(screen->vec4_reg_set.classes);
screen->vec4_reg_set.classes = ralloc_array(screen, int, class_count);
/* Now, add the registers to their classes, and add the conflicts
* between them and the base GRF registers (and also each other).
@ -129,17 +130,17 @@ brw_vec4_alloc_reg_set(struct brw_context *brw)
int reg = 0;
for (int i = 0; i < class_count; i++) {
int class_reg_count = base_reg_count - (class_sizes[i] - 1);
brw->vec4.classes[i] = ra_alloc_reg_class(brw->vec4.regs);
screen->vec4_reg_set.classes[i] = ra_alloc_reg_class(screen->vec4_reg_set.regs);
for (int j = 0; j < class_reg_count; j++) {
ra_class_add_reg(brw->vec4.regs, brw->vec4.classes[i], reg);
ra_class_add_reg(screen->vec4_reg_set.regs, screen->vec4_reg_set.classes[i], reg);
brw->vec4.ra_reg_to_grf[reg] = j;
screen->vec4_reg_set.ra_reg_to_grf[reg] = j;
for (int base_reg = j;
base_reg < j + class_sizes[i];
base_reg++) {
ra_add_transitive_reg_conflict(brw->vec4.regs, base_reg, reg);
ra_add_transitive_reg_conflict(screen->vec4_reg_set.regs, base_reg, reg);
}
reg++;
@ -147,7 +148,7 @@ brw_vec4_alloc_reg_set(struct brw_context *brw)
}
assert(reg == ra_reg_count);
ra_set_finalize(brw->vec4.regs, NULL);
ra_set_finalize(screen->vec4_reg_set.regs, NULL);
}
void
@ -177,6 +178,7 @@ vec4_visitor::setup_payload_interference(struct ra_graph *g,
bool
vec4_visitor::reg_allocate()
{
struct intel_screen *screen = brw->intelScreen;
unsigned int hw_reg_mapping[virtual_grf_count];
int payload_reg_count = this->first_non_payload_grf;
@ -192,13 +194,13 @@ vec4_visitor::reg_allocate()
int first_payload_node = node_count;
node_count += payload_reg_count;
struct ra_graph *g =
ra_alloc_interference_graph(brw->vec4.regs, node_count);
ra_alloc_interference_graph(screen->vec4_reg_set.regs, node_count);
for (int i = 0; i < virtual_grf_count; i++) {
int size = this->virtual_grf_sizes[i];
assert(size >= 1 && size <= 2 &&
"Register allocation relies on split_virtual_grfs().");
ra_set_node_class(g, i, brw->vec4.classes[size - 1]);
ra_set_node_class(g, i, screen->vec4_reg_set.classes[size - 1]);
for (int j = 0; j < i; j++) {
if (virtual_grf_interferes(i, j)) {
@ -234,7 +236,7 @@ vec4_visitor::reg_allocate()
for (int i = 0; i < virtual_grf_count; i++) {
int reg = ra_get_node_reg(g, i);
hw_reg_mapping[i] = brw->vec4.ra_reg_to_grf[reg];
hw_reg_mapping[i] = screen->vec4_reg_set.ra_reg_to_grf[reg];
prog_data->total_grf = MAX2(prog_data->total_grf,
hw_reg_mapping[i] + virtual_grf_sizes[i]);
}

View File

@ -1354,6 +1354,9 @@ __DRIconfig **intelInitScreen2(__DRIscreen *psp)
psp->extensions = !intelScreen->has_context_reset_notification
? intelScreenExtensions : intelRobustScreenExtensions;
brw_fs_alloc_reg_sets(intelScreen);
brw_vec4_alloc_reg_set(intelScreen);
return (const __DRIconfig**) intel_screen_make_configs(psp);
}

View File

@ -62,6 +62,44 @@ struct intel_screen
*/
unsigned program_id;
struct {
struct ra_regs *regs;
/**
* Array of the ra classes for the unaligned contiguous register
* block sizes used.
*/
int *classes;
/**
* Mapping for register-allocated objects in *regs to the first
* GRF for that object.
*/
uint8_t *ra_reg_to_grf;
} vec4_reg_set;
struct {
struct ra_regs *regs;
/**
* Array of the ra classes for the unaligned contiguous register
* block sizes used, indexed by register size.
*/
int classes[16];
/**
* Mapping for register-allocated objects in *regs to the first
* GRF for that object.
*/
uint8_t *ra_reg_to_grf;
/**
* ra class for the aligned pairs we use for PLN, which doesn't
* appear in *classes.
*/
int aligned_pairs_class;
} wm_reg_sets[2];
/**
* Configuration cache with default values for all contexts
*/