i965: move OA accumulation code to intel/perf

We'll want to reuse this in our Vulkan extension.

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Mark Janes <mark.a.janes@intel.com>
This commit is contained in:
Lionel Landwerlin 2018-06-08 15:29:51 +01:00
parent f6bba7760f
commit 41b54b5faf
5 changed files with 229 additions and 199 deletions

View File

@ -433,3 +433,138 @@ gen_perf_load_oa_metrics(struct gen_perf *perf, int fd,
return true;
}
/* Accumulate 32bits OA counters */
static inline void
accumulate_uint32(const uint32_t *report0,
const uint32_t *report1,
uint64_t *accumulator)
{
*accumulator += (uint32_t)(*report1 - *report0);
}
/* Accumulate 40bits OA counters */
static inline void
accumulate_uint40(int a_index,
const uint32_t *report0,
const uint32_t *report1,
uint64_t *accumulator)
{
const uint8_t *high_bytes0 = (uint8_t *)(report0 + 40);
const uint8_t *high_bytes1 = (uint8_t *)(report1 + 40);
uint64_t high0 = (uint64_t)(high_bytes0[a_index]) << 32;
uint64_t high1 = (uint64_t)(high_bytes1[a_index]) << 32;
uint64_t value0 = report0[a_index + 4] | high0;
uint64_t value1 = report1[a_index + 4] | high1;
uint64_t delta;
if (value0 > value1)
delta = (1ULL << 40) + value1 - value0;
else
delta = value1 - value0;
*accumulator += delta;
}
static void
gen8_read_report_clock_ratios(const uint32_t *report,
uint64_t *slice_freq_hz,
uint64_t *unslice_freq_hz)
{
/* The lower 16bits of the RPT_ID field of the OA reports contains a
* snapshot of the bits coming from the RP_FREQ_NORMAL register and is
* divided this way :
*
* RPT_ID[31:25]: RP_FREQ_NORMAL[20:14] (low squashed_slice_clock_frequency)
* RPT_ID[10:9]: RP_FREQ_NORMAL[22:21] (high squashed_slice_clock_frequency)
* RPT_ID[8:0]: RP_FREQ_NORMAL[31:23] (squashed_unslice_clock_frequency)
*
* RP_FREQ_NORMAL[31:23]: Software Unslice Ratio Request
* Multiple of 33.33MHz 2xclk (16 MHz 1xclk)
*
* RP_FREQ_NORMAL[22:14]: Software Slice Ratio Request
* Multiple of 33.33MHz 2xclk (16 MHz 1xclk)
*/
uint32_t unslice_freq = report[0] & 0x1ff;
uint32_t slice_freq_low = (report[0] >> 25) & 0x7f;
uint32_t slice_freq_high = (report[0] >> 9) & 0x3;
uint32_t slice_freq = slice_freq_low | (slice_freq_high << 7);
*slice_freq_hz = slice_freq * 16666667ULL;
*unslice_freq_hz = unslice_freq * 16666667ULL;
}
void
gen_perf_query_result_read_frequencies(struct gen_perf_query_result *result,
const struct gen_device_info *devinfo,
const uint32_t *start,
const uint32_t *end)
{
/* Slice/Unslice frequency is only available in the OA reports when the
* "Disable OA reports due to clock ratio change" field in
* OA_DEBUG_REGISTER is set to 1. This is how the kernel programs this
* global register (see drivers/gpu/drm/i915/i915_perf.c)
*
* Documentation says this should be available on Gen9+ but experimentation
* shows that Gen8 reports similar values, so we enable it there too.
*/
if (devinfo->gen < 8)
return;
gen8_read_report_clock_ratios(start,
&result->slice_frequency[0],
&result->unslice_frequency[0]);
gen8_read_report_clock_ratios(end,
&result->slice_frequency[1],
&result->unslice_frequency[1]);
}
void
gen_perf_query_result_accumulate(struct gen_perf_query_result *result,
const struct gen_perf_query_info *query,
const uint32_t *start,
const uint32_t *end)
{
int i, idx = 0;
result->hw_id = start[2];
result->reports_accumulated++;
switch (query->oa_format) {
case I915_OA_FORMAT_A32u40_A4u32_B8_C8:
accumulate_uint32(start + 1, end + 1, result->accumulator + idx++); /* timestamp */
accumulate_uint32(start + 3, end + 3, result->accumulator + idx++); /* clock */
/* 32x 40bit A counters... */
for (i = 0; i < 32; i++)
accumulate_uint40(i, start, end, result->accumulator + idx++);
/* 4x 32bit A counters... */
for (i = 0; i < 4; i++)
accumulate_uint32(start + 36 + i, end + 36 + i, result->accumulator + idx++);
/* 8x 32bit B counters + 8x 32bit C counters... */
for (i = 0; i < 16; i++)
accumulate_uint32(start + 48 + i, end + 48 + i, result->accumulator + idx++);
break;
case I915_OA_FORMAT_A45_B8_C8:
accumulate_uint32(start + 1, end + 1, result->accumulator); /* timestamp */
for (i = 0; i < 61; i++)
accumulate_uint32(start + 3 + i, end + 3 + i, result->accumulator + 1 + i);
break;
default:
unreachable("Can't accumulate OA counters in unknown format");
}
}
void
gen_perf_query_result_clear(struct gen_perf_query_result *result)
{
memset(result, 0, sizeof(*result));
result->hw_id = 0xffffffff; /* invalid */
}

View File

@ -61,6 +61,44 @@ struct gen_pipeline_stat {
uint32_t denominator;
};
/*
* The largest OA formats we can use include:
* For Haswell:
* 1 timestamp, 45 A counters, 8 B counters and 8 C counters.
* For Gen8+
* 1 timestamp, 1 clock, 36 A counters, 8 B counters and 8 C counters
*/
#define MAX_OA_REPORT_COUNTERS 62
struct gen_perf_query_result {
/**
* Storage for the final accumulated OA counters.
*/
uint64_t accumulator[MAX_OA_REPORT_COUNTERS];
/**
* Hw ID used by the context on which the query was running.
*/
uint32_t hw_id;
/**
* Number of reports accumulated to produce the results.
*/
uint32_t reports_accumulated;
/**
* Frequency in the slices of the GT at the begin and end of the
* query.
*/
uint64_t slice_frequency[2];
/**
* Frequency in the unslice of the GT at the begin and end of the
* query.
*/
uint64_t unslice_frequency[2];
};
struct gen_perf_query_counter {
const char *name;
const char *desc;
@ -208,38 +246,6 @@ gen_perf_query_info_add_basic_stat_reg(struct gen_perf_query_info *query,
gen_perf_query_info_add_stat_reg(query, reg, 1, 1, name, name);
}
/* Accumulate 32bits OA counters */
static inline void
gen_perf_query_accumulate_uint32(const uint32_t *report0,
const uint32_t *report1,
uint64_t *accumulator)
{
*accumulator += (uint32_t)(*report1 - *report0);
}
/* Accumulate 40bits OA counters */
static inline void
gen_perf_query_accumulate_uint40(int a_index,
const uint32_t *report0,
const uint32_t *report1,
uint64_t *accumulator)
{
const uint8_t *high_bytes0 = (uint8_t *)(report0 + 40);
const uint8_t *high_bytes1 = (uint8_t *)(report1 + 40);
uint64_t high0 = (uint64_t)(high_bytes0[a_index]) << 32;
uint64_t high1 = (uint64_t)(high_bytes1[a_index]) << 32;
uint64_t value0 = report0[a_index + 4] | high0;
uint64_t value1 = report1[a_index + 4] | high1;
uint64_t delta;
if (value0 > value1)
delta = (1ULL << 40) + value1 - value0;
else
delta = value1 - value0;
*accumulator += delta;
}
static inline struct gen_perf *
gen_perf_new(void *ctx, int (*ioctl_cb)(int, unsigned long, void *))
{
@ -255,4 +261,15 @@ bool gen_perf_load_oa_metrics(struct gen_perf *perf, int fd,
bool gen_perf_load_metric_id(struct gen_perf *perf, const char *guid,
uint64_t *metric_id);
void gen_perf_query_result_read_frequencies(struct gen_perf_query_result *result,
const struct gen_device_info *devinfo,
const uint32_t *start,
const uint32_t *end);
void gen_perf_query_result_accumulate(struct gen_perf_query_result *result,
const struct gen_perf_query_info *query,
const uint32_t *start,
const uint32_t *end);
void gen_perf_query_result_clear(struct gen_perf_query_result *result);
#endif /* GEN_PERF_H */

View File

@ -542,55 +542,6 @@ drop_from_unaccumulated_query_list(struct brw_context *brw,
reap_old_sample_buffers(brw);
}
/**
* Given pointers to starting and ending OA snapshots, add the deltas for each
* counter to the results.
*/
static void
add_deltas(struct brw_context *brw,
struct brw_perf_query_object *obj,
const uint32_t *start,
const uint32_t *end)
{
const struct gen_perf_query_info *query = obj->query;
uint64_t *accumulator = obj->oa.accumulator;
int idx = 0;
int i;
obj->oa.reports_accumulated++;
switch (query->oa_format) {
case I915_OA_FORMAT_A32u40_A4u32_B8_C8:
gen_perf_query_accumulate_uint32(start + 1, end + 1, accumulator + idx++); /* timestamp */
gen_perf_query_accumulate_uint32(start + 3, end + 3, accumulator + idx++); /* clock */
/* 32x 40bit A counters... */
for (i = 0; i < 32; i++)
gen_perf_query_accumulate_uint40(i, start, end, accumulator + idx++);
/* 4x 32bit A counters... */
for (i = 0; i < 4; i++)
gen_perf_query_accumulate_uint32(start + 36 + i, end + 36 + i,
accumulator + idx++);
/* 8x 32bit B counters + 8x 32bit C counters... */
for (i = 0; i < 16; i++)
gen_perf_query_accumulate_uint32(start + 48 + i, end + 48 + i,
accumulator + idx++);
break;
case I915_OA_FORMAT_A45_B8_C8:
gen_perf_query_accumulate_uint32(start + 1, end + 1, accumulator); /* timestamp */
for (i = 0; i < 61; i++)
gen_perf_query_accumulate_uint32(start + 3 + i, end + 3 + i, accumulator + 1 + i);
break;
default:
unreachable("Can't accumulate OA counters in unknown format");
}
}
static bool
inc_n_oa_users(struct brw_context *brw)
{
@ -801,8 +752,6 @@ accumulate_oa_reports(struct brw_context *brw,
goto error;
}
obj->oa.hw_id = start[2];
/* See if we have any periodic reports to accumulate too... */
/* N.B. The oa.samples_head was set when the query began and
@ -856,11 +805,11 @@ accumulate_oa_reports(struct brw_context *brw,
* of OA counters while any other context is acctive.
*/
if (devinfo->gen >= 8) {
if (in_ctx && report[2] != obj->oa.hw_id) {
if (in_ctx && report[2] != obj->oa.result.hw_id) {
DBG("i915 perf: Switch AWAY (observed by ID change)\n");
in_ctx = false;
out_duration = 0;
} else if (in_ctx == false && report[2] == obj->oa.hw_id) {
} else if (in_ctx == false && report[2] == obj->oa.result.hw_id) {
DBG("i915 perf: Switch TO\n");
in_ctx = true;
@ -877,18 +826,20 @@ accumulate_oa_reports(struct brw_context *brw,
if (out_duration >= 1)
add = false;
} else if (in_ctx) {
assert(report[2] == obj->oa.hw_id);
assert(report[2] == obj->oa.result.hw_id);
DBG("i915 perf: Continuation IN\n");
} else {
assert(report[2] != obj->oa.hw_id);
assert(report[2] != obj->oa.result.hw_id);
DBG("i915 perf: Continuation OUT\n");
add = false;
out_duration++;
}
}
if (add)
add_deltas(brw, obj, last, report);
if (add) {
gen_perf_query_result_accumulate(&obj->oa.result, obj->query,
last, report);
}
last = report;
@ -907,7 +858,8 @@ accumulate_oa_reports(struct brw_context *brw,
end:
add_deltas(brw, obj, last, end);
gen_perf_query_result_accumulate(&obj->oa.result, obj->query,
last, end);
DBG("Marking %d accumulated - results gathered\n", o->Id);
@ -1211,8 +1163,7 @@ brw_begin_perf_query(struct gl_context *ctx,
*/
buf->refcount++;
obj->oa.hw_id = 0xffffffff;
memset(obj->oa.accumulator, 0, sizeof(obj->oa.accumulator));
gen_perf_query_result_clear(&obj->oa.result);
obj->oa.results_accumulated = false;
add_to_unaccumulated_query_list(brw, obj);
@ -1381,62 +1332,15 @@ brw_is_perf_query_ready(struct gl_context *ctx,
return false;
}
static void
gen8_read_report_clock_ratios(const uint32_t *report,
uint64_t *slice_freq_hz,
uint64_t *unslice_freq_hz)
{
/* The lower 16bits of the RPT_ID field of the OA reports contains a
* snapshot of the bits coming from the RP_FREQ_NORMAL register and is
* divided this way :
*
* RPT_ID[31:25]: RP_FREQ_NORMAL[20:14] (low squashed_slice_clock_frequency)
* RPT_ID[10:9]: RP_FREQ_NORMAL[22:21] (high squashed_slice_clock_frequency)
* RPT_ID[8:0]: RP_FREQ_NORMAL[31:23] (squashed_unslice_clock_frequency)
*
* RP_FREQ_NORMAL[31:23]: Software Unslice Ratio Request
* Multiple of 33.33MHz 2xclk (16 MHz 1xclk)
*
* RP_FREQ_NORMAL[22:14]: Software Slice Ratio Request
* Multiple of 33.33MHz 2xclk (16 MHz 1xclk)
*/
uint32_t unslice_freq = report[0] & 0x1ff;
uint32_t slice_freq_low = (report[0] >> 25) & 0x7f;
uint32_t slice_freq_high = (report[0] >> 9) & 0x3;
uint32_t slice_freq = slice_freq_low | (slice_freq_high << 7);
*slice_freq_hz = slice_freq * 16666667ULL;
*unslice_freq_hz = unslice_freq * 16666667ULL;
}
static void
read_slice_unslice_frequencies(struct brw_context *brw,
struct brw_perf_query_object *obj)
{
const struct gen_device_info *devinfo = &brw->screen->devinfo;
uint32_t *begin_report, *end_report;
uint32_t *begin_report = obj->oa.map, *end_report = obj->oa.map + MI_RPC_BO_END_OFFSET_BYTES;
/* Slice/Unslice frequency is only available in the OA reports when the
* "Disable OA reports due to clock ratio change" field in
* OA_DEBUG_REGISTER is set to 1. This is how the kernel programs this
* global register (see drivers/gpu/drm/i915/i915_perf.c)
*
* Documentation says this should be available on Gen9+ but experimentation
* shows that Gen8 reports similar values, so we enable it there too.
*/
if (devinfo->gen < 8)
return;
begin_report = obj->oa.map;
end_report = obj->oa.map + MI_RPC_BO_END_OFFSET_BYTES;
gen8_read_report_clock_ratios(begin_report,
&obj->oa.slice_frequency[0],
&obj->oa.unslice_frequency[0]);
gen8_read_report_clock_ratios(end_report,
&obj->oa.slice_frequency[1],
&obj->oa.unslice_frequency[1]);
gen_perf_query_result_read_frequencies(&obj->oa.result,
devinfo, begin_report, end_report);
}
static void
@ -1488,13 +1392,15 @@ get_oa_counter_data(struct brw_context *brw,
switch (counter->data_type) {
case GEN_PERF_COUNTER_DATA_TYPE_UINT64:
out_uint64 = (uint64_t *)(data + counter->offset);
*out_uint64 = counter->oa_counter_read_uint64(perf, query,
obj->oa.accumulator);
*out_uint64 =
counter->oa_counter_read_uint64(perf, query,
obj->oa.result.accumulator);
break;
case GEN_PERF_COUNTER_DATA_TYPE_FLOAT:
out_float = (float *)(data + counter->offset);
*out_float = counter->oa_counter_read_float(perf, query,
obj->oa.accumulator);
*out_float =
counter->oa_counter_read_float(perf, query,
obj->oa.result.accumulator);
break;
default:
/* So far we aren't using uint32, double or bool32... */

View File

@ -28,6 +28,8 @@
#include "brw_context.h"
#include "perf/gen_perf.h"
struct gen_perf_query_info;
/*
@ -38,15 +40,6 @@ struct gen_perf_query_info;
#define STATS_BO_END_OFFSET_BYTES (STATS_BO_SIZE / 2)
#define MAX_STAT_COUNTERS (STATS_BO_END_OFFSET_BYTES / 8)
/*
* The largest OA formats we can use include:
* For Haswell:
* 1 timestamp, 45 A counters, 8 B counters and 8 C counters.
* For Gen8+
* 1 timestamp, 1 clock, 36 A counters, 8 B counters and 8 C counters
*/
#define MAX_OA_REPORT_COUNTERS 62
/**
* i965 representation of a performance query object.
*
@ -93,16 +86,6 @@ struct brw_perf_query_object
*/
struct exec_node *samples_head;
/**
* Storage for the final accumulated OA counters.
*/
uint64_t accumulator[MAX_OA_REPORT_COUNTERS];
/**
* Hw ID used by the context on which the query was running.
*/
uint32_t hw_id;
/**
* false while in the unaccumulated_elements list, and set to
* true when the final, end MI_RPC snapshot has been
@ -110,27 +93,15 @@ struct brw_perf_query_object
*/
bool results_accumulated;
/**
* Number of reports accumulated to produce the results.
*/
uint32_t reports_accumulated;
/**
* Frequency of the GT at begin and end of the query.
*/
uint64_t gt_frequency[2];
/**
* Frequency in the slices of the GT at the begin and end of the
* query.
* Accumulated OA results between begin and end of the query.
*/
uint64_t slice_frequency[2];
/**
* Frequency in the unslice of the GT at the begin and end of the
* query.
*/
uint64_t unslice_frequency[2];
struct gen_perf_query_result result;
} oa;
struct {

View File

@ -34,6 +34,7 @@ brw_perf_query_get_mdapi_oa_data(struct brw_context *brw,
uint8_t *data)
{
const struct gen_device_info *devinfo = &brw->screen->devinfo;
const struct gen_perf_query_result *result = &obj->oa.result;
switch (devinfo->gen) {
case 7: {
@ -45,15 +46,15 @@ brw_perf_query_get_mdapi_oa_data(struct brw_context *brw,
assert(devinfo->is_haswell);
for (int i = 0; i < ARRAY_SIZE(mdapi_data->ACounters); i++)
mdapi_data->ACounters[i] = obj->oa.accumulator[1 + i];
mdapi_data->ACounters[i] = result->accumulator[1 + i];
for (int i = 0; i < ARRAY_SIZE(mdapi_data->NOACounters); i++) {
mdapi_data->NOACounters[i] =
obj->oa.accumulator[1 + ARRAY_SIZE(mdapi_data->ACounters) + i];
result->accumulator[1 + ARRAY_SIZE(mdapi_data->ACounters) + i];
}
mdapi_data->ReportsCount = obj->oa.reports_accumulated;
mdapi_data->TotalTime = brw_timebase_scale(brw, obj->oa.accumulator[0]);
mdapi_data->ReportsCount = result->reports_accumulated;
mdapi_data->TotalTime = brw_timebase_scale(brw, result->accumulator[0]);
mdapi_data->CoreFrequency = obj->oa.gt_frequency[1];
mdapi_data->CoreFrequencyChanged = obj->oa.gt_frequency[0] != obj->oa.gt_frequency[1];
return sizeof(*mdapi_data);
@ -65,20 +66,20 @@ brw_perf_query_get_mdapi_oa_data(struct brw_context *brw,
return 0;
for (int i = 0; i < ARRAY_SIZE(mdapi_data->OaCntr); i++)
mdapi_data->OaCntr[i] = obj->oa.accumulator[2 + i];
mdapi_data->OaCntr[i] = result->accumulator[2 + i];
for (int i = 0; i < ARRAY_SIZE(mdapi_data->NoaCntr); i++) {
mdapi_data->NoaCntr[i] =
obj->oa.accumulator[2 + ARRAY_SIZE(mdapi_data->OaCntr) + i];
result->accumulator[2 + ARRAY_SIZE(mdapi_data->OaCntr) + i];
}
mdapi_data->ReportId = obj->oa.hw_id;
mdapi_data->ReportsCount = obj->oa.reports_accumulated;
mdapi_data->TotalTime = brw_timebase_scale(brw, obj->oa.accumulator[0]);
mdapi_data->GPUTicks = obj->oa.accumulator[1];
mdapi_data->ReportId = result->hw_id;
mdapi_data->ReportsCount = result->reports_accumulated;
mdapi_data->TotalTime = brw_timebase_scale(brw, result->accumulator[0]);
mdapi_data->GPUTicks = result->accumulator[1];
mdapi_data->CoreFrequency = obj->oa.gt_frequency[1];
mdapi_data->CoreFrequencyChanged = obj->oa.gt_frequency[0] != obj->oa.gt_frequency[1];
mdapi_data->SliceFrequency = (obj->oa.slice_frequency[0] + obj->oa.slice_frequency[1]) / 2ULL;
mdapi_data->UnsliceFrequency = (obj->oa.unslice_frequency[0] + obj->oa.unslice_frequency[1]) / 2ULL;
mdapi_data->SliceFrequency = (result->slice_frequency[0] + result->slice_frequency[1]) / 2ULL;
mdapi_data->UnsliceFrequency = (result->unslice_frequency[0] + result->unslice_frequency[1]) / 2ULL;
return sizeof(*mdapi_data);
}
@ -91,20 +92,20 @@ brw_perf_query_get_mdapi_oa_data(struct brw_context *brw,
return 0;
for (int i = 0; i < ARRAY_SIZE(mdapi_data->OaCntr); i++)
mdapi_data->OaCntr[i] = obj->oa.accumulator[2 + i];
mdapi_data->OaCntr[i] = result->accumulator[2 + i];
for (int i = 0; i < ARRAY_SIZE(mdapi_data->NoaCntr); i++) {
mdapi_data->NoaCntr[i] =
obj->oa.accumulator[2 + ARRAY_SIZE(mdapi_data->OaCntr) + i];
result->accumulator[2 + ARRAY_SIZE(mdapi_data->OaCntr) + i];
}
mdapi_data->ReportId = obj->oa.hw_id;
mdapi_data->ReportsCount = obj->oa.reports_accumulated;
mdapi_data->TotalTime = brw_timebase_scale(brw, obj->oa.accumulator[0]);
mdapi_data->GPUTicks = obj->oa.accumulator[1];
mdapi_data->ReportId = result->hw_id;
mdapi_data->ReportsCount = result->reports_accumulated;
mdapi_data->TotalTime = brw_timebase_scale(brw, result->accumulator[0]);
mdapi_data->GPUTicks = result->accumulator[1];
mdapi_data->CoreFrequency = obj->oa.gt_frequency[1];
mdapi_data->CoreFrequencyChanged = obj->oa.gt_frequency[0] != obj->oa.gt_frequency[1];
mdapi_data->SliceFrequency = (obj->oa.slice_frequency[0] + obj->oa.slice_frequency[1]) / 2ULL;
mdapi_data->UnsliceFrequency = (obj->oa.unslice_frequency[0] + obj->oa.unslice_frequency[1]) / 2ULL;
mdapi_data->SliceFrequency = (result->slice_frequency[0] + result->slice_frequency[1]) / 2ULL;
mdapi_data->UnsliceFrequency = (result->unslice_frequency[0] + result->unslice_frequency[1]) / 2ULL;
return sizeof(*mdapi_data);
}