intel/perf: prep work to enable new perf counters

Those are not part of the OA reports and need some additional
scaffolding. Those counters are only available when doing queries as
we need to emit MI_SRMs to record them.

Equations making use of those counters are not there yet, they will
come in a follow up commit updating a bunch of oa-*.xml files.

v2: Fix typo

v3: Use PERF_CNT_VALUE_MASK (Marcin)

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6518>
This commit is contained in:
Lionel Landwerlin 2019-07-04 20:34:28 +03:00 committed by Marge Bot
parent 969f6efbc2
commit a6e980e9bf
8 changed files with 72 additions and 61 deletions

View File

@ -423,6 +423,7 @@ init_oa_sys_vars(struct gen_perf_config *perf, const struct gen_device_info *dev
perf->sys_vars.gt_max_freq = max_freq_mhz * 1000000;
perf->sys_vars.timestamp_frequency = devinfo->timestamp_frequency;
perf->sys_vars.revision = devinfo->revision;
perf->sys_vars.query_mode = true;
compute_topology_builtins(perf, devinfo);
return true;
@ -1117,6 +1118,18 @@ gen_perf_query_result_read_gt_frequency(struct gen_perf_query_result *result,
result->gt_frequency[1] *= 1000000ULL;
}
void
gen_perf_query_result_read_perfcnts(struct gen_perf_query_result *result,
const struct gen_perf_query_info *query,
const uint64_t *start,
const uint64_t *end)
{
for (uint32_t i = 0; i < 2; i++) {
result->accumulator[query->perfcnt_offset + i] =
(end[i] & PERF_CNT_VALUE_MASK) - (start[i] & PERF_CNT_VALUE_MASK);
}
}
void
gen_perf_query_result_clear(struct gen_perf_query_result *result)
{

View File

@ -108,8 +108,10 @@ struct gen_pipeline_stat {
* 1 timestamp, 45 A counters, 8 B counters and 8 C counters.
* For Gen8+
* 1 timestamp, 1 clock, 36 A counters, 8 B counters and 8 C counters
*
* Plus 2 PERF_CNT registers.
*/
#define MAX_OA_REPORT_COUNTERS 62
#define MAX_OA_REPORT_COUNTERS (62 + 2)
/*
* When currently allocate only one page for pipeline statistics queries. Here
@ -180,10 +182,10 @@ struct gen_perf_query_counter {
union {
uint64_t (*oa_counter_read_uint64)(struct gen_perf_config *perf,
const struct gen_perf_query_info *query,
const uint64_t *accumulator);
const struct gen_perf_query_result *results);
float (*oa_counter_read_float)(struct gen_perf_config *perf,
const struct gen_perf_query_info *query,
const uint64_t *accumulator);
const struct gen_perf_query_result *results);
struct gen_pipeline_stat pipeline_stat;
};
};
@ -231,6 +233,7 @@ struct gen_perf_query_info {
int a_offset;
int b_offset;
int c_offset;
int perfcnt_offset;
struct gen_perf_registers config;
};
@ -282,6 +285,7 @@ struct gen_perf_config {
uint64_t gt_min_freq; /** $GpuMinFrequency */
uint64_t gt_max_freq; /** $GpuMaxFrequency */
uint64_t revision; /** $SkuRevisionId */
bool query_mode; /** $QueryMode */
} sys_vars;
/* OA metric sets, indexed by GUID, as know by Mesa at build time, to
@ -370,6 +374,13 @@ void gen_perf_query_result_read_gt_frequency(struct gen_perf_query_result *resul
const uint32_t start,
const uint32_t end);
/** Store PERFCNT registers values.
*/
void gen_perf_query_result_read_perfcnts(struct gen_perf_query_result *result,
const struct gen_perf_query_info *query,
const uint64_t *start,
const uint64_t *end);
/** Accumulate the delta between 2 OA reports into result for a given query.
*/
void gen_perf_query_result_accumulate(struct gen_perf_query_result *result,

View File

@ -94,7 +94,15 @@ def emit_fsub(tmp_id, args):
def emit_read(tmp_id, args):
type = args[1].lower()
c("uint64_t tmp{0} = accumulator[query->{1}_offset + {2}];".format(tmp_id, type, args[0]))
c("uint64_t tmp{0} = results->accumulator[query->{1}_offset + {2}];".format(tmp_id, type, args[0]))
return tmp_id + 1
def emit_read_reg(tmp_id, args):
offsets = {
'PERFCNT1': 0,
'PERFCNT2': 1,
}
c("uint64_t tmp{0} = results->accumulator[query->perfcnt_offset + {1}];".format(tmp_id, offsets[args[0]]))
return tmp_id + 1
def emit_uadd(tmp_id, args):
@ -144,6 +152,7 @@ ops["FMAX"] = (2, emit_fmax)
ops["FMUL"] = (2, emit_fmul)
ops["FSUB"] = (2, emit_fsub)
ops["READ"] = (2, emit_read)
ops["READ_REG"] = (1, emit_read_reg)
ops["UADD"] = (2, emit_uadd)
ops["UDIV"] = (2, emit_udiv)
ops["UMUL"] = (2, emit_umul)
@ -193,6 +202,7 @@ hw_vars["$GpuTimestampFrequency"] = "perf->sys_vars.timestamp_frequency"
hw_vars["$GpuMinFrequency"] = "perf->sys_vars.gt_min_freq"
hw_vars["$GpuMaxFrequency"] = "perf->sys_vars.gt_max_freq"
hw_vars["$SkuRevisionId"] = "perf->sys_vars.revision"
hw_vars["$QueryMode"] = "perf->sys_vars.query_mode"
def output_rpn_equation_code(set, counter, equation):
c("/* RPN equation: " + equation + " */")
@ -214,7 +224,7 @@ def output_rpn_equation_code(set, counter, equation):
operand = hw_vars[operand]
elif operand in set.counter_vars:
reference = set.counter_vars[operand]
operand = set.read_funcs[operand[1:]] + "(perf, query, accumulator)"
operand = set.read_funcs[operand[1:]] + "(perf, query, results)"
else:
raise Exception("Failed to resolve variable " + operand + " in equation " + equation + " for " + set.name + " :: " + counter.get('name'));
args.append(operand)
@ -234,7 +244,7 @@ def output_rpn_equation_code(set, counter, equation):
if value in hw_vars:
value = hw_vars[value]
if value in set.counter_vars:
value = set.read_funcs[value[1:]] + "(perf, query, accumulator)"
value = set.read_funcs[value[1:]] + "(perf, query, results)"
c("\nreturn " + value + ";")
@ -288,7 +298,7 @@ def output_counter_read(gen, set, counter):
c(counter.read_sym + "(UNUSED struct gen_perf_config *perf,\n")
c_indent(len(counter.read_sym) + 1)
c("const struct gen_perf_query_info *query,\n")
c("const uint64_t *accumulator)\n")
c("const struct gen_perf_query_result *results)\n")
c_outdent(len(counter.read_sym) + 1)
c("{")
@ -729,19 +739,21 @@ def main():
query->oa_format = I915_OA_FORMAT_A45_B8_C8;
/* Accumulation buffer offsets... */
query->gpu_time_offset = 0;
query->a_offset = 1;
query->b_offset = 46;
query->c_offset = 54;
query->a_offset = query->gpu_time_offset + 1;
query->b_offset = query->a_offset + 45;
query->c_offset = query->b_offset + 8;
query->perfcnt_offset = query->c_offset + 8;
"""))
else:
c(textwrap.dedent("""\
query->oa_format = I915_OA_FORMAT_A32u40_A4u32_B8_C8;
/* Accumulation buffer offsets... */
query->gpu_time_offset = 0;
query->gpu_clock_offset = 1;
query->a_offset = 2;
query->b_offset = 38;
query->c_offset = 46;
query->gpu_clock_offset = query->gpu_time_offset + 1;
query->a_offset = query->gpu_clock_offset + 1;
query->b_offset = query->a_offset + 36;
query->c_offset = query->b_offset + 8;
query->perfcnt_offset = query->c_offset + 8;
"""))

View File

@ -54,6 +54,9 @@ gen_perf_query_result_write_mdapi(void *data, uint32_t data_size,
result->accumulator[1 + ARRAY_SIZE(mdapi_data->ACounters) + i];
}
mdapi_data->PerfCounter1 = result->accumulator[query->perfcnt_offset + 0];
mdapi_data->PerfCounter2 = result->accumulator[query->perfcnt_offset + 1];
mdapi_data->ReportsCount = result->reports_accumulated;
mdapi_data->TotalTime =
gen_device_info_timebase_scale(devinfo, result->accumulator[0]);
@ -75,6 +78,9 @@ gen_perf_query_result_write_mdapi(void *data, uint32_t data_size,
result->accumulator[2 + ARRAY_SIZE(mdapi_data->OaCntr) + i];
}
mdapi_data->PerfCounter1 = result->accumulator[query->perfcnt_offset + 0];
mdapi_data->PerfCounter2 = result->accumulator[query->perfcnt_offset + 1];
mdapi_data->ReportId = result->hw_id;
mdapi_data->ReportsCount = result->reports_accumulated;
mdapi_data->TotalTime =
@ -106,6 +112,9 @@ gen_perf_query_result_write_mdapi(void *data, uint32_t data_size,
result->accumulator[2 + ARRAY_SIZE(mdapi_data->OaCntr) + i];
}
mdapi_data->PerfCounter1 = result->accumulator[query->perfcnt_offset + 0];
mdapi_data->PerfCounter2 = result->accumulator[query->perfcnt_offset + 1];
mdapi_data->ReportId = result->hw_id;
mdapi_data->ReportsCount = result->reports_accumulated;
mdapi_data->TotalTime =
@ -354,5 +363,6 @@ gen_perf_register_mdapi_oa_query(struct gen_perf_config *perf,
query->a_offset = copy_query->a_offset;
query->b_offset = copy_query->b_offset;
query->c_offset = copy_query->c_offset;
query->perfcnt_offset = copy_query->perfcnt_offset;
}
}

View File

@ -132,41 +132,6 @@ int gen_perf_query_result_write_mdapi(void *data, uint32_t data_size,
const struct gen_perf_query_info *query,
const struct gen_perf_query_result *result);
static inline void gen_perf_query_mdapi_write_perfcntr(void *data, uint32_t data_size,
const struct gen_device_info *devinfo,
const uint64_t *begin_perf_cntrs,
const uint64_t *end_perf_cntrs)
{
/* Only bits 0:43 of the 64bit registers contains the value. */
const uint64_t mask = (1ull << 44) - 1;
switch (devinfo->gen) {
case 8: {
if (data_size < sizeof(struct gen8_mdapi_metrics))
return;
struct gen8_mdapi_metrics *mdapi_data = data;
mdapi_data->PerfCounter1 =
(end_perf_cntrs[0] & mask) - (begin_perf_cntrs[0] & mask);
mdapi_data->PerfCounter2 =
(end_perf_cntrs[1] & mask) - (begin_perf_cntrs[1] & mask);
break;
}
case 9:
case 11: {
if (data_size < sizeof(struct gen9_mdapi_metrics))
return;
struct gen9_mdapi_metrics *mdapi_data = data;
mdapi_data->PerfCounter1 =
(end_perf_cntrs[0] & mask) - (begin_perf_cntrs[0] & mask);
mdapi_data->PerfCounter2 =
(end_perf_cntrs[1] & mask) - (begin_perf_cntrs[1] & mask);
break;
}
default:
break;
}
}
static inline void gen_perf_query_mdapi_write_marker(void *data, uint32_t data_size,
const struct gen_device_info *devinfo,
uint64_t value)

View File

@ -1423,13 +1423,13 @@ get_oa_counter_data(struct gen_perf_context *perf_ctx,
out_uint64 = (uint64_t *)(data + counter->offset);
*out_uint64 =
counter->oa_counter_read_uint64(perf_cfg, queryinfo,
query->oa.result.accumulator);
&query->oa.result);
break;
case GEN_PERF_COUNTER_DATA_TYPE_FLOAT:
out_float = (float *)(data + counter->offset);
*out_float =
counter->oa_counter_read_float(perf_cfg, queryinfo,
query->oa.result.accumulator);
&query->oa.result);
break;
default:
/* So far we aren't using uint32, double or bool32... */

View File

@ -421,13 +421,13 @@ anv_perf_write_pass_results(struct gen_perf_config *perf,
results[c].uint64 =
counter_pass->counter->oa_counter_read_uint64(perf,
counter_pass->query,
accumulated_results->accumulator);
accumulated_results);
break;
case GEN_PERF_COUNTER_DATA_TYPE_FLOAT:
results[c].float32 =
counter_pass->counter->oa_counter_read_float(perf,
counter_pass->query,
accumulated_results->accumulator);
accumulated_results);
break;
default:
/* So far we aren't using uint32, double or bool32... */

View File

@ -326,7 +326,7 @@ intel_perf_rpstart_offset(bool end)
return 16 + (end ? sizeof(uint32_t) : 0);
}
#if GEN_GEN >= 8 && GEN_GEN <= 11
#if GEN_IS_HASWELL || (GEN_GEN >= 8 && GEN_GEN <= 11)
static uint32_t
intel_perf_counter(bool end)
{
@ -541,14 +541,14 @@ VkResult genX(GetQueryPoolResults)(
oa_begin, oa_end);
gen_perf_query_result_read_gt_frequency(&result, &device->info,
*rpstat_begin, *rpstat_end);
gen_perf_query_result_write_mdapi(pData, stride,
&device->info,
query, &result);
#if GEN_GEN >= 8 && GEN_GEN <= 11
gen_perf_query_mdapi_write_perfcntr(pData, stride, &device->info,
#if GEN_IS_HASWELL || (GEN_GEN >= 8 && GEN_GEN <= 11)
gen_perf_query_result_read_perfcnts(&result, query,
query_data + intel_perf_counter(false),
query_data + intel_perf_counter(true));
#endif
gen_perf_query_result_write_mdapi(pData, stride,
&device->info,
query, &result);
const uint64_t *marker = query_data + intel_perf_marker_offset();
gen_perf_query_mdapi_write_marker(pData, stride, &device->info, *marker);
break;
@ -913,7 +913,7 @@ void genX(CmdBeginQueryIndexedEXT)(
intel_perf_rpstart_offset(false))),
gen_mi_reg32(GENX(RPSTAT0_num)));
#endif
#if GEN_GEN >= 8 && GEN_GEN <= 11
#if GEN_IS_HASWELL || (GEN_GEN >= 8 && GEN_GEN <= 11)
gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr,
intel_perf_counter(false))),
gen_mi_reg64(GENX(PERFCNT1_num)));
@ -1047,7 +1047,7 @@ void genX(CmdEndQueryIndexedEXT)(
uint32_t marker_offset = intel_perf_marker_offset();
gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, marker_offset)),
gen_mi_imm(cmd_buffer->intel_perf_marker));
#if GEN_GEN >= 8 && GEN_GEN <= 11
#if GEN_IS_HASWELL || (GEN_GEN >= 8 && GEN_GEN <= 11)
gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, intel_perf_counter(true))),
gen_mi_reg64(GENX(PERFCNT1_num)));
gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, intel_perf_counter(true) + 8)),