nvc0: support MP performance counters on Maxwell

This adds some performance counters/metrics for SM50/SM52.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Tested-by: Pierre Moreau <pierre.morrow@free.fr>
This commit is contained in:
Samuel Pitoiset 2016-11-05 17:23:33 +01:00
parent b9578b683d
commit 561f2208bd
3 changed files with 721 additions and 3 deletions

View File

@ -376,6 +376,22 @@ static const struct nvc0_hw_metric_query_cfg *sm35_hw_metric_queries[] =
&sm35_warp_nonpred_execution_efficiency,
};
/* ==== Compute capability 5.0 (GM107/GM108) ==== */
static const struct nvc0_hw_metric_query_cfg *sm50_hw_metric_queries[] =
{
&sm20_achieved_occupancy,
&sm20_branch_efficiency,
&sm30_inst_issued,
&sm20_inst_per_wrap,
&sm30_inst_replay_overhead,
&sm20_ipc,
&sm30_issued_ipc,
&sm30_issue_slots,
&sm30_issue_slot_utilization,
&sm30_warp_execution_efficiency,
&sm35_warp_nonpred_execution_efficiency,
};
#undef _SM
static inline const struct nvc0_hw_metric_query_cfg **
@ -384,6 +400,9 @@ nvc0_hw_metric_get_queries(struct nvc0_screen *screen)
struct nouveau_device *dev = screen->base.device;
switch (screen->base.class_3d) {
case GM200_3D_CLASS:
case GM107_3D_CLASS:
return sm50_hw_metric_queries;
case NVF0_3D_CLASS:
return sm35_hw_metric_queries;
case NVE4_3D_CLASS:
@ -403,6 +422,9 @@ nvc0_hw_metric_get_num_queries(struct nvc0_screen *screen)
struct nouveau_device *dev = screen->base.device;
switch (screen->base.class_3d) {
case GM200_3D_CLASS:
case GM107_3D_CLASS:
return ARRAY_SIZE(sm50_hw_metric_queries);
case NVF0_3D_CLASS:
return ARRAY_SIZE(sm35_hw_metric_queries);
case NVE4_3D_CLASS:
@ -660,6 +682,8 @@ nvc0_hw_metric_get_query_result(struct nvc0_context *nvc0,
}
switch (screen->base.class_3d) {
case GM200_3D_CLASS:
case GM107_3D_CLASS:
case NVF0_3D_CLASS:
value = sm35_hw_metric_calc_result(hq, res64);
break;
@ -734,7 +758,7 @@ nvc0_hw_metric_get_driver_query_info(struct nvc0_screen *screen, unsigned id,
if (id < count) {
if (screen->compute) {
if (screen->base.class_3d <= NVF0_3D_CLASS) {
if (screen->base.class_3d <= GM200_3D_CLASS) {
const struct nvc0_hw_metric_query_cfg **queries =
nvc0_hw_metric_get_queries(screen);
const struct nvc0_hw_metric_cfg *cfg =

View File

@ -37,6 +37,12 @@ static const struct {
const char *name;
const char *desc;
} nvc0_hw_sm_queries[] = {
_Q(ACTIVE_CTAS,
"active_ctas",
"Accumulated number of active blocks per cycle. For every cycle it "
"increments by the number of active blocks in the cycle which can be in "
"the range 0 to 32."),
_Q(ACTIVE_CYCLES,
"active_cycles",
"Number of cycles a multiprocessor has at least one active warp"),
@ -81,6 +87,20 @@ static const struct {
"128 bytes. For each extra cache line access the counter is incremented "
"by 1"),
_Q(GLOBAL_ATOM_CAS,
"global_atom_cas",
"Number of ATOM.CAS instructions executed per warp."),
_Q(GLOBAL_LD,
"global_load",
"Number of executed load instructions where state space is specified as "
"global, increments per warp on a multiprocessor."),
_Q(GLOBAL_ST,
"global_store",
"Number of executed store instructions where state space is specified as "
"global, increments per warp on a multiprocessor."),
_Q(GST_TRANSACTIONS,
"global_store_transaction",
"Number of global store transactions. Increments by 1 per transaction. "
@ -114,6 +134,11 @@ static const struct {
"inst_issued",
"Number of instructions issued including replays"),
_Q(INST_ISSUED0,
"inst_issued0",
"Number of cycles that did not issue any instruction, increments per "
"warp."),
_Q(INST_ISSUED1,
"inst_issued1",
"Number of single instruction issued per cycle"),
@ -260,11 +285,24 @@ static const struct {
"User profiled generic trigger that can be inserted in any place of the "
"code to collect the related information. Increments per warp."),
_Q(SHARED_ATOM,
"shared_atom",
"Number of ATOMS instructions executed per warp."),
_Q(SHARED_ATOM_CAS,
"shared_atom_cas",
"Number of ATOMS.CAS instructions executed per warp."),
_Q(SHARED_LD,
"shared_load",
"Number of executed load instructions where state space is specified as "
"shared, increments per warp on a multiprocessor"),
_Q(SHARED_LD_BANK_CONFLICT,
"shared_load_bank_conflict",
"Number of shared load bank conflict generated when the addresses for "
"two or more shared memory load requests fall in the same memory bank."),
_Q(SHARED_LD_REPLAY,
"shared_load_replay",
"Replays caused due to shared load bank conflict (when the addresses for "
@ -273,11 +311,23 @@ static const struct {
"threads in the warp executing that instruction exceed the number of words "
"that can be loaded in one cycle (256 bytes)"),
_Q(SHARED_LD_TRANSACTIONS,
"shared_ld_transactions",
"Number of transactions for shared load accesses. Maximum transaction "
"size in maxwell is 128 bytes, any warp accessing more that 128 bytes "
"will cause multiple transactions for a shared load instruction. This "
"also includes extra transactions caused by shared bank conflicts."),
_Q(SHARED_ST,
"shared_store",
"Number of executed store instructions where state space is specified as "
"shared, increments per warp on a multiprocessor"),
_Q(SHARED_ST_BANK_CONFLICT,
"shared_store_bank_conflict",
"Number of shared store bank conflict generated when the addresses for "
"two or more shared memory store requests fall in the same memory bank."),
_Q(SHARED_ST_REPLAY,
"shared_store_replay",
"Replays caused due to shared store bank conflict (when the addresses for "
@ -286,6 +336,13 @@ static const struct {
"threads in the warp executing that instruction exceed the number of words "
"that can be stored in one cycle"),
_Q(SHARED_ST_TRANSACTIONS,
"shared_st_transactions",
"Number of transactions for shared store accesses. Maximum transaction "
"size in maxwell is 128 bytes, any warp accessing more that 128 bytes "
"will cause multiple transactions for a shared store instruction. This "
"also includes extra transactions caused by shared bank conflicts."),
_Q(SM_CTA_LAUNCHED,
"sm_cta_launched",
"Number of thread blocks launched on a multiprocessor"),
@ -480,6 +537,54 @@ static const uint64_t nvf0_read_hw_sm_counters_code[] =
0x18000000001c003cULL,
};
static const uint64_t gm107_read_hw_sm_counters_code[] =
{
0x001f8000fc0007e0ULL, /* sched (st 0x0) (st 0x0) (st 0x0) */
0xf0c8000002170008ULL, /* mov $r8 $tidx */
0xf0c800000037000cULL, /* mov $r12 $virtid */
0xf0c8000000470000ULL, /* mov $r0 $pm0 */
0x001f8000fc0007e0ULL, /* sched (st 0x0) (st 0x0) (st 0x0) */
0xf0c8000000570001ULL, /* mov $r1 $pm1 */
0xf0c8000000670002ULL, /* mov $r2 $pm2 */
0xf0c8000000770003ULL, /* mov $r3 $pm3 */
0x001f8000fc0007e0ULL, /* sched (st 0x0) (st 0x0) (st 0x0) */
0xf0c8000000870004ULL, /* mov $r4 $pm4 */
0xf0c8000000970005ULL, /* mov $r5 $pm5 */
0xf0c8000000a70006ULL, /* mov $r6 $pm6 */
0x001f8000fc0007e0ULL, /* sched (st 0x0) (st 0x0) (st 0x0) */
0xf0c8000000b70007ULL, /* mov $r7 $pm7 */
0x5b6403800087ff07ULL, /* isetp eq u32 and $p0 0x1 0x0 $r8 0x1 */
0x4c98079c1887000aULL, /* mov $r10 c7[0x620] 0xf */
0x001f8000fc0007e0ULL, /* sched (st 0x0) (st 0x0) (st 0x0) */
0x3800000091470c08ULL, /* bfe u32 $r8 $r12 0x914 */
0x4c98079c1897000bULL, /* mov $r11 c7[0x624] 0xf */
0x3800000020870c09ULL, /* bfe u32 $r9 $r12 0x208 */
0x001f8000fc0007e0ULL, /* sched (st 0x0) (st 0x0) (st 0x0) */
0xe30000000008000fULL, /* not $p0 exit */
0x5b6403800097ff0fULL, /* isetp eq u32 and $p1 0x1 0x0 $r9 0x1 */
0x3838000006070808ULL, /* imul u32 u32 $r8 $r8 0x60 */
0x001f8000fc0007e0ULL, /* sched (st 0x0) (st 0x0) (st 0x0) */
0x383800000107090cULL, /* imul u32 u32 $r12 $r9 0x10 */
0x383800000047090dULL, /* imul u32 u32 $r13 $r9 0x4 */
0x5c10000000d70809ULL, /* iadd $r9 $r8 $r13 */
0x001f8000fc0007e0ULL, /* sched (st 0x0) (st 0x0) (st 0x0) */
0x5c10000000c70808ULL, /* iadd $r8 $r8 $r12 */
0x5c98078000a7000cULL, /* mov $r12 $r10 0xf */
0x5c10800000870a0aULL, /* iadd cc $r10 $r10 $r8 */
0x001f8000fc0007e0ULL, /* sched (st 0x0) (st 0x0) (st 0x0) */
0x5c98078000b7000dULL, /* mov $r13 $r11 0xf */
0x5c1008000ff70b0bULL, /* iadd x $r11 $r11 0x0 */
0x5c10800000970c0cULL, /* iadd cc $r12 $r12 $r9 */
0x001f8000fc0007e0ULL, /* sched (st 0x0) (st 0x0) (st 0x0) */
0x5c1008000ff70d0dULL, /* iadd x $r13 $r13 0x0 */
0xbfd0000000070a00ULL, /* st e wt b128 g[$r10] $r0 0x1 */
0x4c98079c18a70000ULL, /* mov $r0 c7[0x628] 0xf */
0x001f8000fc0007e0ULL, /* sched (st 0x0) (st 0x0) (st 0x0) */
0xbfd0000004010c04ULL, /* $p1 st e wt b128 g[$r12+0x40] $r4 0x1 */
0xbf90000005070c00ULL, /* st e wt b32 g[$r12+0x50] $r0 0x1 */
0xe30000000007000fULL, /* exit */
};
/* For simplicity, we will allocate as many group slots as we allocate counter
* slots. This means that a single counter which wants to source from 2 groups
* will have to be declared as using 2 counter slots. This shouldn't really be
@ -1082,6 +1187,556 @@ static const struct nvc0_hw_sm_query_cfg *sm35_hw_sm_queries[] =
&sm30_warps_launched,
};
/* ==== Compute capability 5.0 (GM107/GM108) ==== */
static const struct nvc0_hw_sm_query_cfg
sm50_active_ctas =
{
.type = NVC0_HW_SM_QUERY_ACTIVE_CTAS,
.ctr[0] = _CB(0x003f, B6, 0x01, 0x29062080),
.num_counters = 1,
.norm = { 1, 1 },
};
static const struct nvc0_hw_sm_query_cfg
sm50_active_cycles =
{
.type = NVC0_HW_SM_QUERY_ACTIVE_CYCLES,
.ctr[0] = _CB(0x0001, B6, 0x00, 0x00000004),
.num_counters = 1,
.norm = { 1, 1 },
};
static const struct nvc0_hw_sm_query_cfg
sm50_active_warps =
{
.type = NVC0_HW_SM_QUERY_ACTIVE_WARPS,
.ctr[0] = _CB(0x003f, B6, 0x00, 0x398a4188),
.num_counters = 1,
.norm = { 1, 1 },
};
static const struct nvc0_hw_sm_query_cfg
sm50_atom_count =
{
.type = NVC0_HW_SM_QUERY_ATOM_COUNT,
.ctr[0] = _CA(0x0001, B6, 0x14, 0x00000004),
.num_counters = 1,
.norm = { 1, 1 },
};
static const struct nvc0_hw_sm_query_cfg
sm50_branch =
{
.type = NVC0_HW_SM_QUERY_BRANCH,
.ctr[0] = _CA(0x0001, B6, 0x1a, 0x00000010),
.num_counters = 1,
.norm = { 1, 1 },
};
static const struct nvc0_hw_sm_query_cfg
sm50_divergent_branch =
{
.type = NVC0_HW_SM_QUERY_DIVERGENT_BRANCH,
.ctr[0] = _CA(0x0001, B6, 0x1a, 0x00000004),
.num_counters = 1,
.norm = { 1, 1 },
};
static const struct nvc0_hw_sm_query_cfg
sm50_global_atom_cas =
{
.type = NVC0_HW_SM_QUERY_GLOBAL_ATOM_CAS,
.ctr[0] = _CA(0x0001, B6, 0x14, 0x00000000),
.num_counters = 1,
.norm = { 1, 1 },
};
static const struct nvc0_hw_sm_query_cfg
sm50_global_ld =
{
.type = NVC0_HW_SM_QUERY_GLOBAL_LD,
.ctr[0] = _CA(0x0001, B6, 0x14, 0x0000000c),
.num_counters = 1,
.norm = { 1, 1 },
};
static const struct nvc0_hw_sm_query_cfg
sm50_global_st =
{
.type = NVC0_HW_SM_QUERY_GLOBAL_ST,
.ctr[0] = _CA(0x0001, B6, 0x14, 0x00000010),
.num_counters = 1,
.norm = { 1, 1 },
};
static const struct nvc0_hw_sm_query_cfg
sm50_gred_count =
{
.type = NVC0_HW_SM_QUERY_GRED_COUNT,
.ctr[0] = _CA(0x0001, B6, 0x14, 0x00000008),
.num_counters = 1,
.norm = { 1, 1 },
};
static const struct nvc0_hw_sm_query_cfg
sm50_inst_executed =
{
.type = NVC0_HW_SM_QUERY_INST_EXECUTED,
.ctr[0] = _CA(0x0003, B6, 0x02, 0x00000398),
.num_counters = 1,
.norm = { 1, 1 },
};
static const struct nvc0_hw_sm_query_cfg
sm50_inst_issued0 =
{
.type = NVC0_HW_SM_QUERY_INST_ISSUED0,
.ctr[0] = _CA(0x0001, B6, 0x02, 0x0000000c),
.num_counters = 1,
.norm = { 1, 1 },
};
static const struct nvc0_hw_sm_query_cfg
sm50_inst_issued1 =
{
.type = NVC0_HW_SM_QUERY_INST_ISSUED1,
.ctr[0] = _CA(0x0001, B6, 0x02, 0x00000010),
.num_counters = 1,
.norm = { 1, 1 },
};
static const struct nvc0_hw_sm_query_cfg
sm50_inst_issued2 =
{
.type = NVC0_HW_SM_QUERY_INST_ISSUED2,
.ctr[0] = _CA(0x0001, B6, 0x02, 0x00000014),
.num_counters = 1,
.norm = { 1, 1 },
};
static const struct nvc0_hw_sm_query_cfg
sm50_local_ld =
{
.type = NVC0_HW_SM_QUERY_LOCAL_LD,
.ctr[0] = _CA(0x0001, B6, 0x13, 0x00000004),
.num_counters = 1,
.norm = { 1, 1 },
};
static const struct nvc0_hw_sm_query_cfg
sm50_local_st =
{
.type = NVC0_HW_SM_QUERY_LOCAL_ST,
.ctr[0] = _CA(0x0001, B6, 0x13, 0x00000000),
.num_counters = 1,
.norm = { 1, 1 },
};
static const struct nvc0_hw_sm_query_cfg
sm50_not_pred_off_inst_executed =
{
.type = NVC0_HW_SM_QUERY_NOT_PRED_OFF_INST_EXECUTED,
.ctr[0] = _CA(0x003f, B6, 0x05, 0x29062080),
.num_counters = 1,
.norm = { 1, 1 },
};
static const struct nvc0_hw_sm_query_cfg
sm50_prof_trigger_0 =
{
.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_0,
.ctr[0] = _CA(0x0001, B6, 0x00, 0x00000000),
.num_counters = 1,
.norm = { 1, 1 },
};
static const struct nvc0_hw_sm_query_cfg
sm50_prof_trigger_1 =
{
.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_1,
.ctr[0] = _CA(0x0001, B6, 0x00, 0x00000004),
.num_counters = 1,
.norm = { 1, 1 },
};
static const struct nvc0_hw_sm_query_cfg
sm50_prof_trigger_2 =
{
.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_2,
.ctr[0] = _CA(0x0001, B6, 0x00, 0x00000008),
.num_counters = 1,
.norm = { 1, 1 },
};
static const struct nvc0_hw_sm_query_cfg
sm50_prof_trigger_3 =
{
.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_3,
.ctr[0] = _CA(0x0001, B6, 0x00, 0x0000000c),
.num_counters = 1,
.norm = { 1, 1 },
};
static const struct nvc0_hw_sm_query_cfg
sm50_prof_trigger_4 =
{
.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_4,
.ctr[0] = _CA(0x0001, B6, 0x00, 0x00000010),
.num_counters = 1,
.norm = { 1, 1 },
};
static const struct nvc0_hw_sm_query_cfg
sm50_prof_trigger_5 =
{
.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_5,
.ctr[0] = _CA(0x0001, B6, 0x00, 0x00000014),
.num_counters = 1,
.norm = { 1, 1 },
};
static const struct nvc0_hw_sm_query_cfg
sm50_prof_trigger_6 =
{
.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_6,
.ctr[0] = _CA(0x0001, B6, 0x00, 0x00000018),
.num_counters = 1,
.norm = { 1, 1 },
};
static const struct nvc0_hw_sm_query_cfg
sm50_prof_trigger_7 =
{
.type = NVC0_HW_SM_QUERY_PROF_TRIGGER_7,
.ctr[0] = _CA(0x0001, B6, 0x00, 0x0000001c),
.num_counters = 1,
.norm = { 1, 1 },
};
static const struct nvc0_hw_sm_query_cfg
sm50_shared_atom =
{
.type = NVC0_HW_SM_QUERY_SHARED_ATOM,
.ctr[0] = _CA(0x0001, B6, 0x13, 0x00000014),
.num_counters = 1,
.norm = { 1, 1 },
};
static const struct nvc0_hw_sm_query_cfg
sm50_shared_atom_cas =
{
.type = NVC0_HW_SM_QUERY_SHARED_ATOM_CAS,
.ctr[0] = _CA(0x0001, B6, 0x13, 0x00000010),
.num_counters = 1,
.norm = { 1, 1 },
};
static const struct nvc0_hw_sm_query_cfg
sm50_shared_ld =
{
.type = NVC0_HW_SM_QUERY_SHARED_LD,
.ctr[0] = _CA(0x0001, B6, 0x13, 0x00000008),
.num_counters = 1,
.norm = { 1, 1 },
};
static const struct nvc0_hw_sm_query_cfg
sm50_shared_ld_bank_conflict =
{
.type = NVC0_HW_SM_QUERY_SHARED_LD_BANK_CONFLICT,
.ctr[0] = _CB(0x0001, B6, 0x0e, 0x00000000),
.num_counters = 1,
.norm = { 1, 1 },
};
static const struct nvc0_hw_sm_query_cfg
sm50_shared_ld_transactions =
{
.type = NVC0_HW_SM_QUERY_SHARED_LD_TRANSACTIONS,
.ctr[0] = _CB(0x0001, B6, 0x0e, 0x00000008),
.num_counters = 1,
.norm = { 1, 1 },
};
static const struct nvc0_hw_sm_query_cfg
sm50_shared_st =
{
.type = NVC0_HW_SM_QUERY_SHARED_ST,
.ctr[0] = _CA(0x0001, B6, 0x13, 0x0000000c),
.num_counters = 1,
.norm = { 1, 1 },
};
static const struct nvc0_hw_sm_query_cfg
sm50_shared_st_bank_conflict =
{
.type = NVC0_HW_SM_QUERY_SHARED_ST_BANK_CONFLICT,
.ctr[0] = _CB(0x0001, B6, 0x0e, 0x00000004),
.num_counters = 1,
.norm = { 1, 1 },
};
static const struct nvc0_hw_sm_query_cfg
sm50_shared_st_transactions =
{
.type = NVC0_HW_SM_QUERY_SHARED_ST_TRANSACTIONS,
.ctr[0] = _CB(0x0001, B6, 0x0e, 0x0000000c),
.num_counters = 1,
.norm = { 1, 1 },
};
static const struct nvc0_hw_sm_query_cfg
sm50_sm_cta_launched =
{
.type = NVC0_HW_SM_QUERY_SM_CTA_LAUNCHED,
.ctr[0] = _CB(0x0001, B6, 0x01, 0x00000018),
.num_counters = 1,
.norm = { 1, 1 },
};
static const struct nvc0_hw_sm_query_cfg
sm50_th_inst_executed =
{
.type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED,
.ctr[0] = _CA(0x003f, B6, 0x04, 0x29062080),
.num_counters = 1,
.norm = { 1, 1 },
};
static const struct nvc0_hw_sm_query_cfg
sm50_warps_launched =
{
.type = NVC0_HW_SM_QUERY_WARPS_LAUNCHED,
.ctr[0] = _CA(0x0001, B6, 0x02, 0x00000008),
.num_counters = 1,
.norm = { 1, 1 },
};
static const struct nvc0_hw_sm_query_cfg *sm50_hw_sm_queries[] =
{
&sm50_active_ctas,
&sm50_active_cycles,
&sm50_active_warps,
&sm50_atom_count,
&sm50_branch,
&sm50_divergent_branch,
&sm50_global_atom_cas,
&sm50_global_ld,
&sm50_global_st,
&sm50_gred_count,
&sm50_inst_executed,
&sm50_inst_issued0,
&sm50_inst_issued1,
&sm50_inst_issued2,
&sm50_local_ld,
&sm50_local_st,
&sm50_not_pred_off_inst_executed,
&sm50_prof_trigger_0,
&sm50_prof_trigger_1,
&sm50_prof_trigger_2,
&sm50_prof_trigger_3,
&sm50_prof_trigger_4,
&sm50_prof_trigger_5,
&sm50_prof_trigger_6,
&sm50_prof_trigger_7,
&sm50_shared_atom,
&sm50_shared_atom_cas,
&sm50_shared_ld,
&sm50_shared_ld_bank_conflict,
&sm50_shared_ld_transactions,
&sm50_shared_st,
&sm50_shared_st_bank_conflict,
&sm50_shared_st_transactions,
&sm50_sm_cta_launched,
&sm50_th_inst_executed,
&sm50_warps_launched,
};
/* ==== Compute capability 5.2 (GM200/GM204/GM206) ==== */
static const struct nvc0_hw_sm_query_cfg
sm52_atom_count =
{
.type = NVC0_HW_SM_QUERY_ATOM_COUNT,
.ctr[0] = _CA(0x0001, B6, 0x0a, 0x0000001c),
.num_counters = 1,
.norm = { 1, 1 },
};
static const struct nvc0_hw_sm_query_cfg
sm52_global_atom_cas =
{
.type = NVC0_HW_SM_QUERY_GLOBAL_ATOM_CAS,
.ctr[0] = _CA(0x0001, B6, 0x0a, 0x00000018),
.num_counters = 1,
.norm = { 1, 1 },
};
static const struct nvc0_hw_sm_query_cfg
sm52_global_ld =
{
.type = NVC0_HW_SM_QUERY_GLOBAL_LD,
.ctr[0] = _CA(0x0001, B6, 0x0b, 0x00000018),
.num_counters = 1,
.norm = { 1, 1 },
};
static const struct nvc0_hw_sm_query_cfg
sm52_global_st =
{
.type = NVC0_HW_SM_QUERY_GLOBAL_ST,
.ctr[0] = _CA(0x0001, B6, 0x0b, 0x0000001c),
.num_counters = 1,
.norm = { 1, 1 },
};
static const struct nvc0_hw_sm_query_cfg
sm52_gred_count =
{
.type = NVC0_HW_SM_QUERY_GRED_COUNT,
.ctr[0] = _CA(0x0001, B6, 0x0f, 0x00000018),
.num_counters = 1,
.norm = { 1, 1 },
};
static const struct nvc0_hw_sm_query_cfg
sm52_inst_executed =
{
.type = NVC0_HW_SM_QUERY_INST_EXECUTED,
.ctr[0] = _CA(0x0003, B6, 0x03, 0x0000020c),
.num_counters = 1,
.norm = { 1, 1 },
};
static const struct nvc0_hw_sm_query_cfg
sm52_inst_issued0 =
{
.type = NVC0_HW_SM_QUERY_INST_ISSUED0,
.ctr[0] = _CA(0x0001, B6, 0x03, 0x00000000),
.num_counters = 1,
.norm = { 1, 1 },
};
static const struct nvc0_hw_sm_query_cfg
sm52_inst_issued1 =
{
.type = NVC0_HW_SM_QUERY_INST_ISSUED1,
.ctr[0] = _CA(0x0001, B6, 0x03, 0x00000004),
.num_counters = 1,
.norm = { 1, 1 },
};
static const struct nvc0_hw_sm_query_cfg
sm52_inst_issued2 =
{
.type = NVC0_HW_SM_QUERY_INST_ISSUED2,
.ctr[0] = _CA(0x0001, B6, 0x03, 0x00000008),
.num_counters = 1,
.norm = { 1, 1 },
};
static const struct nvc0_hw_sm_query_cfg
sm52_local_ld =
{
.type = NVC0_HW_SM_QUERY_LOCAL_LD,
.ctr[0] = _CA(0x0001, B6, 0x06, 0x0000001c),
.num_counters = 1,
.norm = { 1, 1 },
};
static const struct nvc0_hw_sm_query_cfg
sm52_local_st =
{
.type = NVC0_HW_SM_QUERY_LOCAL_ST,
.ctr[0] = _CA(0x0001, B6, 0x06, 0x00000018),
.num_counters = 1,
.norm = { 1, 1 },
};
static const struct nvc0_hw_sm_query_cfg
sm52_shared_atom =
{
.type = NVC0_HW_SM_QUERY_SHARED_ATOM,
.ctr[0] = _CA(0x0001, B6, 0x08, 0x0000001c),
.num_counters = 1,
.norm = { 1, 1 },
};
static const struct nvc0_hw_sm_query_cfg
sm52_shared_atom_cas =
{
.type = NVC0_HW_SM_QUERY_SHARED_ATOM_CAS,
.ctr[0] = _CA(0x0001, B6, 0x08, 0x00000018),
.num_counters = 1,
.norm = { 1, 1 },
};
static const struct nvc0_hw_sm_query_cfg
sm52_shared_ld =
{
.type = NVC0_HW_SM_QUERY_SHARED_LD,
.ctr[0] = _CA(0x0001, B6, 0x07, 0x00000018),
.num_counters = 1,
.norm = { 1, 1 },
};
static const struct nvc0_hw_sm_query_cfg
sm52_shared_st =
{
.type = NVC0_HW_SM_QUERY_SHARED_ST,
.ctr[0] = _CA(0x0001, B6, 0x07, 0x0000001c),
.num_counters = 1,
.norm = { 1, 1 },
};
static const struct nvc0_hw_sm_query_cfg
sm52_warps_launched =
{
.type = NVC0_HW_SM_QUERY_WARPS_LAUNCHED,
.ctr[0] = _CA(0x0001, B6, 0x02, 0x0000001c),
.num_counters = 1,
.norm = { 1, 1 },
};
static const struct nvc0_hw_sm_query_cfg *sm52_hw_sm_queries[] =
{
&sm50_active_ctas,
&sm50_active_cycles,
&sm50_active_warps,
&sm52_atom_count,
&sm50_branch,
&sm50_divergent_branch,
&sm52_global_atom_cas,
&sm52_global_ld,
&sm52_global_st,
&sm52_gred_count,
&sm52_inst_executed,
&sm52_inst_issued0,
&sm52_inst_issued1,
&sm52_inst_issued2,
&sm52_local_ld,
&sm52_local_st,
&sm50_not_pred_off_inst_executed,
&sm50_prof_trigger_0,
&sm50_prof_trigger_1,
&sm50_prof_trigger_2,
&sm50_prof_trigger_3,
&sm50_prof_trigger_4,
&sm50_prof_trigger_5,
&sm50_prof_trigger_6,
&sm50_prof_trigger_7,
&sm52_shared_atom,
&sm52_shared_atom_cas,
&sm52_shared_ld,
&sm50_shared_ld_bank_conflict,
&sm50_shared_ld_transactions,
&sm52_shared_st,
&sm50_shared_st_bank_conflict,
&sm50_shared_st_transactions,
&sm50_sm_cta_launched,
&sm50_th_inst_executed,
&sm52_warps_launched,
};
#undef _Q
#undef _CA
#undef _CB
@ -1580,6 +2235,10 @@ nvc0_hw_sm_get_queries(struct nvc0_screen *screen)
struct nouveau_device *dev = screen->base.device;
switch (screen->base.class_3d) {
case GM200_3D_CLASS:
return sm52_hw_sm_queries;
case GM107_3D_CLASS:
return sm50_hw_sm_queries;
case NVF0_3D_CLASS:
return sm35_hw_sm_queries;
case NVE4_3D_CLASS:
@ -1599,6 +2258,10 @@ nvc0_hw_sm_get_num_queries(struct nvc0_screen *screen)
struct nouveau_device *dev = screen->base.device;
switch (screen->base.class_3d) {
case GM200_3D_CLASS:
return ARRAY_SIZE(sm52_hw_sm_queries);
case GM107_3D_CLASS:
return ARRAY_SIZE(sm50_hw_sm_queries);
case NVF0_3D_CLASS:
return ARRAY_SIZE(sm35_hw_sm_queries);
case NVE4_3D_CLASS:
@ -1710,6 +2373,18 @@ nve4_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
BEGIN_NVC0(push, NVE4_CP(MP_PM_SET(c)), 1);
PUSH_DATA (push, 0);
}
if (screen->base.class_3d >= GM107_3D_CLASS) {
/* Enable mask for counters, it's 8-bits value where 0:3 is for domain A
* and 4:7 for domain B. For example, the mask for active_warps should be
* 0x70 because it uses 3 counters in domain B. However, let's always
* enable all counters because we don't want to track which ones is
* enabled or not, and this allows to monitor multiple queries at the
* same time. */
BEGIN_NVC0(push, SUBC_CP(0x33e0), 1);
PUSH_DATA (push, 0xff);
}
return true;
}
@ -1795,6 +2470,11 @@ nvc0_hw_sm_get_program(struct nvc0_screen *screen)
prog->translated = true;
prog->parm_size = 12;
if (screen->base.class_3d >= GM107_3D_CLASS) {
prog->code = (uint32_t *)gm107_read_hw_sm_counters_code;
prog->code_size = sizeof(gm107_read_hw_sm_counters_code);
prog->num_gprs = 14;
} else
if (screen->base.class_3d == NVE4_3D_CLASS ||
screen->base.class_3d == NVF0_3D_CLASS) {
if (screen->base.class_3d == NVE4_3D_CLASS) {
@ -1885,6 +2565,9 @@ nvc0_hw_sm_end_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
}
}
if (screen->base.class_3d >= GM107_3D_CLASS)
IMMED_NVC0(push, SUBC_CP(0x33e0), 0);
BCTX_REFN_bo(nvc0->bufctx_cp, CP_QUERY, NOUVEAU_BO_GART | NOUVEAU_BO_WR,
hq->bo);
@ -2121,7 +2804,7 @@ nvc0_hw_sm_get_driver_query_info(struct nvc0_screen *screen, unsigned id,
if (id < count) {
if (screen->compute) {
if (screen->base.class_3d <= NVF0_3D_CLASS) {
if (screen->base.class_3d <= GM200_3D_CLASS) {
const struct nvc0_hw_sm_query_cfg **queries =
nvc0_hw_sm_get_queries(screen);

View File

@ -21,7 +21,8 @@ nvc0_hw_sm_query(struct nvc0_hw_query *hq)
#define NVC0_HW_SM_QUERY_LAST NVC0_HW_SM_QUERY(NVC0_HW_SM_QUERY_COUNT - 1)
enum nvc0_hw_sm_queries
{
NVC0_HW_SM_QUERY_ACTIVE_CYCLES = 0,
NVC0_HW_SM_QUERY_ACTIVE_CTAS = 0,
NVC0_HW_SM_QUERY_ACTIVE_CYCLES,
NVC0_HW_SM_QUERY_ACTIVE_WARPS,
NVC0_HW_SM_QUERY_ATOM_CAS_COUNT,
NVC0_HW_SM_QUERY_ATOM_COUNT,
@ -29,12 +30,16 @@ enum nvc0_hw_sm_queries
NVC0_HW_SM_QUERY_DIVERGENT_BRANCH,
NVC0_HW_SM_QUERY_GLD_REQUEST,
NVC0_HW_SM_QUERY_GLD_MEM_DIV_REPLAY,
NVC0_HW_SM_QUERY_GLOBAL_ATOM_CAS,
NVC0_HW_SM_QUERY_GLOBAL_LD,
NVC0_HW_SM_QUERY_GLOBAL_ST,
NVC0_HW_SM_QUERY_GST_TRANSACTIONS,
NVC0_HW_SM_QUERY_GST_MEM_DIV_REPLAY,
NVC0_HW_SM_QUERY_GRED_COUNT,
NVC0_HW_SM_QUERY_GST_REQUEST,
NVC0_HW_SM_QUERY_INST_EXECUTED,
NVC0_HW_SM_QUERY_INST_ISSUED,
NVC0_HW_SM_QUERY_INST_ISSUED0,
NVC0_HW_SM_QUERY_INST_ISSUED1,
NVC0_HW_SM_QUERY_INST_ISSUED2,
NVC0_HW_SM_QUERY_INST_ISSUED1_0,
@ -64,10 +69,16 @@ enum nvc0_hw_sm_queries
NVC0_HW_SM_QUERY_PROF_TRIGGER_5,
NVC0_HW_SM_QUERY_PROF_TRIGGER_6,
NVC0_HW_SM_QUERY_PROF_TRIGGER_7,
NVC0_HW_SM_QUERY_SHARED_ATOM,
NVC0_HW_SM_QUERY_SHARED_ATOM_CAS,
NVC0_HW_SM_QUERY_SHARED_LD,
NVC0_HW_SM_QUERY_SHARED_LD_BANK_CONFLICT,
NVC0_HW_SM_QUERY_SHARED_LD_REPLAY,
NVC0_HW_SM_QUERY_SHARED_LD_TRANSACTIONS,
NVC0_HW_SM_QUERY_SHARED_ST,
NVC0_HW_SM_QUERY_SHARED_ST_BANK_CONFLICT,
NVC0_HW_SM_QUERY_SHARED_ST_REPLAY,
NVC0_HW_SM_QUERY_SHARED_ST_TRANSACTIONS,
NVC0_HW_SM_QUERY_SM_CTA_LAUNCHED,
NVC0_HW_SM_QUERY_THREADS_LAUNCHED,
NVC0_HW_SM_QUERY_TH_INST_EXECUTED,