pps: Intel pps driver
Add the Intel pps driver using functionalities provided by libintel_perf. v2: Fix build with perfetto not enabled. v3: Open perf stream with no filtering. v4: Drop usage of inc/dec_n_users. v5: Isolate intel_perf in its own class. Signed-off-by: Antonio Caggiano <antonio.caggiano@collabora.com> Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10216>
This commit is contained in:
parent
5d95aa3964
commit
92180a4376
|
@ -0,0 +1,21 @@
|
|||
BasedOnStyle: WebKit
|
||||
AlignTrailingComments: 'true'
|
||||
AllowAllParametersOfDeclarationOnNextLine: 'false'
|
||||
AllowShortFunctionsOnASingleLine: None
|
||||
AlwaysBreakBeforeMultilineStrings: 'true'
|
||||
BinPackArguments: 'false'
|
||||
BinPackParameters: 'false'
|
||||
BreakBeforeBinaryOperators: None
|
||||
BreakBeforeBraces: Linux
|
||||
ColumnLimit: '100'
|
||||
Cpp11BracedListStyle: 'true'
|
||||
KeepEmptyLinesAtTheStartOfBlocks: 'false'
|
||||
NamespaceIndentation: None
|
||||
PointerAlignment: Right
|
||||
SortIncludes: 'true'
|
||||
SpaceAfterTemplateKeyword: 'false'
|
||||
Standard: Cpp11
|
||||
TabWidth: '3'
|
||||
IndentWidth: '3'
|
||||
ConstructorInitializerIndentWidth: '3'
|
||||
ContinuationIndentWidth: '3'
|
|
@ -0,0 +1,397 @@
|
|||
/*
|
||||
* Copyright © 2020-2021 Collabora, Ltd.
|
||||
* Author: Antonio Caggiano <antonio.caggiano@collabora.com>
|
||||
* Author: Corentin Noël <corentin.noel@collabora.com>
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
#include "intel_pps_driver.h"
|
||||
|
||||
#include <dirent.h>
|
||||
#include <fcntl.h>
|
||||
#include <math.h>
|
||||
#include <poll.h>
|
||||
#include <strings.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <i915_drm.h>
|
||||
#include <intel/perf/intel_perf_query.h>
|
||||
|
||||
#include <pps/pps.h>
|
||||
#include <pps/pps_algorithm.h>
|
||||
|
||||
#include "intel_pps_perf.h"
|
||||
|
||||
namespace pps
|
||||
{
|
||||
uint64_t IntelDriver::get_min_sampling_period_ns()
|
||||
{
|
||||
return 500000;
|
||||
}
|
||||
|
||||
void IntelDriver::enable_counter(uint32_t counter_id)
|
||||
{
|
||||
auto &counter = counters[counter_id];
|
||||
auto &group = groups[counter.group];
|
||||
if (perf->query) {
|
||||
if (perf->query->symbol_name != group.name) {
|
||||
PPS_LOG_ERROR(
|
||||
"Unable to enable metrics from different sets: %u "
|
||||
"belongs to %s but %s is currently in use.",
|
||||
counter_id,
|
||||
perf->query->symbol_name,
|
||||
group.name.c_str());
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
enabled_counters.emplace_back(counter);
|
||||
if (!perf->query) {
|
||||
perf->query = perf->find_query_by_name(group.name);
|
||||
}
|
||||
}
|
||||
|
||||
void IntelDriver::enable_all_counters()
|
||||
{
|
||||
// We can only enable one metric set at a time so at least enable one.
|
||||
for (auto &group : groups) {
|
||||
if (group.name == "RenderBasic") {
|
||||
for (uint32_t counter_id : group.counters) {
|
||||
auto &counter = counters[counter_id];
|
||||
enabled_counters.emplace_back(counter);
|
||||
}
|
||||
|
||||
perf->query = perf->find_query_by_name(group.name);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static uint64_t timespec_diff(timespec *begin, timespec *end)
|
||||
{
|
||||
return 1000000000ull * (end->tv_sec - begin->tv_sec) + end->tv_nsec - begin->tv_nsec;
|
||||
}
|
||||
|
||||
/// @brief This function tries to correlate CPU time with GPU time
|
||||
std::optional<TimestampCorrelation> IntelDriver::query_correlation_timestamps() const
|
||||
{
|
||||
TimestampCorrelation corr = {};
|
||||
|
||||
clock_t correlation_clock_id = CLOCK_BOOTTIME;
|
||||
|
||||
drm_i915_reg_read reg_read = {};
|
||||
const uint64_t render_ring_timestamp = 0x2358;
|
||||
reg_read.offset = render_ring_timestamp | I915_REG_READ_8B_WA;
|
||||
|
||||
constexpr size_t attempt_count = 3;
|
||||
struct {
|
||||
timespec cpu_ts_begin;
|
||||
timespec cpu_ts_end;
|
||||
uint64_t gpu_ts;
|
||||
} attempts[attempt_count] = {};
|
||||
|
||||
uint32_t best = 0;
|
||||
|
||||
// Gather 3 correlations
|
||||
for (uint32_t i = 0; i < attempt_count; i++) {
|
||||
clock_gettime(correlation_clock_id, &attempts[i].cpu_ts_begin);
|
||||
if (perf_ioctl(drm_device.fd, DRM_IOCTL_I915_REG_READ, ®_read) < 0) {
|
||||
return std::nullopt;
|
||||
}
|
||||
clock_gettime(correlation_clock_id, &attempts[i].cpu_ts_end);
|
||||
|
||||
attempts[i].gpu_ts = reg_read.val;
|
||||
}
|
||||
|
||||
// Now select the best
|
||||
for (uint32_t i = 1; i < attempt_count; i++) {
|
||||
if (timespec_diff(&attempts[i].cpu_ts_begin, &attempts[i].cpu_ts_end) <
|
||||
timespec_diff(&attempts[best].cpu_ts_begin, &attempts[best].cpu_ts_end)) {
|
||||
best = i;
|
||||
}
|
||||
}
|
||||
|
||||
corr.cpu_timestamp =
|
||||
(attempts[best].cpu_ts_begin.tv_sec * 1000000000ull + attempts[best].cpu_ts_begin.tv_nsec) +
|
||||
timespec_diff(&attempts[best].cpu_ts_begin, &attempts[best].cpu_ts_end) / 2;
|
||||
corr.gpu_timestamp = attempts[best].gpu_ts;
|
||||
|
||||
return corr;
|
||||
}
|
||||
|
||||
void IntelDriver::get_new_correlation()
|
||||
{
|
||||
// Rotate left correlations by one position so to make space at the end
|
||||
std::rotate(correlations.begin(), correlations.begin() + 1, correlations.end());
|
||||
|
||||
// Then we overwrite the last correlation with a new one
|
||||
if (auto corr = query_correlation_timestamps()) {
|
||||
correlations.back() = *corr;
|
||||
} else {
|
||||
PPS_LOG_FATAL("Failed to get correlation timestamps");
|
||||
}
|
||||
}
|
||||
|
||||
bool IntelDriver::init_perfcnt()
|
||||
{
|
||||
assert(!perf && "Intel perf should not be initialized at this point");
|
||||
|
||||
perf = std::make_unique<IntelPerf>(drm_device.fd);
|
||||
|
||||
for (auto &query : perf->get_queries()) {
|
||||
// Create group
|
||||
CounterGroup group = {};
|
||||
group.id = groups.size();
|
||||
group.name = query->symbol_name;
|
||||
|
||||
for (int i = 0; i < query->n_counters; ++i) {
|
||||
intel_perf_query_counter &counter = query->counters[i];
|
||||
|
||||
// Create counter
|
||||
Counter counter_desc = {};
|
||||
counter_desc.id = counters.size();
|
||||
counter_desc.name = counter.symbol_name;
|
||||
counter_desc.group = group.id;
|
||||
counter_desc.getter = [counter, query, this](
|
||||
const Counter &c, const Driver &dri) -> Counter::Value {
|
||||
switch (counter.data_type) {
|
||||
case INTEL_PERF_COUNTER_DATA_TYPE_UINT64:
|
||||
case INTEL_PERF_COUNTER_DATA_TYPE_UINT32:
|
||||
case INTEL_PERF_COUNTER_DATA_TYPE_BOOL32:
|
||||
return (int64_t)counter.oa_counter_read_uint64(perf->cfg, query, &result);
|
||||
break;
|
||||
case INTEL_PERF_COUNTER_DATA_TYPE_DOUBLE:
|
||||
case INTEL_PERF_COUNTER_DATA_TYPE_FLOAT:
|
||||
return counter.oa_counter_read_float(perf->cfg, query, &result);
|
||||
break;
|
||||
}
|
||||
|
||||
return {};
|
||||
};
|
||||
|
||||
// Add counter id to the group
|
||||
group.counters.emplace_back(counter_desc.id);
|
||||
|
||||
// Store counter
|
||||
counters.emplace_back(std::move(counter_desc));
|
||||
}
|
||||
|
||||
// Store group
|
||||
groups.emplace_back(std::move(group));
|
||||
}
|
||||
|
||||
assert(groups.size() && "Failed to query groups");
|
||||
assert(counters.size() && "Failed to query counters");
|
||||
|
||||
// Clear accumulations
|
||||
intel_perf_query_result_clear(&result);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void IntelDriver::enable_perfcnt(uint64_t sampling_period_ns)
|
||||
{
|
||||
this->sampling_period_ns = sampling_period_ns;
|
||||
|
||||
// Fill correlations with an initial one
|
||||
if (auto corr = query_correlation_timestamps()) {
|
||||
correlations.fill(*corr);
|
||||
} else {
|
||||
PPS_LOG_FATAL("Failed to get correlation timestamps");
|
||||
}
|
||||
|
||||
if (!perf->open(sampling_period_ns)) {
|
||||
PPS_LOG_FATAL("Failed to open intel perf");
|
||||
}
|
||||
}
|
||||
|
||||
/// @brief Transforms the GPU timestop into a CPU timestamp equivalent
|
||||
uint64_t IntelDriver::correlate_gpu_timestamp(const uint32_t gpu_ts)
|
||||
{
|
||||
auto &corr_a = correlations[0];
|
||||
auto &corr_b = correlations[correlations.size() - 1];
|
||||
|
||||
// A correlation timestamp has 36 bits, so get the first 32 to make it work with gpu_ts
|
||||
uint64_t mask = 0xffffffff;
|
||||
uint32_t corr_a_gpu_ts = corr_a.gpu_timestamp & mask;
|
||||
uint32_t corr_b_gpu_ts = corr_b.gpu_timestamp & mask;
|
||||
|
||||
// Make sure it is within the interval [a,b)
|
||||
assert(gpu_ts >= corr_a_gpu_ts && "GPU TS < Corr a");
|
||||
assert(gpu_ts < corr_b_gpu_ts && "GPU TS >= Corr b");
|
||||
|
||||
uint32_t gpu_delta = gpu_ts - corr_a_gpu_ts;
|
||||
// Factor to convert gpu time to cpu time
|
||||
double gpu_to_cpu = (corr_b.cpu_timestamp - corr_a.cpu_timestamp) /
|
||||
double(corr_b.gpu_timestamp - corr_a.gpu_timestamp);
|
||||
uint64_t cpu_delta = gpu_delta * gpu_to_cpu;
|
||||
return corr_a.cpu_timestamp + cpu_delta;
|
||||
}
|
||||
|
||||
void IntelDriver::disable_perfcnt()
|
||||
{
|
||||
perf = nullptr;
|
||||
groups.clear();
|
||||
counters.clear();
|
||||
enabled_counters.clear();
|
||||
}
|
||||
|
||||
struct Report {
|
||||
uint32_t version;
|
||||
uint32_t timestamp;
|
||||
uint32_t id;
|
||||
};
|
||||
|
||||
/// @brief Some perf record durations can be really short
|
||||
/// @return True if the duration is at least close to the sampling period
|
||||
static bool close_enough(uint64_t duration, uint64_t sampling_period)
|
||||
{
|
||||
return duration > sampling_period - 100000;
|
||||
}
|
||||
|
||||
/// @brief Transforms the raw data received in from the driver into records
|
||||
std::vector<PerfRecord> IntelDriver::parse_perf_records(const std::vector<uint8_t> &data,
|
||||
const size_t byte_count)
|
||||
{
|
||||
std::vector<PerfRecord> records;
|
||||
records.reserve(128);
|
||||
|
||||
PerfRecord record;
|
||||
record.reserve(512);
|
||||
|
||||
const uint8_t *iter = data.data();
|
||||
const uint8_t *end = iter + byte_count;
|
||||
|
||||
uint64_t prev_cpu_timestamp = last_cpu_timestamp;
|
||||
|
||||
while (iter < end) {
|
||||
// Iterate a record at a time
|
||||
auto header = reinterpret_cast<const drm_i915_perf_record_header *>(iter);
|
||||
|
||||
if (header->type == DRM_I915_PERF_RECORD_SAMPLE) {
|
||||
// Report is next to the header
|
||||
auto report = reinterpret_cast<const Report *>(header + 1);
|
||||
auto cpu_timestamp = correlate_gpu_timestamp(report->timestamp);
|
||||
auto duration = cpu_timestamp - prev_cpu_timestamp;
|
||||
|
||||
// Skip perf-records that are too short by checking
|
||||
// the distance between last report and this one
|
||||
if (close_enough(duration, sampling_period_ns)) {
|
||||
prev_cpu_timestamp = cpu_timestamp;
|
||||
|
||||
// Add the new record to the list
|
||||
record.resize(header->size); // Possibly 264?
|
||||
memcpy(record.data(), iter, header->size);
|
||||
records.emplace_back(record);
|
||||
}
|
||||
}
|
||||
|
||||
// Go to the next record
|
||||
iter += header->size;
|
||||
}
|
||||
|
||||
return records;
|
||||
}
|
||||
|
||||
/// @brief Read all the available data from the metric set currently in use
|
||||
void IntelDriver::read_data_from_metric_set()
|
||||
{
|
||||
assert(metric_buffer.size() >= 1024 && "Metric buffer should have space for reading");
|
||||
|
||||
ssize_t bytes_read = 0;
|
||||
while ((bytes_read = perf->read_oa_stream(metric_buffer.data() + total_bytes_read,
|
||||
metric_buffer.size() - total_bytes_read)) > 0 ||
|
||||
errno == EINTR) {
|
||||
total_bytes_read += std::max(ssize_t(0), bytes_read);
|
||||
|
||||
// Increase size of the buffer for the next read
|
||||
if (metric_buffer.size() / 2 < total_bytes_read) {
|
||||
metric_buffer.resize(metric_buffer.size() * 2);
|
||||
}
|
||||
}
|
||||
|
||||
assert(total_bytes_read < metric_buffer.size() && "Buffer not big enough");
|
||||
}
|
||||
|
||||
bool IntelDriver::dump_perfcnt()
|
||||
{
|
||||
if (!perf->oa_stream_ready()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
read_data_from_metric_set();
|
||||
|
||||
get_new_correlation();
|
||||
|
||||
auto new_records = parse_perf_records(metric_buffer, total_bytes_read);
|
||||
if (new_records.empty()) {
|
||||
PPS_LOG("No new records");
|
||||
// No new records from the GPU yet
|
||||
return false;
|
||||
} else {
|
||||
PPS_LOG("Records parsed bytes: %lu", total_bytes_read);
|
||||
// Records are parsed correctly, so we can reset the
|
||||
// number of bytes read so far from the metric set
|
||||
total_bytes_read = 0;
|
||||
}
|
||||
|
||||
APPEND(records, new_records);
|
||||
|
||||
if (records.size() < 2) {
|
||||
// Not enough records to accumulate
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
uint32_t IntelDriver::gpu_next()
|
||||
{
|
||||
if (records.size() < 2) {
|
||||
// Not enough records to accumulate
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Get first and second
|
||||
auto record_a = reinterpret_cast<const drm_i915_perf_record_header *>(records[0].data());
|
||||
auto record_b = reinterpret_cast<const drm_i915_perf_record_header *>(records[1].data());
|
||||
|
||||
intel_perf_query_result_accumulate_fields(&result,
|
||||
&perf->query.value(),
|
||||
&perf->devinfo,
|
||||
record_a + 1,
|
||||
record_b + 1,
|
||||
false /* no_oa_accumulate */);
|
||||
|
||||
// Get last timestamp
|
||||
auto report_b = reinterpret_cast<const Report *>(record_b + 1);
|
||||
auto gpu_timestamp = report_b->timestamp;
|
||||
|
||||
// Consume first record
|
||||
records.erase(std::begin(records), std::begin(records) + 1);
|
||||
|
||||
return gpu_timestamp;
|
||||
}
|
||||
|
||||
uint64_t IntelDriver::cpu_next()
|
||||
{
|
||||
if (auto gpu_timestamp = gpu_next()) {
|
||||
auto cpu_timestamp = correlate_gpu_timestamp(gpu_timestamp);
|
||||
|
||||
last_cpu_timestamp = cpu_timestamp;
|
||||
return cpu_timestamp;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
uint64_t IntelDriver::next()
|
||||
{
|
||||
// Reset accumulation
|
||||
intel_perf_query_result_clear(&result);
|
||||
return cpu_next();
|
||||
}
|
||||
|
||||
} // namespace pps
|
|
@ -0,0 +1,96 @@
|
|||
/*
|
||||
* Copyright © 2020-2021 Collabora, Ltd.
|
||||
* Author: Antonio Caggiano <antonio.caggiano@collabora.com>
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <pps/pps_driver.h>
|
||||
|
||||
#include "intel_pps_perf.h"
|
||||
|
||||
namespace pps
|
||||
{
|
||||
/// Timestamp correlation between CPU/GPU.
|
||||
struct TimestampCorrelation {
|
||||
/// In CLOCK_MONOTONIC
|
||||
uint64_t cpu_timestamp;
|
||||
|
||||
/// Engine timestamp associated with the OA unit
|
||||
uint64_t gpu_timestamp;
|
||||
};
|
||||
|
||||
/// @brief Variable length sequence of bytes generated by Intel Obstervation Architecture (OA)
|
||||
using PerfRecord = std::vector<uint8_t>;
|
||||
|
||||
/// @brief PPS Driver implementation for Intel graphics devices.
|
||||
/// When sampling it may collect multiple perf-records at once. Each perf-record holds multiple
|
||||
/// counter values. Those values are continuously incremented by the GPU. In order to get a delta,
|
||||
/// the driver computes an _accumulation_ (`last_perf_record - previous_perf_record`).
|
||||
/// For optimization purposes, it might ignore some perf-records, considering only those
|
||||
/// perf-records close to the boundary of the sampling period range.
|
||||
class IntelDriver : public Driver
|
||||
{
|
||||
public:
|
||||
std::optional<TimestampCorrelation> query_correlation_timestamps() const;
|
||||
void get_new_correlation();
|
||||
|
||||
/// @brief OA reports only have the lower 32 bits of the timestamp
|
||||
/// register, while correlation data has the whole 36 bits.
|
||||
/// @param gpu_ts a 32 bit OA report GPU timestamp
|
||||
/// @return The CPU timestamp relative to the argument
|
||||
uint64_t correlate_gpu_timestamp(uint32_t gpu_ts);
|
||||
|
||||
uint64_t get_min_sampling_period_ns() override;
|
||||
bool init_perfcnt() override;
|
||||
void enable_counter(uint32_t counter_id) override;
|
||||
void enable_all_counters() override;
|
||||
void enable_perfcnt(uint64_t sampling_period_ns) override;
|
||||
void disable_perfcnt() override;
|
||||
bool dump_perfcnt() override;
|
||||
uint64_t next() override;
|
||||
|
||||
/// @brief Requests the next perf sample
|
||||
/// @return The sample GPU timestamp
|
||||
uint32_t gpu_next();
|
||||
|
||||
/// @brief Requests the next perf sample accumulating those which
|
||||
/// which duration is shorter than the requested sampling period
|
||||
/// @return The sample CPU timestamp
|
||||
uint64_t cpu_next();
|
||||
|
||||
/// @param data Buffer of bytes to parse
|
||||
/// @param byte_count Number of bytes to parse
|
||||
/// @return A list of perf records parsed from raw data passed as input
|
||||
std::vector<PerfRecord> parse_perf_records(const std::vector<uint8_t> &data, size_t byte_count);
|
||||
|
||||
/// @brief Reads data from the GPU metric set
|
||||
void read_data_from_metric_set();
|
||||
|
||||
/// Sampling period in nanoseconds requested by the datasource
|
||||
uint64_t sampling_period_ns = 0;
|
||||
|
||||
/// Keep track of the timestamp of the last sample generated
|
||||
uint64_t last_cpu_timestamp = 0;
|
||||
|
||||
/// This is used to correlate CPU and GPU timestamps
|
||||
std::array<TimestampCorrelation, 64> correlations;
|
||||
|
||||
/// Data buffer used to store data read from the metric set
|
||||
std::vector<uint8_t> metric_buffer = std::vector<uint8_t>(1024, 0);
|
||||
/// Number of bytes read so far still un-parsed.
|
||||
/// Reset once bytes from the metric buffer are parsed to perf records
|
||||
size_t total_bytes_read = 0;
|
||||
|
||||
/// List of OA perf records read so far
|
||||
std::vector<PerfRecord> records;
|
||||
|
||||
std::unique_ptr<IntelPerf> perf;
|
||||
|
||||
// Accumulations are stored here
|
||||
struct intel_perf_query_result result = {};
|
||||
};
|
||||
|
||||
} // namespace pps
|
|
@ -0,0 +1,194 @@
|
|||
/*
|
||||
* Copyright © 2021 Collabora, Ltd.
|
||||
* Author: Antonio Caggiano <antonio.caggiano@collabora.com>
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
#include "intel_pps_perf.h"
|
||||
|
||||
#include <i915_drm.h>
|
||||
#include <math.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <util/ralloc.h>
|
||||
#include <utility>
|
||||
|
||||
#include <pps/pps.h>
|
||||
#include <pps/pps_device.h>
|
||||
|
||||
namespace pps
|
||||
{
|
||||
int perf_ioctl(int fd, unsigned long request, void *arg)
|
||||
{
|
||||
int ret;
|
||||
|
||||
do {
|
||||
ret = ioctl(fd, request, arg);
|
||||
} while (ret == -1 && (errno == EINTR || errno == EAGAIN));
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
IntelPerf::IntelPerf(const int drm_fd)
|
||||
: drm_fd {drm_fd}
|
||||
, ralloc_ctx {ralloc_context(nullptr)}
|
||||
, ralloc_cfg {ralloc_context(nullptr)}
|
||||
, cfg {intel_perf_new(ralloc_cfg)}
|
||||
{
|
||||
assert(drm_fd >= 0 && "DRM fd is not valid");
|
||||
|
||||
if (!intel_get_device_info_from_fd(drm_fd, &devinfo)) {
|
||||
PPS_LOG_FATAL("Failed to get devinfo");
|
||||
}
|
||||
|
||||
intel_perf_init_metrics(cfg,
|
||||
&devinfo,
|
||||
drm_fd,
|
||||
false, // no pipeline statistics
|
||||
false // no register snapshots
|
||||
);
|
||||
|
||||
// Enable RenderBasic counters
|
||||
auto query_name = "RenderBasic";
|
||||
query = find_query_by_name(query_name);
|
||||
if (!query) {
|
||||
PPS_LOG_FATAL("Failed to find %s query", query_name);
|
||||
}
|
||||
}
|
||||
|
||||
IntelPerf::IntelPerf(IntelPerf &&o)
|
||||
: drm_fd {o.drm_fd}
|
||||
, ralloc_ctx {o.ralloc_ctx}
|
||||
, ralloc_cfg {o.ralloc_cfg}
|
||||
, ctx {o.ctx}
|
||||
, cfg {o.cfg}
|
||||
, devinfo {std::move(o.devinfo)}
|
||||
, query {std::move(o.query)}
|
||||
{
|
||||
o.drm_fd = -1;
|
||||
o.ralloc_ctx = nullptr;
|
||||
o.ralloc_cfg = nullptr;
|
||||
o.ctx = nullptr;
|
||||
o.cfg = nullptr;
|
||||
}
|
||||
|
||||
IntelPerf &IntelPerf::operator=(IntelPerf &&o) noexcept
|
||||
{
|
||||
std::swap(drm_fd, o.drm_fd);
|
||||
std::swap(ralloc_ctx, o.ralloc_ctx);
|
||||
std::swap(ralloc_cfg, o.ralloc_cfg);
|
||||
std::swap(ctx, o.ctx);
|
||||
std::swap(cfg, o.cfg);
|
||||
std::swap(devinfo, o.devinfo);
|
||||
std::swap(query, o.query);
|
||||
return *this;
|
||||
}
|
||||
|
||||
IntelPerf::~IntelPerf()
|
||||
{
|
||||
close();
|
||||
|
||||
if (ralloc_ctx) {
|
||||
ralloc_free(ralloc_ctx);
|
||||
}
|
||||
|
||||
if (ralloc_cfg) {
|
||||
ralloc_free(ralloc_cfg);
|
||||
}
|
||||
}
|
||||
|
||||
/// @return A query info, which is something like a group of counters
|
||||
std::optional<struct intel_perf_query_info> IntelPerf::find_query_by_name(
|
||||
const std::string &name) const
|
||||
{
|
||||
for (int i = 0; i < cfg->n_queries; ++i) {
|
||||
struct intel_perf_query_info query = cfg->queries[i];
|
||||
if (name == query.symbol_name) {
|
||||
return query;
|
||||
}
|
||||
}
|
||||
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
std::vector<struct intel_perf_query_info *> IntelPerf::get_queries() const
|
||||
{
|
||||
assert(cfg && "Intel perf config should be valid");
|
||||
assert(cfg->n_queries && "Intel perf queries not initialized");
|
||||
|
||||
std::vector<struct intel_perf_query_info *> queries = {};
|
||||
|
||||
for (int i = 0; i < cfg->n_queries; ++i) {
|
||||
struct intel_perf_query_info *query = &cfg->queries[i];
|
||||
// Skip invalid queries
|
||||
if (query && query->symbol_name) {
|
||||
queries.push_back(query);
|
||||
}
|
||||
}
|
||||
|
||||
return queries;
|
||||
}
|
||||
|
||||
static uint64_t query_timestamp_frequency(const int drm_fd)
|
||||
{
|
||||
int timestamp_frequency;
|
||||
|
||||
drm_i915_getparam_t gp = {};
|
||||
gp.param = I915_PARAM_CS_TIMESTAMP_FREQUENCY;
|
||||
gp.value = ×tamp_frequency;
|
||||
if (perf_ioctl(drm_fd, DRM_IOCTL_I915_GETPARAM, &gp) == 0) {
|
||||
return timestamp_frequency;
|
||||
}
|
||||
|
||||
PPS_LOG_ERROR("Unable to query timestamp frequency from i915, guessing values...");
|
||||
return 12000000;
|
||||
}
|
||||
|
||||
// The period_exponent gives a sampling period as follows:
|
||||
// sample_period = timestamp_period * 2^(period_exponent + 1)
|
||||
// where timestamp_period is 80ns for Haswell+
|
||||
static uint32_t get_oa_exponent(const int drm_fd, const uint64_t sampling_period_ns)
|
||||
{
|
||||
uint64_t timestamp_frequency = query_timestamp_frequency(drm_fd);
|
||||
return static_cast<uint32_t>(log2(sampling_period_ns * timestamp_frequency / 1000000000ull)) - 1;
|
||||
}
|
||||
|
||||
bool IntelPerf::open(const uint64_t sampling_period_ns)
|
||||
{
|
||||
assert(!ctx && "Perf context should not be initialized at this point");
|
||||
|
||||
ctx = intel_perf_new_context(ralloc_ctx);
|
||||
intel_perf_init_context(ctx, cfg, nullptr, nullptr, nullptr, &devinfo, 0, drm_fd);
|
||||
|
||||
auto oa_exponent = get_oa_exponent(drm_fd, sampling_period_ns);
|
||||
|
||||
return intel_perf_open(ctx,
|
||||
query->oa_metrics_set_id,
|
||||
query->oa_format,
|
||||
oa_exponent,
|
||||
drm_fd,
|
||||
INTEL_PERF_INVALID_CTX_ID,
|
||||
true /* enable stream immediately */);
|
||||
}
|
||||
|
||||
void IntelPerf::close()
|
||||
{
|
||||
if (ctx) {
|
||||
intel_perf_close(ctx, nullptr);
|
||||
ctx = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
bool IntelPerf::oa_stream_ready() const
|
||||
{
|
||||
assert(ctx && "Perf context was not open");
|
||||
return intel_perf_oa_stream_ready(ctx);
|
||||
}
|
||||
|
||||
ssize_t IntelPerf::read_oa_stream(void *buf, size_t bytes) const
|
||||
{
|
||||
assert(ctx && "Perf context was not open");
|
||||
return intel_perf_read_oa_stream(ctx, buf, bytes);
|
||||
}
|
||||
|
||||
} // namespace pps
|
|
@ -0,0 +1,57 @@
|
|||
/*
|
||||
* Copyright © 2021 Collabora, Ltd.
|
||||
* Author: Antonio Caggiano <antonio.caggiano@collabora.com>
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <optional>
|
||||
#include <vector>
|
||||
|
||||
#include <intel/dev/intel_device_info.h>
|
||||
#include <intel/perf/intel_perf.h>
|
||||
#include <intel/perf/intel_perf_query.h>
|
||||
|
||||
namespace pps
|
||||
{
|
||||
int perf_ioctl(int fd, unsigned long request, void *arg);
|
||||
|
||||
class IntelPerf
|
||||
{
|
||||
public:
|
||||
IntelPerf(int drm_fd);
|
||||
|
||||
IntelPerf(const IntelPerf &) = delete;
|
||||
IntelPerf &operator=(const IntelPerf &) = delete;
|
||||
|
||||
IntelPerf(IntelPerf &&);
|
||||
IntelPerf &operator=(IntelPerf &&) noexcept;
|
||||
|
||||
~IntelPerf();
|
||||
|
||||
std::optional<struct intel_perf_query_info> find_query_by_name(const std::string &name) const;
|
||||
|
||||
std::vector<struct intel_perf_query_info*> get_queries() const;
|
||||
|
||||
bool open(uint64_t sampling_period_ns);
|
||||
void close();
|
||||
|
||||
bool oa_stream_ready() const;
|
||||
ssize_t read_oa_stream(void *buf, size_t bytes) const;
|
||||
|
||||
int drm_fd = -1;
|
||||
|
||||
void *ralloc_ctx = nullptr;
|
||||
void *ralloc_cfg = nullptr;
|
||||
|
||||
struct intel_perf_context *ctx = nullptr;
|
||||
struct intel_perf_config *cfg = nullptr;
|
||||
|
||||
struct intel_device_info devinfo = {};
|
||||
|
||||
std::optional<struct intel_perf_query_info> query = std::nullopt;
|
||||
};
|
||||
|
||||
} // namespace pps
|
|
@ -0,0 +1,30 @@
|
|||
# Copyright © 2020-2021 Collabora, Ltd.
|
||||
# Author: Antonio Caggiano <antonio.caggiano@collabora.com>
|
||||
#
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
pps_intel_sources = [
|
||||
'intel_pps_perf.cc',
|
||||
'intel_pps_driver.cc',
|
||||
]
|
||||
|
||||
|
||||
pps_intel_lib = static_library(
|
||||
'pps-intel',
|
||||
sources: pps_intel_sources,
|
||||
include_directories: [inc_tool, inc_src, inc_include],
|
||||
link_with: [libintel_perf, libintel_dev],
|
||||
dependencies: [dep_perfetto, dep_libdrm, idep_mesautil],
|
||||
cpp_args: '-std=c++17'
|
||||
)
|
||||
|
||||
compile_args_pps_intel = ['-DPPS_INTEL']
|
||||
|
||||
pps_intel_dep = declare_dependency(
|
||||
link_with: pps_intel_lib,
|
||||
include_directories: [inc_tool, inc_include],
|
||||
compile_args: compile_args_pps_intel,
|
||||
)
|
||||
|
||||
pps_datasources += pps_intel_dep
|
||||
with_datasources += 'intel'
|
|
@ -37,3 +37,6 @@ endif
|
|||
if with_intel_vk
|
||||
subdir('vulkan')
|
||||
endif
|
||||
if with_perfetto and (with_datasources.contains('intel') or with_datasources.contains('auto'))
|
||||
subdir('ds')
|
||||
endif
|
||||
|
|
|
@ -17,6 +17,10 @@
|
|||
#include "freedreno/ds/fd_pps_driver.h"
|
||||
#endif // PPS_FREEDRENO
|
||||
|
||||
#ifdef PPS_INTEL
|
||||
#include "intel/ds/intel_pps_driver.h"
|
||||
#endif // PPS_INTEL
|
||||
|
||||
#include "pps.h"
|
||||
#include "pps_algorithm.h"
|
||||
|
||||
|
@ -30,6 +34,10 @@ std::unordered_map<std::string, std::unique_ptr<Driver>> create_supported_driver
|
|||
map.emplace("msm", std::make_unique<FreedrenoDriver>());
|
||||
#endif // PPS_FREEDRENO
|
||||
|
||||
#ifdef PPS_INTEL
|
||||
map.emplace("i915", std::make_unique<IntelDriver>());
|
||||
#endif // PPS_INTEL
|
||||
|
||||
return map;
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue