pps: Intel pps driver

Add the Intel pps driver using functionalities provided by libintel_perf. v2: Fix build with perfetto not enabled. v3: Open perf stream with no filtering. v4: Drop usage of inc/dec_n_users. v5: Isolate intel_perf in its own class. Signed-off-by: Antonio Caggiano <antonio.caggiano@collabora.com> Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10216>
2021-03-24 17:51:13 +01:00 · 2021-03-24 17:51:13 +01:00 · 92180a4376
parent 5d95aa3964
commit 92180a4376
8 changed files with 806 additions and 0 deletions
--- a/src/intel/ds/.clang-format
+++ b/src/intel/ds/.clang-format
@ -0,0 +1,21 @@
+BasedOnStyle: WebKit
+AlignTrailingComments: 'true'
+AllowAllParametersOfDeclarationOnNextLine: 'false'
+AllowShortFunctionsOnASingleLine: None
+AlwaysBreakBeforeMultilineStrings: 'true'
+BinPackArguments: 'false'
+BinPackParameters: 'false'
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Linux
+ColumnLimit: '100'
+Cpp11BracedListStyle: 'true'
+KeepEmptyLinesAtTheStartOfBlocks: 'false'
+NamespaceIndentation: None
+PointerAlignment: Right
+SortIncludes: 'true'
+SpaceAfterTemplateKeyword: 'false'
+Standard: Cpp11
+TabWidth: '3'
+IndentWidth: '3'
+ConstructorInitializerIndentWidth: '3'
+ContinuationIndentWidth: '3'
--- a/src/intel/ds/intel_pps_driver.cc
+++ b/src/intel/ds/intel_pps_driver.cc
@ -0,0 +1,397 @@
+/*
+ * Copyright © 2020-2021 Collabora, Ltd.
+ * Author: Antonio Caggiano <antonio.caggiano@collabora.com>
+ * Author: Corentin Noël <corentin.noel@collabora.com>
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "intel_pps_driver.h"
+
+#include <dirent.h>
+#include <fcntl.h>
+#include <math.h>
+#include <poll.h>
+#include <strings.h>
+#include <sys/ioctl.h>
+#include <unistd.h>
+
+#include <i915_drm.h>
+#include <intel/perf/intel_perf_query.h>
+
+#include <pps/pps.h>
+#include <pps/pps_algorithm.h>
+
+#include "intel_pps_perf.h"
+
+namespace pps
+{
+uint64_t IntelDriver::get_min_sampling_period_ns()
+{
+   return 500000;
+}
+
+void IntelDriver::enable_counter(uint32_t counter_id)
+{
+   auto &counter = counters[counter_id];
+   auto &group = groups[counter.group];
+   if (perf->query) {
+      if (perf->query->symbol_name != group.name) {
+         PPS_LOG_ERROR(
+            "Unable to enable metrics from different sets: %u "
+            "belongs to %s but %s is currently in use.",
+            counter_id,
+            perf->query->symbol_name,
+            group.name.c_str());
+         return;
+      }
+   }
+
+   enabled_counters.emplace_back(counter);
+   if (!perf->query) {
+      perf->query = perf->find_query_by_name(group.name);
+   }
+}
+
+void IntelDriver::enable_all_counters()
+{
+   // We can only enable one metric set at a time so at least enable one.
+   for (auto &group : groups) {
+      if (group.name == "RenderBasic") {
+         for (uint32_t counter_id : group.counters) {
+            auto &counter = counters[counter_id];
+            enabled_counters.emplace_back(counter);
+         }
+
+         perf->query = perf->find_query_by_name(group.name);
+         break;
+      }
+   }
+}
+
+static uint64_t timespec_diff(timespec *begin, timespec *end)
+{
+   return 1000000000ull * (end->tv_sec - begin->tv_sec) + end->tv_nsec - begin->tv_nsec;
+}
+
+/// @brief This function tries to correlate CPU time with GPU time
+std::optional<TimestampCorrelation> IntelDriver::query_correlation_timestamps() const
+{
+   TimestampCorrelation corr = {};
+
+   clock_t correlation_clock_id = CLOCK_BOOTTIME;
+
+   drm_i915_reg_read reg_read = {};
+   const uint64_t render_ring_timestamp = 0x2358;
+   reg_read.offset = render_ring_timestamp | I915_REG_READ_8B_WA;
+
+   constexpr size_t attempt_count = 3;
+   struct {
+      timespec cpu_ts_begin;
+      timespec cpu_ts_end;
+      uint64_t gpu_ts;
+   } attempts[attempt_count] = {};
+
+   uint32_t best = 0;
+
+   // Gather 3 correlations
+   for (uint32_t i = 0; i < attempt_count; i++) {
+      clock_gettime(correlation_clock_id, &attempts[i].cpu_ts_begin);
+      if (perf_ioctl(drm_device.fd, DRM_IOCTL_I915_REG_READ, &reg_read) < 0) {
+         return std::nullopt;
+      }
+      clock_gettime(correlation_clock_id, &attempts[i].cpu_ts_end);
+
+      attempts[i].gpu_ts = reg_read.val;
+   }
+
+   // Now select the best
+   for (uint32_t i = 1; i < attempt_count; i++) {
+      if (timespec_diff(&attempts[i].cpu_ts_begin, &attempts[i].cpu_ts_end) <
+         timespec_diff(&attempts[best].cpu_ts_begin, &attempts[best].cpu_ts_end)) {
+         best = i;
+      }
+   }
+
+   corr.cpu_timestamp =
+      (attempts[best].cpu_ts_begin.tv_sec * 1000000000ull + attempts[best].cpu_ts_begin.tv_nsec) +
+      timespec_diff(&attempts[best].cpu_ts_begin, &attempts[best].cpu_ts_end) / 2;
+   corr.gpu_timestamp = attempts[best].gpu_ts;
+
+   return corr;
+}
+
+void IntelDriver::get_new_correlation()
+{
+   // Rotate left correlations by one position so to make space at the end
+   std::rotate(correlations.begin(), correlations.begin() + 1, correlations.end());
+
+   // Then we overwrite the last correlation with a new one
+   if (auto corr = query_correlation_timestamps()) {
+      correlations.back() = *corr;
+   } else {
+      PPS_LOG_FATAL("Failed to get correlation timestamps");
+   }
+}
+
+bool IntelDriver::init_perfcnt()
+{
+   assert(!perf && "Intel perf should not be initialized at this point");
+
+   perf = std::make_unique<IntelPerf>(drm_device.fd);
+
+   for (auto &query : perf->get_queries()) {
+      // Create group
+      CounterGroup group = {};
+      group.id = groups.size();
+      group.name = query->symbol_name;
+
+      for (int i = 0; i < query->n_counters; ++i) {
+         intel_perf_query_counter &counter = query->counters[i];
+
+         // Create counter
+         Counter counter_desc = {};
+         counter_desc.id = counters.size();
+         counter_desc.name = counter.symbol_name;
+         counter_desc.group = group.id;
+         counter_desc.getter = [counter, query, this](
+                                  const Counter &c, const Driver &dri) -> Counter::Value {
+            switch (counter.data_type) {
+            case INTEL_PERF_COUNTER_DATA_TYPE_UINT64:
+            case INTEL_PERF_COUNTER_DATA_TYPE_UINT32:
+            case INTEL_PERF_COUNTER_DATA_TYPE_BOOL32:
+               return (int64_t)counter.oa_counter_read_uint64(perf->cfg, query, &result);
+               break;
+            case INTEL_PERF_COUNTER_DATA_TYPE_DOUBLE:
+            case INTEL_PERF_COUNTER_DATA_TYPE_FLOAT:
+               return counter.oa_counter_read_float(perf->cfg, query, &result);
+               break;
+            }
+
+            return {};
+         };
+
+         // Add counter id to the group
+         group.counters.emplace_back(counter_desc.id);
+
+         // Store counter
+         counters.emplace_back(std::move(counter_desc));
+      }
+
+      // Store group
+      groups.emplace_back(std::move(group));
+   }
+
+   assert(groups.size() && "Failed to query groups");
+   assert(counters.size() && "Failed to query counters");
+
+   // Clear accumulations
+   intel_perf_query_result_clear(&result);
+
+   return true;
+}
+
+void IntelDriver::enable_perfcnt(uint64_t sampling_period_ns)
+{
+   this->sampling_period_ns = sampling_period_ns;
+
+   // Fill correlations with an initial one
+   if (auto corr = query_correlation_timestamps()) {
+      correlations.fill(*corr);
+   } else {
+      PPS_LOG_FATAL("Failed to get correlation timestamps");
+   }
+
+   if (!perf->open(sampling_period_ns)) {
+      PPS_LOG_FATAL("Failed to open intel perf");
+   }
+}
+
+/// @brief Transforms the GPU timestop into a CPU timestamp equivalent
+uint64_t IntelDriver::correlate_gpu_timestamp(const uint32_t gpu_ts)
+{
+   auto &corr_a = correlations[0];
+   auto &corr_b = correlations[correlations.size() - 1];
+
+   // A correlation timestamp has 36 bits, so get the first 32 to make it work with gpu_ts
+   uint64_t mask = 0xffffffff;
+   uint32_t corr_a_gpu_ts = corr_a.gpu_timestamp & mask;
+   uint32_t corr_b_gpu_ts = corr_b.gpu_timestamp & mask;
+
+   // Make sure it is within the interval [a,b)
+   assert(gpu_ts >= corr_a_gpu_ts && "GPU TS < Corr a");
+   assert(gpu_ts < corr_b_gpu_ts && "GPU TS >= Corr b");
+
+   uint32_t gpu_delta = gpu_ts - corr_a_gpu_ts;
+   // Factor to convert gpu time to cpu time
+   double gpu_to_cpu = (corr_b.cpu_timestamp - corr_a.cpu_timestamp) /
+      double(corr_b.gpu_timestamp - corr_a.gpu_timestamp);
+   uint64_t cpu_delta = gpu_delta * gpu_to_cpu;
+   return corr_a.cpu_timestamp + cpu_delta;
+}
+
+void IntelDriver::disable_perfcnt()
+{
+   perf = nullptr;
+   groups.clear();
+   counters.clear();
+   enabled_counters.clear();
+}
+
+struct Report {
+   uint32_t version;
+   uint32_t timestamp;
+   uint32_t id;
+};
+
+/// @brief Some perf record durations can be really short
+/// @return True if the duration is at least close to the sampling period
+static bool close_enough(uint64_t duration, uint64_t sampling_period)
+{
+   return duration > sampling_period - 100000;
+}
+
+/// @brief Transforms the raw data received in from the driver into records
+std::vector<PerfRecord> IntelDriver::parse_perf_records(const std::vector<uint8_t> &data,
+   const size_t byte_count)
+{
+   std::vector<PerfRecord> records;
+   records.reserve(128);
+
+   PerfRecord record;
+   record.reserve(512);
+
+   const uint8_t *iter = data.data();
+   const uint8_t *end = iter + byte_count;
+
+   uint64_t prev_cpu_timestamp = last_cpu_timestamp;
+
+   while (iter < end) {
+      // Iterate a record at a time
+      auto header = reinterpret_cast<const drm_i915_perf_record_header *>(iter);
+
+      if (header->type == DRM_I915_PERF_RECORD_SAMPLE) {
+         // Report is next to the header
+         auto report = reinterpret_cast<const Report *>(header + 1);
+         auto cpu_timestamp = correlate_gpu_timestamp(report->timestamp);
+         auto duration = cpu_timestamp - prev_cpu_timestamp;
+
+         // Skip perf-records that are too short by checking
+         // the distance between last report and this one
+         if (close_enough(duration, sampling_period_ns)) {
+            prev_cpu_timestamp = cpu_timestamp;
+
+            // Add the new record to the list
+            record.resize(header->size); // Possibly 264?
+            memcpy(record.data(), iter, header->size);
+            records.emplace_back(record);
+         }
+      }
+
+      // Go to the next record
+      iter += header->size;
+   }
+
+   return records;
+}
+
+/// @brief Read all the available data from the metric set currently in use
+void IntelDriver::read_data_from_metric_set()
+{
+   assert(metric_buffer.size() >= 1024 && "Metric buffer should have space for reading");
+
+   ssize_t bytes_read = 0;
+   while ((bytes_read = perf->read_oa_stream(metric_buffer.data() + total_bytes_read,
+              metric_buffer.size() - total_bytes_read)) > 0 ||
+      errno == EINTR) {
+      total_bytes_read += std::max(ssize_t(0), bytes_read);
+
+      // Increase size of the buffer for the next read
+      if (metric_buffer.size() / 2 < total_bytes_read) {
+         metric_buffer.resize(metric_buffer.size() * 2);
+      }
+   }
+
+   assert(total_bytes_read < metric_buffer.size() && "Buffer not big enough");
+}
+
+bool IntelDriver::dump_perfcnt()
+{
+   if (!perf->oa_stream_ready()) {
+      return false;
+   }
+
+   read_data_from_metric_set();
+
+   get_new_correlation();
+
+   auto new_records = parse_perf_records(metric_buffer, total_bytes_read);
+   if (new_records.empty()) {
+      PPS_LOG("No new records");
+      // No new records from the GPU yet
+      return false;
+   } else {
+      PPS_LOG("Records parsed bytes: %lu", total_bytes_read);
+      // Records are parsed correctly, so we can reset the
+      // number of bytes read so far from the metric set
+      total_bytes_read = 0;
+   }
+
+   APPEND(records, new_records);
+
+   if (records.size() < 2) {
+      // Not enough records to accumulate
+      return false;
+   }
+
+   return true;
+}
+
+uint32_t IntelDriver::gpu_next()
+{
+   if (records.size() < 2) {
+      // Not enough records to accumulate
+      return 0;
+   }
+
+   // Get first and second
+   auto record_a = reinterpret_cast<const drm_i915_perf_record_header *>(records[0].data());
+   auto record_b = reinterpret_cast<const drm_i915_perf_record_header *>(records[1].data());
+
+   intel_perf_query_result_accumulate_fields(&result,
+      &perf->query.value(),
+      &perf->devinfo,
+      record_a + 1,
+      record_b + 1,
+      false /* no_oa_accumulate */);
+
+   // Get last timestamp
+   auto report_b = reinterpret_cast<const Report *>(record_b + 1);
+   auto gpu_timestamp = report_b->timestamp;
+
+   // Consume first record
+   records.erase(std::begin(records), std::begin(records) + 1);
+
+   return gpu_timestamp;
+}
+
+uint64_t IntelDriver::cpu_next()
+{
+   if (auto gpu_timestamp = gpu_next()) {
+      auto cpu_timestamp = correlate_gpu_timestamp(gpu_timestamp);
+
+      last_cpu_timestamp = cpu_timestamp;
+      return cpu_timestamp;
+   }
+
+   return 0;
+}
+
+uint64_t IntelDriver::next()
+{
+   // Reset accumulation
+   intel_perf_query_result_clear(&result);
+   return cpu_next();
+}
+
+} // namespace pps
--- a/src/intel/ds/intel_pps_driver.h
+++ b/src/intel/ds/intel_pps_driver.h
@ -0,0 +1,96 @@
+/*
+ * Copyright © 2020-2021 Collabora, Ltd.
+ * Author: Antonio Caggiano <antonio.caggiano@collabora.com>
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include <pps/pps_driver.h>
+
+#include "intel_pps_perf.h"
+
+namespace pps
+{
+/// Timestamp correlation between CPU/GPU.
+struct TimestampCorrelation {
+   /// In CLOCK_MONOTONIC
+   uint64_t cpu_timestamp;
+
+   /// Engine timestamp associated with the OA unit
+   uint64_t gpu_timestamp;
+};
+
+/// @brief Variable length sequence of bytes generated by Intel Obstervation Architecture (OA)
+using PerfRecord = std::vector<uint8_t>;
+
+/// @brief PPS Driver implementation for Intel graphics devices.
+/// When sampling it may collect multiple perf-records at once. Each perf-record holds multiple
+/// counter values. Those values are continuously incremented by the GPU. In order to get a delta,
+/// the driver computes an _accumulation_ (`last_perf_record - previous_perf_record`).
+/// For optimization purposes, it might ignore some perf-records, considering only those
+/// perf-records close to the boundary of the sampling period range.
+class IntelDriver : public Driver
+{
+   public:
+   std::optional<TimestampCorrelation> query_correlation_timestamps() const;
+   void get_new_correlation();
+
+   /// @brief OA reports only have the lower 32 bits of the timestamp
+   /// register, while correlation data has the whole 36 bits.
+   /// @param gpu_ts a 32 bit OA report GPU timestamp
+   /// @return The CPU timestamp relative to the argument
+   uint64_t correlate_gpu_timestamp(uint32_t gpu_ts);
+
+   uint64_t get_min_sampling_period_ns() override;
+   bool init_perfcnt() override;
+   void enable_counter(uint32_t counter_id) override;
+   void enable_all_counters() override;
+   void enable_perfcnt(uint64_t sampling_period_ns) override;
+   void disable_perfcnt() override;
+   bool dump_perfcnt() override;
+   uint64_t next() override;
+
+   /// @brief Requests the next perf sample
+   /// @return The sample GPU timestamp
+   uint32_t gpu_next();
+
+   /// @brief Requests the next perf sample accumulating those which
+   /// which duration is shorter than the requested sampling period
+   /// @return The sample CPU timestamp
+   uint64_t cpu_next();
+
+   /// @param data Buffer of bytes to parse
+   /// @param byte_count Number of bytes to parse
+   /// @return A list of perf records parsed from raw data passed as input
+   std::vector<PerfRecord> parse_perf_records(const std::vector<uint8_t> &data, size_t byte_count);
+
+   /// @brief Reads data from the GPU metric set
+   void read_data_from_metric_set();
+
+   /// Sampling period in nanoseconds requested by the datasource
+   uint64_t sampling_period_ns = 0;
+
+   /// Keep track of the timestamp of the last sample generated
+   uint64_t last_cpu_timestamp = 0;
+
+   /// This is used to correlate CPU and GPU timestamps
+   std::array<TimestampCorrelation, 64> correlations;
+
+   /// Data buffer used to store data read from the metric set
+   std::vector<uint8_t> metric_buffer = std::vector<uint8_t>(1024, 0);
+   /// Number of bytes read so far still un-parsed.
+   /// Reset once bytes from the metric buffer are parsed to perf records
+   size_t total_bytes_read = 0;
+
+   /// List of OA perf records read so far
+   std::vector<PerfRecord> records;
+
+   std::unique_ptr<IntelPerf> perf;
+
+   // Accumulations are stored here
+   struct intel_perf_query_result result = {};
+};
+
+} // namespace pps
--- a/src/intel/ds/intel_pps_perf.cc
+++ b/src/intel/ds/intel_pps_perf.cc
@ -0,0 +1,194 @@
+/*
+ * Copyright © 2021 Collabora, Ltd.
+ * Author: Antonio Caggiano <antonio.caggiano@collabora.com>
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "intel_pps_perf.h"
+
+#include <i915_drm.h>
+#include <math.h>
+#include <sys/ioctl.h>
+#include <util/ralloc.h>
+#include <utility>
+
+#include <pps/pps.h>
+#include <pps/pps_device.h>
+
+namespace pps
+{
+int perf_ioctl(int fd, unsigned long request, void *arg)
+{
+   int ret;
+
+   do {
+      ret = ioctl(fd, request, arg);
+   } while (ret == -1 && (errno == EINTR || errno == EAGAIN));
+
+   return ret;
+}
+
+IntelPerf::IntelPerf(const int drm_fd)
+   : drm_fd {drm_fd}
+   , ralloc_ctx {ralloc_context(nullptr)}
+   , ralloc_cfg {ralloc_context(nullptr)}
+   , cfg {intel_perf_new(ralloc_cfg)}
+{
+   assert(drm_fd >= 0 && "DRM fd is not valid");
+
+   if (!intel_get_device_info_from_fd(drm_fd, &devinfo)) {
+      PPS_LOG_FATAL("Failed to get devinfo");
+   }
+
+   intel_perf_init_metrics(cfg,
+      &devinfo,
+      drm_fd,
+      false, // no pipeline statistics
+      false  // no register snapshots
+   );
+
+   // Enable RenderBasic counters
+   auto query_name = "RenderBasic";
+   query = find_query_by_name(query_name);
+   if (!query) {
+      PPS_LOG_FATAL("Failed to find %s query", query_name);
+   }
+}
+
+IntelPerf::IntelPerf(IntelPerf &&o)
+   : drm_fd {o.drm_fd}
+   , ralloc_ctx {o.ralloc_ctx}
+   , ralloc_cfg {o.ralloc_cfg}
+   , ctx {o.ctx}
+   , cfg {o.cfg}
+   , devinfo {std::move(o.devinfo)}
+   , query {std::move(o.query)}
+{
+   o.drm_fd = -1;
+   o.ralloc_ctx = nullptr;
+   o.ralloc_cfg = nullptr;
+   o.ctx = nullptr;
+   o.cfg = nullptr;
+}
+
+IntelPerf &IntelPerf::operator=(IntelPerf &&o) noexcept
+{
+   std::swap(drm_fd, o.drm_fd);
+   std::swap(ralloc_ctx, o.ralloc_ctx);
+   std::swap(ralloc_cfg, o.ralloc_cfg);
+   std::swap(ctx, o.ctx);
+   std::swap(cfg, o.cfg);
+   std::swap(devinfo, o.devinfo);
+   std::swap(query, o.query);
+   return *this;
+}
+
+IntelPerf::~IntelPerf()
+{
+   close();
+
+   if (ralloc_ctx) {
+      ralloc_free(ralloc_ctx);
+   }
+
+   if (ralloc_cfg) {
+      ralloc_free(ralloc_cfg);
+   }
+}
+
+/// @return A query info, which is something like a group of counters
+std::optional<struct intel_perf_query_info> IntelPerf::find_query_by_name(
+   const std::string &name) const
+{
+   for (int i = 0; i < cfg->n_queries; ++i) {
+      struct intel_perf_query_info query = cfg->queries[i];
+      if (name == query.symbol_name) {
+         return query;
+      }
+   }
+
+   return std::nullopt;
+}
+
+std::vector<struct intel_perf_query_info *> IntelPerf::get_queries() const
+{
+   assert(cfg && "Intel perf config should be valid");
+   assert(cfg->n_queries && "Intel perf queries not initialized");
+
+   std::vector<struct intel_perf_query_info *> queries = {};
+
+   for (int i = 0; i < cfg->n_queries; ++i) {
+      struct intel_perf_query_info *query = &cfg->queries[i];
+      // Skip invalid queries
+      if (query && query->symbol_name) {
+         queries.push_back(query);
+      }
+   }
+
+   return queries;
+}
+
+static uint64_t query_timestamp_frequency(const int drm_fd)
+{
+   int timestamp_frequency;
+
+   drm_i915_getparam_t gp = {};
+   gp.param = I915_PARAM_CS_TIMESTAMP_FREQUENCY;
+   gp.value = &timestamp_frequency;
+   if (perf_ioctl(drm_fd, DRM_IOCTL_I915_GETPARAM, &gp) == 0) {
+      return timestamp_frequency;
+   }
+
+   PPS_LOG_ERROR("Unable to query timestamp frequency from i915, guessing values...");
+   return 12000000;
+}
+
+// The period_exponent gives a sampling period as follows:
+// sample_period = timestamp_period * 2^(period_exponent + 1)
+// where timestamp_period is 80ns for Haswell+
+static uint32_t get_oa_exponent(const int drm_fd, const uint64_t sampling_period_ns)
+{
+   uint64_t timestamp_frequency = query_timestamp_frequency(drm_fd);
+   return static_cast<uint32_t>(log2(sampling_period_ns * timestamp_frequency / 1000000000ull)) - 1;
+}
+
+bool IntelPerf::open(const uint64_t sampling_period_ns)
+{
+   assert(!ctx && "Perf context should not be initialized at this point");
+
+   ctx = intel_perf_new_context(ralloc_ctx);
+   intel_perf_init_context(ctx, cfg, nullptr, nullptr, nullptr, &devinfo, 0, drm_fd);
+
+   auto oa_exponent = get_oa_exponent(drm_fd, sampling_period_ns);
+
+   return intel_perf_open(ctx,
+      query->oa_metrics_set_id,
+      query->oa_format,
+      oa_exponent,
+      drm_fd,
+      INTEL_PERF_INVALID_CTX_ID,
+      true /* enable stream immediately */);
+}
+
+void IntelPerf::close()
+{
+   if (ctx) {
+      intel_perf_close(ctx, nullptr);
+      ctx = nullptr;
+   }
+}
+
+bool IntelPerf::oa_stream_ready() const
+{
+   assert(ctx && "Perf context was not open");
+   return intel_perf_oa_stream_ready(ctx);
+}
+
+ssize_t IntelPerf::read_oa_stream(void *buf, size_t bytes) const
+{
+   assert(ctx && "Perf context was not open");
+   return intel_perf_read_oa_stream(ctx, buf, bytes);
+}
+
+} // namespace pps
--- a/src/intel/ds/intel_pps_perf.h
+++ b/src/intel/ds/intel_pps_perf.h
@ -0,0 +1,57 @@
+/*
+ * Copyright © 2021 Collabora, Ltd.
+ * Author: Antonio Caggiano <antonio.caggiano@collabora.com>
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include <optional>
+#include <vector>
+
+#include <intel/dev/intel_device_info.h>
+#include <intel/perf/intel_perf.h>
+#include <intel/perf/intel_perf_query.h>
+
+namespace pps
+{
+int perf_ioctl(int fd, unsigned long request, void *arg);
+
+class IntelPerf
+{
+   public:
+   IntelPerf(int drm_fd);
+
+   IntelPerf(const IntelPerf &) = delete;
+   IntelPerf &operator=(const IntelPerf &) = delete;
+
+   IntelPerf(IntelPerf &&);
+   IntelPerf &operator=(IntelPerf &&) noexcept;
+
+   ~IntelPerf();
+
+   std::optional<struct intel_perf_query_info> find_query_by_name(const std::string &name) const;
+
+   std::vector<struct intel_perf_query_info*> get_queries() const;
+
+   bool open(uint64_t sampling_period_ns);
+   void close();
+
+   bool oa_stream_ready() const;
+   ssize_t read_oa_stream(void *buf, size_t bytes) const;
+
+   int drm_fd = -1;
+
+   void *ralloc_ctx = nullptr;
+   void *ralloc_cfg = nullptr;
+
+   struct intel_perf_context *ctx = nullptr;
+   struct intel_perf_config *cfg = nullptr;
+
+   struct intel_device_info devinfo = {};
+
+   std::optional<struct intel_perf_query_info> query = std::nullopt;
+};
+
+} // namespace pps
--- a/src/intel/ds/meson.build
+++ b/src/intel/ds/meson.build
@ -0,0 +1,30 @@
+# Copyright © 2020-2021 Collabora, Ltd.
+# Author: Antonio Caggiano <antonio.caggiano@collabora.com>
+#
+# SPDX-License-Identifier: MIT
+
+pps_intel_sources = [
+  'intel_pps_perf.cc',
+  'intel_pps_driver.cc',
+]
+
+
+pps_intel_lib = static_library(
+  'pps-intel',
+  sources: pps_intel_sources,
+  include_directories: [inc_tool, inc_src, inc_include],
+  link_with: [libintel_perf, libintel_dev],
+  dependencies: [dep_perfetto, dep_libdrm, idep_mesautil],
+  cpp_args: '-std=c++17'
+)
+
+compile_args_pps_intel = ['-DPPS_INTEL']
+
+pps_intel_dep = declare_dependency(
+  link_with: pps_intel_lib,
+  include_directories: [inc_tool, inc_include],
+  compile_args: compile_args_pps_intel,
+)
+
+pps_datasources += pps_intel_dep
+with_datasources += 'intel'
--- a/src/intel/meson.build
+++ b/src/intel/meson.build
@ -37,3 +37,6 @@ endif
 if with_intel_vk
  subdir('vulkan')
 endif
+if with_perfetto and (with_datasources.contains('intel') or with_datasources.contains('auto'))
+  subdir('ds')
+endif
--- a/src/tool/pps/pps_driver.cc
+++ b/src/tool/pps/pps_driver.cc
@ -17,6 +17,10 @@
 #include "freedreno/ds/fd_pps_driver.h"
 #endif // PPS_FREEDRENO

+#ifdef PPS_INTEL
+#include "intel/ds/intel_pps_driver.h"
+#endif // PPS_INTEL
+
 #include "pps.h"
 #include "pps_algorithm.h"

@ -30,6 +34,10 @@ std::unordered_map<std::string, std::unique_ptr<Driver>> create_supported_driver
   map.emplace("msm", std::make_unique<FreedrenoDriver>());
 #endif // PPS_FREEDRENO

+#ifdef PPS_INTEL
+   map.emplace("i915", std::make_unique<IntelDriver>());
+#endif // PPS_INTEL
+
   return map;
 }