mesa/src/freedreno/decode/replay.c

1424 lines
37 KiB
C

/*
* Copyright © 2022 Igalia S.L.
* SPDX-License-Identifier: MIT
*/
#include <assert.h>
#include <ctype.h>
#include <err.h>
#include <errno.h>
#include <fcntl.h>
#include <getopt.h>
#include <inttypes.h>
#include <signal.h>
#include <stdarg.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#if FD_REPLAY_KGSL
#include "../vulkan/msm_kgsl.h"
#elif FD_REPLAY_MSM
#include <xf86drm.h>
#include "drm-uapi/msm_drm.h"
#elif FD_REPLAY_WSL
#define __KERNEL__
#include "drm-uapi/d3dkmthk.h"
#endif
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/wait.h>
#include "util/os_time.h"
#include "util/rb_tree.h"
#include "util/u_vector.h"
#include "util/vma.h"
#include "buffers.h"
#include "cffdec.h"
#include "io.h"
#include "redump.h"
#include "rdutil.h"
/**
* Replay command stream obtained from:
* - /sys/kernel/debug/dri/0/rd
* - /sys/kernel/debug/dri/0/hangrd
* !!! Command stream capture should be done with ALL buffers:
* - echo 1 > /sys/module/msm/parameters/rd_full
*
* Requires kernel with MSM_INFO_SET_IOVA support.
* In case userspace IOVAs are not supported, like on KGSL, we have to
* pre-allocate a single buffer and hope it always allocated starting
* from the same address.
*
* TODO: Misrendering, would require marking framebuffer images
* at each renderpass in order to fetch and decode them.
*
* Code from Freedreno/Turnip is not re-used here since the relevant
* pieces may introduce additional allocations which cannot be allowed
* during the replay.
*
* For how-to see freedreno.rst
*/
static const char *exename = NULL;
static const uint64_t FAKE_ADDRESS_SPACE_SIZE = 1024 * 1024 * 1024;
static int handle_file(const char *filename, uint32_t first_submit,
uint32_t last_submit, uint32_t submit_to_override,
uint64_t base_addr, const char *cmdstreamgen);
static void
print_usage(const char *name)
{
/* clang-format off */
fprintf(stderr, "Usage:\n\n"
"\t%s [OPTSIONS]... FILE...\n\n"
"Options:\n"
"\t-e, --exe=NAME - only use cmdstream from named process\n"
"\t-o --override=submit - № of the submit to override\n"
"\t-g --generator=path - executable which generate cmdstream for override\n"
"\t-f --first=submit - first submit № to replay\n"
"\t-l --last=submit - last submit № to replay\n"
"\t-a --address=address - base iova address on WSL\n"
"\t-h, --help - show this message\n"
, name);
/* clang-format on */
exit(2);
}
/* clang-format off */
static const struct option opts[] = {
{ "exe", required_argument, 0, 'e' },
{ "override", required_argument, 0, 'o' },
{ "generator", required_argument, 0, 'g' },
{ "first", required_argument, 0, 'f' },
{ "last", required_argument, 0, 'l' },
{ "address", required_argument, 0, 'a' },
{ "help", no_argument, 0, 'h' },
};
/* clang-format on */
int
main(int argc, char **argv)
{
int ret = -1;
int c;
uint32_t submit_to_override = -1;
uint32_t first_submit = 0;
uint32_t last_submit = -1;
uint64_t base_addr = 0;
const char *cmdstreamgen = NULL;
while ((c = getopt_long(argc, argv, "e:o:g:f:l:a:h", opts, NULL)) != -1) {
switch (c) {
case 0:
/* option that set a flag, nothing to do */
break;
case 'e':
exename = optarg;
break;
case 'o':
submit_to_override = strtoul(optarg, NULL, 0);
break;
case 'g':
cmdstreamgen = optarg;
break;
case 'f':
first_submit = strtoul(optarg, NULL, 0);
break;
case 'l':
last_submit = strtoul(optarg, NULL, 0);
break;
case 'a':
base_addr = strtoull(optarg, NULL, 0);
break;
case 'h':
default:
print_usage(argv[0]);
}
}
while (optind < argc) {
ret = handle_file(argv[optind], first_submit, last_submit,
submit_to_override, base_addr, cmdstreamgen);
if (ret) {
fprintf(stderr, "error reading: %s\n", argv[optind]);
fprintf(stderr, "continuing..\n");
}
optind++;
}
if (ret)
print_usage(argv[0]);
return ret;
}
struct buffer {
struct rb_node node;
uint32_t gem_handle;
uint64_t size;
uint64_t iova;
void *map;
bool used;
uint32_t flags;
};
struct cmdstream {
uint64_t iova;
uint64_t size;
};
struct wrbuf {
uint64_t iova;
uint64_t size;
char* name;
};
struct device {
int fd;
struct rb_tree buffers;
struct util_vma_heap vma;
struct u_vector cmdstreams;
uint64_t shader_log_iova;
uint64_t cp_log_iova;
bool has_set_iova;
uint32_t va_id;
void *va_map;
uint64_t va_iova;
struct u_vector wrbufs;
#ifdef FD_REPLAY_KGSL
uint32_t context_id;
#endif
#ifdef FD_REPLAY_WSL
struct d3dkmthandle device;
struct d3dkmthandle context;
/* We don't know at the moment a good way to wait for submission to complete
* on WSL, so we could use our own fences.
*/
uint64_t fence_iova;
uint64_t fence_ib_iova;
volatile uint32_t *fence;
uint32_t *fence_ib;
#endif
};
void buffer_mem_free(struct device *dev, struct buffer *buf);
static int
rb_buffer_insert_cmp(const struct rb_node *n1, const struct rb_node *n2)
{
const struct buffer *buf1 = (const struct buffer *)n1;
const struct buffer *buf2 = (const struct buffer *)n2;
/* Note that gpuaddr comparisions can overflow an int: */
if (buf1->iova > buf2->iova)
return 1;
else if (buf1->iova < buf2->iova)
return -1;
return 0;
}
static int
rb_buffer_search_cmp(const struct rb_node *node, const void *addrptr)
{
const struct buffer *buf = (const struct buffer *)node;
uint64_t iova = *(uint64_t *)addrptr;
if (buf->iova + buf->size <= iova)
return -1;
else if (buf->iova > iova)
return 1;
return 0;
}
static struct buffer *
device_get_buffer(struct device *dev, uint64_t iova)
{
if (iova == 0)
return NULL;
return (struct buffer *)rb_tree_search(&dev->buffers, &iova,
rb_buffer_search_cmp);
}
static void
device_mark_buffers(struct device *dev)
{
rb_tree_foreach_safe (struct buffer, buf, &dev->buffers, node) {
buf->used = false;
}
}
static void
device_free_buffers(struct device *dev)
{
rb_tree_foreach_safe (struct buffer, buf, &dev->buffers, node) {
buffer_mem_free(dev, buf);
rb_tree_remove(&dev->buffers, &buf->node);
free(buf);
}
}
static void
device_print_shader_log(struct device *dev)
{
struct shader_log {
uint64_t cur_iova;
union {
uint32_t entries_u32[0];
float entries_float[0];
};
};
if (dev->shader_log_iova != 0)
{
struct buffer *buf = device_get_buffer(dev, dev->shader_log_iova);
if (buf) {
struct shader_log *log = buf->map + (dev->shader_log_iova - buf->iova);
uint32_t count = (log->cur_iova - dev->shader_log_iova -
offsetof(struct shader_log, entries_u32)) / 4;
printf("Shader Log Entries: %u\n", count);
for (uint32_t i = 0; i < count; i++) {
printf("[%u] %08x %.4f\n", i, log->entries_u32[i],
log->entries_float[i]);
}
printf("========================================\n");
}
}
}
static void
device_print_cp_log(struct device *dev)
{
struct cp_log {
uint64_t cur_iova;
uint64_t tmp;
uint64_t first_entry_size;
};
struct cp_log_entry {
uint64_t size;
uint32_t data[0];
};
if (dev->cp_log_iova == 0)
return;
struct buffer *buf = device_get_buffer(dev, dev->cp_log_iova);
if (!buf)
return;
struct cp_log *log = buf->map + (dev->cp_log_iova - buf->iova);
if (log->first_entry_size == 0)
return;
struct cp_log_entry *log_entry =
buf->map + offsetof(struct cp_log, first_entry_size);
uint32_t idx = 0;
while (log_entry->size != 0) {
printf("\nCP Log [%u]:\n", idx++);
uint32_t dwords = log_entry->size / 4;
for (uint32_t i = 0; i < dwords; i++) {
if (i % 8 == 0)
printf("\t");
printf("%08x ", log_entry->data[i]);
if (i % 8 == 7)
printf("\n");
}
printf("\n");
log_entry = (void *)log_entry + log_entry->size +
offsetof(struct cp_log_entry, data);
}
}
static void
device_dump_wrbuf(struct device *dev)
{
if (!u_vector_length(&dev->wrbufs))
return;
char buffer_dir[256];
snprintf(buffer_dir, sizeof(buffer_dir), "%s/buffers", exename);
rmdir(buffer_dir);
mkdir(buffer_dir, 0777);
struct wrbuf *wrbuf;
u_vector_foreach(wrbuf, &dev->wrbufs) {
char buffer_path[256];
snprintf(buffer_path, sizeof(buffer_path), "%s/%s", buffer_dir, wrbuf->name);
FILE *f = fopen(buffer_path, "wb");
if (!f) {
fprintf(stderr, "Error opening %s\n", buffer_path);
goto end_it;
}
struct buffer *buf = device_get_buffer(dev, wrbuf->iova);
if (!buf) {
fprintf(stderr, "Error getting buffer for %s\n", buffer_path);
goto end_it;
}
const void *buffer = buf->map + (wrbuf->iova - buf->iova);
fwrite(buffer, wrbuf->size, 1, f);
end_it:
fclose(f);
}
}
#if FD_REPLAY_MSM
static inline void
get_abs_timeout(struct drm_msm_timespec *tv, uint64_t ns)
{
struct timespec t;
clock_gettime(CLOCK_MONOTONIC, &t);
tv->tv_sec = t.tv_sec + ns / 1000000000;
tv->tv_nsec = t.tv_nsec + ns % 1000000000;
}
static struct device *
device_create(uint64_t base_addr)
{
struct device *dev = calloc(sizeof(struct device), 1);
dev->fd = drmOpenWithType("msm", NULL, DRM_NODE_RENDER);
if (dev->fd < 0) {
errx(1, "Cannot open MSM fd!");
}
uint64_t va_start, va_size;
struct drm_msm_param req = {
.pipe = MSM_PIPE_3D0,
.param = MSM_PARAM_VA_START,
};
int ret = drmCommandWriteRead(dev->fd, DRM_MSM_GET_PARAM, &req, sizeof(req));
va_start = req.value;
if (!ret) {
req.param = MSM_PARAM_VA_SIZE;
ret = drmCommandWriteRead(dev->fd, DRM_MSM_GET_PARAM, &req, sizeof(req));
va_size = req.value;
dev->has_set_iova = true;
}
if (ret) {
printf("MSM_INFO_SET_IOVA is not supported!\n");
struct drm_msm_gem_new req_new = {.size = FAKE_ADDRESS_SPACE_SIZE, .flags = MSM_BO_CACHED_COHERENT};
drmCommandWriteRead(dev->fd, DRM_MSM_GEM_NEW, &req_new, sizeof(req_new));
dev->va_id = req_new.handle;
struct drm_msm_gem_info req_info = {
.handle = req_new.handle,
.info = MSM_INFO_GET_IOVA,
};
drmCommandWriteRead(dev->fd,
DRM_MSM_GEM_INFO, &req_info, sizeof(req_info));
dev->va_iova = req_info.value;
struct drm_msm_gem_info req_offset = {
.handle = req_new.handle,
.info = MSM_INFO_GET_OFFSET,
};
drmCommandWriteRead(dev->fd, DRM_MSM_GEM_INFO, &req_offset, sizeof(req_offset));
dev->va_map = mmap(0, FAKE_ADDRESS_SPACE_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
dev->fd, req_offset.value);
if (dev->va_map == MAP_FAILED) {
err(1, "mmap failure");
}
va_start = dev->va_iova;
va_size = FAKE_ADDRESS_SPACE_SIZE;
printf("Allocated iova %" PRIx64 "\n", dev->va_iova);
}
rb_tree_init(&dev->buffers);
util_vma_heap_init(&dev->vma, va_start, ROUND_DOWN_TO(va_size, 4096));
u_vector_init(&dev->cmdstreams, 8, sizeof(struct cmdstream));
u_vector_init(&dev->wrbufs, 8, sizeof(struct wrbuf));
return dev;
}
static void
device_submit_cmdstreams(struct device *dev)
{
if (!u_vector_length(&dev->cmdstreams)) {
device_free_buffers(dev);
return;
}
struct drm_msm_gem_submit_cmd cmds[u_vector_length(&dev->cmdstreams)];
uint32_t idx = 0;
struct cmdstream *cmd;
u_vector_foreach(cmd, &dev->cmdstreams) {
struct buffer *cmdstream_buf = device_get_buffer(dev, cmd->iova);
uint32_t bo_idx = 0;
rb_tree_foreach (struct buffer, buf, &dev->buffers, node) {
if (buf == cmdstream_buf)
break;
bo_idx++;
}
if (cmdstream_buf)
cmdstream_buf->flags = MSM_SUBMIT_BO_DUMP;
struct drm_msm_gem_submit_cmd *submit_cmd = &cmds[idx];
submit_cmd->type = MSM_SUBMIT_CMD_BUF;
submit_cmd->submit_idx = bo_idx;
if (dev->has_set_iova) {
submit_cmd->submit_offset = cmd->iova - cmdstream_buf->iova;
} else {
submit_cmd->submit_offset = cmd->iova - dev->va_iova;
}
submit_cmd->size = cmd->size;
submit_cmd->pad = 0;
submit_cmd->nr_relocs = 0;
submit_cmd->relocs = 0;
idx++;
}
uint32_t bo_count = 0;
rb_tree_foreach (struct buffer, buf, &dev->buffers, node) {
if (buf)
bo_count++;
}
if (!dev->has_set_iova) {
bo_count = 1;
}
struct drm_msm_gem_submit_bo *bo_list =
calloc(sizeof(struct drm_msm_gem_submit_bo), bo_count);
if (dev->has_set_iova) {
uint32_t bo_idx = 0;
rb_tree_foreach (struct buffer, buf, &dev->buffers, node) {
struct drm_msm_gem_submit_bo *submit_bo = &bo_list[bo_idx++];
submit_bo->handle = buf->gem_handle;
submit_bo->flags =
buf->flags | MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE;
submit_bo->presumed = buf->iova;
buf->flags = 0;
}
} else {
bo_list[0].handle = dev->va_id;
bo_list[0].flags =
MSM_SUBMIT_BO_DUMP | MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE;
bo_list[0].presumed = dev->va_iova;
}
struct drm_msm_gem_submit submit_req = {
.flags = MSM_PIPE_3D0,
.queueid = 0,
.bos = (uint64_t)(uintptr_t)bo_list,
.nr_bos = bo_count,
.cmds = (uint64_t)(uintptr_t)cmds,
.nr_cmds = u_vector_length(&dev->cmdstreams),
.in_syncobjs = 0,
.out_syncobjs = 0,
.nr_in_syncobjs = 0,
.nr_out_syncobjs = 0,
.syncobj_stride = sizeof(struct drm_msm_gem_submit_syncobj),
};
int ret = drmCommandWriteRead(dev->fd, DRM_MSM_GEM_SUBMIT, &submit_req,
sizeof(submit_req));
if (ret) {
err(1, "DRM_MSM_GEM_SUBMIT failure %d", ret);
}
/* Wait for submission to complete in order to be sure that
* freeing buffers would free their VMAs in the kernel.
* Makes sure that new allocations won't clash with old ones.
*/
struct drm_msm_wait_fence wait_req = {
.fence = submit_req.fence,
.queueid = 0,
};
get_abs_timeout(&wait_req.timeout, 1000000000);
ret =
drmCommandWrite(dev->fd, DRM_MSM_WAIT_FENCE, &wait_req, sizeof(wait_req));
if (ret && (ret != -ETIMEDOUT)) {
err(1, "DRM_MSM_WAIT_FENCE failure %d", ret);
}
u_vector_finish(&dev->cmdstreams);
u_vector_init(&dev->cmdstreams, 8, sizeof(struct cmdstream));
device_print_shader_log(dev);
device_print_cp_log(dev);
device_dump_wrbuf(dev);
device_free_buffers(dev);
}
static void
buffer_mem_alloc(struct device *dev, struct buffer *buf)
{
util_vma_heap_alloc_addr(&dev->vma, buf->iova, buf->size);
if (!dev->has_set_iova) {
uint64_t offset = buf->iova - dev->va_iova;
assert(offset < FAKE_ADDRESS_SPACE_SIZE && (offset + buf->size) <= FAKE_ADDRESS_SPACE_SIZE);
buf->map = ((uint8_t*)dev->va_map) + offset;
return;
}
{
struct drm_msm_gem_new req = {.size = buf->size, .flags = MSM_BO_WC};
int ret =
drmCommandWriteRead(dev->fd, DRM_MSM_GEM_NEW, &req, sizeof(req));
if (ret) {
err(1, "DRM_MSM_GEM_NEW failure %d", ret);
}
buf->gem_handle = req.handle;
}
{
struct drm_msm_gem_info req = {
.handle = buf->gem_handle,
.info = MSM_INFO_SET_IOVA,
.value = buf->iova,
};
int ret =
drmCommandWriteRead(dev->fd, DRM_MSM_GEM_INFO, &req, sizeof(req));
if (ret) {
err(1, "MSM_INFO_SET_IOVA failure %d", ret);
}
}
{
struct drm_msm_gem_info req = {
.handle = buf->gem_handle,
.info = MSM_INFO_GET_OFFSET,
};
int ret =
drmCommandWriteRead(dev->fd, DRM_MSM_GEM_INFO, &req, sizeof(req));
if (ret) {
err(1, "MSM_INFO_GET_OFFSET failure %d", ret);
}
void *map = mmap(0, buf->size, PROT_READ | PROT_WRITE, MAP_SHARED,
dev->fd, req.value);
if (map == MAP_FAILED) {
err(1, "mmap failure");
}
buf->map = map;
}
}
void
buffer_mem_free(struct device *dev, struct buffer *buf)
{
if (dev->has_set_iova) {
munmap(buf->map, buf->size);
struct drm_msm_gem_info req_iova = {
.handle = buf->gem_handle,
.info = MSM_INFO_SET_IOVA,
.value = 0,
};
int ret = drmCommandWriteRead(dev->fd, DRM_MSM_GEM_INFO, &req_iova,
sizeof(req_iova));
if (ret < 0) {
err(1, "MSM_INFO_SET_IOVA(0) failed! %d", ret);
return;
}
struct drm_gem_close req = {
.handle = buf->gem_handle,
};
drmIoctl(dev->fd, DRM_IOCTL_GEM_CLOSE, &req);
}
util_vma_heap_free(&dev->vma, buf->iova, buf->size);
}
#elif FD_REPLAY_KGSL
static int
safe_ioctl(int fd, unsigned long request, void *arg)
{
int ret;
do {
ret = ioctl(fd, request, arg);
} while (ret == -1 && (errno == EINTR || errno == EAGAIN));
return ret;
}
static struct device *
device_create(uint64_t base_addr)
{
struct device *dev = calloc(sizeof(struct device), 1);
static const char path[] = "/dev/kgsl-3d0";
dev->fd = open(path, O_RDWR | O_CLOEXEC);
if (dev->fd < 0) {
errx(1, "Cannot open KGSL fd!");
}
struct kgsl_gpumem_alloc_id req = {
.size = FAKE_ADDRESS_SPACE_SIZE,
.flags = KGSL_MEMFLAGS_IOCOHERENT,
};
int ret = safe_ioctl(dev->fd, IOCTL_KGSL_GPUMEM_ALLOC_ID, &req);
if (ret) {
err(1, "IOCTL_KGSL_GPUMEM_ALLOC_ID failure");
}
dev->va_id = req.id;
dev->va_iova = req.gpuaddr;
dev->va_map = mmap(0, FAKE_ADDRESS_SPACE_SIZE, PROT_READ | PROT_WRITE,
MAP_SHARED, dev->fd, req.id << 12);
rb_tree_init(&dev->buffers);
util_vma_heap_init(&dev->vma, req.gpuaddr, ROUND_DOWN_TO(FAKE_ADDRESS_SPACE_SIZE, 4096));
u_vector_init(&dev->cmdstreams, 8, sizeof(struct cmdstream));
struct kgsl_drawctxt_create drawctxt_req = {
.flags = KGSL_CONTEXT_SAVE_GMEM |
KGSL_CONTEXT_NO_GMEM_ALLOC |
KGSL_CONTEXT_PREAMBLE,
};
ret = safe_ioctl(dev->fd, IOCTL_KGSL_DRAWCTXT_CREATE, &drawctxt_req);
if (ret) {
err(1, "IOCTL_KGSL_DRAWCTXT_CREATE failure");
}
printf("Allocated iova %" PRIx64 "\n", dev->va_iova);
dev->context_id = drawctxt_req.drawctxt_id;
return dev;
}
static void
device_submit_cmdstreams(struct device *dev)
{
if (!u_vector_length(&dev->cmdstreams)) {
device_free_buffers(dev);
return;
}
struct kgsl_command_object cmds[u_vector_length(&dev->cmdstreams)];
uint32_t idx = 0;
struct cmdstream *cmd;
u_vector_foreach(cmd, &dev->cmdstreams) {
struct kgsl_command_object *submit_cmd = &cmds[idx++];
submit_cmd->gpuaddr = cmd->iova;
submit_cmd->size = cmd->size;
submit_cmd->flags = KGSL_CMDLIST_IB;
submit_cmd->id = dev->va_id;
}
struct kgsl_gpu_command submit_req = {
.flags = KGSL_CMDBATCH_SUBMIT_IB_LIST,
.cmdlist = (uintptr_t) &cmds,
.cmdsize = sizeof(struct kgsl_command_object),
.numcmds = u_vector_length(&dev->cmdstreams),
.numsyncs = 0,
.context_id = dev->context_id,
};
int ret = safe_ioctl(dev->fd, IOCTL_KGSL_GPU_COMMAND, &submit_req);
if (ret) {
err(1, "IOCTL_KGSL_GPU_COMMAND failure %d", ret);
}
struct kgsl_device_waittimestamp_ctxtid wait = {
.context_id = dev->context_id,
.timestamp = submit_req.timestamp,
.timeout = 3000,
};
ret = safe_ioctl(dev->fd, IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID, &wait);
if (ret) {
err(1, "IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID failure %d", ret);
}
u_vector_finish(&dev->cmdstreams);
u_vector_init(&dev->cmdstreams, 8, sizeof(struct cmdstream));
device_print_shader_log(dev);
device_print_cp_log(dev);
device_dump_wrbuf(dev);
device_free_buffers(dev);
}
static void
buffer_mem_alloc(struct device *dev, struct buffer *buf)
{
util_vma_heap_alloc_addr(&dev->vma, buf->iova, buf->size);
buf->map = ((uint8_t*)dev->va_map) + (buf->iova - dev->va_iova);
}
void
buffer_mem_free(struct device *dev, struct buffer *buf)
{
util_vma_heap_free(&dev->vma, buf->iova, buf->size);
}
#else
static int
safe_ioctl(int fd, unsigned long request, void *arg)
{
int ret;
do {
ret = ioctl(fd, request, arg);
} while (ret == -1 && (errno == EINTR || errno == EAGAIN));
return ret;
}
struct alloc_priv_info {
__u32 struct_size;
char _pad0[4];
__u32 unk0; // 1
char _pad1[4];
__u64 size;
__u32 alignment;
char _pad2[20];
__u64 allocated_size;
__u32 unk1; // 1
char _pad4[8]; /* offset: 60*/
__u32 unk2; // 61
char _pad5[76];
__u32 unk3; /* offset: 148 */ // 1
char _pad6[8];
__u32 unk4; /* offset: 160 */ // 1
char _pad7[44];
__u32 unk5; /* offset: 208 */ // 3
char _pad8[16];
__u32 size_2; /* offset: 228 */
__u32 unk6; // 1
__u32 size_3;
__u32 size_4;
__u32 unk7; /* offset: 244 */ // 1
char _pad9[56];
};
static_assert(sizeof(struct alloc_priv_info) == 304);
static_assert(offsetof(struct alloc_priv_info, unk1) == 56);
static_assert(offsetof(struct alloc_priv_info, unk3) == 148);
static_assert(offsetof(struct alloc_priv_info, unk5) == 208);
struct submit_priv_ib_info {
char _pad5[4];
__u32 size_dwords;
__u64 iova;
char _pad6[8];
} __attribute__((packed));
struct submit_priv_data {
__u32 magic0;
char _pad0[4];
__u32 struct_size;
char _pad1[4];
/* It seems that priv data can have several sub-datas
* cmdbuf is one of them, after it there is another 8 byte struct
* without anything useful in it. That second data doesn't seem
* important for replaying.
*/
__u32 datas_count;
char _pad2[32];
struct {
__u32 magic1;
__u32 data_size;
struct {
__u32 unk1;
__u32 cmdbuf_size;
char _pad3[32];
__u32 ib_count;
char _pad4[36];
struct submit_priv_ib_info ibs[];
} cmdbuf;
} data0;
// unsigned char magic2[8];
} __attribute__((packed));
static_assert(offsetof(struct submit_priv_data, data0) == 0x34);
static_assert(offsetof(struct submit_priv_data, data0.cmdbuf.ibs) == 0x8c);
static struct device *
device_create(uint64_t base_addr)
{
struct device *dev = calloc(sizeof(struct device), 1);
static const char path[] = "/dev/dxg";
dev->fd = open(path, O_RDWR | O_CLOEXEC);
if (dev->fd < 0) {
errx(1, "Cannot open /dev/dxg fd");
}
struct d3dkmt_adapterinfo adapters[1];
struct d3dkmt_enumadapters3 enum_adapters = {
.adapter_count = 1,
.adapters = adapters,
};
int ret = safe_ioctl(dev->fd, LX_DXENUMADAPTERS3, &enum_adapters);
if (ret) {
errx(1, "LX_DXENUMADAPTERS3 failure");
}
if (enum_adapters.adapter_count == 0) {
errx(1, "No adapters found");
}
struct winluid adapter_luid = enum_adapters.adapters[0].adapter_luid;
struct d3dkmt_openadapterfromluid open_adapter = {
.adapter_luid = adapter_luid,
};
ret = safe_ioctl(dev->fd, LX_DXOPENADAPTERFROMLUID, &open_adapter);
if (ret) {
errx(1, "LX_DXOPENADAPTERFROMLUID failure");
}
struct d3dkmthandle adapter = open_adapter.adapter_handle;
struct d3dkmt_createdevice create_device = {
.adapter = adapter,
};
ret = safe_ioctl(dev->fd, LX_DXCREATEDEVICE, &create_device);
if (ret) {
errx(1, "LX_DXCREATEDEVICE failure");
}
struct d3dkmthandle device = create_device.device;
dev->device = device;
unsigned char create_context_priv_data[] = {
0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00,
0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1c, 0x0c, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
};
struct d3dkmt_createcontextvirtual create_context = {
.device = device,
.node_ordinal = 0,
.engine_affinity = 1,
.priv_drv_data = create_context_priv_data,
.priv_drv_data_size = sizeof(create_context_priv_data),
.client_hint = 16,
};
ret = safe_ioctl(dev->fd, LX_DXCREATECONTEXTVIRTUAL, &create_context);
if (ret) {
errx(1, "LX_DXCREATECONTEXTVIRTUAL failure");
}
dev->context = create_context.context;
struct d3dkmt_createpagingqueue create_paging_queue = {
.device = device,
.priority = _D3DDDI_PAGINGQUEUE_PRIORITY_NORMAL,
.physical_adapter_index = 0,
};
ret = safe_ioctl(dev->fd, LX_DXCREATEPAGINGQUEUE, &create_paging_queue);
if (ret) {
errx(1, "LX_DXCREATEPAGINGQUEUE failure");
}
struct d3dkmthandle paging_queue = create_paging_queue.paging_queue;
uint32_t alloc_size = FAKE_ADDRESS_SPACE_SIZE;
struct alloc_priv_info priv_alloc_info = {
.struct_size = sizeof(struct alloc_priv_info),
.unk0 = 1,
.size = alloc_size,
.alignment = 4096,
.unk1 = 1,
.unk2 = 61,
.unk3 = 1,
.unk4 = 1,
.unk5 = 3,
.size_2 = alloc_size,
.unk6 = 1,
.size_3 = alloc_size,
.size_4 = alloc_size,
.unk7 = 1,
};
struct d3dddi_allocationinfo2 alloc_info = {
.priv_drv_data = &priv_alloc_info,
.priv_drv_data_size = sizeof(struct alloc_priv_info),
};
struct d3dkmt_createallocation create_allocation = {
.device = device,
.alloc_count = 1,
.allocation_info = &alloc_info,
};
ret = safe_ioctl(dev->fd, LX_DXCREATEALLOCATION, &create_allocation);
if (ret) {
errx(1, "LX_DXCREATEALLOCATION failure");
}
assert(priv_alloc_info.allocated_size == alloc_size);
struct d3dddi_mapgpuvirtualaddress map_virtual_address = {
.paging_queue = paging_queue,
.base_address = base_addr,
.maximum_address = 18446744073709551615ull,
.allocation = create_allocation.allocation_info[0].allocation,
.size_in_pages = MAX2(alloc_size / 4096, 1),
.protection = {
.write = 1,
.execute = 1,
},
};
ret = safe_ioctl(dev->fd, LX_DXMAPGPUVIRTUALADDRESS, &map_virtual_address);
if (ret != 259) {
errx(1, "LX_DXMAPGPUVIRTUALADDRESS failure");
}
__u32 priority = 0;
struct d3dddi_makeresident make_resident = {
.paging_queue = paging_queue,
.alloc_count = 1,
.allocation_list = &create_allocation.allocation_info[0].allocation,
.priority_list = &priority,
};
ret = safe_ioctl(dev->fd, LX_DXMAKERESIDENT, &make_resident);
if (ret != 259) {
errx(1, "LX_DXMAKERESIDENT failure");
}
struct d3dkmt_lock2 lock = {
.device = device,
.allocation = create_allocation.allocation_info[0].allocation,
};
ret = safe_ioctl(dev->fd, LX_DXLOCK2, &lock);
if (ret) {
errx(1, "LX_DXLOCK2 failure");
}
dev->va_iova = map_virtual_address.virtual_address;
dev->va_map = lock.data;
rb_tree_init(&dev->buffers);
util_vma_heap_init(&dev->vma, dev->va_iova, ROUND_DOWN_TO(alloc_size, 4096));
u_vector_init(&dev->cmdstreams, 8, sizeof(struct cmdstream));
printf("Allocated iova at 0x%" PRIx64 "\n", dev->va_iova);
uint64_t hole_size = 4096;
dev->vma.alloc_high = true;
dev->fence_iova = util_vma_heap_alloc(&dev->vma, hole_size, 4096);
dev->fence_ib_iova = dev->fence_iova + 8;
dev->fence = (uint32_t *) ((uint8_t*)dev->va_map + (dev->fence_iova - dev->va_iova));
dev->fence_ib = (uint32_t *) ((uint8_t*)dev->va_map + (dev->fence_ib_iova - dev->va_iova));
dev->vma.alloc_high = false;
return dev;
}
static void
device_submit_cmdstreams(struct device *dev)
{
if (!u_vector_length(&dev->cmdstreams)) {
device_free_buffers(dev);
return;
}
uint32_t cmdstream_count = u_vector_length(&dev->cmdstreams) + 1;
uint32_t priv_data_size =
sizeof(struct submit_priv_data) +
cmdstream_count * sizeof(struct submit_priv_ib_info);
struct submit_priv_data *priv_data = calloc(1, priv_data_size);
priv_data->magic0 = 0xccaabbee;
priv_data->struct_size = priv_data_size;
priv_data->datas_count = 1;
priv_data->data0.magic1 = 0xfadcab02;
priv_data->data0.data_size =
sizeof(priv_data->data0) +
cmdstream_count * sizeof(struct submit_priv_ib_info);
priv_data->data0.cmdbuf.unk1 = 0xcccc0001;
priv_data->data0.cmdbuf.cmdbuf_size = sizeof(priv_data->data0.cmdbuf) +
cmdstream_count * sizeof(struct submit_priv_ib_info);
priv_data->data0.cmdbuf.ib_count = cmdstream_count;
struct cmdstream *cmd;
uint32_t idx = 0;
u_vector_foreach(cmd, &dev->cmdstreams) {
priv_data->data0.cmdbuf.ibs[idx].size_dwords = cmd->size / 4;
priv_data->data0.cmdbuf.ibs[idx].iova = cmd->iova;
idx++;
}
priv_data->data0.cmdbuf.ibs[idx].size_dwords = 4;
priv_data->data0.cmdbuf.ibs[idx].iova = dev->fence_ib_iova;
*dev->fence = 0x00000000;
dev->fence_ib[0] = pm4_pkt7_hdr(0x3d, 3); // CP_MEM_WRITE
dev->fence_ib[1] = dev->fence_iova;
dev->fence_ib[2] = dev->fence_iova >> 32;
dev->fence_ib[3] = 0xababfcfc;
// Fill second (empty) data block
// uint32_t *magic_end = (uint32_t *)(((char *) priv_data) + priv_data_size - 8);
// magic_end[0] = 0xfadcab00;
// magic_end[1] = 0x00000008;
struct d3dkmt_submitcommand submission = {
.command_buffer = priv_data->data0.cmdbuf.ibs[0].iova,
.command_length = priv_data->data0.cmdbuf.ibs[0].size_dwords * sizeof(uint32_t),
.broadcast_context_count = 1,
.broadcast_context[0] = dev->context,
.priv_drv_data_size = priv_data_size,
.priv_drv_data = priv_data,
};
int ret = safe_ioctl(dev->fd, LX_DXSUBMITCOMMAND, &submission);
if (ret) {
errx(1, "LX_DXSUBMITCOMMAND failure");
}
free(priv_data);
u_vector_finish(&dev->cmdstreams);
u_vector_init(&dev->cmdstreams, 8, sizeof(struct cmdstream));
// TODO: better way to wait
for (unsigned i = 0; i < 1000; i++) {
usleep(1000);
if (*dev->fence != 0)
break;
}
if (*dev->fence == 0) {
errx(1, "Waiting for submission failed! GPU faulted or kernel did not execute this submission.");
}
device_print_shader_log(dev);
device_print_cp_log(dev);
device_dump_wrbuf(dev);
device_free_buffers(dev);
}
static void
buffer_mem_alloc(struct device *dev, struct buffer *buf)
{
util_vma_heap_alloc_addr(&dev->vma, buf->iova, buf->size);
buf->map = ((uint8_t*)dev->va_map) + (buf->iova - dev->va_iova);
}
void
buffer_mem_free(struct device *dev, struct buffer *buf)
{
util_vma_heap_free(&dev->vma, buf->iova, buf->size);
}
#endif
static void
upload_buffer(struct device *dev, uint64_t iova, unsigned int size,
void *hostptr)
{
struct buffer *buf = device_get_buffer(dev, iova);
if (!buf) {
buf = calloc(sizeof(struct buffer), 1);
buf->iova = iova;
buf->size = size;
rb_tree_insert(&dev->buffers, &buf->node, rb_buffer_insert_cmp);
buffer_mem_alloc(dev, buf);
} else if (buf->size != size) {
buffer_mem_free(dev, buf);
buf->size = size;
buffer_mem_alloc(dev, buf);
}
memcpy(buf->map, hostptr, size);
buf->used = true;
}
static int
override_cmdstream(struct device *dev, struct cmdstream *cs,
const char *cmdstreamgen)
{
#if FD_REPLAY_KGSL
static const char *tmpfilename = "/sdcard/Download/cmdstream_override.rd";
#elif FD_REPLAY_MSM || FD_REPLAY_WSL
static const char *tmpfilename = "/tmp/cmdstream_override.rd";
#endif
/* Find a free space for the new cmdstreams and resources we will use
* when overriding existing cmdstream.
*/
/* TODO: should the size be configurable? */
uint64_t hole_size = 32 * 1024 * 1024;
dev->vma.alloc_high = true;
uint64_t hole_iova = util_vma_heap_alloc(&dev->vma, hole_size, 4096);
dev->vma.alloc_high = false;
util_vma_heap_free(&dev->vma, hole_iova, hole_size);
char cmd[2048];
snprintf(cmd, sizeof(cmd),
"%s --vastart=%" PRIu64 " --vasize=%" PRIu64 " %s", cmdstreamgen,
hole_iova, hole_size, tmpfilename);
printf("generating cmdstream '%s'\n", cmd);
int ret = system(cmd);
if (ret) {
fprintf(stderr, "Error executing %s\n", cmd);
return -1;
}
struct io *io;
struct rd_parsed_section ps = {0};
io = io_open(tmpfilename);
if (!io) {
fprintf(stderr, "could not open: %s\n", tmpfilename);
return -1;
}
struct {
unsigned int len;
uint64_t gpuaddr;
} gpuaddr = {0};
while (parse_rd_section(io, &ps)) {
switch (ps.type) {
case RD_GPUADDR:
parse_addr(ps.buf, ps.sz, &gpuaddr.len, &gpuaddr.gpuaddr);
/* no-op */
break;
case RD_BUFFER_CONTENTS:
upload_buffer(dev, gpuaddr.gpuaddr, gpuaddr.len, ps.buf);
ps.buf = NULL;
break;
case RD_CMDSTREAM_ADDR: {
unsigned int sizedwords;
uint64_t gpuaddr;
parse_addr(ps.buf, ps.sz, &sizedwords, &gpuaddr);
printf("override cmdstream: %d dwords\n", sizedwords);
cs->iova = gpuaddr;
cs->size = sizedwords * sizeof(uint32_t);
break;
}
case RD_SHADER_LOG_BUFFER: {
unsigned int sizedwords;
parse_addr(ps.buf, ps.sz, &sizedwords, &dev->shader_log_iova);
break;
}
case RD_CP_LOG_BUFFER: {
unsigned int sizedwords;
parse_addr(ps.buf, ps.sz, &sizedwords, &dev->cp_log_iova);
break;
}
case RD_WRBUFFER: {
struct wrbuf *wrbuf = u_vector_add(&dev->wrbufs);
uint64_t *p = (uint64_t *)ps.buf;
wrbuf->iova = p[0];
wrbuf->size = p[1];
wrbuf->name = calloc(1, p[2]);
memcpy(wrbuf->name, (char *)ps.buf + 3 * sizeof(uint64_t), p[2]);
break;
}
default:
break;
}
}
io_close(io);
if (ps.ret < 0) {
fprintf(stderr, "corrupt file %s\n", tmpfilename);
}
return ps.ret;
}
static int
handle_file(const char *filename, uint32_t first_submit, uint32_t last_submit,
uint32_t submit_to_override, uint64_t base_addr, const char *cmdstreamgen)
{
struct io *io;
int submit = 0;
bool skip = false;
bool need_submit = false;
struct rd_parsed_section ps = {0};
printf("Reading %s...\n", filename);
if (!strcmp(filename, "-"))
io = io_openfd(0);
else
io = io_open(filename);
if (!io) {
fprintf(stderr, "could not open: %s\n", filename);
return -1;
}
struct device *dev = device_create(base_addr);
struct {
unsigned int len;
uint64_t gpuaddr;
} gpuaddr = {0};
while (parse_rd_section(io, &ps)) {
switch (ps.type) {
case RD_TEST:
case RD_VERT_SHADER:
case RD_FRAG_SHADER:
/* no-op */
break;
case RD_CMD:
skip = false;
if (exename) {
skip |= (strstr(ps.buf, exename) != ps.buf);
} else {
skip |= (strstr(ps.buf, "fdperf") == ps.buf);
skip |= (strstr(ps.buf, "chrome") == ps.buf);
skip |= (strstr(ps.buf, "surfaceflinger") == ps.buf);
skip |= ((char *)ps.buf)[0] == 'X';
}
break;
case RD_GPUADDR:
if (need_submit) {
need_submit = false;
device_submit_cmdstreams(dev);
}
parse_addr(ps.buf, ps.sz, &gpuaddr.len, &gpuaddr.gpuaddr);
/* no-op */
break;
case RD_BUFFER_CONTENTS:
/* TODO: skip buffer uploading and even reading if this buffer
* is used for submit outside of [first_submit, last_submit]
* range. A set of buffers is shared between several cmdstreams,
* so we'd have to find starting from which RD_CMD to upload
* the buffers.
*/
upload_buffer(dev, gpuaddr.gpuaddr, gpuaddr.len, ps.buf);
break;
case RD_CMDSTREAM_ADDR: {
unsigned int sizedwords;
uint64_t gpuaddr;
parse_addr(ps.buf, ps.sz, &sizedwords, &gpuaddr);
bool add_submit = !skip && (submit >= first_submit) && (submit <= last_submit);
printf("%scmdstream %d: %d dwords\n", add_submit ? "" : "skipped ",
submit, sizedwords);
if (add_submit) {
struct cmdstream *cs = u_vector_add(&dev->cmdstreams);
if (submit == submit_to_override) {
if (override_cmdstream(dev, cs, cmdstreamgen) < 0)
break;
} else {
cs->iova = gpuaddr;
cs->size = sizedwords * sizeof(uint32_t);
}
}
need_submit = true;
submit++;
break;
}
case RD_GPU_ID: {
uint32_t gpu_id = parse_gpu_id(ps.buf);
if (gpu_id)
printf("gpuid: %d\n", gpu_id);
break;
}
case RD_CHIP_ID: {
uint64_t chip_id = parse_chip_id(ps.buf);
printf("chip_id: 0x%" PRIx64 "\n", chip_id);
break;
}
default:
break;
}
}
if (need_submit)
device_submit_cmdstreams(dev);
close(dev->fd);
io_close(io);
fflush(stdout);
if (ps.ret < 0) {
printf("corrupt file\n");
}
return 0;
}