freedreno/replay: Add WSL backend for Windows

d3dkmthk.h contains only non driver specific structs, private
data structs are not defined anywhere public but contain
important information for memory allocation and submissions.
Luckily only small parts of these structs are relevant for what
we want to do and most of them are not changed between calls.

Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26101>
This commit is contained in:
Danylo Piliaiev 2023-11-02 13:53:03 +01:00 committed by Marge Bot
parent 16de8e99cf
commit 480ec5b1d4
4 changed files with 2197 additions and 16 deletions

1794
include/drm-uapi/d3dkmthk.h Normal file

File diff suppressed because it is too large Load Diff

View File

@ -233,7 +233,7 @@ option(
'freedreno-kmds',
type : 'array',
value : ['msm'],
choices : ['msm', 'kgsl', 'virtio'],
choices : ['msm', 'kgsl', 'virtio', 'wsl'],
description : 'List of kernel-mode drivers to enable for freedreno ' +
'gallium and vulkan driver',
)

View File

@ -71,7 +71,7 @@ if dep_libarchive.found()
)
foreach kmd : freedreno_kmds
if not (kmd in ['kgsl', 'msm'])
if not (kmd in ['kgsl', 'msm', 'wsl'])
warning('replay not supported for ' + kmd + ' KMD')
continue
endif
@ -88,7 +88,10 @@ if dep_libarchive.found()
if kmd == 'kgsl'
replay_flags += '-DFD_REPLAY_KGSL'
elif kmd == 'msm'
replay_flags += '-DFD_REPLAY_MSM'
replay_deps += dep_libdrm
elif kmd == 'wsl'
replay_flags += '-DFD_REPLAY_WSL'
endif
replay = executable(

View File

@ -18,11 +18,14 @@
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#if !FD_REPLAY_KGSL
#if FD_REPLAY_KGSL
#include "../vulkan/msm_kgsl.h"
#elif FD_REPLAY_MSM
#include <xf86drm.h>
#include "drm-uapi/msm_drm.h"
#else
#include "../vulkan/msm_kgsl.h"
#elif FD_REPLAY_WSL
#define __KERNEL__
#include "drm-uapi/d3dkmthk.h"
#endif
#include <sys/ioctl.h>
@ -69,7 +72,7 @@ static const uint64_t FAKE_ADDRESS_SPACE_SIZE = 1024 * 1024 * 1024;
static int handle_file(const char *filename, uint32_t first_submit,
uint32_t last_submit, uint32_t submit_to_override,
const char *cmdstreamgen);
uint64_t base_addr, const char *cmdstreamgen);
static void
print_usage(const char *name)
@ -83,6 +86,7 @@ print_usage(const char *name)
"\t-g --generator=path - executable which generate cmdstream for override\n"
"\t-f --first=submit - first submit № to replay\n"
"\t-l --last=submit - last submit № to replay\n"
"\t-a --address=address - base iova address on WSL\n"
"\t-h, --help - show this message\n"
, name);
/* clang-format on */
@ -96,6 +100,7 @@ static const struct option opts[] = {
{ "generator", required_argument, 0, 'g' },
{ "first", required_argument, 0, 'f' },
{ "last", required_argument, 0, 'l' },
{ "address", required_argument, 0, 'a' },
{ "help", no_argument, 0, 'h' },
};
/* clang-format on */
@ -109,9 +114,10 @@ main(int argc, char **argv)
uint32_t submit_to_override = -1;
uint32_t first_submit = 0;
uint32_t last_submit = -1;
uint64_t base_addr = 0;
const char *cmdstreamgen = NULL;
while ((c = getopt_long(argc, argv, "e:o:g:f:l:h", opts, NULL)) != -1) {
while ((c = getopt_long(argc, argv, "e:o:g:f:l:a:h", opts, NULL)) != -1) {
switch (c) {
case 0:
/* option that set a flag, nothing to do */
@ -131,6 +137,9 @@ main(int argc, char **argv)
case 'l':
last_submit = strtoul(optarg, NULL, 0);
break;
case 'a':
base_addr = strtoull(optarg, NULL, 0);
break;
case 'h':
default:
print_usage(argv[0]);
@ -139,7 +148,7 @@ main(int argc, char **argv)
while (optind < argc) {
ret = handle_file(argv[optind], first_submit, last_submit,
submit_to_override, cmdstreamgen);
submit_to_override, base_addr, cmdstreamgen);
if (ret) {
fprintf(stderr, "error reading: %s\n", argv[optind]);
fprintf(stderr, "continuing..\n");
@ -193,11 +202,24 @@ struct device {
void *va_map;
uint64_t va_iova;
struct u_vector wrbufs;
#ifdef FD_REPLAY_KGSL
uint32_t context_id;
#endif
struct u_vector wrbufs;
#ifdef FD_REPLAY_WSL
struct d3dkmthandle device;
struct d3dkmthandle context;
/* We don't know at the moment a good way to wait for submission to complete
* on WSL, so we could use our own fences.
*/
uint64_t fence_iova;
uint64_t fence_ib_iova;
volatile uint32_t *fence;
uint32_t *fence_ib;
#endif
};
void buffer_mem_free(struct device *dev, struct buffer *buf);
@ -365,7 +387,7 @@ device_dump_wrbuf(struct device *dev)
}
}
#if !FD_REPLAY_KGSL
#if FD_REPLAY_MSM
static inline void
get_abs_timeout(struct drm_msm_timespec *tv, uint64_t ns)
{
@ -376,7 +398,7 @@ get_abs_timeout(struct drm_msm_timespec *tv, uint64_t ns)
}
static struct device *
device_create()
device_create(uint64_t base_addr)
{
struct device *dev = calloc(sizeof(struct device), 1);
@ -656,7 +678,7 @@ buffer_mem_free(struct device *dev, struct buffer *buf)
util_vma_heap_free(&dev->vma, buf->iova, buf->size);
}
#else
#elif FD_REPLAY_KGSL
static int
safe_ioctl(int fd, unsigned long request, void *arg)
{
@ -670,7 +692,7 @@ safe_ioctl(int fd, unsigned long request, void *arg)
}
static struct device *
device_create()
device_create(uint64_t base_addr)
{
struct device *dev = calloc(sizeof(struct device), 1);
@ -789,6 +811,368 @@ buffer_mem_free(struct device *dev, struct buffer *buf)
{
util_vma_heap_free(&dev->vma, buf->iova, buf->size);
}
#else
static int
safe_ioctl(int fd, unsigned long request, void *arg)
{
int ret;
do {
ret = ioctl(fd, request, arg);
} while (ret == -1 && (errno == EINTR || errno == EAGAIN));
return ret;
}
struct alloc_priv_info {
__u32 struct_size;
char _pad0[4];
__u32 unk0; // 1
char _pad1[4];
__u64 size;
__u32 alignment;
char _pad2[20];
__u64 allocated_size;
__u32 unk1; // 1
char _pad4[8]; /* offset: 60*/
__u32 unk2; // 61
char _pad5[76];
__u32 unk3; /* offset: 148 */ // 1
char _pad6[8];
__u32 unk4; /* offset: 160 */ // 1
char _pad7[44];
__u32 unk5; /* offset: 208 */ // 3
char _pad8[16];
__u32 size_2; /* offset: 228 */
__u32 unk6; // 1
__u32 size_3;
__u32 size_4;
__u32 unk7; /* offset: 244 */ // 1
char _pad9[56];
};
static_assert(sizeof(struct alloc_priv_info) == 304);
static_assert(offsetof(struct alloc_priv_info, unk1) == 56);
static_assert(offsetof(struct alloc_priv_info, unk3) == 148);
static_assert(offsetof(struct alloc_priv_info, unk5) == 208);
struct submit_priv_ib_info {
char _pad5[4];
__u32 size_dwords;
__u64 iova;
char _pad6[8];
} __attribute__((packed));
struct submit_priv_data {
__u32 magic0;
char _pad0[4];
__u32 struct_size;
char _pad1[4];
/* It seems that priv data can have several sub-datas
* cmdbuf is one of them, after it there is another 8 byte struct
* without anything useful in it. That second data doesn't seem
* important for replaying.
*/
__u32 datas_count;
char _pad2[32];
struct {
__u32 magic1;
__u32 data_size;
struct {
__u32 unk1;
__u32 cmdbuf_size;
char _pad3[32];
__u32 ib_count;
char _pad4[36];
struct submit_priv_ib_info ibs[];
} cmdbuf;
} data0;
// unsigned char magic2[8];
} __attribute__((packed));
static_assert(offsetof(struct submit_priv_data, data0) == 0x34);
static_assert(offsetof(struct submit_priv_data, data0.cmdbuf.ibs) == 0x8c);
static struct device *
device_create(uint64_t base_addr)
{
struct device *dev = calloc(sizeof(struct device), 1);
static const char path[] = "/dev/dxg";
dev->fd = open(path, O_RDWR | O_CLOEXEC);
if (dev->fd < 0) {
errx(1, "Cannot open /dev/dxg fd");
}
struct d3dkmt_adapterinfo adapters[1];
struct d3dkmt_enumadapters3 enum_adapters = {
.adapter_count = 1,
.adapters = adapters,
};
int ret = safe_ioctl(dev->fd, LX_DXENUMADAPTERS3, &enum_adapters);
if (ret) {
errx(1, "LX_DXENUMADAPTERS3 failure");
}
if (enum_adapters.adapter_count == 0) {
errx(1, "No adapters found");
}
struct winluid adapter_luid = enum_adapters.adapters[0].adapter_luid;
struct d3dkmt_openadapterfromluid open_adapter = {
.adapter_luid = adapter_luid,
};
ret = safe_ioctl(dev->fd, LX_DXOPENADAPTERFROMLUID, &open_adapter);
if (ret) {
errx(1, "LX_DXOPENADAPTERFROMLUID failure");
}
struct d3dkmthandle adapter = open_adapter.adapter_handle;
struct d3dkmt_createdevice create_device = {
.adapter = adapter,
};
ret = safe_ioctl(dev->fd, LX_DXCREATEDEVICE, &create_device);
if (ret) {
errx(1, "LX_DXCREATEDEVICE failure");
}
struct d3dkmthandle device = create_device.device;
dev->device = device;
unsigned char create_context_priv_data[] = {
0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00,
0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1c, 0x0c, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
};
struct d3dkmt_createcontextvirtual create_context = {
.device = device,
.node_ordinal = 0,
.engine_affinity = 1,
.priv_drv_data = create_context_priv_data,
.priv_drv_data_size = sizeof(create_context_priv_data),
.client_hint = 16,
};
ret = safe_ioctl(dev->fd, LX_DXCREATECONTEXTVIRTUAL, &create_context);
if (ret) {
errx(1, "LX_DXCREATECONTEXTVIRTUAL failure");
}
dev->context = create_context.context;
struct d3dkmt_createpagingqueue create_paging_queue = {
.device = device,
.priority = _D3DDDI_PAGINGQUEUE_PRIORITY_NORMAL,
.physical_adapter_index = 0,
};
ret = safe_ioctl(dev->fd, LX_DXCREATEPAGINGQUEUE, &create_paging_queue);
if (ret) {
errx(1, "LX_DXCREATEPAGINGQUEUE failure");
}
struct d3dkmthandle paging_queue = create_paging_queue.paging_queue;
uint32_t alloc_size = FAKE_ADDRESS_SPACE_SIZE;
struct alloc_priv_info priv_alloc_info = {
.struct_size = sizeof(struct alloc_priv_info),
.unk0 = 1,
.size = alloc_size,
.alignment = 4096,
.unk1 = 1,
.unk2 = 61,
.unk3 = 1,
.unk4 = 1,
.unk5 = 3,
.size_2 = alloc_size,
.unk6 = 1,
.size_3 = alloc_size,
.size_4 = alloc_size,
.unk7 = 1,
};
struct d3dddi_allocationinfo2 alloc_info = {
.priv_drv_data = &priv_alloc_info,
.priv_drv_data_size = sizeof(struct alloc_priv_info),
};
struct d3dkmt_createallocation create_allocation = {
.device = device,
.alloc_count = 1,
.allocation_info = &alloc_info,
};
ret = safe_ioctl(dev->fd, LX_DXCREATEALLOCATION, &create_allocation);
if (ret) {
errx(1, "LX_DXCREATEALLOCATION failure");
}
assert(priv_alloc_info.allocated_size == alloc_size);
struct d3dddi_mapgpuvirtualaddress map_virtual_address = {
.paging_queue = paging_queue,
.base_address = base_addr,
.maximum_address = 18446744073709551615ull,
.allocation = create_allocation.allocation_info[0].allocation,
.size_in_pages = MAX2(alloc_size / 4096, 1),
.protection = {
.write = 1,
.execute = 1,
},
};
ret = safe_ioctl(dev->fd, LX_DXMAPGPUVIRTUALADDRESS, &map_virtual_address);
if (ret != 259) {
errx(1, "LX_DXMAPGPUVIRTUALADDRESS failure");
}
__u32 priority = 0;
struct d3dddi_makeresident make_resident = {
.paging_queue = paging_queue,
.alloc_count = 1,
.allocation_list = &create_allocation.allocation_info[0].allocation,
.priority_list = &priority,
};
ret = safe_ioctl(dev->fd, LX_DXMAKERESIDENT, &make_resident);
if (ret != 259) {
errx(1, "LX_DXMAKERESIDENT failure");
}
struct d3dkmt_lock2 lock = {
.device = device,
.allocation = create_allocation.allocation_info[0].allocation,
};
ret = safe_ioctl(dev->fd, LX_DXLOCK2, &lock);
if (ret) {
errx(1, "LX_DXLOCK2 failure");
}
dev->va_iova = map_virtual_address.virtual_address;
dev->va_map = lock.data;
rb_tree_init(&dev->buffers);
util_vma_heap_init(&dev->vma, dev->va_iova, ROUND_DOWN_TO(alloc_size, 4096));
u_vector_init(&dev->cmdstreams, 8, sizeof(struct cmdstream));
printf("Allocated iova at 0x%" PRIx64 "\n", dev->va_iova);
uint64_t hole_size = 4096;
dev->vma.alloc_high = true;
dev->fence_iova = util_vma_heap_alloc(&dev->vma, hole_size, 4096);
dev->fence_ib_iova = dev->fence_iova + 8;
dev->fence = (uint32_t *) ((uint8_t*)dev->va_map + (dev->fence_iova - dev->va_iova));
dev->fence_ib = (uint32_t *) ((uint8_t*)dev->va_map + (dev->fence_ib_iova - dev->va_iova));
dev->vma.alloc_high = false;
return dev;
}
static void
device_submit_cmdstreams(struct device *dev)
{
if (!u_vector_length(&dev->cmdstreams)) {
device_free_buffers(dev);
return;
}
uint32_t cmdstream_count = u_vector_length(&dev->cmdstreams) + 1;
uint32_t priv_data_size =
sizeof(struct submit_priv_data) +
cmdstream_count * sizeof(struct submit_priv_ib_info);
struct submit_priv_data *priv_data = calloc(1, priv_data_size);
priv_data->magic0 = 0xccaabbee;
priv_data->struct_size = priv_data_size;
priv_data->datas_count = 1;
priv_data->data0.magic1 = 0xfadcab02;
priv_data->data0.data_size =
sizeof(priv_data->data0) +
cmdstream_count * sizeof(struct submit_priv_ib_info);
priv_data->data0.cmdbuf.unk1 = 0xcccc0001;
priv_data->data0.cmdbuf.cmdbuf_size = sizeof(priv_data->data0.cmdbuf) +
cmdstream_count * sizeof(struct submit_priv_ib_info);
priv_data->data0.cmdbuf.ib_count = cmdstream_count;
struct cmdstream *cmd;
uint32_t idx = 0;
u_vector_foreach(cmd, &dev->cmdstreams) {
priv_data->data0.cmdbuf.ibs[idx].size_dwords = cmd->size / 4;
priv_data->data0.cmdbuf.ibs[idx].iova = cmd->iova;
idx++;
}
priv_data->data0.cmdbuf.ibs[idx].size_dwords = 4;
priv_data->data0.cmdbuf.ibs[idx].iova = dev->fence_ib_iova;
*dev->fence = 0x00000000;
dev->fence_ib[0] = pm4_pkt7_hdr(0x3d, 3); // CP_MEM_WRITE
dev->fence_ib[1] = dev->fence_iova;
dev->fence_ib[2] = dev->fence_iova >> 32;
dev->fence_ib[3] = 0xababfcfc;
// Fill second (empty) data block
// uint32_t *magic_end = (uint32_t *)(((char *) priv_data) + priv_data_size - 8);
// magic_end[0] = 0xfadcab00;
// magic_end[1] = 0x00000008;
struct d3dkmt_submitcommand submission = {
.command_buffer = priv_data->data0.cmdbuf.ibs[0].iova,
.command_length = priv_data->data0.cmdbuf.ibs[0].size_dwords * sizeof(uint32_t),
.broadcast_context_count = 1,
.broadcast_context[0] = dev->context,
.priv_drv_data_size = priv_data_size,
.priv_drv_data = priv_data,
};
int ret = safe_ioctl(dev->fd, LX_DXSUBMITCOMMAND, &submission);
if (ret) {
errx(1, "LX_DXSUBMITCOMMAND failure");
}
free(priv_data);
u_vector_finish(&dev->cmdstreams);
u_vector_init(&dev->cmdstreams, 8, sizeof(struct cmdstream));
// TODO: better way to wait
for (unsigned i = 0; i < 1000; i++) {
usleep(1000);
if (*dev->fence != 0)
break;
}
if (*dev->fence == 0) {
errx(1, "Waiting for submission failed! GPU faulted or kernel did not execute this submission.");
}
device_print_shader_log(dev);
device_print_cp_log(dev);
device_dump_wrbuf(dev);
device_free_buffers(dev);
}
static void
buffer_mem_alloc(struct device *dev, struct buffer *buf)
{
util_vma_heap_alloc_addr(&dev->vma, buf->iova, buf->size);
buf->map = ((uint8_t*)dev->va_map) + (buf->iova - dev->va_iova);
}
void
buffer_mem_free(struct device *dev, struct buffer *buf)
{
util_vma_heap_free(&dev->vma, buf->iova, buf->size);
}
#endif
static void
@ -822,7 +1206,7 @@ override_cmdstream(struct device *dev, struct cmdstream *cs,
{
#if FD_REPLAY_KGSL
static const char *tmpfilename = "/sdcard/Download/cmdstream_override.rd";
#else
#elif FD_REPLAY_MSM || FD_REPLAY_WSL
static const char *tmpfilename = "/tmp/cmdstream_override.rd";
#endif
@ -918,7 +1302,7 @@ override_cmdstream(struct device *dev, struct cmdstream *cs,
static int
handle_file(const char *filename, uint32_t first_submit, uint32_t last_submit,
uint32_t submit_to_override, const char *cmdstreamgen)
uint32_t submit_to_override, uint64_t base_addr, const char *cmdstreamgen)
{
struct io *io;
int submit = 0;
@ -938,7 +1322,7 @@ handle_file(const char *filename, uint32_t first_submit, uint32_t last_submit,
return -1;
}
struct device *dev = device_create();
struct device *dev = device_create(base_addr);
struct {
unsigned int len;