freedreno/crashdec: Split out mempool decoding

Before we start adding GMU HFI decoding, lets split the other big section specific decoding (mempool) out into it's own file. Signed-off-by: Rob Clark <robdclark@chromium.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13937>
2021-11-23 09:40:15 -08:00 · 2021-11-23 09:40:15 -08:00 · 2133d34b11
parent b234c538e8
commit 2133d34b11
4 changed files with 402 additions and 328 deletions
--- a/src/freedreno/decode/crashdec-mempool.c
+++ b/src/freedreno/decode/crashdec-mempool.c
@ -0,0 +1,313 @@
+/*
+ * Copyright © 2020 Valve Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "crashdec.h"
+
+
+static void
+dump_mem_pool_reg_write(unsigned reg, uint32_t data, unsigned context,
+                        bool pipe)
+{
+   if (pipe) {
+      struct rnndecaddrinfo *info = rnn_reginfo(rnn_pipe, reg);
+      printf("\t\twrite %s (%02x) pipe\n", info->name, reg);
+
+      if (!strcmp(info->typeinfo->name, "void")) {
+         /* registers that ignore their payload */
+      } else {
+         printf("\t\t\t");
+         dump_register(rnn_pipe, reg, data);
+      }
+   } else {
+      printf("\t\twrite %s (%05x) context %d\n", regname(reg, 1), reg, context);
+      dump_register_val(reg, data, 2);
+   }
+}
+
+static void
+dump_mem_pool_chunk(const uint32_t *chunk)
+{
+   struct __attribute__((packed)) {
+      bool reg0_enabled : 1;
+      bool reg1_enabled : 1;
+      uint32_t data0 : 32;
+      uint32_t data1 : 32;
+      uint32_t reg0 : 18;
+      uint32_t reg1 : 18;
+      bool reg0_pipe : 1;
+      bool reg1_pipe : 1;
+      uint32_t reg0_context : 1;
+      uint32_t reg1_context : 1;
+      uint32_t padding : 22;
+   } fields;
+
+   memcpy(&fields, chunk, 4 * sizeof(uint32_t));
+
+   if (fields.reg0_enabled) {
+      dump_mem_pool_reg_write(fields.reg0, fields.data0, fields.reg0_context,
+                              fields.reg0_pipe);
+   }
+
+   if (fields.reg1_enabled) {
+      dump_mem_pool_reg_write(fields.reg1, fields.data1, fields.reg1_context,
+                              fields.reg1_pipe);
+   }
+}
+
+void
+dump_cp_mem_pool(uint32_t *mempool)
+{
+   /* The mem pool is a shared pool of memory used for storing in-flight
+    * register writes. There are 6 different queues, one for each
+    * cluster. Writing to $data (or for some special registers, $addr)
+    * pushes data onto the appropriate queue, and each queue is pulled
+    * from by the appropriate cluster. The queues are thus written to
+    * in-order, but may be read out-of-order.
+    *
+    * The queues are conceptually divided into 128-bit "chunks", and the
+    * read and write pointers are in units of chunks.  These chunks are
+    * organized internally into 8-chunk "blocks", and memory is allocated
+    * dynamically in terms of blocks. Each queue is represented as a
+    * singly-linked list of blocks, as well as 3-bit start/end chunk
+    * pointers that point within the first/last block.  The next pointers
+    * are located in a separate array, rather than inline.
+    */
+
+   /* TODO: The firmware CP_MEM_POOL save/restore routines do something
+    * like:
+    *
+    * cread $02, [ $00 + 0 ]
+    * and $02, $02, 0x118
+    * ...
+    * brne $02, 0, #label
+    * mov $03, 0x2000
+    * mov $03, 0x1000
+    * label:
+    * ...
+    *
+    * I think that control register 0 is the GPU version, and some
+    * versions have a smaller mem pool. It seems some models have a mem
+    * pool that's half the size, and a bunch of offsets are shifted
+    * accordingly. Unfortunately the kernel driver's dumping code doesn't
+    * seem to take this into account, even the downstream android driver,
+    * and we don't know which versions 0x8, 0x10, or 0x100 correspond
+    * to. Or maybe we can use CP_DBG_MEM_POOL_SIZE to figure this out?
+    */
+   bool small_mem_pool = false;
+
+   /* The array of next pointers for each block. */
+   const uint32_t *next_pointers =
+      small_mem_pool ? &mempool[0x800] : &mempool[0x1000];
+
+   /* Maximum number of blocks in the pool, also the size of the pointers
+    * array.
+    */
+   const int num_blocks = small_mem_pool ? 0x30 : 0x80;
+
+   /* Number of queues */
+   const unsigned num_queues = 6;
+
+   /* Unfortunately the per-queue state is a little more complicated than
+    * a simple pair of begin/end pointers. Instead of a single beginning
+    * block, there are *two*, with the property that either the two are
+    * equal or the second is the "next" of the first. Similarly there are
+    * two end blocks. Thus the queue either looks like this:
+    *
+    * A -> B -> ... -> C -> D
+    *
+    * Or like this, or some combination:
+    *
+    * A/B -> ... -> C/D
+    *
+    * However, there's only one beginning/end chunk offset. Now the
+    * question is, which of A or B is the actual start? I.e. is the chunk
+    * offset an offset inside A or B? It depends. I'll show a typical read
+    * cycle, starting here (read pointer marked with a *) with a chunk
+    * offset of 0:
+    *
+    *	  A                    B
+    *  _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _
+    * |_|_|_|_|_|_|_|_| -> |*|_|_|_|_|_|_|_| -> |_|_|_|_|_|_|_|_|
+    *
+    * Once the pointer advances far enough, the hardware decides to free
+    * A, after which the read-side state looks like:
+    *
+    *	(free)                A/B
+    *  _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _
+    * |_|_|_|_|_|_|_|_|    |_|_|_|*|_|_|_|_| -> |_|_|_|_|_|_|_|_|
+    *
+    * Then after advancing the pointer a bit more, the hardware fetches
+    * the "next" pointer for A and stores it in B:
+    *
+    *	(free)                 A                     B
+    *  _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _
+    * |_|_|_|_|_|_|_|_|    |_|_|_|_|_|_|_|*| -> |_|_|_|_|_|_|_|_|
+    *
+    * Then the read pointer advances into B, at which point we've come
+    * back to the first state having advanced a whole block:
+    *
+    *	(free)                 A                     B
+    *  _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _
+    * |_|_|_|_|_|_|_|_|    |_|_|_|_|_|_|_|_| -> |*|_|_|_|_|_|_|_|
+    *
+    *
+    * There is a similar cycle for the write pointer. Now, the question
+    * is, how do we know which state we're in? We need to know this to
+    * know whether the pointer (*) is in A or B if they're different. It
+    * seems like there should be some bit somewhere describing this, but
+    * after lots of experimentation I've come up empty-handed. For now we
+    * assume that if the pointer is in the first half, then we're in
+    * either the first or second state and use B, and otherwise we're in
+    * the second or third state and use A. So far I haven't seen anything
+    * that violates this assumption.
+    */
+
+   struct {
+      uint32_t unk0;
+      uint32_t padding0[7]; /* Mirrors of unk0 */
+
+      struct {
+         uint32_t chunk : 3;
+         uint32_t first_block : 32 - 3;
+      } writer[6];
+      uint32_t padding1[2]; /* Mirrors of writer[4], writer[5] */
+
+      uint32_t unk1;
+      uint32_t padding2[7]; /* Mirrors of unk1 */
+
+      uint32_t writer_second_block[6];
+      uint32_t padding3[2];
+
+      uint32_t unk2[6];
+      uint32_t padding4[2];
+
+      struct {
+         uint32_t chunk : 3;
+         uint32_t first_block : 32 - 3;
+      } reader[6];
+      uint32_t padding5[2]; /* Mirrors of reader[4], reader[5] */
+
+      uint32_t unk3;
+      uint32_t padding6[7]; /* Mirrors of unk3 */
+
+      uint32_t reader_second_block[6];
+      uint32_t padding7[2];
+
+      uint32_t block_count[6];
+      uint32_t padding[2];
+
+      uint32_t unk4;
+      uint32_t padding9[7]; /* Mirrors of unk4 */
+   } data1;
+
+   const uint32_t *data1_ptr =
+      small_mem_pool ? &mempool[0xc00] : &mempool[0x1800];
+   memcpy(&data1, data1_ptr, sizeof(data1));
+
+   /* Based on the kernel, the first dword is the mem pool size (in
+    * blocks?) and mirrors CP_MEM_POOL_DBG_SIZE.
+    */
+   const uint32_t *data2_ptr =
+      small_mem_pool ? &mempool[0x1000] : &mempool[0x2000];
+   const int data2_size = 0x60;
+
+   /* This seems to be the size of each queue in chunks. */
+   const uint32_t *queue_sizes = &data2_ptr[0x18];
+
+   printf("\tdata2:\n");
+   dump_hex_ascii(data2_ptr, 4 * data2_size, 1);
+
+   /* These seem to be some kind of counter of allocated/deallocated blocks */
+   if (verbose) {
+      printf("\tunk0: %x\n", data1.unk0);
+      printf("\tunk1: %x\n", data1.unk1);
+      printf("\tunk3: %x\n", data1.unk3);
+      printf("\tunk4: %x\n\n", data1.unk4);
+   }
+
+   for (int queue = 0; queue < num_queues; queue++) {
+      const char *cluster_names[6] = {"FE",   "SP_VS", "PC_VS",
+                                      "GRAS", "SP_PS", "PS"};
+      printf("\tCLUSTER_%s:\n\n", cluster_names[queue]);
+
+      if (verbose) {
+         printf("\t\twriter_first_block: 0x%x\n",
+                data1.writer[queue].first_block);
+         printf("\t\twriter_second_block: 0x%x\n",
+                data1.writer_second_block[queue]);
+         printf("\t\twriter_chunk: %d\n", data1.writer[queue].chunk);
+         printf("\t\treader_first_block: 0x%x\n",
+                data1.reader[queue].first_block);
+         printf("\t\treader_second_block: 0x%x\n",
+                data1.reader_second_block[queue]);
+         printf("\t\treader_chunk: %d\n", data1.reader[queue].chunk);
+         printf("\t\tblock_count: %d\n", data1.block_count[queue]);
+         printf("\t\tunk2: 0x%x\n", data1.unk2[queue]);
+         printf("\t\tqueue_size: %d\n\n", queue_sizes[queue]);
+      }
+
+      uint32_t cur_chunk = data1.reader[queue].chunk;
+      uint32_t cur_block = cur_chunk > 3 ? data1.reader[queue].first_block
+                                         : data1.reader_second_block[queue];
+      uint32_t last_chunk = data1.writer[queue].chunk;
+      uint32_t last_block = last_chunk > 3 ? data1.writer[queue].first_block
+                                           : data1.writer_second_block[queue];
+
+      if (verbose)
+         printf("\tblock %x\n", cur_block);
+      if (cur_block >= num_blocks) {
+         fprintf(stderr, "block %x too large\n", cur_block);
+         exit(1);
+      }
+      unsigned calculated_queue_size = 0;
+      while (cur_block != last_block || cur_chunk != last_chunk) {
+         calculated_queue_size++;
+         uint32_t *chunk_ptr = &mempool[cur_block * 0x20 + cur_chunk * 4];
+
+         dump_mem_pool_chunk(chunk_ptr);
+
+         printf("\t%05x: %08x %08x %08x %08x\n",
+                4 * (cur_block * 0x20 + cur_chunk + 4), chunk_ptr[0],
+                chunk_ptr[1], chunk_ptr[2], chunk_ptr[3]);
+
+         cur_chunk++;
+         if (cur_chunk == 8) {
+            cur_block = next_pointers[cur_block];
+            if (verbose)
+               printf("\tblock %x\n", cur_block);
+            if (cur_block >= num_blocks) {
+               fprintf(stderr, "block %x too large\n", cur_block);
+               exit(1);
+            }
+            cur_chunk = 0;
+         }
+      }
+      if (calculated_queue_size != queue_sizes[queue]) {
+         printf("\t\tCALCULATED SIZE %d DOES NOT MATCH!\n",
+                calculated_queue_size);
+      }
+      printf("\n");
+   }
+}
+
--- a/src/freedreno/decode/crashdec.c
+++ b/src/freedreno/decode/crashdec.c
@ -36,54 +36,20 @@
 * or times out after 5min)
 */

-#include <assert.h>
-#include <getopt.h>
-#include <inttypes.h>
-#include <stdarg.h>
-#include <stdbool.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>

-#include "freedreno_pm4.h"
-
-#include "ir3/instr-a3xx.h"
-#include "buffers.h"
-#include "cffdec.h"
-#include "disasm.h"
-#include "pager.h"
-#include "rnnutil.h"
-#include "util.h"
+#include "crashdec.h"

 static FILE *in;
-static bool verbose;
+bool verbose;

-static struct rnn *rnn_gmu;
-static struct rnn *rnn_control;
-static struct rnn *rnn_pipe;
+struct rnn *rnn_gmu;
+struct rnn *rnn_control;
+struct rnn *rnn_pipe;

-static struct cffdec_options options = {
+struct cffdec_options options = {
   .draw_filter = -1,
 };

-static inline bool
-is_a6xx(void)
-{
-   return (600 <= options.gpu_id) && (options.gpu_id < 700);
-}
-static inline bool
-is_a5xx(void)
-{
-   return (500 <= options.gpu_id) && (options.gpu_id < 600);
-}
-static inline bool
-is_64b(void)
-{
-   return options.gpu_id >= 500;
-}
-
 /*
 * Helpers to read register values:
 */
@ -417,7 +383,7 @@ decode_bos(void)
 * Decode registers section:
 */

-static void
+void
 dump_register(struct rnn *rnn, uint32_t offset, uint32_t value)
 {
   struct rnndecaddrinfo *info = rnn_reginfo(rnn, offset);
@ -563,292 +529,6 @@ dump_cp_ucode_dbg(uint32_t *dbg)
   }
 }

-static void
-dump_mem_pool_reg_write(unsigned reg, uint32_t data, unsigned context,
-                        bool pipe)
-{
-   if (pipe) {
-      struct rnndecaddrinfo *info = rnn_reginfo(rnn_pipe, reg);
-      printf("\t\twrite %s (%02x) pipe\n", info->name, reg);
-
-      if (!strcmp(info->typeinfo->name, "void")) {
-         /* registers that ignore their payload */
-      } else {
-         printf("\t\t\t");
-         dump_register(rnn_pipe, reg, data);
-      }
-   } else {
-      printf("\t\twrite %s (%05x) context %d\n", regname(reg, 1), reg, context);
-      dump_register_val(reg, data, 2);
-   }
-}
-
-static void
-dump_mem_pool_chunk(const uint32_t *chunk)
-{
-   struct __attribute__((packed)) {
-      bool reg0_enabled : 1;
-      bool reg1_enabled : 1;
-      uint32_t data0 : 32;
-      uint32_t data1 : 32;
-      uint32_t reg0 : 18;
-      uint32_t reg1 : 18;
-      bool reg0_pipe : 1;
-      bool reg1_pipe : 1;
-      uint32_t reg0_context : 1;
-      uint32_t reg1_context : 1;
-      uint32_t padding : 22;
-   } fields;
-
-   memcpy(&fields, chunk, 4 * sizeof(uint32_t));
-
-   if (fields.reg0_enabled) {
-      dump_mem_pool_reg_write(fields.reg0, fields.data0, fields.reg0_context,
-                              fields.reg0_pipe);
-   }
-
-   if (fields.reg1_enabled) {
-      dump_mem_pool_reg_write(fields.reg1, fields.data1, fields.reg1_context,
-                              fields.reg1_pipe);
-   }
-}
-
-static void
-dump_cp_mem_pool(uint32_t *mempool)
-{
-   /* The mem pool is a shared pool of memory used for storing in-flight
-    * register writes. There are 6 different queues, one for each
-    * cluster. Writing to $data (or for some special registers, $addr)
-    * pushes data onto the appropriate queue, and each queue is pulled
-    * from by the appropriate cluster. The queues are thus written to
-    * in-order, but may be read out-of-order.
-    *
-    * The queues are conceptually divided into 128-bit "chunks", and the
-    * read and write pointers are in units of chunks.  These chunks are
-    * organized internally into 8-chunk "blocks", and memory is allocated
-    * dynamically in terms of blocks. Each queue is represented as a
-    * singly-linked list of blocks, as well as 3-bit start/end chunk
-    * pointers that point within the first/last block.  The next pointers
-    * are located in a separate array, rather than inline.
-    */
-
-   /* TODO: The firmware CP_MEM_POOL save/restore routines do something
-    * like:
-    *
-    * cread $02, [ $00 + 0 ]
-    * and $02, $02, 0x118
-    * ...
-    * brne $02, 0, #label
-    * mov $03, 0x2000
-    * mov $03, 0x1000
-    * label:
-    * ...
-    *
-    * I think that control register 0 is the GPU version, and some
-    * versions have a smaller mem pool. It seems some models have a mem
-    * pool that's half the size, and a bunch of offsets are shifted
-    * accordingly. Unfortunately the kernel driver's dumping code doesn't
-    * seem to take this into account, even the downstream android driver,
-    * and we don't know which versions 0x8, 0x10, or 0x100 correspond
-    * to. Or maybe we can use CP_DBG_MEM_POOL_SIZE to figure this out?
-    */
-   bool small_mem_pool = false;
-
-   /* The array of next pointers for each block. */
-   const uint32_t *next_pointers =
-      small_mem_pool ? &mempool[0x800] : &mempool[0x1000];
-
-   /* Maximum number of blocks in the pool, also the size of the pointers
-    * array.
-    */
-   const int num_blocks = small_mem_pool ? 0x30 : 0x80;
-
-   /* Number of queues */
-   const unsigned num_queues = 6;
-
-   /* Unfortunately the per-queue state is a little more complicated than
-    * a simple pair of begin/end pointers. Instead of a single beginning
-    * block, there are *two*, with the property that either the two are
-    * equal or the second is the "next" of the first. Similarly there are
-    * two end blocks. Thus the queue either looks like this:
-    *
-    * A -> B -> ... -> C -> D
-    *
-    * Or like this, or some combination:
-    *
-    * A/B -> ... -> C/D
-    *
-    * However, there's only one beginning/end chunk offset. Now the
-    * question is, which of A or B is the actual start? I.e. is the chunk
-    * offset an offset inside A or B? It depends. I'll show a typical read
-    * cycle, starting here (read pointer marked with a *) with a chunk
-    * offset of 0:
-    *
-    *	  A                    B
-    *  _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _
-    * |_|_|_|_|_|_|_|_| -> |*|_|_|_|_|_|_|_| -> |_|_|_|_|_|_|_|_|
-    *
-    * Once the pointer advances far enough, the hardware decides to free
-    * A, after which the read-side state looks like:
-    *
-    *	(free)                A/B
-    *  _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _
-    * |_|_|_|_|_|_|_|_|    |_|_|_|*|_|_|_|_| -> |_|_|_|_|_|_|_|_|
-    *
-    * Then after advancing the pointer a bit more, the hardware fetches
-    * the "next" pointer for A and stores it in B:
-    *
-    *	(free)                 A                     B
-    *  _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _
-    * |_|_|_|_|_|_|_|_|    |_|_|_|_|_|_|_|*| -> |_|_|_|_|_|_|_|_|
-    *
-    * Then the read pointer advances into B, at which point we've come
-    * back to the first state having advanced a whole block:
-    *
-    *	(free)                 A                     B
-    *  _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _
-    * |_|_|_|_|_|_|_|_|    |_|_|_|_|_|_|_|_| -> |*|_|_|_|_|_|_|_|
-    *
-    *
-    * There is a similar cycle for the write pointer. Now, the question
-    * is, how do we know which state we're in? We need to know this to
-    * know whether the pointer (*) is in A or B if they're different. It
-    * seems like there should be some bit somewhere describing this, but
-    * after lots of experimentation I've come up empty-handed. For now we
-    * assume that if the pointer is in the first half, then we're in
-    * either the first or second state and use B, and otherwise we're in
-    * the second or third state and use A. So far I haven't seen anything
-    * that violates this assumption.
-    */
-
-   struct {
-      uint32_t unk0;
-      uint32_t padding0[7]; /* Mirrors of unk0 */
-
-      struct {
-         uint32_t chunk : 3;
-         uint32_t first_block : 32 - 3;
-      } writer[6];
-      uint32_t padding1[2]; /* Mirrors of writer[4], writer[5] */
-
-      uint32_t unk1;
-      uint32_t padding2[7]; /* Mirrors of unk1 */
-
-      uint32_t writer_second_block[6];
-      uint32_t padding3[2];
-
-      uint32_t unk2[6];
-      uint32_t padding4[2];
-
-      struct {
-         uint32_t chunk : 3;
-         uint32_t first_block : 32 - 3;
-      } reader[6];
-      uint32_t padding5[2]; /* Mirrors of reader[4], reader[5] */
-
-      uint32_t unk3;
-      uint32_t padding6[7]; /* Mirrors of unk3 */
-
-      uint32_t reader_second_block[6];
-      uint32_t padding7[2];
-
-      uint32_t block_count[6];
-      uint32_t padding[2];
-
-      uint32_t unk4;
-      uint32_t padding9[7]; /* Mirrors of unk4 */
-   } data1;
-
-   const uint32_t *data1_ptr =
-      small_mem_pool ? &mempool[0xc00] : &mempool[0x1800];
-   memcpy(&data1, data1_ptr, sizeof(data1));
-
-   /* Based on the kernel, the first dword is the mem pool size (in
-    * blocks?) and mirrors CP_MEM_POOL_DBG_SIZE.
-    */
-   const uint32_t *data2_ptr =
-      small_mem_pool ? &mempool[0x1000] : &mempool[0x2000];
-   const int data2_size = 0x60;
-
-   /* This seems to be the size of each queue in chunks. */
-   const uint32_t *queue_sizes = &data2_ptr[0x18];
-
-   printf("\tdata2:\n");
-   dump_hex_ascii(data2_ptr, 4 * data2_size, 1);
-
-   /* These seem to be some kind of counter of allocated/deallocated blocks */
-   if (verbose) {
-      printf("\tunk0: %x\n", data1.unk0);
-      printf("\tunk1: %x\n", data1.unk1);
-      printf("\tunk3: %x\n", data1.unk3);
-      printf("\tunk4: %x\n\n", data1.unk4);
-   }
-
-   for (int queue = 0; queue < num_queues; queue++) {
-      const char *cluster_names[6] = {"FE",   "SP_VS", "PC_VS",
-                                      "GRAS", "SP_PS", "PS"};
-      printf("\tCLUSTER_%s:\n\n", cluster_names[queue]);
-
-      if (verbose) {
-         printf("\t\twriter_first_block: 0x%x\n",
-                data1.writer[queue].first_block);
-         printf("\t\twriter_second_block: 0x%x\n",
-                data1.writer_second_block[queue]);
-         printf("\t\twriter_chunk: %d\n", data1.writer[queue].chunk);
-         printf("\t\treader_first_block: 0x%x\n",
-                data1.reader[queue].first_block);
-         printf("\t\treader_second_block: 0x%x\n",
-                data1.reader_second_block[queue]);
-         printf("\t\treader_chunk: %d\n", data1.reader[queue].chunk);
-         printf("\t\tblock_count: %d\n", data1.block_count[queue]);
-         printf("\t\tunk2: 0x%x\n", data1.unk2[queue]);
-         printf("\t\tqueue_size: %d\n\n", queue_sizes[queue]);
-      }
-
-      uint32_t cur_chunk = data1.reader[queue].chunk;
-      uint32_t cur_block = cur_chunk > 3 ? data1.reader[queue].first_block
-                                         : data1.reader_second_block[queue];
-      uint32_t last_chunk = data1.writer[queue].chunk;
-      uint32_t last_block = last_chunk > 3 ? data1.writer[queue].first_block
-                                           : data1.writer_second_block[queue];
-
-      if (verbose)
-         printf("\tblock %x\n", cur_block);
-      if (cur_block >= num_blocks) {
-         fprintf(stderr, "block %x too large\n", cur_block);
-         exit(1);
-      }
-      unsigned calculated_queue_size = 0;
-      while (cur_block != last_block || cur_chunk != last_chunk) {
-         calculated_queue_size++;
-         uint32_t *chunk_ptr = &mempool[cur_block * 0x20 + cur_chunk * 4];
-
-         dump_mem_pool_chunk(chunk_ptr);
-
-         printf("\t%05x: %08x %08x %08x %08x\n",
-                4 * (cur_block * 0x20 + cur_chunk + 4), chunk_ptr[0],
-                chunk_ptr[1], chunk_ptr[2], chunk_ptr[3]);
-
-         cur_chunk++;
-         if (cur_chunk == 8) {
-            cur_block = next_pointers[cur_block];
-            if (verbose)
-               printf("\tblock %x\n", cur_block);
-            if (cur_block >= num_blocks) {
-               fprintf(stderr, "block %x too large\n", cur_block);
-               exit(1);
-            }
-            cur_chunk = 0;
-         }
-      }
-      if (calculated_queue_size != queue_sizes[queue]) {
-         printf("\t\tCALCULATED SIZE %d DOES NOT MATCH!\n",
-                calculated_queue_size);
-      }
-      printf("\n");
-   }
-}
-
 static void
 decode_indexed_registers(void)
 {
--- a/src/freedreno/decode/crashdec.h
+++ b/src/freedreno/decode/crashdec.h
@ -0,0 +1,77 @@
+/*
+ * Copyright © 2021 Google, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __CRASHDEC_H__
+#define __CRASHDEC_H__
+
+#include <assert.h>
+#include <getopt.h>
+#include <inttypes.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "freedreno_pm4.h"
+
+#include "ir3/instr-a3xx.h"
+#include "buffers.h"
+#include "cffdec.h"
+#include "disasm.h"
+#include "pager.h"
+#include "rnnutil.h"
+#include "util.h"
+
+extern struct rnn *rnn_gmu;
+extern struct rnn *rnn_control;
+extern struct rnn *rnn_pipe;
+
+extern bool verbose;
+
+extern struct cffdec_options options;
+
+static inline bool
+is_a6xx(void)
+{
+   return (600 <= options.gpu_id) && (options.gpu_id < 700);
+}
+
+static inline bool
+is_a5xx(void)
+{
+   return (500 <= options.gpu_id) && (options.gpu_id < 600);
+}
+
+static inline bool
+is_64b(void)
+{
+   return options.gpu_id >= 500;
+}
+
+void dump_register(struct rnn *rnn, uint32_t offset, uint32_t value);
+void dump_cp_mem_pool(uint32_t *mempool);
+
+#endif /* __CRASHDEC_H__ */
--- a/src/freedreno/decode/meson.build
+++ b/src/freedreno/decode/meson.build
@ -132,7 +132,11 @@ endif

 crashdec = executable(
  'crashdec',
-  'crashdec.c',
+  [
+    'crashdec.c',
+    'crashdec.h',
+    'crashdec-mempool.c',
+  ],
  include_directories: [
    inc_freedreno,
    inc_freedreno_rnn,