diff --git a/src/freedreno/vulkan/meson.build b/src/freedreno/vulkan/meson.build
index 0b6fd62c38f..d9b5631777a 100644
--- a/src/freedreno/vulkan/meson.build
+++ b/src/freedreno/vulkan/meson.build
@@ -48,6 +48,7 @@ libtu_files = files(
   'tu_private.h',
   'tu_query.c',
   'tu_shader.c',
+  'tu_suballoc.c',
   'tu_util.c',
   'tu_util.h',
   'tu_perfetto.h',
diff --git a/src/freedreno/vulkan/tu_autotune.c b/src/freedreno/vulkan/tu_autotune.c
index bf1c07a8a7d..042d0f06d5e 100644
--- a/src/freedreno/vulkan/tu_autotune.c
+++ b/src/freedreno/vulkan/tu_autotune.c
@@ -711,4 +711,4 @@ void tu_autotune_end_renderpass(struct tu_cmd_buffer *cmd,
 
    tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
    tu_cs_emit(cs, ZPASS_DONE);
-}
\ No newline at end of file
+}
diff --git a/src/freedreno/vulkan/tu_cs.c b/src/freedreno/vulkan/tu_cs.c
index c95ffdcfaab..73b80758ca9 100644
--- a/src/freedreno/vulkan/tu_cs.c
+++ b/src/freedreno/vulkan/tu_cs.c
@@ -56,6 +56,25 @@ tu_cs_init_external(struct tu_cs *cs, struct tu_device *device,
    cs->end = end;
 }
 
+/**
+ * Initialize a sub-command stream as a wrapper to an externally sub-allocated
+ * buffer.
+ */
+void
+tu_cs_init_suballoc(struct tu_cs *cs, struct tu_device *device,
+                    struct tu_suballoc_bo *suballoc_bo)
+{
+   uint32_t *start = tu_suballoc_bo_map(suballoc_bo);
+   uint32_t *end = start + (suballoc_bo->size >> 2);
+
+   memset(cs, 0, sizeof(*cs));
+   cs->device = device;
+   cs->mode = TU_CS_MODE_SUB_STREAM;
+   cs->start = cs->reserved_end = cs->cur = start;
+   cs->end = end;
+   cs->refcount_bo = tu_bo_get_ref(suballoc_bo->bo);
+}
+
 /**
  * Finish and release all resources owned by a command stream.
  */
@@ -66,10 +85,24 @@ tu_cs_finish(struct tu_cs *cs)
       tu_bo_finish(cs->device, cs->bos[i]);
    }
 
+   if (cs->refcount_bo)
+      tu_bo_finish(cs->device, cs->refcount_bo);
+
    free(cs->entries);
    free(cs->bos);
 }
 
+static struct tu_bo *
+tu_cs_current_bo(const struct tu_cs *cs)
+{
+   if (cs->refcount_bo) {
+      return cs->refcount_bo;
+   } else {
+      assert(cs->bo_count);
+      return cs->bos[cs->bo_count - 1];
+   }
+}
+
 /**
  * Get the offset of the command packets emitted since the last call to
  * tu_cs_add_entry.
@@ -77,8 +110,7 @@ tu_cs_finish(struct tu_cs *cs)
 static uint32_t
 tu_cs_get_offset(const struct tu_cs *cs)
 {
-   assert(cs->bo_count);
-   return cs->start - (uint32_t *) cs->bos[cs->bo_count - 1]->map;
+   return cs->start - (uint32_t *) tu_cs_current_bo(cs)->map;
 }
 
 /*
@@ -90,6 +122,8 @@ tu_cs_add_bo(struct tu_cs *cs, uint32_t size)
 {
    /* no BO for TU_CS_MODE_EXTERNAL */
    assert(cs->mode != TU_CS_MODE_EXTERNAL);
+   /* No adding more BOs if suballocating from a suballoc_bo. */
+   assert(!cs->refcount_bo);
 
    /* no dangling command packet */
    assert(tu_cs_is_empty(cs));
@@ -176,7 +210,7 @@ tu_cs_add_entry(struct tu_cs *cs)
 
    /* add an entry for [cs->start, cs->cur] */
    cs->entries[cs->entry_count++] = (struct tu_cs_entry) {
-      .bo = cs->bos[cs->bo_count - 1],
+      .bo = tu_cs_current_bo(cs),
       .size = tu_cs_get_size(cs) * sizeof(uint32_t),
       .offset = tu_cs_get_offset(cs) * sizeof(uint32_t),
    };
@@ -281,7 +315,7 @@ tu_cs_alloc(struct tu_cs *cs,
    if (result != VK_SUCCESS)
       return result;
 
-   struct tu_bo *bo = cs->bos[cs->bo_count - 1];
+   struct tu_bo *bo = tu_cs_current_bo(cs);
    size_t offset = align(tu_cs_get_offset(cs), size);
 
    memory->map = bo->map + offset * sizeof(uint32_t);
@@ -303,7 +337,6 @@ struct tu_cs_entry
 tu_cs_end_sub_stream(struct tu_cs *cs, struct tu_cs *sub_cs)
 {
    assert(cs->mode == TU_CS_MODE_SUB_STREAM);
-   assert(cs->bo_count);
    assert(sub_cs->start == cs->cur && sub_cs->end == cs->reserved_end);
    tu_cs_sanity_check(sub_cs);
 
@@ -312,7 +345,7 @@ tu_cs_end_sub_stream(struct tu_cs *cs, struct tu_cs *sub_cs)
    cs->cur = sub_cs->cur;
 
    struct tu_cs_entry entry = {
-      .bo = cs->bos[cs->bo_count - 1],
+      .bo = tu_cs_current_bo(cs),
       .size = tu_cs_get_size(cs) * sizeof(uint32_t),
       .offset = tu_cs_get_offset(cs) * sizeof(uint32_t),
    };
@@ -397,7 +430,7 @@ void
 tu_cs_reset(struct tu_cs *cs)
 {
    if (cs->mode == TU_CS_MODE_EXTERNAL) {
-      assert(!cs->bo_count && !cs->entry_count);
+      assert(!cs->bo_count && !cs->refcount_bo && !cs->entry_count);
       cs->reserved_end = cs->cur = cs->start;
       return;
    }
diff --git a/src/freedreno/vulkan/tu_cs.h b/src/freedreno/vulkan/tu_cs.h
index 494d9d8fcb5..6e830a907db 100644
--- a/src/freedreno/vulkan/tu_cs.h
+++ b/src/freedreno/vulkan/tu_cs.h
@@ -39,6 +39,10 @@ void
 tu_cs_init_external(struct tu_cs *cs, struct tu_device *device,
                     uint32_t *start, uint32_t *end);
 
+void
+tu_cs_init_suballoc(struct tu_cs *cs, struct tu_device *device,
+                    struct tu_suballoc_bo *bo);
+
 void
 tu_cs_finish(struct tu_cs *cs);
 
diff --git a/src/freedreno/vulkan/tu_device.c b/src/freedreno/vulkan/tu_device.c
index b12f49e2465..e2d22892fe2 100644
--- a/src/freedreno/vulkan/tu_device.c
+++ b/src/freedreno/vulkan/tu_device.c
@@ -1728,6 +1728,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
    device->vk.check_status = tu_device_check_status;
 
    mtx_init(&device->bo_mutex, mtx_plain);
+   mtx_init(&device->pipeline_mutex, mtx_plain);
    u_rwlock_init(&device->dma_bo_lock);
    pthread_mutex_init(&device->submit_mutex, NULL);
 
@@ -1786,6 +1787,9 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
    if (custom_border_colors)
       global_size += TU_BORDER_COLOR_COUNT * sizeof(struct bcolor_entry);
 
+   tu_bo_suballocator_init(&device->pipeline_suballoc, device,
+                           128 * 1024, TU_BO_ALLOC_GPU_READ_ONLY | TU_BO_ALLOC_ALLOW_DUMP);
+
    result = tu_bo_init_new(device, &device->global_bo, global_size,
                            TU_BO_ALLOC_ALLOW_DUMP);
    if (result != VK_SUCCESS) {
@@ -1987,6 +1991,8 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
 
    tu_autotune_fini(&device->autotune, device);
 
+   tu_bo_suballocator_finish(&device->pipeline_suballoc);
+
    util_sparse_array_finish(&device->bo_map);
    u_rwlock_destroy(&device->dma_bo_lock);
 
diff --git a/src/freedreno/vulkan/tu_pipeline.c b/src/freedreno/vulkan/tu_pipeline.c
index d8cdd1969a5..c3bae9582b9 100644
--- a/src/freedreno/vulkan/tu_pipeline.c
+++ b/src/freedreno/vulkan/tu_pipeline.c
@@ -2257,6 +2257,7 @@ tu_pipeline_allocate_cs(struct tu_device *dev,
                         struct tu_pipeline *pipeline,
                         struct tu_pipeline_layout *layout,
                         struct tu_pipeline_builder *builder,
+                        struct tu_pipeline_cache *cache,
                         struct ir3_shader_variant *compute)
 {
    uint32_t size = 2048 + tu6_load_state_size(pipeline, layout, compute);
@@ -2292,13 +2293,27 @@ tu_pipeline_allocate_cs(struct tu_device *dev,
       size += tu_xs_get_additional_cs_size_dwords(compute);
    }
 
-   tu_cs_init(&pipeline->cs, dev, TU_CS_MODE_SUB_STREAM, size);
+   /* Allocate the space for the pipeline out of the device's RO suballocator.
+    *
+    * Sub-allocating BOs saves memory and also kernel overhead in refcounting of
+    * BOs at exec time.
+    *
+    * The pipeline cache would seem like a natural place to stick the
+    * suballocator, except that it is not guaranteed to outlive the pipelines
+    * created from it, so you can't store any long-lived state there, and you
+    * can't use its EXTERNALLY_SYNCHRONIZED flag to avoid atomics because
+    * pipeline destroy isn't synchronized by the cache.
+    */
+   pthread_mutex_lock(&dev->pipeline_mutex);
+   VkResult result = tu_suballoc_bo_alloc(&pipeline->bo, &dev->pipeline_suballoc,
+                                          size * 4, 128);
+   pthread_mutex_unlock(&dev->pipeline_mutex);
+   if (result != VK_SUCCESS)
+      return result;
 
-   /* Reserve the space now such that tu_cs_begin_sub_stream never fails. Note
-    * that LOAD_STATE can potentially take up a large amount of space so we
-    * calculate its size explicitly.
-   */
-   return tu_cs_reserve_space(&pipeline->cs, size);
+   tu_cs_init_suballoc(&pipeline->cs, dev, &pipeline->bo);
+
+   return VK_SUCCESS;
 }
 
 static void
@@ -3259,6 +3274,9 @@ tu_pipeline_finish(struct tu_pipeline *pipeline,
                    const VkAllocationCallbacks *alloc)
 {
    tu_cs_finish(&pipeline->cs);
+   pthread_mutex_lock(&dev->pipeline_mutex);
+   tu_suballoc_bo_free(&dev->pipeline_suballoc, &pipeline->bo);
+   pthread_mutex_unlock(&dev->pipeline_mutex);
 
    if (pipeline->pvtmem_bo)
       tu_bo_finish(dev, pipeline->pvtmem_bo);
@@ -3288,7 +3306,7 @@ tu_pipeline_builder_build(struct tu_pipeline_builder *builder,
    }
 
    result = tu_pipeline_allocate_cs(builder->device, *pipeline,
-                                    builder->layout, builder, NULL);
+                                    builder->layout, builder, builder->cache, NULL);
    if (result != VK_SUCCESS) {
       vk_object_free(&builder->device->vk, builder->alloc, *pipeline);
       return result;
@@ -3340,11 +3358,6 @@ tu_pipeline_builder_build(struct tu_pipeline_builder *builder,
    tu_pipeline_builder_parse_rasterization_order(builder, *pipeline);
    tu6_emit_load_state(*pipeline, builder->layout, false);
 
-   /* we should have reserved enough space upfront such that the CS never
-    * grows
-    */
-   assert((*pipeline)->cs.bo_count == 1);
-
    return VK_SUCCESS;
 }
 
@@ -3492,12 +3505,13 @@ tu_CreateGraphicsPipelines(VkDevice device,
 
 static VkResult
 tu_compute_pipeline_create(VkDevice device,
-                           VkPipelineCache _cache,
+                           VkPipelineCache pipelineCache,
                            const VkComputePipelineCreateInfo *pCreateInfo,
                            const VkAllocationCallbacks *pAllocator,
                            VkPipeline *pPipeline)
 {
    TU_FROM_HANDLE(tu_device, dev, device);
+   TU_FROM_HANDLE(tu_pipeline_cache, cache, pipelineCache);
    TU_FROM_HANDLE(tu_pipeline_layout, layout, pCreateInfo->layout);
    const VkPipelineShaderStageCreateInfo *stage_info = &pCreateInfo->stage;
    VkResult result;
@@ -3545,7 +3559,7 @@ tu_compute_pipeline_create(VkDevice device,
    tu_pipeline_set_linkage(&pipeline->program.link[MESA_SHADER_COMPUTE],
                            shader, v);
 
-   result = tu_pipeline_allocate_cs(dev, pipeline, layout, NULL, v);
+   result = tu_pipeline_allocate_cs(dev, pipeline, layout, NULL, cache, v);
    if (result != VK_SUCCESS)
       goto fail;
 
diff --git a/src/freedreno/vulkan/tu_private.h b/src/freedreno/vulkan/tu_private.h
index 39c1e695f7f..b992c381cfd 100644
--- a/src/freedreno/vulkan/tu_private.h
+++ b/src/freedreno/vulkan/tu_private.h
@@ -297,6 +297,13 @@ bool
 tu_physical_device_extension_supported(struct tu_physical_device *dev,
                                        const char *name);
 
+enum tu_bo_alloc_flags
+{
+   TU_BO_ALLOC_NO_FLAGS = 0,
+   TU_BO_ALLOC_ALLOW_DUMP = 1 << 0,
+   TU_BO_ALLOC_GPU_READ_ONLY = 1 << 1,
+};
+
 struct cache_entry;
 
 struct tu_pipeline_cache
@@ -379,6 +386,47 @@ struct tu_bo
    bool implicit_sync : 1;
 };
 
+/* externally-synchronized BO suballocator. */
+struct tu_suballocator
+{
+   struct tu_device *dev;
+
+   uint32_t default_size;
+   enum tu_bo_alloc_flags flags;
+
+   /** Current BO we're suballocating out of. */
+   struct tu_bo *bo;
+   uint32_t next_offset;
+
+   /** Optional BO cached for recycling as the next suballoc->bo, instead of having to allocate one. */
+   struct tu_bo *cached_bo;
+};
+
+struct tu_suballoc_bo
+{
+   struct tu_bo *bo;
+   uint64_t iova;
+   uint32_t size; /* bytes */
+};
+
+void
+tu_bo_suballocator_init(struct tu_suballocator *suballoc,
+                        struct tu_device *dev,
+                        uint32_t default_size,
+                        uint32_t flags);
+void
+tu_bo_suballocator_finish(struct tu_suballocator *suballoc);
+
+VkResult
+tu_suballoc_bo_alloc(struct tu_suballoc_bo *suballoc_bo, struct tu_suballocator *suballoc,
+                     uint32_t size, uint32_t align);
+
+void *
+tu_suballoc_bo_map(struct tu_suballoc_bo *bo);
+
+void
+tu_suballoc_bo_free(struct tu_suballocator *suballoc, struct tu_suballoc_bo *bo);
+
 enum global_shader {
    GLOBAL_SH_VS_BLIT,
    GLOBAL_SH_VS_CLEAR,
@@ -460,6 +508,12 @@ struct tu_device
 
    uint32_t implicit_sync_bo_count;
 
+   /* Device-global BO suballocator for reducing BO management overhead for
+    * (read-only) pipeline state.  Synchronized by pipeline_mutex.
+    */
+   struct tu_suballocator pipeline_suballoc;
+   mtx_t pipeline_mutex;
+
    /* the blob seems to always use 8K factor and 128K param sizes, copy them */
 #define TU_TESS_FACTOR_SIZE (8 * 1024)
 #define TU_TESS_PARAM_SIZE (128 * 1024)
@@ -495,8 +549,9 @@ struct tu_device
     * freed. Otherwise, if we are not self-importing, we get two different BO
     * handles, and we want to free each one individually.
     *
-    * The BOs in this map all have a refcnt with the reference counter and
-    * only self-imported BOs will ever have a refcnt > 1.
+    * The refcount is also useful for being able to maintain BOs across
+    * VK object lifetimes, such as pipelines suballocating out of BOs
+    * allocated on the device.
     */
    struct util_sparse_array bo_map;
 
@@ -545,13 +600,6 @@ tu_device_ticks_to_ns(struct tu_device *dev, uint64_t ts);
 VkResult
 tu_device_check_status(struct vk_device *vk_device);
 
-enum tu_bo_alloc_flags
-{
-   TU_BO_ALLOC_NO_FLAGS = 0,
-   TU_BO_ALLOC_ALLOW_DUMP = 1 << 0,
-   TU_BO_ALLOC_GPU_READ_ONLY = 1 << 1,
-};
-
 VkResult
 tu_bo_init_new(struct tu_device *dev, struct tu_bo **bo, uint64_t size,
                enum tu_bo_alloc_flags flags);
@@ -573,6 +621,13 @@ tu_device_lookup_bo(struct tu_device *device, uint32_t handle)
    return (struct tu_bo *) util_sparse_array_get(&device->bo_map, handle);
 }
 
+static inline struct tu_bo *
+tu_bo_get_ref(struct tu_bo *bo)
+{
+   p_atomic_inc(&bo->refcnt);
+   return bo;
+}
+
 /* Get a scratch bo for use inside a command buffer. This will always return
  * the same bo given the same size or similar sizes, so only one scratch bo
  * can be used at the same time. It's meant for short-lived things where we
@@ -695,6 +750,9 @@ struct tu_cs
    uint32_t bo_count;
    uint32_t bo_capacity;
 
+   /* Optional BO that this CS is sub-allocated from for TU_CS_MODE_SUB_STREAM */
+   struct tu_bo *refcount_bo;
+
    /* state for cond_exec_start/cond_exec_end */
    uint32_t cond_flags;
    uint32_t *cond_dwords;
@@ -1312,6 +1370,7 @@ struct tu_pipeline
    struct vk_object_base base;
 
    struct tu_cs cs;
+   struct tu_suballoc_bo bo;
 
    /* Separate BO for private memory since it should GPU writable */
    struct tu_bo *pvtmem_bo;
diff --git a/src/freedreno/vulkan/tu_suballoc.c b/src/freedreno/vulkan/tu_suballoc.c
new file mode 100644
index 00000000000..d61edf66ab4
--- /dev/null
+++ b/src/freedreno/vulkan/tu_suballoc.c
@@ -0,0 +1,143 @@
+/*
+ * Copyright © 2022 Google LLC
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * Suballocator for space within BOs.
+ *
+ * BOs are allocated at PAGE_SIZE (typically 4k) granularity, so small
+ * allocations are a waste to have in their own BO.  Moreover, on DRM we track a
+ * list of all BOs currently allocated and submit the whole list for validation
+ * (busy tracking and implicit sync) on every submit, and that validation is a
+ * non-trivial cost.  So, being able to pack multiple allocations into a BO can
+ * be a significant performance win.
+ *
+ * The allocator tracks a current BO it is linearly allocating from, and up to
+ * one extra BO returned to the pool when all of its previous suballocations
+ * have been freed. This means that fragmentation can be an issue for
+ * default_size > PAGE_SIZE and small allocations.  Also, excessive BO
+ * reallocation may happen for workloads where default size < working set size.
+ */
+
+#include "tu_private.h"
+
+/* Initializes a BO sub-allocator using refcounts on BOs.
+ */
+void
+tu_bo_suballocator_init(struct tu_suballocator *suballoc,
+                        struct tu_device *dev,
+                        uint32_t default_size, uint32_t flags)
+{
+   suballoc->dev = dev;
+   suballoc->default_size = default_size;
+   suballoc->flags = flags;
+   suballoc->bo = NULL;
+   suballoc->cached_bo = NULL;
+}
+
+void
+tu_bo_suballocator_finish(struct tu_suballocator *suballoc)
+{
+   if (suballoc->bo)
+      tu_bo_finish(suballoc->dev, suballoc->bo);
+   if (suballoc->cached_bo)
+      tu_bo_finish(suballoc->dev, suballoc->cached_bo);
+}
+
+VkResult
+tu_suballoc_bo_alloc(struct tu_suballoc_bo *suballoc_bo,
+                     struct tu_suballocator *suballoc,
+                     uint32_t size, uint32_t align)
+{
+   struct tu_bo *bo = suballoc->bo;
+   if (bo) {
+      uint32_t offset = ALIGN(suballoc->next_offset, align);
+      if (offset + size <= bo->size) {
+         suballoc_bo->bo = tu_bo_get_ref(bo);
+         suballoc_bo->iova = bo->iova + offset;
+         suballoc_bo->size = size;
+
+         suballoc->next_offset = offset + size;
+         return VK_SUCCESS;
+      } else {
+         tu_bo_finish(suballoc->dev, bo);
+         suballoc->bo = NULL;
+      }
+   }
+
+   uint32_t alloc_size = MAX2(size, suballoc->default_size);
+
+   /* Reuse a recycled suballoc BO if we have one and it's big enough, otherwise free it. */
+   if (suballoc->cached_bo) {
+      if (alloc_size <= suballoc->cached_bo->size)
+         suballoc->bo = suballoc->cached_bo;
+      else
+         tu_bo_finish(suballoc->dev, suballoc->cached_bo);
+      suballoc->cached_bo = NULL;
+   }
+
+   /* Allocate the new BO if we didn't have one cached. */
+   if (!suballoc->bo) {
+      VkResult result = tu_bo_init_new(suballoc->dev, &suballoc->bo,
+                                       alloc_size,
+                                       suballoc->flags);
+      if (result != VK_SUCCESS)
+         return result;
+   }
+
+   VkResult result = tu_bo_map(suballoc->dev, suballoc->bo);
+   if (result != VK_SUCCESS) {
+      tu_bo_finish(suballoc->dev, suballoc->bo);
+      return VK_ERROR_OUT_OF_HOST_MEMORY;
+   }
+
+   suballoc_bo->bo = tu_bo_get_ref(suballoc->bo);
+   suballoc_bo->iova = suballoc_bo->bo->iova;
+   suballoc_bo->size = size;
+   suballoc->next_offset = size;
+
+   return VK_SUCCESS;
+}
+
+void
+tu_suballoc_bo_free(struct tu_suballocator *suballoc, struct tu_suballoc_bo *bo)
+{
+   if (!bo->bo)
+      return;
+
+   /* If we we held the last reference to this BO, so just move it to the
+    * suballocator for the next time we need to allocate.
+    */
+   if (p_atomic_read(&bo->bo->refcnt) == 1 && !suballoc->cached_bo) {
+      suballoc->cached_bo = bo->bo;
+      return;
+   }
+
+   /* Otherwise, drop the refcount on it normally. */
+   tu_bo_finish(suballoc->dev, bo->bo);
+}
+
+void *
+tu_suballoc_bo_map(struct tu_suballoc_bo *bo)
+{
+   return bo->bo->map + (bo->iova - bo->bo->iova);
+}