panfrost: Separate core ID range from core count

To query the core count, the hardware has a SHADERS_PRESENT register containing a mask of shader cores connected. The core count equals the number of 1-bits, regardless of placement. This value is useful for public consumption (like in clinfo). However, internally we are interested in the range of core IDs. We usually query core count to determine how many cores to allocate various per-core buffers for (performance counters, occlusion queries, and the stack). In each case, the hardware writes at the index of its core ID, so we have to allocate enough for entire range of core IDs. If the core mask is discontiguous, this necessarily overallocates. Rename the existing core_count to core_id_range, better reflecting its definition and purpose, and repurpose core_count for the actual core count. Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17265>
2022-06-24 17:43:09 -04:00 · 2022-06-24 17:43:09 -04:00 · 6f3eea5ddb
parent 5aa740bc8e
commit 6f3eea5ddb
11 changed files with 33 additions and 24 deletions
--- a/src/gallium/drivers/panfrost/pan_cmdstream.c
+++ b/src/gallium/drivers/panfrost/pan_cmdstream.c
@ -1585,7 +1585,7 @@ panfrost_emit_shared_memory(struct panfrost_batch *batch,
                        panfrost_batch_get_scratchpad(batch,
                                                      ss->info.tls_size,
                                                      dev->thread_tls_alloc,
-                                                      dev->core_count);
+                                                      dev->core_id_range);
                info.tls.ptr = bo->ptr.gpu;
        }

@ -1593,7 +1593,7 @@ panfrost_emit_shared_memory(struct panfrost_batch *batch,
                unsigned size =
                        pan_wls_adjust_size(info.wls.size) *
                        pan_wls_instances(&info.wls.dim) *
-                        dev->core_count;
+                        dev->core_id_range;

                struct panfrost_bo *bo =
                        panfrost_batch_get_shared_memory(batch, size, 1);
@ -2732,7 +2732,7 @@ emit_tls(struct panfrost_batch *batch)
                panfrost_batch_get_scratchpad(batch,
                                              batch->stack_size,
                                              dev->thread_tls_alloc,
-                                              dev->core_count):
+                                              dev->core_id_range):
                NULL;
        struct pan_tls_info tls = {
                .tls = {
@ -2754,7 +2754,7 @@ emit_fbd(struct panfrost_batch *batch, const struct pan_fb_info *fb)
                panfrost_batch_get_scratchpad(batch,
                                              batch->stack_size,
                                              dev->thread_tls_alloc,
-                                              dev->core_count):
+                                              dev->core_id_range):
                NULL;
        struct pan_tls_info tls = {
                .tls = {
--- a/src/gallium/drivers/panfrost/pan_context.c
+++ b/src/gallium/drivers/panfrost/pan_context.c
@ -869,7 +869,7 @@ panfrost_begin_query(struct pipe_context *pipe, struct pipe_query *q)
        case PIPE_QUERY_OCCLUSION_COUNTER:
        case PIPE_QUERY_OCCLUSION_PREDICATE:
        case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: {
-                unsigned size = sizeof(uint64_t) * dev->core_count;
+                unsigned size = sizeof(uint64_t) * dev->core_id_range;

                /* Allocate a resource for the query results to be stored */
                if (!query->rsrc) {
@ -953,7 +953,7 @@ panfrost_get_query_result(struct pipe_context *pipe,

                if (query->type == PIPE_QUERY_OCCLUSION_COUNTER) {
                        uint64_t passed = 0;
-                        for (int i = 0; i < dev->core_count; ++i)
+                        for (int i = 0; i < dev->core_id_range; ++i)
                                passed += result[i];

                        if (dev->arch <= 5 && !query->msaa)
--- a/src/gallium/drivers/panfrost/pan_job.c
+++ b/src/gallium/drivers/panfrost/pan_job.c
@ -388,11 +388,11 @@ struct panfrost_bo *
 panfrost_batch_get_scratchpad(struct panfrost_batch *batch,
                unsigned size_per_thread,
                unsigned thread_tls_alloc,
-                unsigned core_count)
+                unsigned core_id_range)
 {
        unsigned size = panfrost_get_total_stack_size(size_per_thread,
                        thread_tls_alloc,
-                        core_count);
+                        core_id_range);

        if (batch->scratchpad) {
                assert(batch->scratchpad->size >= size);
--- a/src/gallium/drivers/panfrost/pan_job.h
+++ b/src/gallium/drivers/panfrost/pan_job.h
@ -251,7 +251,7 @@ void
 panfrost_batch_adjust_stack_size(struct panfrost_batch *batch);

 struct panfrost_bo *
-panfrost_batch_get_scratchpad(struct panfrost_batch *batch, unsigned size, unsigned thread_tls_alloc, unsigned core_count);
+panfrost_batch_get_scratchpad(struct panfrost_batch *batch, unsigned size, unsigned thread_tls_alloc, unsigned core_id_range);

 struct panfrost_bo *
 panfrost_batch_get_shared_memory(struct panfrost_batch *batch, unsigned size, unsigned workgroup_count);
--- a/src/panfrost/lib/pan_cs.h
+++ b/src/panfrost/lib/pan_cs.h
@ -144,7 +144,7 @@ pan_wls_mem_size(const struct panfrost_device *dev,
 {
        unsigned instances = pan_wls_instances(dim);

-        return pan_wls_adjust_size(wls_size) * instances * dev->core_count;
+        return pan_wls_adjust_size(wls_size) * instances * dev->core_id_range;
 }

 #ifdef PAN_ARCH
--- a/src/panfrost/lib/pan_device.h
+++ b/src/panfrost/lib/pan_device.h
@ -184,7 +184,15 @@ struct panfrost_device {
        unsigned arch;
        unsigned gpu_id;
        unsigned revision;
+
+        /* Number of shader cores */
        unsigned core_count;
+
+        /* Range of core IDs, equal to the maximum core ID + 1. Satisfies
+         * core_id_range >= core_count.
+         */
+        unsigned core_id_range;
+
        unsigned thread_tls_alloc;
        struct panfrost_tiler_features tiler_features;
        const struct panfrost_model *model;
--- a/src/panfrost/lib/pan_encoder.h
+++ b/src/panfrost/lib/pan_encoder.h
@ -75,7 +75,7 @@ unsigned
 panfrost_get_total_stack_size(
                unsigned thread_size,
                unsigned threads_per_core,
-                unsigned core_count);
+                unsigned core_id_range);

 /* Attributes / instancing */

--- a/src/panfrost/lib/pan_props.c
+++ b/src/panfrost/lib/pan_props.c
@ -152,20 +152,21 @@ panfrost_query_tiler_features(int fd)
 }

 static unsigned
-panfrost_query_core_count(int fd)
+panfrost_query_core_count(int fd, unsigned *core_id_range)
 {
        /* On older kernels, worst-case to 16 cores */

        unsigned mask = panfrost_query_raw(fd,
                        DRM_PANFROST_PARAM_SHADER_PRESENT, false, 0xffff);

-        /* Some cores might be absent. For TLS computation purposes, we care
-         * about the greatest ID + 1, which equals the core count if all cores
-         * are present, but allocates space for absent cores if needed.
-         * util_last_bit is defined to return the greatest bit set + 1, which
-         * is exactly what we need. */
+        /* Some cores might be absent. In some cases, we care
+         * about the range of core IDs (that is, the greatest core ID + 1). If
+         * the core mask is contiguous, this equals the core count.
+         */
+        *core_id_range = util_last_bit(mask);

-        return util_last_bit(mask);
+        /* The actual core count skips overs the gaps */
+        return util_bitcount(mask);
 }

 /* Architectural maximums, since this register may be not implemented
@ -263,7 +264,7 @@ panfrost_open_device(void *memctx, int fd, struct panfrost_device *dev)
        dev->memctx = memctx;
        dev->gpu_id = panfrost_query_gpu_version(fd);
        dev->arch = pan_arch(dev->gpu_id);
-        dev->core_count = panfrost_query_core_count(fd);
+        dev->core_count = panfrost_query_core_count(fd, &dev->core_id_range);
        dev->thread_tls_alloc = panfrost_query_thread_tls_alloc(fd, dev->arch);
        dev->kernel_version = drmGetVersion(fd);
        dev->revision = panfrost_query_gpu_revision(fd);
--- a/src/panfrost/lib/pan_scratch.c
+++ b/src/panfrost/lib/pan_scratch.c
@ -78,10 +78,10 @@ unsigned
 panfrost_get_total_stack_size(
                unsigned thread_size,
                unsigned threads_per_core,
-                unsigned core_count)
+                unsigned core_id_range)
 {
        unsigned size_per_thread = (thread_size == 0) ? 0 :
                util_next_power_of_two(ALIGN_POT(thread_size, 16));

-        return size_per_thread * threads_per_core * core_count;
+        return size_per_thread * threads_per_core * core_id_range;
 }
--- a/src/panfrost/perf/pan_perf.c
+++ b/src/panfrost/perf/pan_perf.c
@ -42,7 +42,7 @@ panfrost_perf_counter_read(const struct panfrost_perf_counter *counter,

   // If counter belongs to shader core, accumulate values for all other cores
   if (counter->category_index == PAN_SHADER_CORE_INDEX) {
-      for (uint32_t core = 1; core < perf->dev->core_count; ++core) {
+      for (uint32_t core = 1; core < perf->dev->core_id_range; ++core) {
         ret += perf->counter_values[offset + PAN_COUNTERS_PER_CATEGORY * core];
      }
   }
@ -77,7 +77,7 @@ panfrost_perf_init(struct panfrost_perf *perf, struct panfrost_device *dev)
   // Generally counter blocks are laid out in the following order:
   // Job manager, tiler, one or more L2 caches, and one or more shader cores.
   unsigned l2_slices = panfrost_query_l2_slices(dev);
-   uint32_t n_blocks = 2 + l2_slices + dev->core_count;
+   uint32_t n_blocks = 2 + l2_slices + dev->core_id_range;
   perf->n_counter_values = PAN_COUNTERS_PER_CATEGORY * n_blocks;
   perf->counter_values = ralloc_array(perf, uint32_t, perf->n_counter_values);

--- a/src/panfrost/vulkan/panvk_vX_cmd_buffer.c
+++ b/src/panfrost/vulkan/panvk_vX_cmd_buffer.c
@ -118,7 +118,7 @@ panvk_per_arch(cmd_close_batch)(struct panvk_cmd_buffer *cmdbuf)
   if (batch->tlsinfo.tls.size) {
      unsigned size = panfrost_get_total_stack_size(batch->tlsinfo.tls.size,
                                                    pdev->thread_tls_alloc,
-                                                    pdev->core_count);
+                                                    pdev->core_id_range);
      batch->tlsinfo.tls.ptr =
         pan_pool_alloc_aligned(&cmdbuf->tls_pool.base, size, 4096).gpu;
   }