diff --git a/src/gallium/drivers/panfrost/pan_cmdstream.c b/src/gallium/drivers/panfrost/pan_cmdstream.c index c86ee288d96..478d1410ce4 100644 --- a/src/gallium/drivers/panfrost/pan_cmdstream.c +++ b/src/gallium/drivers/panfrost/pan_cmdstream.c @@ -1585,7 +1585,7 @@ panfrost_emit_shared_memory(struct panfrost_batch *batch, panfrost_batch_get_scratchpad(batch, ss->info.tls_size, dev->thread_tls_alloc, - dev->core_count); + dev->core_id_range); info.tls.ptr = bo->ptr.gpu; } @@ -1593,7 +1593,7 @@ panfrost_emit_shared_memory(struct panfrost_batch *batch, unsigned size = pan_wls_adjust_size(info.wls.size) * pan_wls_instances(&info.wls.dim) * - dev->core_count; + dev->core_id_range; struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch, size, 1); @@ -2732,7 +2732,7 @@ emit_tls(struct panfrost_batch *batch) panfrost_batch_get_scratchpad(batch, batch->stack_size, dev->thread_tls_alloc, - dev->core_count): + dev->core_id_range): NULL; struct pan_tls_info tls = { .tls = { @@ -2754,7 +2754,7 @@ emit_fbd(struct panfrost_batch *batch, const struct pan_fb_info *fb) panfrost_batch_get_scratchpad(batch, batch->stack_size, dev->thread_tls_alloc, - dev->core_count): + dev->core_id_range): NULL; struct pan_tls_info tls = { .tls = { diff --git a/src/gallium/drivers/panfrost/pan_context.c b/src/gallium/drivers/panfrost/pan_context.c index c268e07d1b8..c051679d09f 100644 --- a/src/gallium/drivers/panfrost/pan_context.c +++ b/src/gallium/drivers/panfrost/pan_context.c @@ -869,7 +869,7 @@ panfrost_begin_query(struct pipe_context *pipe, struct pipe_query *q) case PIPE_QUERY_OCCLUSION_COUNTER: case PIPE_QUERY_OCCLUSION_PREDICATE: case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: { - unsigned size = sizeof(uint64_t) * dev->core_count; + unsigned size = sizeof(uint64_t) * dev->core_id_range; /* Allocate a resource for the query results to be stored */ if (!query->rsrc) { @@ -953,7 +953,7 @@ panfrost_get_query_result(struct pipe_context *pipe, if (query->type == PIPE_QUERY_OCCLUSION_COUNTER) { uint64_t passed = 0; - for (int i = 0; i < dev->core_count; ++i) + for (int i = 0; i < dev->core_id_range; ++i) passed += result[i]; if (dev->arch <= 5 && !query->msaa) diff --git a/src/gallium/drivers/panfrost/pan_job.c b/src/gallium/drivers/panfrost/pan_job.c index 71703d6b7e4..81d69bba0ba 100644 --- a/src/gallium/drivers/panfrost/pan_job.c +++ b/src/gallium/drivers/panfrost/pan_job.c @@ -388,11 +388,11 @@ struct panfrost_bo * panfrost_batch_get_scratchpad(struct panfrost_batch *batch, unsigned size_per_thread, unsigned thread_tls_alloc, - unsigned core_count) + unsigned core_id_range) { unsigned size = panfrost_get_total_stack_size(size_per_thread, thread_tls_alloc, - core_count); + core_id_range); if (batch->scratchpad) { assert(batch->scratchpad->size >= size); diff --git a/src/gallium/drivers/panfrost/pan_job.h b/src/gallium/drivers/panfrost/pan_job.h index f79197b2059..3f2d291aa76 100644 --- a/src/gallium/drivers/panfrost/pan_job.h +++ b/src/gallium/drivers/panfrost/pan_job.h @@ -251,7 +251,7 @@ void panfrost_batch_adjust_stack_size(struct panfrost_batch *batch); struct panfrost_bo * -panfrost_batch_get_scratchpad(struct panfrost_batch *batch, unsigned size, unsigned thread_tls_alloc, unsigned core_count); +panfrost_batch_get_scratchpad(struct panfrost_batch *batch, unsigned size, unsigned thread_tls_alloc, unsigned core_id_range); struct panfrost_bo * panfrost_batch_get_shared_memory(struct panfrost_batch *batch, unsigned size, unsigned workgroup_count); diff --git a/src/panfrost/lib/pan_cs.h b/src/panfrost/lib/pan_cs.h index db4dcbf089a..2ffa6017ef7 100644 --- a/src/panfrost/lib/pan_cs.h +++ b/src/panfrost/lib/pan_cs.h @@ -144,7 +144,7 @@ pan_wls_mem_size(const struct panfrost_device *dev, { unsigned instances = pan_wls_instances(dim); - return pan_wls_adjust_size(wls_size) * instances * dev->core_count; + return pan_wls_adjust_size(wls_size) * instances * dev->core_id_range; } #ifdef PAN_ARCH diff --git a/src/panfrost/lib/pan_device.h b/src/panfrost/lib/pan_device.h index dbd9fafb4fa..0cbebe09820 100644 --- a/src/panfrost/lib/pan_device.h +++ b/src/panfrost/lib/pan_device.h @@ -184,7 +184,15 @@ struct panfrost_device { unsigned arch; unsigned gpu_id; unsigned revision; + + /* Number of shader cores */ unsigned core_count; + + /* Range of core IDs, equal to the maximum core ID + 1. Satisfies + * core_id_range >= core_count. + */ + unsigned core_id_range; + unsigned thread_tls_alloc; struct panfrost_tiler_features tiler_features; const struct panfrost_model *model; diff --git a/src/panfrost/lib/pan_encoder.h b/src/panfrost/lib/pan_encoder.h index 585eaf7476b..68349996cde 100644 --- a/src/panfrost/lib/pan_encoder.h +++ b/src/panfrost/lib/pan_encoder.h @@ -75,7 +75,7 @@ unsigned panfrost_get_total_stack_size( unsigned thread_size, unsigned threads_per_core, - unsigned core_count); + unsigned core_id_range); /* Attributes / instancing */ diff --git a/src/panfrost/lib/pan_props.c b/src/panfrost/lib/pan_props.c index 3cc16169d4c..1627e55e4b8 100644 --- a/src/panfrost/lib/pan_props.c +++ b/src/panfrost/lib/pan_props.c @@ -152,20 +152,21 @@ panfrost_query_tiler_features(int fd) } static unsigned -panfrost_query_core_count(int fd) +panfrost_query_core_count(int fd, unsigned *core_id_range) { /* On older kernels, worst-case to 16 cores */ unsigned mask = panfrost_query_raw(fd, DRM_PANFROST_PARAM_SHADER_PRESENT, false, 0xffff); - /* Some cores might be absent. For TLS computation purposes, we care - * about the greatest ID + 1, which equals the core count if all cores - * are present, but allocates space for absent cores if needed. - * util_last_bit is defined to return the greatest bit set + 1, which - * is exactly what we need. */ + /* Some cores might be absent. In some cases, we care + * about the range of core IDs (that is, the greatest core ID + 1). If + * the core mask is contiguous, this equals the core count. + */ + *core_id_range = util_last_bit(mask); - return util_last_bit(mask); + /* The actual core count skips overs the gaps */ + return util_bitcount(mask); } /* Architectural maximums, since this register may be not implemented @@ -263,7 +264,7 @@ panfrost_open_device(void *memctx, int fd, struct panfrost_device *dev) dev->memctx = memctx; dev->gpu_id = panfrost_query_gpu_version(fd); dev->arch = pan_arch(dev->gpu_id); - dev->core_count = panfrost_query_core_count(fd); + dev->core_count = panfrost_query_core_count(fd, &dev->core_id_range); dev->thread_tls_alloc = panfrost_query_thread_tls_alloc(fd, dev->arch); dev->kernel_version = drmGetVersion(fd); dev->revision = panfrost_query_gpu_revision(fd); diff --git a/src/panfrost/lib/pan_scratch.c b/src/panfrost/lib/pan_scratch.c index 2742acb90e2..91d8bd65564 100644 --- a/src/panfrost/lib/pan_scratch.c +++ b/src/panfrost/lib/pan_scratch.c @@ -78,10 +78,10 @@ unsigned panfrost_get_total_stack_size( unsigned thread_size, unsigned threads_per_core, - unsigned core_count) + unsigned core_id_range) { unsigned size_per_thread = (thread_size == 0) ? 0 : util_next_power_of_two(ALIGN_POT(thread_size, 16)); - return size_per_thread * threads_per_core * core_count; + return size_per_thread * threads_per_core * core_id_range; } diff --git a/src/panfrost/perf/pan_perf.c b/src/panfrost/perf/pan_perf.c index 930eae78d11..c543d7f0dbb 100644 --- a/src/panfrost/perf/pan_perf.c +++ b/src/panfrost/perf/pan_perf.c @@ -42,7 +42,7 @@ panfrost_perf_counter_read(const struct panfrost_perf_counter *counter, // If counter belongs to shader core, accumulate values for all other cores if (counter->category_index == PAN_SHADER_CORE_INDEX) { - for (uint32_t core = 1; core < perf->dev->core_count; ++core) { + for (uint32_t core = 1; core < perf->dev->core_id_range; ++core) { ret += perf->counter_values[offset + PAN_COUNTERS_PER_CATEGORY * core]; } } @@ -77,7 +77,7 @@ panfrost_perf_init(struct panfrost_perf *perf, struct panfrost_device *dev) // Generally counter blocks are laid out in the following order: // Job manager, tiler, one or more L2 caches, and one or more shader cores. unsigned l2_slices = panfrost_query_l2_slices(dev); - uint32_t n_blocks = 2 + l2_slices + dev->core_count; + uint32_t n_blocks = 2 + l2_slices + dev->core_id_range; perf->n_counter_values = PAN_COUNTERS_PER_CATEGORY * n_blocks; perf->counter_values = ralloc_array(perf, uint32_t, perf->n_counter_values); diff --git a/src/panfrost/vulkan/panvk_vX_cmd_buffer.c b/src/panfrost/vulkan/panvk_vX_cmd_buffer.c index a4c2d4e75c7..638954c9f1e 100644 --- a/src/panfrost/vulkan/panvk_vX_cmd_buffer.c +++ b/src/panfrost/vulkan/panvk_vX_cmd_buffer.c @@ -118,7 +118,7 @@ panvk_per_arch(cmd_close_batch)(struct panvk_cmd_buffer *cmdbuf) if (batch->tlsinfo.tls.size) { unsigned size = panfrost_get_total_stack_size(batch->tlsinfo.tls.size, pdev->thread_tls_alloc, - pdev->core_count); + pdev->core_id_range); batch->tlsinfo.tls.ptr = pan_pool_alloc_aligned(&cmdbuf->tls_pool.base, size, 4096).gpu; }