mesa/src/intel/compiler/brw_ir_performance.h

/* -*- c++ -*- */
/*
 * Copyright © 2020 Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */

#ifndef BRW_IR_PERFORMANCE_H
#define BRW_IR_PERFORMANCE_H

class fs_visitor;

namespace brw {
   class vec4_visitor;

   /**
    * Various estimates of the performance of a shader based on static
    * analysis.
    */
   struct performance {
      performance(const fs_visitor *v);
      performance(const vec4_visitor *v);
      ~performance();

      analysis_dependency_class
      dependency_class() const
      {
         return (DEPENDENCY_INSTRUCTIONS |
                 DEPENDENCY_BLOCKS);
      }

      bool
      validate(const backend_shader *) const
      {
         return true;
      }

      /**
       * Array containing estimates of the runtime of each basic block of the
       * program in cycle units.
       */
      unsigned *block_latency;

      /**
       * Estimate of the runtime of the whole program in cycle units assuming
       * uncontended execution.
       */
      unsigned latency;

      /**
       * Estimate of the throughput of the whole program in
       * invocations-per-cycle units.
       *
       * Note that this might be lower than the ratio between the dispatch
       * width of the program and its latency estimate in cases where
       * performance doesn't scale without limits as a function of its thread
       * parallelism, e.g. due to the existence of a bottleneck in a shared
       * function.
       */
      float throughput;

   private:
      performance(const performance &perf);
      performance &
      operator=(performance u);
   };
}

#endif
intel/ir: Import shader performance analysis pass. This introduces an analysis pass intended to estimate several performance statistics of the shader, including cycle count latency and throughput values, based on static modeling. It has instruction performance information more comprehensive than the current scheduling pass for all platforms between Gen4-11, and works on both the FS and VEC4 back-end. The most immediate purpose of this pass is to implement a heuristic meant to determine whether using SIMD32 dispatch for a fragment shader can be expected to help more than it hurts. In addition this will allow the effect of passes run after scheduling (e.g. the TGL software scoreboard pass and the VEC4 dependency control pass) to be visible in shader-db statistics. But that isn't the end of the story, other potential applications of this pass (not part of this MR) I've been playing around with are: - Implement a similar SIMD16 heuristic allowing the identification of inefficient SIMD16 fragment shaders. - Implement similar SIMD16 and SIMD32 heuristics for the compute shader stage -- Currently compute shader builds always use the SIMD16 shader if available and never use the SIMD32 shader unless strictly necessary, which is suboptimal under certain conditions. - Hook up to the instruction scheduler in order to improve the accuracy of its timing information. - Use as heuristic in order to drive the selection of scheduling modes (Matt was experimenting with that). - Plug to the TGL software scoreboard pass in order to implement a more effective SBID token allocation algorithm, since in general the optimal token allocation depends on the timings of all instructions in the program. - Use its bottleneck detection functionality in order to implement a heuristic computing a more optimal bound for the number of fragment shader threads executed in parallel (by adjusting the MaximumNumberofThreadsPerPSD control of 3DSTATE_PS). As a follow-up I'm planning to submit updated timing information for Gen12 platforms -- Everything else required to support Gen12 like SWSB handling is already included in this patch, but there were some IP concerns regarding the TGL timing parameters since they cannot currently be obtained with the documentation and hardware which is publicly available. The timing parameters for any previous Gen7-11 platforms can be obtained by anyone by sampling the timestamp register using e.g. shader_time, though I have some more convenient instrumentation coming up. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> 2020-03-26 21:59:02 +00:00			`/* -- c++ -- */`
			`/*`
			`* Copyright © 2020 Intel Corporation`
			`*`
			`* Permission is hereby granted, free of charge, to any person obtaining a`
			`* copy of this software and associated documentation files (the "Software"),`
			`* to deal in the Software without restriction, including without limitation`
			`* the rights to use, copy, modify, merge, publish, distribute, sublicense,`
			`* and/or sell copies of the Software, and to permit persons to whom the`
			`* Software is furnished to do so, subject to the following conditions:`
			`*`
			`* The above copyright notice and this permission notice (including the next`
			`* paragraph) shall be included in all copies or substantial portions of the`
			`* Software.`
			`*`
			`* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR`
			`* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,`
			`* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL`
			`* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER`
			`* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING`
			`* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS`
			`* IN THE SOFTWARE.`
			`*/`

			`#ifndef BRW_IR_PERFORMANCE_H`
			`#define BRW_IR_PERFORMANCE_H`

			`class fs_visitor;`

			`namespace brw {`
			`class vec4_visitor;`

			`/**`
			`* Various estimates of the performance of a shader based on static`
			`* analysis.`
			`*/`
			`struct performance {`
			`performance(const fs_visitor *v);`
			`performance(const vec4_visitor *v);`
			`~performance();`

			`analysis_dependency_class`
			`dependency_class() const`
			`{`
			`return (DEPENDENCY_INSTRUCTIONS \|`
			`DEPENDENCY_BLOCKS);`
			`}`

			`bool`
			`validate(const backend_shader *) const`
			`{`
			`return true;`
			`}`

			`/**`
			`* Array containing estimates of the runtime of each basic block of the`
			`* program in cycle units.`
			`*/`
			`unsigned *block_latency;`

			`/**`
			`* Estimate of the runtime of the whole program in cycle units assuming`
			`* uncontended execution.`
			`*/`
			`unsigned latency;`

			`/**`
			`* Estimate of the throughput of the whole program in`
			`* invocations-per-cycle units.`
			`*`
			`* Note that this might be lower than the ratio between the dispatch`
			`* width of the program and its latency estimate in cases where`
			`* performance doesn't scale without limits as a function of its thread`
			`* parallelism, e.g. due to the existence of a bottleneck in a shared`
			`* function.`
			`*/`
			`float throughput;`

			`private:`
			`performance(const performance &perf);`
			`performance &`
			`operator=(performance u);`
			`};`
			`}`

			`#endif`