ROCm · peterjunpark · Oct 17, 2024 · Oct 17, 2024 · Oct 17, 2024 · Oct 17, 2024
@@ -46,6 +46,9 @@ processor’s metrics therefore are focused on reporting, for example:
 Command processor fetcher (CPF)
 ===============================
 
+.. datatemplate:yaml:: ../../src/omniperf_soc/analysis_configs/gfx90a/0500_command-processor.yaml
+   :template: ../templates/performance-metric-table.tmpl
+
 .. list-table::
    :header-rows: 1
 

@@ -49,6 +49,7 @@
 
 html_static_path = ["sphinx/static/css"]
 html_css_files = ["o_custom.css"]
+templates_path = ["templates"]
 
 external_toc_path = "./sphinx/_toc.yml"
 external_projects_current_project = "omniperf"

@@ -0,0 +1,26 @@
+.. -*- mode: rst -*-
+
+{% for data_source in data["Panel Config"]["data source"] %}
+
+## Anchor for internal linking.
+## For example, :ref:`command-processor-fetcher-metrics`.
+.. _{{data_source.metric_table.title|lower|replace(" ", "-") }}-metrics:
+
+{{ data_source.metric_table.title }}
+-------------------------------------------------------------------------------
+
+.. list-table::
+   :header-rows: 1
+
+   * - Metric
+     - Description
+     - Unit
+
+  {% for name, desc in data_source.metric_table.metric.items() %}
+   * - {{ name }}
+     - {{ desc.tips }}
+     - {{ desc.unit }}
+
+  {% endfor %}
+{% endfor %}
+
@@ -2,6 +2,18 @@
 # Add description/tips for each metric in this section.
 # So it could be shown in hover.
 Metric Description:
+  CPF_Utilization: &CPF_Utilization_tip
+    Percent of total cycles where the CPF was busy actively doing any work. The
+    ratio of CPF busy cycles over total cycles counted by the CPF.
+  CPF_Stall: &CPF_Stall_tip
+    Percent of CPF busy cycles where the CPF was stalled for any reason.
+  CPF-L2_Utilization: &CPF-L2_Utilization_tip
+    Percent of total cycles counted by the CPF-L2 interface where the CPF-L2
+    interface was active doing any work. The ratio of CPF-L2 busy cycles over total cycles counted by the CPF-L2.
+  CPF-L2_Stall: &CPF-L2_Stall_tip
+    Percent of CPF-L2 busy cycles where the CPF-L2 interface was stalled for any reason.
+  CPF-UTCL1_Stall: &CPF-UTCL1_Stall_tip
+    Percent of CPF busy cycles where the CPF was stalled by address translation.
 
 # Define the panel properties and properties of each metric in the panel.
 Panel Config:
@@ -27,7 +39,7 @@ Panel Config:
             max: MAX((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
               if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
             unit: pct
-            tips: 
+            tips: *CPF_Utilization_tip
           CPF Stall:
             avg: AVG((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
               != 0) else None))
@@ -36,7 +48,7 @@ Panel Config:
             max: MAX((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
               != 0) else None))
             unit: pct
-            tips: 
+            tips: *CPF_Stall_tip
           CPF-L2 Utilization:
             avg: AVG((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
               if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
@@ -45,7 +57,7 @@ Panel Config:
             max: MAX((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
               if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
             unit: pct
-            tips: 
+            tips: *CPF-L2_Utilization_tip
           CPF-L2 Stall:
             avg: AVG((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
               != 0) else None))
@@ -54,7 +66,7 @@ Panel Config:
             max: MAX((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
               != 0) else None))
             unit: pct
-            tips: 
+            tips: *CPF-L2_Stall_tip
           CPF-UTCL1 Stall:
             avg: AVG(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
               != 0) else None)
@@ -63,7 +75,7 @@ Panel Config:
             max: MAX(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
               != 0) else None)
             unit: pct
-            tips:
+            tips: *CPF-UTCL1_Stall_tip
 
     - metric_table:
         id: 502

@@ -2,6 +2,76 @@
 # Add description/tips for each metric in this section.
 # So it could be shown in hover.
 Metric Description:
+  Workgroup Manager Utilizations:
+    Accelerator Utilization: &Accelerator_Utilization_tip >-
+      The percent of cycles in the kernel where the accelerator was actively doing any work.
+    Scheduler-Pipe Utilization: &Scheduler-Pipe_Utilization_tip >-
+      The percent of total scheduler-pipe cycles in the kernel where the scheduler-pipes were
+    actively doing any work. Note: this value is expected to range between 0% and 25%.
+    Workgroup Manager Utilization: &Workgroup_Manager_Utilization_tip >-
+      The percent of cycles in the kernel where the workgroup manager was actively doing any work.
+    Shader Engine Utilization: &Shader_Engine_Utilization_tip >-
+      The percent of total shader engine cycles in the kernel where any CU in a shader-engine was
+      actively doing any work, normalized over all shader-engines. Low values (e.g., << 100%)
+      indicate that the accelerator was not fully saturated by the kernel, or a potential
+      load-imbalance issue.
+    SIMD Utilization: &SIMD_Utilization_tip >-
+      The percent of total SIMD cycles in the kernel where any SIMD  on a CU was actively doing any
+      work, summed over all CUs. Low values (less than 100%) indicate that the accelerator was not
+      fully saturated by the kernel, or a potential load-imbalance issue.
+    Dispatched Workgroups: &Dispatched_Workgroups_tip >-
+      The total number of workgroups forming this kernel launch.
+    Dispatched Wavefronts: &Dispatched_Wavefronts_tip >-
+      The total number of wavefronts, summed over all workgroups, forming this kernel launch.
+    VGPR Writes: &VGPR_Writes_tip >-
+      The average number of cycles spent initializing VGPRs at wave creation.
+    SGPR Writes: &SGPR_Writes_tip >-
+      The average number of cycles spent initializing SGPRs at wave creation.
+  Workgroup Manager - Resource Allocation:
+    Not-Scheduled Rate (Workgroup Manager): &Not-Scheduled_Rate_(Workgroup_Manager)_tip >-
+      The percent of total scheduler-pipe cycles in the kernel where a workgroup could not be
+      scheduled to a CU due to a bottleneck within the workgroup manager rather than a lack of a CU
+      or SIMD with sufficient resources. Note: this value is expected to range between 0-25%. See
+      note in workgroup manager description.
+    Not-Scheduled Rate (Scheduler-Pipe): &Not-Scheduled_Rate_(Scheduler-Pipe)_tip >-
+      The percent of total scheduler-pipe cycles in the kernel where a workgroup could not be
+      scheduled to a CU due to a bottleneck within the scheduler-pipes rather than a lack of a CU or
+      SIMD with sufficient resources. Note: this value is expected to range between 0-25%, see note
+      in workgroup manager description.
+    Scheduler-Pipe Stall Rate: &Scheduler-Pipe_Stall_Rate_tip >-
+      The percent of total scheduler-pipe cycles in the kernel where a workgroup could not be
+      scheduled to a CU due to occupancy limitations (like a lack of a CU or SIMD with sufficient
+      resources). Note: this value is expected to range between 0-25%, see note in workgroup manager
+      description.
+    Scratch Stall Rate: &Scratch_Stall_Rate_tip >-
+      The percent of total shader-engine cycles in the kernel where a workgroup could not be
+      scheduled to a CU due to lack of private (a.k.a., scratch) memory slots. While this can reach
+      up to 100%, note that the actual occupancy limitations on a kernel using private memory are
+      typically quite small (for example, less than 1% of the total number of waves that can be
+      scheduled to an accelerator).
+    Insufficient SIMD Waveslots: &Insufficient_SIMD_Waveslots_tip >-
+      The percent of total SIMD cycles in the kernel where a workgroup could not be scheduled to a
+      SIMD due to lack of available waveslots.
+    Insufficient SIMD VGPRs: &Insufficient_SIMD_VGPRs_tip >-
+      The percent of total SIMD cycles in the kernel where a workgroup could not be scheduled to a
+      SIMD due to lack of available VGPRs.
+    Insufficient SIMD SGPRs: &Insufficient_SIMD_SGPRs_tip >-
+      The percent of total SIMD cycles in the kernel where a workgroup could not be scheduled to a
+      SIMD due to lack of available SGPRs.
+    Insufficient CU LDS: &Insufficient_CU_LDS_tip >-
+      The percent of total CU cycles in the kernel where a workgroup could not be scheduled to a CU
+      due to lack of available LDS.
+    Insufficient CU Barriers: &Insufficient_CU_Barriers_tip >-
+      The percent of total CU cycles in the kernel where a workgroup could not be scheduled to a CU
+      due to lack of available barriers.
+    Reached CU Workgroup Limit: &Reached_CU_Workgroup_Limit_tip >-
+      The percent of total CU cycles in the kernel where a workgroup could not be scheduled to a CU
+      due to limits within the workgroup manager.  This is expected to be always be zero on CDNA2 or
+      newer accelerators (and small for previous accelerators).
+    Reached CU Wavefront Limit: &Reached_CU_Wavefront_Limit_tip >-
+      The percent of total CU cycles in the kernel where a wavefront could not be scheduled to a CU
+      due to limits within the workgroup manager.  This is expected to be always be zero on CDNA2 or
+      newer accelerators (and small for previous accelerators).
 
 # Define the panel properties and properties of each metric in the panel.
 Panel Config:
@@ -24,43 +94,43 @@ Panel Config:
             min: MIN(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
             max: MAX(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
             unit: Pct
-            tips: 
+            tips: *Accelerator_Utilization_tip
           Scheduler-Pipe Utilization:
             avg: AVG(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
             min: MIN(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
             max: MAX(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
             unit: Pct
-            tips: 
+            tips:  *Scheduler-Pipe_Utilization_tip
           Workgroup Manager Utilization:
             avg: AVG(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
             min: MIN(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
             max: MAX(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
             unit: Pct
-            tips: 
+            tips: *Workgroup_Manager_Utilization_tip
           Shader Engine Utilization:
             avg: AVG(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
             min: MIN(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
             max: MAX(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
             unit: Pct
-            tips: 
+            tips: *Shader_Engine_Utilization_tip
           SIMD Utilization:
             avg: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
             min: MIN(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
             max: MAX(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
             unit: Pct
-            tips: 
+            tips: *SIMD_Utilization_tip
           Dispatched Workgroups:
             avg: AVG(SPI_CSN_NUM_THREADGROUPS)
             min: MIN(SPI_CSN_NUM_THREADGROUPS)
             max: MAX(SPI_CSN_NUM_THREADGROUPS)
             unit: Workgroups
-            tips: 
+            tips: *Dispatched_Workgroups_tip
           Dispatched Wavefronts:
             avg: AVG(SPI_CSN_WAVE)
             min: MIN(SPI_CSN_WAVE)
             max: MAX(SPI_CSN_WAVE)
             unit: Wavefronts
-            tips: 
+            tips: *Dispatched_Wavefronts_tip
           VGPR Writes:
             avg: AVG((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else
               None))
@@ -69,7 +139,7 @@ Panel Config:
             max: MAX((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else
               None))
             unit: Cycles/wave
-            tips: 
+            tips: *VGPR_Writes_tip
           SGPR Writes:
             avg: AVG((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else
               None))
@@ -78,7 +148,7 @@ Panel Config:
             max: MAX((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else
               None))
             unit: Cycles/wave
-            tips:
+            tips: *SGPR_Writes_tip
     - metric_table:
         id: 602
         title: Workgroup Manager - Resource Allocation
@@ -98,7 +168,7 @@ Panel Config:
             max: MAX((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD !=
               0) else None)
             unit: Pct
-            tips: 
+            tips: *Not-Scheduled_Rate_(Workgroup_Manager)_tip
           Not-scheduled Rate (Scheduler-Pipe):
             avg: AVG((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD !=
               0) else None)
@@ -107,7 +177,7 @@ Panel Config:
             max: MAX((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD !=
               0) else None)
             unit: Pct
-            tips: 
+            tips: *Not-Scheduled_Rate_(Scheduler-Pipe)_tip
           Scheduler-Pipe Stall Rate:
             avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD !=
               0) else None))
@@ -116,52 +186,52 @@ Panel Config:
             max: MAX((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD !=
               0) else None))
             unit: Pct
-            tips: 
+            tips: *Scheduler-Pipe_Stall_Rate_tip
           Scratch Stall Rate:
             avg: AVG((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
             min: MIN((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
             max: MAX((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
             unit: Pct
-            tips: 
+            tips: *Scratch_Stall_Rate_tip
           Insufficient SIMD Waveslots:
             avg: AVG(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
             min: MIN(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
             max: MAX(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
             unit: Pct
-            tips: 
+            tips: *Insufficient_SIMD_Waveslots_tip
           Insufficient SIMD VGPRs:
             avg: AVG(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
             min: MIN(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
             max: MAX(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
             unit: Pct
-            tips: 
+            tips: *Insufficient_SIMD_VGPRs_tip
           Insufficient SIMD SGPRs:
             avg: AVG(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
             min: MIN(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
             max: MAX(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
             unit: Pct
-            tips: 
+            tips: *Insufficient_SIMD_SGPRs_tip
           Insufficient CU LDS:
             avg: AVG(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
             min: MIN(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
             max: MAX(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
             unit: Pct
-            tips: 
+            tips: *Insufficient_CU_LDS_tip
           Insufficient CU Barriers:
             avg: AVG(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
             min: MIN(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
             max: MAX(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
             unit: Pct
-            tips: 
+            tips: *Insufficient_CU_Barriers_tip
           Reached CU Workgroup Limit:
             avg: AVG(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
             min: MIN(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
             max: MAX(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
             unit: Pct
-            tips: 
+            tips: *Reached_CU_Workgroup_Limit_tip
           Reached CU Wavefront Limit:
             avg: AVG(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
             min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
             max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
             unit: Pct
-            tips: 
+            tips: *Reached_CU_Wavefront_Limit_tip