diff --git a/docs/conceptual/command-processor.rst b/docs/conceptual/command-processor.rst index a055768a1..a43ea884b 100644 --- a/docs/conceptual/command-processor.rst +++ b/docs/conceptual/command-processor.rst @@ -46,6 +46,9 @@ processor’s metrics therefore are focused on reporting, for example: Command processor fetcher (CPF) =============================== +.. datatemplate:yaml:: ../../src/omniperf_soc/analysis_configs/gfx90a/0500_command-processor.yaml + :template: ../templates/performance-metric-table.tmpl + .. list-table:: :header-rows: 1 diff --git a/docs/conf.py b/docs/conf.py index f74f95ecd..4bc554069 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -49,6 +49,7 @@ html_static_path = ["sphinx/static/css"] html_css_files = ["o_custom.css"] +templates_path = ["templates"] external_toc_path = "./sphinx/_toc.yml" external_projects_current_project = "omniperf" diff --git a/docs/templates/performance-metric-table.tmpl b/docs/templates/performance-metric-table.tmpl new file mode 100644 index 000000000..b5dfe1682 --- /dev/null +++ b/docs/templates/performance-metric-table.tmpl @@ -0,0 +1,26 @@ +.. -*- mode: rst -*- + +{% for data_source in data["Panel Config"]["data source"] %} + +## Anchor for internal linking. +## For example, :ref:`command-processor-fetcher-metrics`. +.. _{{data_source.metric_table.title|lower|replace(" ", "-") }}-metrics: + +{{ data_source.metric_table.title }} +------------------------------------------------------------------------------- + +.. list-table:: + :header-rows: 1 + + * - Metric + - Description + - Unit + + {% for name, desc in data_source.metric_table.metric.items() %} + * - {{ name }} + - {{ desc.tips }} + - {{ desc.unit }} + + {% endfor %} +{% endfor %} + diff --git a/src/omniperf_soc/analysis_configs/gfx90a/0200_system-speed-of-light.yaml b/src/omniperf_soc/analysis_configs/gfx90a/0200_system-speed-of-light.yaml index 2deb5c7cc..c25c9f8ed 100644 --- a/src/omniperf_soc/analysis_configs/gfx90a/0200_system-speed-of-light.yaml +++ b/src/omniperf_soc/analysis_configs/gfx90a/0200_system-speed-of-light.yaml @@ -2,7 +2,133 @@ # Add description/tips for each metric in this section. # So it could be shown in hover. Metric Description: - SALU: &SALU_anchor Scalar Arithmetic Logic Unit + Speed-of-Light: + VALU FLOPs: &VALU_FLOPs_tip >- + The total floating-point operations executed per second on the VALU. This is also presented as + a percent of the peak theoretical FLOPs achievable on the specific accelerator. Note: this + does not include any floating-point operations from MFMA instructions. + VALU IOPs: &VALU_IOPs_tip >- + The total integer operations executed per second on the VALU. This is also presented as a + percent of the peak theoretical IOPs achievable on the specific accelerator. Note: this does + not include any integer operations from MFMA instructions. + MFMA FLOPs (BF16): &MFMA_FLOPs_(BF16)_tip >- + The total number of 16-bit brain floating point MFMA operations executed per second. Note: + this does not include any 16-bit brain floating point operations from VALU instructions. This + is also presented as a percent of the peak theoretical BF16 MFMA operations achievable on the + specific accelerator. + MFMA FLOPs (F16): &MFMA_FLOPs_(F16)_tip >- + The total number of 16-bit floating point MFMA operations executed per second. Note: this does + not include any 16-bit floating point operations from VALU instructions. This is also + presented as a percent of the peak theoretical F16 MFMA operations achievable on the specific + accelerator. + MFMA FLOPs (F32): &MFMA_FLOPs_(F32)_tip >- + The total number of 32-bit floating point MFMA operations executed per second. Note: this does + not include any 32-bit floating point operations from VALU instructions. This is also + presented as a percent of the peak theoretical F32 MFMA operations achievable on the specific + accelerator. + MFMA FLOPs (F64): &MFMA_FLOPs_(F64)_tip >- + The total number of 64-bit floating point MFMA operations executed per second. Note: this does + not include any 64-bit floating point operations from VALU instructions. This is also + presented as a percent of the peak theoretical F64 MFMA operations achievable on the specific + accelerator. + MFMA IOPs (INT8): &MFMA_IOPs_(INT8)_tip >- + The total number of 8-bit integer MFMA operations executed per second. Note: this does not + include any 8-bit integer operations from VALU instructions. This is also presented as a + percent of the peak theoretical INT8 MFMA operations achievable on the specific accelerator. + Active CUs: &Active_CUs_tip >- + SALU Utilization: &SALU_Utilization_tip >- + Indicates what percent of the kernel's duration the SALU was busy executing instructions. + Computed as the ratio of the total number of cycles spent by the scheduler issuing SALU or + SMEM instructions over the total CU cycles. + VALU Utilization: &VALU_Utilization_tip >- + Indicates what percent of the kernel's duration the VALU was busy executing instructions. Does + not include VMEM operations. Computed as the ratio of the total number of cycles spent by the + scheduler issuing VALU instructions over the total CU cycles. + MFMA Utilization: &MFMA_Utilization_tip >- + Indicates what percent of the kernel's duration the MFMA unit was busy executing instructions. + Computed as the ratio of the total number of cycles the MFMA was busy over the total CU + cycles. + VMEM Utilization: &VMEM_Utilization_tip >- + Indicates what percent of the kernel's duration the VMEM unit was busy executing instructions, + including both global/generic and spill/scratch operations (see the VMEM instruction count + metrics) for more detail). Does not include VALU operations. Computed as the ratio of the + total number of cycles spent by the scheduler issuing VMEM instructions over the total CU + cycles. + Branch Utilization: &Branch_Utilization_tip >- + Indicates what percent of the kernel's duration the branch unit was busy executing + instructions. Computed as the ratio of the total number of cycles spent by the scheduler + issuing branch instructions over the total CU cycles. + VALU Active Threads: &VALU_Active_Threads_tip >- + Indicates the average level of divergence within a wavefront over the lifetime of the kernel. + The number of work-items that were active in a wavefront during execution of each VALU + instruction, time-averaged over all VALU instructions run on all wavefronts in the kernel. + IPC: &IPC_tip >- + The ratio of the total number of instructions executed on the CU over the total active CU + cycles. This is also presented as a percent of the peak theoretical bandwidth achievable on + the specific accelerator. + Wavefront Occupancy: &Wavefront_Occupancy_tip >- + The time-averaged number of wavefronts resident on the accelerator over the lifetime of the + kernel. Note: this metric may be inaccurate for short-running kernels (less than 1ms). This is + also presented as a percent of the peak theoretical occupancy achievable on the specific + accelerator. + Theoretical LDS Bandwidth: &Theoretical_LDS_Bandwidth_tip >- + Indicates the maximum amount of bytes that could have been loaded from, stored to, or + atomically updated in the LDS per unit time (see LDS Bandwidth example for more detail). This + is also presented as a percent of the peak theoretical F64 MFMA operations achievable on the + specific accelerator. + LDS Bank Conflicts/Access: &LDS_Bank_Conflicts/Access_tip >- + The ratio of the number of cycles spent in the LDS scheduler due to bank conflicts (as + determined by the conflict resolution hardware) to the base number of cycles that would be + spent in the LDS scheduler in a completely uncontended case. This is also presented in + normalized form (i.e., the Bank Conflict Rate). + vL1D Cache Hit Rate: &vL1D_Cache_Hit_Rate_tip >- + The ratio of the number of vL1D cache line requests that hit in vL1D cache over the total + number of cache line requests to the vL1D cache RAM. + vL1D Cache BW: &vL1D_Cache_BW_tip >- + The number of bytes looked up in the vL1D cache as a result of VMEM instructions per unit + time. The number of bytes is calculated as the number of cache lines requested multiplied by + the cache line size. This value does not consider partial requests, so e.g., if only a single + value is requested in a cache line, the data movement will still be counted as a full cache + line. This is also presented as a percent of the peak theoretical bandwidth achievable on the + specific accelerator. + L2 Cache Hit Rate: &L2_Cache_Hit_Rate_tip >- + The ratio of the number of L2 cache line requests that hit in the L2 cache over the total + number of incoming cache line requests to the L2 cache. + L2 Cache BW: &L2_Cache_BW_tip >- + The number of bytes looked up in the L2 cache per unit time. The number of bytes is + calculated as the number of cache lines requested multiplied by the cache line size. This + value does not consider partial requests, so e.g., if only a single value is requested in a + cache line, the data movement will still be counted as a full cache line. This is also + presented as a percent of the peak theoretical bandwidth achievable on the specific + accelerator. + L2-Fabric Read BW: &L2-Fabric_Read_BW_tip >- + The number of bytes read by the L2 over the Infinity Fabric™ interface per unit time. This is + also presented as a percent of the peak theoretical bandwidth achievable on the specific + accelerator. + L2-Fabric Write BW: &L2-Fabric_Write_BW_tip >- + The number of bytes sent by the L2 over the Infinity Fabric interface by write and atomic + operations per unit time. This is also presented as a percent of the peak theoretical + bandwidth achievable on the specific accelerator. + L2-Fabric Read Latency: &L2-Fabric_Read_Latency_tip >- + The time-averaged number of cycles read requests spent in Infinity Fabric before data was + returned to the L2. + L2-Fabric Write Latency: &L2-Fabric_Write_Latency_tip >- + The time-averaged number of cycles write requests spent in Infinity Fabric before a completion + acknowledgement was returned to the L2. + sL1D Cache Hit Rate: &sL1D_Cache_Hit_Rate >- + The percent of sL1D requests that hit on a previously loaded line the cache. Calculated as the + ratio of the number of sL1D requests that hit over the number of all sL1D requests. + sL1D Cache BW: &sL1D_Cache_BW_tip >- + The number of bytes looked up in the sL1D cache per unit time. This is also presented as a + percent of the peak theoretical bandwidth achievable on the specific accelerator. + L1I Hit Rate: &L1I_Hit_Rate_tip >- + The percent of L1I requests that hit on a previously loaded line the cache. Calculated as the + ratio of the number of L1I requests that hit over the number of all L1I requests. + L1I BW: &L1I_BW_tip >- + The number of bytes looked up in the L1I cache per unit time. This is also presented as a + percent of the peak theoretical bandwidth achievable on the specific accelerator. + L1I Fetch Latency: &L1I_Fetch_Latency_tip >- + The average number of cycles spent to fetch instructions to a CU. # Define the panel properties and properties of each metric in the panel. Panel Config: @@ -34,67 +160,67 @@ Panel Config: + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) - tips: + tips: *VALU_FLOPs_tip VALU IOPs: value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp - Start_Timestamp))) unit: GIOP peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) - tips: + tips: *VALU_IOPs_tip MFMA FLOPs (BF16): value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) unit: GFLOP peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000) pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 1024) / 1000)) - tips: + tips: *MFMA_IOPs_(BF16)_tip MFMA FLOPs (F16): value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) unit: GFLOP peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000) pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 1024) / 1000)) - tips: + tips: *MFMA_FLOPs_(F16)_tip MFMA FLOPs (F32): value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) unit: GFLOP peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) - tips: + tips: *MFMA_FLOPs_(F32)_tip MFMA FLOPs (F64): value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp))) unit: GFLOP peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) - tips: + tips: *MFMA_FLOPs_(F64)_tip MFMA IOPs (Int8): value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) unit: GIOP peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000) pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 1024) / 1000)) - tips: + tips: *MFMA_IOPs_(INT8)_tip Active CUs: value: $numActiveCUs unit: CUs peak: $cu_per_gpu pop: ((100 * $numActiveCUs) / $cu_per_gpu) - tips: + tips: *Active_CUs_tip SALU Utilization: value: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) unit: pct peak: 100 pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - tips: + tips: *SALU_Utilization_tip VALU Utilization: value: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) unit: pct peak: 100 pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - tips: + tips: *VALU_Utilization_tip MFMA Utilization: value: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu) * 4))) @@ -102,19 +228,19 @@ Panel Config: peak: 100 pop: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu) * 4))) - tips: + tips: *MFMA_Utilization_tip VMEM Utilization: value: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) unit: pct peak: 100 pop: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - tips: + tips: *VMEM_Utilization_tip Branch Utilization: value: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) unit: pct peak: 100 pop: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - tips: + tips: *Branch_Utilization_tip VALU Active Threads: value: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU != 0) else None)) @@ -122,13 +248,13 @@ Panel Config: peak: 64 pop: (AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU != 0) else None)) * 1.5625) - tips: + tips: *VALU_Active_Threads_tip IPC: value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) unit: Instr/cycle peak: 5 pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5) - tips: + tips: *IPC_tip Wavefront Occupancy: value: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) unit: Wavefronts @@ -136,7 +262,7 @@ Panel Config: pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / ($max_waves_per_cu * $cu_per_gpu)))) coll_level: SQ_LEVEL_WAVES - tips: + tips: *Wavefront_Occupancy_tip Theoretical LDS Bandwidth: value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) / (End_Timestamp - Start_Timestamp))) @@ -144,7 +270,7 @@ Panel Config: peak: (($max_sclk * $cu_per_gpu) * 0.128) pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * 0.00128))) - tips: + tips: *Theoretical_LDS_Bandwidth_tip LDS Bank Conflicts/Access: value: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) @@ -152,7 +278,7 @@ Panel Config: peak: 32 pop: ((100 * AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))) / 32) - tips: + tips: *LDS_Bank_Conflicts/Access_tip vL1D Cache Hit Rate: value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) @@ -164,14 +290,14 @@ Panel Config: TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else None)) - tips: + tips: *vL1D_Cache_Hit_Rate_tip vL1D Cache BW: value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp))) unit: GB/s peak: ((($max_sclk / 1000) * 64) * $cu_per_gpu) pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk / 1000) * 64) * $cu_per_gpu)) - tips: + tips: *vL1D_Cache_BW_tip L2 Cache Hit Rate: value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + TCC_MISS_sum) != 0) else None)) @@ -179,14 +305,14 @@ Panel Config: peak: 100 pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum + TCC_MISS_sum) != 0) else None)) - tips: + tips: *L2_Cache_Hit_Rate_tip L2 Cache BW: value: AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))) unit: GB/s peak: ((($max_sclk / 1000) * 64) * TO_INT($total_l2_chan)) pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk / 1000) * 64) * TO_INT($total_l2_chan))) - tips: + tips: *L2_Cache_BW_tip L2-Fabric Read BW: value: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) * 64)) / (End_Timestamp - Start_Timestamp))) @@ -194,7 +320,7 @@ Panel Config: peak: $hbm_bw pop: ((100 * AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) * 64)) / (End_Timestamp - Start_Timestamp)))) / $hbm_bw) - tips: + tips: *L2-Fabric_Read_BW_tip L2-Fabric Write BW: value: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp))) @@ -202,21 +328,21 @@ Panel Config: peak: $hbm_bw pop: ((100 * AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp)))) / $hbm_bw) - tips: + tips: *L2-Fabric_Write_BW_tip L2-Fabric Read Latency: value: AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum != 0) else None)) unit: Cycles peak: None pop: None - tips: + tips: *L2-Fabric_Read_Latency_tip L2-Fabric Write Latency: value: AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum != 0) else None)) unit: Cycles peak: None pop: None - tips: + tips: *L2-Fabric_Write_Latency_tip sL1D Cache Hit Rate: value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) @@ -224,31 +350,31 @@ Panel Config: peak: 100 pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) - tips: + tips: *sL1D_Cache_Hit_Rate_tip sL1D Cache BW: value: AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) unit: GB/s peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) pop: ((100 * AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu)) - tips: + tips: *sL1D_Cache_BW_tip L1I Hit Rate: value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) unit: pct peak: 100 pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) - tips: + tips: *L1I_Hit_Rate_tip L1I BW: value: AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) unit: GB/s peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) pop: ((100 * AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu)) - tips: + tips: *L1I_BW_tip L1I Fetch Latency: value: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) unit: Cycles peak: None pop: None coll_level: SQ_IFETCH_LEVEL - tips: + tips: *L1I_Fetch_Latency_tip diff --git a/src/omniperf_soc/analysis_configs/gfx90a/0500_command-processor.yaml b/src/omniperf_soc/analysis_configs/gfx90a/0500_command-processor.yaml index b4a1f0b10..2eaf678f7 100644 --- a/src/omniperf_soc/analysis_configs/gfx90a/0500_command-processor.yaml +++ b/src/omniperf_soc/analysis_configs/gfx90a/0500_command-processor.yaml @@ -2,6 +2,18 @@ # Add description/tips for each metric in this section. # So it could be shown in hover. Metric Description: + CPF_Utilization: &CPF_Utilization_tip + Percent of total cycles where the CPF was busy actively doing any work. The + ratio of CPF busy cycles over total cycles counted by the CPF. + CPF_Stall: &CPF_Stall_tip + Percent of CPF busy cycles where the CPF was stalled for any reason. + CPF-L2_Utilization: &CPF-L2_Utilization_tip + Percent of total cycles counted by the CPF-L2 interface where the CPF-L2 + interface was active doing any work. The ratio of CPF-L2 busy cycles over total cycles counted by the CPF-L2. + CPF-L2_Stall: &CPF-L2_Stall_tip + Percent of CPF-L2 busy cycles where the CPF-L2 interface was stalled for any reason. + CPF-UTCL1_Stall: &CPF-UTCL1_Stall_tip + Percent of CPF busy cycles where the CPF was stalled by address translation. # Define the panel properties and properties of each metric in the panel. Panel Config: @@ -27,7 +39,7 @@ Panel Config: max: MAX((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) unit: pct - tips: + tips: *CPF_Utilization_tip CPF Stall: avg: AVG((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY != 0) else None)) @@ -36,7 +48,7 @@ Panel Config: max: MAX((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY != 0) else None)) unit: pct - tips: + tips: *CPF_Stall_tip CPF-L2 Utilization: avg: AVG((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) @@ -45,7 +57,7 @@ Panel Config: max: MAX((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) unit: pct - tips: + tips: *CPF-L2_Utilization_tip CPF-L2 Stall: avg: AVG((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY != 0) else None)) @@ -54,7 +66,7 @@ Panel Config: max: MAX((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY != 0) else None)) unit: pct - tips: + tips: *CPF-L2_Stall_tip CPF-UTCL1 Stall: avg: AVG(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY != 0) else None) @@ -63,7 +75,7 @@ Panel Config: max: MAX(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY != 0) else None) unit: pct - tips: + tips: *CPF-UTCL1_Stall_tip - metric_table: id: 502 diff --git a/src/omniperf_soc/analysis_configs/gfx90a/0600_shader-processor-input.yaml b/src/omniperf_soc/analysis_configs/gfx90a/0600_shader-processor-input.yaml index fde76d28f..d37ab66ed 100644 --- a/src/omniperf_soc/analysis_configs/gfx90a/0600_shader-processor-input.yaml +++ b/src/omniperf_soc/analysis_configs/gfx90a/0600_shader-processor-input.yaml @@ -2,6 +2,76 @@ # Add description/tips for each metric in this section. # So it could be shown in hover. Metric Description: + Workgroup Manager Utilizations: + Accelerator Utilization: &Accelerator_Utilization_tip >- + The percent of cycles in the kernel where the accelerator was actively doing any work. + Scheduler-Pipe Utilization: &Scheduler-Pipe_Utilization_tip >- + The percent of total scheduler-pipe cycles in the kernel where the scheduler-pipes were + actively doing any work. Note: this value is expected to range between 0% and 25%. + Workgroup Manager Utilization: &Workgroup_Manager_Utilization_tip >- + The percent of cycles in the kernel where the workgroup manager was actively doing any work. + Shader Engine Utilization: &Shader_Engine_Utilization_tip >- + The percent of total shader engine cycles in the kernel where any CU in a shader-engine was + actively doing any work, normalized over all shader-engines. Low values (e.g., << 100%) + indicate that the accelerator was not fully saturated by the kernel, or a potential + load-imbalance issue. + SIMD Utilization: &SIMD_Utilization_tip >- + The percent of total SIMD cycles in the kernel where any SIMD on a CU was actively doing any + work, summed over all CUs. Low values (less than 100%) indicate that the accelerator was not + fully saturated by the kernel, or a potential load-imbalance issue. + Dispatched Workgroups: &Dispatched_Workgroups_tip >- + The total number of workgroups forming this kernel launch. + Dispatched Wavefronts: &Dispatched_Wavefronts_tip >- + The total number of wavefronts, summed over all workgroups, forming this kernel launch. + VGPR Writes: &VGPR_Writes_tip >- + The average number of cycles spent initializing VGPRs at wave creation. + SGPR Writes: &SGPR_Writes_tip >- + The average number of cycles spent initializing SGPRs at wave creation. + Workgroup Manager - Resource Allocation: + Not-Scheduled Rate (Workgroup Manager): &Not-Scheduled_Rate_(Workgroup_Manager)_tip >- + The percent of total scheduler-pipe cycles in the kernel where a workgroup could not be + scheduled to a CU due to a bottleneck within the workgroup manager rather than a lack of a CU + or SIMD with sufficient resources. Note: this value is expected to range between 0-25%. See + note in workgroup manager description. + Not-Scheduled Rate (Scheduler-Pipe): &Not-Scheduled_Rate_(Scheduler-Pipe)_tip >- + The percent of total scheduler-pipe cycles in the kernel where a workgroup could not be + scheduled to a CU due to a bottleneck within the scheduler-pipes rather than a lack of a CU or + SIMD with sufficient resources. Note: this value is expected to range between 0-25%, see note + in workgroup manager description. + Scheduler-Pipe Stall Rate: &Scheduler-Pipe_Stall_Rate_tip >- + The percent of total scheduler-pipe cycles in the kernel where a workgroup could not be + scheduled to a CU due to occupancy limitations (like a lack of a CU or SIMD with sufficient + resources). Note: this value is expected to range between 0-25%, see note in workgroup manager + description. + Scratch Stall Rate: &Scratch_Stall_Rate_tip >- + The percent of total shader-engine cycles in the kernel where a workgroup could not be + scheduled to a CU due to lack of private (a.k.a., scratch) memory slots. While this can reach + up to 100%, note that the actual occupancy limitations on a kernel using private memory are + typically quite small (for example, less than 1% of the total number of waves that can be + scheduled to an accelerator). + Insufficient SIMD Waveslots: &Insufficient_SIMD_Waveslots_tip >- + The percent of total SIMD cycles in the kernel where a workgroup could not be scheduled to a + SIMD due to lack of available waveslots. + Insufficient SIMD VGPRs: &Insufficient_SIMD_VGPRs_tip >- + The percent of total SIMD cycles in the kernel where a workgroup could not be scheduled to a + SIMD due to lack of available VGPRs. + Insufficient SIMD SGPRs: &Insufficient_SIMD_SGPRs_tip >- + The percent of total SIMD cycles in the kernel where a workgroup could not be scheduled to a + SIMD due to lack of available SGPRs. + Insufficient CU LDS: &Insufficient_CU_LDS_tip >- + The percent of total CU cycles in the kernel where a workgroup could not be scheduled to a CU + due to lack of available LDS. + Insufficient CU Barriers: &Insufficient_CU_Barriers_tip >- + The percent of total CU cycles in the kernel where a workgroup could not be scheduled to a CU + due to lack of available barriers. + Reached CU Workgroup Limit: &Reached_CU_Workgroup_Limit_tip >- + The percent of total CU cycles in the kernel where a workgroup could not be scheduled to a CU + due to limits within the workgroup manager. This is expected to be always be zero on CDNA2 or + newer accelerators (and small for previous accelerators). + Reached CU Wavefront Limit: &Reached_CU_Wavefront_Limit_tip >- + The percent of total CU cycles in the kernel where a wavefront could not be scheduled to a CU + due to limits within the workgroup manager. This is expected to be always be zero on CDNA2 or + newer accelerators (and small for previous accelerators). # Define the panel properties and properties of each metric in the panel. Panel Config: @@ -24,43 +94,43 @@ Panel Config: min: MIN(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) max: MAX(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) unit: Pct - tips: + tips: *Accelerator_Utilization_tip Scheduler-Pipe Utilization: avg: AVG(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) min: MIN(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) max: MAX(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) unit: Pct - tips: + tips: *Scheduler-Pipe_Utilization_tip Workgroup Manager Utilization: avg: AVG(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) min: MIN(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) max: MAX(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) unit: Pct - tips: + tips: *Workgroup_Manager_Utilization_tip Shader Engine Utilization: avg: AVG(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) min: MIN(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) max: MAX(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) unit: Pct - tips: + tips: *Shader_Engine_Utilization_tip SIMD Utilization: avg: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) min: MIN(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) max: MAX(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) unit: Pct - tips: + tips: *SIMD_Utilization_tip Dispatched Workgroups: avg: AVG(SPI_CSN_NUM_THREADGROUPS) min: MIN(SPI_CSN_NUM_THREADGROUPS) max: MAX(SPI_CSN_NUM_THREADGROUPS) unit: Workgroups - tips: + tips: *Dispatched_Workgroups_tip Dispatched Wavefronts: avg: AVG(SPI_CSN_WAVE) min: MIN(SPI_CSN_WAVE) max: MAX(SPI_CSN_WAVE) unit: Wavefronts - tips: + tips: *Dispatched_Wavefronts_tip VGPR Writes: avg: AVG((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else None)) @@ -69,7 +139,7 @@ Panel Config: max: MAX((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else None)) unit: Cycles/wave - tips: + tips: *VGPR_Writes_tip SGPR Writes: avg: AVG((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else None)) @@ -78,7 +148,7 @@ Panel Config: max: MAX((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else None)) unit: Cycles/wave - tips: + tips: *SGPR_Writes_tip - metric_table: id: 602 title: Workgroup Manager - Resource Allocation @@ -98,7 +168,7 @@ Panel Config: max: MAX((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) unit: Pct - tips: + tips: *Not-Scheduled_Rate_(Workgroup_Manager)_tip Not-scheduled Rate (Scheduler-Pipe): avg: AVG((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) @@ -107,7 +177,7 @@ Panel Config: max: MAX((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) unit: Pct - tips: + tips: *Not-Scheduled_Rate_(Scheduler-Pipe)_tip Scheduler-Pipe Stall Rate: avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) @@ -116,52 +186,52 @@ Panel Config: max: MAX((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) unit: Pct - tips: + tips: *Scheduler-Pipe_Stall_Rate_tip Scratch Stall Rate: avg: AVG((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) min: MIN((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) max: MAX((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) unit: Pct - tips: + tips: *Scratch_Stall_Rate_tip Insufficient SIMD Waveslots: avg: AVG(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) min: MIN(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) max: MAX(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) unit: Pct - tips: + tips: *Insufficient_SIMD_Waveslots_tip Insufficient SIMD VGPRs: avg: AVG(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) min: MIN(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) max: MAX(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) unit: Pct - tips: + tips: *Insufficient_SIMD_VGPRs_tip Insufficient SIMD SGPRs: avg: AVG(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) min: MIN(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) max: MAX(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) unit: Pct - tips: + tips: *Insufficient_SIMD_SGPRs_tip Insufficient CU LDS: avg: AVG(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) min: MIN(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) max: MAX(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) unit: Pct - tips: + tips: *Insufficient_CU_LDS_tip Insufficient CU Barriers: avg: AVG(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) min: MIN(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) max: MAX(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) unit: Pct - tips: + tips: *Insufficient_CU_Barriers_tip Reached CU Workgroup Limit: avg: AVG(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) min: MIN(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) max: MAX(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) unit: Pct - tips: + tips: *Reached_CU_Workgroup_Limit_tip Reached CU Wavefront Limit: avg: AVG(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) unit: Pct - tips: + tips: *Reached_CU_Wavefront_Limit_tip diff --git a/src/omniperf_soc/analysis_configs/gfx90a/0700_wavefront-launch.yaml b/src/omniperf_soc/analysis_configs/gfx90a/0700_wavefront-launch.yaml index bd761f948..0cdc7f803 100644 --- a/src/omniperf_soc/analysis_configs/gfx90a/0700_wavefront-launch.yaml +++ b/src/omniperf_soc/analysis_configs/gfx90a/0700_wavefront-launch.yaml @@ -2,6 +2,81 @@ # Add description/tips for each metric in this section. # So it could be shown in hover. Metric Description: + Wavefront Launch Stats: + Grid Size: &Grid_Size_tip >- + The total number of work-items (or, threads) launched as a part of the kernel dispatch. In + HIP, this is equivalent to the total grid size multiplied by the total workgroup (or, block) + size. + Workgroup Size: &Workgroup_Size_tip >- + The total number of work-items (or, threads) in each workgroup (or, block) launched as part of + the kernel dispatch. In HIP, this is equivalent to the total block size. + Total Wavefronts: &Total_Wavefronts_tip >- + The total number of wavefronts launched as part of the kernel dispatch. On AMD Instinct™ CDNA™ + accelerators and GCN™ GPUs, the wavefront size is always 64 work-items. Thus, the total + number of wavefronts should be equivalent to the ceiling of grid size divided by 64. + Saved Wavefronts: &Saved_Wavefronts_tip >- + The total number of wavefronts saved at a context-save. + Restored Wavefronts: &Restored_Wavefronts_tip >- + The total number of wavefronts restored from a context-save. + VGPRs: &VGPRs_tip >- + The number of architected vector general-purpose registers allocated for the kernel, see VALU. + Note: this may not exactly match the number of VGPRs requested by the compiler due to + allocation granularity. + AGPRs: &AGPRs_tip >- + The number of accumulation vector general-purpose registers allocated for the kernel, see + AGPRs. Note: this may not exactly match the number of AGPRs requested by the compiler due to + allocation granularity. + SGPRs: &SGPRs_tip >- + The number of scalar general-purpose registers allocated for the kernel, see SALU. Note: this + may not exactly match the number of SGPRs requested by the compiler due to allocation + granularity. + LDS Allocation: &LDS_Allocation_tip >- + The number of bytes of LDS memory (or, shared memory) allocated for this kernel. Note: This + may also be larger than what was requested at compile time due to both allocation granularity + and dynamic per-dispatch LDS allocations. + Scratch Allocation: &Scratch_Allocation_tip >- + The number of bytes of scratch memory requested per work-item for this kernel. Scratch memory + is used for stack memory on the accelerator, as well as for register spills and restores. + Wavefront Runtime Stats: + Kernel Time: &Kernel_Time_tip >- + The total duration of the executed kernel. Note: this should not be directly compared to the + wavefront cycles / timings below. + Kernel Cycles: &Kernel_Cycles_tip >- + The total duration of the executed kernel in cycles. Note: this should not be directly + compared to the wavefront cycles / timings below. + Instructions Per Wavefront: &Instructions_Per_Wavefront_tip >- + The average number of instructions (of all types) executed per wavefront. This is averaged + over all wavefronts in a kernel dispatch. + Wave Cycles: &Wave_Cycles_tip >- + The number of cycles a wavefront in the kernel dispatch spent resident on a compute unit per + normalization unit. This is averaged over all wavefronts in a kernel dispatch. Note: this + should not be directly compared to the kernel cycles above. + Dependency Wait Cycles: &Dependency_Wait_Cycles_tip >- + The number of cycles a wavefront in the kernel dispatch stalled waiting on memory of any kind + (e.g., instruction fetch, vector or scalar memory, etc.) per normalization unit. This counter + is incremented at every cycle by *all* wavefronts on a CU stalled at a memory operation. As + such, it is most useful to get a sense of how waves were spending their time, rather than + identification of a precise limiter because another wave could be actively executing while a + wave is stalled. The sum of this metric, Issue Wait Cycles and Active Cycles should be equal + to the total Wave Cycles metric. + Issue Wait Cycles: &Issue_Wait_Cycles_tip >- + The number of cycles a wavefront in the kernel dispatch was unable to issue an instruction for + any reason (e.g., execution pipe back-pressure, arbitration loss, etc.) per normalization + unit. This counter is incremented at every cycle by *all* wavefronts on a CU unable to issue + an instruction. As such, it is most useful to get a sense of how waves were spending their + time, rather than identification of a precise limiter because another wave could be actively + executing while a wave is issue stalled. The sum of this metric, Dependency Wait Cycles and + Active Cycles should be equal to the total Wave Cycles metric. + Active Cycles: &Active_Cycles_tip >- + The average number of cycles a wavefront in the kernel dispatch was actively executing + instructions per normalization unit. This measurement is made on a per-wavefront basis, and + may include cycles that another wavefront spent actively executing (on another execution unit, + for example) or was stalled. As such, it is most useful to get a sense of how waves were + spending their time, rather than identification of a precise limiter. The sum of this metric, + Issue Wait Cycles and Active Wait Cycles should be equal to the total Wave Cycles metric. + Wavefront Occupancy: &Wavefront_Occupancy_tip >- + The time-averaged number of wavefronts resident on the accelerator over the lifetime of the + kernel. Note: this metric may be inaccurate for short-running kernels (less than 1ms). # Define the panel properties and properties of each metric in the panel. Panel Config: @@ -24,61 +99,61 @@ Panel Config: min: MIN(Grid_Size) max: MAX(Grid_Size) unit: Work Items - tips: + tips: *Grid_Size_tip Workgroup Size: avg: AVG(Workgroup_Size) min: MIN(Workgroup_Size) max: MAX(Workgroup_Size) unit: Work Items - tips: + tips: *Workgroup_Size_tip Total Wavefronts: avg: AVG(SPI_CSN_WAVE) min: MIN(SPI_CSN_WAVE) max: MAX(SPI_CSN_WAVE) unit: Wavefronts - tips: + tips: *Total_Wavefronts_tip Saved Wavefronts: avg: AVG(SQ_WAVES_SAVED) min: MIN(SQ_WAVES_SAVED) max: MAX(SQ_WAVES_SAVED) unit: Wavefronts - tips: + tips: *Saved_Wavefronts_tip Restored Wavefronts: avg: AVG(SQ_WAVES_RESTORED) min: MIN(SQ_WAVES_RESTORED) max: MAX(SQ_WAVES_RESTORED) unit: Wavefronts - tips: + tips: *Restored_Wavefronts_tip VGPRs: avg: AVG(Arch_VGPR) min: MIN(Arch_VGPR) max: MAX(Arch_VGPR) unit: Registers - tips: + tips: *VGPRs_tip AGPRs: avg: AVG(Accum_VGPR) min: MIN(Accum_VGPR) max: MAX(Accum_VGPR) unit: Registers - tips: + tips: *AGPRs_tip SGPRs: avg: AVG(SGPR) min: MIN(SGPR) max: MAX(SGPR) unit: Registers - tips: + tips: *SGPRs_tip LDS Allocation: avg: AVG(LDS_Per_Workgroup) min: MIN(LDS_Per_Workgroup) max: MAX(LDS_Per_Workgroup) unit: Bytes - tips: + tips: *LDS_Allocation_tip Scratch Allocation: avg: AVG(Scratch_Per_Workitem) min: MIN(Scratch_Per_Workitem) max: MAX(Scratch_Per_Workitem) unit: Bytes/Workitem - tips: + tips: *Scratch_Allocation_tip - metric_table: id: 702 @@ -96,47 +171,47 @@ Panel Config: min: MIN((End_Timestamp - Start_Timestamp)) max: MAX((End_Timestamp - Start_Timestamp)) unit: ns - tips: + tips: *Kernel_Time_tip Kernel Time (Cycles): avg: AVG($GRBM_GUI_ACTIVE_PER_XCD) min: MIN($GRBM_GUI_ACTIVE_PER_XCD) max: MAX($GRBM_GUI_ACTIVE_PER_XCD) unit: Cycle - tips: + tips: *Kernel_Cycles_tip Instructions per wavefront: avg: AVG((SQ_INSTS / SQ_WAVES)) min: MIN((SQ_INSTS / SQ_WAVES)) max: MAX((SQ_INSTS / SQ_WAVES)) unit: Instr/wavefront - tips: + tips: *Instructions_Per_Wavefront_tip Wave Cycles: avg: AVG(((4 * SQ_WAVE_CYCLES) / $denom)) min: MIN(((4 * SQ_WAVE_CYCLES) / $denom)) max: MAX(((4 * SQ_WAVE_CYCLES) / $denom)) unit: (Cycles + $normUnit) - tips: + tips: *Wave_Cycles_tip Dependency Wait Cycles: avg: AVG(((4 * SQ_WAIT_ANY) / $denom)) min: MIN(((4 * SQ_WAIT_ANY) / $denom)) max: MAX(((4 * SQ_WAIT_ANY) / $denom)) unit: (Cycles + $normUnit) - tips: + tips: *Dependency_Wait_Cycles_tip Issue Wait Cycles: avg: AVG(((4 * SQ_WAIT_INST_ANY) / $denom)) min: MIN(((4 * SQ_WAIT_INST_ANY) / $denom)) max: MAX(((4 * SQ_WAIT_INST_ANY) / $denom)) unit: (Cycles + $normUnit) - tips: + tips: *Issue_Wait_Cycles_tip Active Cycles: avg: AVG(((4 * SQ_ACTIVE_INST_ANY) / $denom)) min: MIN(((4 * SQ_ACTIVE_INST_ANY) / $denom)) max: MAX(((4 * SQ_ACTIVE_INST_ANY) / $denom)) unit: (Cycles + $normUnit) - tips: + tips: *Active_Cycles_tip Wavefront Occupancy: avg: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) min: MIN((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) max: MAX((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) unit: Wavefronts coll_level: SQ_LEVEL_WAVES - tips: + tips: *Wavefront_Occupancy_tip diff --git a/src/omniperf_soc/analysis_configs/gfx90a/1000_compute-unit-instruction-mix.yaml b/src/omniperf_soc/analysis_configs/gfx90a/1000_compute-unit-instruction-mix.yaml index 436a220b9..db84cc703 100644 --- a/src/omniperf_soc/analysis_configs/gfx90a/1000_compute-unit-instruction-mix.yaml +++ b/src/omniperf_soc/analysis_configs/gfx90a/1000_compute-unit-instruction-mix.yaml @@ -2,6 +2,37 @@ # Add description/tips for each metric in this section. # So it could be shown in hover. Metric Description: + Overall Instruction Mix: + VALU: &VALU_tip >- + VMEM: &VMEM_tip >- + LDS: &LDS_tip >- + MFMA: &MFMA_tip >- + SALU: &SALU_tip >- + SMEM: &SMEM_tip >- + Branch: &Branch_tip >- + VALU Arithmetic Instruction Mix: + INT32: &INT32_tip >- + INT64: &INT64_tip >- + F16-ADD: &F16-ADD_tip >- + F16-MUL: &F16-MUL_tip >- + F16-FMA: &F16-FMA_tip >- + F16-Trans: &F16-Trans_tip >- + F32-ADD: &F32-ADD_tip >- + F32-MUL: &F32-MUL_tip >- + F32-FMA: &F32-FMA_tip >- + F32-Trans: &F32-Trans_tip >- + F64-ADD: &F64-ADD_tip >- + F64-MUL: &F64-MUL_tip >- + F64-FMA: &F64-FMA_tip >- + F64-Trans: &F64-Trans_tip >- + Conversion: &Conversion_tip >- + VMEM Instruction Mix: + MFMA Arithmetic Instruction Mix: + MFMA-I8: &MFMA-I8_tip >- + MFMA-F16: &MFMA-F16_tip >- + MFMA-BF16: &MFMA-BF16_tip >- + MFMA-F32: &MFMA-F32_tip >- + MFMA-F64: &MFMA-F64_tip >- # Define the panel properties and properties of each metric in the panel. Panel Config: diff --git a/src/omniperf_soc/analysis_configs/gfx90a/1100_compute-unit-compute-pipeline.yaml b/src/omniperf_soc/analysis_configs/gfx90a/1100_compute-unit-compute-pipeline.yaml index bda6f5cc9..54eda7ead 100644 --- a/src/omniperf_soc/analysis_configs/gfx90a/1100_compute-unit-compute-pipeline.yaml +++ b/src/omniperf_soc/analysis_configs/gfx90a/1100_compute-unit-compute-pipeline.yaml @@ -2,6 +2,104 @@ # Add description/tips for each metric in this section. # So it could be shown in hover. Metric Description: + Speed-of-Light: + VALU FLOPs: &VALU_FLOPs_tip >- + The total floating-point operations executed per second on the VALU. This is also presented as + a percent of the peak theoretical FLOPs achievable on the specific accelerator. Note: this + does not include any floating-point operations from MFMA instructions. + VALU IOPs: &VALU_IOPs_tip >- + The total integer operations executed per second on the VALU. This is also presented as a + percent of the peak theoretical IOPs achievable on the specific accelerator. Note: this does + not include any integer operations from MFMA instructions. + MFMA FLOPs (BF16): &MFMA_FLOPs_(BF16)_tip >- + The total number of 16-bit brain floating point MFMA operations executed per second. Note: + this does not include any 16-bit brain floating point operations from VALU instructions. This + is also presented as a percent of the peak theoretical BF16 MFMA operations achievable on the + specific accelerator. + MFMA FLOPs (F16): &MFMA_FLOPs_(F16)_tip >- + The total number of 16-bit floating point MFMA operations executed per second. Note: this does + not include any 16-bit floating point operations from VALU instructions. This is also + presented as a percent of the peak theoretical F16 MFMA operations achievable on the specific + accelerator. + MFMA FLOPs (F32): &MFMA_FLOPs_(F32)_tip >- + The total number of 32-bit floating point MFMA operations executed per second. Note: this does + not include any 32-bit floating point operations from VALU instructions. This is also + presented as a percent of the peak theoretical F32 MFMA operations achievable on the specific + accelerator. + MFMA FLOPs (F64): &MFMA_FLOPs_(F64)_tip >- + The total number of 64-bit floating point MFMA operations executed per second. Note: this does + not include any 64-bit floating point operations from VALU instructions. This is also + presented as a percent of the peak theoretical F64 MFMA operations achievable on the specific + accelerator. + MFMA IOPs (INT8): &MFMA_IOPs_(INT8)_tip >- + The total number of 8-bit integer MFMA operations executed per second. Note: this does not + include any 8-bit integer operations from VALU instructions. This is also presented as a + percent of the peak theoretical INT8 MFMA operations achievable on the specific accelerator. + Pipeline Stats: + IPC: &IPC_tip >- + The ratio of the total number of instructions executed on the CU over the total active CU + cycles. + IPC (Issued): &IPC_(Issued)_tip >- + The ratio of the total number of (non-internal) instructions issued over the number of cycles + where the scheduler was actively working on issuing instructions. + SALU Utilization: &SALU_Utilization_tip >- + Indicates what percent of the kernel's duration the SALU was busy executing instructions. + Computed as the ratio of the total number of cycles spent by the scheduler issuing SALU / SMEM + instructions over the total CU cycles. + VALU Utilization: &VALU_Utilization_tip >- + Indicates what percent of the kernel's duration the VALU was busy executing instructions. Does + not include VMEM operations. Computed as the ratio of the total number of cycles spent by the + scheduler issuing VALU instructions over the total CU cycles. + VMEM Utilization: &VMEM_Utilization_tip >- + Indicates what percent of the kernel's duration the VMEM unit was busy executing instructions, + including both global/generic and spill/scratch operations (see the VMEM instruction count + metrics for more detail). Does not include VALU operations. Computed as the ratio of the total + number of cycles spent by the scheduler issuing VMEM instructions over the total CU cycles. + Branch Utilization: &Branch_Utilization_tip >- + Indicates what percent of the kernel's duration the branch unit was busy executing + instructions. Computed as the ratio of the total number of cycles spent by the scheduler + issuing branch instructions over the total CU cycles. + VALU Active Threads: &VALU_Active_Threads_tip >- + Indicates the average level of divergence within a wavefront over the lifetime of the kernel. + The number of work-items that were active in a wavefront during execution of each VALU + instruction, time-averaged over all VALU instructions run on all wavefronts in the kernel. + MFMA Utilization: &MFMA_Utilization_tip >- + Indicates what percent of the kernel's duration the MFMA unit was busy executing instructions. + Computed as the ratio of the total number of cycles spent by the MFMA was busy over the total + CU cycles. + MFMA Instruction Cycles: &MFMA_Instruction_Cycles_tip >- + The average duration of MFMA instructions in this kernel in cycles. Computed as the ratio of + the total number of cycles the MFMA unit was busy over the total number of MFMA instructions. + VMEM Latency: &VMEM_Latency_tip >- + The average number of round-trip cycles (that is, from issue to data return / acknowledgment) + required for a VMEM instruction to complete. + SMEM Latency: &SMEM_Latency_tip >- + The average number of round-trip cycles (that is, from issue to data return / acknowledgment) + required for a SMEM instruction to complete. + Arithmetic Operations: + FLOPs (Total): &FLOPs_(Total)_tip >- + The total number of floating-point operations executed on either the VALU or MFMA units, per + normalization unit. + IOPs (Total): &IOPs_(Total)_tip >- + The total number of integer operations executed on either the VALU or MFMA units, per + normalization unit. + F16 OPs: &F16_OPs_tip >- + The total number of 16-bit floating-point operations executed on either the VALU or MFMA + units, per normalization unit. + BF16 OPs: &BF16_OPs_tip >- + The total number of 16-bit brain floating-point operations executed on either the VALU or MFMA + units, per normalization unit. Note: on current CDNA accelerators, the VALU has no native BF16 + instructions. + F32 OPs: &F32_OPs_tip >- + The total number of 32-bit floating-point operations executed on either the VALU or MFMA + units, per normalization unit. + F64 OPs: &F64_OPs_tip >- + The total number of 64-bit floating-point operations executed on either the VALU or MFMA + units, per normalization unit. + INT8 OPs: &INT8_OPs_tip >- + The total number of 8-bit integer operations executed on either the VALU or MFMA units, per + normalization unit. Note: on current CDNA accelerators, the VALU has no native INT8 + instructions. # Define the panel properties and properties of each metric in the panel. Panel Config: @@ -257,4 +355,4 @@ Panel Config: min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) unit: (OPs + $normUnit) - tips: \ No newline at end of file + tips: