diff --git a/docs/conceptual/command-processor.rst b/docs/conceptual/command-processor.rst
index a055768a1..a43ea884b 100644
--- a/docs/conceptual/command-processor.rst
+++ b/docs/conceptual/command-processor.rst
@@ -46,6 +46,9 @@ processor’s metrics therefore are focused on reporting, for example:
 Command processor fetcher (CPF)
 ===============================
 
+.. datatemplate:yaml:: ../../src/omniperf_soc/analysis_configs/gfx90a/0500_command-processor.yaml
+   :template: ../templates/performance-metric-table.tmpl
+
 .. list-table::
    :header-rows: 1
 
diff --git a/docs/conf.py b/docs/conf.py
index f74f95ecd..4bc554069 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -49,6 +49,7 @@
 
 html_static_path = ["sphinx/static/css"]
 html_css_files = ["o_custom.css"]
+templates_path = ["templates"]
 
 external_toc_path = "./sphinx/_toc.yml"
 external_projects_current_project = "omniperf"
diff --git a/docs/templates/performance-metric-table.tmpl b/docs/templates/performance-metric-table.tmpl
new file mode 100644
index 000000000..b5dfe1682
--- /dev/null
+++ b/docs/templates/performance-metric-table.tmpl
@@ -0,0 +1,26 @@
+.. -*- mode: rst -*-
+
+{% for data_source in data["Panel Config"]["data source"] %}
+
+## Anchor for internal linking.
+## For example, :ref:`command-processor-fetcher-metrics`.
+.. _{{data_source.metric_table.title|lower|replace(" ", "-") }}-metrics:
+
+{{ data_source.metric_table.title }}
+-------------------------------------------------------------------------------
+
+.. list-table::
+   :header-rows: 1
+
+   * - Metric
+     - Description
+     - Unit
+
+  {% for name, desc in data_source.metric_table.metric.items() %}
+   * - {{ name }}
+     - {{ desc.tips }}
+     - {{ desc.unit }}
+
+  {% endfor %}
+{% endfor %}
+
diff --git a/src/omniperf_soc/analysis_configs/gfx90a/0200_system-speed-of-light.yaml b/src/omniperf_soc/analysis_configs/gfx90a/0200_system-speed-of-light.yaml
index 2deb5c7cc..c25c9f8ed 100644
--- a/src/omniperf_soc/analysis_configs/gfx90a/0200_system-speed-of-light.yaml
+++ b/src/omniperf_soc/analysis_configs/gfx90a/0200_system-speed-of-light.yaml
@@ -2,7 +2,133 @@
 # Add description/tips for each metric in this section.
 # So it could be shown in hover.
 Metric Description:
-  SALU: &SALU_anchor Scalar Arithmetic Logic Unit
+  Speed-of-Light:
+    VALU FLOPs: &VALU_FLOPs_tip >-
+      The total floating-point operations executed per second on the VALU. This is also presented as
+      a percent of the peak theoretical FLOPs achievable on the specific accelerator. Note: this
+      does not include any floating-point operations from MFMA instructions.
+    VALU IOPs: &VALU_IOPs_tip >-
+      The total integer operations executed per second on the VALU. This is also presented as a
+      percent of the peak theoretical IOPs achievable on the specific accelerator. Note: this does
+      not include any integer operations from MFMA instructions.
+    MFMA FLOPs (BF16): &MFMA_FLOPs_(BF16)_tip >-
+      The total number of 16-bit brain floating point MFMA operations executed per second. Note:
+      this does not include any 16-bit brain floating point operations from VALU instructions. This
+      is also presented as a percent of the peak theoretical BF16 MFMA operations achievable on the
+      specific accelerator.
+    MFMA FLOPs (F16): &MFMA_FLOPs_(F16)_tip >-
+      The total number of 16-bit floating point MFMA operations executed per second. Note: this does
+      not include any 16-bit floating point operations from VALU instructions. This is also
+      presented as a percent of the peak theoretical F16 MFMA operations achievable on the specific
+      accelerator.
+    MFMA FLOPs (F32): &MFMA_FLOPs_(F32)_tip >-
+      The total number of 32-bit floating point MFMA operations executed per second. Note: this does
+      not include any 32-bit floating point operations from VALU instructions. This is also
+      presented as a percent of the peak theoretical F32 MFMA operations achievable on the specific
+      accelerator.
+    MFMA FLOPs (F64): &MFMA_FLOPs_(F64)_tip >-
+      The total number of 64-bit floating point MFMA operations executed per second. Note: this does
+      not include any 64-bit floating point operations from VALU instructions. This is also
+      presented as a percent of the peak theoretical F64 MFMA operations achievable on the specific
+      accelerator.
+    MFMA IOPs (INT8): &MFMA_IOPs_(INT8)_tip >-
+      The total number of 8-bit integer MFMA operations executed per second. Note: this does not
+      include any 8-bit integer operations from VALU instructions. This is also presented as a
+      percent of the peak theoretical INT8 MFMA operations achievable on the specific accelerator.
+    Active CUs: &Active_CUs_tip >-
+    SALU Utilization: &SALU_Utilization_tip >-
+      Indicates what percent of the kernel's duration the SALU was busy executing instructions.
+      Computed as the ratio of the total number of cycles spent by the scheduler issuing SALU or
+      SMEM instructions over the total CU cycles.
+    VALU Utilization: &VALU_Utilization_tip >-
+      Indicates what percent of the kernel's duration the VALU was busy executing instructions. Does
+      not include VMEM operations.  Computed as the ratio of the total number of cycles spent by the
+      scheduler issuing VALU instructions over the total CU cycles.
+    MFMA Utilization: &MFMA_Utilization_tip >-
+      Indicates what percent of the kernel's duration the MFMA unit was busy executing instructions.
+      Computed as the ratio of the total number of cycles the MFMA was busy over the total CU
+      cycles.
+    VMEM Utilization: &VMEM_Utilization_tip >-
+      Indicates what percent of the kernel's duration the VMEM unit was busy executing instructions,
+      including both global/generic and spill/scratch operations (see the VMEM instruction count
+      metrics) for more detail). Does not include VALU operations. Computed as the ratio of the
+      total number of cycles spent by the scheduler issuing VMEM instructions over the total CU
+      cycles.
+    Branch Utilization: &Branch_Utilization_tip >-
+      Indicates what percent of the kernel's duration the branch unit was busy executing
+      instructions. Computed as the ratio of the total number of cycles spent by the scheduler
+      issuing branch instructions over the total CU cycles.
+    VALU Active Threads: &VALU_Active_Threads_tip >-
+      Indicates the average level of divergence within a wavefront over the lifetime of the kernel.
+      The number of work-items that were active in a wavefront during execution of each VALU
+      instruction, time-averaged over all VALU instructions run on all wavefronts in the kernel.
+    IPC: &IPC_tip >-
+      The ratio of the total number of instructions executed on the CU over the total active CU
+      cycles. This is also presented as a percent of the peak theoretical bandwidth achievable on
+      the specific accelerator.
+    Wavefront Occupancy: &Wavefront_Occupancy_tip >-
+      The time-averaged number of wavefronts resident on the accelerator over the lifetime of the
+      kernel. Note: this metric may be inaccurate for short-running kernels (less than 1ms). This is
+      also presented as a percent of the peak theoretical occupancy achievable on the specific
+      accelerator.
+    Theoretical LDS Bandwidth: &Theoretical_LDS_Bandwidth_tip >-
+      Indicates the maximum amount of bytes that could have been loaded from, stored to, or
+      atomically updated in the LDS per unit time (see LDS Bandwidth example for more detail). This
+      is also presented as a percent of the peak theoretical F64 MFMA operations achievable on the
+      specific accelerator.
+    LDS Bank Conflicts/Access: &LDS_Bank_Conflicts/Access_tip >-
+      The ratio of the number of cycles spent in the LDS scheduler due to bank conflicts (as
+      determined by the conflict resolution hardware) to the base number of cycles that would be
+      spent in the LDS scheduler in a completely uncontended case. This is also presented in
+      normalized form (i.e., the Bank Conflict Rate).
+    vL1D Cache Hit Rate: &vL1D_Cache_Hit_Rate_tip >-
+      The ratio of the number of vL1D cache line requests that hit in vL1D cache over the total
+      number of cache line requests to the vL1D cache RAM.
+    vL1D Cache BW: &vL1D_Cache_BW_tip >-
+      The number of bytes looked up in the vL1D cache as a result of VMEM instructions per unit
+      time. The number of bytes is calculated as the number of cache lines requested multiplied by
+      the cache line size. This value does not consider partial requests, so e.g., if only a single
+      value is requested in a cache line, the data movement will still be counted as a full cache
+      line. This is also presented as a percent of the peak theoretical bandwidth achievable on the
+      specific accelerator.
+    L2 Cache Hit Rate: &L2_Cache_Hit_Rate_tip >-
+      The ratio of the number of L2 cache line requests that hit in the L2 cache over the total
+      number of incoming cache line requests to the L2 cache.
+    L2 Cache BW: &L2_Cache_BW_tip >-
+      The number of bytes looked up in the L2 cache per unit time.  The number of bytes is
+      calculated as the number of cache lines requested multiplied by the cache line size. This
+      value does not consider partial requests, so e.g., if only a single value is requested in a
+      cache line, the data movement will still be counted as a full cache line. This is also
+      presented as a percent of the peak theoretical bandwidth achievable on the specific
+      accelerator.
+    L2-Fabric Read BW: &L2-Fabric_Read_BW_tip >-
+      The number of bytes read by the L2 over the Infinity Fabric™ interface per unit time. This is
+      also presented as a percent of the peak theoretical bandwidth achievable on the specific
+      accelerator.
+    L2-Fabric Write BW: &L2-Fabric_Write_BW_tip >-
+      The number of bytes sent by the L2 over the Infinity Fabric interface by write and atomic
+      operations per unit time. This is also presented as a percent of the peak theoretical
+      bandwidth achievable on the specific accelerator.
+    L2-Fabric Read Latency: &L2-Fabric_Read_Latency_tip >-
+      The time-averaged number of cycles read requests spent in Infinity Fabric before data was
+      returned to the L2.
+    L2-Fabric Write Latency: &L2-Fabric_Write_Latency_tip >-
+      The time-averaged number of cycles write requests spent in Infinity Fabric before a completion
+      acknowledgement was returned to the L2.
+    sL1D Cache Hit Rate: &sL1D_Cache_Hit_Rate >-
+      The percent of sL1D requests that hit on a previously loaded line the cache. Calculated as the
+      ratio of the number of sL1D requests that hit over the number of all sL1D requests.
+    sL1D Cache BW: &sL1D_Cache_BW_tip >-
+      The number of bytes looked up in the sL1D cache per unit time. This is also presented as a
+      percent of the peak theoretical bandwidth achievable on the specific accelerator.
+    L1I Hit Rate: &L1I_Hit_Rate_tip >-
+      The percent of L1I requests that hit on a previously loaded line the cache. Calculated as the
+      ratio of the number of L1I requests that hit over the number of all L1I requests.
+    L1I BW: &L1I_BW_tip >-
+      The number of bytes looked up in the L1I cache per unit time. This is also presented as a
+      percent of the peak theoretical bandwidth achievable on the specific accelerator.
+    L1I Fetch Latency: &L1I_Fetch_Latency_tip >-
+      The average number of cycles spent to fetch instructions to a CU.
 
 # Define the panel properties and properties of each metric in the panel.
 Panel Config:
@@ -34,67 +160,67 @@ Panel Config:
               + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
               + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))) / (((($max_sclk
               * $cu_per_gpu) * 64) * 2) / 1000))
-            tips: 
+            tips: *VALU_FLOPs_tip
           VALU IOPs:
             value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp - Start_Timestamp)))
             unit: GIOP
             peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
             pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp
               - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000))
-            tips: 
+            tips: *VALU_IOPs_tip
           MFMA FLOPs (BF16):
             value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))
             unit: GFLOP
             peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000)
             pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))))
               / ((($max_sclk * $cu_per_gpu) * 1024) / 1000))
-            tips: 
+            tips: *MFMA_IOPs_(BF16)_tip
           MFMA FLOPs (F16):
             value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))
             unit: GFLOP
             peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000)
             pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))))
               / ((($max_sclk * $cu_per_gpu) * 1024) / 1000))
-            tips: 
+            tips: *MFMA_FLOPs_(F16)_tip
           MFMA FLOPs (F32):
             value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))
             unit: GFLOP
             peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
             pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))))
               / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
-            tips: 
+            tips: *MFMA_FLOPs_(F32)_tip
           MFMA FLOPs (F64):
             value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))
             unit: GFLOP
             peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
             pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp))))
               / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
-            tips: 
+            tips: *MFMA_FLOPs_(F64)_tip
           MFMA IOPs (Int8):
             value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))
             unit: GIOP
             peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000)
             pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))))
               / ((($max_sclk * $cu_per_gpu) * 1024) / 1000))
-            tips: 
+            tips: *MFMA_IOPs_(INT8)_tip
           Active CUs:
             value: $numActiveCUs
             unit: CUs
             peak: $cu_per_gpu
             pop: ((100 * $numActiveCUs) / $cu_per_gpu)
-            tips: 
+            tips:  *Active_CUs_tip
           SALU Utilization:
             value: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
             unit: pct
             peak: 100
             pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            tips: 
+            tips: *SALU_Utilization_tip
           VALU Utilization:
             value: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
             unit: pct
             peak: 100
             pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            tips: 
+            tips: *VALU_Utilization_tip
           MFMA Utilization:
             value: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)
               * 4)))
@@ -102,19 +228,19 @@ Panel Config:
             peak: 100
             pop: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)
               * 4)))
-            tips: 
+            tips: *MFMA_Utilization_tip
           VMEM Utilization:
             value: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
             unit: pct
             peak: 100
             pop: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-            tips: 
+            tips: *VMEM_Utilization_tip
           Branch Utilization:
             value: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
             unit: pct
             peak: 100
             pop: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-            tips: 
+            tips: *Branch_Utilization_tip
           VALU Active Threads:
             value: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
               != 0) else None))
@@ -122,13 +248,13 @@ Panel Config:
             peak: 64
             pop: (AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
               != 0) else None)) * 1.5625)
-            tips: 
+            tips: *VALU_Active_Threads_tip
           IPC:
             value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))
             unit: Instr/cycle
             peak: 5
             pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5)
-            tips: 
+            tips: *IPC_tip
           Wavefront Occupancy:
             value: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
             unit: Wavefronts
@@ -136,7 +262,7 @@ Panel Config:
             pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / ($max_waves_per_cu
               * $cu_per_gpu))))
             coll_level: SQ_LEVEL_WAVES
-            tips: 
+            tips: *Wavefront_Occupancy_tip
           Theoretical LDS Bandwidth:
             value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
               / (End_Timestamp - Start_Timestamp)))
@@ -144,7 +270,7 @@ Panel Config:
             peak: (($max_sclk * $cu_per_gpu) * 0.128)
             pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
               / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * 0.00128)))
-            tips: 
+            tips: *Theoretical_LDS_Bandwidth_tip
           LDS Bank Conflicts/Access:
             value: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
               if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
@@ -152,7 +278,7 @@ Panel Config:
             peak: 32
             pop: ((100 * AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
               if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))) / 32)
-            tips: 
+            tips: *LDS_Bank_Conflicts/Access_tip
           vL1D Cache Hit Rate:
             value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
               + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
@@ -164,14 +290,14 @@ Panel Config:
               TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) /
               TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != 0) else
               None))
-            tips: 
+            tips: *vL1D_Cache_Hit_Rate_tip
           vL1D Cache BW:
             value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp)))
             unit: GB/s
             peak: ((($max_sclk / 1000) * 64) * $cu_per_gpu)
             pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp))))
               / ((($max_sclk / 1000) * 64) * $cu_per_gpu))
-            tips: 
+            tips: *vL1D_Cache_BW_tip
           L2 Cache Hit Rate:
             value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
               + TCC_MISS_sum) != 0) else None))
@@ -179,14 +305,14 @@ Panel Config:
             peak: 100
             pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
               + TCC_MISS_sum) != 0) else None))
-            tips: 
+            tips: *L2_Cache_Hit_Rate_tip
           L2 Cache BW:
             value: AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))
             unit: GB/s
             peak: ((($max_sclk / 1000) * 64) * TO_INT($total_l2_chan))
             pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))))
               / ((($max_sclk / 1000) * 64) * TO_INT($total_l2_chan)))
-            tips: 
+            tips: *L2_Cache_BW_tip
           L2-Fabric Read BW:
             value: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum)
               * 64)) / (End_Timestamp - Start_Timestamp)))
@@ -194,7 +320,7 @@ Panel Config:
             peak: $hbm_bw
             pop: ((100 * AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum)
               * 64)) / (End_Timestamp - Start_Timestamp)))) / $hbm_bw)
-            tips: 
+            tips: *L2-Fabric_Read_BW_tip
           L2-Fabric Write BW:
             value: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum)
               * 32)) / (End_Timestamp - Start_Timestamp)))
@@ -202,21 +328,21 @@ Panel Config:
             peak: $hbm_bw
             pop: ((100 * AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum)
               * 32)) / (End_Timestamp - Start_Timestamp)))) / $hbm_bw)
-            tips: 
+            tips: *L2-Fabric_Write_BW_tip
           L2-Fabric Read Latency:
             value: AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
               != 0) else None))
             unit: Cycles
             peak: None
             pop: None
-            tips: 
+            tips: *L2-Fabric_Read_Latency_tip
           L2-Fabric Write Latency:
             value: AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
               != 0) else None))
             unit: Cycles
             peak: None
             pop: None
-            tips: 
+            tips: *L2-Fabric_Write_Latency_tip
           sL1D Cache Hit Rate:
             value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES))
               if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None))
@@ -224,31 +350,31 @@ Panel Config:
             peak: 100
             pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES))
               if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None))
-            tips: 
+            tips: *sL1D_Cache_Hit_Rate_tip
           sL1D Cache BW:
             value: AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))
             unit: GB/s
             peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu)
             pop: ((100 * AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))) / ((($max_sclk
               / 1000) * 64) * $sqc_per_gpu))
-            tips: 
+            tips: *sL1D_Cache_BW_tip
           L1I Hit Rate:
             value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
             unit: pct
             peak: 100
             pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
-            tips: 
+            tips: *L1I_Hit_Rate_tip
           L1I BW:
             value: AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))
             unit: GB/s
             peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu)
             pop: ((100 * AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))) / ((($max_sclk
               / 1000) * 64) * $sqc_per_gpu))
-            tips: 
+            tips: *L1I_BW_tip
           L1I Fetch Latency:
             value: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
             unit: Cycles
             peak: None
             pop: None
             coll_level: SQ_IFETCH_LEVEL
-            tips:
+            tips: *L1I_Fetch_Latency_tip
diff --git a/src/omniperf_soc/analysis_configs/gfx90a/0500_command-processor.yaml b/src/omniperf_soc/analysis_configs/gfx90a/0500_command-processor.yaml
index b4a1f0b10..2eaf678f7 100644
--- a/src/omniperf_soc/analysis_configs/gfx90a/0500_command-processor.yaml
+++ b/src/omniperf_soc/analysis_configs/gfx90a/0500_command-processor.yaml
@@ -2,6 +2,18 @@
 # Add description/tips for each metric in this section.
 # So it could be shown in hover.
 Metric Description:
+  CPF_Utilization: &CPF_Utilization_tip
+    Percent of total cycles where the CPF was busy actively doing any work. The
+    ratio of CPF busy cycles over total cycles counted by the CPF.
+  CPF_Stall: &CPF_Stall_tip
+    Percent of CPF busy cycles where the CPF was stalled for any reason.
+  CPF-L2_Utilization: &CPF-L2_Utilization_tip
+    Percent of total cycles counted by the CPF-L2 interface where the CPF-L2
+    interface was active doing any work. The ratio of CPF-L2 busy cycles over total cycles counted by the CPF-L2.
+  CPF-L2_Stall: &CPF-L2_Stall_tip
+    Percent of CPF-L2 busy cycles where the CPF-L2 interface was stalled for any reason.
+  CPF-UTCL1_Stall: &CPF-UTCL1_Stall_tip
+    Percent of CPF busy cycles where the CPF was stalled by address translation.
 
 # Define the panel properties and properties of each metric in the panel.
 Panel Config:
@@ -27,7 +39,7 @@ Panel Config:
             max: MAX((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
               if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
             unit: pct
-            tips: 
+            tips: *CPF_Utilization_tip
           CPF Stall:
             avg: AVG((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
               != 0) else None))
@@ -36,7 +48,7 @@ Panel Config:
             max: MAX((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
               != 0) else None))
             unit: pct
-            tips: 
+            tips: *CPF_Stall_tip
           CPF-L2 Utilization:
             avg: AVG((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
               if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
@@ -45,7 +57,7 @@ Panel Config:
             max: MAX((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
               if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
             unit: pct
-            tips: 
+            tips: *CPF-L2_Utilization_tip
           CPF-L2 Stall:
             avg: AVG((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
               != 0) else None))
@@ -54,7 +66,7 @@ Panel Config:
             max: MAX((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
               != 0) else None))
             unit: pct
-            tips: 
+            tips: *CPF-L2_Stall_tip
           CPF-UTCL1 Stall:
             avg: AVG(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
               != 0) else None)
@@ -63,7 +75,7 @@ Panel Config:
             max: MAX(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
               != 0) else None)
             unit: pct
-            tips:
+            tips: *CPF-UTCL1_Stall_tip
 
     - metric_table:
         id: 502
diff --git a/src/omniperf_soc/analysis_configs/gfx90a/0600_shader-processor-input.yaml b/src/omniperf_soc/analysis_configs/gfx90a/0600_shader-processor-input.yaml
index fde76d28f..d37ab66ed 100644
--- a/src/omniperf_soc/analysis_configs/gfx90a/0600_shader-processor-input.yaml
+++ b/src/omniperf_soc/analysis_configs/gfx90a/0600_shader-processor-input.yaml
@@ -2,6 +2,76 @@
 # Add description/tips for each metric in this section.
 # So it could be shown in hover.
 Metric Description:
+  Workgroup Manager Utilizations:
+    Accelerator Utilization: &Accelerator_Utilization_tip >-
+      The percent of cycles in the kernel where the accelerator was actively doing any work.
+    Scheduler-Pipe Utilization: &Scheduler-Pipe_Utilization_tip >-
+      The percent of total scheduler-pipe cycles in the kernel where the scheduler-pipes were
+    actively doing any work. Note: this value is expected to range between 0% and 25%.
+    Workgroup Manager Utilization: &Workgroup_Manager_Utilization_tip >-
+      The percent of cycles in the kernel where the workgroup manager was actively doing any work.
+    Shader Engine Utilization: &Shader_Engine_Utilization_tip >-
+      The percent of total shader engine cycles in the kernel where any CU in a shader-engine was
+      actively doing any work, normalized over all shader-engines. Low values (e.g., << 100%)
+      indicate that the accelerator was not fully saturated by the kernel, or a potential
+      load-imbalance issue.
+    SIMD Utilization: &SIMD_Utilization_tip >-
+      The percent of total SIMD cycles in the kernel where any SIMD  on a CU was actively doing any
+      work, summed over all CUs. Low values (less than 100%) indicate that the accelerator was not
+      fully saturated by the kernel, or a potential load-imbalance issue.
+    Dispatched Workgroups: &Dispatched_Workgroups_tip >-
+      The total number of workgroups forming this kernel launch.
+    Dispatched Wavefronts: &Dispatched_Wavefronts_tip >-
+      The total number of wavefronts, summed over all workgroups, forming this kernel launch.
+    VGPR Writes: &VGPR_Writes_tip >-
+      The average number of cycles spent initializing VGPRs at wave creation.
+    SGPR Writes: &SGPR_Writes_tip >-
+      The average number of cycles spent initializing SGPRs at wave creation.
+  Workgroup Manager - Resource Allocation:
+    Not-Scheduled Rate (Workgroup Manager): &Not-Scheduled_Rate_(Workgroup_Manager)_tip >-
+      The percent of total scheduler-pipe cycles in the kernel where a workgroup could not be
+      scheduled to a CU due to a bottleneck within the workgroup manager rather than a lack of a CU
+      or SIMD with sufficient resources. Note: this value is expected to range between 0-25%. See
+      note in workgroup manager description.
+    Not-Scheduled Rate (Scheduler-Pipe): &Not-Scheduled_Rate_(Scheduler-Pipe)_tip >-
+      The percent of total scheduler-pipe cycles in the kernel where a workgroup could not be
+      scheduled to a CU due to a bottleneck within the scheduler-pipes rather than a lack of a CU or
+      SIMD with sufficient resources. Note: this value is expected to range between 0-25%, see note
+      in workgroup manager description.
+    Scheduler-Pipe Stall Rate: &Scheduler-Pipe_Stall_Rate_tip >-
+      The percent of total scheduler-pipe cycles in the kernel where a workgroup could not be
+      scheduled to a CU due to occupancy limitations (like a lack of a CU or SIMD with sufficient
+      resources). Note: this value is expected to range between 0-25%, see note in workgroup manager
+      description.
+    Scratch Stall Rate: &Scratch_Stall_Rate_tip >-
+      The percent of total shader-engine cycles in the kernel where a workgroup could not be
+      scheduled to a CU due to lack of private (a.k.a., scratch) memory slots. While this can reach
+      up to 100%, note that the actual occupancy limitations on a kernel using private memory are
+      typically quite small (for example, less than 1% of the total number of waves that can be
+      scheduled to an accelerator).
+    Insufficient SIMD Waveslots: &Insufficient_SIMD_Waveslots_tip >-
+      The percent of total SIMD cycles in the kernel where a workgroup could not be scheduled to a
+      SIMD due to lack of available waveslots.
+    Insufficient SIMD VGPRs: &Insufficient_SIMD_VGPRs_tip >-
+      The percent of total SIMD cycles in the kernel where a workgroup could not be scheduled to a
+      SIMD due to lack of available VGPRs.
+    Insufficient SIMD SGPRs: &Insufficient_SIMD_SGPRs_tip >-
+      The percent of total SIMD cycles in the kernel where a workgroup could not be scheduled to a
+      SIMD due to lack of available SGPRs.
+    Insufficient CU LDS: &Insufficient_CU_LDS_tip >-
+      The percent of total CU cycles in the kernel where a workgroup could not be scheduled to a CU
+      due to lack of available LDS.
+    Insufficient CU Barriers: &Insufficient_CU_Barriers_tip >-
+      The percent of total CU cycles in the kernel where a workgroup could not be scheduled to a CU
+      due to lack of available barriers.
+    Reached CU Workgroup Limit: &Reached_CU_Workgroup_Limit_tip >-
+      The percent of total CU cycles in the kernel where a workgroup could not be scheduled to a CU
+      due to limits within the workgroup manager.  This is expected to be always be zero on CDNA2 or
+      newer accelerators (and small for previous accelerators).
+    Reached CU Wavefront Limit: &Reached_CU_Wavefront_Limit_tip >-
+      The percent of total CU cycles in the kernel where a wavefront could not be scheduled to a CU
+      due to limits within the workgroup manager.  This is expected to be always be zero on CDNA2 or
+      newer accelerators (and small for previous accelerators).
 
 # Define the panel properties and properties of each metric in the panel.
 Panel Config:
@@ -24,43 +94,43 @@ Panel Config:
             min: MIN(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
             max: MAX(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
             unit: Pct
-            tips: 
+            tips: *Accelerator_Utilization_tip
           Scheduler-Pipe Utilization:
             avg: AVG(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
             min: MIN(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
             max: MAX(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
             unit: Pct
-            tips: 
+            tips:  *Scheduler-Pipe_Utilization_tip
           Workgroup Manager Utilization:
             avg: AVG(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
             min: MIN(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
             max: MAX(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
             unit: Pct
-            tips: 
+            tips: *Workgroup_Manager_Utilization_tip
           Shader Engine Utilization:
             avg: AVG(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
             min: MIN(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
             max: MAX(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
             unit: Pct
-            tips: 
+            tips: *Shader_Engine_Utilization_tip
           SIMD Utilization:
             avg: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
             min: MIN(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
             max: MAX(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
             unit: Pct
-            tips: 
+            tips: *SIMD_Utilization_tip
           Dispatched Workgroups:
             avg: AVG(SPI_CSN_NUM_THREADGROUPS)
             min: MIN(SPI_CSN_NUM_THREADGROUPS)
             max: MAX(SPI_CSN_NUM_THREADGROUPS)
             unit: Workgroups
-            tips: 
+            tips: *Dispatched_Workgroups_tip
           Dispatched Wavefronts:
             avg: AVG(SPI_CSN_WAVE)
             min: MIN(SPI_CSN_WAVE)
             max: MAX(SPI_CSN_WAVE)
             unit: Wavefronts
-            tips: 
+            tips: *Dispatched_Wavefronts_tip
           VGPR Writes:
             avg: AVG((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else
               None))
@@ -69,7 +139,7 @@ Panel Config:
             max: MAX((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else
               None))
             unit: Cycles/wave
-            tips: 
+            tips: *VGPR_Writes_tip
           SGPR Writes:
             avg: AVG((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else
               None))
@@ -78,7 +148,7 @@ Panel Config:
             max: MAX((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) else
               None))
             unit: Cycles/wave
-            tips:
+            tips: *SGPR_Writes_tip
     - metric_table:
         id: 602
         title: Workgroup Manager - Resource Allocation
@@ -98,7 +168,7 @@ Panel Config:
             max: MAX((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD !=
               0) else None)
             unit: Pct
-            tips: 
+            tips: *Not-Scheduled_Rate_(Workgroup_Manager)_tip
           Not-scheduled Rate (Scheduler-Pipe):
             avg: AVG((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD !=
               0) else None)
@@ -107,7 +177,7 @@ Panel Config:
             max: MAX((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD !=
               0) else None)
             unit: Pct
-            tips: 
+            tips: *Not-Scheduled_Rate_(Scheduler-Pipe)_tip
           Scheduler-Pipe Stall Rate:
             avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD !=
               0) else None))
@@ -116,52 +186,52 @@ Panel Config:
             max: MAX((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD !=
               0) else None))
             unit: Pct
-            tips: 
+            tips: *Scheduler-Pipe_Stall_Rate_tip
           Scratch Stall Rate:
             avg: AVG((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
             min: MIN((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
             max: MAX((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
             unit: Pct
-            tips: 
+            tips: *Scratch_Stall_Rate_tip
           Insufficient SIMD Waveslots:
             avg: AVG(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
             min: MIN(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
             max: MAX(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
             unit: Pct
-            tips: 
+            tips: *Insufficient_SIMD_Waveslots_tip
           Insufficient SIMD VGPRs:
             avg: AVG(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
             min: MIN(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
             max: MAX(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
             unit: Pct
-            tips: 
+            tips: *Insufficient_SIMD_VGPRs_tip
           Insufficient SIMD SGPRs:
             avg: AVG(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
             min: MIN(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
             max: MAX(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
             unit: Pct
-            tips: 
+            tips: *Insufficient_SIMD_SGPRs_tip
           Insufficient CU LDS:
             avg: AVG(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
             min: MIN(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
             max: MAX(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
             unit: Pct
-            tips: 
+            tips: *Insufficient_CU_LDS_tip
           Insufficient CU Barriers:
             avg: AVG(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
             min: MIN(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
             max: MAX(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
             unit: Pct
-            tips: 
+            tips: *Insufficient_CU_Barriers_tip
           Reached CU Workgroup Limit:
             avg: AVG(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
             min: MIN(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
             max: MAX(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
             unit: Pct
-            tips: 
+            tips: *Reached_CU_Workgroup_Limit_tip
           Reached CU Wavefront Limit:
             avg: AVG(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
             min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
             max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
             unit: Pct
-            tips: 
+            tips: *Reached_CU_Wavefront_Limit_tip
diff --git a/src/omniperf_soc/analysis_configs/gfx90a/0700_wavefront-launch.yaml b/src/omniperf_soc/analysis_configs/gfx90a/0700_wavefront-launch.yaml
index bd761f948..0cdc7f803 100644
--- a/src/omniperf_soc/analysis_configs/gfx90a/0700_wavefront-launch.yaml
+++ b/src/omniperf_soc/analysis_configs/gfx90a/0700_wavefront-launch.yaml
@@ -2,6 +2,81 @@
 # Add description/tips for each metric in this section.
 # So it could be shown in hover.
 Metric Description:
+  Wavefront Launch Stats:
+    Grid Size: &Grid_Size_tip >-
+      The total number of work-items (or, threads) launched as a part of the kernel dispatch.  In
+      HIP, this is equivalent to the total grid size multiplied by the total workgroup (or, block)
+      size.
+    Workgroup Size: &Workgroup_Size_tip >-
+      The total number of work-items (or, threads) in each workgroup (or, block) launched as part of
+      the kernel dispatch.  In HIP, this is equivalent to the total block size.
+    Total Wavefronts: &Total_Wavefronts_tip >-
+      The total number of wavefronts launched as part of the kernel dispatch. On AMD Instinct™ CDNA™
+      accelerators and GCN™ GPUs, the wavefront size is always 64 work-items.  Thus, the total
+      number of wavefronts should be equivalent to the ceiling of grid size divided by 64.
+    Saved Wavefronts: &Saved_Wavefronts_tip >-
+      The total number of wavefronts saved at a context-save.
+    Restored Wavefronts: &Restored_Wavefronts_tip >-
+      The total number of wavefronts restored from a context-save.
+    VGPRs: &VGPRs_tip >-
+      The number of architected vector general-purpose registers allocated for the kernel, see VALU.
+      Note: this may not exactly match the number of VGPRs requested by the compiler due to
+      allocation granularity.
+    AGPRs: &AGPRs_tip >-
+      The number of accumulation vector general-purpose registers allocated for the kernel, see
+      AGPRs.  Note: this may not exactly match the number of AGPRs requested by the compiler due to
+      allocation granularity.
+    SGPRs: &SGPRs_tip >-
+      The number of scalar general-purpose registers allocated for the kernel, see SALU.  Note: this
+      may not exactly match the number of SGPRs requested by the compiler due to allocation
+      granularity.
+    LDS Allocation: &LDS_Allocation_tip >-
+      The number of bytes of LDS memory (or, shared memory) allocated for this kernel.  Note: This
+      may also be larger than what was requested at compile time due to both allocation granularity
+      and dynamic per-dispatch LDS allocations.
+    Scratch Allocation: &Scratch_Allocation_tip >-
+      The number of bytes of scratch memory requested per work-item for this kernel. Scratch memory
+      is used for stack memory on the accelerator, as well as for register spills and restores.
+  Wavefront Runtime Stats:
+    Kernel Time: &Kernel_Time_tip >-
+      The total duration of the executed kernel. Note: this should not be directly compared to the
+      wavefront cycles / timings below.
+    Kernel Cycles: &Kernel_Cycles_tip >-
+      The total duration of the executed kernel in cycles. Note: this should not be directly
+      compared to the wavefront cycles / timings below.
+    Instructions Per Wavefront: &Instructions_Per_Wavefront_tip >-
+      The average number of instructions (of all types) executed per wavefront. This is averaged
+      over all wavefronts in a kernel dispatch.
+    Wave Cycles: &Wave_Cycles_tip >-
+      The number of cycles a wavefront in the kernel dispatch spent resident on a compute unit per
+      normalization unit. This is averaged over all wavefronts in a kernel dispatch. Note: this
+      should not be directly compared to the kernel cycles above.
+    Dependency Wait Cycles: &Dependency_Wait_Cycles_tip >-
+      The number of cycles a wavefront in the kernel dispatch stalled waiting on memory of any kind
+      (e.g., instruction fetch, vector or scalar memory, etc.) per normalization unit. This counter
+      is incremented at every cycle by *all* wavefronts on a CU stalled at a memory operation.  As
+      such, it is most useful to get a sense of how waves were spending their time, rather than
+      identification of a precise limiter because another wave could be actively executing while a
+      wave is stalled. The sum of this metric, Issue Wait Cycles and Active Cycles should be equal
+      to the total Wave Cycles metric.
+    Issue Wait Cycles: &Issue_Wait_Cycles_tip >-
+      The number of cycles a wavefront in the kernel dispatch was unable to issue an instruction for
+      any reason (e.g., execution pipe back-pressure, arbitration loss, etc.) per normalization
+      unit.  This counter is incremented at every cycle by *all* wavefronts on a CU unable to issue
+      an instruction.  As such, it is most useful to get a sense of how waves were spending their
+      time, rather than identification of a precise limiter because another wave could be actively
+      executing while a wave is issue stalled.  The sum of this metric, Dependency Wait Cycles and
+      Active Cycles should be equal to the total Wave Cycles metric.
+    Active Cycles: &Active_Cycles_tip >-
+      The average number of cycles a wavefront in the kernel dispatch was actively executing
+      instructions per normalization unit. This measurement is made on a per-wavefront basis, and
+      may include cycles that another wavefront spent actively executing (on another execution unit,
+      for example) or was stalled.  As such, it is most useful to get a sense of how waves were
+      spending their time, rather than identification of a precise limiter. The sum of this metric,
+      Issue Wait Cycles and Active Wait Cycles should be equal to the total Wave Cycles metric.
+    Wavefront Occupancy: &Wavefront_Occupancy_tip >-
+      The time-averaged number of wavefronts resident on the accelerator over the lifetime of the
+      kernel. Note: this metric may be inaccurate for short-running kernels (less than 1ms).
 
 # Define the panel properties and properties of each metric in the panel.
 Panel Config:
@@ -24,61 +99,61 @@ Panel Config:
             min: MIN(Grid_Size)
             max: MAX(Grid_Size)
             unit: Work Items
-            tips: 
+            tips: *Grid_Size_tip
           Workgroup Size:
             avg: AVG(Workgroup_Size)
             min: MIN(Workgroup_Size)
             max: MAX(Workgroup_Size)
             unit: Work Items
-            tips: 
+            tips: *Workgroup_Size_tip
           Total Wavefronts:
             avg: AVG(SPI_CSN_WAVE)
             min: MIN(SPI_CSN_WAVE)
             max: MAX(SPI_CSN_WAVE)
             unit: Wavefronts
-            tips: 
+            tips: *Total_Wavefronts_tip
           Saved Wavefronts:
             avg: AVG(SQ_WAVES_SAVED)
             min: MIN(SQ_WAVES_SAVED)
             max: MAX(SQ_WAVES_SAVED)
             unit: Wavefronts
-            tips: 
+            tips: *Saved_Wavefronts_tip
           Restored Wavefronts:
             avg: AVG(SQ_WAVES_RESTORED)
             min: MIN(SQ_WAVES_RESTORED)
             max: MAX(SQ_WAVES_RESTORED)
             unit: Wavefronts
-            tips: 
+            tips: *Restored_Wavefronts_tip
           VGPRs:
             avg: AVG(Arch_VGPR)
             min: MIN(Arch_VGPR)
             max: MAX(Arch_VGPR)
             unit: Registers
-            tips: 
+            tips: *VGPRs_tip
           AGPRs:
             avg: AVG(Accum_VGPR)
             min: MIN(Accum_VGPR)
             max: MAX(Accum_VGPR)
             unit: Registers
-            tips: 
+            tips: *AGPRs_tip
           SGPRs:
             avg: AVG(SGPR)
             min: MIN(SGPR)
             max: MAX(SGPR)
             unit: Registers
-            tips: 
+            tips: *SGPRs_tip
           LDS Allocation:
             avg: AVG(LDS_Per_Workgroup)
             min: MIN(LDS_Per_Workgroup)
             max: MAX(LDS_Per_Workgroup)
             unit: Bytes
-            tips: 
+            tips: *LDS_Allocation_tip
           Scratch Allocation:
             avg: AVG(Scratch_Per_Workitem)
             min: MIN(Scratch_Per_Workitem)
             max: MAX(Scratch_Per_Workitem)
             unit: Bytes/Workitem
-            tips: 
+            tips: *Scratch_Allocation_tip
 
     - metric_table:
         id: 702
@@ -96,47 +171,47 @@ Panel Config:
             min: MIN((End_Timestamp - Start_Timestamp))
             max: MAX((End_Timestamp - Start_Timestamp))
             unit: ns
-            tips: 
+            tips: *Kernel_Time_tip
           Kernel Time (Cycles):
             avg: AVG($GRBM_GUI_ACTIVE_PER_XCD)
             min: MIN($GRBM_GUI_ACTIVE_PER_XCD)
             max: MAX($GRBM_GUI_ACTIVE_PER_XCD)
             unit: Cycle
-            tips: 
+            tips: *Kernel_Cycles_tip
           Instructions per wavefront:
             avg: AVG((SQ_INSTS / SQ_WAVES))
             min: MIN((SQ_INSTS / SQ_WAVES))
             max: MAX((SQ_INSTS / SQ_WAVES))
             unit: Instr/wavefront
-            tips: 
+            tips: *Instructions_Per_Wavefront_tip
           Wave Cycles:
             avg: AVG(((4 * SQ_WAVE_CYCLES) / $denom))
             min: MIN(((4 * SQ_WAVE_CYCLES) / $denom))
             max: MAX(((4 * SQ_WAVE_CYCLES) / $denom))
             unit: (Cycles + $normUnit)
-            tips: 
+            tips: *Wave_Cycles_tip
           Dependency Wait Cycles:
             avg: AVG(((4 * SQ_WAIT_ANY) / $denom))
             min: MIN(((4 * SQ_WAIT_ANY) / $denom))
             max: MAX(((4 * SQ_WAIT_ANY) / $denom))
             unit: (Cycles + $normUnit)
-            tips: 
+            tips: *Dependency_Wait_Cycles_tip
           Issue Wait Cycles:
             avg: AVG(((4 * SQ_WAIT_INST_ANY) / $denom))
             min: MIN(((4 * SQ_WAIT_INST_ANY) / $denom))
             max: MAX(((4 * SQ_WAIT_INST_ANY) / $denom))
             unit: (Cycles + $normUnit)
-            tips: 
+            tips: *Issue_Wait_Cycles_tip
           Active Cycles:
             avg: AVG(((4 * SQ_ACTIVE_INST_ANY) / $denom))
             min: MIN(((4 * SQ_ACTIVE_INST_ANY) / $denom))
             max: MAX(((4 * SQ_ACTIVE_INST_ANY) / $denom))
             unit: (Cycles + $normUnit)
-            tips: 
+            tips: *Active_Cycles_tip
           Wavefront Occupancy:
             avg: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
             min: MIN((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
             max: MAX((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
             unit: Wavefronts
             coll_level: SQ_LEVEL_WAVES
-            tips:  
+            tips: *Wavefront_Occupancy_tip
diff --git a/src/omniperf_soc/analysis_configs/gfx90a/1000_compute-unit-instruction-mix.yaml b/src/omniperf_soc/analysis_configs/gfx90a/1000_compute-unit-instruction-mix.yaml
index 436a220b9..db84cc703 100644
--- a/src/omniperf_soc/analysis_configs/gfx90a/1000_compute-unit-instruction-mix.yaml
+++ b/src/omniperf_soc/analysis_configs/gfx90a/1000_compute-unit-instruction-mix.yaml
@@ -2,6 +2,37 @@
 # Add description/tips for each metric in this section.
 # So it could be shown in hover.
 Metric Description:
+  Overall Instruction Mix:
+    VALU: &VALU_tip >-
+    VMEM: &VMEM_tip >-
+    LDS: &LDS_tip >-
+    MFMA: &MFMA_tip >-
+    SALU: &SALU_tip >-
+    SMEM: &SMEM_tip >-
+    Branch: &Branch_tip >-
+  VALU Arithmetic Instruction Mix:
+    INT32: &INT32_tip >-
+    INT64: &INT64_tip >-
+    F16-ADD: &F16-ADD_tip >-
+    F16-MUL: &F16-MUL_tip >-
+    F16-FMA: &F16-FMA_tip >-
+    F16-Trans: &F16-Trans_tip >-
+    F32-ADD: &F32-ADD_tip >-
+    F32-MUL: &F32-MUL_tip >-
+    F32-FMA: &F32-FMA_tip >-
+    F32-Trans: &F32-Trans_tip >-
+    F64-ADD: &F64-ADD_tip >-
+    F64-MUL: &F64-MUL_tip >-
+    F64-FMA: &F64-FMA_tip >-
+    F64-Trans: &F64-Trans_tip >-
+    Conversion: &Conversion_tip >-
+  VMEM Instruction Mix:
+  MFMA Arithmetic Instruction Mix:
+    MFMA-I8: &MFMA-I8_tip >-
+    MFMA-F16: &MFMA-F16_tip >-
+    MFMA-BF16: &MFMA-BF16_tip >-
+    MFMA-F32: &MFMA-F32_tip >-
+    MFMA-F64: &MFMA-F64_tip >-
 
 # Define the panel properties and properties of each metric in the panel.
 Panel Config:
diff --git a/src/omniperf_soc/analysis_configs/gfx90a/1100_compute-unit-compute-pipeline.yaml b/src/omniperf_soc/analysis_configs/gfx90a/1100_compute-unit-compute-pipeline.yaml
index bda6f5cc9..54eda7ead 100644
--- a/src/omniperf_soc/analysis_configs/gfx90a/1100_compute-unit-compute-pipeline.yaml
+++ b/src/omniperf_soc/analysis_configs/gfx90a/1100_compute-unit-compute-pipeline.yaml
@@ -2,6 +2,104 @@
 # Add description/tips for each metric in this section.
 # So it could be shown in hover.
 Metric Description:
+  Speed-of-Light:
+    VALU FLOPs: &VALU_FLOPs_tip >-
+      The total floating-point operations executed per second on the VALU. This is also presented as
+      a percent of the peak theoretical FLOPs achievable on the specific accelerator. Note: this
+      does not include any floating-point operations from MFMA instructions.
+    VALU IOPs: &VALU_IOPs_tip >-
+      The total integer operations executed per second on the VALU. This is also presented as a
+      percent of the peak theoretical IOPs achievable on the specific accelerator. Note: this does
+      not include any integer operations from MFMA instructions.
+    MFMA FLOPs (BF16): &MFMA_FLOPs_(BF16)_tip >-
+      The total number of 16-bit brain floating point MFMA operations executed per second. Note:
+      this does not include any 16-bit brain floating point operations from VALU instructions. This
+      is also presented as a percent of the peak theoretical BF16 MFMA operations achievable on the
+      specific accelerator.
+    MFMA FLOPs (F16): &MFMA_FLOPs_(F16)_tip >-
+      The total number of 16-bit floating point MFMA operations executed per second. Note: this does
+      not include any 16-bit floating point operations from VALU instructions. This is also
+      presented as a percent of the peak theoretical F16 MFMA operations achievable on the specific
+      accelerator.
+    MFMA FLOPs (F32): &MFMA_FLOPs_(F32)_tip >-
+      The total number of 32-bit floating point MFMA operations executed per second. Note: this does
+      not include any 32-bit floating point operations from VALU instructions. This is also
+      presented as a percent of the peak theoretical F32 MFMA operations achievable on the specific
+      accelerator.
+    MFMA FLOPs (F64): &MFMA_FLOPs_(F64)_tip >-
+      The total number of 64-bit floating point MFMA operations executed per second. Note: this does
+      not include any 64-bit floating point operations from VALU instructions. This is also
+      presented as a percent of the peak theoretical F64 MFMA operations achievable on the specific
+      accelerator.
+    MFMA IOPs (INT8): &MFMA_IOPs_(INT8)_tip >-
+      The total number of 8-bit integer MFMA operations executed per second. Note: this does not
+      include any 8-bit integer operations from VALU instructions. This is also presented as a
+      percent of the peak theoretical INT8 MFMA operations achievable on the specific accelerator.
+  Pipeline Stats:
+    IPC: &IPC_tip >-
+      The ratio of the total number of instructions executed on the CU over the total active CU
+      cycles.
+    IPC (Issued): &IPC_(Issued)_tip >-
+      The ratio of the total number of (non-internal) instructions issued over the number of cycles
+      where the scheduler was actively working on issuing instructions.
+    SALU Utilization: &SALU_Utilization_tip >-
+      Indicates what percent of the kernel's duration the SALU was busy executing instructions.
+      Computed as the ratio of the total number of cycles spent by the scheduler issuing SALU / SMEM
+      instructions over the total CU cycles.
+    VALU Utilization: &VALU_Utilization_tip >-
+      Indicates what percent of the kernel's duration the VALU was busy executing instructions. Does
+      not include VMEM operations. Computed as the ratio of the total number of cycles spent by the
+      scheduler issuing VALU instructions over the total CU cycles.
+    VMEM Utilization: &VMEM_Utilization_tip >-
+      Indicates what percent of the kernel's duration the VMEM unit was busy executing instructions,
+      including both global/generic and spill/scratch operations (see the VMEM instruction count
+      metrics for more detail). Does not include VALU operations. Computed as the ratio of the total
+      number of cycles spent by the scheduler issuing VMEM instructions over the total CU cycles.
+    Branch Utilization: &Branch_Utilization_tip >-
+      Indicates what percent of the kernel's duration the branch unit was busy executing
+      instructions. Computed as the ratio of the total number of cycles spent by the scheduler
+      issuing branch instructions over the total CU cycles.
+    VALU Active Threads: &VALU_Active_Threads_tip >-
+      Indicates the average level of divergence within a wavefront over the lifetime of the kernel.
+      The number of work-items that were active in a wavefront during execution of each VALU
+      instruction, time-averaged over all VALU instructions run on all wavefronts in the kernel.
+    MFMA Utilization: &MFMA_Utilization_tip >-
+      Indicates what percent of the kernel's duration the MFMA unit was busy executing instructions.
+      Computed as the ratio of the total number of cycles spent by the MFMA was busy over the total
+      CU cycles.
+    MFMA Instruction Cycles: &MFMA_Instruction_Cycles_tip >-
+      The average duration of MFMA instructions in this kernel in cycles. Computed as the ratio of
+      the total number of cycles the MFMA unit was busy over the total number of MFMA instructions.
+    VMEM Latency: &VMEM_Latency_tip >-
+      The average number of round-trip cycles (that is, from issue to data return / acknowledgment)
+      required for a VMEM instruction to complete.
+    SMEM Latency: &SMEM_Latency_tip >-
+      The average number of round-trip cycles (that is, from issue to data return / acknowledgment)
+      required for a SMEM instruction to complete.
+  Arithmetic Operations:
+    FLOPs (Total): &FLOPs_(Total)_tip >-
+      The total number of floating-point operations executed on either the VALU or MFMA units, per
+      normalization unit.
+    IOPs (Total): &IOPs_(Total)_tip >-
+      The total number of integer operations executed on either the VALU or MFMA units, per
+      normalization unit.
+    F16 OPs: &F16_OPs_tip >-
+      The total number of 16-bit floating-point operations executed on either the VALU or MFMA
+      units, per normalization unit.
+    BF16 OPs: &BF16_OPs_tip >-
+      The total number of 16-bit brain floating-point operations executed on either the VALU or MFMA
+      units, per normalization unit. Note: on current CDNA accelerators, the VALU has no native BF16
+      instructions.
+    F32 OPs: &F32_OPs_tip >-
+      The total number of 32-bit floating-point operations executed on either the VALU or MFMA
+      units, per normalization unit.
+    F64 OPs: &F64_OPs_tip >-
+      The total number of 64-bit floating-point operations executed on either the VALU or MFMA
+      units, per normalization unit.
+    INT8 OPs: &INT8_OPs_tip >-
+      The total number of 8-bit integer operations executed on either the VALU or MFMA units, per
+      normalization unit. Note: on current CDNA accelerators, the VALU has no native INT8
+      instructions.
 
 # Define the panel properties and properties of each metric in the panel.
 Panel Config:
@@ -257,4 +355,4 @@ Panel Config:
             min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
             max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
             unit: (OPs  + $normUnit)
-            tips: 
\ No newline at end of file
+            tips: