Skip to content

Commit

Permalink
perf vendor events intel: Update Icelake+ metric constraints
Browse files Browse the repository at this point in the history
Avoid grouping events especially in cases where the kernel's PMU
driver fails to not open the events, causing the events to report back
as "<not counted>".

This update comes from:

  intel/perfmon#94

Fixes issues reported with patch:

  https://lore.kernel.org/lkml/20230719001836.198363-3-irogers@google.com/

Reviewed-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ravi Bangoria <ravi.bangoria@amd.com>
Cc: Weilin Wang <weilin.wang@intel.com>
Cc: Xing Zhengjun <zhengjun.xing@linux.intel.com>
Link: https://lore.kernel.org/r/20230801053634.1142634-5-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
  • Loading branch information
captain5050 authored and acmel committed Aug 3, 2023
1 parent b691f30 commit 9a7d82c
Show file tree
Hide file tree
Showing 7 changed files with 38 additions and 24 deletions.
11 changes: 7 additions & 4 deletions tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json
Original file line number Diff line number Diff line change
Expand Up @@ -558,6 +558,7 @@
},
{
"BriefDescription": "Counts the number of cycles a core is stalled due to a demand load which hit in the Last Level Cache (LLC) or other core with HITE/F/M.",
"MetricConstraint": "NO_GROUP_EVENTS_NMI",
"MetricExpr": "cpu_atom@MEM_BOUND_STALLS.LOAD_LLC_HIT@ / tma_info_core_clks - max((cpu_atom@MEM_BOUND_STALLS.LOAD@ - cpu_atom@LD_HEAD.L1_MISS_AT_RET@) / tma_info_core_clks, 0) * cpu_atom@MEM_BOUND_STALLS.LOAD_LLC_HIT@ / cpu_atom@MEM_BOUND_STALLS.LOAD@",
"MetricGroup": "TopdownL3;tma_L3_group;tma_memory_bound_group",
"MetricName": "tma_l3_bound",
Expand Down Expand Up @@ -800,6 +801,7 @@
},
{
"BriefDescription": "Counts the number of cycles that the oldest load of the load buffer is stalled at retirement due to a store forward block.",
"MetricConstraint": "NO_GROUP_EVENTS_NMI",
"MetricExpr": "LD_HEAD.ST_ADDR_AT_RET / tma_info_core_clks",
"MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group",
"MetricName": "tma_store_fwd_blk",
Expand Down Expand Up @@ -1058,7 +1060,6 @@
},
{
"BriefDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired)",
"MetricConstraint": "NO_GROUP_EVENTS",
"MetricExpr": "tma_x87_use + tma_fp_scalar + tma_fp_vector",
"MetricGroup": "HPC;TopdownL3;tma_L3_group;tma_light_operations_group",
"MetricName": "tma_fp_arith",
Expand Down Expand Up @@ -1230,6 +1231,7 @@
},
{
"BriefDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck",
"MetricConstraint": "NO_GROUP_EVENTS",
"MetricExpr": "100 * (tma_fetch_latency * tma_icache_misses / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))",
"MetricGroup": "Fed;FetchLat;IcMiss;tma_issueFL",
"MetricName": "tma_info_botlnk_l2_ic_misses",
Expand Down Expand Up @@ -1267,6 +1269,7 @@
},
{
"BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks",
"MetricConstraint": "NO_GROUP_EVENTS",
"MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_fb_full / (tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk))",
"MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW",
"MetricName": "tma_info_bottleneck_memory_bandwidth",
Expand Down Expand Up @@ -1355,15 +1358,13 @@
},
{
"BriefDescription": "Floating Point Operations Per Cycle",
"MetricConstraint": "NO_GROUP_EVENTS",
"MetricExpr": "(cpu_core@FP_ARITH_INST_RETIRED.SCALAR_SINGLE@ + cpu_core@FP_ARITH_INST_RETIRED.SCALAR_DOUBLE@ + 2 * cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE@ + 4 * (cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE@ + cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE@) + 8 * cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE@) / tma_info_core_core_clks",
"MetricGroup": "Flops;Ret",
"MetricName": "tma_info_core_flopc",
"Unit": "cpu_core"
},
{
"BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)",
"MetricConstraint": "NO_GROUP_EVENTS",
"MetricExpr": "(cpu_core@FP_ARITH_DISPATCHED.PORT_0@ + cpu_core@FP_ARITH_DISPATCHED.PORT_1@ + cpu_core@FP_ARITH_DISPATCHED.PORT_5@) / (2 * tma_info_core_core_clks)",
"MetricGroup": "Cor;Flops;HPC",
"MetricName": "tma_info_core_fp_arith_utilization",
Expand Down Expand Up @@ -1769,7 +1770,6 @@
},
{
"BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.",
"MetricConstraint": "NO_GROUP_EVENTS",
"MetricExpr": "tma_retiring * tma_info_thread_slots / cpu_core@UOPS_RETIRED.SLOTS\\,cmask\\=1@",
"MetricGroup": "Pipeline;Ret",
"MetricName": "tma_info_pipeline_retire",
Expand Down Expand Up @@ -2002,6 +2002,7 @@
},
{
"BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core",
"MetricConstraint": "NO_GROUP_EVENTS_NMI",
"MetricExpr": "(cpu_core@MEMORY_ACTIVITY.STALLS_L2_MISS@ - cpu_core@MEMORY_ACTIVITY.STALLS_L3_MISS@) / tma_info_thread_clks",
"MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
"MetricName": "tma_l3_bound",
Expand Down Expand Up @@ -2375,6 +2376,7 @@
},
{
"BriefDescription": "This metric represents rate of split store accesses",
"MetricConstraint": "NO_GROUP_EVENTS_NMI",
"MetricExpr": "MEM_INST_RETIRED.SPLIT_STORES / tma_info_core_core_clks",
"MetricGroup": "TopdownL4;tma_L4_group;tma_issueSpSt;tma_store_bound_group",
"MetricName": "tma_split_stores",
Expand Down Expand Up @@ -2405,6 +2407,7 @@
},
{
"BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores",
"MetricConstraint": "NO_GROUP_EVENTS_NMI",
"MetricExpr": "13 * cpu_core@LD_BLOCKS.STORE_FORWARD@ / tma_info_thread_clks",
"MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group",
"MetricName": "tma_store_fwd_blk",
Expand Down
2 changes: 2 additions & 0 deletions tools/perf/pmu-events/arch/x86/alderlaken/adln-metrics.json
Original file line number Diff line number Diff line change
Expand Up @@ -466,6 +466,7 @@
},
{
"BriefDescription": "Counts the number of cycles a core is stalled due to a demand load which hit in the Last Level Cache (LLC) or other core with HITE/F/M.",
"MetricConstraint": "NO_GROUP_EVENTS_NMI",
"MetricExpr": "MEM_BOUND_STALLS.LOAD_LLC_HIT / tma_info_core_clks - max((MEM_BOUND_STALLS.LOAD - LD_HEAD.L1_MISS_AT_RET) / tma_info_core_clks, 0) * MEM_BOUND_STALLS.LOAD_LLC_HIT / MEM_BOUND_STALLS.LOAD",
"MetricGroup": "TopdownL3;tma_L3_group;tma_memory_bound_group",
"MetricName": "tma_l3_bound",
Expand Down Expand Up @@ -682,6 +683,7 @@
},
{
"BriefDescription": "Counts the number of cycles that the oldest load of the load buffer is stalled at retirement due to a store forward block.",
"MetricConstraint": "NO_GROUP_EVENTS_NMI",
"MetricExpr": "LD_HEAD.ST_ADDR_AT_RET / tma_info_core_clks",
"MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group",
"MetricName": "tma_store_fwd_blk",
Expand Down
10 changes: 6 additions & 4 deletions tools/perf/pmu-events/arch/x86/icelake/icl-metrics.json
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@
},
{
"BriefDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset",
"MetricConstraint": "NO_GROUP_EVENTS_NMI",
"MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / tma_info_thread_clks",
"MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group",
"MetricName": "tma_4k_aliasing",
Expand Down Expand Up @@ -319,7 +320,6 @@
},
{
"BriefDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired)",
"MetricConstraint": "NO_GROUP_EVENTS",
"MetricExpr": "tma_x87_use + tma_fp_scalar + tma_fp_vector",
"MetricGroup": "HPC;TopdownL3;tma_L3_group;tma_light_operations_group",
"MetricName": "tma_fp_arith",
Expand Down Expand Up @@ -464,6 +464,7 @@
},
{
"BriefDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck",
"MetricConstraint": "NO_GROUP_EVENTS",
"MetricExpr": "100 * (tma_fetch_latency * tma_icache_misses / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))",
"MetricGroup": "Fed;FetchLat;IcMiss;tma_issueFL",
"MetricName": "tma_info_botlnk_l2_ic_misses",
Expand Down Expand Up @@ -497,6 +498,7 @@
},
{
"BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks",
"MetricConstraint": "NO_GROUP_EVENTS",
"MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk))",
"MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW",
"MetricName": "tma_info_bottleneck_memory_bandwidth",
Expand Down Expand Up @@ -574,14 +576,12 @@
},
{
"BriefDescription": "Floating Point Operations Per Cycle",
"MetricConstraint": "NO_GROUP_EVENTS",
"MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@ + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / tma_info_core_core_clks",
"MetricGroup": "Flops;Ret",
"MetricName": "tma_info_core_flopc"
},
{
"BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)",
"MetricConstraint": "NO_GROUP_EVENTS",
"MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0xfc@) / (2 * tma_info_core_core_clks)",
"MetricGroup": "Cor;Flops;HPC",
"MetricName": "tma_info_core_fp_arith_utilization",
Expand Down Expand Up @@ -927,7 +927,6 @@
},
{
"BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.",
"MetricConstraint": "NO_GROUP_EVENTS",
"MetricExpr": "tma_retiring * tma_info_thread_slots / cpu@UOPS_RETIRED.SLOTS\\,cmask\\=1@",
"MetricGroup": "Pipeline;Ret",
"MetricName": "tma_info_pipeline_retire"
Expand Down Expand Up @@ -1100,6 +1099,7 @@
},
{
"BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core",
"MetricConstraint": "NO_GROUP_EVENTS_NMI",
"MetricExpr": "(CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS) / tma_info_thread_clks",
"MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
"MetricName": "tma_l3_bound",
Expand Down Expand Up @@ -1419,6 +1419,7 @@
},
{
"BriefDescription": "This metric represents rate of split store accesses",
"MetricConstraint": "NO_GROUP_EVENTS_NMI",
"MetricExpr": "MEM_INST_RETIRED.SPLIT_STORES / tma_info_core_core_clks",
"MetricGroup": "TopdownL4;tma_L4_group;tma_issueSpSt;tma_store_bound_group",
"MetricName": "tma_split_stores",
Expand Down Expand Up @@ -1446,6 +1447,7 @@
},
{
"BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores",
"MetricConstraint": "NO_GROUP_EVENTS_NMI",
"MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / tma_info_thread_clks",
"MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group",
"MetricName": "tma_store_fwd_blk",
Expand Down
10 changes: 6 additions & 4 deletions tools/perf/pmu-events/arch/x86/icelakex/icx-metrics.json
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,7 @@
},
{
"BriefDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset",
"MetricConstraint": "NO_GROUP_EVENTS_NMI",
"MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / tma_info_thread_clks",
"MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group",
"MetricName": "tma_4k_aliasing",
Expand Down Expand Up @@ -523,7 +524,6 @@
},
{
"BriefDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired)",
"MetricConstraint": "NO_GROUP_EVENTS",
"MetricExpr": "tma_x87_use + tma_fp_scalar + tma_fp_vector",
"MetricGroup": "HPC;TopdownL3;tma_L3_group;tma_light_operations_group",
"MetricName": "tma_fp_arith",
Expand Down Expand Up @@ -668,6 +668,7 @@
},
{
"BriefDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck",
"MetricConstraint": "NO_GROUP_EVENTS",
"MetricExpr": "100 * (tma_fetch_latency * tma_icache_misses / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))",
"MetricGroup": "Fed;FetchLat;IcMiss;tma_issueFL",
"MetricName": "tma_info_botlnk_l2_ic_misses",
Expand Down Expand Up @@ -701,6 +702,7 @@
},
{
"BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks",
"MetricConstraint": "NO_GROUP_EVENTS",
"MetricExpr": "100 * tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_pmm_bound + tma_store_bound) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk))",
"MetricGroup": "Mem;MemoryBW;Offcore;tma_issueBW",
"MetricName": "tma_info_bottleneck_memory_bandwidth",
Expand Down Expand Up @@ -778,14 +780,12 @@
},
{
"BriefDescription": "Floating Point Operations Per Cycle",
"MetricConstraint": "NO_GROUP_EVENTS",
"MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * cpu@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE\\,umask\\=0x18@ + 8 * cpu@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE\\,umask\\=0x60@ + 16 * FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE) / tma_info_core_core_clks",
"MetricGroup": "Flops;Ret",
"MetricName": "tma_info_core_flopc"
},
{
"BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)",
"MetricConstraint": "NO_GROUP_EVENTS",
"MetricExpr": "(cpu@FP_ARITH_INST_RETIRED.SCALAR_SINGLE\\,umask\\=0x03@ + cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0xfc@) / (2 * tma_info_core_core_clks)",
"MetricGroup": "Cor;Flops;HPC",
"MetricName": "tma_info_core_fp_arith_utilization",
Expand Down Expand Up @@ -1144,7 +1144,6 @@
},
{
"BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.",
"MetricConstraint": "NO_GROUP_EVENTS",
"MetricExpr": "tma_retiring * tma_info_thread_slots / cpu@UOPS_RETIRED.SLOTS\\,cmask\\=1@",
"MetricGroup": "Pipeline;Ret",
"MetricName": "tma_info_pipeline_retire"
Expand Down Expand Up @@ -1369,6 +1368,7 @@
},
{
"BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core",
"MetricConstraint": "NO_GROUP_EVENTS_NMI",
"MetricExpr": "(CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS) / tma_info_thread_clks",
"MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
"MetricName": "tma_l3_bound",
Expand Down Expand Up @@ -1715,6 +1715,7 @@
},
{
"BriefDescription": "This metric represents rate of split store accesses",
"MetricConstraint": "NO_GROUP_EVENTS_NMI",
"MetricExpr": "MEM_INST_RETIRED.SPLIT_STORES / tma_info_core_core_clks",
"MetricGroup": "TopdownL4;tma_L4_group;tma_issueSpSt;tma_store_bound_group",
"MetricName": "tma_split_stores",
Expand Down Expand Up @@ -1742,6 +1743,7 @@
},
{
"BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores",
"MetricConstraint": "NO_GROUP_EVENTS_NMI",
"MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / tma_info_thread_clks",
"MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group",
"MetricName": "tma_store_fwd_blk",
Expand Down
Loading

0 comments on commit 9a7d82c

Please sign in to comment.