[StructuralHash] Support Differences (llvm#112638)

This computes a structural hash while allowing for selective ignoring of certain operands based on a custom function that is provided. Instead of a single hash value, it now returns FunctionHashInfo which includes a hash value, an instruction mapping, and a map to track the operand location and its corresponding hash value that is ignored. Depends on llvm#112621. This is a patch for https://discourse.llvm.org/t/rfc-global-function-merging/82608.
Ericsson · Oct 27, 2024 · 0dd9fdc · 0dd9fdc
1 parent 242c770
commit 0dd9fdc
Show file tree

Hide file tree

Showing 8 changed files with 304 additions and 40 deletions.
diff --git a/llvm/include/llvm/Analysis/StructuralHash.h b/llvm/include/llvm/Analysis/StructuralHash.h
@@ -13,15 +13,22 @@
 
 namespace llvm {
 
+enum class StructuralHashOptions {
+  None,              /// Hash with opcode only.
+  Detailed,          /// Hash with opcode and operands.
+  CallTargetIgnored, /// Ignore call target operand when computing hash.
+};
+
 /// Printer pass for  StructuralHashes
 class StructuralHashPrinterPass
     : public PassInfoMixin<StructuralHashPrinterPass> {
   raw_ostream &OS;
-  bool EnableDetailedStructuralHash;
+  const StructuralHashOptions Options;
 
 public:
-  explicit StructuralHashPrinterPass(raw_ostream &OS, bool Detailed)
-      : OS(OS), EnableDetailedStructuralHash(Detailed) {}
+  explicit StructuralHashPrinterPass(raw_ostream &OS,
+                                     StructuralHashOptions Options)
+      : OS(OS), Options(Options) {}
 
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM);
 

diff --git a/llvm/include/llvm/IR/StructuralHash.h b/llvm/include/llvm/IR/StructuralHash.h
@@ -14,7 +14,9 @@
 #ifndef LLVM_IR_STRUCTURALHASH_H
 #define LLVM_IR_STRUCTURALHASH_H
 
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/StableHashing.h"
+#include "llvm/IR/Instruction.h"
 #include <cstdint>
 
 namespace llvm {
@@ -35,6 +37,49 @@ stable_hash StructuralHash(const Function &F, bool DetailedHash = false);
 /// composed the module hash.
 stable_hash StructuralHash(const Module &M, bool DetailedHash = false);
 
+/// The pair of an instruction index and a operand index.
+using IndexPair = std::pair<unsigned, unsigned>;
+
+/// A map from an instruction index to an instruction pointer.
+using IndexInstrMap = MapVector<unsigned, Instruction *>;
+
+/// A map from an IndexPair to a stable hash.
+using IndexOperandHashMapType = DenseMap<IndexPair, stable_hash>;
+
+/// A function that takes an instruction and an operand index and returns true
+/// if the operand should be ignored in the function hash computation.
+using IgnoreOperandFunc = std::function<bool(const Instruction *, unsigned)>;
+
+struct FunctionHashInfo {
+  /// A hash value representing the structural content of the function
+  stable_hash FunctionHash;
+  /// A mapping from instruction indices to instruction pointers
+  std::unique_ptr<IndexInstrMap> IndexInstruction;
+  /// A mapping from pairs of instruction indices and operand indices
+  /// to the hashes of the operands. This can be used to analyze or
+  /// reconstruct the differences in ignored operands
+  std::unique_ptr<IndexOperandHashMapType> IndexOperandHashMap;
+
+  FunctionHashInfo(stable_hash FuntionHash,
+                   std::unique_ptr<IndexInstrMap> IndexInstruction,
+                   std::unique_ptr<IndexOperandHashMapType> IndexOperandHashMap)
+      : FunctionHash(FuntionHash),
+        IndexInstruction(std::move(IndexInstruction)),
+        IndexOperandHashMap(std::move(IndexOperandHashMap)) {}
+};
+
+/// Computes a structural hash of a given function, considering the structure
+/// and content of the function's instructions while allowing for selective
+/// ignoring of certain operands based on custom criteria. This hash can be used
+/// to identify functions that are structurally similar or identical, which is
+/// useful in optimizations, deduplication, or analysis tasks.
+/// \param F The function to hash.
+/// \param IgnoreOp A callable that takes an instruction and an operand index,
+/// and returns true if the operand should be ignored in the hash computation.
+/// \return A FunctionHashInfo structure
+FunctionHashInfo StructuralHashWithDifferences(const Function &F,
+                                               IgnoreOperandFunc IgnoreOp);
+
 } // end namespace llvm
 
 #endif
diff --git a/llvm/lib/Analysis/StructuralHash.cpp b/llvm/lib/Analysis/StructuralHash.cpp
@@ -21,14 +21,33 @@ using namespace llvm;
 PreservedAnalyses StructuralHashPrinterPass::run(Module &M,
                                                  ModuleAnalysisManager &MAM) {
   OS << "Module Hash: "
-     << format("%016" PRIx64, StructuralHash(M, EnableDetailedStructuralHash))
+     << format("%016" PRIx64,
+               StructuralHash(M, Options != StructuralHashOptions::None))
      << "\n";
   for (Function &F : M) {
     if (F.isDeclaration())
       continue;
-    OS << "Function " << F.getName() << " Hash: "
-       << format("%016" PRIx64, StructuralHash(F, EnableDetailedStructuralHash))
-       << "\n";
+    if (Options == StructuralHashOptions::CallTargetIgnored) {
+      auto IgnoreOp = [&](const Instruction *I, unsigned OpndIdx) {
+        return I->getOpcode() == Instruction::Call &&
+               isa<Constant>(I->getOperand(OpndIdx));
+      };
+      auto FuncHashInfo = StructuralHashWithDifferences(F, IgnoreOp);
+      OS << "Function " << F.getName()
+         << " Hash: " << format("%016" PRIx64, FuncHashInfo.FunctionHash)
+         << "\n";
+      for (auto &[IndexPair, OpndHash] : *FuncHashInfo.IndexOperandHashMap) {
+        auto [InstIndex, OpndIndex] = IndexPair;
+        OS << "\tIgnored Operand Hash: " << format("%016" PRIx64, OpndHash)
+           << " at (" << InstIndex << "," << OpndIndex << ")\n";
+      }
+    } else {
+      OS << "Function " << F.getName() << " Hash: "
+         << format(
+                "%016" PRIx64,
+                StructuralHash(F, Options == StructuralHashOptions::Detailed))
+         << "\n";
+    }
   }
   return PreservedAnalyses::all();
 }
diff --git a/llvm/lib/IR/StructuralHash.cpp b/llvm/lib/IR/StructuralHash.cpp
@@ -34,14 +34,18 @@ class StructuralHashImpl {
   static constexpr stable_hash FunctionHeaderHash = 0x62642d6b6b2d6b72;
   static constexpr stable_hash GlobalHeaderHash = 23456;
 
-  // This will produce different values on 32-bit and 64-bit systens as
-  // hash_combine returns a size_t. However, this is only used for
-  // detailed hashing which, in-tree, only needs to distinguish between
-  // differences in functions.
-  // TODO: This is not stable.
-  template <typename T> stable_hash hashArbitaryType(const T &V) {
-    return hash_combine(V);
-  }
+  /// IgnoreOp is a function that returns true if the operand should be ignored.
+  IgnoreOperandFunc IgnoreOp = nullptr;
+  /// A mapping from instruction indices to instruction pointers.
+  /// The index represents the position of an instruction based on the order in
+  /// which it is first encountered.
+  std::unique_ptr<IndexInstrMap> IndexInstruction = nullptr;
+  /// A mapping from pairs of instruction indices and operand indices
+  /// to the hashes of the operands.
+  std::unique_ptr<IndexOperandHashMapType> IndexOperandHashMap = nullptr;
+
+  /// Assign a unique ID to each Value in the order they are first seen.
+  DenseMap<const Value *, int> ValueToId;
 
   stable_hash hashType(Type *ValueType) {
     SmallVector<stable_hash> Hashes;
@@ -53,23 +57,95 @@ class StructuralHashImpl {
 
 public:
   StructuralHashImpl() = delete;
-  explicit StructuralHashImpl(bool DetailedHash) : DetailedHash(DetailedHash) {}
+  explicit StructuralHashImpl(bool DetailedHash,
+                              IgnoreOperandFunc IgnoreOp = nullptr)
+      : DetailedHash(DetailedHash), IgnoreOp(IgnoreOp) {
+    if (IgnoreOp) {
+      IndexInstruction = std::make_unique<IndexInstrMap>();
+      IndexOperandHashMap = std::make_unique<IndexOperandHashMapType>();
+    }
+  }
+
+  stable_hash hashAPInt(const APInt &I) {
+    SmallVector<stable_hash> Hashes;
+    Hashes.emplace_back(I.getBitWidth());
+    auto RawVals = ArrayRef<uint64_t>(I.getRawData(), I.getNumWords());
+    Hashes.append(RawVals.begin(), RawVals.end());
+    return stable_hash_combine(Hashes);
+  }
+
+  stable_hash hashAPFloat(const APFloat &F) {
+    return hashAPInt(F.bitcastToAPInt());
+  }
+
+  stable_hash hashGlobalValue(const GlobalValue *GV) {
+    if (!GV->hasName())
+      return 0;
+    return stable_hash_name(GV->getName());
+  }
 
+  // Compute a hash for a Constant. This function is logically similar to
+  // FunctionComparator::cmpConstants() in FunctionComparator.cpp, but here
+  // we're interested in computing a hash rather than comparing two Constants.
+  // Some of the logic is simplified, e.g, we don't expand GEPOperator.
   stable_hash hashConstant(Constant *C) {
     SmallVector<stable_hash> Hashes;
-    // TODO: hashArbitaryType() is not stable.
-    if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(C)) {
-      Hashes.emplace_back(hashArbitaryType(ConstInt->getValue()));
-    } else if (ConstantFP *ConstFP = dyn_cast<ConstantFP>(C)) {
-      Hashes.emplace_back(hashArbitaryType(ConstFP->getValue()));
-    } else if (Function *Func = dyn_cast<Function>(C)) {
-      // Hashing the name will be deterministic as LLVM's hashing infrastructure
-      // has explicit support for hashing strings and will not simply hash
-      // the pointer.
-      Hashes.emplace_back(hashArbitaryType(Func->getName()));
+
+    Type *Ty = C->getType();
+    Hashes.emplace_back(hashType(Ty));
+
+    if (C->isNullValue()) {
+      Hashes.emplace_back(static_cast<stable_hash>('N'));
+      return stable_hash_combine(Hashes);
     }
 
-    return stable_hash_combine(Hashes);
+    if (auto *G = dyn_cast<GlobalValue>(C)) {
+      Hashes.emplace_back(hashGlobalValue(G));
+      return stable_hash_combine(Hashes);
+    }
+
+    if (const auto *Seq = dyn_cast<ConstantDataSequential>(C)) {
+      Hashes.emplace_back(xxh3_64bits(Seq->getRawDataValues()));
+      return stable_hash_combine(Hashes);
+    }
+
+    switch (C->getValueID()) {
+    case Value::ConstantIntVal: {
+      const APInt &Int = cast<ConstantInt>(C)->getValue();
+      Hashes.emplace_back(hashAPInt(Int));
+      return stable_hash_combine(Hashes);
+    }
+    case Value::ConstantFPVal: {
+      const APFloat &APF = cast<ConstantFP>(C)->getValueAPF();
+      Hashes.emplace_back(hashAPFloat(APF));
+      return stable_hash_combine(Hashes);
+    }
+    case Value::ConstantArrayVal:
+    case Value::ConstantStructVal:
+    case Value::ConstantVectorVal:
+    case Value::ConstantExprVal: {
+      for (const auto &Op : C->operands()) {
+        auto H = hashConstant(cast<Constant>(Op));
+        Hashes.emplace_back(H);
+      }
+      return stable_hash_combine(Hashes);
+    }
+    case Value::BlockAddressVal: {
+      const BlockAddress *BA = cast<BlockAddress>(C);
+      auto H = hashGlobalValue(BA->getFunction());
+      Hashes.emplace_back(H);
+      return stable_hash_combine(Hashes);
+    }
+    case Value::DSOLocalEquivalentVal: {
+      const auto *Equiv = cast<DSOLocalEquivalent>(C);
+      auto H = hashGlobalValue(Equiv->getGlobalValue());
+      Hashes.emplace_back(H);
+      return stable_hash_combine(Hashes);
+    }
+    default:
+      // Skip other types of constants for simplicity.
+      return stable_hash_combine(Hashes);
+    }
   }
 
   stable_hash hashValue(Value *V) {
@@ -83,6 +159,10 @@ class StructuralHashImpl {
     if (Argument *Arg = dyn_cast<Argument>(V))
       Hashes.emplace_back(Arg->getArgNo());
 
+    // Get an index (an insertion order) for the non-constant value.
+    auto [It, WasInserted] = ValueToId.try_emplace(V, ValueToId.size());
+    Hashes.emplace_back(It->second);
+
     return stable_hash_combine(Hashes);
   }
 
@@ -107,8 +187,20 @@ class StructuralHashImpl {
     if (const auto *ComparisonInstruction = dyn_cast<CmpInst>(&Inst))
       Hashes.emplace_back(ComparisonInstruction->getPredicate());
 
-    for (const auto &Op : Inst.operands())
-      Hashes.emplace_back(hashOperand(Op));
+    unsigned InstIdx = 0;
+    if (IndexInstruction) {
+      InstIdx = IndexInstruction->size();
+      IndexInstruction->try_emplace(InstIdx, const_cast<Instruction *>(&Inst));
+    }
+
+    for (const auto [OpndIdx, Op] : enumerate(Inst.operands())) {
+      auto OpndHash = hashOperand(Op);
+      if (IgnoreOp && IgnoreOp(&Inst, OpndIdx)) {
+        assert(IndexOperandHashMap);
+        IndexOperandHashMap->try_emplace({InstIdx, OpndIdx}, OpndHash);
+      } else
+        Hashes.emplace_back(OpndHash);
+    }
 
     return stable_hash_combine(Hashes);
   }
@@ -188,6 +280,14 @@ class StructuralHashImpl {
   }
 
   uint64_t getHash() const { return Hash; }
+
+  std::unique_ptr<IndexInstrMap> getIndexInstrMap() {
+    return std::move(IndexInstruction);
+  }
+
+  std::unique_ptr<IndexOperandHashMapType> getIndexPairOpndHashMap() {
+    return std::move(IndexOperandHashMap);
+  }
 };
 
 } // namespace
@@ -203,3 +303,12 @@ stable_hash llvm::StructuralHash(const Module &M, bool DetailedHash) {
   H.update(M);
   return H.getHash();
 }
+
+FunctionHashInfo
+llvm::StructuralHashWithDifferences(const Function &F,
+                                    IgnoreOperandFunc IgnoreOp) {
+  StructuralHashImpl H(/*DetailedHash=*/true, IgnoreOp);
+  H.update(F);
+  return FunctionHashInfo(H.getHash(), H.getIndexInstrMap(),
+                          H.getIndexPairOpndHashMap());
+}
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
@@ -1175,9 +1175,17 @@ Expected<std::string> parseMemProfUsePassOptions(StringRef Params) {
   return Result;
 }
 
-Expected<bool> parseStructuralHashPrinterPassOptions(StringRef Params) {
-  return PassBuilder::parseSinglePassOption(Params, "detailed",
-                                            "StructuralHashPrinterPass");
+Expected<StructuralHashOptions>
+parseStructuralHashPrinterPassOptions(StringRef Params) {
+  if (Params.empty())
+    return StructuralHashOptions::None;
+  if (Params == "detailed")
+    return StructuralHashOptions::Detailed;
+  if (Params == "call-target-ignored")
+    return StructuralHashOptions::CallTargetIgnored;
+  return make_error<StringError>(
+      formatv("invalid structural hash printer parameter '{0}' ", Params).str(),
+      inconvertibleErrorCode());
 }
 
 Expected<bool> parseWinEHPrepareOptions(StringRef Params) {

diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
@@ -220,10 +220,11 @@ MODULE_PASS_WITH_PARAMS(
     parseMSanPassOptions, "recover;kernel;eager-checks;track-origins=N")
 MODULE_PASS_WITH_PARAMS(
     "print<structural-hash>", "StructuralHashPrinterPass",
-    [](bool EnableDetailedStructuralHash) {
-      return StructuralHashPrinterPass(dbgs(), EnableDetailedStructuralHash);
+    [](StructuralHashOptions Options) {
+      return StructuralHashPrinterPass(dbgs(), Options);
     },
-    parseStructuralHashPrinterPassOptions, "detailed")
+    parseStructuralHashPrinterPassOptions, "detailed;call-target-ignored")
+
 #undef MODULE_PASS_WITH_PARAMS
 
 #ifndef CGSCC_ANALYSIS

diff --git a/llvm/test/Analysis/StructuralHash/structural-hash-printer.ll b/llvm/test/Analysis/StructuralHash/structural-hash-printer.ll
@@ -1,17 +1,21 @@
 ; RUN: opt -passes='print<structural-hash>' -disable-output %s 2>&1 | FileCheck %s
 ; RUN: opt -passes='print<structural-hash><detailed>' -disable-output %s 2>&1 | FileCheck %s -check-prefix=DETAILED-HASH
+; RUN: opt -passes='print<structural-hash><call-target-ignored>' -disable-output %s 2>&1 | FileCheck %s -check-prefix=CALLTARGETIGNORED-HASH
 
 ; Add a declaration so that we can test we skip it.
-declare i64 @d1()
+declare i64 @d1(i64)
+declare i64 @e1(i64)
 
 define i64 @f1(i64 %a) {
 	%b = add i64 %a, 1
-	ret i64 %b
+	%c = call i64 @d1(i64 %b)
+	ret i64 %c
 }
 
-define i32 @f2(i32 %a) {
-	%b = add i32 %a, 2
-	ret i32 %b
+define i64 @f2(i64 %a) {
+	%b = add i64 %a, 1
+	%c = call i64 @e1(i64 %b)
+	ret i64 %c
 }
 
 ; CHECK: Module Hash: {{([a-f0-9]{16,})}}
@@ -22,3 +26,13 @@ define i32 @f2(i32 %a) {
 ; DETAILED-HASH-NEXT: Function f1 Hash: [[DF1H:([a-f0-9]{16,})]]
 ; DETAILED-HASH-NOT: [[DF1H]]
 ; DETAILED-HASH-NEXT: Function f2 Hash: {{([a-f0-9]{16,})}}
+
+; When ignoring the call target, check if `f1` and `f2` produce the same function hash.
+; The index for the call instruction is 1, and the index of the call target operand is 1.
+; The ignored operand hashes for different call targets should be different.
+; CALLTARGETIGNORED-HASH: Module Hash: {{([a-f0-9]{16,})}}
+; CALLTARGETIGNORED-HASH-NEXT: Function f1 Hash: [[IF1H:([a-f0-9]{16,})]]
+; CALLTARGETIGNORED-HASH-NEXT:   Ignored Operand Hash: [[IO1H:([a-f0-9]{16,})]] at (1,1)
+; CALLTARGETIGNORED-HASH-NEXT: Function f2 Hash: [[IF1H]]
+; CALLTARGETIGNORED-HASH-NOT: [[IO1H]]
+; CALLTARGETIGNORED-HASH-NEXT:   Ignored Operand Hash: {{([a-f0-9]{16,})}} at (1,1)