[StructuralHash] Support Differences

This comutes a structural hash while allowing for selective ignoring of certain operands based on a custom function that is provided. Instead of a single hash value, it now returns FunctionHashInfo which includes a hash value, an instruction mapping, and a map to track the operand location and its corresponding hash value that is ignored.
llvm · Oct 17, 2024 · 6225d74 · 6225d74
1 parent e715fc6
commit 6225d74
Show file tree

Hide file tree

Showing 3 changed files with 275 additions and 14 deletions.
diff --git a/llvm/include/llvm/IR/StructuralHash.h b/llvm/include/llvm/IR/StructuralHash.h
@@ -14,7 +14,9 @@
 #ifndef LLVM_IR_STRUCTURALHASH_H
 #define LLVM_IR_STRUCTURALHASH_H
 
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/StableHashing.h"
+#include "llvm/IR/Instruction.h"
 #include <cstdint>
 
 namespace llvm {
@@ -23,6 +25,7 @@ class Function;
 class Module;
 
 using IRHash = stable_hash;
+using OpndHash = stable_hash;
 
 /// Returns a hash of the function \p F.
 /// \param F The function to hash.
@@ -37,6 +40,49 @@ IRHash StructuralHash(const Function &F, bool DetailedHash = false);
 /// composed the module hash.
 IRHash StructuralHash(const Module &M, bool DetailedHash = false);
 
+/// The pair of an instruction index and a operand index.
+using IndexPair = std::pair<unsigned, unsigned>;
+
+/// A map from an instruction index to an instruction pointer.
+using IndexInstrMap = MapVector<unsigned, Instruction *>;
+
+/// A map from an IndexPair to an OpndHash.
+using IndexOperandHashMapType = DenseMap<IndexPair, OpndHash>;
+
+/// A function that takes an instruction and an operand index and returns true
+/// if the operand should be ignored in the function hash computation.
+using IgnoreOperandFunc = std::function<bool(const Instruction *, unsigned)>;
+
+struct FunctionHashInfo {
+  /// A hash value representing the structural content of the function
+  IRHash FunctionHash;
+  /// A mapping from instruction indices to instruction pointers
+  std::unique_ptr<IndexInstrMap> IndexInstruction;
+  /// A mapping from pairs of instruction indices and operand indices
+  /// to the hashes of the operands. This can be used to analyze or
+  /// reconstruct the differences in ignored operands
+  std::unique_ptr<IndexOperandHashMapType> IndexOperandHashMap;
+
+  FunctionHashInfo(IRHash FuntionHash,
+                   std::unique_ptr<IndexInstrMap> IndexInstruction,
+                   std::unique_ptr<IndexOperandHashMapType> IndexOperandHashMap)
+      : FunctionHash(FuntionHash),
+        IndexInstruction(std::move(IndexInstruction)),
+        IndexOperandHashMap(std::move(IndexOperandHashMap)) {}
+};
+
+/// Computes a structural hash of a given function, considering the structure
+/// and content of the function's instructions while allowing for selective
+/// ignoring of certain operands based on custom criteria. This hash can be used
+/// to identify functions that are structurally similar or identical, which is
+/// useful in optimizations, deduplication, or analysis tasks.
+/// \param F The function to hash.
+/// \param IgnoreOp A callable that takes an instruction and an operand index,
+/// and returns true if the operand should be ignored in the hash computation.
+/// \return A FunctionHashInfo structure
+FunctionHashInfo StructuralHashWithDifferences(const Function &F,
+                                               IgnoreOperandFunc IgnoreOp);
+
 } // end namespace llvm
 
 #endif
diff --git a/llvm/lib/IR/StructuralHash.cpp b/llvm/lib/IR/StructuralHash.cpp
@@ -28,6 +28,19 @@ class StructuralHashImpl {
 
   bool DetailedHash;
 
+  /// IgnoreOp is a function that returns true if the operand should be ignored.
+  IgnoreOperandFunc IgnoreOp = nullptr;
+  /// A mapping from instruction indices to instruction pointers.
+  /// The index represents the position of an instruction based on the order in
+  /// which it is first encountered.
+  std::unique_ptr<IndexInstrMap> IndexInstruction = nullptr;
+  /// A mapping from pairs of instruction indices and operand indices
+  /// to the hashes of the operands.
+  std::unique_ptr<IndexOperandHashMapType> IndexOperandHashMap = nullptr;
+
+  /// Assign a unique ID to each Value in the order they are first seen.
+  DenseMap<const Value *, int> ValueToId;
+
   // This will produce different values on 32-bit and 64-bit systens as
   // hash_combine returns a size_t. However, this is only used for
   // detailed hashing which, in-tree, only needs to distinguish between
@@ -47,24 +60,140 @@ class StructuralHashImpl {
 
 public:
   StructuralHashImpl() = delete;
-  explicit StructuralHashImpl(bool DetailedHash) : DetailedHash(DetailedHash) {}
+  explicit StructuralHashImpl(bool DetailedHash,
+                              IgnoreOperandFunc IgnoreOp = nullptr)
+      : DetailedHash(DetailedHash), IgnoreOp(IgnoreOp) {
+    if (IgnoreOp) {
+      IndexInstruction = std::make_unique<IndexInstrMap>();
+      IndexOperandHashMap = std::make_unique<IndexOperandHashMapType>();
+    }
+  }
 
-  stable_hash hashConstant(Constant *C) {
+  stable_hash hashAPInt(const APInt &I) {
     SmallVector<stable_hash> Hashes;
-    // TODO: hashArbitaryType() is not stable.
-    if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(C)) {
-      Hashes.emplace_back(hashArbitaryType(ConstInt->getValue()));
-    } else if (ConstantFP *ConstFP = dyn_cast<ConstantFP>(C)) {
-      Hashes.emplace_back(hashArbitaryType(ConstFP->getValue()));
-    } else if (Function *Func = dyn_cast<Function>(C))
-      // Hashing the name will be deterministic as LLVM's hashing infrastructure
-      // has explicit support for hashing strings and will not simply hash
-      // the pointer.
-      Hashes.emplace_back(hashArbitaryType(Func->getName()));
+    Hashes.emplace_back(I.getBitWidth());
+    for (unsigned J = 0; J < I.getNumWords(); ++J)
+      Hashes.emplace_back((I.getRawData())[J]);
+    return stable_hash_combine(Hashes);
+  }
 
+  stable_hash hashAPFloat(const APFloat &F) {
+    SmallVector<stable_hash> Hashes;
+    const fltSemantics &S = F.getSemantics();
+    Hashes.emplace_back(APFloat::semanticsPrecision(S));
+    Hashes.emplace_back(APFloat::semanticsMaxExponent(S));
+    Hashes.emplace_back(APFloat::semanticsMinExponent(S));
+    Hashes.emplace_back(APFloat::semanticsSizeInBits(S));
+    Hashes.emplace_back(hashAPInt(F.bitcastToAPInt()));
     return stable_hash_combine(Hashes);
   }
 
+  stable_hash hashGlobalValue(const GlobalValue *GV) {
+    if (!GV->hasName())
+      return 0;
+    return stable_hash_name(GV->getName());
+  }
+
+  // Compute a hash for a Constant. This function is logically similar to
+  // FunctionComparator::cmpConstants() in FunctionComparator.cpp, but here
+  // we're interested in computing a hash rather than comparing two Constants.
+  // Some of the logic is simplified, e.g, we don't expand GEPOperator.
+  stable_hash hashConstant(Constant *C) {
+    SmallVector<stable_hash> Hashes;
+
+    Type *Ty = C->getType();
+    Hashes.emplace_back(hashType(Ty));
+
+    if (C->isNullValue()) {
+      Hashes.emplace_back(static_cast<stable_hash>('N'));
+      return stable_hash_combine(Hashes);
+    }
+
+    auto *G = dyn_cast<GlobalValue>(C);
+    if (G) {
+      Hashes.emplace_back(hashGlobalValue(G));
+      return stable_hash_combine(Hashes);
+    }
+
+    if (const auto *Seq = dyn_cast<ConstantDataSequential>(C)) {
+      Hashes.emplace_back(xxh3_64bits(Seq->getRawDataValues()));
+      return stable_hash_combine(Hashes);
+    }
+
+    switch (C->getValueID()) {
+    case Value::UndefValueVal:
+    case Value::PoisonValueVal:
+    case Value::ConstantTokenNoneVal: {
+      return stable_hash_combine(Hashes);
+    }
+    case Value::ConstantIntVal: {
+      const APInt &Int = cast<ConstantInt>(C)->getValue();
+      Hashes.emplace_back(hashAPInt(Int));
+      return stable_hash_combine(Hashes);
+    }
+    case Value::ConstantFPVal: {
+      const APFloat &APF = cast<ConstantFP>(C)->getValueAPF();
+      Hashes.emplace_back(hashAPFloat(APF));
+      return stable_hash_combine(Hashes);
+    }
+    case Value::ConstantArrayVal: {
+      const ConstantArray *A = cast<ConstantArray>(C);
+      uint64_t NumElements = cast<ArrayType>(Ty)->getNumElements();
+      Hashes.emplace_back(NumElements);
+      for (auto &Op : A->operands()) {
+        auto H = hashConstant(cast<Constant>(Op));
+        Hashes.emplace_back(H);
+      }
+      return stable_hash_combine(Hashes);
+    }
+    case Value::ConstantStructVal: {
+      const ConstantStruct *S = cast<ConstantStruct>(C);
+      unsigned NumElements = cast<StructType>(Ty)->getNumElements();
+      Hashes.emplace_back(NumElements);
+      for (auto &Op : S->operands()) {
+        auto H = hashConstant(cast<Constant>(Op));
+        Hashes.emplace_back(H);
+      }
+      return stable_hash_combine(Hashes);
+    }
+    case Value::ConstantVectorVal: {
+      const ConstantVector *V = cast<ConstantVector>(C);
+      unsigned NumElements = cast<FixedVectorType>(Ty)->getNumElements();
+      Hashes.emplace_back(NumElements);
+      for (auto &Op : V->operands()) {
+        auto H = hashConstant(cast<Constant>(Op));
+        Hashes.emplace_back(H);
+      }
+      return stable_hash_combine(Hashes);
+    }
+    case Value::ConstantExprVal: {
+      const ConstantExpr *E = cast<ConstantExpr>(C);
+      unsigned NumOperands = E->getNumOperands();
+      Hashes.emplace_back(NumOperands);
+      for (auto &Op : E->operands()) {
+        auto H = hashConstant(cast<Constant>(Op));
+        Hashes.emplace_back(H);
+      }
+      return stable_hash_combine(Hashes);
+    }
+    case Value::BlockAddressVal: {
+      const BlockAddress *BA = cast<BlockAddress>(C);
+      auto H = hashGlobalValue(BA->getFunction());
+      Hashes.emplace_back(H);
+      return stable_hash_combine(Hashes);
+    }
+    case Value::DSOLocalEquivalentVal: {
+      const auto *Equiv = cast<DSOLocalEquivalent>(C);
+      auto H = hashGlobalValue(Equiv->getGlobalValue());
+      Hashes.emplace_back(H);
+      return stable_hash_combine(Hashes);
+    }
+    default: // Unknown constant, abort.
+      llvm_unreachable("Constant ValueID not recognized.");
+    }
+    return Hash;
+  }
+
   stable_hash hashValue(Value *V) {
     // Check constant and return its hash.
     Constant *C = dyn_cast<Constant>(V);
@@ -76,6 +205,10 @@ class StructuralHashImpl {
     if (Argument *Arg = dyn_cast<Argument>(V))
       Hashes.emplace_back(Arg->getArgNo());
 
+    // Get an index (an insertion order) for the non-constant value.
+    auto I = ValueToId.insert({V, ValueToId.size()});
+    Hashes.emplace_back(I.first->second);
+
     return stable_hash_combine(Hashes);
   }
 
@@ -100,8 +233,20 @@ class StructuralHashImpl {
     if (const auto *ComparisonInstruction = dyn_cast<CmpInst>(&Inst))
       Hashes.emplace_back(ComparisonInstruction->getPredicate());
 
-    for (const auto &Op : Inst.operands())
-      Hashes.emplace_back(hashOperand(Op));
+    unsigned InstIdx = 0;
+    if (IndexInstruction) {
+      InstIdx = IndexInstruction->size();
+      IndexInstruction->insert({InstIdx, const_cast<Instruction *>(&Inst)});
+    }
+
+    for (const auto [OpndIdx, Op] : enumerate(Inst.operands())) {
+      auto OpndHash = hashOperand(Op);
+      if (IgnoreOp && IgnoreOp(&Inst, OpndIdx)) {
+        assert(IndexOperandHashMap);
+        IndexOperandHashMap->insert({{InstIdx, OpndIdx}, OpndHash});
+      } else
+        Hashes.emplace_back(OpndHash);
+    }
 
     return stable_hash_combine(Hashes);
   }
@@ -184,6 +329,12 @@ class StructuralHashImpl {
   }
 
   uint64_t getHash() const { return Hash; }
+  std::unique_ptr<IndexInstrMap> getIndexInstrMap() {
+    return std::move(IndexInstruction);
+  }
+  std::unique_ptr<IndexOperandHashMapType> getIndexPairOpndHashMap() {
+    return std::move(IndexOperandHashMap);
+  }
 };
 
 } // namespace
@@ -199,3 +350,12 @@ IRHash llvm::StructuralHash(const Module &M, bool DetailedHash) {
   H.update(M);
   return H.getHash();
 }
+
+FunctionHashInfo
+llvm::StructuralHashWithDifferences(const Function &F,
+                                    IgnoreOperandFunc IgnoreOp) {
+  StructuralHashImpl H(/*DetailedHash=*/true, IgnoreOp);
+  H.update(F);
+  return FunctionHashInfo(H.getHash(), H.getIndexInstrMap(),
+                          H.getIndexPairOpndHashMap());
+}
diff --git a/llvm/unittests/IR/StructuralHashTest.cpp b/llvm/unittests/IR/StructuralHashTest.cpp
@@ -239,4 +239,59 @@ TEST(StructuralHashTest, ArgumentNumber) {
   EXPECT_EQ(StructuralHash(*M1), StructuralHash(*M2));
   EXPECT_NE(StructuralHash(*M1, true), StructuralHash(*M2, true));
 }
+
+TEST(StructuralHashTest, Differences) {
+  LLVMContext Ctx;
+  std::unique_ptr<Module> M1 = parseIR(Ctx, "define i64 @f(i64 %a) {\n"
+                                            "  %c = add i64 %a, 1\n"
+                                            "  %b = call i64 @f1(i64 %c)\n"
+                                            "  ret i64 %b\n"
+                                            "}\n"
+                                            "declare i64 @f1(i64)");
+  auto *F1 = M1->getFunction("f");
+  std::unique_ptr<Module> M2 = parseIR(Ctx, "define i64 @g(i64 %a) {\n"
+                                            "  %c = add i64 %a, 1\n"
+                                            "  %b = call i64 @f2(i64 %c)\n"
+                                            "  ret i64 %b\n"
+                                            "}\n"
+                                            "declare i64 @f2(i64)");
+  auto *F2 = M2->getFunction("g");
+
+  // They are originally different when not ignoring any operand.
+  EXPECT_NE(StructuralHash(*F1, true), StructuralHash(*F2, true));
+  EXPECT_NE(StructuralHashWithDifferences(*F1, nullptr).FunctionHash,
+            StructuralHashWithDifferences(*F2, nullptr).FunctionHash);
+
+  // When we ignore the call target f1 vs f2, they have the same hash.
+  auto IgnoreOp = [&](const Instruction *I, unsigned OpndIdx) {
+    return I->getOpcode() == Instruction::Call && OpndIdx == 1;
+  };
+  auto FuncHashInfo1 = StructuralHashWithDifferences(*F1, IgnoreOp);
+  auto FuncHashInfo2 = StructuralHashWithDifferences(*F2, IgnoreOp);
+  EXPECT_EQ(FuncHashInfo1.FunctionHash, FuncHashInfo2.FunctionHash);
+
+  // There are total 3 instructions.
+  EXPECT_EQ(FuncHashInfo1.IndexInstruction->size(), 3u);
+  EXPECT_EQ(FuncHashInfo2.IndexInstruction->size(), 3u);
+
+  // The only 1 operand (the call target) has been ignored.
+  EXPECT_EQ(FuncHashInfo1.IndexOperandHashMap->size(), 1u);
+  EXPECT_EQ(FuncHashInfo2.IndexOperandHashMap->size(), 1u);
+
+  // The index pair of instruction and operand (1, 1) is a key in the map.
+  ASSERT_TRUE(FuncHashInfo1.IndexOperandHashMap->count({1, 1}));
+  ASSERT_TRUE(FuncHashInfo2.IndexOperandHashMap->count({1, 1}));
+
+  // The indexed instruciton must be the call instruction as shown in the
+  // IgnoreOp above.
+  EXPECT_EQ(FuncHashInfo1.IndexInstruction->lookup(1)->getOpcode(),
+            Instruction::Call);
+  EXPECT_EQ(FuncHashInfo2.IndexInstruction->lookup(1)->getOpcode(),
+            Instruction::Call);
+
+  // The ignored operand hashes (for f1 vs. f2) are different.
+  EXPECT_NE(FuncHashInfo1.IndexOperandHashMap->lookup({1, 1}),
+            FuncHashInfo2.IndexOperandHashMap->lookup({1, 1}));
+}
+
 } // end anonymous namespace