From 25a223d700002f66dec9a70b4bb1d1f03e4df11f Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Thu, 26 Jun 2025 00:18:37 +0000 Subject: [PATCH 1/4] Key-type customization in `CanonicalValueStore` and `ClangDecl` cleanups The `ClangDecl` struct caused some confusion here -- it is embedding extra data into a `CanonicalValueStore` that isn't used for lookups or canonicalization, but is useful to store along side. This changes the `CanonicalValueStore` to support customized key type for `Lookup` so that we can provide the more direct API that only takes the relevant key. This in turn takes advantage of the support for heterogenous keys in the underlying `Set` as long as hashing and equality are consistent. We do need to add support for heterogenous equality comparison with `clang::Decl*`, but that is fairly easily done now that the argument-reversed form isn't needed as well. Lastly, this cleans up the `ClangDecl` customization points to be more idiomatic by using `operator==` and `CarbonHashValue`. While there, I've added comments to make it unambiguous why we can use the pointer value for the underlying `clang::Decl` due to the Clang AST's address-as-identity model. Resolves the immediate TODOs around this type. Future work might involve changing from the current `Add` API to one more like `Map` and `Set`'s API where a callback is used to create the object, but that level of API complexity isn't necessarily motivated yet and can easily be a follow-on if and when its worth doing. The `Add` code paths *are* working with the `inst_id` in order to create an instruction if we are importing the Clang declaration. It is the `Lookup` code paths that never needed to know about the `inst_id` and became more confusing for having to stub it out in the API. --- toolchain/base/value_store.h | 26 +++++++++++++++++++++----- toolchain/check/import_cpp.cpp | 11 ++++------- toolchain/sem_ir/clang_decl.h | 34 ++++++++++++++++++++++++++-------- 3 files changed, 51 insertions(+), 20 deletions(-) diff --git a/toolchain/base/value_store.h b/toolchain/base/value_store.h index d1c70187228c0..e3aefd9899635 100644 --- a/toolchain/base/value_store.h +++ b/toolchain/base/value_store.h @@ -5,6 +5,7 @@ #ifndef CARBON_TOOLCHAIN_BASE_VALUE_STORE_H_ #define CARBON_TOOLCHAIN_BASE_VALUE_STORE_H_ +#include #include #include #include @@ -37,10 +38,12 @@ template class ValueStoreRange; // Common calculation for ValueStore types. -template +template class ValueStoreTypes { public: using ValueType = std::decay_t; + using KeyType = std::decay_t; // Typically we want to use `ValueType&` and `const ValueType& to avoid // copies, but when the value type is a `StringRef`, we assume external @@ -53,6 +56,14 @@ class ValueStoreTypes { llvm::StringRef, const ValueType&>; }; +// If `IdT` provides a distinct `IdT::KeyType`, default to that for the key +// type. +template + requires(!std::same_as) +class ValueStoreTypes + : public ValueStoreTypes {}; + // A simple wrapper for accumulating values, providing IDs to later retrieve the // value. This does not do deduplication. // @@ -221,11 +232,16 @@ class ValueStoreRange { // A wrapper for accumulating immutable values with deduplication, providing IDs // to later retrieve the value. // -// IdT::ValueType must represent the type being indexed. +// `IdT::ValueType` must represent the type being indexed. +// +// `IdT::KeyType` can optionally be present, and if so is used for the argument +// to `Lookup`. It must be valid to use both `KeyType` and `ValueType` as lookup +// types in the underlying `Set`. template class CanonicalValueStore { public: using ValueType = ValueStoreTypes::ValueType; + using KeyType = ValueStoreTypes::KeyType; using RefType = ValueStoreTypes::RefType; using ConstRefType = ValueStoreTypes::ConstRefType; @@ -237,7 +253,7 @@ class CanonicalValueStore { // Looks up the canonical ID for a value, or returns `None` if not in the // store. - auto Lookup(ValueType value) const -> IdT; + auto Lookup(KeyType key) const -> IdT; // Reserves space. auto Reserve(size_t size) -> void; @@ -290,8 +306,8 @@ auto CanonicalValueStore::Add(ValueType value) -> IdT { } template -auto CanonicalValueStore::Lookup(ValueType value) const -> IdT { - if (auto result = set_.Lookup(value, KeyContext(&values_))) { +auto CanonicalValueStore::Lookup(KeyType key) const -> IdT { + if (auto result = set_.Lookup(key, KeyContext(&values_))) { return result.key(); } return IdT::None; diff --git a/toolchain/check/import_cpp.cpp b/toolchain/check/import_cpp.cpp index 7e331724f8633..23a62fd339874 100644 --- a/toolchain/check/import_cpp.cpp +++ b/toolchain/check/import_cpp.cpp @@ -350,9 +350,8 @@ static auto AsCarbonNamespace(Context& context, auto& clang_decls = context.sem_ir().clang_decls(); // Check if the decl context is already mapped to a Carbon namespace. - if (auto context_clang_decl_id = clang_decls.Lookup( - {.decl = clang::dyn_cast(decl_context), - .inst_id = SemIR::InstId::None}); + if (auto context_clang_decl_id = + clang_decls.Lookup(clang::dyn_cast(decl_context)); context_clang_decl_id.has_value()) { return clang_decls.Get(context_clang_decl_id).inst_id; } @@ -365,8 +364,7 @@ static auto AsCarbonNamespace(Context& context, decl_contexts.push_back(decl_context); decl_context = decl_context->getParent(); parent_decl_id = - clang_decls.Lookup({.decl = clang::dyn_cast(decl_context), - .inst_id = SemIR::InstId::None}); + clang_decls.Lookup(clang::dyn_cast(decl_context)); } while (!parent_decl_id.has_value()); // We know the parent of the last decl context is mapped, map the rest. @@ -532,8 +530,7 @@ static auto MapRecordType(Context& context, SemIR::LocId loc_id, if (record_decl && !record_decl->isUnion()) { auto& clang_decls = context.sem_ir().clang_decls(); SemIR::InstId record_inst_id = SemIR::InstId::None; - if (auto record_clang_decl_id = clang_decls.Lookup( - {.decl = record_decl, .inst_id = SemIR::InstId::None}); + if (auto record_clang_decl_id = clang_decls.Lookup(record_decl); record_clang_decl_id.has_value()) { record_inst_id = clang_decls.Get(record_clang_decl_id).inst_id; } else { diff --git a/toolchain/sem_ir/clang_decl.h b/toolchain/sem_ir/clang_decl.h index d918f92b49786..ef69382079d98 100644 --- a/toolchain/sem_ir/clang_decl.h +++ b/toolchain/sem_ir/clang_decl.h @@ -20,17 +20,22 @@ class Decl; namespace Carbon::SemIR { // A Clang declaration mapped to a Carbon instruction. -// Using custom hashing since the declaration is keyed by the `decl` member for -// lookup. -// TODO: Avoid custom hashing by either having the data structure support keying -// or create a dedicated mapping. See -// https://discord.com/channels/655572317891461132/768530752592805919/1384999468293947537 +// +// Note that Clang's AST uses address-identity for nodes, which means the +// pointer is the canonical way to represent a specific AST node and is expected +// to be sufficient for comparison, hashing, etc. struct ClangDecl : public Printable { auto Print(llvm::raw_ostream& out) const -> void; - friend auto CarbonHashtableEq(const ClangDecl& lhs, const ClangDecl& rhs) - -> bool { - return HashtableEq(lhs.decl, rhs.decl); + // Equality comparison uses the address-identity property of the Clang AST and + // just compares the `decl` pointers. We canonicalize on these to avoid ever + // having different instructions for the same declaration. + auto operator==(const ClangDecl& rhs) const -> bool { + return decl == rhs.decl; + } + // Support direct comparison with the Clang AST node pointer. + auto operator==(const clang::Decl* rhs_decl) const -> bool { + return decl == rhs_decl; } // Hashing for ClangDecl. See common/hashing.h. @@ -45,15 +50,28 @@ struct ClangDecl : public Printable { clang::Decl* decl = nullptr; // The instruction the Clang declaration is mapped to. + // + // This is stored along side the `decl` pointer to avoid having to lookup both + // the pointer and the instruction ID in two separate areas of storage. InstId inst_id; }; // The ID of a `ClangDecl`. +// +// These IDs are importantly distinct from the `inst_id` associated with each +// declaration. These form a dense range of IDs that is used to reference the +// AST node pointers without storing those pointers directly into SemIR and +// needing space to hold a full pointer. We can't avoid having these IDs without +// embedding pointers directly into the storage of SemIR as part of an +// instruction. struct ClangDeclId : public IdBase { static constexpr llvm::StringLiteral Label = "clang_decl_id"; using ValueType = ClangDecl; + // Use the AST node pointer directly when doing `Lookup` to find an ID. + using KeyType = clang::Decl*; + using IdBase::IdBase; }; From db4ab3a56978b5534aaa9206b9250873e7f4156e Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Fri, 27 Jun 2025 11:05:09 -0700 Subject: [PATCH 2/4] Apply suggestions from code review Co-authored-by: Geoff Romer --- toolchain/sem_ir/clang_decl.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/toolchain/sem_ir/clang_decl.h b/toolchain/sem_ir/clang_decl.h index ef69382079d98..901b2fd219ab4 100644 --- a/toolchain/sem_ir/clang_decl.h +++ b/toolchain/sem_ir/clang_decl.h @@ -28,8 +28,9 @@ struct ClangDecl : public Printable { auto Print(llvm::raw_ostream& out) const -> void; // Equality comparison uses the address-identity property of the Clang AST and - // just compares the `decl` pointers. We canonicalize on these to avoid ever - // having different instructions for the same declaration. + // just compares the `decl` pointers. We can disregard `inst_id` because `ClangDecl`s + // are only used as entries in `SemIR::clang_decls`, which canonicalizes them to + // ensure that the same `decl` can't be associated with more than one `inst_id`. auto operator==(const ClangDecl& rhs) const -> bool { return decl == rhs.decl; } From 80a8f98e3c958fe6fabb315be79aabf95dae6520 Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Fri, 27 Jun 2025 18:11:26 +0000 Subject: [PATCH 3/4] reword and relocate comments --- toolchain/sem_ir/clang_decl.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/toolchain/sem_ir/clang_decl.h b/toolchain/sem_ir/clang_decl.h index 901b2fd219ab4..c17570d60c270 100644 --- a/toolchain/sem_ir/clang_decl.h +++ b/toolchain/sem_ir/clang_decl.h @@ -24,13 +24,18 @@ namespace Carbon::SemIR { // Note that Clang's AST uses address-identity for nodes, which means the // pointer is the canonical way to represent a specific AST node and is expected // to be sufficient for comparison, hashing, etc. +// +// This type is specifically designed for use in a `CanonicalValueStore` and +// provide a single canonical access from SemIR to each `clang::Decl*` used. +// This also ensures that a given `clang::Decl*` is associated with exactly one +// instruction, and the `inst_id` here provides access to that instruction from +// either the `ClangDeclId` or the `clang::Decl*`. struct ClangDecl : public Printable { auto Print(llvm::raw_ostream& out) const -> void; // Equality comparison uses the address-identity property of the Clang AST and - // just compares the `decl` pointers. We can disregard `inst_id` because `ClangDecl`s - // are only used as entries in `SemIR::clang_decls`, which canonicalizes them to - // ensure that the same `decl` can't be associated with more than one `inst_id`. + // just compares the `decl` pointers. The `inst_id` is always the same due to + // the canonicalization. auto operator==(const ClangDecl& rhs) const -> bool { return decl == rhs.decl; } From 7c671a0fdedcb3a49777b466c61f6215b84c9f4e Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Fri, 27 Jun 2025 18:21:47 +0000 Subject: [PATCH 4/4] todo --- toolchain/base/value_store.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/toolchain/base/value_store.h b/toolchain/base/value_store.h index e3aefd9899635..1318ca1a9dad1 100644 --- a/toolchain/base/value_store.h +++ b/toolchain/base/value_store.h @@ -43,6 +43,10 @@ template ; + + // TODO: Would be a bit cleaner to not have this here as it's only meaningful + // to the `CanonicalValueStore`, not to other `ValueStore`s. Planned to fix + // with a larger refactoring. using KeyType = std::decay_t; // Typically we want to use `ValueType&` and `const ValueType& to avoid