diff --git a/Cargo.lock b/Cargo.lock index 9785964..86e7580 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -58,9 +58,9 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.14.3" +version = "0.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604" +checksum = "1e087f84d4f86bf4b218b927129862374b72199ae7d8657835f1e89000eea4fb" [[package]] name = "memchr" diff --git a/Cargo.toml b/Cargo.toml index 572be69..05db054 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,8 +11,8 @@ license = "MPL-2.0" rust-version = "1.70" [dependencies] -# Used for HashMap and RawTable for a custom bag structure. -hashbrown = { version = "0.14", default-features = false, features = ["raw"] } +# Used for HashMap and HashTable for a custom bag structure. +hashbrown = { version = "0.15", default-features = false } # Used as the default BuildHasher when the `default-hasher` feature flag # is enabled (which it is by default). ahash = { version = "0.8", default-features = false, optional = true } diff --git a/docs/internals.md b/docs/internals.md index 9cb87d0..8582435 100644 --- a/docs/internals.md +++ b/docs/internals.md @@ -37,7 +37,7 @@ type WordList = HashBag; The word list is one of the two central data structures. It's a lookup table for the pairs of `(stem, flag_set)` defined in a dictionary's `.dic` file. We need to look up whether a word is in the dictionary (and what flags it has) very quickly. A natural choice might be `HashMap` or `BTreeSet`. Unlike flag sets and boxed slices and strs mentioned above, it's ok for this type to be resizable. There's only one instance of it in a dictionary and the API can support adding words to the dictionary to enable building a personal dictionary feature. Instead the snag with this type is that there can be duplicate stems in the dictionary with different flag sets. Merging the flag sets together isn't correct: the combination of flags might allow one prefix/suffix to apply but not work in a compounds while another entry provides a different prefix/suffix which can compound. -So what we need is something closer to `HashMap>`. The extra `Vec` is more overhead though that isn't necessary in most cases since duplicate stems are fairly rare. In other languages like C++ this is where a [multi map](https://en.cppreference.com/w/cpp/container/unordered_multimap) might fit. It's the same idea as a hash map but allows for duplicate keys. Building a type like this in Rust is actually pretty straightforward with the [`hashbrown`] raw API which exposes enough internals to build a new hash table type. The table's `Iterator` implementation is identical. Insertion is slightly simpler: we don't need to check if the key is already in the table, we can just blindly insert. Reading from the table works very similarly to `HashMap::get`. Lookup in a regular hash map can stop searching the table when the first entry matching the key is found. For a multi map though we continue to look up until we find an entry that doesn't match. +So what we need is something closer to `HashMap>`. The extra `Vec` is more overhead though that isn't necessary in most cases since duplicate stems are fairly rare. In other languages like C++ this is where a [multi map](https://en.cppreference.com/w/cpp/container/unordered_multimap) might fit. It's the same idea as a hash map but allows for duplicate keys. Building a type like this in Rust is actually pretty straightforward with the [`hashbrown`] `HashTable` API. Insertion is slightly simpler than a `HashMap`: we don't need to check if the key is already in the table, we can just blindly insert. Reading from the table works very similarly to `HashMap::get`. Lookup in a regular hash map can stop searching the table when the first entry matching the key is found. For a multi map though we continue to look up until we find an entry that doesn't match. See the implementation details for this in [`src/hash_bag.rs`](../src/hash_bag.rs). diff --git a/src/hash_bag.rs b/src/hash_bag.rs index 6fa3986..fd0e972 100644 --- a/src/hash_bag.rs +++ b/src/hash_bag.rs @@ -2,12 +2,9 @@ use core::{ borrow::Borrow, fmt::Debug, hash::{BuildHasher, Hash}, - marker::PhantomData, }; -// TODO: `use hashbrown::{hash_table, HashTable}` instead once `HashTable` supports `iter_hash`. -// See the `hash-table` branch. -use hashbrown::raw::{RawIter, RawIterHash, RawTable}; +use hashbrown::hash_table::{self, HashTable, IterHash}; /// A collection of key-value pairs - similar to a HashMap - which allows for duplicate keys. /// @@ -30,14 +27,14 @@ use hashbrown::raw::{RawIter, RawIterHash, RawTable}; /// [Swiss Tables]: https://abseil.io/blog/20180927-swisstables #[derive(Clone)] pub struct HashBag { - table: RawTable<(K, V)>, + table: HashTable<(K, V)>, build_hasher: S, } impl HashBag { pub fn new() -> Self { Self { - table: RawTable::new(), + table: HashTable::new(), build_hasher: S::default(), } } @@ -49,9 +46,7 @@ impl HashBag { /// The ordering of the pairs returned by the iterator is undefined. pub fn iter(&self) -> Iter<'_, K, V> { Iter { - inner: unsafe { self.table.iter() }, - // Here we tie the lifetime of self to the iter. - marker: PhantomData, + inner: self.table.iter(), } } @@ -68,7 +63,7 @@ where { pub fn with_capacity_and_hasher(capacity: usize, build_hasher: S) -> Self { Self { - table: RawTable::with_capacity(capacity), + table: HashTable::with_capacity(capacity), build_hasher, } } @@ -79,9 +74,8 @@ where pub fn insert(&mut self, k: K, v: V) { let hash = make_hash(&self.build_hasher, &k); let hasher = make_hasher(&self.build_hasher); - self.table.reserve(1, make_hasher(&self.build_hasher)); // Insert without attempting to find an existing entry with this key. - self.table.insert(hash, (k, v), hasher); + self.table.insert_unique(hash, (k, v), hasher); } /// Gets all key-value pairs in the bag with the given key. @@ -95,10 +89,8 @@ where let hash = make_hash(&self.build_hasher, k); GetAllIter { - inner: unsafe { self.table.iter_hash(hash) }, + inner: self.table.iter_hash(hash), key: k, - // Here we tie the lifetime of self to the iter. - marker: PhantomData, } } } @@ -135,23 +127,18 @@ where move |val| make_hash::(hash_builder, &val.0) } +// This is a very thin wrapper around `hash_table::Iter` which rearranges the reference so that +// we return `(&k, &v)` instead of `&(k, v)`. pub struct Iter<'a, K, V> { - inner: RawIter<(K, V)>, - marker: PhantomData<(&'a K, &'a V)>, + inner: hash_table::Iter<'a, (K, V)>, } impl<'a, K, V> Iterator for Iter<'a, K, V> { type Item = (&'a K, &'a V); fn next(&mut self) -> Option<(&'a K, &'a V)> { - // Avoid `Option::map` because it bloats LLVM IR. - match self.inner.next() { - Some(x) => unsafe { - let r = x.as_ref(); - Some((&r.0, &r.1)) - }, - None => None, - } + let (k, v) = self.inner.next()?; + Some((k, v)) } fn size_hint(&self) -> (usize, Option) { @@ -170,9 +157,8 @@ where K: Borrow, Q: Hash + Eq, { - inner: RawIterHash<(K, V)>, + inner: IterHash<'bag, (K, V)>, key: &'key Q, - marker: PhantomData<(&'bag K, &'bag V)>, } impl<'bag, 'key, Q: ?Sized, K, V> Iterator for GetAllIter<'bag, 'key, Q, K, V> @@ -185,13 +171,9 @@ where fn next(&mut self) -> Option { loop { match self.inner.next() { - Some(bucket) => { - // SAFETY: the creator of the iterator (`get`) ensures that the reference - // to the value outlives the RawTable. It also prevents concurrent - // modifications to the table. - let element = unsafe { bucket.as_ref() }; - if self.key.eq(element.0.borrow()) { - return Some((&element.0, &element.1)); + Some((k, v)) => { + if self.key.eq(k.borrow()) { + return Some((k, v)); } continue; }