diff --git a/spec/std/hash_spec.cr b/spec/std/hash_spec.cr index 79157dcf301f..89d68cfc8773 100644 --- a/spec/std/hash_spec.cr +++ b/spec/std/hash_spec.cr @@ -806,18 +806,25 @@ describe "Hash" do it "creates with initial capacity" do hash = Hash(Int32, Int32).new(initial_capacity: 1234) - hash.@buckets_size.should eq(1234) + hash.capacity.should eq(1536_u32) end it "creates with initial capacity and default value" do hash = Hash(Int32, Int32).new(default_value: 3, initial_capacity: 1234) hash[1].should eq(3) - hash.@buckets_size.should eq(1234) + hash.capacity.should eq(1536_u32) end it "creates with initial capacity and block" do hash = Hash(Int32, Int32).new(initial_capacity: 1234) { |h, k| h[k] = 3 } hash[1].should eq(3) - hash.@buckets_size.should eq(1234) + hash.capacity.should eq(1536_u32) + end +end + +class Hash(K, V) + # :nodoc: + def capacity + nentries(@format) end end diff --git a/src/hash.cr b/src/hash.cr index 538f09619d96..247c4736a436 100644 --- a/src/hash.cr +++ b/src/hash.cr @@ -8,19 +8,26 @@ class Hash(K, V) include Iterable({K, V}) getter size : Int32 - @buckets_size : Int32 - @first : Entry(K, V)? - @last : Entry(K, V)? + @format : UInt8 + @rebuild_num : UInt16 + @first : UInt32 + @last : UInt32 + @index : Pointer(UInt32) + @entries : Pointer(Void) @block : (self, K -> V)? def initialize(block : (Hash(K, V), K -> V)? = nil, initial_capacity = nil) - initial_capacity ||= 11 - initial_capacity = 11 if initial_capacity < 11 - initial_capacity = initial_capacity.to_i - @buckets = Pointer(Entry(K, V)?).malloc(initial_capacity) - @buckets_size = initial_capacity @size = 0 + @format = 0_u8 + @rebuild_num = 0_u16 + @first = 0_u32 + @last = 0_u32 + @index = Pointer(UInt32).new(0) + @entries = Pointer(Void).new(0) @block = block + if initial_capacity + resize_data(calculate_new_size(initial_capacity)) + end end def self.new(initial_capacity = nil, &block : (Hash(K, V), K -> V)) @@ -38,22 +45,14 @@ class Hash(K, V) # h["foo"] = "bar" # h["foo"] # => "bar" # ``` - def []=(key : K, value : V) - rehash if @size > 5 * @buckets_size - - index = bucket_index key - entry = insert_in_bucket index, key, value - return value unless entry - - @size += 1 - - if last = @last - last.fore = entry - entry.back = last + def []=(key, value) + hash = hash_key(key) + entry = find_entry(hash, key) + if entry + entry.value.value = value + else + push_entry(hash, key, value) end - - @last = entry - @first = entry unless @first value end @@ -85,7 +84,7 @@ class Hash(K, V) # h.has_key?("bar") # => false # ``` def has_key?(key) - !!find_entry(key) + !!find_entry(hash_key(key), key) end # Returns `true` when value given by *value* exists, otherwise `false`. @@ -148,8 +147,9 @@ class Hash(K, V) # h.fetch("bar") { |key| key.upcase } # => "BAR" # ``` def fetch(key) - entry = find_entry(key) - entry ? entry.value : yield key + hash = hash_key(key) + entry = find_entry(hash, key) + entry ? entry.value.value : yield key end # Returns a tuple populated with the elements at the given *indexes*. @@ -220,43 +220,16 @@ class Hash(K, V) # h.delete("baz") { |key| "#{key} not found" } # => "baz not found" # ``` def delete(key) - index = bucket_index(key) - entry = @buckets[index] - - previous_entry = nil - while entry - if entry.key == key - back_entry = entry.back - fore_entry = entry.fore - if fore_entry - if back_entry - back_entry.fore = fore_entry - fore_entry.back = back_entry - else - @first = fore_entry - fore_entry.back = nil - end - else - if back_entry - back_entry.fore = nil - @last = back_entry - else - @first = nil - @last = nil - end - end - if previous_entry - previous_entry.next = entry.next - else - @buckets[index] = entry.next - end - @size -= 1 - return entry.value - end - previous_entry = entry - entry = entry.next + hash = hash_key(key) + entry = find_entry_fix_delete(hash, key) + unless entry.null? + value = entry.value.value + clear_entry(entry) + @size -= 1 + value + else + yield key end - yield key end # Deletes each key-value pair for which the given block returns `true`. @@ -267,12 +240,11 @@ class Hash(K, V) # h # => { "bar" => "qux" } # ``` def delete_if - keys_to_delete = [] of K - each do |key, value| - keys_to_delete << key if yield(key, value) - end - keys_to_delete.each do |key| - delete(key) + each_entry do |entry| + if yield(entry.value.pair) + clear_entry(entry) + @size -= 1 + end end self end @@ -305,10 +277,8 @@ class Hash(K, V) # end # ``` def each : Nil - current = @first - while current - yield({current.key, current.value}) - current = current.fore + each_entry do |entry| + yield(entry.value.pair) end end @@ -323,7 +293,7 @@ class Hash(K, V) # iterator.next # => {"baz", "qux"} # ``` def each - EntryIterator(K, V).new(self, @first) + EntryIterator(K, V).new(self, @first, @rebuild_num) end # Calls the given block for each key-value pair and passes in the key. @@ -354,7 +324,7 @@ class Hash(K, V) # key # => "baz" # ``` def each_key - KeyIterator(K, V).new(self, @first) + KeyIterator(K, V).new(self, @first, @rebuild_num) end # Calls the given block for each key-value pair and passes in the value. @@ -385,7 +355,7 @@ class Hash(K, V) # value # => "qux" # ``` def each_value - ValueIterator(K, V).new(self, @first) + ValueIterator(K, V).new(self, @first, @rebuild_num) end # Returns a new `Array` with all the keys. @@ -421,8 +391,11 @@ class Hash(K, V) # h.key_index("qux") # => nil # ``` def key_index(key) - each_with_index do |(my_key, my_value), index| - return index if key == my_key + hash = hash_key(key) + index = 0 + each_entry do |entry| + return index if entry.value.hashsum == hash && entry.value.key == key + index += 1 end nil end @@ -504,11 +477,9 @@ class Hash(K, V) end # Equivalent to `Hash#reject`, but makes modification on the current object rather that returning a new one. Returns `nil` if no changes were made. - def reject!(&block : K, V -> _) + def reject! num_entries = size - each do |key, value| - delete(key) if yield(key, value) - end + delete_if { |k, v| yield k, v } num_entries == size ? nil : self end @@ -559,8 +530,7 @@ class Hash(K, V) # h # => {"a" => 1, "c" => 3} # ``` def select!(keys : Array | Tuple) - each { |k, v| delete(k) unless keys.includes?(k) } - self + delete_if { |k, v| !keys.includes?(k) } end def select!(*keys) @@ -606,7 +576,7 @@ class Hash(K, V) # Returns the first key in the hash. def first_key - @first.not_nil!.key + first_entry.not_nil!.value.key end # Returns the first key if it exists, or returns `nil`. @@ -618,17 +588,17 @@ class Hash(K, V) # hash.first_key? # => nil # ``` def first_key? - @first.try &.key + first_entry.try &.value.key end # Returns the first value in the hash. def first_value - @first.not_nil!.value + first_entry.not_nil!.value.value end # Similar to `#first_key?`, but returns its value. def first_value? - @first.try &.value + first_entry.try &.value.value end # Deletes and returns the first key-value pair in the hash, @@ -673,10 +643,12 @@ class Hash(K, V) # hash # => {} # ``` def shift - first = @first - if first - delete first.key - {first.key, first.value} + entry = first_entry + if entry + res = entry.value.pair + clear_entry(entry) + @size -= 1 + res else yield end @@ -689,21 +661,25 @@ class Hash(K, V) # hash.clear # => {} # ``` def clear - @buckets_size.times do |i| - @buckets[i] = nil - end + #resize_data(0_u8) + # have to explicitely clear @index cause it is reused as UInt32 + #@index = Pointer(UInt32).null + #@rebuild_num += 1_u16 + #@first = 0_u32 + #@last = 0_u32 @size = 0 - @first = nil - @last = nil + @first = @last + rehash self end # Compares with *other*. Returns `true` if all key-value pairs are the same. def ==(other : Hash) return false unless size == other.size - each do |key, value| - entry = other.find_entry(key) - return false unless entry && entry.value == value + each_entry do |entry| + other_entry = other.find_entry(entry.value.hashsum, entry.value.key) + return false if other_entry.null? + return false unless other_entry.value.value == entry.value.value end true end @@ -714,11 +690,10 @@ class Hash(K, V) # order of the keys. result = hasher.result - each do |key, value| + each_entry do |entry| copy = hasher - copy = key.hash(copy) - copy = value.hash(copy) - result += copy.result + copy = entry.value.value.hash(copy) + result += copy.result ^ entry.value.hashsum end result.hash(hasher) @@ -733,11 +708,36 @@ class Hash(K, V) # hash_a # => {"foo" => "bar"} # ``` def dup - hash = Hash(K, V).new(initial_capacity: @buckets_size) - each do |key, value| - hash[key] = value + copy = super + copy.init_dup(self) + end + + protected def init_dup(original) + index = nindex(@format) + unless index.zero? + @index = Pointer(UInt32).malloc(index) + @index.copy_from(original.@index, index) end - hash + + nchunks = nentries(@format) / CHUNK + if nchunks > 1 + new_chunks = Pointer(Pointer(Entry(K, V))).malloc(nchunks) + @entries = new_chunks.as(Pointer(Void)) + old_chunks = original.@entries.as(Pointer(Pointer(Entry(K, V)))) + else + new_chunks = pointerof(@entries).as(Pointer(Pointer(Entry(K, V)))) + old_chunks = pointerof(original.@entries).as(Pointer(Pointer(Entry(K, V)))) + end + new_chunks.clear(nchunks) + if !empty? + last_chunkn = (@last - 1) / CHUNK + 0.upto(last_chunkn) do |i| + chunk = Pointer(Entry(K, V)).malloc(CHUNK) + chunk.copy_from(old_chunks[i], CHUNK) + new_chunks[i] = chunk + end + end + self end # Similar to `#dup`, but duplicates the values as well. @@ -749,11 +749,16 @@ class Hash(K, V) # hash_a # => {"foobar" => {"foo" => "bar"}} # ``` def clone - hash = Hash(K, V).new(initial_capacity: @buckets_size) - each do |key, value| - hash[key] = value.clone + copy = dup + copy.init_clone + end + + protected def init_clone + each_entry do |entry| + entry.value.key = entry.value.key.clone + entry.value.value = entry.value.value.clone end - hash + self end def inspect(io : IO) @@ -804,20 +809,6 @@ class Hash(K, V) self end - def rehash - new_size = calculate_new_size(@size) - @buckets = @buckets.realloc(new_size) - new_size.times { |i| @buckets[i] = nil } - @buckets_size = new_size - entry = @last - while entry - index = bucket_index entry.key - entry.next = @buckets[index] - @buckets[index] = entry - entry = entry.back - end - end - # Inverts keys and values. If there are duplicated values, the last key becomes the new value. # # ``` @@ -825,92 +816,326 @@ class Hash(K, V) # {"foo" => "bar", "baz" => "bar"}.invert # => {"bar" => "baz"} # ``` def invert - hash = Hash(V, K).new(initial_capacity: @buckets_size) + hash = Hash(V, K).new(initial_capacity: @size) self.each do |k, v| hash[v] = k end hash end - protected def find_entry(key) - return nil if empty? + # Implementation + + CHUNK = 8_u32 - index = bucket_index key - entry = @buckets[index] - find_entry_in_bucket entry, key + @[AlwaysInline] + private def chunks_ptr + capa = nentries(@format) + if capa <= CHUNK + pointerof(@entries).as(Pointer(Pointer(Entry(K, V)))) + else + @entries.as(Pointer(Pointer(Entry(K, V)))) + end end - private def insert_in_bucket(index, key, value) - entry = @buckets[index] - if entry - while entry - if entry.key == key - entry.value = value - return nil + protected def entry_at(i, chunks = chunks_ptr) + chunks[i / CHUNK] + i % CHUNK + end + + @[AlwaysInline] + private def index_ptr + if indexmask(@format) == 0 + pointerof(@index).as(UInt32*) + else + @index + end + end + + @[AlwaysInline] + private def each_entry + return if empty? + rnum = @rebuild_num + chunks = chunks_ptr + @first.upto(@last - 1) do |i| + entry = entry_at(i, chunks) + if !entry.value.empty? + yield entry + raise "Hash modified during iteration" unless rnum == @rebuild_num + elsif @first == i + @first += 1 + end + end + nil + end + + @[AlwaysInline] + private def first_entry + each_entry { |entry| break entry } + end + + protected def find_entry(hash, key) : Pointer(Entry(K, V)) + unless empty? + chunks = chunks_ptr + mask = indexmask(@format) + pos = hash & mask + idx = index_ptr[pos] + while idx != 0 + entry = entry_at(~idx, chunks) + if entry.value.hashsum == hash && entry.value.key == key + return entry end - if entry.next - entry = entry.next + idx = entry.value.next + end + end + Pointer(Entry(K, V)).null + end + + protected def find_entry_fix_delete(hash, key) : Pointer(Entry(K, V)) + unless empty? + chunks = chunks_ptr + mask = indexmask(@format) + pos = hash & mask + prev_idx = index_ptr + pos + idx = prev_idx.value + while idx != 0 + entry = entry_at(~idx, chunks) + idx = entry.value.next + if entry.value.hashsum == hash && entry.value.key == key + return entry + elsif entry.value.empty? + prev_idx.value = idx else - return entry.next = Entry(K, V).new(key, value) + prev_idx = entry.value.next_ptr end end + end + Pointer(Entry(K, V)).null + end + + def rehash + @rebuild_num += 1_u16 + if needs_shrink(@size, @format) + reclaim_without_index + # attention: be careful for @format underflow + # currently it is safe because of `format > 1` in needs_shrink + #if needs_shrink(@size, @format - 2) + # resize_data(@format - 1) + #end + fix_index + elsif nentries(@format + 1) == 0 + raise "Hash table too big" else - return @buckets[index] = Entry(K, V).new(key, value) + resize_data(@format + 1) + if indexmask(@format) != indexmask(@format - 1) + reclaim_without_index + fix_index + end end end - private def find_entry_in_bucket(entry, key) - while entry - if entry.key == key - return entry + private def resize_data(newsz) + oldsz = @format + old_nindex = nindex(oldsz) + new_nindex = nindex(newsz) + if new_nindex != old_nindex + if old_nindex == 0 + # explicitely alloc because index was reused as UInt32 + @index = Pointer(UInt32).malloc(new_nindex) + elsif new_nindex == 0 + @index.realloc(0) + # explicitely clear because index will be reused as UInt32 + @index = Pointer(UInt32).null + else + @index = @index.realloc(new_nindex) end - entry = entry.next end - nil + + old_nchunks = nentries(oldsz) / CHUNK + new_nchunks = nentries(newsz) / CHUNK + if new_nchunks > 1 + new_chunks = Pointer(Pointer(Entry(K, V))).malloc(new_nchunks) + else + new_chunks = pointerof(@entries).as(Pointer(Pointer(Entry(K, V)))) + end + if old_nchunks > 1 + old_chunks = @entries.as(Pointer(Pointer(Entry(K, V)))) + else + old_chunks = pointerof(@entries).as(Pointer(Pointer(Entry(K, V)))) + end + if old_nchunks < new_nchunks + old_chunks.copy_to(new_chunks, old_nchunks) + (new_chunks + old_nchunks).clear(new_nchunks - old_nchunks) + else + old_chunks.copy_to(new_chunks, new_nchunks) + end + if old_nchunks > 1 + old_chunks.realloc(0) + end + if new_nchunks > 1 + @entries = new_chunks.as(Pointer(Void)) + end + @format = newsz + end + + private def needs_shrink(size : Int32, format : UInt8) : Bool + format > 1 && size < nentries(format - 1) + end + + private def reclaim_without_index + @rebuild_num += 1_u16 + pos = 0_u32 + chunks = chunks_ptr + unless empty? + idx = @first + if @first == 0_u32 + if @last == @size + pos = idx = @last + else + while true + entry = entry_at(idx, chunks) + break if entry.value.empty? + idx += 1 + end + pos = idx + end + end + idx.upto(@last - 1) do |i| + entry = entry_at(i, chunks) + unless entry.value.empty? + entry_at(pos, chunks).value = entry.value + pos += 1 + end + end + end + # clean tail to help garbage collector + chunkn = (pos - 1) / CHUNK + if (chpos = pos % CHUNK) != 0 + chunk = chunks[chunkn] + (chunk + chpos).clear(CHUNK - chpos) + end + ((pos + CHUNK - 1) / CHUNK).upto((@last - 1) / CHUNK) do |i| + chunks[i].clear(CHUNK) + end + @first = 0_u32 + @last = pos + end + + private def fix_index + index = index_ptr + mask = indexmask(@format) + index.clear(mask + 1) + return if empty? + chunks = chunks_ptr + 0_u32.upto(@last - 1) do |i| + entry = entry_at(i, chunks) + pos = entry.value.hashsum & mask + entry.value.next = index[pos] + index[pos] = ~i + end end - private def bucket_index(key) - key.hash.remainder(@buckets_size).to_i + private def push_entry(hash : UInt32, key, val) : UInt32 + if @last == nentries(@format) + rehash + end + idx = @last + chunks = chunks_ptr + chunk = chunks[idx / CHUNK] + if chunk.null? + chunk = Pointer(Entry(K, V)).malloc(CHUNK) + chunk.clear(CHUNK) + chunks[idx / CHUNK] = chunk + end + entry = chunk + idx % CHUNK + entry.value.hashsum = hash + entry.value.key = key + entry.value.value = val + + mask = indexmask(@format) + index = index_ptr + pos = hash & mask + entry.value.next = index[pos] + index[pos] = ~idx + + @last += 1 + @size += 1 + idx + end + + def hash_key(key) + h = key.hash.to_u32 + {% if flag?(:bits32) %} + h | 0xC0000000_u32 + {% else %} + h | 0x80000000_u32 + {% end %} + end + + private def nindex(sz) + mask = FORMATS[sz].indexmask + mask + (mask != 0 ? 1 : 0) + end + + private def indexmask(sz) + FORMATS[sz].indexmask + end + + private def nentries(sz) + FORMATS[sz].nentries end private def calculate_new_size(size) - new_size = 8 - HASH_PRIMES.each do |hash_size| - return hash_size if new_size > size - new_size <<= 1 + (1...FORMATS.size).each do |i| + return i.to_u8 if FORMATS[i].nentries >= size end raise "Hash table too big" end - private class Entry(K, V) - getter key : K + private def clear_entry(entry) + nxt = entry.value.next + entry.clear + entry.value.next = nxt + end + + private struct Entry(K, V) + property hashsum : UInt32 + property next : UInt32 + property key : K property value : V - # Next in the linked list of each bucket - property next : self? + def initialize(@key : K, @value : V, @hashsum : UInt32, @next : UInt32) + end - # Next in the ordered sense of hash - property fore : self? + def pair : {K, V} + {@key, @value} + end - # Previous in the ordered sense of hash - property back : self? + def empty? + @hashsum.zero? + end - def initialize(@key : K, @value : V) + def next_ptr + pointerof(@next) end end private module BaseIterator - def initialize(@hash, @current) + def initialize(@hash, @current, @rebuild_num) end def base_next - if current = @current - value = yield current - @current = current.fore - value - else - stop + if @hash.@rebuild_num != @rebuild_num + raise "Hash modified during iteration" end + while @current < @hash.@last + entry = @hash.entry_at(@current) + unless entry.value.empty? + value = yield entry + @current += 1_u32 + return value + end + @current += 1_u32 + end + stop end def rewind @@ -923,10 +1148,11 @@ class Hash(K, V) include Iterator({K, V}) @hash : Hash(K, V) - @current : Entry(K, V)? + @current : UInt32 + @rebuild_num : UInt16 def next - base_next { |entry| {entry.key, entry.value} } + base_next { |entry| {entry.value.key, entry.value.value} } end end @@ -935,10 +1161,11 @@ class Hash(K, V) include Iterator(K) @hash : Hash(K, V) - @current : Entry(K, V)? + @current : UInt32 + @rebuild_num : UInt16 def next - base_next &.key + base_next &.value.key end end @@ -947,43 +1174,44 @@ class Hash(K, V) include Iterator(V) @hash : Hash(K, V) - @current : Entry(K, V)? + @current : UInt32 + @rebuild_num : UInt16 def next - base_next &.value + base_next &.value.value end end # :nodoc: - HASH_PRIMES = [ - 8 + 3, - 16 + 3, - 32 + 5, - 64 + 3, - 128 + 3, - 256 + 27, - 512 + 9, - 1024 + 9, - 2048 + 5, - 4096 + 3, - 8192 + 27, - 16384 + 43, - 32768 + 3, - 65536 + 45, - 131072 + 29, - 262144 + 3, - 524288 + 21, - 1048576 + 7, - 2097152 + 17, - 4194304 + 15, - 8388608 + 9, - 16777216 + 43, - 33554432 + 35, - 67108864 + 15, - 134217728 + 29, - 268435456 + 3, - 536870912 + 11, - 1073741824 + 85, - 0, - ] + record Format, nentries : UInt32, indexmask : UInt32 + {% begin %} + # :nodoc: + FORMATS = StaticArray[ + Format.new(0_u32, 0_u32), + Format.new(8_u32, 0_u32), + {% if flag?(:bits32) %} + + {% for i in 4..26 %} + {% p = 1 << i %} + Format.new({{p}}_u32, {{p/2 - 1}}_u32), + Format.new({{p + p/2}}_u32, {{p/2 - 1}}_u32), + {% end %} + + {% else %} + + {% for i in 4..10 %} + {% p = 1 << i %} + Format.new({{p}}_u32, {{p - 1}}_u32), + Format.new({{p + p/2}}_u32, {{p - 1}}_u32), + {% end %} + {% for i in 11..30 %} + {% p = 1 << i %} + Format.new({{p}}_u32, {{p*2 - 1}}_u32), + Format.new({{p + p/2}}_u32, {{p*2 - 1}}_u32), + {% end %} + + {% end %} + Format.new(0_u32, 0_u32), + ] +{% end %} end