-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 74c247c
Showing
7 changed files
with
468 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
name: CI | ||
on: | ||
push: | ||
branches: | ||
- main | ||
pull_request: | ||
|
||
jobs: | ||
build: | ||
name: CI | ||
runs-on: ubuntu-latest | ||
|
||
steps: | ||
- name: Log | ||
env: | ||
CI_EVENT_ACTION: ${{ github.event.action }} | ||
CI_PR_TITLE: ${{ github.event.pull_request.title }} | ||
CI_PR_PREV_TITLE: ${{ github.event.changes.title.from }} | ||
run: | | ||
echo github.event.action=$CI_EVENT_ACTION | ||
echo github.event.pull_request.title=$CI_PR_TITLE | ||
echo github.event.changes.title.from=$CI_PR_PREV_TITLE | ||
- name: Set up Go | ||
uses: actions/setup-go@v2 | ||
with: | ||
go-version: '~1.16.6' | ||
id: go | ||
|
||
- name: Install utilities | ||
run: | | ||
go install golang.org/x/lint/golint@latest | ||
go install golang.org/x/tools/cmd/goimports@latest | ||
go install honnef.co/go/tools/cmd/staticcheck@latest | ||
# display Go environment for reference | ||
go env | ||
- name: Check out code | ||
uses: actions/checkout@v2 | ||
|
||
- uses: actions/cache@v2 | ||
with: | ||
path: ~/go/pkg/mod | ||
key: ${{ runner.os }}-go-${{ hashFiles('**/go.sum') }} | ||
restore-keys: | | ||
${{ runner.os }}-go- | ||
- name: Get dependencies | ||
run: | | ||
go mod tidy | ||
/usr/bin/git diff --exit-code | ||
- name: Build | ||
run: | | ||
go build -v ./... | ||
- name: Check | ||
run: | | ||
go vet ./... | ||
golint ./... | ||
staticcheck ./... | ||
goimports -w . | ||
/usr/bin/git diff --exit-code | ||
- name: Test | ||
run: | | ||
go test -v ./... |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
Copyright 2021 The Sensible Code Company Ltd | ||
|
||
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and | ||
associated documentation files (the "Software"), to deal in the Software without restriction, | ||
including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, | ||
and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, | ||
subject to the following conditions: | ||
|
||
The above copyright notice and this permission notice shall be included in all copies | ||
or substantial portions of the Software. | ||
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT | ||
NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | ||
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, | ||
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH | ||
THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
# faststringmap | ||
|
||
`faststringmap` is a fast read-only string keyed map for Go (golang). | ||
For our use case it is approximately 5 times faster than using Go's | ||
built-in map type with a string key. It also has the following advantages: | ||
|
||
* look up strings and byte slices without use of the `unsafe` package | ||
* minimal impact on GC due to lack of pointers in the data structure | ||
* data structure can be trivially serialized to disk or network | ||
|
||
The code provided implements a map from string to `uint32` which fits our | ||
use case, but you can easily substitute other value types. | ||
|
||
`faststringmap` is a variant of a data structure called a [Trie](https://en.wikipedia.org/wiki/Trie). | ||
At each level we use a slice to hold the next possible byte values. | ||
This slice is of length one plus the difference between the lowest and highest | ||
possible next bytes of strings in the map. Not all the entries in the slice are | ||
valid next bytes. `faststringmap` is thus more space efficient for keys using a | ||
small set of nearby runes, for example those using a lot of digits. | ||
|
||
## Example | ||
|
||
Example usage can be found in [``uint32_store_example_test.go``](uint32_store_example_test.go). | ||
|
||
## Motivation | ||
|
||
I created `faststringmap` in order to improve the speed of parsing CSV | ||
where the fields were category codes from survey data. The majority of these | ||
were numeric (`"1"`, `"2"`, `"3"`...) plus a distinct code for "not applicable". | ||
I was struck that in the simplest possible cases (e.g. `"1"` ... `"5"`) the map | ||
should be a single slice lookup. | ||
|
||
Our fast CSV parser provides fields as byte slices into the read buffer to | ||
avoid creating string objects. So I also wanted to facilitate key lookup from a | ||
`[]byte` rather than a string. This is not possible using a built-in Go map without | ||
use of the `unsafe` package. | ||
|
||
## Benchmarks | ||
|
||
Example benchmarks from my laptop: | ||
``` | ||
cpu: Intel(R) Core(TM) i7-6700HQ CPU @ 2.60GHz | ||
BenchmarkUint32Store | ||
BenchmarkUint32Store-8 218463 4959 ns/op | ||
BenchmarkGoStringToUint32 | ||
BenchmarkGoStringToUint32-8 49279 24483 ns/op | ||
``` | ||
|
||
## Improvements | ||
|
||
You can improve the performance further by using a slice for the ``next`` fields. | ||
This avoids a bounds check when looking up the entry for a byte. However, it | ||
comes at the cost of easy serialization and introduces a lot of pointers which | ||
will have impact on GC. It is not possible to directly construct the slice version | ||
in the same way so that the whole store is one block of memory. Either create as in | ||
this code and then derive the slice version or create distinct slice objects at each level. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
module github.com/sensiblecodeio/faststringmap | ||
|
||
go 1.16 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
// Copyright 2021 The Sensible Code Company Ltd | ||
// Author: Duncan Harris | ||
|
||
package faststringmap | ||
|
||
import ( | ||
"sort" | ||
) | ||
|
||
type ( | ||
// Uint32Store is a fast read only map from string to uint32 | ||
// Lookups are about 5x faster than the built-in Go map type | ||
Uint32Store struct { | ||
store []byteValue | ||
} | ||
|
||
byteValue struct { | ||
nextLo uint32 // index in store of next byteValues | ||
nextLen byte // number of byteValues in store used for next possible bytes | ||
nextOffset byte // offset from zero byte value of first element of range of byteValues | ||
valid bool // is the byte sequence with no more bytes in the map? | ||
value uint32 // value for byte sequence with no more bytes | ||
} | ||
|
||
// Uint32Source is for supplying data to initialise Uint32Store | ||
Uint32Source interface { | ||
// AppendKeys should append the keys of the maps to the supplied slice and return the resulting slice | ||
AppendKeys([]string) []string | ||
// Get should return the value for the supplied key | ||
Get(string) uint32 | ||
} | ||
) | ||
|
||
// NewUint32Store creates from the data supplied in srcMap | ||
func NewUint32Store(srcMap Uint32Source) Uint32Store { | ||
m := Uint32Store{store: make([]byteValue, 1)} | ||
if keys := srcMap.AppendKeys([]string(nil)); len(keys) > 0 { | ||
sort.Strings(keys) | ||
m.makeByteValue(&m.store[0], keys, 0, srcMap) | ||
} | ||
return m | ||
} | ||
|
||
// makeByteValue will initialise the supplied byteValue for | ||
// the sorted strings in slice a considering bytes at byteIndex in the strings | ||
func (m *Uint32Store) makeByteValue(bv *byteValue, a []string, byteIndex int, srcMap Uint32Source) { | ||
// if there is a string with no more bytes then it is always first because they are sorted | ||
if len(a[0]) == byteIndex { | ||
bv.valid = true | ||
bv.value = srcMap.Get(a[0]) | ||
a = a[1:] | ||
} | ||
if len(a) == 0 { | ||
return | ||
} | ||
bv.nextOffset = a[0][byteIndex] // lowest value for next byte | ||
bv.nextLen = a[len(a)-1][byteIndex] - // highest value for next byte | ||
bv.nextOffset + 1 // minus lowest value +1 = number of possible next bytes | ||
bv.nextLo = uint32(len(m.store)) // first byteValue struct to use | ||
|
||
// allocate enough byteValue structs - they default to "not valid" | ||
m.store = append(m.store, make([]byteValue, bv.nextLen)...) | ||
|
||
for i, n := 0, len(a); i < n; { | ||
// find range of strings starting with the same byte | ||
iSameByteHi := i + 1 | ||
for iSameByteHi < n && a[iSameByteHi][byteIndex] == a[i][byteIndex] { | ||
iSameByteHi++ | ||
} | ||
nextStoreIndex := bv.nextLo + uint32(a[i][byteIndex]-bv.nextOffset) | ||
m.makeByteValue(&m.store[nextStoreIndex], a[i:iSameByteHi], byteIndex+1, srcMap) | ||
i = iSameByteHi | ||
} | ||
} | ||
|
||
// LookupString looks up the supplied string in the map | ||
func (m *Uint32Store) LookupString(s string) (uint32, bool) { | ||
bv := &m.store[0] | ||
for i, n := 0, len(s); i < n; i++ { | ||
b := s[i] | ||
if b < bv.nextOffset { | ||
return 0, false | ||
} | ||
ni := b - bv.nextOffset | ||
if ni >= bv.nextLen { | ||
return 0, false | ||
} | ||
bv = &m.store[bv.nextLo+uint32(ni)] | ||
} | ||
return bv.value, bv.valid | ||
} | ||
|
||
// LookupBytes looks up the supplied byte slice in the map | ||
func (m *Uint32Store) LookupBytes(s []byte) (uint32, bool) { | ||
bv := &m.store[0] | ||
for _, b := range s { | ||
if b < bv.nextOffset { | ||
return 0, false | ||
} | ||
ni := b - bv.nextOffset | ||
if ni >= bv.nextLen { | ||
return 0, false | ||
} | ||
bv = &m.store[bv.nextLo+uint32(ni)] | ||
} | ||
return bv.value, bv.valid | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
package faststringmap_test | ||
|
||
import ( | ||
"fmt" | ||
"sort" | ||
"strings" | ||
|
||
"github.com/sensiblecodeio/faststringmap" | ||
) | ||
|
||
func Example() { | ||
m := exampleSource{ | ||
"key1": 42, | ||
"key2": 27644437, | ||
"l": 2, | ||
} | ||
|
||
fm := faststringmap.NewUint32Store(m) | ||
|
||
// add an entry that is not in the fast map | ||
m["m"] = 4 | ||
|
||
// sort the keys so output is the same for each test run | ||
keys := make([]string, 0, len(m)) | ||
for k := range m { | ||
keys = append(keys, k) | ||
} | ||
sort.Strings(keys) | ||
|
||
// lookup every key in the fast map and print the corresponding value | ||
for _, k := range keys { | ||
v, ok := fm.LookupString(k) | ||
fmt.Printf("%q: %d, %v\n", k, v, ok) | ||
} | ||
|
||
// Dump out the store to aid in understanding the implementation | ||
fmt.Println() | ||
dump := fmt.Sprintf("%+v", fm) | ||
dump = strings.ReplaceAll(dump, "}", "}\n") | ||
dump = strings.ReplaceAll(dump, "[", "[\n ") | ||
fmt.Println(dump) | ||
|
||
// Output: | ||
// | ||
// "key1": 42, true | ||
// "key2": 27644437, true | ||
// "l": 2, true | ||
// "m": 0, false | ||
// | ||
// {store:[ | ||
// {nextLo:1 nextLen:2 nextOffset:107 valid:false value:0} | ||
// {nextLo:3 nextLen:1 nextOffset:101 valid:false value:0} | ||
// {nextLo:0 nextLen:0 nextOffset:0 valid:true value:2} | ||
// {nextLo:4 nextLen:1 nextOffset:121 valid:false value:0} | ||
// {nextLo:5 nextLen:2 nextOffset:49 valid:false value:0} | ||
// {nextLo:0 nextLen:0 nextOffset:0 valid:true value:42} | ||
// {nextLo:0 nextLen:0 nextOffset:0 valid:true value:27644437} | ||
// ]} | ||
} | ||
|
||
type exampleSource map[string]uint32 | ||
|
||
func (s exampleSource) AppendKeys(a []string) []string { | ||
for k := range s { | ||
a = append(a, k) | ||
} | ||
return a | ||
} | ||
|
||
func (s exampleSource) Get(k string) uint32 { | ||
return s[k] | ||
} |
Oops, something went wrong.