diff --git a/.github/workflows/ccpp.yml b/.github/workflows/ccpp.yml index 04bac33..2ed7a17 100644 --- a/.github/workflows/ccpp.yml +++ b/.github/workflows/ccpp.yml @@ -9,8 +9,10 @@ jobs: strategy: matrix: os: [ubuntu-latest, macOS-latest] - + steps: - - uses: actions/checkout@v1 + - uses: actions/checkout@v2 + with: + submodules: true - name: make run: make diff --git a/.gitignore b/.gitignore index 820c203..ef20697 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,4 @@ -swuniq *.o sonar-project.properties -bin/* -bin -benchsuite/corpustest +out/* +out diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..9f60d81 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,6 @@ +[submodule "uthash"] + path = uthash + url = https://github.com/troydhanson/uthash +[submodule "xxHash"] + path = xxHash + url = https://github.com/Cyan4973/xxHash diff --git a/Makefile b/Makefile index 3dc6d6e..056a796 100644 --- a/Makefile +++ b/Makefile @@ -1,23 +1,39 @@ SHELL=/bin/sh TARGET=swuniq -SRCS=swuniq.c xxhash.h -CFLAGS=-O2 +SRCS=swuniq.c DESTDIR= prefix=/usr/local/bin INSTALL=install INSTALL_PROGRAM=$(INSTALL) +CFLAGS ?= -O2 -pie +DEBUGFLAGS+=-Wall -Wextra -Wconversion -Wcast-qual -Wcast-align \ + -Wshadow -Wundef -Wstrict-overflow=5 -Wstrict-prototypes \ + -Wswitch-enum -Wredundant-decls -Wvla -Wnarrowing \ + -Wpointer-arith -Wformat-security -Wformat=2 \ + -Winit-self -Wfloat-equal -Wwrite-strings +CFLAGS += $(DEBUGFLAGS) + +# Add support for xxHash dispatch +ifeq ($(DISPATCH),1) + CFLAGS += -DXXHSUM_DISPATCH=1 + SRCS += xxHash/xxh_x86dispatch.c +endif + +.PHONY: swuniq swuniq: $(SRCS) mkdir -p out - $(CC) $(CFLAGS) $(TARGET).c -o out/$(TARGET) + $(CC) $(CFLAGS) $(SRCS) -o out/$(TARGET) static: $(SRCS) mkdir -p out - $(CC) $(CFLAGS) -static $(TARGET).c -o out/$(TARGET)-static - -.PHONY: all -all: swuniq static + $(CC) $(CFLAGS) -static $(SRCS) -o out/$(TARGET)-static +# ## dispatch only works for x86/x64 systems +# dispatch: CPPFLAGS += -DXXHSUM_DISPATCH=1 +# dispatch: xxHash/xxhash.o xxHash/xxh_x86dispatch.o swuniq.c +# $(CC) $(CFLAGS) $^ $(LDFLAGS) +# xxh_x86dispatch.o: xxHash/xxh_x86dispatch.c xxHash/xxh_x86dispatch.h xxHash/xxhash.h install: swuniq mkdir -p $(DESTDIR)$(prefix) @@ -43,7 +59,7 @@ install-strip-all: .PHONY: check check: - @if [ "$$({ seq 1 10; seq 1 10; } | out/swuniq -w 10 | wc -l)" -eq 10 ]; then \ + @if [ "$$({ seq 1 10; seq 1 10; } | out/swuniq -w 10 | wc -l)" -eq 10 ]; then \ echo 'Test suite result [SUCCESS]' \ exit 0 ;\ else \ diff --git a/swuniq.c b/swuniq.c index 25a49fe..b40edee 100644 --- a/swuniq.c +++ b/swuniq.c @@ -1,138 +1,197 @@ /* -* swuniq - sliding window uniq -* -* MIT License -* -* Copyright (c) 2018 Miguel Terron -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in all -* copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -* -*/ + * swuniq - sliding window uniq + * + * MIT License + * + * Copyright (c) 2018 Miguel Terron + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ /* swuniq : * TODO: Description */ - /* ************************************ * Includes **************************************/ -#include +#include +#include +#include +#include #include +#include #include -#include - #include -#include -#include #define XXH_PRIVATE_API #define XXH_STATIC_LINKING_ONLY -#define XXH_INLINE_ALL -#include "xxhash.h" - -#include "utringbuffer.h" +#include "xxHash/xxhash.h" + +#ifdef XXHSUM_DISPATCH +# include "xxHash/xxh_x86dispatch.h" +#endif + +#include "uthash/src/utringbuffer.h" + +/* makes the next part easier */ +#if defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64) +# define ARCH_X64 1 +# define ARCH_X86 "x86_64" +#elif defined(__i386__) || defined(_M_IX86) || defined(_M_IX86_FP) +# define ARCH_X86 "i386" +#endif +/* Try to detect the architecture. */ +#if defined(ARCH_X86) +# if defined(XXHSUM_DISPATCH) +# define ARCH ARCH_X86 " autoVec" +# elif defined(__AVX512F__) +# define ARCH ARCH_X86 " + AVX512" +# elif defined(__AVX2__) +# define ARCH ARCH_X86 " + AVX2" +# elif defined(__AVX__) +# define ARCH ARCH_X86 " + AVX" +# elif defined(_M_X64) || defined(_M_AMD64) || defined(__x86_64__) \ + || defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP == 2) +# define ARCH ARCH_X86 " + SSE2" +# else +# define ARCH ARCH_X86 +# endif +#elif defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) +# define ARCH "aarch64 + NEON" +#elif defined(__arm__) || defined(__thumb__) || defined(__thumb2__) || defined(_M_ARM) +/* ARM has a lot of different features that can change xxHash significantly. */ +# if defined(__thumb2__) || (defined(__thumb__) && (__thumb__ == 2 || __ARM_ARCH >= 7)) +# define ARCH_THUMB " Thumb-2" +# elif defined(__thumb__) +# define ARCH_THUMB " Thumb-1" +# else +# define ARCH_THUMB "" +# endif +/* ARMv7 has unaligned by default */ +# if defined(__ARM_FEATURE_UNALIGNED) || __ARM_ARCH >= 7 || defined(_M_ARMV7VE) +# define ARCH_UNALIGNED " + unaligned" +# else +# define ARCH_UNALIGNED "" +# endif +# if defined(__ARM_NEON) || defined(__ARM_NEON__) +# define ARCH_NEON " + NEON" +# else +# define ARCH_NEON "" +# endif +# define ARCH "ARMv" EXPAND_AND_QUOTE(__ARM_ARCH) ARCH_THUMB ARCH_NEON ARCH_UNALIGNED +#elif defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__) +# if defined(__GNUC__) && defined(__POWER9_VECTOR__) +# define ARCH "ppc64 + POWER9 vector" +# elif defined(__GNUC__) && defined(__POWER8_VECTOR__) +# define ARCH "ppc64 + POWER8 vector" +# else +# define ARCH "ppc64" +# endif +#elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) +# define ARCH "ppc" +#elif defined(__AVR) +# define ARCH "AVR" +#elif defined(__mips64) +# define ARCH "mips64" +#elif defined(__mips) +# define ARCH "mips" +#elif defined(__s390x__) +# define ARCH "s390x" +#elif defined(__s390__) +# define ARCH "s390" +#else +# define ARCH "unknown" +#endif +static const int g_nbBits = (int)(sizeof(void*)*8); /********************************************************************************************************************/ -unsigned long long hashString(const void* buffer, size_t length) -{ - unsigned long long const seed = 1029384756; - unsigned long long const hash = XXH3_64bits_withSeed(buffer, length, seed); +unsigned long long hashString(const char *data) { + unsigned long long const hash = XXH3_64bits(data, strlen(data)); return hash; } /* returns 1 if the hash already exists on the ringbuffer */ -int lookup(const unsigned long long hash, const UT_ringbuffer* rbuffer) -{ - int out = 0; - - if (utringbuffer_len(rbuffer) == 0) return(out); - else - { - //unsigned long long *item; - for (int i=0; i < utringbuffer_len(rbuffer); i++) { +bool lookup(unsigned long long hash, const UT_ringbuffer *rbuffer) { + bool out = false; + + if (utringbuffer_len(rbuffer) == 0) + return (out); + else { + for (unsigned int i = 0; i < utringbuffer_len(rbuffer); i++) { unsigned long long *item = utringbuffer_eltptr(rbuffer, i); out = (hash == *item); - if (out) break; + if (out) + break; } - return(out); + return (out); } } /********************************************************** -* Main -**********************************************************/ -int main (int argc, char *argv[]){ - int wsize = 10; // Default window size + * Main + **********************************************************/ +int main(int argc, char *argv[]) { + unsigned int wsize = 10; // Default window size int c; - while ((c = getopt (argc, argv, "hw:")) != -1) - { - switch (c) - { - case 'w': - wsize = strtoumax(optarg, NULL, 10); - break; - case 'h': - default: - fprintf(stderr,"Usage: swuniq [-w N] INPUT\nFilter matching lines (within a configurable window) from INPUT\n(or stdin), writing to stdout.\n\n\t-w N Size of the sliding window to use for deduplication\nNote: By default swuniq will use a window of 10 lines.\n\n"); - exit(1); + while ((c = getopt(argc, argv, "hw:")) != -1) { + switch (c) { + case 'w': + wsize = strtoumax(optarg, NULL, 10); + break; + case 'h': + default: + #define HELP_MESSAGE "swuniq 0.6 by Miguel Terron compiled as %i-bit %s\nFilter matching lines (within a configurable window) from INPUT\n(or stdin), writing to stdout.\n\nUsage: swuniq [-w N] INPUT\n\t-w N Size of the sliding window to use for deduplication\nNote: By default swuniq will use a window of 10 lines.\n", g_nbBits, ARCH + fprintf(stderr, HELP_MESSAGE); + exit(1); } } // Open file if filename is provided - if(optind < argc) { - if ( freopen(argv[optind], "r", stdin) == NULL) - { - fprintf(stderr,"Can't open file %s",argv[optind]); + if (optind < argc) { + if (freopen(argv[optind], "r", stdin) == NULL) { + fprintf(stderr, "Can't open file %s", argv[optind]); exit(1); } } - char *buffer; - size_t bufsize = 6000; - UT_ringbuffer *history; - UT_icd ut_long_long_icd = {sizeof(long long), NULL, NULL, NULL }; + UT_icd ut_long_long_icd = {sizeof(long long), NULL, NULL, NULL}; utringbuffer_new(history, wsize, &ut_long_long_icd); - unsigned long long digest; - buffer = (char *)malloc(bufsize * sizeof(char)); - if( buffer == NULL ) - { - perror("Unable to allocate buffer"); - exit(1); - } - - while( -1 != getline(&buffer, &bufsize, stdin) ) - { - digest = hashString(buffer, strlen(buffer)); - if (!lookup(digest,history)) - { + char* line; + size_t bufsize = sysconf(_SC_PAGESIZE); + unsigned long long digest; + while (-1 != getline(&line, &bufsize, stdin)) { + digest = hashString(line); + if (!lookup(digest, history)) { utringbuffer_push_back(history, &digest); - printf("%s",buffer); + printf("%s", line); fflush(stdout); } } fclose(stdin); -// utringbuffer_free(history); + free(line); + // utringbuffer_free(history); exit(0); } diff --git a/utarray.h b/utarray.h deleted file mode 100644 index 6ed0dce..0000000 --- a/utarray.h +++ /dev/null @@ -1,238 +0,0 @@ -/* -Copyright (c) 2008-2018, Troy D. Hanson http://troydhanson.github.com/uthash/ -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS -IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER -OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -/* a dynamic array implementation using macros - */ -#ifndef UTARRAY_H -#define UTARRAY_H - -#define UTARRAY_VERSION 2.1.0 - -#include /* size_t */ -#include /* memset, etc */ -#include /* exit */ - -#ifdef __GNUC__ -#define UTARRAY_UNUSED __attribute__((__unused__)) -#else -#define UTARRAY_UNUSED -#endif - -#ifndef oom -#define oom() exit(-1) -#endif - -typedef void (ctor_f)(void *dst, const void *src); -typedef void (dtor_f)(void *elt); -typedef void (init_f)(void *elt); -typedef struct { - size_t sz; - init_f *init; - ctor_f *copy; - dtor_f *dtor; -} UT_icd; - -typedef struct { - unsigned i,n;/* i: index of next available slot, n: num slots */ - UT_icd icd; /* initializer, copy and destructor functions */ - char *d; /* n slots of size icd->sz*/ -} UT_array; - -#define utarray_init(a,_icd) do { \ - memset(a,0,sizeof(UT_array)); \ - (a)->icd = *(_icd); \ -} while(0) - -#define utarray_done(a) do { \ - if ((a)->n) { \ - if ((a)->icd.dtor) { \ - unsigned _ut_i; \ - for(_ut_i=0; _ut_i < (a)->i; _ut_i++) { \ - (a)->icd.dtor(utarray_eltptr(a,_ut_i)); \ - } \ - } \ - free((a)->d); \ - } \ - (a)->n=0; \ -} while(0) - -#define utarray_new(a,_icd) do { \ - (a) = (UT_array*)malloc(sizeof(UT_array)); \ - if ((a) == NULL) oom(); \ - utarray_init(a,_icd); \ -} while(0) - -#define utarray_free(a) do { \ - utarray_done(a); \ - free(a); \ -} while(0) - -#define utarray_reserve(a,by) do { \ - if (((a)->i+(by)) > (a)->n) { \ - char *utarray_tmp; \ - while (((a)->i+(by)) > (a)->n) { (a)->n = ((a)->n ? (2*(a)->n) : 8); } \ - utarray_tmp=(char*)realloc((a)->d, (a)->n*(a)->icd.sz); \ - if (utarray_tmp == NULL) oom(); \ - (a)->d=utarray_tmp; \ - } \ -} while(0) - -#define utarray_push_back(a,p) do { \ - utarray_reserve(a,1); \ - if ((a)->icd.copy) { (a)->icd.copy( _utarray_eltptr(a,(a)->i++), p); } \ - else { memcpy(_utarray_eltptr(a,(a)->i++), p, (a)->icd.sz); }; \ -} while(0) - -#define utarray_pop_back(a) do { \ - if ((a)->icd.dtor) { (a)->icd.dtor( _utarray_eltptr(a,--((a)->i))); } \ - else { (a)->i--; } \ -} while(0) - -#define utarray_extend_back(a) do { \ - utarray_reserve(a,1); \ - if ((a)->icd.init) { (a)->icd.init(_utarray_eltptr(a,(a)->i)); } \ - else { memset(_utarray_eltptr(a,(a)->i),0,(a)->icd.sz); } \ - (a)->i++; \ -} while(0) - -#define utarray_len(a) ((a)->i) - -#define utarray_eltptr(a,j) (((j) < (a)->i) ? _utarray_eltptr(a,j) : NULL) -#define _utarray_eltptr(a,j) ((a)->d + ((a)->icd.sz * (j))) - -#define utarray_insert(a,p,j) do { \ - if ((j) > (a)->i) utarray_resize(a,j); \ - utarray_reserve(a,1); \ - if ((j) < (a)->i) { \ - memmove( _utarray_eltptr(a,(j)+1), _utarray_eltptr(a,j), \ - ((a)->i - (j))*((a)->icd.sz)); \ - } \ - if ((a)->icd.copy) { (a)->icd.copy( _utarray_eltptr(a,j), p); } \ - else { memcpy(_utarray_eltptr(a,j), p, (a)->icd.sz); }; \ - (a)->i++; \ -} while(0) - -#define utarray_inserta(a,w,j) do { \ - if (utarray_len(w) == 0) break; \ - if ((j) > (a)->i) utarray_resize(a,j); \ - utarray_reserve(a,utarray_len(w)); \ - if ((j) < (a)->i) { \ - memmove(_utarray_eltptr(a,(j)+utarray_len(w)), \ - _utarray_eltptr(a,j), \ - ((a)->i - (j))*((a)->icd.sz)); \ - } \ - if ((a)->icd.copy) { \ - unsigned _ut_i; \ - for(_ut_i=0;_ut_i<(w)->i;_ut_i++) { \ - (a)->icd.copy(_utarray_eltptr(a, (j) + _ut_i), _utarray_eltptr(w, _ut_i)); \ - } \ - } else { \ - memcpy(_utarray_eltptr(a,j), _utarray_eltptr(w,0), \ - utarray_len(w)*((a)->icd.sz)); \ - } \ - (a)->i += utarray_len(w); \ -} while(0) - -#define utarray_resize(dst,num) do { \ - unsigned _ut_i; \ - if ((dst)->i > (unsigned)(num)) { \ - if ((dst)->icd.dtor) { \ - for (_ut_i = (num); _ut_i < (dst)->i; ++_ut_i) { \ - (dst)->icd.dtor(_utarray_eltptr(dst, _ut_i)); \ - } \ - } \ - } else if ((dst)->i < (unsigned)(num)) { \ - utarray_reserve(dst, (num) - (dst)->i); \ - if ((dst)->icd.init) { \ - for (_ut_i = (dst)->i; _ut_i < (unsigned)(num); ++_ut_i) { \ - (dst)->icd.init(_utarray_eltptr(dst, _ut_i)); \ - } \ - } else { \ - memset(_utarray_eltptr(dst, (dst)->i), 0, (dst)->icd.sz*((num) - (dst)->i)); \ - } \ - } \ - (dst)->i = (num); \ -} while(0) - -#define utarray_concat(dst,src) do { \ - utarray_inserta(dst, src, utarray_len(dst)); \ -} while(0) - -#define utarray_erase(a,pos,len) do { \ - if ((a)->icd.dtor) { \ - unsigned _ut_i; \ - for (_ut_i = 0; _ut_i < (len); _ut_i++) { \ - (a)->icd.dtor(utarray_eltptr(a, (pos) + _ut_i)); \ - } \ - } \ - if ((a)->i > ((pos) + (len))) { \ - memmove(_utarray_eltptr(a, pos), _utarray_eltptr(a, (pos) + (len)), \ - ((a)->i - ((pos) + (len))) * (a)->icd.sz); \ - } \ - (a)->i -= (len); \ -} while(0) - -#define utarray_renew(a,u) do { \ - if (a) utarray_clear(a); \ - else utarray_new(a, u); \ -} while(0) - -#define utarray_clear(a) do { \ - if ((a)->i > 0) { \ - if ((a)->icd.dtor) { \ - unsigned _ut_i; \ - for(_ut_i=0; _ut_i < (a)->i; _ut_i++) { \ - (a)->icd.dtor(_utarray_eltptr(a, _ut_i)); \ - } \ - } \ - (a)->i = 0; \ - } \ -} while(0) - -#define utarray_sort(a,cmp) do { \ - qsort((a)->d, (a)->i, (a)->icd.sz, cmp); \ -} while(0) - -#define utarray_find(a,v,cmp) bsearch((v),(a)->d,(a)->i,(a)->icd.sz,cmp) - -#define utarray_front(a) (((a)->i) ? (_utarray_eltptr(a,0)) : NULL) -#define utarray_next(a,e) (((e)==NULL) ? utarray_front(a) : ((((a)->i) > (utarray_eltidx(a,e)+1)) ? _utarray_eltptr(a,utarray_eltidx(a,e)+1) : NULL)) -#define utarray_prev(a,e) (((e)==NULL) ? utarray_back(a) : ((utarray_eltidx(a,e) > 0) ? _utarray_eltptr(a,utarray_eltidx(a,e)-1) : NULL)) -#define utarray_back(a) (((a)->i) ? (_utarray_eltptr(a,(a)->i-1)) : NULL) -#define utarray_eltidx(a,e) (((char*)(e) >= (a)->d) ? (((char*)(e) - (a)->d)/(a)->icd.sz) : -1) - -/* last we pre-define a few icd for common utarrays of ints and strings */ -static void utarray_str_cpy(void *dst, const void *src) { - char **_src = (char**)src, **_dst = (char**)dst; - *_dst = (*_src == NULL) ? NULL : strdup(*_src); -} -static void utarray_str_dtor(void *elt) { - char **eltc = (char**)elt; - if (*eltc != NULL) free(*eltc); -} -static const UT_icd ut_str_icd UTARRAY_UNUSED = {sizeof(char*),NULL,utarray_str_cpy,utarray_str_dtor}; -static const UT_icd ut_int_icd UTARRAY_UNUSED = {sizeof(int),NULL,NULL,NULL}; -static const UT_icd ut_ptr_icd UTARRAY_UNUSED = {sizeof(void*),NULL,NULL,NULL}; - - -#endif /* UTARRAY_H */ diff --git a/uthash b/uthash new file mode 160000 index 0000000..8e67ced --- /dev/null +++ b/uthash @@ -0,0 +1 @@ +Subproject commit 8e67ced1d1c5bd8141c542a22630e6de78aa6b90 diff --git a/utringbuffer.h b/utringbuffer.h deleted file mode 100644 index ce2890e..0000000 --- a/utringbuffer.h +++ /dev/null @@ -1,108 +0,0 @@ -/* -Copyright (c) 2015-2018, Troy D. Hanson http://troydhanson.github.com/uthash/ -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS -IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER -OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -/* a ring-buffer implementation using macros - */ -#ifndef UTRINGBUFFER_H -#define UTRINGBUFFER_H - -#define UTRINGBUFFER_VERSION 2.1.0 - -#include -#include -#include "utarray.h" // for "UT_icd" - -typedef struct { - unsigned i; /* index of next available slot; wraps at n */ - unsigned n; /* capacity */ - unsigned char f; /* full */ - UT_icd icd; /* initializer, copy and destructor functions */ - char *d; /* n slots of size icd->sz */ -} UT_ringbuffer; - -#define utringbuffer_init(a, _n, _icd) do { \ - memset(a, 0, sizeof(UT_ringbuffer)); \ - (a)->icd = *(_icd); \ - (a)->n = (_n); \ - if ((a)->n) { (a)->d = (char*)malloc((a)->n * (_icd)->sz); } \ -} while(0) - -#define utringbuffer_clear(a) do { \ - if ((a)->icd.dtor) { \ - if ((a)->f) { \ - unsigned _ut_i; \ - for (_ut_i = 0; _ut_i < (a)->n; ++_ut_i) { \ - (a)->icd.dtor(utringbuffer_eltptr(a, _ut_i)); \ - } \ - } else { \ - unsigned _ut_i; \ - for (_ut_i = 0; _ut_i < (a)->i; ++_ut_i) { \ - (a)->icd.dtor(utringbuffer_eltptr(a, _ut_i)); \ - } \ - } \ - } \ - (a)->i = 0; \ - (a)->f = 0; \ -} while(0) - -#define utringbuffer_done(a) do { \ - utringbuffer_clear(a); \ - free((a)->d); (a)->d = NULL; \ - (a)->n = 0; \ -} while(0) - -#define utringbuffer_new(a,n,_icd) do { \ - a = (UT_ringbuffer*)malloc(sizeof(UT_ringbuffer)); \ - utringbuffer_init(a, n, _icd); \ -} while(0) - -#define utringbuffer_free(a) do { \ - utringbuffer_done(a); \ - free(a); \ -} while(0) - -#define utringbuffer_push_back(a,p) do { \ - if ((a)->icd.dtor && (a)->f) { (a)->icd.dtor(_utringbuffer_internalptr(a,(a)->i)); } \ - if ((a)->icd.copy) { (a)->icd.copy( _utringbuffer_internalptr(a,(a)->i), p); } \ - else { memcpy(_utringbuffer_internalptr(a,(a)->i), p, (a)->icd.sz); }; \ - if (++(a)->i == (a)->n) { (a)->i = 0; (a)->f = 1; } \ -} while(0) - -#define utringbuffer_len(a) ((a)->f ? (a)->n : (a)->i) -#define utringbuffer_empty(a) ((a)->i == 0 && !(a)->f) -#define utringbuffer_full(a) ((a)->f != 0) - -#define _utringbuffer_real_idx(a,j) ((a)->f ? ((j) + (a)->i) % (a)->n : (j)) -#define _utringbuffer_internalptr(a,j) ((void*)((a)->d + ((a)->icd.sz * (j)))) -#define utringbuffer_eltptr(a,j) ((0 <= (j) && (j) < utringbuffer_len(a)) ? _utringbuffer_internalptr(a,_utringbuffer_real_idx(a,j)) : NULL) - -#define _utringbuffer_fake_idx(a,j) ((a)->f ? ((j) + (a)->n - (a)->i) % (a)->n : (j)) -#define _utringbuffer_internalidx(a,e) (((char*)(e) >= (a)->d) ? (((char*)(e) - (a)->d)/(a)->icd.sz) : -1) -#define utringbuffer_eltidx(a,e) _utringbuffer_fake_idx(a, _utringbuffer_internalidx(a,e)) - -#define utringbuffer_front(a) utringbuffer_eltptr(a,0) -#define utringbuffer_next(a,e) ((e)==NULL ? utringbuffer_front(a) : utringbuffer_eltptr(a, utringbuffer_eltidx(a,e)+1)) -#define utringbuffer_prev(a,e) ((e)==NULL ? utringbuffer_back(a) : utringbuffer_eltptr(a, utringbuffer_eltidx(a,e)-1)) -#define utringbuffer_back(a) (utringbuffer_empty(a) ? NULL : utringbuffer_eltptr(a, utringbuffer_len(a) - 1)) - -#endif /* UTRINGBUFFER_H */ diff --git a/xxHash b/xxHash new file mode 160000 index 0000000..94e5f23 --- /dev/null +++ b/xxHash @@ -0,0 +1 @@ +Subproject commit 94e5f23e736f2bb67ebdf90727353e65344f9fc0 diff --git a/xxh3.h b/xxh3.h deleted file mode 100644 index 9e3e88a..0000000 --- a/xxh3.h +++ /dev/null @@ -1,2112 +0,0 @@ -/* - * xxHash - Extremely Fast Hash algorithm - * Development source file for `xxh3` - * Copyright (C) 2019-present, Yann Collet - * - * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following disclaimer - * in the documentation and/or other materials provided with the - * distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * You can contact the author at: - * - xxHash homepage: https://www.xxhash.com - * - xxHash source repository: https://github.com/Cyan4973/xxHash - */ - -/* - * Note: This file is separated for development purposes. - * It will be integrated into `xxhash.h` when development stage is completed. - * - * Credit: most of the work on vectorial and asm variants comes from @easyaspi314 - */ - -#ifndef XXH3_H_1397135465 -#define XXH3_H_1397135465 - -/* === Dependencies === */ -#ifndef XXHASH_H_5627135585666179 -/* special: when including `xxh3.h` directly, turn on XXH_INLINE_ALL */ -# undef XXH_INLINE_ALL /* avoid redefinition */ -# define XXH_INLINE_ALL -#endif -#include "xxhash.h" - - -/* === Compiler specifics === */ - -#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */ -# define XXH_RESTRICT restrict -#else -/* Note: it might be useful to define __restrict or __restrict__ for some C++ compilers */ -# define XXH_RESTRICT /* disable */ -#endif - -#if (defined(__GNUC__) && (__GNUC__ >= 3)) \ - || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \ - || defined(__clang__) -# define XXH_likely(x) __builtin_expect(x, 1) -# define XXH_unlikely(x) __builtin_expect(x, 0) -#else -# define XXH_likely(x) (x) -# define XXH_unlikely(x) (x) -#endif - -#if defined(__GNUC__) -# if defined(__AVX2__) -# include -# elif defined(__SSE2__) -# include -# elif defined(__ARM_NEON__) || defined(__ARM_NEON) -# define inline __inline__ /* clang bug */ -# include -# undef inline -# endif -#elif defined(_MSC_VER) -# include -#endif - -/* - * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while - * remaining a true 64-bit/128-bit hash function. - * - * This is done by prioritizing a subset of 64-bit operations that can be - * emulated without too many steps on the average 32-bit machine. - * - * For example, these two lines seem similar, and run equally fast on 64-bit: - * - * xxh_u64 x; - * x ^= (x >> 47); // good - * x ^= (x >> 13); // bad - * - * However, to a 32-bit machine, there is a major difference. - * - * x ^= (x >> 47) looks like this: - * - * x.lo ^= (x.hi >> (47 - 32)); - * - * while x ^= (x >> 13) looks like this: - * - * // note: funnel shifts are not usually cheap. - * x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13)); - * x.hi ^= (x.hi >> 13); - * - * The first one is significantly faster than the second, simply because the - * shift is larger than 32. This means: - * - All the bits we need are in the upper 32 bits, so we can ignore the lower - * 32 bits in the shift. - * - The shift result will always fit in the lower 32 bits, and therefore, - * we can ignore the upper 32 bits in the xor. - * - * Thanks to this optimization, XXH3 only requires these features to be efficient: - * - * - Usable unaligned access - * - A 32-bit or 64-bit ALU - * - If 32-bit, a decent ADC instruction - * - A 32 or 64-bit multiply with a 64-bit result - * - For the 128-bit variant, a decent byteswap helps short inputs. - * - * The first two are already required by XXH32, and almost all 32-bit and 64-bit - * platforms which can run XXH32 can run XXH3 efficiently. - * - * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one - * notable exception. - * - * First of all, Thumb-1 lacks support for the UMULL instruction which - * performs the important long multiply. This means numerous __aeabi_lmul - * calls. - * - * Second of all, the 8 functional registers are just not enough. - * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need - * Lo registers, and this shuffling results in thousands more MOVs than A32. - * - * A32 and T32 don't have this limitation. They can access all 14 registers, - * do a 32->64 multiply with UMULL, and the flexible operand allowing free - * shifts is helpful, too. - * - * Therefore, we do a quick sanity check. - * - * If compiling Thumb-1 for a target which supports ARM instructions, we will - * emit a warning, as it is not a "sane" platform to compile for. - * - * Usually, if this happens, it is because of an accident and you probably need - * to specify -march, as you likely meant to compile for a newer architecture. - */ -#if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM) -# warning "XXH3 is highly inefficient without ARM or Thumb-2." -#endif - -/* ========================================== - * Vectorization detection - * ========================================== */ -#define XXH_SCALAR 0 /* Portable scalar version */ -#define XXH_SSE2 1 /* SSE2 for Pentium 4 and all x86_64 */ -#define XXH_AVX2 2 /* AVX2 for Haswell and Bulldozer */ -#define XXH_NEON 3 /* NEON for most ARMv7-A and all AArch64 */ -#define XXH_VSX 4 /* VSX and ZVector for POWER8/z13 */ - -#ifndef XXH_VECTOR /* can be defined on command line */ -# if defined(__AVX2__) -# define XXH_VECTOR XXH_AVX2 -# elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2)) -# define XXH_VECTOR XXH_SSE2 -# elif defined(__GNUC__) /* msvc support maybe later */ \ - && (defined(__ARM_NEON__) || defined(__ARM_NEON)) \ - && (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \ - || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) -# define XXH_VECTOR XXH_NEON -# elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) \ - || (defined(__s390x__) && defined(__VEC__)) \ - && defined(__GNUC__) /* TODO: IBM XL */ -# define XXH_VECTOR XXH_VSX -# else -# define XXH_VECTOR XXH_SCALAR -# endif -#endif - -/* - * Controls the alignment of the accumulator. - * This is for compatibility with aligned vector loads, which are usually faster. - */ -#ifndef XXH_ACC_ALIGN -# if XXH_VECTOR == XXH_SCALAR /* scalar */ -# define XXH_ACC_ALIGN 8 -# elif XXH_VECTOR == XXH_SSE2 /* sse2 */ -# define XXH_ACC_ALIGN 16 -# elif XXH_VECTOR == XXH_AVX2 /* avx2 */ -# define XXH_ACC_ALIGN 32 -# elif XXH_VECTOR == XXH_NEON /* neon */ -# define XXH_ACC_ALIGN 16 -# elif XXH_VECTOR == XXH_VSX /* vsx */ -# define XXH_ACC_ALIGN 16 -# endif -#endif - -/* - * UGLY HACK: - * GCC usually generates the best code with -O3 for xxHash. - * - * However, when targeting AVX2, it is overzealous in its unrolling resulting - * in code roughly 3/4 the speed of Clang. - * - * There are other issues, such as GCC splitting _mm256_loadu_si256 into - * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which - * only applies to Sandy and Ivy Bridge... which don't even support AVX2. - * - * That is why when compiling the AVX2 version, it is recommended to use either - * -O2 -mavx2 -march=haswell - * or - * -O2 -mavx2 -mno-avx256-split-unaligned-load - * for decent performance, or to use Clang instead. - * - * Fortunately, we can control the first one with a pragma that forces GCC into - * -O2, but the other one we can't control without "failed to inline always - * inline function due to target mismatch" warnings. - */ -#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \ - && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ - && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */ -# pragma GCC push_options -# pragma GCC optimize("-O2") -#endif - - -#if XXH_VECTOR == XXH_NEON -/* - * NEON's setup for vmlal_u32 is a little more complicated than it is on - * SSE2, AVX2, and VSX. - * - * While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an upcast. - * - * To do the same operation, the 128-bit 'Q' register needs to be split into - * two 64-bit 'D' registers, performing this operation:: - * - * [ a | b ] - * | '---------. .--------' | - * | x | - * | .---------' '--------. | - * [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[ a >> 32 | b >> 32 ] - * - * Due to significant changes in aarch64, the fastest method for aarch64 is - * completely different than the fastest method for ARMv7-A. - * - * ARMv7-A treats D registers as unions overlaying Q registers, so modifying - * D11 will modify the high half of Q5. This is similar to how modifying AH - * will only affect bits 8-15 of AX on x86. - * - * VZIP takes two registers, and puts even lanes in one register and odd lanes - * in the other. - * - * On ARMv7-A, this strangely modifies both parameters in place instead of - * taking the usual 3-operand form. - * - * Therefore, if we want to do this, we can simply use a D-form VZIP.32 on the - * lower and upper halves of the Q register to end up with the high and low - * halves where we want - all in one instruction. - * - * vzip.32 d10, d11 @ d10 = { d10[0], d11[0] }; d11 = { d10[1], d11[1] } - * - * Unfortunately we need inline assembly for this: Instructions modifying two - * registers at once is not possible in GCC or Clang's IR, and they have to - * create a copy. - * - * aarch64 requires a different approach. - * - * In order to make it easier to write a decent compiler for aarch64, many - * quirks were removed, such as conditional execution. - * - * NEON was also affected by this. - * - * aarch64 cannot access the high bits of a Q-form register, and writes to a - * D-form register zero the high bits, similar to how writes to W-form scalar - * registers (or DWORD registers on x86_64) work. - * - * The formerly free vget_high intrinsics now require a vext (with a few - * exceptions) - * - * Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the equivalent - * of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to only modify one - * operand. - * - * The equivalent of the VZIP.32 on the lower and upper halves would be this - * mess: - * - * ext v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0], v0[1] } - * zip1 v1.2s, v0.2s, v2.2s // v1 = { v0[0], v2[0] } - * zip2 v0.2s, v0.2s, v1.2s // v0 = { v0[1], v2[1] } - * - * Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64 (SHRN): - * - * shrn v1.2s, v0.2d, #32 // v1 = (uint32x2_t)(v0 >> 32); - * xtn v0.2s, v0.2d // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF); - * - * This is available on ARMv7-A, but is less efficient than a single VZIP.32. - */ - -/* - * Function-like macro: - * void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t &outHi) - * { - * outLo = (uint32x2_t)(in & 0xFFFFFFFF); - * outHi = (uint32x2_t)(in >> 32); - * in = UNDEFINED; - * } - */ -# if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \ - && defined(__GNUC__) \ - && !defined(__aarch64__) && !defined(__arm64__) -# define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \ - do { \ - /* Undocumented GCC/Clang operand modifier: %e0 = lower D half, %f0 = upper D half */ \ - /* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486 */ \ - /* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 */ \ - __asm__("vzip.32 %e0, %f0" : "+w" (in)); \ - (outLo) = vget_low_u32 (vreinterpretq_u32_u64(in)); \ - (outHi) = vget_high_u32(vreinterpretq_u32_u64(in)); \ - } while (0) -# else -# define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \ - do { \ - (outLo) = vmovn_u64 (in); \ - (outHi) = vshrn_n_u64 ((in), 32); \ - } while (0) -# endif -#endif /* XXH_VECTOR == XXH_NEON */ - -/* - * VSX and Z Vector helpers. - * - * This is very messy, and any pull requests to clean this up are welcome. - * - * There are a lot of problems with supporting VSX and s390x, due to - * inconsistent intrinsics, spotty coverage, and multiple endiannesses. - */ -#if XXH_VECTOR == XXH_VSX -# if defined(__s390x__) -# include -# else -# include -# endif - -# undef vector /* Undo the pollution */ - -typedef __vector unsigned long long xxh_u64x2; -typedef __vector unsigned char xxh_u8x16; -typedef __vector unsigned xxh_u32x4; - -# ifndef XXH_VSX_BE -# if defined(__BIG_ENDIAN__) \ - || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) -# define XXH_VSX_BE 1 -# elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__ -# warning "-maltivec=be is not recommended. Please use native endianness." -# define XXH_VSX_BE 1 -# else -# define XXH_VSX_BE 0 -# endif -# endif /* !defined(XXH_VSX_BE) */ - -# if XXH_VSX_BE -/* A wrapper for POWER9's vec_revb. */ -# if defined(__POWER9_VECTOR__) || (defined(__clang__) && defined(__s390x__)) -# define XXH_vec_revb vec_revb -# else -XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val) -{ - xxh_u8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, - 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 }; - return vec_perm(val, val, vByteSwap); -} -# endif -# endif /* XXH_VSX_BE */ - -/* - * Performs an unaligned load and byte swaps it on big endian. - */ -XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr) -{ - xxh_u64x2 ret; - memcpy(&ret, ptr, sizeof(xxh_u64x2)); -# if XXH_VSX_BE - ret = XXH_vec_revb(ret); -# endif - return ret; -} - -/* - * vec_mulo and vec_mule are very problematic intrinsics on PowerPC - * - * These intrinsics weren't added until GCC 8, despite existing for a while, - * and they are endian dependent. Also, their meaning swap depending on version. - * */ -# if defined(__s390x__) - /* s390x is always big endian, no issue on this platform */ -# define XXH_vec_mulo vec_mulo -# define XXH_vec_mule vec_mule -# elif defined(__clang__) && __has_builtin(__builtin_altivec_vmuleuw) -/* Clang has a better way to control this, we can just use the builtin which doesn't swap. */ -# define XXH_vec_mulo __builtin_altivec_vmulouw -# define XXH_vec_mule __builtin_altivec_vmuleuw -# else -/* gcc needs inline assembly */ -/* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */ -XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b) -{ - xxh_u64x2 result; - __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); - return result; -} -XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b) -{ - xxh_u64x2 result; - __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); - return result; -} -# endif /* XXH_vec_mulo, XXH_vec_mule */ -#endif /* XXH_VECTOR == XXH_VSX */ - - -/* prefetch - * can be disabled, by declaring XXH_NO_PREFETCH build macro */ -#if defined(XXH_NO_PREFETCH) -# define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ -#else -# if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86)) /* _mm_prefetch() is not defined outside of x86/x64 */ -# include /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */ -# define XXH_PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0) -# elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) ) -# define XXH_PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */) -# else -# define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ -# endif -#endif /* XXH_NO_PREFETCH */ - - -/* ========================================== - * XXH3 default settings - * ========================================== */ - -#define XXH_SECRET_DEFAULT_SIZE 192 /* minimum XXH3_SECRET_SIZE_MIN */ - -#if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN) -# error "default keyset is not large enough" -#endif - -/* Pseudorandom secret taken directly from FARSH */ -XXH_ALIGN(64) static const xxh_u8 kSecret[XXH_SECRET_DEFAULT_SIZE] = { - 0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c, - 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f, - 0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21, - 0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c, - 0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3, - 0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8, - 0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d, - 0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64, - - 0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb, - 0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e, - 0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce, - 0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e, -}; - -/* - * Does a 32-bit to 64-bit long multiply. - * - * Wraps __emulu on MSVC x86 because it tends to call __allmul when it doesn't - * need to (but it shouldn't need to anyways, it is about 7 instructions to do - * a 64x64 multiply...). Since we know that this will _always_ emit MULL, we - * use that instead of the normal method. - * - * If you are compiling for platforms like Thumb-1 and don't have a better option, - * you may also want to write your own long multiply routine here. - * - * XXH_FORCE_INLINE xxh_u64 XXH_mult32to64(xxh_u64 x, xxh_u64 y) - * { - * return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF); - * } - */ -#if defined(_MSC_VER) && defined(_M_IX86) -# include -# define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y)) -#else -/* - * Downcast + upcast is usually better than masking on older compilers like - * GCC 4.2 (especially 32-bit ones), all without affecting newer compilers. - * - * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands - * and perform a full 64x64 multiply -- entirely redundant on 32-bit. - */ -# define XXH_mult32to64(x, y) ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y)) -#endif - -/* - * Calculates a 64->128-bit long multiply. - * - * Uses __uint128_t and _umul128 if available, otherwise uses a scalar version. - */ -static XXH128_hash_t -XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) -{ - /* - * GCC/Clang __uint128_t method. - * - * On most 64-bit targets, GCC and Clang define a __uint128_t type. - * This is usually the best way as it usually uses a native long 64-bit - * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64. - * - * Usually. - * - * Despite being a 32-bit platform, Clang (and emscripten) define this type - * despite not having the arithmetic for it. This results in a laggy - * compiler builtin call which calculates a full 128-bit multiply. - * In that case it is best to use the portable one. - * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677 - */ -#if defined(__GNUC__) && !defined(__wasm__) \ - && defined(__SIZEOF_INT128__) \ - || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) - - __uint128_t product = (__uint128_t)lhs * (__uint128_t)rhs; - XXH128_hash_t const r128 = { (xxh_u64)(product), (xxh_u64)(product >> 64) }; - return r128; - - /* - * MSVC for x64's _umul128 method. - * - * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct); - * - * This compiles to single operand MUL on x64. - */ -#elif defined(_M_X64) || defined(_M_IA64) - -#ifndef _MSC_VER -# pragma intrinsic(_umul128) -#endif - xxh_u64 product_high; - xxh_u64 const product_low = _umul128(lhs, rhs, &product_high); - XXH128_hash_t const r128 = { product_low, product_high }; - return r128; - -#else - /* - * Portable scalar method. Optimized for 32-bit and 64-bit ALUs. - * - * This is a fast and simple grade school multiply, which is shown below - * with base 10 arithmetic instead of base 0x100000000. - * - * 9 3 // D2 lhs = 93 - * x 7 5 // D2 rhs = 75 - * ---------- - * 1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15 - * 4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45 - * 2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21 - * + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63 - * --------- - * 2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27 - * + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67 - * --------- - * 6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975 - * - * The reasons for adding the products like this are: - * 1. It avoids manual carry tracking. Just like how - * (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX. - * This avoids a lot of complexity. - * - * 2. It hints for, and on Clang, compiles to, the powerful UMAAL - * instruction available in ARM's Digital Signal Processing extension - * in 32-bit ARMv6 and later, which is shown below: - * - * void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm) - * { - * xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm; - * *RdLo = (xxh_u32)(product & 0xFFFFFFFF); - * *RdHi = (xxh_u32)(product >> 32); - * } - * - * This instruction was designed for efficient long multiplication, and - * allows this to be calculated in only 4 instructions at speeds - * comparable to some 64-bit ALUs. - * - * 3. It isn't terrible on other platforms. Usually this will be a couple - * of 32-bit ADD/ADCs. - */ - - /* First calculate all of the cross products. */ - xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF); - xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32, rhs & 0xFFFFFFFF); - xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32); - xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32, rhs >> 32); - - /* Now add the products together. These will never overflow. */ - xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi; - xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi; - xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF); - - XXH128_hash_t r128 = { lower, upper }; - return r128; -#endif -} - -/* - * Does a 64-bit to 128-bit multiply, then XOR folds it. - * - * The reason for the separate function is to prevent passing too many structs - * around by value. This will hopefully inline the multiply, but we don't force it. - */ -static xxh_u64 -XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs) -{ - XXH128_hash_t product = XXH_mult64to128(lhs, rhs); - return product.low64 ^ product.high64; -} - -/* Seems to produce slightly better code on GCC for some reason. */ -XXH_FORCE_INLINE xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift) -{ - XXH_ASSERT(0 <= shift && shift < 64); - return v64 ^ (v64 >> shift); -} - -/* - * We don't need to (or want to) mix as much as XXH64. - * - * Short hashes are more evenly distributed, so it isn't necessary. - */ -static XXH64_hash_t XXH3_avalanche(xxh_u64 h64) -{ - h64 = XXH_xorshift64(h64, 37); - h64 *= 0x165667919E3779F9ULL; - h64 = XXH_xorshift64(h64, 32); - return h64; -} - - -/* ========================================== - * Short keys - * ========================================== - * One of the shortcomings of XXH32 and XXH64 was that their performance was - * sub-optimal on short lengths. It used an iterative algorithm which strongly - * favored lengths that were a multiple of 4 or 8. - * - * Instead of iterating over individual inputs, we use a set of single shot - * functions which piece together a range of lengths and operate in constant time. - * - * Additionally, the number of multiplies has been significantly reduced. This - * reduces latency, especially when emulating 64-bit multiplies on 32-bit. - * - * Depending on the platform, this may or may not be faster than XXH32, but it - * is almost guaranteed to be faster than XXH64. - */ - -/* - * At very short lengths, there isn't enough input to fully hide secrets, or use - * the entire secret. - * - * There is also only a limited amount of mixing we can do before significantly - * impacting performance. - * - * Therefore, we use different sections of the secret and always mix two secret - * samples with an XOR. This should have no effect on performance on the - * seedless or withSeed variants because everything _should_ be constant folded - * by modern compilers. - * - * The XOR mixing hides individual parts of the secret and increases entropy. - * - * This adds an extra layer of strength for custom secrets. - */ -XXH_FORCE_INLINE XXH64_hash_t -XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) -{ - XXH_ASSERT(input != NULL); - XXH_ASSERT(1 <= len && len <= 3); - XXH_ASSERT(secret != NULL); - /* - * len = 1: combined = { input[0], 0x01, input[0], input[0] } - * len = 2: combined = { input[1], 0x02, input[0], input[1] } - * len = 3: combined = { input[2], 0x03, input[0], input[1] } - */ - { xxh_u8 const c1 = input[0]; - xxh_u8 const c2 = input[len >> 1]; - xxh_u8 const c3 = input[len - 1]; - xxh_u32 const combined = ((xxh_u32)c1<<16) | (((xxh_u32)c2) << 24) | (((xxh_u32)c3) << 0) | (((xxh_u32)len) << 8); - xxh_u64 const bitflip = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed; - xxh_u64 const keyed = (xxh_u64)combined ^ bitflip; - xxh_u64 const mixed = keyed * PRIME64_1; - return XXH3_avalanche(mixed); - } -} - -XXH_FORCE_INLINE XXH64_hash_t -XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) -{ - XXH_ASSERT(input != NULL); - XXH_ASSERT(secret != NULL); - XXH_ASSERT(4 <= len && len < 8); - seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32; - { xxh_u32 const input1 = XXH_readLE32(input); - xxh_u32 const input2 = XXH_readLE32(input + len - 4); - xxh_u64 const bitflip = (XXH_readLE64(secret+8) ^ XXH_readLE64(secret+16)) - seed; - xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32); - xxh_u64 x = input64 ^ bitflip; - /* this mix is inspired by Pelle Evensen's rrmxmx */ - x ^= XXH_rotl64(x, 49) ^ XXH_rotl64(x, 24); - x *= 0x9FB21C651E98DF25ULL; - x ^= (x >> 35) + len ; - x *= 0x9FB21C651E98DF25ULL; - return XXH_xorshift64(x, 28); - } -} - -XXH_FORCE_INLINE XXH64_hash_t -XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) -{ - XXH_ASSERT(input != NULL); - XXH_ASSERT(secret != NULL); - XXH_ASSERT(8 <= len && len <= 16); - { xxh_u64 const bitflip1 = (XXH_readLE64(secret+24) ^ XXH_readLE64(secret+32)) + seed; - xxh_u64 const bitflip2 = (XXH_readLE64(secret+40) ^ XXH_readLE64(secret+48)) - seed; - xxh_u64 const input_lo = XXH_readLE64(input) ^ bitflip1; - xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2; - xxh_u64 const acc = len - + XXH_swap64(input_lo) + input_hi - + XXH3_mul128_fold64(input_lo, input_hi); - return XXH3_avalanche(acc); - } -} - -XXH_FORCE_INLINE XXH64_hash_t -XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) -{ - XXH_ASSERT(len <= 16); - { if (XXH_likely(len > 8)) return XXH3_len_9to16_64b(input, len, secret, seed); - if (XXH_likely(len >= 4)) return XXH3_len_4to8_64b(input, len, secret, seed); - if (len) return XXH3_len_1to3_64b(input, len, secret, seed); - return XXH3_avalanche((PRIME64_1 + seed) ^ (XXH_readLE64(secret+56) ^ XXH_readLE64(secret+64))); - } -} - -/* - * DISCLAIMER: There are known *seed-dependent* multicollisions here due to - * multiplication by zero, affecting hashes of lengths 17 to 240. - * - * However, they are very unlikely. - * - * Keep this in mind when using the unseeded XXH3_64bits() variant: As with all - * unseeded non-cryptographic hashes, it does not attempt to defend itself - * against specially crafted inputs, only random inputs. - * - * Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes - * cancelling out the secret is taken an arbitrary number of times (addressed - * in XXH3_accumulate_512), this collision is very unlikely with random inputs - * and/or proper seeding: - * - * This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a - * function that is only called up to 16 times per hash with up to 240 bytes of - * input. - * - * This is not too bad for a non-cryptographic hash function, especially with - * only 64 bit outputs. - * - * The 128-bit variant (which trades some speed for strength) is NOT affected - * by this, although it is always a good idea to use a proper seed if you care - * about strength. - */ -XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input, - const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64) -{ -#if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ - && defined(__i386__) && defined(__SSE2__) /* x86 + SSE2 */ \ - && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable like XXH32 hack */ - /* - * UGLY HACK: - * GCC for x86 tends to autovectorize the 128-bit multiply, resulting in - * slower code. - * - * By forcing seed64 into a register, we disrupt the cost model and - * cause it to scalarize. See `XXH32_round()` - * - * FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600, - * XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on - * GCC 9.2, despite both emitting scalar code. - * - * GCC generates much better scalar code than Clang for the rest of XXH3, - * which is why finding a more optimal codepath is an interest. - */ - __asm__ ("" : "+r" (seed64)); -#endif - { xxh_u64 const input_lo = XXH_readLE64(input); - xxh_u64 const input_hi = XXH_readLE64(input+8); - return XXH3_mul128_fold64( - input_lo ^ (XXH_readLE64(secret) + seed64), - input_hi ^ (XXH_readLE64(secret+8) - seed64) - ); - } -} - -/* For mid range keys, XXH3 uses a Mum-hash variant. */ -XXH_FORCE_INLINE XXH64_hash_t -XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len, - const xxh_u8* XXH_RESTRICT secret, size_t secretSize, - XXH64_hash_t seed) -{ - XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; - XXH_ASSERT(16 < len && len <= 128); - - { xxh_u64 acc = len * PRIME64_1; - if (len > 32) { - if (len > 64) { - if (len > 96) { - acc += XXH3_mix16B(input+48, secret+96, seed); - acc += XXH3_mix16B(input+len-64, secret+112, seed); - } - acc += XXH3_mix16B(input+32, secret+64, seed); - acc += XXH3_mix16B(input+len-48, secret+80, seed); - } - acc += XXH3_mix16B(input+16, secret+32, seed); - acc += XXH3_mix16B(input+len-32, secret+48, seed); - } - acc += XXH3_mix16B(input+0, secret+0, seed); - acc += XXH3_mix16B(input+len-16, secret+16, seed); - - return XXH3_avalanche(acc); - } -} - -#define XXH3_MIDSIZE_MAX 240 - -XXH_NO_INLINE XXH64_hash_t -XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len, - const xxh_u8* XXH_RESTRICT secret, size_t secretSize, - XXH64_hash_t seed) -{ - XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; - XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); - - #define XXH3_MIDSIZE_STARTOFFSET 3 - #define XXH3_MIDSIZE_LASTOFFSET 17 - - { xxh_u64 acc = len * PRIME64_1; - int const nbRounds = (int)len / 16; - int i; - for (i=0; i<8; i++) { - acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed); - } - acc = XXH3_avalanche(acc); - XXH_ASSERT(nbRounds >= 8); -#if defined(__clang__) /* Clang */ \ - && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ - && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ - /* - * UGLY HACK: - * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86. - * In everywhere else, it uses scalar code. - * - * For 64->128-bit multiplies, even if the NEON was 100% optimal, it - * would still be slower than UMAAL (see XXH_mult64to128). - * - * Unfortunately, Clang doesn't handle the long multiplies properly and - * converts them to the nonexistent "vmulq_u64" intrinsic, which is then - * scalarized into an ugly mess of VMOV.32 instructions. - * - * This mess is difficult to avoid without turning autovectorization - * off completely, but they are usually relatively minor and/or not - * worth it to fix. - * - * This loop is the easiest to fix, as unlike XXH32, this pragma - * _actually works_ because it is a loop vectorization instead of an - * SLP vectorization. - */ - #pragma clang loop vectorize(disable) -#endif - for (i=8 ; i < nbRounds; i++) { - acc += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed); - } - /* last bytes */ - acc += XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed); - return XXH3_avalanche(acc); - } -} - - -/* === Long Keys === */ - -#define STRIPE_LEN 64 -#define XXH_SECRET_CONSUME_RATE 8 /* nb of secret bytes consumed at each accumulation */ -#define ACC_NB (STRIPE_LEN / sizeof(xxh_u64)) - -typedef enum { XXH3_acc_64bits, XXH3_acc_128bits } XXH3_accWidth_e; - -/* - * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized. - * - * It is a hardened version of UMAC, based off of FARSH's implementation. - * - * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD - * implementations, and it is ridiculously fast. - * - * We harden it by mixing the original input to the accumulators as well as the product. - * - * This means that in the (relatively likely) case of a multiply by zero, the - * original input is preserved. - * - * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve - * cross-pollination, as otherwise the upper and lower halves would be - * essentially independent. - * - * This doesn't matter on 64-bit hashes since they all get merged together in - * the end, so we skip the extra step. - * - * Both XXH3_64bits and XXH3_128bits use this subroutine. - */ -XXH_FORCE_INLINE void -XXH3_accumulate_512( void* XXH_RESTRICT acc, - const void* XXH_RESTRICT input, - const void* XXH_RESTRICT secret, - XXH3_accWidth_e accWidth) -{ -#if (XXH_VECTOR == XXH_AVX2) - - XXH_ASSERT((((size_t)acc) & 31) == 0); - { XXH_ALIGN(32) __m256i* const xacc = (__m256i *) acc; - /* Unaligned. This is mainly for pointer arithmetic, and because - * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ - const __m256i* const xinput = (const __m256i *) input; - /* Unaligned. This is mainly for pointer arithmetic, and because - * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ - const __m256i* const xsecret = (const __m256i *) secret; - - size_t i; - for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) { - /* data_vec = xinput[i]; */ - __m256i const data_vec = _mm256_loadu_si256 (xinput+i); - /* key_vec = xsecret[i]; */ - __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); - /* data_key = data_vec ^ key_vec; */ - __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); - /* data_key_lo = data_key >> 32; */ - __m256i const data_key_lo = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); - /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ - __m256i const product = _mm256_mul_epu32 (data_key, data_key_lo); - if (accWidth == XXH3_acc_128bits) { - /* xacc[i] += swap(data_vec); */ - __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2)); - __m256i const sum = _mm256_add_epi64(xacc[i], data_swap); - /* xacc[i] += product; */ - xacc[i] = _mm256_add_epi64(product, sum); - } else { /* XXH3_acc_64bits */ - /* xacc[i] += data_vec; */ - __m256i const sum = _mm256_add_epi64(xacc[i], data_vec); - /* xacc[i] += product; */ - xacc[i] = _mm256_add_epi64(product, sum); - } - } } - -#elif (XXH_VECTOR == XXH_SSE2) - - /* SSE2 is just a half-scale version of the AVX2 version. */ - XXH_ASSERT((((size_t)acc) & 15) == 0); - { XXH_ALIGN(16) __m128i* const xacc = (__m128i *) acc; - /* Unaligned. This is mainly for pointer arithmetic, and because - * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ - const __m128i* const xinput = (const __m128i *) input; - /* Unaligned. This is mainly for pointer arithmetic, and because - * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ - const __m128i* const xsecret = (const __m128i *) secret; - - size_t i; - for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) { - /* data_vec = xinput[i]; */ - __m128i const data_vec = _mm_loadu_si128 (xinput+i); - /* key_vec = xsecret[i]; */ - __m128i const key_vec = _mm_loadu_si128 (xsecret+i); - /* data_key = data_vec ^ key_vec; */ - __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); - /* data_key_lo = data_key >> 32; */ - __m128i const data_key_lo = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); - /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ - __m128i const product = _mm_mul_epu32 (data_key, data_key_lo); - if (accWidth == XXH3_acc_128bits) { - /* xacc[i] += swap(data_vec); */ - __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2)); - __m128i const sum = _mm_add_epi64(xacc[i], data_swap); - /* xacc[i] += product; */ - xacc[i] = _mm_add_epi64(product, sum); - } else { /* XXH3_acc_64bits */ - /* xacc[i] += data_vec; */ - __m128i const sum = _mm_add_epi64(xacc[i], data_vec); - /* xacc[i] += product; */ - xacc[i] = _mm_add_epi64(product, sum); - } - } } - -#elif (XXH_VECTOR == XXH_NEON) - - XXH_ASSERT((((size_t)acc) & 15) == 0); - { - XXH_ALIGN(16) uint64x2_t* const xacc = (uint64x2_t *) acc; - /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */ - uint8_t const* const xinput = (const uint8_t *) input; - uint8_t const* const xsecret = (const uint8_t *) secret; - - size_t i; - for (i=0; i < STRIPE_LEN / sizeof(uint64x2_t); i++) { - /* data_vec = xinput[i]; */ - uint8x16_t data_vec = vld1q_u8(xinput + (i * 16)); - /* key_vec = xsecret[i]; */ - uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16)); - /* data_key = data_vec ^ key_vec; */ - uint64x2_t data_key = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec)); - uint32x2_t data_key_lo, data_key_hi; - if (accWidth == XXH3_acc_64bits) { - /* xacc[i] += data_vec; */ - xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u8(data_vec)); - } else { /* XXH3_acc_128bits */ - /* xacc[i] += swap(data_vec); */ - uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec); - uint64x2_t const swapped = vextq_u64(data64, data64, 1); - xacc[i] = vaddq_u64 (xacc[i], swapped); - } - /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF); - * data_key_hi = (uint32x2_t) (data_key >> 32); - * data_key = UNDEFINED; */ - XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi); - /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */ - xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi); - - } - } - -#elif (XXH_VECTOR == XXH_VSX) - xxh_u64x2* const xacc = (xxh_u64x2*) acc; /* presumed aligned */ - xxh_u64x2 const* const xinput = (xxh_u64x2 const*) input; /* no alignment restriction */ - xxh_u64x2 const* const xsecret = (xxh_u64x2 const*) secret; /* no alignment restriction */ - xxh_u64x2 const v32 = { 32, 32 }; - size_t i; - for (i = 0; i < STRIPE_LEN / sizeof(xxh_u64x2); i++) { - /* data_vec = xinput[i]; */ - xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + i); - /* key_vec = xsecret[i]; */ - xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i); - xxh_u64x2 const data_key = data_vec ^ key_vec; - /* shuffled = (data_key << 32) | (data_key >> 32); */ - xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32); - /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */ - xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled); - xacc[i] += product; - - if (accWidth == XXH3_acc_64bits) { - xacc[i] += data_vec; - } else { /* XXH3_acc_128bits */ - /* swap high and low halves */ -#ifdef __s390x__ - xxh_u64x2 const data_swapped = vec_permi(data_vec, data_vec, 2); -#else - xxh_u64x2 const data_swapped = vec_xxpermdi(data_vec, data_vec, 2); -#endif - xacc[i] += data_swapped; - } - } - -#else /* scalar variant of Accumulator - universal */ - - XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */ - const xxh_u8* const xinput = (const xxh_u8*) input; /* no alignment restriction */ - const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */ - size_t i; - XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0); - for (i=0; i < ACC_NB; i++) { - xxh_u64 const data_val = XXH_readLE64(xinput + 8*i); - xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + i*8); - - if (accWidth == XXH3_acc_64bits) { - xacc[i] += data_val; - } else { - xacc[i ^ 1] += data_val; /* swap adjacent lanes */ - } - xacc[i] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32); - } -#endif -} - -/* - * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing. - * - * Multiplication isn't perfect, as explained by Google in HighwayHash: - * - * // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to - * // varying degrees. In descending order of goodness, bytes - * // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32. - * // As expected, the upper and lower bytes are much worse. - * - * Source: https://github.com/google/highwayhash/blob/0aaf66b/highwayhash/hh_avx2.h#L291 - * - * Since our algorithm uses a pseudorandom secret to add some variance into the - * mix, we don't need to (or want to) mix as often or as much as HighwayHash does. - * - * This isn't as tight as XXH3_accumulate, but still written in SIMD to avoid - * extraction. - * - * Both XXH3_64bits and XXH3_128bits use this subroutine. - */ -XXH_FORCE_INLINE void -XXH3_scrambleAcc(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) -{ -#if (XXH_VECTOR == XXH_AVX2) - - XXH_ASSERT((((size_t)acc) & 31) == 0); - { XXH_ALIGN(32) __m256i* const xacc = (__m256i*) acc; - /* Unaligned. This is mainly for pointer arithmetic, and because - * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ - const __m256i* const xsecret = (const __m256i *) secret; - const __m256i prime32 = _mm256_set1_epi32((int)PRIME32_1); - - size_t i; - for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) { - /* xacc[i] ^= (xacc[i] >> 47) */ - __m256i const acc_vec = xacc[i]; - __m256i const shifted = _mm256_srli_epi64 (acc_vec, 47); - __m256i const data_vec = _mm256_xor_si256 (acc_vec, shifted); - /* xacc[i] ^= xsecret; */ - __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); - __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); - - /* xacc[i] *= PRIME32_1; */ - __m256i const data_key_hi = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); - __m256i const prod_lo = _mm256_mul_epu32 (data_key, prime32); - __m256i const prod_hi = _mm256_mul_epu32 (data_key_hi, prime32); - xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32)); - } - } - -#elif (XXH_VECTOR == XXH_SSE2) - - XXH_ASSERT((((size_t)acc) & 15) == 0); - { XXH_ALIGN(16) __m128i* const xacc = (__m128i*) acc; - /* Unaligned. This is mainly for pointer arithmetic, and because - * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ - const __m128i* const xsecret = (const __m128i *) secret; - const __m128i prime32 = _mm_set1_epi32((int)PRIME32_1); - - size_t i; - for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) { - /* xacc[i] ^= (xacc[i] >> 47) */ - __m128i const acc_vec = xacc[i]; - __m128i const shifted = _mm_srli_epi64 (acc_vec, 47); - __m128i const data_vec = _mm_xor_si128 (acc_vec, shifted); - /* xacc[i] ^= xsecret[i]; */ - __m128i const key_vec = _mm_loadu_si128 (xsecret+i); - __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); - - /* xacc[i] *= PRIME32_1; */ - __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); - __m128i const prod_lo = _mm_mul_epu32 (data_key, prime32); - __m128i const prod_hi = _mm_mul_epu32 (data_key_hi, prime32); - xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32)); - } - } - -#elif (XXH_VECTOR == XXH_NEON) - - XXH_ASSERT((((size_t)acc) & 15) == 0); - - { uint64x2_t* xacc = (uint64x2_t*) acc; - uint8_t const* xsecret = (uint8_t const*) secret; - uint32x2_t prime = vdup_n_u32 (PRIME32_1); - - size_t i; - for (i=0; i < STRIPE_LEN/sizeof(uint64x2_t); i++) { - /* xacc[i] ^= (xacc[i] >> 47); */ - uint64x2_t acc_vec = xacc[i]; - uint64x2_t shifted = vshrq_n_u64 (acc_vec, 47); - uint64x2_t data_vec = veorq_u64 (acc_vec, shifted); - - /* xacc[i] ^= xsecret[i]; */ - uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16)); - uint64x2_t data_key = veorq_u64(data_vec, vreinterpretq_u64_u8(key_vec)); - - /* xacc[i] *= PRIME32_1 */ - uint32x2_t data_key_lo, data_key_hi; - /* data_key_lo = (uint32x2_t) (xacc[i] & 0xFFFFFFFF); - * data_key_hi = (uint32x2_t) (xacc[i] >> 32); - * xacc[i] = UNDEFINED; */ - XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi); - { /* - * prod_hi = (data_key >> 32) * PRIME32_1; - * - * Avoid vmul_u32 + vshll_n_u32 since Clang 6 and 7 will - * incorrectly "optimize" this: - * tmp = vmul_u32(vmovn_u64(a), vmovn_u64(b)); - * shifted = vshll_n_u32(tmp, 32); - * to this: - * tmp = "vmulq_u64"(a, b); // no such thing! - * shifted = vshlq_n_u64(tmp, 32); - * - * However, unlike SSE, Clang lacks a 64-bit multiply routine - * for NEON, and it scalarizes two 64-bit multiplies instead. - * - * vmull_u32 has the same timing as vmul_u32, and it avoids - * this bug completely. - * See https://bugs.llvm.org/show_bug.cgi?id=39967 - */ - uint64x2_t prod_hi = vmull_u32 (data_key_hi, prime); - /* xacc[i] = prod_hi << 32; */ - xacc[i] = vshlq_n_u64(prod_hi, 32); - /* xacc[i] += (prod_hi & 0xFFFFFFFF) * PRIME32_1; */ - xacc[i] = vmlal_u32(xacc[i], data_key_lo, prime); - } - } } - -#elif (XXH_VECTOR == XXH_VSX) - - XXH_ASSERT((((size_t)acc) & 15) == 0); - - { xxh_u64x2* const xacc = (xxh_u64x2*) acc; - const xxh_u64x2* const xsecret = (const xxh_u64x2*) secret; - /* constants */ - xxh_u64x2 const v32 = { 32, 32 }; - xxh_u64x2 const v47 = { 47, 47 }; - xxh_u32x4 const prime = { PRIME32_1, PRIME32_1, PRIME32_1, PRIME32_1 }; - size_t i; - for (i = 0; i < STRIPE_LEN / sizeof(xxh_u64x2); i++) { - /* xacc[i] ^= (xacc[i] >> 47); */ - xxh_u64x2 const acc_vec = xacc[i]; - xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47); - - /* xacc[i] ^= xsecret[i]; */ - xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i); - xxh_u64x2 const data_key = data_vec ^ key_vec; - - /* xacc[i] *= PRIME32_1 */ - /* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime & 0xFFFFFFFF); */ - xxh_u64x2 const prod_even = XXH_vec_mule((xxh_u32x4)data_key, prime); - /* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32); */ - xxh_u64x2 const prod_odd = XXH_vec_mulo((xxh_u32x4)data_key, prime); - xacc[i] = prod_odd + (prod_even << v32); - } } - -#else /* scalar variant of Scrambler - universal */ - - XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */ - const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */ - size_t i; - XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0); - for (i=0; i < ACC_NB; i++) { - xxh_u64 const key64 = XXH_readLE64(xsecret + 8*i); - xxh_u64 acc64 = xacc[i]; - acc64 = XXH_xorshift64(acc64, 47); - acc64 ^= key64; - acc64 *= PRIME32_1; - xacc[i] = acc64; - } - -#endif -} - -#define XXH_PREFETCH_DIST 384 - -/* - * XXH3_accumulate() - * Loops over XXH3_accumulate_512(). - * Assumption: nbStripes will not overflow the secret size - */ -XXH_FORCE_INLINE void -XXH3_accumulate( xxh_u64* XXH_RESTRICT acc, - const xxh_u8* XXH_RESTRICT input, - const xxh_u8* XXH_RESTRICT secret, - size_t nbStripes, - XXH3_accWidth_e accWidth) -{ - size_t n; - for (n = 0; n < nbStripes; n++ ) { - const xxh_u8* const in = input + n*STRIPE_LEN; - XXH_PREFETCH(in + XXH_PREFETCH_DIST); - XXH3_accumulate_512(acc, - in, - secret + n*XXH_SECRET_CONSUME_RATE, - accWidth); - } -} - -XXH_FORCE_INLINE void -XXH3_hashLong_internal_loop( xxh_u64* XXH_RESTRICT acc, - const xxh_u8* XXH_RESTRICT input, size_t len, - const xxh_u8* XXH_RESTRICT secret, size_t secretSize, - XXH3_accWidth_e accWidth) -{ - size_t const nb_rounds = (secretSize - STRIPE_LEN) / XXH_SECRET_CONSUME_RATE; - size_t const block_len = STRIPE_LEN * nb_rounds; - size_t const nb_blocks = len / block_len; - - size_t n; - - XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); - - for (n = 0; n < nb_blocks; n++) { - XXH3_accumulate(acc, input + n*block_len, secret, nb_rounds, accWidth); - XXH3_scrambleAcc(acc, secret + secretSize - STRIPE_LEN); - } - - /* last partial block */ - XXH_ASSERT(len > STRIPE_LEN); - { size_t const nbStripes = (len - (block_len * nb_blocks)) / STRIPE_LEN; - XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE)); - XXH3_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, accWidth); - - /* last stripe */ - if (len & (STRIPE_LEN - 1)) { - const xxh_u8* const p = input + len - STRIPE_LEN; - /* Do not align on 8, so that the secret is different from the scrambler */ -#define XXH_SECRET_LASTACC_START 7 - XXH3_accumulate_512(acc, p, secret + secretSize - STRIPE_LEN - XXH_SECRET_LASTACC_START, accWidth); - } } -} - -XXH_FORCE_INLINE xxh_u64 -XXH3_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret) -{ - return XXH3_mul128_fold64( - acc[0] ^ XXH_readLE64(secret), - acc[1] ^ XXH_readLE64(secret+8) ); -} - -static XXH64_hash_t -XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start) -{ - xxh_u64 result64 = start; - - result64 += XXH3_mix2Accs(acc+0, secret + 0); - result64 += XXH3_mix2Accs(acc+2, secret + 16); - result64 += XXH3_mix2Accs(acc+4, secret + 32); - result64 += XXH3_mix2Accs(acc+6, secret + 48); - - return XXH3_avalanche(result64); -} - -#define XXH3_INIT_ACC { PRIME32_3, PRIME64_1, PRIME64_2, PRIME64_3, \ - PRIME64_4, PRIME32_2, PRIME64_5, PRIME32_1 }; - -XXH_FORCE_INLINE XXH64_hash_t -XXH3_hashLong_internal(const xxh_u8* XXH_RESTRICT input, size_t len, - const xxh_u8* XXH_RESTRICT secret, size_t secretSize) -{ - XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXH3_INIT_ACC; - - XXH3_hashLong_internal_loop(acc, input, len, secret, secretSize, XXH3_acc_64bits); - - /* converge into final hash */ - XXH_STATIC_ASSERT(sizeof(acc) == 64); - /* do not align on 8, so that the secret is different from the accumulator */ -#define XXH_SECRET_MERGEACCS_START 11 - XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); - return XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * PRIME64_1); -} - -/* - * It's important for performance that XXH3_hashLong is not inlined. Not sure - * why (uop cache maybe?), but the difference is large and easily measurable. - */ -XXH_NO_INLINE XXH64_hash_t -XXH3_hashLong_64b_defaultSecret(const xxh_u8* XXH_RESTRICT input, size_t len) -{ - return XXH3_hashLong_internal(input, len, kSecret, sizeof(kSecret)); -} - -/* - * It's important for performance that XXH3_hashLong is not inlined. Not sure - * why (uop cache maybe?), but the difference is large and easily measurable. - */ -XXH_NO_INLINE XXH64_hash_t -XXH3_hashLong_64b_withSecret(const xxh_u8* XXH_RESTRICT input, size_t len, - const xxh_u8* XXH_RESTRICT secret, size_t secretSize) -{ - return XXH3_hashLong_internal(input, len, secret, secretSize); -} - - -XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64) -{ - if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64); - memcpy(dst, &v64, sizeof(v64)); -} - -/* XXH3_initCustomSecret() : - * destination `customSecret` is presumed allocated and same size as `kSecret`. - */ -XXH_FORCE_INLINE void XXH3_initCustomSecret(xxh_u8* customSecret, xxh_u64 seed64) -{ - int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16; - int i; - - XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0); - - for (i=0; i < nbRounds; i++) { - XXH_writeLE64(customSecret + 16*i, XXH_readLE64(kSecret + 16*i) + seed64); - XXH_writeLE64(customSecret + 16*i + 8, XXH_readLE64(kSecret + 16*i + 8) - seed64); - } -} - - -/* - * XXH3_hashLong_64b_withSeed(): - * Generate a custom key based on alteration of default kSecret with the seed, - * and then use this key for long mode hashing. - * - * This operation is decently fast but nonetheless costs a little bit of time. - * Try to avoid it whenever possible (typically when seed==0). - * - * It's important for performance that XXH3_hashLong is not inlined. Not sure - * why (uop cache maybe?), but the difference is large and easily measurable. - */ -XXH_NO_INLINE XXH64_hash_t -XXH3_hashLong_64b_withSeed(const xxh_u8* input, size_t len, XXH64_hash_t seed) -{ - XXH_ALIGN(8) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; - if (seed==0) return XXH3_hashLong_64b_defaultSecret(input, len); - XXH3_initCustomSecret(secret, seed); - return XXH3_hashLong_internal(input, len, secret, sizeof(secret)); -} - -/* === Public entry point === */ - -XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* input, size_t len) -{ - if (len <= 16) return XXH3_len_0to16_64b((const xxh_u8*)input, len, kSecret, 0); - if (len <= 128) return XXH3_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0); - if (len <= XXH3_MIDSIZE_MAX) return XXH3_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0); - return XXH3_hashLong_64b_defaultSecret((const xxh_u8*)input, len); -} - -XXH_PUBLIC_API XXH64_hash_t -XXH3_64bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize) -{ - XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); - /* - * If an action is to be taken if `secret` conditions are not respected, - * it should be done here. - * For now, it's a contract pre-condition. - * Adding a check and a branch here would cost performance at every hash. - */ - if (len <= 16) return XXH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0); - if (len <= 128) return XXH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0); - if (len <= XXH3_MIDSIZE_MAX) return XXH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0); - return XXH3_hashLong_64b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize); -} - -XXH_PUBLIC_API XXH64_hash_t -XXH3_64bits_withSeed(const void* input, size_t len, XXH64_hash_t seed) -{ - if (len <= 16) return XXH3_len_0to16_64b((const xxh_u8*)input, len, kSecret, seed); - if (len <= 128) return XXH3_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed); - if (len <= XXH3_MIDSIZE_MAX) return XXH3_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed); - return XXH3_hashLong_64b_withSeed((const xxh_u8*)input, len, seed); -} - -/* === XXH3 streaming === */ - -XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void) -{ - return (XXH3_state_t*)XXH_malloc(sizeof(XXH3_state_t)); -} - -XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr) -{ - XXH_free(statePtr); - return XXH_OK; -} - -XXH_PUBLIC_API void -XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state) -{ - memcpy(dst_state, src_state, sizeof(*dst_state)); -} - -static void -XXH3_64bits_reset_internal(XXH3_state_t* statePtr, - XXH64_hash_t seed, - const xxh_u8* secret, size_t secretSize) -{ - XXH_ASSERT(statePtr != NULL); - memset(statePtr, 0, sizeof(*statePtr)); - statePtr->acc[0] = PRIME32_3; - statePtr->acc[1] = PRIME64_1; - statePtr->acc[2] = PRIME64_2; - statePtr->acc[3] = PRIME64_3; - statePtr->acc[4] = PRIME64_4; - statePtr->acc[5] = PRIME32_2; - statePtr->acc[6] = PRIME64_5; - statePtr->acc[7] = PRIME32_1; - statePtr->seed = seed; - XXH_ASSERT(secret != NULL); - statePtr->secret = secret; - XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); - statePtr->secretLimit = (XXH32_hash_t)(secretSize - STRIPE_LEN); - statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE; -} - -XXH_PUBLIC_API XXH_errorcode -XXH3_64bits_reset(XXH3_state_t* statePtr) -{ - if (statePtr == NULL) return XXH_ERROR; - XXH3_64bits_reset_internal(statePtr, 0, kSecret, XXH_SECRET_DEFAULT_SIZE); - return XXH_OK; -} - -XXH_PUBLIC_API XXH_errorcode -XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize) -{ - if (statePtr == NULL) return XXH_ERROR; - XXH3_64bits_reset_internal(statePtr, 0, (const xxh_u8*)secret, secretSize); - if (secret == NULL) return XXH_ERROR; - if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; - return XXH_OK; -} - -XXH_PUBLIC_API XXH_errorcode -XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed) -{ - if (statePtr == NULL) return XXH_ERROR; - XXH3_64bits_reset_internal(statePtr, seed, kSecret, XXH_SECRET_DEFAULT_SIZE); - XXH3_initCustomSecret(statePtr->customSecret, seed); - statePtr->secret = statePtr->customSecret; - return XXH_OK; -} - -XXH_FORCE_INLINE void -XXH3_consumeStripes( xxh_u64* acc, - XXH32_hash_t* nbStripesSoFarPtr, XXH32_hash_t nbStripesPerBlock, - const xxh_u8* input, size_t totalStripes, - const xxh_u8* secret, size_t secretLimit, - XXH3_accWidth_e accWidth) -{ - XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock); - if (nbStripesPerBlock - *nbStripesSoFarPtr <= totalStripes) { - /* need a scrambling operation */ - size_t const nbStripes = nbStripesPerBlock - *nbStripesSoFarPtr; - XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes, accWidth); - XXH3_scrambleAcc(acc, secret + secretLimit); - XXH3_accumulate(acc, input + nbStripes * STRIPE_LEN, secret, totalStripes - nbStripes, accWidth); - *nbStripesSoFarPtr = (XXH32_hash_t)(totalStripes - nbStripes); - } else { - XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, totalStripes, accWidth); - *nbStripesSoFarPtr += (XXH32_hash_t)totalStripes; - } -} - -/* - * Both XXH3_64bits_update and XXH3_128bits_update use this routine. - */ -XXH_FORCE_INLINE XXH_errorcode -XXH3_update(XXH3_state_t* state, const xxh_u8* input, size_t len, XXH3_accWidth_e accWidth) -{ - if (input==NULL) -#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) - return XXH_OK; -#else - return XXH_ERROR; -#endif - - { const xxh_u8* const bEnd = input + len; - - state->totalLen += len; - - if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) { /* fill in tmp buffer */ - XXH_memcpy(state->buffer + state->bufferedSize, input, len); - state->bufferedSize += (XXH32_hash_t)len; - return XXH_OK; - } - /* input is now > XXH3_INTERNALBUFFER_SIZE */ - - #define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / STRIPE_LEN) - XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % STRIPE_LEN == 0); /* clean multiple */ - - /* - * There is some input left inside the internal buffer. - * Fill it, then consume it. - */ - if (state->bufferedSize) { - size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize; - XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize); - input += loadSize; - XXH3_consumeStripes(state->acc, - &state->nbStripesSoFar, state->nbStripesPerBlock, - state->buffer, XXH3_INTERNALBUFFER_STRIPES, - state->secret, state->secretLimit, - accWidth); - state->bufferedSize = 0; - } - - /* Consume input by full buffer quantities */ - if (input+XXH3_INTERNALBUFFER_SIZE <= bEnd) { - const xxh_u8* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE; - do { - XXH3_consumeStripes(state->acc, - &state->nbStripesSoFar, state->nbStripesPerBlock, - input, XXH3_INTERNALBUFFER_STRIPES, - state->secret, state->secretLimit, - accWidth); - input += XXH3_INTERNALBUFFER_SIZE; - } while (input<=limit); - } - - if (input < bEnd) { /* Some remaining input: buffer it */ - XXH_memcpy(state->buffer, input, (size_t)(bEnd-input)); - state->bufferedSize = (XXH32_hash_t)(bEnd-input); - } - } - - return XXH_OK; -} - -XXH_PUBLIC_API XXH_errorcode -XXH3_64bits_update(XXH3_state_t* state, const void* input, size_t len) -{ - return XXH3_update(state, (const xxh_u8*)input, len, XXH3_acc_64bits); -} - - -XXH_FORCE_INLINE void -XXH3_digest_long (XXH64_hash_t* acc, const XXH3_state_t* state, XXH3_accWidth_e accWidth) -{ - /* - * Digest on a local copy. This way, the state remains unaltered, and it can - * continue ingesting more input afterwards. - */ - memcpy(acc, state->acc, sizeof(state->acc)); - if (state->bufferedSize >= STRIPE_LEN) { - size_t const totalNbStripes = state->bufferedSize / STRIPE_LEN; - XXH32_hash_t nbStripesSoFar = state->nbStripesSoFar; - XXH3_consumeStripes(acc, - &nbStripesSoFar, state->nbStripesPerBlock, - state->buffer, totalNbStripes, - state->secret, state->secretLimit, - accWidth); - if (state->bufferedSize % STRIPE_LEN) { /* one last partial stripe */ - XXH3_accumulate_512(acc, - state->buffer + state->bufferedSize - STRIPE_LEN, - state->secret + state->secretLimit - XXH_SECRET_LASTACC_START, - accWidth); - } - } else { /* bufferedSize < STRIPE_LEN */ - if (state->bufferedSize) { /* one last stripe */ - xxh_u8 lastStripe[STRIPE_LEN]; - size_t const catchupSize = STRIPE_LEN - state->bufferedSize; - memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize); - memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize); - XXH3_accumulate_512(acc, - lastStripe, - state->secret + state->secretLimit - XXH_SECRET_LASTACC_START, - accWidth); - } } -} - -XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* state) -{ - if (state->totalLen > XXH3_MIDSIZE_MAX) { - XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[ACC_NB]; - XXH3_digest_long(acc, state, XXH3_acc_64bits); - return XXH3_mergeAccs(acc, state->secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)state->totalLen * PRIME64_1); - } - /* len <= XXH3_MIDSIZE_MAX : short code */ - if (state->seed) - return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed); - return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen), state->secret, state->secretLimit + STRIPE_LEN); -} - -/* ========================================== - * XXH3 128 bits (=> XXH128) - * ========================================== */ - -XXH_FORCE_INLINE XXH128_hash_t -XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) -{ - XXH_ASSERT(input != NULL); - XXH_ASSERT(1 <= len && len <= 3); - XXH_ASSERT(secret != NULL); - /* - * len = 1: combinedl = { input[0], 0x01, input[0], input[0] } - * len = 2: combinedl = { input[1], 0x02, input[0], input[1] } - * len = 3: combinedl = { input[2], 0x03, input[0], input[1] } - */ - { xxh_u8 const c1 = input[0]; - xxh_u8 const c2 = input[len >> 1]; - xxh_u8 const c3 = input[len - 1]; - xxh_u32 const combinedl = ((xxh_u32)c1<<16) | (((xxh_u32)c2) << 24) | (((xxh_u32)c3) << 0) | (((xxh_u32)len) << 8); - xxh_u32 const combinedh = XXH_rotl32(XXH_swap32(combinedl), 13); - xxh_u64 const bitflipl = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed; - xxh_u64 const bitfliph = (XXH_readLE32(secret+8) ^ XXH_readLE32(secret+12)) - seed; - xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl; - xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph; - xxh_u64 const mixedl = keyed_lo * PRIME64_1; - xxh_u64 const mixedh = keyed_hi * PRIME64_5; - XXH128_hash_t const h128 = { XXH3_avalanche(mixedl) /*low64*/, XXH3_avalanche(mixedh) /*high64*/ }; - return h128; - } -} - -XXH_FORCE_INLINE XXH128_hash_t -XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) -{ - XXH_ASSERT(input != NULL); - XXH_ASSERT(secret != NULL); - XXH_ASSERT(4 <= len && len <= 8); - seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32; - { xxh_u32 const input_lo = XXH_readLE32(input); - xxh_u32 const input_hi = XXH_readLE32(input + len - 4); - xxh_u64 const input_64 = input_lo + ((xxh_u64)input_hi << 32); - xxh_u64 const bitflip = (XXH_readLE64(secret+16) ^ XXH_readLE64(secret+24)) + seed; - xxh_u64 const keyed = input_64 ^ bitflip; - - /* Shift len to the left to ensure it is even, this avoids even multiplies. */ - XXH128_hash_t m128 = XXH_mult64to128(keyed, PRIME64_1 + (len << 2)); - - m128.high64 += (m128.low64 << 1); - m128.low64 ^= (m128.high64 >> 3); - - m128.low64 = XXH_xorshift64(m128.low64, 35); - m128.low64 *= 0x9FB21C651E98DF25ULL; - m128.low64 = XXH_xorshift64(m128.low64, 28); - m128.high64 = XXH3_avalanche(m128.high64); - return m128; - } -} - -XXH_FORCE_INLINE XXH128_hash_t -XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) -{ - XXH_ASSERT(input != NULL); - XXH_ASSERT(secret != NULL); - XXH_ASSERT(9 <= len && len <= 16); - { xxh_u64 const bitflipl = (XXH_readLE64(secret+32) ^ XXH_readLE64(secret+40)) - seed; - xxh_u64 const bitfliph = (XXH_readLE64(secret+48) ^ XXH_readLE64(secret+56)) + seed; - xxh_u64 const input_lo = XXH_readLE64(input); - xxh_u64 input_hi = XXH_readLE64(input + len - 8); - XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, PRIME64_1); - /* - * Put len in the middle of m128 to ensure that the length gets mixed to - * both the low and high bits in the 128x64 multiply below. - */ - m128.low64 += (xxh_u64)(len - 1) << 54; - input_hi ^= bitfliph; - /* - * Add the high 32 bits of input_hi to the high 32 bits of m128, then - * add the long product of the low 32 bits of input_hi and PRIME32_2 to - * the high 64 bits of m128. - * - * The best approach to this operation is different on 32-bit and 64-bit. - */ - if (sizeof(void *) < sizeof(xxh_u64)) { /* 32-bit */ - /* - * 32-bit optimized version, which is more readable. - * - * On 32-bit, it removes an ADC and delays a dependency between the two - * halves of m128.high64, but it generates an extra mask on 64-bit. - */ - m128.high64 += (input_hi & 0xFFFFFFFF00000000) + XXH_mult32to64((xxh_u32)input_hi, PRIME32_2); - } else { - /* - * 64-bit optimized (albeit more confusing) version. - * - * Uses some properties of addition and multiplication to remove the mask: - * - * Let: - * a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF) - * b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000) - * c = PRIME32_2 - * - * a + (b * c) - * Inverse Property: x + y - x == y - * a + (b * (1 + c - 1)) - * Distributive Property: x * (y + z) == (x * y) + (x * z) - * a + (b * 1) + (b * (c - 1)) - * Identity Property: x * 1 == x - * a + b + (b * (c - 1)) - * - * Substitute a, b, and c: - * input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (PRIME32_2 - 1)) - * - * Since input_hi.hi + input_hi.lo == input_hi, we get this: - * input_hi + ((xxh_u64)input_hi.lo * (PRIME32_2 - 1)) - */ - m128.high64 += input_hi + XXH_mult32to64((xxh_u32)input_hi, PRIME32_2 - 1); - } - /* m128 ^= XXH_swap64(m128 >> 64); */ - m128.low64 ^= XXH_swap64(m128.high64); - - { /* 128x64 multiply: h128 = m128 * PRIME64_2; */ - XXH128_hash_t h128 = XXH_mult64to128(m128.low64, PRIME64_2); - h128.high64 += m128.high64 * PRIME64_2; - - h128.low64 = XXH3_avalanche(h128.low64); - h128.high64 = XXH3_avalanche(h128.high64); - return h128; - } } -} - -/* Assumption : `secret` size is >= 16 - * Note : it should be >= XXH3_SECRET_SIZE_MIN anyway */ -XXH_FORCE_INLINE XXH128_hash_t -XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) -{ - XXH_ASSERT(len <= 16); - { if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed); - if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed); - if (len) return XXH3_len_1to3_128b(input, len, secret, seed); - { XXH128_hash_t h128; - xxh_u64 const bitflipl = XXH_readLE64(secret+64) ^ XXH_readLE64(secret+72); - xxh_u64 const bitfliph = XXH_readLE64(secret+80) ^ XXH_readLE64(secret+88); - h128.low64 = XXH3_avalanche((PRIME64_1 + seed) ^ bitflipl); - h128.high64 = XXH3_avalanche((PRIME64_2 - seed) ^ bitfliph); - return h128; - } } -} - -/* - * A bit slower than XXH3_mix16B, but handles multiply by zero better. - */ -XXH_FORCE_INLINE XXH128_hash_t -XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2, const xxh_u8* secret, XXH64_hash_t seed) -{ - acc.low64 += XXH3_mix16B (input_1, secret+0, seed); - acc.low64 ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8); - acc.high64 += XXH3_mix16B (input_2, secret+16, seed); - acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8); - return acc; -} - - -XXH_FORCE_INLINE XXH128_hash_t -XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len, - const xxh_u8* XXH_RESTRICT secret, size_t secretSize, - XXH64_hash_t seed) -{ - XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; - XXH_ASSERT(16 < len && len <= 128); - - { XXH128_hash_t acc; - acc.low64 = len * PRIME64_1; - acc.high64 = 0; - if (len > 32) { - if (len > 64) { - if (len > 96) { - acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed); - } - acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed); - } - acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed); - } - acc = XXH128_mix32B(acc, input, input+len-16, secret, seed); - { xxh_u64 const low64 = acc.low64 + acc.high64; - xxh_u64 const high64 = (acc.low64 * PRIME64_1) + (acc.high64 * PRIME64_4) + ((len - seed) * PRIME64_2); - XXH128_hash_t const h128 = { XXH3_avalanche(low64), (XXH64_hash_t)0 - XXH3_avalanche(high64) }; - return h128; - } - } -} - -XXH_NO_INLINE XXH128_hash_t -XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len, - const xxh_u8* XXH_RESTRICT secret, size_t secretSize, - XXH64_hash_t seed) -{ - XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; - XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); - - { XXH128_hash_t acc; - int const nbRounds = (int)len / 32; - int i; - acc.low64 = len * PRIME64_1; - acc.high64 = 0; - for (i=0; i<4; i++) { - acc = XXH128_mix32B(acc, input+(32*i), input+(32*i)+16, secret+(32*i), seed); - } - acc.low64 = XXH3_avalanche(acc.low64); - acc.high64 = XXH3_avalanche(acc.high64); - XXH_ASSERT(nbRounds >= 4); - for (i=4 ; i < nbRounds; i++) { - acc = XXH128_mix32B(acc, input+(32*i), input+(32*i)+16, secret+XXH3_MIDSIZE_STARTOFFSET+(32*(i-4)), seed); - } - /* last bytes */ - acc = XXH128_mix32B(acc, input + len - 16, input + len - 32, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16, 0ULL - seed); - - { xxh_u64 const low64 = acc.low64 + acc.high64; - xxh_u64 const high64 = (acc.low64 * PRIME64_1) + (acc.high64 * PRIME64_4) + ((len - seed) * PRIME64_2); - XXH128_hash_t const h128 = { XXH3_avalanche(low64), (XXH64_hash_t)0 - XXH3_avalanche(high64) }; - return h128; - } - } -} - -XXH_FORCE_INLINE XXH128_hash_t -XXH3_hashLong_128b_internal(const xxh_u8* XXH_RESTRICT input, size_t len, - const xxh_u8* XXH_RESTRICT secret, size_t secretSize) -{ - XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXH3_INIT_ACC; - - XXH3_hashLong_internal_loop(acc, input, len, secret, secretSize, XXH3_acc_128bits); - - /* converge into final hash */ - XXH_STATIC_ASSERT(sizeof(acc) == 64); - XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); - { xxh_u64 const low64 = XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * PRIME64_1); - xxh_u64 const high64 = XXH3_mergeAccs(acc, secret + secretSize - sizeof(acc) - XXH_SECRET_MERGEACCS_START, ~((xxh_u64)len * PRIME64_2)); - XXH128_hash_t const h128 = { low64, high64 }; - return h128; - } -} - -/* - * It's important for performance that XXH3_hashLong is not inlined. Not sure - * why (uop cache maybe?), but the difference is large and easily measurable. - */ -XXH_NO_INLINE XXH128_hash_t -XXH3_hashLong_128b_defaultSecret(const xxh_u8* input, size_t len) -{ - return XXH3_hashLong_128b_internal(input, len, kSecret, sizeof(kSecret)); -} - -/* - * It's important for performance that XXH3_hashLong is not inlined. Not sure - * why (uop cache maybe?), but the difference is large and easily measurable. - */ -XXH_NO_INLINE XXH128_hash_t -XXH3_hashLong_128b_withSecret(const xxh_u8* input, size_t len, - const xxh_u8* secret, size_t secretSize) -{ - return XXH3_hashLong_128b_internal(input, len, secret, secretSize); -} - -/* - * It's important for performance that XXH3_hashLong is not inlined. Not sure - * why (uop cache maybe?), but the difference is large and easily measurable. - */ -XXH_NO_INLINE XXH128_hash_t -XXH3_hashLong_128b_withSeed(const xxh_u8* input, size_t len, XXH64_hash_t seed) -{ - XXH_ALIGN(8) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; - if (seed == 0) return XXH3_hashLong_128b_defaultSecret(input, len); - XXH3_initCustomSecret(secret, seed); - return XXH3_hashLong_128b_internal(input, len, secret, sizeof(secret)); -} - - -XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* input, size_t len) -{ - if (len <= 16) return XXH3_len_0to16_128b((const xxh_u8*)input, len, kSecret, 0); - if (len <= 128) return XXH3_len_17to128_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0); - if (len <= XXH3_MIDSIZE_MAX) return XXH3_len_129to240_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0); - return XXH3_hashLong_128b_defaultSecret((const xxh_u8*)input, len); -} - -XXH_PUBLIC_API XXH128_hash_t -XXH3_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize) -{ - XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); - /* - * If an action is to be taken if `secret` conditions are not respected, - * it should be done here. - * For now, it's a contract pre-condition. - * Adding a check and a branch here would cost performance at every hash. - */ - if (len <= 16) return XXH3_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0); - if (len <= 128) return XXH3_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0); - if (len <= XXH3_MIDSIZE_MAX) return XXH3_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0); - return XXH3_hashLong_128b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize); -} - -XXH_PUBLIC_API XXH128_hash_t -XXH3_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed) -{ - if (len <= 16) return XXH3_len_0to16_128b((const xxh_u8*)input, len, kSecret, seed); - if (len <= 128) return XXH3_len_17to128_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed); - if (len <= XXH3_MIDSIZE_MAX) return XXH3_len_129to240_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed); - return XXH3_hashLong_128b_withSeed((const xxh_u8*)input, len, seed); -} - -XXH_PUBLIC_API XXH128_hash_t -XXH128(const void* input, size_t len, XXH64_hash_t seed) -{ - return XXH3_128bits_withSeed(input, len, seed); -} - - -/* === XXH3 128-bit streaming === */ - -/* all the functions are actually the same as for 64-bit streaming variant, - just the reset one is different (different initial acc values for 0,5,6,7), - and near the end of the digest function */ - -static void -XXH3_128bits_reset_internal(XXH3_state_t* statePtr, - XXH64_hash_t seed, - const xxh_u8* secret, size_t secretSize) -{ - XXH3_64bits_reset_internal(statePtr, seed, secret, secretSize); -} - -XXH_PUBLIC_API XXH_errorcode -XXH3_128bits_reset(XXH3_state_t* statePtr) -{ - if (statePtr == NULL) return XXH_ERROR; - XXH3_128bits_reset_internal(statePtr, 0, kSecret, XXH_SECRET_DEFAULT_SIZE); - return XXH_OK; -} - -XXH_PUBLIC_API XXH_errorcode -XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize) -{ - if (statePtr == NULL) return XXH_ERROR; - XXH3_128bits_reset_internal(statePtr, 0, (const xxh_u8*)secret, secretSize); - if (secret == NULL) return XXH_ERROR; - if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; - return XXH_OK; -} - -XXH_PUBLIC_API XXH_errorcode -XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed) -{ - if (statePtr == NULL) return XXH_ERROR; - XXH3_128bits_reset_internal(statePtr, seed, kSecret, XXH_SECRET_DEFAULT_SIZE); - XXH3_initCustomSecret(statePtr->customSecret, seed); - statePtr->secret = statePtr->customSecret; - return XXH_OK; -} - -XXH_PUBLIC_API XXH_errorcode -XXH3_128bits_update(XXH3_state_t* state, const void* input, size_t len) -{ - return XXH3_update(state, (const xxh_u8*)input, len, XXH3_acc_128bits); -} - -XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* state) -{ - if (state->totalLen > XXH3_MIDSIZE_MAX) { - XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[ACC_NB]; - XXH3_digest_long(acc, state, XXH3_acc_128bits); - XXH_ASSERT(state->secretLimit + STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); - { xxh_u64 const low64 = XXH3_mergeAccs(acc, state->secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)state->totalLen * PRIME64_1); - xxh_u64 const high64 = XXH3_mergeAccs(acc, state->secret + state->secretLimit + STRIPE_LEN - sizeof(acc) - XXH_SECRET_MERGEACCS_START, ~((xxh_u64)state->totalLen * PRIME64_2)); - XXH128_hash_t const h128 = { low64, high64 }; - return h128; - } - } - /* len <= XXH3_MIDSIZE_MAX : short code */ - if (state->seed) - return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed); - return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen), state->secret, state->secretLimit + STRIPE_LEN); -} - -/* 128-bit utility functions */ - -#include /* memcmp, memcpy */ - -/* return : 1 is equal, 0 if different */ -XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2) -{ - /* note : XXH128_hash_t is compact, it has no padding byte */ - return !(memcmp(&h1, &h2, sizeof(h1))); -} - -/* This prototype is compatible with stdlib's qsort(). - * return : >0 if *h128_1 > *h128_2 - * <0 if *h128_1 < *h128_2 - * =0 if *h128_1 == *h128_2 */ -XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2) -{ - XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1; - XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2; - int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64); - /* note : bets that, in most cases, hash values are different */ - if (hcmp) return hcmp; - return (h1.low64 > h2.low64) - (h2.low64 > h1.low64); -} - - -/*====== Canonical representation ======*/ -XXH_PUBLIC_API void -XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash) -{ - XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t)); - if (XXH_CPU_LITTLE_ENDIAN) { - hash.high64 = XXH_swap64(hash.high64); - hash.low64 = XXH_swap64(hash.low64); - } - memcpy(dst, &hash.high64, sizeof(hash.high64)); - memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64)); -} - -XXH_PUBLIC_API XXH128_hash_t -XXH128_hashFromCanonical(const XXH128_canonical_t* src) -{ - XXH128_hash_t h; - h.high64 = XXH_readBE64(src); - h.low64 = XXH_readBE64(src->digest + 8); - return h; -} - -/* Pop our optimization override from above */ -#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \ - && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ - && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */ -# pragma GCC pop_options -#endif - -#endif /* XXH3_H_1397135465 */ diff --git a/xxhash.h b/xxhash.h deleted file mode 100644 index f711b34..0000000 --- a/xxhash.h +++ /dev/null @@ -1,1941 +0,0 @@ -/* - * xxHash - Extremely Fast Hash algorithm - * Header File - * Copyright (C) 2012-present, Yann Collet. - * - * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following disclaimer - * in the documentation and/or other materials provided with the - * distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * You can contact the author at: - * - xxHash homepage: https://www.xxhash.com - * - xxHash source repository: https://github.com/Cyan4973/xxHash - */ - -/* TODO: update */ -/* Notice extracted from xxHash homepage: - -xxHash is an extremely fast hash algorithm, running at RAM speed limits. -It also successfully passes all tests from the SMHasher suite. - -Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz) - -Name Speed Q.Score Author -xxHash 5.4 GB/s 10 -CrapWow 3.2 GB/s 2 Andrew -MumurHash 3a 2.7 GB/s 10 Austin Appleby -SpookyHash 2.0 GB/s 10 Bob Jenkins -SBox 1.4 GB/s 9 Bret Mulvey -Lookup3 1.2 GB/s 9 Bob Jenkins -SuperFastHash 1.2 GB/s 1 Paul Hsieh -CityHash64 1.05 GB/s 10 Pike & Alakuijala -FNV 0.55 GB/s 5 Fowler, Noll, Vo -CRC32 0.43 GB/s 9 -MD5-32 0.33 GB/s 10 Ronald L. Rivest -SHA1-32 0.28 GB/s 10 - -Q.Score is a measure of quality of the hash function. -It depends on successfully passing SMHasher test set. -10 is a perfect score. - -Note: SMHasher's CRC32 implementation is not the fastest one. -Other speed-oriented implementations can be faster, -especially in combination with PCLMUL instruction: -https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html?showComment=1552696407071#c3490092340461170735 - -A 64-bit version, named XXH64, is available since r35. -It offers much better speed, but for 64-bit applications only. -Name Speed on 64 bits Speed on 32 bits -XXH64 13.8 GB/s 1.9 GB/s -XXH32 6.8 GB/s 6.0 GB/s -*/ - -#if defined (__cplusplus) -extern "C" { -#endif - -/* **************************** - * INLINE mode - ******************************/ -/*! - * XXH_INLINE_ALL (and XXH_PRIVATE_API) - * Use these build macros to inline xxhash into the target unit. - * Inlining improves performance on small inputs, especially when the length is - * expressed as a compile-time constant: - * - * https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html - * - * It also keeps xxHash symbols private to the unit, so they are not exported. - * - * Usage: - * #define XXH_INLINE_ALL - * #include "xxhash.h" - * - * Do not compile and link xxhash.o as a separate object, as it is not useful. - */ -#if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)) \ - && !defined(XXH_INLINE_ALL_31684351384) - /* this section should be traversed only once */ -# define XXH_INLINE_ALL_31684351384 - /* give access to the advanced API, required to compile implementations */ -# undef XXH_STATIC_LINKING_ONLY /* avoid macro redef */ -# define XXH_STATIC_LINKING_ONLY - /* make all functions private */ -# undef XXH_PUBLIC_API -# if defined(__GNUC__) -# define XXH_PUBLIC_API static __inline __attribute__((unused)) -# elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) -# define XXH_PUBLIC_API static inline -# elif defined(_MSC_VER) -# define XXH_PUBLIC_API static __inline -# else - /* note: this version may generate warnings for unused static functions */ -# define XXH_PUBLIC_API static -# endif - - /* - * This part deals with the special case where a unit wants to inline xxHash, - * but "xxhash.h" has previously been included without XXH_INLINE_ALL, such - * as part of some previously included *.h header file. - * Without further action, the new include would just be ignored, - * and functions would effectively _not_ be inlined (silent failure). - * The following macros solve this situation by prefixing all inlined names, - * avoiding naming collision with previous inclusions. - */ -# ifdef XXH_NAMESPACE -# error "XXH_INLINE_ALL with XXH_NAMESPACE is not supported" - /* - * Note: Alternative: #undef all symbols (it's a pretty large list). - * Without #error: it compiles, but functions are actually not inlined. - */ -# endif -# define XXH_NAMESPACE XXH_INLINE_ - /* - * Some identifiers (enums, type names) are not symbols, but they must - * still be renamed to avoid redeclaration. - * Alternative solution: do not redeclare them. - * However, this requires some #ifdefs, and is a more dispersed action. - * Meanwhile, renaming can be achieved in a single block - */ -# define XXH_IPREF(Id) XXH_INLINE_ ## Id -# define XXH_OK XXH_IPREF(XXH_OK) -# define XXH_ERROR XXH_IPREF(XXH_ERROR) -# define XXH_errorcode XXH_IPREF(XXH_errorcode) -# define XXH32_canonical_t XXH_IPREF(XXH32_canonical_t) -# define XXH64_canonical_t XXH_IPREF(XXH64_canonical_t) -# define XXH128_canonical_t XXH_IPREF(XXH128_canonical_t) -# define XXH32_state_s XXH_IPREF(XXH32_state_s) -# define XXH32_state_t XXH_IPREF(XXH32_state_t) -# define XXH64_state_s XXH_IPREF(XXH64_state_s) -# define XXH64_state_t XXH_IPREF(XXH64_state_t) -# define XXH3_state_s XXH_IPREF(XXH3_state_s) -# define XXH3_state_t XXH_IPREF(XXH3_state_t) -# define XXH128_hash_t XXH_IPREF(XXH128_hash_t) - /* Ensure the header is parsed again, even if it was previously included */ -# undef XXHASH_H_5627135585666179 -# undef XXHASH_H_STATIC_13879238742 -#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */ - - - -/* **************************************************************** - * Stable API - *****************************************************************/ -#ifndef XXHASH_H_5627135585666179 -#define XXHASH_H_5627135585666179 1 - -/* specific declaration modes for Windows */ -#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API) -# if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT)) -# ifdef XXH_EXPORT -# define XXH_PUBLIC_API __declspec(dllexport) -# elif XXH_IMPORT -# define XXH_PUBLIC_API __declspec(dllimport) -# endif -# else -# define XXH_PUBLIC_API /* do nothing */ -# endif -#endif - -/*! - * XXH_NAMESPACE, aka Namespace Emulation: - * - * If you want to include _and expose_ xxHash functions from within your own - * library, but also want to avoid symbol collisions with other libraries which - * may also include xxHash, you can use XXH_NAMESPACE to automatically prefix - * any public symbol from xxhash library with the value of XXH_NAMESPACE - * (therefore, avoid empty or numeric values). - * - * Note that no change is required within the calling program as long as it - * includes `xxhash.h`: Regular symbol names will be automatically translated - * by this header. - */ -#ifdef XXH_NAMESPACE -# define XXH_CAT(A,B) A##B -# define XXH_NAME2(A,B) XXH_CAT(A,B) -# define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber) -# define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32) -# define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState) -# define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState) -# define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset) -# define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update) -# define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest) -# define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState) -# define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash) -# define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical) -# define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64) -# define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState) -# define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState) -# define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset) -# define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update) -# define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest) -# define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState) -# define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash) -# define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical) -#endif - - -/* ************************************* -* Version -***************************************/ -#define XXH_VERSION_MAJOR 0 -#define XXH_VERSION_MINOR 7 -#define XXH_VERSION_RELEASE 3 -#define XXH_VERSION_NUMBER (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE) -XXH_PUBLIC_API unsigned XXH_versionNumber (void); - - -/* **************************** -* Definitions -******************************/ -#include /* size_t */ -typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode; - - -/*-********************************************************************** -* 32-bit hash -************************************************************************/ -#if !defined (__VMS) \ - && (defined (__cplusplus) \ - || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) -# include - typedef uint32_t XXH32_hash_t; -#else -# include -# if UINT_MAX == 0xFFFFFFFFUL - typedef unsigned int XXH32_hash_t; -# else -# if ULONG_MAX == 0xFFFFFFFFUL - typedef unsigned long XXH32_hash_t; -# else -# error "unsupported platform: need a 32-bit type" -# endif -# endif -#endif - -/*! - * XXH32(): - * Calculate the 32-bit hash of sequence "length" bytes stored at memory address "input". - * The memory between input & input+length must be valid (allocated and read-accessible). - * "seed" can be used to alter the result predictably. - * Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark): 5.4 GB/s - */ -XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed); - -/******* Streaming *******/ - -/* - * Streaming functions generate the xxHash value from an incrememtal input. - * This method is slower than single-call functions, due to state management. - * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized. - * - * An XXH state must first be allocated using `XXH*_createState()`. - * - * Start a new hash by initializing the state with a seed using `XXH*_reset()`. - * - * Then, feed the hash state by calling `XXH*_update()` as many times as necessary. - * - * The function returns an error code, with 0 meaning OK, and any other value - * meaning there is an error. - * - * Finally, a hash value can be produced anytime, by using `XXH*_digest()`. - * This function returns the nn-bits hash as an int or long long. - * - * It's still possible to continue inserting input into the hash state after a - * digest, and generate new hash values later on by invoking `XXH*_digest()`. - * - * When done, release the state using `XXH*_freeState()`. - */ - -typedef struct XXH32_state_s XXH32_state_t; /* incomplete type */ -XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void); -XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr); -XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state); - -XXH_PUBLIC_API XXH_errorcode XXH32_reset (XXH32_state_t* statePtr, XXH32_hash_t seed); -XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length); -XXH_PUBLIC_API XXH32_hash_t XXH32_digest (const XXH32_state_t* statePtr); - -/******* Canonical representation *******/ - -/* - * The default return values from XXH functions are unsigned 32 and 64 bit - * integers. - * This the simplest and fastest format for further post-processing. - * - * However, this leaves open the question of what is the order on the byte level, - * since little and big endian conventions will store the same number differently. - * - * The canonical representation settles this issue by mandating big-endian - * convention, the same convention as human-readable numbers (large digits first). - * - * When writing hash values to storage, sending them over a network, or printing - * them, it's highly recommended to use the canonical representation to ensure - * portability across a wider range of systems, present and future. - * - * The following functions allow transformation of hash values to and from - * canonical format. - */ - -typedef struct { unsigned char digest[4]; } XXH32_canonical_t; -XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash); -XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src); - - -#ifndef XXH_NO_LONG_LONG -/*-********************************************************************** -* 64-bit hash -************************************************************************/ -#if !defined (__VMS) \ - && (defined (__cplusplus) \ - || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) -# include - typedef uint64_t XXH64_hash_t; -#else - /* the following type must have a width of 64-bit */ - typedef unsigned long long XXH64_hash_t; -#endif - -/*! - * XXH64(): - * Returns the 64-bit hash of sequence of length @length stored at memory - * address @input. - * @seed can be used to alter the result predictably. - * This function usually runs faster on 64-bit systems, but slower on 32-bit - * systems (see benchmark). - */ -XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t length, XXH64_hash_t seed); - -/******* Streaming *******/ -typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */ -XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void); -XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr); -XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state); - -XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH64_state_t* statePtr, XXH64_hash_t seed); -XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length); -XXH_PUBLIC_API XXH64_hash_t XXH64_digest (const XXH64_state_t* statePtr); - -/******* Canonical representation *******/ -typedef struct { unsigned char digest[8]; } XXH64_canonical_t; -XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash); -XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src); - - -#endif /* XXH_NO_LONG_LONG */ - -#endif /* XXHASH_H_5627135585666179 */ - - - -#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) -#define XXHASH_H_STATIC_13879238742 -/* **************************************************************************** - * This section contains declarations which are not guaranteed to remain stable. - * They may change in future versions, becoming incompatible with a different - * version of the library. - * These declarations should only be used with static linking. - * Never use them in association with dynamic linking! - ***************************************************************************** */ - -/* - * These definitions are only present to allow static allocation of an XXH - * state, for example, on the stack or in a struct. - * Never **ever** access members directly. - */ - -struct XXH32_state_s { - XXH32_hash_t total_len_32; - XXH32_hash_t large_len; - XXH32_hash_t v1; - XXH32_hash_t v2; - XXH32_hash_t v3; - XXH32_hash_t v4; - XXH32_hash_t mem32[4]; - XXH32_hash_t memsize; - XXH32_hash_t reserved; /* never read nor write, might be removed in a future version */ -}; /* typedef'd to XXH32_state_t */ - - -#ifndef XXH_NO_LONG_LONG /* defined when there is no 64-bit support */ - -struct XXH64_state_s { - XXH64_hash_t total_len; - XXH64_hash_t v1; - XXH64_hash_t v2; - XXH64_hash_t v3; - XXH64_hash_t v4; - XXH64_hash_t mem64[4]; - XXH32_hash_t memsize; - XXH32_hash_t reserved32; /* required for padding anyway */ - XXH64_hash_t reserved64; /* never read nor write, might be removed in a future version */ -}; /* typedef'd to XXH64_state_t */ - - -/*-********************************************************************** -* XXH3 -* New experimental hash -************************************************************************/ - -/* ************************************************************************ - * XXH3 is a new hash algorithm featuring: - * - Improved speed for both small and large inputs - * - True 64-bit and 128-bit outputs - * - SIMD acceleration - * - Improved 32-bit viability - * - * Speed analysis methodology is explained here: - * - * https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html - * - * In general, expect XXH3 to run about ~2x faster on large inputs and >3x - * faster on small ones compared to XXH64, though exact differences depend on - * the platform. - * - * The algorithm is portable: Like XXH32 and XXH64, it generates the same hash - * on all platforms. - * - * It benefits greatly from SIMD and 64-bit arithmetic, but does not require it. - * - * Almost all 32-bit and 64-bit targets that can run XXH32 smoothly can run - * XXH3 at usable speeds, even if XXH64 runs slowly. Further details are - * explained in the implementation. - * - * Optimized implementations are provided for AVX2, SSE2, NEON, POWER8, ZVector, - * and scalar targets. This can be controlled with the XXH_VECTOR macro. - * - * XXH3 offers 2 variants, _64bits and _128bits. - * When only 64 bits are needed, prefer calling the _64bits variant, as it - * reduces the amount of mixing, resulting in faster speed on small inputs. - * - * It's also generally simpler to manipulate a scalar return type than a struct. - * - * The 128-bit version adds additional strength, but it is slightly slower. - * - * The XXH3 algorithm is still in development. - * The results it produces may still change in future versions. - * - * Results produced by v0.7.x are not comparable with results from v0.7.y. - * However, the API is completely stable, and it can safely be used for - * ephemeral data (local sessions). - * - * Avoid storing values in long-term storage until the algorithm is finalized. - * - * The API supports one-shot hashing, streaming mode, and custom secrets. - */ - -#ifdef XXH_NAMESPACE -# define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits) -# define XXH3_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret) -# define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed) - -# define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState) -# define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState) -# define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState) - -# define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset) -# define XXH3_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed) -# define XXH3_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret) -# define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update) -# define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest) -#endif - -/* XXH3_64bits(): - * default 64-bit variant, using default secret and default seed of 0. - * It's the fastest variant. */ -XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* data, size_t len); - -/* - * XXH3_64bits_withSecret(): - * It's possible to provide any blob of bytes as a "secret" to generate the hash. - * This makes it more difficult for an external actor to prepare an intentional - * collision. - * The secret *must* be large enough (>= XXH3_SECRET_SIZE_MIN). - * It should consist of random bytes. - * Avoid trivial sequences, such as repeating sequences and especially '\0', - * as this can cancel out itself. - * Failure to respect these conditions will result in a poor quality hash. - */ -#define XXH3_SECRET_SIZE_MIN 136 -XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize); - -/* - * XXH3_64bits_withSeed(): - * This variant generates a custom secret on the fly based on the default - * secret, altered using the `seed` value. - * While this operation is decently fast, note that it's not completely free. - * Note: seed==0 produces the same results as XXH3_64bits(). - */ -XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void* data, size_t len, XXH64_hash_t seed); - - -/* streaming 64-bit */ - -#if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11+ */ -# include -# define XXH_ALIGN(n) alignas(n) -#elif defined(__GNUC__) -# define XXH_ALIGN(n) __attribute__ ((aligned(n))) -#elif defined(_MSC_VER) -# define XXH_ALIGN(n) __declspec(align(n)) -#else -# define XXH_ALIGN(n) /* disabled */ -#endif - -/* Old GCC versions only accept the attribute after the type in structures. */ -#if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) /* C11+ */ \ - && defined(__GNUC__) -# define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align) -#else -# define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type -#endif - -typedef struct XXH3_state_s XXH3_state_t; - -#define XXH3_SECRET_DEFAULT_SIZE 192 /* minimum XXH3_SECRET_SIZE_MIN */ -#define XXH3_INTERNALBUFFER_SIZE 256 -struct XXH3_state_s { - XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]); - /* used to store a custom secret generated from the seed. Makes state larger. - * Design might change */ - XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]); - XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]); - XXH32_hash_t bufferedSize; - XXH32_hash_t nbStripesPerBlock; - XXH32_hash_t nbStripesSoFar; - XXH32_hash_t secretLimit; - XXH32_hash_t reserved32; - XXH32_hash_t reserved32_2; - XXH64_hash_t totalLen; - XXH64_hash_t seed; - XXH64_hash_t reserved64; - /* note: there is some padding after due to alignment on 64 bytes */ - const unsigned char* secret; -}; /* typedef'd to XXH3_state_t */ - -#undef XXH_ALIGN_MEMBER - -/* - * Streaming requires state maintenance. - * This operation costs memory and CPU. - * As a consequence, streaming is slower than one-shot hashing. - * For better performance, prefer one-shot functions whenever possible. - */ -XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void); -XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr); -XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state); - - -/* - * XXH3_64bits_reset(): - * Initialize with the default parameters. - * The result will be equivalent to `XXH3_64bits()`. - */ -XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t* statePtr); -/* - * XXH3_64bits_reset_withSeed(): - * Generate a custom secret from `seed`, and store it into `statePtr`. - * digest will be equivalent to `XXH3_64bits_withSeed()`. - */ -XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed); -/* - * XXH3_64bits_reset_withSecret(): - * `secret` is referenced, and must outlive the hash streaming session, so - * be careful when using stack arrays. - * `secretSize` must be >= `XXH3_SECRET_SIZE_MIN`. - */ -XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize); - -XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH3_state_t* statePtr, const void* input, size_t length); -XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* statePtr); - - -/* 128-bit */ - -#ifdef XXH_NAMESPACE -# define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128) -# define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits) -# define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed) -# define XXH3_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret) - -# define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset) -# define XXH3_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed) -# define XXH3_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret) -# define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update) -# define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest) - -# define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual) -# define XXH128_cmp XXH_NAME2(XXH_NAMESPACE, XXH128_cmp) -# define XXH128_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash) -# define XXH128_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical) -#endif - -typedef struct { - XXH64_hash_t low64; - XXH64_hash_t high64; -} XXH128_hash_t; - -XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t seed); -XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* data, size_t len); -XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void* data, size_t len, XXH64_hash_t seed); /* == XXH128() */ -XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize); - -XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t* statePtr); -XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed); -XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize); - -XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH3_state_t* statePtr, const void* input, size_t length); -XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* statePtr); - - -/* Note: For better performance, these functions can be inlined using XXH_INLINE_ALL */ - -/*! - * XXH128_isEqual(): - * Return: 1 if `h1` and `h2` are equal, 0 if they are not. - */ -XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2); - -/*! - * XXH128_cmp(): - * - * This comparator is compatible with stdlib's `qsort()`/`bsearch()`. - * - * return: >0 if *h128_1 > *h128_2 - * <0 if *h128_1 < *h128_2 - * =0 if *h128_1 == *h128_2 - */ -XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2); - - -/******* Canonical representation *******/ -typedef struct { unsigned char digest[16]; } XXH128_canonical_t; -XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash); -XXH_PUBLIC_API XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t* src); - - -#endif /* XXH_NO_LONG_LONG */ - -#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) -# define XXH_IMPLEMENTATION -#endif - -#endif /* defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) */ - - -/* ======================================================================== */ -/* ======================================================================== */ -/* ======================================================================== */ - - -/*-********************************************************************** - * xxHash implementation - *-********************************************************************** - * xxHash's implementation used to be found in xxhash.c. - * - * However, code inlining requires the implementation to be visible to the - * compiler, usually within the header. - * - * As a workaround, xxhash.c used to be included within xxhash.h. This caused - * some issues with some build systems, especially ones which treat .c files - * as source files. - * - * Therefore, the implementation is now directly integrated within xxhash.h. - * Another small advantage is that xxhash.c is no longer needed in /include. - ************************************************************************/ - -#if ( defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) \ - || defined(XXH_IMPLEMENTATION) ) && !defined(XXH_IMPLEM_13a8737387) -# define XXH_IMPLEM_13a8737387 - -/* ************************************* -* Tuning parameters -***************************************/ -/*! - * XXH_FORCE_MEMORY_ACCESS: - * By default, access to unaligned memory is controlled by `memcpy()`, which is - * safe and portable. - * - * Unfortunately, on some target/compiler combinations, the generated assembly - * is sub-optimal. - * - * The below switch allow to select a different access method for improved - * performance. - * Method 0 (default): - * Use `memcpy()`. Safe and portable. - * Method 1: - * `__attribute__((packed))` statement. It depends on compiler extensions - * and is therefore not portable. - * This method is safe if your compiler supports it, and *generally* as - * fast or faster than `memcpy`. - * Method 2: - * Direct access via cast. This method doesn't depend on the compiler but - * violates the C standard. - * It can generate buggy code on targets which do not support unaligned - * memory accesses. - * But in some circumstances, it's the only known way to get the most - * performance (ie GCC + ARMv6) - * Method 3: - * Byteshift. This can generate the best code on old compilers which don't - * inline small `memcpy()` calls, and it might also be faster on big-endian - * systems which lack a native byteswap instruction. - * See https://stackoverflow.com/a/32095106/646947 for details. - * Prefer these methods in priority order (0 > 1 > 2 > 3) - */ -#ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */ -# if !defined(__clang__) && defined(__GNUC__) && defined(__ARM_FEATURE_UNALIGNED) && defined(__ARM_ARCH) && (__ARM_ARCH == 6) -# define XXH_FORCE_MEMORY_ACCESS 2 -# elif !defined(__clang__) && ((defined(__INTEL_COMPILER) && !defined(_WIN32)) || \ - (defined(__GNUC__) && (defined(__ARM_ARCH) && __ARM_ARCH >= 7))) -# define XXH_FORCE_MEMORY_ACCESS 1 -# endif -#endif - -/*! - *XXH_ACCEPT_NULL_INPUT_POINTER: - * If the input pointer is NULL, xxHash's default behavior is to dereference it, - * triggering a segfault. - * When this macro is enabled, xxHash actively checks the input for a null pointer. - * If it is, the result for null input pointers is the same as a zero-length input. - */ -#ifndef XXH_ACCEPT_NULL_INPUT_POINTER /* can be defined externally */ -# define XXH_ACCEPT_NULL_INPUT_POINTER 0 -#endif - -/*! - * XXH_FORCE_ALIGN_CHECK: - * This is a minor performance trick, only useful with lots of very small keys. - * It means: check for aligned/unaligned input. - * The check costs one initial branch per hash; - * Set it to 0 when the input is guaranteed to be aligned or when alignment - * doesn't matter for performance. - * - * This option does not affect XXH3. - */ -#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */ -# if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64) -# define XXH_FORCE_ALIGN_CHECK 0 -# else -# define XXH_FORCE_ALIGN_CHECK 1 -# endif -#endif - -/*! - * XXH_NO_INLINE_HINTS: - * - * By default, xxHash tries to force the compiler to inline almost all internal - * functions. - * - * This can usually improve performance due to reduced jumping and improved - * constant folding, but significantly increases the size of the binary which - * might not be favorable. - * - * Additionally, sometimes the forced inlining can be detrimental to performance, - * depending on the architecture. - * - * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the - * compiler full control on whether to inline or not. - * - * When not optimizing (-O0), optimizing for size (-Os, -Oz), or using - * -fno-inline with GCC or Clang, this will automatically be defined. - */ -#ifndef XXH_NO_INLINE_HINTS -# if defined(__OPTIMIZE_SIZE__) /* -Os, -Oz */ \ - || defined(__NO_INLINE__) /* -O0, -fno-inline */ -# define XXH_NO_INLINE_HINTS 1 -# else -# define XXH_NO_INLINE_HINTS 0 -# endif -#endif - -/*! - * XXH_REROLL: - * Whether to reroll XXH32_finalize, and XXH64_finalize, - * instead of using an unrolled jump table/if statement loop. - * - * This is automatically defined on -Os/-Oz on GCC and Clang. - */ -#ifndef XXH_REROLL -# if defined(__OPTIMIZE_SIZE__) -# define XXH_REROLL 1 -# else -# define XXH_REROLL 0 -# endif -#endif - - -/* ************************************* -* Includes & Memory related functions -***************************************/ -/*! - * Modify the local functions below should you wish to use some other memory - * routines for malloc() and free() - */ -#include -static void* XXH_malloc(size_t s) { return malloc(s); } -static void XXH_free (void* p) { free(p); } -/*! and for memcpy() */ -#include -static void* XXH_memcpy(void* dest, const void* src, size_t size) -{ - return memcpy(dest,src,size); -} - -#include /* ULLONG_MAX */ - - -/* ************************************* -* Compiler Specific Options -***************************************/ -#ifdef _MSC_VER /* Visual Studio warning fix */ -# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ -#endif - -#if XXH_NO_INLINE_HINTS /* disable inlining hints */ -# define XXH_FORCE_INLINE static -# define XXH_NO_INLINE static -#elif defined(_MSC_VER) /* Visual Studio */ -# define XXH_FORCE_INLINE static __forceinline -# define XXH_NO_INLINE static __declspec(noinline) -#else -# if defined (__cplusplus) \ - || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */ -# ifdef __GNUC__ -# define XXH_FORCE_INLINE static inline __attribute__((always_inline)) -# define XXH_NO_INLINE static __attribute__((noinline)) -# else -# define XXH_FORCE_INLINE static inline -# define XXH_NO_INLINE static -# endif -# else -# define XXH_FORCE_INLINE static -# define XXH_NO_INLINE static -# endif /* __STDC_VERSION__ */ -#endif - - - -/* ************************************* -* Debug -***************************************/ -/* - * DEBUGLEVEL is expected to be defined externally, typically via the compiler's - * command line options. The value must be a number. - */ -#ifndef DEBUGLEVEL -# define DEBUGLEVEL 0 -#endif - -#if (DEBUGLEVEL>=1) -# include /* note: can still be disabled with NDEBUG */ -# define XXH_ASSERT(c) assert(c) -#else -# define XXH_ASSERT(c) ((void)0) -#endif - -/* note: use after variable declarations */ -#define XXH_STATIC_ASSERT(c) { enum { XXH_sa = 1/(int)(!!(c)) }; } - - -/* ************************************* -* Basic Types -***************************************/ -#if !defined (__VMS) \ - && (defined (__cplusplus) \ - || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) -# include - typedef uint8_t xxh_u8; -#else - typedef unsigned char xxh_u8; -#endif -typedef XXH32_hash_t xxh_u32; - - -/* *** Memory access *** */ - -#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) -/* - * Manual byteshift. Best for old compilers which don't inline memcpy. - * We actually directly use XXH_readLE32 and XXH_readBE32. - */ -#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) - -/* - * Force direct memory access. Only works on CPU which support unaligned memory - * access in hardware. - */ -static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; } - -#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) - -/* - * __pack instructions are safer but compiler specific, hence potentially - * problematic for some compilers. - * - * Currently only defined for GCC and ICC. - */ -typedef union { xxh_u32 u32; } __attribute__((packed)) unalign; -static xxh_u32 XXH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; } - -#else - -/* - * Portable and safe solution. Generally efficient. - * see: https://stackoverflow.com/a/32095106/646947 - */ -static xxh_u32 XXH_read32(const void* memPtr) -{ - xxh_u32 val; - memcpy(&val, memPtr, sizeof(val)); - return val; -} - -#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ - - -/* *** Endianess *** */ -typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess; - -/*! - * XXH_CPU_LITTLE_ENDIAN: - * Defined to 1 if the target is little endian, or 0 if it is big endian. - * It can be defined externally, for example on the compiler command line. - * - * If it is not defined, a runtime check (which is usually constant folded) - * is used instead. - */ -#ifndef XXH_CPU_LITTLE_ENDIAN -/* - * Try to detect endianness automatically, to avoid the nonstandard behavior - * in `XXH_isLittleEndian()` - */ -# if defined(_WIN32) /* Windows is always little endian */ \ - || defined(__LITTLE_ENDIAN__) \ - || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) -# define XXH_CPU_LITTLE_ENDIAN 1 -# elif defined(__BIG_ENDIAN__) \ - || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) -# define XXH_CPU_LITTLE_ENDIAN 0 -# else -static int XXH_isLittleEndian(void) -{ - /* - * Nonstandard, but well-defined behavior in practice. - * Don't use static: it is detrimental to performance. - */ - const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 }; - return one.c[0]; -} -# define XXH_CPU_LITTLE_ENDIAN XXH_isLittleEndian() -# endif -#endif - - - - -/* **************************************** -* Compiler-specific Functions and Macros -******************************************/ -#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) - -#ifndef __has_builtin -# define __has_builtin(x) 0 -#endif - -#if !defined(NO_CLANG_BUILTIN) && __has_builtin(__builtin_rotateleft32) \ - && __has_builtin(__builtin_rotateleft64) -# define XXH_rotl32 __builtin_rotateleft32 -# define XXH_rotl64 __builtin_rotateleft64 -/* Note: although _rotl exists for minGW (GCC under windows), performance seems poor */ -#elif defined(_MSC_VER) -# define XXH_rotl32(x,r) _rotl(x,r) -# define XXH_rotl64(x,r) _rotl64(x,r) -#else -# define XXH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r)))) -# define XXH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r)))) -#endif - -#if defined(_MSC_VER) /* Visual Studio */ -# define XXH_swap32 _byteswap_ulong -#elif XXH_GCC_VERSION >= 403 -# define XXH_swap32 __builtin_bswap32 -#else -static xxh_u32 XXH_swap32 (xxh_u32 x) -{ - return ((x << 24) & 0xff000000 ) | - ((x << 8) & 0x00ff0000 ) | - ((x >> 8) & 0x0000ff00 ) | - ((x >> 24) & 0x000000ff ); -} -#endif - - -/* *************************** -* Memory reads -*****************************/ -typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment; - -/* - * XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. - * - * This is ideal for older compilers which don't inline memcpy. - */ -#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) - -XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* memPtr) -{ - const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; - return bytePtr[0] - | ((xxh_u32)bytePtr[1] << 8) - | ((xxh_u32)bytePtr[2] << 16) - | ((xxh_u32)bytePtr[3] << 24); -} - -XXH_FORCE_INLINE xxh_u32 XXH_readBE32(const void* memPtr) -{ - const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; - return bytePtr[3] - | ((xxh_u32)bytePtr[2] << 8) - | ((xxh_u32)bytePtr[1] << 16) - | ((xxh_u32)bytePtr[0] << 24); -} - -#else -XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* ptr) -{ - return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr)); -} - -static xxh_u32 XXH_readBE32(const void* ptr) -{ - return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr); -} -#endif - -XXH_FORCE_INLINE xxh_u32 -XXH_readLE32_align(const void* ptr, XXH_alignment align) -{ - if (align==XXH_unaligned) { - return XXH_readLE32(ptr); - } else { - return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u32*)ptr : XXH_swap32(*(const xxh_u32*)ptr); - } -} - - -/* ************************************* -* Misc -***************************************/ -XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; } - - -/* ******************************************************************* -* 32-bit hash functions -*********************************************************************/ -static const xxh_u32 PRIME32_1 = 0x9E3779B1U; /* 0b10011110001101110111100110110001 */ -static const xxh_u32 PRIME32_2 = 0x85EBCA77U; /* 0b10000101111010111100101001110111 */ -static const xxh_u32 PRIME32_3 = 0xC2B2AE3DU; /* 0b11000010101100101010111000111101 */ -static const xxh_u32 PRIME32_4 = 0x27D4EB2FU; /* 0b00100111110101001110101100101111 */ -static const xxh_u32 PRIME32_5 = 0x165667B1U; /* 0b00010110010101100110011110110001 */ - -static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input) -{ - acc += input * PRIME32_2; - acc = XXH_rotl32(acc, 13); - acc *= PRIME32_1; -#if defined(__GNUC__) && defined(__SSE4_1__) && !defined(XXH_ENABLE_AUTOVECTORIZE) - /* - * UGLY HACK: - * This inline assembly hack forces acc into a normal register. This is the - * only thing that prevents GCC and Clang from autovectorizing the XXH32 - * loop (pragmas and attributes don't work for some resason) without globally - * disabling SSE4.1. - * - * The reason we want to avoid vectorization is because despite working on - * 4 integers at a time, there are multiple factors slowing XXH32 down on - * SSE4: - * - There's a ridiculous amount of lag from pmulld (10 cycles of latency on - * newer chips!) making it slightly slower to multiply four integers at - * once compared to four integers independently. Even when pmulld was - * fastest, Sandy/Ivy Bridge, it is still not worth it to go into SSE - * just to multiply unless doing a long operation. - * - * - Four instructions are required to rotate, - * movqda tmp, v // not required with VEX encoding - * pslld tmp, 13 // tmp <<= 13 - * psrld v, 19 // x >>= 19 - * por v, tmp // x |= tmp - * compared to one for scalar: - * roll v, 13 // reliably fast across the board - * shldl v, v, 13 // Sandy Bridge and later prefer this for some reason - * - * - Instruction level parallelism is actually more beneficial here because - * the SIMD actually serializes this operation: While v1 is rotating, v2 - * can load data, while v3 can multiply. SSE forces them to operate - * together. - * - * How this hack works: - * __asm__("" // Declare an assembly block but don't declare any instructions - * : // However, as an Input/Output Operand, - * "+r" // constrain a read/write operand (+) as a general purpose register (r). - * (acc) // and set acc as the operand - * ); - * - * Because of the 'r', the compiler has promised that seed will be in a - * general purpose register and the '+' says that it will be 'read/write', - * so it has to assume it has changed. It is like volatile without all the - * loads and stores. - * - * Since the argument has to be in a normal register (not an SSE register), - * each time XXH32_round is called, it is impossible to vectorize. - */ - __asm__("" : "+r" (acc)); -#endif - return acc; -} - -/* mix all bits */ -static xxh_u32 XXH32_avalanche(xxh_u32 h32) -{ - h32 ^= h32 >> 15; - h32 *= PRIME32_2; - h32 ^= h32 >> 13; - h32 *= PRIME32_3; - h32 ^= h32 >> 16; - return(h32); -} - -#define XXH_get32bits(p) XXH_readLE32_align(p, align) - -static xxh_u32 -XXH32_finalize(xxh_u32 h32, const xxh_u8* ptr, size_t len, XXH_alignment align) -{ -#define PROCESS1 \ - h32 += (*ptr++) * PRIME32_5; \ - h32 = XXH_rotl32(h32, 11) * PRIME32_1 ; - -#define PROCESS4 \ - h32 += XXH_get32bits(ptr) * PRIME32_3; \ - ptr+=4; \ - h32 = XXH_rotl32(h32, 17) * PRIME32_4 ; - - /* Compact rerolled version */ - if (XXH_REROLL) { - len &= 15; - while (len >= 4) { - PROCESS4; - len -= 4; - } - while (len > 0) { - PROCESS1; - --len; - } - return XXH32_avalanche(h32); - } else { - switch(len&15) /* or switch(bEnd - p) */ { - case 12: PROCESS4; - /* fallthrough */ - case 8: PROCESS4; - /* fallthrough */ - case 4: PROCESS4; - return XXH32_avalanche(h32); - - case 13: PROCESS4; - /* fallthrough */ - case 9: PROCESS4; - /* fallthrough */ - case 5: PROCESS4; - PROCESS1; - return XXH32_avalanche(h32); - - case 14: PROCESS4; - /* fallthrough */ - case 10: PROCESS4; - /* fallthrough */ - case 6: PROCESS4; - PROCESS1; - PROCESS1; - return XXH32_avalanche(h32); - - case 15: PROCESS4; - /* fallthrough */ - case 11: PROCESS4; - /* fallthrough */ - case 7: PROCESS4; - /* fallthrough */ - case 3: PROCESS1; - /* fallthrough */ - case 2: PROCESS1; - /* fallthrough */ - case 1: PROCESS1; - /* fallthrough */ - case 0: return XXH32_avalanche(h32); - } - XXH_ASSERT(0); - return h32; /* reaching this point is deemed impossible */ - } -} - -XXH_FORCE_INLINE xxh_u32 -XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align) -{ - const xxh_u8* bEnd = input + len; - xxh_u32 h32; - -#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) - if (input==NULL) { - len=0; - bEnd=input=(const xxh_u8*)(size_t)16; - } -#endif - - if (len>=16) { - const xxh_u8* const limit = bEnd - 15; - xxh_u32 v1 = seed + PRIME32_1 + PRIME32_2; - xxh_u32 v2 = seed + PRIME32_2; - xxh_u32 v3 = seed + 0; - xxh_u32 v4 = seed - PRIME32_1; - - do { - v1 = XXH32_round(v1, XXH_get32bits(input)); input += 4; - v2 = XXH32_round(v2, XXH_get32bits(input)); input += 4; - v3 = XXH32_round(v3, XXH_get32bits(input)); input += 4; - v4 = XXH32_round(v4, XXH_get32bits(input)); input += 4; - } while (input < limit); - - h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) - + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18); - } else { - h32 = seed + PRIME32_5; - } - - h32 += (xxh_u32)len; - - return XXH32_finalize(h32, input, len&15, align); -} - - -XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed) -{ -#if 0 - /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ - XXH32_state_t state; - XXH32_reset(&state, seed); - XXH32_update(&state, (const xxh_u8*)input, len); - return XXH32_digest(&state); - -#else - - if (XXH_FORCE_ALIGN_CHECK) { - if ((((size_t)input) & 3) == 0) { /* Input is 4-bytes aligned, leverage the speed benefit */ - return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_aligned); - } } - - return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned); -#endif -} - - - -/******* Hash streaming *******/ - -XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void) -{ - return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t)); -} -XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr) -{ - XXH_free(statePtr); - return XXH_OK; -} - -XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState) -{ - memcpy(dstState, srcState, sizeof(*dstState)); -} - -XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed) -{ - XXH32_state_t state; /* using a local state to memcpy() in order to avoid strict-aliasing warnings */ - memset(&state, 0, sizeof(state)); - state.v1 = seed + PRIME32_1 + PRIME32_2; - state.v2 = seed + PRIME32_2; - state.v3 = seed + 0; - state.v4 = seed - PRIME32_1; - /* do not write into reserved, planned to be removed in a future version */ - memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved)); - return XXH_OK; -} - - -XXH_PUBLIC_API XXH_errorcode -XXH32_update(XXH32_state_t* state, const void* input, size_t len) -{ - if (input==NULL) -#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) - return XXH_OK; -#else - return XXH_ERROR; -#endif - - { const xxh_u8* p = (const xxh_u8*)input; - const xxh_u8* const bEnd = p + len; - - state->total_len_32 += (XXH32_hash_t)len; - state->large_len |= (XXH32_hash_t)((len>=16) | (state->total_len_32>=16)); - - if (state->memsize + len < 16) { /* fill in tmp buffer */ - XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, len); - state->memsize += (XXH32_hash_t)len; - return XXH_OK; - } - - if (state->memsize) { /* some data left from previous update */ - XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, 16-state->memsize); - { const xxh_u32* p32 = state->mem32; - state->v1 = XXH32_round(state->v1, XXH_readLE32(p32)); p32++; - state->v2 = XXH32_round(state->v2, XXH_readLE32(p32)); p32++; - state->v3 = XXH32_round(state->v3, XXH_readLE32(p32)); p32++; - state->v4 = XXH32_round(state->v4, XXH_readLE32(p32)); - } - p += 16-state->memsize; - state->memsize = 0; - } - - if (p <= bEnd-16) { - const xxh_u8* const limit = bEnd - 16; - xxh_u32 v1 = state->v1; - xxh_u32 v2 = state->v2; - xxh_u32 v3 = state->v3; - xxh_u32 v4 = state->v4; - - do { - v1 = XXH32_round(v1, XXH_readLE32(p)); p+=4; - v2 = XXH32_round(v2, XXH_readLE32(p)); p+=4; - v3 = XXH32_round(v3, XXH_readLE32(p)); p+=4; - v4 = XXH32_round(v4, XXH_readLE32(p)); p+=4; - } while (p<=limit); - - state->v1 = v1; - state->v2 = v2; - state->v3 = v3; - state->v4 = v4; - } - - if (p < bEnd) { - XXH_memcpy(state->mem32, p, (size_t)(bEnd-p)); - state->memsize = (unsigned)(bEnd-p); - } - } - - return XXH_OK; -} - - -XXH_PUBLIC_API XXH32_hash_t XXH32_digest (const XXH32_state_t* state) -{ - xxh_u32 h32; - - if (state->large_len) { - h32 = XXH_rotl32(state->v1, 1) - + XXH_rotl32(state->v2, 7) - + XXH_rotl32(state->v3, 12) - + XXH_rotl32(state->v4, 18); - } else { - h32 = state->v3 /* == seed */ + PRIME32_5; - } - - h32 += state->total_len_32; - - return XXH32_finalize(h32, (const xxh_u8*)state->mem32, state->memsize, XXH_aligned); -} - - -/******* Canonical representation *******/ - -/* - * The default return values from XXH functions are unsigned 32 and 64 bit - * integers. - * - * The canonical representation uses big endian convention, the same convention - * as human-readable numbers (large digits first). - * - * This way, hash values can be written into a file or buffer, remaining - * comparable across different systems. - * - * The following functions allow transformation of hash values to and from their - * canonical format. - */ -XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash) -{ - XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t)); - if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash); - memcpy(dst, &hash, sizeof(*dst)); -} - -XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src) -{ - return XXH_readBE32(src); -} - - -#ifndef XXH_NO_LONG_LONG - -/* ******************************************************************* -* 64-bit hash functions -*********************************************************************/ - -/******* Memory access *******/ - -typedef XXH64_hash_t xxh_u64; - - -/*! - * XXH_REROLL_XXH64: - * Whether to reroll the XXH64_finalize() loop. - * - * Just like XXH32, we can unroll the XXH64_finalize() loop. This can be a - * performance gain on 64-bit hosts, as only one jump is required. - * - * However, on 32-bit hosts, because arithmetic needs to be done with two 32-bit - * registers, and 64-bit arithmetic needs to be simulated, it isn't beneficial - * to unroll. The code becomes ridiculously large (the largest function in the - * binary on i386!), and rerolling it saves anywhere from 3kB to 20kB. It is - * also slightly faster because it fits into cache better and is more likely - * to be inlined by the compiler. - * - * If XXH_REROLL is defined, this is ignored and the loop is always rerolled. - */ -#ifndef XXH_REROLL_XXH64 -# if (defined(__ILP32__) || defined(_ILP32)) /* ILP32 is often defined on 32-bit GCC family */ \ - || !(defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64) /* x86-64 */ \ - || defined(_M_ARM64) || defined(__aarch64__) || defined(__arm64__) /* aarch64 */ \ - || defined(__PPC64__) || defined(__PPC64LE__) || defined(__ppc64__) || defined(__powerpc64__) /* ppc64 */ \ - || defined(__mips64__) || defined(__mips64)) /* mips64 */ \ - || (!defined(SIZE_MAX) || SIZE_MAX < ULLONG_MAX) /* check limits */ -# define XXH_REROLL_XXH64 1 -# else -# define XXH_REROLL_XXH64 0 -# endif -#endif /* !defined(XXH_REROLL_XXH64) */ - -#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) -/* - * Manual byteshift. Best for old compilers which don't inline memcpy. - * We actually directly use XXH_readLE64 and XXH_readBE64. - */ -#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) - -/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */ -static xxh_u64 XXH_read64(const void* memPtr) { return *(const xxh_u64*) memPtr; } - -#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) - -/* - * __pack instructions are safer, but compiler specific, hence potentially - * problematic for some compilers. - * - * Currently only defined for GCC and ICC. - */ -typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64; -static xxh_u64 XXH_read64(const void* ptr) { return ((const unalign64*)ptr)->u64; } - -#else - -/* - * Portable and safe solution. Generally efficient. - * see: https://stackoverflow.com/a/32095106/646947 - */ -static xxh_u64 XXH_read64(const void* memPtr) -{ - xxh_u64 val; - memcpy(&val, memPtr, sizeof(val)); - return val; -} - -#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ - -#if defined(_MSC_VER) /* Visual Studio */ -# define XXH_swap64 _byteswap_uint64 -#elif XXH_GCC_VERSION >= 403 -# define XXH_swap64 __builtin_bswap64 -#else -static xxh_u64 XXH_swap64 (xxh_u64 x) -{ - return ((x << 56) & 0xff00000000000000ULL) | - ((x << 40) & 0x00ff000000000000ULL) | - ((x << 24) & 0x0000ff0000000000ULL) | - ((x << 8) & 0x000000ff00000000ULL) | - ((x >> 8) & 0x00000000ff000000ULL) | - ((x >> 24) & 0x0000000000ff0000ULL) | - ((x >> 40) & 0x000000000000ff00ULL) | - ((x >> 56) & 0x00000000000000ffULL); -} -#endif - - -/* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. */ -#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) - -XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* memPtr) -{ - const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; - return bytePtr[0] - | ((xxh_u64)bytePtr[1] << 8) - | ((xxh_u64)bytePtr[2] << 16) - | ((xxh_u64)bytePtr[3] << 24) - | ((xxh_u64)bytePtr[4] << 32) - | ((xxh_u64)bytePtr[5] << 40) - | ((xxh_u64)bytePtr[6] << 48) - | ((xxh_u64)bytePtr[7] << 56); -} - -XXH_FORCE_INLINE xxh_u64 XXH_readBE64(const void* memPtr) -{ - const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; - return bytePtr[7] - | ((xxh_u64)bytePtr[6] << 8) - | ((xxh_u64)bytePtr[5] << 16) - | ((xxh_u64)bytePtr[4] << 24) - | ((xxh_u64)bytePtr[3] << 32) - | ((xxh_u64)bytePtr[2] << 40) - | ((xxh_u64)bytePtr[1] << 48) - | ((xxh_u64)bytePtr[0] << 56); -} - -#else -XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* ptr) -{ - return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr)); -} - -static xxh_u64 XXH_readBE64(const void* ptr) -{ - return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr); -} -#endif - -XXH_FORCE_INLINE xxh_u64 -XXH_readLE64_align(const void* ptr, XXH_alignment align) -{ - if (align==XXH_unaligned) - return XXH_readLE64(ptr); - else - return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXH_swap64(*(const xxh_u64*)ptr); -} - - -/******* xxh64 *******/ - -static const xxh_u64 PRIME64_1 = 0x9E3779B185EBCA87ULL; /* 0b1001111000110111011110011011000110000101111010111100101010000111 */ -static const xxh_u64 PRIME64_2 = 0xC2B2AE3D27D4EB4FULL; /* 0b1100001010110010101011100011110100100111110101001110101101001111 */ -static const xxh_u64 PRIME64_3 = 0x165667B19E3779F9ULL; /* 0b0001011001010110011001111011000110011110001101110111100111111001 */ -static const xxh_u64 PRIME64_4 = 0x85EBCA77C2B2AE63ULL; /* 0b1000010111101011110010100111011111000010101100101010111001100011 */ -static const xxh_u64 PRIME64_5 = 0x27D4EB2F165667C5ULL; /* 0b0010011111010100111010110010111100010110010101100110011111000101 */ - -static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input) -{ - acc += input * PRIME64_2; - acc = XXH_rotl64(acc, 31); - acc *= PRIME64_1; - return acc; -} - -static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val) -{ - val = XXH64_round(0, val); - acc ^= val; - acc = acc * PRIME64_1 + PRIME64_4; - return acc; -} - -static xxh_u64 XXH64_avalanche(xxh_u64 h64) -{ - h64 ^= h64 >> 33; - h64 *= PRIME64_2; - h64 ^= h64 >> 29; - h64 *= PRIME64_3; - h64 ^= h64 >> 32; - return h64; -} - - -#define XXH_get64bits(p) XXH_readLE64_align(p, align) - -static xxh_u64 -XXH64_finalize(xxh_u64 h64, const xxh_u8* ptr, size_t len, XXH_alignment align) -{ -#define PROCESS1_64 \ - h64 ^= (*ptr++) * PRIME64_5; \ - h64 = XXH_rotl64(h64, 11) * PRIME64_1; - -#define PROCESS4_64 \ - h64 ^= (xxh_u64)(XXH_get32bits(ptr)) * PRIME64_1; \ - ptr+=4; \ - h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3; - -#define PROCESS8_64 { \ - xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr)); \ - ptr+=8; \ - h64 ^= k1; \ - h64 = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4; \ -} - - /* Rerolled version for 32-bit targets is faster and much smaller. */ - if (XXH_REROLL || XXH_REROLL_XXH64) { - len &= 31; - while (len >= 8) { - PROCESS8_64; - len -= 8; - } - if (len >= 4) { - PROCESS4_64; - len -= 4; - } - while (len > 0) { - PROCESS1_64; - --len; - } - return XXH64_avalanche(h64); - } else { - switch(len & 31) { - case 24: PROCESS8_64; - /* fallthrough */ - case 16: PROCESS8_64; - /* fallthrough */ - case 8: PROCESS8_64; - return XXH64_avalanche(h64); - - case 28: PROCESS8_64; - /* fallthrough */ - case 20: PROCESS8_64; - /* fallthrough */ - case 12: PROCESS8_64; - /* fallthrough */ - case 4: PROCESS4_64; - return XXH64_avalanche(h64); - - case 25: PROCESS8_64; - /* fallthrough */ - case 17: PROCESS8_64; - /* fallthrough */ - case 9: PROCESS8_64; - PROCESS1_64; - return XXH64_avalanche(h64); - - case 29: PROCESS8_64; - /* fallthrough */ - case 21: PROCESS8_64; - /* fallthrough */ - case 13: PROCESS8_64; - /* fallthrough */ - case 5: PROCESS4_64; - PROCESS1_64; - return XXH64_avalanche(h64); - - case 26: PROCESS8_64; - /* fallthrough */ - case 18: PROCESS8_64; - /* fallthrough */ - case 10: PROCESS8_64; - PROCESS1_64; - PROCESS1_64; - return XXH64_avalanche(h64); - - case 30: PROCESS8_64; - /* fallthrough */ - case 22: PROCESS8_64; - /* fallthrough */ - case 14: PROCESS8_64; - /* fallthrough */ - case 6: PROCESS4_64; - PROCESS1_64; - PROCESS1_64; - return XXH64_avalanche(h64); - - case 27: PROCESS8_64; - /* fallthrough */ - case 19: PROCESS8_64; - /* fallthrough */ - case 11: PROCESS8_64; - PROCESS1_64; - PROCESS1_64; - PROCESS1_64; - return XXH64_avalanche(h64); - - case 31: PROCESS8_64; - /* fallthrough */ - case 23: PROCESS8_64; - /* fallthrough */ - case 15: PROCESS8_64; - /* fallthrough */ - case 7: PROCESS4_64; - /* fallthrough */ - case 3: PROCESS1_64; - /* fallthrough */ - case 2: PROCESS1_64; - /* fallthrough */ - case 1: PROCESS1_64; - /* fallthrough */ - case 0: return XXH64_avalanche(h64); - } - } - /* impossible to reach */ - XXH_ASSERT(0); - return 0; /* unreachable, but some compilers complain without it */ -} - -XXH_FORCE_INLINE xxh_u64 -XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align) -{ - const xxh_u8* bEnd = input + len; - xxh_u64 h64; - -#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) - if (input==NULL) { - len=0; - bEnd=input=(const xxh_u8*)(size_t)32; - } -#endif - - if (len>=32) { - const xxh_u8* const limit = bEnd - 32; - xxh_u64 v1 = seed + PRIME64_1 + PRIME64_2; - xxh_u64 v2 = seed + PRIME64_2; - xxh_u64 v3 = seed + 0; - xxh_u64 v4 = seed - PRIME64_1; - - do { - v1 = XXH64_round(v1, XXH_get64bits(input)); input+=8; - v2 = XXH64_round(v2, XXH_get64bits(input)); input+=8; - v3 = XXH64_round(v3, XXH_get64bits(input)); input+=8; - v4 = XXH64_round(v4, XXH_get64bits(input)); input+=8; - } while (input<=limit); - - h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18); - h64 = XXH64_mergeRound(h64, v1); - h64 = XXH64_mergeRound(h64, v2); - h64 = XXH64_mergeRound(h64, v3); - h64 = XXH64_mergeRound(h64, v4); - - } else { - h64 = seed + PRIME64_5; - } - - h64 += (xxh_u64) len; - - return XXH64_finalize(h64, input, len, align); -} - - -XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t len, XXH64_hash_t seed) -{ -#if 0 - /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ - XXH64_state_t state; - XXH64_reset(&state, seed); - XXH64_update(&state, (const xxh_u8*)input, len); - return XXH64_digest(&state); - -#else - - if (XXH_FORCE_ALIGN_CHECK) { - if ((((size_t)input) & 7)==0) { /* Input is aligned, let's leverage the speed advantage */ - return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_aligned); - } } - - return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned); - -#endif -} - -/******* Hash Streaming *******/ - -XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void) -{ - return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t)); -} -XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr) -{ - XXH_free(statePtr); - return XXH_OK; -} - -XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState) -{ - memcpy(dstState, srcState, sizeof(*dstState)); -} - -XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t seed) -{ - XXH64_state_t state; /* use a local state to memcpy() in order to avoid strict-aliasing warnings */ - memset(&state, 0, sizeof(state)); - state.v1 = seed + PRIME64_1 + PRIME64_2; - state.v2 = seed + PRIME64_2; - state.v3 = seed + 0; - state.v4 = seed - PRIME64_1; - /* do not write into reserved64, might be removed in a future version */ - memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved64)); - return XXH_OK; -} - -XXH_PUBLIC_API XXH_errorcode -XXH64_update (XXH64_state_t* state, const void* input, size_t len) -{ - if (input==NULL) -#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) - return XXH_OK; -#else - return XXH_ERROR; -#endif - - { const xxh_u8* p = (const xxh_u8*)input; - const xxh_u8* const bEnd = p + len; - - state->total_len += len; - - if (state->memsize + len < 32) { /* fill in tmp buffer */ - XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, len); - state->memsize += (xxh_u32)len; - return XXH_OK; - } - - if (state->memsize) { /* tmp buffer is full */ - XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, 32-state->memsize); - state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64+0)); - state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64+1)); - state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64+2)); - state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64+3)); - p += 32-state->memsize; - state->memsize = 0; - } - - if (p+32 <= bEnd) { - const xxh_u8* const limit = bEnd - 32; - xxh_u64 v1 = state->v1; - xxh_u64 v2 = state->v2; - xxh_u64 v3 = state->v3; - xxh_u64 v4 = state->v4; - - do { - v1 = XXH64_round(v1, XXH_readLE64(p)); p+=8; - v2 = XXH64_round(v2, XXH_readLE64(p)); p+=8; - v3 = XXH64_round(v3, XXH_readLE64(p)); p+=8; - v4 = XXH64_round(v4, XXH_readLE64(p)); p+=8; - } while (p<=limit); - - state->v1 = v1; - state->v2 = v2; - state->v3 = v3; - state->v4 = v4; - } - - if (p < bEnd) { - XXH_memcpy(state->mem64, p, (size_t)(bEnd-p)); - state->memsize = (unsigned)(bEnd-p); - } - } - - return XXH_OK; -} - - -XXH_PUBLIC_API XXH64_hash_t XXH64_digest (const XXH64_state_t* state) -{ - xxh_u64 h64; - - if (state->total_len >= 32) { - xxh_u64 const v1 = state->v1; - xxh_u64 const v2 = state->v2; - xxh_u64 const v3 = state->v3; - xxh_u64 const v4 = state->v4; - - h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18); - h64 = XXH64_mergeRound(h64, v1); - h64 = XXH64_mergeRound(h64, v2); - h64 = XXH64_mergeRound(h64, v3); - h64 = XXH64_mergeRound(h64, v4); - } else { - h64 = state->v3 /*seed*/ + PRIME64_5; - } - - h64 += (xxh_u64) state->total_len; - - return XXH64_finalize(h64, (const xxh_u8*)state->mem64, (size_t)state->total_len, XXH_aligned); -} - - -/******* Canonical representation *******/ - -XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash) -{ - XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t)); - if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash); - memcpy(dst, &hash, sizeof(*dst)); -} - -XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src) -{ - return XXH_readBE64(src); -} - - - -/* ********************************************************************* -* XXH3 -* New generation hash designed for speed on small keys and vectorization -************************************************************************ */ - -#include "xxh3.h" - - -#endif /* XXH_NO_LONG_LONG */ - - -#endif /* XXH_IMPLEMENTATION */ - - -#if defined (__cplusplus) -} -#endif