Skip to content

Commit

Permalink
Update xxHash to v0.8.0
Browse files Browse the repository at this point in the history
Add support for xxHash's dispatch
Cleanup compile warnings
Improve usage message
Add submodules for xxHash and uthash
  • Loading branch information
mterron committed Jul 30, 2020
1 parent 59521c1 commit ad4c2e3
Show file tree
Hide file tree
Showing 11 changed files with 185 additions and 4,501 deletions.
6 changes: 4 additions & 2 deletions .github/workflows/ccpp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,10 @@ jobs:
strategy:
matrix:
os: [ubuntu-latest, macOS-latest]

steps:
- uses: actions/checkout@v1
- uses: actions/checkout@v2
with:
submodules: true
- name: make
run: make
6 changes: 2 additions & 4 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
swuniq
*.o
sonar-project.properties
bin/*
bin
benchsuite/corpustest
out/*
out
6 changes: 6 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[submodule "uthash"]
path = uthash
url = https://github.com/troydhanson/uthash
[submodule "xxHash"]
path = xxHash
url = https://github.com/Cyan4973/xxHash
32 changes: 24 additions & 8 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,23 +1,39 @@
SHELL=/bin/sh
TARGET=swuniq
SRCS=swuniq.c xxhash.h
CFLAGS=-O2
SRCS=swuniq.c
DESTDIR=
prefix=/usr/local/bin
INSTALL=install
INSTALL_PROGRAM=$(INSTALL)

CFLAGS ?= -O2 -pie
DEBUGFLAGS+=-Wall -Wextra -Wconversion -Wcast-qual -Wcast-align \
-Wshadow -Wundef -Wstrict-overflow=5 -Wstrict-prototypes \
-Wswitch-enum -Wredundant-decls -Wvla -Wnarrowing \
-Wpointer-arith -Wformat-security -Wformat=2 \
-Winit-self -Wfloat-equal -Wwrite-strings
CFLAGS += $(DEBUGFLAGS)

# Add support for xxHash dispatch
ifeq ($(DISPATCH),1)
CFLAGS += -DXXHSUM_DISPATCH=1
SRCS += xxHash/xxh_x86dispatch.c
endif

.PHONY: swuniq
swuniq: $(SRCS)
mkdir -p out
$(CC) $(CFLAGS) $(TARGET).c -o out/$(TARGET)
$(CC) $(CFLAGS) $(SRCS) -o out/$(TARGET)

static: $(SRCS)
mkdir -p out
$(CC) $(CFLAGS) -static $(TARGET).c -o out/$(TARGET)-static

.PHONY: all
all: swuniq static
$(CC) $(CFLAGS) -static $(SRCS) -o out/$(TARGET)-static

# ## dispatch only works for x86/x64 systems
# dispatch: CPPFLAGS += -DXXHSUM_DISPATCH=1
# dispatch: xxHash/xxhash.o xxHash/xxh_x86dispatch.o swuniq.c
# $(CC) $(CFLAGS) $^ $(LDFLAGS)
# xxh_x86dispatch.o: xxHash/xxh_x86dispatch.c xxHash/xxh_x86dispatch.h xxHash/xxhash.h

install: swuniq
mkdir -p $(DESTDIR)$(prefix)
Expand All @@ -43,7 +59,7 @@ install-strip-all:

.PHONY: check
check:
@if [ "$$({ seq 1 10; seq 1 10; } | out/swuniq -w 10 | wc -l)" -eq 10 ]; then \
@if [ "$$({ seq 1 10; seq 1 10; } | out/swuniq -w 10 | wc -l)" -eq 10 ]; then \
echo 'Test suite result [SUCCESS]' \
exit 0 ;\
else \
Expand Down
235 changes: 147 additions & 88 deletions swuniq.c
Original file line number Diff line number Diff line change
@@ -1,138 +1,197 @@
/*
* swuniq - sliding window uniq
*
* MIT License
*
* Copyright (c) 2018 Miguel Terron
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
* swuniq - sliding window uniq
*
* MIT License
*
* Copyright (c) 2018 Miguel Terron
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
*/

/* swuniq :
* TODO: Description
*/


/* ************************************
* Includes
**************************************/
#include <stdlib.h>
#include <ctype.h>
#include <err.h>
#include <inttypes.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <err.h>

#include <unistd.h>
#include <inttypes.h>
#include <ctype.h>

#define XXH_PRIVATE_API
#define XXH_STATIC_LINKING_ONLY
#define XXH_INLINE_ALL
#include "xxhash.h"

#include "utringbuffer.h"
#include "xxHash/xxhash.h"

#ifdef XXHSUM_DISPATCH
# include "xxHash/xxh_x86dispatch.h"
#endif

#include "uthash/src/utringbuffer.h"

/* makes the next part easier */
#if defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64)
# define ARCH_X64 1
# define ARCH_X86 "x86_64"
#elif defined(__i386__) || defined(_M_IX86) || defined(_M_IX86_FP)
# define ARCH_X86 "i386"
#endif
/* Try to detect the architecture. */
#if defined(ARCH_X86)
# if defined(XXHSUM_DISPATCH)
# define ARCH ARCH_X86 " autoVec"
# elif defined(__AVX512F__)
# define ARCH ARCH_X86 " + AVX512"
# elif defined(__AVX2__)
# define ARCH ARCH_X86 " + AVX2"
# elif defined(__AVX__)
# define ARCH ARCH_X86 " + AVX"
# elif defined(_M_X64) || defined(_M_AMD64) || defined(__x86_64__) \
|| defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP == 2)
# define ARCH ARCH_X86 " + SSE2"
# else
# define ARCH ARCH_X86
# endif
#elif defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64)
# define ARCH "aarch64 + NEON"
#elif defined(__arm__) || defined(__thumb__) || defined(__thumb2__) || defined(_M_ARM)
/* ARM has a lot of different features that can change xxHash significantly. */
# if defined(__thumb2__) || (defined(__thumb__) && (__thumb__ == 2 || __ARM_ARCH >= 7))
# define ARCH_THUMB " Thumb-2"
# elif defined(__thumb__)
# define ARCH_THUMB " Thumb-1"
# else
# define ARCH_THUMB ""
# endif
/* ARMv7 has unaligned by default */
# if defined(__ARM_FEATURE_UNALIGNED) || __ARM_ARCH >= 7 || defined(_M_ARMV7VE)
# define ARCH_UNALIGNED " + unaligned"
# else
# define ARCH_UNALIGNED ""
# endif
# if defined(__ARM_NEON) || defined(__ARM_NEON__)
# define ARCH_NEON " + NEON"
# else
# define ARCH_NEON ""
# endif
# define ARCH "ARMv" EXPAND_AND_QUOTE(__ARM_ARCH) ARCH_THUMB ARCH_NEON ARCH_UNALIGNED
#elif defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__)
# if defined(__GNUC__) && defined(__POWER9_VECTOR__)
# define ARCH "ppc64 + POWER9 vector"
# elif defined(__GNUC__) && defined(__POWER8_VECTOR__)
# define ARCH "ppc64 + POWER8 vector"
# else
# define ARCH "ppc64"
# endif
#elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__)
# define ARCH "ppc"
#elif defined(__AVR)
# define ARCH "AVR"
#elif defined(__mips64)
# define ARCH "mips64"
#elif defined(__mips)
# define ARCH "mips"
#elif defined(__s390x__)
# define ARCH "s390x"
#elif defined(__s390__)
# define ARCH "s390"
#else
# define ARCH "unknown"
#endif
static const int g_nbBits = (int)(sizeof(void*)*8);

/********************************************************************************************************************/

unsigned long long hashString(const void* buffer, size_t length)
{
unsigned long long const seed = 1029384756;
unsigned long long const hash = XXH3_64bits_withSeed(buffer, length, seed);
unsigned long long hashString(const char *data) {
unsigned long long const hash = XXH3_64bits(data, strlen(data));
return hash;
}

/* returns 1 if the hash already exists on the ringbuffer */
int lookup(const unsigned long long hash, const UT_ringbuffer* rbuffer)
{
int out = 0;

if (utringbuffer_len(rbuffer) == 0) return(out);
else
{
//unsigned long long *item;
for (int i=0; i < utringbuffer_len(rbuffer); i++) {
bool lookup(unsigned long long hash, const UT_ringbuffer *rbuffer) {
bool out = false;

if (utringbuffer_len(rbuffer) == 0)
return (out);
else {
for (unsigned int i = 0; i < utringbuffer_len(rbuffer); i++) {
unsigned long long *item = utringbuffer_eltptr(rbuffer, i);
out = (hash == *item);
if (out) break;
if (out)
break;
}
return(out);
return (out);
}
}

/**********************************************************
* Main
**********************************************************/
int main (int argc, char *argv[]){
int wsize = 10; // Default window size
* Main
**********************************************************/
int main(int argc, char *argv[]) {
unsigned int wsize = 10; // Default window size
int c;

while ((c = getopt (argc, argv, "hw:")) != -1)
{
switch (c)
{
case 'w':
wsize = strtoumax(optarg, NULL, 10);
break;
case 'h':
default:
fprintf(stderr,"Usage: swuniq [-w N] INPUT\nFilter matching lines (within a configurable window) from INPUT\n(or stdin), writing to stdout.\n\n\t-w N Size of the sliding window to use for deduplication\nNote: By default swuniq will use a window of 10 lines.\n\n");
exit(1);
while ((c = getopt(argc, argv, "hw:")) != -1) {
switch (c) {
case 'w':
wsize = strtoumax(optarg, NULL, 10);
break;
case 'h':
default:
#define HELP_MESSAGE "swuniq 0.6 by Miguel Terron compiled as %i-bit %s\nFilter matching lines (within a configurable window) from INPUT\n(or stdin), writing to stdout.\n\nUsage: swuniq [-w N] INPUT\n\t-w N Size of the sliding window to use for deduplication\nNote: By default swuniq will use a window of 10 lines.\n", g_nbBits, ARCH
fprintf(stderr, HELP_MESSAGE);
exit(1);
}
}

// Open file if filename is provided
if(optind < argc) {
if ( freopen(argv[optind], "r", stdin) == NULL)
{
fprintf(stderr,"Can't open file %s",argv[optind]);
if (optind < argc) {
if (freopen(argv[optind], "r", stdin) == NULL) {
fprintf(stderr, "Can't open file %s", argv[optind]);
exit(1);
}
}

char *buffer;
size_t bufsize = 6000;

UT_ringbuffer *history;
UT_icd ut_long_long_icd = {sizeof(long long), NULL, NULL, NULL };
UT_icd ut_long_long_icd = {sizeof(long long), NULL, NULL, NULL};
utringbuffer_new(history, wsize, &ut_long_long_icd);
unsigned long long digest;

buffer = (char *)malloc(bufsize * sizeof(char));
if( buffer == NULL )
{
perror("Unable to allocate buffer");
exit(1);
}

while( -1 != getline(&buffer, &bufsize, stdin) )
{
digest = hashString(buffer, strlen(buffer));
if (!lookup(digest,history))
{
char* line;
size_t bufsize = sysconf(_SC_PAGESIZE);
unsigned long long digest;
while (-1 != getline(&line, &bufsize, stdin)) {
digest = hashString(line);
if (!lookup(digest, history)) {
utringbuffer_push_back(history, &digest);
printf("%s",buffer);
printf("%s", line);
fflush(stdout);
}
}

fclose(stdin);
// utringbuffer_free(history);
free(line);
// utringbuffer_free(history);
exit(0);
}
Loading

0 comments on commit ad4c2e3

Please sign in to comment.