Merge pull request #58 from ashvardanian/main-dev

AVX-512, Bindings for C++ & Rust & Swift, Levenshtein distances, Needleman-Wunsch scores, and Fingerprinting
ashvardanian · Feb 6, 2024 · 9fde435 · 9fde435
2 parents d728848 + aa7bbc2
commit 9fde435
Show file tree

Hide file tree

Showing 56 changed files with 18,003 additions and 2,500 deletions.
diff --git a/.clang-format b/.clang-format
@@ -1,8 +1,8 @@
 Language: Cpp
-BasedOnStyle:  LLVM
+BasedOnStyle: LLVM
 IndentWidth: 4
 TabWidth: 4
-NamespaceIndentation: All
+NamespaceIndentation: None
 ColumnLimit: 120
 ReflowComments: true
 UseTab: Never
@@ -44,9 +44,8 @@ BraceWrapping:
   SplitEmptyNamespace: false
   IndentBraces: false
 
-
 SortIncludes: true
-SortUsingDeclarations: true 
+SortUsingDeclarations: true
 
 SpaceAfterCStyleCast: false
 SpaceAfterLogicalNot: false
@@ -65,5 +64,7 @@ SpacesInContainerLiterals: false
 SpacesInParentheses: false
 SpacesInSquareBrackets: false
 
-BinPackArguments: false
-BinPackParameters: false
+BinPackArguments: true
+BinPackParameters: true
+PenaltyBreakBeforeFirstCallParameter: 1
+PenaltyBreakArgument: 1
diff --git a/.github/workflows/build_tools.sh b/.github/workflows/build_tools.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+
+# Assign arguments to variables
+BUILD_TYPE=$1    # Debug or Release
+COMPILER=$2      # GCC, LLVM, or MSVC
+
+# Set common flags
+COMMON_FLAGS="-DSTRINGZILLA_BUILD_TEST=1 -DSTRINGZILLA_BUILD_BENCHMARK=1 -DSTRINGZILLA_BUILD_SHARED=0"
+
+# Compiler specific settings
+case "$COMPILER" in
+    "GCC")
+        COMPILER_FLAGS="-DCMAKE_CXX_COMPILER=gcc-12 -DCMAKE_CXX_COMPILER=g++-12"
+        ;;
+    "LLVM")
+        COMPILER_FLAGS="-DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++"
+        ;;
+    "MSVC")
+        COMPILER_FLAGS="" 
+        ;;
+    *)
+        echo "Unknown compiler: $COMPILER"
+        exit 1
+        ;;
+esac
+
+# Set build type
+case "$BUILD_TYPE" in
+    "Debug")
+        BUILD_DIR="./build_debug"
+        BUILD_FLAGS="-DCMAKE_BUILD_TYPE=Debug"
+        ;;
+    "Release")
+        BUILD_DIR="./build_release"
+        BUILD_FLAGS="-DCMAKE_BUILD_TYPE=RelWithDebInfo"
+        ;;
+    *)
+        echo "Unknown build type: $BUILD_TYPE"
+        exit 1
+        ;;
+esac
+
+# Execute commands
+cmake $COMMON_FLAGS $COMPILER_FLAGS $BUILD_FLAGS -B $BUILD_DIR && cmake --build $BUILD_DIR --config $BUILD_TYPE
diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml
@@ -9,83 +9,256 @@ on:
 env:
   BUILD_TYPE: Release
   GH_TOKEN: ${{ secrets.SEMANTIC_RELEASE_TOKEN }}
+  PYTHON_VERSION: 3.11
+  SWIFT_VERSION: 5.9
   PYTHONUTF8: 1
 
 # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
 permissions:
   contents: read
 
 jobs:
-
-  test_python_311:
-    name: Test Python
-    runs-on: ${{ matrix.os }}
-    strategy:
-      matrix:
-        os: [ubuntu-20.04, macOS-11, windows-2022]
-        python-version: ["3.11"]
+  test_ubuntu_gcc:
+    name: Ubuntu (GCC 12)
+    runs-on: ubuntu-22.04
+    env:
+      CC: gcc-12
+      CXX: g++-12
 
     steps:
-      - uses: actions/checkout@v3
-      - run: git submodule update --init --recursive
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v4
+      - uses: actions/checkout@v4
+
+        # C/C++
+        # If the compilation fails, we want to log the compilation commands in addition to
+        # the standard output.
+      - name: Build C/C++
+        run: |
+          sudo apt update
+          sudo apt install -y cmake build-essential libjemalloc-dev libomp-dev gcc-12 g++-12
+
+          cmake -B build_artifacts \
+            -DCMAKE_BUILD_TYPE=RelWithDebInfo \
+            -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \
+            -DSTRINGZILLA_BUILD_BENCHMARK=1 \
+            -DSTRINGZILLA_BUILD_TEST=1
+
+          cmake --build build_artifacts --config RelWithDebInfo > build_artifacts/logs.txt 2>&1 || {
+            echo "Compilation failed. Here are the logs:"
+            cat build_artifacts/logs.txt
+            echo "The original compilation commands:"
+            cat build_artifacts/compile_commands.json
+            echo "CPU Features:"
+            lscpu
+            echo "GCC Version:"
+            gcc-12 --version
+            echo "G++ Version:"
+            g++-12 --version
+            exit 1
+          }
+      - name: Test C++
+        run: ./build_artifacts/stringzilla_test_cpp20
+      - name: Test on Real World Data
+        run: |
+          ./build_artifacts/stringzilla_bench_search ${DATASET_PATH}     # for substring search
+          ./build_artifacts/stringzilla_bench_token ${DATASET_PATH}      # for hashing, equality comparisons, etc.
+          ./build_artifacts/stringzilla_bench_similarity ${DATASET_PATH} # for edit distances and alignment scores
+          ./build_artifacts/stringzilla_bench_sort ${DATASET_PATH}       # for sorting arrays of strings
+          ./build_artifacts/stringzilla_bench_container ${DATASET_PATH}  # for STL containers with string keys
+        env:
+          DATASET_PATH: ./README.md
+        # Don't overload GitHub with our benchmarks.
+        # The results in such an unstable environment will be meaningless anyway.
+        if: 0
+
+        # Python
+      - name: Set up Python ${{ env.PYTHON_VERSION }}
+        uses: actions/setup-python@v5
         with:
-          python-version: ${{ matrix.python-version }}
-      - name: Install dependencies
+          python-version: ${{ env.PYTHON_VERSION }}
+      - name: Build Python
         run: |
-          python -m pip install --no-cache-dir --upgrade pip numpy
-          pip install --no-cache-dir pytest
-      - name: Build locally
-        run: python -m pip install .
-      - name: Test with PyTest
-        run: pytest scripts/
-
-
-  test_python_37:
-    name: Test Python 3.7
-    runs-on: ${{ matrix.os }}
-    strategy:
-      matrix:
-        os: [ubuntu-20.04]
-        python-version: ["3.7"]
+          python -m pip install --upgrade pip
+          pip install pytest pytest-repeat numpy
+          python -m pip install .
+      - name: Test Python
+        run: pytest scripts/test.py -s -x
+
+        # JavaScript
+      # - name: Set up Node.js
+      #   uses: actions/setup-node
+      #   with:
+      #     node-version: 18
+      # - name: Build and test JavaScript
+      #   run: npm ci && npm test
+
+        # Rust
+      - name: Test Rust
+        uses: actions-rs/toolchain@v1
+        with:
+          toolchain: stable
+          override: true
+
+  test_ubuntu_clang:
+    name: Ubuntu (Clang 16)
+    runs-on: ubuntu-22.04
+    env:
+      CC: clang-16
+      CXX: clang++-16
 
     steps:
-      - uses: actions/checkout@v3
-      - run: git submodule update --init --recursive
-
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v4
+      - uses: actions/checkout@v4
         with:
-          python-version: ${{ matrix.python-version }}
+          ref: main-dev
+      - run: git submodule update --init --recursive
 
-      - name: Install dependencies
+        # C/C++
+        # Clang 16 isn't available from default repos on Ubuntu 22.04, so we have to install it manually
+      - name: Build C/C++
         run: |
-          python -m pip install --no-cache-dir --upgrade pip numpy
-          pip install --no-cache-dir pytest
+          sudo apt update
+          sudo apt install -y cmake build-essential libjemalloc-dev
+          wget https://apt.llvm.org/llvm.sh
+          chmod +x llvm.sh
+          sudo ./llvm.sh 16
 
-      - name: Build locally
-        run: python -m pip install .
+          cmake -B build_artifacts \
+            -DCMAKE_BUILD_TYPE=RelWithDebInfo \
+            -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \
+            -DSTRINGZILLA_BUILD_BENCHMARK=1 \
+            -DSTRINGZILLA_BUILD_TEST=1
 
-      - name: Test with PyTest
-        run: pytest scripts/
+          cmake --build build_artifacts --config RelWithDebInfo > build_artifacts/logs.txt 2>&1 || {
+            echo "Compilation failed. Here are the logs:"
+            cat build_artifacts/logs.txt
+            echo "The original compilation commands:"
+            cat build_artifacts/compile_commands.json
+            echo "CPU Features:"
+            lscpu
+            echo "Clang Version:"
+            clang-16 --version
+            echo "Clang++ Version:"
+            clang++-16 --version
+            exit 1
+          }
+      - name: Test C++
+        run: ./build_artifacts/stringzilla_test_cpp20
+      - name: Test on Real World Data
+        run: |
+          ./build_artifacts/stringzilla_bench_search ${DATASET_PATH}     # for substring search
+          ./build_artifacts/stringzilla_bench_token ${DATASET_PATH}      # for hashing, equality comparisons, etc.
+          ./build_artifacts/stringzilla_bench_similarity ${DATASET_PATH} # for edit distances and alignment scores
+          ./build_artifacts/stringzilla_bench_sort ${DATASET_PATH}       # for sorting arrays of strings
+          ./build_artifacts/stringzilla_bench_container ${DATASET_PATH}  # for STL containers with string keys
+        env:
+          DATASET_PATH: ./README.md
+        # Don't overload GitHub with our benchmarks.
+        # The results in such an unstable environment will be meaningless anyway.
+        if: 0
 
-  test_javascript:
-    name: Test JavaScript
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        node-version: [18.x]
+        # Python
+      - name: Set up Python ${{ env.PYTHON_VERSION }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ env.PYTHON_VERSION }}
+      - name: Build Python
+        run: |
+          python -m pip install --upgrade pip
+          pip install pytest pytest-repeat numpy
+          python -m pip install .
+      - name: Test Python
+        run: pytest scripts/test.py -s -x
+
+        # Rust
+      - name: Test Rust
+        uses: actions-rs/toolchain@v1
+        with:
+          toolchain: stable
+          override: true
+
+        # Swift
+        # Fails due to: https://github.com/swift-actions/setup-swift/issues/591
+      # - name: Set up Swift ${{ env.SWIFT_VERSION }}
+      #   uses: swift-actions/setup-swift@v1
+      #   with:
+      #     swift-version: ${{ env.SWIFT_VERSION }}
+      # - name: Build Swift
+      #   run: swift build -c release --static-swift-stdlib
+      # - name: Test Swift
+      #   run: swift test -c release --enable-test-discovery
+
+  # Temporary workaround to run Swift tests on Linux
+  # Based on: https://github.com/swift-actions/setup-swift/issues/591#issuecomment-1685710678
+  test_ubuntu_swift:
+    name: Ubuntu (Swift)
+    runs-on: ubuntu-22.04
+    container: swift:5.9
     steps:
-
       - uses: actions/checkout@v4
-      - name: Set up Node.js
-        uses: actions/setup-node@v3
+      - name: Test Swift
+        run: swift test
+
+  test_macos:
+    name: MacOS
+    runs-on: macos-12
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: main-dev
+      - run: git submodule update --init --recursive
+
+        # C/C++
+      - name: Build C/C++
+        run: |
+          brew update
+          brew install cmake
+          cmake -B build_artifacts \
+            -DCMAKE_BUILD_TYPE=RelWithDebInfo \
+            -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \
+            -DSTRINGZILLA_BUILD_BENCHMARK=1 \
+            -DSTRINGZILLA_BUILD_TEST=1
+          cmake --build build_artifacts --config RelWithDebInfo
+      - name: Test C++
+        run: ./build_artifacts/stringzilla_test_cpp17
+      - name: Test on Real World Data
+        run: |
+          ./build_artifacts/stringzilla_bench_search ${DATASET_PATH}     # for substring search
+          ./build_artifacts/stringzilla_bench_token ${DATASET_PATH}      # for hashing, equality comparisons, etc.
+          ./build_artifacts/stringzilla_bench_similarity ${DATASET_PATH} # for edit distances and alignment scores
+          ./build_artifacts/stringzilla_bench_sort ${DATASET_PATH}       # for sorting arrays of strings
+          ./build_artifacts/stringzilla_bench_container ${DATASET_PATH}  # for STL containers with string keys
+        env:
+          DATASET_PATH: ./README.md
+        # Don't overload GitHub with our benchmarks.
+        # The results in such an unstable environment will be meaningless anyway.
+        if: 0
+
+        # Python
+      - name: Set up Python ${{ env.PYTHON_VERSION }}
+        uses: actions/setup-python@v5
         with:
-          node-version: '18.x'
-
-      - name: Build locally
-        run: npm i
+          python-version: ${{ env.PYTHON_VERSION }}
+      - name: Build Python
+        run: |
+          python -m pip install --upgrade pip
+          pip install pytest pytest-repeat numpy
+          python -m pip install .
+      - name: Test Python
+        run: pytest scripts/test.py -s -x
+
+        # Swift
+      - name: Set up Swift ${{ env.SWIFT_VERSION }}
+        uses: swift-actions/setup-swift@v1
+        with:
+          swift-version: ${{ env.SWIFT_VERSION }}
+      - name: Build Swift
+        run: swift build
+      - name: Test Swift
+        run: swift test
 
-      - name: Test
-        run: npm test
+        # Rust
+      - name: Test Rust
+        uses: actions-rs/toolchain@v1
+        with:
+          toolchain: stable
+          override: true