Skip to content

Commit 7b3e20d

Browse files
authored
Merge: Tokenizing UTF-8 with SIMD ㊗️
2 parents 645407b + da5687d commit 7b3e20d

31 files changed

+7289
-486
lines changed

.github/workflows/prerelease.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -802,7 +802,7 @@ jobs:
802802
strategy:
803803
fail-fast: false
804804
matrix:
805-
os: [ubuntu-24.04, macos-13, windows-2022]
805+
os: [ubuntu-24.04, macos-15, windows-2022]
806806
python-version: ["38", "39", "310", "311", "312", "313"]
807807
steps:
808808
- uses: actions/checkout@v5

.github/workflows/release.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ jobs:
110110

111111
build_wheels_stringzilla_macos:
112112
name: Build StringZilla ${{ matrix.python-version }} for macOS
113-
runs-on: macos-13
113+
runs-on: macos-15
114114
needs: versioning
115115
strategy:
116116
fail-fast: false
@@ -137,7 +137,7 @@ jobs:
137137
- name: Upload wheels
138138
uses: actions/upload-artifact@v4
139139
with:
140-
name: cibw-wheels-stringzilla-macos-13-${{ strategy.job-index }}
140+
name: cibw-wheels-stringzilla-macos-15-${{ strategy.job-index }}
141141
path: ./wheelhouse/*.whl
142142
overwrite: true
143143

@@ -211,7 +211,7 @@ jobs:
211211

212212
build_wheels_stringzillas_cpus_macos:
213213
name: Build StringZillas-CPUs ${{ matrix.python-version }} for macOS
214-
runs-on: macos-13
214+
runs-on: macos-15
215215
needs: versioning
216216
strategy:
217217
fail-fast: false
@@ -239,7 +239,7 @@ jobs:
239239
- name: Upload wheels
240240
uses: actions/upload-artifact@v4
241241
with:
242-
name: cibw-wheels-stringzillas-cpus-macos-13-${{ strategy.job-index }}
242+
name: cibw-wheels-stringzillas-cpus-macos-15-${{ strategy.job-index }}
243243
path: ./wheelhouse/*.whl
244244
overwrite: true
245245

.vscode/build_current.cmake

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
# build_current.cmake
2+
# Cross-platform script to build the current file's target
3+
#
4+
# Usage: cmake -DFILE=<file> -DBUILD_TYPE=<Debug|Release> -P build_current.cmake
5+
6+
cmake_minimum_required(VERSION 3.14)
7+
8+
if (NOT DEFINED FILE)
9+
message(FATAL_ERROR "FILE not specified. Usage: cmake -DFILE=path/to/file.cpp -P build_current.cmake")
10+
endif ()
11+
12+
if (NOT DEFINED BUILD_TYPE)
13+
set(BUILD_TYPE "Debug")
14+
message(STATUS "BUILD_TYPE not specified, defaulting to Debug")
15+
endif ()
16+
17+
# Extract basename without extension
18+
get_filename_component(BASENAME "${FILE}" NAME_WE)
19+
20+
# Map filename patterns to CMake targets
21+
if (BASENAME MATCHES "^bench_(.+)$")
22+
# Benchmark files: bench_find.cpp -> stringzilla_bench_find_cpp20
23+
set(TARGET "stringzilla_${BASENAME}_cpp20")
24+
else ()
25+
message(FATAL_ERROR "Unknown file pattern: ${BASENAME}\nSupported patterns:\n - bench_*.cpp\n - test_stringzilla.cpp\n - test_stringzillas.cpp")
26+
endif ()
27+
28+
# Determine build directory
29+
string(TOLOWER "${BUILD_TYPE}" build_type_lower)
30+
set(BUILD_DIR "${CMAKE_CURRENT_LIST_DIR}/../build_${build_type_lower}")
31+
32+
# Verify build directory exists
33+
if (NOT EXISTS "${BUILD_DIR}")
34+
message(FATAL_ERROR "Build directory not found: ${BUILD_DIR}\nRun: cmake -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -B ${BUILD_DIR}")
35+
endif ()
36+
37+
message(STATUS "Building target: ${TARGET}")
38+
message(STATUS "Build directory: ${BUILD_DIR}")
39+
message(STATUS "Build type: ${BUILD_TYPE}")
40+
41+
# Execute the build
42+
execute_process(
43+
COMMAND cmake --build ${BUILD_DIR} --config ${BUILD_TYPE} --target ${TARGET}
44+
RESULT_VARIABLE result
45+
WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/..
46+
)
47+
48+
if (result EQUAL 0)
49+
message(STATUS "Build succeeded: ${TARGET}")
50+
else ()
51+
message(FATAL_ERROR "Build failed with exit code: ${result}")
52+
endif ()

.vscode/launch.json

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -106,13 +106,17 @@
106106
"name": "ASAN_OPTIONS",
107107
"value": "detect_leaks=0:atexit=1:strict_init_order=1:strict_string_checks=1"
108108
},
109+
{
110+
"name": "STRINGWARS_TOKENS",
111+
"value": "file"
112+
},
109113
{
110114
"name": "STRINGWARS_DATASET",
111115
"value": "utf8.txt"
112116
}
113117
],
114118
"stopAtEntry": false,
115-
"preLaunchTask": "Build Benchmarks: Debug",
119+
"preLaunchTask": "Build Current File: Debug",
116120
"linux": {
117121
"MIMode": "gdb",
118122
"setupCommands": [
@@ -149,7 +153,7 @@
149153
}
150154
],
151155
"stopAtEntry": false,
152-
"preLaunchTask": "Build Benchmarks: Debug",
156+
"preLaunchTask": "Build Current File: Debug",
153157
"linux": {
154158
"MIMode": "gdb",
155159
"setupCommands": [
@@ -182,7 +186,7 @@
182186
}
183187
],
184188
"stopAtEntry": false,
185-
"preLaunchTask": "Build Benchmarks: Debug"
189+
"preLaunchTask": "Build Current File: Debug"
186190
},
187191
{
188192
"name": "Current Python File",

.vscode/settings.json

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -263,6 +263,22 @@
263263
"editor.insertSpaces": true,
264264
"editor.rulers": [120],
265265
"editor.tabSize": 4,
266+
// Rust analyzer memory optimizations
267+
"rust-analyzer.checkOnSave.enable": true,
268+
"rust-analyzer.checkOnSave.allTargets": false,
269+
"rust-analyzer.cargo.allFeatures": false,
270+
"rust-analyzer.cargo.buildScripts.enable": false,
271+
"rust-analyzer.procMacro.enable": false,
272+
"rust-analyzer.diagnostics.enable": true,
273+
"rust-analyzer.diagnostics.experimental.enable": false,
274+
"rust-analyzer.completion.autoimport.enable": false,
275+
"rust-analyzer.lens.enable": false,
276+
"rust-analyzer.inlayHints.enable": false,
277+
"rust-analyzer.hover.actions.enable": false,
278+
"rust-analyzer.assist.emitMustUse": false,
279+
"rust-analyzer.cachePriming.enable": false,
280+
"rust-analyzer.files.watcher": "client",
281+
"rust-analyzer.updates.channel": "stable",
266282
"files.associations": {
267283
"__availability": "cpp",
268284
"__bit_reference": "cpp",

.vscode/tasks.json

Lines changed: 55 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -7,16 +7,12 @@
77
"args": [],
88
"type": "shell",
99
"osx": {
10-
"environment": [
11-
{
12-
"name": "CMAKE_CXX_COMPILER",
13-
"value": "$(brew --prefix llvm)/bin/clang++"
14-
},
15-
{
16-
"name": "CMAKE_C_COMPILER",
17-
"value": "$(brew --prefix llvm)/bin/clang"
10+
"options": {
11+
"env": {
12+
"CMAKE_CXX_COMPILER": "$(brew --prefix llvm)/bin/clang++",
13+
"CMAKE_C_COMPILER": "$(brew --prefix llvm)/bin/clang"
1814
}
19-
]
15+
}
2016
}
2117
},
2218
{
@@ -25,16 +21,12 @@
2521
"args": [],
2622
"type": "shell",
2723
"osx": {
28-
"environment": [
29-
{
30-
"name": "CMAKE_CXX_COMPILER",
31-
"value": "$(brew --prefix llvm)/bin/clang++"
32-
},
33-
{
34-
"name": "CMAKE_C_COMPILER",
35-
"value": "$(brew --prefix llvm)/bin/clang"
24+
"options": {
25+
"env": {
26+
"CMAKE_CXX_COMPILER": "$(brew --prefix llvm)/bin/clang++",
27+
"CMAKE_C_COMPILER": "$(brew --prefix llvm)/bin/clang"
3628
}
37-
]
29+
}
3830
}
3931
},
4032
{
@@ -49,16 +41,12 @@
4941
"args": [],
5042
"type": "shell",
5143
"osx": {
52-
"environment": [
53-
{
54-
"name": "CMAKE_CXX_COMPILER",
55-
"value": "$(brew --prefix llvm)/bin/clang++"
56-
},
57-
{
58-
"name": "CMAKE_C_COMPILER",
59-
"value": "$(brew --prefix llvm)/bin/clang"
44+
"options": {
45+
"env": {
46+
"CMAKE_CXX_COMPILER": "$(brew --prefix llvm)/bin/clang++",
47+
"CMAKE_C_COMPILER": "$(brew --prefix llvm)/bin/clang"
6048
}
61-
]
49+
}
6250
}
6351
},
6452
{
@@ -67,16 +55,47 @@
6755
"args": [],
6856
"type": "shell",
6957
"osx": {
70-
"environment": [
71-
{
72-
"name": "CMAKE_CXX_COMPILER",
73-
"value": "$(brew --prefix llvm)/bin/clang++"
74-
},
75-
{
76-
"name": "CMAKE_C_COMPILER",
77-
"value": "$(brew --prefix llvm)/bin/clang"
58+
"options": {
59+
"env": {
60+
"CMAKE_CXX_COMPILER": "$(brew --prefix llvm)/bin/clang++",
61+
"CMAKE_C_COMPILER": "$(brew --prefix llvm)/bin/clang"
7862
}
79-
]
63+
}
64+
}
65+
},
66+
{
67+
"label": "Build Current File: Debug",
68+
"command": "cmake",
69+
"args": [
70+
"-DFILE=${file}",
71+
"-DBUILD_TYPE=Debug",
72+
"-P",
73+
"${workspaceFolder}/.vscode/build_current.cmake"
74+
],
75+
"type": "shell",
76+
"group": {
77+
"kind": "build",
78+
"isDefault": true
79+
},
80+
"presentation": {
81+
"reveal": "always",
82+
"panel": "shared"
83+
}
84+
},
85+
{
86+
"label": "Build Current File: Release",
87+
"command": "cmake",
88+
"args": [
89+
"-DFILE=${file}",
90+
"-DBUILD_TYPE=Release",
91+
"-P",
92+
"${workspaceFolder}/.vscode/build_current.cmake"
93+
],
94+
"type": "shell",
95+
"group": "build",
96+
"presentation": {
97+
"reveal": "always",
98+
"panel": "shared"
8099
}
81100
}
82101
]

CMakeLists.txt

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -702,7 +702,10 @@ function (define_stringzillas_shared target source_file backend_flags)
702702
target_include_directories(${target} PRIVATE fork_union/include)
703703
target_compile_definitions(${target} PRIVATE "SZ_DYNAMIC_DISPATCH=1")
704704
target_compile_definitions(${target} PRIVATE "SZ_AVOID_LIBC=0")
705-
target_compile_definitions(${target} PRIVATE "SZ_DEBUG=0")
705+
# Only define SZ_DEBUG=0 in Release builds; Debug builds inherit from types.h
706+
if (NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
707+
target_compile_definitions(${target} PRIVATE "SZ_DEBUG=0")
708+
endif ()
706709

707710
# Set backend-specific compilation flags
708711
foreach (flag ${backend_flags})

Cargo.lock

Lines changed: 6 additions & 6 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,10 +36,10 @@ rocm = ["std", "cpus"] # ROCm GPU backend (includes multi-threaded CPU backend)
3636

3737
[dependencies]
3838
allocator-api2 = { version = "0.3.0", optional = true }
39-
stringtape = { version = "2.0.3", optional = true }
39+
stringtape = { version = "2.4.1", optional = true }
4040

4141
[build-dependencies]
42-
cc = "1.2.40"
42+
cc = "1.2.47"
4343

4444
[lints.clippy]
4545
# Catch platform-specific type issues like `c_char` differences

README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1485,6 +1485,13 @@ __`SZ_USE_CUDA`, `SZ_USE_KEPLER`, `SZ_USE_HOPPER`__:
14851485
> One can explicitly disable certain families of PTX instructions for compatibility purposes.
14861486
> Default values are inferred at compile time depending on compiler support (for dynamic dispatch) and the target architecture (for static dispatch).
14871487
1488+
__`SZ_ENFORCE_SVE_OVER_NEON`__:
1489+
1490+
> SVE and SVE2 are expected to supersede NEON on ARM architectures.
1491+
> Still, oftentimes the equivalent SVE kernels are slower due to equally small register files and higher complexity of the instructions.
1492+
> By default, when both SVE and NEON are available, SVE is used selectively only for the algorithms that benefit from it.
1493+
> If you want to enforce SVE usage everywhere, define this flag.
1494+
14881495
__`SZ_DYNAMIC_DISPATCH`__:
14891496

14901497
> By default, StringZilla is a header-only library.

0 commit comments

Comments
 (0)