matrixorigin · cpegeric · Mar 13, 2026 · Mar 16, 2026 · Mar 16, 2026 · Mar 16, 2026
@@ -178,6 +178,7 @@ pb: vendor-build generate-pb fmt
 
 VERSION_INFO :=-X '$(GO_MODULE)/pkg/version.GoVersion=$(GO_VERSION)' -X '$(GO_MODULE)/pkg/version.BranchName=$(BRANCH_NAME)' -X '$(GO_MODULE)/pkg/version.CommitID=$(LAST_COMMIT_ID)' -X '$(GO_MODULE)/pkg/version.BuildTime=$(BUILD_TIME)' -X '$(GO_MODULE)/pkg/version.Version=$(MO_VERSION)'
 THIRDPARTIES_INSTALL_DIR=$(ROOT_DIR)/thirdparties/install
+CGO_DIR=$(ROOT_DIR)/cgo
 RACE_OPT :=
 DEBUG_OPT :=
 CGO_DEBUG_OPT :=
@@ -188,7 +189,7 @@ ifeq ($(MO_CL_CUDA),1)
     $(error CONDA_PREFIX env variable not found.)
   endif
 	CUVS_CFLAGS := -I$(CONDA_PREFIX)/include
-	CUVS_LDFLAGS := -L$(CONDA_PREFIX)/envs/go/lib -lcuvs -lcuvs_c
+	CUVS_LDFLAGS := -L$(CONDA_PREFIX)/lib -lcuvs -lcuvs_c -lnccl -lucxx -lucp -luct -lucs -lucm
 	CUDA_CFLAGS := -I/usr/local/cuda/include $(CUVS_CFLAGS)
 	CUDA_LDFLAGS := -L/usr/local/cuda/lib64/stubs -lcuda -L/usr/local/cuda/lib64 -lcudart $(CUVS_LDFLAGS) -lstdc++
 	TAGS += -tags "gpu"
@@ -198,11 +199,11 @@ ifeq ($(TYPECHECK),1)
 	TAGS += -tags "typecheck"
 endif
 
-CGO_OPTS :=CGO_CFLAGS="-I$(THIRDPARTIES_INSTALL_DIR)/include $(CUDA_CFLAGS)"
-GOLDFLAGS=-ldflags="-extldflags '$(CUDA_LDFLAGS) -L$(THIRDPARTIES_INSTALL_DIR)/lib -Wl,-rpath,\$${ORIGIN}/lib -fopenmp' $(VERSION_INFO)"
+CGO_OPTS :=CGO_CFLAGS="-I$(CGO_DIR) -I$(THIRDPARTIES_INSTALL_DIR)/include $(CUDA_CFLAGS)"
+GOLDFLAGS=-ldflags="-extldflags '$(CUDA_LDFLAGS) -L$(CGO_DIR) -lmo -L$(THIRDPARTIES_INSTALL_DIR)/lib -Wl,-rpath,\$${ORIGIN}/lib -fopenmp' $(VERSION_INFO)"
 
 ifeq ("$(UNAME_S)","darwin")
-GOLDFLAGS:=-ldflags="-extldflags '-L$(THIRDPARTIES_INSTALL_DIR)/lib -Wl,-rpath,@executable_path/lib' $(VERSION_INFO)"
+GOLDFLAGS:=-ldflags="-extldflags '-L$(CGO_DIR) -lmo -L$(THIRDPARTIES_INSTALL_DIR)/lib -Wl,-rpath,@executable_path/lib' $(VERSION_INFO)"
 endif
 
 ifeq ($(GOBUILD_OPT),)

@@ -1,48 +1,77 @@
 DEBUG_OPT :=
 UNAME_M := $(shell uname -m)
+UNAME_S := $(shell uname -s)
+CC ?= gcc
 
 # Yeah, fast math.  We want it to be fast, for all xcall, 
 # IEEE compliance should not be an issue.
 OPT_LV := -O3 -ffast-math -ftree-vectorize -funroll-loops
-CFLAGS=-std=c99 -g ${OPT_LV} -Wall -Werror -I../thirdparties/install/include
-OBJS=mo.o arith.o compare.o logic.o xcall.o usearchex.o bloom.o
-CUDA_OBJS=
+COMMON_CFLAGS := -g $(OPT_LV) -Wall -Werror -fPIC -I../thirdparties/install/include
+CFLAGS := -std=c99 $(COMMON_CFLAGS)
+OBJS := mo.o arith.o compare.o logic.o xcall.o usearchex.o bloom.o
+CUDA_OBJS :=
+LDFLAGS := -L../thirdparties/install/lib -lusearch_c
+TARGET_LIB := libmo.so
+
+ifeq ($(UNAME_S),Darwin)
+	TARGET_LIB := libmo.dylib
+	LDFLAGS += -dynamiclib -undefined dynamic_lookup -install_name @rpath/$(TARGET_LIB)
+else
+	LDFLAGS += -shared
+endif
 
 ifeq ($(UNAME_M), x86_64)
-	CFLAGS+= -march=haswell
+	CFLAGS += -march=haswell
 endif
 
 ifeq ($(MO_CL_CUDA),1)
+	ifeq ($(CONDA_PREFIX),)
+		$(error CONDA_PREFIX env variable not found. Please activate your conda environment.)
+	endif
 	CC = /usr/local/cuda/bin/nvcc 
-	CFLAGS = -ccbin g++ -m64 --shared -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_89,code=sm_89 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90
+	CFLAGS = -ccbin g++ -m64 -Xcompiler -fPIC -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_89,code=sm_89 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90
 	CFLAGS += -I../thirdparties/install/include -DMO_CL_CUDA
 	CUDA_OBJS += cuda/cuda.o
-	CUDA_LDFLAGS := -L/usr/local/cuda/lib64/stubs -lcuda -L/usr/local/cuda/lib64 -lcudart -lstdc++
+	# Explicitly include all needed libraries for shared library linking
+	CUDA_LDFLAGS := -L/usr/local/cuda/lib64/stubs -lcuda -L/usr/local/cuda/lib64 -lcudart -L$(CONDA_PREFIX)/lib -lcuvs -lcuvs_c -ldl -lrmm -lstdc++
+	LDFLAGS += $(CUDA_LDFLAGS)
 endif
 
-all: libmo.a
+.PHONY: all clean test debug
+
+all: $(TARGET_LIB) libmo.a
 
-libmo.a: $(OBJS) 
+$(TARGET_LIB): $(OBJS) 
 ifeq ($(MO_CL_CUDA),1)
-	make -C cuda
+	$(MAKE) -C cuda
+	$(MAKE) -C cuvs
+	$(CC) $(LDFLAGS) -o $@ $(OBJS) $(CUDA_OBJS) cuvs/*.o
+else
+	$(CC) $(LDFLAGS) -o $@ $(OBJS)
 endif
-	ar -rcs libmo.a $(OBJS) $(CUDA_OBJS)
 
-# 
-#	$(CC) -o libmo.a $(OBJS) $(CUDA_OBJS) $(CUDA_LDFLAGS)
+libmo.a: $(OBJS)
+ifeq ($(MO_CL_CUDA),1)
+	$(MAKE) -C cuda
+	$(MAKE) -C cuvs
+	ar -rcs $@ $(OBJS) $(CUDA_OBJS) cuvs/*.o
+else
+	ar -rcs $@ $(OBJS)
+endif
 
+%.o: %.c
+	$(CC) $(CFLAGS) -c $< -o $@
 
-test: libmo.a
-	make -C test
+test: $(TARGET_LIB)
+	$(MAKE) -C test
 
-.PHONY: debug
 debug: override OPT_LV := -O0
 debug: override DEBUG_OPT := debug
 debug: all
 
-.PHONY: clean
 clean:
-	rm -f *.o *.a *.so
+	rm -f *.o *.a *.so *.dylib
 ifeq ($(MO_CL_CUDA),1)
-	make -C cuda clean
+	$(MAKE) -C cuda clean
+	$(MAKE) -C cuvs clean
 endif
@@ -1,25 +1,28 @@
 MatrixOne CGO Kernel
 ===============================
 
-This directory contains cgo source code for MO.   Running
-make should produce two files to be used by go code.
-On go side, go will `include "mo.h"` and `-lmo`.   
+This directory contains CGO source code for MatrixOne. Running `make` produces the core library files used by Go code.
+
+On the Go side, the integration typically uses `mo.h` and links against the generated libraries:
 ```
 mo.h
-libmo.a
+libmo.a / libmo.so
 ```
 
-`mo.h` should be pristine, meaning it only contains C function
-prototype used by go.  The only datatypes that can be passed 
-between go and c code are int and float/double and pointer.   
-Always explicitly specify int size such as `int32_t`, `uint64_t`.
-Do not use `int`, `long`, etc.
+`mo.h` should remain pristine, containing only C function prototypes for Go to consume. Data passed between Go and C should be limited to standard types (int, float, double, pointers). Always specify explicit integer sizes (e.g., `int32_t`, `uint64_t`) and avoid platform-dependent types like `int` or `long`.
+
+GPU Support (CUDA & cuVS)
+-------------------------
+The kernel supports GPU acceleration for certain operations (e.g., vector search) via NVIDIA CUDA and the cuVS library.
+
+- **Build Flag:** GPU support is enabled by setting `MO_CL_CUDA=1` during the build.
+- **Environment:** Requires a working CUDA installation and a Conda environment with `cuvs` and `rmm` installed.
+- **Source Code:** GPU-specific code resides in the `cuda/` and `cuvs/` subdirectories.
 
 Implementation Notes
---------------------------------
+--------------------
 
-1. Pure C.
-2. Use memory passed from go.  Try not allocate memory in C code.
-3. Only depends on libc and libm.
-4. If 3rd party lib is absolutely necessary, import source code 
-   and build from source. If 3rd party lib is C++, wrap it completely in C.
+1. **Language:** Core kernel is Pure C. GPU extensions use C++ and CUDA, wrapped in a C-compatible interface.
+2. **Memory Management:** Prefer using memory allocated and passed from Go. Minimize internal allocations in C/C++ code.
+3. **Dependencies:** The base kernel depends only on `libc`, `libm`, and `libusearch`. GPU builds introduce dependencies on CUDA, `cuvs`, and `rmm`.
+4. **Third-party Libraries:** If a third-party library is necessary, it should be built from source (see `thirdparties/` directory). C++ libraries must be fully wrapped in C before being exposed to Go.
@@ -395,7 +395,7 @@ $(FATBIN_FILE): mocl.cu
 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -fatbin $<
 
 cuda.o: cuda.cpp
-	$(EXEC) $(NVCC) $(INCLUDES) -O3 --shared $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+	$(EXEC) $(NVCC) $(INCLUDES) -O3 --shared -Xcompiler -fPIC $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
 
 mytest.o: cuda.cpp $(FATBIN_FILE)
 	$(EXEC) $(NVCC) $(INCLUDES) -DTEST_RUN -g -O0 $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<

@@ -0,0 +1,84 @@
+# Copyright 2021 Matrix Origin
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+NVCC := /usr/local/cuda/bin/nvcc
+CC   := gcc
+CXX  := g++
+
+# Libraries
+LIBS := -L/usr/local/cuda/lib64/stubs -lcuda -L/usr/local/cuda/lib64/cudart -L$(CONDA_PREFIX)/lib -lcuvs -lcuvs_c -lnccl -lucxx -lucp -luct -lucs -lucm -ldl -lrmm -lrapids_logger -Xlinker -lpthread -Xlinker -lm
+
+INCLUDES := -I. -I/usr/local/cuda/include -I$(CONDA_PREFIX)/include -I$(CONDA_PREFIX)/include/rapids -I$(CONDA_PREFIX)/include/raft -I$(CONDA_PREFIX)/include/cuvs
+
+# NVCC_FLAGS are for compilation only. -x cu tells nvcc to treat .cpp as .cu
+NVCC_FLAGS := -O3 -std=c++17 -x cu -Xcompiler "-Wall -Wextra -fPIC" --extended-lambda --expt-relaxed-constexpr $(INCLUDES) -DLIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE -DRAFT_SYSTEM_LITTLE_ENDIAN=1 \
+    -gencode arch=compute_75,code=sm_75 \
+    -gencode arch=compute_80,code=sm_80 \
+    -gencode arch=compute_86,code=sm_86 \
+    -gencode arch=compute_89,code=sm_89 \
+    -gencode arch=compute_90,code=sm_90 \
+    -gencode arch=compute_90,code=compute_90
+
+# LDFLAGS for linking only. DO NOT include -x cu here.
+LDFLAGS := -O3 -std=c++17 -Xcompiler "-Wall -Wextra -fPIC"
+
+# Source files
+C_SRCS := brute_force_c.cpp ivf_flat_c.cpp ivf_pq_c.cpp cagra_c.cpp kmeans_c.cpp adhoc_c.cpp distance_c.cpp
+CPP_SRCS := helper.cpp
+TEST_SRCS := test/main_test.cu test/brute_force_test.cu test/ivf_flat_test.cu test/ivf_pq_test.cu test/cagra_test.cu test/kmeans_test.cu test/quantize_test.cu test/distance_test.cu test/batching_test.cu test/snmg_test.cu test/verify_half_conversion.cu
+
+# Object files
+OBJS := $(C_SRCS:.cpp=.o) $(CPP_SRCS:.cpp=.o)
+TEST_OBJS := $(patsubst test/%.cu,obj/test/%.o,$(TEST_SRCS))
+
+.PHONY: all clean debug release test
+
+all: libmocuvs.so
+
+test: test_cuvs_worker benchmark_cuvs test_kmeans
+
+release: all
+
+debug: NVCC_FLAGS := $(filter-out -O3,$(NVCC_FLAGS)) -O0 -g -lineinfo
+debug: LDFLAGS := $(filter-out -O3,$(LDFLAGS)) -g
+debug: all
+
+libmocuvs.so: $(OBJS)
+	$(NVCC) $(LDFLAGS) -shared -o $@ $^ $(LIBS)
+
+%.o: %.cpp
+	@echo "Compiling $< with NVCC"
+	$(NVCC) $(NVCC_FLAGS) -c $< -o $@
+
+obj/test/%.o: test/%.cu
+	@mkdir -p $(@D)
+	@echo "NVCC $<"
+	$(NVCC) $(NVCC_FLAGS) -c $< -o $@
+
+test_cuvs_worker: $(TEST_OBJS) $(OBJS)
+	@echo "Linking $@"
+	$(NVCC) $(LDFLAGS) $^ $(LIBS) -o $@
+
+benchmark_cuvs: obj/test/benchmark_cuvs.o $(OBJS)
+	@echo "Linking $@"
+	$(NVCC) $(LDFLAGS) $^ $(LIBS) -o $@
+
+test_kmeans: obj/test/test_kmeans.o $(OBJS)
+	@echo "Linking $@"
+	$(NVCC) $(LDFLAGS) $^ $(LIBS) -o $@
+
+clean:
+	@echo "Cleaning up..."
+	rm -f libmocuvs.so *.o test_cuvs_worker benchmark_cuvs test_kmeans
+	rm -rf obj
@@ -0,0 +1,119 @@
+✦ Architecture Design: cuVS-Accelerated Vector Indexing
+
+  1. Overview
+  The MatrixOne cuvs package provides a high-performance, GPU-accelerated vector search and clustering infrastructure. It acts as
+  a bridge between the Go-based database kernel and NVIDIA's cuVS and RAFT libraries. The architecture is designed to solve three
+  primary challenges:
+   1. Impedance Mismatch: Reconciling Go’s concurrent goroutine scheduler with CUDA’s thread-specific resource requirements.
+   2. Scalability: Supporting datasets that exceed single-GPU memory (Sharding) or high-concurrency search requirements
+      (Replicated).
+   3. Efficiency: Minimizing CUDA kernel launch overhead via dynamic query batching.
+
+  ---
+
+  2. Core Component: cuvs_worker_t
+  The cuvs_worker_t is the foundational engine of the architecture.
+
+  Implementation Details:
+   * Persistent C++ Thread Pool: Instead of executing CUDA calls directly from CGO (which could be scheduled on any OS thread),
+     the worker maintains a dedicated pool of long-lived C++ threads. Each thread is pinned to a specific GPU device.
+   * Job Queuing: Requests from the Go layer are submitted as "Jobs" to an internal thread-safe queue. The worker returns a
+     std::future, allowing the Go layer to perform other tasks while the GPU processes the request.
+   * Context Stability: By using dedicated threads, we ensure that CUDA context and RAFT resource handles remain stable and
+     cached, avoiding the expensive overhead of context creation or handle re-initialization.
+
+  ---
+
+  3. Distribution Modes
+  The system supports three distinct modes to leverage multi-GPU hardware:
+
+  A. Single GPU Mode
+   * Design: The index resides entirely on one device.
+   * Use Case: Small to medium datasets where latency is the priority.
+
+  B. Replicated Mode (Scaling Throughput)
+   * Design: The full index is loaded onto multiple GPUs simultaneously.
+   * Mechanism: The cuvs_worker implements a load-balancing strategy (typically round-robin). Incoming queries are dispatched to
+     the next available GPU.
+   * Benefit: Linearly scales the Queries Per Second (QPS) by utilizing the compute power of all available GPUs.
+
+  C. Sharded Mode (Scaling Capacity)
+   * Design: The dataset is partitioned into $N$ shards across $N$ GPUs.
+   * Mechanism:
+       1. Broadcast: A search request is sent to all GPUs.
+       2. Local Search: Each GPU searches its local shard independently using RAFT resources.
+       3. Top-K Merge: The worker aggregates the results ($N \times K$ candidates) and performs a final merge-sort (often on the
+          CPU or via a fast GPU kernel) to return the global top-K.
+   * Benefit: Enables indexing of massive datasets (e.g., 100M+ vectors) that would not fit in the memory of a single GPU.
+
+  ---
+
+  4. RAFT Resource Management
+  The package relies on RAFT (raft::resources) for all CUDA-accelerated operations.
+
+   * Resource Caching: raft::resources objects (containing CUDA streams, cuBLAS handles, and workspace memory) are held within the
+     cuvs_worker threads. They are created once at Start() and reused for the lifetime of the index.
+   * Stream-Based Parallelism: Every index operation is executed asynchronously on a RAFT-managed CUDA stream. This allows the
+     system to overlap data transfers (Host-to-Device) with kernel execution, maximizing hardware utilization.
+   * Memory Layout: Leveraging raft::mdspan and raft::mdarray ensures that memory is handled in a layout-aware manner
+     (C-contiguous or Fortran-contiguous), matching the requirements of optimized BLAS and LAPACK kernels.
+
+  ---
+
+  5. Dynamic Batching: The Throughput Key
+  In a database environment, queries often arrive one by one from different users. Processing these as individual CUDA kernels is
+  inefficient due to launch overhead and under-utilization of GPU warps.
+
+  The Dynamic Batching Mechanism:
+   * Aggregation Window: When multiple search requests arrive at the worker within a small time window (microseconds), the worker
+     stalls briefly to aggregate them.
+   * Matrix Consolidation: Individual query vectors are packed into a single large query matrix.
+   * Consolidated Search: A single cuvs::neighbors::search call is made. GPUs are significantly more efficient at processing one
+     $64 \times D$ matrix than 64 individual $1 \times D$ vectors.
+   * Automatic Fulfilling: Once the batch search completes, the worker de-multiplexes the results and fulfills the specific
+     std::future for each individual Go request.
+
+  ---
+
+ 6. Automatic Type Quantization
+  To optimize memory footprint and search speed, the architecture features an automated quantization pipeline that converts
+  high-precision float32 vectors into compressed formats.
+
+   * Transparent Conversion: The Go layer can consistently provide float32 data. The system automatically handles the conversion
+     to the index's internal type (half, int8, or uint8) directly on the GPU.
+   * FP16 (Half Precision):
+       * Mechanism: Uses raft::copy to perform bit-level conversion from 32-bit to 16-bit floating point.
+       * Benefit: 2x memory reduction with negligible impact on search recall.
+   * 8-Bit Integer (int8/uint8):
+       * Mechanism: Implements a learned Scalar Quantizer. The system samples the dataset to determine optimal min and max
+         clipping bounds.
+       * Training: Before building, the quantizer is "trained" on a subset of the data to ensure the 256 available integer levels
+         are mapped to the most significant range of the distribution.
+       * Benefit: 4x memory reduction, enabling massive datasets to reside in VRAM.
+   * GPU-Accelerated: All quantization kernels are executed on the device. This minimizes CPU usage and avoids the latency of
+     converting data before sending it over the PCIe bus.
+
+  7. Supported Index Types
+  The following indexes are fully integrated into the MatrixOne GPU architecture:
+
+
+  ┌──────────┬──────────────────────┬───────────────────────────────────────────────────────────────────────────────┐
+  │ Index    │ Algorithm            │ Strengths                                                                     │
+  ├──────────┼──────────────────────┼───────────────────────────────────────────────────────────────────────────────┤
+  │ CAGRA    │ Hardware-accelerated │ Best-in-class search speed and high recall. Optimized for hardware graph      │
+  │          │ Graph                │ traversal.                                                                    │
+  │ IVF-Flat │ Inverted File Index  │ High accuracy and fast search. Excellent for general-purpose use.             │
+  │ IVF-PQ   │ Product Quantization │ Extreme compression. Supports billions of vectors via lossy code compression. │
+  │ Brute    │ Exact Flat Search    │ 100% recall. Ideal for small datasets or generating ground-truth for          │
+  │ Force    │                      │ benchmarks.                                                                   │
+  │ K-Means  │ Clustering           │ High-performance centroid calculation for data partitioning and unsupervised  │
+  │          │                      │ learning.                                                                     │
+  └──────────┴──────────────────────┴───────────────────────────────────────────────────────────────────────────────┘
+
+
+  8. Operational Telemetry
+  All indexes implement a unified Info() method that returns a JSON-formatted string. This allows the database to programmatically
+  verify:
+   * Hardware Mapping: Which GPU devices are holding which shards.
+   * Data Layout: Element sizes, dimensions, and current vector counts.
+   * Hyper-parameters: Internal tuning values like NLists, GraphDegree, or PQBits.