diff --git a/.github/workflows/_build.yml b/.github/workflows/_build.yml index 1f574d690..d93997892 100644 --- a/.github/workflows/_build.yml +++ b/.github/workflows/_build.yml @@ -12,7 +12,7 @@ on: description: 'JSON string of Python versions' required: false type: string - default: '["3.10", "3.11", "3.12", "3.13", "3.14"]' + default: '["3.10"]' build_sdist: description: 'Whether to build source distribution' required: false @@ -42,7 +42,7 @@ on: python_json: description: 'JSON string of Python versions' required: false - default: '["3.10", "3.11", "3.12", "3.13", "3.14"]' + default: '["3.10"]' jobs: build-sdist: @@ -218,7 +218,7 @@ jobs: run: uv sync --frozen - name: Install build dependencies - run: uv pip install setuptools setuptools_scm pybind11 cmake wheel build + run: uv pip install setuptools setuptools_scm cmake wheel build - name: Resolve OpenViking version for Rust CLI (Linux) shell: bash @@ -375,7 +375,7 @@ jobs: run: uv sync --frozen - name: Install build dependencies - run: uv pip install setuptools setuptools_scm pybind11 cmake wheel build + run: uv pip install setuptools setuptools_scm cmake wheel build - name: Resolve OpenViking version for Rust CLI (macOS/Windows) shell: bash @@ -442,53 +442,3 @@ jobs: VERSION=$(ls dist/*.whl | head -n 1 | xargs basename | cut -d- -f2) echo "Build Version: $VERSION" echo "::notice::Build Wheel Version (${{ matrix.os == 'macos-14' && 'macOS arm64 (macos-14)' || matrix.os == 'macos-15-intel' && 'macOS x86_64 (macos-15-intel)' || matrix.os == 'windows-latest' && 'Windows x86_64 (windows-latest)' || matrix.os }} py${{ matrix.python-version }}): $VERSION" - - verify-macos-14-wheel-on-macos-15: - name: Verify macOS 14 arm64 wheel installs on macOS 15 - needs: [build-other] - if: >- - inputs.build_wheels && - contains(inputs.os_json, 'macos-14') && - contains(inputs.python_json, '3.12') - runs-on: macos-15 - steps: - - name: Set up Python 3.12 - uses: actions/setup-python@v6 - with: - python-version: '3.12' - - - name: Download macOS arm64 wheel artifact - uses: actions/download-artifact@v8 - with: - name: python-package-distributions-macos-arm64-3.12 - path: dist/ - - - name: Install built wheel - shell: bash - run: | - python -m pip install --upgrade pip - python -m pip install dist/*.whl - - - name: Smoke test native extension loading - shell: bash - run: | - python - <<'PY' - import importlib - import importlib.util - - import openviking.storage.vectordb.engine as engine - - native_spec = importlib.util.find_spec("openviking.storage.vectordb.engine._native") - if native_spec is None or native_spec.origin is None: - raise SystemExit("openviking storage native backend extension was not installed") - - native_module = importlib.import_module("openviking.storage.vectordb.engine._native") - if engine.ENGINE_VARIANT != "native": - raise SystemExit( - f"expected native engine variant on macOS arm64 wheel, got {engine.ENGINE_VARIANT}" - ) - - print(f"Loaded runtime engine variant {engine.ENGINE_VARIANT}") - print(f"Loaded native extension from {native_spec.origin}") - print(f"Imported backend module {native_module.__name__}") - PY diff --git a/.github/workflows/_codeql.yml b/.github/workflows/_codeql.yml index 395b179fc..ca007e316 100644 --- a/.github/workflows/_codeql.yml +++ b/.github/workflows/_codeql.yml @@ -47,7 +47,7 @@ jobs: - name: Install dependencies run: | uv sync --frozen - uv pip install setuptools setuptools_scm pybind11 cmake wheel + uv pip install setuptools setuptools_scm cmake wheel - name: Initialize CodeQL uses: github/codeql-action/init@v4 diff --git a/.github/workflows/_test_full.yml b/.github/workflows/_test_full.yml index f7730ca1d..30a58c9ec 100644 --- a/.github/workflows/_test_full.yml +++ b/.github/workflows/_test_full.yml @@ -78,7 +78,7 @@ jobs: run: uv sync --frozen --extra test - name: Install build dependencies - run: uv pip install setuptools setuptools_scm pybind11 cmake wheel + run: uv pip install setuptools setuptools_scm cmake wheel - name: Build C++ extensions run: uv run python setup.py build_ext --inplace diff --git a/.github/workflows/_test_lite.yml b/.github/workflows/_test_lite.yml index 25525af37..52e6a7097 100644 --- a/.github/workflows/_test_lite.yml +++ b/.github/workflows/_test_lite.yml @@ -78,7 +78,7 @@ jobs: run: uv sync --frozen --extra test - name: Install build dependencies - run: uv pip install setuptools setuptools_scm pybind11 cmake wheel + run: uv pip install setuptools setuptools_scm cmake wheel - name: Build C++ extensions run: uv run python setup.py build_ext --inplace diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 89b5f8003..2f8fad30e 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -34,7 +34,7 @@ on: description: 'JSON string of Python versions (Manual only)' required: false type: string - default: '["3.10", "3.11", "3.12", "3.13", "3.14"]' + default: '["3.10"]' permissions: contents: write @@ -48,7 +48,7 @@ jobs: uses: ./.github/workflows/_build.yml with: os_json: ${{ inputs.os_json || '["ubuntu-24.04", "ubuntu-24.04-arm", "macos-14", "macos-15-intel", "windows-latest"]' }} - python_json: ${{ inputs.python_json || '["3.10", "3.11", "3.12", "3.13", "3.14"]' }} + python_json: ${{ inputs.python_json || '["3.10"]' }} build_sdist: ${{ github.event_name == 'release' || inputs.build_sdist != false }} build_wheels: ${{ github.event_name == 'release' || inputs.build_wheels != false }} diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 8feafacd7..088cb3606 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -163,7 +163,7 @@ openviking/ │ ├── src/ # CLI source code │ └── install.sh # Quick install script │ -├── src/ # C++ extensions (pybind11) +├── src/ # C++ extension sources (Python abi3) │ ├── tests/ # Test suite │ ├── client/ # Client tests diff --git a/CONTRIBUTING_CN.md b/CONTRIBUTING_CN.md index ad5b973ff..7073d6c03 100644 --- a/CONTRIBUTING_CN.md +++ b/CONTRIBUTING_CN.md @@ -163,7 +163,7 @@ openviking/ │ ├── src/ # CLI 源码 │ └── install.sh # 一键安装脚本 │ -├── src/ # C++ 扩展 (pybind11) +├── src/ # C++ 扩展源码(Python abi3) │ ├── tests/ # 测试套件 │ ├── client/ # 客户端测试 diff --git a/CONTRIBUTING_JA.md b/CONTRIBUTING_JA.md index 70fc9100e..2011ce75f 100644 --- a/CONTRIBUTING_JA.md +++ b/CONTRIBUTING_JA.md @@ -165,7 +165,7 @@ openviking/ │ ├── src/ # CLIソースコード │ └── install.sh # クイックインストールスクリプト │ -├── src/ # C++拡張(pybind11) +├── src/ # C++拡張ソース(Python abi3) │ ├── tests/ # テストスイート │ ├── client/ # クライアントテスト diff --git a/examples/cloud/bob.py b/examples/cloud/bob.py index e2ae59fcb..d73418d67 100644 --- a/examples/cloud/bob.py +++ b/examples/cloud/bob.py @@ -128,7 +128,7 @@ def main(): "本地开发步骤:1) 安装 Python 3.10+ 和 uv " "2) git clone 后执行 uv sync 安装依赖 " "3) 复制 examples/ov.conf.example 为 ~/.openviking/ov.conf 填入 API Key " - "4) 运行 openviking-server 启动开发服务。C++ 扩展需要 cmake 和 pybind11。", + "4) 运行 openviking-server 启动开发服务。构建 abi3 C++ 扩展需要 cmake。", ), ("user", "测试怎么跑?"), ( diff --git a/openviking/storage/vectordb/README.md b/openviking/storage/vectordb/README.md index 3f5f26c66..009050d40 100644 --- a/openviking/storage/vectordb/README.md +++ b/openviking/storage/vectordb/README.md @@ -11,7 +11,7 @@ OpenViking 项目的高性能向量数据库模块,专为 AI Agent 场景设 - **灵活的存储模式**:支持内存模式(Volatile)和持久化模式(Persistent) - **TTL 自动过期**:支持数据生存时间管理,自动清理过期数据 - **索引自动重建**:后台任务自动检测和重建索引 -- **高性能**:核心引擎基于 C++ 实现,使用 pybind11 绑定 +- **高性能**:核心引擎基于 C++ 实现,使用 Python abi3 扩展绑定 - **线程安全**:关键数据结构支持并发访问 ## 架构原理 @@ -30,7 +30,7 @@ Collection Layer (集合管理、数据操作、索引协调) Index Layer Storage Layer (向量检索) (三表存储) ↓ ↓ -C++ Engine (pybind11 绑定) +C++ Engine (abi3 绑定) ``` ### 三表存储模型 diff --git a/openviking/storage/vectordb/engine/__init__.py b/openviking/storage/vectordb/engine/__init__.py index b1075f35d..f87b45e82 100644 --- a/openviking/storage/vectordb/engine/__init__.py +++ b/openviking/storage/vectordb/engine/__init__.py @@ -8,8 +8,12 @@ import importlib.util import os import platform +import sys +from pathlib import Path from types import ModuleType +from ._python_api import build_abi3_exports + _BACKEND_MODULES = { "x86_sse3": "_x86_sse3", "x86_avx2": "_x86_avx2", @@ -115,10 +119,37 @@ def _select_variant() -> tuple[str | None, tuple[str, ...], str | None]: def _load_backend(variant: str) -> ModuleType: - return importlib.import_module(f".{_BACKEND_MODULES[variant]}", __name__) + module_name = _BACKEND_MODULES[variant] + module_path = Path(__file__).resolve().parent + qualified_name = f"{__name__}.{module_name}" + + if qualified_name in sys.modules: + return sys.modules[qualified_name] + + for suffix in importlib.machinery.EXTENSION_SUFFIXES: + if "abi3" not in suffix: + continue + candidate = module_path / f"{module_name}{suffix}" + if not candidate.exists(): + continue + spec = importlib.util.spec_from_file_location(qualified_name, candidate) + if spec is None or spec.loader is None: + continue + module = importlib.util.module_from_spec(spec) + sys.modules[qualified_name] = module + spec.loader.exec_module(module) + return module + + return importlib.import_module(f".{module_name}", __name__) def _export_backend(module: ModuleType) -> tuple[str, ...]: + if getattr(module, "_ENGINE_BACKEND_API", None) == "abi3-v1": + exports = build_abi3_exports(module) + for name, value in exports.items(): + globals()[name] = value + return tuple(exports) + names = getattr(module, "__all__", None) if names is None: names = tuple(name for name in dir(module) if not name.startswith("_")) diff --git a/openviking/storage/vectordb/engine/_python_api.py b/openviking/storage/vectordb/engine/_python_api.py new file mode 100644 index 000000000..c82d0ea76 --- /dev/null +++ b/openviking/storage/vectordb/engine/_python_api.py @@ -0,0 +1,499 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import struct +from dataclasses import dataclass +from enum import IntEnum +from typing import Any, Dict, Iterable, List + +INT64_SIZE = 8 +UINT64_SIZE = 8 +FLOAT32_SIZE = 4 +UINT32_SIZE = 4 +UINT16_SIZE = 2 +BOOL_SIZE = 1 + + +class FieldType(IntEnum): + int64 = 0 + uint64 = 1 + float32 = 2 + string = 3 + binary = 4 + boolean = 5 + list_int64 = 6 + list_string = 7 + list_float32 = 8 + + +class StorageOpType(IntEnum): + PUT = 0 + DELETE = 1 + + +@dataclass +class FieldMeta: + name: str + data_type: FieldType + offset: int + id: int + default_value: Any = None + + +def _default_value_for_type(data_type: FieldType) -> Any: + return { + FieldType.int64: 0, + FieldType.uint64: 0, + FieldType.float32: 0.0, + FieldType.string: "", + FieldType.binary: b"", + FieldType.boolean: False, + FieldType.list_int64: [], + FieldType.list_string: [], + FieldType.list_float32: [], + }[data_type] + + +def _field_type_size(data_type: FieldType) -> int: + return { + FieldType.int64: INT64_SIZE, + FieldType.uint64: UINT64_SIZE, + FieldType.float32: FLOAT32_SIZE, + FieldType.string: UINT32_SIZE, + FieldType.binary: UINT32_SIZE, + FieldType.boolean: BOOL_SIZE, + FieldType.list_int64: UINT32_SIZE, + FieldType.list_string: UINT32_SIZE, + FieldType.list_float32: UINT32_SIZE, + }[data_type] + + +class Schema: + def __init__(self, fields: list[dict[str, Any]]): + if not isinstance(fields, list) or not fields: + raise ValueError("Schema fields must be a non-empty list") + + self.field_metas: Dict[str, FieldMeta] = {} + self.field_orders: List[FieldMeta] = [None] * len(fields) # type: ignore[list-item] + current_offset = 1 + seen_ids: set[int] = set() + + for field in fields: + if not isinstance(field, dict): + raise TypeError("Each schema field must be a dict") + try: + name = str(field["name"]) + data_type = FieldType(field["data_type"]) + field_id = int(field["id"]) + except KeyError as exc: + raise ValueError(f"Missing schema field key: {exc.args[0]}") from exc + except Exception as exc: # pragma: no cover - defensive + raise ValueError("Invalid schema field definition") from exc + + if not name: + raise ValueError("Schema field name cannot be empty") + if field_id < 0 or field_id >= len(fields): + raise ValueError("Schema field ids must be contiguous and zero-based") + if field_id in seen_ids: + raise ValueError("Schema field ids must be unique") + if name in self.field_metas: + raise ValueError(f"Duplicate schema field name: {name}") + + seen_ids.add(field_id) + default_value = field.get("default_value", _default_value_for_type(data_type)) + meta = FieldMeta( + name=name, + data_type=data_type, + offset=current_offset, + id=field_id, + default_value=default_value, + ) + self.field_metas[name] = meta + self.field_orders[field_id] = meta + current_offset += _field_type_size(data_type) + + if any(meta is None for meta in self.field_orders): + raise ValueError("Schema field ids must be contiguous and zero-based") + + self.total_byte_length = current_offset + + def get_field_meta(self, field_name: str) -> FieldMeta: + if field_name not in self.field_metas: + raise KeyError(f"Field {field_name} does not exist in schema") + return self.field_metas[field_name] + + def get_field_order(self) -> list[FieldMeta]: + return self.field_orders + + def get_total_byte_length(self) -> int: + return self.total_byte_length + + +def _get_row_value(row_data: Any, field_name: str, default_value: Any) -> Any: + if isinstance(row_data, dict): + return row_data.get(field_name, default_value) + return getattr(row_data, field_name, default_value) + + +class BytesRow: + def __init__(self, schema: Schema): + self.schema = schema + self.field_order = schema.get_field_order() + + def serialize(self, row_data: Any) -> bytes: + fixed_formats: list[str] = [] + fixed_values: list[Any] = [] + variable_formats: list[str] = [] + variable_values: list[Any] = [] + variable_region_offset = self.schema.total_byte_length + + for field_meta in self.field_order: + value = _get_row_value(row_data, field_meta.name, field_meta.default_value) + + if field_meta.data_type == FieldType.int64: + fixed_formats.append("q") + fixed_values.append(int(value)) + elif field_meta.data_type == FieldType.uint64: + fixed_formats.append("Q") + fixed_values.append(int(value)) + elif field_meta.data_type == FieldType.float32: + fixed_formats.append("f") + fixed_values.append(float(value)) + elif field_meta.data_type == FieldType.boolean: + fixed_formats.append("B") + fixed_values.append(1 if value else 0) + elif field_meta.data_type == FieldType.string: + encoded = str(value).encode("utf-8") + fixed_formats.append("I") + fixed_values.append(variable_region_offset) + variable_formats.append("H") + variable_values.append(len(encoded)) + variable_formats.append(f"{len(encoded)}s") + variable_values.append(encoded) + variable_region_offset += UINT16_SIZE + len(encoded) + elif field_meta.data_type == FieldType.binary: + blob = bytes(value) + fixed_formats.append("I") + fixed_values.append(variable_region_offset) + variable_formats.append("I") + variable_values.append(len(blob)) + variable_formats.append(f"{len(blob)}s") + variable_values.append(blob) + variable_region_offset += UINT32_SIZE + len(blob) + elif field_meta.data_type == FieldType.list_int64: + items = [int(item) for item in value] + fixed_formats.append("I") + fixed_values.append(variable_region_offset) + variable_formats.append("H") + variable_values.append(len(items)) + variable_formats.append(f"{len(items)}q") + variable_values.extend(items) + variable_region_offset += UINT16_SIZE + len(items) * INT64_SIZE + elif field_meta.data_type == FieldType.list_float32: + items = [float(item) for item in value] + fixed_formats.append("I") + fixed_values.append(variable_region_offset) + variable_formats.append("H") + variable_values.append(len(items)) + variable_formats.append(f"{len(items)}f") + variable_values.extend(items) + variable_region_offset += UINT16_SIZE + len(items) * FLOAT32_SIZE + elif field_meta.data_type == FieldType.list_string: + items = [str(item) for item in value] + fixed_formats.append("I") + fixed_values.append(variable_region_offset) + variable_formats.append("H") + variable_values.append(len(items)) + variable_region_offset += UINT16_SIZE + for item in items: + encoded = item.encode("utf-8") + variable_formats.append("H") + variable_values.append(len(encoded)) + variable_formats.append(f"{len(encoded)}s") + variable_values.append(encoded) + variable_region_offset += UINT16_SIZE + len(encoded) + + fmt = "<" + "".join(fixed_formats) + "".join(variable_formats) + buffer = bytearray(1 + struct.calcsize(fmt)) + buffer[0] = len(self.field_order) + struct.pack_into(fmt, buffer, 1, *(fixed_values + variable_values)) + return bytes(buffer) + + def serialize_batch(self, rows_data: Iterable[Any]) -> list[bytes]: + return [self.serialize(row_data) for row_data in rows_data] + + def deserialize_field(self, serialized_data: bytes, field_name: str) -> Any: + field_meta = self.schema.get_field_meta(field_name) + if field_meta.id >= serialized_data[0]: + return field_meta.default_value + + if field_meta.data_type == FieldType.int64: + return struct.unpack_from(" dict[str, Any]: + return { + field_meta.name: self.deserialize_field(serialized_data, field_meta.name) + for field_meta in self.field_order + } + + +class _RequestBase: + __slots__ = () + + def to_backend(self) -> dict[str, Any]: + return {name: getattr(self, name) for name in self.__slots__} + + +class AddDataRequest(_RequestBase): + __slots__ = ( + "label", + "vector", + "sparse_raw_terms", + "sparse_values", + "fields_str", + "old_fields_str", + ) + + def __init__(self): + self.label = 0 + self.vector = [] + self.sparse_raw_terms = [] + self.sparse_values = [] + self.fields_str = "" + self.old_fields_str = "" + + +class DeleteDataRequest(_RequestBase): + __slots__ = ("label", "old_fields_str") + + def __init__(self): + self.label = 0 + self.old_fields_str = "" + + +class SearchRequest(_RequestBase): + __slots__ = ("query", "sparse_raw_terms", "sparse_values", "topk", "dsl") + + def __init__(self): + self.query = [] + self.sparse_raw_terms = [] + self.sparse_values = [] + self.topk = 0 + self.dsl = "" + + +class SearchResult: + __slots__ = ("result_num", "labels", "scores", "extra_json") + + def __init__( + self, + *, + result_num: int = 0, + labels: list[int] | None = None, + scores: list[float] | None = None, + extra_json: str = "", + ): + self.result_num = result_num + self.labels = labels or [] + self.scores = scores or [] + self.extra_json = extra_json + + @classmethod + def from_backend(cls, payload: dict[str, Any]) -> "SearchResult": + return cls( + result_num=int(payload.get("result_num", 0)), + labels=list(payload.get("labels", [])), + scores=list(payload.get("scores", [])), + extra_json=str(payload.get("extra_json", "")), + ) + + +class StateResult: + __slots__ = ("update_timestamp", "element_count") + + def __init__(self, *, update_timestamp: int = 0, element_count: int = 0): + self.update_timestamp = update_timestamp + self.element_count = element_count + + @property + def data_count(self) -> int: + return self.element_count + + @classmethod + def from_backend(cls, payload: dict[str, Any]) -> "StateResult": + count = int(payload.get("element_count", payload.get("data_count", 0))) + return cls( + update_timestamp=int(payload.get("update_timestamp", 0)), + element_count=count, + ) + + +class StorageOp(_RequestBase): + __slots__ = ("type", "key", "value") + + def __init__(self): + self.type = StorageOpType.PUT + self.key = "" + self.value = b"" + + +def _request_list_to_backend(items: Iterable[_RequestBase]) -> list[Any]: + return list(items) + + +def _build_native_bytes_row_exports(backend: Any): + class Schema: + def __init__(self, fields: list[dict[str, Any]]): + try: + self._handle = backend._new_schema(fields) + except RuntimeError as exc: + raise ValueError(str(exc)) from exc + + def get_total_byte_length(self) -> int: + return int(backend._schema_get_total_byte_length(self._handle)) + + class BytesRow: + def __init__(self, schema: Schema): + self.schema = schema + self._handle = backend._new_bytes_row(schema._handle) + + def serialize(self, row_data: Any) -> bytes: + return bytes(backend._bytes_row_serialize(self._handle, row_data)) + + def serialize_batch(self, rows_data: Iterable[Any]) -> list[bytes]: + return list(backend._bytes_row_serialize_batch(self._handle, list(rows_data))) + + def deserialize(self, serialized_data: bytes) -> dict[str, Any]: + return dict(backend._bytes_row_deserialize(self._handle, serialized_data)) + + def deserialize_field(self, serialized_data: bytes, field_name: str) -> Any: + return backend._bytes_row_deserialize_field(self._handle, serialized_data, field_name) + + return Schema, BytesRow + + +def build_abi3_exports(backend: Any) -> dict[str, Any]: + if hasattr(backend, "_new_schema") and hasattr(backend, "_new_bytes_row"): + SchemaExport, BytesRowExport = _build_native_bytes_row_exports(backend) + else: # pragma: no cover - temporary fallback + SchemaExport, BytesRowExport = Schema, BytesRow + + class IndexEngine: + def __init__(self, path_or_json: str): + self._backend = backend + self._handle = backend._new_index_engine(path_or_json) + + def add_data(self, data_list: list[AddDataRequest]) -> int: + return int( + self._backend._index_engine_add_data( + self._handle, _request_list_to_backend(data_list) + ) + ) + + def delete_data(self, data_list: list[DeleteDataRequest]) -> int: + return int( + self._backend._index_engine_delete_data( + self._handle, _request_list_to_backend(data_list) + ) + ) + + def search(self, req: SearchRequest) -> SearchResult: + return SearchResult.from_backend(self._backend._index_engine_search(self._handle, req)) + + def dump(self, path: str) -> int: + return int(self._backend._index_engine_dump(self._handle, path)) + + def get_state(self) -> StateResult: + return StateResult.from_backend(self._backend._index_engine_get_state(self._handle)) + + class _StoreBase: + def __init__(self, handle: Any): + self._backend = backend + self._handle = handle + + def exec_op(self, ops: list[StorageOp]) -> int: + return int(self._backend._store_exec_op(self._handle, _request_list_to_backend(ops))) + + def get_data(self, keys: list[str]) -> list[bytes]: + return list(self._backend._store_get_data(self._handle, keys)) + + def put_data(self, keys: list[str], values: list[bytes]) -> int: + return int(self._backend._store_put_data(self._handle, keys, values)) + + def delete_data(self, keys: list[str]) -> int: + return int(self._backend._store_delete_data(self._handle, keys)) + + def clear_data(self) -> int: + return int(self._backend._store_clear_data(self._handle)) + + def seek_range(self, start_key: str, end_key: str) -> list[tuple[str, bytes]]: + return list(self._backend._store_seek_range(self._handle, start_key, end_key)) + + class PersistStore(_StoreBase): + def __init__(self, path: str): + super().__init__(backend._new_persist_store(path)) + + class VolatileStore(_StoreBase): + def __init__(self): + super().__init__(backend._new_volatile_store()) + + def init_logging( + log_level: str, log_output: str, log_format: str = "[%Y-%m-%d %H:%M:%S.%e] [%l] %v" + ): + return backend._init_logging(log_level, log_output, log_format) + + return { + "FieldType": FieldType, + "StorageOpType": StorageOpType, + "Schema": SchemaExport, + "BytesRow": BytesRowExport, + "AddDataRequest": AddDataRequest, + "DeleteDataRequest": DeleteDataRequest, + "SearchRequest": SearchRequest, + "SearchResult": SearchResult, + "StateResult": StateResult, + "StorageOp": StorageOp, + "IndexEngine": IndexEngine, + "PersistStore": PersistStore, + "VolatileStore": VolatileStore, + "init_logging": init_logging, + } diff --git a/pyproject.toml b/pyproject.toml index 226368bf8..a60fc73ae 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,6 @@ requires = [ "setuptools>=61.0", "setuptools-scm>=8.0", - "pybind11>=2.13.0", "cmake>=3.15", "wheel", ] @@ -112,7 +111,6 @@ gemini-async = [ build = [ "setuptools>=61.0", "setuptools-scm>=8.0", - "pybind11>=2.13.0", "cmake>=3.15", "wheel", "build", @@ -189,7 +187,7 @@ openviking = [ "lib/libagfsbinding.dll", "bin/ov", "bin/ov.exe", - "storage/vectordb/engine/*.so", + "storage/vectordb/engine/*.abi3.so", "storage/vectordb/engine/*.pyd", ] vikingbot = [ diff --git a/setup.py b/setup.py index 31977fb2e..91be3535b 100644 --- a/setup.py +++ b/setup.py @@ -8,10 +8,14 @@ import sysconfig from pathlib import Path -import pybind11 from setuptools import Extension, setup from setuptools.command.build_ext import build_ext +try: + from wheel.bdist_wheel import bdist_wheel +except ImportError: # pragma: no cover - local build_ext may not have wheel installed + bdist_wheel = None + SETUP_DIR = Path(__file__).resolve().parent if str(SETUP_DIR) not in sys.path: sys.path.insert(0, str(SETUP_DIR)) @@ -344,6 +348,7 @@ def build_extension(self, ext): ext_dir = ext_fullpath.parent.resolve() build_dir = Path(self.build_temp) / "cmake_build" build_dir.mkdir(parents=True, exist_ok=True) + self._clean_stale_engine_artifacts(ext_dir) self._run_stage_with_artifact_checks( "CMake build", @@ -352,9 +357,23 @@ def build_extension(self, ext): ) self._engine_extensions_built = True + def _clean_stale_engine_artifacts(self, ext_dir: Path): + """Remove stale non-abi3 engine binaries from wheel build output directories.""" + source_engine_dir = (SETUP_DIR / "openviking" / "storage" / "vectordb" / "engine").resolve() + if ext_dir == source_engine_dir: + return + + for pattern in ("*.so", "*.pyd"): + for artifact in ext_dir.glob(pattern): + artifact.unlink() + def _build_extension_impl(self, ext_fullpath, ext_dir, build_dir): """Invoke CMake to build the Python native extension.""" - py_ext_suffix = sysconfig.get_config_var("EXT_SUFFIX") or ext_fullpath.suffix + ext_basename = ext_fullpath.stem.split(".")[0] + built_filename = Path(self.get_ext_filename(self.extensions[0].name)).name + py_ext_suffix = built_filename.removeprefix(ext_basename) + if not py_ext_suffix: + py_ext_suffix = sysconfig.get_config_var("EXT_SUFFIX") or ext_fullpath.suffix cmake_args = [ f"-S{Path(ENGINE_SOURCE_DIR).resolve()}", @@ -368,7 +387,6 @@ def _build_extension_impl(self, ext_fullpath, ext_dir, build_dir): f"-DPython3_EXECUTABLE={sys.executable}", f"-DPython3_INCLUDE_DIRS={sysconfig.get_path('include')}", f"-DPython3_LIBRARIES={sysconfig.get_config_vars().get('LIBRARY')}", - f"-Dpybind11_DIR={pybind11.get_cmake_dir()}", f"-DCMAKE_C_COMPILER={C_COMPILER_PATH}", f"-DCMAKE_CXX_COMPILER={CXX_COMPILER_PATH}", ] @@ -387,6 +405,23 @@ def _build_extension_impl(self, ext_fullpath, ext_dir, build_dir): self.spawn([self.cmake_executable] + build_args) +if bdist_wheel is not None: + + class OpenVikingBdistWheel(bdist_wheel): + def finalize_options(self): + super().finalize_options() + self.py_limited_api = "cp310" +else: + OpenVikingBdistWheel = None + + +cmdclass = { + "build_ext": OpenVikingBuildExt, +} +if OpenVikingBdistWheel is not None: + cmdclass["bdist_wheel"] = OpenVikingBdistWheel + + setup( # install_requires=[ # f"pyagfs @ file://localhost/{os.path.abspath('third_party/agfs/agfs-sdk/python')}" @@ -395,11 +430,10 @@ def _build_extension_impl(self, ext_fullpath, ext_dir, build_dir): Extension( name=ENGINE_BUILD_CONFIG.primary_extension, sources=[], + py_limited_api=True, ) ], - cmdclass={ - "build_ext": OpenVikingBuildExt, - }, + cmdclass=cmdclass, package_data={ "openviking": [ "bin/agfs-server", @@ -409,7 +443,7 @@ def _build_extension_impl(self, ext_fullpath, ext_dir, build_dir): "lib/libagfsbinding.dll", "bin/ov", "bin/ov.exe", - "storage/vectordb/engine/*.so", + "storage/vectordb/engine/*.abi3.so", "storage/vectordb/engine/*.pyd", ], }, diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 77d7ce214..ed1663e75 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.12) +cmake_minimum_required(VERSION 3.15) project(openviking_cpp) @@ -48,7 +48,6 @@ if(UNIX AND NOT APPLE) set(Python3_LIBRARIES "") endif() -find_package(pybind11 REQUIRED) find_package(Threads REQUIRED) set(BUILD_SHARED_LIBS OFF CACHE BOOL "" FORCE) @@ -190,13 +189,14 @@ function(ov_add_python_backend backend_suffix module_name) cmake_parse_arguments(OV_BACKEND "" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) set(MODULE_TARGET "engine_module_${backend_suffix}") - pybind11_add_module(${MODULE_TARGET} MODULE pybind11_interface.cpp) + add_library(${MODULE_TARGET} MODULE abi3_engine_backend.cpp) target_include_directories(${MODULE_TARGET} PRIVATE ${Python3_INCLUDE_DIRS}) target_compile_options(${MODULE_TARGET} PRIVATE -fPIC ${OV_BACKEND_COMPILE_OPTIONS}) target_compile_definitions( ${MODULE_TARGET} PRIVATE + Py_LIMITED_API=0x030A0000 OV_PY_MODULE_NAME=${module_name} ${OV_BACKEND_COMPILE_DEFINITIONS} ) @@ -207,6 +207,9 @@ function(ov_add_python_backend backend_suffix module_name) ${OV_BACKEND_INDEX_LIBRARY} Threads::Threads ) + if(WIN32 AND TARGET Python3::Python) + target_link_libraries(${MODULE_TARGET} PRIVATE Python3::Python) + endif() ov_link_filesystem_libs(${MODULE_TARGET}) if(MINGW) @@ -225,9 +228,14 @@ function(ov_add_python_backend backend_suffix module_name) PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${OV_PY_OUTPUT_DIR}" RUNTIME_OUTPUT_DIRECTORY "${OV_PY_OUTPUT_DIR}" + PREFIX "" OUTPUT_NAME "${module_name}" SUFFIX "${OV_PY_EXT_SUFFIX}" ) + + if(APPLE) + target_link_options(${MODULE_TARGET} PRIVATE "-undefined" "dynamic_lookup") + endif() endfunction() set(OV_ENGINE_IMPL_TARGET "") @@ -259,16 +267,24 @@ if(OV_PLATFORM_X86) list(APPEND OV_BUILT_X86_VARIANTS "${OV_VARIANT}") endforeach() - pybind11_add_module(engine_module_x86_caps MODULE cpu_feature_probe.cpp) + add_library(engine_module_x86_caps MODULE abi3_x86_caps.cpp) target_include_directories(engine_module_x86_caps PRIVATE ${Python3_INCLUDE_DIRS}) + target_compile_definitions(engine_module_x86_caps PRIVATE Py_LIMITED_API=0x030A0000) + if(WIN32 AND TARGET Python3::Python) + target_link_libraries(engine_module_x86_caps PRIVATE Python3::Python) + endif() set_target_properties( engine_module_x86_caps PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${OV_PY_OUTPUT_DIR}" RUNTIME_OUTPUT_DIRECTORY "${OV_PY_OUTPUT_DIR}" + PREFIX "" OUTPUT_NAME "_x86_caps" SUFFIX "${OV_PY_EXT_SUFFIX}" ) + if(APPLE) + target_link_options(engine_module_x86_caps PRIVATE "-undefined" "dynamic_lookup") + endif() if(TARGET engine_index_sse3) add_library(engine_impl INTERFACE) diff --git a/src/abi3_engine_backend.cpp b/src/abi3_engine_backend.cpp new file mode 100644 index 000000000..9eacc0795 --- /dev/null +++ b/src/abi3_engine_backend.cpp @@ -0,0 +1,1545 @@ +// Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +// SPDX-License-Identifier: Apache-2.0 +#define Py_LIMITED_API 0x030A0000 +#include + +#include +#include +#include +#include +#include + +#include "common/log_utils.h" +#include "index/common_structs.h" +#include "index/index_engine.h" +#include "store/bytes_row.h" +#include "store/kv_store.h" +#include "store/persist_store.h" +#include "store/volatile_store.h" + +namespace vdb = vectordb; + +namespace { + +constexpr const char* kIndexCapsuleName = "openviking.vectordb.IndexEngine"; +constexpr const char* kStoreCapsuleName = "openviking.vectordb.KVStore"; +constexpr const char* kSchemaCapsuleName = "openviking.vectordb.Schema"; +constexpr const char* kBytesRowCapsuleName = "openviking.vectordb.BytesRow"; + +struct SchemaHandle { + std::shared_ptr schema; +}; + +struct BytesRowHandle { + std::shared_ptr schema; + std::shared_ptr bytes_row; +}; + +void raise_type_error(const std::string& message) { + PyErr_SetString(PyExc_TypeError, message.c_str()); +} + +void raise_value_error(const std::string& message) { + PyErr_SetString(PyExc_ValueError, message.c_str()); +} + +void raise_runtime_error(const std::string& message) { + PyErr_SetString(PyExc_RuntimeError, message.c_str()); +} + +template +auto call_without_gil(Func&& func) -> decltype(func()) { + PyThreadState* save = PyEval_SaveThread(); + try { + if constexpr (std::is_void_v) { + func(); + PyEval_RestoreThread(save); + } else { + auto result = func(); + PyEval_RestoreThread(save); + return result; + } + } catch (...) { + PyEval_RestoreThread(save); + throw; + } +} + +bool get_named_value(PyObject* obj, const char* name, PyObject** out, bool* found) { + *out = nullptr; + *found = false; + + if (PyDict_Check(obj)) { + PyObject* value = PyDict_GetItemString(obj, name); + if (value == nullptr) { + return true; + } + Py_INCREF(value); + *out = value; + *found = true; + return true; + } + + PyObject* value = PyObject_GetAttrString(obj, name); + if (value == nullptr) { + if (PyErr_ExceptionMatches(PyExc_AttributeError)) { + PyErr_Clear(); + return true; + } + return false; + } + + *out = value; + *found = true; + return true; +} + +bool py_to_string(PyObject* obj, std::string* out, bool allow_bytes) { + if (PyUnicode_Check(obj)) { + Py_ssize_t size = 0; + const char* data = PyUnicode_AsUTF8AndSize(obj, &size); + if (data == nullptr) { + return false; + } + out->assign(data, static_cast(size)); + return true; + } + + if (allow_bytes && PyBytes_Check(obj)) { + char* data = nullptr; + Py_ssize_t size = 0; + if (PyBytes_AsStringAndSize(obj, &data, &size) < 0) { + return false; + } + out->assign(data, static_cast(size)); + return true; + } + + raise_type_error("Expected str" + std::string(allow_bytes ? " or bytes" : "")); + return false; +} + +bool py_to_uint64(PyObject* obj, uint64_t* out) { + const unsigned long long value = PyLong_AsUnsignedLongLong(obj); + if (PyErr_Occurred() != nullptr) { + return false; + } + *out = static_cast(value); + return true; +} + +bool py_to_uint32(PyObject* obj, uint32_t* out) { + const unsigned long value = PyLong_AsUnsignedLong(obj); + if (PyErr_Occurred() != nullptr) { + return false; + } + *out = static_cast(value); + return true; +} + +bool py_to_float(PyObject* obj, float* out) { + const double value = PyFloat_AsDouble(obj); + if (PyErr_Occurred() != nullptr) { + return false; + } + *out = static_cast(value); + return true; +} + +bool py_to_int64(PyObject* obj, int64_t* out) { + const long long value = PyLong_AsLongLong(obj); + if (PyErr_Occurred() != nullptr) { + return false; + } + *out = static_cast(value); + return true; +} + +bool py_to_bool(PyObject* obj, bool* out) { + const int value = PyObject_IsTrue(obj); + if (value < 0) { + return false; + } + *out = value != 0; + return true; +} + +bool py_to_float_vector(PyObject* obj, std::vector* out) { + const Py_ssize_t size = PySequence_Size(obj); + if (size < 0) { + raise_type_error("Expected a sequence of floats"); + return false; + } + + out->clear(); + out->reserve(static_cast(size)); + for (Py_ssize_t i = 0; i < size; ++i) { + PyObject* item = PySequence_GetItem(obj, i); + if (item == nullptr) { + return false; + } + float value = 0.0f; + const bool ok = py_to_float(item, &value); + Py_DECREF(item); + if (!ok) { + return false; + } + out->push_back(value); + } + return true; +} + +bool py_to_string_vector(PyObject* obj, std::vector* out) { + const Py_ssize_t size = PySequence_Size(obj); + if (size < 0) { + raise_type_error("Expected a sequence of strings"); + return false; + } + + out->clear(); + out->reserve(static_cast(size)); + for (Py_ssize_t i = 0; i < size; ++i) { + PyObject* item = PySequence_GetItem(obj, i); + if (item == nullptr) { + return false; + } + std::string value; + const bool ok = py_to_string(item, &value, false); + Py_DECREF(item); + if (!ok) { + return false; + } + out->push_back(std::move(value)); + } + return true; +} + +bool py_to_int64_vector(PyObject* obj, std::vector* out) { + const Py_ssize_t size = PySequence_Size(obj); + if (size < 0) { + raise_type_error("Expected a sequence of integers"); + return false; + } + + out->clear(); + out->reserve(static_cast(size)); + for (Py_ssize_t i = 0; i < size; ++i) { + PyObject* item = PySequence_GetItem(obj, i); + if (item == nullptr) { + return false; + } + int64_t value = 0; + const bool ok = py_to_int64(item, &value); + Py_DECREF(item); + if (!ok) { + return false; + } + out->push_back(value); + } + return true; +} + +PyObject* float_vector_to_py(const std::vector& values) { + PyObject* list = PyList_New(static_cast(values.size())); + if (list == nullptr) { + return nullptr; + } + for (Py_ssize_t i = 0; i < static_cast(values.size()); ++i) { + PyObject* item = PyFloat_FromDouble(values[static_cast(i)]); + if (item == nullptr) { + Py_DECREF(list); + return nullptr; + } + PyList_SetItem(list, i, item); + } + return list; +} + +PyObject* string_vector_to_py(const std::vector& values) { + PyObject* list = PyList_New(static_cast(values.size())); + if (list == nullptr) { + return nullptr; + } + for (Py_ssize_t i = 0; i < static_cast(values.size()); ++i) { + const auto& value = values[static_cast(i)]; + PyObject* item = PyUnicode_FromStringAndSize(value.data(), static_cast(value.size())); + if (item == nullptr) { + Py_DECREF(list); + return nullptr; + } + PyList_SetItem(list, i, item); + } + return list; +} + +PyObject* uint64_vector_to_py(const std::vector& values) { + PyObject* list = PyList_New(static_cast(values.size())); + if (list == nullptr) { + return nullptr; + } + for (Py_ssize_t i = 0; i < static_cast(values.size()); ++i) { + PyObject* item = PyLong_FromUnsignedLongLong(values[static_cast(i)]); + if (item == nullptr) { + Py_DECREF(list); + return nullptr; + } + PyList_SetItem(list, i, item); + } + return list; +} + +PyObject* value_to_py(const vdb::Value& value, vdb::FieldType field_type) { + switch (field_type) { + case vdb::FieldType::INT64: + if (std::holds_alternative(value)) { + return PyLong_FromLongLong(std::get(value)); + } + break; + case vdb::FieldType::UINT64: + if (std::holds_alternative(value)) { + return PyLong_FromUnsignedLongLong(std::get(value)); + } + break; + case vdb::FieldType::FLOAT32: + if (std::holds_alternative(value)) { + return PyFloat_FromDouble(std::get(value)); + } + break; + case vdb::FieldType::BOOLEAN: + if (std::holds_alternative(value)) { + if (std::get(value)) { + Py_RETURN_TRUE; + } + Py_RETURN_FALSE; + } + break; + case vdb::FieldType::STRING: + if (std::holds_alternative(value)) { + const auto& text = std::get(value); + return PyUnicode_FromStringAndSize(text.data(), static_cast(text.size())); + } + break; + case vdb::FieldType::BINARY: + if (std::holds_alternative(value)) { + const auto& blob = std::get(value); + return PyBytes_FromStringAndSize(blob.data(), static_cast(blob.size())); + } + break; + case vdb::FieldType::LIST_INT64: + if (std::holds_alternative>(value)) { + const auto& items = std::get>(value); + PyObject* list = PyList_New(static_cast(items.size())); + if (list == nullptr) { + return nullptr; + } + for (Py_ssize_t i = 0; i < static_cast(items.size()); ++i) { + PyObject* item = PyLong_FromLongLong(items[static_cast(i)]); + if (item == nullptr) { + Py_DECREF(list); + return nullptr; + } + PyList_SetItem(list, i, item); + } + return list; + } + break; + case vdb::FieldType::LIST_STRING: + if (std::holds_alternative>(value)) { + return string_vector_to_py(std::get>(value)); + } + break; + case vdb::FieldType::LIST_FLOAT32: + if (std::holds_alternative>(value)) { + return float_vector_to_py(std::get>(value)); + } + break; + } + + Py_RETURN_NONE; +} + +bool py_to_field_value(PyObject* obj, vdb::FieldType data_type, vdb::Value* out) { + switch (data_type) { + case vdb::FieldType::INT64: { + int64_t value = 0; + if (!py_to_int64(obj, &value)) { + return false; + } + *out = value; + return true; + } + case vdb::FieldType::UINT64: { + uint64_t value = 0; + if (!py_to_uint64(obj, &value)) { + return false; + } + *out = value; + return true; + } + case vdb::FieldType::FLOAT32: { + float value = 0.0f; + if (!py_to_float(obj, &value)) { + return false; + } + *out = value; + return true; + } + case vdb::FieldType::STRING: + case vdb::FieldType::BINARY: { + std::string value; + if (!py_to_string(obj, &value, true)) { + return false; + } + *out = std::move(value); + return true; + } + case vdb::FieldType::BOOLEAN: { + bool value = false; + if (!py_to_bool(obj, &value)) { + return false; + } + *out = value; + return true; + } + case vdb::FieldType::LIST_INT64: { + std::vector values; + if (!py_to_int64_vector(obj, &values)) { + return false; + } + *out = std::move(values); + return true; + } + case vdb::FieldType::LIST_STRING: { + std::vector values; + if (!py_to_string_vector(obj, &values)) { + return false; + } + *out = std::move(values); + return true; + } + case vdb::FieldType::LIST_FLOAT32: { + std::vector values; + if (!py_to_float_vector(obj, &values)) { + return false; + } + *out = std::move(values); + return true; + } + } + + raise_type_error("Unsupported field type"); + return false; +} + +template +T* capsule_to_ptr(PyObject* capsule, const char* capsule_name) { + if (!PyCapsule_CheckExact(capsule)) { + raise_type_error("Expected capsule handle"); + return nullptr; + } + return static_cast(PyCapsule_GetPointer(capsule, capsule_name)); +} + +void schema_capsule_destructor(PyObject* capsule) { + auto* ptr = static_cast(PyCapsule_GetPointer(capsule, kSchemaCapsuleName)); + delete ptr; +} + +void bytes_row_capsule_destructor(PyObject* capsule) { + auto* ptr = static_cast(PyCapsule_GetPointer(capsule, kBytesRowCapsuleName)); + delete ptr; +} + +void index_capsule_destructor(PyObject* capsule) { + auto* ptr = static_cast(PyCapsule_GetPointer(capsule, kIndexCapsuleName)); + delete ptr; +} + +void store_capsule_destructor(PyObject* capsule) { + auto* ptr = static_cast(PyCapsule_GetPointer(capsule, kStoreCapsuleName)); + delete ptr; +} + +bool py_to_field_type(PyObject* obj, vdb::FieldType* out) { + const long value = PyLong_AsLong(obj); + if (PyErr_Occurred() != nullptr) { + return false; + } + + switch (value) { + case 0: + *out = vdb::FieldType::INT64; + return true; + case 1: + *out = vdb::FieldType::UINT64; + return true; + case 2: + *out = vdb::FieldType::FLOAT32; + return true; + case 3: + *out = vdb::FieldType::STRING; + return true; + case 4: + *out = vdb::FieldType::BINARY; + return true; + case 5: + *out = vdb::FieldType::BOOLEAN; + return true; + case 6: + *out = vdb::FieldType::LIST_INT64; + return true; + case 7: + *out = vdb::FieldType::LIST_STRING; + return true; + case 8: + *out = vdb::FieldType::LIST_FLOAT32; + return true; + default: + raise_value_error("Unsupported field type"); + return false; + } +} + +bool parse_schema_fields(PyObject* fields_obj, std::vector* fields) { + const Py_ssize_t size = PySequence_Size(fields_obj); + if (size < 0) { + raise_type_error("Schema fields must be a sequence"); + return false; + } + + fields->clear(); + fields->reserve(static_cast(size)); + + for (Py_ssize_t i = 0; i < size; ++i) { + PyObject* item = PySequence_GetItem(fields_obj, i); + if (item == nullptr) { + return false; + } + if (!PyDict_Check(item)) { + Py_DECREF(item); + raise_type_error("Each schema field must be a dict"); + return false; + } + + vdb::FieldDef field; + PyObject* name_obj = PyDict_GetItemString(item, "name"); + PyObject* type_obj = PyDict_GetItemString(item, "data_type"); + PyObject* id_obj = PyDict_GetItemString(item, "id"); + if (name_obj == nullptr || type_obj == nullptr || id_obj == nullptr) { + Py_DECREF(item); + raise_value_error("Schema field definition must contain name, data_type, and id"); + return false; + } + + if (!py_to_string(name_obj, &field.name, false) || !py_to_field_type(type_obj, &field.data_type)) { + Py_DECREF(item); + return false; + } + + const long field_id = PyLong_AsLong(id_obj); + if (PyErr_Occurred() != nullptr) { + Py_DECREF(item); + return false; + } + field.id = static_cast(field_id); + + PyObject* default_value = PyDict_GetItemString(item, "default_value"); + if (default_value != nullptr && default_value != Py_None) { + if (!py_to_field_value(default_value, field.data_type, &field.default_value)) { + PyErr_Clear(); + field.default_value = std::monostate{}; + } + } else { + field.default_value = std::monostate{}; + } + + fields->push_back(std::move(field)); + Py_DECREF(item); + } + + return true; +} + +bool row_object_to_values(PyObject* obj, + const std::vector& field_order, + std::vector* out) { + out->assign(field_order.size(), std::monostate{}); + for (size_t i = 0; i < field_order.size(); ++i) { + const auto& meta = field_order[i]; + PyObject* value = nullptr; + bool found = false; + if (!get_named_value(obj, meta.name.c_str(), &value, &found)) { + return false; + } + if (!found || value == Py_None) { + Py_XDECREF(value); + continue; + } + + vdb::Value parsed; + const bool ok = py_to_field_value(value, meta.data_type, &parsed); + Py_DECREF(value); + if (!ok) { + return false; + } + (*out)[i] = std::move(parsed); + } + return true; +} + +bool parse_add_request(PyObject* obj, vdb::AddDataRequest* request) { + PyObject* value = nullptr; + bool found = false; + + if (!get_named_value(obj, "label", &value, &found)) { + return false; + } + if (found && !py_to_uint64(value, &request->label)) { + Py_DECREF(value); + return false; + } + Py_XDECREF(value); + + if (!get_named_value(obj, "vector", &value, &found)) { + return false; + } + if (found && !py_to_float_vector(value, &request->vector)) { + Py_DECREF(value); + return false; + } + Py_XDECREF(value); + + if (!get_named_value(obj, "sparse_raw_terms", &value, &found)) { + return false; + } + if (found && !py_to_string_vector(value, &request->sparse_raw_terms)) { + Py_DECREF(value); + return false; + } + Py_XDECREF(value); + + if (!get_named_value(obj, "sparse_values", &value, &found)) { + return false; + } + if (found && !py_to_float_vector(value, &request->sparse_values)) { + Py_DECREF(value); + return false; + } + Py_XDECREF(value); + + if (!get_named_value(obj, "fields_str", &value, &found)) { + return false; + } + if (found && !py_to_string(value, &request->fields_str, true)) { + Py_DECREF(value); + return false; + } + Py_XDECREF(value); + + if (!get_named_value(obj, "old_fields_str", &value, &found)) { + return false; + } + if (found && !py_to_string(value, &request->old_fields_str, true)) { + Py_DECREF(value); + return false; + } + Py_XDECREF(value); + + return true; +} + +bool parse_delete_request(PyObject* obj, vdb::DeleteDataRequest* request) { + PyObject* value = nullptr; + bool found = false; + + if (!get_named_value(obj, "label", &value, &found)) { + return false; + } + if (found && !py_to_uint64(value, &request->label)) { + Py_DECREF(value); + return false; + } + Py_XDECREF(value); + + if (!get_named_value(obj, "old_fields_str", &value, &found)) { + return false; + } + if (found && !py_to_string(value, &request->old_fields_str, true)) { + Py_DECREF(value); + return false; + } + Py_XDECREF(value); + + return true; +} + +bool parse_search_request(PyObject* obj, vdb::SearchRequest* request) { + PyObject* value = nullptr; + bool found = false; + + if (!get_named_value(obj, "query", &value, &found)) { + return false; + } + if (found && !py_to_float_vector(value, &request->query)) { + Py_DECREF(value); + return false; + } + Py_XDECREF(value); + + if (!get_named_value(obj, "sparse_raw_terms", &value, &found)) { + return false; + } + if (found && !py_to_string_vector(value, &request->sparse_raw_terms)) { + Py_DECREF(value); + return false; + } + Py_XDECREF(value); + + if (!get_named_value(obj, "sparse_values", &value, &found)) { + return false; + } + if (found && !py_to_float_vector(value, &request->sparse_values)) { + Py_DECREF(value); + return false; + } + Py_XDECREF(value); + + if (!get_named_value(obj, "topk", &value, &found)) { + return false; + } + if (found && !py_to_uint32(value, &request->topk)) { + Py_DECREF(value); + return false; + } + Py_XDECREF(value); + + if (!get_named_value(obj, "dsl", &value, &found)) { + return false; + } + if (found && !py_to_string(value, &request->dsl, false)) { + Py_DECREF(value); + return false; + } + Py_XDECREF(value); + + return true; +} + +bool parse_storage_op(PyObject* obj, vdb::StorageOp* op) { + PyObject* value = nullptr; + bool found = false; + + if (!get_named_value(obj, "type", &value, &found)) { + return false; + } + long type_value = 0; + if (found) { + type_value = PyLong_AsLong(value); + if (PyErr_Occurred() != nullptr) { + Py_DECREF(value); + return false; + } + if (type_value != static_cast(vdb::StorageOp::PUT_OP) && + type_value != static_cast(vdb::StorageOp::DELETE_OP)) { + Py_DECREF(value); + raise_value_error("Invalid storage op type"); + return false; + } + } + Py_XDECREF(value); + op->type = type_value == static_cast(vdb::StorageOp::DELETE_OP) + ? vdb::StorageOp::DELETE_OP + : vdb::StorageOp::PUT_OP; + + if (!get_named_value(obj, "key", &value, &found)) { + return false; + } + if (!found) { + raise_value_error("Storage op key is required"); + return false; + } + if (!py_to_string(value, &op->key, false)) { + Py_XDECREF(value); + return false; + } + Py_DECREF(value); + + if (!get_named_value(obj, "value", &value, &found)) { + return false; + } + if (found && !py_to_string(value, &op->value, true)) { + Py_DECREF(value); + return false; + } + Py_XDECREF(value); + + return true; +} + +template +bool parse_request_list(PyObject* obj, + bool (*parse_item)(PyObject*, RequestT*), + std::vector* out) { + const Py_ssize_t size = PySequence_Size(obj); + if (size < 0) { + raise_type_error("Expected a sequence"); + return false; + } + + out->clear(); + out->reserve(static_cast(size)); + for (Py_ssize_t i = 0; i < size; ++i) { + PyObject* item = PySequence_GetItem(obj, i); + if (item == nullptr) { + return false; + } + RequestT request; + const bool ok = parse_item(item, &request); + Py_DECREF(item); + if (!ok) { + return false; + } + out->push_back(std::move(request)); + } + + return true; +} + +bool parse_string_list(PyObject* obj, std::vector* out) { + return py_to_string_vector(obj, out); +} + +bool parse_binary_list(PyObject* obj, std::vector* out) { + const Py_ssize_t size = PySequence_Size(obj); + if (size < 0) { + raise_type_error("Expected a sequence of bytes"); + return false; + } + out->clear(); + out->reserve(static_cast(size)); + for (Py_ssize_t i = 0; i < size; ++i) { + PyObject* item = PySequence_GetItem(obj, i); + if (item == nullptr) { + return false; + } + std::string value; + const bool ok = py_to_string(item, &value, true); + Py_DECREF(item); + if (!ok) { + return false; + } + out->push_back(std::move(value)); + } + return true; +} + +PyObject* py_new_schema(PyObject*, PyObject* args) { + PyObject* fields_obj = nullptr; + if (!PyArg_ParseTuple(args, "O", &fields_obj)) { + return nullptr; + } + + std::vector fields; + if (!parse_schema_fields(fields_obj, &fields)) { + return nullptr; + } + + try { + auto* handle = new SchemaHandle{std::make_shared(fields)}; + return PyCapsule_New(handle, kSchemaCapsuleName, schema_capsule_destructor); + } catch (const std::exception& exc) { + raise_runtime_error(exc.what()); + return nullptr; + } +} + +PyObject* py_schema_get_total_byte_length(PyObject*, PyObject* args) { + PyObject* capsule = nullptr; + if (!PyArg_ParseTuple(args, "O", &capsule)) { + return nullptr; + } + + auto* handle = capsule_to_ptr(capsule, kSchemaCapsuleName); + if (handle == nullptr) { + return nullptr; + } + + return PyLong_FromLong(handle->schema->get_total_byte_length()); +} + +PyObject* py_new_bytes_row(PyObject*, PyObject* args) { + PyObject* schema_capsule = nullptr; + if (!PyArg_ParseTuple(args, "O", &schema_capsule)) { + return nullptr; + } + + auto* schema_handle = capsule_to_ptr(schema_capsule, kSchemaCapsuleName); + if (schema_handle == nullptr) { + return nullptr; + } + + auto* handle = new BytesRowHandle{ + schema_handle->schema, + std::make_shared(schema_handle->schema), + }; + return PyCapsule_New(handle, kBytesRowCapsuleName, bytes_row_capsule_destructor); +} + +PyObject* py_bytes_row_serialize(PyObject*, PyObject* args) { + PyObject* capsule = nullptr; + PyObject* row_obj = nullptr; + if (!PyArg_ParseTuple(args, "OO", &capsule, &row_obj)) { + return nullptr; + } + + auto* handle = capsule_to_ptr(capsule, kBytesRowCapsuleName); + if (handle == nullptr) { + return nullptr; + } + + std::vector row_values; + if (!row_object_to_values(row_obj, handle->schema->get_field_order(), &row_values)) { + return nullptr; + } + + try { + const std::string payload = + call_without_gil([&]() { return handle->bytes_row->serialize(row_values); }); + return PyBytes_FromStringAndSize(payload.data(), static_cast(payload.size())); + } catch (const std::exception& exc) { + raise_runtime_error(exc.what()); + return nullptr; + } +} + +PyObject* py_bytes_row_serialize_batch(PyObject*, PyObject* args) { + PyObject* capsule = nullptr; + PyObject* rows_obj = nullptr; + if (!PyArg_ParseTuple(args, "OO", &capsule, &rows_obj)) { + return nullptr; + } + + auto* handle = capsule_to_ptr(capsule, kBytesRowCapsuleName); + if (handle == nullptr) { + return nullptr; + } + + const Py_ssize_t size = PySequence_Size(rows_obj); + if (size < 0) { + raise_type_error("Expected a sequence of rows"); + return nullptr; + } + + PyObject* list = PyList_New(size); + if (list == nullptr) { + return nullptr; + } + + for (Py_ssize_t i = 0; i < size; ++i) { + PyObject* row_obj = PySequence_GetItem(rows_obj, i); + if (row_obj == nullptr) { + Py_DECREF(list); + return nullptr; + } + + std::vector row_values; + const bool ok = row_object_to_values(row_obj, handle->schema->get_field_order(), &row_values); + Py_DECREF(row_obj); + if (!ok) { + Py_DECREF(list); + return nullptr; + } + + try { + const std::string payload = + call_without_gil([&]() { return handle->bytes_row->serialize(row_values); }); + PyObject* item = PyBytes_FromStringAndSize(payload.data(), static_cast(payload.size())); + if (item == nullptr) { + Py_DECREF(list); + return nullptr; + } + PyList_SetItem(list, i, item); + } catch (const std::exception& exc) { + Py_DECREF(list); + raise_runtime_error(exc.what()); + return nullptr; + } + } + + return list; +} + +PyObject* py_bytes_row_deserialize(PyObject*, PyObject* args) { + PyObject* capsule = nullptr; + PyObject* payload_obj = nullptr; + if (!PyArg_ParseTuple(args, "OO", &capsule, &payload_obj)) { + return nullptr; + } + + auto* handle = capsule_to_ptr(capsule, kBytesRowCapsuleName); + if (handle == nullptr) { + return nullptr; + } + + std::string payload; + if (!py_to_string(payload_obj, &payload, true)) { + return nullptr; + } + + PyObject* result = PyDict_New(); + if (result == nullptr) { + return nullptr; + } + + try { + for (const auto& meta : handle->schema->get_field_order()) { + const vdb::Value value = + call_without_gil([&]() { return handle->bytes_row->deserialize_field(payload, meta.name); }); + if (std::holds_alternative(value)) { + continue; + } + + PyObject* py_value = value_to_py(value, meta.data_type); + if (py_value == nullptr) { + Py_DECREF(result); + return nullptr; + } + if (PyDict_SetItemString(result, meta.name.c_str(), py_value) < 0) { + Py_DECREF(py_value); + Py_DECREF(result); + return nullptr; + } + Py_DECREF(py_value); + } + } catch (const std::exception& exc) { + Py_DECREF(result); + raise_runtime_error(exc.what()); + return nullptr; + } + + return result; +} + +PyObject* py_bytes_row_deserialize_field(PyObject*, PyObject* args) { + PyObject* capsule = nullptr; + PyObject* payload_obj = nullptr; + const char* field_name = nullptr; + if (!PyArg_ParseTuple(args, "OOs", &capsule, &payload_obj, &field_name)) { + return nullptr; + } + + auto* handle = capsule_to_ptr(capsule, kBytesRowCapsuleName); + if (handle == nullptr) { + return nullptr; + } + + std::string payload; + if (!py_to_string(payload_obj, &payload, true)) { + return nullptr; + } + + const auto* meta = handle->schema->get_field_meta(field_name); + if (meta == nullptr) { + raise_value_error("Field does not exist in schema"); + return nullptr; + } + + try { + const vdb::Value value = + call_without_gil([&]() { return handle->bytes_row->deserialize_field(payload, field_name); }); + return value_to_py(value, meta->data_type); + } catch (const std::exception& exc) { + raise_runtime_error(exc.what()); + return nullptr; + } +} + +PyObject* build_search_result(const vdb::SearchResult& result) { + PyObject* payload = PyDict_New(); + if (payload == nullptr) { + return nullptr; + } + + PyObject* result_num = PyLong_FromUnsignedLong(result.result_num); + PyObject* labels = uint64_vector_to_py(result.labels); + PyObject* scores = float_vector_to_py(result.scores); + PyObject* extra_json = PyUnicode_FromStringAndSize( + result.extra_json.data(), static_cast(result.extra_json.size())); + if (result_num == nullptr || labels == nullptr || scores == nullptr || + extra_json == nullptr) { + Py_XDECREF(result_num); + Py_XDECREF(labels); + Py_XDECREF(scores); + Py_XDECREF(extra_json); + Py_DECREF(payload); + return nullptr; + } + + PyDict_SetItemString(payload, "result_num", result_num); + PyDict_SetItemString(payload, "labels", labels); + PyDict_SetItemString(payload, "scores", scores); + PyDict_SetItemString(payload, "extra_json", extra_json); + Py_DECREF(result_num); + Py_DECREF(labels); + Py_DECREF(scores); + Py_DECREF(extra_json); + return payload; +} + +PyObject* build_state_result(const vdb::StateResult& result) { + PyObject* payload = PyDict_New(); + if (payload == nullptr) { + return nullptr; + } + + PyObject* update_ts = PyLong_FromUnsignedLongLong(result.update_timestamp); + PyObject* element_count = PyLong_FromUnsignedLongLong(result.element_count); + if (update_ts == nullptr || element_count == nullptr) { + Py_XDECREF(update_ts); + Py_XDECREF(element_count); + Py_DECREF(payload); + return nullptr; + } + + PyDict_SetItemString(payload, "update_timestamp", update_ts); + PyDict_SetItemString(payload, "element_count", element_count); + Py_DECREF(update_ts); + Py_DECREF(element_count); + return payload; +} + +PyObject* py_init_logging(PyObject*, PyObject* args, PyObject* kwargs) { + const char* log_level = nullptr; + const char* log_output = nullptr; + const char* log_format = "[%Y-%m-%d %H:%M:%S.%e] [%l] %v"; + static const char* keywords[] = {"log_level", "log_output", "log_format", nullptr}; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "ss|s", const_cast(keywords), + &log_level, &log_output, &log_format)) { + return nullptr; + } + + try { + vdb::init_logging(log_level, log_output, log_format); + } catch (const std::exception& exc) { + raise_runtime_error(exc.what()); + return nullptr; + } + + Py_RETURN_NONE; +} + +PyObject* py_new_index_engine(PyObject*, PyObject* args) { + const char* path_or_json = nullptr; + if (!PyArg_ParseTuple(args, "s", &path_or_json)) { + return nullptr; + } + + try { + return PyCapsule_New(new vdb::IndexEngine(path_or_json), kIndexCapsuleName, + index_capsule_destructor); + } catch (const std::exception& exc) { + raise_runtime_error(exc.what()); + return nullptr; + } +} + +PyObject* py_index_engine_add_data(PyObject*, PyObject* args) { + PyObject* capsule = nullptr; + PyObject* items = nullptr; + if (!PyArg_ParseTuple(args, "OO", &capsule, &items)) { + return nullptr; + } + + auto* engine = capsule_to_ptr(capsule, kIndexCapsuleName); + if (engine == nullptr) { + return nullptr; + } + + std::vector requests; + if (!parse_request_list(items, parse_add_request, &requests)) { + return nullptr; + } + + try { + const int result = call_without_gil([&]() { return engine->add_data(requests); }); + return PyLong_FromLong(result); + } catch (const std::exception& exc) { + raise_runtime_error(exc.what()); + return nullptr; + } +} + +PyObject* py_index_engine_delete_data(PyObject*, PyObject* args) { + PyObject* capsule = nullptr; + PyObject* items = nullptr; + if (!PyArg_ParseTuple(args, "OO", &capsule, &items)) { + return nullptr; + } + + auto* engine = capsule_to_ptr(capsule, kIndexCapsuleName); + if (engine == nullptr) { + return nullptr; + } + + std::vector requests; + if (!parse_request_list(items, parse_delete_request, &requests)) { + return nullptr; + } + + try { + const int result = call_without_gil([&]() { return engine->delete_data(requests); }); + return PyLong_FromLong(result); + } catch (const std::exception& exc) { + raise_runtime_error(exc.what()); + return nullptr; + } +} + +PyObject* py_index_engine_search(PyObject*, PyObject* args) { + PyObject* capsule = nullptr; + PyObject* request_obj = nullptr; + if (!PyArg_ParseTuple(args, "OO", &capsule, &request_obj)) { + return nullptr; + } + + auto* engine = capsule_to_ptr(capsule, kIndexCapsuleName); + if (engine == nullptr) { + return nullptr; + } + + vdb::SearchRequest request; + if (!parse_search_request(request_obj, &request)) { + return nullptr; + } + + try { + const vdb::SearchResult result = call_without_gil([&]() { return engine->search(request); }); + return build_search_result(result); + } catch (const std::exception& exc) { + raise_runtime_error(exc.what()); + return nullptr; + } +} + +PyObject* py_index_engine_dump(PyObject*, PyObject* args) { + PyObject* capsule = nullptr; + const char* path = nullptr; + if (!PyArg_ParseTuple(args, "Os", &capsule, &path)) { + return nullptr; + } + + auto* engine = capsule_to_ptr(capsule, kIndexCapsuleName); + if (engine == nullptr) { + return nullptr; + } + + try { + const int64_t result = call_without_gil([&]() { return engine->dump(path); }); + return PyLong_FromLongLong(result); + } catch (const std::exception& exc) { + raise_runtime_error(exc.what()); + return nullptr; + } +} + +PyObject* py_index_engine_get_state(PyObject*, PyObject* args) { + PyObject* capsule = nullptr; + if (!PyArg_ParseTuple(args, "O", &capsule)) { + return nullptr; + } + + auto* engine = capsule_to_ptr(capsule, kIndexCapsuleName); + if (engine == nullptr) { + return nullptr; + } + + try { + const vdb::StateResult result = call_without_gil([&]() { return engine->get_state(); }); + return build_state_result(result); + } catch (const std::exception& exc) { + raise_runtime_error(exc.what()); + return nullptr; + } +} + +PyObject* py_new_persist_store(PyObject*, PyObject* args) { + const char* path = nullptr; + if (!PyArg_ParseTuple(args, "s", &path)) { + return nullptr; + } + + try { + return PyCapsule_New(new vdb::PersistStore(path), kStoreCapsuleName, + store_capsule_destructor); + } catch (const std::exception& exc) { + raise_runtime_error(exc.what()); + return nullptr; + } +} + +PyObject* py_new_volatile_store(PyObject*, PyObject*) { + try { + return PyCapsule_New(new vdb::VolatileStore(), kStoreCapsuleName, + store_capsule_destructor); + } catch (const std::exception& exc) { + raise_runtime_error(exc.what()); + return nullptr; + } +} + +PyObject* py_store_exec_op(PyObject*, PyObject* args) { + PyObject* capsule = nullptr; + PyObject* ops_obj = nullptr; + if (!PyArg_ParseTuple(args, "OO", &capsule, &ops_obj)) { + return nullptr; + } + + auto* store = capsule_to_ptr(capsule, kStoreCapsuleName); + if (store == nullptr) { + return nullptr; + } + + std::vector ops; + if (!parse_request_list(ops_obj, parse_storage_op, &ops)) { + return nullptr; + } + + try { + const int result = call_without_gil([&]() { return store->exec_op(ops); }); + return PyLong_FromLong(result); + } catch (const std::exception& exc) { + raise_runtime_error(exc.what()); + return nullptr; + } +} + +PyObject* py_store_get_data(PyObject*, PyObject* args) { + PyObject* capsule = nullptr; + PyObject* keys_obj = nullptr; + if (!PyArg_ParseTuple(args, "OO", &capsule, &keys_obj)) { + return nullptr; + } + + auto* store = capsule_to_ptr(capsule, kStoreCapsuleName); + if (store == nullptr) { + return nullptr; + } + + std::vector keys; + if (!parse_string_list(keys_obj, &keys)) { + return nullptr; + } + + try { + const auto values = call_without_gil([&]() { return store->get_data(keys); }); + PyObject* list = PyList_New(static_cast(values.size())); + if (list == nullptr) { + return nullptr; + } + for (Py_ssize_t i = 0; i < static_cast(values.size()); ++i) { + const auto& value = values[static_cast(i)]; + PyObject* item = PyBytes_FromStringAndSize(value.data(), + static_cast(value.size())); + if (item == nullptr) { + Py_DECREF(list); + return nullptr; + } + PyList_SetItem(list, i, item); + } + return list; + } catch (const std::exception& exc) { + raise_runtime_error(exc.what()); + return nullptr; + } +} + +PyObject* py_store_put_data(PyObject*, PyObject* args) { + PyObject* capsule = nullptr; + PyObject* keys_obj = nullptr; + PyObject* values_obj = nullptr; + if (!PyArg_ParseTuple(args, "OOO", &capsule, &keys_obj, &values_obj)) { + return nullptr; + } + + auto* store = capsule_to_ptr(capsule, kStoreCapsuleName); + if (store == nullptr) { + return nullptr; + } + + std::vector keys; + std::vector values; + if (!parse_string_list(keys_obj, &keys) || !parse_binary_list(values_obj, &values)) { + return nullptr; + } + if (keys.size() != values.size()) { + raise_value_error("keys and values must have the same length"); + return nullptr; + } + + try { + const int result = call_without_gil([&]() { return store->put_data(keys, values); }); + return PyLong_FromLong(result); + } catch (const std::exception& exc) { + raise_runtime_error(exc.what()); + return nullptr; + } +} + +PyObject* py_store_delete_data(PyObject*, PyObject* args) { + PyObject* capsule = nullptr; + PyObject* keys_obj = nullptr; + if (!PyArg_ParseTuple(args, "OO", &capsule, &keys_obj)) { + return nullptr; + } + + auto* store = capsule_to_ptr(capsule, kStoreCapsuleName); + if (store == nullptr) { + return nullptr; + } + + std::vector keys; + if (!parse_string_list(keys_obj, &keys)) { + return nullptr; + } + + try { + const int result = call_without_gil([&]() { return store->delete_data(keys); }); + return PyLong_FromLong(result); + } catch (const std::exception& exc) { + raise_runtime_error(exc.what()); + return nullptr; + } +} + +PyObject* py_store_clear_data(PyObject*, PyObject* args) { + PyObject* capsule = nullptr; + if (!PyArg_ParseTuple(args, "O", &capsule)) { + return nullptr; + } + + auto* store = capsule_to_ptr(capsule, kStoreCapsuleName); + if (store == nullptr) { + return nullptr; + } + + try { + const int result = call_without_gil([&]() { return store->clear_data(); }); + return PyLong_FromLong(result); + } catch (const std::exception& exc) { + raise_runtime_error(exc.what()); + return nullptr; + } +} + +PyObject* py_store_seek_range(PyObject*, PyObject* args) { + PyObject* capsule = nullptr; + const char* start_key = nullptr; + const char* end_key = nullptr; + if (!PyArg_ParseTuple(args, "Oss", &capsule, &start_key, &end_key)) { + return nullptr; + } + + auto* store = capsule_to_ptr(capsule, kStoreCapsuleName); + if (store == nullptr) { + return nullptr; + } + + try { + const auto items = call_without_gil([&]() { return store->seek_range(start_key, end_key); }); + PyObject* list = PyList_New(static_cast(items.size())); + if (list == nullptr) { + return nullptr; + } + for (Py_ssize_t i = 0; i < static_cast(items.size()); ++i) { + const auto& item = items[static_cast(i)]; + PyObject* tuple = PyTuple_New(2); + PyObject* key = PyUnicode_FromStringAndSize(item.first.data(), + static_cast(item.first.size())); + PyObject* value = PyBytes_FromStringAndSize(item.second.data(), + static_cast(item.second.size())); + if (tuple == nullptr || key == nullptr || value == nullptr) { + Py_XDECREF(tuple); + Py_XDECREF(key); + Py_XDECREF(value); + Py_DECREF(list); + return nullptr; + } + PyTuple_SetItem(tuple, 0, key); + PyTuple_SetItem(tuple, 1, value); + PyList_SetItem(list, i, tuple); + } + return list; + } catch (const std::exception& exc) { + raise_runtime_error(exc.what()); + return nullptr; + } +} + +PyMethodDef kModuleMethods[] = { + {"_new_schema", py_new_schema, METH_VARARGS, "Create a schema handle."}, + {"_schema_get_total_byte_length", py_schema_get_total_byte_length, METH_VARARGS, + "Read total schema byte length."}, + {"_new_bytes_row", py_new_bytes_row, METH_VARARGS, "Create a BytesRow handle."}, + {"_bytes_row_serialize", py_bytes_row_serialize, METH_VARARGS, + "Serialize a row using the native BytesRow implementation."}, + {"_bytes_row_serialize_batch", py_bytes_row_serialize_batch, METH_VARARGS, + "Serialize a batch of rows using the native BytesRow implementation."}, + {"_bytes_row_deserialize", py_bytes_row_deserialize, METH_VARARGS, + "Deserialize a row using the native BytesRow implementation."}, + {"_bytes_row_deserialize_field", py_bytes_row_deserialize_field, METH_VARARGS, + "Deserialize a single field using the native BytesRow implementation."}, + {"_init_logging", reinterpret_cast(py_init_logging), + METH_VARARGS | METH_KEYWORDS, "Initialize vectordb logging."}, + {"_new_index_engine", py_new_index_engine, METH_VARARGS, "Create an index engine handle."}, + {"_index_engine_add_data", py_index_engine_add_data, METH_VARARGS, + "Add data to the index engine."}, + {"_index_engine_delete_data", py_index_engine_delete_data, METH_VARARGS, + "Delete data from the index engine."}, + {"_index_engine_search", py_index_engine_search, METH_VARARGS, + "Search the index engine."}, + {"_index_engine_dump", py_index_engine_dump, METH_VARARGS, "Dump index state to disk."}, + {"_index_engine_get_state", py_index_engine_get_state, METH_VARARGS, + "Read index engine state."}, + {"_new_persist_store", py_new_persist_store, METH_VARARGS, "Create a persistent store."}, + {"_new_volatile_store", py_new_volatile_store, METH_NOARGS, "Create a volatile store."}, + {"_store_exec_op", py_store_exec_op, METH_VARARGS, "Execute store operations."}, + {"_store_get_data", py_store_get_data, METH_VARARGS, "Read values from the store."}, + {"_store_put_data", py_store_put_data, METH_VARARGS, "Write values to the store."}, + {"_store_delete_data", py_store_delete_data, METH_VARARGS, "Delete keys from the store."}, + {"_store_clear_data", py_store_clear_data, METH_VARARGS, "Clear the store."}, + {"_store_seek_range", py_store_seek_range, METH_VARARGS, + "Read a key range from the store."}, + {nullptr, nullptr, 0, nullptr}, +}; + +PyModuleDef kModuleDef = { + PyModuleDef_HEAD_INIT, + "_ov_engine_backend", + "OpenViking abi3 vectordb backend.", + -1, + kModuleMethods, +}; + +} // namespace + +#ifndef OV_PY_MODULE_NAME +#define OV_PY_MODULE_NAME _native +#endif + +#define OV_CONCAT_IMPL(a, b) a##b +#define OV_CONCAT(a, b) OV_CONCAT_IMPL(a, b) + +PyMODINIT_FUNC OV_CONCAT(PyInit_, OV_PY_MODULE_NAME)(void) { + PyObject* module = PyModule_Create(&kModuleDef); + if (module == nullptr) { + return nullptr; + } + + if (PyModule_AddStringConstant(module, "_ENGINE_BACKEND_API", "abi3-v1") < 0) { + Py_DECREF(module); + return nullptr; + } + + return module; +} diff --git a/src/cpu_feature_probe.cpp b/src/abi3_x86_caps.cpp similarity index 69% rename from src/cpu_feature_probe.cpp rename to src/abi3_x86_caps.cpp index f1df44f1a..e2e522016 100644 --- a/src/cpu_feature_probe.cpp +++ b/src/abi3_x86_caps.cpp @@ -1,8 +1,9 @@ // Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. // SPDX-License-Identifier: Apache-2.0 -#include -#include +#define Py_LIMITED_API 0x030A0000 +#include +#include #include #if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86) @@ -15,8 +16,6 @@ #endif #endif -namespace py = pybind11; - namespace { struct CpuFeatures { @@ -93,10 +92,9 @@ CpuFeatures detect_cpu_features() { CpuFeatures detect_cpu_features() { return CpuFeatures{}; } #endif -std::vector get_supported_variants() { +std::vector get_supported_variants_impl() { std::vector variants; const auto features = detect_cpu_features(); - if (features.sse3) { variants.emplace_back("x86_sse3"); } @@ -110,9 +108,41 @@ std::vector get_supported_variants() { return variants; } +PyObject* py_get_supported_variants(PyObject*, PyObject*) { + const auto variants = get_supported_variants_impl(); + PyObject* list = PyList_New(static_cast(variants.size())); + if (list == nullptr) { + return nullptr; + } + for (Py_ssize_t i = 0; i < static_cast(variants.size()); ++i) { + const auto& variant = variants[static_cast(i)]; + PyObject* item = PyUnicode_FromStringAndSize( + variant.data(), static_cast(variant.size())); + if (item == nullptr) { + Py_DECREF(list); + return nullptr; + } + PyList_SetItem(list, i, item); + } + return list; +} + +PyMethodDef kMethods[] = { + {"get_supported_variants", py_get_supported_variants, METH_NOARGS, + "Return CPU-supported x86 engine variants."}, + {nullptr, nullptr, 0, nullptr}, +}; + +PyModuleDef kModuleDef = { + PyModuleDef_HEAD_INIT, + "_x86_caps", + "OpenViking abi3 x86 capability probe.", + -1, + kMethods, +}; + } // namespace -PYBIND11_MODULE(_x86_caps, m) { - m.def("get_supported_variants", &get_supported_variants, - "Return CPU-supported x86 engine variants"); +PyMODINIT_FUNC PyInit__x86_caps(void) { + return PyModule_Create(&kModuleDef); } diff --git a/src/index/common_structs.h b/src/index/common_structs.h index 8f93b83c4..d68bfce9c 100644 --- a/src/index/common_structs.h +++ b/src/index/common_structs.h @@ -38,13 +38,9 @@ struct SearchResult { std::string extra_json; }; -struct FetchDataResult { - std::vector embedding; -}; - struct StateResult { uint64_t update_timestamp = 0; uint64_t element_count = 0; }; -} // namespace vectordb \ No newline at end of file +} // namespace vectordb diff --git a/src/py_accessors.h b/src/py_accessors.h deleted file mode 100644 index 7712263a9..000000000 --- a/src/py_accessors.h +++ /dev/null @@ -1,276 +0,0 @@ -// Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include -#include -#include -#include "store/bytes_row.h" - -namespace py = pybind11; -namespace vdb = vectordb; - -// Accessor for Python Dict -class PyDictAccessor { - public: - PyDictAccessor(const vdb::Schema& schema) - : field_order_(schema.get_field_order()) { - } - - bool has_value(const py::dict& row, int field_idx) const { - const auto& name = field_order_[field_idx].name; - if (!row.contains(name.c_str())) - return false; - return !row[name.c_str()].is_none(); - } - - int64_t get_int64(const py::dict& row, int field_idx) const { - return row[field_order_[field_idx].name.c_str()].cast(); - } - - uint64_t get_uint64(const py::dict& row, int field_idx) const { - return row[field_order_[field_idx].name.c_str()].cast(); - } - - float get_float(const py::dict& row, int field_idx) const { - return row[field_order_[field_idx].name.c_str()].cast(); - } - - bool get_bool(const py::dict& row, int field_idx) const { - return row[field_order_[field_idx].name.c_str()].cast(); - } - - int get_string_len(const py::dict& row, int field_idx) const { - py::object val = row[field_order_[field_idx].name.c_str()]; - // Assume it's string or bytes - if (py::isinstance(val)) { - return PyBytes_Size(val.ptr()); - } - return val.cast().length(); - } - - int get_binary_len(const py::dict& row, int field_idx) const { - // Same as string for length - return get_string_len(row, field_idx); - } - - int get_list_len(const py::dict& row, int field_idx) const { - py::list l = row[field_order_[field_idx].name.c_str()].cast(); - return static_cast(l.size()); - } - - int get_list_string_content_len(const py::dict& row, int field_idx) const { - py::list l = row[field_order_[field_idx].name.c_str()].cast(); - int total = 0; - for (auto item : l) { - total += item.cast().length(); - } - return total; - } - - // Writers - void write_string(const py::dict& row, int field_idx, char* dest) const { - std::string s = - row[field_order_[field_idx].name.c_str()].cast(); - uint16_t len = static_cast(s.length()); - std::memcpy(dest, &len, 2); - if (len > 0) - std::memcpy(dest + 2, s.data(), len); - } - - void write_binary(const py::dict& row, int field_idx, char* dest) const { - std::string s = - row[field_order_[field_idx].name.c_str()].cast(); - uint32_t len = static_cast(s.length()); - std::memcpy(dest, &len, 4); - if (len > 0) - std::memcpy(dest + 4, s.data(), len); - } - - void write_list_int64(const py::dict& row, int field_idx, char* dest) const { - py::list l = row[field_order_[field_idx].name.c_str()].cast(); - uint16_t len = static_cast(l.size()); - std::memcpy(dest, &len, 2); - int64_t* data_ptr = reinterpret_cast(dest + 2); - for (size_t i = 0; i < len; ++i) { - data_ptr[i] = l[i].cast(); - } - } - - void write_list_float32(const py::dict& row, int field_idx, - char* dest) const { - py::list l = row[field_order_[field_idx].name.c_str()].cast(); - uint16_t len = static_cast(l.size()); - std::memcpy(dest, &len, 2); - float* data_ptr = reinterpret_cast(dest + 2); - for (size_t i = 0; i < len; ++i) { - data_ptr[i] = l[i].cast(); - } - } - - void write_list_string(const py::dict& row, int field_idx, char* dest) const { - py::list l = row[field_order_[field_idx].name.c_str()].cast(); - uint16_t len = static_cast(l.size()); - std::memcpy(dest, &len, 2); - char* cur = dest + 2; - for (size_t i = 0; i < len; ++i) { - std::string s = l[i].cast(); - uint16_t slen = static_cast(s.length()); - std::memcpy(cur, &slen, 2); - cur += 2; - if (slen > 0) - std::memcpy(cur, s.data(), slen); - cur += slen; - } - } - - private: - const std::vector& field_order_; -}; - -// Accessor for Python Object -class PyObjectAccessor { - public: - PyObjectAccessor(const vdb::Schema& schema) - : field_order_(schema.get_field_order()) { - } - - bool has_value(const py::handle& row, int field_idx) const { - const char* name = field_order_[field_idx].name.c_str(); - if (!py::hasattr(row, name)) - return false; - return !row.attr(name).is_none(); - } - - int64_t get_int64(const py::handle& row, int field_idx) const { - return row.attr(field_order_[field_idx].name.c_str()).cast(); - } - - uint64_t get_uint64(const py::handle& row, int field_idx) const { - return row.attr(field_order_[field_idx].name.c_str()).cast(); - } - - float get_float(const py::handle& row, int field_idx) const { - return row.attr(field_order_[field_idx].name.c_str()).cast(); - } - - bool get_bool(const py::handle& row, int field_idx) const { - return row.attr(field_order_[field_idx].name.c_str()).cast(); - } - - int get_string_len(const py::handle& row, int field_idx) const { - // See comments in PyDictAccessor about encoding efficiency - return row.attr(field_order_[field_idx].name.c_str()) - .cast() - .length(); - } - - int get_binary_len(const py::handle& row, int field_idx) const { - return get_string_len(row, field_idx); - } - - int get_list_len(const py::handle& row, int field_idx) const { - py::list l = - row.attr(field_order_[field_idx].name.c_str()).cast(); - return static_cast(l.size()); - } - - int get_list_string_content_len(const py::handle& row, int field_idx) const { - py::list l = - row.attr(field_order_[field_idx].name.c_str()).cast(); - int total = 0; - for (auto item : l) { - total += item.cast().length(); - } - return total; - } - - void write_string(const py::handle& row, int field_idx, char* dest) const { - std::string s = - row.attr(field_order_[field_idx].name.c_str()).cast(); - uint16_t len = static_cast(s.length()); - std::memcpy(dest, &len, 2); - if (len > 0) - std::memcpy(dest + 2, s.data(), len); - } - - void write_binary(const py::handle& row, int field_idx, char* dest) const { - std::string s = - row.attr(field_order_[field_idx].name.c_str()).cast(); - uint32_t len = static_cast(s.length()); - std::memcpy(dest, &len, 4); - if (len > 0) - std::memcpy(dest + 4, s.data(), len); - } - - void write_list_int64(const py::handle& row, int field_idx, - char* dest) const { - py::list l = - row.attr(field_order_[field_idx].name.c_str()).cast(); - uint16_t len = static_cast(l.size()); - std::memcpy(dest, &len, 2); - int64_t* data_ptr = reinterpret_cast(dest + 2); - for (size_t i = 0; i < len; ++i) { - data_ptr[i] = l[i].cast(); - } - } - - void write_list_float32(const py::handle& row, int field_idx, - char* dest) const { - py::list l = - row.attr(field_order_[field_idx].name.c_str()).cast(); - uint16_t len = static_cast(l.size()); - std::memcpy(dest, &len, 2); - float* data_ptr = reinterpret_cast(dest + 2); - for (size_t i = 0; i < len; ++i) { - data_ptr[i] = l[i].cast(); - } - } - - void write_list_string(const py::handle& row, int field_idx, - char* dest) const { - py::list l = - row.attr(field_order_[field_idx].name.c_str()).cast(); - uint16_t len = static_cast(l.size()); - std::memcpy(dest, &len, 2); - char* cur = dest + 2; - for (size_t i = 0; i < len; ++i) { - std::string s = l[i].cast(); - uint16_t slen = static_cast(s.length()); - std::memcpy(cur, &slen, 2); - cur += 2; - if (slen > 0) - std::memcpy(cur, s.data(), slen); - cur += slen; - } - } - - private: - const std::vector& field_order_; -}; - -// Helper to convert C++ Value to Python object -inline py::object value_to_py(const vdb::Value& val) { - if (std::holds_alternative(val)) - return py::none(); - if (std::holds_alternative(val)) - return py::cast(std::get(val)); - if (std::holds_alternative(val)) - return py::cast(std::get(val)); - if (std::holds_alternative(val)) - return py::cast(std::get(val)); - if (std::holds_alternative(val)) - return py::cast(std::get(val)); - if (std::holds_alternative(val)) { - return py::cast(std::get(val)); - } - if (std::holds_alternative>(val)) - return py::cast(std::get>(val)); - if (std::holds_alternative>(val)) - return py::cast(std::get>(val)); - if (std::holds_alternative>(val)) - return py::cast(std::get>(val)); - - return py::none(); -} diff --git a/src/pybind11_interface.cpp b/src/pybind11_interface.cpp deleted file mode 100644 index 6533cd2fd..000000000 --- a/src/pybind11_interface.cpp +++ /dev/null @@ -1,351 +0,0 @@ -// Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. -// SPDX-License-Identifier: Apache-2.0 -#include -#include -#include -#include -#include -#include "index/index_engine.h" -#include "store/persist_store.h" -#include "store/volatile_store.h" -#include "common/log_utils.h" -#include "store/bytes_row.h" -#include "py_accessors.h" - -namespace py = pybind11; -namespace vdb = vectordb; - -#ifndef OV_PY_MODULE_NAME -#define OV_PY_MODULE_NAME engine -#endif - -#define OV_EXPAND_MACRO(name) name - -PYBIND11_MODULE(OV_EXPAND_MACRO(OV_PY_MODULE_NAME), m) { - m.def("init_logging", &vdb::init_logging, "Initialize logging"); - - py::enum_(m, "FieldType") - .value("int64", vdb::FieldType::INT64) - .value("uint64", vdb::FieldType::UINT64) - .value("float32", vdb::FieldType::FLOAT32) - .value("string", vdb::FieldType::STRING) - .value("binary", vdb::FieldType::BINARY) - .value("boolean", vdb::FieldType::BOOLEAN) - .value("list_int64", vdb::FieldType::LIST_INT64) - .value("list_string", vdb::FieldType::LIST_STRING) - .value("list_float32", vdb::FieldType::LIST_FLOAT32); - - py::class_>(m, "Schema") - .def(py::init([](const py::list& fields_py) { - std::vector fields; - for (const auto& item : fields_py) { - py::dict d = item.cast(); - vdb::FieldDef fd; - fd.name = d["name"].cast(); - fd.data_type = d["data_type"].cast(); - fd.id = d["id"].cast(); - if (d.contains("default_value")) { - try { - switch (fd.data_type) { - case vdb::FieldType::INT64: - fd.default_value = d["default_value"].cast(); - break; - case vdb::FieldType::UINT64: - fd.default_value = d["default_value"].cast(); - break; - case vdb::FieldType::FLOAT32: - fd.default_value = d["default_value"].cast(); - break; - case vdb::FieldType::BOOLEAN: - fd.default_value = d["default_value"].cast(); - break; - case vdb::FieldType::STRING: - fd.default_value = d["default_value"].cast(); - break; - case vdb::FieldType::BINARY: - fd.default_value = d["default_value"].cast(); - break; - case vdb::FieldType::LIST_INT64: - fd.default_value = - d["default_value"].cast>(); - break; - case vdb::FieldType::LIST_FLOAT32: - fd.default_value = - d["default_value"].cast>(); - break; - case vdb::FieldType::LIST_STRING: - fd.default_value = - d["default_value"].cast>(); - break; - } - } catch (...) { - fd.default_value = std::monostate{}; - } - } else { - fd.default_value = std::monostate{}; - } - fields.push_back(fd); - } - return std::make_shared(fields); - })) - .def("get_total_byte_length", &vdb::Schema::get_total_byte_length); - - py::class_(m, "BytesRow") - .def(py::init>()) - .def("serialize", - [](vdb::BytesRow& self, const py::dict& row_data) { - PyDictAccessor accessor(self.get_schema()); - std::string serialized = - self.serialize_template(row_data, accessor); - return py::bytes(serialized); - }) - .def("serialize_batch", - [](vdb::BytesRow& self, const py::list& objects) { - py::list results; - const auto& schema = self.get_schema(); - - PyDictAccessor dict_accessor(schema); - PyObjectAccessor obj_accessor(schema); - - for (const auto& obj : objects) { - std::string serialized; - if (py::isinstance(obj)) { - serialized = self.serialize_template(obj.cast(), - dict_accessor); - } else { - serialized = self.serialize_template(obj, obj_accessor); - } - results.append(py::bytes(serialized)); - } - return results; - }) - .def("deserialize", - [](vdb::BytesRow& self, const std::string& data) { - py::dict res_dict; - const auto& schema = self.get_schema(); - - const auto& field_order = schema.get_field_order(); - for (const auto& meta : field_order) { - vdb::Value val = self.deserialize_field(data, meta.name); - - if (std::holds_alternative(val)) - continue; - - if (meta.data_type == vdb::FieldType::BINARY) { - if (std::holds_alternative(val)) { - res_dict[meta.name.c_str()] = - py::bytes(std::get(val)); - continue; - } - } - res_dict[meta.name.c_str()] = value_to_py(val); - } - return res_dict; - }) - .def("deserialize_field", - [](vdb::BytesRow& self, const std::string& data, - const std::string& field_name) -> py::object { - vdb::Value val = self.deserialize_field(data, field_name); - const auto& schema = self.get_schema(); - const auto* meta = schema.get_field_meta(field_name); - - if (meta && meta->data_type == vdb::FieldType::BINARY) { - if (std::holds_alternative(val)) { - const auto& s = std::get(val); - return py::bytes(s); - } - } - return value_to_py(val); - }); - - py::class_(m, "AddDataRequest") - .def(py::init<>()) - .def_readwrite("label", &vdb::AddDataRequest::label) - .def_readwrite("vector", &vdb::AddDataRequest::vector) - .def_readwrite("sparse_raw_terms", &vdb::AddDataRequest::sparse_raw_terms) - .def_readwrite("sparse_values", &vdb::AddDataRequest::sparse_values) - .def_readwrite("fields_str", &vdb::AddDataRequest::fields_str) - .def_readwrite("old_fields_str", &vdb::AddDataRequest::old_fields_str) - .def("__repr__", [](const vdb::AddDataRequest& p) { - return ""; - }); - - py::class_(m, "DeleteDataRequest") - .def(py::init<>()) - .def_readwrite("label", &vdb::DeleteDataRequest::label) - .def_readwrite("old_fields_str", &vdb::DeleteDataRequest::old_fields_str) - .def("__repr__", [](const vdb::DeleteDataRequest& p) { - return ""; - }); - - py::class_(m, "SearchRequest") - .def(py::init<>()) - .def_readwrite("query", &vdb::SearchRequest::query) - .def_readwrite("sparse_raw_terms", &vdb::SearchRequest::sparse_raw_terms) - .def_readwrite("sparse_values", &vdb::SearchRequest::sparse_values) - .def_readwrite("topk", &vdb::SearchRequest::topk) - .def_readwrite("dsl", &vdb::SearchRequest::dsl) - .def("__repr__", [](const vdb::SearchRequest& p) { - return ""; - }); - - py::class_(m, "SearchResult") - .def(py::init<>()) - .def_readwrite("result_num", &vdb::SearchResult::result_num) - .def_readwrite("labels", &vdb::SearchResult::labels) - .def_readwrite("scores", &vdb::SearchResult::scores) - .def_readwrite("extra_json", &vdb::SearchResult::extra_json) - .def("__repr__", [](const vdb::SearchResult& p) { - return ""; - }); - - py::class_(m, "FetchDatahResult") - .def(py::init<>()) - .def_readwrite("embedding", &vdb::FetchDataResult::embedding) - .def("__repr__", [](const vdb::FetchDataResult& p) { - return ""; - }); - - py::class_(m, "StateResult") - .def(py::init<>()) - .def_readwrite("update_timestamp", &vdb::StateResult::update_timestamp) - .def_readwrite("element_count", &vdb::StateResult::element_count) - .def("__repr__", [](const vdb::StateResult& p) { - return ""; - }); - - py::class_(m, "IndexEngine") - .def(py::init()) - .def( - "add_data", - [](vdb::IndexEngine& self, - const std::vector& data_list) { - pybind11::gil_scoped_release release; - return self.add_data(data_list); - }, - "add data to index") - .def( - "delete_data", - [](vdb::IndexEngine& self, - const std::vector& data_list) { - pybind11::gil_scoped_release release; - return self.delete_data(data_list); - }, - "delete data from index") - .def( - "search", - [](vdb::IndexEngine& self, const vdb::SearchRequest& req) { - pybind11::gil_scoped_release release; - return self.search(req); - }, - "search") - .def( - "dump", - [](vdb::IndexEngine& self, const std::string& dir) { - pybind11::gil_scoped_release release; - return self.dump(dir); - }, - "dump index") - .def("get_state", &vdb::IndexEngine::get_state, "get index state"); - - py::class_(m, "VolatileStore") - .def(py::init<>()) - .def("exec_op", &vdb::VolatileStore::exec_op, "exec op") - .def( - "get_data", - [](vdb::VolatileStore& self, const std::vector& keys) { - std::vector cxx_bin_list = self.get_data(keys); - - py::list py_bytes_list; - for (auto& cxx_bin : cxx_bin_list) { - py_bytes_list.append(py::bytes(cxx_bin.data(), cxx_bin.size())); - } - return py_bytes_list; - }, - "get data") - .def("delete_data", &vdb::VolatileStore::delete_data, "delete data") - .def("put_data", &vdb::VolatileStore::put_data, "put data") - .def("clear_data", &vdb::VolatileStore::clear_data, "clear data") - .def( - "seek_range", - [](vdb::VolatileStore& self, const std::string& start_key, - const std::string& end_key) { - std::vector> cxx_kv_list = - self.seek_range(start_key, end_key); - py::list py_kv_list; - for (const auto& cxx_pair : cxx_kv_list) { - py::tuple py_pair(2); - py_pair[0] = cxx_pair.first; - py_pair[1] = - py::bytes(cxx_pair.second.data(), cxx_pair.second.size()); - py_kv_list.append(py_pair); - } - return py_kv_list; - }, - "seek range"); - - py::class_(m, "PersistStore") - .def(py::init()) - .def( - "exec_op", - [](vdb::PersistStore& self, const std::vector& ops) { - pybind11::gil_scoped_release release; - return self.exec_op(ops); - }, - "exec op") - .def( - "get_data", - [](vdb::PersistStore& self, const std::vector& keys) { - std::vector cxx_bin_list; - { - pybind11::gil_scoped_release release; - cxx_bin_list = self.get_data(keys); - } - - py::list py_bytes_list; - for (auto& cxx_bin : cxx_bin_list) { - py_bytes_list.append(py::bytes(cxx_bin.data(), cxx_bin.size())); - } - return py_bytes_list; - }, - "get data") - .def("delete_data", &vdb::PersistStore::delete_data, "delete data") - .def("put_data", &vdb::PersistStore::put_data, "put data") - .def("clear_data", &vdb::PersistStore::clear_data, "clear data") - .def( - "seek_range", - [](vdb::PersistStore& self, const std::string& start_key, - const std::string& end_key) { - std::vector> cxx_kv_list = - self.seek_range(start_key, end_key); - py::list py_kv_list; - - for (const auto& cxx_pair : cxx_kv_list) { - py::tuple py_pair(2); - py_pair[0] = cxx_pair.first; - py_pair[1] = - py::bytes(cxx_pair.second.data(), cxx_pair.second.size()); - py_kv_list.append(py_pair); - } - return py_kv_list; - }, - "seek range"); - - py::enum_(m, "StorageOpType") - .value("PUT", vdb::StorageOp::OpType::PUT_OP) - .value("DELETE", vdb::StorageOp::OpType::DELETE_OP); - - py::class_(m, "StorageOp") - .def(py::init<>()) - .def_readwrite("type", &vdb::StorageOp::type) - .def_readwrite("key", &vdb::StorageOp::key) - .def_readwrite("value", &vdb::StorageOp::value); -} diff --git a/tests/misc/test_abi3_packaging_config.py b/tests/misc/test_abi3_packaging_config.py new file mode 100644 index 000000000..2638c1c8b --- /dev/null +++ b/tests/misc/test_abi3_packaging_config.py @@ -0,0 +1,66 @@ +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parents[2] + + +def _read_text(relative_path: str) -> str: + return (REPO_ROOT / relative_path).read_text(encoding="utf-8") + + +def test_packaging_only_includes_abi3_engine_extensions(): + setup_py = _read_text("setup.py") + pyproject = _read_text("pyproject.toml") + + assert "storage/vectordb/engine/*.abi3.so" in setup_py + assert "storage/vectordb/engine/*.abi3.so" in pyproject + assert "storage/vectordb/engine/*.so" not in setup_py + assert "storage/vectordb/engine/*.so" not in pyproject + + +def test_release_workflows_default_to_single_cp310_and_drop_pybind11(): + build_workflow = _read_text(".github/workflows/_build.yml") + release_workflow = _read_text(".github/workflows/release.yml") + lite_workflow = _read_text(".github/workflows/_test_lite.yml") + full_workflow = _read_text(".github/workflows/_test_full.yml") + codeql_workflow = _read_text(".github/workflows/_codeql.yml") + uv_lock = _read_text("uv.lock") + + assert "default: '[\"3.10\"]'" in build_workflow + assert "default: '[\"3.10\"]'" in release_workflow + assert "python_json: ${{ inputs.python_json || '[\"3.10\"]' }}" in release_workflow + + for workflow_text in ( + build_workflow, + lite_workflow, + full_workflow, + codeql_workflow, + ): + assert "pybind11" not in workflow_text + assert 'name = "pybind11"' not in uv_lock + + +def test_release_build_workflow_no_longer_defines_extra_wheel_verify_jobs(): + build_workflow = _read_text(".github/workflows/_build.yml") + + assert "verify-linux-abi3-wheel:" not in build_workflow + assert "verify-macos-14-wheel-on-macos-15:" not in build_workflow + + +def test_abi3_backend_releases_gil_and_rejects_invalid_storage_op_type(): + backend_source = _read_text("src/abi3_engine_backend.cpp") + + assert "PyEval_SaveThread" in backend_source + assert "PyEval_RestoreThread" in backend_source + assert "Invalid storage op type" in backend_source + + +def test_repo_no_longer_contains_pybind11_engine_bindings(): + assert not (REPO_ROOT / "src" / "pybind11_interface.cpp").exists() + assert not (REPO_ROOT / "src" / "cpu_feature_probe.cpp").exists() + assert not (REPO_ROOT / "src" / "py_accessors.h").exists() + + +def test_python_engine_exports_only_live_abi3_api(): + import openviking.storage.vectordb.engine as engine + + assert not hasattr(engine, "FetchDataResult") diff --git a/tests/misc/test_vectordb_engine_loader.py b/tests/misc/test_vectordb_engine_loader.py index a3d9442be..7386727a6 100644 --- a/tests/misc/test_vectordb_engine_loader.py +++ b/tests/misc/test_vectordb_engine_loader.py @@ -27,6 +27,23 @@ def _load_engine_module( monkeypatch, *, machine, available_backends, cpu_variants, env_variant=None ): _install_package_stubs(monkeypatch) + for backend_name in available_backends: + monkeypatch.setitem( + sys.modules, + f"openviking.storage.vectordb.engine._{backend_name}", + types.SimpleNamespace( + BACKEND_NAME=backend_name, + IndexEngine=f"IndexEngine:{backend_name}", + PersistStore=f"PersistStore:{backend_name}", + VolatileStore=f"VolatileStore:{backend_name}", + ), + ) + + monkeypatch.setitem( + sys.modules, + "openviking.storage.vectordb.engine._x86_caps", + types.SimpleNamespace(get_supported_variants=lambda: list(cpu_variants)), + ) monkeypatch.setattr(platform, "machine", lambda: machine) if env_variant is None: @@ -38,22 +55,11 @@ def _load_engine_module( original_find_spec = importlib.util.find_spec def fake_import_module(name, package=None): - if package == "openviking.storage.vectordb.engine" and name == "._x86_caps": - caps = types.SimpleNamespace( - get_supported_variants=lambda: list(cpu_variants), - ) - return caps - if package == "openviking.storage.vectordb.engine" and name.startswith("._"): - backend_name = name[2:].lstrip("_") - if backend_name not in available_backends: - raise ModuleNotFoundError(name) - return types.SimpleNamespace( - BACKEND_NAME=backend_name, - IndexEngine=f"IndexEngine:{backend_name}", - PersistStore=f"PersistStore:{backend_name}", - VolatileStore=f"VolatileStore:{backend_name}", - ) + qualified_name = importlib.util.resolve_name(name, package) + if qualified_name in sys.modules: + return sys.modules[qualified_name] + raise ModuleNotFoundError(name) return original_import_module(name, package) @@ -83,6 +89,45 @@ def fake_find_spec(name, package=None): return module +def _load_engine_module_with_backend(monkeypatch, *, machine, backend_name, backend_module): + _install_package_stubs(monkeypatch) + monkeypatch.setattr(platform, "machine", lambda: machine) + monkeypatch.delenv("OV_ENGINE_VARIANT", raising=False) + monkeypatch.setitem( + sys.modules, + f"openviking.storage.vectordb.engine._{backend_name}", + backend_module, + ) + + original_import_module = importlib.import_module + original_find_spec = importlib.util.find_spec + + def fake_import_module(name, package=None): + if package == "openviking.storage.vectordb.engine" and name == f"._{backend_name}": + return backend_module + return original_import_module(name, package) + + def fake_find_spec(name, package=None): + fullname = importlib.util.resolve_name(name, package) if name.startswith(".") else name + if fullname == f"openviking.storage.vectordb.engine._{backend_name}": + return object() + return original_find_spec(name, package) + + monkeypatch.setattr(importlib, "import_module", fake_import_module) + monkeypatch.setattr(importlib.util, "find_spec", fake_find_spec) + + spec = importlib.util.spec_from_file_location( + "openviking.storage.vectordb.engine", + ENGINE_INIT, + submodule_search_locations=[str(ENGINE_INIT.parent)], + ) + module = importlib.util.module_from_spec(spec) + monkeypatch.setitem(sys.modules, "openviking.storage.vectordb.engine", module) + assert spec.loader is not None + spec.loader.exec_module(module) + return module + + def test_engine_loader_auto_selects_best_supported_x86_backend(monkeypatch): module = _load_engine_module( monkeypatch, @@ -118,3 +163,146 @@ def test_engine_loader_rejects_forced_unsupported_variant(monkeypatch): cpu_variants={"x86_sse3", "x86_avx2"}, env_variant="x86_avx512", ) + + +def test_engine_loader_wraps_abi3_backend_with_python_api(monkeypatch): + calls = [] + engine_handle = object() + store_handle = object() + schema_handle = object() + bytes_row_handle = object() + + backend = types.SimpleNamespace( + BACKEND_NAME="native", + _ENGINE_BACKEND_API="abi3-v1", + _new_schema=lambda fields: calls.append(("new_schema", fields)) or schema_handle, + _schema_get_total_byte_length=lambda handle: calls.append(("schema_total", handle)) or 123, + _new_bytes_row=lambda handle: calls.append(("new_bytes_row", handle)) or bytes_row_handle, + _bytes_row_serialize=lambda handle, row: calls.append(("bytes_row_serialize", handle, row)) + or b"blob", + _bytes_row_serialize_batch=lambda handle, rows: calls.append( + ("bytes_row_serialize_batch", handle, rows) + ) + or [b"blob-a", b"blob-b"], + _bytes_row_deserialize=lambda handle, payload: calls.append( + ("bytes_row_deserialize", handle, payload) + ) + or {"id": 42, "name": "viking"}, + _bytes_row_deserialize_field=lambda handle, payload, field_name: calls.append( + ("bytes_row_deserialize_field", handle, payload, field_name) + ) + or (42 if field_name == "id" else "viking"), + _new_index_engine=lambda config: calls.append(("new_index_engine", config)) + or engine_handle, + _index_engine_add_data=lambda handle, items: calls.append(("add_data", handle, items)) or 3, + _index_engine_delete_data=lambda handle, items: calls.append(("delete_data", handle, items)) + or 2, + _index_engine_search=lambda handle, req: calls.append(("search", handle, req)) + or { + "result_num": 2, + "labels": [101, 202], + "scores": [0.9, 0.8], + "extra_json": '{"ok":true}', + }, + _index_engine_dump=lambda handle, path: calls.append(("dump", handle, path)) or 11, + _index_engine_get_state=lambda handle: calls.append(("get_state", handle)) + or {"update_timestamp": 123, "element_count": 7}, + _new_persist_store=lambda path: calls.append(("new_persist_store", path)) or store_handle, + _new_volatile_store=lambda: calls.append(("new_volatile_store",)) or store_handle, + _store_exec_op=lambda handle, ops: calls.append(("store_exec_op", handle, ops)) or 1, + _store_get_data=lambda handle, keys: calls.append(("store_get_data", handle, keys)) + or [b"one", b"two"], + _store_put_data=lambda handle, keys, values: calls.append( + ("store_put_data", handle, keys, values) + ) + or 0, + _store_delete_data=lambda handle, keys: calls.append(("store_delete_data", handle, keys)) + or 0, + _store_clear_data=lambda handle: calls.append(("store_clear_data", handle)) or 0, + _store_seek_range=lambda handle, start, end: calls.append( + ("store_seek_range", handle, start, end) + ) + or [("aa", b"11"), ("ab", b"22")], + _init_logging=lambda level, output, fmt: calls.append(("init_logging", level, output, fmt)), + ) + + module = _load_engine_module_with_backend( + monkeypatch, + machine="arm64", + backend_name="native", + backend_module=backend, + ) + + assert module.ENGINE_VARIANT == "native" + assert module.AVAILABLE_ENGINE_VARIANTS == ("native",) + + schema = module.Schema( + [ + {"name": "id", "data_type": module.FieldType.int64, "id": 0}, + {"name": "name", "data_type": module.FieldType.string, "id": 1}, + ] + ) + assert schema.get_total_byte_length() == 123 + row = module.BytesRow(schema) + blob = row.serialize({"id": 42, "name": "viking"}) + assert blob == b"blob" + assert row.serialize_batch([{"id": 1}, {"id": 2}]) == [b"blob-a", b"blob-b"] + assert row.deserialize_field(blob, "id") == 42 + assert row.deserialize(blob)["name"] == "viking" + + module.init_logging("INFO", "stdout") + index = module.IndexEngine("config-json") + + add_req = module.AddDataRequest() + add_req.label = 9 + add_req.vector = [0.1, 0.2] + add_req.fields_str = '{"name":"x"}' + assert index.add_data([add_req]) == 3 + + search_req = module.SearchRequest() + search_req.query = [0.5, 0.4] + search_req.topk = 5 + search_req.dsl = "{}" + result = index.search(search_req) + assert result.labels == [101, 202] + assert result.scores == [0.9, 0.8] + assert result.extra_json == '{"ok":true}' + + state = index.get_state() + assert state.update_timestamp == 123 + assert state.element_count == 7 + assert state.data_count == 7 + + store = module.PersistStore("/tmp/test-store") + assert store.get_data(["k1", "k2"]) == [b"one", b"two"] + assert store.seek_range("a", "b") == [("aa", b"11"), ("ab", b"22")] + + op = module.StorageOp() + op.type = module.StorageOpType.PUT + op.key = "abc" + op.value = b"payload" + assert store.exec_op([op]) == 1 + + assert ("new_index_engine", "config-json") in calls + assert ("new_persist_store", "/tmp/test-store") in calls + search_calls = [entry for entry in calls if entry[0] == "search"] + assert len(search_calls) == 1 + _, handle, payload = search_calls[0] + assert handle is engine_handle + assert payload is search_req + assert payload.query == [0.5, 0.4] + assert payload.topk == 5 + assert payload.dsl == "{}" + + add_calls = [entry for entry in calls if entry[0] == "add_data"] + assert len(add_calls) == 1 + _, handle, payload = add_calls[0] + assert handle is engine_handle + assert payload == [add_req] + assert payload[0].fields_str == '{"name":"x"}' + + new_schema_calls = [entry for entry in calls if entry[0] == "new_schema"] + assert len(new_schema_calls) == 1 + assert ("schema_total", schema_handle) in calls + assert ("new_bytes_row", schema_handle) in calls + assert ("bytes_row_serialize", bytes_row_handle, {"id": 42, "name": "viking"}) in calls diff --git a/tests/vectordb/test_recall.py b/tests/vectordb/test_recall.py index fbc8de0a5..a38761ebe 100644 --- a/tests/vectordb/test_recall.py +++ b/tests/vectordb/test_recall.py @@ -503,9 +503,6 @@ def test_complex_schema_missing_fields(self): # For LocalCollection, non-vector fields are often serialized into a 'fields' JSON string or accessible directly if mapped. # We need to check if the data came back. - # NOTE: FetchDataResult structure: result_num, labels, scores, extra_json? - # Actually fetch_data returns a list of results. - print(f"Full Record Fetch: {res_full.items[0]}") res_min = collection.fetch_data(["2"]) diff --git a/uv.lock b/uv.lock index b5ca28fad..8b51bfe30 100644 --- a/uv.lock +++ b/uv.lock @@ -3466,7 +3466,6 @@ bot-telegram = [ build = [ { name = "build" }, { name = "cmake" }, - { name = "pybind11" }, { name = "setuptools" }, { name = "setuptools-scm" }, { name = "wheel" }, @@ -3563,7 +3562,6 @@ requires-dist = [ { name = "prompt-toolkit", marker = "extra == 'bot'", specifier = ">=3.0.0" }, { name = "protobuf", specifier = ">=6.33.5" }, { name = "py-machineid", marker = "extra == 'bot'", specifier = ">=1.0.0" }, - { name = "pybind11", marker = "extra == 'build'", specifier = ">=2.13.0" }, { name = "pydantic", specifier = ">=2.0.0" }, { name = "pydantic-settings", marker = "extra == 'bot'", specifier = ">=2.0.0" }, { name = "pygments", marker = "extra == 'bot'", specifier = ">=2.16.0" }, @@ -4324,15 +4322,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/47/8d/d529b5d697919ba8c11ad626e835d4039be708a35b0d22de83a269a6682c/pyasn1_modules-0.4.2-py3-none-any.whl", hash = "sha256:29253a9207ce32b64c3ac6600edc75368f98473906e8fd1043bd6b5b1de2c14a", size = 181259, upload-time = "2025-03-28T02:41:19.028Z" }, ] -[[package]] -name = "pybind11" -version = "3.0.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/a5/98/9118a0659646f1628c592ef9bb48e0056efa6bf27c951fd12a178e0136fb/pybind11-3.0.2.tar.gz", hash = "sha256:432f01aeb68e361a3a7fc7575c2c7f497595bf640f747acd909ff238dd766e06", size = 577131, upload-time = "2026-02-17T04:46:52.556Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/88/c5/e98d9c51f3d5300d5e40ad9037dd6b3b60736fd02ab68dcc98c96be7592d/pybind11-3.0.2-py3-none-any.whl", hash = "sha256:f8a6500548919cc33bcd220d5f984688326f574fa97f1107f2f4fdb4c6fb019f", size = 310158, upload-time = "2026-02-17T04:46:49.91Z" }, -] - [[package]] name = "pycparser" version = "3.0"