From a958f00b4b533244000b4d69e0c9b3d062434c80 Mon Sep 17 00:00:00 2001 From: Matt Van Horn <455140+mvanhorn@users.noreply.github.com> Date: Mon, 30 Mar 2026 23:51:09 -0700 Subject: [PATCH] feat(embedding): surface non-symmetric embedding config for VikingDB provider VikingDB embedders accepted is_query but ignored it. Now VikingDBDenseEmbedder and VikingDBHybridEmbedder accept query_param/document_param and pass input_type to the API when non-symmetric mode is configured. - Add query_param/document_param to VikingDB Dense and Hybrid constructors - Add _resolve_input_type() to select query vs document param - Pass input_type in _call_api data items when set - Wire factory entries to pass config params through - Sparse embedder unchanged (sparse models are symmetric) Closes #655 Co-Authored-By: Claude Opus 4.6 (1M context) --- .../models/embedder/vikingdb_embedders.py | 45 ++++++++++++-- .../utils/config/embedding_config.py | 4 ++ .../embedder/test_vikingdb_nonsymmetric.py | 60 +++++++++++++++++++ 3 files changed, 104 insertions(+), 5 deletions(-) create mode 100644 tests/unit/embedder/test_vikingdb_nonsymmetric.py diff --git a/openviking/models/embedder/vikingdb_embedders.py b/openviking/models/embedder/vikingdb_embedders.py index a6042316d..a8183461d 100644 --- a/openviking/models/embedder/vikingdb_embedders.py +++ b/openviking/models/embedder/vikingdb_embedders.py @@ -39,11 +39,15 @@ def _call_api( texts: List[str], dense_model: Dict[str, Any] = None, sparse_model: Optional[Dict[str, Any]] = None, + input_type: Optional[str] = None, ) -> List[Dict[str, Any]]: """Call VikingDB Embedding API""" path = "/api/vikingdb/embedding" data_items = [{"text": text} for text in texts] + if input_type is not None: + for item in data_items: + item["input_type"] = input_type req_body = {"data": data_items} if dense_model: @@ -115,6 +119,8 @@ def __init__( dimension: Optional[int] = None, embedding_type: str = "text", config: Optional[Dict[str, Any]] = None, + query_param: Optional[str] = None, + document_param: Optional[str] = None, ): DenseEmbedderBase.__init__(self, model_name, config) self._init_vikingdb_client(ak, sk, region, host) @@ -122,10 +128,22 @@ def __init__( self.dimension = dimension self.embedding_type = embedding_type self.dense_model = {"name": model_name, "version": model_version, "dim": dimension} + self.query_param = query_param + self.document_param = document_param + + def _resolve_input_type(self, is_query: bool) -> Optional[str]: + """Return the input_type value for query or document side, or None for symmetric mode.""" + if is_query and self.query_param is not None: + return self.query_param + if not is_query and self.document_param is not None: + return self.document_param + return None def embed(self, text: str, is_query: bool = False) -> EmbedResult: + input_type = self._resolve_input_type(is_query) + def _call() -> EmbedResult: - results = self._call_api([text], dense_model=self.dense_model) + results = self._call_api([text], dense_model=self.dense_model, input_type=input_type) if not results: return EmbedResult(dense_vector=[]) @@ -154,9 +172,10 @@ def _call() -> EmbedResult: def embed_batch(self, texts: List[str], is_query: bool = False) -> List[EmbedResult]: if not texts: return [] + input_type = self._resolve_input_type(is_query) def _call() -> List[EmbedResult]: - raw_results = self._call_api(texts, dense_model=self.dense_model) + raw_results = self._call_api(texts, dense_model=self.dense_model, input_type=input_type) return [ EmbedResult( dense_vector=self._truncate_and_normalize( @@ -277,6 +296,8 @@ def __init__( dimension: Optional[int] = None, embedding_type: str = "text", config: Optional[Dict[str, Any]] = None, + query_param: Optional[str] = None, + document_param: Optional[str] = None, ): HybridEmbedderBase.__init__(self, model_name, config) self._init_vikingdb_client(ak, sk, region, host) @@ -288,11 +309,24 @@ def __init__( "name": model_name, "version": model_version, } + self.query_param = query_param + self.document_param = document_param + + def _resolve_input_type(self, is_query: bool) -> Optional[str]: + """Return the input_type value for query or document side, or None for symmetric mode.""" + if is_query and self.query_param is not None: + return self.query_param + if not is_query and self.document_param is not None: + return self.document_param + return None def embed(self, text: str, is_query: bool = False) -> EmbedResult: + input_type = self._resolve_input_type(is_query) + def _call() -> EmbedResult: results = self._call_api( - [text], dense_model=self.dense_model, sparse_model=self.sparse_model + [text], dense_model=self.dense_model, sparse_model=self.sparse_model, + input_type=input_type, ) if not results: return EmbedResult(dense_vector=[], sparse_vector={}) @@ -300,7 +334,6 @@ def _call() -> EmbedResult: item = results[0] dense_vector = [] sparse_vector = {} - if "dense" in item: dense_vector = self._truncate_and_normalize(item["dense"], self.dimension) if "sparse" in item: @@ -326,10 +359,12 @@ def _call() -> EmbedResult: def embed_batch(self, texts: List[str], is_query: bool = False) -> List[EmbedResult]: if not texts: return [] + input_type = self._resolve_input_type(is_query) def _call() -> List[EmbedResult]: raw_results = self._call_api( - texts, dense_model=self.dense_model, sparse_model=self.sparse_model + texts, dense_model=self.dense_model, sparse_model=self.sparse_model, + input_type=input_type, ) results = [] for item in raw_results: diff --git a/openviking_cli/utils/config/embedding_config.py b/openviking_cli/utils/config/embedding_config.py index 2392198c9..c922eefa3 100644 --- a/openviking_cli/utils/config/embedding_config.py +++ b/openviking_cli/utils/config/embedding_config.py @@ -408,6 +408,8 @@ def _create_embedder( "dimension": cfg.dimension, "input_type": cfg.input, "config": {"max_retries": self.max_retries}, + **({"query_param": cfg.query_param} if cfg.query_param else {}), + **({"document_param": cfg.document_param} if cfg.document_param else {}), }, ), ("vikingdb", "sparse"): ( @@ -434,6 +436,8 @@ def _create_embedder( "dimension": cfg.dimension, "input_type": cfg.input, "config": {"max_retries": self.max_retries}, + **({"query_param": cfg.query_param} if cfg.query_param else {}), + **({"document_param": cfg.document_param} if cfg.document_param else {}), }, ), ("jina", "dense"): ( diff --git a/tests/unit/embedder/test_vikingdb_nonsymmetric.py b/tests/unit/embedder/test_vikingdb_nonsymmetric.py new file mode 100644 index 000000000..3ce630b41 --- /dev/null +++ b/tests/unit/embedder/test_vikingdb_nonsymmetric.py @@ -0,0 +1,60 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: AGPL-3.0 +"""Tests for VikingDB non-symmetric embedding support.""" + +from unittest.mock import patch + +import pytest + +from openviking.models.embedder.vikingdb_embedders import ( + VikingDBDenseEmbedder, + VikingDBHybridEmbedder, +) + + +@pytest.fixture +def mock_vikingdb_client(): + """Patch VikingDB client initialization.""" + with patch.object( + VikingDBDenseEmbedder, "_init_vikingdb_client", return_value=None + ) as mock_init: + mock_init.side_effect = lambda *args, **kwargs: None + yield mock_init + + +def test_dense_resolve_input_type_symmetric(): + """When no query_param/document_param, input_type is None (symmetric).""" + embedder = VikingDBDenseEmbedder.__new__(VikingDBDenseEmbedder) + embedder.query_param = None + embedder.document_param = None + assert embedder._resolve_input_type(is_query=True) is None + assert embedder._resolve_input_type(is_query=False) is None + + +def test_dense_resolve_input_type_nonsymmetric(): + """When query_param/document_param set, return correct value for is_query.""" + embedder = VikingDBDenseEmbedder.__new__(VikingDBDenseEmbedder) + embedder.query_param = "query" + embedder.document_param = "passage" + assert embedder._resolve_input_type(is_query=True) == "query" + assert embedder._resolve_input_type(is_query=False) == "passage" + + +def test_hybrid_resolve_input_type_nonsymmetric(): + """Hybrid embedder also resolves input_type correctly.""" + embedder = VikingDBHybridEmbedder.__new__(VikingDBHybridEmbedder) + embedder.query_param = "search_query" + embedder.document_param = "search_document" + assert embedder._resolve_input_type(is_query=True) == "search_query" + assert embedder._resolve_input_type(is_query=False) == "search_document" + + +def test_dense_backward_compat_no_params(): + """VikingDBDenseEmbedder without query_param/document_param works.""" + embedder = VikingDBDenseEmbedder.__new__(VikingDBDenseEmbedder) + embedder.query_param = None + embedder.document_param = None + embedder.model_name = "test" + embedder.dimension = 1024 + # Should not raise + assert embedder._resolve_input_type(is_query=True) is None