-
Notifications
You must be signed in to change notification settings - Fork 5.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Inconsistent Embedding Reuse Behavior in VectorStoreIndex with DenseXRetrievalPack #14417
Comments
The discrepancy between To configure def monkey_query(
self,
query: VectorStoreQuery,
**kwargs: Any,
):
"""Get nodes for response."""
# Prevent metadata filtering on stores that were persisted without metadata.
if (
query.filters is not None
and self._data.embedding_dict
and not self._data.metadata_dict
):
raise ValueError(
"Cannot filter stores that were persisted without metadata. "
"Please rebuild the store with metadata to enable filtering."
)
# Prefilter nodes based on the query filter and node ID restrictions.
query_filter_fn = _build_metadata_filter_fn(
lambda node_id: self._data.metadata_dict[node_id], query.filters
)
if query.node_ids is not None:
available_ids = set(query.node_ids)
def node_filter_fn(node_id: str) -> bool:
return node_id in available_ids
else:
def node_filter_fn(node_id: str) -> bool:
return True
node_ids = []
embeddings = []
# TODO: consolidate with get_query_text_embedding_similarities
for node_id, embedding in self._data.embedding_dict.items():
if node_filter_fn(node_id) and query_filter_fn(node_id):
node_ids.append(node_id)
embeddings.append(embedding)
query_embedding = cast(List[float], query.query_embedding)
if query.mode in LEARNER_MODES:
top_similarities, top_ids = get_top_k_embeddings_learner(
query_embedding,
embeddings,
similarity_top_k=query.similarity_top_k,
embedding_ids=node_ids,
)
elif query.mode == MMR_MODE:
mmr_threshold = kwargs.get("mmr_threshold", None)
top_similarities, top_ids = get_top_k_mmr_embeddings(
query_embedding,
embeddings,
similarity_top_k=query.similarity_top_k,
embedding_ids=node_ids,
mmr_threshold=mmr_threshold,
)
elif query.mode == VectorStoreQueryMode.DEFAULT:
top_similarities, top_ids = get_top_k_embeddings(
query_embedding,
embeddings,
similarity_top_k=query.similarity_top_k,
embedding_ids=node_ids,
)
elif query.mode == VectorStoreQueryMode.HYBRID: # where I made my changes
top_similarities, top_ids = get_top_k_embeddings(
query_embedding,
embeddings,
similarity_top_k=query.similarity_top_k,
embedding_ids=node_ids,
)
else:
raise ValueError(f"Invalid query mode: {query.mode}")
return VectorStoreQueryResult(similarities=top_similarities, ids=top_ids) Ensure that the Additionally, the class VectorIndexRetriever(BaseRetriever):
def __init__(
self,
index: VectorStoreIndex,
similarity_top_k: int = DEFAULT_SIMILARITY_TOP_K,
vector_store_query_mode: VectorStoreQueryMode = VectorStoreQueryMode.DEFAULT,
filters: Optional[MetadataFilters] = None,
alpha: Optional[float] = None,
node_ids: Optional[List[str]] = None,
doc_ids: Optional[List[str]] = None,
sparse_top_k: Optional[int] = None,
callback_manager: Optional[CallbackManager] = None,
object_map: Optional[dict] = None,
verbose: bool = False,
**kwargs: Any,
) -> None:
self._index = index
self._vector_store = self._index.vector_store
self._service_context = self._index.service_context
self._docstore = self._index.docstore
self._similarity_top_k = similarity_top_k
self._vector_store_query_mode = VectorStoreQueryMode(vector_store_query_mode)
self._alpha = alpha
self._node_ids = node_ids
self._doc_ids = doc_ids
self._filters = filters
self._sparse_top_k = sparse_top_k
self._kwargs: Dict[str, Any] = kwargs.get("vector_store_kwargs", {})
super().__init__(
callback_manager=callback_manager, object_map=object_map, verbose=verbose
)
def _build_vector_store_query(
self, query_bundle_with_embeddings: QueryBundle
) -> VectorStoreQuery:
return VectorStoreQuery(
query_embedding=query_bundle_with_embeddings.embedding,
similarity_top_k=self._similarity_top_k,
node_ids=self._node_ids,
doc_ids=self._doc_ids,
query_str=query_bundle_with_embeddings.query_str,
mode=self._vector_store_query_mode,
alpha=self._alpha,
filters=self._filters,
sparse_top_k=self._sparse_top_k,
)
def _build_node_list_from_query_result(
self, query_result: VectorStoreQueryResult
) -> List[NodeWithScore]:
if query_result.nodes is None:
if query_result.ids is None:
raise ValueError(
"Vector store query result should return at "
"least one of nodes or ids."
)
assert isinstance(self._index.index_struct, IndexDict)
node_ids = [
self._index.index_struct.nodes_dict[idx] for idx in query_result.ids
]
nodes = self._docstore.get_nodes(node_ids)
query_result.nodes = nodes
else:
for i in range(len(query_result.nodes)):
source_node = query_result.nodes[i].source_node
if (not self._vector_store.stores_text) or (
source_node is not None and source_node.node_type != ObjectType.TEXT
):
node_id = query_result.nodes[i].node_id
if self._docstore.document_exists(node_id):
query_result.nodes[i] = self._docstore.get_node(
node_id
) # type: ignore[index]
log_vector_store_query_result(query_result)
node_with_scores: List[NodeWithScore] = []
for ind, node in enumerate(query_result.nodes):
score: Optional[float] = None
if query_result.similarities is not None:
score = query_result.similarities[ind]
node_with_scores.append(NodeWithScore(node=node, score=score))
return node_with_scores
def _get_nodes_with_embeddings(
self, query_bundle_with_embeddings: QueryBundle
) -> List[NodeWithScore]:
query = self._build_vector_store_query(query_bundle_with_embeddings)
query_result = self._vector_store.query(query, **self._kwargs)
return self._build_node_list_from_query_result(query_result)
async def _aget_nodes_with_embeddings(
self, query_bundle_with_embeddings: QueryBundle
) -> List[NodeWithScore]:
query = self._build_vector_store_query(query_bundle_with_embeddings)
query_result = await self._vector_store.aquery(query, **self._kwargs)
return self._build_node_list_from_query_result(query_result) By using the |
Question Validation
Question
In the DenseXRetrievalPack class, I'm encountering an issue with VectorStoreIndex when using different sets of nodes (nodes versus all_nodes).
Relevant Code Lines:
Issue:
The issue occurs with both ChromaDB and Elasticsearch.
When using nodes alone, the embeddings can be reused without any issue.
The problem arises specifically when all_nodes is used, which includes both nodes and sub_nodes, result in the error:
ValueError: Query id d28e3de4-1c4f-420a-9184-97bf8556b11b not found in either retriever_dict or query_engine_dict.
Expected Behavior:
I expect VectorStoreIndex to handle querying using embeddings from all_nodes (nodes+sub_nodes)without errors, similar to when using nodes.
Questions:
Why does the error occur when using all_nodes but not with nodes alone?
How can I modify the code to successfully reuse embeddings with all_nodes?
Is there a specific way to handle all_nodes with ChromaDB and Elasticsearch to avoid this issue?
The text was updated successfully, but these errors were encountered: