diff --git a/src/lightspeed_rag_content/document_processor.py b/src/lightspeed_rag_content/document_processor.py index 557da8c..90e09a0 100644 --- a/src/lightspeed_rag_content/document_processor.py +++ b/src/lightspeed_rag_content/document_processor.py @@ -540,10 +540,11 @@ async def upload_file(chunk_indices: list[int]) -> str: ) embedding = embedding_response.data[0].embedding + metadata = {**doc.get("metadata", {}), "source": index} chunk = { "content": doc["content"], "chunk_id": doc["chunk_id"], - "metadata": doc.get("metadata", {}), + "metadata": metadata, "chunk_metadata": doc["chunk_metadata"], "embedding": embedding, "embedding_model": embedding_model, @@ -613,6 +614,7 @@ async def _upload_and_process_files( # noqa: C901 # pylint: disable=R0912,R091 attributes = { **rag_doc.metadata, # type: ignore[union-attr] "document_id": doc_uuid, + "source": index, } vs_file = await client.vector_stores.files.create( vector_store_id=vector_store.id, diff --git a/tests/test_document_processor_llama_stack.py b/tests/test_document_processor_llama_stack.py index 84aefa6..b2b234b 100644 --- a/tests/test_document_processor_llama_stack.py +++ b/tests/test_document_processor_llama_stack.py @@ -557,6 +557,12 @@ def test_save_manual_chunking(self, mocker, llama_stack_processor): assert call_kwargs["vector_store_id"] == "vs_123" assert "chunks" in call_kwargs assert len(call_kwargs["chunks"]) == 2 + # Verify index name is embedded in chunk metadata as "source" + # and existing metadata keys are preserved + for chunk in call_kwargs["chunks"]: + assert chunk["metadata"]["source"] == mock.sentinel.index + assert "title" in chunk["metadata"] + assert "docs_url" in chunk["metadata"] def test_save_auto_chunking(self, mocker, llama_stack_processor): """Test saving documents with automatic chunking workflow.""" @@ -566,3 +572,9 @@ def test_save_auto_chunking(self, mocker, llama_stack_processor): # Verify files.create was called for each document (single file upload) assert client.files.create.await_count == 2 assert client.vector_stores.files.create.await_count == 2 + # Verify index name is embedded in file attributes as "source" + # and existing metadata keys are preserved + for call in client.vector_stores.files.create.await_args_list: + assert call.kwargs["attributes"]["source"] == mock.sentinel.index + assert "title" in call.kwargs["attributes"] + assert "document_id" in call.kwargs["attributes"]