tanakachitsamba · tanakachitsamba · Oct 7, 2025 · Oct 7, 2025 · Oct 7, 2025 · Oct 7, 2025
diff --git a/README.md b/README.md
@@ -1,5 +1,86 @@
-to run the scripts make sure you have installed all deps with: 
-`pip install behave chromadb python-dotenv`
+## Vector database utilities
 
-to run tests run: 
-`behave`
+This project provides a tiny toolkit for managing a [Chroma](https://docs.trychroma.com/)
+vector store.  The utilities support persisting embeddings locally, executing
+semantic search queries and cleaning up stored documents.
+
+### Installation
+
+Install the required Python dependencies:
+
+```bash
+pip install chromadb python-dotenv pytest
+```
+
+### Configuration
+
+The behaviour of the service can be tuned through environment variables or CLI
+flags:
+
+| Variable | Description | Default |
+| --- | --- | --- |
+| `VECTOR_PERSIST_DIRECTORY` | Directory that stores the Chroma database | `db` |
+| `VECTOR_COLLECTION_NAME` | Target collection name | `lake` |
+| `VECTOR_EMBEDDING_BACKEND` | `openai` (requires `OPENAI_KEY`) or `simple` | `openai` |
+| `VECTOR_OPENAI_MODEL` | Optional override of the OpenAI embedding model | `text-embedding-ada-002` |
+
+Create a `.env` file if you prefer storing the configuration locally.  For
+example:
+
+```
+VECTOR_PERSIST_DIRECTORY=./db
+VECTOR_COLLECTION_NAME=my_collection
+VECTOR_EMBEDDING_BACKEND=simple
+```
+
+### Command line usage
+
+All features are exposed via the `vector_service` CLI:
+
+```bash
+# Ingest a document
+python -m vector_service ingest \
+  --document "Chocolate chip cookies" \
+  --metadata '{"category": "dessert", "rating": 5}' \
+  --id recipe-1
+
+# Run a semantic query
+python -m vector_service query --text "cookie recipe" --top-k 3
+
+# Delete stored documents
+python -m vector_service delete recipe-1 recipe-2
+```
+
+Metadata values can be provided as JSON (shown above) or as comma separated
+`key=value` pairs such as `--metadata category=dessert,rating=5`.
+
+The legacy ingestion entry-point still exists for compatibility:
+
+```bash
+python add_documents.py "Chocolate chip cookies" '{"category": "dessert"}' recipe-1
+```
+
+### Programmatic usage
+
+The `vector_service` module exposes helpers that can be imported from Python
+code:
+
+```python
+from vector_service import add_documents, query_collection, delete_documents
+
+add_documents(
+    ["Chocolate chip cookies"],
+    [{"category": "dessert"}],
+    ["recipe-1"],
+)
+results = query_collection("cookies", top_k=2)
+delete_documents(["recipe-1"])
+```
+
+### Tests
+
+Run the integration tests with:
+
+```bash
+pytest
+```
diff --git a/add_documents.py b/add_documents.py
@@ -1,70 +1,46 @@
-import chromadb
-from chromadb.utils import embedding_functions
-from dotenv import load_dotenv
-import os
+"""Backward compatible ingestion script.
+
+This wrapper keeps the historical ``add_documents.py`` entry-point functional
+while delegating the heavy lifting to :mod:`vector_service`.  Newer workflows
+should prefer the richer CLI available via ``python -m vector_service``.
+"""
+
+import json
 import sys
+from typing import Any, Dict
 
-def load_openai_key():
-    # Load variables from .env file into environment
-    load_dotenv()
-    openai_key = os.environ.get('OPENAI_KEY')
-    if not openai_key:
-        raise ValueError("OPENAI_KEY is not set in the .env file.")
-    return openai_key
+from vector_service import VectorConfig, add_documents, get_config
 
-def create_openai_ef(api_key):
-    # Using OpenAI Embeddings. This assumes you have the openai package installed
-    openai_ef = embedding_functions.OpenAIEmbeddingFunction(
-        api_key=api_key,
-        model_name="text-embedding-ada-002"
-    )
-    return openai_ef
 
-def create_or_get_collection(client):
-    # Create a new chroma collection
-    collection_name = "lake"
-    return client.get_or_create_collection(name=collection_name)
+def _parse_metadata(raw: str) -> Dict[str, Any]:
+    raw = raw.strip()
+    if not raw:
+        return {}
+    if raw.startswith("{"):
+        return json.loads(raw)
+    metadata: Dict[str, Any] = {}
+    for item in raw.split(","):
+        if not item:
+            continue
+        key, _, value = item.partition("=")
+        metadata[key.strip()] = value.strip()
+    return metadata
 
-def add_to_openai_collection(collection, documents, metadatas, ids):
-    try:
-        collection.add(
-            documents=documents,
-            metadatas=metadatas,
-            ids=ids
-        )
-        print("Documents added to the collection successfully.")
-    except Exception as e:
-        print(f"Error occurred while adding documents: {e}")
 
 if __name__ == "__main__":
     try:
-        # Check if three command-line arguments are provided
         if len(sys.argv) != 4:
-            raise ValueError("Usage: python script.py <documents> <metadatas> <ids>")
-
-        # Extract the command-line arguments as strings
-        documents = sys.argv[1]
-        metadatas = sys.argv[2]
-        ids = sys.argv[3]
-
-        # Create a new Chroma client with persistence enabled.
-        persist_directory = "db" # this path for the db could be an arg 
-        client = chromadb.PersistentClient(path=persist_directory)
-
-        # Load the OpenAI key
-        openai_key = load_openai_key()
-
-        # Create/Open OpenAI Embedding Function
-        openai_ef = create_openai_ef(api_key=openai_key)
-
-        # Create or get the Chroma collection
-        openai_collection = create_or_get_collection(client)
-
-        # Call the function with the provided arguments
-        add_to_openai_collection(openai_collection, documents, metadatas, ids)
-    except ValueError as ve:
-        print(ve)
-    except chromadb.ChromaDBError as cde:
-        print(f"ChromaDBError: {cde}")
-    except Exception as e:
-        print(f"An unexpected error occurred: {e}")
+            raise ValueError(
+                "Usage: python add_documents.py <document> <metadata> <id>"
+            )
+
+        document, metadata_raw, identifier = sys.argv[1:4]
+        metadata = _parse_metadata(metadata_raw)
+        config: VectorConfig = get_config()
+        add_documents([document], [metadata], [identifier], config=config)
+        print(
+            f"Document '{identifier}' ingested into collection"
+            f" '{config.collection_name}'."
+        )
+    except Exception as exc:
+        print(f"Error: {exc}")
diff --git a/tests/test_vector_service_integration.py b/tests/test_vector_service_integration.py
@@ -0,0 +1,46 @@
+import pathlib
+import sys
+
+import pytest
+
+sys.path.insert(0, str(pathlib.Path(__file__).resolve().parents[1]))
+
+import vector_service
+
+
+@pytest.fixture(autouse=True)
+def _reset_env(monkeypatch):
+    for key in [
+        "VECTOR_PERSIST_DIRECTORY",
+        "VECTOR_COLLECTION_NAME",
+        "VECTOR_EMBEDDING_BACKEND",
+        "OPENAI_KEY",
+    ]:
+        monkeypatch.delenv(key, raising=False)
+
+
+def test_ingest_query_and_delete(tmp_path, monkeypatch):
+    persist_dir = tmp_path / "chromadb"
+    monkeypatch.setenv("VECTOR_PERSIST_DIRECTORY", str(persist_dir))
+    monkeypatch.setenv("VECTOR_COLLECTION_NAME", "integration_tests")
+    monkeypatch.setenv("VECTOR_EMBEDDING_BACKEND", "simple")
+
+    docs = ["Chocolate chip cookies", "Freshly baked bread"]
+    metadatas = [
+        {"category": "dessert", "rating": 5},
+        {"category": "bakery", "rating": 4},
+    ]
+    ids = ["doc_1", "doc_2"]
+
+    vector_service.add_documents(docs, metadatas, ids)
+
+    results = vector_service.query_collection("chocolate", top_k=1)
+    assert "metadatas" in results
+    assert results["ids"][0][0] == "doc_1"
+    assert results["metadatas"][0][0]["category"] == "dessert"
+
+    vector_service.delete_documents(["doc_1"])
+    post_delete = vector_service.query_collection("bread", top_k=2)
+    remaining_ids = post_delete["ids"][0]
+    assert "doc_1" not in remaining_ids
+    assert "doc_2" in remaining_ids