Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 85 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,86 @@
to run the scripts make sure you have installed all deps with:
`pip install behave chromadb python-dotenv`
## Vector database utilities

to run tests run:
`behave`
This project provides a tiny toolkit for managing a [Chroma](https://docs.trychroma.com/)
vector store. The utilities support persisting embeddings locally, executing
semantic search queries and cleaning up stored documents.

### Installation

Install the required Python dependencies:

```bash
pip install chromadb python-dotenv pytest
```

### Configuration

The behaviour of the service can be tuned through environment variables or CLI
flags:

| Variable | Description | Default |
| --- | --- | --- |
| `VECTOR_PERSIST_DIRECTORY` | Directory that stores the Chroma database | `db` |
| `VECTOR_COLLECTION_NAME` | Target collection name | `lake` |
| `VECTOR_EMBEDDING_BACKEND` | `openai` (requires `OPENAI_KEY`) or `simple` | `openai` |
| `VECTOR_OPENAI_MODEL` | Optional override of the OpenAI embedding model | `text-embedding-ada-002` |

Create a `.env` file if you prefer storing the configuration locally. For
example:

```
VECTOR_PERSIST_DIRECTORY=./db
VECTOR_COLLECTION_NAME=my_collection
VECTOR_EMBEDDING_BACKEND=simple
```

### Command line usage

All features are exposed via the `vector_service` CLI:

```bash
# Ingest a document
python -m vector_service ingest \
--document "Chocolate chip cookies" \
--metadata '{"category": "dessert", "rating": 5}' \
--id recipe-1

# Run a semantic query
python -m vector_service query --text "cookie recipe" --top-k 3

# Delete stored documents
python -m vector_service delete recipe-1 recipe-2
```

Metadata values can be provided as JSON (shown above) or as comma separated
`key=value` pairs such as `--metadata category=dessert,rating=5`.

The legacy ingestion entry-point still exists for compatibility:

```bash
python add_documents.py "Chocolate chip cookies" '{"category": "dessert"}' recipe-1
```

### Programmatic usage

The `vector_service` module exposes helpers that can be imported from Python
code:

```python
from vector_service import add_documents, query_collection, delete_documents

add_documents(
["Chocolate chip cookies"],
[{"category": "dessert"}],
["recipe-1"],
)
results = query_collection("cookies", top_k=2)
delete_documents(["recipe-1"])
```

### Tests

Run the integration tests with:

```bash
pytest
```
98 changes: 37 additions & 61 deletions add_documents.py
Original file line number Diff line number Diff line change
@@ -1,70 +1,46 @@
import chromadb
from chromadb.utils import embedding_functions
from dotenv import load_dotenv
import os
"""Backward compatible ingestion script.

This wrapper keeps the historical ``add_documents.py`` entry-point functional
while delegating the heavy lifting to :mod:`vector_service`. Newer workflows
should prefer the richer CLI available via ``python -m vector_service``.
"""

import json
import sys
from typing import Any, Dict

def load_openai_key():
# Load variables from .env file into environment
load_dotenv()
openai_key = os.environ.get('OPENAI_KEY')
if not openai_key:
raise ValueError("OPENAI_KEY is not set in the .env file.")
return openai_key
from vector_service import VectorConfig, add_documents, get_config

def create_openai_ef(api_key):
# Using OpenAI Embeddings. This assumes you have the openai package installed
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
api_key=api_key,
model_name="text-embedding-ada-002"
)
return openai_ef

def create_or_get_collection(client):
# Create a new chroma collection
collection_name = "lake"
return client.get_or_create_collection(name=collection_name)
def _parse_metadata(raw: str) -> Dict[str, Any]:
raw = raw.strip()
if not raw:
return {}
if raw.startswith("{"):
return json.loads(raw)
metadata: Dict[str, Any] = {}
for item in raw.split(","):
if not item:
continue
key, _, value = item.partition("=")
metadata[key.strip()] = value.strip()
return metadata

def add_to_openai_collection(collection, documents, metadatas, ids):
try:
collection.add(
documents=documents,
metadatas=metadatas,
ids=ids
)
print("Documents added to the collection successfully.")
except Exception as e:
print(f"Error occurred while adding documents: {e}")

if __name__ == "__main__":
try:
# Check if three command-line arguments are provided
if len(sys.argv) != 4:
raise ValueError("Usage: python script.py <documents> <metadatas> <ids>")

# Extract the command-line arguments as strings
documents = sys.argv[1]
metadatas = sys.argv[2]
ids = sys.argv[3]

# Create a new Chroma client with persistence enabled.
persist_directory = "db" # this path for the db could be an arg
client = chromadb.PersistentClient(path=persist_directory)

# Load the OpenAI key
openai_key = load_openai_key()

# Create/Open OpenAI Embedding Function
openai_ef = create_openai_ef(api_key=openai_key)

# Create or get the Chroma collection
openai_collection = create_or_get_collection(client)

# Call the function with the provided arguments
add_to_openai_collection(openai_collection, documents, metadatas, ids)
except ValueError as ve:
print(ve)
except chromadb.ChromaDBError as cde:
print(f"ChromaDBError: {cde}")
except Exception as e:
print(f"An unexpected error occurred: {e}")
raise ValueError(
"Usage: python add_documents.py <document> <metadata> <id>"
)

document, metadata_raw, identifier = sys.argv[1:4]
metadata = _parse_metadata(metadata_raw)
config: VectorConfig = get_config()
add_documents([document], [metadata], [identifier], config=config)
print(
f"Document '{identifier}' ingested into collection"
f" '{config.collection_name}'."
)
except Exception as exc:
print(f"Error: {exc}")
46 changes: 46 additions & 0 deletions tests/test_vector_service_integration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import pathlib
import sys

import pytest

sys.path.insert(0, str(pathlib.Path(__file__).resolve().parents[1]))

import vector_service


@pytest.fixture(autouse=True)
def _reset_env(monkeypatch):
for key in [
"VECTOR_PERSIST_DIRECTORY",
"VECTOR_COLLECTION_NAME",
"VECTOR_EMBEDDING_BACKEND",
"OPENAI_KEY",
]:
monkeypatch.delenv(key, raising=False)


def test_ingest_query_and_delete(tmp_path, monkeypatch):
persist_dir = tmp_path / "chromadb"
monkeypatch.setenv("VECTOR_PERSIST_DIRECTORY", str(persist_dir))
monkeypatch.setenv("VECTOR_COLLECTION_NAME", "integration_tests")
monkeypatch.setenv("VECTOR_EMBEDDING_BACKEND", "simple")

docs = ["Chocolate chip cookies", "Freshly baked bread"]
metadatas = [
{"category": "dessert", "rating": 5},
{"category": "bakery", "rating": 4},
]
ids = ["doc_1", "doc_2"]

vector_service.add_documents(docs, metadatas, ids)

results = vector_service.query_collection("chocolate", top_k=1)
assert "metadatas" in results
assert results["ids"][0][0] == "doc_1"
assert results["metadatas"][0][0]["category"] == "dessert"

vector_service.delete_documents(["doc_1"])
post_delete = vector_service.query_collection("bread", top_k=2)
remaining_ids = post_delete["ids"][0]
assert "doc_1" not in remaining_ids
assert "doc_2" in remaining_ids
Loading