Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
11a3850
feat: add self-query functionality
jirastorza Oct 9, 2025
f954bca
fix: modified self_query_prompt
jirastorza Oct 10, 2025
f0e66da
fix: modified self_query_prompt
jirastorza Oct 10, 2025
2e6c436
fix: code simplification
jirastorza Oct 14, 2025
3507ad5
fix: test rag
jirastorza Oct 14, 2025
238d3a1
fix: add self_query option to config and update tool calling logic.
jirastorza Oct 14, 2025
b8055da
fix: corret logger
jirastorza Oct 15, 2025
9e32790
fix: linting
jirastorza Oct 15, 2025
c8e4fa9
fix: simplify rag test.
jirastorza Oct 15, 2025
ff97cd2
fix: remove repetitive self_query call.
jirastorza Oct 15, 2025
e12ed5b
fix: move self_query to _search.py
jirastorza Oct 16, 2025
752ea2b
fix: modify test structure.
jirastorza Oct 16, 2025
b0b46a6
fix: allow list metadata values.
jirastorza Oct 16, 2025
5d575e9
fix: allow list type metadata handling.
jirastorza Oct 16, 2025
b32f070
fix: reduce MetadataValues to hashable types, modify document metadat…
jirastorza Oct 17, 2025
f937fe6
fix: adapt test.
jirastorza Oct 17, 2025
f68d1c7
fix: adapt test case to changes.
jirastorza Oct 17, 2025
ecbcae2
fix: additional test fix.
jirastorza Oct 17, 2025
fb5a01b
fix: database chunk and document metadata.
jirastorza Oct 17, 2025
15a6000
fix: update README.
jirastorza Oct 22, 2025
f20c512
Merge remote-tracking branch 'origin/main' into self-query
jirastorza Oct 28, 2025
1e10550
fix: ensure metadata is stored as proper JSON without escape characters
jirastorza Oct 29, 2025
723931d
fix: handle hex byte escape sequences in metadata filter values
jirastorza Oct 29, 2025
775cae3
fix: sanitize LLM metadata output to remove NULs and decode escaped c…
jirastorza Oct 30, 2025
1e3cb2d
docs: clarify comment explaining why LLM output is cleaned after extr…
jirastorza Oct 30, 2025
ed8558e
fix: remove metadata filter decoding
jirastorza Oct 30, 2025
5586c08
fix: decode escaped Unicode sequences in metadata_filter
jirastorza Oct 30, 2025
f83e57a
fix: encode query with ensure_ascii for consistent Unicode handling i…
jirastorza Oct 30, 2025
dc4e62a
feat: use ID-based metadata mapping for more reliable self-query extr…
jirastorza Oct 30, 2025
1e2c9a0
feat: use ID-based metadata mapping for more reliable self-query extr…
jirastorza Oct 30, 2025
f8225f5
fix: update self_query template for small model extraction
jirastorza Oct 30, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions src/raglite/_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
FloatMatrix,
FloatVector,
IndexId,
MetadataValue,
PickledObject,
)

Expand Down Expand Up @@ -438,6 +439,15 @@ def get(id_: str = "default", *, config: RAGLiteConfig | None = None) -> dict[st
return metadata


class Metadata(SQLModel, table=True):
"""A table for metadata values, linked to field names."""

__tablename__ = "metadata"

name: str = Field(..., primary_key=True)
values: list[MetadataValue] = Field(default_factory=list, sa_column=Column(JSON))


class Eval(SQLModel, table=True):
"""A RAG evaluation example."""

Expand Down
55 changes: 54 additions & 1 deletion src/raglite/_insert.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,71 @@
from contextlib import nullcontext
from functools import partial
from pathlib import Path
from typing import TYPE_CHECKING

from filelock import FileLock
from sqlalchemy import text
from sqlalchemy.engine import make_url
from sqlalchemy.orm.attributes import flag_modified
from sqlmodel import Session, col, select
from tqdm.auto import tqdm

from raglite._config import RAGLiteConfig
from raglite._database import Chunk, ChunkEmbedding, Document, create_database_engine
from raglite._database import Chunk, ChunkEmbedding, Document, Metadata, create_database_engine
from raglite._embed import embed_strings, embed_strings_without_late_chunking, embedding_type
from raglite._split_chunklets import split_chunklets
from raglite._split_chunks import split_chunks
from raglite._split_sentences import split_sentences

if TYPE_CHECKING:
from raglite._typing import MetadataValue

METADATA_EXCLUDED_FIELDS = ["filename", "uri", "url", "size", "created", "modified"]


def _update_metadata_from_documents( # noqa: C901
documents: list[Document],
*,
session: Session,
metadata_excluded_fields: list[str] = METADATA_EXCLUDED_FIELDS,
) -> None:
"""Update the metadata table with new metadata from documents metadata."""
if not documents:
return
# Aggregate metadata values from all documents.
metadata: dict[str, list[MetadataValue]] = {}
for doc in documents:
for key, value in doc.metadata_.items():
if key in metadata_excluded_fields:
continue
if key not in metadata:
metadata[key] = []
if value not in metadata[key]:
metadata[key].append(value)
# Fetch all existing database metadata records
existing_metadata = {
record.name: record
for record in session.exec(
select(Metadata).where(col(Metadata.name).in_(list(metadata.keys())))
).all()
}
# Update or add metadata records.
for key, values in metadata.items():
# Update
if key in existing_metadata:
result = existing_metadata[key]
modified = False
for value in values:
if value not in result.values:
result.values.append(value)
modified = True
if modified:
flag_modified(result, "values") # Notify SQLAlchemy of the change
session.add(result)
# Add
else:
session.add(Metadata(name=key, values=values))


def _create_chunk_records(
document: Document, config: RAGLiteConfig
Expand Down Expand Up @@ -171,6 +222,8 @@ def insert_documents( # noqa: C901
session.expunge_all() # Release memory of flushed changes.
num_unflushed_embeddings = 0
pbar.update()
# Update metadata table.
_update_metadata_from_documents(documents=documents, session=session)
session.commit()
if engine.dialect.name == "duckdb":
# DuckDB does not automatically update its keyword search index [1], so we do it
Expand Down
94 changes: 92 additions & 2 deletions src/raglite/_rag.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,13 @@
stream_chunk_builder,
supports_function_calling,
)
from sqlmodel import Session, select

from raglite._config import RAGLiteConfig
from raglite._database import Chunk, ChunkSpan
from raglite._database import Chunk, ChunkSpan, Metadata, create_database_engine
from raglite._litellm import get_context_size
from raglite._search import retrieve_chunk_spans
from raglite._typing import MetadataFilter
from raglite._typing import MetadataFilter, MetadataValue

# The default RAG instruction template follows Anthropic's best practices [1].
# [1] https://docs.anthropic.com/en/docs/build-with-claude/prompt-engineering/long-context-tips
Expand All @@ -36,17 +37,47 @@
{user_prompt}
""".strip()

SELF_QUERY_PROMPT = """
---
You extract metadata filters from a user query.
Rules:
- Return ONE JSON object containing EVERY metadata field as a key.
- For each field:
- Only set a value if the user query explicitly and unambiguously mentions it, using exactly one value from the allowed list for that field.
- If the user query is broad, ambiguous, or does NOT explicitly mention a single
allowed value for that field, you MUST output {no_match}.
- Do NOT infer values from other fields, common knowledge, or popularity.
- Output ONLY the JSON object, with no extra text before or after.
---

</available_metadata>
{metadata_dict}
</available_metadata>

User query: "{query}"
""".strip()

NO_MATCH = "<<no_match>>"


def retrieve_context(
query: str,
*,
self_query: bool = False,
num_chunks: int = 10,
metadata_filter: MetadataFilter | None = None,
config: RAGLiteConfig | None = None,
) -> list[ChunkSpan]:
"""Retrieve context for RAG."""
# Call the search method.
config = config or RAGLiteConfig()
# If self_query is enabled, extract metadata filters from the query.
if self_query:
self_query_filter = _self_query(query, config=config)
if metadata_filter is not None:
metadata_filter = {**self_query_filter, **metadata_filter}
else:
metadata_filter = self_query_filter
results = config.search_method(
query, num_results=num_chunks, metadata_filter=metadata_filter, config=config
)
Expand Down Expand Up @@ -173,6 +204,65 @@ def _run_tools(
return tool_messages


def _self_query(
query: str,
*,
self_query_prompt: str = SELF_QUERY_PROMPT,
no_match: str = NO_MATCH,
config: RAGLiteConfig | None = None,
) -> MetadataFilter:
"""Extract metadata filters from a natural language query."""
config = config or RAGLiteConfig()
# Retrieve the available metadata from the database.
with Session(create_database_engine(config)) as session:
metadata_records = session.exec(select(Metadata)).all()
if not metadata_records:
return {}
# Generate response format JSON schema
available_metadata: dict[str, list[MetadataValue]] = {}
properties: dict[str, dict[str, list[MetadataValue]]] = {}
for record in metadata_records:
properties[record.name] = {"enum": [*record.values, no_match]}
available_metadata[record.name] = record.values
response_format = {
"type": "json_schema",
"json_schema": {
"name": "metadata_filter",
"schema": {
"type": "object",
"additionalProperties": False,
"properties": properties,
"required": list(properties.keys()),
},
"strict": True,
},
}
# Format the prompt with the user's query and available metadata
formatted_prompt = self_query_prompt.format(
metadata_dict=json.dumps(available_metadata, indent=2),
query=query,
no_match=no_match,
)
# Call the LLM to extract metadata filters
try:
response = completion(
model=config.llm,
messages=[
{"role": "user", "content": formatted_prompt},
],
response_format=response_format,
temperature=0,
)
metadata_filter = response["choices"][0]["message"]["content"]
metadata_filter = json.loads(metadata_filter)
# Remove any key-value pairs where the value is equal to no_match
metadata_filter = {k: v for k, v in metadata_filter.items() if v != no_match}
except (json.JSONDecodeError, KeyError):
return {}
else:
return metadata_filter


def rag(
messages: list[dict[str, str]],
*,
Expand Down
13 changes: 12 additions & 1 deletion tests/test_insert.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from tqdm import tqdm

from raglite._config import RAGLiteConfig
from raglite._database import Chunk, Document, create_database_engine
from raglite._database import Chunk, Document, Metadata, create_database_engine
from raglite._markdown import document_to_markdown


Expand Down Expand Up @@ -43,3 +43,14 @@ def test_insert(raglite_test_config: RAGLiteConfig) -> None:
doc = document_to_markdown(doc_path)
doc = doc.replace("\n", "").strip()
assert restored_document == doc, "Restored document does not match the original input."
# Verify that the document metadata matches.
metadata = session.exec(select(Metadata)).all()
assert len(metadata) > 0, "No metadata found for the document"
# Check that the metadata values match the original document metadata.
for meta in metadata:
assert meta.name in document.metadata_, (
f"Metadata {meta.name} not found in document metadata"
)
assert document.metadata_[meta.name] in meta.values, (
f"Metadata value {document.metadata_[meta.name]} for {meta.name} not found in metadata values {meta.values}"
)
33 changes: 32 additions & 1 deletion tests/test_rag.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
"""Test RAGLite's RAG functionality."""

import json
from typing import Any

from raglite import (
RAGLiteConfig,
add_context,
retrieve_context,
)
from raglite._database import ChunkSpan
from raglite._rag import rag
from raglite._rag import _self_query, rag


def test_rag_manual(raglite_test_config: RAGLiteConfig) -> None:
Expand Down Expand Up @@ -60,3 +61,33 @@ def test_rag_auto_without_retrieval(raglite_test_config: RAGLiteConfig) -> None:
# Verify that no RAG context was retrieved.
assert [message["role"] for message in messages] == ["user", "assistant"]
assert not chunk_spans


def test_self_query(raglite_test_config: RAGLiteConfig) -> None:
"""Test self-query functionality that extracts metadata filters from queries."""
# Test 1: Query that should extract "Physics" from topic field
query1 = "I want to learn more about Physics"
expected_filter1 = {"topic": "Physics"}
actual_filter1 = _self_query(query1, config=raglite_test_config)
assert actual_filter1 == expected_filter1, f"Expected {expected_filter1}, got {actual_filter1}"
# Test 2: Query with non-existent metadata values should return empty filter
query2 = "What did Shakespeare write about chemistry?"
expected_filter2: dict[str, Any] = {}
actual_filter2 = _self_query(query2, config=raglite_test_config)
assert actual_filter2 == expected_filter2, f"Expected {expected_filter2}, got {actual_filter2}"


def test_retrieve_context_self_query(raglite_test_config: RAGLiteConfig) -> None:
"""Test retrieve_context with self_query functionality."""
query = "What does Albert Einstein's paper say about time dilation?"
chunk_spans = retrieve_context(
query=query, self_query=True, num_chunks=5, config=raglite_test_config
)
assert all(isinstance(chunk_span, ChunkSpan) for chunk_span in chunk_spans)
for chunk_span in chunk_spans:
assert chunk_span.document.metadata_.get("type") == "Paper", (
f"Expected type='Paper', got {chunk_span.document.metadata_.get('type')}"
)
assert chunk_span.document.metadata_.get("author") == "Albert Einstein", (
f"Expected author='Albert Einstein', got {chunk_span.document.metadata_.get('author')}"
)
Loading