superlinear-ai · Robbe-Superlinear · Oct 9, 2025 · Oct 10, 2025 · Oct 10, 2025 · Oct 14, 2025
diff --git a/src/raglite/_database.py b/src/raglite/_database.py
@@ -41,6 +41,7 @@
     FloatMatrix,
     FloatVector,
     IndexId,
+    MetadataValue,
     PickledObject,
 )
 
@@ -438,6 +439,15 @@ def get(id_: str = "default", *, config: RAGLiteConfig | None = None) -> dict[st
         return metadata
 
 
+class Metadata(SQLModel, table=True):
+    """A table for metadata values, linked to field names."""
+
+    __tablename__ = "metadata"
+
+    name: str = Field(..., primary_key=True)
+    values: list[MetadataValue] = Field(default_factory=list, sa_column=Column(JSON))
+
+
 class Eval(SQLModel, table=True):
     """A RAG evaluation example."""
 

diff --git a/src/raglite/_insert.py b/src/raglite/_insert.py
@@ -4,20 +4,71 @@
 from contextlib import nullcontext
 from functools import partial
 from pathlib import Path
+from typing import TYPE_CHECKING
 
 from filelock import FileLock
 from sqlalchemy import text
 from sqlalchemy.engine import make_url
+from sqlalchemy.orm.attributes import flag_modified
 from sqlmodel import Session, col, select
 from tqdm.auto import tqdm
 
 from raglite._config import RAGLiteConfig
-from raglite._database import Chunk, ChunkEmbedding, Document, create_database_engine
+from raglite._database import Chunk, ChunkEmbedding, Document, Metadata, create_database_engine
 from raglite._embed import embed_strings, embed_strings_without_late_chunking, embedding_type
 from raglite._split_chunklets import split_chunklets
 from raglite._split_chunks import split_chunks
 from raglite._split_sentences import split_sentences
 
+if TYPE_CHECKING:
+    from raglite._typing import MetadataValue
+
+METADATA_EXCLUDED_FIELDS = ["filename", "uri", "url", "size", "created", "modified"]
+
+
+def _update_metadata_from_documents(  # noqa: C901
+    documents: list[Document],
+    *,
+    session: Session,
+    metadata_excluded_fields: list[str] = METADATA_EXCLUDED_FIELDS,
+) -> None:
+    """Update the metadata table with new metadata from documents metadata."""
+    if not documents:
+        return
+    # Aggregate metadata values from all documents.
+    metadata: dict[str, list[MetadataValue]] = {}
+    for doc in documents:
+        for key, value in doc.metadata_.items():
+            if key in metadata_excluded_fields:
+                continue
+            if key not in metadata:
+                metadata[key] = []
+            if value not in metadata[key]:
+                metadata[key].append(value)
+    # Fetch all existing database metadata records
+    existing_metadata = {
+        record.name: record
+        for record in session.exec(
+            select(Metadata).where(col(Metadata.name).in_(list(metadata.keys())))
+        ).all()
+    }
+    # Update or add metadata records.
+    for key, values in metadata.items():
+        # Update
+        if key in existing_metadata:
+            result = existing_metadata[key]
+            modified = False
+            for value in values:
+                if value not in result.values:
+                    result.values.append(value)
+                    modified = True
+            if modified:
+                flag_modified(result, "values")  # Notify SQLAlchemy of the change
+                session.add(result)
+        # Add
+        else:
+            session.add(Metadata(name=key, values=values))
+
 
 def _create_chunk_records(
     document: Document, config: RAGLiteConfig
@@ -171,6 +222,8 @@ def insert_documents(  # noqa: C901
                 session.expunge_all()  # Release memory of flushed changes.
                 num_unflushed_embeddings = 0
             pbar.update()
+        # Update metadata table.
+        _update_metadata_from_documents(documents=documents, session=session)
         session.commit()
         if engine.dialect.name == "duckdb":
             # DuckDB does not automatically update its keyword search index [1], so we do it

diff --git a/src/raglite/_rag.py b/src/raglite/_rag.py
@@ -12,12 +12,13 @@
     stream_chunk_builder,
     supports_function_calling,
 )
+from sqlmodel import Session, select
 
 from raglite._config import RAGLiteConfig
-from raglite._database import Chunk, ChunkSpan
+from raglite._database import Chunk, ChunkSpan, Metadata, create_database_engine
 from raglite._litellm import get_context_size
 from raglite._search import retrieve_chunk_spans
-from raglite._typing import MetadataFilter
+from raglite._typing import MetadataFilter, MetadataValue
 
 # The default RAG instruction template follows Anthropic's best practices [1].
 # [1] https://docs.anthropic.com/en/docs/build-with-claude/prompt-engineering/long-context-tips
@@ -36,17 +37,47 @@
 {user_prompt}
 """.strip()
 
+SELF_QUERY_PROMPT = """
+---
+You extract metadata filters from a user query.
+Rules:
+- Return ONE JSON object containing EVERY metadata field as a key.
+- For each field:
+    - Only set a value if the user query explicitly and unambiguously mentions it, using exactly one value from the allowed list for that field.
+    - If the user query is broad, ambiguous, or does NOT explicitly mention a single
+      allowed value for that field, you MUST output {no_match}.
+- Do NOT infer values from other fields, common knowledge, or popularity.
+- Output ONLY the JSON object, with no extra text before or after.
+---
+
+</available_metadata>
+{metadata_dict}
+</available_metadata>
+
+User query: "{query}"
+""".strip()
+
+NO_MATCH = "<<no_match>>"
+
 
 def retrieve_context(
     query: str,
     *,
+    self_query: bool = False,
     num_chunks: int = 10,
     metadata_filter: MetadataFilter | None = None,
     config: RAGLiteConfig | None = None,
 ) -> list[ChunkSpan]:
     """Retrieve context for RAG."""
     # Call the search method.
     config = config or RAGLiteConfig()
+    # If self_query is enabled, extract metadata filters from the query.
+    if self_query:
+        self_query_filter = _self_query(query, config=config)
+        if metadata_filter is not None:
+            metadata_filter = {**self_query_filter, **metadata_filter}
+        else:
+            metadata_filter = self_query_filter
     results = config.search_method(
         query, num_results=num_chunks, metadata_filter=metadata_filter, config=config
     )
@@ -173,6 +204,65 @@ def _run_tools(
     return tool_messages
 
 
+def _self_query(
+    query: str,
+    *,
+    self_query_prompt: str = SELF_QUERY_PROMPT,
+    no_match: str = NO_MATCH,
+    config: RAGLiteConfig | None = None,
+) -> MetadataFilter:
+    """Extract metadata filters from a natural language query."""
+    config = config or RAGLiteConfig()
+    # Retrieve the available metadata from the database.
+    with Session(create_database_engine(config)) as session:
+        metadata_records = session.exec(select(Metadata)).all()
+    if not metadata_records:
+        return {}
+    # Generate response format JSON schema
+    available_metadata: dict[str, list[MetadataValue]] = {}
+    properties: dict[str, dict[str, list[MetadataValue]]] = {}
+    for record in metadata_records:
+        properties[record.name] = {"enum": [*record.values, no_match]}
+        available_metadata[record.name] = record.values
+    response_format = {
+        "type": "json_schema",
+        "json_schema": {
+            "name": "metadata_filter",
+            "schema": {
+                "type": "object",
+                "additionalProperties": False,
+                "properties": properties,
+                "required": list(properties.keys()),
+            },
+            "strict": True,
+        },
+    }
+    # Format the prompt with the user's query and available metadata
+    formatted_prompt = self_query_prompt.format(
+        metadata_dict=json.dumps(available_metadata, indent=2),
+        query=query,
+        no_match=no_match,
+    )
+    # Call the LLM to extract metadata filters
+    try:
+        response = completion(
+            model=config.llm,
+            messages=[
+                {"role": "user", "content": formatted_prompt},
+            ],
+            response_format=response_format,
+            temperature=0,
+        )
+        metadata_filter = response["choices"][0]["message"]["content"]
+        metadata_filter = json.loads(metadata_filter)
+        # Remove any key-value pairs where the value is equal to no_match
+        metadata_filter = {k: v for k, v in metadata_filter.items() if v != no_match}
+    except (json.JSONDecodeError, KeyError):
+        return {}
+    else:
+        return metadata_filter
+
+
 def rag(
     messages: list[dict[str, str]],
     *,

diff --git a/tests/test_insert.py b/tests/test_insert.py
@@ -6,7 +6,7 @@
 from tqdm import tqdm
 
 from raglite._config import RAGLiteConfig
-from raglite._database import Chunk, Document, create_database_engine
+from raglite._database import Chunk, Document, Metadata, create_database_engine
 from raglite._markdown import document_to_markdown
 
 
@@ -43,3 +43,14 @@ def test_insert(raglite_test_config: RAGLiteConfig) -> None:
         doc = document_to_markdown(doc_path)
         doc = doc.replace("\n", "").strip()
         assert restored_document == doc, "Restored document does not match the original input."
+        # Verify that the document metadata matches.
+        metadata = session.exec(select(Metadata)).all()
+        assert len(metadata) > 0, "No metadata found for the document"
+        # Check that the metadata values match the original document metadata.
+        for meta in metadata:
+            assert meta.name in document.metadata_, (
+                f"Metadata {meta.name} not found in document metadata"
+            )
+            assert document.metadata_[meta.name] in meta.values, (
+                f"Metadata value {document.metadata_[meta.name]} for {meta.name} not found in metadata values {meta.values}"
+            )
diff --git a/tests/test_rag.py b/tests/test_rag.py
@@ -1,14 +1,15 @@
 """Test RAGLite's RAG functionality."""
 
 import json
+from typing import Any
 
 from raglite import (
     RAGLiteConfig,
     add_context,
     retrieve_context,
 )
 from raglite._database import ChunkSpan
-from raglite._rag import rag
+from raglite._rag import _self_query, rag
 
 
 def test_rag_manual(raglite_test_config: RAGLiteConfig) -> None:
@@ -60,3 +61,33 @@ def test_rag_auto_without_retrieval(raglite_test_config: RAGLiteConfig) -> None:
     # Verify that no RAG context was retrieved.
     assert [message["role"] for message in messages] == ["user", "assistant"]
     assert not chunk_spans
+
+
+def test_self_query(raglite_test_config: RAGLiteConfig) -> None:
+    """Test self-query functionality that extracts metadata filters from queries."""
+    # Test 1: Query that should extract "Physics" from topic field
+    query1 = "I want to learn more about Physics"
+    expected_filter1 = {"topic": "Physics"}
+    actual_filter1 = _self_query(query1, config=raglite_test_config)
+    assert actual_filter1 == expected_filter1, f"Expected {expected_filter1}, got {actual_filter1}"
+    # Test 2: Query with non-existent metadata values should return empty filter
+    query2 = "What did Shakespeare write about chemistry?"
+    expected_filter2: dict[str, Any] = {}
+    actual_filter2 = _self_query(query2, config=raglite_test_config)
+    assert actual_filter2 == expected_filter2, f"Expected {expected_filter2}, got {actual_filter2}"
+
+
+def test_retrieve_context_self_query(raglite_test_config: RAGLiteConfig) -> None:
+    """Test retrieve_context with self_query functionality."""
+    query = "What does Albert Einstein's paper say about time dilation?"
+    chunk_spans = retrieve_context(
+        query=query, self_query=True, num_chunks=5, config=raglite_test_config
+    )
+    assert all(isinstance(chunk_span, ChunkSpan) for chunk_span in chunk_spans)
+    for chunk_span in chunk_spans:
+        assert chunk_span.document.metadata_.get("type") == "Paper", (
+            f"Expected type='Paper', got {chunk_span.document.metadata_.get('type')}"
+        )
+        assert chunk_span.document.metadata_.get("author") == "Albert Einstein", (
+            f"Expected author='Albert Einstein', got {chunk_span.document.metadata_.get('author')}"
+        )