crate · amotl · Dec 25, 2024 · Dec 25, 2024
diff --git a/examples/basic/pdf.py b/examples/basic/pdf.py
@@ -0,0 +1,137 @@
+"""
+Use CrateDB Vector Search with Sentence Transformers from Hugging Face.
+
+- https://huggingface.co/sentence-transformers
+- https://python.langchain.com/docs/integrations/text_embedding/sentence_transformers/
+
+As input data, the example uses the canonical `state_of_the_union.txt`.
+
+Synopsis::
+
+    # Install prerequisites.
+    pip install --upgrade langchain-huggingface langchain-cratedb langchain-text-splitters 'pypdf!=5.1.0'
+
+    # Start database.
+    docker run --rm -it --publish=4200:4200 crate/crate:nightly
+
+    # Optionally set environment variable to configure CrateDB connection URL.
+    export CRATEDB_SQLALCHEMY_URL="crate://crate@localhost/?schema=doc"
+
+    # Run program.
+    python examples/basic/pdf.py
+"""  # noqa: E501
+# /// script
+# requires-python = ">=3.9"
+# dependencies = [
+#   "langchain-cratedb",
+#   "langchain-huggingface",
+#   "langchain-text-splitters",
+#   "pypdf!=5.1.0",
+# ]
+# ///
+
+import os
+import typing as t
+
+from langchain_community.document_loaders import PyPDFLoader
+from langchain_core.documents import Document
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+
+from langchain_cratedb import CrateDBVectorStore
+
+CRATEDB_SQLALCHEMY_URL = os.environ.get(
+    "CRATEDB_SQLALCHEMY_URL", "crate://crate@localhost/?schema=testdrive"
+)
+RESOURCE_URL = "https://patentimages.storage.googleapis.com/1e/f5/93/346d19e0e43e92/EP0666666B1.pdf"
+
+
+def get_documents() -> t.List[Document]:
+    """
+    Acquire data, return as LangChain documents.
+    """
+
+    # Define text splitter.
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
+
+    # Define resource loader.
+    loader = PyPDFLoader(RESOURCE_URL)
+
+    # Load PDF pages and split into fragments.
+    fragments = []
+    pages = loader.load()
+    for page in pages:
+        fragments += text_splitter.create_documents([page.page_content])
+    return fragments
+
+
+def main() -> None:
+    # Set up LLM.
+    # embeddings = OpenAIEmbeddings()  # noqa: ERA001
+    # """
+    embeddings = HuggingFaceEmbeddings(
+        # A small sentence-transformers model mapping sentences & paragraphs to a
+        # 384 dimensional dense vector space and can be used for tasks like
+        # clustering or semantic search.
+        #
+        # The model is intended to be used as a sentence and short paragraph encoder.
+        # Given an input text, it outputs a vector which captures the semantic
+        # information. The sentence vector may be used for information retrieval,
+        # clustering or sentence similarity tasks.
+        #
+        # By default, input text longer than 256 word pieces is truncated.
+        # https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
+        #
+        model_name="all-MiniLM-L6-v2",  # noqa: ERA001
+        #
+        #
+        # Every Byte Matters: Introducing mxbai-embed-xsmall-v1
+        # https://www.mixedbread.ai/blog/mxbai-embed-xsmall-v1
+        #
+        # An open-source English embedding model optimized for retrieval tasks developed
+        # by Mixedbread. It is built upon `sentence-transformers/all-MiniLM-L6-v2` and
+        # trained with the AnglE loss and Espresso.
+        #
+        # https://huggingface.co/mixedbread-ai/mxbai-embed-xsmall-v1
+        #
+        # model_name="mixedbread-ai/mxbai-embed-xsmall-v1",  # noqa: ERA001
+    )
+    # """
+
+    # Acquire documents.
+    print("Acquiring data")
+    documents = get_documents()
+
+    # Embed each chunk, and load them into the vector store.
+    print("Indexing data")
+    vector_store = CrateDBVectorStore.from_documents(
+        documents=documents,
+        embedding=embeddings,
+        connection=CRATEDB_SQLALCHEMY_URL,
+    )
+
+    # Invoke a query, and display the first result.
+    print("Querying data")
+    queries = [
+        "What is the invention about?",
+        "What does the patent describe?",
+        "Give me a summary, please.",
+        "Which kind of system is it?",
+        "Was ist das für ein System?",
+        "De quel type de système s'agit-il?",
+    ]
+    for query in queries:
+        print("=" * 42)
+        print("Query:", query)
+        print("=" * 42)
+        docs = vector_store.similarity_search(query, k=3)
+        for doc in docs:
+            print(doc.page_content)
+            print()
+        print()
+
+    vector_store.delete_collection()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/basic/vector_huggingface.py b/examples/basic/vector_huggingface.py
@@ -0,0 +1,110 @@
+"""
+Use CrateDB Vector Search with Sentence Transformers from Hugging Face.
+
+- https://huggingface.co/sentence-transformers
+- https://python.langchain.com/docs/integrations/text_embedding/sentence_transformers/
+
+As input data, the example uses the canonical `state_of_the_union.txt`.
+
+Synopsis::
+
+    # Install prerequisites.
+    pip install --upgrade langchain-cratedb langchain-huggingface
+
+    # Start database.
+    docker run --rm -it --publish=4200:4200 crate/crate:nightly
+
+    # Optionally set environment variable to configure CrateDB connection URL.
+    export CRATEDB_SQLALCHEMY_URL="crate://crate@localhost/?schema=doc"
+
+    # Run program.
+    python examples/basic/vector_huggingface.py
+"""  # noqa: E501
+# /// script
+# requires-python = ">=3.9"
+# dependencies = [
+#   "langchain-huggingface",
+#   "langchain-cratedb",
+# ]
+# ///
+
+import os
+import typing as t
+
+import requests
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_core.documents import Document
+from langchain_huggingface import HuggingFaceEmbeddings
+
+from langchain_cratedb import CrateDBVectorStore
+
+CRATEDB_SQLALCHEMY_URL = os.environ.get(
+    "CRATEDB_SQLALCHEMY_URL", "crate://crate@localhost/?schema=testdrive"
+)
+# TODO: Change URL to repository after merging.
+RESOURCE_URL = "https://gist.github.com/amotl/a5dd9814d1865b14248ca97eb8075f96/raw/Universal_Declaration_of_Human_Rights.md"
+
+
+def get_documents() -> t.List[Document]:
+    """
+    Acquire data, return as LangChain documents.
+    """
+
+    # Define text splitter.
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=350, chunk_overlap=0)
+
+    # Load a document, and split it into chunks.
+    text = requests.get(RESOURCE_URL, timeout=10).text
+    return text_splitter.create_documents([text])
+
+
+def main() -> None:
+    # Set up LLM.
+    embeddings = HuggingFaceEmbeddings(
+        # A small sentence-transformers model mapping sentences & paragraphs to a
+        # 384 dimensional dense vector space and can be used for tasks like
+        # clustering or semantic search.
+        #
+        # The model is intended to be used as a sentence and short paragraph encoder.
+        # Given an input text, it outputs a vector which captures the semantic
+        # information. The sentence vector may be used for information retrieval,
+        # clustering or sentence similarity tasks.
+        #
+        # By default, input text longer than 256 word pieces is truncated.
+        # https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
+        #
+        # model_name="all-MiniLM-L6-v2",  # noqa: ERA001
+        #
+        #
+        # Every Byte Matters: Introducing mxbai-embed-xsmall-v1
+        # https://www.mixedbread.ai/blog/mxbai-embed-xsmall-v1
+        #
+        # An open-source English embedding model optimized for retrieval tasks developed
+        # by Mixedbread. It is built upon `sentence-transformers/all-MiniLM-L6-v2` and
+        # trained with the AnglE loss and Espresso.
+        #
+        # https://huggingface.co/mixedbread-ai/mxbai-embed-xsmall-v1
+        #
+        model_name="mixedbread-ai/mxbai-embed-xsmall-v1",
+    )
+
+    # Acquire documents.
+    documents = get_documents()
+
+    # Embed each chunk, and load them into the vector store.
+    vector_store = CrateDBVectorStore.from_documents(
+        documents=documents,
+        embedding=embeddings,
+        connection=CRATEDB_SQLALCHEMY_URL,
+    )
+
+    # Invoke a query, and display the first result.
+    query = "What does the declaration say about freedom?"
+    docs = vector_store.similarity_search(query)
+    for doc in docs:
+        print("=" * 42)
+        print(doc.page_content)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/basic/vector_search.py → examples/basic/vector_openai.py b/examples/basic/vector_search.py → examples/basic/vector_openai.py
@@ -34,12 +34,18 @@
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_core.documents import Document
 from langchain_openai import OpenAIEmbeddings
+from langchain_text_splitters import (
+    ExperimentalMarkdownSyntaxTextSplitter,
+    MarkdownTextSplitter,
+)
 
 from langchain_cratedb import CrateDBVectorStore
 
 CRATEDB_SQLALCHEMY_URL = os.environ.get(
     "CRATEDB_SQLALCHEMY_URL", "crate://crate@localhost/?schema=testdrive"
 )
+# TODO: Change URL to repository after merging.
+RESOURCE_URL = "https://gist.github.com/amotl/a5dd9814d1865b14248ca97eb8075f96/raw/Universal_Declaration_of_Human_Rights.md"
 
 
 def get_documents() -> t.List[Document]:
@@ -48,27 +54,33 @@ def get_documents() -> t.List[Document]:
     """
 
     # Define text splitter.
-    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
+    text_splitter = MarkdownTextSplitter(chunk_size=350, chunk_overlap=0)
 
     # Load a document, and split it into chunks.
-    url = "https://github.com/langchain-ai/langchain/raw/v0.0.325/docs/docs/modules/state_of_the_union.txt"
-    text = requests.get(url, timeout=10).text
+    text = requests.get(RESOURCE_URL, timeout=10).text
     return text_splitter.create_documents([text])
 
 
 def main() -> None:
+    # Set up LLM.
+    embeddings = OpenAIEmbeddings()
+
     # Acquire documents.
     documents = get_documents()
 
     # Embed each chunk, and load them into the vector store.
     vector_store = CrateDBVectorStore.from_documents(
-        documents, OpenAIEmbeddings(), connection=CRATEDB_SQLALCHEMY_URL
+        documents=documents,
+        embedding=embeddings,
+        connection=CRATEDB_SQLALCHEMY_URL,
     )
 
     # Invoke a query, and display the first result.
-    query = "What did the president say about Ketanji Brown Jackson"
+    query = "What does the declaration say about freedom?"
     docs = vector_store.similarity_search(query)
-    print(docs[0].page_content)
+    for doc in docs:
+        print("=" * 42)
+        print(doc.page_content)
 
 
 if __name__ == "__main__":

diff --git a/pyproject.toml b/pyproject.toml
@@ -172,9 +172,12 @@ optional = true
 [tool.poetry.group.dev.dependencies]
 
 [tool.poetry.group.test.dependencies]
+langchain-huggingface = "<0.2"
 langchain-openai = "<0.3"
 langchain-tests = "==0.3.7"
+langchain-text-splitters = "<0.4"
 notebook = "<7.4"
+pypdf = "!=5.1.0"
 pytest = "<9"
 pytest-asyncio = "<0.26"
 pytest-cov = "<7"