Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
137 changes: 137 additions & 0 deletions examples/basic/pdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
"""
Use CrateDB Vector Search with Sentence Transformers from Hugging Face.

- https://huggingface.co/sentence-transformers
- https://python.langchain.com/docs/integrations/text_embedding/sentence_transformers/

As input data, the example uses the canonical `state_of_the_union.txt`.

Synopsis::

# Install prerequisites.
pip install --upgrade langchain-huggingface langchain-cratedb langchain-text-splitters 'pypdf!=5.1.0'

# Start database.
docker run --rm -it --publish=4200:4200 crate/crate:nightly

# Optionally set environment variable to configure CrateDB connection URL.
export CRATEDB_SQLALCHEMY_URL="crate://crate@localhost/?schema=doc"

# Run program.
python examples/basic/pdf.py
""" # noqa: E501
# /// script
# requires-python = ">=3.9"
# dependencies = [
# "langchain-cratedb",
# "langchain-huggingface",
# "langchain-text-splitters",
# "pypdf!=5.1.0",
# ]
# ///

import os
import typing as t

from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents import Document
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain_cratedb import CrateDBVectorStore

CRATEDB_SQLALCHEMY_URL = os.environ.get(
"CRATEDB_SQLALCHEMY_URL", "crate://crate@localhost/?schema=testdrive"
)
RESOURCE_URL = "https://patentimages.storage.googleapis.com/1e/f5/93/346d19e0e43e92/EP0666666B1.pdf"


def get_documents() -> t.List[Document]:
"""
Acquire data, return as LangChain documents.
"""

# Define text splitter.
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)

# Define resource loader.
loader = PyPDFLoader(RESOURCE_URL)

# Load PDF pages and split into fragments.
fragments = []
pages = loader.load()
for page in pages:
fragments += text_splitter.create_documents([page.page_content])
return fragments


def main() -> None:
# Set up LLM.
# embeddings = OpenAIEmbeddings() # noqa: ERA001
# """
embeddings = HuggingFaceEmbeddings(
# A small sentence-transformers model mapping sentences & paragraphs to a
# 384 dimensional dense vector space and can be used for tasks like
# clustering or semantic search.
#
# The model is intended to be used as a sentence and short paragraph encoder.
# Given an input text, it outputs a vector which captures the semantic
# information. The sentence vector may be used for information retrieval,
# clustering or sentence similarity tasks.
#
# By default, input text longer than 256 word pieces is truncated.
# https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
#
model_name="all-MiniLM-L6-v2", # noqa: ERA001
#
#
# Every Byte Matters: Introducing mxbai-embed-xsmall-v1
# https://www.mixedbread.ai/blog/mxbai-embed-xsmall-v1
#
# An open-source English embedding model optimized for retrieval tasks developed
# by Mixedbread. It is built upon `sentence-transformers/all-MiniLM-L6-v2` and
# trained with the AnglE loss and Espresso.
#
# https://huggingface.co/mixedbread-ai/mxbai-embed-xsmall-v1
#
# model_name="mixedbread-ai/mxbai-embed-xsmall-v1", # noqa: ERA001
)
# """

# Acquire documents.
print("Acquiring data")
documents = get_documents()

# Embed each chunk, and load them into the vector store.
print("Indexing data")
vector_store = CrateDBVectorStore.from_documents(
documents=documents,
embedding=embeddings,
connection=CRATEDB_SQLALCHEMY_URL,
)

# Invoke a query, and display the first result.
print("Querying data")
queries = [
"What is the invention about?",
"What does the patent describe?",
"Give me a summary, please.",
"Which kind of system is it?",
"Was ist das für ein System?",
"De quel type de système s'agit-il?",
]
for query in queries:
print("=" * 42)
print("Query:", query)
print("=" * 42)
docs = vector_store.similarity_search(query, k=3)
for doc in docs:
print(doc.page_content)
print()
print()

vector_store.delete_collection()


if __name__ == "__main__":
main()
110 changes: 110 additions & 0 deletions examples/basic/vector_huggingface.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
"""
Use CrateDB Vector Search with Sentence Transformers from Hugging Face.

- https://huggingface.co/sentence-transformers
- https://python.langchain.com/docs/integrations/text_embedding/sentence_transformers/

As input data, the example uses the canonical `state_of_the_union.txt`.

Synopsis::

# Install prerequisites.
pip install --upgrade langchain-cratedb langchain-huggingface

# Start database.
docker run --rm -it --publish=4200:4200 crate/crate:nightly

# Optionally set environment variable to configure CrateDB connection URL.
export CRATEDB_SQLALCHEMY_URL="crate://crate@localhost/?schema=doc"

# Run program.
python examples/basic/vector_huggingface.py
""" # noqa: E501
# /// script
# requires-python = ">=3.9"
# dependencies = [
# "langchain-huggingface",
# "langchain-cratedb",
# ]
# ///

import os
import typing as t

import requests
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_huggingface import HuggingFaceEmbeddings

from langchain_cratedb import CrateDBVectorStore

CRATEDB_SQLALCHEMY_URL = os.environ.get(
"CRATEDB_SQLALCHEMY_URL", "crate://crate@localhost/?schema=testdrive"
)
# TODO: Change URL to repository after merging.
RESOURCE_URL = "https://gist.github.com/amotl/a5dd9814d1865b14248ca97eb8075f96/raw/Universal_Declaration_of_Human_Rights.md"


def get_documents() -> t.List[Document]:
"""
Acquire data, return as LangChain documents.
"""

# Define text splitter.
text_splitter = RecursiveCharacterTextSplitter(chunk_size=350, chunk_overlap=0)

# Load a document, and split it into chunks.
text = requests.get(RESOURCE_URL, timeout=10).text
return text_splitter.create_documents([text])


def main() -> None:
# Set up LLM.
embeddings = HuggingFaceEmbeddings(
# A small sentence-transformers model mapping sentences & paragraphs to a
# 384 dimensional dense vector space and can be used for tasks like
# clustering or semantic search.
#
# The model is intended to be used as a sentence and short paragraph encoder.
# Given an input text, it outputs a vector which captures the semantic
# information. The sentence vector may be used for information retrieval,
# clustering or sentence similarity tasks.
#
# By default, input text longer than 256 word pieces is truncated.
# https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
#
# model_name="all-MiniLM-L6-v2", # noqa: ERA001
#
#
# Every Byte Matters: Introducing mxbai-embed-xsmall-v1
# https://www.mixedbread.ai/blog/mxbai-embed-xsmall-v1
#
# An open-source English embedding model optimized for retrieval tasks developed
# by Mixedbread. It is built upon `sentence-transformers/all-MiniLM-L6-v2` and
# trained with the AnglE loss and Espresso.
#
# https://huggingface.co/mixedbread-ai/mxbai-embed-xsmall-v1
#
model_name="mixedbread-ai/mxbai-embed-xsmall-v1",
)

# Acquire documents.
documents = get_documents()

# Embed each chunk, and load them into the vector store.
vector_store = CrateDBVectorStore.from_documents(
documents=documents,
embedding=embeddings,
connection=CRATEDB_SQLALCHEMY_URL,
)

# Invoke a query, and display the first result.
query = "What does the declaration say about freedom?"
docs = vector_store.similarity_search(query)
for doc in docs:
print("=" * 42)
print(doc.page_content)


if __name__ == "__main__":
main()
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,18 @@
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import (
ExperimentalMarkdownSyntaxTextSplitter,
MarkdownTextSplitter,
)

from langchain_cratedb import CrateDBVectorStore

CRATEDB_SQLALCHEMY_URL = os.environ.get(
"CRATEDB_SQLALCHEMY_URL", "crate://crate@localhost/?schema=testdrive"
)
# TODO: Change URL to repository after merging.
RESOURCE_URL = "https://gist.github.com/amotl/a5dd9814d1865b14248ca97eb8075f96/raw/Universal_Declaration_of_Human_Rights.md"


def get_documents() -> t.List[Document]:
Expand All @@ -48,27 +54,33 @@ def get_documents() -> t.List[Document]:
"""

# Define text splitter.
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
text_splitter = MarkdownTextSplitter(chunk_size=350, chunk_overlap=0)

# Load a document, and split it into chunks.
url = "https://github.com/langchain-ai/langchain/raw/v0.0.325/docs/docs/modules/state_of_the_union.txt"
text = requests.get(url, timeout=10).text
text = requests.get(RESOURCE_URL, timeout=10).text
return text_splitter.create_documents([text])


def main() -> None:
# Set up LLM.
embeddings = OpenAIEmbeddings()

# Acquire documents.
documents = get_documents()

# Embed each chunk, and load them into the vector store.
vector_store = CrateDBVectorStore.from_documents(
documents, OpenAIEmbeddings(), connection=CRATEDB_SQLALCHEMY_URL
documents=documents,
embedding=embeddings,
connection=CRATEDB_SQLALCHEMY_URL,
)

# Invoke a query, and display the first result.
query = "What did the president say about Ketanji Brown Jackson"
query = "What does the declaration say about freedom?"
docs = vector_store.similarity_search(query)
print(docs[0].page_content)
for doc in docs:
print("=" * 42)
print(doc.page_content)


if __name__ == "__main__":
Expand Down
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -172,9 +172,12 @@ optional = true
[tool.poetry.group.dev.dependencies]

[tool.poetry.group.test.dependencies]
langchain-huggingface = "<0.2"
langchain-openai = "<0.3"
langchain-tests = "==0.3.7"
langchain-text-splitters = "<0.4"
notebook = "<7.4"
pypdf = "!=5.1.0"
pytest = "<9"
pytest-asyncio = "<0.26"
pytest-cov = "<7"
Expand Down
Loading
Loading