From 0b29c61ee4fb4f9320fd07e046940db61aed5212 Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Wed, 29 Jan 2025 00:51:37 +0100 Subject: [PATCH] feat: Add LangChain vector store adapter for CrateDB --- .../Components/components-vector-stores.md | 25 ++ pyproject.toml | 1 + .../components/vectorstores/__init__.py | 2 + .../components/vectorstores/cratedb.py | 90 ++++++ .../components/cratedb/__init__.py | 0 .../cratedb/test_cratedb_component.py | 283 ++++++++++++++++++ src/frontend/src/icons/CrateDB/CrateDB.jsx | 76 +++++ src/frontend/src/icons/CrateDB/cratedb.svg | 55 ++++ src/frontend/src/icons/CrateDB/index.tsx | 9 + src/frontend/src/icons/eagerIconImports.ts | 2 + src/frontend/src/icons/lazyIconImports.ts | 2 + 11 files changed, 545 insertions(+) create mode 100644 src/backend/base/langflow/components/vectorstores/cratedb.py create mode 100644 src/backend/tests/integration/components/cratedb/__init__.py create mode 100644 src/backend/tests/integration/components/cratedb/test_cratedb_component.py create mode 100644 src/frontend/src/icons/CrateDB/CrateDB.jsx create mode 100644 src/frontend/src/icons/CrateDB/cratedb.svg create mode 100644 src/frontend/src/icons/CrateDB/index.tsx diff --git a/docs/docs/Components/components-vector-stores.md b/docs/docs/Components/components-vector-stores.md index de9dbdf4b16f..bd12203058bf 100644 --- a/docs/docs/Components/components-vector-stores.md +++ b/docs/docs/Components/components-vector-stores.md @@ -418,6 +418,31 @@ For more information, see the [Chroma documentation](https://docs.trychroma.com/ +## CrateDB + +This component creates a CrateDB Vector Store with search capabilities. +For more information, see the documentation about the +[CrateDB LangChain adapter](https://cratedb.com/docs/guide/integrate/langchain/). + +### Inputs + +| Name | Type | Description | +|----------------------------------|---------------|------------------------------------------------------------------| +| collection_name | String | The name of the collection. Default: "langflow". | +| search_query | String | The query to search for in the vector store. | +| ingest_data | Data | The data to ingest into the vector store (list of Data objects). | +| embedding | Embeddings | The embedding function to use for the vector store. | +| server_url | String | SQLAlchemy URL to connect to CrateDB. | +| search_type | String | Type of search to perform: "Similarity" or "MMR". | +| number_of_results | Integer | Number of results to return from the search. Default: 10. | + +### Outputs + +| Name | Type | Description | +|----------------|--------------------|-------------------------------| +| vector_store | CrateDBVectorStore | CrateDB vector store instance | +| search_results | List[Data] | Results of similarity search | + ## Elasticsearch This component creates an Elasticsearch Vector Store with search capabilities. diff --git a/pyproject.toml b/pyproject.toml index e65930e21e45..2d0d23982209 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -97,6 +97,7 @@ dependencies = [ "langchain-ollama==0.2.1", "langchain-sambanova==0.1.0", "langchain-community~=0.3.20", + "langchain-cratedb<0.2", "sqlalchemy[aiosqlite]>=2.0.38,<3.0.0", "atlassian-python-api==3.41.16", "mem0ai==0.1.34", diff --git a/src/backend/base/langflow/components/vectorstores/__init__.py b/src/backend/base/langflow/components/vectorstores/__init__.py index ef1da82272d5..aefa28b2a0a8 100644 --- a/src/backend/base/langflow/components/vectorstores/__init__.py +++ b/src/backend/base/langflow/components/vectorstores/__init__.py @@ -5,6 +5,7 @@ from .chroma import ChromaVectorStoreComponent from .clickhouse import ClickhouseVectorStoreComponent from .couchbase import CouchbaseVectorStoreComponent +from .cratedb import CrateDBVectorStoreComponent from .elasticsearch import ElasticsearchVectorStoreComponent from .faiss import FaissVectorStoreComponent from .graph_rag import GraphRAGComponent @@ -31,6 +32,7 @@ "ChromaVectorStoreComponent", "ClickhouseVectorStoreComponent", "CouchbaseVectorStoreComponent", + "CrateDBVectorStoreComponent", "ElasticsearchVectorStoreComponent", "FaissVectorStoreComponent", "GraphRAGComponent", diff --git a/src/backend/base/langflow/components/vectorstores/cratedb.py b/src/backend/base/langflow/components/vectorstores/cratedb.py new file mode 100644 index 000000000000..b7423329f9ed --- /dev/null +++ b/src/backend/base/langflow/components/vectorstores/cratedb.py @@ -0,0 +1,90 @@ +import typing as t + +from langchain_cratedb import CrateDBVectorStore + +from langflow.base.vectorstores.model import LCVectorStoreComponent, check_cached_vector_store +from langflow.helpers import docs_to_data +from langflow.io import HandleInput, IntInput, SecretStrInput, StrInput +from langflow.schema import Data + + +class CrateDBVectorStoreComponent(LCVectorStoreComponent): + display_name = "CrateDBVector" + description = "CrateDB Vector Store with search capabilities" + name = "CrateDB" + icon = "CrateDB" + + inputs = [ + SecretStrInput(name="server_url", display_name="CrateDB SQLAlchemy URL", required=True), + StrInput(name="collection_name", display_name="Table", required=True), + *LCVectorStoreComponent.inputs, + HandleInput(name="embedding", display_name="Embedding", input_types=["Embeddings"], required=True), + IntInput( + name="number_of_results", + display_name="Number of Results", + info="Number of results to return.", + value=4, + advanced=True, + ), + ] + + @check_cached_vector_store + def build_vector_store(self) -> CrateDBVectorStore: + documents = [] + for _input in self.ingest_data or []: + if isinstance(_input, Data): + documents.append(_input.to_lc_document()) + else: + documents.append(_input) + + connection_string = self.server_url or "crate://" + + if documents: + store = CrateDBVectorStore.from_documents( + embedding=self.embedding, + documents=documents, + collection_name=self.collection_name, + connection=connection_string, + ) + else: + store = CrateDBVectorStore.from_existing_index( + embedding=self.embedding, + collection_name=self.collection_name, + connection=connection_string, + ) + + return store + + def search_documents(self) -> list[Data]: + vector_store = self.build_vector_store() + + if self.search_query and isinstance(self.search_query, str) and self.search_query.strip(): + docs = vector_store.similarity_search( + query=self.search_query, + k=self.number_of_results, + ) + + data = docs_to_data(docs) + self.status = data + return data + return [] + + +def cratedb_collection_to_data(embedding_documents: list[t.Any]): + """Converts a collection of CrateDB vectors into a list of data. + + Args: + embedding_documents (dict): A list of EmbeddingStore instances. + + Returns: + list: A list of data, where each record represents a document in the collection. + """ + data = [] + for doc in embedding_documents: + data_dict = { + "id": doc.id, + "text": doc.document, + } + data_dict.update(doc.cmetadata) + data.append(Data(**data_dict)) + return data diff --git a/src/backend/tests/integration/components/cratedb/__init__.py b/src/backend/tests/integration/components/cratedb/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/src/backend/tests/integration/components/cratedb/test_cratedb_component.py b/src/backend/tests/integration/components/cratedb/test_cratedb_component.py new file mode 100644 index 000000000000..3d43c23ede6c --- /dev/null +++ b/src/backend/tests/integration/components/cratedb/test_cratedb_component.py @@ -0,0 +1,283 @@ +"""Invoke CrateDB using Docker. + +docker run --rm -it --name=cratedb \ + --publish=4200:4200 --publish=5432:5432 \ + --env=CRATE_HEAP_SIZE=2g crate:latest \ + -Cdiscovery.type=single-node \ + -Ccluster.routing.allocation.disk.threshold_enabled=false +""" + +import os +from typing import Any + +import pytest +import sqlalchemy as sa +from langflow.components.vectorstores.cratedb import CrateDBVectorStoreComponent, cratedb_collection_to_data +from langflow.schema.data import Data + +from tests.base import ComponentTestBaseWithoutClient, VersionComponentMapping + +CRATEDB_SQLALCHEMY_URL = os.getenv("CRATEDB_SQLALCHEMY_URL", "crate://") + + +@pytest.fixture(autouse=True) +def cratedb_reset() -> None: + """Cleanup: Drop all collections before tests.""" + engine = sa.create_engine(CRATEDB_SQLALCHEMY_URL) + with engine.connect() as connection: + connection.execute(sa.text("DROP TABLE IF EXISTS langchain_collection")) + connection.execute(sa.text("DROP TABLE IF EXISTS langchain_embedding")) + + +@pytest.mark.api_key_required +class TestCrateDBVectorStoreComponent(ComponentTestBaseWithoutClient): + @pytest.fixture + def component_class(self) -> type[Any]: + """Return the component class to test.""" + return CrateDBVectorStoreComponent + + @pytest.fixture + def default_kwargs(self) -> dict[str, Any]: + """Return the default kwargs for the component.""" + from langflow.components.embeddings.openai import OpenAIEmbeddingsComponent + + if os.getenv("OPENAI_API_KEY") is None: + pytest.skip("OPENAI_API_KEY is not set") + + api_key = os.getenv("OPENAI_API_KEY") + + return { + "server_url": CRATEDB_SQLALCHEMY_URL, + "embedding": OpenAIEmbeddingsComponent(openai_api_key=api_key).build_embeddings(), + "collection_name": "test_collection", + } + + @pytest.fixture + def file_names_mapping(self) -> list[VersionComponentMapping]: + """Return the file names mapping for different versions.""" + return [] + + def test_create_db( + self, component_class: type[CrateDBVectorStoreComponent], default_kwargs: dict[str, Any] + ) -> None: + """Test the create_collection method.""" + component: CrateDBVectorStoreComponent = component_class().set(**default_kwargs) + vector_store = component.build_vector_store() + vector_store._init_models_with_dimensionality(3) + vector_store.create_tables_if_not_exists() + vector_store.create_collection() + + engine = sa.create_engine(CRATEDB_SQLALCHEMY_URL) + with engine.connect() as connection: + connection.execute(sa.text("SELECT * FROM langchain_collection")) + connection.execute(sa.text("SELECT * FROM langchain_embedding")) + + def test_create_collection_with_data( + self, component_class: type[CrateDBVectorStoreComponent], default_kwargs: dict[str, Any] + ) -> None: + """Test the create_collection method with data.""" + # set ingest_data in default_kwargs to a list of Data objects + test_texts = ["test data 1", "test data 2", "something completely different"] + default_kwargs["ingest_data"] = [Data(text=text) for text in test_texts] + + component: CrateDBVectorStoreComponent = component_class().set(**default_kwargs) + vector_store = component.build_vector_store() + + # Verify collection exists and has the correct data + collection = vector_store.get_collection(vector_store.session_maker()) + assert collection.name == default_kwargs["collection_name"] + assert len(collection.embeddings) == len(test_texts) + + def test_similarity_search( + self, component_class: type[CrateDBVectorStoreComponent], default_kwargs: dict[str, Any] + ) -> None: + """Test the similarity search functionality through the component.""" + # Create test data with distinct topics + test_data = [ + "The quick brown fox jumps over the lazy dog", + "Python is a popular programming language", + "Machine learning models process data", + "The lazy dog sleeps all day long", + ] + default_kwargs["ingest_data"] = [Data(text=text) for text in test_data] + default_kwargs["search_type"] = "Similarity" + default_kwargs["number_of_results"] = 2 + + component: CrateDBVectorStoreComponent = component_class().set(**default_kwargs) + component.build_vector_store() + + # Test similarity search through the component + component.set(search_query="dog sleeping") + results = component.search_documents() + + assert len(results) == 2 + # The most relevant results should be about dogs + assert any("dog" in result.text.lower() for result in results) + + # Test with different number of results + component.set(number_of_results=3) + results = component.search_documents() + assert len(results) == 3 + + def test_mmr_search( + self, component_class: type[CrateDBVectorStoreComponent], default_kwargs: dict[str, Any] + ) -> None: + """Test the MMR search functionality through the component.""" + # Create test data with some similar documents + test_data = [ + "The quick brown fox jumps", + "The quick brown fox leaps", + "The quick brown fox hops", + "Something completely different about cats", + ] + default_kwargs["ingest_data"] = [Data(text=text) for text in test_data] + default_kwargs["search_type"] = "MMR" + default_kwargs["number_of_results"] = 3 + + component: CrateDBVectorStoreComponent = component_class().set(**default_kwargs) + component.build_vector_store() + + # Test MMR search through the component + component.set(search_query="quick fox") + results = component.search_documents() + + assert len(results) == 3 + # Results should be diverse but relevant + assert any("fox" in result.text.lower() for result in results) + + # Test with different settings + component.set(number_of_results=2) + diverse_results = component.search_documents() + assert len(diverse_results) == 2 + + def test_search_with_different_types( + self, component_class: type[CrateDBVectorStoreComponent], default_kwargs: dict[str, Any] + ) -> None: + """Test search with different search types.""" + test_data = [ + "The quick brown fox jumps over the lazy dog", + "Python is a popular programming language", + "Machine learning models process data", + ] + default_kwargs["ingest_data"] = [Data(text=text) for text in test_data] + default_kwargs["number_of_results"] = 2 + + component: CrateDBVectorStoreComponent = component_class().set(**default_kwargs) + component.build_vector_store() + + # Test similarity search + component.set(search_type="Similarity", search_query="programming languages") + similarity_results = component.search_documents() + assert len(similarity_results) == 2 + assert any("python" in result.text.lower() for result in similarity_results) + + # Test MMR search + component.set(search_type="MMR", search_query="programming languages") + mmr_results = component.search_documents() + assert len(mmr_results) == 2 + + # Test with empty query + component.set(search_query="") + empty_results = component.search_documents() + assert len(empty_results) == 0 + + def test_search_with_score( + self, component_class: type[CrateDBVectorStoreComponent], default_kwargs: dict[str, Any] + ) -> None: + """Test the search with score functionality through the component.""" + test_data = [ + "The quick brown fox jumps over the lazy dog", + "Python is a popular programming language", + "Machine learning models process data", + ] + default_kwargs["ingest_data"] = [Data(text=text) for text in test_data] + default_kwargs["number_of_results"] = 2 + + component: CrateDBVectorStoreComponent = component_class().set(**default_kwargs) + component.build_vector_store() + + # Test search with score through the component + component.set( + search_type="similarity_score_threshold", search_query="programming languages", number_of_results=2 + ) + results = component.search_documents() + + assert len(results) == 2 + # Results should be sorted by relevance + assert any("python" in result.text.lower() for result in results) + assert any("programming" in result.text.lower() for result in results) + + # Test with different number of results + component.set(number_of_results=3) + results = component.search_documents() + assert len(results) == 3 + + def test_cratedb_collection_to_data( + self, component_class: type[CrateDBVectorStoreComponent], default_kwargs: dict[str, Any] + ) -> None: + """Test the cratedb_collection_to_data function.""" + # Create a collection with documents and metadata + test_data = [ + Data(data={"text": "Document 1", "metadata_field": "value1"}), + Data(data={"text": "Document 2", "metadata_field": "value2"}), + ] + default_kwargs["ingest_data"] = test_data + component: CrateDBVectorStoreComponent = component_class().set(**default_kwargs) + vector_store = component.build_vector_store() + + # Get the collection data + collection = vector_store.get_collection(vector_store.session_maker()) + collection_dict = collection.embeddings + data_objects = cratedb_collection_to_data(collection_dict) + + # Verify the conversion + assert len(data_objects) == 2 + for data_obj in data_objects: + assert isinstance(data_obj, Data) + assert "id" in data_obj.data + assert "text" in data_obj.data + assert data_obj.data["text"] in ["Document 1", "Document 2"] + assert "metadata_field" in data_obj.data + assert data_obj.data["metadata_field"] in ["value1", "value2"] + + def test_cratedb_collection_to_data_without_metadata( + self, component_class: type[CrateDBVectorStoreComponent], default_kwargs: dict[str, Any] + ) -> None: + """Test the cratedb_collection_to_data function with documents that have no metadata.""" + # Create a collection with documents but no metadata + test_data = [ + Data(data={"text": "Simple document 1"}), + Data(data={"text": "Simple document 2"}), + ] + default_kwargs["ingest_data"] = test_data + component: CrateDBVectorStoreComponent = component_class().set(**default_kwargs) + vector_store = component.build_vector_store() + + # Get the collection data + collection = vector_store.get_collection(vector_store.session_maker()) + collection_dict = collection.embeddings + data_objects = cratedb_collection_to_data(collection_dict) + + # Verify the conversion + assert len(data_objects) == 2 + for data_obj in data_objects: + assert isinstance(data_obj, Data) + assert "id" in data_obj.data + assert "text" in data_obj.data + assert data_obj.data["text"] in ["Simple document 1", "Simple document 2"] + + def test_cratedb_collection_to_data_empty_collection( + self, component_class: type[CrateDBVectorStoreComponent], default_kwargs: dict[str, Any] + ) -> None: + """Test the cratedb_collection_to_data function with an empty collection.""" + # Create an empty collection + component: CrateDBVectorStoreComponent = component_class().set(**default_kwargs) + vector_store = component.build_vector_store() + + # Get the collection data + with pytest.raises(RuntimeError) as ex: + vector_store.get_collection(vector_store.session_maker()) + assert ex.match("Collection can't be accessed without specifying dimension size of embedding vectors") + + def test_component_versions(self, *args, **kwargs) -> None: # noqa: ARG002 + pytest.skip("Component versions can't be tested for new components") diff --git a/src/frontend/src/icons/CrateDB/CrateDB.jsx b/src/frontend/src/icons/CrateDB/CrateDB.jsx new file mode 100644 index 000000000000..705bde1e0992 --- /dev/null +++ b/src/frontend/src/icons/CrateDB/CrateDB.jsx @@ -0,0 +1,76 @@ +const SvgCrateDBIcon = (props) => ( + + CrateDB-Favicon + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +); + +export default SvgCrateDBIcon; diff --git a/src/frontend/src/icons/CrateDB/cratedb.svg b/src/frontend/src/icons/CrateDB/cratedb.svg new file mode 100644 index 000000000000..8209c16b4f1f --- /dev/null +++ b/src/frontend/src/icons/CrateDB/cratedb.svg @@ -0,0 +1,55 @@ + + CrateDB-Favicon + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/src/frontend/src/icons/CrateDB/index.tsx b/src/frontend/src/icons/CrateDB/index.tsx new file mode 100644 index 000000000000..7f296ef29398 --- /dev/null +++ b/src/frontend/src/icons/CrateDB/index.tsx @@ -0,0 +1,9 @@ +import React, { forwardRef } from "react"; +import SvgCrateDBIcon from "./CrateDB"; + +export const CrateDBIcon = forwardRef< + SVGSVGElement, + React.PropsWithChildren<{}> +>((props, ref) => { + return ; +}); diff --git a/src/frontend/src/icons/eagerIconImports.ts b/src/frontend/src/icons/eagerIconImports.ts index 5182adc54b36..46f044de197d 100644 --- a/src/frontend/src/icons/eagerIconImports.ts +++ b/src/frontend/src/icons/eagerIconImports.ts @@ -22,6 +22,7 @@ import { CohereIcon } from "@/icons/Cohere"; import { ComposioIcon } from "@/icons/Composio"; import { ConfluenceIcon } from "@/icons/Confluence"; import { CouchbaseIcon } from "@/icons/Couchbase"; +import { CrateDBIcon } from "@/icons/CrateDB"; import { CrewAiIcon } from "@/icons/CrewAI"; import { DeepSeekIcon } from "@/icons/DeepSeek"; import { DropboxIcon } from "@/icons/Dropbox"; @@ -139,6 +140,7 @@ export const eagerIconsMapping = { Composio: ComposioIcon, Confluence: ConfluenceIcon, Couchbase: CouchbaseIcon, + CrateDB: CrateDBIcon, CrewAI: CrewAiIcon, DeepSeek: DeepSeekIcon, Dropbox: DropboxIcon, diff --git a/src/frontend/src/icons/lazyIconImports.ts b/src/frontend/src/icons/lazyIconImports.ts index 72947c66afba..3f4c33e8c0c4 100644 --- a/src/frontend/src/icons/lazyIconImports.ts +++ b/src/frontend/src/icons/lazyIconImports.ts @@ -64,6 +64,8 @@ export const lazyIconsMapping = { import("@/icons/Couchbase").then((mod) => ({ default: mod.CouchbaseIcon })), Claude: () => import("@/icons/Claude").then((mod) => ({ default: mod.ClaudeIcon })), + CrateDB: () => + import("@/icons/CrateDB").then((mod) => ({ default: mod.CrateDBIcon })), CrewAI: () => import("@/icons/CrewAI").then((mod) => ({ default: mod.CrewAiIcon })), Cursor: () =>