diff --git a/docs/components/data-sources/excel-file.mdx b/docs/components/data-sources/excel-file.mdx new file mode 100644 index 0000000000..af8a2cd62b --- /dev/null +++ b/docs/components/data-sources/excel-file.mdx @@ -0,0 +1,18 @@ +--- +title: '📄 Excel file' +--- + +### Excel file + +To add any xlsx/xls file, use the data_type as `excel_file`. `excel_file` allows remote urls and conventional file paths. Eg: + +```python +from embedchain import App + +app = App() +app.add('https://example.com/content/intro.xlsx', data_type="excel_file") +# Or add file using the local file path on your system +# app.add('content/intro.xls', data_type="excel_file") + +app.query("Give brief information about data.") +``` diff --git a/embedchain/chunkers/excel_file.py b/embedchain/chunkers/excel_file.py new file mode 100644 index 0000000000..7de00a52fc --- /dev/null +++ b/embedchain/chunkers/excel_file.py @@ -0,0 +1,22 @@ +from typing import Optional + +from langchain.text_splitter import RecursiveCharacterTextSplitter + +from embedchain.chunkers.base_chunker import BaseChunker +from embedchain.config.add_config import ChunkerConfig +from embedchain.helpers.json_serializable import register_deserializable + + +@register_deserializable +class ExcelFileChunker(BaseChunker): + """Chunker for Excel file.""" + + def __init__(self, config: Optional[ChunkerConfig] = None): + if config is None: + config = ChunkerConfig(chunk_size=1000, chunk_overlap=0, length_function=len) + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=config.chunk_size, + chunk_overlap=config.chunk_overlap, + length_function=config.length_function, + ) + super().__init__(text_splitter) diff --git a/embedchain/data_formatter/data_formatter.py b/embedchain/data_formatter/data_formatter.py index b43f73a888..75dd76a254 100644 --- a/embedchain/data_formatter/data_formatter.py +++ b/embedchain/data_formatter/data_formatter.py @@ -80,6 +80,7 @@ def _get_loader(self, data_type: DataType, config: LoaderConfig, loader: Optiona DataType.SLACK: "embedchain.loaders.slack.SlackLoader", DataType.DROPBOX: "embedchain.loaders.dropbox.DropboxLoader", DataType.TEXT_FILE: "embedchain.loaders.text_file.TextFileLoader", + DataType.EXCEL_FILE: "embedchain.loaders.excel_file.ExcelFileLoader", } if data_type == DataType.CUSTOM or loader is not None: @@ -127,6 +128,7 @@ def _get_chunker(self, data_type: DataType, config: ChunkerConfig, chunker: Opti DataType.SLACK: "embedchain.chunkers.common_chunker.CommonChunker", DataType.DROPBOX: "embedchain.chunkers.common_chunker.CommonChunker", DataType.TEXT_FILE: "embedchain.chunkers.common_chunker.CommonChunker", + DataType.EXCEL_FILE: "embedchain.chunkers.excel_file.ExcelFileChunker", } if chunker is not None: diff --git a/embedchain/loaders/excel_file.py b/embedchain/loaders/excel_file.py new file mode 100644 index 0000000000..beab18dd57 --- /dev/null +++ b/embedchain/loaders/excel_file.py @@ -0,0 +1,40 @@ +import hashlib +import importlib.util + +try: + from langchain_community.document_loaders import UnstructuredExcelLoader +except ImportError: + raise ImportError( + 'Excel file requires extra dependencies. Install with `pip install --upgrade "embedchain[dataloaders]"`' + ) from None + +if importlib.util.find_spec("openpyxl") is None and importlib.util.find_spec("xlrd") is None: + raise ImportError("Excel file requires extra dependencies. Install with `pip install openpyxl xlrd`") from None + +from embedchain.helpers.json_serializable import register_deserializable +from embedchain.loaders.base_loader import BaseLoader +from embedchain.utils.misc import clean_string + + +@register_deserializable +class ExcelFileLoader(BaseLoader): + def load_data(self, excel_url): + """Load data from a Excel file.""" + loader = UnstructuredExcelLoader(excel_url) + pages = loader.load_and_split() + + data = [] + for page in pages: + content = page.page_content + content = clean_string(content) + + metadata = page.metadata + metadata["url"] = excel_url + + data.append({"content": content, "meta_data": metadata}) + + doc_id = hashlib.sha256((content + excel_url).encode()).hexdigest() + return { + "doc_id": doc_id, + "data": data, + } diff --git a/embedchain/models/data_type.py b/embedchain/models/data_type.py index df2e655a07..5f3ebfac80 100644 --- a/embedchain/models/data_type.py +++ b/embedchain/models/data_type.py @@ -40,6 +40,7 @@ class IndirectDataType(Enum): SLACK = "slack" DROPBOX = "dropbox" TEXT_FILE = "text_file" + EXCEL_FILE = "excel_file" class SpecialDataType(Enum): @@ -79,3 +80,4 @@ class DataType(Enum): SLACK = IndirectDataType.SLACK.value DROPBOX = IndirectDataType.DROPBOX.value TEXT_FILE = IndirectDataType.TEXT_FILE.value + EXCEL_FILE = IndirectDataType.EXCEL_FILE.value diff --git a/pyproject.toml b/pyproject.toml index e0e1b8b9f0..15a102fac3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -190,7 +190,7 @@ dataloaders=[ "duckduckgo-search", "pytube", "sentence-transformers", - "unstructured", + "unstructured" ] vertexai = ["langchain-google-vertexai"] llama2 = ["replicate"] diff --git a/tests/chunkers/test_chunkers.py b/tests/chunkers/test_chunkers.py index cdd06fcc3c..55c61e565b 100644 --- a/tests/chunkers/test_chunkers.py +++ b/tests/chunkers/test_chunkers.py @@ -2,6 +2,7 @@ from embedchain.chunkers.discourse import DiscourseChunker from embedchain.chunkers.docs_site import DocsSiteChunker from embedchain.chunkers.docx_file import DocxFileChunker +from embedchain.chunkers.excel_file import ExcelFileChunker from embedchain.chunkers.gmail import GmailChunker from embedchain.chunkers.google_drive import GoogleDriveChunker from embedchain.chunkers.json import JSONChunker @@ -43,6 +44,7 @@ DiscourseChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len}, CommonChunker: {"chunk_size": 2000, "chunk_overlap": 0, "length_function": len}, GoogleDriveChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len}, + ExcelFileChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len}, } diff --git a/tests/loaders/test_excel_file.py b/tests/loaders/test_excel_file.py new file mode 100644 index 0000000000..c0865ed5e3 --- /dev/null +++ b/tests/loaders/test_excel_file.py @@ -0,0 +1,33 @@ +import hashlib +from unittest.mock import patch + +import pytest + +from embedchain.loaders.excel_file import ExcelFileLoader + + +@pytest.fixture +def excel_file_loader(): + return ExcelFileLoader() + + +def test_load_data(excel_file_loader): + mock_url = "mock_excel_file.xlsx" + expected_content = "Sample Excel Content" + + # Mock the load_data method of the excel_file_loader instance + with patch.object( + excel_file_loader, + "load_data", + return_value={ + "doc_id": hashlib.sha256((expected_content + mock_url).encode()).hexdigest(), + "data": [{"content": expected_content, "meta_data": {"url": mock_url}}], + }, + ): + result = excel_file_loader.load_data(mock_url) + + assert result["data"][0]["content"] == expected_content + assert result["data"][0]["meta_data"]["url"] == mock_url + + expected_doc_id = hashlib.sha256((expected_content + mock_url).encode()).hexdigest() + assert result["doc_id"] == expected_doc_id