-
Notifications
You must be signed in to change notification settings - Fork 2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
8 changed files
with
120 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
--- | ||
title: '📄 Excel file' | ||
--- | ||
|
||
### Excel file | ||
|
||
To add any xlsx/xls file, use the data_type as `excel_file`. `excel_file` allows remote urls and conventional file paths. Eg: | ||
|
||
```python | ||
from embedchain import App | ||
|
||
app = App() | ||
app.add('https://example.com/content/intro.xlsx', data_type="excel_file") | ||
# Or add file using the local file path on your system | ||
# app.add('content/intro.xls', data_type="excel_file") | ||
|
||
app.query("Give brief information about data.") | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
from typing import Optional | ||
|
||
from langchain.text_splitter import RecursiveCharacterTextSplitter | ||
|
||
from embedchain.chunkers.base_chunker import BaseChunker | ||
from embedchain.config.add_config import ChunkerConfig | ||
from embedchain.helpers.json_serializable import register_deserializable | ||
|
||
|
||
@register_deserializable | ||
class ExcelFileChunker(BaseChunker): | ||
"""Chunker for Excel file.""" | ||
|
||
def __init__(self, config: Optional[ChunkerConfig] = None): | ||
if config is None: | ||
config = ChunkerConfig(chunk_size=1000, chunk_overlap=0, length_function=len) | ||
text_splitter = RecursiveCharacterTextSplitter( | ||
chunk_size=config.chunk_size, | ||
chunk_overlap=config.chunk_overlap, | ||
length_function=config.length_function, | ||
) | ||
super().__init__(text_splitter) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
import hashlib | ||
import importlib.util | ||
|
||
try: | ||
from langchain_community.document_loaders import UnstructuredExcelLoader | ||
except ImportError: | ||
raise ImportError( | ||
'Excel file requires extra dependencies. Install with `pip install --upgrade "embedchain[dataloaders]"`' | ||
) from None | ||
|
||
if importlib.util.find_spec("openpyxl") is None and importlib.util.find_spec("xlrd") is None: | ||
raise ImportError("Excel file requires extra dependencies. Install with `pip install openpyxl xlrd`") from None | ||
|
||
from embedchain.helpers.json_serializable import register_deserializable | ||
from embedchain.loaders.base_loader import BaseLoader | ||
from embedchain.utils.misc import clean_string | ||
|
||
|
||
@register_deserializable | ||
class ExcelFileLoader(BaseLoader): | ||
def load_data(self, excel_url): | ||
"""Load data from a Excel file.""" | ||
loader = UnstructuredExcelLoader(excel_url) | ||
pages = loader.load_and_split() | ||
|
||
data = [] | ||
for page in pages: | ||
content = page.page_content | ||
content = clean_string(content) | ||
|
||
metadata = page.metadata | ||
metadata["url"] = excel_url | ||
|
||
data.append({"content": content, "meta_data": metadata}) | ||
|
||
doc_id = hashlib.sha256((content + excel_url).encode()).hexdigest() | ||
return { | ||
"doc_id": doc_id, | ||
"data": data, | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
import hashlib | ||
from unittest.mock import patch | ||
|
||
import pytest | ||
|
||
from embedchain.loaders.excel_file import ExcelFileLoader | ||
|
||
|
||
@pytest.fixture | ||
def excel_file_loader(): | ||
return ExcelFileLoader() | ||
|
||
|
||
def test_load_data(excel_file_loader): | ||
mock_url = "mock_excel_file.xlsx" | ||
expected_content = "Sample Excel Content" | ||
|
||
# Mock the load_data method of the excel_file_loader instance | ||
with patch.object( | ||
excel_file_loader, | ||
"load_data", | ||
return_value={ | ||
"doc_id": hashlib.sha256((expected_content + mock_url).encode()).hexdigest(), | ||
"data": [{"content": expected_content, "meta_data": {"url": mock_url}}], | ||
}, | ||
): | ||
result = excel_file_loader.load_data(mock_url) | ||
|
||
assert result["data"][0]["content"] == expected_content | ||
assert result["data"][0]["meta_data"]["url"] == mock_url | ||
|
||
expected_doc_id = hashlib.sha256((expected_content + mock_url).encode()).hexdigest() | ||
assert result["doc_id"] == expected_doc_id |