Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions .github/workflows/_comps-workflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -74,10 +74,17 @@ jobs:
mode: ${{ inputs.mode }}
run: |
build_list=$(bash ${{ github.workspace }}/.github/workflows/scripts/get_cicd_list.sh "${mode}" ${docker_compose_path})
echo "build_list=${build_list}" >> $GITHUB_OUTPUT
echo "${build_list}"
if [ -z "${build_list}" ]; then
echo "empty=true" >> $GITHUB_OUTPUT
echo "${{ inputs.service }} have no ${mode} part."
else
echo "empty=false" >> $GITHUB_OUTPUT
echo "build_list=${build_list}" >> $GITHUB_OUTPUT
fi

- name: Build Image
if: ${{ fromJSON(inputs.build) && steps.get-yaml-path.outputs.file_exists == 'true' }}
if: ${{ fromJSON(inputs.build) && steps.get-yaml-path.outputs.file_exists == 'true' && steps.get-build-list.outputs.empty == 'false' }}
uses: opea-project/validation/actions/image-build@main
with:
work_dir: ${{ github.workspace }}
Expand Down
5 changes: 2 additions & 3 deletions .github/workflows/_run-docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,8 @@ jobs:
cd ${{ github.workspace }}/tests
test_cases=$(find . -type f -name "test_${service_l}*.sh")
for script in $test_cases; do
echo $script
if echo "$script" | grep -q "on"; then
hardware=$(echo $script | cut -d'/' -f3 | cut -d'.' -f1 | awk -F'on_' '{print $2}')
if echo "$script" | grep -q "_on"; then
hardware=$(echo $script | cut -d'/' -f3 | cut -d'.' -f1 | awk -F'_on_' '{print $2}')
else
hardware="intel_cpu"
fi
Expand Down
15 changes: 7 additions & 8 deletions comps/dataprep/milvus/langchain/prepare_doc_milvus.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
encode_filename,
get_separators,
get_tables_result,
parse_html,
parse_html_new,
remove_folder_with_ignore,
save_content_to_local_disk,
)
Expand All @@ -39,17 +39,16 @@
logflag = os.getenv("LOGFLAG", False)

# workaround notes: cp comps/dataprep/utils.py ./milvus/utils.py
# from utils import document_loader, get_tables_result, parse_html
index_params = {"index_type": "FLAT", "metric_type": "IP", "params": {}}
partition_field_name = "filename"
upload_folder = "./uploaded_files/"
milvus_uri = f"http://{MILVUS_HOST}:{MILVUS_PORT}"


class MosecEmbeddings(OpenAIEmbeddings):
def _get_len_safe_embeddings(
self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None
) -> List[List[float]]:
_chunk_size = chunk_size or self.chunk_size
batched_embeddings: List[List[float]] = []
response = self.client.create(input=texts, **self._invocation_params)
if not isinstance(response, dict):
Expand Down Expand Up @@ -93,7 +92,7 @@ def ingest_chunks_to_milvus(file_name: str, chunks: List):
batch_docs,
embeddings,
collection_name=COLLECTION_NAME,
connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT},
connection_args={"uri": milvus_uri},
partition_key_field=partition_field_name,
)
except Exception as e:
Expand Down Expand Up @@ -211,7 +210,7 @@ async def ingest_documents(
my_milvus = Milvus(
embedding_function=embeddings,
collection_name=COLLECTION_NAME,
connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT},
connection_args={"uri": milvus_uri},
index_params=index_params,
auto_id=True,
)
Expand Down Expand Up @@ -318,7 +317,7 @@ async def ingest_documents(
)

save_path = upload_folder + encoded_link + ".txt"
content = parse_html([link])[0][0]
content = parse_html_new([link], chunk_size=chunk_size, chunk_overlap=chunk_overlap)
await save_content_to_local_disk(save_path, content)
ingest_data_to_milvus(
DocPath(
Expand Down Expand Up @@ -347,7 +346,7 @@ async def rag_get_file_structure():
my_milvus = Milvus(
embedding_function=embeddings,
collection_name=COLLECTION_NAME,
connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT},
connection_args={"uri": milvus_uri},
index_params=index_params,
auto_id=True,
)
Expand Down Expand Up @@ -405,7 +404,7 @@ async def delete_single_file(file_path: str = Body(..., embed=True)):
my_milvus = Milvus(
embedding_function=embeddings,
collection_name=COLLECTION_NAME,
connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT},
connection_args={"uri": milvus_uri},
index_params=index_params,
auto_id=True,
)
Expand Down
1 change: 1 addition & 0 deletions comps/dataprep/milvus/langchain/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ docarray[full]
docx2txt
easyocr
fastapi
html2text
huggingface_hub
langchain
langchain-community
Expand Down
4 changes: 2 additions & 2 deletions comps/dataprep/neo4j/llama_index/extract_graph_neo4j.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
encode_filename,
get_separators,
get_tables_result,
parse_html,
parse_html_new,
save_content_to_local_disk,
)

Expand Down Expand Up @@ -654,7 +654,7 @@ async def ingest_documents(
for link in link_list:
encoded_link = encode_filename(link)
save_path = upload_folder + encoded_link + ".txt"
content = parse_html([link])[0][0]
content = parse_html_new([link], chunk_size=chunk_size, chunk_overlap=chunk_overlap)
try:
await save_content_to_local_disk(save_path, content)
index = ingest_data_to_neo4j(
Expand Down
1 change: 1 addition & 0 deletions comps/dataprep/neo4j/llama_index/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ easyocr
fastapi
future
graspologic
html2text
huggingface_hub
ipython
langchain
Expand Down
4 changes: 2 additions & 2 deletions comps/dataprep/pgvector/langchain/prepare_doc_pgvector.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
encode_filename,
get_file_structure,
get_separators,
parse_html,
parse_html_new,
remove_folder_with_ignore,
save_content_to_local_disk,
)
Expand Down Expand Up @@ -158,7 +158,7 @@ async def ingest_link_to_pgvector(link_list: List[str]):

for link in link_list:
texts = []
content = parse_html([link])[0][0]
content = parse_html_new([link], chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
if logflag:
logger.info(f"[ ingest link ] link: {link} content: {content}")
encoded_link = encode_filename(link)
Expand Down
1 change: 1 addition & 0 deletions comps/dataprep/pgvector/langchain/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ docarray[full]
docx2txt
easyocr
fastapi
html2text
huggingface_hub
langchain
langchain-community
Expand Down
8 changes: 4 additions & 4 deletions comps/dataprep/pinecone/langchain/prepare_doc_pinecone.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
get_file_structure,
get_separators,
get_tables_result,
parse_html,
parse_html_new,
remove_folder_with_ignore,
save_content_to_local_disk,
)
Expand Down Expand Up @@ -158,7 +158,7 @@ def ingest_data_to_pinecone(doc_path: DocPath):
pc = Pinecone(api_key=PINECONE_API_KEY)


async def ingest_link_to_pinecone(link_list: List[str]):
async def ingest_link_to_pinecone(link_list: List[str], chunk_size, chunk_overlap):
# Create embedding obj
if tei_embedding_endpoint:
# create embeddings using TEI endpoint service
Expand All @@ -178,7 +178,7 @@ async def ingest_link_to_pinecone(link_list: List[str]):

# save link contents and doc_ids one by one
for link in link_list:
content = parse_html([link])[0][0]
content = parse_html_new([link], chunk_size=chunk_size, chunk_overlap=chunk_overlap)
if logflag:
logger.info(f"[ ingest link ] link: {link} content: {content}")
encoded_link = encode_filename(link)
Expand Down Expand Up @@ -239,7 +239,7 @@ async def ingest_documents(
link_list = json.loads(link_list) # Parse JSON string to list
if not isinstance(link_list, list):
raise HTTPException(status_code=400, detail="link_list should be a list.")
await ingest_link_to_pinecone(link_list)
await ingest_link_to_pinecone(link_list, chunk_size, chunk_overlap)
result = {"status": 200, "message": "Data preparation succeeded"}
if logflag:
logger.info(f"Successfully saved link list {link_list}")
Expand Down
1 change: 1 addition & 0 deletions comps/dataprep/pinecone/langchain/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ docarray[full]
docx2txt
easyocr
fastapi
html2text
huggingface_hub
langchain
langchain-community
Expand Down
4 changes: 2 additions & 2 deletions comps/dataprep/qdrant/langchain/prepare_doc_qdrant.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
encode_filename,
get_separators,
get_tables_result,
parse_html,
parse_html_new,
save_content_to_local_disk,
)

Expand Down Expand Up @@ -149,7 +149,7 @@ async def ingest_documents(
for link in link_list:
encoded_link = encode_filename(link)
save_path = upload_folder + encoded_link + ".txt"
content = parse_html([link])[0][0]
content = parse_html_new([link], chunk_size=chunk_size, chunk_overlap=chunk_overlap)
try:
await save_content_to_local_disk(save_path, content)
ingest_data_to_qdrant(
Expand Down
1 change: 1 addition & 0 deletions comps/dataprep/qdrant/langchain/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ docarray[full]
docx2txt
easyocr
fastapi
html2text
huggingface_hub
langchain
langchain-community
Expand Down
4 changes: 2 additions & 2 deletions comps/dataprep/redis/langchain/prepare_doc_redis.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
format_search_results,
get_separators,
get_tables_result,
parse_html,
parse_html_new,
remove_folder_with_ignore,
save_content_to_local_disk,
)
Expand Down Expand Up @@ -320,7 +320,7 @@ async def ingest_documents(
)

save_path = upload_folder + encoded_link + ".txt"
content = parse_html([link])[0][0]
content = parse_html_new([link], chunk_size=chunk_size, chunk_overlap=chunk_overlap)
await save_content_to_local_disk(save_path, content)
ingest_data_to_redis(
DocPath(
Expand Down
1 change: 1 addition & 0 deletions comps/dataprep/redis/langchain/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ docarray[full]
docx2txt
easyocr
fastapi
html2text
huggingface_hub
langchain
langchain-community
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
encode_filename,
get_file_structure,
get_separators,
parse_html,
parse_html_new,
remove_folder_with_ignore,
save_content_to_local_disk,
timeout,
Expand Down Expand Up @@ -255,7 +255,7 @@ def ingest_link_to_redis(link_list: List[str], enable_ray=False, num_cpus=20):
link_list = [str(f) for f in link_list]

def _parse_html(link):
data = parse_html([link])
data = parse_html_new([link], chunk_size=1500, chunk_overlap=100)
return data[0][0]

if enable_ray:
Expand Down
1 change: 1 addition & 0 deletions comps/dataprep/redis/langchain_ray/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ docarray[full]
docx2txt
easyocr
fastapi
html2text
huggingface_hub
langchain
langchain-community
Expand Down
22 changes: 22 additions & 0 deletions comps/dataprep/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -620,6 +620,28 @@ def parse_html(input):
return chucks


def load_html_content(links, chunk_size=1500, chunk_overlap=50):
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import AsyncHtmlLoader
from langchain_community.document_transformers import Html2TextTransformer

loader = AsyncHtmlLoader(links, ignore_load_errors=True, trust_env=True)
docs = loader.load()
html2text = Html2TextTransformer()
docs = list(html2text.transform_documents(docs))
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
docs = text_splitter.split_documents(docs)
return docs


def parse_html_new(input, chunk_size, chunk_overlap):
docs = load_html_content(input, chunk_size, chunk_overlap)
html_content = ""
for doc in docs:
html_content += doc.page_content + "\n"
return html_content


def get_tables_result(pdf_path, table_strategy):
"""Extract tables information from pdf file."""
if table_strategy == "fast":
Expand Down
4 changes: 2 additions & 2 deletions comps/dataprep/vdms/langchain/prepare_doc_vdms.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
encode_filename,
get_separators,
get_tables_result,
parse_html,
parse_html_new,
save_content_to_local_disk,
)

Expand Down Expand Up @@ -143,7 +143,7 @@ async def ingest_documents(
# check whether the link file already exists

save_path = upload_folder + encoded_link + ".txt"
content = parse_html([link])[0][0]
content = parse_html_new([link], chunk_size=chunk_size, chunk_overlap=chunk_overlap)
await save_content_to_local_disk(save_path, content)
ingest_data_to_vdms(
DocPath(
Expand Down
1 change: 1 addition & 0 deletions comps/dataprep/vdms/langchain/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ docx2txt
easyocr
einops
fastapi
html2text
huggingface_hub
langchain
langchain-community
Expand Down