opea-project · lvliang-intel · Nov 19, 2024 · Nov 18, 2024 · Nov 18, 2024 · Nov 18, 2024
@@ -74,10 +74,17 @@ jobs:
           mode: ${{ inputs.mode }}
         run: |
           build_list=$(bash ${{ github.workspace }}/.github/workflows/scripts/get_cicd_list.sh "${mode}" ${docker_compose_path})
-          echo "build_list=${build_list}" >> $GITHUB_OUTPUT
+          echo "${build_list}"
+          if [ -z "${build_list}" ]; then
+            echo "empty=true" >> $GITHUB_OUTPUT
+            echo "${{ inputs.service }} have no ${mode} part."
+          else
+            echo "empty=false" >> $GITHUB_OUTPUT
+            echo "build_list=${build_list}" >> $GITHUB_OUTPUT
+          fi
 
       - name: Build Image
-        if: ${{ fromJSON(inputs.build) && steps.get-yaml-path.outputs.file_exists == 'true' }}
+        if: ${{ fromJSON(inputs.build) && steps.get-yaml-path.outputs.file_exists == 'true' && steps.get-build-list.outputs.empty == 'false' }}
         uses: opea-project/validation/actions/image-build@main
         with:
           work_dir: ${{ github.workspace }}

@@ -54,9 +54,8 @@ jobs:
           cd ${{ github.workspace }}/tests
           test_cases=$(find . -type f -name "test_${service_l}*.sh")
           for script in $test_cases; do
-            echo $script
-            if echo "$script" | grep -q "on"; then
-              hardware=$(echo $script | cut -d'/' -f3 | cut -d'.' -f1 | awk -F'on_' '{print $2}')
+            if echo "$script" | grep -q "_on"; then
+              hardware=$(echo $script | cut -d'/' -f3 | cut -d'.' -f1 | awk -F'_on_' '{print $2}')
             else
               hardware="intel_cpu"
             fi

@@ -30,7 +30,7 @@
     encode_filename,
     get_separators,
     get_tables_result,
-    parse_html,
+    parse_html_new,
     remove_folder_with_ignore,
     save_content_to_local_disk,
 )
@@ -39,17 +39,16 @@
 logflag = os.getenv("LOGFLAG", False)
 
 # workaround notes: cp comps/dataprep/utils.py ./milvus/utils.py
-# from utils import document_loader, get_tables_result, parse_html
 index_params = {"index_type": "FLAT", "metric_type": "IP", "params": {}}
 partition_field_name = "filename"
 upload_folder = "./uploaded_files/"
+milvus_uri = f"http://{MILVUS_HOST}:{MILVUS_PORT}"
 
 
 class MosecEmbeddings(OpenAIEmbeddings):
     def _get_len_safe_embeddings(
         self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None
     ) -> List[List[float]]:
-        _chunk_size = chunk_size or self.chunk_size
         batched_embeddings: List[List[float]] = []
         response = self.client.create(input=texts, **self._invocation_params)
         if not isinstance(response, dict):
@@ -93,7 +92,7 @@ def ingest_chunks_to_milvus(file_name: str, chunks: List):
                 batch_docs,
                 embeddings,
                 collection_name=COLLECTION_NAME,
-                connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT},
+                connection_args={"uri": milvus_uri},
                 partition_key_field=partition_field_name,
             )
         except Exception as e:
@@ -211,7 +210,7 @@ async def ingest_documents(
     my_milvus = Milvus(
         embedding_function=embeddings,
         collection_name=COLLECTION_NAME,
-        connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT},
+        connection_args={"uri": milvus_uri},
         index_params=index_params,
         auto_id=True,
     )
@@ -318,7 +317,7 @@ async def ingest_documents(
                     )
 
             save_path = upload_folder + encoded_link + ".txt"
-            content = parse_html([link])[0][0]
+            content = parse_html_new([link], chunk_size=chunk_size, chunk_overlap=chunk_overlap)
             await save_content_to_local_disk(save_path, content)
             ingest_data_to_milvus(
                 DocPath(
@@ -347,7 +346,7 @@ async def rag_get_file_structure():
     my_milvus = Milvus(
         embedding_function=embeddings,
         collection_name=COLLECTION_NAME,
-        connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT},
+        connection_args={"uri": milvus_uri},
         index_params=index_params,
         auto_id=True,
     )
@@ -405,7 +404,7 @@ async def delete_single_file(file_path: str = Body(..., embed=True)):
     my_milvus = Milvus(
         embedding_function=embeddings,
         collection_name=COLLECTION_NAME,
-        connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT},
+        connection_args={"uri": milvus_uri},
         index_params=index_params,
         auto_id=True,
     )

@@ -4,6 +4,7 @@ docarray[full]
 docx2txt
 easyocr
 fastapi
+html2text
 huggingface_hub
 langchain
 langchain-community

@@ -48,7 +48,7 @@
     encode_filename,
     get_separators,
     get_tables_result,
-    parse_html,
+    parse_html_new,
     save_content_to_local_disk,
 )
 
@@ -654,7 +654,7 @@ async def ingest_documents(
         for link in link_list:
             encoded_link = encode_filename(link)
             save_path = upload_folder + encoded_link + ".txt"
-            content = parse_html([link])[0][0]
+            content = parse_html_new([link], chunk_size=chunk_size, chunk_overlap=chunk_overlap)
             try:
                 await save_content_to_local_disk(save_path, content)
                 index = ingest_data_to_neo4j(

@@ -6,6 +6,7 @@ easyocr
 fastapi
 future
 graspologic 
+html2text
 huggingface_hub
 ipython
 langchain

@@ -21,7 +21,7 @@
     encode_filename,
     get_file_structure,
     get_separators,
-    parse_html,
+    parse_html_new,
     remove_folder_with_ignore,
     save_content_to_local_disk,
 )
@@ -158,7 +158,7 @@ async def ingest_link_to_pgvector(link_list: List[str]):
 
     for link in link_list:
         texts = []
-        content = parse_html([link])[0][0]
+        content = parse_html_new([link], chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
         if logflag:
             logger.info(f"[ ingest link ] link: {link} content: {content}")
         encoded_link = encode_filename(link)

@@ -4,6 +4,7 @@ docarray[full]
 docx2txt
 easyocr
 fastapi
+html2text
 huggingface_hub
 langchain
 langchain-community

@@ -24,7 +24,7 @@
     get_file_structure,
     get_separators,
     get_tables_result,
-    parse_html,
+    parse_html_new,
     remove_folder_with_ignore,
     save_content_to_local_disk,
 )
@@ -158,7 +158,7 @@ def ingest_data_to_pinecone(doc_path: DocPath):
     pc = Pinecone(api_key=PINECONE_API_KEY)
 
 
-async def ingest_link_to_pinecone(link_list: List[str]):
+async def ingest_link_to_pinecone(link_list: List[str], chunk_size, chunk_overlap):
     # Create embedding obj
     if tei_embedding_endpoint:
         # create embeddings using TEI endpoint service
@@ -178,7 +178,7 @@ async def ingest_link_to_pinecone(link_list: List[str]):
 
     # save link contents and doc_ids one by one
     for link in link_list:
-        content = parse_html([link])[0][0]
+        content = parse_html_new([link], chunk_size=chunk_size, chunk_overlap=chunk_overlap)
         if logflag:
             logger.info(f"[ ingest link ] link: {link} content: {content}")
         encoded_link = encode_filename(link)
@@ -239,7 +239,7 @@ async def ingest_documents(
             link_list = json.loads(link_list)  # Parse JSON string to list
             if not isinstance(link_list, list):
                 raise HTTPException(status_code=400, detail="link_list should be a list.")
-            await ingest_link_to_pinecone(link_list)
+            await ingest_link_to_pinecone(link_list, chunk_size, chunk_overlap)
             result = {"status": 200, "message": "Data preparation succeeded"}
             if logflag:
                 logger.info(f"Successfully saved link list {link_list}")

@@ -4,6 +4,7 @@ docarray[full]
 docx2txt
 easyocr
 fastapi
+html2text
 huggingface_hub
 langchain
 langchain-community

@@ -19,7 +19,7 @@
     encode_filename,
     get_separators,
     get_tables_result,
-    parse_html,
+    parse_html_new,
     save_content_to_local_disk,
 )
 
@@ -149,7 +149,7 @@ async def ingest_documents(
         for link in link_list:
             encoded_link = encode_filename(link)
             save_path = upload_folder + encoded_link + ".txt"
-            content = parse_html([link])[0][0]
+            content = parse_html_new([link], chunk_size=chunk_size, chunk_overlap=chunk_overlap)
             try:
                 await save_content_to_local_disk(save_path, content)
                 ingest_data_to_qdrant(

@@ -4,6 +4,7 @@ docarray[full]
 docx2txt
 easyocr
 fastapi
+html2text
 huggingface_hub
 langchain
 langchain-community

@@ -26,7 +26,7 @@
     format_search_results,
     get_separators,
     get_tables_result,
-    parse_html,
+    parse_html_new,
     remove_folder_with_ignore,
     save_content_to_local_disk,
 )
@@ -320,7 +320,7 @@ async def ingest_documents(
                 )
 
             save_path = upload_folder + encoded_link + ".txt"
-            content = parse_html([link])[0][0]
+            content = parse_html_new([link], chunk_size=chunk_size, chunk_overlap=chunk_overlap)
             await save_content_to_local_disk(save_path, content)
             ingest_data_to_redis(
                 DocPath(

@@ -4,6 +4,7 @@ docarray[full]
 docx2txt
 easyocr
 fastapi
+html2text
 huggingface_hub
 langchain
 langchain-community

@@ -48,7 +48,7 @@
     encode_filename,
     get_file_structure,
     get_separators,
-    parse_html,
+    parse_html_new,
     remove_folder_with_ignore,
     save_content_to_local_disk,
     timeout,
@@ -255,7 +255,7 @@ def ingest_link_to_redis(link_list: List[str], enable_ray=False, num_cpus=20):
     link_list = [str(f) for f in link_list]
 
     def _parse_html(link):
-        data = parse_html([link])
+        data = parse_html_new([link], chunk_size=1500, chunk_overlap=100)
         return data[0][0]
 
     if enable_ray:

@@ -4,6 +4,7 @@ docarray[full]
 docx2txt
 easyocr
 fastapi
+html2text
 huggingface_hub
 langchain
 langchain-community

@@ -620,6 +620,28 @@ def parse_html(input):
     return chucks
 
 
+def load_html_content(links, chunk_size=1500, chunk_overlap=50):
+    from langchain.text_splitter import RecursiveCharacterTextSplitter
+    from langchain_community.document_loaders import AsyncHtmlLoader
+    from langchain_community.document_transformers import Html2TextTransformer
+
+    loader = AsyncHtmlLoader(links, ignore_load_errors=True, trust_env=True)
+    docs = loader.load()
+    html2text = Html2TextTransformer()
+    docs = list(html2text.transform_documents(docs))
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+    docs = text_splitter.split_documents(docs)
+    return docs
+
+
+def parse_html_new(input, chunk_size, chunk_overlap):
+    docs = load_html_content(input, chunk_size, chunk_overlap)
+    html_content = ""
+    for doc in docs:
+        html_content += doc.page_content + "\n"
+    return html_content
+
+
 def get_tables_result(pdf_path, table_strategy):
     """Extract tables information from pdf file."""
     if table_strategy == "fast":

@@ -19,7 +19,7 @@
     encode_filename,
     get_separators,
     get_tables_result,
-    parse_html,
+    parse_html_new,
     save_content_to_local_disk,
 )
 
@@ -143,7 +143,7 @@ async def ingest_documents(
             # check whether the link file already exists
 
             save_path = upload_folder + encoded_link + ".txt"
-            content = parse_html([link])[0][0]
+            content = parse_html_new([link], chunk_size=chunk_size, chunk_overlap=chunk_overlap)
             await save_content_to_local_disk(save_path, content)
             ingest_data_to_vdms(
                 DocPath(

@@ -6,6 +6,7 @@ docx2txt
 easyocr
 einops
 fastapi
+html2text
 huggingface_hub
 langchain
 langchain-community