Encapsulate LlamaParse to optimize the processing of chunks and metadata information.

alexhu-capix · alexhu-capix · commit 5e5618b8094d · 2024-07-15T21:03:43.000+08:00
diff --git a/.gitignore b/.gitignore
@@ -7,6 +7,11 @@ __pycache__/
 
 .DS_Store
 
+chroma_dir
+diskcache_dir
+sqlite_dir
+web/download_dir
+
 # C extensions
 *.so
 
diff --git a/README.md b/README.md
@@ -91,13 +91,17 @@ USE_RERANKING=1
 USE_DEBUG=0
 USE_LLAMA_PARSE=0
 LLAMA_CLOUD_API_KEY="xxxx"
+USE_GPT4O=0
 ```
 
 - Don't modify **`LLM_NAME`**
 - Modify the **`OPENAI_API_KEY`** with your own key. Please log in to the [OpenAI website](https://platform.openai.com/api-keys) to view your API Key.
 - Update the **`GPT_MODEL_NAME`** setting, replacing `gpt-3.5-turbo` with `gpt-4-turbo` or `gpt-4o` if you want to use GPT-4.
 - Change **`BOT_TOPIC`** to reflect your Bot's name. This is very important, as it will be used in `Prompt Construction`. Please try to use a concise and clear word, such as `OpenIM`, `LangChain`.
 - Adjust **`URL_PREFIX`** to match your website's domain. This is mainly for generating accessible URL links for uploaded local files. Such as `http://127.0.0.1:7000/web/download_dir/2024_05_20/d3a01d6a-90cd-4c2a-b926-9cda12466caf/openssl-cookbook.pdf`.
+- Set **`USE_LLAMA_PARSE`** to 1 if you want to use `LlamaParse`.
+- Modify the **`LLAMA_CLOUD_API_KEY `** with your own key. Please log in to the [LLamaCloud website](https://cloud.llamaindex.ai/api-key) to view your API Key.
+- Set **`USE_GPT4O`** to 1 if you want to use `GPT-4o` mode.
 - For more information about the meanings and usages of constants, you can check under the `server/constant` directory.
 
 #### Using ZhipuAI as the LLM base
@@ -130,6 +134,8 @@ LLAMA_CLOUD_API_KEY="xxxx"
 - Update the **`GLM_MODEL_NAME`** setting, the model list is `['glm-3-turbo', 'glm-4', 'glm-4-0520', 'glm-4-air', 'glm-4-airx', 'glm-4-flash']`.
 - Change **`BOT_TOPIC`** to reflect your Bot's name. This is very important, as it will be used in `Prompt Construction`. Please try to use a concise and clear word, such as `OpenIM`, `LangChain`.
 - Adjust **`URL_PREFIX`** to match your website's domain. This is mainly for generating accessible URL links for uploaded local files. Such as `http://127.0.0.1:7000/web/download_dir/2024_05_20/d3a01d6a-90cd-4c2a-b926-9cda12466caf/openssl-cookbook.pdf`.
+- Set **`USE_LLAMA_PARSE`** to 1 if you want to use `LlamaParse`.
+- Modify the **`LLAMA_CLOUD_API_KEY `** with your own key. Please log in to the [LLamaCloud website](https://cloud.llamaindex.ai/api-key) to view your API Key.
 - For more information about the meanings and usages of constants, you can check under the `server/constant` directory.
 
 #### Using DeepSeek as the LLM base
@@ -167,6 +173,8 @@ LLAMA_CLOUD_API_KEY="xxxx"
 - Update the **`DEEPSEEK_MODEL_NAME `** setting if you want to use other models of DeepSeek.
 - Change **`BOT_TOPIC`** to reflect your Bot's name. This is very important, as it will be used in `Prompt Construction`. Please try to use a concise and clear word, such as `OpenIM`, `LangChain`.
 - Adjust **`URL_PREFIX`** to match your website's domain. This is mainly for generating accessible URL links for uploaded local files. Such as `http://127.0.0.1:7000/web/download_dir/2024_05_20/d3a01d6a-90cd-4c2a-b926-9cda12466caf/openssl-cookbook.pdf`.
+- Set **`USE_LLAMA_PARSE`** to 1 if you want to use `LlamaParse`.
+- Modify the **`LLAMA_CLOUD_API_KEY `** with your own key. Please log in to the [LLamaCloud website](https://cloud.llamaindex.ai/api-key) to view your API Key.
 - For more information about the meanings and usages of constants, you can check under the `server/constant` directory.
 
 
@@ -205,6 +213,8 @@ LLAMA_CLOUD_API_KEY="xxxx"
 - Update the **`MOONSHOT_MODEL_NAME `** setting if you want to use other models of Moonshot.
 - Change **`BOT_TOPIC`** to reflect your Bot's name. This is very important, as it will be used in `Prompt Construction`. Please try to use a concise and clear word, such as `OpenIM`, `LangChain`.
 - Adjust **`URL_PREFIX`** to match your website's domain. This is mainly for generating accessible URL links for uploaded local files. Such as `http://127.0.0.1:7000/web/download_dir/2024_05_20/d3a01d6a-90cd-4c2a-b926-9cda12466caf/openssl-cookbook.pdf`.
+- Set **`USE_LLAMA_PARSE`** to 1 if you want to use `LlamaParse`.
+- Modify the **`LLAMA_CLOUD_API_KEY `** with your own key. Please log in to the [LLamaCloud website](https://cloud.llamaindex.ai/api-key) to view your API Key.
 - For more information about the meanings and usages of constants, you can check under the `server/constant` directory.
 
 
@@ -242,6 +252,8 @@ LLAMA_CLOUD_API_KEY="xxxx"
 - If you have changed the default `IP:PORT` when starting `Ollama`, please update **`OLLAMA_BASE_URL`**. Please pay special attention, only enter the IP (domain) and PORT here, without appending a URI.
 - Change **`BOT_TOPIC`** to reflect your Bot's name. This is very important, as it will be used in `Prompt Construction`. Please try to use a concise and clear word, such as `OpenIM`, `LangChain`.
 - Adjust **`URL_PREFIX`** to match your website's domain. This is mainly for generating accessible URL links for uploaded local files. Such as `http://127.0.0.1:7000/web/download_dir/2024_05_20/d3a01d6a-90cd-4c2a-b926-9cda12466caf/openssl-cookbook.pdf`.
+- Set **`USE_LLAMA_PARSE`** to 1 if you want to use `LlamaParse`.
+- Modify the **`LLAMA_CLOUD_API_KEY `** with your own key. Please log in to the [LLamaCloud website](https://cloud.llamaindex.ai/api-key) to view your API Key.
 - For more information about the meanings and usages of constants, you can check under the `server/constant` directory.
 
 
diff --git a/env_of_openai b/env_of_openai
@@ -9,3 +9,4 @@ USE_RERANKING=1
 USE_DEBUG=0
 USE_LLAMA_PARSE=0
 LLAMA_CLOUD_API_KEY="xxxx"
+USE_GPT4O=0
diff --git a/requirements.txt b/requirements.txt
@@ -29,5 +29,4 @@ onnxruntime==1.16.3
 numpy==1.26.4
 et-xmlfile==1.1.0
 openpyxl==3.1.2
-llama-index==0.10.43
-llama-parse==0.4.4
+llama-parse==0.4.6
diff --git a/server/app/account.py b/server/app/account.py
@@ -1,5 +1,5 @@
 import time
-from flask import Blueprint, Flask, request
+from flask import Blueprint, request
 from werkzeug.security import generate_password_hash, check_password_hash
 from server.app.utils.decorators import token_required
 from server.app.utils.diskcache_lock import diskcache_lock
diff --git a/server/app/auth.py b/server/app/auth.py
@@ -1,6 +1,6 @@
 from flask import Blueprint, request
-from server.logger.logger_config import my_logger as logger
 from server.app.utils.token_helper import TokenHelper
+from server.logger.logger_config import my_logger as logger
 
 auth_bp = Blueprint('auth', __name__, url_prefix='/open_kf_api/auth')
 
diff --git a/server/app/bot_config.py b/server/app/bot_config.py
@@ -1,6 +1,6 @@
 import json
 import time
-from flask import Blueprint, Flask, request
+from flask import Blueprint, request
 from server.app.utils.decorators import token_required
 from server.app.utils.sqlite_client import get_db_connection
 from server.app.utils.diskcache_client import diskcache_client
diff --git a/server/app/common.py b/server/app/common.py
@@ -1,7 +1,7 @@
 from datetime import datetime
 import os
 import uuid
-from flask import Blueprint, Flask, request
+from flask import Blueprint, request
 from werkzeug.utils import secure_filename
 from server.constant.constants import STATIC_DIR, MEDIA_DIR
 from server.app.utils.decorators import token_required
diff --git a/server/app/files.py b/server/app/files.py
@@ -6,11 +6,13 @@
 from threading import Thread
 import time
 from typing import Dict, List, Any
-from urllib.parse import urlparse
 import uuid
-from flask import Blueprint, Flask, request
-from werkzeug.utils import secure_filename
-from server.constant.constants import MAX_LOCAL_FILE_BATCH_LENGTH, MAX_FILE_SIZE, LOCAL_FILE_DOWNLOAD_DIR, STATIC_DIR, FILE_LOADER_EXTENSIONS, MAX_CONCURRENT_WRITES, LOCAL_FILE_PROCESS_FAILED
+from flask import Blueprint, request
+from server.constant.constants import (MAX_LOCAL_FILE_BATCH_LENGTH,
+                                       MAX_FILE_SIZE, LOCAL_FILE_DOWNLOAD_DIR,
+                                       STATIC_DIR, FILE_LOADER_EXTENSIONS,
+                                       MAX_CONCURRENT_WRITES,
+                                       LOCAL_FILE_PROCESS_FAILED)
 from server.app.utils.decorators import token_required
 from server.app.utils.sqlite_client import get_db_connection
 from server.app.utils.diskcache_lock import diskcache_lock
diff --git a/server/app/intervention.py b/server/app/intervention.py
@@ -1,6 +1,6 @@
 import json
 import time
-from flask import Blueprint, Flask, request
+from flask import Blueprint, request
 from server.app.utils.decorators import token_required
 from server.app.utils.sqlite_client import get_db_connection
 from server.app.utils.diskcache_client import diskcache_client
diff --git a/server/app/queries.py b/server/app/queries.py
@@ -6,9 +6,11 @@
 import time
 from typing import List, Dict, Any, Tuple
 from urllib.parse import urlparse
-from flask import Blueprint, Flask, request, Response
+from flask import Blueprint, request, Response
 from langchain.schema.document import Document
-from server.constant.constants import RECALL_TOP_K, RERANK_RECALL_TOP_K, MAX_QUERY_LENGTH, SESSION_EXPIRE_TIME, MAX_HISTORY_SESSION_LENGTH
+from server.constant.constants import (RECALL_TOP_K, RERANK_RECALL_TOP_K,
+                                       MAX_QUERY_LENGTH, SESSION_EXPIRE_TIME,
+                                       MAX_HISTORY_SESSION_LENGTH)
 from server.app.utils.decorators import token_required
 from server.app.utils.sqlite_client import get_db_connection
 from server.app.utils.diskcache_client import diskcache_client
diff --git a/server/app/sitemaps.py b/server/app/sitemaps.py
@@ -3,14 +3,17 @@
 import json
 from threading import Thread
 import time
-from typing import Callable, Dict, Any, List, Union, Set
+from typing import Callable, Dict, Any, List
 from urllib.parse import urlparse
-from flask import Blueprint, Flask, request
+from flask import Blueprint, request
 from server.app.utils.decorators import token_required
 from server.app.utils.sqlite_client import get_db_connection
 from server.app.utils.diskcache_lock import diskcache_lock
 from server.app.utils.url_helper import is_valid_url
-from server.constant.constants import ADD_SITEMAP_CONTENT, DELETE_SITEMAP_CONTENT, UPDATE_SITEMAP_CONTENT, DOMAIN_PROCESSING, FROM_SITEMAP_URL
+from server.constant.constants import (ADD_SITEMAP_CONTENT,
+                                       DELETE_SITEMAP_CONTENT,
+                                       UPDATE_SITEMAP_CONTENT,
+                                       DOMAIN_PROCESSING, FROM_SITEMAP_URL)
 from server.logger.logger_config import my_logger as logger
 from server.rag.index.parser.html_parser.web_link_crawler import AsyncCrawlerSiteLink
 from server.rag.index.parser.html_parser.web_content_crawler import AsyncCrawlerSiteContent
diff --git a/server/app/urls.py b/server/app/urls.py
@@ -4,8 +4,11 @@
 import time
 from typing import Dict, Any
 from urllib.parse import urlparse
-from flask import Blueprint, Flask, request
-from server.constant.constants import MAX_ISOLATED_URL_BATCH_LENGTH, FROM_ISOLATED_URL, ADD_ISOLATED_URL_CONTENT, DELETE_ISOLATED_URL_CONTENT
+from flask import Blueprint, request
+from server.constant.constants import (MAX_ISOLATED_URL_BATCH_LENGTH,
+                                       FROM_ISOLATED_URL,
+                                       ADD_ISOLATED_URL_CONTENT,
+                                       DELETE_ISOLATED_URL_CONTENT)
 from server.app.utils.decorators import token_required
 from server.app.utils.sqlite_client import get_db_connection
 from server.app.utils.diskcache_lock import diskcache_lock
diff --git a/server/app/utils/diskcache_client.py b/server/app/utils/diskcache_client.py
@@ -1,4 +1,3 @@
-from contextlib import contextmanager
 from typing import Any, Optional, List
 from diskcache import Cache
 from server.constant.constants import DISKCACHE_DIR
diff --git a/server/app/utils/diskcache_lock.py b/server/app/utils/diskcache_lock.py
@@ -1,8 +1,9 @@
 from contextlib import contextmanager
-from typing import Generator, Any
+from typing import Generator
 from diskcache import Cache, Lock
 from server.app.utils.diskcache_client import diskcache_client
-from server.constant.constants import DISTRIBUTED_LOCK_ID, DISTRIBUTED_LOCK_EXPIRE_TIME
+from server.constant.constants import (DISTRIBUTED_LOCK_ID,
+                                       DISTRIBUTED_LOCK_EXPIRE_TIME)
 
 
 class DiskcacheLock:
diff --git a/server/rag/index/embedder/document_embedder.py b/server/rag/index/embedder/document_embedder.py
@@ -6,7 +6,10 @@
 from langchain_openai import OpenAIEmbeddings
 from langchain_community.embeddings import OllamaEmbeddings
 from langchain.schema.document import Document
-from server.constant.constants import OPENAI_EMBEDDING_MODEL_NAME, ZHIPUAI_EMBEDDING_MODEL_NAME, OPENAI_EMBEDDING_MODEL_NAME, CHROMA_DB_DIR, CHROMA_COLLECTION_NAME, OLLAMA_EMBEDDING_MODEL_NAME
+from server.constant.constants import (OPENAI_EMBEDDING_MODEL_NAME,
+                                       ZHIPUAI_EMBEDDING_MODEL_NAME,
+                                       CHROMA_DB_DIR, CHROMA_COLLECTION_NAME,
+                                       OLLAMA_EMBEDDING_MODEL_NAME)
 from server.logger.logger_config import my_logger as logger
 from server.rag.index.embedder.zhipuai_embedder import ZhipuAIEmbeddings
 
diff --git a/server/rag/index/parser/file_parser/llamaparse/__init__.py b/server/rag/index/parser/file_parser/llamaparse/__init__.py
diff --git a/server/rag/index/parser/file_parser/llamaparse/file_handler.py b/server/rag/index/parser/file_parser/llamaparse/file_handler.py
@@ -0,0 +1,27 @@
+import shutil
+from abc import ABC, abstractmethod
+
+
+class FileHandler(ABC):
+    @abstractmethod
+    def download_file(self, file_path: str, destination_path: str) -> None:
+        pass
+
+    @abstractmethod
+    def upload_file(self, file_path: str, destination_path: str) -> None:
+        pass
+
+    @abstractmethod
+    def sync_foler(self, source: str, destination: str) -> None:
+        pass
+
+
+class LocalHandler(FileHandler):
+    def download_file(self, file_path: str, destination_path: str) -> None:
+        shutil.copy(file_path, destination_path)
+
+    def upload_file(self, file_path: str, destination_path: str) -> None:
+        shutil.copy(file_path, destination_path)
+
+    def sync_foler(self, source: str, destination: str) -> None:
+        shutil.copytree(source, destination, dirs_exist_ok=True)
diff --git a/server/rag/index/parser/file_parser/llamaparse/llamaparse_parser.py b/server/rag/index/parser/file_parser/llamaparse/llamaparse_parser.py
diff --git a/server/rag/index/parser/file_parser/markdown_parser.py b/server/rag/index/parser/file_parser/markdown_parser.py
diff --git a/server/rag/index/parser/html_parser/web_content_crawler.py b/server/rag/index/parser/html_parser/web_content_crawler.py
diff --git a/server/rag/index/parser/html_parser/web_link_crawler.py b/server/rag/index/parser/html_parser/web_link_crawler.py

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,3 @@`
`1`		`-from contextlib import contextmanager`
`2`	`1`	`from typing import Any, Optional, List`
`3`	`2`	`from diskcache import Cache`
`4`	`3`	`from server.constant.constants import DISKCACHE_DIR`
-Original file line number
+Diff line change
 +import json
 +import os
 +import requests
 +import tempfile
 +import time
 +from pathlib import Path
 +from typing import Any, List, Optional
 +from llama_parse import LlamaParse
 +from server.rag.index.parser.file_parser.llamaparse.file_handler import FileHandler
 +from server.logger.logger_config import my_logger as logger
++
 +all_elements_output_file = "all_elements.json"
 +chunks_output_file = "chunks.json"
++
++
 +class DocParser:
 +    def __init__(self,
 +                 file_handler: FileHandler,
 +                 language: str = "en",
 +                 is_download_image: bool = True) -> None:
 +        self.file_handler = file_handler
 +        self.is_download_image = is_download_image
 +        USE_GPT4O = int(os.getenv('USE_GPT4O'))
 +        if USE_GPT4O:
 +            self.llamaparse = LlamaParse(
 +                api_key=os.getenv('LLAMA_CLOUD_API_KEY'),
 +                gpt4o_mode=True,
 +                gpt4o_api_key=os.getenv('OPENAI_API_KEY'),
 +                result_type="json",
 +                language=language,
 +                verbose=True)
 +        else:
 +            self.llamaparse = LlamaParse(
 +                api_key=os.getenv('LLAMA_CLOUD_API_KEY'),
 +                result_type="json",
 +                language=language,
 +                verbose=True)
 +        logger.info(
 +            f"Init DocParser of llamaparse, language: '{language}', is_download_image: {is_download_image}, USE_GPT4O: {USE_GPT4O}"
 +        )
++
 +    def parse_file(
 +            self,
 +            filepath: Path,
 +            destination_folder: Path,
 +            include_chunking: bool = True) -> tuple[list[Any], list[Any]]:
 +        with tempfile.TemporaryDirectory() as temp_dir:
 +            temp_file = Path(temp_dir) / filepath.name
 +            self.file_handler.download_file(filepath.as_posix(),
 +                                            temp_file.as_posix())
++
 +            elements_file = f"{temp_dir}/{all_elements_output_file}"
++
 +            elements, chunks = self.partition_doc_to_folder(
 +                temp_file,
 +                Path(temp_dir),
 +                include_chunking=include_chunking,
 +                all_elements_output_file=elements_file)
++
 +            self.file_handler.sync_foler(temp_dir,
 +                                         destination_folder.as_posix())
++
 +            return elements, chunks
++
 +    def partition_doc(
 +        self,
 +        input_file: Path,
 +        output_dir: Path,
 +        include_chunking: bool = True,
 +    ) -> tuple[list[Any], list[Any]]:
 +        elements = []
 +        chunks = []
 +        try:
 +            import nest_asyncio
 +            nest_asyncio.apply()
++
 +            json_objs = self.llamaparse.get_json_result(str(input_file))
 +            job_id = json_objs[0]["job_id"]
 +            elements = json_objs[0]["pages"]
 +            job_metadata = json_objs[0]["job_metadata"]
 +            logger.info(
 +                f"For inpput_file: '{input_file}', job_id is'{job_id}', job_metatdata is {job_metadata}"
 +            )
++
 +            if self.is_download_image:
 +                """
 +                TODO:
 +                To enhance the efficiency of image downloading, the following optimizations could be considered:
 +                1. Handle image downloads through asynchronous tasks to improve response times.
 +                2. Implement concurrent downloads to make effective use of resources and accelerate the download process.
 +                """
 +                for page_item in elements:
 +                    images = page_item["images"]
 +                    for image_item in images:
 +                        image_name = image_item["name"]
 +                        logger.info(
 +                            f"For inpput_file: '{input_file}', downloading image: '{image_name}'"
 +                        )
 +                        download_image(job_id, image_name,
 +                                       output_dir.as_posix())
++
 +            if include_chunking:
 +                """
 +                TODO:
 +                The current chunking strategy treats each page as a separate chunk. Future optimizations might include:
 +                1. Evaluating whether adjacent pages can be merged into a single chunk.
 +                2. Considering whether it's necessary to split a single page into multiple chunks.
 +                """
 +                filename = input_file.name
 +                file_extension = input_file.suffix
 +                for page_item in elements:
 +                    page_number = page_item["page"]
 +                    chunk_item = {
 +                        "chunk_text": page_item["md"],
 +                        "metadata": {
 +                            "filename": filename,
 +                            "filetype": f"application/{file_extension[1:]}",
 +                            "last_modified_timestamp": int(time.time()),
 +                            "beginning_page": page_number,
 +                            "ending_page": page_number
 +                        }
 +                    }
 +                    chunks.append(chunk_item)
 +        except Exception as e:
 +            logger.error(
 +                f"Parsing file: '{input_file}' is failed, exception: {e}")
++
 +        return elements, chunks
++
 +    def partition_doc_to_folder(
 +        self,
 +        input_file: Path,
 +        output_dir: Path,
 +        all_elements_output_file: str,
 +        include_chunking: bool = True,
 +    ) -> tuple[list[Any], list[Any]]:
 +        elements, chunks = self.partition_doc(input_file, output_dir,
 +                                              include_chunking)
++
 +        elements_output_file = output_dir / all_elements_output_file
 +        elements_to_json(elements, elements_output_file.as_posix())
 +        elements_to_json(chunks, (output_dir / chunks_output_file).as_posix())
++
 +        return elements, chunks
++
++
 +def elements_to_json(
 +    elements: List[Any],
 +    filename: Optional[str] = None,
 +    indent: int = 4,
 +    encoding: str = "utf-8",
 +) -> Optional[str]:
 +    """
 +    Saves a list of elements to a JSON file if filename is specified.
 +    Otherwise, return the list of elements as a string.
 +    """
 +    # -- serialize `elements` as a JSON array (str) --
 +    json_str = json.dumps(elements, indent=indent, sort_keys=False)
 +    if filename is not None:
 +        with open(filename, "w", encoding=encoding) as f:
 +            f.write(json_str)
 +        return None
 +    return json_str
++
++
 +def download_image(job_id: str, image_name: str, output_dir: str) -> None:
 +    url = f"https://api.cloud.llamaindex.ai/api/parsing/job/{job_id}/result/image/{image_name}"
 +    headers = {
 +        'Authorization': f'Bearer {os.getenv("LLAMA_CLOUD_API_KEY")}',
 +        'Accept': 'application/json',
 +        'Content-Type': 'multipart/form-data'
 +    }
 +    try:
 +        response = requests.get(url, headers=headers)
 +        if response.status_code == 200:
 +            with open(f'{output_dir}/{image_name}', 'wb') as f:
 +                f.write(response.content)
 +        else:
 +            logger.error(
 +                f"Failed to retrieve '{image_name}', status_code: {response.status_code}, text: {response.text}"
 +            )
 +    except Exception as e:
 +        logger.error(f"Download '{image_name}' failed, error: {e}")
-Original file line number
+Diff line change
 from typing import List
 from server.app.utils.diskcache_lock import diskcache_lock
 from server.logger.logger_config import my_logger as logger
 -from server.constant.constants import SQLITE_DB_DIR, SQLITE_DB_NAME, MAX_CHUNK_LENGTH, CHUNK_OVERLAP, FROM_LOCAL_FILE, LOCAL_FILE_PARSING, LOCAL_FILE_PARSING_COMPLETED, LOCAL_FILE_EMBEDDED, LOCAL_FILE_PROCESS_FAILED
 +from server.constant.constants import (SQLITE_DB_DIR, SQLITE_DB_NAME,
 +                                       MAX_CHUNK_LENGTH, CHUNK_OVERLAP,
 +                                       FROM_LOCAL_FILE, LOCAL_FILE_PARSING,
 +                                       LOCAL_FILE_PARSING_COMPLETED,
 +                                       LOCAL_FILE_EMBEDDED,
 +                                       LOCAL_FILE_PROCESS_FAILED)
 from server.rag.index.chunk.markdown_splitter import MarkdownTextSplitter
 from server.rag.index.embedder.document_embedder import document_embedder
-Original file line number
+Diff line change
 from bs4 import BeautifulSoup
 from server.app.utils.url_helper import is_same_domain, normalize_url
 from server.app.utils.diskcache_lock import diskcache_lock
 -from server.constant.constants import SQLITE_DB_DIR, SQLITE_DB_NAME, MAX_CRAWL_PARALLEL_REQUEST, SITEMAP_URL_RECORDED, SITEMAP_URL_EXPIRED, DOMAIN_STATISTICS_GATHERING_COLLECTED
 +from server.constant.constants import (SQLITE_DB_DIR, SQLITE_DB_NAME,
 +                                       MAX_CRAWL_PARALLEL_REQUEST,
 +                                       SITEMAP_URL_RECORDED,
 +                                       SITEMAP_URL_EXPIRED,
 +                                       DOMAIN_STATISTICS_GATHERING_COLLECTED)
 from server.logger.logger_config import my_logger as logger