From 78d6e4929670edd844e7d1ec35899307d565b546 Mon Sep 17 00:00:00 2001 From: Sanjay Nadhavajhala Date: Mon, 12 Feb 2024 18:15:00 -0800 Subject: [PATCH] Added check for text encoding in RAG --- services/backend/task_executor/app/tasks.py | 14 ++++++++++---- services/backend/task_executor/requirements.txt | 1 + 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/services/backend/task_executor/app/tasks.py b/services/backend/task_executor/app/tasks.py index a807306..5f56e5d 100644 --- a/services/backend/task_executor/app/tasks.py +++ b/services/backend/task_executor/app/tasks.py @@ -460,6 +460,7 @@ def execute_asst_file_create(file_id: str, assistant_id: str): # Third Party from langchain.text_splitter import RecursiveCharacterTextSplitter + from chardet import detect # Local from app.vector_db.milvus.main import add_texts @@ -489,10 +490,15 @@ def execute_asst_file_create(file_id: str, assistant_id: str): parsed_text = res else: ## try to read plain text try: - parsed_text = file_object["content"].decode() - - except Exception as e: - print(f"Load Error: {e}") + parsed_text = file_object["content"].decode('utf-8') + except UnicodeDecodeError: + try: + # Attempt to detect encoding and decode + encoding = detect(file_object["content"])['encoding'] + parsed_text = file_object["content"].decode(encoding) + except Exception as e: + logging.error(f"Decoding error with detected encoding: {e}") + parsed_text = "" if parsed_text != "": # Split docs and add to milvus vector DB diff --git a/services/backend/task_executor/requirements.txt b/services/backend/task_executor/requirements.txt index 24d4970..00a3ea2 100644 --- a/services/backend/task_executor/requirements.txt +++ b/services/backend/task_executor/requirements.txt @@ -18,3 +18,4 @@ spacy==3.7.2 markdownify==0.11.6 playwright==1.39.0 tiktoken==0.5.2 +chardet==5.2.0 \ No newline at end of file