Skip to content

Commit

Permalink
Added check for text encoding in RAG
Browse files Browse the repository at this point in the history
  • Loading branch information
sanjay920 committed Feb 13, 2024
1 parent 8362cb9 commit 78d6e49
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 4 deletions.
14 changes: 10 additions & 4 deletions services/backend/task_executor/app/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -460,6 +460,7 @@ def execute_asst_file_create(file_id: str, assistant_id: str):

# Third Party
from langchain.text_splitter import RecursiveCharacterTextSplitter
from chardet import detect

# Local
from app.vector_db.milvus.main import add_texts
Expand Down Expand Up @@ -489,10 +490,15 @@ def execute_asst_file_create(file_id: str, assistant_id: str):
parsed_text = res
else: ## try to read plain text
try:
parsed_text = file_object["content"].decode()

except Exception as e:
print(f"Load Error: {e}")
parsed_text = file_object["content"].decode('utf-8')
except UnicodeDecodeError:
try:
# Attempt to detect encoding and decode
encoding = detect(file_object["content"])['encoding']
parsed_text = file_object["content"].decode(encoding)
except Exception as e:
logging.error(f"Decoding error with detected encoding: {e}")
parsed_text = ""

if parsed_text != "":
# Split docs and add to milvus vector DB
Expand Down
1 change: 1 addition & 0 deletions services/backend/task_executor/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,4 @@ spacy==3.7.2
markdownify==0.11.6
playwright==1.39.0
tiktoken==0.5.2
chardet==5.2.0

0 comments on commit 78d6e49

Please sign in to comment.