Update PDF parsing to use utf-8 chars instead of ascii

UIUC-Chatbot · Oct 10, 2023 · 97a300d · 97a300d
1 parent 6fbd1bb
commit 97a300d
Showing 1 changed file with 1 addition and 1 deletion.
diff --git a/ai_ta_backend/vector_database.py b/ai_ta_backend/vector_database.py
@@ -496,7 +496,7 @@ def _ingest_single_pdf(self, s3_path: str, course_name: str, **kwargs):
                 self.s3_client.upload_fileobj(f, os.getenv('S3_BUCKET_NAME'), s3_upload_path)
 
           # Extract text
-          text = page.get_text().encode("utf8").decode('ascii', errors='ignore')  # get plain text (is in UTF-8)
+          text = page.get_text().encode("utf8").decode("utf8", errors='ignore')  # get plain text (is in UTF-8)
           pdf_pages_OCRed.append(dict(text=text, page_number=i, readable_filename=Path(s3_path).name))
 
         if kwargs['kwargs'] == {}: