-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdocument_processing.py
More file actions
132 lines (97 loc) · 3.35 KB
/
document_processing.py
File metadata and controls
132 lines (97 loc) · 3.35 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
"""Document processing module for PDF and Word document handling."""
import logging
from typing import List
import docx
import fitz
from langchain_text_splitters import RecursiveCharacterTextSplitter
from constants import CHUNK_OVERLAP, CHUNK_SIZE, MAX_FILE_SIZE_BYTES, MAX_FILE_SIZE_MB
from vector_db import add_to_vector_db
logger = logging.getLogger(__name__)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=CHUNK_SIZE,
chunk_overlap=CHUNK_OVERLAP,
length_function=len,
separators=["\n\n", "\n", " ", ""]
)
def check_file_size(file) -> None:
"""Check if file size is within allowed limits.
Args:
file: The uploaded file object.
Raises:
ValueError: If file exceeds the maximum allowed size.
"""
file.seek(0, 2)
file_size = file.tell()
file.seek(0)
if file_size > MAX_FILE_SIZE_BYTES:
raise ValueError(
f"File '{file.name}' exceeds maximum allowed size of {MAX_FILE_SIZE_MB}MB"
)
def process_pdf(file) -> str:
"""Extract text content from a PDF file.
Args:
file: The PDF file object.
Returns:
The extracted text content.
"""
check_file_size(file)
doc = fitz.open(stream=file.read(), filetype="pdf")
text = ""
for page in doc:
text += page.get_text()
logger.info("Processed PDF with %d pages", len(doc))
return text
def process_docx(file) -> str:
"""Extract text content from a Word document.
Args:
file: The Word document file object.
Returns:
The extracted text content.
"""
check_file_size(file)
doc = docx.Document(file)
text = ""
for para in doc.paragraphs:
text += para.text + "\n"
logger.info("Processed Word document with %d paragraphs", len(doc.paragraphs))
return text
def chunk_text(text: str) -> List[str]:
"""Split text into chunks for vector storage.
Args:
text: The text to split.
Returns:
A list of text chunks.
"""
if not text or not text.strip():
return []
chunks = text_splitter.split_text(text)
logger.info("Split text into %d chunks", len(chunks))
return chunks
def process_and_store_documents(uploaded_files: List) -> None:
"""Process uploaded documents and store them in the vector database.
Args:
uploaded_files: List of uploaded file objects.
Raises:
ValueError: If a file exceeds size limits or has unsupported format.
"""
for file in uploaded_files:
try:
if file.type == "application/pdf":
text = process_pdf(file)
elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
text = process_docx(file)
else:
logger.warning("Unsupported file type: %s", file.type)
continue
chunks = chunk_text(text)
for chunk in chunks:
add_to_vector_db(chunk, auto_save=False)
from vector_db import save_index
save_index()
logger.info("Successfully processed and stored file: %s", file.name)
except ValueError as e:
logger.error("Validation error processing file %s: %s", file.name, e)
raise
except Exception as e:
logger.error("Error processing file %s: %s", file.name, e)
raise