DeckSmith/document_processing.py at main · jack-jackhui/DeckSmith · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
"""Document processing module for PDF and Word document handling."""

import logging
from typing import List

import docx
import fitz
from langchain_text_splitters import RecursiveCharacterTextSplitter

from constants import CHUNK_OVERLAP, CHUNK_SIZE, MAX_FILE_SIZE_BYTES, MAX_FILE_SIZE_MB
from vector_db import add_to_vector_db

logger = logging.getLogger(__name__)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    length_function=len,
    separators=["\n\n", "\n", " ", ""]
)


def check_file_size(file) -> None:
    """Check if file size is within allowed limits.

    Args:
        file: The uploaded file object.

    Raises:
        ValueError: If file exceeds the maximum allowed size.
    """
    file.seek(0, 2)
    file_size = file.tell()
    file.seek(0)

    if file_size > MAX_FILE_SIZE_BYTES:
        raise ValueError(
            f"File '{file.name}' exceeds maximum allowed size of {MAX_FILE_SIZE_MB}MB"
        )


def process_pdf(file) -> str:
    """Extract text content from a PDF file.

    Args:
        file: The PDF file object.

    Returns:
        The extracted text content.
    """
    check_file_size(file)

    doc = fitz.open(stream=file.read(), filetype="pdf")
    text = ""
    for page in doc:
        text += page.get_text()

    logger.info("Processed PDF with %d pages", len(doc))
    return text


def process_docx(file) -> str:
    """Extract text content from a Word document.

    Args:
        file: The Word document file object.

    Returns:
        The extracted text content.
    """
    check_file_size(file)

    doc = docx.Document(file)
    text = ""
    for para in doc.paragraphs:
        text += para.text + "\n"

    logger.info("Processed Word document with %d paragraphs", len(doc.paragraphs))
    return text


def chunk_text(text: str) -> List[str]:
    """Split text into chunks for vector storage.

    Args:
        text: The text to split.

    Returns:
        A list of text chunks.
    """
    if not text or not text.strip():
        return []

    chunks = text_splitter.split_text(text)
    logger.info("Split text into %d chunks", len(chunks))
    return chunks


def process_and_store_documents(uploaded_files: List) -> None:
    """Process uploaded documents and store them in the vector database.

    Args:
        uploaded_files: List of uploaded file objects.

    Raises:
        ValueError: If a file exceeds size limits or has unsupported format.
    """
    for file in uploaded_files:
        try:
            if file.type == "application/pdf":
                text = process_pdf(file)
            elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
                text = process_docx(file)
            else:
                logger.warning("Unsupported file type: %s", file.type)
                continue

            chunks = chunk_text(text)
            for chunk in chunks:
                add_to_vector_db(chunk, auto_save=False)

            from vector_db import save_index
            save_index()

            logger.info("Successfully processed and stored file: %s", file.name)

        except ValueError as e:
            logger.error("Validation error processing file %s: %s", file.name, e)
            raise
        except Exception as e:
            logger.error("Error processing file %s: %s", file.name, e)
            raise