-
Notifications
You must be signed in to change notification settings - Fork 4
/
process_document.py
37 lines (29 loc) · 1.12 KB
/
process_document.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import os
import tempfile
import shutil
import logging
import chromadb
import chromadb.errors as chromadb_errors
from transformers import TRANSFORMERS_CACHE
from embed_and_retrieve import get_logger
logger = get_logger()
# Create a temporary directory to store uploaded files
UPLOAD_DIR = "data"
os.makedirs(UPLOAD_DIR, exist_ok=True)
def upload_file(uploaded_file):
with tempfile.NamedTemporaryFile(delete=False, suffix=f".{uploaded_file.name.split('.')[-1]}", dir=UPLOAD_DIR) as tmp_file:
tmp_file.write(uploaded_file.getvalue())
return tmp_file.name
def cleanup():
# Remove temporary files
shutil.rmtree(UPLOAD_DIR, ignore_errors=True)
# Clear Transformers cache (Remove any HF models downloaded)
if os.path.exists(TRANSFORMERS_CACHE):
shutil.rmtree(TRANSFORMERS_CACHE, ignore_errors=True)
# Remove ChromaDB collection
import chromadb
chroma_client = chromadb.Client()
try:
chroma_client.delete_collection("document_collection")
except (ValueError, chromadb_errors.ChromaError) as e:
logger.warning(f"Failed to delete collection: {e}. Ignoring exception.")