@@ -27,6 +27,9 @@ def __init__(self) -> None:
27
27
aws_secret_access_key = os .getenv ('AWS_SECRET_ACCESS_KEY' ),
28
28
)
29
29
30
+ # TODO: Get it from vector_database.py
31
+ self .acceptable_filetypes = ['.html' , '.py' , '.vtt' , '.pdf' , '.txt' , '.srt' , '.docx' , '.ppt' , '.pptx' ]
32
+
30
33
# Create a Supabase client
31
34
self .supabase_client = supabase .create_client ( # type: ignore
32
35
supabase_url = os .environ ['SUPABASE_URL' ],
@@ -47,7 +50,7 @@ def __init__(self) -> None:
47
50
def get_file_extension (self , filename ):
48
51
match = re .search (r'\.([a-zA-Z0-9]+)$' , filename )
49
52
valid_filetypes = list (mimetypes .types_map .keys ())
50
- valid_filetypes = valid_filetypes + [ '.html' , '.py' , '.vtt' , '.pdf' , '.txt' , '.srt' , '.docx' , '.ppt' , '.pptx' ]
53
+ valid_filetypes = valid_filetypes + self . acceptable_filetypes
51
54
if match :
52
55
filetype = "." + match .group (1 )
53
56
if filetype in valid_filetypes :
@@ -79,15 +82,15 @@ def valid_url(self, url):
79
82
if "<!doctype html" not in str (response .text ).lower ():
80
83
print ("⛔️⛔️ Filetype not supported:" , response .url , "⛔️⛔️" )
81
84
return (False , False , False )
82
- elif filetype in [ '.py' , '.vtt' , '.pdf' , '.txt' , '.srt' , '.docx' , '.ppt' , '.pptx' ]:
85
+ elif filetype in self . acceptable_filetypes [ 1 : ]:
83
86
if "<!doctype html" in str (response .text ).lower ():
84
87
content = BeautifulSoup (response .text , "html.parser" )
85
88
filetype = '.html'
86
89
else :
87
90
content = response .content
88
91
else :
89
92
return (False , False , False )
90
- if filetype not in [ '.html' , '.py' , '.vtt' , '.pdf' , '.txt' , '.srt' , '.docx' , '.ppt' , '.pptx' ] :
93
+ if filetype not in self . acceptable_filetypes :
91
94
print ("⛔️⛔️ Filetype not supported:" , filetype , "⛔️⛔️" )
92
95
return (False , False , False )
93
96
return (response .url , content , filetype )
0 commit comments