Skip to content

Commit 95247b9

Browse files
committed
made filetypes a variable
1 parent 36f7f90 commit 95247b9

File tree

1 file changed

+6
-3
lines changed

1 file changed

+6
-3
lines changed

ai_ta_backend/web_scrape.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@ def __init__(self) -> None:
2727
aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'),
2828
)
2929

30+
# TODO: Get it from vector_database.py
31+
self.acceptable_filetypes = ['.html', '.py', '.vtt', '.pdf', '.txt', '.srt', '.docx', '.ppt', '.pptx']
32+
3033
# Create a Supabase client
3134
self.supabase_client = supabase.create_client( # type: ignore
3235
supabase_url=os.environ['SUPABASE_URL'],
@@ -47,7 +50,7 @@ def __init__(self) -> None:
4750
def get_file_extension(self, filename):
4851
match = re.search(r'\.([a-zA-Z0-9]+)$', filename)
4952
valid_filetypes = list(mimetypes.types_map.keys())
50-
valid_filetypes = valid_filetypes + ['.html', '.py', '.vtt', '.pdf', '.txt', '.srt', '.docx', '.ppt', '.pptx']
53+
valid_filetypes = valid_filetypes + self.acceptable_filetypes
5154
if match:
5255
filetype = "." + match.group(1)
5356
if filetype in valid_filetypes:
@@ -79,15 +82,15 @@ def valid_url(self, url):
7982
if "<!doctype html" not in str(response.text).lower():
8083
print("⛔️⛔️ Filetype not supported:", response.url, "⛔️⛔️")
8184
return (False, False, False)
82-
elif filetype in ['.py', '.vtt', '.pdf', '.txt', '.srt', '.docx', '.ppt', '.pptx']:
85+
elif filetype in self.acceptable_filetypes[1:]:
8386
if "<!doctype html" in str(response.text).lower():
8487
content = BeautifulSoup(response.text, "html.parser")
8588
filetype = '.html'
8689
else:
8790
content = response.content
8891
else:
8992
return (False, False, False)
90-
if filetype not in ['.html', '.py', '.vtt', '.pdf', '.txt', '.srt', '.docx', '.ppt', '.pptx']:
93+
if filetype not in self.acceptable_filetypes:
9194
print("⛔️⛔️ Filetype not supported:", filetype, "⛔️⛔️")
9295
return (False, False, False)
9396
return (response.url, content, filetype)

0 commit comments

Comments
 (0)