Skip to content

Commit

Permalink
fixed errors with false values showing up in lists and added a hard s…
Browse files Browse the repository at this point in the history
…top function based on number of URLs looked at, not just scraped
  • Loading branch information
jkmin3 committed Sep 25, 2023
1 parent 9b56819 commit f5ec2c6
Showing 1 changed file with 65 additions and 65 deletions.
130 changes: 65 additions & 65 deletions ai_ta_backend/web_scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ def valid_url(self, url):
return (False, False, False)
if filetype not in ['.html', '.py', '.vtt', '.pdf', '.txt', '.srt', '.docx', '.ppt', '.pptx']:
print("⛔️⛔️ Filetype not supported:", filetype, "⛔️⛔️")
return (False, False, False)
return (response.url, content, filetype)
else:
print("🚫🚫 URL is invalid:", response.url, "Return code:", response.status_code, "🚫🚫")
Expand Down Expand Up @@ -134,6 +135,7 @@ def find_urls(self, soup:BeautifulSoup, site:str, urls:set):
href = site+href
else:
href = site+'/'+href
print("HREFS:", href)
urls.add(href)

except Exception as e:
Expand Down Expand Up @@ -226,19 +228,25 @@ def check_file_not_exists(self, file):
return False
else:
return True

def count_hard_stop(self, average:int=4):
# Counts the number of repeated urls and if it is too high, it will exit the web scraper
if self.existing_urls == None:
self.existing_urls = []
if self.invalid_urls == None:
self.invalid_urls = []
self.existing_urls.extend(self.invalid_urls)
counted_urls = Counter(self.existing_urls)
if len(counted_urls) != 0:
print(counted_urls[False], "False stuff")
print("📈📈 Counted URLs", sum(counted_urls.values())/len(counted_urls), "📈📈")
if sum(counted_urls.values())/len(counted_urls) > average:

# # Counts the average number of repeated urls and if it is too high, it will exit the web scraper
# def count_hard_stop_avg(self, average:int=4):
# # Counts the number of repeated urls and if it is too high, it will exit the web scraper
# all_urls = self.existing_urls + self.invalid_urls
# counted_urls = Counter(all_urls)
# if len(counted_urls) != 0:
# print(counted_urls[False], "False stuff")
# print("📈📈 Counted URLs", sum(counted_urls.values())/len(counted_urls), "📈📈")
# if sum(counted_urls.values())/len(counted_urls) > average:
# print("Too many repeated urls, exiting web scraper")
# return True
# else:
# return False

def count_hard_stop_len(self):
all_urls = self.existing_urls + self.invalid_urls
if all_urls != []:
if len(all_urls) > self.max_urls:
print("Too many repeated urls, exiting web scraper")
return True
else:
Expand All @@ -253,65 +261,61 @@ def remove_falses(self):

def check_and_ingest(self, url:str, course_name:str, timeout:int, base_url_on:str):
if url not in self.invalid_urls and url not in self.existing_urls:
url, content, filetype = self.valid_url(url)
second_url, content, filetype = self.valid_url(url)
else:
raise ValueError("This URL is invalid or already existing in the database")
print("This URL is invalid or already existing in the database")
self.existing_urls.append((url))
return '', '', ''

if url:
if second_url:
time.sleep(timeout)
url_content = (url, content, filetype)
url_content = (second_url, content, filetype)
if self.check_file_not_exists(url_content):
path_name = self.title_path_name(url_content)
self.url_contents.append(url_content)
self.existing_urls.append(url_content)
# url_contents = remove_duplicates(url_contents, _existing_urls)
self.ingest_file(url_content, course_name, path_name, base_url_on)
print("✅✅ Scraped:", url, "✅✅")
print("✅✅ Scraped:", second_url, "✅✅")
self.max_urls -= 1
else:
raise ValueError("This URL is already existing in the database")
print("This URL is already existing in the database")
self.existing_urls.append((second_url, content, filetype))
else:
self.invalid_urls.append(url)
raise ValueError("This URL is invalid")
print("This URL is invalid")

return url, content, filetype

def scrape_user_provided_page(self, url:str, course_name:str, timeout:int, base_url_on:str, base:str):

urls= set()
try:
url, content, filetype = self.check_and_ingest(url, course_name, timeout, base_url_on)
except ValueError as e:
raise e

if filetype == '.html':
try:
body = content.find("body")
header = content.find("head")
except Exception as e:
print("Error:", e)
body = ""
header = ""
else:
body = ""
header = ""
url, content, filetype = self.check_and_ingest(url, course_name, timeout, base_url_on)

# Check for 403 Forbidden urls
try:
if content.title.string.lower() == "403 forbidden" or content.title.string.lower() == 'page not found': # type: ignore
print("403 Forbidden")
self.invalid_urls.append(url)
if url:
if filetype == '.html':
try:
body = content.find("body")
header = content.find("head")
except Exception as e:
print("Error:", e)
body = ""
header = ""
# Check for 403 Forbidden urls
try:
if content.title.string.lower() == "403 forbidden" or content.title.string.lower() == 'page not found': # type: ignore
print("403 Forbidden")
self.invalid_urls.append(url)
else:
pass
except Exception as e:
print("Error:", e)
pass
if body != "" and header != "":
urls = self.find_urls(body, base, urls) # type: ignore
urls = self.find_urls(header, base, urls)# type: ignore
else:
pass
except Exception as e:
print("Error:", e)
pass
if body != "" and header != "":
urls = self.find_urls(body, base, urls)
urls = self.find_urls(header, base, urls)
else:
urls = self.find_urls(content, base, urls)

urls = self.find_urls(content, base, urls)# type: ignore

return urls

def non_user_provided_page_urls(self, url:str, base:str, soup, filetype:str):
Expand Down Expand Up @@ -355,7 +359,7 @@ def crawler(self, url:str, course_name:str, max_depth:int=3, timeout:int=1, base
if base == "":
raise ValueError("This URL is invalid")

if self.count_hard_stop(4):
if self.count_hard_stop_len():
raise ValueError("Too many repeated urls, exiting web scraper")

try:
Expand All @@ -374,18 +378,16 @@ def crawler(self, url:str, course_name:str, max_depth:int=3, timeout:int=1, base
if base_url_on:
if url.startswith(base):
url, content, filetype = self.check_and_ingest(url, course_name, timeout, base_url_on)
temp_urls.append((url, content, filetype))
if self.count_hard_stop(average):
if url:
temp_urls.append((url, content, filetype))
if self.count_hard_stop_len():
raise ValueError("Too many repeated urls, exiting web scraper")
else:
print("This URL is already existing in the database")
else:
url, content, filetype = self.check_and_ingest(url, course_name, timeout, base_url_on)
temp_urls.append((url, content, filetype))
if self.count_hard_stop(average):
raise ValueError("Too many repeated urls, exiting web scraper")
else:
print("This URL is already existing in the database")
if url:
temp_urls.append((url, content, filetype))
if self.count_hard_stop_len():
raise ValueError("Too many repeated urls, exiting web scraper")
else:
print("Max URLs reached")
raise ValueError("Max URLs reached")
Expand All @@ -398,10 +400,8 @@ def crawler(self, url:str, course_name:str, max_depth:int=3, timeout:int=1, base
if self.max_urls > 0:
if _depth < max_depth:
self.crawler(url[0], course_name, max_depth, timeout, base_url_on, _depth+1, url[1], url[2], average)
# url_contents = remove_duplicates(url_contents, _existing_urls)
# print("Technically don't have to remove here, but here is what it is:", diff)
print(self.max_urls, "urls left")
if self.count_hard_stop(average):
if self.count_hard_stop_len():
raise ValueError("Too many repeated urls, exiting web scraper")
else:
print("Depth exceeded:", _depth+1, "out of", max_depth)
Expand Down

0 comments on commit f5ec2c6

Please sign in to comment.