Skip to content

Commit

Permalink
removed some prints and make the count work with existing items in su…
Browse files Browse the repository at this point in the history
…pabase
  • Loading branch information
jkmin3 committed Sep 25, 2023
1 parent 14ab055 commit cb79daf
Showing 1 changed file with 5 additions and 2 deletions.
7 changes: 5 additions & 2 deletions ai_ta_backend/web_scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ def __init__(self) -> None:
self.existing_urls = []
self.max_urls = 0
self.original_amount = 0
self.supa_urls = 0

return None

Expand Down Expand Up @@ -136,7 +137,6 @@ def find_urls(self, soup:BeautifulSoup, site:str, urls:set):
href = site+href
else:
href = site+'/'+href
print("HREFS:", href)
urls.add(href)

except Exception as e:
Expand Down Expand Up @@ -246,8 +246,10 @@ def check_file_not_exists(self, file):

def count_hard_stop_len(self):
all_urls = self.existing_urls + self.invalid_urls
count = len(all_urls) - self.supa_urls
if all_urls != []:
if len(all_urls) > self.max_urls:
print("📈📈 Counted URLs", count, "out of", self.original_amount, "📈📈" )
if len(all_urls) > self.original_amount:
print("Too many repeated urls, exiting web scraper")
return True
else:
Expand Down Expand Up @@ -469,6 +471,7 @@ def main_crawler(self, url:str, course_name:str, max_urls:int=100, max_depth:int
self.existing_urls = []
try:
print("Begin Ingesting Web page")
self.supa_urls = len(self.existing_urls)
self.crawler(url=url, course_name=course_name, max_depth=max_depth, timeout=timeout, base_url_on=base_url_str)
except ValueError as e:
print("Error:", e)
Expand Down

0 comments on commit cb79daf

Please sign in to comment.