From cb79daf71a4833471d1b77ff85a790b9786e98ce Mon Sep 17 00:00:00 2001 From: jkmin3 Date: Mon, 25 Sep 2023 16:06:50 -0500 Subject: [PATCH] removed some prints and make the count work with existing items in supabase --- ai_ta_backend/web_scrape.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/ai_ta_backend/web_scrape.py b/ai_ta_backend/web_scrape.py index badaf71f..218cad5d 100644 --- a/ai_ta_backend/web_scrape.py +++ b/ai_ta_backend/web_scrape.py @@ -39,6 +39,7 @@ def __init__(self) -> None: self.existing_urls = [] self.max_urls = 0 self.original_amount = 0 + self.supa_urls = 0 return None @@ -136,7 +137,6 @@ def find_urls(self, soup:BeautifulSoup, site:str, urls:set): href = site+href else: href = site+'/'+href - print("HREFS:", href) urls.add(href) except Exception as e: @@ -246,8 +246,10 @@ def check_file_not_exists(self, file): def count_hard_stop_len(self): all_urls = self.existing_urls + self.invalid_urls + count = len(all_urls) - self.supa_urls if all_urls != []: - if len(all_urls) > self.max_urls: + print("📈📈 Counted URLs", count, "out of", self.original_amount, "📈📈" ) + if len(all_urls) > self.original_amount: print("Too many repeated urls, exiting web scraper") return True else: @@ -469,6 +471,7 @@ def main_crawler(self, url:str, course_name:str, max_urls:int=100, max_depth:int self.existing_urls = [] try: print("Begin Ingesting Web page") + self.supa_urls = len(self.existing_urls) self.crawler(url=url, course_name=course_name, max_depth=max_depth, timeout=timeout, base_url_on=base_url_str) except ValueError as e: print("Error:", e)