removed some prints and make the count work with existing items in su…

…pabase
UIUC-Chatbot · Sep 25, 2023 · cb79daf · cb79daf
1 parent 14ab055
commit cb79daf
Showing 1 changed file with 5 additions and 2 deletions.
diff --git a/ai_ta_backend/web_scrape.py b/ai_ta_backend/web_scrape.py
@@ -39,6 +39,7 @@ def __init__(self) -> None:
     self.existing_urls = []
     self.max_urls = 0
     self.original_amount = 0
+    self.supa_urls = 0
 
     return None
 
@@ -136,7 +137,6 @@ def find_urls(self, soup:BeautifulSoup, site:str, urls:set):
           href = site+href
         else:
           href = site+'/'+href
-        print("HREFS:", href)
         urls.add(href)
 
     except Exception as e:
@@ -246,8 +246,10 @@ def check_file_not_exists(self, file):
 
   def count_hard_stop_len(self):
     all_urls = self.existing_urls + self.invalid_urls
+    count = len(all_urls) - self.supa_urls
     if all_urls != []:
-      if len(all_urls) > self.max_urls:
+      print("📈📈 Counted URLs", count, "out of", self.original_amount, "📈📈" )
+      if len(all_urls) > self.original_amount:
         print("Too many repeated urls, exiting web scraper")
         return True
       else:
@@ -469,6 +471,7 @@ def main_crawler(self, url:str, course_name:str, max_urls:int=100, max_depth:int
         self.existing_urls = []
       try:
         print("Begin Ingesting Web page")
+        self.supa_urls = len(self.existing_urls)
         self.crawler(url=url, course_name=course_name, max_depth=max_depth, timeout=timeout, base_url_on=base_url_str)
       except ValueError as e:
         print("Error:", e)