From cb79daf71a4833471d1b77ff85a790b9786e98ce Mon Sep 17 00:00:00 2001
From: jkmin3 <ijoshuamin@gmail.com>
Date: Mon, 25 Sep 2023 16:06:50 -0500
Subject: [PATCH] removed some prints and make the count work with existing
 items in supabase

---
 ai_ta_backend/web_scrape.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/ai_ta_backend/web_scrape.py b/ai_ta_backend/web_scrape.py
index badaf71f..218cad5d 100644
--- a/ai_ta_backend/web_scrape.py
+++ b/ai_ta_backend/web_scrape.py
@@ -39,6 +39,7 @@ def __init__(self) -> None:
     self.existing_urls = []
     self.max_urls = 0
     self.original_amount = 0
+    self.supa_urls = 0
 
     return None
 
@@ -136,7 +137,6 @@ def find_urls(self, soup:BeautifulSoup, site:str, urls:set):
           href = site+href
         else:
           href = site+'/'+href
-        print("HREFS:", href)
         urls.add(href)
 
     except Exception as e:
@@ -246,8 +246,10 @@ def check_file_not_exists(self, file):
 
   def count_hard_stop_len(self):
     all_urls = self.existing_urls + self.invalid_urls
+    count = len(all_urls) - self.supa_urls
     if all_urls != []:
-      if len(all_urls) > self.max_urls:
+      print("📈📈 Counted URLs", count, "out of", self.original_amount, "📈📈" )
+      if len(all_urls) > self.original_amount:
         print("Too many repeated urls, exiting web scraper")
         return True
       else:
@@ -469,6 +471,7 @@ def main_crawler(self, url:str, course_name:str, max_urls:int=100, max_depth:int
         self.existing_urls = []
       try:
         print("Begin Ingesting Web page")
+        self.supa_urls = len(self.existing_urls)
         self.crawler(url=url, course_name=course_name, max_depth=max_depth, timeout=timeout, base_url_on=base_url_str)
       except ValueError as e:
         print("Error:", e)