refactoring finished and added better print statements to web scraping

UIUC-Chatbot · Sep 9, 2023 · 557b77a · 557b77a
1 parent 6efe080
commit 557b77a
Show file tree

Hide file tree

Showing 2 changed files with 53 additions and 105 deletions.
diff --git a/ai_ta_backend/vector_database.py b/ai_ta_backend/vector_database.py
@@ -218,79 +218,54 @@ def get_context_stuffed_prompt(self, user_question: str, course_name: str, top_n
 
 
   def bulk_ingest(self, s3_paths: Union[List[str], str], course_name: str, **kwargs) -> Dict[str, List[str]]:
-    # https://python.langchain.com/en/latest/modules/indexes/document_loaders/examples/microsoft_word.html
     success_status = {"success_ingest": [], "failure_ingest": []}
 
+    def ingest(file_ext_mapping, s3_path, *args, **kwargs):
+        handler = file_ext_mapping.get(Path(s3_path).suffix)
+        if handler:
+            ret = handler(s3_path, *args, **kwargs)
+            if ret != "Success":
+                success_status['failure_ingest'].append(s3_path)
+            else:
+                success_status['success_ingest'].append(s3_path)
+
+    file_ext_mapping = {
+        '.html': self._ingest_html,
+        '.py': self._ingest_single_py,
+        '.vtt': self._ingest_single_vtt,
+        '.pdf': self._ingest_single_pdf,
+        '.txt': self._ingest_single_txt,
+        '.md': self._ingest_single_txt,
+        '.srt': self._ingest_single_srt,
+        '.docx': self._ingest_single_docx,
+        '.ppt': self._ingest_single_ppt,
+        '.pptx': self._ingest_single_ppt,
+    }
+
     try:
-      if isinstance(s3_paths, str):
-        s3_paths = [s3_paths]
-
-      for s3_path in s3_paths:
-        ext = Path(s3_path).suffix  # check mimetype of file
-        # TODO: no need to download, just guess_type against the s3_path...
-        with NamedTemporaryFile(suffix=ext) as tmpfile:
-          self.s3_client.download_fileobj(Bucket=os.environ['S3_BUCKET_NAME'], Key=s3_path, Fileobj=tmpfile)
-          mime_type = mimetypes.guess_type(tmpfile.name)[0]
-          category, subcategory = mime_type.split('/')
-
-        if s3_path.endswith('.html'):
-          ret = self._ingest_html(s3_path, course_name, kwargs=kwargs)
-          if ret != "Success":
-            success_status['failure_ingest'].append(s3_path)
-          else:
-            success_status['success_ingest'].append(s3_path)
-        elif s3_path.endswith('.py'):
-          ret = self._ingest_single_py(s3_path, course_name)
-          if ret != "Success":
-            success_status['failure_ingest'].append(s3_path)
-          else:
-            success_status['success_ingest'].append(s3_path)
-        elif s3_path.endswith('.vtt'):
-          ret = self._ingest_single_vtt(s3_path, course_name)
-          if ret != "Success":
-            success_status['failure_ingest'].append(s3_path)
-          else:
-            success_status['success_ingest'].append(s3_path)
-        elif s3_path.endswith('.pdf'):
-          ret = self._ingest_single_pdf(s3_path, course_name, kwargs=kwargs)
-          if ret != "Success":
-            success_status['failure_ingest'].append(s3_path)
-          else:
-            success_status['success_ingest'].append(s3_path)
-        elif s3_path.endswith('.txt') or s3_path.endswith('.md'):
-          ret = self._ingest_single_txt(s3_path, course_name)
-          if ret != "Success":
-            success_status['failure_ingest'].append(s3_path)
-          else:
-            success_status['success_ingest'].append(s3_path)
-        elif s3_path.endswith('.srt'):
-          ret = self._ingest_single_srt(s3_path, course_name)
-          if ret != "Success":
-            success_status['failure_ingest'].append(s3_path)
-          else:
-            success_status['success_ingest'].append(s3_path)
-        elif s3_path.endswith('.docx'):
-          ret = self._ingest_single_docx(s3_path, course_name)
-          if ret != "Success":
-            success_status['failure_ingest'].append(s3_path)
-          else:
-            success_status['success_ingest'].append(s3_path)
-        elif s3_path.endswith('.ppt') or s3_path.endswith('.pptx'):
-          ret = self._ingest_single_ppt(s3_path, course_name)
-          if ret != "Success":
-            success_status['failure_ingest'].append(s3_path)
-          else:
-            success_status['success_ingest'].append(s3_path)
-        elif category == 'video' or category == 'audio':
-          ret = self._ingest_single_video(s3_path, course_name)
-          if ret != "Success":
-            success_status['failure_ingest'].append(s3_path)
-          else:
-            success_status['success_ingest'].append(s3_path)
-      return success_status
+        if isinstance(s3_paths, str):
+            s3_paths = [s3_paths]
+
+        for s3_path in s3_paths:
+            with NamedTemporaryFile(suffix=Path(s3_path).suffix) as tmpfile:
+                self.s3_client.download_fileobj(Bucket=os.environ['S3_BUCKET_NAME'], Key=s3_path, Fileobj=tmpfile)
+                mime_type = mimetypes.guess_type(tmpfile.name)[0]
+                category, _ = mime_type.split('/')
+
+            if category in ['video', 'audio']:
+                ret = self._ingest_single_video(s3_path, course_name)
+                if ret != "Success":
+                    success_status['failure_ingest'].append(s3_path)
+                else:
+                    success_status['success_ingest'].append(s3_path)
+            else:
+                ingest(file_ext_mapping, s3_path, course_name, kwargs=kwargs)
+
+        return success_status
     except Exception as e:
-      success_status['failure_ingest'].append("MAJOR ERROR IN /bulk_ingest: Error: " + str(e))
-      return success_status
+        success_status['failure_ingest'].append(f"MAJOR ERROR IN /bulk_ingest: Error: {str(e)}")
+        return success_status
+
 
   def _ingest_single_py(self, s3_path: str, course_name: str):
     try:

diff --git a/ai_ta_backend/web_scrape.py b/ai_ta_backend/web_scrape.py
@@ -151,7 +151,7 @@ def remove_duplicates(urls:list, supabase_urls:list=None):
   print("deleted", og_len-len(not_repeated_files), "duplicate files")
   return urls
 
-def crawler(url:str, max_urls:int=1000, max_depth:int=3, timeout:int=1, base_url_on:str=None, _depth:int=0, _soup:BeautifulSoup=None, _filetype:str=None,  _invalid_urls:list=[], _existing_urls:list=None):
+def crawler(url:str, max_urls:int=1000, max_depth:int=3, timeout:int=1, base_url_on:str=None, _depth:int=0, _soup:BeautifulSoup=None, _filetype:str=None,  _invalid_urls:list=[], _existing_urls:list=[]):
   '''Function gets titles of urls and the urls themselves'''
   # Prints the depth of the current search
   print("depth: ", _depth)
@@ -181,7 +181,7 @@ def crawler(url:str, max_urls:int=1000, max_depth:int=3, timeout:int=1, base_url
     url, s, filetype = valid_url(url)
     time.sleep(timeout)
     url_contents.append((url,s, filetype))
-    print("Scraped:", url)
+    print("✅Scraped:", url, "✅")
   if url: 
     if filetype == '.html':
       try:
@@ -227,7 +227,7 @@ def crawler(url:str, max_urls:int=1000, max_depth:int=3, timeout:int=1, base_url
       if url.startswith(site):
         url, s, filetype = valid_url(url)
         if url:
-          print("Scraped:", url)
+          print("✅Scraped:", url, "✅")
           url_contents.append((url, s, filetype))
         else:
           _invalid_urls.append(url)
@@ -236,7 +236,7 @@ def crawler(url:str, max_urls:int=1000, max_depth:int=3, timeout:int=1, base_url
     else:
       url, s, filetype = valid_url(url)
       if url:
-        print("Scraped:", url)
+        print("✅Scraped:", url, "✅")
         url_contents.append((url, s, filetype))
       else:
         _invalid_urls.append(url)
@@ -305,8 +305,8 @@ def main_crawler(url:str, course_name:str, max_urls:int=100, max_depth:int=3, ti
   timeout = int(timeout)
   stay_on_baseurl = bool(stay_on_baseurl)
   if stay_on_baseurl:
-    stay_on_baseurl = base_url(url)
-    print(stay_on_baseurl)
+    baseurl = base_url(url)
+    print(baseurl)
 
   ingester = Ingest()
   s3_client = boto3.client(
@@ -328,7 +328,7 @@ def main_crawler(url:str, course_name:str, max_urls:int=100, max_depth:int=3, ti
     supabase_key=os.getenv('SUPABASE_API_KEY'))  # type: ignore
     urls = supabase_client.table(os.getenv('NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE')).select('course_name, url, contexts').eq('course_name', course_name).execute()
     if urls.data == []:
-      existing_urls = None
+      existing_urls = []
     else:
       existing_urls = []
       for thing in urls.data:
@@ -337,8 +337,9 @@ def main_crawler(url:str, course_name:str, max_urls:int=100, max_depth:int=3, ti
           whole += t['text']
         existing_urls.append((thing['url'], whole))
     print("Finished gathering existing urls from Supabase")
+    print("Length of existing urls:", len(existing_urls))
     print("Begin Ingesting Web page")
-    data = crawler(url=url, max_urls=max_urls, max_depth=max_depth, timeout=timeout, base_url_on=stay_on_baseurl, _existing_urls=existing_urls)
+    data = crawler(url=url, max_urls=max_urls, max_depth=max_depth, timeout=timeout, base_url_on=baseurl, _existing_urls=existing_urls)
 
   # Clean some keys for a proper file name
   # todo: have a default title
@@ -403,34 +404,6 @@ def main_crawler(url:str, course_name:str, max_urls:int=100, max_depth:int=3, ti
               counter += 1
           else:
             print("No", key[2] ,"to upload", key[1])
-      # if ".pdf" in key[0]:
-      #   with NamedTemporaryFile(suffix=".pdf") as temp_pdf:
-      #     if key[1] != "" or key[1] != None:
-      #       temp_pdf.write(key[1])
-      #       temp_pdf.seek(0)
-      #       s3_upload_path = "courses/"+ course_name + "/" + path_name[i] + ".pdf"
-      #       paths.append(s3_upload_path)
-      #       with open(temp_pdf.name, 'rb') as f:
-      #         print("Uploading PDF to S3")
-      #         s3_client.upload_fileobj(f, os.getenv('S3_BUCKET_NAME'), s3_upload_path)
-      #         ingester.bulk_ingest(s3_upload_path, course_name=course_name, url=key[0], base_url=url)
-      #         counter += 1
-      #     else:
-      #       print("No PDF to upload", key[1])
-      # else:
-      #   with NamedTemporaryFile(suffix=".html") as temp_html:
-      #     if key[1] != "" or key[1] != None:
-      #       temp_html.write(key[1].encode('utf-8'))
-      #       temp_html.seek(0)
-      #       s3_upload_path = "courses/"+ course_name + "/" + path_name[i] + ".html"
-      #       paths.append(s3_upload_path)
-      #       with open(temp_html.name, 'rb') as f:
-      #         print("Uploading html to S3")
-      #         s3_client.upload_fileobj(f, os.getenv('S3_BUCKET_NAME'), s3_upload_path)
-      #         ingester.bulk_ingest(s3_upload_path, course_name=course_name, url=key[0], base_url=url)
-      #         counter += 1
-      #     else:
-      #       print("No html to upload", key[1])
   except Exception as e:
     print("Error in upload:", e)