UIUC-Chatbot · choprahetarth · Oct 16, 2023 · Oct 31, 2023 · Nov 8, 2023 · Nov 8, 2023
diff --git a/ai_ta_backend/web_scrape.py b/ai_ta_backend/web_scrape.py
@@ -4,8 +4,10 @@
 import shutil
 import time
 from collections import Counter
+import tempfile
 from tempfile import NamedTemporaryFile
 from zipfile import ZipFile
+import shutil
 
 import boto3  # type: ignore
 import requests
@@ -16,6 +18,37 @@
 from ai_ta_backend.vector_database import Ingest
 
 
+#### added setup code for selenium ####
+# from selenium import webdriver
+from seleniumwire import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+
+# set the default Download directory
+options = webdriver.ChromeOptions()
+# set the download path
+download_dir = tempfile.mkdtemp()
+print(download_dir)
+time.sleep(3)
+# download_dir = os.path.abspath("pdf_files")
+options.add_experimental_option("prefs", {
+    "download.default_directory": download_dir,
+    "download.prompt_for_download": False,
+    "download.directory_upgrade": True,
+    "plugins.always_open_pdf_externally": True,
+    "safebrowsing.enabled": False,
+    "safebrowsing.disable_download_protection": True})
+# options2 = {'ca_key': r'C:\Users\chopr\Desktop\Part-Time\AgGPT\raw.githubusercontent.com_wkeeling_selenium-wire_master_seleniumwire_ca.crt'}
+driver = webdriver.Chrome(options=options)
+
+import logging
+logging.basicConfig(level=logging.DEBUG)  # Main app runs at DEBUG level
+logger = logging.getLogger('seleniumwire')
+logger.setLevel(logging.ERROR)  # Run selenium wire at ERROR level
+
+#### setup added for selenium ###
+
 class WebScrape():
 
   def __init__(self) -> None:
@@ -60,51 +93,134 @@ def get_file_extension(self, filename):
   def valid_url(self, url):
     '''Returns the URL and it's content if it's good, otherwise returns false. Prints the status code.'''
     try:
-      response = requests.get(url, allow_redirects=True, timeout=20)
-
+      driver.set_page_load_timeout(100) # adjust this as according
+      # get the URL
+      driver.get(url)
+      response_code = 404
+      print("THE URL WE ARE SEARCHING FOR IS - >>>>" ,url)
+      for request in driver.requests:
+        if request.url==url:
+              response_code = request.response.status_code
+      print("=======================RESPONSE CODE IS GIVEN HERE================", response_code)
+
+      # handle the redirect case
       redirect_loop_counter = 0
-      while response.status_code == 301:
-        # Check for permanent redirect
+      while response_code == 301:
+        # check for permanent redirectr
         if redirect_loop_counter > 3:
           print("❌ Redirect loop (on 301 error) exceeded redirect limit of:", redirect_loop_counter, "❌")
           return (False, False, False)
-        redirect_url = response.headers['Location']
-        response = requests.head(redirect_url)
-        redirect_loop_counter += 1
-      if response.status_code == 200:
-        filetype = self.get_file_extension(response.url)
+        # check for the redirected url
+        new_url = driver.current_url
+        driver.get(new_url)
+        # check the status code for the new url
+        for request in driver.requests:
+          if request.url==new_url:
+            response_code = request.response.status_code
+        redirect_loop_counter+=1
+
+      # handle the 200 case 
+      if response_code == 200:
+        filetype = self.get_file_extension(url)
         print("file extension:", filetype)
+
+        # handle the case when the file is html
         if filetype == '.html':
-          content = BeautifulSoup(response.content, "html.parser")
-          if "<!doctype html" not in str(response.text).lower():
-            print("⛔️⛔️ Filetype not supported:", response.url, "⛔️⛔️")
-            return (False, False, False)
+          print("THE FILETYPE IS HEREAAAAAAAAAAAAAAAAAAAAAAA - ", filetype)
+          # this basically takes the source of the html
+          # and then loads it to bful soup
+          page_source = driver.page_source
+          content = BeautifulSoup(page_source, "html.parser")
+          # check why was this needed in any case?
+          # if "<!doctype html" not in str(content).lower():
+          #   print("⛔️⛔️ Filetype not supported:", url, "⛔️⛔️")
+          #   return (False, False, False)
+          return (url, content, filetype)
+
+        # handle the other cases
         elif filetype in ['.py', '.vtt', '.pdf', '.txt', '.srt', '.docx', '.ppt', '.pptx']:
-          if "<!doctype html" in str(response.text).lower():
-            content = BeautifulSoup(response.text, "html.parser")
+          page_source = driver.page_source
+          downloaded_file_path = os.path.join(download_dir, os.listdir(download_dir)[0])
+          print(downloaded_file_path)
+          # Read the content of the downloaded file
+
+          if "<!doctype html" in str(page_source).lower():
+            content = BeautifulSoup(page_source, "html.parser")
             filetype = '.html'
           else:
-            content = response.content
+            with open(downloaded_file_path, 'rb') as file:
+              content = file.read()
+            time.sleep(6)
+            os.remove(downloaded_file_path)
+            #delete all files
+            for filename in os.listdir(download_dir):
+              file_path = os.path.join(download_dir, filename)
+              if os.path.isfile(file_path) or os.path.islink(file_path):
+                  os.unlink(file_path)
+              elif os.path.isdir(file_path):
+                  shutil.rmtree(file_path)
+
         else:
           return (False, False, False)
         if filetype not in ['.html', '.py', '.vtt', '.pdf', '.txt', '.srt', '.docx', '.ppt', '.pptx']:
           print("⛔️⛔️ Filetype not supported:", filetype, "⛔️⛔️")
-          return (False, False, False)
-        return (response.url, content, filetype)
+        print("The loaded content is ->", content)
+        return (url, content, filetype)
       else:
-        print("🚫🚫 URL is invalid:", response.url, "Return code:", response.status_code, "🚫🚫")
+        print("🚫🚫 URL is invalid:", url, "Return code:", response_code, "🚫🚫")
         return (False, False, False)
     except requests.RequestException as e:
       print("🚫🚫 URL is invalid:", url, "Error:", e, "🚫🚫")
-      return (False, False, False)
+      return (False, False, False)   
+
+
+    # '''Returns the URL and it's content if it's good, otherwise returns false. Prints the status code.'''
+    # try:
+    #   response = requests.get(url, allow_redirects=True, timeout=20)
+
+    #   redirect_loop_counter = 0
+    #   while response.status_code == 301:
+    #     # Check for permanent redirect
+    #     if redirect_loop_counter > 3:
+    #       print("❌ Redirect loop (on 301 error) exceeded redirect limit of:", redirect_loop_counter, "❌")
+    #       return (False, False, False)
+    #     redirect_url = response.headers['Location']
+    #     response = requests.head(redirect_url)
+    #     redirect_loop_counter += 1
+    #   if response.status_code == 200:
+    #     filetype = self.get_file_extension(response.url)
+    #     print("file extension:", filetype)
+    #     if filetype == '.html':
+    #       content = BeautifulSoup(response.content, "html.parser")
+    #       if "<!doctype html" not in str(response.text).lower():
+    #         print("⛔️⛔️ Filetype not supported:", response.url, "⛔️⛔️")
+    #         return (False, False, False)
+    #     elif filetype in ['.py', '.vtt', '.pdf', '.txt', '.srt', '.docx', '.ppt', '.pptx']:
+    #       if "<!doctype html" in str(response.text).lower():
+    #         content = BeautifulSoup(response.text, "html.parser")
+    #         filetype = '.html'
+    #       else:
+    #         content = response.content
+    #     else:
+    #       return (False, False, False)
+    #     if filetype not in ['.html', '.py', '.vtt', '.pdf', '.txt', '.srt', '.docx', '.ppt', '.pptx']:
+    #       print("⛔️⛔️ Filetype not supported:", filetype, "⛔️⛔️")
+    #       return (False, False, False)
+    #     return (response.url, content, filetype)
+    #   else:
+    #     print("🚫🚫 URL is invalid:", response.url, "Return code:", response.status_code, "🚫🚫")
+    #     return (False, False, False)
+    # except requests.RequestException as e:
+    #   print("🚫🚫 URL is invalid:", url, "Error:", e, "🚫🚫")
+    #   return (False, False, False)
 
   # Ensures url is in the correct format
   def base_url(self, url:str):
     try:
       # Get rid of double slashes in url
       # Create a base site for incomplete hrefs
       if url.startswith("https:"):
-        site= re.match(pattern=r'https:\/\/[a-zA-Z0-9.]*[a-z]', string=url).group(0) # type: ignore
+        site= re.match(pattern=r"https:\/\/[a-zA-Z0-9.']*[a-z]", string=url).group(0) # type: ignore
         url = re.sub(pattern=r"https:\/\/", repl="", string=url)
         url = re.sub(pattern=r"[\/\/]{2,}", repl="", string=url)
         url = "https://"+url

diff --git a/ca.crt b/ca.crt
@@ -0,0 +1,30 @@
+-----BEGIN CERTIFICATE-----
+MIIFFzCCAv+gAwIBAgIUIUc6dnnqhYX3ZYXQzpZyJ1gtUwcwDQYJKoZIhvcNAQEL
+BQAwGzEZMBcGA1UEAwwQU2VsZW5pdW0gV2lyZSBDQTAeFw0xODA3MjAxMDQxMDNa
+Fw0yODA3MTcxMDQxMDNaMBsxGTAXBgNVBAMMEFNlbGVuaXVtIFdpcmUgQ0EwggIi
+MA0GCSqGSIb3DQEBAQUAA4ICDwAwggIKAoICAQDKKpm14AHiJb4onGES4Echs2qB
+XsfeMAbsA7x4blJkMGyHGx9B8OpXqlRtcNnWD2JGnjc0/k92uuZaV2prDnZwH5Jl
+nJSZuGEzUUAnrwhTHTqMhM9pfT8RpltE0lyplQni8rjH5oshBrzzAHILm/iAm1WI
+HCFUClQaJ7sVVzAikaPfg4WUXLHP7/AjxIejp/SVI8Ycn1BPIlDwp1pIq4WawJoZ
+TZ75GwvsT1ohH4YSRM+BxwBuBUqjusaYJiWwpnR801XV290i3/bBOkS2fEa4+ciS
+LEGEi4SaaC6Nhap3sd80npJUQff4ltVGaxX0jCG/zswf2XGEDtsw2FF848KePj4X
+Ilgm4xcuhhBvcsgob/bwEvDTrXPk38YQEJEKH8uGf37AOv2TQmqj45WZt7jSZ2YH
+ZGn4RunJAO/J7toqJ7upjx66Pq8WkXQ6faSeTNENmXclYPRQFujVbFkECRcOtS6W
+fUkHM+tgXHKqSMcfVVp46o/4HfHzoTyvrUDryHJB3h/IrqWK1433rYp3bJzkpjM9
+JT71vh6sDo/Ys+4HK5rwrwkeP7b+6dUx1nHOgPX88njVI6cuxnjex6AfSld5d4BH
+YZdviXRqCxpiudmnN+cMKAdJgRZFmVNH/djQqtq3y/gmjwKnyW95y3uJu4Xz5+R4
+9jhAZGJFiHK/vE+XwwIDAQABo1MwUTAdBgNVHQ4EFgQUPvrTydSlYhMQJy8lvBvh
+nLeQsvQwHwYDVR0jBBgwFoAUPvrTydSlYhMQJy8lvBvhnLeQsvQwDwYDVR0TAQH/
+BAUwAwEB/zANBgkqhkiG9w0BAQsFAAOCAgEAmIvadNtFca9vuMuSewSXHlOd9p7d
+9xYkp8Yj5RvFUGL32zYUatH9YsRh5K9Wz5jifjwBLMRDZIm48xhxYjqVvTZoQpL6
+Qyzbu2EsRCbmQ+861U4SfcP2uetJuFM6Ug0/CKviyNpUaX/8YWupFXsEiCRJM9pk
+sh2b+dqljy9kvrOosfehz8CRbxUfgPsL2IVZa0mHsuOZDa/XHAAW9ns5TdBlFHwo
+W/2KDvvPGL/3t7Zah2jwu8D8w397looMXxqyT/DAjH6+bd5Kg/7mELaqbg/pM3EJ
+mENd5ButBkhpVbyAKLn7TvpZYSEF/VMNPcZHOKoKrx1utZwLFuVIb07WDMRov0GO
+hg/rrIBWvA1ySi/4yrnRDc7GBHSUh0Krx6LLZ/ZtE3j7/4rwj51MwqqNhQrCxGhz
+ksqn8V6XY7UUKnlTlAWRyuBLiA+yvf9GdgNJxUblZYMNpPbeLwe2Be/utROuMqwr
+G4RA1sfPuEdyfdXB/7c8ViOPxKYFH0POXuwB+Z1JlXDtR8rbjyVPUwqQarAuNIbw
+NC8P+GWSzviG544BQyW1xKqLgQcEMSU73icDOOb9COcl1h7URSO9WB6CZXykpQSk
+hceDiwojCDsyM84uXyyXKXCRPtseCIRsA1zZwrXU7NDDBXrIC7moVbxkDu2G4V1g
+b5JFYe4FNI0yw/o=
+-----END CERTIFICATE-----
diff --git a/railway.json b/railway.json
@@ -5,11 +5,25 @@
     "nixpacksVersion": "1.15.0",
     "nixpacksPlan": {
       "phases": {
-        "myPhase": {
-          "name": "ffmpeg (removed: libcrypt1, started to break)",
+        "install": {
+          "cmds": [
+            "sudo bash ./setup.sh",
+            "python -m venv --copies /opt/venv && . /opt/venv/bin/activate",
+            "pip install pip==23.3.1",
+            "pip install -r requirements.txt"
+          ],
           "aptPkgs": [
-            "ffmpeg",
-            "libcrypt1"
+            "ffmpeg"
+          ]
+        },
+        "setup": {
+          "aptPkgs": [
+            "libcap-dev",
+            "libgl1"
+          ],
+          "nixPkgs": [
+            "python310",
+            "gcc"
           ]
         }
       }

diff --git a/requirements.txt b/requirements.txt
@@ -45,5 +45,8 @@ xlrd # for excel ingest
 pdfminer # for image OCR
 pytesseract # for image OCR
 
+selenium-wire # for New Web Scraper
+selenium
+
 # No arize for now, huge build size with these additions.
 # arize[AutoEmbeddings, LLM_Evaluation]
diff --git a/setup.sh b/setup.sh
@@ -0,0 +1,17 @@
+# Install Google Chrome
+apt-get update && apt-get install -y \
+  wget \
+  unzip \
+  libglib2.0-0 \
+  libnss3 \
+  libgconf-2-4 \
+  libfontconfig1 \
+  && wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb \
+  && dpkg -i google-chrome-stable_current_amd64.deb; apt-get -fy install
+
+# Install ChromeDriver
+wget https://chromedriver.storage.googleapis.com/2.41/chromedriver_linux64.zip \
+  && unzip chromedriver_linux64.zip \
+  && mv chromedriver /usr/bin/chromedriver \
+  && chown root:root /usr/bin/chromedriver \
+  && chmod +x /usr/bin/chromedriver