Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding a sudo for bash scirpt to install selenium and chromedriver #139

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
158 changes: 137 additions & 21 deletions ai_ta_backend/web_scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@
import shutil
import time
from collections import Counter
import tempfile
from tempfile import NamedTemporaryFile
from zipfile import ZipFile
import shutil

import boto3 # type: ignore
import requests
Expand All @@ -16,6 +18,37 @@
from ai_ta_backend.vector_database import Ingest


#### added setup code for selenium ####
# from selenium import webdriver
from seleniumwire import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# set the default Download directory
options = webdriver.ChromeOptions()
# set the download path
download_dir = tempfile.mkdtemp()
print(download_dir)
time.sleep(3)
# download_dir = os.path.abspath("pdf_files")
options.add_experimental_option("prefs", {
"download.default_directory": download_dir,
"download.prompt_for_download": False,
"download.directory_upgrade": True,
"plugins.always_open_pdf_externally": True,
"safebrowsing.enabled": False,
"safebrowsing.disable_download_protection": True})
# options2 = {'ca_key': r'C:\Users\chopr\Desktop\Part-Time\AgGPT\raw.githubusercontent.com_wkeeling_selenium-wire_master_seleniumwire_ca.crt'}
driver = webdriver.Chrome(options=options)

import logging
logging.basicConfig(level=logging.DEBUG) # Main app runs at DEBUG level
logger = logging.getLogger('seleniumwire')
logger.setLevel(logging.ERROR) # Run selenium wire at ERROR level

#### setup added for selenium ###

class WebScrape():

def __init__(self) -> None:
Expand Down Expand Up @@ -60,51 +93,134 @@ def get_file_extension(self, filename):
def valid_url(self, url):
'''Returns the URL and it's content if it's good, otherwise returns false. Prints the status code.'''
try:
response = requests.get(url, allow_redirects=True, timeout=20)

driver.set_page_load_timeout(100) # adjust this as according
# get the URL
driver.get(url)
response_code = 404
print("THE URL WE ARE SEARCHING FOR IS - >>>>" ,url)
for request in driver.requests:
if request.url==url:
response_code = request.response.status_code
print("=======================RESPONSE CODE IS GIVEN HERE================", response_code)

# handle the redirect case
redirect_loop_counter = 0
while response.status_code == 301:
# Check for permanent redirect
while response_code == 301:
# check for permanent redirectr
if redirect_loop_counter > 3:
print("❌ Redirect loop (on 301 error) exceeded redirect limit of:", redirect_loop_counter, "❌")
return (False, False, False)
redirect_url = response.headers['Location']
response = requests.head(redirect_url)
redirect_loop_counter += 1
if response.status_code == 200:
filetype = self.get_file_extension(response.url)
# check for the redirected url
new_url = driver.current_url
driver.get(new_url)
# check the status code for the new url
for request in driver.requests:
if request.url==new_url:
response_code = request.response.status_code
redirect_loop_counter+=1

# handle the 200 case
if response_code == 200:
filetype = self.get_file_extension(url)
print("file extension:", filetype)

# handle the case when the file is html
if filetype == '.html':
content = BeautifulSoup(response.content, "html.parser")
if "<!doctype html" not in str(response.text).lower():
print("⛔️⛔️ Filetype not supported:", response.url, "⛔️⛔️")
return (False, False, False)
print("THE FILETYPE IS HEREAAAAAAAAAAAAAAAAAAAAAAA - ", filetype)
# this basically takes the source of the html
# and then loads it to bful soup
page_source = driver.page_source
content = BeautifulSoup(page_source, "html.parser")
# check why was this needed in any case?
# if "<!doctype html" not in str(content).lower():
# print("⛔️⛔️ Filetype not supported:", url, "⛔️⛔️")
# return (False, False, False)
return (url, content, filetype)

# handle the other cases
elif filetype in ['.py', '.vtt', '.pdf', '.txt', '.srt', '.docx', '.ppt', '.pptx']:
if "<!doctype html" in str(response.text).lower():
content = BeautifulSoup(response.text, "html.parser")
page_source = driver.page_source
downloaded_file_path = os.path.join(download_dir, os.listdir(download_dir)[0])
print(downloaded_file_path)
# Read the content of the downloaded file

if "<!doctype html" in str(page_source).lower():
content = BeautifulSoup(page_source, "html.parser")
filetype = '.html'
else:
content = response.content
with open(downloaded_file_path, 'rb') as file:
content = file.read()
time.sleep(6)
os.remove(downloaded_file_path)
#delete all files
for filename in os.listdir(download_dir):
file_path = os.path.join(download_dir, filename)
if os.path.isfile(file_path) or os.path.islink(file_path):
os.unlink(file_path)
elif os.path.isdir(file_path):
shutil.rmtree(file_path)

else:
return (False, False, False)
if filetype not in ['.html', '.py', '.vtt', '.pdf', '.txt', '.srt', '.docx', '.ppt', '.pptx']:
print("⛔️⛔️ Filetype not supported:", filetype, "⛔️⛔️")
return (False, False, False)
return (response.url, content, filetype)
print("The loaded content is ->", content)
return (url, content, filetype)
else:
print("🚫🚫 URL is invalid:", response.url, "Return code:", response.status_code, "🚫🚫")
print("🚫🚫 URL is invalid:", url, "Return code:", response_code, "🚫🚫")
return (False, False, False)
except requests.RequestException as e:
print("🚫🚫 URL is invalid:", url, "Error:", e, "🚫🚫")
return (False, False, False)
return (False, False, False)


# '''Returns the URL and it's content if it's good, otherwise returns false. Prints the status code.'''
# try:
# response = requests.get(url, allow_redirects=True, timeout=20)

# redirect_loop_counter = 0
# while response.status_code == 301:
# # Check for permanent redirect
# if redirect_loop_counter > 3:
# print("❌ Redirect loop (on 301 error) exceeded redirect limit of:", redirect_loop_counter, "❌")
# return (False, False, False)
# redirect_url = response.headers['Location']
# response = requests.head(redirect_url)
# redirect_loop_counter += 1
# if response.status_code == 200:
# filetype = self.get_file_extension(response.url)
# print("file extension:", filetype)
# if filetype == '.html':
# content = BeautifulSoup(response.content, "html.parser")
# if "<!doctype html" not in str(response.text).lower():
# print("⛔️⛔️ Filetype not supported:", response.url, "⛔️⛔️")
# return (False, False, False)
# elif filetype in ['.py', '.vtt', '.pdf', '.txt', '.srt', '.docx', '.ppt', '.pptx']:
# if "<!doctype html" in str(response.text).lower():
# content = BeautifulSoup(response.text, "html.parser")
# filetype = '.html'
# else:
# content = response.content
# else:
# return (False, False, False)
# if filetype not in ['.html', '.py', '.vtt', '.pdf', '.txt', '.srt', '.docx', '.ppt', '.pptx']:
# print("⛔️⛔️ Filetype not supported:", filetype, "⛔️⛔️")
# return (False, False, False)
# return (response.url, content, filetype)
# else:
# print("🚫🚫 URL is invalid:", response.url, "Return code:", response.status_code, "🚫🚫")
# return (False, False, False)
# except requests.RequestException as e:
# print("🚫🚫 URL is invalid:", url, "Error:", e, "🚫🚫")
# return (False, False, False)

# Ensures url is in the correct format
def base_url(self, url:str):
try:
# Get rid of double slashes in url
# Create a base site for incomplete hrefs
if url.startswith("https:"):
site= re.match(pattern=r'https:\/\/[a-zA-Z0-9.]*[a-z]', string=url).group(0) # type: ignore
site= re.match(pattern=r"https:\/\/[a-zA-Z0-9.']*[a-z]", string=url).group(0) # type: ignore
url = re.sub(pattern=r"https:\/\/", repl="", string=url)
url = re.sub(pattern=r"[\/\/]{2,}", repl="", string=url)
url = "https://"+url
Expand Down
30 changes: 30 additions & 0 deletions ca.crt
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
-----BEGIN CERTIFICATE-----
MIIFFzCCAv+gAwIBAgIUIUc6dnnqhYX3ZYXQzpZyJ1gtUwcwDQYJKoZIhvcNAQEL
BQAwGzEZMBcGA1UEAwwQU2VsZW5pdW0gV2lyZSBDQTAeFw0xODA3MjAxMDQxMDNa
Fw0yODA3MTcxMDQxMDNaMBsxGTAXBgNVBAMMEFNlbGVuaXVtIFdpcmUgQ0EwggIi
MA0GCSqGSIb3DQEBAQUAA4ICDwAwggIKAoICAQDKKpm14AHiJb4onGES4Echs2qB
XsfeMAbsA7x4blJkMGyHGx9B8OpXqlRtcNnWD2JGnjc0/k92uuZaV2prDnZwH5Jl
nJSZuGEzUUAnrwhTHTqMhM9pfT8RpltE0lyplQni8rjH5oshBrzzAHILm/iAm1WI
HCFUClQaJ7sVVzAikaPfg4WUXLHP7/AjxIejp/SVI8Ycn1BPIlDwp1pIq4WawJoZ
TZ75GwvsT1ohH4YSRM+BxwBuBUqjusaYJiWwpnR801XV290i3/bBOkS2fEa4+ciS
LEGEi4SaaC6Nhap3sd80npJUQff4ltVGaxX0jCG/zswf2XGEDtsw2FF848KePj4X
Ilgm4xcuhhBvcsgob/bwEvDTrXPk38YQEJEKH8uGf37AOv2TQmqj45WZt7jSZ2YH
ZGn4RunJAO/J7toqJ7upjx66Pq8WkXQ6faSeTNENmXclYPRQFujVbFkECRcOtS6W
fUkHM+tgXHKqSMcfVVp46o/4HfHzoTyvrUDryHJB3h/IrqWK1433rYp3bJzkpjM9
JT71vh6sDo/Ys+4HK5rwrwkeP7b+6dUx1nHOgPX88njVI6cuxnjex6AfSld5d4BH
YZdviXRqCxpiudmnN+cMKAdJgRZFmVNH/djQqtq3y/gmjwKnyW95y3uJu4Xz5+R4
9jhAZGJFiHK/vE+XwwIDAQABo1MwUTAdBgNVHQ4EFgQUPvrTydSlYhMQJy8lvBvh
nLeQsvQwHwYDVR0jBBgwFoAUPvrTydSlYhMQJy8lvBvhnLeQsvQwDwYDVR0TAQH/
BAUwAwEB/zANBgkqhkiG9w0BAQsFAAOCAgEAmIvadNtFca9vuMuSewSXHlOd9p7d
9xYkp8Yj5RvFUGL32zYUatH9YsRh5K9Wz5jifjwBLMRDZIm48xhxYjqVvTZoQpL6
Qyzbu2EsRCbmQ+861U4SfcP2uetJuFM6Ug0/CKviyNpUaX/8YWupFXsEiCRJM9pk
sh2b+dqljy9kvrOosfehz8CRbxUfgPsL2IVZa0mHsuOZDa/XHAAW9ns5TdBlFHwo
W/2KDvvPGL/3t7Zah2jwu8D8w397looMXxqyT/DAjH6+bd5Kg/7mELaqbg/pM3EJ
mENd5ButBkhpVbyAKLn7TvpZYSEF/VMNPcZHOKoKrx1utZwLFuVIb07WDMRov0GO
hg/rrIBWvA1ySi/4yrnRDc7GBHSUh0Krx6LLZ/ZtE3j7/4rwj51MwqqNhQrCxGhz
ksqn8V6XY7UUKnlTlAWRyuBLiA+yvf9GdgNJxUblZYMNpPbeLwe2Be/utROuMqwr
G4RA1sfPuEdyfdXB/7c8ViOPxKYFH0POXuwB+Z1JlXDtR8rbjyVPUwqQarAuNIbw
NC8P+GWSzviG544BQyW1xKqLgQcEMSU73icDOOb9COcl1h7URSO9WB6CZXykpQSk
hceDiwojCDsyM84uXyyXKXCRPtseCIRsA1zZwrXU7NDDBXrIC7moVbxkDu2G4V1g
b5JFYe4FNI0yw/o=
-----END CERTIFICATE-----
22 changes: 18 additions & 4 deletions railway.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,25 @@
"nixpacksVersion": "1.15.0",
"nixpacksPlan": {
"phases": {
"myPhase": {
"name": "ffmpeg (removed: libcrypt1, started to break)",
"install": {
"cmds": [
"sudo bash ./setup.sh",
"python -m venv --copies /opt/venv && . /opt/venv/bin/activate",
"pip install pip==23.3.1",
"pip install -r requirements.txt"
],
"aptPkgs": [
"ffmpeg",
"libcrypt1"
"ffmpeg"
]
},
"setup": {
"aptPkgs": [
"libcap-dev",
"libgl1"
],
"nixPkgs": [
"python310",
"gcc"
]
}
}
Expand Down
3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -45,5 +45,8 @@ xlrd # for excel ingest
pdfminer # for image OCR
pytesseract # for image OCR

selenium-wire # for New Web Scraper
selenium

# No arize for now, huge build size with these additions.
# arize[AutoEmbeddings, LLM_Evaluation]
17 changes: 17 additions & 0 deletions setup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Install Google Chrome
apt-get update && apt-get install -y \
wget \
unzip \
libglib2.0-0 \
libnss3 \
libgconf-2-4 \
libfontconfig1 \
&& wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb \
&& dpkg -i google-chrome-stable_current_amd64.deb; apt-get -fy install

# Install ChromeDriver
wget https://chromedriver.storage.googleapis.com/2.41/chromedriver_linux64.zip \
&& unzip chromedriver_linux64.zip \
&& mv chromedriver /usr/bin/chromedriver \
&& chown root:root /usr/bin/chromedriver \
&& chmod +x /usr/bin/chromedriver