-
Notifications
You must be signed in to change notification settings - Fork 16
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
f90606c
commit 4f5a55c
Showing
4 changed files
with
97 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
# chrome is not available for ARM | ||
FROM --platform=linux/amd64 python:3.12-slim | ||
|
||
WORKDIR /srv | ||
|
||
RUN apt-get update && apt-get install -y wget curl unzip | ||
RUN wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb | ||
RUN apt-get install -y ./google-chrome-stable_current_amd64.deb | ||
RUN google-chrome --version | ||
|
||
# alternative way to install chrome and chromedriver | ||
# RUN apt-get update && apt-get install -y curl gnupg unzip | ||
# RUN curl -sS -o - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - | ||
# RUN echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list | ||
# RUN apt-get -y update && apt-get -y install google-chrome-stable | ||
|
||
# https://googlechromelabs.github.io/chrome-for-testing/#stable | ||
RUN curl -o chromedriver_linux64.zip https://storage.googleapis.com/chrome-for-testing-public/128.0.6613.86/linux64/chromedriver-linux64.zip | ||
RUN unzip chromedriver_linux64.zip | ||
RUN chmod +x chromedriver-linux64 | ||
RUN mv -f chromedriver-linux64 /usr/local/bin/chromedriver | ||
|
||
COPY requirements.txt /srv/ | ||
RUN pip install -r requirements.txt --no-cache-dir | ||
|
||
COPY . /srv | ||
|
||
ENTRYPOINT ["streamlit", "run", "app.py", \ | ||
"--server.port=80", \ | ||
"--server.headless=true", \ | ||
"--server.address=0.0.0.0", \ | ||
"--browser.gatherUsageStats=false", \ | ||
"--server.enableStaticServing=true", \ | ||
"--server.fileWatcherType=none", \ | ||
# hide the Streamlit menu | ||
"--client.toolbarMode=viewer"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
# Web scraping with Selenium | ||
|
||
A web scraping app using Streamlit and Selenium | ||
|
||
## Run locally | ||
|
||
```sh | ||
# build docker image | ||
docker build -t selenium-demo . | ||
|
||
# run | ||
docker run -p 8080:80 selenium-demo | ||
``` | ||
|
||
App will run in: http://0.0.0.0:8080 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
import streamlit as st | ||
from bs4 import BeautifulSoup | ||
from selenium.webdriver.chrome.options import Options | ||
from selenium import webdriver | ||
from selenium.webdriver.common.by import By | ||
from selenium.webdriver.support.ui import WebDriverWait | ||
from selenium.webdriver.support import expected_conditions as EC | ||
|
||
|
||
def get_table(): | ||
url = "https://en.wikipedia.org/wiki/Mercury_Prize" | ||
xpath = "//table[contains(@class, 'wikitable')]" | ||
|
||
chrome_options = Options() | ||
chrome_options.add_argument("--headless") | ||
# we need this since we'll run the container as root | ||
chrome_options.add_argument("--no-sandbox") | ||
driver = webdriver.Chrome(options=chrome_options) | ||
driver.get(url) | ||
|
||
# wait for the table to load | ||
element = WebDriverWait(driver, 10).until( | ||
EC.visibility_of_element_located((By.XPATH, xpath)) | ||
) | ||
|
||
element_html = element.get_attribute("outerHTML") | ||
|
||
# remove links | ||
soup = BeautifulSoup(element_html, "html.parser") | ||
|
||
for a in soup.find_all("a"): | ||
a.replace_with(a.text) | ||
|
||
# return the cleaned html | ||
return str(soup) | ||
|
||
|
||
st.title("Mercury Prize Winners") | ||
|
||
|
||
if st.button("Load from Wikipedia"): | ||
content = get_table() | ||
st.html(content) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
streamlit | ||
selenium | ||
beautifulsoup4 |