adds web scraping example

ploomber · Aug 29, 2024 · 4f5a55c · 4f5a55c
1 parent f90606c
commit 4f5a55c
Show file tree

Hide file tree

Showing 4 changed files with 97 additions and 0 deletions.
diff --git a/examples/streamlit/web-scraping-selenium/Dockerfile b/examples/streamlit/web-scraping-selenium/Dockerfile
@@ -0,0 +1,36 @@
+# chrome is not available for ARM
+FROM --platform=linux/amd64 python:3.12-slim
+
+WORKDIR /srv
+
+RUN apt-get update && apt-get install -y wget curl unzip
+RUN wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
+RUN apt-get install -y ./google-chrome-stable_current_amd64.deb
+RUN google-chrome --version
+
+# alternative way to install chrome and chromedriver
+# RUN apt-get update && apt-get install -y curl gnupg unzip
+# RUN curl -sS -o - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add -
+# RUN echo "deb [arch=amd64]  http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list
+# RUN apt-get -y update && apt-get -y install google-chrome-stable
+
+# https://googlechromelabs.github.io/chrome-for-testing/#stable
+RUN curl -o chromedriver_linux64.zip https://storage.googleapis.com/chrome-for-testing-public/128.0.6613.86/linux64/chromedriver-linux64.zip
+RUN unzip chromedriver_linux64.zip
+RUN chmod +x chromedriver-linux64
+RUN mv -f chromedriver-linux64 /usr/local/bin/chromedriver
+
+COPY requirements.txt /srv/
+RUN pip install -r requirements.txt --no-cache-dir
+
+COPY . /srv
+
+ENTRYPOINT ["streamlit", "run", "app.py", \
+            "--server.port=80", \
+            "--server.headless=true", \
+            "--server.address=0.0.0.0", \
+            "--browser.gatherUsageStats=false", \
+            "--server.enableStaticServing=true", \
+            "--server.fileWatcherType=none", \
+            # hide the Streamlit menu
+            "--client.toolbarMode=viewer"]
diff --git a/examples/streamlit/web-scraping-selenium/README.md b/examples/streamlit/web-scraping-selenium/README.md
@@ -0,0 +1,15 @@
+# Web scraping with Selenium
+
+A web scraping app using Streamlit and Selenium
+
+## Run locally
+
+```sh
+# build docker image
+docker build -t selenium-demo .
+
+# run
+docker run -p 8080:80 selenium-demo
+```
+
+App will run in: http://0.0.0.0:8080
diff --git a/examples/streamlit/web-scraping-selenium/app.py b/examples/streamlit/web-scraping-selenium/app.py
@@ -0,0 +1,43 @@
+import streamlit as st
+from bs4 import BeautifulSoup
+from selenium.webdriver.chrome.options import Options
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+
+
+def get_table():
+    url = "https://en.wikipedia.org/wiki/Mercury_Prize"
+    xpath = "//table[contains(@class, 'wikitable')]"
+
+    chrome_options = Options()
+    chrome_options.add_argument("--headless")
+    # we need this since we'll run the container as root
+    chrome_options.add_argument("--no-sandbox")
+    driver = webdriver.Chrome(options=chrome_options)
+    driver.get(url)
+
+    # wait for the table to load
+    element = WebDriverWait(driver, 10).until(
+        EC.visibility_of_element_located((By.XPATH, xpath))
+    )
+
+    element_html = element.get_attribute("outerHTML")
+
+    # remove links
+    soup = BeautifulSoup(element_html, "html.parser")
+
+    for a in soup.find_all("a"):
+        a.replace_with(a.text)
+
+    # return the cleaned html
+    return str(soup)
+
+
+st.title("Mercury Prize Winners")
+
+
+if st.button("Load from Wikipedia"):
+    content = get_table()
+    st.html(content)
diff --git a/examples/streamlit/web-scraping-selenium/requirements.txt b/examples/streamlit/web-scraping-selenium/requirements.txt
@@ -0,0 +1,3 @@
+streamlit
+selenium
+beautifulsoup4