diff --git a/.github/scripts/validate_listings.py b/.github/scripts/validate_listings.py new file mode 100644 index 00000000000..9e284f45a8c --- /dev/null +++ b/.github/scripts/validate_listings.py @@ -0,0 +1,75 @@ +import json +import os +import re +import time +import requests +from bs4 import BeautifulSoup +from selenium import webdriver + +# Set up headless Selenium driver +options = webdriver.ChromeOptions() +options.add_argument("--headless=new") +driver = webdriver.Chrome(options=options) + +DEAD_REGEX = [ + r"no longer accepting", + r"position .* filled", + r"doesn'?t exist", + r"job .* closed", + r"no longer available" +] + +def shortenTitle(title: str): + return title.split("–")[0].split(" - ")[0] + +def is_listing_active(listing): + link = listing["url"] + try: + response = requests.get(link, timeout=10) + + # If page returns 404 or redirects immediately + if response.status_code == 404 or "job_closed" in response.url: + return False + + driver.get(link) + time.sleep(3) + + page = driver.page_source + soup = BeautifulSoup(page, 'html.parser') + page_text = soup.get_text().strip().lower() + + if any(re.search(pattern, page_text) for pattern in DEAD_REGEX): + return False + + return True + except requests.RequestException: + return False + +def update_listings_activity(): + """Reads listings.json and marks closed jobs as inactive.""" + json_file = "listings.json" + + with open(json_file, "r", encoding="utf-8") as f: + listings = json.load(f) + + # closed_listings = [] + updated = False + for listing in listings: + listing["title"] = shortenTitle(listing["title"]) + if listing["source"] == "Simplify" or listing["active"] == False: + continue + + if not is_listing_active(listing): + listing["active"] = False + print("Found inactive listing: " + listing["company_name"] + " | " + listing["title"]) + # closed_listings.append(listing) + updated = True + # updates listings.json directly + if updated: + with open(json_file, "w", encoding="utf-8") as f: + json.dump(listings, f, indent=4) + + +if __name__ == "__main__": + update_listings_activity() + driver.quit() \ No newline at end of file diff --git a/.github/workflows/validate_listings.yml b/.github/workflows/validate_listings.yml new file mode 100644 index 00000000000..5b55c3d9983 --- /dev/null +++ b/.github/workflows/validate_listings.yml @@ -0,0 +1,36 @@ +name: Validate Internship Listings + +on: + schedule: + - cron: "0 2 * * *" # Runs daily at 2 AM UTC + workflow_dispatch: # Allows manual triggering + +jobs: + validate-listings: + runs-on: ubuntu-latest + + steps: + - name: Checkout Repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.9" + + - name: Install Dependencies + run: pip install requests beautifulsoup4 selenium + + - name: Prepare Selenium + uses: nanasess/setup-chromedriver@v1.0.5 + + - name: Run Listing Validation + run: python .github/scripts/validate_listings.py + + - name: Commit Changes if Listings Were Updated + run: | + git config --local user.name "github-actions[bot]" + git config --local user.email "github-actions@github.com" + git add .github/scripts/listings.json + git diff --quiet && git diff --staged --quiet || git commit -m "Marked closed internships as inactive" + git push