Skip to content

Commit

Permalink
added multi-threading for parallel webscrape
Browse files Browse the repository at this point in the history
  • Loading branch information
star-nox committed Feb 22, 2024
1 parent 10b618c commit b4b4b9c
Showing 1 changed file with 36 additions and 18 deletions.
54 changes: 36 additions & 18 deletions ai_ta_backend/crawlee_ext_scrape.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import requests
import os
import json
import concurrent.futures
import time

SCRAPE_LOG = {}
Expand All @@ -10,15 +9,17 @@ def crawlee_scrape(course_name: str):
This function takes in a pre-defined set of URLs and scrapes the content from each URL.
"""
urls = [
'https://extension.arizona.edu/'
'https://www.uaex.uada.edu/',
'https://ucanr.edu/Focus_Areas/'

]

payload = {
"params": {
"url": "",
"scrapeStrategy": "equal-and-below",
"match": "",
"maxPagesToCrawl": 20000,
"maxPagesToCrawl": 5000,
"maxTokens": 2000000,
"courseName": course_name
}
Expand All @@ -27,21 +28,38 @@ def crawlee_scrape(course_name: str):
# create a POST request to the crawlee API
api_endpoint = 'https://crawlee-production.up.railway.app/crawl'

# loop through the URLs and scrape the content
for url in urls:
payload["params"]["url"] = url
payload["params"]["match"] = "http?(s)://" + url.split("//")[1] + "/**"

print("Scraping URL:", url)
start_time = time.monotonic()
response = requests.post(api_endpoint, json=payload)

no_of_urls_scraped = response.json()
SCRAPE_LOG[url] = no_of_urls_scraped
print(f"⏰ Scraping runtime: {(time.monotonic() - start_time):.2f} seconds")
time.sleep(10)

with concurrent.futures.ThreadPoolExecutor() as executor:
futures = []
for url in urls:
future = executor.submit(scrape_url, url, payload, api_endpoint)
futures.append(future)

# Wait for all tasks to complete and gather results
for future in concurrent.futures.as_completed(futures):
result = future.result()
print(result)

print(SCRAPE_LOG)

return "Scraping complete."


def scrape_url(url, payload, api_endpoint):
"""
Scrapes a single URL and logs results.
"""

payload["params"]["url"] = url
payload["params"]["match"] = "http?(s)://" + url.split("//")[1] + "/**"

print("Scraping URL:", url)
start_time = time.monotonic()
response = requests.post(api_endpoint, json=payload)

no_of_urls_scraped = response.json()
SCRAPE_LOG[url] = no_of_urls_scraped

print(f"⏰ Scraping runtime: {(time.monotonic() - start_time):.2f} seconds")

return f"Scraped {url} with {no_of_urls_scraped} URLs scraped."

0 comments on commit b4b4b9c

Please sign in to comment.