added multi-threading for parallel webscrape

UIUC-Chatbot · Feb 22, 2024 · b4b4b9c · b4b4b9c
1 parent 10b618c
commit b4b4b9c
Showing 1 changed file with 36 additions and 18 deletions.
diff --git a/ai_ta_backend/crawlee_ext_scrape.py b/ai_ta_backend/crawlee_ext_scrape.py
@@ -1,6 +1,5 @@
 import requests
-import os
-import json
+import concurrent.futures
 import time
 
 SCRAPE_LOG = {}
@@ -10,15 +9,17 @@ def crawlee_scrape(course_name: str):
     This function takes in a pre-defined set of URLs and scrapes the content from each URL.
     """
     urls = [
-        'https://extension.arizona.edu/'
+        'https://www.uaex.uada.edu/',
+        'https://ucanr.edu/Focus_Areas/'
+
     ]
 
     payload = {
             "params": {
                 "url": "",
                 "scrapeStrategy": "equal-and-below",
                 "match": "",
-                "maxPagesToCrawl": 20000,
+                "maxPagesToCrawl": 5000,
                 "maxTokens": 2000000,
                 "courseName": course_name
             }
@@ -27,21 +28,38 @@ def crawlee_scrape(course_name: str):
     # create a POST request to the crawlee API
     api_endpoint = 'https://crawlee-production.up.railway.app/crawl'
 
-    # loop through the URLs and scrape the content
-    for url in urls:
-        payload["params"]["url"] = url
-        payload["params"]["match"] = "http?(s)://" + url.split("//")[1] + "/**"
-
-        print("Scraping URL:", url)
-        start_time = time.monotonic()
-        response = requests.post(api_endpoint, json=payload)
-
-        no_of_urls_scraped = response.json()
-        SCRAPE_LOG[url] = no_of_urls_scraped
-        print(f"⏰ Scraping runtime: {(time.monotonic() - start_time):.2f} seconds")
-        time.sleep(10)
-
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        futures = []
+        for url in urls:
+            future = executor.submit(scrape_url, url, payload, api_endpoint)
+            futures.append(future)
+
+        # Wait for all tasks to complete and gather results
+        for future in concurrent.futures.as_completed(futures):
+            result = future.result()
+            print(result)
+
     print(SCRAPE_LOG)
 
     return "Scraping complete."
 
+
+def scrape_url(url, payload, api_endpoint):
+    """
+    Scrapes a single URL and logs results.
+    """
+
+    payload["params"]["url"] = url
+    payload["params"]["match"] = "http?(s)://" + url.split("//")[1] + "/**"
+
+    print("Scraping URL:", url)
+    start_time = time.monotonic()
+    response = requests.post(api_endpoint, json=payload)
+
+    no_of_urls_scraped = response.json()
+    SCRAPE_LOG[url] = no_of_urls_scraped
+
+    print(f"⏰ Scraping runtime: {(time.monotonic() - start_time):.2f} seconds")
+
+    return f"Scraped {url} with {no_of_urls_scraped} URLs scraped."
+