expose match url to API

star-nox · star-nox · commit d856603be4c8 · 2024-03-01T11:14:17.000-06:00
diff --git a/ai_ta_backend/crawlee_ext_scrape.py b/ai_ta_backend/crawlee_ext_scrape.py
@@ -4,7 +4,7 @@
 
 SCRAPE_LOG = {}
 
-def crawlee_scrape(course_name: str, urls: list, exclude_urls: list):
+def crawlee_scrape(course_name: str, urls: list, exclude_urls: list, match_url: str):
     """
     This function takes in a pre-defined set of URLs and scrapes the content from each URL.
     """
@@ -30,8 +30,12 @@ def crawlee_scrape(course_name: str, urls: list, exclude_urls: list):
     # loop through the URLs and scrape the content
     for url in urls:
         payload["params"]["url"] = url
-        payload["params"]["match"] = "http?(s)://" + url.split("//")[1] + "/**"
 
+        if match_url:
+            payload["params"]["match"] = match_url
+        else:
+            payload["params"]["match"] = "http?(s)://" + url.split("//")[1] + "/**"
+        
         print("Scraping URL:", url)
         start_time = time.monotonic()
         response = requests.post(api_endpoint, json=payload)
diff --git a/ai_ta_backend/main.py b/ai_ta_backend/main.py
@@ -702,11 +702,13 @@ def extension_scrape() -> Response:
   urls: List[str] = data.get('urls', [])
   course_name: str = data.get('course_name', '')
   exclude_urls: List[str] = data.get('exclude_urls', [])
+  match_url: str = data.get('match_url', '')
+
   if course_name == '' or urls == []:
     # proper web error "400 Bad request"
     abort(400, description=f"Missing required parameter: 'course_name' and 'urls' must be provided.")
   
-  result = crawlee_scrape(course_name, urls, exclude_urls)
+  result = crawlee_scrape(course_name, urls, exclude_urls, match_url)
   response = jsonify(result)
   response.headers.add('Access-Control-Allow-Origin', '*')
   return response