Skip to content

Commit d856603

Browse files
committed
expose match url to API
1 parent 311b80d commit d856603

File tree

2 files changed

+9
-3
lines changed

2 files changed

+9
-3
lines changed

ai_ta_backend/crawlee_ext_scrape.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
SCRAPE_LOG = {}
66

7-
def crawlee_scrape(course_name: str, urls: list, exclude_urls: list):
7+
def crawlee_scrape(course_name: str, urls: list, exclude_urls: list, match_url: str):
88
"""
99
This function takes in a pre-defined set of URLs and scrapes the content from each URL.
1010
"""
@@ -30,8 +30,12 @@ def crawlee_scrape(course_name: str, urls: list, exclude_urls: list):
3030
# loop through the URLs and scrape the content
3131
for url in urls:
3232
payload["params"]["url"] = url
33-
payload["params"]["match"] = "http?(s)://" + url.split("//")[1] + "/**"
3433

34+
if match_url:
35+
payload["params"]["match"] = match_url
36+
else:
37+
payload["params"]["match"] = "http?(s)://" + url.split("//")[1] + "/**"
38+
3539
print("Scraping URL:", url)
3640
start_time = time.monotonic()
3741
response = requests.post(api_endpoint, json=payload)

ai_ta_backend/main.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -702,11 +702,13 @@ def extension_scrape() -> Response:
702702
urls: List[str] = data.get('urls', [])
703703
course_name: str = data.get('course_name', '')
704704
exclude_urls: List[str] = data.get('exclude_urls', [])
705+
match_url: str = data.get('match_url', '')
706+
705707
if course_name == '' or urls == []:
706708
# proper web error "400 Bad request"
707709
abort(400, description=f"Missing required parameter: 'course_name' and 'urls' must be provided.")
708710

709-
result = crawlee_scrape(course_name, urls, exclude_urls)
711+
result = crawlee_scrape(course_name, urls, exclude_urls, match_url)
710712
response = jsonify(result)
711713
response.headers.add('Access-Control-Allow-Origin', '*')
712714
return response

0 commit comments

Comments
 (0)