diff --git a/ai_ta_backend/crawlee_ext_scrape.py b/ai_ta_backend/crawlee_ext_scrape.py index 0528332e..8c118db8 100644 --- a/ai_ta_backend/crawlee_ext_scrape.py +++ b/ai_ta_backend/crawlee_ext_scrape.py @@ -4,7 +4,7 @@ SCRAPE_LOG = {} -def crawlee_scrape(course_name: str, urls: list): +def crawlee_scrape(course_name: str, urls: list, exclude_urls: list): """ This function takes in a pre-defined set of URLs and scrapes the content from each URL. """ @@ -15,12 +15,12 @@ def crawlee_scrape(course_name: str, urls: list): "url": "", "scrapeStrategy": "equal-and-below", "match": "", - "maxPagesToCrawl": 10000, + "maxPagesToCrawl": 15000, "maxTokens": 2000000, "maxConcurrency": 20, "maxRequestsPerMinute": 120, "courseName": course_name, - "exclude": "" + "exclude": exclude_urls } } @@ -31,8 +31,7 @@ def crawlee_scrape(course_name: str, urls: list): for url in urls: payload["params"]["url"] = url payload["params"]["match"] = "http?(s)://" + url.split("//")[1] + "/**" - - + print("Scraping URL:", url) start_time = time.monotonic() response = requests.post(api_endpoint, json=payload) diff --git a/ai_ta_backend/main.py b/ai_ta_backend/main.py index 31576561..8027b294 100644 --- a/ai_ta_backend/main.py +++ b/ai_ta_backend/main.py @@ -32,6 +32,7 @@ from ai_ta_backend.web_scrape import WebScrape, mit_course_download from ai_ta_backend.crawlee_ext_scrape import crawlee_scrape + # Sentry.io error logging sentry_sdk.init( dsn=os.getenv("SENTRY_DSN"), @@ -692,7 +693,6 @@ def getTopContextsWithMQR() -> Response: response.headers.add('Access-Control-Allow-Origin', '*') return response - @app.route('/extension-scrape', methods=['POST']) def extension_scrape() -> Response: """ @@ -701,12 +701,12 @@ def extension_scrape() -> Response: data = request.get_json() urls: List[str] = data.get('urls', []) course_name: str = data.get('course_name', '') - + exclude_urls: List[str] = data.get('exclude_urls', []) if course_name == '' or urls == []: # proper web error "400 Bad request" abort(400, description=f"Missing required parameter: 'course_name' and 'urls' must be provided.") - result = crawlee_scrape(course_name, urls) + result = crawlee_scrape(course_name, urls, exclude_urls) response = jsonify(result) response.headers.add('Access-Control-Allow-Origin', '*') return response