Skip to content

Commit

Permalink
minor changes
Browse files Browse the repository at this point in the history
  • Loading branch information
star-nox committed Feb 29, 2024
1 parent 1ea180c commit 311b80d
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 8 deletions.
9 changes: 4 additions & 5 deletions ai_ta_backend/crawlee_ext_scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

SCRAPE_LOG = {}

def crawlee_scrape(course_name: str, urls: list):
def crawlee_scrape(course_name: str, urls: list, exclude_urls: list):
"""
This function takes in a pre-defined set of URLs and scrapes the content from each URL.
"""
Expand All @@ -15,12 +15,12 @@ def crawlee_scrape(course_name: str, urls: list):
"url": "",
"scrapeStrategy": "equal-and-below",
"match": "",
"maxPagesToCrawl": 10000,
"maxPagesToCrawl": 15000,
"maxTokens": 2000000,
"maxConcurrency": 20,
"maxRequestsPerMinute": 120,
"courseName": course_name,
"exclude": ""
"exclude": exclude_urls
}
}

Expand All @@ -31,8 +31,7 @@ def crawlee_scrape(course_name: str, urls: list):
for url in urls:
payload["params"]["url"] = url
payload["params"]["match"] = "http?(s)://" + url.split("//")[1] + "/**"



print("Scraping URL:", url)
start_time = time.monotonic()
response = requests.post(api_endpoint, json=payload)
Expand Down
6 changes: 3 additions & 3 deletions ai_ta_backend/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
from ai_ta_backend.web_scrape import WebScrape, mit_course_download
from ai_ta_backend.crawlee_ext_scrape import crawlee_scrape


# Sentry.io error logging
sentry_sdk.init(
dsn=os.getenv("SENTRY_DSN"),
Expand Down Expand Up @@ -692,7 +693,6 @@ def getTopContextsWithMQR() -> Response:
response.headers.add('Access-Control-Allow-Origin', '*')
return response


@app.route('/extension-scrape', methods=['POST'])
def extension_scrape() -> Response:
"""
Expand All @@ -701,12 +701,12 @@ def extension_scrape() -> Response:
data = request.get_json()
urls: List[str] = data.get('urls', [])
course_name: str = data.get('course_name', '')

exclude_urls: List[str] = data.get('exclude_urls', [])
if course_name == '' or urls == []:
# proper web error "400 Bad request"
abort(400, description=f"Missing required parameter: 'course_name' and 'urls' must be provided.")

result = crawlee_scrape(course_name, urls)
result = crawlee_scrape(course_name, urls, exclude_urls)
response = jsonify(result)
response.headers.add('Access-Control-Allow-Origin', '*')
return response
Expand Down

0 comments on commit 311b80d

Please sign in to comment.