Skip to content

Commit

Permalink
added separate endpoint for scraping cropwizard docs
Browse files Browse the repository at this point in the history
  • Loading branch information
star-nox committed Feb 22, 2024
1 parent ef4c016 commit 10b618c
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 0 deletions.
47 changes: 47 additions & 0 deletions ai_ta_backend/crawlee_ext_scrape.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import requests
import os
import json
import time

SCRAPE_LOG = {}

def crawlee_scrape(course_name: str):
"""
This function takes in a pre-defined set of URLs and scrapes the content from each URL.
"""
urls = [
'https://extension.arizona.edu/'
]

payload = {
"params": {
"url": "",
"scrapeStrategy": "equal-and-below",
"match": "",
"maxPagesToCrawl": 20000,
"maxTokens": 2000000,
"courseName": course_name
}
}

# create a POST request to the crawlee API
api_endpoint = 'https://crawlee-production.up.railway.app/crawl'

# loop through the URLs and scrape the content
for url in urls:
payload["params"]["url"] = url
payload["params"]["match"] = "http?(s)://" + url.split("//")[1] + "/**"

print("Scraping URL:", url)
start_time = time.monotonic()
response = requests.post(api_endpoint, json=payload)

no_of_urls_scraped = response.json()
SCRAPE_LOG[url] = no_of_urls_scraped
print(f"⏰ Scraping runtime: {(time.monotonic() - start_time):.2f} seconds")
time.sleep(10)

print(SCRAPE_LOG)

return "Scraping complete."

18 changes: 18 additions & 0 deletions ai_ta_backend/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from ai_ta_backend.nomic_logging import get_nomic_map, log_convo_to_nomic, create_document_map
from ai_ta_backend.vector_database import Ingest
from ai_ta_backend.web_scrape import WebScrape, mit_course_download
from ai_ta_backend.crawlee_ext_scrape import crawlee_scrape

# Sentry.io error logging
sentry_sdk.init(
Expand Down Expand Up @@ -696,6 +697,23 @@ def getTopContextsWithMQR() -> Response:
return response


@app.route('/extension-scrape', methods=['GET'])
def extension_scrape() -> Response:
"""
Scrapes extension websites
"""
course_name: str = request.args.get('course_name', default='', type=str)

if course_name == '':
# proper web error "400 Bad request"
abort(400, description=f"Missing required parameter: 'course_name' must be provided. Course name: `{course_name}`")

result = crawlee_scrape(course_name)
response = jsonify(result)
response.headers.add('Access-Control-Allow-Origin', '*')
return response


@app.route('/resource-report', methods=['GET'])
def resource_report() -> Response:
"""
Expand Down

0 comments on commit 10b618c

Please sign in to comment.