From c249676a5282dd1a8188ac13f0873b556528a1d4 Mon Sep 17 00:00:00 2001 From: star-nox Date: Tue, 10 Dec 2024 14:49:37 -0600 Subject: [PATCH 1/3] added endpoint for re-scrape --- ai_ta_backend/main.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/ai_ta_backend/main.py b/ai_ta_backend/main.py index fe4586cd..c2941728 100644 --- a/ai_ta_backend/main.py +++ b/ai_ta_backend/main.py @@ -603,6 +603,19 @@ def get_project_stats(service: RetrievalService) -> Response: response.headers.add('Access-Control-Allow-Origin', '*') return response +@app.route('/updateProjectDocuments', methods=['GET']) +def updateProjectDocuments() -> Response: + project_name = request.args.get('project_name', default='', type=str) + + if project_name == '': + abort(400, description="Missing required parameter: 'project_name' must be provided.") + + result = webscrape_documents(project_name) + + response = jsonify(result) + response.headers.add('Access-Control-Allow-Origin', '*') + return response + def configure(binder: Binder) -> None: binder.bind(ThreadPoolExecutorInterface, to=ThreadPoolExecutorAdapter(max_workers=10), scope=SingletonScope) From 73778859eff81bb19d1c80ac3b70eddb21d610bf Mon Sep 17 00:00:00 2001 From: star-nox Date: Tue, 10 Dec 2024 15:40:31 -0600 Subject: [PATCH 2/3] added function for re-scraping --- ai_ta_backend/main.py | 1 + ai_ta_backend/utils/update_project.py | 55 +++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 ai_ta_backend/utils/update_project.py diff --git a/ai_ta_backend/main.py b/ai_ta_backend/main.py index c2941728..98519ea6 100644 --- a/ai_ta_backend/main.py +++ b/ai_ta_backend/main.py @@ -43,6 +43,7 @@ from ai_ta_backend.service.workflow_service import WorkflowService from ai_ta_backend.utils.pubmed_extraction import extractPubmedData +from ai_ta_backend.utils.update_project import webscrape_documents app = Flask(__name__) diff --git a/ai_ta_backend/utils/update_project.py b/ai_ta_backend/utils/update_project.py new file mode 100644 index 00000000..56987eb1 --- /dev/null +++ b/ai_ta_backend/utils/update_project.py @@ -0,0 +1,55 @@ +import os +from supabase import create_client +from dotenv import load_dotenv +from urllib.parse import urlparse +import json + +load_dotenv() + + + +def webscrape_documents(project_name: str): + print(f"Scraping documents for project: {project_name}") + + # create Supabase client + supabase_url = os.getenv("SUPABASE_URL") + supabase_key = os.getenv("SUPABASE_API_KEY") + supabase_client = create_client(supabase_url, supabase_key) + + # use RPC to get unique base_urls + # response = supabase_client.rpc("get_distinct_base_urls", {"p_course_name": project_name}).execute() # this only returns base_urls + response = supabase_client.rpc("get_base_url_with_doc_groups", {"p_course_name": project_name}).execute() # this returns base_urls with document groups + print(f"Base urls: {response}") + base_urls = response.data + print(f"Total base_urls: {len(base_urls)}") + + #webcrawl_url = "https://crawlee.kastan.ai/crawl" + #webcrawl_url = "https://crawlee-production.up.railway.app/crawl" + webcrawl_url = "https://crawlee-pr-7.up.railway.app/crawl" + payload = { + "params": { + "url": "", + "scrapeStrategy": "equal-and-below", + "match": "", + "maxPagesToCrawl": 5, + "maxTokens": 2000000, + "courseName": project_name + } + } + + for base_url in base_urls: + print(f"Base URL: {base_url}") + print(f"Document Group: {base_urls[base_url]}") + payload["params"]["url"] = base_url + domain = urlparse(base_url).netloc + print(f"Domains: {domain}") + payload["params"]["match"] = "http?(s)://" + domain + "/**" + payload["params"]["documentGroups"] = [base_urls[base_url]] + print("Payload: ", payload) + + # response = requests.post(webcrawl_url, json=payload) + # print("Response from crawl: ", response.json()) + + + + return "Webscrape done." \ No newline at end of file From 719b58591af845b66cef02491e2973efa9cc7508 Mon Sep 17 00:00:00 2001 From: star-nox Date: Wed, 11 Dec 2024 10:36:43 -0600 Subject: [PATCH 3/3] changed the webscrape URL to prod --- ai_ta_backend/utils/update_project.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ai_ta_backend/utils/update_project.py b/ai_ta_backend/utils/update_project.py index 56987eb1..b41932ae 100644 --- a/ai_ta_backend/utils/update_project.py +++ b/ai_ta_backend/utils/update_project.py @@ -24,8 +24,8 @@ def webscrape_documents(project_name: str): print(f"Total base_urls: {len(base_urls)}") #webcrawl_url = "https://crawlee.kastan.ai/crawl" - #webcrawl_url = "https://crawlee-production.up.railway.app/crawl" - webcrawl_url = "https://crawlee-pr-7.up.railway.app/crawl" + webcrawl_url = "https://crawlee-production.up.railway.app/crawl" + payload = { "params": { "url": "",