created API for downloading arxiv papers

UIUC-Chatbot · Nov 30, 2023 · 7fec78a · 7fec78a
1 parent 94acbc1
commit 7fec78a
Show file tree

Hide file tree

Showing 3 changed files with 45 additions and 0 deletions.
diff --git a/ai_ta_backend/data_import.py b/ai_ta_backend/data_import.py
@@ -0,0 +1,24 @@
+import os
+import arxiv
+import pandas as pd
+
+def get_arxiv_data(query: str) -> str:
+    """
+    Fetch papers through query or id and store them in local directory.
+    Eventually needs to be ingested into database.
+    """
+
+    search = arxiv.Search(query=query, 
+                          max_results=10, 
+                          sort_by = arxiv.SortCriterion.SubmittedDate)
+
+    directory = os.path.join(os.getcwd(), 'arxiv_papers')
+    if not os.path.exists(directory):
+        os.makedirs(directory)
+
+    for result in arxiv.Client().results(search):
+        print("Downloading paper: ", result.title)
+        result.download_pdf(dirpath=directory)
+
+
+    return "success"
diff --git a/ai_ta_backend/main.py b/ai_ta_backend/main.py
@@ -15,6 +15,7 @@
 from ai_ta_backend.vector_database import Ingest
 from ai_ta_backend.web_scrape import WebScrape, mit_course_download
 from ai_ta_backend.export_data import export_convo_history_csv
+from ai_ta_backend.data_import import get_arxiv_data
 
 app = Flask(__name__)
 CORS(app)
@@ -495,6 +496,25 @@ def export_convo_history():
   os.remove(export_status[0])
   return response
 
+@app.route('/get-arxiv-fulltext', methods=['GET'])
+def get_arxiv_fulltext():
+  search_query: str = request.args.get('search_query', default='', type=str)
+  print("In /get-arxiv-fulltext: ", search_query)
+
+  if search_query == '':
+    # proper web error "400 Bad request"
+    abort(
+        400,
+        description=
+        f"Missing required parameters: 'arxiv_id' or 'search_query' must be provided."
+    )
+
+  fulltext = get_arxiv_data(search_query)
+
+  response = jsonify(fulltext)
+  response.headers.add('Access-Control-Allow-Origin', '*')
+  return response
+
 
 if __name__ == '__main__':
   app.run(debug=True, port=int(os.getenv("PORT", default=8000)))
diff --git a/requirements.txt b/requirements.txt
@@ -51,3 +51,4 @@ unstructured==0.10.29 # causes huge ~5.3 GB of installs. Probbably from onnx: ht
 
 # Not currently supporting coursera ingest
 # cs-dlp @ git+https://github.com/raffaem/[email protected] # previously called coursera-dl
+arxiv==2.0.0
Original file line number	Diff line number	Diff line change
Expand Up		@@ -51,3 +51,4 @@ unstructured==0.10.29 # causes huge ~5.3 GB of installs. Probbably from onnx: ht

		# Not currently supporting coursera ingest
		# cs-dlp @ git+https://github.com/raffaem/[email protected] # previously called coursera-dl
		arxiv==2.0.0