Skip to content

Commit

Permalink
created API for downloading arxiv papers
Browse files Browse the repository at this point in the history
  • Loading branch information
star-nox committed Nov 30, 2023
1 parent 94acbc1 commit 7fec78a
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 0 deletions.
24 changes: 24 additions & 0 deletions ai_ta_backend/data_import.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import os
import arxiv
import pandas as pd

def get_arxiv_data(query: str) -> str:
"""
Fetch papers through query or id and store them in local directory.
Eventually needs to be ingested into database.
"""

search = arxiv.Search(query=query,
max_results=10,
sort_by = arxiv.SortCriterion.SubmittedDate)

directory = os.path.join(os.getcwd(), 'arxiv_papers')
if not os.path.exists(directory):
os.makedirs(directory)

for result in arxiv.Client().results(search):
print("Downloading paper: ", result.title)
result.download_pdf(dirpath=directory)


return "success"
20 changes: 20 additions & 0 deletions ai_ta_backend/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from ai_ta_backend.vector_database import Ingest
from ai_ta_backend.web_scrape import WebScrape, mit_course_download
from ai_ta_backend.export_data import export_convo_history_csv
from ai_ta_backend.data_import import get_arxiv_data

app = Flask(__name__)
CORS(app)
Expand Down Expand Up @@ -495,6 +496,25 @@ def export_convo_history():
os.remove(export_status[0])
return response

@app.route('/get-arxiv-fulltext', methods=['GET'])
def get_arxiv_fulltext():
search_query: str = request.args.get('search_query', default='', type=str)
print("In /get-arxiv-fulltext: ", search_query)

if search_query == '':
# proper web error "400 Bad request"
abort(
400,
description=
f"Missing required parameters: 'arxiv_id' or 'search_query' must be provided."
)

fulltext = get_arxiv_data(search_query)

response = jsonify(fulltext)
response.headers.add('Access-Control-Allow-Origin', '*')
return response


if __name__ == '__main__':
app.run(debug=True, port=int(os.getenv("PORT", default=8000)))
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,4 @@ unstructured==0.10.29 # causes huge ~5.3 GB of installs. Probbably from onnx: ht

# Not currently supporting coursera ingest
# cs-dlp @ git+https://github.com/raffaem/[email protected] # previously called coursera-dl
arxiv==2.0.0

0 comments on commit 7fec78a

Please sign in to comment.