Skip to content

Commit

Permalink
modified id-based and scopus download in elsevier
Browse files Browse the repository at this point in the history
  • Loading branch information
star-nox committed Feb 27, 2024
1 parent fe6eacf commit e09e20a
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 41 deletions.
98 changes: 63 additions & 35 deletions ai_ta_backend/journal_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
import supabase
import tarfile

import time

# Below functions hit API endpoints from sites like arXiv, Elsevier, and Sringer Nature to retrieve journal articles
SPRINGER_API_KEY = os.environ.get('SPRINGER_API_KEY')
ELSEVIER_API_KEY = os.environ.get('ELSEVIER_API_KEY')
Expand Down Expand Up @@ -145,7 +147,7 @@ def downloadSpringerFulltext(issn=None, subject=None, journal=None, title=None,
else:
return "No query parameters provided"

main_url = api_url + query_str + "&api_key=" + str(SPRINGER_API_KEY)
main_url = api_url + query_str + "&api_key=" + str(SPRINGER_API_KEY) + "&s=301"
print("Full URL: ", main_url)


Expand All @@ -165,6 +167,10 @@ def downloadSpringerFulltext(issn=None, subject=None, journal=None, title=None,
url = urls[0]['value'] + "?api_key=" + str(SPRINGER_API_KEY)
print("DX URL: ", url)
url_response = requests.get(url, headers=headers)
# check for headers here!

print("Headers: ", url_response.headers['content-type'])

dx_doi_data = url_response.json()
links = dx_doi_data['link']
pdf_link = None
Expand All @@ -191,6 +197,7 @@ def downloadSpringerFulltext(issn=None, subject=None, journal=None, title=None,
data = response.json()
# print("Total records: ", len(data['records']))

# last set of records after exiting while loop
for record in data['records']:
urls = record['url']
filename = record['doi'].replace("/", "_")
Expand Down Expand Up @@ -228,7 +235,7 @@ def downloadSpringerFulltext(issn=None, subject=None, journal=None, title=None,
uppload_path = "springer_papers/" + file
try:
with open(filepath, "rb") as f:
res = SUPABASE_CLIENT.storage.from_("publications/springer_journals/nature_reviews_immunology").upload(file=f, path=uppload_path, file_options={"content-type": "application/pdf"})
res = SUPABASE_CLIENT.storage.from_("publications/springer_journals/nature_immunology").upload(file=f, path=uppload_path, file_options={"content-type": "application/pdf"})
print("Upload response: ", res)
except Exception as e:
print("Error: ", e)
Expand All @@ -250,25 +257,39 @@ def downloadSpringerFulltext(issn=None, subject=None, journal=None, title=None,

##------------------------ ELSEVIER API FUNCTIONS ------------------------##

def downloadElsevierFulltextFromDoi(doi: str, course_name: str):
def downloadElsevierFulltextFromDoi(id: str, id_type: str, course_name: str):
"""
This function downloads articles from Elsevier for a given DOI.
Modify the function to accept all sorts of IDs - pii, pubmed_id, eid
"""


directory = os.path.join(os.getcwd(), 'elsevier_papers')
if not os.path.exists(directory):
os.makedirs(directory)

headers = {'X-ELS-APIKey': ELSEVIER_API_KEY, 'Accept':'application/pdf'}
url = 'https://api.elsevier.com/content/article/'

if id_type == "doi":
url += "doi/" + id
elif id_type == "eid":
url += "eid/" + id
elif id_type == "pii":
url += "pii/" + id
elif id_type == "pubmed_id":
url += "pubmed_id/" + id
else:
return "No query parameters provided"

url = 'https://api.elsevier.com/content/article/doi/' + doi
response = requests.get(url, headers=headers)
print("Status: ", response.status_code)
data = response.text
filename = doi.replace("/", "_")
filename = id.replace("/", "_")
with open(directory + "/" + filename + ".pdf", "wb") as f: # Open a file in binary write mode ("wb")
for chunk in response.iter_content(chunk_size=1024): # Download in chunks
f.write(chunk)

print("Downloaded: ", filename)
# # upload to s3
# s3_paths = upload_data_files_to_s3(course_name, directory)

Expand Down Expand Up @@ -314,9 +335,9 @@ def searchScienceDirectArticles(course: str, query: str, title: str, pub: str):

response = requests.put(url, headers=headers, json=data)
print("Status: ", response.status_code)
data = response.json()
results = data['results']
total_results = data['resultsFound']
response_data = response.json()
results = response_data['results']
total_results = response_data['resultsFound']
print("Total results: ", total_results)
current_results = len(results)

Expand All @@ -325,7 +346,7 @@ def searchScienceDirectArticles(course: str, query: str, title: str, pub: str):
doi = result['doi']
#pii = result['pii']
if doi:
downloadElsevierFulltextFromDoi(doi=doi, course_name=course)
downloadElsevierFulltextFromDoi(id=doi, id_type='doi', course_name=course)
# elif pii:
# # download with pii
# pass
Expand All @@ -336,8 +357,8 @@ def searchScienceDirectArticles(course: str, query: str, title: str, pub: str):
data["display"]["offset"] += current_results
response = requests.put(url, headers=headers, json=data)
print("Status: ", response.status_code)
data = response.json()
results = data['results']
response_data = response.json()
results = response_data['results']
current_results += len(results)
print("Current results: ", current_results)

Expand All @@ -346,7 +367,7 @@ def searchScienceDirectArticles(course: str, query: str, title: str, pub: str):
doi = result['doi']
#pii = result['pii']
if doi:
downloadElsevierFulltextFromDoi(doi=doi, course_name=course)
downloadElsevierFulltextFromDoi(id=doi, id_type='doi', course_name=course)
# elif pii:
# # download with pii
# pass
Expand All @@ -358,6 +379,9 @@ def searchScopusArticles(course: str, query: str, title: str, pub: str, subject:
This function uses the Scopus Search API to retrieve metadata for journal articles
and then downloads the fulltext using downloadElsevierFulltextFromDoi().
"""
# log start time
start_time = time.monotonic()

# uses GET request
base_url = "https://api.elsevier.com/content/search/scopus?"
query = "query="
Expand All @@ -373,34 +397,21 @@ def searchScopusArticles(course: str, query: str, title: str, pub: str, subject:
query += "SUBJAREA(" + subject + ")"

final_url = base_url + query + "OPENACCESS(1)" + "&apiKey=" + str(ELSEVIER_API_KEY)

print("Final URL: ", final_url)

encoded_url = urllib.parse.quote(final_url, safe=':/?&=')
print("Encoded URL: ", encoded_url)

response = requests.get(encoded_url)
print("Status: ", response.status_code)
data = response.json()

# iterate through results and extract full-text links
results = data['search-results']['entry']
for result in results:
for link in result['link']:
if link['@ref'] == 'full-text':
print("Full-text link: ", link['@href'])

# download full-text with link
headers = {'X-ELS-APIKey': ELSEVIER_API_KEY, 'Accept':'application/pdf'}
full_text_response = requests.get(link['@href'], headers=headers)
print("Status: ", full_text_response.status_code)
#print("Full-text response: ", full_text_response.text)

with open("scopus_paper.pdf", "wb") as f: # Open a file in binary write mode ("wb")
for chunk in response.iter_content(chunk_size=1024): # Download in chunks
f.write(chunk)

exit()
# results contain pii - so we can call downloadElsevierFulltextFromDoi() here
print("PII: ", result['pii'])
pii = result['pii']
download_status = downloadElsevierFulltextFromDoi(id=pii, id_type='pii', course_name=course)
print("Download status: ", download_status)


# response is JSON and has next page link
Expand All @@ -411,11 +422,28 @@ def searchScopusArticles(course: str, query: str, title: str, pub: str, subject:
next_page_url = link['@href']
break
print("Next page: ", next_page_url)
while next_page_url:
response = requests.get(next_page_url)
data = response.json()
results = data['search-results']['entry']
for result in results:
# results contain pii - so we can call downloadElsevierFulltextFromDoi() here
pii = result['pii']
download_status = downloadElsevierFulltextFromDoi(id=pii, id_type='pii', course_name=course)
print("Download status: ", download_status)

# response is JSON and has next page link
links = data['search-results']['link']
next_page_url = None
for link in links:
if link['@ref'] == 'next':
next_page_url = link['@href']
break
print("Next page: ", next_page_url)

# log end time
print(f"⏰ Runtime: {(time.monotonic() - start_time):.2f} seconds")





return "success"


Expand Down
11 changes: 6 additions & 5 deletions ai_ta_backend/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -746,20 +746,21 @@ def get_springer_data():

@app.route('/get-elsevier-fulltext', methods=['GET'])
def get_elsevier_data():
doi = request.args.get('doi', default='', type=str)
id = request.args.get('id', default='', type=str)
id_type = request.args.get('id_type', default='doi', type=str)
course_name = request.args.get('course_name', default='', type=str)

print("In /get-elsevier-fulltext")

if doi == '' or course_name == '':
if id == '' or id_type == '' or course_name == '':
# proper web error "400 Bad request"
abort(
400,
description=
f"Missing required parameters: 'doi' and 'course_name' must be provided."
f"Missing required parameters: 'id', 'id_type' [doi, eid, pii, pubmed_id] and 'course_name' must be provided."
)

fulltext = downloadElsevierFulltextFromDoi(doi, course_name)
fulltext = downloadElsevierFulltextFromDoi(id, id_type, course_name)

response = jsonify(fulltext)
response.headers.add('Access-Control-Allow-Origin', '*')
Expand Down Expand Up @@ -848,7 +849,7 @@ def getScopusArticle() -> Response:

print("In /getScopusArticles")

if (title == '' and journal == '' and search_query == '') or course_name == '':
if (title == '' and journal == '' and search_query == '' and issn == '') or course_name == '':
# proper web error "400 Bad request"
abort(
400,
Expand Down
2 changes: 1 addition & 1 deletion ai_ta_backend/vector_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
from ai_ta_backend.filtering_contexts import filter_top_contexts
from ai_ta_backend.nomic_logging import log_to_document_map, delete_from_document_map

MULTI_QUERY_PROMPT = hub.pull("langchain-ai/rag-fusion-query-generation")
#MULTI_QUERY_PROMPT = hub.pull("langchain-ai/rag-fusion-query-generation")
OPENAI_API_TYPE = "openai" # "openai" or "azure"


Expand Down
Binary file removed elsevier_papers/10.1016_0039-128x(94)90089-2.pdf
Binary file not shown.

0 comments on commit e09e20a

Please sign in to comment.