Skip to content

Commit

Permalink
updated pubmed eutils function
Browse files Browse the repository at this point in the history
  • Loading branch information
star-nox committed Mar 8, 2024
1 parent 79001e4 commit 064d4d0
Showing 1 changed file with 29 additions and 31 deletions.
60 changes: 29 additions & 31 deletions ai_ta_backend/journal_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -631,7 +631,7 @@ def searchPubmedArticlesWithEutils(course: str, search: str, title: str, journal
This function is used for a text-based search in PubMed using the E-Utilities API.
Args:
course: course name
query: search query
search: search query
title: article title
journal: journal title
"""
Expand All @@ -640,7 +640,7 @@ def searchPubmedArticlesWithEutils(course: str, search: str, title: str, journal
os.makedirs(directory)

base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?"
database = "db=pmc"
database = "db=pubmed"
final_query = "term="

title_query = journal_query = search_query = ""
Expand All @@ -661,45 +661,43 @@ def searchPubmedArticlesWithEutils(course: str, search: str, title: str, journal
final_url = base_url + database + "&" + final_query + "&retmode=json&retmax=100"
print("Final URL: ", final_url)
response = requests.get(final_url)
data = response.json()

print(data)
if response.status_code != 200:
return "Error: " + str(response.status_code) + " - " + response.text

data = response.json()

total_records = int(data['esearchresult']['count'])
current_records = len(data['esearchresult']['idlist'])
id_list = data['esearchresult']['idlist']
current_records = 0

print("Total Records: ", total_records)
print("Current Records: ", current_records)
print("ID List: ", id_list)

while current_records < total_records:
# extract ID and convert them to PMC ID
id_list = data['esearchresult']['idlist']
print("Number of records in current page: ", len(id_list))
id_str = ",".join(id_list)
current_pmc_ids = pubmed_id_converter(id_str)
print("Number of PMC IDs: ", len(current_pmc_ids))

# call pubmed download here - parallel processing
with concurrent.futures.ProcessPoolExecutor() as executor:
results = [executor.submit(downloadPubmedArticles, id, course) for id in current_pmc_ids]

# update current records count
current_records += len(id_list)

# if next page exists, update next page url and call the API again
retstart = current_records
final_url = base_url + database + "&" + final_query + "&retmode=json&retmax=100&retstart=" + str(retstart)
print("Final URL: ", final_url)
response = requests.get(final_url)
next_page_url = base_url + database + "&" + final_query + "&retmode=json&retmax=100&retstart=" + str(retstart)
print("Next page URL: ", next_page_url)
response = requests.get(next_page_url)
if response.status_code != 200:
return "Error in next page: " + str(response.status_code) + " - " + response.text
data = response.json()

current_ids = data['esearchresult']['idlist']
id_list += current_ids
current_records += len(current_ids)
print("Current Records: ", current_records)

id_str = ",".join(id_list)
current_pmc_ids = pubmed_id_converter(id_str)

# call pubmed download here
for pmc_id in current_pmc_ids:
downloadPubmedArticles(id=pmc_id, course_name=course)

id_str = ",".join(id_list)
current_pmc_ids = pubmed_id_converter(id_str)
print("Current PMC IDs: ", current_pmc_ids)
# upload to supabase bucket

# call pubmed download here
for pmc_id in current_pmc_ids:
downloadPubmedArticles(id=pmc_id, course_name=course)

return "success"


Expand All @@ -714,7 +712,7 @@ def pubmed_id_converter(id: str):
url = base_url + app_details + "&ids=" + id

response = requests.get(url)
print("Response: ", response.text)
#print("Response: ", response.text)
root = ET.fromstring(response.text)
records = root.findall(".//record")
for record in records:
Expand Down

0 comments on commit 064d4d0

Please sign in to comment.