Skip to content

Commit

Permalink
added elsevier re-route in pubmed
Browse files Browse the repository at this point in the history
  • Loading branch information
star-nox committed Mar 15, 2024
1 parent c439b61 commit 15b5822
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 21 deletions.
88 changes: 72 additions & 16 deletions ai_ta_backend/journal_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
from urllib.parse import urlparse
import urllib.parse

from ai_ta_backend.aws import upload_data_files_to_s3
from ai_ta_backend.vector_database import Ingest
# from ai_ta_backend.aws import upload_data_files_to_s3
# from ai_ta_backend.vector_database import Ingest

import supabase
import tarfile
Expand Down Expand Up @@ -408,7 +408,7 @@ def searchScopusArticles(course: str, search_str: str, title: str, pub: str, sub


# after all records are downloaded, upload to supabase bucket
supabase_bucket_path = "publications/elsevier_journals/journal_of_allergy_and_clinical_immunology"
supabase_bucket_path = "publications/elsevier_journals/mbio"
try:
for root, directories, files in os.walk(directory):
for file in files:
Expand Down Expand Up @@ -575,7 +575,7 @@ def downloadPubmedArticles(id, course_name, **kwargs):
if format != None and format in ['tgz', 'pdf']:
main_url += "&format=" + format

print("Full URL: ", main_url)
#print("Full URL: ", main_url)

xml_response = requests.get(main_url)
root = ET.fromstring(xml_response.text)
Expand All @@ -584,22 +584,22 @@ def downloadPubmedArticles(id, course_name, **kwargs):
while resumption is not None: # download current articles and query
# parse xml response and extract pdf links and other metadata
records = extract_record_data(xml_response.text)
print("Total records: ", len(records))
#print("Total records: ", len(records))
if len(records) > 0:
# download articles
download_status = downloadFromFTP(records, directory, ftp_address="ftp.ncbi.nlm.nih.gov")

# query for next set of articles
resumption_url = resumption.find(".//link").get("href")
print("Resumption URL: ", resumption_url)
#print("Resumption URL: ", resumption_url)

xml_response = requests.get(resumption_url)
root = ET.fromstring(xml_response.text)
resumption = root.find(".//resumption")

# download current articles if resumption is None
records = extract_record_data(xml_response.text)
print("Current total records: ", len(records))
#print("Current total records: ", len(records))
if len(records) > 0:
# download articles
download_status = downloadFromFTP(records, directory, ftp_address="ftp.ncbi.nlm.nih.gov")
Expand Down Expand Up @@ -638,6 +638,8 @@ def searchPubmedArticlesWithEutils(course: str, search: str, title: str, journal
title: article title
journal: journal title
"""
start_time = time.monotonic()

directory = os.path.join(os.getcwd(), 'pubmed_papers')
if not os.path.exists(directory):
os.makedirs(directory)
Expand All @@ -661,7 +663,7 @@ def searchPubmedArticlesWithEutils(course: str, search: str, title: str, journal
search_query = search.replace(" ", "+")
final_query += search_query

final_url = base_url + database + "&" + final_query + "&retmode=json&retmax=100"
final_url = base_url + database + "&" + final_query + "&retmode=json&retmax=100&retstart=3000"
print("Final URL: ", final_url)
response = requests.get(final_url)

Expand All @@ -675,6 +677,7 @@ def searchPubmedArticlesWithEutils(course: str, search: str, title: str, journal
current_records = 0

print("Total Records: ", total_records)
pmid_list = []

while current_records < total_records:
# extract ID and convert them to PMC ID
Expand All @@ -685,8 +688,11 @@ def searchPubmedArticlesWithEutils(course: str, search: str, title: str, journal
print("Number of PMC IDs: ", len(current_pmc_ids))

# extract the PMIDs which do not have a PMCID
pmid_list = list(set(id_list) - set(current_pmc_ids))
print("PMIDs without PMC IDs: ", pmid_list)
diff_ids = list(set(id_list) - set(current_pmc_ids))
print("Current PM IDs not present as PMC Ids: ", len(diff_ids))
print("Diff IDs: ", diff_ids)
pmid_list += diff_ids


# call pubmed download here - parallel processing
with concurrent.futures.ProcessPoolExecutor() as executor:
Expand All @@ -705,9 +711,39 @@ def searchPubmedArticlesWithEutils(course: str, search: str, title: str, journal
data = response.json()

# check if IDs from pmid_list are present in elsevier
print("PMIDs without PMC IDs: ", len(pmid_list))
if len(pmid_list) > 0:
print("Downloading from Elsevier...")
with concurrent.futures.ProcessPoolExecutor() as executor:
batch_size = 10
batches = [pmid_list[i:i+batch_size] for i in range(0, len(pmid_list), batch_size)]
results = []
for batch in batches:
batch_results = [executor.submit(downloadElsevierFulltextFromId, pmid, 'pubmed_id', course) for pmid in batch]
results.extend(batch_results)

print(f"⏰ Total Download Runtime: {(time.monotonic() - start_time):.2f} seconds")


# upload to supabase bucket
count = 0
try:
for root, directories, files in os.walk(directory):
for file in files:
filepath = os.path.join(root, file)
upload_path = file
try:
with open(filepath, "rb") as f:
res = SUPABASE_CLIENT.storage.from_("publications/pubmed_journals/virus_evolution").upload(file=f, path=upload_path, file_options={"content-type": "application/pdf"})
count += 1
except Exception as e:
print("Error: ", e)
print("Uploaded: ", count)

except Exception as e:
print("Error: ", e)

# log end time
print(f"⏰ Total Runtime with Supabase upload: {(time.monotonic() - start_time):.2f} seconds")

return "success"

Expand Down Expand Up @@ -746,11 +782,21 @@ def extract_record_data(xml_string):
root = ET.fromstring(xml_string)
records = root.findall(".//record")
extracted_data = []
href = None

for record in records:
record_id = record.get("id")
license = record.get("license")
href = record.find(".//link").get("href")
links = record.findall(".//link")
for link in links:
# check for PDF links first
if link.get("format") == "pdf":
href = link.get("href")
break
# if PDF link not found, use the available tgz link
if not href:
href = links[0].get("href")

extracted_data.append({
"record_id": record_id,
"license": license,
Expand All @@ -777,21 +823,31 @@ def downloadFromFTP(paths, local_dir, ftp_address):
for path in paths:
ftp_url = urlparse(path['href'])
ftp_path = ftp_url.path[1:]
print("Downloading from FTP path: ", ftp_path)
#print("Downloading from FTP path: ", ftp_path)

filename = ftp_path.split('/')[-1]
local_file = os.path.join(local_dir, filename)
with open(local_file, 'wb') as f:
ftp.retrbinary("RETR " + ftp_path, f.write)
print("Downloaded: ", filename)
#print("Downloaded: ", filename)

# if filename ends in tar.gz, extract the pdf and delete the tar.gz
if filename.endswith(".tar.gz"):
extracted_pdf = extract_pdf(local_file)
#print("Extracted PDF: ", extracted_pdf)

filename = os.path.basename(filename)
new_pdf_name = filename.replace('.tar.gz', '.pdf')

new_pdf_path = os.path.join(local_dir, new_pdf_name)
old_pdf_path = os.path.join(local_dir, extracted_pdf)
os.rename(old_pdf_path, new_pdf_path)

# delete the tar.gz file
os.remove(local_file)
os.remove(old_pdf_path)

ftp.quit()


return "success"


Expand Down
10 changes: 5 additions & 5 deletions ai_ta_backend/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,12 @@
import ray
import sentry_sdk

from ai_ta_backend.canvas import CanvasAPI
# from ai_ta_backend.canvas import CanvasAPI

from ai_ta_backend.export_data import export_convo_history_json, export_documents_json, check_s3_path_and_download
from ai_ta_backend.nomic_logging import get_nomic_map, log_convo_to_nomic, create_document_map
from ai_ta_backend.vector_database import Ingest
from ai_ta_backend.web_scrape import WebScrape, mit_course_download
# from ai_ta_backend.export_data import export_convo_history_json, export_documents_json, check_s3_path_and_download
# from ai_ta_backend.nomic_logging import get_nomic_map, log_convo_to_nomic, create_document_map
# from ai_ta_backend.vector_database import Ingest
# from ai_ta_backend.web_scrape import WebScrape, mit_course_download
from ai_ta_backend.journal_ingest import (get_arxiv_fulltext, downloadSpringerFulltext,
downloadElsevierFulltextFromId, getFromDoi,
downloadPubmedArticles, searchPubmedArticlesWithEutils,
Expand Down
Empty file removed ai_ta_backend/vector_database.py
Empty file.

0 comments on commit 15b5822

Please sign in to comment.