From aefeac19b39e8917851e0f4a8644aa4d40a5e75d Mon Sep 17 00:00:00 2001 From: star-nox Date: Mon, 1 Apr 2024 19:44:34 -0500 Subject: [PATCH 01/28] added functions for metadata extraction --- ai_ta_backend/main.py | 12 ++ ai_ta_backend/utils/pubmed_extraction.py | 172 +++++++++++++++++++++++ 2 files changed, 184 insertions(+) create mode 100644 ai_ta_backend/utils/pubmed_extraction.py diff --git a/ai_ta_backend/main.py b/ai_ta_backend/main.py index 452792ac..b53206d9 100644 --- a/ai_ta_backend/main.py +++ b/ai_ta_backend/main.py @@ -39,6 +39,7 @@ from ai_ta_backend.service.sentry_service import SentryService from ai_ta_backend.beam.nomic_logging import create_document_map +from ai_ta_backend.utils.pubmed_extraction import extractPubmedData app = Flask(__name__) CORS(app) @@ -343,6 +344,17 @@ def getTopContextsWithMQR(service: RetrievalService, posthog_service: PosthogSer response.headers.add('Access-Control-Allow-Origin', '*') return response +@app.route('/pubmedExtraction', methods=['GET']) +def pubmedExtraction(): + """ + Extracts metadata and download papers from PubMed. + """ + result = extractPubmedData() + + response = jsonify(result) + response.headers.add('Access-Control-Allow-Origin', '*') + return response + def configure(binder: Binder) -> None: binder.bind(RetrievalService, to=RetrievalService, scope=RequestScope) diff --git a/ai_ta_backend/utils/pubmed_extraction.py b/ai_ta_backend/utils/pubmed_extraction.py new file mode 100644 index 00000000..4be2b4a8 --- /dev/null +++ b/ai_ta_backend/utils/pubmed_extraction.py @@ -0,0 +1,172 @@ +import os +import requests +import shutil +import json +import xml.etree.ElementTree as ET +import ftplib +import supabase +import gzip +import time +import concurrent.futures +import urllib.request + + +SUPBASE_CLIENT = supabase.create_client( # type: ignore + supabase_url=os.getenv('SUPABASE_URL'), # type: ignore + supabase_key=os.getenv('SUPABASE_API_KEY') # type: ignore +) + +def extractPubmedData(): + """ + Extracts metadata from the files listed in FTP folder and stores it in SQL DB. + """ + ftp_address = "ftp.ncbi.nlm.nih.gov" + ftp_path = "pubmed/baseline" + file_list = getFileList(ftp_address, ftp_path, ".gz") + + for file in file_list: + # download the .gz file + gz_filepath = downloadFromFTP(ftp_address, ftp_path, file, "pubmed") + print("Downloaded: ", gz_filepath) + + # extract the XML file + xml_filepath = extractXMLFile(gz_filepath) + print("XML Extracted: ", xml_filepath) + + # extract metadata from the XML file + metadata = extractMetadataFromXML(xml_filepath) + + # find PMC ID and DOI for all articles + for article in metadata: + pmid = article['pmid'] + article_ids = getArticleIDs(pmid) + + + # delete XML and .gz files + + + + return "success" + +def downloadFromFTP(ftp_address: str, ftp_path: str, file: str, local_dir: str): + """ + Downloads all .gz files from the FTP folder and stores it in the local directory. + """ + # create local directory if it doesn't exist + os.makedirs(local_dir, exist_ok=True) + + # connect to the FTP server + ftp = ftplib.FTP(ftp_address) + ftp.login() + ftp.cwd(ftp_path) + + local_filepath = os.path.join(local_dir, file) + + with open(local_filepath, 'wb') as f: + ftp.retrbinary('RETR ' + file, f.write) + + print(f"Downloaded {file} to {local_filepath}") + + ftp.quit() + + return "success" + +def getFileList(ftp_address: str, ftp_path: str, extension: str = ".gz"): + """ + Returns a list of files in the FTP folder. + """ + # connect to the FTP server + ftp = ftplib.FTP(ftp_address) + ftp.login() + + # Change directory to the specified path + ftp.cwd(ftp_path) + + # Get list of file entries + file_listing = ftp.nlst() + + ftp.quit() + + # Filter for files with the specified extension + gz_files = [entry for entry in file_listing if entry.endswith(extension)] + gz_files.sort(reverse=True) + print(f"Found {len(gz_files)} files on {ftp_address}/{ftp_path}") + + return gz_files + +def extractXMLFile(gz_filepath: str): + """ + Extracts the XML file from the .gz file. + Args: + gz_filepath: Path to the .gz file. + Returns: + xml_filepath: Path to the extracted XML file. + """ + print("gz file path: ", gz_filepath) + xml_filepath = gz_filepath.replace(".gz", "") + with gzip.open(gz_filepath, 'rb') as f_in: + with open(xml_filepath, 'wb') as f_out: + shutil.copyfileobj(f_in, f_out) + + return xml_filepath + +def extractMetadataFromXML(xml_filepath: str): + """ + Extracts metadata from the XML file and stores it in a dictionary. + Args: + xml_filepath: Path to the XML file. + Returns: + metadata: List of dictionaries containing metadata for each article. + """ + tree = ET.parse(xml_filepath) + root = tree.getroot() + metadata = [] + # Extract metadata from the XML file + for item in root.iter('PubmedArticle'): + article_data = {} + + publication_status = item.find('PubmedData/PublicationStatus').text + # ppublish articles are not present in PMC database + if publication_status == "epublish": + article_data['full_text'] = True + else: + article_data['full_text'] = False + + medline_citation = item.find('MedlineCitation') + article = medline_citation.find('Article') + journal = article.find('Journal') + issue = journal.find('JournalIssue') + + article_data['pmid'] = medline_citation.find('PMID').text + article_data['issn'] = journal.find('ISSN').text + article_data['journal_title'] = journal.find('Title').text + + article_title = article.find('ArticleTitle').text + article_data['article_title'] = article_title.replace('[', '').replace(']', '') + + article_data['last_revised'] = f"{medline_citation.find('DateRevised/Year').text}-{medline_citation.find('DateRevised/Month').text}-{medline_citation.find('DateRevised/Day').text}" + article_data['published'] = f"{issue.find('PubDate/Year').text}-{issue.find('PubDate/Month').text}-{issue.find('PubDate/Day').text}" + #article_data['date_completed'] = f"{medline_citation.find('DateCompleted/Year').text}-{medline_citation.find('DateCompleted/Month').text}-{medline_citation.find('DateCompleted/Day').text}" + + # extract and store abstract in a text file + + + + metadata.append(article_data) + + return metadata + +def getArticleIDs(pmid: str): + """ + Retrieves the PMC ID and DOI for an article. + """ + base_url = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/" + app_details = "?tool=ncsa_uiuc&email=caiincsa@gmail.com" + url = base_url + app_details + "&ids=" + id + + response = requests.get(url) + + + + + From b833524a68fc7f249a42a6c11c42d4907f4177aa Mon Sep 17 00:00:00 2001 From: star-nox Date: Wed, 3 Apr 2024 15:37:15 -0500 Subject: [PATCH 02/28] completed all download functions --- ai_ta_backend/utils/pubmed_extraction.py | 321 +++++++++++++++++++---- 1 file changed, 275 insertions(+), 46 deletions(-) diff --git a/ai_ta_backend/utils/pubmed_extraction.py b/ai_ta_backend/utils/pubmed_extraction.py index 4be2b4a8..8daf4e45 100644 --- a/ai_ta_backend/utils/pubmed_extraction.py +++ b/ai_ta_backend/utils/pubmed_extraction.py @@ -1,14 +1,17 @@ import os import requests import shutil -import json import xml.etree.ElementTree as ET import ftplib import supabase import gzip -import time import concurrent.futures -import urllib.request +from urllib.parse import urlparse +import tarfile +import os +import shutil +from minio import Minio + SUPBASE_CLIENT = supabase.create_client( # type: ignore @@ -20,26 +23,45 @@ def extractPubmedData(): """ Extracts metadata from the files listed in FTP folder and stores it in SQL DB. """ - ftp_address = "ftp.ncbi.nlm.nih.gov" - ftp_path = "pubmed/baseline" - file_list = getFileList(ftp_address, ftp_path, ".gz") + xml_filepath = "pubmed/pubmed24n1219.xml" + metadata = extractMetadataFromXML(xml_filepath) + + # find PMC ID and DOI for all articles + metadata_with_ids = getArticleIDs(metadata) + + # download the articles + complete_metadata = downloadArticles(metadata_with_ids) + + print("Complete metadata: ", complete_metadata) + + # upload articles to bucket + article_upload = uploadToStorage("pubmed_abstracts") + + # upload metadata to SQL DB - for file in file_list: - # download the .gz file - gz_filepath = downloadFromFTP(ftp_address, ftp_path, file, "pubmed") - print("Downloaded: ", gz_filepath) + + + # ftp_address = "ftp.ncbi.nlm.nih.gov" + # ftp_path = "pubmed/baseline" + # file_list = getFileList(ftp_address, ftp_path, ".gz") - # extract the XML file - xml_filepath = extractXMLFile(gz_filepath) - print("XML Extracted: ", xml_filepath) + # for file in file_list: + # # download the .gz file + # gz_filepath = downloadFromFTP(ftp_address, ftp_path, file, "pubmed") + # print("Downloaded: ", gz_filepath) - # extract metadata from the XML file - metadata = extractMetadataFromXML(xml_filepath) + # # extract the XML file + # xml_filepath = extractXMLFile(gz_filepath) + # print("XML Extracted: ", xml_filepath) - # find PMC ID and DOI for all articles - for article in metadata: - pmid = article['pmid'] - article_ids = getArticleIDs(pmid) + # # extract metadata from the XML file + # xml_filepath = "pubmed/pubmed24n1219.xml" + # metadata = extractMetadataFromXML(xml_filepath) + + # # find PMC ID and DOI for all articles + # for article in metadata: + # pmid = article['pmid'] + # article_ids = getArticleIDs(pmid) # delete XML and .gz files @@ -50,7 +72,7 @@ def extractPubmedData(): def downloadFromFTP(ftp_address: str, ftp_path: str, file: str, local_dir: str): """ - Downloads all .gz files from the FTP folder and stores it in the local directory. + Downloads a .gz file from the FTP folder and stores it in the local directory. """ # create local directory if it doesn't exist os.makedirs(local_dir, exist_ok=True) @@ -61,19 +83,17 @@ def downloadFromFTP(ftp_address: str, ftp_path: str, file: str, local_dir: str): ftp.cwd(ftp_path) local_filepath = os.path.join(local_dir, file) - with open(local_filepath, 'wb') as f: ftp.retrbinary('RETR ' + file, f.write) print(f"Downloaded {file} to {local_filepath}") ftp.quit() - - return "success" + return local_filepath def getFileList(ftp_address: str, ftp_path: str, extension: str = ".gz"): """ - Returns a list of files in the FTP folder. + Returns a list of .gz files in the FTP folder. """ # connect to the FTP server ftp = ftplib.FTP(ftp_address) @@ -118,53 +138,262 @@ def extractMetadataFromXML(xml_filepath: str): Returns: metadata: List of dictionaries containing metadata for each article. """ + # create a directory to store abstracts + os.makedirs("pubmed_abstracts", exist_ok=True) + tree = ET.parse(xml_filepath) root = tree.getroot() metadata = [] + # Extract metadata from the XML file for item in root.iter('PubmedArticle'): article_data = {} - publication_status = item.find('PubmedData/PublicationStatus').text - # ppublish articles are not present in PMC database - if publication_status == "epublish": - article_data['full_text'] = True - else: - article_data['full_text'] = False - medline_citation = item.find('MedlineCitation') article = medline_citation.find('Article') journal = article.find('Journal') issue = journal.find('JournalIssue') - article_data['pmid'] = medline_citation.find('PMID').text - article_data['issn'] = journal.find('ISSN').text - article_data['journal_title'] = journal.find('Title').text + if medline_citation.find('PMID') is not None: + article_data['pmid'] = medline_citation.find('PMID').text + article_data['pmcid'] = None + article_data['doi'] = None + else: + continue - article_title = article.find('ArticleTitle').text - article_data['article_title'] = article_title.replace('[', '').replace(']', '') + if journal.find('ISSN') is not None: + article_data['issn'] = journal.find('ISSN').text + else: + article_data['issn'] = None - article_data['last_revised'] = f"{medline_citation.find('DateRevised/Year').text}-{medline_citation.find('DateRevised/Month').text}-{medline_citation.find('DateRevised/Day').text}" - article_data['published'] = f"{issue.find('PubDate/Year').text}-{issue.find('PubDate/Month').text}-{issue.find('PubDate/Day').text}" - #article_data['date_completed'] = f"{medline_citation.find('DateCompleted/Year').text}-{medline_citation.find('DateCompleted/Month').text}-{medline_citation.find('DateCompleted/Day').text}" + if journal.find('Title') is not None: + article_data['journal_title'] = journal.find('Title').text + else: + article_data['journal_title'] = None + + # some articles don't have an article title + article_title = article.find('ArticleTitle') + if article_title is not None and article_title.text is not None: + article_data['article_title'] = article_title.text.replace('[', '').replace(']', '') + else: + article_data['article_title'] = None + article_data['last_revised'] = f"{medline_citation.find('DateRevised/Year').text}-{medline_citation.find('DateRevised/Month').text}-{medline_citation.find('DateRevised/Day').text}" + + # some articles don't have all fields present for publication date + if issue.find('PubDate/Year') is not None and issue.find('PubDate/Month') is not None and issue.find('PubDate/Day') is not None: + article_data['published'] = f"{issue.find('PubDate/Year').text}-{issue.find('PubDate/Month').text}-{issue.find('PubDate/Day').text}" + elif issue.find('PubDate/Year') is not None and issue.find('PubDate/Month') is not None: + article_data['published'] = f"{issue.find('PubDate/Year').text}-{issue.find('PubDate/Month').text}" + elif issue.find('PubDate/Year') is not None: + article_data['published'] = f"{issue.find('PubDate/Year').text}" + else: + article_data['published'] = None + + # extract and store abstract in a text file + abstract = article.find('Abstract') + if abstract is not None: + abstract_text = "" + for abstract_text_element in abstract.iter('AbstractText'): + # if labels (objective, methods, etc.) are present, add them to the text (e.g. "OBJECTIVE: ") + if abstract_text_element.attrib.get('Label') is not None: + abstract_text += abstract_text_element.attrib.get('Label') + ": " + if abstract_text_element.text is not None: + abstract_text += abstract_text_element.text + "\n" + + # save abstract to a text file + abstract_filename = f"pubmed_abstracts/{article_data['pmid']}.txt" + with open(abstract_filename, 'w') as f: + if article_data['article_title']: + f.write("Article title: " + article_data['article_title'] + "\n") + if article_data['journal_title']: + f.write("Journal title: " + article_data['journal_title'] + "\n") + f.write("Abstract: " + abstract_text) - + # some articles are listed, but not released online yet. Adding fields for such articles to maintain uniformity. + article_data['live'] = True + article_data['release_date'] = None + article_data['license'] = None + article_data['pubmed_ftp_link'] = None + article_data['filepath'] = abstract_filename metadata.append(article_data) - + if len(metadata) == 300: + return metadata return metadata -def getArticleIDs(pmid: str): +def getArticleIDs(metadata: list): """ - Retrieves the PMC ID and DOI for an article. + Retrieves the PMC ID and DOI for given articles and updates the metadata. """ base_url = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/" - app_details = "?tool=ncsa_uiuc&email=caiincsa@gmail.com" - url = base_url + app_details + "&ids=" + id + app_details = "?tool=ncsa_uiuc&email=caiincsa@gmail.com&format=json" + + batch_size = 200 # maximum number of articles API can process in one request + for i in range(0, len(metadata), batch_size): + batch = metadata[i:i+batch_size] + ids = ",".join([article['pmid'] for article in batch]) + response = requests.get(base_url + app_details + "&ids=" + ids) + data = response.json() + records = data['records'] + + for record in records: + if 'errmsg' in record: + article['live'] = False + continue + else: + # find article with matching pmid and update pmcid, doi, live, and release date fields + for article in batch: + if article['pmid'] == record['pmid']: + article['pmcid'] = record['pmcid'] + article['doi'] = record['doi'] + article['live'] = False if 'live' in record and record['live'] == "false" else True + article['release_date'] = record.get('release-date', article['release_date']) + print("Updated metadata in ID converter: ", article) + break + return metadata + +def downloadArticles(metadata: list): + """ + Downloads articles from PMC and stores them in bucket. + Updates metadata with license information. + """ + + base_url = "https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?" + print("Downloading articles...") + + # connect to FTP server anonymously + ftp = ftplib.FTP("ftp.ncbi.nlm.nih.gov") + ftp.login() + + for article in metadata: + + if article['live'] is False or article['pmcid'] is None: + continue + + # else proceed with download + if article['pmcid']: + # download the article + final_url = base_url + "id=" + article['pmcid'] + print("Downloading: ", final_url) + + xml_response = requests.get(final_url) + extracted_data = extractArticleData(xml_response.text) + + print("\nExtracted data: ", extracted_data) + + # if no data extracted (reason: article not released/open-access), skip to next article + if not extracted_data: + article['live'] = False + continue + + # update metadata with license and ftp link information + article['license'] = extracted_data[0]['license'] + article['pubmed_ftp_link'] = extracted_data[0]['href'] if 'href' in extracted_data[0] else None + + # download the article + ftp_url = urlparse(extracted_data[0]['href']) + ftp_path = ftp_url.path[1:] + print("FTP path: ", ftp_path) + + filename = ftp_path.split("/")[-1] + local_file = os.path.join("pubmed_abstracts", filename) + with open(local_file, 'wb') as f: + ftp.retrbinary('RETR ' + ftp_path, f.write) + print("Downloaded: ", local_file) + article['filepath'] = local_file + + # if file is .tar.gz, extract the PDF and delete the tar.gz file + if filename.endswith(".tar.gz"): + extracted_pdf_paths = extractPDF(local_file) + print("Extracted PDF: ", extracted_pdf_paths) + article['filepath'] = ",".join(extracted_pdf_paths) + os.remove(local_file) + + print("\nUpdated metadata after download: ", article) + ftp.login() + return metadata + +def extractPDF(tar_gz_filepath: str): + """ + Extracts the PDF file from the .tar.gz file. + """ + print("Extracting PDF from: ", tar_gz_filepath) + extracted_paths = [] + with tarfile.open(tar_gz_filepath, "r:gz") as tar: + for member in tar: + if member.isreg() and member.name.endswith(".pdf"): + tar.extract(member, path="pubmed_abstracts") + print("Extracted: ", member.name) + extracted_paths.append(os.path.join("pubmed_abstracts", member.name)) + + return extracted_paths + +def extractArticleData(xml_string: str): + """ + Extracts license information and article download link from the XML response. + """ + root = ET.fromstring(xml_string) + + if root.find(".//error") is not None: + return [] + + records = root.findall(".//record") + extracted_data = [] + href = None + print("In extractArticleData") + for record in records: + record_id = record.get("id") + license = record.get("license") + links = record.findall(".//link") + + for link in links: + if link.get("format") == "pdf": + href = link.get("href") + break + # if PDF link not found, use the available tgz link + if not href: + href = links[0].get("href") + + extracted_data.append({ + "record_id": record_id, + "license": license, + "href": href + }) + + return extracted_data + +def uploadToStorage(filepath: str): + """ + Uploads all files present in given folder to Minio bucket. + """ + minio_client = Minio( + endpoint=os.getenv('MINIO_ENDPOINT'), + access_key=os.getenv('MINIO_ACCESS_KEY'), + secret_key=os.getenv('MINIO_SECRET_KEY'), + secure=False + ) + + bucket_name = "pubmed" + for root, dirs, files in os.walk(filepath): + for file in files: + file_path = os.path.join(root, file) + print("Uploading: ", file_path) + + + return "success" + + + + + + + + + + - response = requests.get(url) From 8f14cf9b5137aeae326c7e3cff24150d562be64b Mon Sep 17 00:00:00 2001 From: star-nox Date: Wed, 3 Apr 2024 17:23:05 -0500 Subject: [PATCH 03/28] added supabase upsert --- ai_ta_backend/utils/pubmed_extraction.py | 82 ++++++++++++------------ 1 file changed, 40 insertions(+), 42 deletions(-) diff --git a/ai_ta_backend/utils/pubmed_extraction.py b/ai_ta_backend/utils/pubmed_extraction.py index 8daf4e45..d3923eaf 100644 --- a/ai_ta_backend/utils/pubmed_extraction.py +++ b/ai_ta_backend/utils/pubmed_extraction.py @@ -23,6 +23,17 @@ def extractPubmedData(): """ Extracts metadata from the files listed in FTP folder and stores it in SQL DB. """ + ftp_address = "ftp.ncbi.nlm.nih.gov" + ftp_path = "pubmed/baseline" + file_list = getFileList(ftp_address, ftp_path, ".gz") + + gz_filepath = downloadXML(ftp_address, ftp_path, file_list[0], "pubmed") + print("GZ Downloaded: ", gz_filepath) + + # extract the XML file + xml_filepath = extractXMLFile(gz_filepath) + print("XML Extracted: ", xml_filepath) + xml_filepath = "pubmed/pubmed24n1219.xml" metadata = extractMetadataFromXML(xml_filepath) @@ -31,48 +42,22 @@ def extractPubmedData(): # download the articles complete_metadata = downloadArticles(metadata_with_ids) - print("Complete metadata: ", complete_metadata) # upload articles to bucket article_upload = uploadToStorage("pubmed_abstracts") + print("Uploaded articles: ", article_upload) # upload metadata to SQL DB + response = SUPBASE_CLIENT.table("publications").upsert(complete_metadata).execute() + print("Supabase response: ", response) + exit() - - - # ftp_address = "ftp.ncbi.nlm.nih.gov" - # ftp_path = "pubmed/baseline" - # file_list = getFileList(ftp_address, ftp_path, ".gz") - - # for file in file_list: - # # download the .gz file - # gz_filepath = downloadFromFTP(ftp_address, ftp_path, file, "pubmed") - # print("Downloaded: ", gz_filepath) - - # # extract the XML file - # xml_filepath = extractXMLFile(gz_filepath) - # print("XML Extracted: ", xml_filepath) - - # # extract metadata from the XML file - # xml_filepath = "pubmed/pubmed24n1219.xml" - # metadata = extractMetadataFromXML(xml_filepath) - - # # find PMC ID and DOI for all articles - # for article in metadata: - # pmid = article['pmid'] - # article_ids = getArticleIDs(pmid) - - - # delete XML and .gz files - - - return "success" -def downloadFromFTP(ftp_address: str, ftp_path: str, file: str, local_dir: str): +def downloadXML(ftp_address: str, ftp_path: str, file: str, local_dir: str): """ - Downloads a .gz file from the FTP folder and stores it in the local directory. + Downloads a .gz XML file from the FTP baseline folder and stores it in the local directory. """ # create local directory if it doesn't exist os.makedirs(local_dir, exist_ok=True) @@ -190,7 +175,6 @@ def extractMetadataFromXML(xml_filepath: str): else: article_data['published'] = None - # extract and store abstract in a text file abstract = article.find('Abstract') if abstract is not None: @@ -219,7 +203,7 @@ def extractMetadataFromXML(xml_filepath: str): article_data['filepath'] = abstract_filename metadata.append(article_data) - if len(metadata) == 300: + if len(metadata) == 20: return metadata return metadata @@ -240,7 +224,11 @@ def getArticleIDs(metadata: list): for record in records: if 'errmsg' in record: - article['live'] = False + print("Error: ", record['errmsg']) + for article in batch: + if article['pmid'] == record['pmid']: + article['live'] = False + break continue else: # find article with matching pmid and update pmcid, doi, live, and release date fields @@ -368,20 +356,30 @@ def uploadToStorage(filepath: str): """ Uploads all files present in given folder to Minio bucket. """ - minio_client = Minio( - endpoint=os.getenv('MINIO_ENDPOINT'), - access_key=os.getenv('MINIO_ACCESS_KEY'), - secret_key=os.getenv('MINIO_SECRET_KEY'), + print("in uploadToStorage()") + + minio_client = Minio(os.environ['MINIO_URL'], + access_key=os.environ['MINIO_ACCESS_KEY'], + secret_key=os.environ['MINIO_SECRET_KEY'], secure=False ) bucket_name = "pubmed" + found = minio_client.bucket_exists(bucket_name) + if not found: + minio_client.make_bucket(bucket_name) + print("Created bucket", bucket_name) + else: + print("Bucket", bucket_name, "already exists") + for root, dirs, files in os.walk(filepath): + # can parallelize this upload for file in files: file_path = os.path.join(root, file) - print("Uploading: ", file_path) - - + object_name = file_path.split("/")[-1] + # insert local file into remote bucket + minio_client.fput_object(bucket_name, object_name, file_path) + print("Uploaded: ", object_name) return "success" From 6daa4df762021379bf75770ff5076edde470e736 Mon Sep 17 00:00:00 2001 From: star-nox Date: Wed, 3 Apr 2024 18:30:36 -0500 Subject: [PATCH 04/28] updated comments --- ai_ta_backend/utils/pubmed_extraction.py | 98 +++++++++++++++++------- 1 file changed, 69 insertions(+), 29 deletions(-) diff --git a/ai_ta_backend/utils/pubmed_extraction.py b/ai_ta_backend/utils/pubmed_extraction.py index d3923eaf..07adf0b4 100644 --- a/ai_ta_backend/utils/pubmed_extraction.py +++ b/ai_ta_backend/utils/pubmed_extraction.py @@ -19,14 +19,20 @@ supabase_key=os.getenv('SUPABASE_API_KEY') # type: ignore ) +MINIO_CLIENT = Minio(os.environ['MINIO_URL'], + access_key=os.environ['MINIO_ACCESS_KEY'], + secret_key=os.environ['MINIO_SECRET_KEY'], + secure=False +) + def extractPubmedData(): """ - Extracts metadata from the files listed in FTP folder and stores it in SQL DB. + Main function to extract metadata and articles from the PubMed baseline folder. """ ftp_address = "ftp.ncbi.nlm.nih.gov" ftp_path = "pubmed/baseline" file_list = getFileList(ftp_address, ftp_path, ".gz") - + gz_filepath = downloadXML(ftp_address, ftp_path, file_list[0], "pubmed") print("GZ Downloaded: ", gz_filepath) @@ -58,6 +64,13 @@ def extractPubmedData(): def downloadXML(ftp_address: str, ftp_path: str, file: str, local_dir: str): """ Downloads a .gz XML file from the FTP baseline folder and stores it in the local directory. + Args: + ftp_address: FTP server address. + ftp_path: Path to the FTP folder. + file: File to download. + local_dir: Local directory to store the downloaded file. + Returns: + local_filepath: Path to the downloaded file. """ # create local directory if it doesn't exist os.makedirs(local_dir, exist_ok=True) @@ -78,7 +91,13 @@ def downloadXML(ftp_address: str, ftp_path: str, file: str, local_dir: str): def getFileList(ftp_address: str, ftp_path: str, extension: str = ".gz"): """ - Returns a list of .gz files in the FTP folder. + Returns a list of .gz files in the FTP baseline folder. + Args: + ftp_address: FTP server address. + ftp_path: Path to the FTP folder. + extension: File extension to filter for. + Returns: + gz_files: List of .gz files in the FTP folder. """ # connect to the FTP server ftp = ftplib.FTP(ftp_address) @@ -107,7 +126,7 @@ def extractXMLFile(gz_filepath: str): Returns: xml_filepath: Path to the extracted XML file. """ - print("gz file path: ", gz_filepath) + print("Downloaded .gz file path: ", gz_filepath) xml_filepath = gz_filepath.replace(".gz", "") with gzip.open(gz_filepath, 'rb') as f_in: with open(xml_filepath, 'wb') as f_out: @@ -117,7 +136,9 @@ def extractXMLFile(gz_filepath: str): def extractMetadataFromXML(xml_filepath: str): """ - Extracts metadata from the XML file and stores it in a dictionary. + Extracts article details from the XML file and stores it in a dictionary. + Details extracted: PMID, PMCID, DOI, ISSN, journal title, article title, + last revised date, published date, abstract. Args: xml_filepath: Path to the XML file. Returns: @@ -130,6 +151,7 @@ def extractMetadataFromXML(xml_filepath: str): root = tree.getroot() metadata = [] + # PARALLELIZE THE BELOW FOR LOOP AND EXTRACT METADATA FOR ALL ARTICLES AT ONCE - IN 1000s # Extract metadata from the XML file for item in root.iter('PubmedArticle'): article_data = {} @@ -209,7 +231,13 @@ def extractMetadataFromXML(xml_filepath: str): def getArticleIDs(metadata: list): """ - Retrieves the PMC ID and DOI for given articles and updates the metadata. + Uses the PubMed ID converter API to get PMCID and DOI for each article. + Queries the API in batches of 200 articles at a time. + Also updates the metadata with the release date and live status - some articles are yet to be released. + Args: + metadata: List of dictionaries containing metadata for each article. + Returns: + metadata: Updated metadata with PMCID, DOI, release date, and live status information. """ base_url = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/" app_details = "?tool=ncsa_uiuc&email=caiincsa@gmail.com&format=json" @@ -221,7 +249,8 @@ def getArticleIDs(metadata: list): response = requests.get(base_url + app_details + "&ids=" + ids) data = response.json() records = data['records'] - + + # PARALLELIZE THIS FOR LOOP - UPDATES ADDITIONAL FIELDS FOR ALL ARTICLES AT ONCE for record in records: if 'errmsg' in record: print("Error: ", record['errmsg']) @@ -244,8 +273,11 @@ def getArticleIDs(metadata: list): def downloadArticles(metadata: list): """ - Downloads articles from PMC and stores them in bucket. - Updates metadata with license information. + Downloads articles from PMC and stores them in local directory. + Args: + metadata: List of dictionaries containing metadata for each article. + Returns: + metadata: Updated metadata with license, FTP link, and downloaded filepath information. """ base_url = "https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?" @@ -255,6 +287,7 @@ def downloadArticles(metadata: list): ftp = ftplib.FTP("ftp.ncbi.nlm.nih.gov") ftp.login() + # PARALLELIZE THIS FOR LOOP - DOWNLOAD + METADATA UPDATE for article in metadata: if article['live'] is False or article['pmcid'] is None: @@ -262,14 +295,15 @@ def downloadArticles(metadata: list): # else proceed with download if article['pmcid']: - # download the article + # query URL for article download final_url = base_url + "id=" + article['pmcid'] - print("Downloading: ", final_url) + print("Download URL: ", final_url) xml_response = requests.get(final_url) + # get license and FTP link extracted_data = extractArticleData(xml_response.text) - print("\nExtracted data: ", extracted_data) + print("\nExtracted license and link data: ", extracted_data) # if no data extracted (reason: article not released/open-access), skip to next article if not extracted_data: @@ -289,23 +323,28 @@ def downloadArticles(metadata: list): local_file = os.path.join("pubmed_abstracts", filename) with open(local_file, 'wb') as f: ftp.retrbinary('RETR ' + ftp_path, f.write) - print("Downloaded: ", local_file) + print("Downloaded PDF file: ", local_file) article['filepath'] = local_file # if file is .tar.gz, extract the PDF and delete the tar.gz file if filename.endswith(".tar.gz"): extracted_pdf_paths = extractPDF(local_file) - print("Extracted PDF: ", extracted_pdf_paths) + print("Extracted PDFs from .tar.gz file: ", extracted_pdf_paths) article['filepath'] = ",".join(extracted_pdf_paths) os.remove(local_file) print("\nUpdated metadata after download: ", article) - ftp.login() + ftp.quit() return metadata def extractPDF(tar_gz_filepath: str): """ - Extracts the PDF file from the .tar.gz file. + Extracts PDF files from the downloaded .tar.gz file. The zipped folder contains other supplementary + materials like images, etc. which are not extracted. + Args: + tar_gz_filepath: Path to the .tar.gz file. + Returns: + extracted_paths: List of paths to the extracted PDF files. """ print("Extracting PDF from: ", tar_gz_filepath) extracted_paths = [] @@ -321,18 +360,25 @@ def extractPDF(tar_gz_filepath: str): def extractArticleData(xml_string: str): """ Extracts license information and article download link from the XML response. + This function process XML response for single article. + Args: + xml_string: XML response from PMC download API. + Returns: + extracted_data: List of dictionaries containing license and download link for the article. """ - root = ET.fromstring(xml_string) + print("In extractArticleData") + root = ET.fromstring(xml_string) + # if there is an errors (article not open-access), return empty list (skip article) if root.find(".//error") is not None: return [] records = root.findall(".//record") extracted_data = [] href = None - print("In extractArticleData") + for record in records: - record_id = record.get("id") + record_id = record.get("id") # pmcid license = record.get("license") links = record.findall(".//link") @@ -354,20 +400,14 @@ def extractArticleData(xml_string: str): def uploadToStorage(filepath: str): """ - Uploads all files present in given folder to Minio bucket. + Uploads all files present under given filepath to Minio bucket. """ print("in uploadToStorage()") - minio_client = Minio(os.environ['MINIO_URL'], - access_key=os.environ['MINIO_ACCESS_KEY'], - secret_key=os.environ['MINIO_SECRET_KEY'], - secure=False - ) - bucket_name = "pubmed" - found = minio_client.bucket_exists(bucket_name) + found = MINIO_CLIENT.bucket_exists(bucket_name) if not found: - minio_client.make_bucket(bucket_name) + MINIO_CLIENT.make_bucket(bucket_name) print("Created bucket", bucket_name) else: print("Bucket", bucket_name, "already exists") @@ -378,7 +418,7 @@ def uploadToStorage(filepath: str): file_path = os.path.join(root, file) object_name = file_path.split("/")[-1] # insert local file into remote bucket - minio_client.fput_object(bucket_name, object_name, file_path) + MINIO_CLIENT.fput_object(bucket_name, object_name, file_path) print("Uploaded: ", object_name) return "success" From c8c90560aae8e4192d923548f3aa4d731a1dd9c5 Mon Sep 17 00:00:00 2001 From: star-nox Date: Thu, 4 Apr 2024 15:54:34 -0500 Subject: [PATCH 05/28] added processpool to extractMetadataFromXML() --- ai_ta_backend/utils/pubmed_extraction.py | 153 ++++++++++++++++++++--- 1 file changed, 133 insertions(+), 20 deletions(-) diff --git a/ai_ta_backend/utils/pubmed_extraction.py b/ai_ta_backend/utils/pubmed_extraction.py index 07adf0b4..0ab4ade3 100644 --- a/ai_ta_backend/utils/pubmed_extraction.py +++ b/ai_ta_backend/utils/pubmed_extraction.py @@ -11,6 +11,7 @@ import os import shutil from minio import Minio +import time @@ -29,19 +30,27 @@ def extractPubmedData(): """ Main function to extract metadata and articles from the PubMed baseline folder. """ + start_time = time.time() + ftp_address = "ftp.ncbi.nlm.nih.gov" ftp_path = "pubmed/baseline" file_list = getFileList(ftp_address, ftp_path, ".gz") gz_filepath = downloadXML(ftp_address, ftp_path, file_list[0], "pubmed") print("GZ Downloaded: ", gz_filepath) + print("Time taken to download .gz file: ", round(time.time() - start_time, 2), "seconds") # extract the XML file xml_filepath = extractXMLFile(gz_filepath) print("XML Extracted: ", xml_filepath) + print("Time taken to extract XML file: ", round(time.time() - start_time, 2), "seconds") xml_filepath = "pubmed/pubmed24n1219.xml" metadata = extractMetadataFromXML(xml_filepath) + print("Number of articles found in this file: ", len(metadata)) + print("\nSample metadata: ", metadata) + print("\n\nTime taken to extract metadata: ", round(time.time() - start_time, 2), "seconds") + exit() # find PMC ID and DOI for all articles metadata_with_ids = getArticleIDs(metadata) @@ -55,10 +64,9 @@ def extractPubmedData(): print("Uploaded articles: ", article_upload) # upload metadata to SQL DB - response = SUPBASE_CLIENT.table("publications").upsert(complete_metadata).execute() + response = SUPBASE_CLIENT.table("publications").upsert(complete_metadata).execute() # type: ignore print("Supabase response: ", response) - exit() - + return "success" def downloadXML(ftp_address: str, ftp_path: str, file: str, local_dir: str): @@ -144,16 +152,121 @@ def extractMetadataFromXML(xml_filepath: str): Returns: metadata: List of dictionaries containing metadata for each article. """ + print("inside extractMetadataFromXML()") + # create a directory to store abstracts os.makedirs("pubmed_abstracts", exist_ok=True) tree = ET.parse(xml_filepath) root = tree.getroot() metadata = [] + + + with concurrent.futures.ProcessPoolExecutor() as executor: + futures = [] + article_items = list(item for item in root.iter('PubmedArticle')) # Convert generator to list + total_items = len(article_items) # Use len() since article_items is now a list + article_items_100 = (article_items[i:i+50] for i in range(0, total_items, 50)) + for chunk in article_items_100: + for item in chunk: + future = executor.submit(processArticleItem, item) + futures.append(future) + + for future in concurrent.futures.as_completed(futures): + article_data = future.result() + metadata.append(article_data) + + print("Extracted metadata for 20 articles: ", metadata[:20]) + print("Total articles extracted: ", len(metadata)) + return metadata - # PARALLELIZE THE BELOW FOR LOOP AND EXTRACT METADATA FOR ALL ARTICLES AT ONCE - IN 1000s - # Extract metadata from the XML file - for item in root.iter('PubmedArticle'): + # # PARALLELIZE THE BELOW FOR LOOP AND EXTRACT METADATA FOR ALL ARTICLES AT ONCE - IN 1000s + # # Extract metadata from the XML file + # for item in root.iter('PubmedArticle'): + # article_data = {} + + # medline_citation = item.find('MedlineCitation') + # article = medline_citation.find('Article') + # journal = article.find('Journal') + # issue = journal.find('JournalIssue') + + # if medline_citation.find('PMID') is not None: + # article_data['pmid'] = medline_citation.find('PMID').text + # article_data['pmcid'] = None + # article_data['doi'] = None + # else: + # continue + + # if journal.find('ISSN') is not None: + # article_data['issn'] = journal.find('ISSN').text + # else: + # article_data['issn'] = None + + # if journal.find('Title') is not None: + # article_data['journal_title'] = journal.find('Title').text + # else: + # article_data['journal_title'] = None + + # # some articles don't have an article title + # article_title = article.find('ArticleTitle') + # if article_title is not None and article_title.text is not None: + # article_data['article_title'] = article_title.text.replace('[', '').replace(']', '') + # else: + # article_data['article_title'] = None + + # article_data['last_revised'] = f"{medline_citation.find('DateRevised/Year').text}-{medline_citation.find('DateRevised/Month').text}-{medline_citation.find('DateRevised/Day').text}" + + # # some articles don't have all fields present for publication date + # if issue.find('PubDate/Year') is not None and issue.find('PubDate/Month') is not None and issue.find('PubDate/Day') is not None: + # article_data['published'] = f"{issue.find('PubDate/Year').text}-{issue.find('PubDate/Month').text}-{issue.find('PubDate/Day').text}" + # elif issue.find('PubDate/Year') is not None and issue.find('PubDate/Month') is not None: + # article_data['published'] = f"{issue.find('PubDate/Year').text}-{issue.find('PubDate/Month').text}" + # elif issue.find('PubDate/Year') is not None: + # article_data['published'] = f"{issue.find('PubDate/Year').text}" + # else: + # article_data['published'] = None + + # # extract and store abstract in a text file + # abstract = article.find('Abstract') + # if abstract is not None: + # abstract_text = "" + # for abstract_text_element in abstract.iter('AbstractText'): + # # if labels (objective, methods, etc.) are present, add them to the text (e.g. "OBJECTIVE: ") + # if abstract_text_element.attrib.get('Label') is not None: + # abstract_text += abstract_text_element.attrib.get('Label') + ": " + # if abstract_text_element.text is not None: + # abstract_text += abstract_text_element.text + "\n" + + # # save abstract to a text file + # abstract_filename = f"pubmed_abstracts/{article_data['pmid']}.txt" + # with open(abstract_filename, 'w') as f: + # if article_data['article_title']: + # f.write("Article title: " + article_data['article_title'] + "\n") + # if article_data['journal_title']: + # f.write("Journal title: " + article_data['journal_title'] + "\n") + # f.write("Abstract: " + abstract_text) + + # # some articles are listed, but not released online yet. Adding fields for such articles to maintain uniformity. + # article_data['live'] = True + # article_data['release_date'] = None + # article_data['license'] = None + # article_data['pubmed_ftp_link'] = None + # article_data['filepath'] = abstract_filename + + # metadata.append(article_data) + # if len(metadata) == 20: + # return metadata + # return metadata + +def processArticleItem(item: ET.Element): + """ + Extracts article details from a single PubmedArticle XML element. This is used in the process pool executor. + Args: + item: PubmedArticle XML element. + Returns: + article_data: Dictionary containing metadata for the article. + """ + try: article_data = {} medline_citation = item.find('MedlineCitation') @@ -166,7 +279,7 @@ def extractMetadataFromXML(xml_filepath: str): article_data['pmcid'] = None article_data['doi'] = None else: - continue + return article_data if journal.find('ISSN') is not None: article_data['issn'] = journal.find('ISSN').text @@ -177,7 +290,7 @@ def extractMetadataFromXML(xml_filepath: str): article_data['journal_title'] = journal.find('Title').text else: article_data['journal_title'] = None - + # some articles don't have an article title article_title = article.find('ArticleTitle') if article_title is not None and article_title.text is not None: @@ -186,7 +299,7 @@ def extractMetadataFromXML(xml_filepath: str): article_data['article_title'] = None article_data['last_revised'] = f"{medline_citation.find('DateRevised/Year').text}-{medline_citation.find('DateRevised/Month').text}-{medline_citation.find('DateRevised/Day').text}" - + # some articles don't have all fields present for publication date if issue.find('PubDate/Year') is not None and issue.find('PubDate/Month') is not None and issue.find('PubDate/Day') is not None: article_data['published'] = f"{issue.find('PubDate/Year').text}-{issue.find('PubDate/Month').text}-{issue.find('PubDate/Day').text}" @@ -196,9 +309,10 @@ def extractMetadataFromXML(xml_filepath: str): article_data['published'] = f"{issue.find('PubDate/Year').text}" else: article_data['published'] = None - + # extract and store abstract in a text file abstract = article.find('Abstract') + abstract_filename = None if abstract is not None: abstract_text = "" for abstract_text_element in abstract.iter('AbstractText'): @@ -207,27 +321,26 @@ def extractMetadataFromXML(xml_filepath: str): abstract_text += abstract_text_element.attrib.get('Label') + ": " if abstract_text_element.text is not None: abstract_text += abstract_text_element.text + "\n" - + # save abstract to a text file abstract_filename = f"pubmed_abstracts/{article_data['pmid']}.txt" with open(abstract_filename, 'w') as f: - if article_data['article_title']: - f.write("Article title: " + article_data['article_title'] + "\n") if article_data['journal_title']: - f.write("Journal title: " + article_data['journal_title'] + "\n") + f.write("Journal title: " + article_data['journal_title'] + "\n\n") + if article_data['article_title']: + f.write("Article title: " + article_data['article_title'] + "\n\n") f.write("Abstract: " + abstract_text) - - # some articles are listed, but not released online yet. Adding fields for such articles to maintain uniformity. + + # some articles are listed, but not released yet. Adding fields for such articles to maintain uniformity. article_data['live'] = True article_data['release_date'] = None article_data['license'] = None article_data['pubmed_ftp_link'] = None article_data['filepath'] = abstract_filename - metadata.append(article_data) - if len(metadata) == 20: - return metadata - return metadata + return article_data + except Exception as e: + return {'error': str(e)} def getArticleIDs(metadata: list): """ From f8a4d23da927fd1ef670f71150308d8ad3d3761d Mon Sep 17 00:00:00 2001 From: star-nox Date: Sat, 6 Apr 2024 11:58:13 -0500 Subject: [PATCH 06/28] minor changes --- ai_ta_backend/utils/pubmed_extraction.py | 124 +++++++++++++++++------ 1 file changed, 91 insertions(+), 33 deletions(-) diff --git a/ai_ta_backend/utils/pubmed_extraction.py b/ai_ta_backend/utils/pubmed_extraction.py index 0ab4ade3..6e5ebe89 100644 --- a/ai_ta_backend/utils/pubmed_extraction.py +++ b/ai_ta_backend/utils/pubmed_extraction.py @@ -12,6 +12,8 @@ import shutil from minio import Minio import time +from multiprocessing import Manager + @@ -46,26 +48,28 @@ def extractPubmedData(): print("Time taken to extract XML file: ", round(time.time() - start_time, 2), "seconds") xml_filepath = "pubmed/pubmed24n1219.xml" - metadata = extractMetadataFromXML(xml_filepath) - print("Number of articles found in this file: ", len(metadata)) - print("\nSample metadata: ", metadata) - print("\n\nTime taken to extract metadata: ", round(time.time() - start_time, 2), "seconds") - exit() - - # find PMC ID and DOI for all articles - metadata_with_ids = getArticleIDs(metadata) - - # download the articles - complete_metadata = downloadArticles(metadata_with_ids) - print("Complete metadata: ", complete_metadata) - # upload articles to bucket - article_upload = uploadToStorage("pubmed_abstracts") - print("Uploaded articles: ", article_upload) - - # upload metadata to SQL DB - response = SUPBASE_CLIENT.table("publications").upsert(complete_metadata).execute() # type: ignore - print("Supabase response: ", response) + for metadata in extractMetadataFromXML(xml_filepath): + print("Total articles retrieved: ", len(metadata)) + print("Time taken to extract metadata for 2000 articles: ", round(time.time() - start_time, 2), "seconds") + + # find PMC ID and DOI for all articles + metadata_with_ids = getArticleIDs(metadata) + print("Time taken to get PMC ID and DOI for 2000 articles: ", round(time.time() - start_time, 2), "seconds") + + # download the articles + complete_metadata = downloadArticles(metadata_with_ids) + print("Time taken to download articles for 2000 articles: ", round(time.time() - start_time, 2), "seconds") + print("Complete metadata: ", complete_metadata[:20]) + + # upload articles to bucket + # article_upload = uploadToStorage("pubmed_abstracts") + # print("Uploaded articles: ", article_upload) + + # upload metadata to SQL DB + response = SUPBASE_CLIENT.table("publications").upsert(complete_metadata).execute() # type: ignore + print("Supabase response: ", response) + exit() return "success" @@ -165,20 +169,23 @@ def extractMetadataFromXML(xml_filepath: str): with concurrent.futures.ProcessPoolExecutor() as executor: futures = [] article_items = list(item for item in root.iter('PubmedArticle')) # Convert generator to list - total_items = len(article_items) # Use len() since article_items is now a list - article_items_100 = (article_items[i:i+50] for i in range(0, total_items, 50)) - for chunk in article_items_100: - for item in chunk: - future = executor.submit(processArticleItem, item) - futures.append(future) - - for future in concurrent.futures.as_completed(futures): - article_data = future.result() - metadata.append(article_data) + + for item in article_items: + future = executor.submit(processArticleItem, item) + article_data = future.result() + + metadata.append(article_data) + + if len(metadata) == 500: + print("collected 500 articles") + return metadata + metadata = [] # reset metadata for next batch + + if metadata: + yield metadata + + print("Metadata extraction complete.") - print("Extracted metadata for 20 articles: ", metadata[:20]) - print("Total articles extracted: ", len(metadata)) - return metadata # # PARALLELIZE THE BELOW FOR LOOP AND EXTRACT METADATA FOR ALL ARTICLES AT ONCE - IN 1000s # # Extract metadata from the XML file @@ -352,17 +359,45 @@ def getArticleIDs(metadata: list): Returns: metadata: Updated metadata with PMCID, DOI, release date, and live status information. """ + print("In getArticleIDs()") base_url = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/" app_details = "?tool=ncsa_uiuc&email=caiincsa@gmail.com&format=json" batch_size = 200 # maximum number of articles API can process in one request + + # # Create a shared list using multiprocessing.Manager + # manager = Manager() + # shared_metadata = manager.list(metadata) # Copy initial metadata into the shared list + + # for i in range(0, len(metadata), batch_size): + # batch = metadata[i:i+batch_size] + # ids = ",".join([article['pmid'] for article in batch]) + # response = requests.get(base_url + app_details + "&ids=" + ids) + # data = response.json() + # records = data['records'] + + # with concurrent.futures.ProcessPoolExecutor() as executor: + # futures = [] + # for record in records: + # future = executor.submit(updateArticleMetadata, shared_metadata, record) + # futures.append(future) + + # # process results from parallel tasks + # for future in futures: + # try: + # future.result() + # except Exception as e: + # print(f"Error updating metadata for article: {e}") + + # print("Updated metadata in ID converter: ", len(shared_metadata)) + # return shared_metadata + for i in range(0, len(metadata), batch_size): batch = metadata[i:i+batch_size] ids = ",".join([article['pmid'] for article in batch]) response = requests.get(base_url + app_details + "&ids=" + ids) data = response.json() records = data['records'] - # PARALLELIZE THIS FOR LOOP - UPDATES ADDITIONAL FIELDS FOR ALL ARTICLES AT ONCE for record in records: if 'errmsg' in record: @@ -384,6 +419,29 @@ def getArticleIDs(metadata: list): break return metadata +def updateArticleMetadata(shared_metadata: list, record: dict): + """ + Updates metadata with PMCID, DOI, release date, and live status information for given article. + """ + if 'errmsg' in record: + print("Error: ", record['errmsg']) + for article in shared_metadata: + if article['pmid'] == record['pmid']: + article['live'] = False + break + + else: + # find article with matching pmid and update pmcid, doi, live, and release date fields + for article in shared_metadata: + if article['pmid'] == record['pmid']: + article['pmcid'] = record['pmcid'] + article['doi'] = record['doi'] + article['live'] = False if 'live' in record and record['live'] == "false" else True + article['release_date'] = record.get('release-date', article['release_date']) + print("Updated metadata in ID converter: ", article) + break + + def downloadArticles(metadata: list): """ Downloads articles from PMC and stores them in local directory. From fa83ddfe76fd6ddbb8bd9ae192f6bcba87354382 Mon Sep 17 00:00:00 2001 From: star-nox Date: Mon, 8 Apr 2024 11:55:11 -0500 Subject: [PATCH 07/28] yielded metadata after collecting 100 articles --- ai_ta_backend/utils/pubmed_extraction.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/ai_ta_backend/utils/pubmed_extraction.py b/ai_ta_backend/utils/pubmed_extraction.py index 6e5ebe89..4c36ad73 100644 --- a/ai_ta_backend/utils/pubmed_extraction.py +++ b/ai_ta_backend/utils/pubmed_extraction.py @@ -51,11 +51,11 @@ def extractPubmedData(): for metadata in extractMetadataFromXML(xml_filepath): print("Total articles retrieved: ", len(metadata)) - print("Time taken to extract metadata for 2000 articles: ", round(time.time() - start_time, 2), "seconds") + print("Time taken to extract metadata for 100 articles: ", round(time.time() - start_time, 2), "seconds") # find PMC ID and DOI for all articles metadata_with_ids = getArticleIDs(metadata) - print("Time taken to get PMC ID and DOI for 2000 articles: ", round(time.time() - start_time, 2), "seconds") + print("Time taken to get PMC ID and DOI for 100 articles: ", round(time.time() - start_time, 2), "seconds") # download the articles complete_metadata = downloadArticles(metadata_with_ids) @@ -63,8 +63,8 @@ def extractPubmedData(): print("Complete metadata: ", complete_metadata[:20]) # upload articles to bucket - # article_upload = uploadToStorage("pubmed_abstracts") - # print("Uploaded articles: ", article_upload) + article_upload = uploadToStorage("pubmed_abstracts") + print("/n/nUploaded articles: ", article_upload) # upload metadata to SQL DB response = SUPBASE_CLIENT.table("publications").upsert(complete_metadata).execute() # type: ignore @@ -176,9 +176,9 @@ def extractMetadataFromXML(xml_filepath: str): metadata.append(article_data) - if len(metadata) == 500: - print("collected 500 articles") - return metadata + if len(metadata) == 100: + print("collected 100 articles") + yield metadata metadata = [] # reset metadata for next batch if metadata: @@ -311,9 +311,9 @@ def processArticleItem(item: ET.Element): if issue.find('PubDate/Year') is not None and issue.find('PubDate/Month') is not None and issue.find('PubDate/Day') is not None: article_data['published'] = f"{issue.find('PubDate/Year').text}-{issue.find('PubDate/Month').text}-{issue.find('PubDate/Day').text}" elif issue.find('PubDate/Year') is not None and issue.find('PubDate/Month') is not None: - article_data['published'] = f"{issue.find('PubDate/Year').text}-{issue.find('PubDate/Month').text}" + article_data['published'] = f"{issue.find('PubDate/Year').text}-{issue.find('PubDate/Month').text}-01" elif issue.find('PubDate/Year') is not None: - article_data['published'] = f"{issue.find('PubDate/Year').text}" + article_data['published'] = f"{issue.find('PubDate/Year').text}-01-01" else: article_data['published'] = None From 64a4142833edcc82dbdc855b6cbd799a86e41360 Mon Sep 17 00:00:00 2001 From: star-nox Date: Thu, 11 Apr 2024 11:27:52 -0500 Subject: [PATCH 08/28] storing metadata into csv and upserting per XML file --- ai_ta_backend/utils/pubmed_extraction.py | 611 +++++++++++------------ 1 file changed, 291 insertions(+), 320 deletions(-) diff --git a/ai_ta_backend/utils/pubmed_extraction.py b/ai_ta_backend/utils/pubmed_extraction.py index 4c36ad73..a20057f0 100644 --- a/ai_ta_backend/utils/pubmed_extraction.py +++ b/ai_ta_backend/utils/pubmed_extraction.py @@ -13,9 +13,9 @@ from minio import Minio import time from multiprocessing import Manager - - - +import pandas as pd +import threading +import json SUPBASE_CLIENT = supabase.create_client( # type: ignore supabase_url=os.getenv('SUPABASE_URL'), # type: ignore @@ -25,7 +25,7 @@ MINIO_CLIENT = Minio(os.environ['MINIO_URL'], access_key=os.environ['MINIO_ACCESS_KEY'], secret_key=os.environ['MINIO_SECRET_KEY'], - secure=False + secure=True ) def extractPubmedData(): @@ -38,20 +38,23 @@ def extractPubmedData(): ftp_path = "pubmed/baseline" file_list = getFileList(ftp_address, ftp_path, ".gz") - gz_filepath = downloadXML(ftp_address, ftp_path, file_list[0], "pubmed") + gz_filepath = downloadXML(ftp_address, ftp_path, file_list[1], "pubmed") print("GZ Downloaded: ", gz_filepath) - print("Time taken to download .gz file: ", round(time.time() - start_time, 2), "seconds") + gz_file_download_time = round(time.time() - start_time, 2) + print("Time taken to download .gz file: ", gz_file_download_time, "seconds") # extract the XML file + if not gz_filepath: + return "failure" xml_filepath = extractXMLFile(gz_filepath) print("XML Extracted: ", xml_filepath) - print("Time taken to extract XML file: ", round(time.time() - start_time, 2), "seconds") - - xml_filepath = "pubmed/pubmed24n1219.xml" + xml_extract_time = round(time.time() - gz_file_download_time, 2) + print("Time taken to extract XML file: ", xml_extract_time, "seconds") + + #xml_filepath = "pubmed/pubmed24n1219.xml" for metadata in extractMetadataFromXML(xml_filepath): - print("Total articles retrieved: ", len(metadata)) - print("Time taken to extract metadata for 100 articles: ", round(time.time() - start_time, 2), "seconds") + metadata_extract_start_time = time.time() # find PMC ID and DOI for all articles metadata_with_ids = getArticleIDs(metadata) @@ -59,17 +62,43 @@ def extractPubmedData(): # download the articles complete_metadata = downloadArticles(metadata_with_ids) - print("Time taken to download articles for 2000 articles: ", round(time.time() - start_time, 2), "seconds") - print("Complete metadata: ", complete_metadata[:20]) + + # store metadata in csv file + print("\n") + print("Total articles retrieved: ", len(complete_metadata)) + df = pd.DataFrame(complete_metadata) + csv_filepath = "metadata.csv" + + if os.path.isfile(csv_filepath): + df.to_csv(csv_filepath, mode='a', header=False, index=False) + else: + df.to_csv(csv_filepath, index=False) + + print("Time taken to extract metadata for 100 articles: ", round(time.time() - metadata_extract_start_time, 2), "seconds") + + + print("Time taken to download articles: ", round(time.time() - start_time, 2), "seconds") + print("Total metadata extracted: ", len(complete_metadata)) + + # upload articles to bucket + print("Uploading articles to storage...") + article_upload = uploadToStorage("pubmed_abstracts") + print("Uploaded articles: ", article_upload) - # upload articles to bucket - article_upload = uploadToStorage("pubmed_abstracts") - print("/n/nUploaded articles: ", article_upload) - - # upload metadata to SQL DB - response = SUPBASE_CLIENT.table("publications").upsert(complete_metadata).execute() # type: ignore - print("Supabase response: ", response) - exit() + # upload metadata to SQL DB + df = pd.read_csv(csv_filepath) + + complete_metadata = df.to_dict('records') + for item in complete_metadata: + for key, value in item.items(): + if pd.isna(value): # Or: math.isnan(value) + item[key] = None + + print("Metadata loaded into dataframe: ", len(complete_metadata)) + # continue with the rest of the code + response = SUPBASE_CLIENT.table("publications").upsert(complete_metadata).execute() # type: ignore + print("Supabase response: ", response) + return "success" @@ -84,22 +113,26 @@ def downloadXML(ftp_address: str, ftp_path: str, file: str, local_dir: str): Returns: local_filepath: Path to the downloaded file. """ - # create local directory if it doesn't exist - os.makedirs(local_dir, exist_ok=True) + try: + # create local directory if it doesn't exist + os.makedirs(local_dir, exist_ok=True) - # connect to the FTP server - ftp = ftplib.FTP(ftp_address) - ftp.login() - ftp.cwd(ftp_path) + # connect to the FTP server + ftp = ftplib.FTP(ftp_address) + ftp.login() + ftp.cwd(ftp_path) - local_filepath = os.path.join(local_dir, file) - with open(local_filepath, 'wb') as f: - ftp.retrbinary('RETR ' + file, f.write) - - print(f"Downloaded {file} to {local_filepath}") + local_filepath = os.path.join(local_dir, file) + with open(local_filepath, 'wb') as f: + ftp.retrbinary('RETR ' + file, f.write) + + print(f"Downloaded {file} to {local_filepath}") - ftp.quit() - return local_filepath + ftp.quit() + return local_filepath + except Exception as e: + print("Error downloading file: ", e) + return None def getFileList(ftp_address: str, ftp_path: str, extension: str = ".gz"): """ @@ -111,24 +144,28 @@ def getFileList(ftp_address: str, ftp_path: str, extension: str = ".gz"): Returns: gz_files: List of .gz files in the FTP folder. """ - # connect to the FTP server - ftp = ftplib.FTP(ftp_address) - ftp.login() + try: + # connect to the FTP server + ftp = ftplib.FTP(ftp_address) + ftp.login() - # Change directory to the specified path - ftp.cwd(ftp_path) + # Change directory to the specified path + ftp.cwd(ftp_path) - # Get list of file entries - file_listing = ftp.nlst() + # Get list of file entries + file_listing = ftp.nlst() - ftp.quit() + ftp.quit() - # Filter for files with the specified extension - gz_files = [entry for entry in file_listing if entry.endswith(extension)] - gz_files.sort(reverse=True) - print(f"Found {len(gz_files)} files on {ftp_address}/{ftp_path}") + # Filter for files with the specified extension + gz_files = [entry for entry in file_listing if entry.endswith(extension)] + gz_files.sort(reverse=True) + print(f"Found {len(gz_files)} files on {ftp_address}/{ftp_path}") - return gz_files + return gz_files + except Exception as e: + print("Error getting file list: ", e) + return [] def extractXMLFile(gz_filepath: str): """ @@ -138,13 +175,17 @@ def extractXMLFile(gz_filepath: str): Returns: xml_filepath: Path to the extracted XML file. """ - print("Downloaded .gz file path: ", gz_filepath) - xml_filepath = gz_filepath.replace(".gz", "") - with gzip.open(gz_filepath, 'rb') as f_in: - with open(xml_filepath, 'wb') as f_out: - shutil.copyfileobj(f_in, f_out) + try: + print("Downloaded .gz file path: ", gz_filepath) + xml_filepath = gz_filepath.replace(".gz", "") + with gzip.open(gz_filepath, 'rb') as f_in: + with open(xml_filepath, 'wb') as f_out: + shutil.copyfileobj(f_in, f_out) - return xml_filepath + return xml_filepath + except Exception as e: + print("Error extracting XML file: ", e) + return None def extractMetadataFromXML(xml_filepath: str): """ @@ -157,113 +198,38 @@ def extractMetadataFromXML(xml_filepath: str): metadata: List of dictionaries containing metadata for each article. """ print("inside extractMetadataFromXML()") + try: + # create a directory to store abstracts + os.makedirs("pubmed_abstracts", exist_ok=True) - # create a directory to store abstracts - os.makedirs("pubmed_abstracts", exist_ok=True) - - tree = ET.parse(xml_filepath) - root = tree.getroot() - metadata = [] - + tree = ET.parse(xml_filepath) + root = tree.getroot() + metadata = [] - with concurrent.futures.ProcessPoolExecutor() as executor: - futures = [] - article_items = list(item for item in root.iter('PubmedArticle')) # Convert generator to list - - for item in article_items: - future = executor.submit(processArticleItem, item) - article_data = future.result() - metadata.append(article_data) + with concurrent.futures.ProcessPoolExecutor() as executor: + futures = [] + article_items = list(item for item in root.iter('PubmedArticle')) # Convert generator to list + + for item in article_items: + future = executor.submit(processArticleItem, item) + article_data = future.result() - if len(metadata) == 100: - print("collected 100 articles") - yield metadata - metadata = [] # reset metadata for next batch + metadata.append(article_data) - if metadata: - yield metadata - - print("Metadata extraction complete.") + if len(metadata) == 100: + print("collected 100 articles") + yield metadata + metadata = [] # reset metadata for next batch - - # # PARALLELIZE THE BELOW FOR LOOP AND EXTRACT METADATA FOR ALL ARTICLES AT ONCE - IN 1000s - # # Extract metadata from the XML file - # for item in root.iter('PubmedArticle'): - # article_data = {} - - # medline_citation = item.find('MedlineCitation') - # article = medline_citation.find('Article') - # journal = article.find('Journal') - # issue = journal.find('JournalIssue') - - # if medline_citation.find('PMID') is not None: - # article_data['pmid'] = medline_citation.find('PMID').text - # article_data['pmcid'] = None - # article_data['doi'] = None - # else: - # continue - - # if journal.find('ISSN') is not None: - # article_data['issn'] = journal.find('ISSN').text - # else: - # article_data['issn'] = None - - # if journal.find('Title') is not None: - # article_data['journal_title'] = journal.find('Title').text - # else: - # article_data['journal_title'] = None + if metadata: + yield metadata - # # some articles don't have an article title - # article_title = article.find('ArticleTitle') - # if article_title is not None and article_title.text is not None: - # article_data['article_title'] = article_title.text.replace('[', '').replace(']', '') - # else: - # article_data['article_title'] = None - - # article_data['last_revised'] = f"{medline_citation.find('DateRevised/Year').text}-{medline_citation.find('DateRevised/Month').text}-{medline_citation.find('DateRevised/Day').text}" - - # # some articles don't have all fields present for publication date - # if issue.find('PubDate/Year') is not None and issue.find('PubDate/Month') is not None and issue.find('PubDate/Day') is not None: - # article_data['published'] = f"{issue.find('PubDate/Year').text}-{issue.find('PubDate/Month').text}-{issue.find('PubDate/Day').text}" - # elif issue.find('PubDate/Year') is not None and issue.find('PubDate/Month') is not None: - # article_data['published'] = f"{issue.find('PubDate/Year').text}-{issue.find('PubDate/Month').text}" - # elif issue.find('PubDate/Year') is not None: - # article_data['published'] = f"{issue.find('PubDate/Year').text}" - # else: - # article_data['published'] = None - - # # extract and store abstract in a text file - # abstract = article.find('Abstract') - # if abstract is not None: - # abstract_text = "" - # for abstract_text_element in abstract.iter('AbstractText'): - # # if labels (objective, methods, etc.) are present, add them to the text (e.g. "OBJECTIVE: ") - # if abstract_text_element.attrib.get('Label') is not None: - # abstract_text += abstract_text_element.attrib.get('Label') + ": " - # if abstract_text_element.text is not None: - # abstract_text += abstract_text_element.text + "\n" - - # # save abstract to a text file - # abstract_filename = f"pubmed_abstracts/{article_data['pmid']}.txt" - # with open(abstract_filename, 'w') as f: - # if article_data['article_title']: - # f.write("Article title: " + article_data['article_title'] + "\n") - # if article_data['journal_title']: - # f.write("Journal title: " + article_data['journal_title'] + "\n") - # f.write("Abstract: " + abstract_text) - - # # some articles are listed, but not released online yet. Adding fields for such articles to maintain uniformity. - # article_data['live'] = True - # article_data['release_date'] = None - # article_data['license'] = None - # article_data['pubmed_ftp_link'] = None - # article_data['filepath'] = abstract_filename - - # metadata.append(article_data) - # if len(metadata) == 20: - # return metadata - # return metadata + print("Metadata extraction complete.") + except Exception as e: + print("Error extracting metadata: ", e) + return [] + def processArticleItem(item: ET.Element): """ @@ -360,64 +326,46 @@ def getArticleIDs(metadata: list): metadata: Updated metadata with PMCID, DOI, release date, and live status information. """ print("In getArticleIDs()") - base_url = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/" - app_details = "?tool=ncsa_uiuc&email=caiincsa@gmail.com&format=json" + try: + base_url = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/" + app_details = "?tool=ncsa_uiuc&email=caiincsa@gmail.com&format=json" - batch_size = 200 # maximum number of articles API can process in one request - - # # Create a shared list using multiprocessing.Manager - # manager = Manager() - # shared_metadata = manager.list(metadata) # Copy initial metadata into the shared list - - # for i in range(0, len(metadata), batch_size): - # batch = metadata[i:i+batch_size] - # ids = ",".join([article['pmid'] for article in batch]) - # response = requests.get(base_url + app_details + "&ids=" + ids) - # data = response.json() - # records = data['records'] - - # with concurrent.futures.ProcessPoolExecutor() as executor: - # futures = [] - # for record in records: - # future = executor.submit(updateArticleMetadata, shared_metadata, record) - # futures.append(future) - - # # process results from parallel tasks - # for future in futures: - # try: - # future.result() - # except Exception as e: - # print(f"Error updating metadata for article: {e}") - - # print("Updated metadata in ID converter: ", len(shared_metadata)) - # return shared_metadata - - for i in range(0, len(metadata), batch_size): - batch = metadata[i:i+batch_size] - ids = ",".join([article['pmid'] for article in batch]) - response = requests.get(base_url + app_details + "&ids=" + ids) - data = response.json() - records = data['records'] - # PARALLELIZE THIS FOR LOOP - UPDATES ADDITIONAL FIELDS FOR ALL ARTICLES AT ONCE - for record in records: - if 'errmsg' in record: - print("Error: ", record['errmsg']) - for article in batch: - if article['pmid'] == record['pmid']: - article['live'] = False - break - continue - else: - # find article with matching pmid and update pmcid, doi, live, and release date fields - for article in batch: - if article['pmid'] == record['pmid']: - article['pmcid'] = record['pmcid'] - article['doi'] = record['doi'] - article['live'] = False if 'live' in record and record['live'] == "false" else True - article['release_date'] = record.get('release-date', article['release_date']) - print("Updated metadata in ID converter: ", article) - break - return metadata + batch_size = 200 # maximum number of articles API can process in one request + + for i in range(0, len(metadata), batch_size): + batch = metadata[i:i+batch_size] + ids = ",".join([article['pmid'] for article in batch]) + response = requests.get(base_url + app_details + "&ids=" + ids) + data = response.json() + records = data['records'] + # PARALLELIZE THIS FOR LOOP - UPDATES ADDITIONAL FIELDS FOR ALL ARTICLES AT ONCE + with concurrent.futures.ThreadPoolExecutor() as executor: + executor.map(updateArticleMetadata, batch, records) + + # for record in records: + # if 'errmsg' in record: + # print("Error: ", record['errmsg']) + # for article in batch: + # if article['pmid'] == record['pmid']: + # article['live'] = False + # break + # continue + # else: + # # find article with matching pmid and update pmcid, doi, live, and release date fields + # for article in batch: + # if article['pmid'] == record['pmid']: + # article['pmcid'] = record['pmcid'] + # article['doi'] = record['doi'] + # article['live'] = False if 'live' in record and record['live'] == "false" else True + # article['release_date'] = record.get('release-date', article['release_date']) + # print("Updated metadata in ID converter: ", article) + # break + + + return metadata + except Exception as e: + print("Error getting article IDs: ", e) + return metadata def updateArticleMetadata(shared_metadata: list, record: dict): """ @@ -450,63 +398,73 @@ def downloadArticles(metadata: list): Returns: metadata: Updated metadata with license, FTP link, and downloaded filepath information. """ + try: + base_url = "https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?" + print("Downloading articles...") - base_url = "https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?" - print("Downloading articles...") - - # connect to FTP server anonymously - ftp = ftplib.FTP("ftp.ncbi.nlm.nih.gov") - ftp.login() - - # PARALLELIZE THIS FOR LOOP - DOWNLOAD + METADATA UPDATE - for article in metadata: + # connect to FTP server anonymously + ftp = ftplib.FTP("ftp.ncbi.nlm.nih.gov") + ftp.login() - if article['live'] is False or article['pmcid'] is None: - continue - - # else proceed with download - if article['pmcid']: - # query URL for article download - final_url = base_url + "id=" + article['pmcid'] - print("Download URL: ", final_url) - - xml_response = requests.get(final_url) - # get license and FTP link - extracted_data = extractArticleData(xml_response.text) - - print("\nExtracted license and link data: ", extracted_data) + # PARALLELIZE THIS FOR LOOP - DOWNLOAD + METADATA UPDATE + for article in metadata: - # if no data extracted (reason: article not released/open-access), skip to next article - if not extracted_data: - article['live'] = False + if article['live'] is False or article['pmcid'] is None: continue - - # update metadata with license and ftp link information - article['license'] = extracted_data[0]['license'] - article['pubmed_ftp_link'] = extracted_data[0]['href'] if 'href' in extracted_data[0] else None - - # download the article - ftp_url = urlparse(extracted_data[0]['href']) - ftp_path = ftp_url.path[1:] - print("FTP path: ", ftp_path) - - filename = ftp_path.split("/")[-1] - local_file = os.path.join("pubmed_abstracts", filename) - with open(local_file, 'wb') as f: - ftp.retrbinary('RETR ' + ftp_path, f.write) - print("Downloaded PDF file: ", local_file) - article['filepath'] = local_file - - # if file is .tar.gz, extract the PDF and delete the tar.gz file - if filename.endswith(".tar.gz"): - extracted_pdf_paths = extractPDF(local_file) - print("Extracted PDFs from .tar.gz file: ", extracted_pdf_paths) - article['filepath'] = ",".join(extracted_pdf_paths) - os.remove(local_file) - print("\nUpdated metadata after download: ", article) - ftp.quit() - return metadata + # else proceed with download + if article['pmcid']: + # query URL for article download + final_url = base_url + "id=" + article['pmcid'] + print("Download URL: ", final_url) + + xml_response = requests.get(final_url) + # get license and FTP link + extracted_data = extractArticleData(xml_response.text) + + print("\nExtracted license and link data: ", extracted_data) + + # if no data extracted (reason: article not released/open-access), skip to next article + if not extracted_data: + article['live'] = False + continue + + # update metadata with license and ftp link information + article['license'] = extracted_data[0]['license'] + article['pubmed_ftp_link'] = extracted_data[0]['href'] if 'href' in extracted_data[0] else None + + # download the article + ftp_url = urlparse(extracted_data[0]['href']) + ftp_path = ftp_url.path[1:] + print("FTP path: ", ftp_path) + + # Set a timeout of 15 minutes - some files take > 1 hour to download and everything hangs + timeout = threading.Timer(15 * 60, lambda: print("Download timed out!")) + timeout.start() + + filename = ftp_path.split("/")[-1] + local_file = os.path.join("pubmed_abstracts", filename) + try: + with open(local_file, 'wb') as f: + ftp.retrbinary('RETR ' + ftp_path, f.write) + print("Downloaded PDF file: ", local_file) + article['filepath'] = local_file + + # if file is .tar.gz, extract the PDF and delete the tar.gz file + if filename.endswith(".tar.gz"): + extracted_pdf_paths = extractPDF(local_file) + print("Extracted PDFs from .tar.gz file: ", extracted_pdf_paths) + article['filepath'] = ",".join(extracted_pdf_paths) + os.remove(local_file) + finally: + timeout.cancel() # cancel the timer if download finishes before timeout + + print("\nUpdated metadata after download: ", article) + ftp.quit() + return metadata + except Exception as e: + print("Error downloading articles: ", e) + return metadata def extractPDF(tar_gz_filepath: str): """ @@ -517,17 +475,21 @@ def extractPDF(tar_gz_filepath: str): Returns: extracted_paths: List of paths to the extracted PDF files. """ - print("Extracting PDF from: ", tar_gz_filepath) - extracted_paths = [] - with tarfile.open(tar_gz_filepath, "r:gz") as tar: - for member in tar: - if member.isreg() and member.name.endswith(".pdf"): - tar.extract(member, path="pubmed_abstracts") - print("Extracted: ", member.name) - extracted_paths.append(os.path.join("pubmed_abstracts", member.name)) - - return extracted_paths - + try: + print("Extracting PDF from: ", tar_gz_filepath) + extracted_paths = [] + with tarfile.open(tar_gz_filepath, "r:gz") as tar: + for member in tar: + if member.isreg() and member.name.endswith(".pdf"): + tar.extract(member, path="pubmed_abstracts") + print("Extracted: ", member.name) + extracted_paths.append(os.path.join("pubmed_abstracts", member.name)) + + return extracted_paths + except Exception as e: + print("Error extracting PDF: ", e) + return [] + def extractArticleData(xml_string: str): """ Extracts license information and article download link from the XML response. @@ -538,60 +500,69 @@ def extractArticleData(xml_string: str): extracted_data: List of dictionaries containing license and download link for the article. """ print("In extractArticleData") - - root = ET.fromstring(xml_string) - # if there is an errors (article not open-access), return empty list (skip article) - if root.find(".//error") is not None: - return [] - - records = root.findall(".//record") - extracted_data = [] - href = None - - for record in records: - record_id = record.get("id") # pmcid - license = record.get("license") - links = record.findall(".//link") - - for link in links: - if link.get("format") == "pdf": - href = link.get("href") - break - # if PDF link not found, use the available tgz link - if not href: - href = links[0].get("href") + try: + root = ET.fromstring(xml_string) + # if there is an errors (article not open-access), return empty list (skip article) + if root.find(".//error") is not None: + return [] + + records = root.findall(".//record") + extracted_data = [] + href = None + + for record in records: + record_id = record.get("id") # pmcid + license = record.get("license") + links = record.findall(".//link") + + for link in links: + if link.get("format") == "pdf": + href = link.get("href") + break + # if PDF link not found, use the available tgz link + if not href: + href = links[0].get("href") + + extracted_data.append({ + "record_id": record_id, + "license": license, + "href": href + }) - extracted_data.append({ - "record_id": record_id, - "license": license, - "href": href - }) + return extracted_data + except Exception as e: + print("Error extracting article data: ", e) + return [] - return extracted_data - def uploadToStorage(filepath: str): """ Uploads all files present under given filepath to Minio bucket. """ print("in uploadToStorage()") - - bucket_name = "pubmed" - found = MINIO_CLIENT.bucket_exists(bucket_name) - if not found: - MINIO_CLIENT.make_bucket(bucket_name) - print("Created bucket", bucket_name) - else: - print("Bucket", bucket_name, "already exists") - - for root, dirs, files in os.walk(filepath): - # can parallelize this upload - for file in files: - file_path = os.path.join(root, file) - object_name = file_path.split("/")[-1] - # insert local file into remote bucket - MINIO_CLIENT.fput_object(bucket_name, object_name, file_path) - print("Uploaded: ", object_name) - return "success" + try: + bucket_name = "pubmed" + print(os.environ['MINIO_URL']) + print(os.environ['MINIO_SECRET_KEY']) + print(os.environ['MINIO_ACCESS_KEY']) + found = MINIO_CLIENT.bucket_exists(bucket_name) + if not found: + MINIO_CLIENT.make_bucket(bucket_name) + print("Created bucket", bucket_name) + else: + print("Bucket", bucket_name, "already exists") + + for root, dirs, files in os.walk(filepath): + # can parallelize this upload + for file in files: + file_path = os.path.join(root, file) + object_name = file_path.split("/")[-1] + # insert local file into remote bucket + MINIO_CLIENT.fput_object(bucket_name, object_name, file_path) + print("Uploaded: ", object_name) + return "success" + except Exception as e: + print("Error uploading to storage: ", e) + return "failure" From 24425d455c6bd9ec32463171adf94be4b44310fd Mon Sep 17 00:00:00 2001 From: star-nox Date: Fri, 12 Apr 2024 09:37:45 -0500 Subject: [PATCH 09/28] parallelized metadata update --- ai_ta_backend/utils/pubmed_extraction.py | 286 ++++++++++++++--------- 1 file changed, 182 insertions(+), 104 deletions(-) diff --git a/ai_ta_backend/utils/pubmed_extraction.py b/ai_ta_backend/utils/pubmed_extraction.py index a20057f0..16730f9b 100644 --- a/ai_ta_backend/utils/pubmed_extraction.py +++ b/ai_ta_backend/utils/pubmed_extraction.py @@ -40,18 +40,17 @@ def extractPubmedData(): gz_filepath = downloadXML(ftp_address, ftp_path, file_list[1], "pubmed") print("GZ Downloaded: ", gz_filepath) - gz_file_download_time = round(time.time() - start_time, 2) - print("Time taken to download .gz file: ", gz_file_download_time, "seconds") + print("Time taken to download .gz file: ", round(time.time() - start_time, 2), "seconds") + gz_file_download_time = time.time() # extract the XML file if not gz_filepath: return "failure" xml_filepath = extractXMLFile(gz_filepath) print("XML Extracted: ", xml_filepath) - xml_extract_time = round(time.time() - gz_file_download_time, 2) - print("Time taken to extract XML file: ", xml_extract_time, "seconds") + print("Time taken to extract XML file: ", round(time.time() - gz_file_download_time, 2), "seconds") + - #xml_filepath = "pubmed/pubmed24n1219.xml" for metadata in extractMetadataFromXML(xml_filepath): metadata_extract_start_time = time.time() @@ -59,10 +58,12 @@ def extractPubmedData(): # find PMC ID and DOI for all articles metadata_with_ids = getArticleIDs(metadata) print("Time taken to get PMC ID and DOI for 100 articles: ", round(time.time() - start_time, 2), "seconds") - + #print("Metadata with IDs: ", metadata_with_ids) + # download the articles complete_metadata = downloadArticles(metadata_with_ids) - + print(complete_metadata) + # store metadata in csv file print("\n") print("Total articles retrieved: ", len(complete_metadata)) @@ -315,79 +316,153 @@ def processArticleItem(item: ET.Element): except Exception as e: return {'error': str(e)} -def getArticleIDs(metadata: list): - """ - Uses the PubMed ID converter API to get PMCID and DOI for each article. - Queries the API in batches of 200 articles at a time. - Also updates the metadata with the release date and live status - some articles are yet to be released. - Args: - metadata: List of dictionaries containing metadata for each article. - Returns: - metadata: Updated metadata with PMCID, DOI, release date, and live status information. - """ - print("In getArticleIDs()") - try: - base_url = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/" - app_details = "?tool=ncsa_uiuc&email=caiincsa@gmail.com&format=json" +# def getArticleIDs(metadata: list): +# """ +# Uses the PubMed ID converter API to get PMCID and DOI for each article. +# Queries the API in batches of 200 articles at a time. +# Also updates the metadata with the release date and live status - some articles are yet to be released. +# Args: +# metadata: List of dictionaries containing metadata for each article. +# Returns: +# metadata: Updated metadata with PMCID, DOI, release date, and live status information. +# """ +# print("In getArticleIDs()") + +# base_url = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/" +# app_details = "?tool=ncsa_uiuc&email=caiincsa@gmail.com&format=json" - batch_size = 200 # maximum number of articles API can process in one request +# batch_size = 200 # maximum number of articles API can process in one request - for i in range(0, len(metadata), batch_size): - batch = metadata[i:i+batch_size] - ids = ",".join([article['pmid'] for article in batch]) - response = requests.get(base_url + app_details + "&ids=" + ids) - data = response.json() - records = data['records'] - # PARALLELIZE THIS FOR LOOP - UPDATES ADDITIONAL FIELDS FOR ALL ARTICLES AT ONCE - with concurrent.futures.ThreadPoolExecutor() as executor: - executor.map(updateArticleMetadata, batch, records) - - # for record in records: - # if 'errmsg' in record: - # print("Error: ", record['errmsg']) - # for article in batch: - # if article['pmid'] == record['pmid']: - # article['live'] = False - # break - # continue - # else: - # # find article with matching pmid and update pmcid, doi, live, and release date fields - # for article in batch: - # if article['pmid'] == record['pmid']: - # article['pmcid'] = record['pmcid'] - # article['doi'] = record['doi'] - # article['live'] = False if 'live' in record and record['live'] == "false" else True - # article['release_date'] = record.get('release-date', article['release_date']) - # print("Updated metadata in ID converter: ", article) - # break - +# for i in range(0, len(metadata), batch_size): +# batch = metadata[i:i+batch_size] +# ids = ",".join([article['pmid'] for article in batch]) +# response = requests.get(base_url + app_details + "&ids=" + ids) +# data = response.json() +# records = data['records'] +# # PARALLELIZE THIS FOR LOOP - UPDATES ADDITIONAL FIELDS FOR ALL ARTICLES AT ONCE +# with Manager() as manager: +# shared_metadata = manager.list(batch) +# with concurrent.futures.ProcessPoolExecutor() as executor: +# futures = {executor.submit(updateArticleMetadata, shared_metadata, record): record for record in records} +# concurrent.futures.wait(futures) +# for future in concurrent.futures.as_completed(futures): +# record = futures[future] +# try: +# future.result() +# except Exception as exc: +# print('%r generated an exception: %s' % (record, exc)) - return metadata - except Exception as e: - print("Error getting article IDs: ", e) - return metadata +# print("Updated metadata: ", list(shared_metadata)) + +# print("Length of metadata after ID conversion: ", len(metadata)) + +# return metadata + + +# def updateArticleMetadata(shared_metadata, record): +# """ +# Updates metadata with PMCID, DOI, release date, and live status information for given article. +# Used withing getArticleIDs() function. +# """ + +# if 'errmsg' in record: +# print("Error: ", record['errmsg']) +# for article in shared_metadata: +# if article['pmid'] == record['pmid']: +# article['live'] = False +# break +# else: +# # find article with matching pmid and update pmcid, doi, live, and release date fields +# print("record: ", record) +# for article in shared_metadata: +# if article['pmid'] == record['pmid']: +# article['pmcid'] = record['pmcid'] +# article['doi'] = record['doi'] +# article['live'] = False if 'live' in record and record['live'] == "false" else True +# article['release_date'] = record.get('release-date', article['release_date']) +# print("Updated metadata in ID converter: ", article) +# break + +def getArticleIDs(metadata: list): + """ + Uses the PubMed ID converter API to get PMCID and DOI for each article. + Queries the API in batches of 200 articles at a time. + Also updates the metadata with the release date and live status - some articles are yet to be released. + Args: + metadata: List of dictionaries containing metadata for each article. + Returns: + metadata: Updated metadata with PMCID, DOI, release date, and live status information. + """ + print("In getArticleIDs()") + + base_url = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/" + app_details = "?tool=ncsa_uiuc&email=caiincsa@gmail.com&format=json" + + batch_size = 200 # maximum number of articles API can process in one request + + for i in range(0, len(metadata), batch_size): + batch = metadata[i:i + batch_size] + ids = ",".join([article['pmid'] for article in batch]) + response = requests.get(base_url + app_details + "&ids=" + ids) + data = response.json() + records = data['records'] + + # PARALLELIZE THIS FOR LOOP - UPDATES ADDITIONAL FIELDS FOR ALL ARTICLES AT ONCE + with Manager() as manager: + shared_metadata = manager.dict() # Use a shared dictionary + with concurrent.futures.ProcessPoolExecutor() as executor: + futures = { + executor.submit(updateArticleMetadata, shared_metadata, record): record + for record in records + } + concurrent.futures.wait(futures) + for future in concurrent.futures.as_completed(futures): + record = futures[future] + try: + future.result() + except Exception as exc: + print('%r generated an exception: %s' % (record, exc)) + + # Update original metadata after loop + for article in metadata: + if article['pmid'] in shared_metadata: + # print("Shared metadata: ", shared_metadata[article['pmid']]) + if 'errmsg' in shared_metadata[article['pmid']]: + article['live'] = False + else: + article['pmcid'] = shared_metadata[article['pmid']]['pmcid'] + article['doi'] = shared_metadata[article['pmid']]['doi'] + article['live'] = shared_metadata[article['pmid']]['live'] + article['release_date'] = shared_metadata[article['pmid']]['release_date'] + #print("Updated metadata: ", article) + + #print("Length of metadata after ID conversion: ", len(metadata)) + return metadata + + +def updateArticleMetadata(shared_metadata, record): + """ + Updates metadata with PMCID, DOI, release date, and live status information for given article. + Used within getArticleIDs() function. + """ + + if 'errmsg' in record: + print("Error: ", record['errmsg']) + shared_metadata[record['pmid']] = { + **record, # Create a copy with record data + 'live': False + } + else: + # Update shared dictionary with pmid as key and updated article data as value + shared_metadata[record['pmid']] = { + **record, # Create a copy with record data + 'pmcid': record['pmcid'], + 'doi': record['doi'], + 'live': False if 'live' in record and record['live'] == "false" else True, + 'release_date': record['release-date'] if 'release-date' in record else None, + } + #print("Updated metadata in ID converter: ", shared_metadata[record['pmid']]) -def updateArticleMetadata(shared_metadata: list, record: dict): - """ - Updates metadata with PMCID, DOI, release date, and live status information for given article. - """ - if 'errmsg' in record: - print("Error: ", record['errmsg']) - for article in shared_metadata: - if article['pmid'] == record['pmid']: - article['live'] = False - break - - else: - # find article with matching pmid and update pmcid, doi, live, and release date fields - for article in shared_metadata: - if article['pmid'] == record['pmid']: - article['pmcid'] = record['pmcid'] - article['doi'] = record['doi'] - article['live'] = False if 'live' in record and record['live'] == "false" else True - article['release_date'] = record.get('release-date', article['release_date']) - print("Updated metadata in ID converter: ", article) - break def downloadArticles(metadata: list): @@ -398,6 +473,7 @@ def downloadArticles(metadata: list): Returns: metadata: Updated metadata with license, FTP link, and downloaded filepath information. """ + print("In downloadArticles()") try: base_url = "https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?" print("Downloading articles...") @@ -405,13 +481,11 @@ def downloadArticles(metadata: list): # connect to FTP server anonymously ftp = ftplib.FTP("ftp.ncbi.nlm.nih.gov") ftp.login() - - # PARALLELIZE THIS FOR LOOP - DOWNLOAD + METADATA UPDATE + for article in metadata: - if article['live'] is False or article['pmcid'] is None: continue - + # else proceed with download if article['pmcid']: # query URL for article download @@ -420,51 +494,55 @@ def downloadArticles(metadata: list): xml_response = requests.get(final_url) # get license and FTP link - extracted_data = extractArticleData(xml_response.text) - + extracted_data = extractArticleData(xml_response.text) print("\nExtracted license and link data: ", extracted_data) # if no data extracted (reason: article not released/open-access), skip to next article if not extracted_data: article['live'] = False continue - + # update metadata with license and ftp link information article['license'] = extracted_data[0]['license'] article['pubmed_ftp_link'] = extracted_data[0]['href'] if 'href' in extracted_data[0] else None - + # download the article ftp_url = urlparse(extracted_data[0]['href']) ftp_path = ftp_url.path[1:] print("FTP path: ", ftp_path) - + # Set a timeout of 15 minutes - some files take > 1 hour to download and everything hangs - timeout = threading.Timer(15 * 60, lambda: print("Download timed out!")) - timeout.start() - + # timeout = threading.Timer(15 * 60, lambda: print("Download timeout reached.")) + # timeout.start() + filename = ftp_path.split("/")[-1] local_file = os.path.join("pubmed_abstracts", filename) + try: - with open(local_file, 'wb') as f: - ftp.retrbinary('RETR ' + ftp_path, f.write) - print("Downloaded PDF file: ", local_file) - article['filepath'] = local_file - - # if file is .tar.gz, extract the PDF and delete the tar.gz file - if filename.endswith(".tar.gz"): - extracted_pdf_paths = extractPDF(local_file) - print("Extracted PDFs from .tar.gz file: ", extracted_pdf_paths) - article['filepath'] = ",".join(extracted_pdf_paths) - os.remove(local_file) - finally: - timeout.cancel() # cancel the timer if download finishes before timeout - - print("\nUpdated metadata after download: ", article) + with concurrent.futures.ThreadPoolExecutor() as executor: + future = executor.submit(ftp.retrbinary, 'RETR ' + ftp_path, open(local_file, 'wb').write) + future.result(timeout=15*60) # Set a timeout of 15 minutes + print("Downloaded PDF file: ", local_file) + article['filepath'] = local_file + + # if file is .tar.gz, extract the PDF and delete the tar.gz file + if filename.endswith(".tar.gz"): + extracted_pdf_paths = extractPDF(local_file) + print("Extracted PDFs from .tar.gz file: ", extracted_pdf_paths) + article['filepath'] = ",".join(extracted_pdf_paths) + os.remove(local_file) + except concurrent.futures.TimeoutError: + print("Download timeout reached.") + continue # Skip the download and continue with the rest of the code + + print("\nUpdated metadata after download: ", article) + ftp.quit() return metadata except Exception as e: print("Error downloading articles: ", e) - return metadata + return metadata + def extractPDF(tar_gz_filepath: str): """ From 3ddec3bd0a1cfd1297d392bd35ad9980a526d2de Mon Sep 17 00:00:00 2001 From: star-nox Date: Fri, 12 Apr 2024 09:54:11 -0500 Subject: [PATCH 10/28] added minio to requirements.txt --- requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements.txt b/requirements.txt index 848c10d0..9855a94f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -39,6 +39,8 @@ supabase==2.0.2 posthog==3.1.0 sentry-sdk==1.39.1 +minio + # Not currently supporting coursera ingest # cs-dlp @ git+https://github.com/raffaem/cs-dlp.git@0.12.0b0 # previously called coursera-dl From e03fbf15e5daf71c7147768521b4c1044bb6ee68 Mon Sep 17 00:00:00 2001 From: star-nox Date: Fri, 12 Apr 2024 16:41:53 -0500 Subject: [PATCH 11/28] parallelized download --- ai_ta_backend/utils/pubmed_extraction.py | 245 ++++++++++++++++------- 1 file changed, 175 insertions(+), 70 deletions(-) diff --git a/ai_ta_backend/utils/pubmed_extraction.py b/ai_ta_backend/utils/pubmed_extraction.py index 16730f9b..16312b4a 100644 --- a/ai_ta_backend/utils/pubmed_extraction.py +++ b/ai_ta_backend/utils/pubmed_extraction.py @@ -16,6 +16,9 @@ import pandas as pd import threading import json +from functools import partial + + SUPBASE_CLIENT = supabase.create_client( # type: ignore supabase_url=os.getenv('SUPABASE_URL'), # type: ignore @@ -38,7 +41,7 @@ def extractPubmedData(): ftp_path = "pubmed/baseline" file_list = getFileList(ftp_address, ftp_path, ".gz") - gz_filepath = downloadXML(ftp_address, ftp_path, file_list[1], "pubmed") + gz_filepath = downloadXML(ftp_address, ftp_path, file_list[2], "pubmed") print("GZ Downloaded: ", gz_filepath) print("Time taken to download .gz file: ", round(time.time() - start_time, 2), "seconds") gz_file_download_time = time.time() @@ -50,19 +53,20 @@ def extractPubmedData(): print("XML Extracted: ", xml_filepath) print("Time taken to extract XML file: ", round(time.time() - gz_file_download_time, 2), "seconds") - - for metadata in extractMetadataFromXML(xml_filepath): metadata_extract_start_time = time.time() # find PMC ID and DOI for all articles metadata_with_ids = getArticleIDs(metadata) - print("Time taken to get PMC ID and DOI for 100 articles: ", round(time.time() - start_time, 2), "seconds") + metadata_update_time = time.time() + print("Time taken to get PMC ID and DOI for 100 articles: ", round(metadata_update_time - metadata_extract_start_time, 2), "seconds") #print("Metadata with IDs: ", metadata_with_ids) # download the articles complete_metadata = downloadArticles(metadata_with_ids) print(complete_metadata) + print("Time taken to download articles for 100 articles: ", round(time.time() - metadata_update_time, 2), "seconds") + # store metadata in csv file print("\n") @@ -83,7 +87,7 @@ def extractPubmedData(): # upload articles to bucket print("Uploading articles to storage...") - article_upload = uploadToStorage("pubmed_abstracts") + article_upload = uploadToStorage("pubmed_abstracts") # need to parallelize upload print("Uploaded articles: ", article_upload) # upload metadata to SQL DB @@ -100,7 +104,6 @@ def extractPubmedData(): response = SUPBASE_CLIENT.table("publications").upsert(complete_metadata).execute() # type: ignore print("Supabase response: ", response) - return "success" def downloadXML(ftp_address: str, ftp_path: str, file: str, local_dir: str): @@ -465,84 +468,188 @@ def updateArticleMetadata(shared_metadata, record): +# def downloadArticles(metadata: list): +# """ +# Downloads articles from PMC and stores them in local directory. +# Args: +# metadata: List of dictionaries containing metadata for each article. +# Returns: +# metadata: Updated metadata with license, FTP link, and downloaded filepath information. +# """ +# print("In downloadArticles()") +# try: +# base_url = "https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?" +# print("Downloading articles...") + +# # connect to FTP server anonymously +# ftp = ftplib.FTP("ftp.ncbi.nlm.nih.gov") +# ftp.login() + +# for article in metadata: +# if article['live'] is False or article['pmcid'] is None: +# continue + +# # else proceed with download +# if article['pmcid']: +# # query URL for article download +# final_url = base_url + "id=" + article['pmcid'] +# print("Download URL: ", final_url) + +# xml_response = requests.get(final_url) +# # get license and FTP link +# extracted_data = extractArticleData(xml_response.text) +# print("\nExtracted license and link data: ", extracted_data) + +# # if no data extracted (reason: article not released/open-access), skip to next article +# if not extracted_data: +# article['live'] = False +# continue + +# # update metadata with license and ftp link information +# article['license'] = extracted_data[0]['license'] +# article['pubmed_ftp_link'] = extracted_data[0]['href'] if 'href' in extracted_data[0] else None + +# # download the article +# ftp_url = urlparse(extracted_data[0]['href']) +# ftp_path = ftp_url.path[1:] +# print("FTP path: ", ftp_path) + +# # Set a timeout of 15 minutes - some files take > 1 hour to download and everything hangs +# # timeout = threading.Timer(15 * 60, lambda: print("Download timeout reached.")) +# # timeout.start() + +# filename = ftp_path.split("/")[-1] +# local_file = os.path.join("pubmed_abstracts", filename) + +# try: +# with concurrent.futures.ThreadPoolExecutor() as executor: +# future = executor.submit(ftp.retrbinary, 'RETR ' + ftp_path, open(local_file, 'wb').write) +# future.result(timeout=15*60) # Set a timeout of 15 minutes +# print("Downloaded PDF file: ", local_file) +# article['filepath'] = local_file + +# # if file is .tar.gz, extract the PDF and delete the tar.gz file +# if filename.endswith(".tar.gz"): +# extracted_pdf_paths = extractPDF(local_file) +# print("Extracted PDFs from .tar.gz file: ", extracted_pdf_paths) +# article['filepath'] = ",".join(extracted_pdf_paths) +# os.remove(local_file) +# except concurrent.futures.TimeoutError: +# print("Download timeout reached.") +# continue # Skip the download and continue with the rest of the code + +# print("\nUpdated metadata after download: ", article) + +# ftp.quit() +# return metadata +# except Exception as e: +# print("Error downloading articles: ", e) +# return metadata + def downloadArticles(metadata: list): """ Downloads articles from PMC and stores them in local directory. Args: metadata: List of dictionaries containing metadata for each article. Returns: - metadata: Updated metadata with license, FTP link, and downloaded filepath information. + metadata: Updated metadata with license, FTP link, and downloaded filepath information. """ print("In downloadArticles()") try: base_url = "https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?" - print("Downloading articles...") - # connect to FTP server anonymously - ftp = ftplib.FTP("ftp.ncbi.nlm.nih.gov") - ftp.login() + updated_articles = {} + # Use ThreadPoolExecutor to run download_article for each article in parallel + download_article_partial = partial(download_article, api_url=base_url) + with concurrent.futures.ProcessPoolExecutor() as executor: + futures = [executor.submit(download_article_partial, article) for article in metadata] + for future in concurrent.futures.as_completed(futures): + try: + updated_article = future.result(timeout=15*60) # Check result without blocking + if updated_article: + updated_articles[updated_article['pmid']] = updated_article + print("Updated article: ", updated_article) + except Exception as e: + print("Error downloading article:", e) + + # Update original metadata with updated articles for article in metadata: - if article['live'] is False or article['pmcid'] is None: - continue - - # else proceed with download - if article['pmcid']: - # query URL for article download - final_url = base_url + "id=" + article['pmcid'] - print("Download URL: ", final_url) - - xml_response = requests.get(final_url) - # get license and FTP link - extracted_data = extractArticleData(xml_response.text) - print("\nExtracted license and link data: ", extracted_data) - - # if no data extracted (reason: article not released/open-access), skip to next article - if not extracted_data: - article['live'] = False - continue + if article['pmid'] in updated_articles: + article.update(updated_articles[article['pmid']]) - # update metadata with license and ftp link information - article['license'] = extracted_data[0]['license'] - article['pubmed_ftp_link'] = extracted_data[0]['href'] if 'href' in extracted_data[0] else None - - # download the article - ftp_url = urlparse(extracted_data[0]['href']) - ftp_path = ftp_url.path[1:] - print("FTP path: ", ftp_path) - - # Set a timeout of 15 minutes - some files take > 1 hour to download and everything hangs - # timeout = threading.Timer(15 * 60, lambda: print("Download timeout reached.")) - # timeout.start() - - filename = ftp_path.split("/")[-1] - local_file = os.path.join("pubmed_abstracts", filename) - - try: - with concurrent.futures.ThreadPoolExecutor() as executor: - future = executor.submit(ftp.retrbinary, 'RETR ' + ftp_path, open(local_file, 'wb').write) - future.result(timeout=15*60) # Set a timeout of 15 minutes - print("Downloaded PDF file: ", local_file) - article['filepath'] = local_file - - # if file is .tar.gz, extract the PDF and delete the tar.gz file - if filename.endswith(".tar.gz"): - extracted_pdf_paths = extractPDF(local_file) - print("Extracted PDFs from .tar.gz file: ", extracted_pdf_paths) - article['filepath'] = ",".join(extracted_pdf_paths) - os.remove(local_file) - except concurrent.futures.TimeoutError: - print("Download timeout reached.") - continue # Skip the download and continue with the rest of the code - - print("\nUpdated metadata after download: ", article) + print("Updated metadata after download: ", metadata) - ftp.quit() - return metadata + return metadata + except Exception as e: print("Error downloading articles: ", e) - return metadata - + return metadata + +def download_article(article, api_url): + """ + Downloads the article from given FTP link and updates metadata with license, FTP link, and downloaded filepath information. + This function is used within downloadArticles() function. + Args: + article: Dictionary containing metadata for the article. + api_url: URL for the article download API. + ftp: FTP connection object. + Returns: + article: Updated metadata for the article. + """ + + print("Downloading articles...") + if not article['live'] or article['pmcid'] is None: + return + + # Proceed with download + # Connect to FTP server anonymously + ftp = ftplib.FTP("ftp.ncbi.nlm.nih.gov") + ftp.login() + + if article['pmcid']: + final_url = api_url + "id=" + article['pmcid'] + print("\nDownload URL: ", final_url) + + xml_response = requests.get(final_url) + extracted_data = extractArticleData(xml_response.text) + print("Extracted license and link data: ", extracted_data) + + if not extracted_data: + article['live'] = False + return + + article['license'] = extracted_data[0]['license'] + article['pubmed_ftp_link'] = extracted_data[0]['href'] if 'href' in extracted_data[0] else None + + ftp_url = urlparse(extracted_data[0]['href']) + ftp_path = ftp_url.path[1:] + print("FTP path: ", ftp_path) + + filename = ftp_path.split("/")[-1] + local_file = os.path.join("pubmed_abstracts", filename) + + try: + with open(local_file, 'wb') as f: + ftp.retrbinary('RETR ' + ftp_path, f.write) # Download directly to file + + print("Downloaded FTP file: ", local_file) + article['filepath'] = local_file + + if filename.endswith(".tar.gz"): + extracted_pdf_paths = extractPDF(local_file) + print("Extracted PDFs from .tar.gz file: ", extracted_pdf_paths) + article['filepath'] = ",".join(extracted_pdf_paths) + os.remove(local_file) + + except concurrent.futures.TimeoutError: + print("Download timeout reached.") + + ftp.quit() + + print("\nUpdated metadata after download: ", article) + return article + def extractPDF(tar_gz_filepath: str): """ @@ -619,9 +726,7 @@ def uploadToStorage(filepath: str): print("in uploadToStorage()") try: bucket_name = "pubmed" - print(os.environ['MINIO_URL']) - print(os.environ['MINIO_SECRET_KEY']) - print(os.environ['MINIO_ACCESS_KEY']) + found = MINIO_CLIENT.bucket_exists(bucket_name) if not found: MINIO_CLIENT.make_bucket(bucket_name) From 8e5a1a0d84ea8430c3f83ad06b6b2d7504d58801 Mon Sep 17 00:00:00 2001 From: star-nox Date: Mon, 15 Apr 2024 16:58:20 -0500 Subject: [PATCH 12/28] parallelized upload --- ai_ta_backend/utils/pubmed_extraction.py | 73 +++++++++++++++++++----- 1 file changed, 60 insertions(+), 13 deletions(-) diff --git a/ai_ta_backend/utils/pubmed_extraction.py b/ai_ta_backend/utils/pubmed_extraction.py index 16312b4a..9919ff5b 100644 --- a/ai_ta_backend/utils/pubmed_extraction.py +++ b/ai_ta_backend/utils/pubmed_extraction.py @@ -19,7 +19,6 @@ from functools import partial - SUPBASE_CLIENT = supabase.create_client( # type: ignore supabase_url=os.getenv('SUPABASE_URL'), # type: ignore supabase_key=os.getenv('SUPABASE_API_KEY') # type: ignore @@ -41,7 +40,7 @@ def extractPubmedData(): ftp_path = "pubmed/baseline" file_list = getFileList(ftp_address, ftp_path, ".gz") - gz_filepath = downloadXML(ftp_address, ftp_path, file_list[2], "pubmed") + gz_filepath = downloadXML(ftp_address, ftp_path, file_list[3], "pubmed") print("GZ Downloaded: ", gz_filepath) print("Time taken to download .gz file: ", round(time.time() - start_time, 2), "seconds") gz_file_download_time = time.time() @@ -53,6 +52,7 @@ def extractPubmedData(): print("XML Extracted: ", xml_filepath) print("Time taken to extract XML file: ", round(time.time() - gz_file_download_time, 2), "seconds") + #xml_filepath = "pubmed/pubmed24n1217.xml" for metadata in extractMetadataFromXML(xml_filepath): metadata_extract_start_time = time.time() @@ -91,6 +91,7 @@ def extractPubmedData(): print("Uploaded articles: ", article_upload) # upload metadata to SQL DB + csv_filepath = "metadata.csv" df = pd.read_csv(csv_filepath) complete_metadata = df.to_dict('records') @@ -604,7 +605,7 @@ def download_article(article, api_url): # Proceed with download # Connect to FTP server anonymously - ftp = ftplib.FTP("ftp.ncbi.nlm.nih.gov") + ftp = ftplib.FTP("ftp.ncbi.nlm.nih.gov", timeout=15*60) ftp.login() if article['pmcid']: @@ -628,7 +629,7 @@ def download_article(article, api_url): filename = ftp_path.split("/")[-1] local_file = os.path.join("pubmed_abstracts", filename) - + try: with open(local_file, 'wb') as f: ftp.retrbinary('RETR ' + ftp_path, f.write) # Download directly to file @@ -718,15 +719,25 @@ def extractArticleData(xml_string: str): except Exception as e: print("Error extracting article data: ", e) return [] - + +def upload_file(client, bucket_name, file_path, object_name): + """ + Uploads a single file to the Minio bucket. + """ + try: + client.fput_object(bucket_name, object_name, file_path) + print(f"Uploaded: {object_name}") + except Exception as e: + print(f"Error uploading {object_name}: {e}") + def uploadToStorage(filepath: str): """ - Uploads all files present under given filepath to Minio bucket. + Uploads all files present under given filepath to Minio bucket in parallel. """ print("in uploadToStorage()") try: bucket_name = "pubmed" - + found = MINIO_CLIENT.bucket_exists(bucket_name) if not found: MINIO_CLIENT.make_bucket(bucket_name) @@ -734,18 +745,54 @@ def uploadToStorage(filepath: str): else: print("Bucket", bucket_name, "already exists") - for root, dirs, files in os.walk(filepath): - # can parallelize this upload - for file in files: + # Get all files to upload + files = [] + for root, _, files_ in os.walk(filepath): + for file in files_: file_path = os.path.join(root, file) object_name = file_path.split("/")[-1] - # insert local file into remote bucket - MINIO_CLIENT.fput_object(bucket_name, object_name, file_path) - print("Uploaded: ", object_name) + files.append((MINIO_CLIENT, bucket_name, file_path, object_name)) + + # Use concurrent.futures ThreadPoolExecutor for parallel upload + with concurrent.futures.ThreadPoolExecutor() as executor: + # Submit all upload tasks to the executor + futures = [executor.submit(upload_file, *args) for args in files] + # Wait for all tasks to complete + for future in futures: + future.result() # This will raise any exceptions from upload_file + return "success" except Exception as e: print("Error uploading to storage: ", e) return "failure" + +# def uploadToStorage(filepath: str): +# """ +# Uploads all files present under given filepath to Minio bucket. +# """ +# print("in uploadToStorage()") +# try: +# bucket_name = "pubmed" + +# found = MINIO_CLIENT.bucket_exists(bucket_name) +# if not found: +# MINIO_CLIENT.make_bucket(bucket_name) +# print("Created bucket", bucket_name) +# else: +# print("Bucket", bucket_name, "already exists") + +# for root, dirs, files in os.walk(filepath): +# # can parallelize this upload +# for file in files: +# file_path = os.path.join(root, file) +# object_name = file_path.split("/")[-1] +# # insert local file into remote bucket +# MINIO_CLIENT.fput_object(bucket_name, object_name, file_path) +# print("Uploaded: ", object_name) +# return "success" +# except Exception as e: +# print("Error uploading to storage: ", e) +# return "failure" From caac0bfb38f44cb50b40ae3364298464bd1bfe3b Mon Sep 17 00:00:00 2001 From: star-nox Date: Wed, 17 Apr 2024 10:00:26 -0500 Subject: [PATCH 13/28] minor changes --- ai_ta_backend/utils/pubmed_extraction.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ai_ta_backend/utils/pubmed_extraction.py b/ai_ta_backend/utils/pubmed_extraction.py index 9919ff5b..a025e025 100644 --- a/ai_ta_backend/utils/pubmed_extraction.py +++ b/ai_ta_backend/utils/pubmed_extraction.py @@ -40,7 +40,7 @@ def extractPubmedData(): ftp_path = "pubmed/baseline" file_list = getFileList(ftp_address, ftp_path, ".gz") - gz_filepath = downloadXML(ftp_address, ftp_path, file_list[3], "pubmed") + gz_filepath = downloadXML(ftp_address, ftp_path, file_list[4], "pubmed") print("GZ Downloaded: ", gz_filepath) print("Time taken to download .gz file: ", round(time.time() - start_time, 2), "seconds") gz_file_download_time = time.time() @@ -567,6 +567,7 @@ def downloadArticles(metadata: list): futures = [executor.submit(download_article_partial, article) for article in metadata] for future in concurrent.futures.as_completed(futures): try: + print("Starting new download...") updated_article = future.result(timeout=15*60) # Check result without blocking if updated_article: updated_articles[updated_article['pmid']] = updated_article From fa605089169abb2663e2a0fe475e1142a5665de3 Mon Sep 17 00:00:00 2001 From: star-nox Date: Fri, 19 Apr 2024 10:38:04 -0500 Subject: [PATCH 14/28] restricted upload parallelization to 10 --- ai_ta_backend/utils/pubmed_extraction.py | 146 ++++++++++++----------- 1 file changed, 79 insertions(+), 67 deletions(-) diff --git a/ai_ta_backend/utils/pubmed_extraction.py b/ai_ta_backend/utils/pubmed_extraction.py index a025e025..779b2cc2 100644 --- a/ai_ta_backend/utils/pubmed_extraction.py +++ b/ai_ta_backend/utils/pubmed_extraction.py @@ -39,72 +39,83 @@ def extractPubmedData(): ftp_address = "ftp.ncbi.nlm.nih.gov" ftp_path = "pubmed/baseline" file_list = getFileList(ftp_address, ftp_path, ".gz") - - gz_filepath = downloadXML(ftp_address, ftp_path, file_list[4], "pubmed") - print("GZ Downloaded: ", gz_filepath) - print("Time taken to download .gz file: ", round(time.time() - start_time, 2), "seconds") - gz_file_download_time = time.time() - - # extract the XML file - if not gz_filepath: - return "failure" - xml_filepath = extractXMLFile(gz_filepath) - print("XML Extracted: ", xml_filepath) - print("Time taken to extract XML file: ", round(time.time() - gz_file_download_time, 2), "seconds") - - #xml_filepath = "pubmed/pubmed24n1217.xml" - for metadata in extractMetadataFromXML(xml_filepath): - metadata_extract_start_time = time.time() - - # find PMC ID and DOI for all articles - metadata_with_ids = getArticleIDs(metadata) - metadata_update_time = time.time() - print("Time taken to get PMC ID and DOI for 100 articles: ", round(metadata_update_time - metadata_extract_start_time, 2), "seconds") - #print("Metadata with IDs: ", metadata_with_ids) - - # download the articles - complete_metadata = downloadArticles(metadata_with_ids) - print(complete_metadata) - print("Time taken to download articles for 100 articles: ", round(time.time() - metadata_update_time, 2), "seconds") + for file in file_list[5:]: # already processed first 5 files + try: + print("Processing file: ", file) - # store metadata in csv file - print("\n") - print("Total articles retrieved: ", len(complete_metadata)) - df = pd.DataFrame(complete_metadata) - csv_filepath = "metadata.csv" - - if os.path.isfile(csv_filepath): - df.to_csv(csv_filepath, mode='a', header=False, index=False) - else: - df.to_csv(csv_filepath, index=False) - - print("Time taken to extract metadata for 100 articles: ", round(time.time() - metadata_extract_start_time, 2), "seconds") + gz_filepath = downloadXML(ftp_address, ftp_path, file, "pubmed") + print("GZ Downloaded: ", gz_filepath) + print("Time taken to download .gz file: ", round(time.time() - start_time, 2), "seconds") + gz_file_download_time = time.time() + + # extract the XML file + if not gz_filepath: + return "failure" + xml_filepath = extractXMLFile(gz_filepath) + print("XML Extracted: ", xml_filepath) + print("Time taken to extract XML file: ", round(time.time() - gz_file_download_time, 2), "seconds") + + #xml_filepath = "pubmed/pubmed24n1217.xml" + for metadata in extractMetadataFromXML(xml_filepath): + metadata_extract_start_time = time.time() + + # find PMC ID and DOI for all articles + metadata_with_ids = getArticleIDs(metadata) + metadata_update_time = time.time() + print("Time taken to get PMC ID and DOI for 100 articles: ", round(metadata_update_time - metadata_extract_start_time, 2), "seconds") + #print("Metadata with IDs: ", metadata_with_ids) + + # download the articles + complete_metadata = downloadArticles(metadata_with_ids) + print(complete_metadata) + print("Time taken to download articles for 100 articles: ", round(time.time() - metadata_update_time, 2), "seconds") + + # store metadata in csv file + print("\n") + print("Total articles retrieved: ", len(complete_metadata)) + df = pd.DataFrame(complete_metadata) + csv_filepath = "metadata.csv" + + if os.path.isfile(csv_filepath): + df.to_csv(csv_filepath, mode='a', header=False, index=False) + else: + df.to_csv(csv_filepath, index=False) + + print("Time taken to extract metadata for 100 articles: ", round(time.time() - metadata_extract_start_time, 2), "seconds") - print("Time taken to download articles: ", round(time.time() - start_time, 2), "seconds") - print("Total metadata extracted: ", len(complete_metadata)) + print("Time taken to download articles: ", round(time.time() - start_time, 2), "seconds") + print("Total metadata extracted: ", len(complete_metadata)) - # upload articles to bucket - print("Uploading articles to storage...") - article_upload = uploadToStorage("pubmed_abstracts") # need to parallelize upload - print("Uploaded articles: ", article_upload) - - # upload metadata to SQL DB - csv_filepath = "metadata.csv" - df = pd.read_csv(csv_filepath) - - complete_metadata = df.to_dict('records') - for item in complete_metadata: - for key, value in item.items(): - if pd.isna(value): # Or: math.isnan(value) - item[key] = None - - print("Metadata loaded into dataframe: ", len(complete_metadata)) - # continue with the rest of the code - response = SUPBASE_CLIENT.table("publications").upsert(complete_metadata).execute() # type: ignore - print("Supabase response: ", response) + # upload articles to bucket + print("Uploading articles to storage...") + article_upload = uploadToStorage("pubmed_abstracts") # need to parallelize upload + print("Uploaded articles: ", article_upload) + + # upload metadata to SQL DB + csv_filepath = "metadata.csv" + df = pd.read_csv(csv_filepath) + + complete_metadata = df.to_dict('records') + for item in complete_metadata: + for key, value in item.items(): + if pd.isna(value): # Or: math.isnan(value) + item[key] = None + print("Metadata loaded into dataframe: ", len(complete_metadata)) + # continue with the rest of the code + response = SUPBASE_CLIENT.table("publications").upsert(complete_metadata).execute() # type: ignore + print("Uploaded metadata to SQL DB.") + + # delete files + os.remove(csv_filepath) + os.remove("pubmed_abstracts") + + except Exception as e: + print("Error processing file: ", e) + continue + return "success" def downloadXML(ftp_address: str, ftp_path: str, file: str, local_dir: str): @@ -754,13 +765,14 @@ def uploadToStorage(filepath: str): object_name = file_path.split("/")[-1] files.append((MINIO_CLIENT, bucket_name, file_path, object_name)) - # Use concurrent.futures ThreadPoolExecutor for parallel upload - with concurrent.futures.ThreadPoolExecutor() as executor: - # Submit all upload tasks to the executor - futures = [executor.submit(upload_file, *args) for args in files] - # Wait for all tasks to complete - for future in futures: - future.result() # This will raise any exceptions from upload_file + # Use concurrent.futures ThreadPoolExecutor with limited pool size + with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: + # Submit files in batches of 10 + for i in range(0, len(files), 10): + batch_files = files[i:i+10] + futures = [executor.submit(upload_file, *args) for args in batch_files] + for future in futures: + future.result() # This will raise any exceptions from upload_file return "success" except Exception as e: From a61255c4be724f272f334de0fab40a54551359d2 Mon Sep 17 00:00:00 2001 From: star-nox Date: Sun, 21 Apr 2024 09:01:58 -0500 Subject: [PATCH 15/28] changed starting XML file --- ai_ta_backend/utils/pubmed_extraction.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ai_ta_backend/utils/pubmed_extraction.py b/ai_ta_backend/utils/pubmed_extraction.py index 779b2cc2..03d19824 100644 --- a/ai_ta_backend/utils/pubmed_extraction.py +++ b/ai_ta_backend/utils/pubmed_extraction.py @@ -39,8 +39,9 @@ def extractPubmedData(): ftp_address = "ftp.ncbi.nlm.nih.gov" ftp_path = "pubmed/baseline" file_list = getFileList(ftp_address, ftp_path, ".gz") + - for file in file_list[5:]: # already processed first 5 files + for file in file_list[7:]: # already processed first 5 files try: print("Processing file: ", file) From 4d86b85ed0857ffd9ac5fd1d91c31c0a3ce6fa81 Mon Sep 17 00:00:00 2001 From: star-nox Date: Mon, 22 Apr 2024 12:11:18 -0500 Subject: [PATCH 16/28] changed starting XML file --- ai_ta_backend/utils/pubmed_extraction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ai_ta_backend/utils/pubmed_extraction.py b/ai_ta_backend/utils/pubmed_extraction.py index 03d19824..3d660e3f 100644 --- a/ai_ta_backend/utils/pubmed_extraction.py +++ b/ai_ta_backend/utils/pubmed_extraction.py @@ -41,7 +41,7 @@ def extractPubmedData(): file_list = getFileList(ftp_address, ftp_path, ".gz") - for file in file_list[7:]: # already processed first 5 files + for file in file_list[8:]: # already processed first 5 files try: print("Processing file: ", file) From 63a6cb60bb09ef5ffd99d2bedff6394149e131be Mon Sep 17 00:00:00 2001 From: star-nox Date: Mon, 29 Apr 2024 15:53:14 -0500 Subject: [PATCH 17/28] changed start file --- ai_ta_backend/utils/pubmed_extraction.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ai_ta_backend/utils/pubmed_extraction.py b/ai_ta_backend/utils/pubmed_extraction.py index 3d660e3f..c5f50dd9 100644 --- a/ai_ta_backend/utils/pubmed_extraction.py +++ b/ai_ta_backend/utils/pubmed_extraction.py @@ -41,7 +41,7 @@ def extractPubmedData(): file_list = getFileList(ftp_address, ftp_path, ".gz") - for file in file_list[8:]: # already processed first 5 files + for file in file_list[10:]: # already processed first 5 files try: print("Processing file: ", file) @@ -89,7 +89,7 @@ def extractPubmedData(): print("Time taken to download articles: ", round(time.time() - start_time, 2), "seconds") print("Total metadata extracted: ", len(complete_metadata)) - # upload articles to bucket + upload articles to bucket print("Uploading articles to storage...") article_upload = uploadToStorage("pubmed_abstracts") # need to parallelize upload print("Uploaded articles: ", article_upload) @@ -108,7 +108,7 @@ def extractPubmedData(): # continue with the rest of the code response = SUPBASE_CLIENT.table("publications").upsert(complete_metadata).execute() # type: ignore print("Uploaded metadata to SQL DB.") - + # delete files os.remove(csv_filepath) os.remove("pubmed_abstracts") @@ -116,6 +116,7 @@ def extractPubmedData(): except Exception as e: print("Error processing file: ", e) continue + exit() return "success" From b17baeaa618590acfcbb8b03ab9f1838fb56cbc3 Mon Sep 17 00:00:00 2001 From: star-nox Date: Mon, 29 Apr 2024 16:01:24 -0500 Subject: [PATCH 18/28] minor comment --- ai_ta_backend/utils/pubmed_extraction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ai_ta_backend/utils/pubmed_extraction.py b/ai_ta_backend/utils/pubmed_extraction.py index c5f50dd9..12a88de3 100644 --- a/ai_ta_backend/utils/pubmed_extraction.py +++ b/ai_ta_backend/utils/pubmed_extraction.py @@ -89,7 +89,7 @@ def extractPubmedData(): print("Time taken to download articles: ", round(time.time() - start_time, 2), "seconds") print("Total metadata extracted: ", len(complete_metadata)) - upload articles to bucket + # upload articles to bucket print("Uploading articles to storage...") article_upload = uploadToStorage("pubmed_abstracts") # need to parallelize upload print("Uploaded articles: ", article_upload) From 4cfc6b8046bebc5feeba667cca4cc238e058d41a Mon Sep 17 00:00:00 2001 From: star-nox Date: Mon, 6 May 2024 11:36:33 -0500 Subject: [PATCH 19/28] minor changes in main loop --- ai_ta_backend/utils/pubmed_extraction.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/ai_ta_backend/utils/pubmed_extraction.py b/ai_ta_backend/utils/pubmed_extraction.py index 12a88de3..01731dc4 100644 --- a/ai_ta_backend/utils/pubmed_extraction.py +++ b/ai_ta_backend/utils/pubmed_extraction.py @@ -41,7 +41,7 @@ def extractPubmedData(): file_list = getFileList(ftp_address, ftp_path, ".gz") - for file in file_list[10:]: # already processed first 5 files + for file in file_list[18:20]: try: print("Processing file: ", file) @@ -65,7 +65,6 @@ def extractPubmedData(): metadata_with_ids = getArticleIDs(metadata) metadata_update_time = time.time() print("Time taken to get PMC ID and DOI for 100 articles: ", round(metadata_update_time - metadata_extract_start_time, 2), "seconds") - #print("Metadata with IDs: ", metadata_with_ids) # download the articles complete_metadata = downloadArticles(metadata_with_ids) @@ -95,28 +94,26 @@ def extractPubmedData(): print("Uploaded articles: ", article_upload) # upload metadata to SQL DB - csv_filepath = "metadata.csv" df = pd.read_csv(csv_filepath) - complete_metadata = df.to_dict('records') for item in complete_metadata: for key, value in item.items(): if pd.isna(value): # Or: math.isnan(value) item[key] = None - print("Metadata loaded into dataframe: ", len(complete_metadata)) + # continue with the rest of the code response = SUPBASE_CLIENT.table("publications").upsert(complete_metadata).execute() # type: ignore print("Uploaded metadata to SQL DB.") - # delete files - os.remove(csv_filepath) - os.remove("pubmed_abstracts") - except Exception as e: print("Error processing file: ", e) - continue - exit() + + # delete files + shutil.rmtree("F:/MSIM/ML_Projects/ai-ta-backend/pubmed_abstracts") + os.remove("F:/MSIM/ML_Projects/ai-ta-backend/metadata.csv") + #os.remove(xml_filepath) + print("Finished file: ", file) return "success" From 8b533ba33c3e72d54434dc09f1f821c9ed263a8a Mon Sep 17 00:00:00 2001 From: star-nox Date: Mon, 13 May 2024 11:14:33 -0500 Subject: [PATCH 20/28] minor changes --- ai_ta_backend/utils/pubmed_extraction.py | 118 +++++++++++------------ 1 file changed, 59 insertions(+), 59 deletions(-) diff --git a/ai_ta_backend/utils/pubmed_extraction.py b/ai_ta_backend/utils/pubmed_extraction.py index 01731dc4..46dbf2aa 100644 --- a/ai_ta_backend/utils/pubmed_extraction.py +++ b/ai_ta_backend/utils/pubmed_extraction.py @@ -41,77 +41,77 @@ def extractPubmedData(): file_list = getFileList(ftp_address, ftp_path, ".gz") - for file in file_list[18:20]: - try: - print("Processing file: ", file) + for file in file_list[20:21]: + # try: + # print("Processing file: ", file) - gz_filepath = downloadXML(ftp_address, ftp_path, file, "pubmed") - print("GZ Downloaded: ", gz_filepath) - print("Time taken to download .gz file: ", round(time.time() - start_time, 2), "seconds") - gz_file_download_time = time.time() - - # extract the XML file - if not gz_filepath: - return "failure" - xml_filepath = extractXMLFile(gz_filepath) - print("XML Extracted: ", xml_filepath) - print("Time taken to extract XML file: ", round(time.time() - gz_file_download_time, 2), "seconds") + # gz_filepath = downloadXML(ftp_address, ftp_path, file, "pubmed") + # print("GZ Downloaded: ", gz_filepath) + # print("Time taken to download .gz file: ", round(time.time() - start_time, 2), "seconds") + # gz_file_download_time = time.time() + + # # extract the XML file + # if not gz_filepath: + # return "failure" + # xml_filepath = extractXMLFile(gz_filepath) + # print("XML Extracted: ", xml_filepath) + # print("Time taken to extract XML file: ", round(time.time() - gz_file_download_time, 2), "seconds") - #xml_filepath = "pubmed/pubmed24n1217.xml" - for metadata in extractMetadataFromXML(xml_filepath): - metadata_extract_start_time = time.time() - - # find PMC ID and DOI for all articles - metadata_with_ids = getArticleIDs(metadata) - metadata_update_time = time.time() - print("Time taken to get PMC ID and DOI for 100 articles: ", round(metadata_update_time - metadata_extract_start_time, 2), "seconds") + # #xml_filepath = "pubmed/pubmed24n1217.xml" + # for metadata in extractMetadataFromXML(xml_filepath): + # metadata_extract_start_time = time.time() + + # # find PMC ID and DOI for all articles + # metadata_with_ids = getArticleIDs(metadata) + # metadata_update_time = time.time() + # print("Time taken to get PMC ID and DOI for 100 articles: ", round(metadata_update_time - metadata_extract_start_time, 2), "seconds") - # download the articles - complete_metadata = downloadArticles(metadata_with_ids) - print(complete_metadata) - print("Time taken to download articles for 100 articles: ", round(time.time() - metadata_update_time, 2), "seconds") - - # store metadata in csv file - print("\n") - print("Total articles retrieved: ", len(complete_metadata)) - df = pd.DataFrame(complete_metadata) - csv_filepath = "metadata.csv" - - if os.path.isfile(csv_filepath): - df.to_csv(csv_filepath, mode='a', header=False, index=False) - else: - df.to_csv(csv_filepath, index=False) + # # download the articles + # complete_metadata = downloadArticles(metadata_with_ids) + # print(complete_metadata) + # print("Time taken to download articles for 100 articles: ", round(time.time() - metadata_update_time, 2), "seconds") + + # # store metadata in csv file + # print("\n") + # print("Total articles retrieved: ", len(complete_metadata)) + # df = pd.DataFrame(complete_metadata) + # csv_filepath = "metadata.csv" + + # if os.path.isfile(csv_filepath): + # df.to_csv(csv_filepath, mode='a', header=False, index=False) + # else: + # df.to_csv(csv_filepath, index=False) - print("Time taken to extract metadata for 100 articles: ", round(time.time() - metadata_extract_start_time, 2), "seconds") + # print("Time taken to extract metadata for 100 articles: ", round(time.time() - metadata_extract_start_time, 2), "seconds") - print("Time taken to download articles: ", round(time.time() - start_time, 2), "seconds") - print("Total metadata extracted: ", len(complete_metadata)) + # print("Time taken to download articles: ", round(time.time() - start_time, 2), "seconds") + # print("Total metadata extracted: ", len(complete_metadata)) - # upload articles to bucket - print("Uploading articles to storage...") - article_upload = uploadToStorage("pubmed_abstracts") # need to parallelize upload - print("Uploaded articles: ", article_upload) + # # upload articles to bucket + # print("Uploading articles to storage...") + # article_upload = uploadToStorage("pubmed_abstracts") # need to parallelize upload + # print("Uploaded articles: ", article_upload) - # upload metadata to SQL DB - df = pd.read_csv(csv_filepath) - complete_metadata = df.to_dict('records') - for item in complete_metadata: - for key, value in item.items(): - if pd.isna(value): # Or: math.isnan(value) - item[key] = None - print("Metadata loaded into dataframe: ", len(complete_metadata)) + # # upload metadata to SQL DB + # df = pd.read_csv(csv_filepath) + # complete_metadata = df.to_dict('records') + # for item in complete_metadata: + # for key, value in item.items(): + # if pd.isna(value): # Or: math.isnan(value) + # item[key] = None + # print("Metadata loaded into dataframe: ", len(complete_metadata)) - # continue with the rest of the code - response = SUPBASE_CLIENT.table("publications").upsert(complete_metadata).execute() # type: ignore - print("Uploaded metadata to SQL DB.") + # # continue with the rest of the code + # response = SUPBASE_CLIENT.table("publications").upsert(complete_metadata).execute() # type: ignore + # print("Uploaded metadata to SQL DB.") - except Exception as e: - print("Error processing file: ", e) + # except Exception as e: + # print("Error processing file: ", e) # delete files - shutil.rmtree("F:/MSIM/ML_Projects/ai-ta-backend/pubmed_abstracts") - os.remove("F:/MSIM/ML_Projects/ai-ta-backend/metadata.csv") + shutil.rmtree("pubmed_abstracts") + os.remove("metadata.csv") #os.remove(xml_filepath) print("Finished file: ", file) From 6dfe50ba03940483b30a5c698675eac7d1d9b4d8 Mon Sep 17 00:00:00 2001 From: star-nox Date: Tue, 14 May 2024 11:01:17 -0500 Subject: [PATCH 21/28] parallelized processing --- ai_ta_backend/utils/pubmed_extraction.py | 175 +++++++++++++---------- 1 file changed, 96 insertions(+), 79 deletions(-) diff --git a/ai_ta_backend/utils/pubmed_extraction.py b/ai_ta_backend/utils/pubmed_extraction.py index 46dbf2aa..39276ade 100644 --- a/ai_ta_backend/utils/pubmed_extraction.py +++ b/ai_ta_backend/utils/pubmed_extraction.py @@ -39,83 +39,100 @@ def extractPubmedData(): ftp_address = "ftp.ncbi.nlm.nih.gov" ftp_path = "pubmed/baseline" file_list = getFileList(ftp_address, ftp_path, ".gz") + + with concurrent.futures.ProcessPoolExecutor() as executor: + futures = [executor.submit(processPubmedXML, file, ftp_address, ftp_path) for file in file_list[21:22]] + for future in concurrent.futures.as_completed(futures): + try: + future.result() + except Exception as e: + print("Error processing file: ", e) + + return "success" - for file in file_list[20:21]: - # try: - # print("Processing file: ", file) +def processPubmedXML(file:str, ftp_address:str, ftp_path:str): + """ + Main function to extract metadata and articles from the PubMed baseline folder. + """ + start_time = time.monotonic() + try: + print("Processing file: ", file) + gz_filepath = downloadXML(ftp_address, ftp_path, file, "pubmed") + print("GZ Downloaded: ", gz_filepath) + print("Time taken to download .gz file: ", round(time.time() - start_time, 2), "seconds") + gz_file_download_time = time.time() + + # extract the XML file + if not gz_filepath: + return "failure" + xml_filepath = extractXMLFile(gz_filepath) + print("XML Extracted: ", xml_filepath) + print("Time taken to extract XML file: ", round(time.time() - gz_file_download_time, 2), "seconds") - # gz_filepath = downloadXML(ftp_address, ftp_path, file, "pubmed") - # print("GZ Downloaded: ", gz_filepath) - # print("Time taken to download .gz file: ", round(time.time() - start_time, 2), "seconds") - # gz_file_download_time = time.time() - - # # extract the XML file - # if not gz_filepath: - # return "failure" - # xml_filepath = extractXMLFile(gz_filepath) - # print("XML Extracted: ", xml_filepath) - # print("Time taken to extract XML file: ", round(time.time() - gz_file_download_time, 2), "seconds") - - # #xml_filepath = "pubmed/pubmed24n1217.xml" - # for metadata in extractMetadataFromXML(xml_filepath): - # metadata_extract_start_time = time.time() - - # # find PMC ID and DOI for all articles - # metadata_with_ids = getArticleIDs(metadata) - # metadata_update_time = time.time() - # print("Time taken to get PMC ID and DOI for 100 articles: ", round(metadata_update_time - metadata_extract_start_time, 2), "seconds") + xml_id = xml_filepath[7:-4].replace(".", "_") + destination_dir = xml_id + "_papers" + csv_filepath = xml_id + "_metadata.csv" + print("Destination directory: ", destination_dir) + print("CSV file path: ", csv_filepath) + #xml_filepath = "pubmed/pubmed24n1217.xml" + + for metadata in extractMetadataFromXML(xml_filepath, destination_dir): + metadata_extract_start_time = time.time() + + # find PMC ID and DOI for all articles + metadata_with_ids = getArticleIDs(metadata) + metadata_update_time = time.time() + print("Time taken to get PMC ID and DOI for 100 articles: ", round(metadata_update_time - metadata_extract_start_time, 2), "seconds") - # # download the articles - # complete_metadata = downloadArticles(metadata_with_ids) - # print(complete_metadata) - # print("Time taken to download articles for 100 articles: ", round(time.time() - metadata_update_time, 2), "seconds") - - # # store metadata in csv file - # print("\n") - # print("Total articles retrieved: ", len(complete_metadata)) - # df = pd.DataFrame(complete_metadata) - # csv_filepath = "metadata.csv" - - # if os.path.isfile(csv_filepath): - # df.to_csv(csv_filepath, mode='a', header=False, index=False) - # else: - # df.to_csv(csv_filepath, index=False) + # download the articles + complete_metadata = downloadArticles(metadata_with_ids, destination_dir) + print(complete_metadata) + print("Time taken to download articles for 100 articles: ", round(time.time() - metadata_update_time, 2), "seconds") + + # store metadata in csv file + print("\n") + print("Total articles retrieved: ", len(complete_metadata)) + df = pd.DataFrame(complete_metadata) + + if os.path.isfile(csv_filepath): + df.to_csv(csv_filepath, mode='a', header=False, index=False) + else: + df.to_csv(csv_filepath, index=False) - # print("Time taken to extract metadata for 100 articles: ", round(time.time() - metadata_extract_start_time, 2), "seconds") + print("Time taken to extract metadata for 100 articles: ", round(time.time() - metadata_extract_start_time, 2), "seconds") - # print("Time taken to download articles: ", round(time.time() - start_time, 2), "seconds") - # print("Total metadata extracted: ", len(complete_metadata)) + print("Time taken to download articles: ", round(time.time() - start_time, 2), "seconds") + print("Total metadata extracted: ", len(complete_metadata)) - # # upload articles to bucket - # print("Uploading articles to storage...") - # article_upload = uploadToStorage("pubmed_abstracts") # need to parallelize upload - # print("Uploaded articles: ", article_upload) - - # # upload metadata to SQL DB - # df = pd.read_csv(csv_filepath) - # complete_metadata = df.to_dict('records') - # for item in complete_metadata: - # for key, value in item.items(): - # if pd.isna(value): # Or: math.isnan(value) - # item[key] = None - # print("Metadata loaded into dataframe: ", len(complete_metadata)) + # upload articles to bucket + print("Uploading articles to storage...") + article_upload = uploadToStorage("pubmed_abstracts") # need to parallelize upload + print("Uploaded articles: ", article_upload) - # # continue with the rest of the code - # response = SUPBASE_CLIENT.table("publications").upsert(complete_metadata).execute() # type: ignore - # print("Uploaded metadata to SQL DB.") + # upload metadata to SQL DB + df = pd.read_csv(csv_filepath) + complete_metadata = df.to_dict('records') + for item in complete_metadata: + for key, value in item.items(): + if pd.isna(value): # Or: math.isnan(value) + item[key] = None + print("Metadata loaded into dataframe: ", len(complete_metadata)) - # except Exception as e: - # print("Error processing file: ", e) - - # delete files - shutil.rmtree("pubmed_abstracts") - os.remove("metadata.csv") - #os.remove(xml_filepath) - print("Finished file: ", file) + # continue with the rest of the code + response = SUPBASE_CLIENT.table("publications").upsert(complete_metadata).execute() # type: ignore + print("Uploaded metadata to SQL DB.") - return "success" + except Exception as e: + print("Error processing file: ", e) + + # delete files + shutil.rmtree(destination_dir) + os.remove(csv_filepath) + #os.remove(xml_filepath) + print("Finished file: ", file) + def downloadXML(ftp_address: str, ftp_path: str, file: str, local_dir: str): """ @@ -202,7 +219,7 @@ def extractXMLFile(gz_filepath: str): print("Error extracting XML file: ", e) return None -def extractMetadataFromXML(xml_filepath: str): +def extractMetadataFromXML(xml_filepath: str, dir: str): """ Extracts article details from the XML file and stores it in a dictionary. Details extracted: PMID, PMCID, DOI, ISSN, journal title, article title, @@ -215,7 +232,7 @@ def extractMetadataFromXML(xml_filepath: str): print("inside extractMetadataFromXML()") try: # create a directory to store abstracts - os.makedirs("pubmed_abstracts", exist_ok=True) + os.makedirs(dir, exist_ok=True) tree = ET.parse(xml_filepath) root = tree.getroot() @@ -227,7 +244,7 @@ def extractMetadataFromXML(xml_filepath: str): article_items = list(item for item in root.iter('PubmedArticle')) # Convert generator to list for item in article_items: - future = executor.submit(processArticleItem, item) + future = executor.submit(processArticleItem, item, dir) article_data = future.result() metadata.append(article_data) @@ -246,7 +263,7 @@ def extractMetadataFromXML(xml_filepath: str): return [] -def processArticleItem(item: ET.Element): +def processArticleItem(item: ET.Element, directory: str): """ Extracts article details from a single PubmedArticle XML element. This is used in the process pool executor. Args: @@ -311,7 +328,7 @@ def processArticleItem(item: ET.Element): abstract_text += abstract_text_element.text + "\n" # save abstract to a text file - abstract_filename = f"pubmed_abstracts/{article_data['pmid']}.txt" + abstract_filename = directory + "/" + article_data['pmid'] + ".txt" with open(abstract_filename, 'w') as f: if article_data['journal_title']: f.write("Journal title: " + article_data['journal_title'] + "\n\n") @@ -557,7 +574,7 @@ def updateArticleMetadata(shared_metadata, record): # print("Error downloading articles: ", e) # return metadata -def downloadArticles(metadata: list): +def downloadArticles(metadata: list, dir: str): """ Downloads articles from PMC and stores them in local directory. Args: @@ -572,7 +589,7 @@ def downloadArticles(metadata: list): updated_articles = {} # Use ThreadPoolExecutor to run download_article for each article in parallel - download_article_partial = partial(download_article, api_url=base_url) + download_article_partial = partial(download_article, api_url=base_url, dir=dir) with concurrent.futures.ProcessPoolExecutor() as executor: futures = [executor.submit(download_article_partial, article) for article in metadata] for future in concurrent.futures.as_completed(futures): @@ -598,7 +615,7 @@ def downloadArticles(metadata: list): print("Error downloading articles: ", e) return metadata -def download_article(article, api_url): +def download_article(article, api_url, dir): """ Downloads the article from given FTP link and updates metadata with license, FTP link, and downloaded filepath information. This function is used within downloadArticles() function. @@ -639,7 +656,7 @@ def download_article(article, api_url): print("FTP path: ", ftp_path) filename = ftp_path.split("/")[-1] - local_file = os.path.join("pubmed_abstracts", filename) + local_file = os.path.join(dir, filename) try: with open(local_file, 'wb') as f: @@ -649,7 +666,7 @@ def download_article(article, api_url): article['filepath'] = local_file if filename.endswith(".tar.gz"): - extracted_pdf_paths = extractPDF(local_file) + extracted_pdf_paths = extractPDF(local_file, dir) print("Extracted PDFs from .tar.gz file: ", extracted_pdf_paths) article['filepath'] = ",".join(extracted_pdf_paths) os.remove(local_file) @@ -663,7 +680,7 @@ def download_article(article, api_url): return article -def extractPDF(tar_gz_filepath: str): +def extractPDF(tar_gz_filepath: str, dest_directory: str): """ Extracts PDF files from the downloaded .tar.gz file. The zipped folder contains other supplementary materials like images, etc. which are not extracted. @@ -678,9 +695,9 @@ def extractPDF(tar_gz_filepath: str): with tarfile.open(tar_gz_filepath, "r:gz") as tar: for member in tar: if member.isreg() and member.name.endswith(".pdf"): - tar.extract(member, path="pubmed_abstracts") + tar.extract(member, path=dest_directory) print("Extracted: ", member.name) - extracted_paths.append(os.path.join("pubmed_abstracts", member.name)) + extracted_paths.append(os.path.join(dest_directory, member.name)) return extracted_paths except Exception as e: From e98f3dc04303c8aef2eb0b2fcdfd5ce4f72c2ade Mon Sep 17 00:00:00 2001 From: star-nox Date: Tue, 14 May 2024 12:53:46 -0500 Subject: [PATCH 22/28] added try-except in getArticleIds() --- ai_ta_backend/utils/pubmed_extraction.py | 87 ++++++++++--------- metadata.csv | 101 +++++++++++++++++++++++ 2 files changed, 144 insertions(+), 44 deletions(-) create mode 100644 metadata.csv diff --git a/ai_ta_backend/utils/pubmed_extraction.py b/ai_ta_backend/utils/pubmed_extraction.py index 12a88de3..524c6e79 100644 --- a/ai_ta_backend/utils/pubmed_extraction.py +++ b/ai_ta_backend/utils/pubmed_extraction.py @@ -41,7 +41,7 @@ def extractPubmedData(): file_list = getFileList(ftp_address, ftp_path, ".gz") - for file in file_list[10:]: # already processed first 5 files + for file in file_list[22:23]: try: print("Processing file: ", file) @@ -65,7 +65,6 @@ def extractPubmedData(): metadata_with_ids = getArticleIDs(metadata) metadata_update_time = time.time() print("Time taken to get PMC ID and DOI for 100 articles: ", round(metadata_update_time - metadata_extract_start_time, 2), "seconds") - #print("Metadata with IDs: ", metadata_with_ids) # download the articles complete_metadata = downloadArticles(metadata_with_ids) @@ -95,28 +94,26 @@ def extractPubmedData(): print("Uploaded articles: ", article_upload) # upload metadata to SQL DB - csv_filepath = "metadata.csv" df = pd.read_csv(csv_filepath) - complete_metadata = df.to_dict('records') for item in complete_metadata: for key, value in item.items(): if pd.isna(value): # Or: math.isnan(value) item[key] = None - print("Metadata loaded into dataframe: ", len(complete_metadata)) + # continue with the rest of the code response = SUPBASE_CLIENT.table("publications").upsert(complete_metadata).execute() # type: ignore print("Uploaded metadata to SQL DB.") - # delete files - os.remove(csv_filepath) - os.remove("pubmed_abstracts") - except Exception as e: print("Error processing file: ", e) - continue - exit() + + # delete files + shutil.rmtree("pubmed_abstracts") + os.remove("metadata.csv") + #os.remove(xml_filepath) + print("Finished file: ", file) return "success" @@ -420,39 +417,41 @@ def getArticleIDs(metadata: list): for i in range(0, len(metadata), batch_size): batch = metadata[i:i + batch_size] ids = ",".join([article['pmid'] for article in batch]) - response = requests.get(base_url + app_details + "&ids=" + ids) - data = response.json() - records = data['records'] - - # PARALLELIZE THIS FOR LOOP - UPDATES ADDITIONAL FIELDS FOR ALL ARTICLES AT ONCE - with Manager() as manager: - shared_metadata = manager.dict() # Use a shared dictionary - with concurrent.futures.ProcessPoolExecutor() as executor: - futures = { - executor.submit(updateArticleMetadata, shared_metadata, record): record - for record in records - } - concurrent.futures.wait(futures) - for future in concurrent.futures.as_completed(futures): - record = futures[future] - try: - future.result() - except Exception as exc: - print('%r generated an exception: %s' % (record, exc)) - - # Update original metadata after loop - for article in metadata: - if article['pmid'] in shared_metadata: - # print("Shared metadata: ", shared_metadata[article['pmid']]) - if 'errmsg' in shared_metadata[article['pmid']]: - article['live'] = False - else: - article['pmcid'] = shared_metadata[article['pmid']]['pmcid'] - article['doi'] = shared_metadata[article['pmid']]['doi'] - article['live'] = shared_metadata[article['pmid']]['live'] - article['release_date'] = shared_metadata[article['pmid']]['release_date'] - #print("Updated metadata: ", article) - + try: + response = requests.get(base_url + app_details + "&ids=" + ids) + data = response.json() + records = data['records'] + + # PARALLELIZE THIS FOR LOOP - UPDATES ADDITIONAL FIELDS FOR ALL ARTICLES AT ONCE + with Manager() as manager: + shared_metadata = manager.dict() # Use a shared dictionary + with concurrent.futures.ProcessPoolExecutor() as executor: + futures = { + executor.submit(updateArticleMetadata, shared_metadata, record): record + for record in records + } + concurrent.futures.wait(futures) + for future in concurrent.futures.as_completed(futures): + record = futures[future] + try: + future.result() + except Exception as exc: + print('%r generated an exception: %s' % (record, exc)) + + # Update original metadata after loop + for article in metadata: + if article['pmid'] in shared_metadata: + # print("Shared metadata: ", shared_metadata[article['pmid']]) + if 'errmsg' in shared_metadata[article['pmid']]: + article['live'] = False + else: + article['pmcid'] = shared_metadata[article['pmid']]['pmcid'] + article['doi'] = shared_metadata[article['pmid']]['doi'] + article['live'] = shared_metadata[article['pmid']]['live'] + article['release_date'] = shared_metadata[article['pmid']]['release_date'] + #print("Updated metadata: ", article) + except Exception as e: + print("Error: ", e) #print("Length of metadata after ID conversion: ", len(metadata)) return metadata diff --git a/metadata.csv b/metadata.csv new file mode 100644 index 00000000..6517828c --- /dev/null +++ b/metadata.csv @@ -0,0 +1,101 @@ +pmid,pmcid,doi,issn,journal_title,article_title,last_revised,published,live,release_date,license,pubmed_ftp_link,filepath +37417630,PMC10328620,10.1097/MD.0000000000034177,1536-5964,Medicine,Meningitis with septic shock resulting from odontogenic infection misdiagnosed as closed-lock in temporomandibular disorder: A case report and literature review.,2023-11-15,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/64/70/PMC10328620.tar.gz,pubmed_abstracts/PMC10328620/medi-102-e34177.pdf +37417631,PMC10328656,10.1097/MD.0000000000034223,1536-5964,Medicine,"The effect of progressive muscle relaxation technique and myofascial release technique on premenstrual symptoms, blood circulation, and quality of life in women with premenstrual syndrome: A single-blind randomized controlled study.",2023-08-02,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/76/4b/PMC10328656.tar.gz,pubmed_abstracts/PMC10328656/medi-102-e34223.pdf +37417634,PMC10328666,10.1097/MD.0000000000034239,1536-5964,Medicine,Case report: Plastic bronchitis associated with Bordetella parapertussis.,2023-08-02,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/ee/a6/PMC10328666.tar.gz,pubmed_abstracts/PMC10328666/medi-102-e34239.pdf +37417633,PMC10328702,10.1097/MD.0000000000034216,1536-5964,Medicine,Renal artery aneurysm induced by neurofibromatosis type 1: A case report and review of the endovascular interventions for this rare vasculopathy.,2023-08-02,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/85/3e/PMC10328702.tar.gz,pubmed_abstracts/PMC10328702/medi-102-e34216.pdf +37417632,PMC10328683,10.1097/MD.0000000000034221,1536-5964,Medicine,Intervention for burnout and irrational beliefs in parents of couples seeking a divorce: A critical reflection of Igbo-African marital discord.,2023-08-02,2023-Jul-07,True,,CC BY-NC,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/5d/c7/medi-102-e34221.PMC10328683.pdf,pubmed_abstracts/medi-102-e34221.PMC10328683.pdf +37417636,PMC10328563,10.1097/MD.0000000000034197,1536-5964,Medicine,Hematomyelia associated with coronavirus disease 2019: A rare case report.,2023-08-02,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/97/4c/PMC10328563.tar.gz,pubmed_abstracts/PMC10328563/medi-102-e34197.pdf +37417635,PMC10328687,10.1097/MD.0000000000034194,1536-5964,Medicine,Marginal resection as a potential curative treatment option of infantile fibrosarcoma with good response after chemotherapy: A case report of an ETV6-NTRK3 positive infantile fibrosacroma of the distal tibia.,2023-08-02,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/f1/67/PMC10328687.tar.gz,pubmed_abstracts/PMC10328687/medi-102-e34194.pdf +37417638,PMC10328619,10.1097/MD.0000000000034282,1536-5964,Medicine,"The effect of being married on heart rate variability, an indicator of autonomic dysfunction: A retrospective study.",2023-08-02,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/62/28/PMC10328619.tar.gz,pubmed_abstracts/PMC10328619/medi-102-e34282.pdf +37417637,PMC10328692,10.1097/MD.0000000000034238,1536-5964,Medicine,A case report of Ovarian hyperstimulation syndrome and corpus luteum rupture in twin pregnancies with IVF-ET.,2023-08-02,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/19/93/PMC10328692.tar.gz,pubmed_abstracts/PMC10328692/medi-102-e34238.pdf +37417639,PMC10328576,10.1097/MD.0000000000033936,1536-5964,Medicine,Transvenous embolization using the Amplatzer Vascular Plug II in patent ductus arteriosus concomitant with Stanford type B aortic dissection: A case report.,2023-08-02,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/52/2f/PMC10328576.tar.gz,pubmed_abstracts/PMC10328576/medi-102-e33936.pdf +37417640,PMC10328685,10.1097/MD.0000000000034250,1536-5964,Medicine,Quantitative chest CT imaging characteristics and outcome of patients with COVID-19 associated pulmonary artery thrombosis: A single-center retrospective cohort study.,2023-08-02,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/17/4e/PMC10328685.tar.gz,pubmed_abstracts/PMC10328685/medi-102-e34250.pdf +37417642,PMC10328710,10.1097/MD.0000000000033880,1536-5964,Medicine,Orelabrutinib versus ibrutinib for patients with refractory/relapsed primary central nervous system lymphoma: An efficacy and safety analysis.,2023-08-02,2023-Jul-07,True,,CC BY-NC,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/cf/c9/medi-102-e33880.PMC10328710.pdf,pubmed_abstracts/medi-102-e33880.PMC10328710.pdf +37417641,PMC10328596,10.1097/MD.0000000000034248,1536-5964,Medicine,The influence of psychological factors on coronary heart disease: A review of the evidence and implications for psychological interventions.,2023-08-02,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/f5/ec/PMC10328596.tar.gz,pubmed_abstracts/PMC10328596/medi-102-e34248.pdf +37417645,,,1440-1754,Journal of paediatrics and child health,Blue and red Doppler jet on the echocardiogram.,2023-11-16,2023-Jul-01,False,,,, +37417643,PMC10328582,10.1097/MD.0000000000034401,1536-5964,Medicine,Opioids for treating refractory dyspnea in patients with heart failure: A protocol for systematic review and meta-analysis: Retraction.,2023-11-16,2023-07-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/44/53/PMC10328582.tar.gz,pubmed_abstracts/PMC10328582/medi-102-e34401.pdf +37417644,,,1473-2165,Journal of cosmetic dermatology,Safety and efficacy of human platelet extract in skin recovery after fractional CO,2023-08-14,2023-Sep-01,False,,,,pubmed_abstracts/37417644.txt +37417646,,,1440-1754,Journal of paediatrics and child health,Hemi-atrophy of the face.,2023-11-16,2023-Jul-01,False,,,, +37417647,,,1440-1754,Journal of paediatrics and child health,An unexpected percutaneous gastro-jejunostomy obstruction.,2023-11-16,2023-Jul-01,False,,,, +37417649,,,1097-0347,Head & neck,Ultrasound-guided resection for squamous cell carcinoma of the buccal mucosa: A feasibility study.,2023-08-14,2023-09-01,False,,,, +37417648,,,1728-2985,"Urologiia (Moscow, Russia : 1999)",Androgenic status of men with severe COVID-19: the role of testosterone and dihydrotestosterone within the program FOUNDER (features of a new coronavirus infection course and options therapy depending on the androgenic status).,2023-07-18,2023-Jul-01,False,,,,pubmed_abstracts/37417648.txt +37417650,PMC10790315,10.1111/all.15797,1398-9995,Allergy,Diagnostic utility of allergy tests to predict baked egg and lightly cooked egg allergies compared to double-blind placebo-controlled food challenges.,2023-10-10,2023-09-01,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/4a/d7/PMC10790315.tar.gz,pubmed_abstracts/PMC10790315/ALL-78-2510.pdf +37417653,,,1879-1190,Journal of the American College of Surgeons,Statistical Power of Randomized Controlled Trials in Trauma Surgery.,2023-10-24,2023-11-01,False,,,,pubmed_abstracts/37417653.txt +37417652,,,1440-1746,Journal of gastroenterology and hepatology,A model for predicting poor survival in patients with cirrhosis undergoing portosystemic shunt embolization.,2023-09-18,2023-Sep-01,False,,,,pubmed_abstracts/37417652.txt +37417657,PMC11017731,10.1021/acssynbio.3c00061,2161-5063,ACS synthetic biology,Engineering Tissue-Scale Properties with Synthetic Cells: Forging One from Many.,2023-07-29,2023-07-21,False,2024-07-21,,,pubmed_abstracts/37417657.txt +37417659,,,1537-8918,Current sports medicine reports,To Protect and Serve: Preventable Collapse and Death of Police Trainees.,2023-11-21,2023-07-01,False,,,, +37417654,PMC10988698,10.1113/EP090989,1469-445X,Experimental physiology,Role of proprioceptors in chronic musculoskeletal pain.,2023-07-07,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/32/e5/PMC10988698.tar.gz,pubmed_abstracts/PMC10988698/EPH-109-45.pdf +37417660,,,1537-8918,Current sports medicine reports,Web Alerts.,2023-08-20,2023-Jul-01,False,,,, +37417662,,,1537-8918,Current sports medicine reports,Vitamin C Supplementation and Athletic Performance: A Review.,2023-08-20,2023-Jul-01,False,,,,pubmed_abstracts/37417662.txt +37417661,,,1537-8918,Current sports medicine reports,"Nutritional Strategies for Endurance Cyclists - Periodized Nutrition, Ketogenic Diets, and Other Considerations.",2023-09-11,2023-Jul-01,False,,,,pubmed_abstracts/37417661.txt +37417663,,,1537-8918,Current sports medicine reports,A Research and Clinical Framework for Understanding Achilles Injury in Female Collegiate Gymnasts.,2023-08-20,2023-Jul-01,False,,,,pubmed_abstracts/37417663.txt +37417664,,,1833-3575,Health information management : journal of the Health Information Management Association of Australia,Alpha NSW: What would it take to create a state-wide paediatric population-level learning health system?,2023-07-07,2023-Jul-07,False,,,,pubmed_abstracts/37417664.txt +37417665,,,1476-8259,Computer methods in biomechanics and biomedical engineering,Automated detection of auditory response: non-detection stopping criterion and repeatability studies for multichannel EEG.,2023-07-07,2023-Jul-07,False,,,,pubmed_abstracts/37417665.txt +37417666,,,1944-8252,ACS applied materials & interfaces,Impact of Molecular Orientation on Lateral and Interfacial Electron Transfer at Oxide Interfaces.,2023-07-19,2023-Jul-19,False,,,,pubmed_abstracts/37417666.txt +37417658,PMC10331187,10.1177/17539447231184984,1753-9455,Therapeutic advances in cardiovascular disease,Evaluation of diuretic efficiency of intravenous furosemide in patients with advanced heart failure in a heart failure clinic.,2023-07-18,,True,,CC BY-NC,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/a7/5e/10.1177_17539447231184984.PMC10331187.pdf,pubmed_abstracts/10.1177_17539447231184984.PMC10331187.pdf +37417667,PMC10373524,10.1021/acsnano.2c11904,1936-086X,ACS nano,"Insights into the Structure of Comirnaty Covid-19 Vaccine: A Theory on Soft, Partially Bilayer-Covered Nanoparticles with Hydrogen Bond-Stabilized mRNA-Lipid Complexes.",2023-07-31,2023-07-25,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/8a/ad/PMC10373524.tar.gz,"pubmed_abstracts/PMC10373524/nn2c11904.pdf,pubmed_abstracts/PMC10373524/nn2c11904_si_001.pdf" +37417668,,,1078-7496,Journal of trauma nursing : the official journal of the Society of Trauma Nurses,Be Someone's Betsy!,2023-11-03,,False,,,, +37417669,,,1078-7496,Journal of trauma nursing : the official journal of the Society of Trauma Nurses,Concomitant Mental Illnesses Diagnoses and Likelihood of Trauma Recidivism.,2023-08-21,,False,,,,pubmed_abstracts/37417669.txt +37417670,,,1078-7496,Journal of trauma nursing : the official journal of the Society of Trauma Nurses,Concomitant Mental Illnesses Diagnoses and Likelihood of Trauma Recidivism.,2023-12-05,,False,,,, +37417673,,,1078-7496,Journal of trauma nursing : the official journal of the Society of Trauma Nurses,Injury and Sociodemographic Characteristics of Intimate Partner Violence in Women in Israel: A Single-Center Retrospective Cohort Study.,2023-08-21,,False,,,,pubmed_abstracts/37417673.txt +37417672,,,1078-7496,Journal of trauma nursing : the official journal of the Society of Trauma Nurses,"Effects of Case Management in Trauma Patients in Taiwan: A Randomized, Longitudinal Study.",2023-08-21,,False,,,,pubmed_abstracts/37417672.txt +37417671,,,1078-7496,Journal of trauma nursing : the official journal of the Society of Trauma Nurses,Inpatient Rehabilitation Falls: Comparing Patients With Traumatic Brain Injury Versus Patients With Stroke.,2023-08-21,,False,,,,pubmed_abstracts/37417671.txt +37417674,,,1078-7496,Journal of trauma nursing : the official journal of the Society of Trauma Nurses,Impact of Trauma Resuscitation Emergency Care Nurse Deployment in Trauma Activations in a Rural Trauma Center.,2023-08-21,,False,,,,pubmed_abstracts/37417674.txt +37417678,PMC10388677,10.1530/EDM-22-0383,2052-0573,"Endocrinology, diabetes & metabolism case reports",Clinical and molecular description of two cases of neonatal diabetes secondary to mutations in PDX1.,2023-08-02,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/b3/4c/PMC10388677.tar.gz,pubmed_abstracts/PMC10388677/EDM22-0383.pdf +37417680,,,1524-4725,Dermatologic surgery : official publication for American Society for Dermatologic Surgery [et al.],A Retrospective Analysis of Complications of Minimally Invasive Cosmetic Procedures Seen at a Referral Practice in Houston.,2023-10-02,2023-09-01,False,,,, +37417681,,,1552-7433,Personality & social psychology bulletin,"Masculinity Threats Sequentially Arouse Public Discomfort, Anger, and Positive Attitudes Toward Sexual Violence.",2023-07-07,2023-Jul-07,False,,,,pubmed_abstracts/37417681.txt +37417679,PMC10895403,10.1093/jpids/piad048,2048-7207,Journal of the Pediatric Infectious Diseases Society,Comparison of Administrative Database-Derived and Hospital-Derived Data for Monitoring Blood Culture Use in the Pediatric Intensive Care Unit.,2023-11-03,2023-Jul-31,True,,,,pubmed_abstracts/37417679.txt +37417677,,,1078-7496,Journal of trauma nursing : the official journal of the Society of Trauma Nurses,Postintensive Care Syndrome: Feasibly Bridging Care at a Tertiary Trauma Center.,2023-12-05,,False,,,, +37417682,PMC10374551,10.1049/nbt2.12144,1751-875X,IET nanobiotechnology,Natural compound chaetocin induced DNA damage and apoptosis through reactive oxygen species-dependent pathways in A549 lung cancer cells and in vitro evaluations.,2023-07-31,2023-Jul-01,True,,CC BY-NC-ND,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/7a/14/NBT2-17-465.PMC10374551.pdf,pubmed_abstracts/NBT2-17-465.PMC10374551.pdf +37417676,,,1078-7496,Journal of trauma nursing : the official journal of the Society of Trauma Nurses,Postintensive Care Syndrome: Feasibly Bridging Care at a Tertiary Trauma Center.,2023-08-21,,False,,,,pubmed_abstracts/37417676.txt +37417675,,,1078-7496,Journal of trauma nursing : the official journal of the Society of Trauma Nurses,Bringing Trauma Education to the Frontier: Overcoming Distance Barriers Utilizing a Virtual Platform.,2023-08-21,,False,,,,pubmed_abstracts/37417675.txt +37417683,,,2163-0097,Clinical advances in periodontics,Novel biomaterial advanced platelet-rich fibrin plus block for multiple gingival recession.,2023-07-19,2023-Jul-07,False,,,,pubmed_abstracts/37417683.txt +37417684,PMC10439496,10.1049/syb2.12070,1751-8857,IET systems biology,Comprehensive analysis of anoikis-related lncRNAs for predicting prognosis and response of immunotherapy in hepatocellular carcinoma.,2023-08-23,2023-08-01,True,,CC BY-NC-ND,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/e3/c1/SYB2-17-198.PMC10439496.pdf,pubmed_abstracts/SYB2-17-198.PMC10439496.pdf +37417685,,,1944-8252,ACS applied materials & interfaces,Influence of Interfering Ions and Adsorption Temperature on Radioactive Iodine Removal Efficiency and Stability of Ni-MOF-74 and Zr-UiO-66.,2023-07-20,2023-Jul-19,False,,,,pubmed_abstracts/37417685.txt +37417686,,,1532-7752,Journal of personality assessment,The HEXACO Personality Space Before and After Re-Rotation to Approximate the Big Five Dimensions.,2023-07-07,2023-Jul-07,False,,,,pubmed_abstracts/37417686.txt +37417687,,,1532-7752,Journal of personality assessment,New Versions of the MMPI and Rorschach: How Have Training Programs Responded?,2023-07-07,2023-Jul-07,False,,,,pubmed_abstracts/37417687.txt +37417690,PMC10332179,10.1080/07853890.2023.2230888,1365-2060,Annals of medicine,Blinatumomab as salvage therapy in patients with relapsed/refractory B-ALL who have failed/progressed after anti-CD19-CAR T therapy.,2023-11-19,2023-12-01,True,,CC BY-NC,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/f5/9b/IANN_55_2230888.PMC10332179.pdf,pubmed_abstracts/IANN_55_2230888.PMC10332179.pdf +37417688,PMC10407019,10.1111/aogs.14620,1600-0412,Acta obstetricia et gynecologica Scandinavica,Ultrasound examination of the pelvic floor during active labor: A longitudinal cohort study.,2023-08-10,2023-09-01,True,,CC BY-NC-ND,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/6d/0e/AOGS-102-1203.PMC10407019.pdf,pubmed_abstracts/AOGS-102-1203.PMC10407019.pdf +37417691,PMC10771532,10.1111/jgs.18505,1532-5415,Journal of the American Geriatrics Society,Factors associated with preventable hospitalizations after hospice live discharge among Medicare patients with Alzheimer's disease and related dementias.,2023-11-15,2023-Nov-01,False,2024-11-01,,, +37417692,,,1470-8744,Biotechnology and applied biochemistry,Deciphering the role of fungus in degradation of polypropylene from hospital waste.,2023-12-10,2023-Dec-01,False,,,,pubmed_abstracts/37417692.txt +37417689,PMC10527499,10.1097/BRS.0000000000004769,1528-1159,Spine,Association of Neighborhood Socioeconomic Deprivation With Utilization and Costs of Anterior Cervical Discectomy and Fusion.,2023-10-03,2023-Sep-15,False,2024-09-15,,,pubmed_abstracts/37417689.txt +37417694,,,1532-5040,Physiotherapy theory and practice,Effect of a structured early mobilization protocol on the level of mobilization and muscle strength in critical care patients: A randomized clinical trial.,2023-07-07,2023-Jul-07,False,,,,pubmed_abstracts/37417694.txt +37417693,PMC10735286,10.1210/clinem/dgad401,1945-7197,The Journal of clinical endocrinology and metabolism,Pheochromocytomas Most Commonly Present As Adrenal Incidentalomas: A Large Tertiary Center Experience.,2023-07-07,2023-Jul-07,True,,CC BY-NC-ND,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/21/a2/PMC10735286.tar.gz, +37417696,,,1745-3682,Acta orthopaedica,A comparison of uncemented short versus standard stem length in total hip arthroplasty: results from the Dutch Arthroplasty Register.,2023-11-16,2023-07-07,False,,,,pubmed_abstracts/37417696.txt +37417695,,,1528-1159,Spine,Quality of Life and Postoperative Satisfaction in Patients with Benign Extramedullary Spinal Tumors: A Multicenter Study.,2023-08-28,2023-Sep-15,False,,,,pubmed_abstracts/37417695.txt +37417697,PMC10484187,10.1097/BRS.0000000000004731,1528-1159,Spine,"Directed Versus Nondirected Standing Postures in Adolescent Idiopathic Scoliosis: Its Impact on Curve Magnitude, Alignment, and Clinical Decision-Making.",2023-09-13,2023-Oct-01,True,,CC BY-NC-ND,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/f2/95/brs-48-1354.PMC10484187.pdf,pubmed_abstracts/brs-48-1354.PMC10484187.pdf +37417700,PMC10331372,10.1177/01410768231184381,1758-1095,Journal of the Royal Society of Medicine,From ,2023-07-18,2023-Jun-01,False,2026-06-01,,, +37417698,,,1751-3766,Journal of biological dynamics,Threshold dynamics of a stochastic mathematical model for ,2023-11-16,2023-Dec-01,False,,,,pubmed_abstracts/37417698.txt +37417702,PMC10331365,10.1177/01410768231184373,1758-1095,Journal of the Royal Society of Medicine,Is an independent NHS an impossible dream?,2023-07-18,2023-Jun-01,True,,,, +37417701,PMC10331368,10.1177/01410768231182836,1758-1095,Journal of the Royal Society of Medicine,Facilitating genetic testing after death: the ongoing duty of care to the deceased and their relatives.,2023-11-16,2023-Jun-01,False,2026-06-01,,, +37417704,,,1744-764X,Expert opinion on drug safety,Proton pump inhibitors use prior to COVID-19 hospitalization is associated with higher C,2023-07-10,2023-Jul-10,False,,,,pubmed_abstracts/37417704.txt +37417706,,,1945-7197,The Journal of clinical endocrinology and metabolism,Microvascular complications are associated with coronary collateralization in type 2 diabetes and chronic occlusion.,2023-07-07,2023-Jul-07,False,,,,pubmed_abstracts/37417706.txt +37417705,PMC10332216,10.1080/07853890.2023.2231847,1365-2060,Annals of medicine,Life quality among psoriasis patients based on Dermatology Life Quality Index evaluation and its association with psoriasis severity in China: a cross-sectional study.,2023-11-19,2023-12-01,True,,CC BY-NC,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/09/33/IANN_55_2231847.PMC10332216.pdf,pubmed_abstracts/IANN_55_2231847.PMC10332216.pdf +37417709,,,1528-1159,Spine,Cannabis Use is Associated with Higher Rates of Pseudarthrosis Following TLIF: A Multi-Institutional Matched-Cohort Study.,2023-07-07,2023-Jul-03,False,,,,pubmed_abstracts/37417709.txt +37417707,,,1523-4681,Journal of bone and mineral research : the official journal of the American Society for Bone and Mineral Research,3D Finite Element Models Reconstructed From 2D Dual-Energy X-Ray Absorptiometry (DXA) Images Improve Hip Fracture Prediction Compared to Areal BMD in Osteoporotic Fractures in Men (MrOS) Sweden Cohort.,2023-09-26,2023-09-01,False,,,,pubmed_abstracts/37417707.txt +37417710,,,1612-1880,Chemistry & biodiversity,Synthesis and Evaluation of Novel Metacetamol Derivatives with Hydrazone Moiety as Anticancer and Antimicrobial Agents.,2023-08-24,2023-Aug-01,False,,,,pubmed_abstracts/37417710.txt +37417712,PMC10337823,10.1093/europace/euad189,1532-2092,"Europace : European pacing, arrhythmias, and cardiac electrophysiology : journal of the working groups on cardiac pacing, arrhythmias, and cardiac cellular electrophysiology of the European Society of Cardiology",Very-early symptomatic recurrence is associated with late recurrence after radiofrequency ablation of atrial fibrillation.,2023-08-22,2023-07-04,True,,CC BY-NC,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/4e/85/euad189.PMC10337823.pdf,pubmed_abstracts/euad189.PMC10337823.pdf +37417711,,,1528-0691,"Chemical record (New York, N.Y.)",Supported Noble Metal Catalysts and Adsorbents with Soft Lewis Acid Functions.,2023-11-21,2023-Nov-01,False,,,,pubmed_abstracts/37417711.txt +37417713,PMC10719214,10.1093/cei/uxad072,1365-2249,Clinical and experimental immunology,Effects of mesenchymal stem cells on Treg cells in rats with colitis.,2023-12-13,2023-Dec-13,False,2024-07-07,,,pubmed_abstracts/37417713.txt +37417714,PMC10577628,10.1111/aogs.14626,1600-0412,Acta obstetricia et gynecologica Scandinavica,Double-vs single-balloon catheter for induction of labor: Systematic review and individual participant data meta-analysis.,2023-10-24,2023-11-01,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/af/36/PMC10577628.tar.gz,pubmed_abstracts/PMC10577628/AOGS-102-1440.pdf +37417715,PMC10508478,10.1002/vms3.1180,2053-1095,Veterinary medicine and science,"Evaluating the effects of direct-fed microbial supplementation on the performance, milk quality and fatty acid of mid-lactating dairy cows.",2023-09-21,2023-09-01,True,,CC BY-NC,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/8c/db/VMS3-9-2212.PMC10508478.pdf,pubmed_abstracts/VMS3-9-2212.PMC10508478.pdf +37417716,,,1538-9774,"Computers, informatics, nursing : CIN",The Disruptive Impacts of Next Generation Generative Artificial Intelligence.,2023-11-28,2023-07-01,False,,,, +37417718,,,1559-4106,Biointerphases,Development of electronic sum frequency generation spectrophotometer to assess the buried interfaces.,2023-11-18,2023-07-01,False,,,,pubmed_abstracts/37417718.txt +37417719,,,1559-4106,Biointerphases,Theoretical study of electronic sum frequency generation spectroscopy to assess the buried interfaces.,2023-11-18,2023-07-01,False,,,,pubmed_abstracts/37417719.txt +37417708,PMC10524881,10.1097/AUD.0000000000001396,1538-4667,Ear and hearing,Association Between Adult-Onset Hearing Loss and Income: A Systematic Review.,2023-11-06,,False,2024-07-06,,,pubmed_abstracts/37417708.txt +37417720,,,1943-278X,Suicide & life-threatening behavior,When safe firearm storage isn't enough: Examining risk profiles among firearm suicide decedents.,2023-08-16,2023-08-01,False,,,,pubmed_abstracts/37417720.txt +37417722,,,1754-9485,Journal of medical imaging and radiation oncology,Percutaneous treatment of renal tumours.,2023-07-07,2023-Jul-07,False,,,,pubmed_abstracts/37417722.txt +37417721,PMC10332182,10.1080/07853890.2023.2233556,1365-2060,Annals of medicine,Ultrasound-guided injection acupotomy as a minimally invasive intervention therapy for cervical spondylotic radiculopathy: a randomized control trial.,2023-11-19,2023-12-01,True,,CC BY-NC,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/8f/eb/IANN_55_2233556.PMC10332182.pdf,pubmed_abstracts/IANN_55_2233556.PMC10332182.pdf +37417723,,,1528-1159,Spine,GSK-3β and β-Catenin Signaling Pathway is Involved in Myofibroblast Transition of Ligamentum Flavum in Lumbar Spinal Stenosis Patients.,2023-09-27,2023-Oct-15,False,,,,pubmed_abstracts/37417723.txt +37417724,,,1528-1159,Spine,Subclassification of Sanders Maturation Stage 3 Demonstrates Differences in Spine and Total Height Velocity Between 3A and 3B in Patients with Idiopathic Scoliosis.,2023-07-07,2023-Jul-06,False,,,,pubmed_abstracts/37417724.txt +37417725,,,1523-4681,Journal of bone and mineral research : the official journal of the American Society for Bone and Mineral Research,Efficacy and Safety of Transdermal Abaloparatide in Postmenopausal Women with Osteoporosis: A Randomized Study.,2023-10-26,2023-10-01,False,,,,pubmed_abstracts/37417725.txt +37417726,,,1521-4141,European journal of immunology,CKBA suppresses mast cell activation via ERK signaling pathway in murine atopic dermatitis.,2023-09-11,2023-09-01,False,,,,pubmed_abstracts/37417726.txt +37417728,,,1521-4095,"Advanced materials (Deerfield Beach, Fla.)",Reconstructed Hierarchically Structured Keratin Fibers with Shape-Memory Features Based on Reversible Secondary-Structure Transformation.,2023-10-23,2023-Oct-01,False,,,,pubmed_abstracts/37417728.txt +37417727,PMC10181040,10.3390/nu15092153,2072-6643,Nutrients,"Effectiveness of a Digitally Delivered Continuous Care Intervention (Defeat Diabetes) on Type 2 Diabetes Outcomes: A 12-Month Single-Arm, Pre-Post Intervention Study.",2023-07-19,2023-Apr-30,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/f3/34/PMC10181040.tar.gz,pubmed_abstracts/PMC10181040/nutrients-15-02153.pdf +37417730,PMC10356134,10.7554/eLife.88310,2050-084X,eLife,Metformin regulates bone marrow stromal cells to accelerate bone healing in diabetic mice.,2023-07-21,2023-07-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/5e/82/PMC10356134.tar.gz,"pubmed_abstracts/PMC10356134/elife-88310-mdarchecklist1.pdf,pubmed_abstracts/PMC10356134/elife-88310.pdf" +37417729,PMC10508548,10.1002/vms3.1196,2053-1095,Veterinary medicine and science,Global prevalence of Neospora caninum in rodents: A systematic review and meta-analysis.,2023-09-21,2023-09-01,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/1d/b2/PMC10508548.tar.gz,pubmed_abstracts/PMC10508548/VMS3-9-2192.pdf +37417731,,,1520-6890,Chemical reviews,Enantioselective Transformations in the Synthesis of Therapeutic Agents.,2023-08-30,2023-08-09,False,,,,pubmed_abstracts/37417731.txt +37417732,,,1948-7185,The journal of physical chemistry letters,Reduction-Active Antisolvent: A Universal and Innovative Strategy of Further Ameliorating Additive Optimization for High Efficiency Perovskite Solar Cells.,2023-07-20,2023-Jul-20,False,,,,pubmed_abstracts/37417732.txt +37417734,PMC10328535,10.7554/eLife.86373,2050-084X,eLife,The Opto-inflammasome in zebrafish as a tool to study cell and tissue responses to speck formation and cell death.,2023-11-16,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/b9/9f/PMC10328535.tar.gz,pubmed_abstracts/PMC10328535/elife-86373.pdf +37417733,PMC10392983,10.7554/eLife.88058,2050-084X,eLife,Allosteric activation or inhibition of PI3Kγ mediated through conformational changes in the p110γ helical domain.,2023-11-16,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/91/25/PMC10392983.tar.gz,"pubmed_abstracts/PMC10392983/elife-88058.pdf,pubmed_abstracts/PMC10392983/elife-88058-mdarchecklist1.pdf" +37417737,PMC10731660,10.1021/acs.inorgchem.3c01620,1520-510X,Inorganic chemistry,Role of Pure Technetium Chemistry: Are There Still Links to Applications in Imaging?,2023-07-07,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/14/5f/PMC10731660.tar.gz,pubmed_abstracts/PMC10731660/ic3c01620.pdf From 68057df5765edcff5743634ade84cdb8a8ec163d Mon Sep 17 00:00:00 2001 From: star-nox Date: Tue, 14 May 2024 12:55:05 -0500 Subject: [PATCH 23/28] deleted csv file --- metadata.csv | 101 --------------------------------------------------- 1 file changed, 101 deletions(-) delete mode 100644 metadata.csv diff --git a/metadata.csv b/metadata.csv deleted file mode 100644 index 6517828c..00000000 --- a/metadata.csv +++ /dev/null @@ -1,101 +0,0 @@ -pmid,pmcid,doi,issn,journal_title,article_title,last_revised,published,live,release_date,license,pubmed_ftp_link,filepath -37417630,PMC10328620,10.1097/MD.0000000000034177,1536-5964,Medicine,Meningitis with septic shock resulting from odontogenic infection misdiagnosed as closed-lock in temporomandibular disorder: A case report and literature review.,2023-11-15,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/64/70/PMC10328620.tar.gz,pubmed_abstracts/PMC10328620/medi-102-e34177.pdf -37417631,PMC10328656,10.1097/MD.0000000000034223,1536-5964,Medicine,"The effect of progressive muscle relaxation technique and myofascial release technique on premenstrual symptoms, blood circulation, and quality of life in women with premenstrual syndrome: A single-blind randomized controlled study.",2023-08-02,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/76/4b/PMC10328656.tar.gz,pubmed_abstracts/PMC10328656/medi-102-e34223.pdf -37417634,PMC10328666,10.1097/MD.0000000000034239,1536-5964,Medicine,Case report: Plastic bronchitis associated with Bordetella parapertussis.,2023-08-02,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/ee/a6/PMC10328666.tar.gz,pubmed_abstracts/PMC10328666/medi-102-e34239.pdf -37417633,PMC10328702,10.1097/MD.0000000000034216,1536-5964,Medicine,Renal artery aneurysm induced by neurofibromatosis type 1: A case report and review of the endovascular interventions for this rare vasculopathy.,2023-08-02,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/85/3e/PMC10328702.tar.gz,pubmed_abstracts/PMC10328702/medi-102-e34216.pdf -37417632,PMC10328683,10.1097/MD.0000000000034221,1536-5964,Medicine,Intervention for burnout and irrational beliefs in parents of couples seeking a divorce: A critical reflection of Igbo-African marital discord.,2023-08-02,2023-Jul-07,True,,CC BY-NC,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/5d/c7/medi-102-e34221.PMC10328683.pdf,pubmed_abstracts/medi-102-e34221.PMC10328683.pdf -37417636,PMC10328563,10.1097/MD.0000000000034197,1536-5964,Medicine,Hematomyelia associated with coronavirus disease 2019: A rare case report.,2023-08-02,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/97/4c/PMC10328563.tar.gz,pubmed_abstracts/PMC10328563/medi-102-e34197.pdf -37417635,PMC10328687,10.1097/MD.0000000000034194,1536-5964,Medicine,Marginal resection as a potential curative treatment option of infantile fibrosarcoma with good response after chemotherapy: A case report of an ETV6-NTRK3 positive infantile fibrosacroma of the distal tibia.,2023-08-02,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/f1/67/PMC10328687.tar.gz,pubmed_abstracts/PMC10328687/medi-102-e34194.pdf -37417638,PMC10328619,10.1097/MD.0000000000034282,1536-5964,Medicine,"The effect of being married on heart rate variability, an indicator of autonomic dysfunction: A retrospective study.",2023-08-02,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/62/28/PMC10328619.tar.gz,pubmed_abstracts/PMC10328619/medi-102-e34282.pdf -37417637,PMC10328692,10.1097/MD.0000000000034238,1536-5964,Medicine,A case report of Ovarian hyperstimulation syndrome and corpus luteum rupture in twin pregnancies with IVF-ET.,2023-08-02,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/19/93/PMC10328692.tar.gz,pubmed_abstracts/PMC10328692/medi-102-e34238.pdf -37417639,PMC10328576,10.1097/MD.0000000000033936,1536-5964,Medicine,Transvenous embolization using the Amplatzer Vascular Plug II in patent ductus arteriosus concomitant with Stanford type B aortic dissection: A case report.,2023-08-02,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/52/2f/PMC10328576.tar.gz,pubmed_abstracts/PMC10328576/medi-102-e33936.pdf -37417640,PMC10328685,10.1097/MD.0000000000034250,1536-5964,Medicine,Quantitative chest CT imaging characteristics and outcome of patients with COVID-19 associated pulmonary artery thrombosis: A single-center retrospective cohort study.,2023-08-02,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/17/4e/PMC10328685.tar.gz,pubmed_abstracts/PMC10328685/medi-102-e34250.pdf -37417642,PMC10328710,10.1097/MD.0000000000033880,1536-5964,Medicine,Orelabrutinib versus ibrutinib for patients with refractory/relapsed primary central nervous system lymphoma: An efficacy and safety analysis.,2023-08-02,2023-Jul-07,True,,CC BY-NC,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/cf/c9/medi-102-e33880.PMC10328710.pdf,pubmed_abstracts/medi-102-e33880.PMC10328710.pdf -37417641,PMC10328596,10.1097/MD.0000000000034248,1536-5964,Medicine,The influence of psychological factors on coronary heart disease: A review of the evidence and implications for psychological interventions.,2023-08-02,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/f5/ec/PMC10328596.tar.gz,pubmed_abstracts/PMC10328596/medi-102-e34248.pdf -37417645,,,1440-1754,Journal of paediatrics and child health,Blue and red Doppler jet on the echocardiogram.,2023-11-16,2023-Jul-01,False,,,, -37417643,PMC10328582,10.1097/MD.0000000000034401,1536-5964,Medicine,Opioids for treating refractory dyspnea in patients with heart failure: A protocol for systematic review and meta-analysis: Retraction.,2023-11-16,2023-07-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/44/53/PMC10328582.tar.gz,pubmed_abstracts/PMC10328582/medi-102-e34401.pdf -37417644,,,1473-2165,Journal of cosmetic dermatology,Safety and efficacy of human platelet extract in skin recovery after fractional CO,2023-08-14,2023-Sep-01,False,,,,pubmed_abstracts/37417644.txt -37417646,,,1440-1754,Journal of paediatrics and child health,Hemi-atrophy of the face.,2023-11-16,2023-Jul-01,False,,,, -37417647,,,1440-1754,Journal of paediatrics and child health,An unexpected percutaneous gastro-jejunostomy obstruction.,2023-11-16,2023-Jul-01,False,,,, -37417649,,,1097-0347,Head & neck,Ultrasound-guided resection for squamous cell carcinoma of the buccal mucosa: A feasibility study.,2023-08-14,2023-09-01,False,,,, -37417648,,,1728-2985,"Urologiia (Moscow, Russia : 1999)",Androgenic status of men with severe COVID-19: the role of testosterone and dihydrotestosterone within the program FOUNDER (features of a new coronavirus infection course and options therapy depending on the androgenic status).,2023-07-18,2023-Jul-01,False,,,,pubmed_abstracts/37417648.txt -37417650,PMC10790315,10.1111/all.15797,1398-9995,Allergy,Diagnostic utility of allergy tests to predict baked egg and lightly cooked egg allergies compared to double-blind placebo-controlled food challenges.,2023-10-10,2023-09-01,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/4a/d7/PMC10790315.tar.gz,pubmed_abstracts/PMC10790315/ALL-78-2510.pdf -37417653,,,1879-1190,Journal of the American College of Surgeons,Statistical Power of Randomized Controlled Trials in Trauma Surgery.,2023-10-24,2023-11-01,False,,,,pubmed_abstracts/37417653.txt -37417652,,,1440-1746,Journal of gastroenterology and hepatology,A model for predicting poor survival in patients with cirrhosis undergoing portosystemic shunt embolization.,2023-09-18,2023-Sep-01,False,,,,pubmed_abstracts/37417652.txt -37417657,PMC11017731,10.1021/acssynbio.3c00061,2161-5063,ACS synthetic biology,Engineering Tissue-Scale Properties with Synthetic Cells: Forging One from Many.,2023-07-29,2023-07-21,False,2024-07-21,,,pubmed_abstracts/37417657.txt -37417659,,,1537-8918,Current sports medicine reports,To Protect and Serve: Preventable Collapse and Death of Police Trainees.,2023-11-21,2023-07-01,False,,,, -37417654,PMC10988698,10.1113/EP090989,1469-445X,Experimental physiology,Role of proprioceptors in chronic musculoskeletal pain.,2023-07-07,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/32/e5/PMC10988698.tar.gz,pubmed_abstracts/PMC10988698/EPH-109-45.pdf -37417660,,,1537-8918,Current sports medicine reports,Web Alerts.,2023-08-20,2023-Jul-01,False,,,, -37417662,,,1537-8918,Current sports medicine reports,Vitamin C Supplementation and Athletic Performance: A Review.,2023-08-20,2023-Jul-01,False,,,,pubmed_abstracts/37417662.txt -37417661,,,1537-8918,Current sports medicine reports,"Nutritional Strategies for Endurance Cyclists - Periodized Nutrition, Ketogenic Diets, and Other Considerations.",2023-09-11,2023-Jul-01,False,,,,pubmed_abstracts/37417661.txt -37417663,,,1537-8918,Current sports medicine reports,A Research and Clinical Framework for Understanding Achilles Injury in Female Collegiate Gymnasts.,2023-08-20,2023-Jul-01,False,,,,pubmed_abstracts/37417663.txt -37417664,,,1833-3575,Health information management : journal of the Health Information Management Association of Australia,Alpha NSW: What would it take to create a state-wide paediatric population-level learning health system?,2023-07-07,2023-Jul-07,False,,,,pubmed_abstracts/37417664.txt -37417665,,,1476-8259,Computer methods in biomechanics and biomedical engineering,Automated detection of auditory response: non-detection stopping criterion and repeatability studies for multichannel EEG.,2023-07-07,2023-Jul-07,False,,,,pubmed_abstracts/37417665.txt -37417666,,,1944-8252,ACS applied materials & interfaces,Impact of Molecular Orientation on Lateral and Interfacial Electron Transfer at Oxide Interfaces.,2023-07-19,2023-Jul-19,False,,,,pubmed_abstracts/37417666.txt -37417658,PMC10331187,10.1177/17539447231184984,1753-9455,Therapeutic advances in cardiovascular disease,Evaluation of diuretic efficiency of intravenous furosemide in patients with advanced heart failure in a heart failure clinic.,2023-07-18,,True,,CC BY-NC,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/a7/5e/10.1177_17539447231184984.PMC10331187.pdf,pubmed_abstracts/10.1177_17539447231184984.PMC10331187.pdf -37417667,PMC10373524,10.1021/acsnano.2c11904,1936-086X,ACS nano,"Insights into the Structure of Comirnaty Covid-19 Vaccine: A Theory on Soft, Partially Bilayer-Covered Nanoparticles with Hydrogen Bond-Stabilized mRNA-Lipid Complexes.",2023-07-31,2023-07-25,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/8a/ad/PMC10373524.tar.gz,"pubmed_abstracts/PMC10373524/nn2c11904.pdf,pubmed_abstracts/PMC10373524/nn2c11904_si_001.pdf" -37417668,,,1078-7496,Journal of trauma nursing : the official journal of the Society of Trauma Nurses,Be Someone's Betsy!,2023-11-03,,False,,,, -37417669,,,1078-7496,Journal of trauma nursing : the official journal of the Society of Trauma Nurses,Concomitant Mental Illnesses Diagnoses and Likelihood of Trauma Recidivism.,2023-08-21,,False,,,,pubmed_abstracts/37417669.txt -37417670,,,1078-7496,Journal of trauma nursing : the official journal of the Society of Trauma Nurses,Concomitant Mental Illnesses Diagnoses and Likelihood of Trauma Recidivism.,2023-12-05,,False,,,, -37417673,,,1078-7496,Journal of trauma nursing : the official journal of the Society of Trauma Nurses,Injury and Sociodemographic Characteristics of Intimate Partner Violence in Women in Israel: A Single-Center Retrospective Cohort Study.,2023-08-21,,False,,,,pubmed_abstracts/37417673.txt -37417672,,,1078-7496,Journal of trauma nursing : the official journal of the Society of Trauma Nurses,"Effects of Case Management in Trauma Patients in Taiwan: A Randomized, Longitudinal Study.",2023-08-21,,False,,,,pubmed_abstracts/37417672.txt -37417671,,,1078-7496,Journal of trauma nursing : the official journal of the Society of Trauma Nurses,Inpatient Rehabilitation Falls: Comparing Patients With Traumatic Brain Injury Versus Patients With Stroke.,2023-08-21,,False,,,,pubmed_abstracts/37417671.txt -37417674,,,1078-7496,Journal of trauma nursing : the official journal of the Society of Trauma Nurses,Impact of Trauma Resuscitation Emergency Care Nurse Deployment in Trauma Activations in a Rural Trauma Center.,2023-08-21,,False,,,,pubmed_abstracts/37417674.txt -37417678,PMC10388677,10.1530/EDM-22-0383,2052-0573,"Endocrinology, diabetes & metabolism case reports",Clinical and molecular description of two cases of neonatal diabetes secondary to mutations in PDX1.,2023-08-02,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/b3/4c/PMC10388677.tar.gz,pubmed_abstracts/PMC10388677/EDM22-0383.pdf -37417680,,,1524-4725,Dermatologic surgery : official publication for American Society for Dermatologic Surgery [et al.],A Retrospective Analysis of Complications of Minimally Invasive Cosmetic Procedures Seen at a Referral Practice in Houston.,2023-10-02,2023-09-01,False,,,, -37417681,,,1552-7433,Personality & social psychology bulletin,"Masculinity Threats Sequentially Arouse Public Discomfort, Anger, and Positive Attitudes Toward Sexual Violence.",2023-07-07,2023-Jul-07,False,,,,pubmed_abstracts/37417681.txt -37417679,PMC10895403,10.1093/jpids/piad048,2048-7207,Journal of the Pediatric Infectious Diseases Society,Comparison of Administrative Database-Derived and Hospital-Derived Data for Monitoring Blood Culture Use in the Pediatric Intensive Care Unit.,2023-11-03,2023-Jul-31,True,,,,pubmed_abstracts/37417679.txt -37417677,,,1078-7496,Journal of trauma nursing : the official journal of the Society of Trauma Nurses,Postintensive Care Syndrome: Feasibly Bridging Care at a Tertiary Trauma Center.,2023-12-05,,False,,,, -37417682,PMC10374551,10.1049/nbt2.12144,1751-875X,IET nanobiotechnology,Natural compound chaetocin induced DNA damage and apoptosis through reactive oxygen species-dependent pathways in A549 lung cancer cells and in vitro evaluations.,2023-07-31,2023-Jul-01,True,,CC BY-NC-ND,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/7a/14/NBT2-17-465.PMC10374551.pdf,pubmed_abstracts/NBT2-17-465.PMC10374551.pdf -37417676,,,1078-7496,Journal of trauma nursing : the official journal of the Society of Trauma Nurses,Postintensive Care Syndrome: Feasibly Bridging Care at a Tertiary Trauma Center.,2023-08-21,,False,,,,pubmed_abstracts/37417676.txt -37417675,,,1078-7496,Journal of trauma nursing : the official journal of the Society of Trauma Nurses,Bringing Trauma Education to the Frontier: Overcoming Distance Barriers Utilizing a Virtual Platform.,2023-08-21,,False,,,,pubmed_abstracts/37417675.txt -37417683,,,2163-0097,Clinical advances in periodontics,Novel biomaterial advanced platelet-rich fibrin plus block for multiple gingival recession.,2023-07-19,2023-Jul-07,False,,,,pubmed_abstracts/37417683.txt -37417684,PMC10439496,10.1049/syb2.12070,1751-8857,IET systems biology,Comprehensive analysis of anoikis-related lncRNAs for predicting prognosis and response of immunotherapy in hepatocellular carcinoma.,2023-08-23,2023-08-01,True,,CC BY-NC-ND,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/e3/c1/SYB2-17-198.PMC10439496.pdf,pubmed_abstracts/SYB2-17-198.PMC10439496.pdf -37417685,,,1944-8252,ACS applied materials & interfaces,Influence of Interfering Ions and Adsorption Temperature on Radioactive Iodine Removal Efficiency and Stability of Ni-MOF-74 and Zr-UiO-66.,2023-07-20,2023-Jul-19,False,,,,pubmed_abstracts/37417685.txt -37417686,,,1532-7752,Journal of personality assessment,The HEXACO Personality Space Before and After Re-Rotation to Approximate the Big Five Dimensions.,2023-07-07,2023-Jul-07,False,,,,pubmed_abstracts/37417686.txt -37417687,,,1532-7752,Journal of personality assessment,New Versions of the MMPI and Rorschach: How Have Training Programs Responded?,2023-07-07,2023-Jul-07,False,,,,pubmed_abstracts/37417687.txt -37417690,PMC10332179,10.1080/07853890.2023.2230888,1365-2060,Annals of medicine,Blinatumomab as salvage therapy in patients with relapsed/refractory B-ALL who have failed/progressed after anti-CD19-CAR T therapy.,2023-11-19,2023-12-01,True,,CC BY-NC,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/f5/9b/IANN_55_2230888.PMC10332179.pdf,pubmed_abstracts/IANN_55_2230888.PMC10332179.pdf -37417688,PMC10407019,10.1111/aogs.14620,1600-0412,Acta obstetricia et gynecologica Scandinavica,Ultrasound examination of the pelvic floor during active labor: A longitudinal cohort study.,2023-08-10,2023-09-01,True,,CC BY-NC-ND,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/6d/0e/AOGS-102-1203.PMC10407019.pdf,pubmed_abstracts/AOGS-102-1203.PMC10407019.pdf -37417691,PMC10771532,10.1111/jgs.18505,1532-5415,Journal of the American Geriatrics Society,Factors associated with preventable hospitalizations after hospice live discharge among Medicare patients with Alzheimer's disease and related dementias.,2023-11-15,2023-Nov-01,False,2024-11-01,,, -37417692,,,1470-8744,Biotechnology and applied biochemistry,Deciphering the role of fungus in degradation of polypropylene from hospital waste.,2023-12-10,2023-Dec-01,False,,,,pubmed_abstracts/37417692.txt -37417689,PMC10527499,10.1097/BRS.0000000000004769,1528-1159,Spine,Association of Neighborhood Socioeconomic Deprivation With Utilization and Costs of Anterior Cervical Discectomy and Fusion.,2023-10-03,2023-Sep-15,False,2024-09-15,,,pubmed_abstracts/37417689.txt -37417694,,,1532-5040,Physiotherapy theory and practice,Effect of a structured early mobilization protocol on the level of mobilization and muscle strength in critical care patients: A randomized clinical trial.,2023-07-07,2023-Jul-07,False,,,,pubmed_abstracts/37417694.txt -37417693,PMC10735286,10.1210/clinem/dgad401,1945-7197,The Journal of clinical endocrinology and metabolism,Pheochromocytomas Most Commonly Present As Adrenal Incidentalomas: A Large Tertiary Center Experience.,2023-07-07,2023-Jul-07,True,,CC BY-NC-ND,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/21/a2/PMC10735286.tar.gz, -37417696,,,1745-3682,Acta orthopaedica,A comparison of uncemented short versus standard stem length in total hip arthroplasty: results from the Dutch Arthroplasty Register.,2023-11-16,2023-07-07,False,,,,pubmed_abstracts/37417696.txt -37417695,,,1528-1159,Spine,Quality of Life and Postoperative Satisfaction in Patients with Benign Extramedullary Spinal Tumors: A Multicenter Study.,2023-08-28,2023-Sep-15,False,,,,pubmed_abstracts/37417695.txt -37417697,PMC10484187,10.1097/BRS.0000000000004731,1528-1159,Spine,"Directed Versus Nondirected Standing Postures in Adolescent Idiopathic Scoliosis: Its Impact on Curve Magnitude, Alignment, and Clinical Decision-Making.",2023-09-13,2023-Oct-01,True,,CC BY-NC-ND,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/f2/95/brs-48-1354.PMC10484187.pdf,pubmed_abstracts/brs-48-1354.PMC10484187.pdf -37417700,PMC10331372,10.1177/01410768231184381,1758-1095,Journal of the Royal Society of Medicine,From ,2023-07-18,2023-Jun-01,False,2026-06-01,,, -37417698,,,1751-3766,Journal of biological dynamics,Threshold dynamics of a stochastic mathematical model for ,2023-11-16,2023-Dec-01,False,,,,pubmed_abstracts/37417698.txt -37417702,PMC10331365,10.1177/01410768231184373,1758-1095,Journal of the Royal Society of Medicine,Is an independent NHS an impossible dream?,2023-07-18,2023-Jun-01,True,,,, -37417701,PMC10331368,10.1177/01410768231182836,1758-1095,Journal of the Royal Society of Medicine,Facilitating genetic testing after death: the ongoing duty of care to the deceased and their relatives.,2023-11-16,2023-Jun-01,False,2026-06-01,,, -37417704,,,1744-764X,Expert opinion on drug safety,Proton pump inhibitors use prior to COVID-19 hospitalization is associated with higher C,2023-07-10,2023-Jul-10,False,,,,pubmed_abstracts/37417704.txt -37417706,,,1945-7197,The Journal of clinical endocrinology and metabolism,Microvascular complications are associated with coronary collateralization in type 2 diabetes and chronic occlusion.,2023-07-07,2023-Jul-07,False,,,,pubmed_abstracts/37417706.txt -37417705,PMC10332216,10.1080/07853890.2023.2231847,1365-2060,Annals of medicine,Life quality among psoriasis patients based on Dermatology Life Quality Index evaluation and its association with psoriasis severity in China: a cross-sectional study.,2023-11-19,2023-12-01,True,,CC BY-NC,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/09/33/IANN_55_2231847.PMC10332216.pdf,pubmed_abstracts/IANN_55_2231847.PMC10332216.pdf -37417709,,,1528-1159,Spine,Cannabis Use is Associated with Higher Rates of Pseudarthrosis Following TLIF: A Multi-Institutional Matched-Cohort Study.,2023-07-07,2023-Jul-03,False,,,,pubmed_abstracts/37417709.txt -37417707,,,1523-4681,Journal of bone and mineral research : the official journal of the American Society for Bone and Mineral Research,3D Finite Element Models Reconstructed From 2D Dual-Energy X-Ray Absorptiometry (DXA) Images Improve Hip Fracture Prediction Compared to Areal BMD in Osteoporotic Fractures in Men (MrOS) Sweden Cohort.,2023-09-26,2023-09-01,False,,,,pubmed_abstracts/37417707.txt -37417710,,,1612-1880,Chemistry & biodiversity,Synthesis and Evaluation of Novel Metacetamol Derivatives with Hydrazone Moiety as Anticancer and Antimicrobial Agents.,2023-08-24,2023-Aug-01,False,,,,pubmed_abstracts/37417710.txt -37417712,PMC10337823,10.1093/europace/euad189,1532-2092,"Europace : European pacing, arrhythmias, and cardiac electrophysiology : journal of the working groups on cardiac pacing, arrhythmias, and cardiac cellular electrophysiology of the European Society of Cardiology",Very-early symptomatic recurrence is associated with late recurrence after radiofrequency ablation of atrial fibrillation.,2023-08-22,2023-07-04,True,,CC BY-NC,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/4e/85/euad189.PMC10337823.pdf,pubmed_abstracts/euad189.PMC10337823.pdf -37417711,,,1528-0691,"Chemical record (New York, N.Y.)",Supported Noble Metal Catalysts and Adsorbents with Soft Lewis Acid Functions.,2023-11-21,2023-Nov-01,False,,,,pubmed_abstracts/37417711.txt -37417713,PMC10719214,10.1093/cei/uxad072,1365-2249,Clinical and experimental immunology,Effects of mesenchymal stem cells on Treg cells in rats with colitis.,2023-12-13,2023-Dec-13,False,2024-07-07,,,pubmed_abstracts/37417713.txt -37417714,PMC10577628,10.1111/aogs.14626,1600-0412,Acta obstetricia et gynecologica Scandinavica,Double-vs single-balloon catheter for induction of labor: Systematic review and individual participant data meta-analysis.,2023-10-24,2023-11-01,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/af/36/PMC10577628.tar.gz,pubmed_abstracts/PMC10577628/AOGS-102-1440.pdf -37417715,PMC10508478,10.1002/vms3.1180,2053-1095,Veterinary medicine and science,"Evaluating the effects of direct-fed microbial supplementation on the performance, milk quality and fatty acid of mid-lactating dairy cows.",2023-09-21,2023-09-01,True,,CC BY-NC,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/8c/db/VMS3-9-2212.PMC10508478.pdf,pubmed_abstracts/VMS3-9-2212.PMC10508478.pdf -37417716,,,1538-9774,"Computers, informatics, nursing : CIN",The Disruptive Impacts of Next Generation Generative Artificial Intelligence.,2023-11-28,2023-07-01,False,,,, -37417718,,,1559-4106,Biointerphases,Development of electronic sum frequency generation spectrophotometer to assess the buried interfaces.,2023-11-18,2023-07-01,False,,,,pubmed_abstracts/37417718.txt -37417719,,,1559-4106,Biointerphases,Theoretical study of electronic sum frequency generation spectroscopy to assess the buried interfaces.,2023-11-18,2023-07-01,False,,,,pubmed_abstracts/37417719.txt -37417708,PMC10524881,10.1097/AUD.0000000000001396,1538-4667,Ear and hearing,Association Between Adult-Onset Hearing Loss and Income: A Systematic Review.,2023-11-06,,False,2024-07-06,,,pubmed_abstracts/37417708.txt -37417720,,,1943-278X,Suicide & life-threatening behavior,When safe firearm storage isn't enough: Examining risk profiles among firearm suicide decedents.,2023-08-16,2023-08-01,False,,,,pubmed_abstracts/37417720.txt -37417722,,,1754-9485,Journal of medical imaging and radiation oncology,Percutaneous treatment of renal tumours.,2023-07-07,2023-Jul-07,False,,,,pubmed_abstracts/37417722.txt -37417721,PMC10332182,10.1080/07853890.2023.2233556,1365-2060,Annals of medicine,Ultrasound-guided injection acupotomy as a minimally invasive intervention therapy for cervical spondylotic radiculopathy: a randomized control trial.,2023-11-19,2023-12-01,True,,CC BY-NC,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/8f/eb/IANN_55_2233556.PMC10332182.pdf,pubmed_abstracts/IANN_55_2233556.PMC10332182.pdf -37417723,,,1528-1159,Spine,GSK-3β and β-Catenin Signaling Pathway is Involved in Myofibroblast Transition of Ligamentum Flavum in Lumbar Spinal Stenosis Patients.,2023-09-27,2023-Oct-15,False,,,,pubmed_abstracts/37417723.txt -37417724,,,1528-1159,Spine,Subclassification of Sanders Maturation Stage 3 Demonstrates Differences in Spine and Total Height Velocity Between 3A and 3B in Patients with Idiopathic Scoliosis.,2023-07-07,2023-Jul-06,False,,,,pubmed_abstracts/37417724.txt -37417725,,,1523-4681,Journal of bone and mineral research : the official journal of the American Society for Bone and Mineral Research,Efficacy and Safety of Transdermal Abaloparatide in Postmenopausal Women with Osteoporosis: A Randomized Study.,2023-10-26,2023-10-01,False,,,,pubmed_abstracts/37417725.txt -37417726,,,1521-4141,European journal of immunology,CKBA suppresses mast cell activation via ERK signaling pathway in murine atopic dermatitis.,2023-09-11,2023-09-01,False,,,,pubmed_abstracts/37417726.txt -37417728,,,1521-4095,"Advanced materials (Deerfield Beach, Fla.)",Reconstructed Hierarchically Structured Keratin Fibers with Shape-Memory Features Based on Reversible Secondary-Structure Transformation.,2023-10-23,2023-Oct-01,False,,,,pubmed_abstracts/37417728.txt -37417727,PMC10181040,10.3390/nu15092153,2072-6643,Nutrients,"Effectiveness of a Digitally Delivered Continuous Care Intervention (Defeat Diabetes) on Type 2 Diabetes Outcomes: A 12-Month Single-Arm, Pre-Post Intervention Study.",2023-07-19,2023-Apr-30,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/f3/34/PMC10181040.tar.gz,pubmed_abstracts/PMC10181040/nutrients-15-02153.pdf -37417730,PMC10356134,10.7554/eLife.88310,2050-084X,eLife,Metformin regulates bone marrow stromal cells to accelerate bone healing in diabetic mice.,2023-07-21,2023-07-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/5e/82/PMC10356134.tar.gz,"pubmed_abstracts/PMC10356134/elife-88310-mdarchecklist1.pdf,pubmed_abstracts/PMC10356134/elife-88310.pdf" -37417729,PMC10508548,10.1002/vms3.1196,2053-1095,Veterinary medicine and science,Global prevalence of Neospora caninum in rodents: A systematic review and meta-analysis.,2023-09-21,2023-09-01,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/1d/b2/PMC10508548.tar.gz,pubmed_abstracts/PMC10508548/VMS3-9-2192.pdf -37417731,,,1520-6890,Chemical reviews,Enantioselective Transformations in the Synthesis of Therapeutic Agents.,2023-08-30,2023-08-09,False,,,,pubmed_abstracts/37417731.txt -37417732,,,1948-7185,The journal of physical chemistry letters,Reduction-Active Antisolvent: A Universal and Innovative Strategy of Further Ameliorating Additive Optimization for High Efficiency Perovskite Solar Cells.,2023-07-20,2023-Jul-20,False,,,,pubmed_abstracts/37417732.txt -37417734,PMC10328535,10.7554/eLife.86373,2050-084X,eLife,The Opto-inflammasome in zebrafish as a tool to study cell and tissue responses to speck formation and cell death.,2023-11-16,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/b9/9f/PMC10328535.tar.gz,pubmed_abstracts/PMC10328535/elife-86373.pdf -37417733,PMC10392983,10.7554/eLife.88058,2050-084X,eLife,Allosteric activation or inhibition of PI3Kγ mediated through conformational changes in the p110γ helical domain.,2023-11-16,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/91/25/PMC10392983.tar.gz,"pubmed_abstracts/PMC10392983/elife-88058.pdf,pubmed_abstracts/PMC10392983/elife-88058-mdarchecklist1.pdf" -37417737,PMC10731660,10.1021/acs.inorgchem.3c01620,1520-510X,Inorganic chemistry,Role of Pure Technetium Chemistry: Are There Still Links to Applications in Imaging?,2023-07-07,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/14/5f/PMC10731660.tar.gz,pubmed_abstracts/PMC10731660/ic3c01620.pdf From 4338931746bf4e04b138d56319b62c99a4a5d38b Mon Sep 17 00:00:00 2001 From: star-nox Date: Tue, 14 May 2024 12:59:40 -0500 Subject: [PATCH 24/28] test comment --- ai_ta_backend/utils/pubmed_extraction.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ai_ta_backend/utils/pubmed_extraction.py b/ai_ta_backend/utils/pubmed_extraction.py index 524c6e79..b5700174 100644 --- a/ai_ta_backend/utils/pubmed_extraction.py +++ b/ai_ta_backend/utils/pubmed_extraction.py @@ -417,6 +417,7 @@ def getArticleIDs(metadata: list): for i in range(0, len(metadata), batch_size): batch = metadata[i:i + batch_size] ids = ",".join([article['pmid'] for article in batch]) + # test comment try: response = requests.get(base_url + app_details + "&ids=" + ids) data = response.json() From 048f41de3503be289ca2b824ecf720096fe3e4c9 Mon Sep 17 00:00:00 2001 From: star-nox Date: Tue, 14 May 2024 13:00:53 -0500 Subject: [PATCH 25/28] print test comment --- ai_ta_backend/utils/pubmed_extraction.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ai_ta_backend/utils/pubmed_extraction.py b/ai_ta_backend/utils/pubmed_extraction.py index b5700174..502f03f9 100644 --- a/ai_ta_backend/utils/pubmed_extraction.py +++ b/ai_ta_backend/utils/pubmed_extraction.py @@ -418,6 +418,7 @@ def getArticleIDs(metadata: list): batch = metadata[i:i + batch_size] ids = ",".join([article['pmid'] for article in batch]) # test comment + print("test comment") try: response = requests.get(base_url + app_details + "&ids=" + ids) data = response.json() From 472814ec9b122326b21cabab7b3cd20c65837026 Mon Sep 17 00:00:00 2001 From: Kastan Day Date: Tue, 14 May 2024 11:16:53 -0700 Subject: [PATCH 26/28] Commented out prints for speed --- ai_ta_backend/utils/pubmed_extraction.py | 43 ++++++++++++------------ 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/ai_ta_backend/utils/pubmed_extraction.py b/ai_ta_backend/utils/pubmed_extraction.py index 502f03f9..a3dd3912 100644 --- a/ai_ta_backend/utils/pubmed_extraction.py +++ b/ai_ta_backend/utils/pubmed_extraction.py @@ -18,7 +18,6 @@ import json from functools import partial - SUPBASE_CLIENT = supabase.create_client( # type: ignore supabase_url=os.getenv('SUPABASE_URL'), # type: ignore supabase_key=os.getenv('SUPABASE_API_KEY') # type: ignore @@ -46,7 +45,7 @@ def extractPubmedData(): print("Processing file: ", file) gz_filepath = downloadXML(ftp_address, ftp_path, file, "pubmed") - print("GZ Downloaded: ", gz_filepath) + # print("GZ Downloaded: ", gz_filepath) print("Time taken to download .gz file: ", round(time.time() - start_time, 2), "seconds") gz_file_download_time = time.time() @@ -54,7 +53,7 @@ def extractPubmedData(): if not gz_filepath: return "failure" xml_filepath = extractXMLFile(gz_filepath) - print("XML Extracted: ", xml_filepath) + # print("XML Extracted: ", xml_filepath) print("Time taken to extract XML file: ", round(time.time() - gz_file_download_time, 2), "seconds") #xml_filepath = "pubmed/pubmed24n1217.xml" @@ -68,7 +67,7 @@ def extractPubmedData(): # download the articles complete_metadata = downloadArticles(metadata_with_ids) - print(complete_metadata) + # print(complete_metadata) print("Time taken to download articles for 100 articles: ", round(time.time() - metadata_update_time, 2), "seconds") # store metadata in csv file @@ -141,7 +140,7 @@ def downloadXML(ftp_address: str, ftp_path: str, file: str, local_dir: str): with open(local_filepath, 'wb') as f: ftp.retrbinary('RETR ' + file, f.write) - print(f"Downloaded {file} to {local_filepath}") + # print(f"Downloaded {file} to {local_filepath}") ftp.quit() return local_filepath @@ -175,7 +174,7 @@ def getFileList(ftp_address: str, ftp_path: str, extension: str = ".gz"): # Filter for files with the specified extension gz_files = [entry for entry in file_listing if entry.endswith(extension)] gz_files.sort(reverse=True) - print(f"Found {len(gz_files)} files on {ftp_address}/{ftp_path}") + # print(f"Found {len(gz_files)} files on {ftp_address}/{ftp_path}") return gz_files except Exception as e: @@ -191,7 +190,7 @@ def extractXMLFile(gz_filepath: str): xml_filepath: Path to the extracted XML file. """ try: - print("Downloaded .gz file path: ", gz_filepath) + # print("Downloaded .gz file path: ", gz_filepath) xml_filepath = gz_filepath.replace(".gz", "") with gzip.open(gz_filepath, 'rb') as f_in: with open(xml_filepath, 'wb') as f_out: @@ -212,7 +211,7 @@ def extractMetadataFromXML(xml_filepath: str): Returns: metadata: List of dictionaries containing metadata for each article. """ - print("inside extractMetadataFromXML()") + # print("inside extractMetadataFromXML()") try: # create a directory to store abstracts os.makedirs("pubmed_abstracts", exist_ok=True) @@ -233,14 +232,14 @@ def extractMetadataFromXML(xml_filepath: str): metadata.append(article_data) if len(metadata) == 100: - print("collected 100 articles") + # print("collected 100 articles") yield metadata metadata = [] # reset metadata for next batch if metadata: yield metadata - print("Metadata extraction complete.") + # print("Metadata extraction complete.") except Exception as e: print("Error extracting metadata: ", e) return [] @@ -594,7 +593,7 @@ def downloadArticles(metadata: list): if article['pmid'] in updated_articles: article.update(updated_articles[article['pmid']]) - print("Updated metadata after download: ", metadata) + # print("Updated metadata after download: ", metadata) return metadata @@ -625,7 +624,7 @@ def download_article(article, api_url): if article['pmcid']: final_url = api_url + "id=" + article['pmcid'] - print("\nDownload URL: ", final_url) + # print("\nDownload URL: ", final_url) xml_response = requests.get(final_url) extracted_data = extractArticleData(xml_response.text) @@ -640,7 +639,7 @@ def download_article(article, api_url): ftp_url = urlparse(extracted_data[0]['href']) ftp_path = ftp_url.path[1:] - print("FTP path: ", ftp_path) + # print("FTP path: ", ftp_path) filename = ftp_path.split("/")[-1] local_file = os.path.join("pubmed_abstracts", filename) @@ -649,12 +648,12 @@ def download_article(article, api_url): with open(local_file, 'wb') as f: ftp.retrbinary('RETR ' + ftp_path, f.write) # Download directly to file - print("Downloaded FTP file: ", local_file) + # print("Downloaded FTP file: ", local_file) article['filepath'] = local_file if filename.endswith(".tar.gz"): extracted_pdf_paths = extractPDF(local_file) - print("Extracted PDFs from .tar.gz file: ", extracted_pdf_paths) + # print("Extracted PDFs from .tar.gz file: ", extracted_pdf_paths) article['filepath'] = ",".join(extracted_pdf_paths) os.remove(local_file) @@ -663,7 +662,7 @@ def download_article(article, api_url): ftp.quit() - print("\nUpdated metadata after download: ", article) + # print("\nUpdated metadata after download: ", article) return article @@ -677,7 +676,7 @@ def extractPDF(tar_gz_filepath: str): extracted_paths: List of paths to the extracted PDF files. """ try: - print("Extracting PDF from: ", tar_gz_filepath) + # print("Extracting PDF from: ", tar_gz_filepath) extracted_paths = [] with tarfile.open(tar_gz_filepath, "r:gz") as tar: for member in tar: @@ -700,7 +699,7 @@ def extractArticleData(xml_string: str): Returns: extracted_data: List of dictionaries containing license and download link for the article. """ - print("In extractArticleData") + # print("In extractArticleData") try: root = ET.fromstring(xml_string) # if there is an errors (article not open-access), return empty list (skip article) @@ -741,7 +740,7 @@ def upload_file(client, bucket_name, file_path, object_name): """ try: client.fput_object(bucket_name, object_name, file_path) - print(f"Uploaded: {object_name}") + # print(f"Uploaded: {object_name}") except Exception as e: print(f"Error uploading {object_name}: {e}") @@ -749,7 +748,7 @@ def uploadToStorage(filepath: str): """ Uploads all files present under given filepath to Minio bucket in parallel. """ - print("in uploadToStorage()") + # print("in uploadToStorage()") try: bucket_name = "pubmed" @@ -757,8 +756,8 @@ def uploadToStorage(filepath: str): if not found: MINIO_CLIENT.make_bucket(bucket_name) print("Created bucket", bucket_name) - else: - print("Bucket", bucket_name, "already exists") + # else: + # print("Bucket", bucket_name, "already exists") # Get all files to upload files = [] From 880ae975acc83ecd5a758f7aa23a62ae76ef173f Mon Sep 17 00:00:00 2001 From: Kastan Day Date: Tue, 14 May 2024 11:17:27 -0700 Subject: [PATCH 27/28] Commented out prints for speed --- ai_ta_backend/utils/pubmed_extraction.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/ai_ta_backend/utils/pubmed_extraction.py b/ai_ta_backend/utils/pubmed_extraction.py index a3dd3912..92e0461e 100644 --- a/ai_ta_backend/utils/pubmed_extraction.py +++ b/ai_ta_backend/utils/pubmed_extraction.py @@ -18,6 +18,7 @@ import json from functools import partial + SUPBASE_CLIENT = supabase.create_client( # type: ignore supabase_url=os.getenv('SUPABASE_URL'), # type: ignore supabase_key=os.getenv('SUPABASE_API_KEY') # type: ignore @@ -39,13 +40,12 @@ def extractPubmedData(): ftp_path = "pubmed/baseline" file_list = getFileList(ftp_address, ftp_path, ".gz") - for file in file_list[22:23]: try: print("Processing file: ", file) gz_filepath = downloadXML(ftp_address, ftp_path, file, "pubmed") - # print("GZ Downloaded: ", gz_filepath) + print("GZ Downloaded: ", gz_filepath) print("Time taken to download .gz file: ", round(time.time() - start_time, 2), "seconds") gz_file_download_time = time.time() @@ -53,7 +53,7 @@ def extractPubmedData(): if not gz_filepath: return "failure" xml_filepath = extractXMLFile(gz_filepath) - # print("XML Extracted: ", xml_filepath) + print("XML Extracted: ", xml_filepath) print("Time taken to extract XML file: ", round(time.time() - gz_file_download_time, 2), "seconds") #xml_filepath = "pubmed/pubmed24n1217.xml" @@ -174,7 +174,7 @@ def getFileList(ftp_address: str, ftp_path: str, extension: str = ".gz"): # Filter for files with the specified extension gz_files = [entry for entry in file_listing if entry.endswith(extension)] gz_files.sort(reverse=True) - # print(f"Found {len(gz_files)} files on {ftp_address}/{ftp_path}") + print(f"Found {len(gz_files)} files on {ftp_address}/{ftp_path}") return gz_files except Exception as e: @@ -406,7 +406,7 @@ def getArticleIDs(metadata: list): Returns: metadata: Updated metadata with PMCID, DOI, release date, and live status information. """ - print("In getArticleIDs()") +# print("In getArticleIDs()") base_url = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/" app_details = "?tool=ncsa_uiuc&email=caiincsa@gmail.com&format=json" @@ -416,8 +416,6 @@ def getArticleIDs(metadata: list): for i in range(0, len(metadata), batch_size): batch = metadata[i:i + batch_size] ids = ",".join([article['pmid'] for article in batch]) - # test comment - print("test comment") try: response = requests.get(base_url + app_details + "&ids=" + ids) data = response.json() @@ -568,7 +566,7 @@ def downloadArticles(metadata: list): Returns: metadata: Updated metadata with license, FTP link, and downloaded filepath information. """ - print("In downloadArticles()") + # print("In downloadArticles()") try: base_url = "https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?" @@ -580,11 +578,11 @@ def downloadArticles(metadata: list): futures = [executor.submit(download_article_partial, article) for article in metadata] for future in concurrent.futures.as_completed(futures): try: - print("Starting new download...") + # print("Starting new download...") updated_article = future.result(timeout=15*60) # Check result without blocking if updated_article: updated_articles[updated_article['pmid']] = updated_article - print("Updated article: ", updated_article) + # print("Updated article: ", updated_article) except Exception as e: print("Error downloading article:", e) @@ -613,7 +611,7 @@ def download_article(article, api_url): article: Updated metadata for the article. """ - print("Downloading articles...") + # print("Downloading articles...") if not article['live'] or article['pmcid'] is None: return @@ -628,7 +626,7 @@ def download_article(article, api_url): xml_response = requests.get(final_url) extracted_data = extractArticleData(xml_response.text) - print("Extracted license and link data: ", extracted_data) + # print("Extracted license and link data: ", extracted_data) if not extracted_data: article['live'] = False From 304ec5d62f08f9e2be3fdb00f2892ec6c6197f34 Mon Sep 17 00:00:00 2001 From: star-nox Date: Wed, 15 May 2024 11:52:01 -0500 Subject: [PATCH 28/28] parallelized main for loop and added xml filename column --- ai_ta_backend/utils/pubmed_extraction.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/ai_ta_backend/utils/pubmed_extraction.py b/ai_ta_backend/utils/pubmed_extraction.py index d5806d6a..0008147e 100644 --- a/ai_ta_backend/utils/pubmed_extraction.py +++ b/ai_ta_backend/utils/pubmed_extraction.py @@ -41,7 +41,7 @@ def extractPubmedData(): file_list = getFileList(ftp_address, ftp_path, ".gz") with concurrent.futures.ProcessPoolExecutor() as executor: - futures = [executor.submit(processPubmedXML, file, ftp_address, ftp_path) for file in file_list[21:22]] + futures = [executor.submit(processPubmedXML, file, ftp_address, ftp_path) for file in file_list[32:33]] for future in concurrent.futures.as_completed(futures): try: future.result() @@ -95,13 +95,16 @@ def processPubmedXML(file:str, ftp_address:str, ftp_path:str): print("Total articles retrieved: ", len(complete_metadata)) df = pd.DataFrame(complete_metadata) + # add a column for the XML file path + df['xml_filename'] = os.path.basename(xml_filepath) + if os.path.isfile(csv_filepath): df.to_csv(csv_filepath, mode='a', header=False, index=False) else: df.to_csv(csv_filepath, index=False) print("Time taken to extract metadata for 100 articles: ", round(time.time() - metadata_extract_start_time, 2), "seconds") - + exit() print("Time taken to download articles: ", round(time.time() - start_time, 2), "seconds") print("Total metadata extracted: ", len(complete_metadata))