From aefeac19b39e8917851e0f4a8644aa4d40a5e75d Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Mon, 1 Apr 2024 19:44:34 -0500
Subject: [PATCH 01/28] added functions for metadata extraction

---
 ai_ta_backend/main.py                    |  12 ++
 ai_ta_backend/utils/pubmed_extraction.py | 172 +++++++++++++++++++++++
 2 files changed, 184 insertions(+)
 create mode 100644 ai_ta_backend/utils/pubmed_extraction.py

diff --git a/ai_ta_backend/main.py b/ai_ta_backend/main.py
index 452792ac..b53206d9 100644
--- a/ai_ta_backend/main.py
+++ b/ai_ta_backend/main.py
@@ -39,6 +39,7 @@
 from ai_ta_backend.service.sentry_service import SentryService
 
 from ai_ta_backend.beam.nomic_logging import create_document_map
+from ai_ta_backend.utils.pubmed_extraction import extractPubmedData
 
 app = Flask(__name__)
 CORS(app)
@@ -343,6 +344,17 @@ def getTopContextsWithMQR(service: RetrievalService, posthog_service: PosthogSer
   response.headers.add('Access-Control-Allow-Origin', '*')
   return response
 
+@app.route('/pubmedExtraction', methods=['GET'])
+def pubmedExtraction():
+  """
+  Extracts metadata and download papers from PubMed.
+  """
+  result = extractPubmedData()
+
+  response = jsonify(result)
+  response.headers.add('Access-Control-Allow-Origin', '*')
+  return response
+
 
 def configure(binder: Binder) -> None:
   binder.bind(RetrievalService, to=RetrievalService, scope=RequestScope)
diff --git a/ai_ta_backend/utils/pubmed_extraction.py b/ai_ta_backend/utils/pubmed_extraction.py
new file mode 100644
index 00000000..4be2b4a8
--- /dev/null
+++ b/ai_ta_backend/utils/pubmed_extraction.py
@@ -0,0 +1,172 @@
+import os
+import requests
+import shutil
+import json
+import xml.etree.ElementTree as ET
+import ftplib
+import supabase
+import gzip
+import time
+import concurrent.futures
+import urllib.request
+
+
+SUPBASE_CLIENT = supabase.create_client(    # type: ignore
+    supabase_url=os.getenv('SUPABASE_URL'), # type: ignore
+    supabase_key=os.getenv('SUPABASE_API_KEY')  # type: ignore
+)
+
+def extractPubmedData():
+    """
+    Extracts metadata from the files listed in FTP folder and stores it in SQL DB.
+    """
+    ftp_address = "ftp.ncbi.nlm.nih.gov"
+    ftp_path = "pubmed/baseline"
+    file_list = getFileList(ftp_address, ftp_path, ".gz")
+
+    for file in file_list:
+        # download the .gz file
+        gz_filepath = downloadFromFTP(ftp_address, ftp_path, file, "pubmed")
+        print("Downloaded: ", gz_filepath)
+
+        # extract the XML file
+        xml_filepath = extractXMLFile(gz_filepath)
+        print("XML Extracted: ", xml_filepath)
+
+        # extract metadata from the XML file
+        metadata = extractMetadataFromXML(xml_filepath)
+
+        # find PMC ID and DOI for all articles
+        for article in metadata:
+            pmid = article['pmid']
+            article_ids = getArticleIDs(pmid)
+
+        
+        # delete XML and .gz files
+        
+
+    
+    return "success"
+
+def downloadFromFTP(ftp_address: str, ftp_path: str, file: str, local_dir: str):
+    """
+    Downloads all .gz files from the FTP folder and stores it in the local directory.
+    """
+    # create local directory if it doesn't exist
+    os.makedirs(local_dir, exist_ok=True)
+
+    # connect to the FTP server
+    ftp = ftplib.FTP(ftp_address)
+    ftp.login()
+    ftp.cwd(ftp_path) 
+
+    local_filepath = os.path.join(local_dir, file)
+
+    with open(local_filepath, 'wb') as f:
+        ftp.retrbinary('RETR ' + file, f.write)
+        
+    print(f"Downloaded {file} to {local_filepath}")
+
+    ftp.quit()
+    
+    return "success"
+
+def getFileList(ftp_address: str, ftp_path: str, extension: str = ".gz"):
+    """
+    Returns a list of files in the FTP folder.
+    """
+    # connect to the FTP server
+    ftp = ftplib.FTP(ftp_address)
+    ftp.login()
+
+    # Change directory to the specified path
+    ftp.cwd(ftp_path) 
+
+    # Get list of file entries
+    file_listing = ftp.nlst()
+
+    ftp.quit()
+
+    # Filter for files with the specified extension
+    gz_files = [entry for entry in file_listing if entry.endswith(extension)]
+    gz_files.sort(reverse=True)
+    print(f"Found {len(gz_files)} files on {ftp_address}/{ftp_path}")
+
+    return gz_files
+
+def extractXMLFile(gz_filepath: str):
+    """
+    Extracts the XML file from the .gz file.
+    Args:
+        gz_filepath: Path to the .gz file.
+    Returns:
+        xml_filepath: Path to the extracted XML file.
+    """
+    print("gz file path: ", gz_filepath)
+    xml_filepath = gz_filepath.replace(".gz", "")
+    with gzip.open(gz_filepath, 'rb') as f_in:
+        with open(xml_filepath, 'wb') as f_out:
+            shutil.copyfileobj(f_in, f_out)
+
+    return xml_filepath
+
+def extractMetadataFromXML(xml_filepath: str):
+    """
+    Extracts metadata from the XML file and stores it in a dictionary.
+    Args: 
+        xml_filepath: Path to the XML file.
+    Returns:
+        metadata: List of dictionaries containing metadata for each article.
+    """
+    tree = ET.parse(xml_filepath)
+    root = tree.getroot()
+    metadata = []
+    # Extract metadata from the XML file
+    for item in root.iter('PubmedArticle'):
+        article_data = {}
+
+        publication_status = item.find('PubmedData/PublicationStatus').text
+        # ppublish articles are not present in PMC database
+        if publication_status == "epublish":
+            article_data['full_text'] = True
+        else:
+            article_data['full_text'] = False
+
+        medline_citation = item.find('MedlineCitation')
+        article = medline_citation.find('Article')
+        journal = article.find('Journal')
+        issue = journal.find('JournalIssue')
+
+        article_data['pmid'] = medline_citation.find('PMID').text
+        article_data['issn'] = journal.find('ISSN').text
+        article_data['journal_title'] = journal.find('Title').text
+
+        article_title = article.find('ArticleTitle').text
+        article_data['article_title'] = article_title.replace('[', '').replace(']', '')
+
+        article_data['last_revised'] = f"{medline_citation.find('DateRevised/Year').text}-{medline_citation.find('DateRevised/Month').text}-{medline_citation.find('DateRevised/Day').text}"
+        article_data['published'] = f"{issue.find('PubDate/Year').text}-{issue.find('PubDate/Month').text}-{issue.find('PubDate/Day').text}"
+        #article_data['date_completed'] = f"{medline_citation.find('DateCompleted/Year').text}-{medline_citation.find('DateCompleted/Month').text}-{medline_citation.find('DateCompleted/Day').text}"
+
+        # extract and store abstract in a text file
+        
+
+
+        metadata.append(article_data)
+        
+    return metadata
+
+def getArticleIDs(pmid: str):
+    """
+    Retrieves the PMC ID and DOI for an article.
+    """
+    base_url = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/"
+    app_details = "?tool=ncsa_uiuc&email=caiincsa@gmail.com"
+    url = base_url + app_details + "&ids=" + id
+
+    response = requests.get(url)
+
+
+
+    
+

From b833524a68fc7f249a42a6c11c42d4907f4177aa Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Wed, 3 Apr 2024 15:37:15 -0500
Subject: [PATCH 02/28] completed all download functions

---
 ai_ta_backend/utils/pubmed_extraction.py | 321 +++++++++++++++++++----
 1 file changed, 275 insertions(+), 46 deletions(-)

diff --git a/ai_ta_backend/utils/pubmed_extraction.py b/ai_ta_backend/utils/pubmed_extraction.py
index 4be2b4a8..8daf4e45 100644
--- a/ai_ta_backend/utils/pubmed_extraction.py
+++ b/ai_ta_backend/utils/pubmed_extraction.py
@@ -1,14 +1,17 @@
 import os
 import requests
 import shutil
-import json
 import xml.etree.ElementTree as ET
 import ftplib
 import supabase
 import gzip
-import time
 import concurrent.futures
-import urllib.request
+from urllib.parse import urlparse
+import tarfile
+import os
+import shutil
+from minio import Minio
+
 
 
 SUPBASE_CLIENT = supabase.create_client(    # type: ignore
@@ -20,26 +23,45 @@ def extractPubmedData():
     """
     Extracts metadata from the files listed in FTP folder and stores it in SQL DB.
     """
-    ftp_address = "ftp.ncbi.nlm.nih.gov"
-    ftp_path = "pubmed/baseline"
-    file_list = getFileList(ftp_address, ftp_path, ".gz")
+    xml_filepath = "pubmed/pubmed24n1219.xml"
+    metadata = extractMetadataFromXML(xml_filepath)
+
+    # find PMC ID and DOI for all articles
+    metadata_with_ids = getArticleIDs(metadata)
+
+    # download the articles
+    complete_metadata = downloadArticles(metadata_with_ids)
+
+    print("Complete metadata: ", complete_metadata)
+    
+    # upload articles to bucket
+    article_upload = uploadToStorage("pubmed_abstracts")
+
+    # upload metadata to SQL DB
 
-    for file in file_list:
-        # download the .gz file
-        gz_filepath = downloadFromFTP(ftp_address, ftp_path, file, "pubmed")
-        print("Downloaded: ", gz_filepath)
+    
+
+    # ftp_address = "ftp.ncbi.nlm.nih.gov"
+    # ftp_path = "pubmed/baseline"
+    # file_list = getFileList(ftp_address, ftp_path, ".gz")
 
-        # extract the XML file
-        xml_filepath = extractXMLFile(gz_filepath)
-        print("XML Extracted: ", xml_filepath)
+    # for file in file_list:
+    #     # download the .gz file
+    #     gz_filepath = downloadFromFTP(ftp_address, ftp_path, file, "pubmed")
+    #     print("Downloaded: ", gz_filepath)
 
-        # extract metadata from the XML file
-        metadata = extractMetadataFromXML(xml_filepath)
+    #     # extract the XML file
+    #     xml_filepath = extractXMLFile(gz_filepath)
+    #     print("XML Extracted: ", xml_filepath)
 
-        # find PMC ID and DOI for all articles
-        for article in metadata:
-            pmid = article['pmid']
-            article_ids = getArticleIDs(pmid)
+    #     # extract metadata from the XML file
+    #     xml_filepath = "pubmed/pubmed24n1219.xml"
+    #     metadata = extractMetadataFromXML(xml_filepath)
+
+    #     # find PMC ID and DOI for all articles
+    #     for article in metadata:
+    #         pmid = article['pmid']
+    #         article_ids = getArticleIDs(pmid)
 
         
         # delete XML and .gz files
@@ -50,7 +72,7 @@ def extractPubmedData():
 
 def downloadFromFTP(ftp_address: str, ftp_path: str, file: str, local_dir: str):
     """
-    Downloads all .gz files from the FTP folder and stores it in the local directory.
+    Downloads a .gz file from the FTP folder and stores it in the local directory.
     """
     # create local directory if it doesn't exist
     os.makedirs(local_dir, exist_ok=True)
@@ -61,19 +83,17 @@ def downloadFromFTP(ftp_address: str, ftp_path: str, file: str, local_dir: str):
     ftp.cwd(ftp_path) 
 
     local_filepath = os.path.join(local_dir, file)
-
     with open(local_filepath, 'wb') as f:
         ftp.retrbinary('RETR ' + file, f.write)
         
     print(f"Downloaded {file} to {local_filepath}")
 
     ftp.quit()
-    
-    return "success"
+    return local_filepath
 
 def getFileList(ftp_address: str, ftp_path: str, extension: str = ".gz"):
     """
-    Returns a list of files in the FTP folder.
+    Returns a list of .gz files in the FTP folder.
     """
     # connect to the FTP server
     ftp = ftplib.FTP(ftp_address)
@@ -118,53 +138,262 @@ def extractMetadataFromXML(xml_filepath: str):
     Returns:
         metadata: List of dictionaries containing metadata for each article.
     """
+    # create a directory to store abstracts
+    os.makedirs("pubmed_abstracts", exist_ok=True)
+
     tree = ET.parse(xml_filepath)
     root = tree.getroot()
     metadata = []
+    
     # Extract metadata from the XML file
     for item in root.iter('PubmedArticle'):
         article_data = {}
 
-        publication_status = item.find('PubmedData/PublicationStatus').text
-        # ppublish articles are not present in PMC database
-        if publication_status == "epublish":
-            article_data['full_text'] = True
-        else:
-            article_data['full_text'] = False
-
         medline_citation = item.find('MedlineCitation')
         article = medline_citation.find('Article')
         journal = article.find('Journal')
         issue = journal.find('JournalIssue')
 
-        article_data['pmid'] = medline_citation.find('PMID').text
-        article_data['issn'] = journal.find('ISSN').text
-        article_data['journal_title'] = journal.find('Title').text
+        if medline_citation.find('PMID') is not None:
+            article_data['pmid'] = medline_citation.find('PMID').text
+            article_data['pmcid'] = None
+            article_data['doi'] = None
+        else:
+            continue
 
-        article_title = article.find('ArticleTitle').text
-        article_data['article_title'] = article_title.replace('[', '').replace(']', '')
+        if journal.find('ISSN') is not None:
+            article_data['issn'] = journal.find('ISSN').text
+        else:
+            article_data['issn'] = None
 
-        article_data['last_revised'] = f"{medline_citation.find('DateRevised/Year').text}-{medline_citation.find('DateRevised/Month').text}-{medline_citation.find('DateRevised/Day').text}"
-        article_data['published'] = f"{issue.find('PubDate/Year').text}-{issue.find('PubDate/Month').text}-{issue.find('PubDate/Day').text}"
-        #article_data['date_completed'] = f"{medline_citation.find('DateCompleted/Year').text}-{medline_citation.find('DateCompleted/Month').text}-{medline_citation.find('DateCompleted/Day').text}"
+        if journal.find('Title') is not None:
+            article_data['journal_title'] = journal.find('Title').text
+        else:
+            article_data['journal_title'] = None
+        
+        # some articles don't have an article title
+        article_title = article.find('ArticleTitle')
+        if article_title is not None and article_title.text is not None:
+            article_data['article_title'] = article_title.text.replace('[', '').replace(']', '')
+        else:
+            article_data['article_title'] = None
 
+        article_data['last_revised'] = f"{medline_citation.find('DateRevised/Year').text}-{medline_citation.find('DateRevised/Month').text}-{medline_citation.find('DateRevised/Day').text}"
+        
+        # some articles don't have all fields present for publication date
+        if issue.find('PubDate/Year') is not None and issue.find('PubDate/Month') is not None and issue.find('PubDate/Day') is not None:   
+            article_data['published'] = f"{issue.find('PubDate/Year').text}-{issue.find('PubDate/Month').text}-{issue.find('PubDate/Day').text}"
+        elif issue.find('PubDate/Year') is not None and issue.find('PubDate/Month') is not None:
+            article_data['published'] = f"{issue.find('PubDate/Year').text}-{issue.find('PubDate/Month').text}"
+        elif issue.find('PubDate/Year') is not None:
+            article_data['published'] = f"{issue.find('PubDate/Year').text}"
+        else:
+            article_data['published'] = None
+        
+        
         # extract and store abstract in a text file
+        abstract = article.find('Abstract')
+        if abstract is not None:
+            abstract_text = ""
+            for abstract_text_element in abstract.iter('AbstractText'):
+                # if labels (objective, methods, etc.) are present, add them to the text (e.g. "OBJECTIVE: ")
+                if abstract_text_element.attrib.get('Label') is not None:
+                    abstract_text += abstract_text_element.attrib.get('Label') + ": "
+                if abstract_text_element.text is not None:
+                    abstract_text += abstract_text_element.text + "\n"
+            
+            # save abstract to a text file
+            abstract_filename = f"pubmed_abstracts/{article_data['pmid']}.txt"
+            with open(abstract_filename, 'w') as f:
+                if article_data['article_title']:
+                    f.write("Article title: " + article_data['article_title'] + "\n")
+                if article_data['journal_title']:
+                    f.write("Journal title: " + article_data['journal_title'] + "\n")
+                f.write("Abstract: " + abstract_text)
         
-
+        # some articles are listed, but not released online yet. Adding fields for such articles to maintain uniformity.
+        article_data['live'] = True
+        article_data['release_date'] = None
+        article_data['license'] = None
+        article_data['pubmed_ftp_link'] = None
+        article_data['filepath'] = abstract_filename
 
         metadata.append(article_data)
-        
+        if len(metadata) == 300:
+            return metadata
     return metadata
 
-def getArticleIDs(pmid: str):
+def getArticleIDs(metadata: list):
     """
-    Retrieves the PMC ID and DOI for an article.
+    Retrieves the PMC ID and DOI for given articles and updates the metadata.
     """
     base_url = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/"
-    app_details = "?tool=ncsa_uiuc&email=caiincsa@gmail.com"
-    url = base_url + app_details + "&ids=" + id
+    app_details = "?tool=ncsa_uiuc&email=caiincsa@gmail.com&format=json"
+
+    batch_size = 200    # maximum number of articles API can process in one request
+    for i in range(0, len(metadata), batch_size):
+        batch = metadata[i:i+batch_size]
+        ids = ",".join([article['pmid'] for article in batch])
+        response = requests.get(base_url + app_details + "&ids=" + ids)
+        data = response.json()        
+        records = data['records']
+
+        for record in records:
+            if 'errmsg' in record:
+                article['live'] = False
+                continue
+            else:
+                # find article with matching pmid and update pmcid, doi, live, and release date fields
+                for article in batch:
+                    if article['pmid'] == record['pmid']:
+                        article['pmcid'] = record['pmcid']
+                        article['doi'] = record['doi']
+                        article['live'] = False if 'live' in record and record['live'] == "false" else True
+                        article['release_date'] = record.get('release-date', article['release_date'])
+                        print("Updated metadata in ID converter: ", article)
+                        break
+    return metadata
+
+def downloadArticles(metadata: list):
+    """
+    Downloads articles from PMC and stores them in bucket.
+    Updates metadata with license information.
+    """
+
+    base_url = "https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?"
+    print("Downloading articles...")
+
+    # connect to FTP server anonymously
+    ftp = ftplib.FTP("ftp.ncbi.nlm.nih.gov")
+    ftp.login()
+
+    for article in metadata:
+
+        if article['live'] is False or article['pmcid'] is None:
+            continue
+        
+        # else proceed with download
+        if article['pmcid']:
+            # download the article
+            final_url = base_url + "id=" + article['pmcid'] 
+            print("Downloading: ", final_url)
+
+            xml_response = requests.get(final_url)
+            extracted_data = extractArticleData(xml_response.text)
+            
+            print("\nExtracted data: ", extracted_data)
+
+            # if no data extracted (reason: article not released/open-access), skip to next article
+            if not extracted_data:
+                article['live'] = False
+                continue
+
+            # update metadata with license and ftp link information
+            article['license'] = extracted_data[0]['license']
+            article['pubmed_ftp_link'] = extracted_data[0]['href'] if 'href' in extracted_data[0] else None
+            
+            # download the article
+            ftp_url = urlparse(extracted_data[0]['href'])
+            ftp_path = ftp_url.path[1:]
+            print("FTP path: ", ftp_path)
+
+            filename = ftp_path.split("/")[-1]
+            local_file = os.path.join("pubmed_abstracts", filename)
+            with open(local_file, 'wb') as f:
+                ftp.retrbinary('RETR ' + ftp_path, f.write)
+            print("Downloaded: ", local_file)
+            article['filepath'] = local_file
+
+            # if file is .tar.gz, extract the PDF and delete the tar.gz file
+            if filename.endswith(".tar.gz"):
+                extracted_pdf_paths = extractPDF(local_file)
+                print("Extracted PDF: ", extracted_pdf_paths)
+                article['filepath'] = ",".join(extracted_pdf_paths)
+                os.remove(local_file)
+            
+            print("\nUpdated metadata after download: ", article)
+    ftp.login()
+    return metadata          
+
+def extractPDF(tar_gz_filepath: str):
+    """
+    Extracts the PDF file from the .tar.gz file.
+    """
+    print("Extracting PDF from: ", tar_gz_filepath)
+    extracted_paths = []
+    with tarfile.open(tar_gz_filepath, "r:gz") as tar:
+        for member in tar:
+            if member.isreg() and member.name.endswith(".pdf"):
+                tar.extract(member, path="pubmed_abstracts")
+                print("Extracted: ", member.name)
+                extracted_paths.append(os.path.join("pubmed_abstracts", member.name))
+              
+    return extracted_paths
+
+def extractArticleData(xml_string: str):
+    """
+    Extracts license information and article download link from the XML response.
+    """
+    root = ET.fromstring(xml_string)
+
+    if root.find(".//error") is not None:
+        return []
+
+    records = root.findall(".//record")
+    extracted_data = []
+    href = None
+    print("In extractArticleData")
+    for record in records:
+        record_id = record.get("id")
+        license = record.get("license")
+        links = record.findall(".//link")
+
+        for link in links:
+            if link.get("format") == "pdf":
+                href = link.get("href")
+                break
+        # if PDF link not found, use the available tgz link
+        if not href:
+            href = links[0].get("href")
+        
+        extracted_data.append({
+            "record_id": record_id,
+            "license": license,
+            "href": href
+        })
+    
+    return extracted_data
+
+def uploadToStorage(filepath: str):
+    """
+    Uploads all files present in given folder to Minio bucket.
+    """
+    minio_client = Minio(
+        endpoint=os.getenv('MINIO_ENDPOINT'),
+        access_key=os.getenv('MINIO_ACCESS_KEY'),
+        secret_key=os.getenv('MINIO_SECRET_KEY'),
+        secure=False
+    )
+
+    bucket_name = "pubmed"
+    for root, dirs, files in os.walk(filepath):
+        for file in files:
+            file_path = os.path.join(root, file)
+            print("Uploading: ", file_path)
+            
+    
+    return "success"
+
+
+
+        
+                        
+
+
+
+    
+            
 
-    response = requests.get(url)
 
 
 

From 8f14cf9b5137aeae326c7e3cff24150d562be64b Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Wed, 3 Apr 2024 17:23:05 -0500
Subject: [PATCH 03/28] added supabase upsert

---
 ai_ta_backend/utils/pubmed_extraction.py | 82 ++++++++++++------------
 1 file changed, 40 insertions(+), 42 deletions(-)

diff --git a/ai_ta_backend/utils/pubmed_extraction.py b/ai_ta_backend/utils/pubmed_extraction.py
index 8daf4e45..d3923eaf 100644
--- a/ai_ta_backend/utils/pubmed_extraction.py
+++ b/ai_ta_backend/utils/pubmed_extraction.py
@@ -23,6 +23,17 @@ def extractPubmedData():
     """
     Extracts metadata from the files listed in FTP folder and stores it in SQL DB.
     """
+    ftp_address = "ftp.ncbi.nlm.nih.gov"
+    ftp_path = "pubmed/baseline"
+    file_list = getFileList(ftp_address, ftp_path, ".gz")
+
+    gz_filepath = downloadXML(ftp_address, ftp_path, file_list[0], "pubmed")
+    print("GZ Downloaded: ", gz_filepath)
+
+    # extract the XML file
+    xml_filepath = extractXMLFile(gz_filepath)
+    print("XML Extracted: ", xml_filepath)
+
     xml_filepath = "pubmed/pubmed24n1219.xml"
     metadata = extractMetadataFromXML(xml_filepath)
 
@@ -31,48 +42,22 @@ def extractPubmedData():
 
     # download the articles
     complete_metadata = downloadArticles(metadata_with_ids)
-
     print("Complete metadata: ", complete_metadata)
     
     # upload articles to bucket
     article_upload = uploadToStorage("pubmed_abstracts")
+    print("Uploaded articles: ", article_upload)
 
     # upload metadata to SQL DB
+    response = SUPBASE_CLIENT.table("publications").upsert(complete_metadata).execute()
+    print("Supabase response: ", response)
+    exit()
 
-    
-
-    # ftp_address = "ftp.ncbi.nlm.nih.gov"
-    # ftp_path = "pubmed/baseline"
-    # file_list = getFileList(ftp_address, ftp_path, ".gz")
-
-    # for file in file_list:
-    #     # download the .gz file
-    #     gz_filepath = downloadFromFTP(ftp_address, ftp_path, file, "pubmed")
-    #     print("Downloaded: ", gz_filepath)
-
-    #     # extract the XML file
-    #     xml_filepath = extractXMLFile(gz_filepath)
-    #     print("XML Extracted: ", xml_filepath)
-
-    #     # extract metadata from the XML file
-    #     xml_filepath = "pubmed/pubmed24n1219.xml"
-    #     metadata = extractMetadataFromXML(xml_filepath)
-
-    #     # find PMC ID and DOI for all articles
-    #     for article in metadata:
-    #         pmid = article['pmid']
-    #         article_ids = getArticleIDs(pmid)
-
-        
-        # delete XML and .gz files
-        
-
-    
     return "success"
 
-def downloadFromFTP(ftp_address: str, ftp_path: str, file: str, local_dir: str):
+def downloadXML(ftp_address: str, ftp_path: str, file: str, local_dir: str):
     """
-    Downloads a .gz file from the FTP folder and stores it in the local directory.
+    Downloads a .gz XML file from the FTP baseline folder and stores it in the local directory.
     """
     # create local directory if it doesn't exist
     os.makedirs(local_dir, exist_ok=True)
@@ -190,7 +175,6 @@ def extractMetadataFromXML(xml_filepath: str):
         else:
             article_data['published'] = None
         
-        
         # extract and store abstract in a text file
         abstract = article.find('Abstract')
         if abstract is not None:
@@ -219,7 +203,7 @@ def extractMetadataFromXML(xml_filepath: str):
         article_data['filepath'] = abstract_filename
 
         metadata.append(article_data)
-        if len(metadata) == 300:
+        if len(metadata) == 20:
             return metadata
     return metadata
 
@@ -240,7 +224,11 @@ def getArticleIDs(metadata: list):
 
         for record in records:
             if 'errmsg' in record:
-                article['live'] = False
+                print("Error: ", record['errmsg'])
+                for article in batch:
+                    if article['pmid'] == record['pmid']:
+                        article['live'] = False
+                        break
                 continue
             else:
                 # find article with matching pmid and update pmcid, doi, live, and release date fields
@@ -368,20 +356,30 @@ def uploadToStorage(filepath: str):
     """
     Uploads all files present in given folder to Minio bucket.
     """
-    minio_client = Minio(
-        endpoint=os.getenv('MINIO_ENDPOINT'),
-        access_key=os.getenv('MINIO_ACCESS_KEY'),
-        secret_key=os.getenv('MINIO_SECRET_KEY'),
+    print("in uploadToStorage()")
+    
+    minio_client = Minio(os.environ['MINIO_URL'],
+        access_key=os.environ['MINIO_ACCESS_KEY'],
+        secret_key=os.environ['MINIO_SECRET_KEY'],
         secure=False
     )
 
     bucket_name = "pubmed"
+    found = minio_client.bucket_exists(bucket_name)
+    if not found:
+        minio_client.make_bucket(bucket_name)
+        print("Created bucket", bucket_name)
+    else:
+        print("Bucket", bucket_name, "already exists")
+
     for root, dirs, files in os.walk(filepath):
+        # can parallelize this upload
         for file in files:
             file_path = os.path.join(root, file)
-            print("Uploading: ", file_path)
-            
-    
+            object_name = file_path.split("/")[-1]
+            # insert local file into remote bucket
+            minio_client.fput_object(bucket_name, object_name, file_path)
+            print("Uploaded: ", object_name)
     return "success"
 
 

From 6daa4df762021379bf75770ff5076edde470e736 Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Wed, 3 Apr 2024 18:30:36 -0500
Subject: [PATCH 04/28] updated comments

---
 ai_ta_backend/utils/pubmed_extraction.py | 98 +++++++++++++++++-------
 1 file changed, 69 insertions(+), 29 deletions(-)

diff --git a/ai_ta_backend/utils/pubmed_extraction.py b/ai_ta_backend/utils/pubmed_extraction.py
index d3923eaf..07adf0b4 100644
--- a/ai_ta_backend/utils/pubmed_extraction.py
+++ b/ai_ta_backend/utils/pubmed_extraction.py
@@ -19,14 +19,20 @@
     supabase_key=os.getenv('SUPABASE_API_KEY')  # type: ignore
 )
 
+MINIO_CLIENT = Minio(os.environ['MINIO_URL'],
+    access_key=os.environ['MINIO_ACCESS_KEY'],
+    secret_key=os.environ['MINIO_SECRET_KEY'],
+    secure=False
+)
+
 def extractPubmedData():
     """
-    Extracts metadata from the files listed in FTP folder and stores it in SQL DB.
+    Main function to extract metadata and articles from the PubMed baseline folder.
     """
     ftp_address = "ftp.ncbi.nlm.nih.gov"
     ftp_path = "pubmed/baseline"
     file_list = getFileList(ftp_address, ftp_path, ".gz")
-
+    
     gz_filepath = downloadXML(ftp_address, ftp_path, file_list[0], "pubmed")
     print("GZ Downloaded: ", gz_filepath)
 
@@ -58,6 +64,13 @@ def extractPubmedData():
 def downloadXML(ftp_address: str, ftp_path: str, file: str, local_dir: str):
     """
     Downloads a .gz XML file from the FTP baseline folder and stores it in the local directory.
+    Args:
+        ftp_address: FTP server address.
+        ftp_path: Path to the FTP folder.
+        file: File to download.
+        local_dir: Local directory to store the downloaded file.
+    Returns:
+        local_filepath: Path to the downloaded file.
     """
     # create local directory if it doesn't exist
     os.makedirs(local_dir, exist_ok=True)
@@ -78,7 +91,13 @@ def downloadXML(ftp_address: str, ftp_path: str, file: str, local_dir: str):
 
 def getFileList(ftp_address: str, ftp_path: str, extension: str = ".gz"):
     """
-    Returns a list of .gz files in the FTP folder.
+    Returns a list of .gz files in the FTP baseline folder.
+    Args:
+        ftp_address: FTP server address.
+        ftp_path: Path to the FTP folder.
+        extension: File extension to filter for.
+    Returns:
+        gz_files: List of .gz files in the FTP folder.
     """
     # connect to the FTP server
     ftp = ftplib.FTP(ftp_address)
@@ -107,7 +126,7 @@ def extractXMLFile(gz_filepath: str):
     Returns:
         xml_filepath: Path to the extracted XML file.
     """
-    print("gz file path: ", gz_filepath)
+    print("Downloaded .gz file path: ", gz_filepath)
     xml_filepath = gz_filepath.replace(".gz", "")
     with gzip.open(gz_filepath, 'rb') as f_in:
         with open(xml_filepath, 'wb') as f_out:
@@ -117,7 +136,9 @@ def extractXMLFile(gz_filepath: str):
 
 def extractMetadataFromXML(xml_filepath: str):
     """
-    Extracts metadata from the XML file and stores it in a dictionary.
+    Extracts article details from the XML file and stores it in a dictionary.
+    Details extracted: PMID, PMCID, DOI, ISSN, journal title, article title, 
+    last revised date, published date, abstract.
     Args: 
         xml_filepath: Path to the XML file.
     Returns:
@@ -130,6 +151,7 @@ def extractMetadataFromXML(xml_filepath: str):
     root = tree.getroot()
     metadata = []
     
+    # PARALLELIZE THE BELOW FOR LOOP AND EXTRACT METADATA FOR ALL ARTICLES AT ONCE - IN 1000s
     # Extract metadata from the XML file
     for item in root.iter('PubmedArticle'):
         article_data = {}
@@ -209,7 +231,13 @@ def extractMetadataFromXML(xml_filepath: str):
 
 def getArticleIDs(metadata: list):
     """
-    Retrieves the PMC ID and DOI for given articles and updates the metadata.
+    Uses the PubMed ID converter API to get PMCID and DOI for each article.
+    Queries the API in batches of 200 articles at a time.
+    Also updates the metadata with the release date and live status - some articles are yet to be released.
+    Args:
+        metadata: List of dictionaries containing metadata for each article.
+    Returns:
+        metadata: Updated metadata with PMCID, DOI, release date, and live status information.
     """
     base_url = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/"
     app_details = "?tool=ncsa_uiuc&email=caiincsa@gmail.com&format=json"
@@ -221,7 +249,8 @@ def getArticleIDs(metadata: list):
         response = requests.get(base_url + app_details + "&ids=" + ids)
         data = response.json()        
         records = data['records']
-
+        
+        # PARALLELIZE THIS FOR LOOP - UPDATES ADDITIONAL FIELDS FOR ALL ARTICLES AT ONCE
         for record in records:
             if 'errmsg' in record:
                 print("Error: ", record['errmsg'])
@@ -244,8 +273,11 @@ def getArticleIDs(metadata: list):
 
 def downloadArticles(metadata: list):
     """
-    Downloads articles from PMC and stores them in bucket.
-    Updates metadata with license information.
+    Downloads articles from PMC and stores them in local directory.
+    Args:
+        metadata: List of dictionaries containing metadata for each article.
+    Returns:
+        metadata: Updated metadata with license, FTP link, and downloaded filepath information.
     """
 
     base_url = "https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?"
@@ -255,6 +287,7 @@ def downloadArticles(metadata: list):
     ftp = ftplib.FTP("ftp.ncbi.nlm.nih.gov")
     ftp.login()
 
+    # PARALLELIZE THIS FOR LOOP - DOWNLOAD + METADATA UPDATE
     for article in metadata:
 
         if article['live'] is False or article['pmcid'] is None:
@@ -262,14 +295,15 @@ def downloadArticles(metadata: list):
         
         # else proceed with download
         if article['pmcid']:
-            # download the article
+            # query URL for article download
             final_url = base_url + "id=" + article['pmcid'] 
-            print("Downloading: ", final_url)
+            print("Download URL: ", final_url)
 
             xml_response = requests.get(final_url)
+            # get license and FTP link
             extracted_data = extractArticleData(xml_response.text)
             
-            print("\nExtracted data: ", extracted_data)
+            print("\nExtracted license and link data: ", extracted_data)
 
             # if no data extracted (reason: article not released/open-access), skip to next article
             if not extracted_data:
@@ -289,23 +323,28 @@ def downloadArticles(metadata: list):
             local_file = os.path.join("pubmed_abstracts", filename)
             with open(local_file, 'wb') as f:
                 ftp.retrbinary('RETR ' + ftp_path, f.write)
-            print("Downloaded: ", local_file)
+            print("Downloaded PDF file: ", local_file)
             article['filepath'] = local_file
 
             # if file is .tar.gz, extract the PDF and delete the tar.gz file
             if filename.endswith(".tar.gz"):
                 extracted_pdf_paths = extractPDF(local_file)
-                print("Extracted PDF: ", extracted_pdf_paths)
+                print("Extracted PDFs from .tar.gz file: ", extracted_pdf_paths)
                 article['filepath'] = ",".join(extracted_pdf_paths)
                 os.remove(local_file)
             
             print("\nUpdated metadata after download: ", article)
-    ftp.login()
+    ftp.quit()
     return metadata          
 
 def extractPDF(tar_gz_filepath: str):
     """
-    Extracts the PDF file from the .tar.gz file.
+    Extracts PDF files from the downloaded .tar.gz file. The zipped folder contains other supplementary
+    materials like images, etc. which are not extracted.
+    Args:
+        tar_gz_filepath: Path to the .tar.gz file.
+    Returns:
+        extracted_paths: List of paths to the extracted PDF files.
     """
     print("Extracting PDF from: ", tar_gz_filepath)
     extracted_paths = []
@@ -321,18 +360,25 @@ def extractPDF(tar_gz_filepath: str):
 def extractArticleData(xml_string: str):
     """
     Extracts license information and article download link from the XML response.
+    This function process XML response for single article.
+    Args:
+        xml_string: XML response from PMC download API.
+    Returns:
+        extracted_data: List of dictionaries containing license and download link for the article.
     """
-    root = ET.fromstring(xml_string)
+    print("In extractArticleData")
 
+    root = ET.fromstring(xml_string)
+    # if there is an errors (article not open-access), return empty list (skip article)
     if root.find(".//error") is not None:
         return []
 
     records = root.findall(".//record")
     extracted_data = []
     href = None
-    print("In extractArticleData")
+    
     for record in records:
-        record_id = record.get("id")
+        record_id = record.get("id")    # pmcid
         license = record.get("license")
         links = record.findall(".//link")
 
@@ -354,20 +400,14 @@ def extractArticleData(xml_string: str):
 
 def uploadToStorage(filepath: str):
     """
-    Uploads all files present in given folder to Minio bucket.
+    Uploads all files present under given filepath to Minio bucket.
     """
     print("in uploadToStorage()")
     
-    minio_client = Minio(os.environ['MINIO_URL'],
-        access_key=os.environ['MINIO_ACCESS_KEY'],
-        secret_key=os.environ['MINIO_SECRET_KEY'],
-        secure=False
-    )
-
     bucket_name = "pubmed"
-    found = minio_client.bucket_exists(bucket_name)
+    found = MINIO_CLIENT.bucket_exists(bucket_name)
     if not found:
-        minio_client.make_bucket(bucket_name)
+        MINIO_CLIENT.make_bucket(bucket_name)
         print("Created bucket", bucket_name)
     else:
         print("Bucket", bucket_name, "already exists")
@@ -378,7 +418,7 @@ def uploadToStorage(filepath: str):
             file_path = os.path.join(root, file)
             object_name = file_path.split("/")[-1]
             # insert local file into remote bucket
-            minio_client.fput_object(bucket_name, object_name, file_path)
+            MINIO_CLIENT.fput_object(bucket_name, object_name, file_path)
             print("Uploaded: ", object_name)
     return "success"
 

From c8c90560aae8e4192d923548f3aa4d731a1dd9c5 Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Thu, 4 Apr 2024 15:54:34 -0500
Subject: [PATCH 05/28] added processpool to extractMetadataFromXML()

---
 ai_ta_backend/utils/pubmed_extraction.py | 153 ++++++++++++++++++++---
 1 file changed, 133 insertions(+), 20 deletions(-)

diff --git a/ai_ta_backend/utils/pubmed_extraction.py b/ai_ta_backend/utils/pubmed_extraction.py
index 07adf0b4..0ab4ade3 100644
--- a/ai_ta_backend/utils/pubmed_extraction.py
+++ b/ai_ta_backend/utils/pubmed_extraction.py
@@ -11,6 +11,7 @@
 import os
 import shutil
 from minio import Minio
+import time
 
 
 
@@ -29,19 +30,27 @@ def extractPubmedData():
     """
     Main function to extract metadata and articles from the PubMed baseline folder.
     """
+    start_time = time.time()
+
     ftp_address = "ftp.ncbi.nlm.nih.gov"
     ftp_path = "pubmed/baseline"
     file_list = getFileList(ftp_address, ftp_path, ".gz")
     
     gz_filepath = downloadXML(ftp_address, ftp_path, file_list[0], "pubmed")
     print("GZ Downloaded: ", gz_filepath)
+    print("Time taken to download .gz file: ", round(time.time() - start_time, 2), "seconds")
 
     # extract the XML file
     xml_filepath = extractXMLFile(gz_filepath)
     print("XML Extracted: ", xml_filepath)
+    print("Time taken to extract XML file: ", round(time.time() - start_time, 2), "seconds")
 
     xml_filepath = "pubmed/pubmed24n1219.xml"
     metadata = extractMetadataFromXML(xml_filepath)
+    print("Number of articles found in this file: ", len(metadata))
+    print("\nSample metadata: ", metadata)
+    print("\n\nTime taken to extract metadata: ", round(time.time() - start_time, 2), "seconds")
+    exit()
 
     # find PMC ID and DOI for all articles
     metadata_with_ids = getArticleIDs(metadata)
@@ -55,10 +64,9 @@ def extractPubmedData():
     print("Uploaded articles: ", article_upload)
 
     # upload metadata to SQL DB
-    response = SUPBASE_CLIENT.table("publications").upsert(complete_metadata).execute()
+    response = SUPBASE_CLIENT.table("publications").upsert(complete_metadata).execute() # type: ignore
     print("Supabase response: ", response)
-    exit()
-
+    
     return "success"
 
 def downloadXML(ftp_address: str, ftp_path: str, file: str, local_dir: str):
@@ -144,16 +152,121 @@ def extractMetadataFromXML(xml_filepath: str):
     Returns:
         metadata: List of dictionaries containing metadata for each article.
     """
+    print("inside extractMetadataFromXML()")
+
     # create a directory to store abstracts
     os.makedirs("pubmed_abstracts", exist_ok=True)
 
     tree = ET.parse(xml_filepath)
     root = tree.getroot()
     metadata = []
+
+
+    with concurrent.futures.ProcessPoolExecutor() as executor:
+        futures = []
+        article_items = list(item for item in root.iter('PubmedArticle'))  # Convert generator to list
+        total_items = len(article_items)  # Use len() since article_items is now a list
+        article_items_100 = (article_items[i:i+50] for i in range(0, total_items, 50))
+        for chunk in article_items_100:
+            for item in chunk:
+                future = executor.submit(processArticleItem, item)
+                futures.append(future)
+        
+                for future in concurrent.futures.as_completed(futures):
+                    article_data = future.result()
+                    metadata.append(article_data)
+
+    print("Extracted metadata for 20 articles: ", metadata[:20])
+    print("Total articles extracted: ", len(metadata))
+    return metadata
     
-    # PARALLELIZE THE BELOW FOR LOOP AND EXTRACT METADATA FOR ALL ARTICLES AT ONCE - IN 1000s
-    # Extract metadata from the XML file
-    for item in root.iter('PubmedArticle'):
+    # # PARALLELIZE THE BELOW FOR LOOP AND EXTRACT METADATA FOR ALL ARTICLES AT ONCE - IN 1000s
+    # # Extract metadata from the XML file
+    # for item in root.iter('PubmedArticle'):
+    #     article_data = {}
+
+    #     medline_citation = item.find('MedlineCitation')
+    #     article = medline_citation.find('Article')
+    #     journal = article.find('Journal')
+    #     issue = journal.find('JournalIssue')
+
+    #     if medline_citation.find('PMID') is not None:
+    #         article_data['pmid'] = medline_citation.find('PMID').text
+    #         article_data['pmcid'] = None
+    #         article_data['doi'] = None
+    #     else:
+    #         continue
+
+    #     if journal.find('ISSN') is not None:
+    #         article_data['issn'] = journal.find('ISSN').text
+    #     else:
+    #         article_data['issn'] = None
+
+    #     if journal.find('Title') is not None:
+    #         article_data['journal_title'] = journal.find('Title').text
+    #     else:
+    #         article_data['journal_title'] = None
+        
+    #     # some articles don't have an article title
+    #     article_title = article.find('ArticleTitle')
+    #     if article_title is not None and article_title.text is not None:
+    #         article_data['article_title'] = article_title.text.replace('[', '').replace(']', '')
+    #     else:
+    #         article_data['article_title'] = None
+
+    #     article_data['last_revised'] = f"{medline_citation.find('DateRevised/Year').text}-{medline_citation.find('DateRevised/Month').text}-{medline_citation.find('DateRevised/Day').text}"
+        
+    #     # some articles don't have all fields present for publication date
+    #     if issue.find('PubDate/Year') is not None and issue.find('PubDate/Month') is not None and issue.find('PubDate/Day') is not None:   
+    #         article_data['published'] = f"{issue.find('PubDate/Year').text}-{issue.find('PubDate/Month').text}-{issue.find('PubDate/Day').text}"
+    #     elif issue.find('PubDate/Year') is not None and issue.find('PubDate/Month') is not None:
+    #         article_data['published'] = f"{issue.find('PubDate/Year').text}-{issue.find('PubDate/Month').text}"
+    #     elif issue.find('PubDate/Year') is not None:
+    #         article_data['published'] = f"{issue.find('PubDate/Year').text}"
+    #     else:
+    #         article_data['published'] = None
+        
+    #     # extract and store abstract in a text file
+    #     abstract = article.find('Abstract')
+    #     if abstract is not None:
+    #         abstract_text = ""
+    #         for abstract_text_element in abstract.iter('AbstractText'):
+    #             # if labels (objective, methods, etc.) are present, add them to the text (e.g. "OBJECTIVE: ")
+    #             if abstract_text_element.attrib.get('Label') is not None:
+    #                 abstract_text += abstract_text_element.attrib.get('Label') + ": "
+    #             if abstract_text_element.text is not None:
+    #                 abstract_text += abstract_text_element.text + "\n"
+            
+    #         # save abstract to a text file
+    #         abstract_filename = f"pubmed_abstracts/{article_data['pmid']}.txt"
+    #         with open(abstract_filename, 'w') as f:
+    #             if article_data['article_title']:
+    #                 f.write("Article title: " + article_data['article_title'] + "\n")
+    #             if article_data['journal_title']:
+    #                 f.write("Journal title: " + article_data['journal_title'] + "\n")
+    #             f.write("Abstract: " + abstract_text)
+        
+    #     # some articles are listed, but not released online yet. Adding fields for such articles to maintain uniformity.
+    #     article_data['live'] = True
+    #     article_data['release_date'] = None
+    #     article_data['license'] = None
+    #     article_data['pubmed_ftp_link'] = None
+    #     article_data['filepath'] = abstract_filename
+
+    #     metadata.append(article_data)
+    #     if len(metadata) == 20:
+    #         return metadata
+    # return metadata
+
+def processArticleItem(item: ET.Element):
+    """
+    Extracts article details from a single PubmedArticle XML element. This is used in the process pool executor.
+    Args:
+        item: PubmedArticle XML element.
+    Returns:
+        article_data: Dictionary containing metadata for the article.
+    """
+    try:
         article_data = {}
 
         medline_citation = item.find('MedlineCitation')
@@ -166,7 +279,7 @@ def extractMetadataFromXML(xml_filepath: str):
             article_data['pmcid'] = None
             article_data['doi'] = None
         else:
-            continue
+            return article_data
 
         if journal.find('ISSN') is not None:
             article_data['issn'] = journal.find('ISSN').text
@@ -177,7 +290,7 @@ def extractMetadataFromXML(xml_filepath: str):
             article_data['journal_title'] = journal.find('Title').text
         else:
             article_data['journal_title'] = None
-        
+
         # some articles don't have an article title
         article_title = article.find('ArticleTitle')
         if article_title is not None and article_title.text is not None:
@@ -186,7 +299,7 @@ def extractMetadataFromXML(xml_filepath: str):
             article_data['article_title'] = None
 
         article_data['last_revised'] = f"{medline_citation.find('DateRevised/Year').text}-{medline_citation.find('DateRevised/Month').text}-{medline_citation.find('DateRevised/Day').text}"
-        
+            
         # some articles don't have all fields present for publication date
         if issue.find('PubDate/Year') is not None and issue.find('PubDate/Month') is not None and issue.find('PubDate/Day') is not None:   
             article_data['published'] = f"{issue.find('PubDate/Year').text}-{issue.find('PubDate/Month').text}-{issue.find('PubDate/Day').text}"
@@ -196,9 +309,10 @@ def extractMetadataFromXML(xml_filepath: str):
             article_data['published'] = f"{issue.find('PubDate/Year').text}"
         else:
             article_data['published'] = None
-        
+
         # extract and store abstract in a text file
         abstract = article.find('Abstract')
+        abstract_filename = None
         if abstract is not None:
             abstract_text = ""
             for abstract_text_element in abstract.iter('AbstractText'):
@@ -207,27 +321,26 @@ def extractMetadataFromXML(xml_filepath: str):
                     abstract_text += abstract_text_element.attrib.get('Label') + ": "
                 if abstract_text_element.text is not None:
                     abstract_text += abstract_text_element.text + "\n"
-            
+        
             # save abstract to a text file
             abstract_filename = f"pubmed_abstracts/{article_data['pmid']}.txt"
             with open(abstract_filename, 'w') as f:
-                if article_data['article_title']:
-                    f.write("Article title: " + article_data['article_title'] + "\n")
                 if article_data['journal_title']:
-                    f.write("Journal title: " + article_data['journal_title'] + "\n")
+                    f.write("Journal title: " + article_data['journal_title'] + "\n\n")
+                if article_data['article_title']:
+                    f.write("Article title: " + article_data['article_title'] + "\n\n")
                 f.write("Abstract: " + abstract_text)
-        
-        # some articles are listed, but not released online yet. Adding fields for such articles to maintain uniformity.
+
+        # some articles are listed, but not released yet. Adding fields for such articles to maintain uniformity.
         article_data['live'] = True
         article_data['release_date'] = None
         article_data['license'] = None
         article_data['pubmed_ftp_link'] = None
         article_data['filepath'] = abstract_filename
 
-        metadata.append(article_data)
-        if len(metadata) == 20:
-            return metadata
-    return metadata
+        return article_data
+    except Exception as e:
+        return {'error': str(e)}
 
 def getArticleIDs(metadata: list):
     """

From f8a4d23da927fd1ef670f71150308d8ad3d3761d Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Sat, 6 Apr 2024 11:58:13 -0500
Subject: [PATCH 06/28] minor changes

---
 ai_ta_backend/utils/pubmed_extraction.py | 124 +++++++++++++++++------
 1 file changed, 91 insertions(+), 33 deletions(-)

diff --git a/ai_ta_backend/utils/pubmed_extraction.py b/ai_ta_backend/utils/pubmed_extraction.py
index 0ab4ade3..6e5ebe89 100644
--- a/ai_ta_backend/utils/pubmed_extraction.py
+++ b/ai_ta_backend/utils/pubmed_extraction.py
@@ -12,6 +12,8 @@
 import shutil
 from minio import Minio
 import time
+from multiprocessing import Manager
+
 
 
 
@@ -46,26 +48,28 @@ def extractPubmedData():
     print("Time taken to extract XML file: ", round(time.time() - start_time, 2), "seconds")
 
     xml_filepath = "pubmed/pubmed24n1219.xml"
-    metadata = extractMetadataFromXML(xml_filepath)
-    print("Number of articles found in this file: ", len(metadata))
-    print("\nSample metadata: ", metadata)
-    print("\n\nTime taken to extract metadata: ", round(time.time() - start_time, 2), "seconds")
-    exit()
-
-    # find PMC ID and DOI for all articles
-    metadata_with_ids = getArticleIDs(metadata)
-
-    # download the articles
-    complete_metadata = downloadArticles(metadata_with_ids)
-    print("Complete metadata: ", complete_metadata)
     
-    # upload articles to bucket
-    article_upload = uploadToStorage("pubmed_abstracts")
-    print("Uploaded articles: ", article_upload)
-
-    # upload metadata to SQL DB
-    response = SUPBASE_CLIENT.table("publications").upsert(complete_metadata).execute() # type: ignore
-    print("Supabase response: ", response)
+    for metadata in extractMetadataFromXML(xml_filepath):
+        print("Total articles retrieved: ", len(metadata))
+        print("Time taken to extract metadata for 2000 articles: ", round(time.time() - start_time, 2), "seconds")
+
+        # find PMC ID and DOI for all articles
+        metadata_with_ids = getArticleIDs(metadata)
+        print("Time taken to get PMC ID and DOI for 2000 articles: ", round(time.time() - start_time, 2), "seconds")
+
+        # download the articles
+        complete_metadata = downloadArticles(metadata_with_ids)
+        print("Time taken to download articles for 2000 articles: ", round(time.time() - start_time, 2), "seconds")
+        print("Complete metadata: ", complete_metadata[:20])
+    
+        # upload articles to bucket
+        # article_upload = uploadToStorage("pubmed_abstracts")
+        # print("Uploaded articles: ", article_upload)
+
+        # upload metadata to SQL DB
+        response = SUPBASE_CLIENT.table("publications").upsert(complete_metadata).execute() # type: ignore
+        print("Supabase response: ", response)
+        exit()
     
     return "success"
 
@@ -165,20 +169,23 @@ def extractMetadataFromXML(xml_filepath: str):
     with concurrent.futures.ProcessPoolExecutor() as executor:
         futures = []
         article_items = list(item for item in root.iter('PubmedArticle'))  # Convert generator to list
-        total_items = len(article_items)  # Use len() since article_items is now a list
-        article_items_100 = (article_items[i:i+50] for i in range(0, total_items, 50))
-        for chunk in article_items_100:
-            for item in chunk:
-                future = executor.submit(processArticleItem, item)
-                futures.append(future)
-        
-                for future in concurrent.futures.as_completed(futures):
-                    article_data = future.result()
-                    metadata.append(article_data)
+    
+        for item in article_items:
+            future = executor.submit(processArticleItem, item)
+            article_data = future.result()
+
+            metadata.append(article_data)
+
+            if len(metadata) == 500:
+                print("collected 500 articles")
+                return metadata
+                metadata = []   # reset metadata for next batch
+
+    if metadata:
+        yield metadata
+    
+    print("Metadata extraction complete.")
 
-    print("Extracted metadata for 20 articles: ", metadata[:20])
-    print("Total articles extracted: ", len(metadata))
-    return metadata
     
     # # PARALLELIZE THE BELOW FOR LOOP AND EXTRACT METADATA FOR ALL ARTICLES AT ONCE - IN 1000s
     # # Extract metadata from the XML file
@@ -352,17 +359,45 @@ def getArticleIDs(metadata: list):
     Returns:
         metadata: Updated metadata with PMCID, DOI, release date, and live status information.
     """
+    print("In getArticleIDs()")
     base_url = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/"
     app_details = "?tool=ncsa_uiuc&email=caiincsa@gmail.com&format=json"
 
     batch_size = 200    # maximum number of articles API can process in one request
+    
+    # # Create a shared list using multiprocessing.Manager
+    # manager = Manager()
+    # shared_metadata = manager.list(metadata)  # Copy initial metadata into the shared list
+
+    # for i in range(0, len(metadata), batch_size):
+    #     batch = metadata[i:i+batch_size]
+    #     ids = ",".join([article['pmid'] for article in batch])
+    #     response = requests.get(base_url + app_details + "&ids=" + ids)
+    #     data = response.json()        
+    #     records = data['records']
+
+    #     with concurrent.futures.ProcessPoolExecutor() as executor:
+    #         futures = []
+    #         for record in records:
+    #             future = executor.submit(updateArticleMetadata, shared_metadata, record)
+    #             futures.append(future)
+
+    #         # process results from parallel tasks
+    #         for future in futures:
+    #             try:
+    #                 future.result()
+    #             except Exception as e:
+    #                 print(f"Error updating metadata for article: {e}")
+
+    # print("Updated metadata in ID converter: ", len(shared_metadata))
+    # return shared_metadata
+    
     for i in range(0, len(metadata), batch_size):
         batch = metadata[i:i+batch_size]
         ids = ",".join([article['pmid'] for article in batch])
         response = requests.get(base_url + app_details + "&ids=" + ids)
         data = response.json()        
         records = data['records']
-        
         # PARALLELIZE THIS FOR LOOP - UPDATES ADDITIONAL FIELDS FOR ALL ARTICLES AT ONCE
         for record in records:
             if 'errmsg' in record:
@@ -384,6 +419,29 @@ def getArticleIDs(metadata: list):
                         break
     return metadata
 
+def updateArticleMetadata(shared_metadata: list, record: dict):
+    """
+    Updates metadata with PMCID, DOI, release date, and live status information for given article.
+    """
+    if 'errmsg' in record:
+        print("Error: ", record['errmsg'])
+        for article in shared_metadata:
+            if article['pmid'] == record['pmid']:
+                article['live'] = False
+                break
+        
+    else:
+        # find article with matching pmid and update pmcid, doi, live, and release date fields
+        for article in shared_metadata:
+            if article['pmid'] == record['pmid']:
+                article['pmcid'] = record['pmcid']
+                article['doi'] = record['doi']
+                article['live'] = False if 'live' in record and record['live'] == "false" else True
+                article['release_date'] = record.get('release-date', article['release_date'])
+                print("Updated metadata in ID converter: ", article)
+                break
+
+
 def downloadArticles(metadata: list):
     """
     Downloads articles from PMC and stores them in local directory.

From fa83ddfe76fd6ddbb8bd9ae192f6bcba87354382 Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Mon, 8 Apr 2024 11:55:11 -0500
Subject: [PATCH 07/28] yielded metadata after collecting 100 articles

---
 ai_ta_backend/utils/pubmed_extraction.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/ai_ta_backend/utils/pubmed_extraction.py b/ai_ta_backend/utils/pubmed_extraction.py
index 6e5ebe89..4c36ad73 100644
--- a/ai_ta_backend/utils/pubmed_extraction.py
+++ b/ai_ta_backend/utils/pubmed_extraction.py
@@ -51,11 +51,11 @@ def extractPubmedData():
     
     for metadata in extractMetadataFromXML(xml_filepath):
         print("Total articles retrieved: ", len(metadata))
-        print("Time taken to extract metadata for 2000 articles: ", round(time.time() - start_time, 2), "seconds")
+        print("Time taken to extract metadata for 100 articles: ", round(time.time() - start_time, 2), "seconds")
 
         # find PMC ID and DOI for all articles
         metadata_with_ids = getArticleIDs(metadata)
-        print("Time taken to get PMC ID and DOI for 2000 articles: ", round(time.time() - start_time, 2), "seconds")
+        print("Time taken to get PMC ID and DOI for 100 articles: ", round(time.time() - start_time, 2), "seconds")
 
         # download the articles
         complete_metadata = downloadArticles(metadata_with_ids)
@@ -63,8 +63,8 @@ def extractPubmedData():
         print("Complete metadata: ", complete_metadata[:20])
     
         # upload articles to bucket
-        # article_upload = uploadToStorage("pubmed_abstracts")
-        # print("Uploaded articles: ", article_upload)
+        article_upload = uploadToStorage("pubmed_abstracts")
+        print("/n/nUploaded articles: ", article_upload)
 
         # upload metadata to SQL DB
         response = SUPBASE_CLIENT.table("publications").upsert(complete_metadata).execute() # type: ignore
@@ -176,9 +176,9 @@ def extractMetadataFromXML(xml_filepath: str):
 
             metadata.append(article_data)
 
-            if len(metadata) == 500:
-                print("collected 500 articles")
-                return metadata
+            if len(metadata) == 100:
+                print("collected 100 articles")
+                yield metadata
                 metadata = []   # reset metadata for next batch
 
     if metadata:
@@ -311,9 +311,9 @@ def processArticleItem(item: ET.Element):
         if issue.find('PubDate/Year') is not None and issue.find('PubDate/Month') is not None and issue.find('PubDate/Day') is not None:   
             article_data['published'] = f"{issue.find('PubDate/Year').text}-{issue.find('PubDate/Month').text}-{issue.find('PubDate/Day').text}"
         elif issue.find('PubDate/Year') is not None and issue.find('PubDate/Month') is not None:
-            article_data['published'] = f"{issue.find('PubDate/Year').text}-{issue.find('PubDate/Month').text}"
+            article_data['published'] = f"{issue.find('PubDate/Year').text}-{issue.find('PubDate/Month').text}-01"
         elif issue.find('PubDate/Year') is not None:
-            article_data['published'] = f"{issue.find('PubDate/Year').text}"
+            article_data['published'] = f"{issue.find('PubDate/Year').text}-01-01"
         else:
             article_data['published'] = None
 

From 64a4142833edcc82dbdc855b6cbd799a86e41360 Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Thu, 11 Apr 2024 11:27:52 -0500
Subject: [PATCH 08/28] storing metadata into csv and upserting per XML file

---
 ai_ta_backend/utils/pubmed_extraction.py | 611 +++++++++++------------
 1 file changed, 291 insertions(+), 320 deletions(-)

diff --git a/ai_ta_backend/utils/pubmed_extraction.py b/ai_ta_backend/utils/pubmed_extraction.py
index 4c36ad73..a20057f0 100644
--- a/ai_ta_backend/utils/pubmed_extraction.py
+++ b/ai_ta_backend/utils/pubmed_extraction.py
@@ -13,9 +13,9 @@
 from minio import Minio
 import time
 from multiprocessing import Manager
-
-
-
+import pandas as pd
+import threading 
+import json
 
 SUPBASE_CLIENT = supabase.create_client(    # type: ignore
     supabase_url=os.getenv('SUPABASE_URL'), # type: ignore
@@ -25,7 +25,7 @@
 MINIO_CLIENT = Minio(os.environ['MINIO_URL'],
     access_key=os.environ['MINIO_ACCESS_KEY'],
     secret_key=os.environ['MINIO_SECRET_KEY'],
-    secure=False
+    secure=True
 )
 
 def extractPubmedData():
@@ -38,20 +38,23 @@ def extractPubmedData():
     ftp_path = "pubmed/baseline"
     file_list = getFileList(ftp_address, ftp_path, ".gz")
     
-    gz_filepath = downloadXML(ftp_address, ftp_path, file_list[0], "pubmed")
+    gz_filepath = downloadXML(ftp_address, ftp_path, file_list[1], "pubmed")
     print("GZ Downloaded: ", gz_filepath)
-    print("Time taken to download .gz file: ", round(time.time() - start_time, 2), "seconds")
+    gz_file_download_time = round(time.time() - start_time, 2)
+    print("Time taken to download .gz file: ", gz_file_download_time, "seconds")
 
     # extract the XML file
+    if not gz_filepath:
+        return "failure"
     xml_filepath = extractXMLFile(gz_filepath)
     print("XML Extracted: ", xml_filepath)
-    print("Time taken to extract XML file: ", round(time.time() - start_time, 2), "seconds")
-
-    xml_filepath = "pubmed/pubmed24n1219.xml"
+    xml_extract_time = round(time.time() - gz_file_download_time, 2)
+    print("Time taken to extract XML file: ", xml_extract_time, "seconds")
+    
+    #xml_filepath = "pubmed/pubmed24n1219.xml"
     
     for metadata in extractMetadataFromXML(xml_filepath):
-        print("Total articles retrieved: ", len(metadata))
-        print("Time taken to extract metadata for 100 articles: ", round(time.time() - start_time, 2), "seconds")
+        metadata_extract_start_time = time.time() 
 
         # find PMC ID and DOI for all articles
         metadata_with_ids = getArticleIDs(metadata)
@@ -59,17 +62,43 @@ def extractPubmedData():
 
         # download the articles
         complete_metadata = downloadArticles(metadata_with_ids)
-        print("Time taken to download articles for 2000 articles: ", round(time.time() - start_time, 2), "seconds")
-        print("Complete metadata: ", complete_metadata[:20])
+
+        # store metadata in csv file
+        print("\n")
+        print("Total articles retrieved: ", len(complete_metadata))
+        df = pd.DataFrame(complete_metadata)
+        csv_filepath = "metadata.csv"
+
+        if os.path.isfile(csv_filepath):
+            df.to_csv(csv_filepath, mode='a', header=False, index=False)
+        else:
+            df.to_csv(csv_filepath, index=False)
+        
+        print("Time taken to extract metadata for 100 articles: ", round(time.time() - metadata_extract_start_time, 2), "seconds")
+
+
+    print("Time taken to download articles: ", round(time.time() - start_time, 2), "seconds")
+    print("Total metadata extracted: ", len(complete_metadata))
+
+    # upload articles to bucket
+    print("Uploading articles to storage...")
+    article_upload = uploadToStorage("pubmed_abstracts")
+    print("Uploaded articles: ", article_upload)
     
-        # upload articles to bucket
-        article_upload = uploadToStorage("pubmed_abstracts")
-        print("/n/nUploaded articles: ", article_upload)
-
-        # upload metadata to SQL DB
-        response = SUPBASE_CLIENT.table("publications").upsert(complete_metadata).execute() # type: ignore
-        print("Supabase response: ", response)
-        exit()
+    # upload metadata to SQL DB
+    df = pd.read_csv(csv_filepath)
+    
+    complete_metadata = df.to_dict('records')
+    for item in complete_metadata:
+        for key, value in item.items():
+            if pd.isna(value):  # Or: math.isnan(value)
+                item[key] = None
+  
+    print("Metadata loaded into dataframe: ", len(complete_metadata))
+    # continue with the rest of the code
+    response = SUPBASE_CLIENT.table("publications").upsert(complete_metadata).execute() # type: ignore
+    print("Supabase response: ", response)
+        
     
     return "success"
 
@@ -84,22 +113,26 @@ def downloadXML(ftp_address: str, ftp_path: str, file: str, local_dir: str):
     Returns:
         local_filepath: Path to the downloaded file.
     """
-    # create local directory if it doesn't exist
-    os.makedirs(local_dir, exist_ok=True)
+    try:
+        # create local directory if it doesn't exist
+        os.makedirs(local_dir, exist_ok=True)
 
-    # connect to the FTP server
-    ftp = ftplib.FTP(ftp_address)
-    ftp.login()
-    ftp.cwd(ftp_path) 
+        # connect to the FTP server
+        ftp = ftplib.FTP(ftp_address)
+        ftp.login()
+        ftp.cwd(ftp_path) 
 
-    local_filepath = os.path.join(local_dir, file)
-    with open(local_filepath, 'wb') as f:
-        ftp.retrbinary('RETR ' + file, f.write)
-        
-    print(f"Downloaded {file} to {local_filepath}")
+        local_filepath = os.path.join(local_dir, file)
+        with open(local_filepath, 'wb') as f:
+            ftp.retrbinary('RETR ' + file, f.write)
+            
+        print(f"Downloaded {file} to {local_filepath}")
 
-    ftp.quit()
-    return local_filepath
+        ftp.quit()
+        return local_filepath
+    except Exception as e:
+        print("Error downloading file: ", e)
+        return None
 
 def getFileList(ftp_address: str, ftp_path: str, extension: str = ".gz"):
     """
@@ -111,24 +144,28 @@ def getFileList(ftp_address: str, ftp_path: str, extension: str = ".gz"):
     Returns:
         gz_files: List of .gz files in the FTP folder.
     """
-    # connect to the FTP server
-    ftp = ftplib.FTP(ftp_address)
-    ftp.login()
+    try:
+        # connect to the FTP server
+        ftp = ftplib.FTP(ftp_address)
+        ftp.login()
 
-    # Change directory to the specified path
-    ftp.cwd(ftp_path) 
+        # Change directory to the specified path
+        ftp.cwd(ftp_path) 
 
-    # Get list of file entries
-    file_listing = ftp.nlst()
+        # Get list of file entries
+        file_listing = ftp.nlst()
 
-    ftp.quit()
+        ftp.quit()
 
-    # Filter for files with the specified extension
-    gz_files = [entry for entry in file_listing if entry.endswith(extension)]
-    gz_files.sort(reverse=True)
-    print(f"Found {len(gz_files)} files on {ftp_address}/{ftp_path}")
+        # Filter for files with the specified extension
+        gz_files = [entry for entry in file_listing if entry.endswith(extension)]
+        gz_files.sort(reverse=True)
+        print(f"Found {len(gz_files)} files on {ftp_address}/{ftp_path}")
 
-    return gz_files
+        return gz_files
+    except Exception as e:
+        print("Error getting file list: ", e)
+        return []
 
 def extractXMLFile(gz_filepath: str):
     """
@@ -138,13 +175,17 @@ def extractXMLFile(gz_filepath: str):
     Returns:
         xml_filepath: Path to the extracted XML file.
     """
-    print("Downloaded .gz file path: ", gz_filepath)
-    xml_filepath = gz_filepath.replace(".gz", "")
-    with gzip.open(gz_filepath, 'rb') as f_in:
-        with open(xml_filepath, 'wb') as f_out:
-            shutil.copyfileobj(f_in, f_out)
+    try:
+        print("Downloaded .gz file path: ", gz_filepath)
+        xml_filepath = gz_filepath.replace(".gz", "")
+        with gzip.open(gz_filepath, 'rb') as f_in:
+            with open(xml_filepath, 'wb') as f_out:
+                shutil.copyfileobj(f_in, f_out)
 
-    return xml_filepath
+        return xml_filepath
+    except Exception as e:
+        print("Error extracting XML file: ", e)
+        return None
 
 def extractMetadataFromXML(xml_filepath: str):
     """
@@ -157,113 +198,38 @@ def extractMetadataFromXML(xml_filepath: str):
         metadata: List of dictionaries containing metadata for each article.
     """
     print("inside extractMetadataFromXML()")
+    try:
+        # create a directory to store abstracts
+        os.makedirs("pubmed_abstracts", exist_ok=True)
 
-    # create a directory to store abstracts
-    os.makedirs("pubmed_abstracts", exist_ok=True)
-
-    tree = ET.parse(xml_filepath)
-    root = tree.getroot()
-    metadata = []
-
+        tree = ET.parse(xml_filepath)
+        root = tree.getroot()
+        metadata = []
 
-    with concurrent.futures.ProcessPoolExecutor() as executor:
-        futures = []
-        article_items = list(item for item in root.iter('PubmedArticle'))  # Convert generator to list
-    
-        for item in article_items:
-            future = executor.submit(processArticleItem, item)
-            article_data = future.result()
 
-            metadata.append(article_data)
+        with concurrent.futures.ProcessPoolExecutor() as executor:
+            futures = []
+            article_items = list(item for item in root.iter('PubmedArticle'))  # Convert generator to list
+        
+            for item in article_items:
+                future = executor.submit(processArticleItem, item)
+                article_data = future.result()
 
-            if len(metadata) == 100:
-                print("collected 100 articles")
-                yield metadata
-                metadata = []   # reset metadata for next batch
+                metadata.append(article_data)
 
-    if metadata:
-        yield metadata
-    
-    print("Metadata extraction complete.")
+                if len(metadata) == 100:
+                    print("collected 100 articles")
+                    yield metadata
+                    metadata = []   # reset metadata for next batch
 
-    
-    # # PARALLELIZE THE BELOW FOR LOOP AND EXTRACT METADATA FOR ALL ARTICLES AT ONCE - IN 1000s
-    # # Extract metadata from the XML file
-    # for item in root.iter('PubmedArticle'):
-    #     article_data = {}
-
-    #     medline_citation = item.find('MedlineCitation')
-    #     article = medline_citation.find('Article')
-    #     journal = article.find('Journal')
-    #     issue = journal.find('JournalIssue')
-
-    #     if medline_citation.find('PMID') is not None:
-    #         article_data['pmid'] = medline_citation.find('PMID').text
-    #         article_data['pmcid'] = None
-    #         article_data['doi'] = None
-    #     else:
-    #         continue
-
-    #     if journal.find('ISSN') is not None:
-    #         article_data['issn'] = journal.find('ISSN').text
-    #     else:
-    #         article_data['issn'] = None
-
-    #     if journal.find('Title') is not None:
-    #         article_data['journal_title'] = journal.find('Title').text
-    #     else:
-    #         article_data['journal_title'] = None
+        if metadata:
+            yield metadata
         
-    #     # some articles don't have an article title
-    #     article_title = article.find('ArticleTitle')
-    #     if article_title is not None and article_title.text is not None:
-    #         article_data['article_title'] = article_title.text.replace('[', '').replace(']', '')
-    #     else:
-    #         article_data['article_title'] = None
-
-    #     article_data['last_revised'] = f"{medline_citation.find('DateRevised/Year').text}-{medline_citation.find('DateRevised/Month').text}-{medline_citation.find('DateRevised/Day').text}"
-        
-    #     # some articles don't have all fields present for publication date
-    #     if issue.find('PubDate/Year') is not None and issue.find('PubDate/Month') is not None and issue.find('PubDate/Day') is not None:   
-    #         article_data['published'] = f"{issue.find('PubDate/Year').text}-{issue.find('PubDate/Month').text}-{issue.find('PubDate/Day').text}"
-    #     elif issue.find('PubDate/Year') is not None and issue.find('PubDate/Month') is not None:
-    #         article_data['published'] = f"{issue.find('PubDate/Year').text}-{issue.find('PubDate/Month').text}"
-    #     elif issue.find('PubDate/Year') is not None:
-    #         article_data['published'] = f"{issue.find('PubDate/Year').text}"
-    #     else:
-    #         article_data['published'] = None
-        
-    #     # extract and store abstract in a text file
-    #     abstract = article.find('Abstract')
-    #     if abstract is not None:
-    #         abstract_text = ""
-    #         for abstract_text_element in abstract.iter('AbstractText'):
-    #             # if labels (objective, methods, etc.) are present, add them to the text (e.g. "OBJECTIVE: ")
-    #             if abstract_text_element.attrib.get('Label') is not None:
-    #                 abstract_text += abstract_text_element.attrib.get('Label') + ": "
-    #             if abstract_text_element.text is not None:
-    #                 abstract_text += abstract_text_element.text + "\n"
-            
-    #         # save abstract to a text file
-    #         abstract_filename = f"pubmed_abstracts/{article_data['pmid']}.txt"
-    #         with open(abstract_filename, 'w') as f:
-    #             if article_data['article_title']:
-    #                 f.write("Article title: " + article_data['article_title'] + "\n")
-    #             if article_data['journal_title']:
-    #                 f.write("Journal title: " + article_data['journal_title'] + "\n")
-    #             f.write("Abstract: " + abstract_text)
-        
-    #     # some articles are listed, but not released online yet. Adding fields for such articles to maintain uniformity.
-    #     article_data['live'] = True
-    #     article_data['release_date'] = None
-    #     article_data['license'] = None
-    #     article_data['pubmed_ftp_link'] = None
-    #     article_data['filepath'] = abstract_filename
-
-    #     metadata.append(article_data)
-    #     if len(metadata) == 20:
-    #         return metadata
-    # return metadata
+        print("Metadata extraction complete.")
+    except Exception as e:
+        print("Error extracting metadata: ", e)
+        return []
+    
 
 def processArticleItem(item: ET.Element):
     """
@@ -360,64 +326,46 @@ def getArticleIDs(metadata: list):
         metadata: Updated metadata with PMCID, DOI, release date, and live status information.
     """
     print("In getArticleIDs()")
-    base_url = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/"
-    app_details = "?tool=ncsa_uiuc&email=caiincsa@gmail.com&format=json"
+    try:
+        base_url = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/"
+        app_details = "?tool=ncsa_uiuc&email=caiincsa@gmail.com&format=json"
 
-    batch_size = 200    # maximum number of articles API can process in one request
-    
-    # # Create a shared list using multiprocessing.Manager
-    # manager = Manager()
-    # shared_metadata = manager.list(metadata)  # Copy initial metadata into the shared list
-
-    # for i in range(0, len(metadata), batch_size):
-    #     batch = metadata[i:i+batch_size]
-    #     ids = ",".join([article['pmid'] for article in batch])
-    #     response = requests.get(base_url + app_details + "&ids=" + ids)
-    #     data = response.json()        
-    #     records = data['records']
-
-    #     with concurrent.futures.ProcessPoolExecutor() as executor:
-    #         futures = []
-    #         for record in records:
-    #             future = executor.submit(updateArticleMetadata, shared_metadata, record)
-    #             futures.append(future)
-
-    #         # process results from parallel tasks
-    #         for future in futures:
-    #             try:
-    #                 future.result()
-    #             except Exception as e:
-    #                 print(f"Error updating metadata for article: {e}")
-
-    # print("Updated metadata in ID converter: ", len(shared_metadata))
-    # return shared_metadata
-    
-    for i in range(0, len(metadata), batch_size):
-        batch = metadata[i:i+batch_size]
-        ids = ",".join([article['pmid'] for article in batch])
-        response = requests.get(base_url + app_details + "&ids=" + ids)
-        data = response.json()        
-        records = data['records']
-        # PARALLELIZE THIS FOR LOOP - UPDATES ADDITIONAL FIELDS FOR ALL ARTICLES AT ONCE
-        for record in records:
-            if 'errmsg' in record:
-                print("Error: ", record['errmsg'])
-                for article in batch:
-                    if article['pmid'] == record['pmid']:
-                        article['live'] = False
-                        break
-                continue
-            else:
-                # find article with matching pmid and update pmcid, doi, live, and release date fields
-                for article in batch:
-                    if article['pmid'] == record['pmid']:
-                        article['pmcid'] = record['pmcid']
-                        article['doi'] = record['doi']
-                        article['live'] = False if 'live' in record and record['live'] == "false" else True
-                        article['release_date'] = record.get('release-date', article['release_date'])
-                        print("Updated metadata in ID converter: ", article)
-                        break
-    return metadata
+        batch_size = 200    # maximum number of articles API can process in one request
+            
+        for i in range(0, len(metadata), batch_size):
+            batch = metadata[i:i+batch_size]
+            ids = ",".join([article['pmid'] for article in batch])
+            response = requests.get(base_url + app_details + "&ids=" + ids)
+            data = response.json()        
+            records = data['records']
+            # PARALLELIZE THIS FOR LOOP - UPDATES ADDITIONAL FIELDS FOR ALL ARTICLES AT ONCE
+            with concurrent.futures.ThreadPoolExecutor() as executor:
+                executor.map(updateArticleMetadata, batch, records)
+
+            # for record in records:
+            #     if 'errmsg' in record:
+            #         print("Error: ", record['errmsg'])
+            #         for article in batch:
+            #             if article['pmid'] == record['pmid']:
+            #                 article['live'] = False
+            #                 break
+            #         continue
+            #     else:
+            #         # find article with matching pmid and update pmcid, doi, live, and release date fields
+            #         for article in batch:
+            #             if article['pmid'] == record['pmid']:
+            #                 article['pmcid'] = record['pmcid']
+            #                 article['doi'] = record['doi']
+            #                 article['live'] = False if 'live' in record and record['live'] == "false" else True
+            #                 article['release_date'] = record.get('release-date', article['release_date'])
+            #                 print("Updated metadata in ID converter: ", article)
+            #                 break
+
+            
+        return metadata
+    except Exception as e:
+        print("Error getting article IDs: ", e)
+        return metadata
 
 def updateArticleMetadata(shared_metadata: list, record: dict):
     """
@@ -450,63 +398,73 @@ def downloadArticles(metadata: list):
     Returns:
         metadata: Updated metadata with license, FTP link, and downloaded filepath information.
     """
+    try:
+        base_url = "https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?"
+        print("Downloading articles...")
 
-    base_url = "https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?"
-    print("Downloading articles...")
-
-    # connect to FTP server anonymously
-    ftp = ftplib.FTP("ftp.ncbi.nlm.nih.gov")
-    ftp.login()
-
-    # PARALLELIZE THIS FOR LOOP - DOWNLOAD + METADATA UPDATE
-    for article in metadata:
+        # connect to FTP server anonymously
+        ftp = ftplib.FTP("ftp.ncbi.nlm.nih.gov")
+        ftp.login()
 
-        if article['live'] is False or article['pmcid'] is None:
-            continue
-        
-        # else proceed with download
-        if article['pmcid']:
-            # query URL for article download
-            final_url = base_url + "id=" + article['pmcid'] 
-            print("Download URL: ", final_url)
-
-            xml_response = requests.get(final_url)
-            # get license and FTP link
-            extracted_data = extractArticleData(xml_response.text)
-            
-            print("\nExtracted license and link data: ", extracted_data)
+        # PARALLELIZE THIS FOR LOOP - DOWNLOAD + METADATA UPDATE
+        for article in metadata:
 
-            # if no data extracted (reason: article not released/open-access), skip to next article
-            if not extracted_data:
-                article['live'] = False
+            if article['live'] is False or article['pmcid'] is None:
                 continue
-
-            # update metadata with license and ftp link information
-            article['license'] = extracted_data[0]['license']
-            article['pubmed_ftp_link'] = extracted_data[0]['href'] if 'href' in extracted_data[0] else None
-            
-            # download the article
-            ftp_url = urlparse(extracted_data[0]['href'])
-            ftp_path = ftp_url.path[1:]
-            print("FTP path: ", ftp_path)
-
-            filename = ftp_path.split("/")[-1]
-            local_file = os.path.join("pubmed_abstracts", filename)
-            with open(local_file, 'wb') as f:
-                ftp.retrbinary('RETR ' + ftp_path, f.write)
-            print("Downloaded PDF file: ", local_file)
-            article['filepath'] = local_file
-
-            # if file is .tar.gz, extract the PDF and delete the tar.gz file
-            if filename.endswith(".tar.gz"):
-                extracted_pdf_paths = extractPDF(local_file)
-                print("Extracted PDFs from .tar.gz file: ", extracted_pdf_paths)
-                article['filepath'] = ",".join(extracted_pdf_paths)
-                os.remove(local_file)
             
-            print("\nUpdated metadata after download: ", article)
-    ftp.quit()
-    return metadata          
+            # else proceed with download
+            if article['pmcid']:
+                # query URL for article download
+                final_url = base_url + "id=" + article['pmcid'] 
+                print("Download URL: ", final_url)
+
+                xml_response = requests.get(final_url)
+                # get license and FTP link
+                extracted_data = extractArticleData(xml_response.text)
+                
+                print("\nExtracted license and link data: ", extracted_data)
+
+                # if no data extracted (reason: article not released/open-access), skip to next article
+                if not extracted_data:
+                    article['live'] = False
+                    continue
+
+                # update metadata with license and ftp link information
+                article['license'] = extracted_data[0]['license']
+                article['pubmed_ftp_link'] = extracted_data[0]['href'] if 'href' in extracted_data[0] else None
+                
+                # download the article
+                ftp_url = urlparse(extracted_data[0]['href'])
+                ftp_path = ftp_url.path[1:]
+                print("FTP path: ", ftp_path)
+                
+                # Set a timeout of 15 minutes - some files take > 1 hour to download and everything hangs
+                timeout = threading.Timer(15 * 60, lambda: print("Download timed out!"))
+                timeout.start()
+                
+                filename = ftp_path.split("/")[-1]
+                local_file = os.path.join("pubmed_abstracts", filename)
+                try:
+                    with open(local_file, 'wb') as f:
+                        ftp.retrbinary('RETR ' + ftp_path, f.write)
+                    print("Downloaded PDF file: ", local_file)
+                    article['filepath'] = local_file
+
+                    # if file is .tar.gz, extract the PDF and delete the tar.gz file
+                    if filename.endswith(".tar.gz"):
+                        extracted_pdf_paths = extractPDF(local_file)
+                        print("Extracted PDFs from .tar.gz file: ", extracted_pdf_paths)
+                        article['filepath'] = ",".join(extracted_pdf_paths)
+                        os.remove(local_file)
+                finally:
+                    timeout.cancel() # cancel the timer if download finishes before timeout
+                
+                print("\nUpdated metadata after download: ", article)
+        ftp.quit()
+        return metadata   
+    except Exception as e:
+        print("Error downloading articles: ", e)
+        return metadata       
 
 def extractPDF(tar_gz_filepath: str):
     """
@@ -517,17 +475,21 @@ def extractPDF(tar_gz_filepath: str):
     Returns:
         extracted_paths: List of paths to the extracted PDF files.
     """
-    print("Extracting PDF from: ", tar_gz_filepath)
-    extracted_paths = []
-    with tarfile.open(tar_gz_filepath, "r:gz") as tar:
-        for member in tar:
-            if member.isreg() and member.name.endswith(".pdf"):
-                tar.extract(member, path="pubmed_abstracts")
-                print("Extracted: ", member.name)
-                extracted_paths.append(os.path.join("pubmed_abstracts", member.name))
-              
-    return extracted_paths
-
+    try:
+        print("Extracting PDF from: ", tar_gz_filepath)
+        extracted_paths = []
+        with tarfile.open(tar_gz_filepath, "r:gz") as tar:
+            for member in tar:
+                if member.isreg() and member.name.endswith(".pdf"):
+                    tar.extract(member, path="pubmed_abstracts")
+                    print("Extracted: ", member.name)
+                    extracted_paths.append(os.path.join("pubmed_abstracts", member.name))
+                
+        return extracted_paths
+    except Exception as e:
+        print("Error extracting PDF: ", e)
+        return []
+    
 def extractArticleData(xml_string: str):
     """
     Extracts license information and article download link from the XML response.
@@ -538,60 +500,69 @@ def extractArticleData(xml_string: str):
         extracted_data: List of dictionaries containing license and download link for the article.
     """
     print("In extractArticleData")
-
-    root = ET.fromstring(xml_string)
-    # if there is an errors (article not open-access), return empty list (skip article)
-    if root.find(".//error") is not None:
-        return []
-
-    records = root.findall(".//record")
-    extracted_data = []
-    href = None
-    
-    for record in records:
-        record_id = record.get("id")    # pmcid
-        license = record.get("license")
-        links = record.findall(".//link")
-
-        for link in links:
-            if link.get("format") == "pdf":
-                href = link.get("href")
-                break
-        # if PDF link not found, use the available tgz link
-        if not href:
-            href = links[0].get("href")
+    try:
+        root = ET.fromstring(xml_string)
+        # if there is an errors (article not open-access), return empty list (skip article)
+        if root.find(".//error") is not None:
+            return []
+
+        records = root.findall(".//record")
+        extracted_data = []
+        href = None
+        
+        for record in records:
+            record_id = record.get("id")    # pmcid
+            license = record.get("license")
+            links = record.findall(".//link")
+
+            for link in links:
+                if link.get("format") == "pdf":
+                    href = link.get("href")
+                    break
+            # if PDF link not found, use the available tgz link
+            if not href:
+                href = links[0].get("href")
+            
+            extracted_data.append({
+                "record_id": record_id,
+                "license": license,
+                "href": href
+            })
         
-        extracted_data.append({
-            "record_id": record_id,
-            "license": license,
-            "href": href
-        })
+        return extracted_data
+    except Exception as e:
+        print("Error extracting article data: ", e)
+        return []
     
-    return extracted_data
-
 def uploadToStorage(filepath: str):
     """
     Uploads all files present under given filepath to Minio bucket.
     """
     print("in uploadToStorage()")
-    
-    bucket_name = "pubmed"
-    found = MINIO_CLIENT.bucket_exists(bucket_name)
-    if not found:
-        MINIO_CLIENT.make_bucket(bucket_name)
-        print("Created bucket", bucket_name)
-    else:
-        print("Bucket", bucket_name, "already exists")
-
-    for root, dirs, files in os.walk(filepath):
-        # can parallelize this upload
-        for file in files:
-            file_path = os.path.join(root, file)
-            object_name = file_path.split("/")[-1]
-            # insert local file into remote bucket
-            MINIO_CLIENT.fput_object(bucket_name, object_name, file_path)
-            print("Uploaded: ", object_name)
-    return "success"
+    try:
+        bucket_name = "pubmed"
+        print(os.environ['MINIO_URL'])
+        print(os.environ['MINIO_SECRET_KEY'])
+        print(os.environ['MINIO_ACCESS_KEY'])
+        found = MINIO_CLIENT.bucket_exists(bucket_name)
+        if not found:
+            MINIO_CLIENT.make_bucket(bucket_name)
+            print("Created bucket", bucket_name)
+        else:
+            print("Bucket", bucket_name, "already exists")
+
+        for root, dirs, files in os.walk(filepath):
+            # can parallelize this upload
+            for file in files:
+                file_path = os.path.join(root, file)
+                object_name = file_path.split("/")[-1]
+                # insert local file into remote bucket
+                MINIO_CLIENT.fput_object(bucket_name, object_name, file_path)
+                print("Uploaded: ", object_name)
+        return "success"
+    except Exception as e:
+        print("Error uploading to storage: ", e)
+        return "failure"
 
 
 

From 24425d455c6bd9ec32463171adf94be4b44310fd Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Fri, 12 Apr 2024 09:37:45 -0500
Subject: [PATCH 09/28] parallelized metadata update

---
 ai_ta_backend/utils/pubmed_extraction.py | 286 ++++++++++++++---------
 1 file changed, 182 insertions(+), 104 deletions(-)

diff --git a/ai_ta_backend/utils/pubmed_extraction.py b/ai_ta_backend/utils/pubmed_extraction.py
index a20057f0..16730f9b 100644
--- a/ai_ta_backend/utils/pubmed_extraction.py
+++ b/ai_ta_backend/utils/pubmed_extraction.py
@@ -40,18 +40,17 @@ def extractPubmedData():
     
     gz_filepath = downloadXML(ftp_address, ftp_path, file_list[1], "pubmed")
     print("GZ Downloaded: ", gz_filepath)
-    gz_file_download_time = round(time.time() - start_time, 2)
-    print("Time taken to download .gz file: ", gz_file_download_time, "seconds")
+    print("Time taken to download .gz file: ", round(time.time() - start_time, 2), "seconds")
+    gz_file_download_time = time.time()
 
     # extract the XML file
     if not gz_filepath:
         return "failure"
     xml_filepath = extractXMLFile(gz_filepath)
     print("XML Extracted: ", xml_filepath)
-    xml_extract_time = round(time.time() - gz_file_download_time, 2)
-    print("Time taken to extract XML file: ", xml_extract_time, "seconds")
+    print("Time taken to extract XML file: ", round(time.time() - gz_file_download_time, 2), "seconds")
+    
     
-    #xml_filepath = "pubmed/pubmed24n1219.xml"
     
     for metadata in extractMetadataFromXML(xml_filepath):
         metadata_extract_start_time = time.time() 
@@ -59,10 +58,12 @@ def extractPubmedData():
         # find PMC ID and DOI for all articles
         metadata_with_ids = getArticleIDs(metadata)
         print("Time taken to get PMC ID and DOI for 100 articles: ", round(time.time() - start_time, 2), "seconds")
-
+        #print("Metadata with IDs: ", metadata_with_ids)
+        
         # download the articles
         complete_metadata = downloadArticles(metadata_with_ids)
-
+        print(complete_metadata)
+        
         # store metadata in csv file
         print("\n")
         print("Total articles retrieved: ", len(complete_metadata))
@@ -315,79 +316,153 @@ def processArticleItem(item: ET.Element):
     except Exception as e:
         return {'error': str(e)}
 
-def getArticleIDs(metadata: list):
-    """
-    Uses the PubMed ID converter API to get PMCID and DOI for each article.
-    Queries the API in batches of 200 articles at a time.
-    Also updates the metadata with the release date and live status - some articles are yet to be released.
-    Args:
-        metadata: List of dictionaries containing metadata for each article.
-    Returns:
-        metadata: Updated metadata with PMCID, DOI, release date, and live status information.
-    """
-    print("In getArticleIDs()")
-    try:
-        base_url = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/"
-        app_details = "?tool=ncsa_uiuc&email=caiincsa@gmail.com&format=json"
+# def getArticleIDs(metadata: list):
+#     """
+#     Uses the PubMed ID converter API to get PMCID and DOI for each article.
+#     Queries the API in batches of 200 articles at a time.
+#     Also updates the metadata with the release date and live status - some articles are yet to be released.
+#     Args:
+#         metadata: List of dictionaries containing metadata for each article.
+#     Returns:
+#         metadata: Updated metadata with PMCID, DOI, release date, and live status information.
+#     """
+#     print("In getArticleIDs()")
+    
+#     base_url = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/"
+#     app_details = "?tool=ncsa_uiuc&email=caiincsa@gmail.com&format=json"
 
-        batch_size = 200    # maximum number of articles API can process in one request
+#     batch_size = 200    # maximum number of articles API can process in one request
             
-        for i in range(0, len(metadata), batch_size):
-            batch = metadata[i:i+batch_size]
-            ids = ",".join([article['pmid'] for article in batch])
-            response = requests.get(base_url + app_details + "&ids=" + ids)
-            data = response.json()        
-            records = data['records']
-            # PARALLELIZE THIS FOR LOOP - UPDATES ADDITIONAL FIELDS FOR ALL ARTICLES AT ONCE
-            with concurrent.futures.ThreadPoolExecutor() as executor:
-                executor.map(updateArticleMetadata, batch, records)
-
-            # for record in records:
-            #     if 'errmsg' in record:
-            #         print("Error: ", record['errmsg'])
-            #         for article in batch:
-            #             if article['pmid'] == record['pmid']:
-            #                 article['live'] = False
-            #                 break
-            #         continue
-            #     else:
-            #         # find article with matching pmid and update pmcid, doi, live, and release date fields
-            #         for article in batch:
-            #             if article['pmid'] == record['pmid']:
-            #                 article['pmcid'] = record['pmcid']
-            #                 article['doi'] = record['doi']
-            #                 article['live'] = False if 'live' in record and record['live'] == "false" else True
-            #                 article['release_date'] = record.get('release-date', article['release_date'])
-            #                 print("Updated metadata in ID converter: ", article)
-            #                 break
-
+#     for i in range(0, len(metadata), batch_size):
+#         batch = metadata[i:i+batch_size]
+#         ids = ",".join([article['pmid'] for article in batch])
+#         response = requests.get(base_url + app_details + "&ids=" + ids)
+#         data = response.json()        
+#         records = data['records']
+#         # PARALLELIZE THIS FOR LOOP - UPDATES ADDITIONAL FIELDS FOR ALL ARTICLES AT ONCE
+#         with Manager() as manager:
+#             shared_metadata = manager.list(batch)
+#             with concurrent.futures.ProcessPoolExecutor() as executor:
+#                 futures = {executor.submit(updateArticleMetadata, shared_metadata, record): record for record in records}
+#                 concurrent.futures.wait(futures)
+#             for future in concurrent.futures.as_completed(futures):
+#                 record = futures[future]
+#                 try:
+#                     future.result()
+#                 except Exception as exc:
+#                     print('%r generated an exception: %s' % (record, exc))
             
-        return metadata
-    except Exception as e:
-        print("Error getting article IDs: ", e)
-        return metadata
+#             print("Updated metadata: ", list(shared_metadata))
+       
+#     print("Length of metadata after ID conversion: ", len(metadata))
+                
+#     return metadata
+    
+
+# def updateArticleMetadata(shared_metadata, record):
+#     """
+#     Updates metadata with PMCID, DOI, release date, and live status information for given article.
+#     Used withing getArticleIDs() function.
+#     """
+    
+#     if 'errmsg' in record:
+#         print("Error: ", record['errmsg'])
+#         for article in shared_metadata:
+#             if article['pmid'] == record['pmid']:
+#                 article['live'] = False
+#                 break
+#     else:
+#         # find article with matching pmid and update pmcid, doi, live, and release date fields
+#         print("record: ", record)
+#         for article in shared_metadata:
+#             if article['pmid'] == record['pmid']:
+#                 article['pmcid'] = record['pmcid']
+#                 article['doi'] = record['doi']
+#                 article['live'] = False if 'live' in record and record['live'] == "false" else True
+#                 article['release_date'] = record.get('release-date', article['release_date'])
+#                 print("Updated metadata in ID converter: ", article)
+#                 break
+
+def getArticleIDs(metadata: list):
+  """
+  Uses the PubMed ID converter API to get PMCID and DOI for each article.
+  Queries the API in batches of 200 articles at a time.
+  Also updates the metadata with the release date and live status - some articles are yet to be released.
+  Args:
+      metadata: List of dictionaries containing metadata for each article.
+  Returns:
+      metadata: Updated metadata with PMCID, DOI, release date, and live status information.
+  """
+  print("In getArticleIDs()")
+
+  base_url = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/"
+  app_details = "?tool=ncsa_uiuc&email=caiincsa@gmail.com&format=json"
+
+  batch_size = 200  # maximum number of articles API can process in one request
+
+  for i in range(0, len(metadata), batch_size):
+    batch = metadata[i:i + batch_size]
+    ids = ",".join([article['pmid'] for article in batch])
+    response = requests.get(base_url + app_details + "&ids=" + ids)
+    data = response.json()
+    records = data['records']
+
+    # PARALLELIZE THIS FOR LOOP - UPDATES ADDITIONAL FIELDS FOR ALL ARTICLES AT ONCE
+    with Manager() as manager:
+        shared_metadata = manager.dict()  # Use a shared dictionary
+        with concurrent.futures.ProcessPoolExecutor() as executor:
+            futures = {
+            executor.submit(updateArticleMetadata, shared_metadata, record): record
+            for record in records
+            }
+            concurrent.futures.wait(futures)
+            for future in concurrent.futures.as_completed(futures):
+                record = futures[future]
+                try:
+                    future.result()
+                except Exception as exc:
+                    print('%r generated an exception: %s' % (record, exc))        
+
+        # Update original metadata after loop
+        for article in metadata:
+            if article['pmid'] in shared_metadata:
+                # print("Shared metadata: ", shared_metadata[article['pmid']])
+                if 'errmsg' in shared_metadata[article['pmid']]:
+                    article['live'] = False
+                else:
+                    article['pmcid'] = shared_metadata[article['pmid']]['pmcid']
+                    article['doi'] = shared_metadata[article['pmid']]['doi']
+                    article['live'] = shared_metadata[article['pmid']]['live']
+                    article['release_date'] = shared_metadata[article['pmid']]['release_date']
+                #print("Updated metadata: ", article)
+
+  #print("Length of metadata after ID conversion: ", len(metadata))
+  return metadata
+
+
+def updateArticleMetadata(shared_metadata, record):
+  """
+  Updates metadata with PMCID, DOI, release date, and live status information for given article.
+  Used within getArticleIDs() function.
+  """
+
+  if 'errmsg' in record:
+    print("Error: ", record['errmsg'])
+    shared_metadata[record['pmid']] = {
+        **record,  # Create a copy with record data
+        'live': False
+    }
+  else:
+    # Update shared dictionary with pmid as key and updated article data as value
+    shared_metadata[record['pmid']] = {
+        **record,  # Create a copy with record data
+        'pmcid': record['pmcid'],
+        'doi': record['doi'],
+        'live': False if 'live' in record and record['live'] == "false" else True,
+        'release_date': record['release-date'] if 'release-date' in record else None,
+    }
+    #print("Updated metadata in ID converter: ", shared_metadata[record['pmid']])
 
-def updateArticleMetadata(shared_metadata: list, record: dict):
-    """
-    Updates metadata with PMCID, DOI, release date, and live status information for given article.
-    """
-    if 'errmsg' in record:
-        print("Error: ", record['errmsg'])
-        for article in shared_metadata:
-            if article['pmid'] == record['pmid']:
-                article['live'] = False
-                break
-        
-    else:
-        # find article with matching pmid and update pmcid, doi, live, and release date fields
-        for article in shared_metadata:
-            if article['pmid'] == record['pmid']:
-                article['pmcid'] = record['pmcid']
-                article['doi'] = record['doi']
-                article['live'] = False if 'live' in record and record['live'] == "false" else True
-                article['release_date'] = record.get('release-date', article['release_date'])
-                print("Updated metadata in ID converter: ", article)
-                break
 
 
 def downloadArticles(metadata: list):
@@ -398,6 +473,7 @@ def downloadArticles(metadata: list):
     Returns:
         metadata: Updated metadata with license, FTP link, and downloaded filepath information.
     """
+    print("In downloadArticles()")
     try:
         base_url = "https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?"
         print("Downloading articles...")
@@ -405,13 +481,11 @@ def downloadArticles(metadata: list):
         # connect to FTP server anonymously
         ftp = ftplib.FTP("ftp.ncbi.nlm.nih.gov")
         ftp.login()
-
-        # PARALLELIZE THIS FOR LOOP - DOWNLOAD + METADATA UPDATE
+        
         for article in metadata:
-
             if article['live'] is False or article['pmcid'] is None:
                 continue
-            
+
             # else proceed with download
             if article['pmcid']:
                 # query URL for article download
@@ -420,51 +494,55 @@ def downloadArticles(metadata: list):
 
                 xml_response = requests.get(final_url)
                 # get license and FTP link
-                extracted_data = extractArticleData(xml_response.text)
-                
+                extracted_data = extractArticleData(xml_response.text) 
                 print("\nExtracted license and link data: ", extracted_data)
 
                 # if no data extracted (reason: article not released/open-access), skip to next article
                 if not extracted_data:
                     article['live'] = False
                     continue
-
+                
                 # update metadata with license and ftp link information
                 article['license'] = extracted_data[0]['license']
                 article['pubmed_ftp_link'] = extracted_data[0]['href'] if 'href' in extracted_data[0] else None
-                
+                        
                 # download the article
                 ftp_url = urlparse(extracted_data[0]['href'])
                 ftp_path = ftp_url.path[1:]
                 print("FTP path: ", ftp_path)
-                
+
                 # Set a timeout of 15 minutes - some files take > 1 hour to download and everything hangs
-                timeout = threading.Timer(15 * 60, lambda: print("Download timed out!"))
-                timeout.start()
-                
+                # timeout = threading.Timer(15 * 60, lambda: print("Download timeout reached."))
+                # timeout.start()
+                        
                 filename = ftp_path.split("/")[-1]
                 local_file = os.path.join("pubmed_abstracts", filename)
+
                 try:
-                    with open(local_file, 'wb') as f:
-                        ftp.retrbinary('RETR ' + ftp_path, f.write)
-                    print("Downloaded PDF file: ", local_file)
-                    article['filepath'] = local_file
-
-                    # if file is .tar.gz, extract the PDF and delete the tar.gz file
-                    if filename.endswith(".tar.gz"):
-                        extracted_pdf_paths = extractPDF(local_file)
-                        print("Extracted PDFs from .tar.gz file: ", extracted_pdf_paths)
-                        article['filepath'] = ",".join(extracted_pdf_paths)
-                        os.remove(local_file)
-                finally:
-                    timeout.cancel() # cancel the timer if download finishes before timeout
-                
-                print("\nUpdated metadata after download: ", article)
+                    with concurrent.futures.ThreadPoolExecutor() as executor:
+                        future = executor.submit(ftp.retrbinary, 'RETR ' + ftp_path, open(local_file, 'wb').write)
+                        future.result(timeout=15*60)  # Set a timeout of 15 minutes
+                        print("Downloaded PDF file: ", local_file)
+                        article['filepath'] = local_file
+
+                        # if file is .tar.gz, extract the PDF and delete the tar.gz file
+                        if filename.endswith(".tar.gz"):
+                            extracted_pdf_paths = extractPDF(local_file)
+                            print("Extracted PDFs from .tar.gz file: ", extracted_pdf_paths)
+                            article['filepath'] = ",".join(extracted_pdf_paths)
+                            os.remove(local_file)
+                except concurrent.futures.TimeoutError:
+                    print("Download timeout reached.")
+                    continue  # Skip the download and continue with the rest of the code
+                        
+                print("\nUpdated metadata after download: ", article) 
+        
         ftp.quit()
         return metadata   
     except Exception as e:
         print("Error downloading articles: ", e)
-        return metadata       
+        return metadata   
+           
 
 def extractPDF(tar_gz_filepath: str):
     """

From 3ddec3bd0a1cfd1297d392bd35ad9980a526d2de Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Fri, 12 Apr 2024 09:54:11 -0500
Subject: [PATCH 10/28] added minio to requirements.txt

---
 requirements.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/requirements.txt b/requirements.txt
index 848c10d0..9855a94f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -39,6 +39,8 @@ supabase==2.0.2
 posthog==3.1.0
 sentry-sdk==1.39.1
 
+minio
+
 # Not currently supporting coursera ingest
 # cs-dlp @ git+https://github.com/raffaem/cs-dlp.git@0.12.0b0 # previously called coursera-dl
 

From e03fbf15e5daf71c7147768521b4c1044bb6ee68 Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Fri, 12 Apr 2024 16:41:53 -0500
Subject: [PATCH 11/28] parallelized download

---
 ai_ta_backend/utils/pubmed_extraction.py | 245 ++++++++++++++++-------
 1 file changed, 175 insertions(+), 70 deletions(-)

diff --git a/ai_ta_backend/utils/pubmed_extraction.py b/ai_ta_backend/utils/pubmed_extraction.py
index 16730f9b..16312b4a 100644
--- a/ai_ta_backend/utils/pubmed_extraction.py
+++ b/ai_ta_backend/utils/pubmed_extraction.py
@@ -16,6 +16,9 @@
 import pandas as pd
 import threading 
 import json
+from functools import partial
+
+
 
 SUPBASE_CLIENT = supabase.create_client(    # type: ignore
     supabase_url=os.getenv('SUPABASE_URL'), # type: ignore
@@ -38,7 +41,7 @@ def extractPubmedData():
     ftp_path = "pubmed/baseline"
     file_list = getFileList(ftp_address, ftp_path, ".gz")
     
-    gz_filepath = downloadXML(ftp_address, ftp_path, file_list[1], "pubmed")
+    gz_filepath = downloadXML(ftp_address, ftp_path, file_list[2], "pubmed")
     print("GZ Downloaded: ", gz_filepath)
     print("Time taken to download .gz file: ", round(time.time() - start_time, 2), "seconds")
     gz_file_download_time = time.time()
@@ -50,19 +53,20 @@ def extractPubmedData():
     print("XML Extracted: ", xml_filepath)
     print("Time taken to extract XML file: ", round(time.time() - gz_file_download_time, 2), "seconds")
     
-    
-    
     for metadata in extractMetadataFromXML(xml_filepath):
         metadata_extract_start_time = time.time() 
 
         # find PMC ID and DOI for all articles
         metadata_with_ids = getArticleIDs(metadata)
-        print("Time taken to get PMC ID and DOI for 100 articles: ", round(time.time() - start_time, 2), "seconds")
+        metadata_update_time = time.time()
+        print("Time taken to get PMC ID and DOI for 100 articles: ", round(metadata_update_time - metadata_extract_start_time, 2), "seconds")
         #print("Metadata with IDs: ", metadata_with_ids)
         
         # download the articles
         complete_metadata = downloadArticles(metadata_with_ids)
         print(complete_metadata)
+        print("Time taken to download articles for 100 articles: ", round(time.time() - metadata_update_time, 2), "seconds")
+
         
         # store metadata in csv file
         print("\n")
@@ -83,7 +87,7 @@ def extractPubmedData():
 
     # upload articles to bucket
     print("Uploading articles to storage...")
-    article_upload = uploadToStorage("pubmed_abstracts")
+    article_upload = uploadToStorage("pubmed_abstracts")    # need to parallelize upload
     print("Uploaded articles: ", article_upload)
     
     # upload metadata to SQL DB
@@ -100,7 +104,6 @@ def extractPubmedData():
     response = SUPBASE_CLIENT.table("publications").upsert(complete_metadata).execute() # type: ignore
     print("Supabase response: ", response)
         
-    
     return "success"
 
 def downloadXML(ftp_address: str, ftp_path: str, file: str, local_dir: str):
@@ -465,84 +468,188 @@ def updateArticleMetadata(shared_metadata, record):
 
 
 
+# def downloadArticles(metadata: list):
+#     """
+#     Downloads articles from PMC and stores them in local directory.
+#     Args:
+#         metadata: List of dictionaries containing metadata for each article.
+#     Returns:
+#         metadata: Updated metadata with license, FTP link, and downloaded filepath information.
+#     """
+#     print("In downloadArticles()")
+#     try:
+#         base_url = "https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?"
+#         print("Downloading articles...")
+
+#         # connect to FTP server anonymously
+#         ftp = ftplib.FTP("ftp.ncbi.nlm.nih.gov")
+#         ftp.login()
+        
+#         for article in metadata:
+#             if article['live'] is False or article['pmcid'] is None:
+#                 continue
+
+#             # else proceed with download
+#             if article['pmcid']:
+#                 # query URL for article download
+#                 final_url = base_url + "id=" + article['pmcid'] 
+#                 print("Download URL: ", final_url)
+
+#                 xml_response = requests.get(final_url)
+#                 # get license and FTP link
+#                 extracted_data = extractArticleData(xml_response.text) 
+#                 print("\nExtracted license and link data: ", extracted_data)
+
+#                 # if no data extracted (reason: article not released/open-access), skip to next article
+#                 if not extracted_data:
+#                     article['live'] = False
+#                     continue
+                
+#                 # update metadata with license and ftp link information
+#                 article['license'] = extracted_data[0]['license']
+#                 article['pubmed_ftp_link'] = extracted_data[0]['href'] if 'href' in extracted_data[0] else None
+                        
+#                 # download the article
+#                 ftp_url = urlparse(extracted_data[0]['href'])
+#                 ftp_path = ftp_url.path[1:]
+#                 print("FTP path: ", ftp_path)
+
+#                 # Set a timeout of 15 minutes - some files take > 1 hour to download and everything hangs
+#                 # timeout = threading.Timer(15 * 60, lambda: print("Download timeout reached."))
+#                 # timeout.start()
+                        
+#                 filename = ftp_path.split("/")[-1]
+#                 local_file = os.path.join("pubmed_abstracts", filename)
+
+#                 try:
+#                     with concurrent.futures.ThreadPoolExecutor() as executor:
+#                         future = executor.submit(ftp.retrbinary, 'RETR ' + ftp_path, open(local_file, 'wb').write)
+#                         future.result(timeout=15*60)  # Set a timeout of 15 minutes
+#                         print("Downloaded PDF file: ", local_file)
+#                         article['filepath'] = local_file
+
+#                         # if file is .tar.gz, extract the PDF and delete the tar.gz file
+#                         if filename.endswith(".tar.gz"):
+#                             extracted_pdf_paths = extractPDF(local_file)
+#                             print("Extracted PDFs from .tar.gz file: ", extracted_pdf_paths)
+#                             article['filepath'] = ",".join(extracted_pdf_paths)
+#                             os.remove(local_file)
+#                 except concurrent.futures.TimeoutError:
+#                     print("Download timeout reached.")
+#                     continue  # Skip the download and continue with the rest of the code
+                        
+#                 print("\nUpdated metadata after download: ", article) 
+        
+#         ftp.quit()
+#         return metadata   
+#     except Exception as e:
+#         print("Error downloading articles: ", e)
+#         return metadata   
+
 def downloadArticles(metadata: list):
     """
     Downloads articles from PMC and stores them in local directory.
     Args:
         metadata: List of dictionaries containing metadata for each article.
     Returns:
-        metadata: Updated metadata with license, FTP link, and downloaded filepath information.
+      metadata: Updated metadata with license, FTP link, and downloaded filepath information.
     """
     print("In downloadArticles()")
     try:
         base_url = "https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?"
-        print("Downloading articles...")
 
-        # connect to FTP server anonymously
-        ftp = ftplib.FTP("ftp.ncbi.nlm.nih.gov")
-        ftp.login()
+        updated_articles = {}
         
+        # Use ThreadPoolExecutor to run download_article for each article in parallel
+        download_article_partial = partial(download_article, api_url=base_url)
+        with concurrent.futures.ProcessPoolExecutor() as executor:
+            futures = [executor.submit(download_article_partial, article) for article in metadata]
+            for future in concurrent.futures.as_completed(futures):
+                try:
+                    updated_article = future.result(timeout=15*60)  # Check result without blocking
+                    if updated_article:
+                        updated_articles[updated_article['pmid']] = updated_article
+                    print("Updated article: ", updated_article)
+                except Exception as e:
+                    print("Error downloading article:", e)
+
+        # Update original metadata with updated articles
         for article in metadata:
-            if article['live'] is False or article['pmcid'] is None:
-                continue
-
-            # else proceed with download
-            if article['pmcid']:
-                # query URL for article download
-                final_url = base_url + "id=" + article['pmcid'] 
-                print("Download URL: ", final_url)
-
-                xml_response = requests.get(final_url)
-                # get license and FTP link
-                extracted_data = extractArticleData(xml_response.text) 
-                print("\nExtracted license and link data: ", extracted_data)
-
-                # if no data extracted (reason: article not released/open-access), skip to next article
-                if not extracted_data:
-                    article['live'] = False
-                    continue
+            if article['pmid'] in updated_articles:
+                article.update(updated_articles[article['pmid']])
                 
-                # update metadata with license and ftp link information
-                article['license'] = extracted_data[0]['license']
-                article['pubmed_ftp_link'] = extracted_data[0]['href'] if 'href' in extracted_data[0] else None
-                        
-                # download the article
-                ftp_url = urlparse(extracted_data[0]['href'])
-                ftp_path = ftp_url.path[1:]
-                print("FTP path: ", ftp_path)
-
-                # Set a timeout of 15 minutes - some files take > 1 hour to download and everything hangs
-                # timeout = threading.Timer(15 * 60, lambda: print("Download timeout reached."))
-                # timeout.start()
-                        
-                filename = ftp_path.split("/")[-1]
-                local_file = os.path.join("pubmed_abstracts", filename)
-
-                try:
-                    with concurrent.futures.ThreadPoolExecutor() as executor:
-                        future = executor.submit(ftp.retrbinary, 'RETR ' + ftp_path, open(local_file, 'wb').write)
-                        future.result(timeout=15*60)  # Set a timeout of 15 minutes
-                        print("Downloaded PDF file: ", local_file)
-                        article['filepath'] = local_file
-
-                        # if file is .tar.gz, extract the PDF and delete the tar.gz file
-                        if filename.endswith(".tar.gz"):
-                            extracted_pdf_paths = extractPDF(local_file)
-                            print("Extracted PDFs from .tar.gz file: ", extracted_pdf_paths)
-                            article['filepath'] = ",".join(extracted_pdf_paths)
-                            os.remove(local_file)
-                except concurrent.futures.TimeoutError:
-                    print("Download timeout reached.")
-                    continue  # Skip the download and continue with the rest of the code
-                        
-                print("\nUpdated metadata after download: ", article) 
+        print("Updated metadata after download: ", metadata)
         
-        ftp.quit()
-        return metadata   
+        return metadata
+
     except Exception as e:
         print("Error downloading articles: ", e)
-        return metadata   
-           
+        return metadata
+
+def download_article(article, api_url):
+    """
+    Downloads the article from given FTP link and updates metadata with license, FTP link, and downloaded filepath information.
+    This function is used within downloadArticles() function.
+    Args:
+        article: Dictionary containing metadata for the article.
+        api_url: URL for the article download API.
+        ftp: FTP connection object.
+    Returns:
+        article: Updated metadata for the article.
+    """
+
+    print("Downloading articles...")
+    if not article['live'] or article['pmcid'] is None:
+        return
+
+    # Proceed with download
+    # Connect to FTP server anonymously
+    ftp = ftplib.FTP("ftp.ncbi.nlm.nih.gov")
+    ftp.login()
+
+    if article['pmcid']:
+        final_url = api_url + "id=" + article['pmcid']
+        print("\nDownload URL: ", final_url)
+
+        xml_response = requests.get(final_url)
+        extracted_data = extractArticleData(xml_response.text)
+        print("Extracted license and link data: ", extracted_data)
+
+        if not extracted_data:
+            article['live'] = False
+            return
+
+        article['license'] = extracted_data[0]['license']
+        article['pubmed_ftp_link'] = extracted_data[0]['href'] if 'href' in extracted_data[0] else None
+
+        ftp_url = urlparse(extracted_data[0]['href'])
+        ftp_path = ftp_url.path[1:]
+        print("FTP path: ", ftp_path)
+
+        filename = ftp_path.split("/")[-1]
+        local_file = os.path.join("pubmed_abstracts", filename)
+
+        try:
+            with open(local_file, 'wb') as f:
+                ftp.retrbinary('RETR ' + ftp_path, f.write)  # Download directly to file
+
+            print("Downloaded FTP file: ", local_file)
+            article['filepath'] = local_file
+
+            if filename.endswith(".tar.gz"):
+                extracted_pdf_paths = extractPDF(local_file)
+                print("Extracted PDFs from .tar.gz file: ", extracted_pdf_paths)
+                article['filepath'] = ",".join(extracted_pdf_paths)
+                os.remove(local_file)
+
+        except concurrent.futures.TimeoutError:
+            print("Download timeout reached.")
+
+        ftp.quit()
+
+        print("\nUpdated metadata after download: ", article)
+        return article
+         
 
 def extractPDF(tar_gz_filepath: str):
     """
@@ -619,9 +726,7 @@ def uploadToStorage(filepath: str):
     print("in uploadToStorage()")
     try:
         bucket_name = "pubmed"
-        print(os.environ['MINIO_URL'])
-        print(os.environ['MINIO_SECRET_KEY'])
-        print(os.environ['MINIO_ACCESS_KEY'])
+        
         found = MINIO_CLIENT.bucket_exists(bucket_name)
         if not found:
             MINIO_CLIENT.make_bucket(bucket_name)

From 8e5a1a0d84ea8430c3f83ad06b6b2d7504d58801 Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Mon, 15 Apr 2024 16:58:20 -0500
Subject: [PATCH 12/28] parallelized upload

---
 ai_ta_backend/utils/pubmed_extraction.py | 73 +++++++++++++++++++-----
 1 file changed, 60 insertions(+), 13 deletions(-)

diff --git a/ai_ta_backend/utils/pubmed_extraction.py b/ai_ta_backend/utils/pubmed_extraction.py
index 16312b4a..9919ff5b 100644
--- a/ai_ta_backend/utils/pubmed_extraction.py
+++ b/ai_ta_backend/utils/pubmed_extraction.py
@@ -19,7 +19,6 @@
 from functools import partial
 
 
-
 SUPBASE_CLIENT = supabase.create_client(    # type: ignore
     supabase_url=os.getenv('SUPABASE_URL'), # type: ignore
     supabase_key=os.getenv('SUPABASE_API_KEY')  # type: ignore
@@ -41,7 +40,7 @@ def extractPubmedData():
     ftp_path = "pubmed/baseline"
     file_list = getFileList(ftp_address, ftp_path, ".gz")
     
-    gz_filepath = downloadXML(ftp_address, ftp_path, file_list[2], "pubmed")
+    gz_filepath = downloadXML(ftp_address, ftp_path, file_list[3], "pubmed")
     print("GZ Downloaded: ", gz_filepath)
     print("Time taken to download .gz file: ", round(time.time() - start_time, 2), "seconds")
     gz_file_download_time = time.time()
@@ -53,6 +52,7 @@ def extractPubmedData():
     print("XML Extracted: ", xml_filepath)
     print("Time taken to extract XML file: ", round(time.time() - gz_file_download_time, 2), "seconds")
     
+    #xml_filepath = "pubmed/pubmed24n1217.xml"
     for metadata in extractMetadataFromXML(xml_filepath):
         metadata_extract_start_time = time.time() 
 
@@ -91,6 +91,7 @@ def extractPubmedData():
     print("Uploaded articles: ", article_upload)
     
     # upload metadata to SQL DB
+    csv_filepath = "metadata.csv"
     df = pd.read_csv(csv_filepath)
     
     complete_metadata = df.to_dict('records')
@@ -604,7 +605,7 @@ def download_article(article, api_url):
 
     # Proceed with download
     # Connect to FTP server anonymously
-    ftp = ftplib.FTP("ftp.ncbi.nlm.nih.gov")
+    ftp = ftplib.FTP("ftp.ncbi.nlm.nih.gov", timeout=15*60)
     ftp.login()
 
     if article['pmcid']:
@@ -628,7 +629,7 @@ def download_article(article, api_url):
 
         filename = ftp_path.split("/")[-1]
         local_file = os.path.join("pubmed_abstracts", filename)
-
+        
         try:
             with open(local_file, 'wb') as f:
                 ftp.retrbinary('RETR ' + ftp_path, f.write)  # Download directly to file
@@ -718,15 +719,25 @@ def extractArticleData(xml_string: str):
     except Exception as e:
         print("Error extracting article data: ", e)
         return []
-    
+
+def upload_file(client, bucket_name, file_path, object_name):
+    """
+    Uploads a single file to the Minio bucket.
+    """
+    try:
+        client.fput_object(bucket_name, object_name, file_path)
+        print(f"Uploaded: {object_name}")
+    except Exception as e:
+        print(f"Error uploading {object_name}: {e}")
+
 def uploadToStorage(filepath: str):
     """
-    Uploads all files present under given filepath to Minio bucket.
+    Uploads all files present under given filepath to Minio bucket in parallel.
     """
     print("in uploadToStorage()")
     try:
         bucket_name = "pubmed"
-        
+
         found = MINIO_CLIENT.bucket_exists(bucket_name)
         if not found:
             MINIO_CLIENT.make_bucket(bucket_name)
@@ -734,18 +745,54 @@ def uploadToStorage(filepath: str):
         else:
             print("Bucket", bucket_name, "already exists")
 
-        for root, dirs, files in os.walk(filepath):
-            # can parallelize this upload
-            for file in files:
+        # Get all files to upload
+        files = []
+        for root, _, files_ in os.walk(filepath):
+            for file in files_:
                 file_path = os.path.join(root, file)
                 object_name = file_path.split("/")[-1]
-                # insert local file into remote bucket
-                MINIO_CLIENT.fput_object(bucket_name, object_name, file_path)
-                print("Uploaded: ", object_name)
+                files.append((MINIO_CLIENT, bucket_name, file_path, object_name))
+
+        # Use concurrent.futures ThreadPoolExecutor for parallel upload 
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            # Submit all upload tasks to the executor
+            futures = [executor.submit(upload_file, *args) for args in files]
+            # Wait for all tasks to complete
+            for future in futures:
+                future.result()  # This will raise any exceptions from upload_file
+
         return "success"
     except Exception as e:
         print("Error uploading to storage: ", e)
         return "failure"
+  
+# def uploadToStorage(filepath: str):
+#     """
+#     Uploads all files present under given filepath to Minio bucket.
+#     """
+#     print("in uploadToStorage()")
+#     try:
+#         bucket_name = "pubmed"
+        
+#         found = MINIO_CLIENT.bucket_exists(bucket_name)
+#         if not found:
+#             MINIO_CLIENT.make_bucket(bucket_name)
+#             print("Created bucket", bucket_name)
+#         else:
+#             print("Bucket", bucket_name, "already exists")
+
+#         for root, dirs, files in os.walk(filepath):
+#             # can parallelize this upload
+#             for file in files:
+#                 file_path = os.path.join(root, file)
+#                 object_name = file_path.split("/")[-1]
+#                 # insert local file into remote bucket
+#                 MINIO_CLIENT.fput_object(bucket_name, object_name, file_path)
+#                 print("Uploaded: ", object_name)
+#         return "success"
+#     except Exception as e:
+#         print("Error uploading to storage: ", e)
+#         return "failure"
 
 
 

From caac0bfb38f44cb50b40ae3364298464bd1bfe3b Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Wed, 17 Apr 2024 10:00:26 -0500
Subject: [PATCH 13/28] minor changes

---
 ai_ta_backend/utils/pubmed_extraction.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ai_ta_backend/utils/pubmed_extraction.py b/ai_ta_backend/utils/pubmed_extraction.py
index 9919ff5b..a025e025 100644
--- a/ai_ta_backend/utils/pubmed_extraction.py
+++ b/ai_ta_backend/utils/pubmed_extraction.py
@@ -40,7 +40,7 @@ def extractPubmedData():
     ftp_path = "pubmed/baseline"
     file_list = getFileList(ftp_address, ftp_path, ".gz")
     
-    gz_filepath = downloadXML(ftp_address, ftp_path, file_list[3], "pubmed")
+    gz_filepath = downloadXML(ftp_address, ftp_path, file_list[4], "pubmed")
     print("GZ Downloaded: ", gz_filepath)
     print("Time taken to download .gz file: ", round(time.time() - start_time, 2), "seconds")
     gz_file_download_time = time.time()
@@ -567,6 +567,7 @@ def downloadArticles(metadata: list):
             futures = [executor.submit(download_article_partial, article) for article in metadata]
             for future in concurrent.futures.as_completed(futures):
                 try:
+                    print("Starting new download...")
                     updated_article = future.result(timeout=15*60)  # Check result without blocking
                     if updated_article:
                         updated_articles[updated_article['pmid']] = updated_article

From fa605089169abb2663e2a0fe475e1142a5665de3 Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Fri, 19 Apr 2024 10:38:04 -0500
Subject: [PATCH 14/28] restricted upload parallelization to 10

---
 ai_ta_backend/utils/pubmed_extraction.py | 146 ++++++++++++-----------
 1 file changed, 79 insertions(+), 67 deletions(-)

diff --git a/ai_ta_backend/utils/pubmed_extraction.py b/ai_ta_backend/utils/pubmed_extraction.py
index a025e025..779b2cc2 100644
--- a/ai_ta_backend/utils/pubmed_extraction.py
+++ b/ai_ta_backend/utils/pubmed_extraction.py
@@ -39,72 +39,83 @@ def extractPubmedData():
     ftp_address = "ftp.ncbi.nlm.nih.gov"
     ftp_path = "pubmed/baseline"
     file_list = getFileList(ftp_address, ftp_path, ".gz")
-    
-    gz_filepath = downloadXML(ftp_address, ftp_path, file_list[4], "pubmed")
-    print("GZ Downloaded: ", gz_filepath)
-    print("Time taken to download .gz file: ", round(time.time() - start_time, 2), "seconds")
-    gz_file_download_time = time.time()
-
-    # extract the XML file
-    if not gz_filepath:
-        return "failure"
-    xml_filepath = extractXMLFile(gz_filepath)
-    print("XML Extracted: ", xml_filepath)
-    print("Time taken to extract XML file: ", round(time.time() - gz_file_download_time, 2), "seconds")
-    
-    #xml_filepath = "pubmed/pubmed24n1217.xml"
-    for metadata in extractMetadataFromXML(xml_filepath):
-        metadata_extract_start_time = time.time() 
-
-        # find PMC ID and DOI for all articles
-        metadata_with_ids = getArticleIDs(metadata)
-        metadata_update_time = time.time()
-        print("Time taken to get PMC ID and DOI for 100 articles: ", round(metadata_update_time - metadata_extract_start_time, 2), "seconds")
-        #print("Metadata with IDs: ", metadata_with_ids)
-        
-        # download the articles
-        complete_metadata = downloadArticles(metadata_with_ids)
-        print(complete_metadata)
-        print("Time taken to download articles for 100 articles: ", round(time.time() - metadata_update_time, 2), "seconds")
 
+    for file in file_list[5:]:  # already processed first 5 files
+        try:
+            print("Processing file: ", file)
         
-        # store metadata in csv file
-        print("\n")
-        print("Total articles retrieved: ", len(complete_metadata))
-        df = pd.DataFrame(complete_metadata)
-        csv_filepath = "metadata.csv"
-
-        if os.path.isfile(csv_filepath):
-            df.to_csv(csv_filepath, mode='a', header=False, index=False)
-        else:
-            df.to_csv(csv_filepath, index=False)
-        
-        print("Time taken to extract metadata for 100 articles: ", round(time.time() - metadata_extract_start_time, 2), "seconds")
+            gz_filepath = downloadXML(ftp_address, ftp_path, file, "pubmed")
+            print("GZ Downloaded: ", gz_filepath)
+            print("Time taken to download .gz file: ", round(time.time() - start_time, 2), "seconds")
+            gz_file_download_time = time.time()
+
+            # extract the XML file
+            if not gz_filepath:
+                return "failure"
+            xml_filepath = extractXMLFile(gz_filepath)
+            print("XML Extracted: ", xml_filepath)
+            print("Time taken to extract XML file: ", round(time.time() - gz_file_download_time, 2), "seconds")
+            
+            #xml_filepath = "pubmed/pubmed24n1217.xml"
+            for metadata in extractMetadataFromXML(xml_filepath):
+                metadata_extract_start_time = time.time() 
+
+                # find PMC ID and DOI for all articles
+                metadata_with_ids = getArticleIDs(metadata)
+                metadata_update_time = time.time()
+                print("Time taken to get PMC ID and DOI for 100 articles: ", round(metadata_update_time - metadata_extract_start_time, 2), "seconds")
+                #print("Metadata with IDs: ", metadata_with_ids)
+                
+                # download the articles
+                complete_metadata = downloadArticles(metadata_with_ids)
+                print(complete_metadata)
+                print("Time taken to download articles for 100 articles: ", round(time.time() - metadata_update_time, 2), "seconds")
+
+                # store metadata in csv file
+                print("\n")
+                print("Total articles retrieved: ", len(complete_metadata))
+                df = pd.DataFrame(complete_metadata)
+                csv_filepath = "metadata.csv"
+
+                if os.path.isfile(csv_filepath):
+                    df.to_csv(csv_filepath, mode='a', header=False, index=False)
+                else:
+                    df.to_csv(csv_filepath, index=False)
+                
+                print("Time taken to extract metadata for 100 articles: ", round(time.time() - metadata_extract_start_time, 2), "seconds")
 
 
-    print("Time taken to download articles: ", round(time.time() - start_time, 2), "seconds")
-    print("Total metadata extracted: ", len(complete_metadata))
+            print("Time taken to download articles: ", round(time.time() - start_time, 2), "seconds")
+            print("Total metadata extracted: ", len(complete_metadata))
 
-    # upload articles to bucket
-    print("Uploading articles to storage...")
-    article_upload = uploadToStorage("pubmed_abstracts")    # need to parallelize upload
-    print("Uploaded articles: ", article_upload)
-    
-    # upload metadata to SQL DB
-    csv_filepath = "metadata.csv"
-    df = pd.read_csv(csv_filepath)
-    
-    complete_metadata = df.to_dict('records')
-    for item in complete_metadata:
-        for key, value in item.items():
-            if pd.isna(value):  # Or: math.isnan(value)
-                item[key] = None
-  
-    print("Metadata loaded into dataframe: ", len(complete_metadata))
-    # continue with the rest of the code
-    response = SUPBASE_CLIENT.table("publications").upsert(complete_metadata).execute() # type: ignore
-    print("Supabase response: ", response)
+            # upload articles to bucket
+            print("Uploading articles to storage...")
+            article_upload = uploadToStorage("pubmed_abstracts")    # need to parallelize upload
+            print("Uploaded articles: ", article_upload)
+            
+            # upload metadata to SQL DB
+            csv_filepath = "metadata.csv"
+            df = pd.read_csv(csv_filepath)
+            
+            complete_metadata = df.to_dict('records')
+            for item in complete_metadata:
+                for key, value in item.items():
+                    if pd.isna(value):  # Or: math.isnan(value)
+                        item[key] = None
         
+            print("Metadata loaded into dataframe: ", len(complete_metadata))
+            # continue with the rest of the code
+            response = SUPBASE_CLIENT.table("publications").upsert(complete_metadata).execute() # type: ignore
+            print("Uploaded metadata to SQL DB.")
+
+            # delete files
+            os.remove(csv_filepath)
+            os.remove("pubmed_abstracts")
+        
+        except Exception as e:
+            print("Error processing file: ", e)
+            continue
+            
     return "success"
 
 def downloadXML(ftp_address: str, ftp_path: str, file: str, local_dir: str):
@@ -754,13 +765,14 @@ def uploadToStorage(filepath: str):
                 object_name = file_path.split("/")[-1]
                 files.append((MINIO_CLIENT, bucket_name, file_path, object_name))
 
-        # Use concurrent.futures ThreadPoolExecutor for parallel upload 
-        with concurrent.futures.ThreadPoolExecutor() as executor:
-            # Submit all upload tasks to the executor
-            futures = [executor.submit(upload_file, *args) for args in files]
-            # Wait for all tasks to complete
-            for future in futures:
-                future.result()  # This will raise any exceptions from upload_file
+        # Use concurrent.futures ThreadPoolExecutor with limited pool size
+        with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
+            # Submit files in batches of 10
+            for i in range(0, len(files), 10):
+                batch_files = files[i:i+10]
+                futures = [executor.submit(upload_file, *args) for args in batch_files]
+                for future in futures:
+                    future.result()  # This will raise any exceptions from upload_file
 
         return "success"
     except Exception as e:

From a61255c4be724f272f334de0fab40a54551359d2 Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Sun, 21 Apr 2024 09:01:58 -0500
Subject: [PATCH 15/28] changed starting XML file

---
 ai_ta_backend/utils/pubmed_extraction.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ai_ta_backend/utils/pubmed_extraction.py b/ai_ta_backend/utils/pubmed_extraction.py
index 779b2cc2..03d19824 100644
--- a/ai_ta_backend/utils/pubmed_extraction.py
+++ b/ai_ta_backend/utils/pubmed_extraction.py
@@ -39,8 +39,9 @@ def extractPubmedData():
     ftp_address = "ftp.ncbi.nlm.nih.gov"
     ftp_path = "pubmed/baseline"
     file_list = getFileList(ftp_address, ftp_path, ".gz")
+    
 
-    for file in file_list[5:]:  # already processed first 5 files
+    for file in file_list[7:]:  # already processed first 5 files
         try:
             print("Processing file: ", file)
         

From 4d86b85ed0857ffd9ac5fd1d91c31c0a3ce6fa81 Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Mon, 22 Apr 2024 12:11:18 -0500
Subject: [PATCH 16/28] changed starting XML file

---
 ai_ta_backend/utils/pubmed_extraction.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ai_ta_backend/utils/pubmed_extraction.py b/ai_ta_backend/utils/pubmed_extraction.py
index 03d19824..3d660e3f 100644
--- a/ai_ta_backend/utils/pubmed_extraction.py
+++ b/ai_ta_backend/utils/pubmed_extraction.py
@@ -41,7 +41,7 @@ def extractPubmedData():
     file_list = getFileList(ftp_address, ftp_path, ".gz")
     
 
-    for file in file_list[7:]:  # already processed first 5 files
+    for file in file_list[8:]:  # already processed first 5 files
         try:
             print("Processing file: ", file)
         

From 63a6cb60bb09ef5ffd99d2bedff6394149e131be Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Mon, 29 Apr 2024 15:53:14 -0500
Subject: [PATCH 17/28] changed start file

---
 ai_ta_backend/utils/pubmed_extraction.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/ai_ta_backend/utils/pubmed_extraction.py b/ai_ta_backend/utils/pubmed_extraction.py
index 3d660e3f..c5f50dd9 100644
--- a/ai_ta_backend/utils/pubmed_extraction.py
+++ b/ai_ta_backend/utils/pubmed_extraction.py
@@ -41,7 +41,7 @@ def extractPubmedData():
     file_list = getFileList(ftp_address, ftp_path, ".gz")
     
 
-    for file in file_list[8:]:  # already processed first 5 files
+    for file in file_list[10:]:  # already processed first 5 files
         try:
             print("Processing file: ", file)
         
@@ -89,7 +89,7 @@ def extractPubmedData():
             print("Time taken to download articles: ", round(time.time() - start_time, 2), "seconds")
             print("Total metadata extracted: ", len(complete_metadata))
 
-            # upload articles to bucket
+            upload articles to bucket
             print("Uploading articles to storage...")
             article_upload = uploadToStorage("pubmed_abstracts")    # need to parallelize upload
             print("Uploaded articles: ", article_upload)
@@ -108,7 +108,7 @@ def extractPubmedData():
             # continue with the rest of the code
             response = SUPBASE_CLIENT.table("publications").upsert(complete_metadata).execute() # type: ignore
             print("Uploaded metadata to SQL DB.")
-
+            
             # delete files
             os.remove(csv_filepath)
             os.remove("pubmed_abstracts")
@@ -116,6 +116,7 @@ def extractPubmedData():
         except Exception as e:
             print("Error processing file: ", e)
             continue
+        exit()
             
     return "success"
 

From b17baeaa618590acfcbb8b03ab9f1838fb56cbc3 Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Mon, 29 Apr 2024 16:01:24 -0500
Subject: [PATCH 18/28] minor comment

---
 ai_ta_backend/utils/pubmed_extraction.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ai_ta_backend/utils/pubmed_extraction.py b/ai_ta_backend/utils/pubmed_extraction.py
index c5f50dd9..12a88de3 100644
--- a/ai_ta_backend/utils/pubmed_extraction.py
+++ b/ai_ta_backend/utils/pubmed_extraction.py
@@ -89,7 +89,7 @@ def extractPubmedData():
             print("Time taken to download articles: ", round(time.time() - start_time, 2), "seconds")
             print("Total metadata extracted: ", len(complete_metadata))
 
-            upload articles to bucket
+            # upload articles to bucket
             print("Uploading articles to storage...")
             article_upload = uploadToStorage("pubmed_abstracts")    # need to parallelize upload
             print("Uploaded articles: ", article_upload)

From 4cfc6b8046bebc5feeba667cca4cc238e058d41a Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Mon, 6 May 2024 11:36:33 -0500
Subject: [PATCH 19/28] minor changes in main loop

---
 ai_ta_backend/utils/pubmed_extraction.py | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/ai_ta_backend/utils/pubmed_extraction.py b/ai_ta_backend/utils/pubmed_extraction.py
index 12a88de3..01731dc4 100644
--- a/ai_ta_backend/utils/pubmed_extraction.py
+++ b/ai_ta_backend/utils/pubmed_extraction.py
@@ -41,7 +41,7 @@ def extractPubmedData():
     file_list = getFileList(ftp_address, ftp_path, ".gz")
     
 
-    for file in file_list[10:]:  # already processed first 5 files
+    for file in file_list[18:20]:  
         try:
             print("Processing file: ", file)
         
@@ -65,7 +65,6 @@ def extractPubmedData():
                 metadata_with_ids = getArticleIDs(metadata)
                 metadata_update_time = time.time()
                 print("Time taken to get PMC ID and DOI for 100 articles: ", round(metadata_update_time - metadata_extract_start_time, 2), "seconds")
-                #print("Metadata with IDs: ", metadata_with_ids)
                 
                 # download the articles
                 complete_metadata = downloadArticles(metadata_with_ids)
@@ -95,28 +94,26 @@ def extractPubmedData():
             print("Uploaded articles: ", article_upload)
             
             # upload metadata to SQL DB
-            csv_filepath = "metadata.csv"
             df = pd.read_csv(csv_filepath)
-            
             complete_metadata = df.to_dict('records')
             for item in complete_metadata:
                 for key, value in item.items():
                     if pd.isna(value):  # Or: math.isnan(value)
                         item[key] = None
-        
             print("Metadata loaded into dataframe: ", len(complete_metadata))
+            
             # continue with the rest of the code
             response = SUPBASE_CLIENT.table("publications").upsert(complete_metadata).execute() # type: ignore
             print("Uploaded metadata to SQL DB.")
             
-            # delete files
-            os.remove(csv_filepath)
-            os.remove("pubmed_abstracts")
-        
         except Exception as e:
             print("Error processing file: ", e)
-            continue
-        exit()
+
+        # delete files            
+        shutil.rmtree("F:/MSIM/ML_Projects/ai-ta-backend/pubmed_abstracts")
+        os.remove("F:/MSIM/ML_Projects/ai-ta-backend/metadata.csv")
+        #os.remove(xml_filepath)
+        print("Finished file: ", file)
             
     return "success"
 

From 8b533ba33c3e72d54434dc09f1f821c9ed263a8a Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Mon, 13 May 2024 11:14:33 -0500
Subject: [PATCH 20/28] minor changes

---
 ai_ta_backend/utils/pubmed_extraction.py | 118 +++++++++++------------
 1 file changed, 59 insertions(+), 59 deletions(-)

diff --git a/ai_ta_backend/utils/pubmed_extraction.py b/ai_ta_backend/utils/pubmed_extraction.py
index 01731dc4..46dbf2aa 100644
--- a/ai_ta_backend/utils/pubmed_extraction.py
+++ b/ai_ta_backend/utils/pubmed_extraction.py
@@ -41,77 +41,77 @@ def extractPubmedData():
     file_list = getFileList(ftp_address, ftp_path, ".gz")
     
 
-    for file in file_list[18:20]:  
-        try:
-            print("Processing file: ", file)
+    for file in file_list[20:21]:  
+        # try:
+        #     print("Processing file: ", file)
         
-            gz_filepath = downloadXML(ftp_address, ftp_path, file, "pubmed")
-            print("GZ Downloaded: ", gz_filepath)
-            print("Time taken to download .gz file: ", round(time.time() - start_time, 2), "seconds")
-            gz_file_download_time = time.time()
-
-            # extract the XML file
-            if not gz_filepath:
-                return "failure"
-            xml_filepath = extractXMLFile(gz_filepath)
-            print("XML Extracted: ", xml_filepath)
-            print("Time taken to extract XML file: ", round(time.time() - gz_file_download_time, 2), "seconds")
+        #     gz_filepath = downloadXML(ftp_address, ftp_path, file, "pubmed")
+        #     print("GZ Downloaded: ", gz_filepath)
+        #     print("Time taken to download .gz file: ", round(time.time() - start_time, 2), "seconds")
+        #     gz_file_download_time = time.time()
+
+        #     # extract the XML file
+        #     if not gz_filepath:
+        #         return "failure"
+        #     xml_filepath = extractXMLFile(gz_filepath)
+        #     print("XML Extracted: ", xml_filepath)
+        #     print("Time taken to extract XML file: ", round(time.time() - gz_file_download_time, 2), "seconds")
             
-            #xml_filepath = "pubmed/pubmed24n1217.xml"
-            for metadata in extractMetadataFromXML(xml_filepath):
-                metadata_extract_start_time = time.time() 
-
-                # find PMC ID and DOI for all articles
-                metadata_with_ids = getArticleIDs(metadata)
-                metadata_update_time = time.time()
-                print("Time taken to get PMC ID and DOI for 100 articles: ", round(metadata_update_time - metadata_extract_start_time, 2), "seconds")
+        #     #xml_filepath = "pubmed/pubmed24n1217.xml"
+        #     for metadata in extractMetadataFromXML(xml_filepath):
+        #         metadata_extract_start_time = time.time() 
+
+        #         # find PMC ID and DOI for all articles
+        #         metadata_with_ids = getArticleIDs(metadata)
+        #         metadata_update_time = time.time()
+        #         print("Time taken to get PMC ID and DOI for 100 articles: ", round(metadata_update_time - metadata_extract_start_time, 2), "seconds")
                 
-                # download the articles
-                complete_metadata = downloadArticles(metadata_with_ids)
-                print(complete_metadata)
-                print("Time taken to download articles for 100 articles: ", round(time.time() - metadata_update_time, 2), "seconds")
-
-                # store metadata in csv file
-                print("\n")
-                print("Total articles retrieved: ", len(complete_metadata))
-                df = pd.DataFrame(complete_metadata)
-                csv_filepath = "metadata.csv"
-
-                if os.path.isfile(csv_filepath):
-                    df.to_csv(csv_filepath, mode='a', header=False, index=False)
-                else:
-                    df.to_csv(csv_filepath, index=False)
+        #         # download the articles
+        #         complete_metadata = downloadArticles(metadata_with_ids)
+        #         print(complete_metadata)
+        #         print("Time taken to download articles for 100 articles: ", round(time.time() - metadata_update_time, 2), "seconds")
+
+        #         # store metadata in csv file
+        #         print("\n")
+        #         print("Total articles retrieved: ", len(complete_metadata))
+        #         df = pd.DataFrame(complete_metadata)
+        #         csv_filepath = "metadata.csv"
+
+        #         if os.path.isfile(csv_filepath):
+        #             df.to_csv(csv_filepath, mode='a', header=False, index=False)
+        #         else:
+        #             df.to_csv(csv_filepath, index=False)
                 
-                print("Time taken to extract metadata for 100 articles: ", round(time.time() - metadata_extract_start_time, 2), "seconds")
+        #         print("Time taken to extract metadata for 100 articles: ", round(time.time() - metadata_extract_start_time, 2), "seconds")
 
 
-            print("Time taken to download articles: ", round(time.time() - start_time, 2), "seconds")
-            print("Total metadata extracted: ", len(complete_metadata))
+        #     print("Time taken to download articles: ", round(time.time() - start_time, 2), "seconds")
+        #     print("Total metadata extracted: ", len(complete_metadata))
 
-            # upload articles to bucket
-            print("Uploading articles to storage...")
-            article_upload = uploadToStorage("pubmed_abstracts")    # need to parallelize upload
-            print("Uploaded articles: ", article_upload)
+        #     # upload articles to bucket
+        #     print("Uploading articles to storage...")
+        #     article_upload = uploadToStorage("pubmed_abstracts")    # need to parallelize upload
+        #     print("Uploaded articles: ", article_upload)
             
-            # upload metadata to SQL DB
-            df = pd.read_csv(csv_filepath)
-            complete_metadata = df.to_dict('records')
-            for item in complete_metadata:
-                for key, value in item.items():
-                    if pd.isna(value):  # Or: math.isnan(value)
-                        item[key] = None
-            print("Metadata loaded into dataframe: ", len(complete_metadata))
+        #     # upload metadata to SQL DB
+        #     df = pd.read_csv(csv_filepath)
+        #     complete_metadata = df.to_dict('records')
+        #     for item in complete_metadata:
+        #         for key, value in item.items():
+        #             if pd.isna(value):  # Or: math.isnan(value)
+        #                 item[key] = None
+        #     print("Metadata loaded into dataframe: ", len(complete_metadata))
             
-            # continue with the rest of the code
-            response = SUPBASE_CLIENT.table("publications").upsert(complete_metadata).execute() # type: ignore
-            print("Uploaded metadata to SQL DB.")
+        #     # continue with the rest of the code
+        #     response = SUPBASE_CLIENT.table("publications").upsert(complete_metadata).execute() # type: ignore
+        #     print("Uploaded metadata to SQL DB.")
             
-        except Exception as e:
-            print("Error processing file: ", e)
+        # except Exception as e:
+        #     print("Error processing file: ", e)
 
         # delete files            
-        shutil.rmtree("F:/MSIM/ML_Projects/ai-ta-backend/pubmed_abstracts")
-        os.remove("F:/MSIM/ML_Projects/ai-ta-backend/metadata.csv")
+        shutil.rmtree("pubmed_abstracts")
+        os.remove("metadata.csv")
         #os.remove(xml_filepath)
         print("Finished file: ", file)
             

From 6dfe50ba03940483b30a5c698675eac7d1d9b4d8 Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Tue, 14 May 2024 11:01:17 -0500
Subject: [PATCH 21/28] parallelized processing

---
 ai_ta_backend/utils/pubmed_extraction.py | 175 +++++++++++++----------
 1 file changed, 96 insertions(+), 79 deletions(-)

diff --git a/ai_ta_backend/utils/pubmed_extraction.py b/ai_ta_backend/utils/pubmed_extraction.py
index 46dbf2aa..39276ade 100644
--- a/ai_ta_backend/utils/pubmed_extraction.py
+++ b/ai_ta_backend/utils/pubmed_extraction.py
@@ -39,83 +39,100 @@ def extractPubmedData():
     ftp_address = "ftp.ncbi.nlm.nih.gov"
     ftp_path = "pubmed/baseline"
     file_list = getFileList(ftp_address, ftp_path, ".gz")
+
+    with concurrent.futures.ProcessPoolExecutor() as executor:
+        futures = [executor.submit(processPubmedXML, file, ftp_address, ftp_path) for file in file_list[21:22]]
+        for future in concurrent.futures.as_completed(futures):
+            try:
+                future.result()
+            except Exception as e:
+                print("Error processing file: ", e)
+
     
+    return "success"
 
-    for file in file_list[20:21]:  
-        # try:
-        #     print("Processing file: ", file)
+def processPubmedXML(file:str, ftp_address:str, ftp_path:str):
+    """
+    Main function to extract metadata and articles from the PubMed baseline folder.
+    """
+    start_time = time.monotonic()
+    try:
+        print("Processing file: ", file)
+        gz_filepath = downloadXML(ftp_address, ftp_path, file, "pubmed")
+        print("GZ Downloaded: ", gz_filepath)
+        print("Time taken to download .gz file: ", round(time.time() - start_time, 2), "seconds")
+        gz_file_download_time = time.time()
+
+        # extract the XML file
+        if not gz_filepath:
+            return "failure"
+        xml_filepath = extractXMLFile(gz_filepath)
+        print("XML Extracted: ", xml_filepath)
+        print("Time taken to extract XML file: ", round(time.time() - gz_file_download_time, 2), "seconds")
         
-        #     gz_filepath = downloadXML(ftp_address, ftp_path, file, "pubmed")
-        #     print("GZ Downloaded: ", gz_filepath)
-        #     print("Time taken to download .gz file: ", round(time.time() - start_time, 2), "seconds")
-        #     gz_file_download_time = time.time()
-
-        #     # extract the XML file
-        #     if not gz_filepath:
-        #         return "failure"
-        #     xml_filepath = extractXMLFile(gz_filepath)
-        #     print("XML Extracted: ", xml_filepath)
-        #     print("Time taken to extract XML file: ", round(time.time() - gz_file_download_time, 2), "seconds")
-            
-        #     #xml_filepath = "pubmed/pubmed24n1217.xml"
-        #     for metadata in extractMetadataFromXML(xml_filepath):
-        #         metadata_extract_start_time = time.time() 
-
-        #         # find PMC ID and DOI for all articles
-        #         metadata_with_ids = getArticleIDs(metadata)
-        #         metadata_update_time = time.time()
-        #         print("Time taken to get PMC ID and DOI for 100 articles: ", round(metadata_update_time - metadata_extract_start_time, 2), "seconds")
+        xml_id = xml_filepath[7:-4].replace(".", "_")
+        destination_dir = xml_id + "_papers"
+        csv_filepath = xml_id + "_metadata.csv"
+        print("Destination directory: ", destination_dir)
+        print("CSV file path: ", csv_filepath)
+        #xml_filepath = "pubmed/pubmed24n1217.xml"
+
+        for metadata in extractMetadataFromXML(xml_filepath, destination_dir):
+            metadata_extract_start_time = time.time() 
+
+            # find PMC ID and DOI for all articles
+            metadata_with_ids = getArticleIDs(metadata)
+            metadata_update_time = time.time()
+            print("Time taken to get PMC ID and DOI for 100 articles: ", round(metadata_update_time - metadata_extract_start_time, 2), "seconds")
                 
-        #         # download the articles
-        #         complete_metadata = downloadArticles(metadata_with_ids)
-        #         print(complete_metadata)
-        #         print("Time taken to download articles for 100 articles: ", round(time.time() - metadata_update_time, 2), "seconds")
-
-        #         # store metadata in csv file
-        #         print("\n")
-        #         print("Total articles retrieved: ", len(complete_metadata))
-        #         df = pd.DataFrame(complete_metadata)
-        #         csv_filepath = "metadata.csv"
-
-        #         if os.path.isfile(csv_filepath):
-        #             df.to_csv(csv_filepath, mode='a', header=False, index=False)
-        #         else:
-        #             df.to_csv(csv_filepath, index=False)
+            # download the articles
+            complete_metadata = downloadArticles(metadata_with_ids, destination_dir)
+            print(complete_metadata)
+            print("Time taken to download articles for 100 articles: ", round(time.time() - metadata_update_time, 2), "seconds")
+
+            # store metadata in csv file
+            print("\n")
+            print("Total articles retrieved: ", len(complete_metadata))
+            df = pd.DataFrame(complete_metadata)
+
+            if os.path.isfile(csv_filepath):
+                df.to_csv(csv_filepath, mode='a', header=False, index=False)
+            else:
+                df.to_csv(csv_filepath, index=False)
                 
-        #         print("Time taken to extract metadata for 100 articles: ", round(time.time() - metadata_extract_start_time, 2), "seconds")
+            print("Time taken to extract metadata for 100 articles: ", round(time.time() - metadata_extract_start_time, 2), "seconds")
 
 
-        #     print("Time taken to download articles: ", round(time.time() - start_time, 2), "seconds")
-        #     print("Total metadata extracted: ", len(complete_metadata))
+        print("Time taken to download articles: ", round(time.time() - start_time, 2), "seconds")
+        print("Total metadata extracted: ", len(complete_metadata))
 
-        #     # upload articles to bucket
-        #     print("Uploading articles to storage...")
-        #     article_upload = uploadToStorage("pubmed_abstracts")    # need to parallelize upload
-        #     print("Uploaded articles: ", article_upload)
-            
-        #     # upload metadata to SQL DB
-        #     df = pd.read_csv(csv_filepath)
-        #     complete_metadata = df.to_dict('records')
-        #     for item in complete_metadata:
-        #         for key, value in item.items():
-        #             if pd.isna(value):  # Or: math.isnan(value)
-        #                 item[key] = None
-        #     print("Metadata loaded into dataframe: ", len(complete_metadata))
+        # upload articles to bucket
+        print("Uploading articles to storage...")
+        article_upload = uploadToStorage("pubmed_abstracts")    # need to parallelize upload
+        print("Uploaded articles: ", article_upload)
             
-        #     # continue with the rest of the code
-        #     response = SUPBASE_CLIENT.table("publications").upsert(complete_metadata).execute() # type: ignore
-        #     print("Uploaded metadata to SQL DB.")
+        # upload metadata to SQL DB
+        df = pd.read_csv(csv_filepath)
+        complete_metadata = df.to_dict('records')
+        for item in complete_metadata:
+            for key, value in item.items():
+                if pd.isna(value):  # Or: math.isnan(value)
+                    item[key] = None
+        print("Metadata loaded into dataframe: ", len(complete_metadata))
             
-        # except Exception as e:
-        #     print("Error processing file: ", e)
-
-        # delete files            
-        shutil.rmtree("pubmed_abstracts")
-        os.remove("metadata.csv")
-        #os.remove(xml_filepath)
-        print("Finished file: ", file)
+        # continue with the rest of the code
+        response = SUPBASE_CLIENT.table("publications").upsert(complete_metadata).execute() # type: ignore
+        print("Uploaded metadata to SQL DB.")
             
-    return "success"
+    except Exception as e:
+        print("Error processing file: ", e)
+    
+    # delete files            
+    shutil.rmtree(destination_dir)
+    os.remove(csv_filepath)
+    #os.remove(xml_filepath)
+    print("Finished file: ", file)
+
 
 def downloadXML(ftp_address: str, ftp_path: str, file: str, local_dir: str):
     """
@@ -202,7 +219,7 @@ def extractXMLFile(gz_filepath: str):
         print("Error extracting XML file: ", e)
         return None
 
-def extractMetadataFromXML(xml_filepath: str):
+def extractMetadataFromXML(xml_filepath: str, dir: str):
     """
     Extracts article details from the XML file and stores it in a dictionary.
     Details extracted: PMID, PMCID, DOI, ISSN, journal title, article title, 
@@ -215,7 +232,7 @@ def extractMetadataFromXML(xml_filepath: str):
     print("inside extractMetadataFromXML()")
     try:
         # create a directory to store abstracts
-        os.makedirs("pubmed_abstracts", exist_ok=True)
+        os.makedirs(dir, exist_ok=True)
 
         tree = ET.parse(xml_filepath)
         root = tree.getroot()
@@ -227,7 +244,7 @@ def extractMetadataFromXML(xml_filepath: str):
             article_items = list(item for item in root.iter('PubmedArticle'))  # Convert generator to list
         
             for item in article_items:
-                future = executor.submit(processArticleItem, item)
+                future = executor.submit(processArticleItem, item, dir)
                 article_data = future.result()
 
                 metadata.append(article_data)
@@ -246,7 +263,7 @@ def extractMetadataFromXML(xml_filepath: str):
         return []
     
 
-def processArticleItem(item: ET.Element):
+def processArticleItem(item: ET.Element, directory: str):
     """
     Extracts article details from a single PubmedArticle XML element. This is used in the process pool executor.
     Args:
@@ -311,7 +328,7 @@ def processArticleItem(item: ET.Element):
                     abstract_text += abstract_text_element.text + "\n"
         
             # save abstract to a text file
-            abstract_filename = f"pubmed_abstracts/{article_data['pmid']}.txt"
+            abstract_filename = directory + "/" + article_data['pmid'] + ".txt"
             with open(abstract_filename, 'w') as f:
                 if article_data['journal_title']:
                     f.write("Journal title: " + article_data['journal_title'] + "\n\n")
@@ -557,7 +574,7 @@ def updateArticleMetadata(shared_metadata, record):
 #         print("Error downloading articles: ", e)
 #         return metadata   
 
-def downloadArticles(metadata: list):
+def downloadArticles(metadata: list, dir: str):
     """
     Downloads articles from PMC and stores them in local directory.
     Args:
@@ -572,7 +589,7 @@ def downloadArticles(metadata: list):
         updated_articles = {}
         
         # Use ThreadPoolExecutor to run download_article for each article in parallel
-        download_article_partial = partial(download_article, api_url=base_url)
+        download_article_partial = partial(download_article, api_url=base_url, dir=dir)
         with concurrent.futures.ProcessPoolExecutor() as executor:
             futures = [executor.submit(download_article_partial, article) for article in metadata]
             for future in concurrent.futures.as_completed(futures):
@@ -598,7 +615,7 @@ def downloadArticles(metadata: list):
         print("Error downloading articles: ", e)
         return metadata
 
-def download_article(article, api_url):
+def download_article(article, api_url, dir):
     """
     Downloads the article from given FTP link and updates metadata with license, FTP link, and downloaded filepath information.
     This function is used within downloadArticles() function.
@@ -639,7 +656,7 @@ def download_article(article, api_url):
         print("FTP path: ", ftp_path)
 
         filename = ftp_path.split("/")[-1]
-        local_file = os.path.join("pubmed_abstracts", filename)
+        local_file = os.path.join(dir, filename)
         
         try:
             with open(local_file, 'wb') as f:
@@ -649,7 +666,7 @@ def download_article(article, api_url):
             article['filepath'] = local_file
 
             if filename.endswith(".tar.gz"):
-                extracted_pdf_paths = extractPDF(local_file)
+                extracted_pdf_paths = extractPDF(local_file, dir)
                 print("Extracted PDFs from .tar.gz file: ", extracted_pdf_paths)
                 article['filepath'] = ",".join(extracted_pdf_paths)
                 os.remove(local_file)
@@ -663,7 +680,7 @@ def download_article(article, api_url):
         return article
          
 
-def extractPDF(tar_gz_filepath: str):
+def extractPDF(tar_gz_filepath: str, dest_directory: str):
     """
     Extracts PDF files from the downloaded .tar.gz file. The zipped folder contains other supplementary
     materials like images, etc. which are not extracted.
@@ -678,9 +695,9 @@ def extractPDF(tar_gz_filepath: str):
         with tarfile.open(tar_gz_filepath, "r:gz") as tar:
             for member in tar:
                 if member.isreg() and member.name.endswith(".pdf"):
-                    tar.extract(member, path="pubmed_abstracts")
+                    tar.extract(member, path=dest_directory)
                     print("Extracted: ", member.name)
-                    extracted_paths.append(os.path.join("pubmed_abstracts", member.name))
+                    extracted_paths.append(os.path.join(dest_directory, member.name))
                 
         return extracted_paths
     except Exception as e:

From e98f3dc04303c8aef2eb0b2fcdfd5ce4f72c2ade Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Tue, 14 May 2024 12:53:46 -0500
Subject: [PATCH 22/28] added try-except in getArticleIds()

---
 ai_ta_backend/utils/pubmed_extraction.py |  87 ++++++++++---------
 metadata.csv                             | 101 +++++++++++++++++++++++
 2 files changed, 144 insertions(+), 44 deletions(-)
 create mode 100644 metadata.csv

diff --git a/ai_ta_backend/utils/pubmed_extraction.py b/ai_ta_backend/utils/pubmed_extraction.py
index 12a88de3..524c6e79 100644
--- a/ai_ta_backend/utils/pubmed_extraction.py
+++ b/ai_ta_backend/utils/pubmed_extraction.py
@@ -41,7 +41,7 @@ def extractPubmedData():
     file_list = getFileList(ftp_address, ftp_path, ".gz")
     
 
-    for file in file_list[10:]:  # already processed first 5 files
+    for file in file_list[22:23]:  
         try:
             print("Processing file: ", file)
         
@@ -65,7 +65,6 @@ def extractPubmedData():
                 metadata_with_ids = getArticleIDs(metadata)
                 metadata_update_time = time.time()
                 print("Time taken to get PMC ID and DOI for 100 articles: ", round(metadata_update_time - metadata_extract_start_time, 2), "seconds")
-                #print("Metadata with IDs: ", metadata_with_ids)
                 
                 # download the articles
                 complete_metadata = downloadArticles(metadata_with_ids)
@@ -95,28 +94,26 @@ def extractPubmedData():
             print("Uploaded articles: ", article_upload)
             
             # upload metadata to SQL DB
-            csv_filepath = "metadata.csv"
             df = pd.read_csv(csv_filepath)
-            
             complete_metadata = df.to_dict('records')
             for item in complete_metadata:
                 for key, value in item.items():
                     if pd.isna(value):  # Or: math.isnan(value)
                         item[key] = None
-        
             print("Metadata loaded into dataframe: ", len(complete_metadata))
+            
             # continue with the rest of the code
             response = SUPBASE_CLIENT.table("publications").upsert(complete_metadata).execute() # type: ignore
             print("Uploaded metadata to SQL DB.")
             
-            # delete files
-            os.remove(csv_filepath)
-            os.remove("pubmed_abstracts")
-        
         except Exception as e:
             print("Error processing file: ", e)
-            continue
-        exit()
+
+        # delete files            
+        shutil.rmtree("pubmed_abstracts")
+        os.remove("metadata.csv")
+        #os.remove(xml_filepath)
+        print("Finished file: ", file)
             
     return "success"
 
@@ -420,39 +417,41 @@ def getArticleIDs(metadata: list):
   for i in range(0, len(metadata), batch_size):
     batch = metadata[i:i + batch_size]
     ids = ",".join([article['pmid'] for article in batch])
-    response = requests.get(base_url + app_details + "&ids=" + ids)
-    data = response.json()
-    records = data['records']
-
-    # PARALLELIZE THIS FOR LOOP - UPDATES ADDITIONAL FIELDS FOR ALL ARTICLES AT ONCE
-    with Manager() as manager:
-        shared_metadata = manager.dict()  # Use a shared dictionary
-        with concurrent.futures.ProcessPoolExecutor() as executor:
-            futures = {
-            executor.submit(updateArticleMetadata, shared_metadata, record): record
-            for record in records
-            }
-            concurrent.futures.wait(futures)
-            for future in concurrent.futures.as_completed(futures):
-                record = futures[future]
-                try:
-                    future.result()
-                except Exception as exc:
-                    print('%r generated an exception: %s' % (record, exc))        
-
-        # Update original metadata after loop
-        for article in metadata:
-            if article['pmid'] in shared_metadata:
-                # print("Shared metadata: ", shared_metadata[article['pmid']])
-                if 'errmsg' in shared_metadata[article['pmid']]:
-                    article['live'] = False
-                else:
-                    article['pmcid'] = shared_metadata[article['pmid']]['pmcid']
-                    article['doi'] = shared_metadata[article['pmid']]['doi']
-                    article['live'] = shared_metadata[article['pmid']]['live']
-                    article['release_date'] = shared_metadata[article['pmid']]['release_date']
-                #print("Updated metadata: ", article)
-
+    try:
+        response = requests.get(base_url + app_details + "&ids=" + ids)
+        data = response.json()
+        records = data['records']
+
+        # PARALLELIZE THIS FOR LOOP - UPDATES ADDITIONAL FIELDS FOR ALL ARTICLES AT ONCE
+        with Manager() as manager:
+            shared_metadata = manager.dict()  # Use a shared dictionary
+            with concurrent.futures.ProcessPoolExecutor() as executor:
+                futures = {
+                executor.submit(updateArticleMetadata, shared_metadata, record): record
+                for record in records
+                }
+                concurrent.futures.wait(futures)
+                for future in concurrent.futures.as_completed(futures):
+                    record = futures[future]
+                    try:
+                        future.result()
+                    except Exception as exc:
+                        print('%r generated an exception: %s' % (record, exc))        
+
+            # Update original metadata after loop
+            for article in metadata:
+                if article['pmid'] in shared_metadata:
+                    # print("Shared metadata: ", shared_metadata[article['pmid']])
+                    if 'errmsg' in shared_metadata[article['pmid']]:
+                        article['live'] = False
+                    else:
+                        article['pmcid'] = shared_metadata[article['pmid']]['pmcid']
+                        article['doi'] = shared_metadata[article['pmid']]['doi']
+                        article['live'] = shared_metadata[article['pmid']]['live']
+                        article['release_date'] = shared_metadata[article['pmid']]['release_date']
+                    #print("Updated metadata: ", article)
+    except Exception as e:
+        print("Error: ", e)
   #print("Length of metadata after ID conversion: ", len(metadata))
   return metadata
 
diff --git a/metadata.csv b/metadata.csv
new file mode 100644
index 00000000..6517828c
--- /dev/null
+++ b/metadata.csv
@@ -0,0 +1,101 @@
+pmid,pmcid,doi,issn,journal_title,article_title,last_revised,published,live,release_date,license,pubmed_ftp_link,filepath
+37417630,PMC10328620,10.1097/MD.0000000000034177,1536-5964,Medicine,Meningitis with septic shock resulting from odontogenic infection misdiagnosed as closed-lock in temporomandibular disorder: A case report and literature review.,2023-11-15,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/64/70/PMC10328620.tar.gz,pubmed_abstracts/PMC10328620/medi-102-e34177.pdf
+37417631,PMC10328656,10.1097/MD.0000000000034223,1536-5964,Medicine,"The effect of progressive muscle relaxation technique and myofascial release technique on premenstrual symptoms, blood circulation, and quality of life in women with premenstrual syndrome: A single-blind randomized controlled study.",2023-08-02,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/76/4b/PMC10328656.tar.gz,pubmed_abstracts/PMC10328656/medi-102-e34223.pdf
+37417634,PMC10328666,10.1097/MD.0000000000034239,1536-5964,Medicine,Case report: Plastic bronchitis associated with Bordetella parapertussis.,2023-08-02,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/ee/a6/PMC10328666.tar.gz,pubmed_abstracts/PMC10328666/medi-102-e34239.pdf
+37417633,PMC10328702,10.1097/MD.0000000000034216,1536-5964,Medicine,Renal artery aneurysm induced by neurofibromatosis type 1: A case report and review of the endovascular interventions for this rare vasculopathy.,2023-08-02,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/85/3e/PMC10328702.tar.gz,pubmed_abstracts/PMC10328702/medi-102-e34216.pdf
+37417632,PMC10328683,10.1097/MD.0000000000034221,1536-5964,Medicine,Intervention for burnout and irrational beliefs in parents of couples seeking a divorce: A critical reflection of Igbo-African marital discord.,2023-08-02,2023-Jul-07,True,,CC BY-NC,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/5d/c7/medi-102-e34221.PMC10328683.pdf,pubmed_abstracts/medi-102-e34221.PMC10328683.pdf
+37417636,PMC10328563,10.1097/MD.0000000000034197,1536-5964,Medicine,Hematomyelia associated with coronavirus disease 2019: A rare case report.,2023-08-02,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/97/4c/PMC10328563.tar.gz,pubmed_abstracts/PMC10328563/medi-102-e34197.pdf
+37417635,PMC10328687,10.1097/MD.0000000000034194,1536-5964,Medicine,Marginal resection as a potential curative treatment option of infantile fibrosarcoma with good response after chemotherapy: A case report of an ETV6-NTRK3 positive infantile fibrosacroma of the distal tibia.,2023-08-02,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/f1/67/PMC10328687.tar.gz,pubmed_abstracts/PMC10328687/medi-102-e34194.pdf
+37417638,PMC10328619,10.1097/MD.0000000000034282,1536-5964,Medicine,"The effect of being married on heart rate variability, an indicator of autonomic dysfunction: A retrospective study.",2023-08-02,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/62/28/PMC10328619.tar.gz,pubmed_abstracts/PMC10328619/medi-102-e34282.pdf
+37417637,PMC10328692,10.1097/MD.0000000000034238,1536-5964,Medicine,A case report of Ovarian hyperstimulation syndrome and corpus luteum rupture in twin pregnancies with IVF-ET.,2023-08-02,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/19/93/PMC10328692.tar.gz,pubmed_abstracts/PMC10328692/medi-102-e34238.pdf
+37417639,PMC10328576,10.1097/MD.0000000000033936,1536-5964,Medicine,Transvenous embolization using the Amplatzer Vascular Plug II in patent ductus arteriosus concomitant with Stanford type B aortic dissection: A case report.,2023-08-02,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/52/2f/PMC10328576.tar.gz,pubmed_abstracts/PMC10328576/medi-102-e33936.pdf
+37417640,PMC10328685,10.1097/MD.0000000000034250,1536-5964,Medicine,Quantitative chest CT imaging characteristics and outcome of patients with COVID-19 associated pulmonary artery thrombosis: A single-center retrospective cohort study.,2023-08-02,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/17/4e/PMC10328685.tar.gz,pubmed_abstracts/PMC10328685/medi-102-e34250.pdf
+37417642,PMC10328710,10.1097/MD.0000000000033880,1536-5964,Medicine,Orelabrutinib versus ibrutinib for patients with refractory/relapsed primary central nervous system lymphoma: An efficacy and safety analysis.,2023-08-02,2023-Jul-07,True,,CC BY-NC,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/cf/c9/medi-102-e33880.PMC10328710.pdf,pubmed_abstracts/medi-102-e33880.PMC10328710.pdf
+37417641,PMC10328596,10.1097/MD.0000000000034248,1536-5964,Medicine,The influence of psychological factors on coronary heart disease: A review of the evidence and implications for psychological interventions.,2023-08-02,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/f5/ec/PMC10328596.tar.gz,pubmed_abstracts/PMC10328596/medi-102-e34248.pdf
+37417645,,,1440-1754,Journal of paediatrics and child health,Blue and red Doppler jet on the echocardiogram.,2023-11-16,2023-Jul-01,False,,,,
+37417643,PMC10328582,10.1097/MD.0000000000034401,1536-5964,Medicine,Opioids for treating refractory dyspnea in patients with heart failure: A protocol for systematic review and meta-analysis: Retraction.,2023-11-16,2023-07-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/44/53/PMC10328582.tar.gz,pubmed_abstracts/PMC10328582/medi-102-e34401.pdf
+37417644,,,1473-2165,Journal of cosmetic dermatology,Safety and efficacy of human platelet extract in skin recovery after fractional CO,2023-08-14,2023-Sep-01,False,,,,pubmed_abstracts/37417644.txt
+37417646,,,1440-1754,Journal of paediatrics and child health,Hemi-atrophy of the face.,2023-11-16,2023-Jul-01,False,,,,
+37417647,,,1440-1754,Journal of paediatrics and child health,An unexpected percutaneous gastro-jejunostomy obstruction.,2023-11-16,2023-Jul-01,False,,,,
+37417649,,,1097-0347,Head & neck,Ultrasound-guided resection for squamous cell carcinoma of the buccal mucosa: A feasibility study.,2023-08-14,2023-09-01,False,,,,
+37417648,,,1728-2985,"Urologiia (Moscow, Russia : 1999)",Androgenic status of men with severe COVID-19: the role of testosterone and dihydrotestosterone within the program FOUNDER (features of a new coronavirus infection course and options therapy depending on the androgenic status).,2023-07-18,2023-Jul-01,False,,,,pubmed_abstracts/37417648.txt
+37417650,PMC10790315,10.1111/all.15797,1398-9995,Allergy,Diagnostic utility of allergy tests to predict baked egg and lightly cooked egg allergies compared to double-blind placebo-controlled food challenges.,2023-10-10,2023-09-01,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/4a/d7/PMC10790315.tar.gz,pubmed_abstracts/PMC10790315/ALL-78-2510.pdf
+37417653,,,1879-1190,Journal of the American College of Surgeons,Statistical Power of Randomized Controlled Trials in Trauma Surgery.,2023-10-24,2023-11-01,False,,,,pubmed_abstracts/37417653.txt
+37417652,,,1440-1746,Journal of gastroenterology and hepatology,A model for predicting poor survival in patients with cirrhosis undergoing portosystemic shunt embolization.,2023-09-18,2023-Sep-01,False,,,,pubmed_abstracts/37417652.txt
+37417657,PMC11017731,10.1021/acssynbio.3c00061,2161-5063,ACS synthetic biology,Engineering Tissue-Scale Properties with Synthetic Cells: Forging One from Many.,2023-07-29,2023-07-21,False,2024-07-21,,,pubmed_abstracts/37417657.txt
+37417659,,,1537-8918,Current sports medicine reports,To Protect and Serve: Preventable Collapse and Death of Police Trainees.,2023-11-21,2023-07-01,False,,,,
+37417654,PMC10988698,10.1113/EP090989,1469-445X,Experimental physiology,Role of proprioceptors in chronic musculoskeletal pain.,2023-07-07,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/32/e5/PMC10988698.tar.gz,pubmed_abstracts/PMC10988698/EPH-109-45.pdf
+37417660,,,1537-8918,Current sports medicine reports,Web Alerts.,2023-08-20,2023-Jul-01,False,,,,
+37417662,,,1537-8918,Current sports medicine reports,Vitamin C Supplementation and Athletic Performance: A Review.,2023-08-20,2023-Jul-01,False,,,,pubmed_abstracts/37417662.txt
+37417661,,,1537-8918,Current sports medicine reports,"Nutritional Strategies for Endurance Cyclists - Periodized Nutrition, Ketogenic Diets, and Other Considerations.",2023-09-11,2023-Jul-01,False,,,,pubmed_abstracts/37417661.txt
+37417663,,,1537-8918,Current sports medicine reports,A Research and Clinical Framework for Understanding Achilles Injury in Female Collegiate Gymnasts.,2023-08-20,2023-Jul-01,False,,,,pubmed_abstracts/37417663.txt
+37417664,,,1833-3575,Health information management : journal of the Health Information Management Association of Australia,Alpha NSW: What would it take to create a state-wide paediatric population-level learning health system?,2023-07-07,2023-Jul-07,False,,,,pubmed_abstracts/37417664.txt
+37417665,,,1476-8259,Computer methods in biomechanics and biomedical engineering,Automated detection of auditory response: non-detection stopping criterion and repeatability studies for multichannel EEG.,2023-07-07,2023-Jul-07,False,,,,pubmed_abstracts/37417665.txt
+37417666,,,1944-8252,ACS applied materials & interfaces,Impact of Molecular Orientation on Lateral and Interfacial Electron Transfer at Oxide Interfaces.,2023-07-19,2023-Jul-19,False,,,,pubmed_abstracts/37417666.txt
+37417658,PMC10331187,10.1177/17539447231184984,1753-9455,Therapeutic advances in cardiovascular disease,Evaluation of diuretic efficiency of intravenous furosemide in patients with advanced heart failure in a heart failure clinic.,2023-07-18,,True,,CC BY-NC,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/a7/5e/10.1177_17539447231184984.PMC10331187.pdf,pubmed_abstracts/10.1177_17539447231184984.PMC10331187.pdf
+37417667,PMC10373524,10.1021/acsnano.2c11904,1936-086X,ACS nano,"Insights into the Structure of Comirnaty Covid-19 Vaccine: A Theory on Soft, Partially Bilayer-Covered Nanoparticles with Hydrogen Bond-Stabilized mRNA-Lipid Complexes.",2023-07-31,2023-07-25,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/8a/ad/PMC10373524.tar.gz,"pubmed_abstracts/PMC10373524/nn2c11904.pdf,pubmed_abstracts/PMC10373524/nn2c11904_si_001.pdf"
+37417668,,,1078-7496,Journal of trauma nursing : the official journal of the Society of Trauma Nurses,Be Someone's Betsy!,2023-11-03,,False,,,,
+37417669,,,1078-7496,Journal of trauma nursing : the official journal of the Society of Trauma Nurses,Concomitant Mental Illnesses Diagnoses and Likelihood of Trauma Recidivism.,2023-08-21,,False,,,,pubmed_abstracts/37417669.txt
+37417670,,,1078-7496,Journal of trauma nursing : the official journal of the Society of Trauma Nurses,Concomitant Mental Illnesses Diagnoses and Likelihood of Trauma Recidivism.,2023-12-05,,False,,,,
+37417673,,,1078-7496,Journal of trauma nursing : the official journal of the Society of Trauma Nurses,Injury and Sociodemographic Characteristics of Intimate Partner Violence in Women in Israel: A Single-Center Retrospective Cohort Study.,2023-08-21,,False,,,,pubmed_abstracts/37417673.txt
+37417672,,,1078-7496,Journal of trauma nursing : the official journal of the Society of Trauma Nurses,"Effects of Case Management in Trauma Patients in Taiwan: A Randomized, Longitudinal Study.",2023-08-21,,False,,,,pubmed_abstracts/37417672.txt
+37417671,,,1078-7496,Journal of trauma nursing : the official journal of the Society of Trauma Nurses,Inpatient Rehabilitation Falls: Comparing Patients With Traumatic Brain Injury Versus Patients With Stroke.,2023-08-21,,False,,,,pubmed_abstracts/37417671.txt
+37417674,,,1078-7496,Journal of trauma nursing : the official journal of the Society of Trauma Nurses,Impact of Trauma Resuscitation Emergency Care Nurse Deployment in Trauma Activations in a Rural Trauma Center.,2023-08-21,,False,,,,pubmed_abstracts/37417674.txt
+37417678,PMC10388677,10.1530/EDM-22-0383,2052-0573,"Endocrinology, diabetes & metabolism case reports",Clinical and molecular description of two cases of neonatal diabetes secondary to mutations in PDX1.,2023-08-02,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/b3/4c/PMC10388677.tar.gz,pubmed_abstracts/PMC10388677/EDM22-0383.pdf
+37417680,,,1524-4725,Dermatologic surgery : official publication for American Society for Dermatologic Surgery [et al.],A Retrospective Analysis of Complications of Minimally Invasive Cosmetic Procedures Seen at a Referral Practice in Houston.,2023-10-02,2023-09-01,False,,,,
+37417681,,,1552-7433,Personality & social psychology bulletin,"Masculinity Threats Sequentially Arouse Public Discomfort, Anger, and Positive Attitudes Toward Sexual Violence.",2023-07-07,2023-Jul-07,False,,,,pubmed_abstracts/37417681.txt
+37417679,PMC10895403,10.1093/jpids/piad048,2048-7207,Journal of the Pediatric Infectious Diseases Society,Comparison of Administrative Database-Derived and Hospital-Derived Data for Monitoring Blood Culture Use in the Pediatric Intensive Care Unit.,2023-11-03,2023-Jul-31,True,,,,pubmed_abstracts/37417679.txt
+37417677,,,1078-7496,Journal of trauma nursing : the official journal of the Society of Trauma Nurses,Postintensive Care Syndrome: Feasibly Bridging Care at a Tertiary Trauma Center.,2023-12-05,,False,,,,
+37417682,PMC10374551,10.1049/nbt2.12144,1751-875X,IET nanobiotechnology,Natural compound chaetocin induced DNA damage and apoptosis through reactive oxygen species-dependent pathways in A549 lung cancer cells and in vitro evaluations.,2023-07-31,2023-Jul-01,True,,CC BY-NC-ND,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/7a/14/NBT2-17-465.PMC10374551.pdf,pubmed_abstracts/NBT2-17-465.PMC10374551.pdf
+37417676,,,1078-7496,Journal of trauma nursing : the official journal of the Society of Trauma Nurses,Postintensive Care Syndrome: Feasibly Bridging Care at a Tertiary Trauma Center.,2023-08-21,,False,,,,pubmed_abstracts/37417676.txt
+37417675,,,1078-7496,Journal of trauma nursing : the official journal of the Society of Trauma Nurses,Bringing Trauma Education to the Frontier: Overcoming Distance Barriers Utilizing a Virtual Platform.,2023-08-21,,False,,,,pubmed_abstracts/37417675.txt
+37417683,,,2163-0097,Clinical advances in periodontics,Novel biomaterial advanced platelet-rich fibrin plus block for multiple gingival recession.,2023-07-19,2023-Jul-07,False,,,,pubmed_abstracts/37417683.txt
+37417684,PMC10439496,10.1049/syb2.12070,1751-8857,IET systems biology,Comprehensive analysis of anoikis-related lncRNAs for predicting prognosis and response of immunotherapy in hepatocellular carcinoma.,2023-08-23,2023-08-01,True,,CC BY-NC-ND,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/e3/c1/SYB2-17-198.PMC10439496.pdf,pubmed_abstracts/SYB2-17-198.PMC10439496.pdf
+37417685,,,1944-8252,ACS applied materials & interfaces,Influence of Interfering Ions and Adsorption Temperature on Radioactive Iodine Removal Efficiency and Stability of Ni-MOF-74 and Zr-UiO-66.,2023-07-20,2023-Jul-19,False,,,,pubmed_abstracts/37417685.txt
+37417686,,,1532-7752,Journal of personality assessment,The HEXACO Personality Space Before and After Re-Rotation to Approximate the Big Five Dimensions.,2023-07-07,2023-Jul-07,False,,,,pubmed_abstracts/37417686.txt
+37417687,,,1532-7752,Journal of personality assessment,New Versions of the MMPI and Rorschach: How Have Training Programs Responded?,2023-07-07,2023-Jul-07,False,,,,pubmed_abstracts/37417687.txt
+37417690,PMC10332179,10.1080/07853890.2023.2230888,1365-2060,Annals of medicine,Blinatumomab as salvage therapy in patients with relapsed/refractory B-ALL who have failed/progressed after anti-CD19-CAR T therapy.,2023-11-19,2023-12-01,True,,CC BY-NC,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/f5/9b/IANN_55_2230888.PMC10332179.pdf,pubmed_abstracts/IANN_55_2230888.PMC10332179.pdf
+37417688,PMC10407019,10.1111/aogs.14620,1600-0412,Acta obstetricia et gynecologica Scandinavica,Ultrasound examination of the pelvic floor during active labor: A longitudinal cohort study.,2023-08-10,2023-09-01,True,,CC BY-NC-ND,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/6d/0e/AOGS-102-1203.PMC10407019.pdf,pubmed_abstracts/AOGS-102-1203.PMC10407019.pdf
+37417691,PMC10771532,10.1111/jgs.18505,1532-5415,Journal of the American Geriatrics Society,Factors associated with preventable hospitalizations after hospice live discharge among Medicare patients with Alzheimer's disease and related dementias.,2023-11-15,2023-Nov-01,False,2024-11-01,,,
+37417692,,,1470-8744,Biotechnology and applied biochemistry,Deciphering the role of fungus in degradation of polypropylene from hospital waste.,2023-12-10,2023-Dec-01,False,,,,pubmed_abstracts/37417692.txt
+37417689,PMC10527499,10.1097/BRS.0000000000004769,1528-1159,Spine,Association of Neighborhood Socioeconomic Deprivation With Utilization and Costs of Anterior Cervical Discectomy and Fusion.,2023-10-03,2023-Sep-15,False,2024-09-15,,,pubmed_abstracts/37417689.txt
+37417694,,,1532-5040,Physiotherapy theory and practice,Effect of a structured early mobilization protocol on the level of mobilization and muscle strength in critical care patients: A randomized clinical trial.,2023-07-07,2023-Jul-07,False,,,,pubmed_abstracts/37417694.txt
+37417693,PMC10735286,10.1210/clinem/dgad401,1945-7197,The Journal of clinical endocrinology and metabolism,Pheochromocytomas Most Commonly Present As Adrenal Incidentalomas: A Large Tertiary Center Experience.,2023-07-07,2023-Jul-07,True,,CC BY-NC-ND,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/21/a2/PMC10735286.tar.gz,
+37417696,,,1745-3682,Acta orthopaedica,A comparison of uncemented short versus standard stem length in total hip arthroplasty: results from the Dutch Arthroplasty Register.,2023-11-16,2023-07-07,False,,,,pubmed_abstracts/37417696.txt
+37417695,,,1528-1159,Spine,Quality of Life and Postoperative Satisfaction in Patients with Benign Extramedullary Spinal Tumors: A Multicenter Study.,2023-08-28,2023-Sep-15,False,,,,pubmed_abstracts/37417695.txt
+37417697,PMC10484187,10.1097/BRS.0000000000004731,1528-1159,Spine,"Directed Versus Nondirected Standing Postures in Adolescent Idiopathic Scoliosis: Its Impact on Curve Magnitude, Alignment, and Clinical Decision-Making.",2023-09-13,2023-Oct-01,True,,CC BY-NC-ND,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/f2/95/brs-48-1354.PMC10484187.pdf,pubmed_abstracts/brs-48-1354.PMC10484187.pdf
+37417700,PMC10331372,10.1177/01410768231184381,1758-1095,Journal of the Royal Society of Medicine,From ,2023-07-18,2023-Jun-01,False,2026-06-01,,,
+37417698,,,1751-3766,Journal of biological dynamics,Threshold dynamics of a stochastic mathematical model for ,2023-11-16,2023-Dec-01,False,,,,pubmed_abstracts/37417698.txt
+37417702,PMC10331365,10.1177/01410768231184373,1758-1095,Journal of the Royal Society of Medicine,Is an independent NHS an impossible dream?,2023-07-18,2023-Jun-01,True,,,,
+37417701,PMC10331368,10.1177/01410768231182836,1758-1095,Journal of the Royal Society of Medicine,Facilitating genetic testing after death: the ongoing duty of care to the deceased and their relatives.,2023-11-16,2023-Jun-01,False,2026-06-01,,,
+37417704,,,1744-764X,Expert opinion on drug safety,Proton pump inhibitors use prior to COVID-19 hospitalization is associated with higher C,2023-07-10,2023-Jul-10,False,,,,pubmed_abstracts/37417704.txt
+37417706,,,1945-7197,The Journal of clinical endocrinology and metabolism,Microvascular complications are associated with coronary collateralization in type 2 diabetes and chronic occlusion.,2023-07-07,2023-Jul-07,False,,,,pubmed_abstracts/37417706.txt
+37417705,PMC10332216,10.1080/07853890.2023.2231847,1365-2060,Annals of medicine,Life quality among psoriasis patients based on Dermatology Life Quality Index evaluation and its association with psoriasis severity in China: a cross-sectional study.,2023-11-19,2023-12-01,True,,CC BY-NC,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/09/33/IANN_55_2231847.PMC10332216.pdf,pubmed_abstracts/IANN_55_2231847.PMC10332216.pdf
+37417709,,,1528-1159,Spine,Cannabis Use is Associated with Higher Rates of Pseudarthrosis Following TLIF: A Multi-Institutional Matched-Cohort Study.,2023-07-07,2023-Jul-03,False,,,,pubmed_abstracts/37417709.txt
+37417707,,,1523-4681,Journal of bone and mineral research : the official journal of the American Society for Bone and Mineral Research,3D Finite Element Models Reconstructed From 2D Dual-Energy X-Ray Absorptiometry (DXA) Images Improve Hip Fracture Prediction Compared to Areal BMD in Osteoporotic Fractures in Men (MrOS) Sweden Cohort.,2023-09-26,2023-09-01,False,,,,pubmed_abstracts/37417707.txt
+37417710,,,1612-1880,Chemistry & biodiversity,Synthesis and Evaluation of Novel Metacetamol Derivatives with Hydrazone Moiety as Anticancer and Antimicrobial Agents.,2023-08-24,2023-Aug-01,False,,,,pubmed_abstracts/37417710.txt
+37417712,PMC10337823,10.1093/europace/euad189,1532-2092,"Europace : European pacing, arrhythmias, and cardiac electrophysiology : journal of the working groups on cardiac pacing, arrhythmias, and cardiac cellular electrophysiology of the European Society of Cardiology",Very-early symptomatic recurrence is associated with late recurrence after radiofrequency ablation of atrial fibrillation.,2023-08-22,2023-07-04,True,,CC BY-NC,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/4e/85/euad189.PMC10337823.pdf,pubmed_abstracts/euad189.PMC10337823.pdf
+37417711,,,1528-0691,"Chemical record (New York, N.Y.)",Supported Noble Metal Catalysts and Adsorbents with Soft Lewis Acid Functions.,2023-11-21,2023-Nov-01,False,,,,pubmed_abstracts/37417711.txt
+37417713,PMC10719214,10.1093/cei/uxad072,1365-2249,Clinical and experimental immunology,Effects of mesenchymal stem cells on Treg cells in rats with colitis.,2023-12-13,2023-Dec-13,False,2024-07-07,,,pubmed_abstracts/37417713.txt
+37417714,PMC10577628,10.1111/aogs.14626,1600-0412,Acta obstetricia et gynecologica Scandinavica,Double-vs single-balloon catheter for induction of labor: Systematic review and individual participant data meta-analysis.,2023-10-24,2023-11-01,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/af/36/PMC10577628.tar.gz,pubmed_abstracts/PMC10577628/AOGS-102-1440.pdf
+37417715,PMC10508478,10.1002/vms3.1180,2053-1095,Veterinary medicine and science,"Evaluating the effects of direct-fed microbial supplementation on the performance, milk quality and fatty acid of mid-lactating dairy cows.",2023-09-21,2023-09-01,True,,CC BY-NC,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/8c/db/VMS3-9-2212.PMC10508478.pdf,pubmed_abstracts/VMS3-9-2212.PMC10508478.pdf
+37417716,,,1538-9774,"Computers, informatics, nursing : CIN",The Disruptive Impacts of Next Generation Generative Artificial Intelligence.,2023-11-28,2023-07-01,False,,,,
+37417718,,,1559-4106,Biointerphases,Development of electronic sum frequency generation spectrophotometer to assess the buried interfaces.,2023-11-18,2023-07-01,False,,,,pubmed_abstracts/37417718.txt
+37417719,,,1559-4106,Biointerphases,Theoretical study of electronic sum frequency generation spectroscopy to assess the buried interfaces.,2023-11-18,2023-07-01,False,,,,pubmed_abstracts/37417719.txt
+37417708,PMC10524881,10.1097/AUD.0000000000001396,1538-4667,Ear and hearing,Association Between Adult-Onset Hearing Loss and Income: A Systematic Review.,2023-11-06,,False,2024-07-06,,,pubmed_abstracts/37417708.txt
+37417720,,,1943-278X,Suicide & life-threatening behavior,When safe firearm storage isn't enough: Examining risk profiles among firearm suicide decedents.,2023-08-16,2023-08-01,False,,,,pubmed_abstracts/37417720.txt
+37417722,,,1754-9485,Journal of medical imaging and radiation oncology,Percutaneous treatment of renal tumours.,2023-07-07,2023-Jul-07,False,,,,pubmed_abstracts/37417722.txt
+37417721,PMC10332182,10.1080/07853890.2023.2233556,1365-2060,Annals of medicine,Ultrasound-guided injection acupotomy as a minimally invasive intervention therapy for cervical spondylotic radiculopathy: a randomized control trial.,2023-11-19,2023-12-01,True,,CC BY-NC,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/8f/eb/IANN_55_2233556.PMC10332182.pdf,pubmed_abstracts/IANN_55_2233556.PMC10332182.pdf
+37417723,,,1528-1159,Spine,GSK-3β and β-Catenin Signaling Pathway is Involved in Myofibroblast Transition of Ligamentum Flavum in Lumbar Spinal Stenosis Patients.,2023-09-27,2023-Oct-15,False,,,,pubmed_abstracts/37417723.txt
+37417724,,,1528-1159,Spine,Subclassification of Sanders Maturation Stage 3 Demonstrates Differences in Spine and Total Height Velocity Between 3A and 3B in Patients with Idiopathic Scoliosis.,2023-07-07,2023-Jul-06,False,,,,pubmed_abstracts/37417724.txt
+37417725,,,1523-4681,Journal of bone and mineral research : the official journal of the American Society for Bone and Mineral Research,Efficacy and Safety of Transdermal Abaloparatide in Postmenopausal Women with Osteoporosis: A Randomized Study.,2023-10-26,2023-10-01,False,,,,pubmed_abstracts/37417725.txt
+37417726,,,1521-4141,European journal of immunology,CKBA suppresses mast cell activation via ERK signaling pathway in murine atopic dermatitis.,2023-09-11,2023-09-01,False,,,,pubmed_abstracts/37417726.txt
+37417728,,,1521-4095,"Advanced materials (Deerfield Beach, Fla.)",Reconstructed Hierarchically Structured Keratin Fibers with Shape-Memory Features Based on Reversible Secondary-Structure Transformation.,2023-10-23,2023-Oct-01,False,,,,pubmed_abstracts/37417728.txt
+37417727,PMC10181040,10.3390/nu15092153,2072-6643,Nutrients,"Effectiveness of a Digitally Delivered Continuous Care Intervention (Defeat Diabetes) on Type 2 Diabetes Outcomes: A 12-Month Single-Arm, Pre-Post Intervention Study.",2023-07-19,2023-Apr-30,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/f3/34/PMC10181040.tar.gz,pubmed_abstracts/PMC10181040/nutrients-15-02153.pdf
+37417730,PMC10356134,10.7554/eLife.88310,2050-084X,eLife,Metformin regulates bone marrow stromal cells to accelerate bone healing in diabetic mice.,2023-07-21,2023-07-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/5e/82/PMC10356134.tar.gz,"pubmed_abstracts/PMC10356134/elife-88310-mdarchecklist1.pdf,pubmed_abstracts/PMC10356134/elife-88310.pdf"
+37417729,PMC10508548,10.1002/vms3.1196,2053-1095,Veterinary medicine and science,Global prevalence of Neospora caninum in rodents: A systematic review and meta-analysis.,2023-09-21,2023-09-01,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/1d/b2/PMC10508548.tar.gz,pubmed_abstracts/PMC10508548/VMS3-9-2192.pdf
+37417731,,,1520-6890,Chemical reviews,Enantioselective Transformations in the Synthesis of Therapeutic Agents.,2023-08-30,2023-08-09,False,,,,pubmed_abstracts/37417731.txt
+37417732,,,1948-7185,The journal of physical chemistry letters,Reduction-Active Antisolvent: A Universal and Innovative Strategy of Further Ameliorating Additive Optimization for High Efficiency Perovskite Solar Cells.,2023-07-20,2023-Jul-20,False,,,,pubmed_abstracts/37417732.txt
+37417734,PMC10328535,10.7554/eLife.86373,2050-084X,eLife,The Opto-inflammasome in zebrafish as a tool to study cell and tissue responses to speck formation and cell death.,2023-11-16,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/b9/9f/PMC10328535.tar.gz,pubmed_abstracts/PMC10328535/elife-86373.pdf
+37417733,PMC10392983,10.7554/eLife.88058,2050-084X,eLife,Allosteric activation or inhibition of PI3Kγ mediated through conformational changes in the p110γ helical domain.,2023-11-16,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/91/25/PMC10392983.tar.gz,"pubmed_abstracts/PMC10392983/elife-88058.pdf,pubmed_abstracts/PMC10392983/elife-88058-mdarchecklist1.pdf"
+37417737,PMC10731660,10.1021/acs.inorgchem.3c01620,1520-510X,Inorganic chemistry,Role of Pure Technetium Chemistry: Are There Still Links to Applications in Imaging?,2023-07-07,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/14/5f/PMC10731660.tar.gz,pubmed_abstracts/PMC10731660/ic3c01620.pdf

From 68057df5765edcff5743634ade84cdb8a8ec163d Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Tue, 14 May 2024 12:55:05 -0500
Subject: [PATCH 23/28] deleted csv file

---
 metadata.csv | 101 ---------------------------------------------------
 1 file changed, 101 deletions(-)
 delete mode 100644 metadata.csv

diff --git a/metadata.csv b/metadata.csv
deleted file mode 100644
index 6517828c..00000000
--- a/metadata.csv
+++ /dev/null
@@ -1,101 +0,0 @@
-pmid,pmcid,doi,issn,journal_title,article_title,last_revised,published,live,release_date,license,pubmed_ftp_link,filepath
-37417630,PMC10328620,10.1097/MD.0000000000034177,1536-5964,Medicine,Meningitis with septic shock resulting from odontogenic infection misdiagnosed as closed-lock in temporomandibular disorder: A case report and literature review.,2023-11-15,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/64/70/PMC10328620.tar.gz,pubmed_abstracts/PMC10328620/medi-102-e34177.pdf
-37417631,PMC10328656,10.1097/MD.0000000000034223,1536-5964,Medicine,"The effect of progressive muscle relaxation technique and myofascial release technique on premenstrual symptoms, blood circulation, and quality of life in women with premenstrual syndrome: A single-blind randomized controlled study.",2023-08-02,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/76/4b/PMC10328656.tar.gz,pubmed_abstracts/PMC10328656/medi-102-e34223.pdf
-37417634,PMC10328666,10.1097/MD.0000000000034239,1536-5964,Medicine,Case report: Plastic bronchitis associated with Bordetella parapertussis.,2023-08-02,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/ee/a6/PMC10328666.tar.gz,pubmed_abstracts/PMC10328666/medi-102-e34239.pdf
-37417633,PMC10328702,10.1097/MD.0000000000034216,1536-5964,Medicine,Renal artery aneurysm induced by neurofibromatosis type 1: A case report and review of the endovascular interventions for this rare vasculopathy.,2023-08-02,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/85/3e/PMC10328702.tar.gz,pubmed_abstracts/PMC10328702/medi-102-e34216.pdf
-37417632,PMC10328683,10.1097/MD.0000000000034221,1536-5964,Medicine,Intervention for burnout and irrational beliefs in parents of couples seeking a divorce: A critical reflection of Igbo-African marital discord.,2023-08-02,2023-Jul-07,True,,CC BY-NC,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/5d/c7/medi-102-e34221.PMC10328683.pdf,pubmed_abstracts/medi-102-e34221.PMC10328683.pdf
-37417636,PMC10328563,10.1097/MD.0000000000034197,1536-5964,Medicine,Hematomyelia associated with coronavirus disease 2019: A rare case report.,2023-08-02,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/97/4c/PMC10328563.tar.gz,pubmed_abstracts/PMC10328563/medi-102-e34197.pdf
-37417635,PMC10328687,10.1097/MD.0000000000034194,1536-5964,Medicine,Marginal resection as a potential curative treatment option of infantile fibrosarcoma with good response after chemotherapy: A case report of an ETV6-NTRK3 positive infantile fibrosacroma of the distal tibia.,2023-08-02,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/f1/67/PMC10328687.tar.gz,pubmed_abstracts/PMC10328687/medi-102-e34194.pdf
-37417638,PMC10328619,10.1097/MD.0000000000034282,1536-5964,Medicine,"The effect of being married on heart rate variability, an indicator of autonomic dysfunction: A retrospective study.",2023-08-02,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/62/28/PMC10328619.tar.gz,pubmed_abstracts/PMC10328619/medi-102-e34282.pdf
-37417637,PMC10328692,10.1097/MD.0000000000034238,1536-5964,Medicine,A case report of Ovarian hyperstimulation syndrome and corpus luteum rupture in twin pregnancies with IVF-ET.,2023-08-02,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/19/93/PMC10328692.tar.gz,pubmed_abstracts/PMC10328692/medi-102-e34238.pdf
-37417639,PMC10328576,10.1097/MD.0000000000033936,1536-5964,Medicine,Transvenous embolization using the Amplatzer Vascular Plug II in patent ductus arteriosus concomitant with Stanford type B aortic dissection: A case report.,2023-08-02,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/52/2f/PMC10328576.tar.gz,pubmed_abstracts/PMC10328576/medi-102-e33936.pdf
-37417640,PMC10328685,10.1097/MD.0000000000034250,1536-5964,Medicine,Quantitative chest CT imaging characteristics and outcome of patients with COVID-19 associated pulmonary artery thrombosis: A single-center retrospective cohort study.,2023-08-02,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/17/4e/PMC10328685.tar.gz,pubmed_abstracts/PMC10328685/medi-102-e34250.pdf
-37417642,PMC10328710,10.1097/MD.0000000000033880,1536-5964,Medicine,Orelabrutinib versus ibrutinib for patients with refractory/relapsed primary central nervous system lymphoma: An efficacy and safety analysis.,2023-08-02,2023-Jul-07,True,,CC BY-NC,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/cf/c9/medi-102-e33880.PMC10328710.pdf,pubmed_abstracts/medi-102-e33880.PMC10328710.pdf
-37417641,PMC10328596,10.1097/MD.0000000000034248,1536-5964,Medicine,The influence of psychological factors on coronary heart disease: A review of the evidence and implications for psychological interventions.,2023-08-02,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/f5/ec/PMC10328596.tar.gz,pubmed_abstracts/PMC10328596/medi-102-e34248.pdf
-37417645,,,1440-1754,Journal of paediatrics and child health,Blue and red Doppler jet on the echocardiogram.,2023-11-16,2023-Jul-01,False,,,,
-37417643,PMC10328582,10.1097/MD.0000000000034401,1536-5964,Medicine,Opioids for treating refractory dyspnea in patients with heart failure: A protocol for systematic review and meta-analysis: Retraction.,2023-11-16,2023-07-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/44/53/PMC10328582.tar.gz,pubmed_abstracts/PMC10328582/medi-102-e34401.pdf
-37417644,,,1473-2165,Journal of cosmetic dermatology,Safety and efficacy of human platelet extract in skin recovery after fractional CO,2023-08-14,2023-Sep-01,False,,,,pubmed_abstracts/37417644.txt
-37417646,,,1440-1754,Journal of paediatrics and child health,Hemi-atrophy of the face.,2023-11-16,2023-Jul-01,False,,,,
-37417647,,,1440-1754,Journal of paediatrics and child health,An unexpected percutaneous gastro-jejunostomy obstruction.,2023-11-16,2023-Jul-01,False,,,,
-37417649,,,1097-0347,Head & neck,Ultrasound-guided resection for squamous cell carcinoma of the buccal mucosa: A feasibility study.,2023-08-14,2023-09-01,False,,,,
-37417648,,,1728-2985,"Urologiia (Moscow, Russia : 1999)",Androgenic status of men with severe COVID-19: the role of testosterone and dihydrotestosterone within the program FOUNDER (features of a new coronavirus infection course and options therapy depending on the androgenic status).,2023-07-18,2023-Jul-01,False,,,,pubmed_abstracts/37417648.txt
-37417650,PMC10790315,10.1111/all.15797,1398-9995,Allergy,Diagnostic utility of allergy tests to predict baked egg and lightly cooked egg allergies compared to double-blind placebo-controlled food challenges.,2023-10-10,2023-09-01,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/4a/d7/PMC10790315.tar.gz,pubmed_abstracts/PMC10790315/ALL-78-2510.pdf
-37417653,,,1879-1190,Journal of the American College of Surgeons,Statistical Power of Randomized Controlled Trials in Trauma Surgery.,2023-10-24,2023-11-01,False,,,,pubmed_abstracts/37417653.txt
-37417652,,,1440-1746,Journal of gastroenterology and hepatology,A model for predicting poor survival in patients with cirrhosis undergoing portosystemic shunt embolization.,2023-09-18,2023-Sep-01,False,,,,pubmed_abstracts/37417652.txt
-37417657,PMC11017731,10.1021/acssynbio.3c00061,2161-5063,ACS synthetic biology,Engineering Tissue-Scale Properties with Synthetic Cells: Forging One from Many.,2023-07-29,2023-07-21,False,2024-07-21,,,pubmed_abstracts/37417657.txt
-37417659,,,1537-8918,Current sports medicine reports,To Protect and Serve: Preventable Collapse and Death of Police Trainees.,2023-11-21,2023-07-01,False,,,,
-37417654,PMC10988698,10.1113/EP090989,1469-445X,Experimental physiology,Role of proprioceptors in chronic musculoskeletal pain.,2023-07-07,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/32/e5/PMC10988698.tar.gz,pubmed_abstracts/PMC10988698/EPH-109-45.pdf
-37417660,,,1537-8918,Current sports medicine reports,Web Alerts.,2023-08-20,2023-Jul-01,False,,,,
-37417662,,,1537-8918,Current sports medicine reports,Vitamin C Supplementation and Athletic Performance: A Review.,2023-08-20,2023-Jul-01,False,,,,pubmed_abstracts/37417662.txt
-37417661,,,1537-8918,Current sports medicine reports,"Nutritional Strategies for Endurance Cyclists - Periodized Nutrition, Ketogenic Diets, and Other Considerations.",2023-09-11,2023-Jul-01,False,,,,pubmed_abstracts/37417661.txt
-37417663,,,1537-8918,Current sports medicine reports,A Research and Clinical Framework for Understanding Achilles Injury in Female Collegiate Gymnasts.,2023-08-20,2023-Jul-01,False,,,,pubmed_abstracts/37417663.txt
-37417664,,,1833-3575,Health information management : journal of the Health Information Management Association of Australia,Alpha NSW: What would it take to create a state-wide paediatric population-level learning health system?,2023-07-07,2023-Jul-07,False,,,,pubmed_abstracts/37417664.txt
-37417665,,,1476-8259,Computer methods in biomechanics and biomedical engineering,Automated detection of auditory response: non-detection stopping criterion and repeatability studies for multichannel EEG.,2023-07-07,2023-Jul-07,False,,,,pubmed_abstracts/37417665.txt
-37417666,,,1944-8252,ACS applied materials & interfaces,Impact of Molecular Orientation on Lateral and Interfacial Electron Transfer at Oxide Interfaces.,2023-07-19,2023-Jul-19,False,,,,pubmed_abstracts/37417666.txt
-37417658,PMC10331187,10.1177/17539447231184984,1753-9455,Therapeutic advances in cardiovascular disease,Evaluation of diuretic efficiency of intravenous furosemide in patients with advanced heart failure in a heart failure clinic.,2023-07-18,,True,,CC BY-NC,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/a7/5e/10.1177_17539447231184984.PMC10331187.pdf,pubmed_abstracts/10.1177_17539447231184984.PMC10331187.pdf
-37417667,PMC10373524,10.1021/acsnano.2c11904,1936-086X,ACS nano,"Insights into the Structure of Comirnaty Covid-19 Vaccine: A Theory on Soft, Partially Bilayer-Covered Nanoparticles with Hydrogen Bond-Stabilized mRNA-Lipid Complexes.",2023-07-31,2023-07-25,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/8a/ad/PMC10373524.tar.gz,"pubmed_abstracts/PMC10373524/nn2c11904.pdf,pubmed_abstracts/PMC10373524/nn2c11904_si_001.pdf"
-37417668,,,1078-7496,Journal of trauma nursing : the official journal of the Society of Trauma Nurses,Be Someone's Betsy!,2023-11-03,,False,,,,
-37417669,,,1078-7496,Journal of trauma nursing : the official journal of the Society of Trauma Nurses,Concomitant Mental Illnesses Diagnoses and Likelihood of Trauma Recidivism.,2023-08-21,,False,,,,pubmed_abstracts/37417669.txt
-37417670,,,1078-7496,Journal of trauma nursing : the official journal of the Society of Trauma Nurses,Concomitant Mental Illnesses Diagnoses and Likelihood of Trauma Recidivism.,2023-12-05,,False,,,,
-37417673,,,1078-7496,Journal of trauma nursing : the official journal of the Society of Trauma Nurses,Injury and Sociodemographic Characteristics of Intimate Partner Violence in Women in Israel: A Single-Center Retrospective Cohort Study.,2023-08-21,,False,,,,pubmed_abstracts/37417673.txt
-37417672,,,1078-7496,Journal of trauma nursing : the official journal of the Society of Trauma Nurses,"Effects of Case Management in Trauma Patients in Taiwan: A Randomized, Longitudinal Study.",2023-08-21,,False,,,,pubmed_abstracts/37417672.txt
-37417671,,,1078-7496,Journal of trauma nursing : the official journal of the Society of Trauma Nurses,Inpatient Rehabilitation Falls: Comparing Patients With Traumatic Brain Injury Versus Patients With Stroke.,2023-08-21,,False,,,,pubmed_abstracts/37417671.txt
-37417674,,,1078-7496,Journal of trauma nursing : the official journal of the Society of Trauma Nurses,Impact of Trauma Resuscitation Emergency Care Nurse Deployment in Trauma Activations in a Rural Trauma Center.,2023-08-21,,False,,,,pubmed_abstracts/37417674.txt
-37417678,PMC10388677,10.1530/EDM-22-0383,2052-0573,"Endocrinology, diabetes & metabolism case reports",Clinical and molecular description of two cases of neonatal diabetes secondary to mutations in PDX1.,2023-08-02,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/b3/4c/PMC10388677.tar.gz,pubmed_abstracts/PMC10388677/EDM22-0383.pdf
-37417680,,,1524-4725,Dermatologic surgery : official publication for American Society for Dermatologic Surgery [et al.],A Retrospective Analysis of Complications of Minimally Invasive Cosmetic Procedures Seen at a Referral Practice in Houston.,2023-10-02,2023-09-01,False,,,,
-37417681,,,1552-7433,Personality & social psychology bulletin,"Masculinity Threats Sequentially Arouse Public Discomfort, Anger, and Positive Attitudes Toward Sexual Violence.",2023-07-07,2023-Jul-07,False,,,,pubmed_abstracts/37417681.txt
-37417679,PMC10895403,10.1093/jpids/piad048,2048-7207,Journal of the Pediatric Infectious Diseases Society,Comparison of Administrative Database-Derived and Hospital-Derived Data for Monitoring Blood Culture Use in the Pediatric Intensive Care Unit.,2023-11-03,2023-Jul-31,True,,,,pubmed_abstracts/37417679.txt
-37417677,,,1078-7496,Journal of trauma nursing : the official journal of the Society of Trauma Nurses,Postintensive Care Syndrome: Feasibly Bridging Care at a Tertiary Trauma Center.,2023-12-05,,False,,,,
-37417682,PMC10374551,10.1049/nbt2.12144,1751-875X,IET nanobiotechnology,Natural compound chaetocin induced DNA damage and apoptosis through reactive oxygen species-dependent pathways in A549 lung cancer cells and in vitro evaluations.,2023-07-31,2023-Jul-01,True,,CC BY-NC-ND,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/7a/14/NBT2-17-465.PMC10374551.pdf,pubmed_abstracts/NBT2-17-465.PMC10374551.pdf
-37417676,,,1078-7496,Journal of trauma nursing : the official journal of the Society of Trauma Nurses,Postintensive Care Syndrome: Feasibly Bridging Care at a Tertiary Trauma Center.,2023-08-21,,False,,,,pubmed_abstracts/37417676.txt
-37417675,,,1078-7496,Journal of trauma nursing : the official journal of the Society of Trauma Nurses,Bringing Trauma Education to the Frontier: Overcoming Distance Barriers Utilizing a Virtual Platform.,2023-08-21,,False,,,,pubmed_abstracts/37417675.txt
-37417683,,,2163-0097,Clinical advances in periodontics,Novel biomaterial advanced platelet-rich fibrin plus block for multiple gingival recession.,2023-07-19,2023-Jul-07,False,,,,pubmed_abstracts/37417683.txt
-37417684,PMC10439496,10.1049/syb2.12070,1751-8857,IET systems biology,Comprehensive analysis of anoikis-related lncRNAs for predicting prognosis and response of immunotherapy in hepatocellular carcinoma.,2023-08-23,2023-08-01,True,,CC BY-NC-ND,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/e3/c1/SYB2-17-198.PMC10439496.pdf,pubmed_abstracts/SYB2-17-198.PMC10439496.pdf
-37417685,,,1944-8252,ACS applied materials & interfaces,Influence of Interfering Ions and Adsorption Temperature on Radioactive Iodine Removal Efficiency and Stability of Ni-MOF-74 and Zr-UiO-66.,2023-07-20,2023-Jul-19,False,,,,pubmed_abstracts/37417685.txt
-37417686,,,1532-7752,Journal of personality assessment,The HEXACO Personality Space Before and After Re-Rotation to Approximate the Big Five Dimensions.,2023-07-07,2023-Jul-07,False,,,,pubmed_abstracts/37417686.txt
-37417687,,,1532-7752,Journal of personality assessment,New Versions of the MMPI and Rorschach: How Have Training Programs Responded?,2023-07-07,2023-Jul-07,False,,,,pubmed_abstracts/37417687.txt
-37417690,PMC10332179,10.1080/07853890.2023.2230888,1365-2060,Annals of medicine,Blinatumomab as salvage therapy in patients with relapsed/refractory B-ALL who have failed/progressed after anti-CD19-CAR T therapy.,2023-11-19,2023-12-01,True,,CC BY-NC,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/f5/9b/IANN_55_2230888.PMC10332179.pdf,pubmed_abstracts/IANN_55_2230888.PMC10332179.pdf
-37417688,PMC10407019,10.1111/aogs.14620,1600-0412,Acta obstetricia et gynecologica Scandinavica,Ultrasound examination of the pelvic floor during active labor: A longitudinal cohort study.,2023-08-10,2023-09-01,True,,CC BY-NC-ND,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/6d/0e/AOGS-102-1203.PMC10407019.pdf,pubmed_abstracts/AOGS-102-1203.PMC10407019.pdf
-37417691,PMC10771532,10.1111/jgs.18505,1532-5415,Journal of the American Geriatrics Society,Factors associated with preventable hospitalizations after hospice live discharge among Medicare patients with Alzheimer's disease and related dementias.,2023-11-15,2023-Nov-01,False,2024-11-01,,,
-37417692,,,1470-8744,Biotechnology and applied biochemistry,Deciphering the role of fungus in degradation of polypropylene from hospital waste.,2023-12-10,2023-Dec-01,False,,,,pubmed_abstracts/37417692.txt
-37417689,PMC10527499,10.1097/BRS.0000000000004769,1528-1159,Spine,Association of Neighborhood Socioeconomic Deprivation With Utilization and Costs of Anterior Cervical Discectomy and Fusion.,2023-10-03,2023-Sep-15,False,2024-09-15,,,pubmed_abstracts/37417689.txt
-37417694,,,1532-5040,Physiotherapy theory and practice,Effect of a structured early mobilization protocol on the level of mobilization and muscle strength in critical care patients: A randomized clinical trial.,2023-07-07,2023-Jul-07,False,,,,pubmed_abstracts/37417694.txt
-37417693,PMC10735286,10.1210/clinem/dgad401,1945-7197,The Journal of clinical endocrinology and metabolism,Pheochromocytomas Most Commonly Present As Adrenal Incidentalomas: A Large Tertiary Center Experience.,2023-07-07,2023-Jul-07,True,,CC BY-NC-ND,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/21/a2/PMC10735286.tar.gz,
-37417696,,,1745-3682,Acta orthopaedica,A comparison of uncemented short versus standard stem length in total hip arthroplasty: results from the Dutch Arthroplasty Register.,2023-11-16,2023-07-07,False,,,,pubmed_abstracts/37417696.txt
-37417695,,,1528-1159,Spine,Quality of Life and Postoperative Satisfaction in Patients with Benign Extramedullary Spinal Tumors: A Multicenter Study.,2023-08-28,2023-Sep-15,False,,,,pubmed_abstracts/37417695.txt
-37417697,PMC10484187,10.1097/BRS.0000000000004731,1528-1159,Spine,"Directed Versus Nondirected Standing Postures in Adolescent Idiopathic Scoliosis: Its Impact on Curve Magnitude, Alignment, and Clinical Decision-Making.",2023-09-13,2023-Oct-01,True,,CC BY-NC-ND,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/f2/95/brs-48-1354.PMC10484187.pdf,pubmed_abstracts/brs-48-1354.PMC10484187.pdf
-37417700,PMC10331372,10.1177/01410768231184381,1758-1095,Journal of the Royal Society of Medicine,From ,2023-07-18,2023-Jun-01,False,2026-06-01,,,
-37417698,,,1751-3766,Journal of biological dynamics,Threshold dynamics of a stochastic mathematical model for ,2023-11-16,2023-Dec-01,False,,,,pubmed_abstracts/37417698.txt
-37417702,PMC10331365,10.1177/01410768231184373,1758-1095,Journal of the Royal Society of Medicine,Is an independent NHS an impossible dream?,2023-07-18,2023-Jun-01,True,,,,
-37417701,PMC10331368,10.1177/01410768231182836,1758-1095,Journal of the Royal Society of Medicine,Facilitating genetic testing after death: the ongoing duty of care to the deceased and their relatives.,2023-11-16,2023-Jun-01,False,2026-06-01,,,
-37417704,,,1744-764X,Expert opinion on drug safety,Proton pump inhibitors use prior to COVID-19 hospitalization is associated with higher C,2023-07-10,2023-Jul-10,False,,,,pubmed_abstracts/37417704.txt
-37417706,,,1945-7197,The Journal of clinical endocrinology and metabolism,Microvascular complications are associated with coronary collateralization in type 2 diabetes and chronic occlusion.,2023-07-07,2023-Jul-07,False,,,,pubmed_abstracts/37417706.txt
-37417705,PMC10332216,10.1080/07853890.2023.2231847,1365-2060,Annals of medicine,Life quality among psoriasis patients based on Dermatology Life Quality Index evaluation and its association with psoriasis severity in China: a cross-sectional study.,2023-11-19,2023-12-01,True,,CC BY-NC,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/09/33/IANN_55_2231847.PMC10332216.pdf,pubmed_abstracts/IANN_55_2231847.PMC10332216.pdf
-37417709,,,1528-1159,Spine,Cannabis Use is Associated with Higher Rates of Pseudarthrosis Following TLIF: A Multi-Institutional Matched-Cohort Study.,2023-07-07,2023-Jul-03,False,,,,pubmed_abstracts/37417709.txt
-37417707,,,1523-4681,Journal of bone and mineral research : the official journal of the American Society for Bone and Mineral Research,3D Finite Element Models Reconstructed From 2D Dual-Energy X-Ray Absorptiometry (DXA) Images Improve Hip Fracture Prediction Compared to Areal BMD in Osteoporotic Fractures in Men (MrOS) Sweden Cohort.,2023-09-26,2023-09-01,False,,,,pubmed_abstracts/37417707.txt
-37417710,,,1612-1880,Chemistry & biodiversity,Synthesis and Evaluation of Novel Metacetamol Derivatives with Hydrazone Moiety as Anticancer and Antimicrobial Agents.,2023-08-24,2023-Aug-01,False,,,,pubmed_abstracts/37417710.txt
-37417712,PMC10337823,10.1093/europace/euad189,1532-2092,"Europace : European pacing, arrhythmias, and cardiac electrophysiology : journal of the working groups on cardiac pacing, arrhythmias, and cardiac cellular electrophysiology of the European Society of Cardiology",Very-early symptomatic recurrence is associated with late recurrence after radiofrequency ablation of atrial fibrillation.,2023-08-22,2023-07-04,True,,CC BY-NC,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/4e/85/euad189.PMC10337823.pdf,pubmed_abstracts/euad189.PMC10337823.pdf
-37417711,,,1528-0691,"Chemical record (New York, N.Y.)",Supported Noble Metal Catalysts and Adsorbents with Soft Lewis Acid Functions.,2023-11-21,2023-Nov-01,False,,,,pubmed_abstracts/37417711.txt
-37417713,PMC10719214,10.1093/cei/uxad072,1365-2249,Clinical and experimental immunology,Effects of mesenchymal stem cells on Treg cells in rats with colitis.,2023-12-13,2023-Dec-13,False,2024-07-07,,,pubmed_abstracts/37417713.txt
-37417714,PMC10577628,10.1111/aogs.14626,1600-0412,Acta obstetricia et gynecologica Scandinavica,Double-vs single-balloon catheter for induction of labor: Systematic review and individual participant data meta-analysis.,2023-10-24,2023-11-01,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/af/36/PMC10577628.tar.gz,pubmed_abstracts/PMC10577628/AOGS-102-1440.pdf
-37417715,PMC10508478,10.1002/vms3.1180,2053-1095,Veterinary medicine and science,"Evaluating the effects of direct-fed microbial supplementation on the performance, milk quality and fatty acid of mid-lactating dairy cows.",2023-09-21,2023-09-01,True,,CC BY-NC,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/8c/db/VMS3-9-2212.PMC10508478.pdf,pubmed_abstracts/VMS3-9-2212.PMC10508478.pdf
-37417716,,,1538-9774,"Computers, informatics, nursing : CIN",The Disruptive Impacts of Next Generation Generative Artificial Intelligence.,2023-11-28,2023-07-01,False,,,,
-37417718,,,1559-4106,Biointerphases,Development of electronic sum frequency generation spectrophotometer to assess the buried interfaces.,2023-11-18,2023-07-01,False,,,,pubmed_abstracts/37417718.txt
-37417719,,,1559-4106,Biointerphases,Theoretical study of electronic sum frequency generation spectroscopy to assess the buried interfaces.,2023-11-18,2023-07-01,False,,,,pubmed_abstracts/37417719.txt
-37417708,PMC10524881,10.1097/AUD.0000000000001396,1538-4667,Ear and hearing,Association Between Adult-Onset Hearing Loss and Income: A Systematic Review.,2023-11-06,,False,2024-07-06,,,pubmed_abstracts/37417708.txt
-37417720,,,1943-278X,Suicide & life-threatening behavior,When safe firearm storage isn't enough: Examining risk profiles among firearm suicide decedents.,2023-08-16,2023-08-01,False,,,,pubmed_abstracts/37417720.txt
-37417722,,,1754-9485,Journal of medical imaging and radiation oncology,Percutaneous treatment of renal tumours.,2023-07-07,2023-Jul-07,False,,,,pubmed_abstracts/37417722.txt
-37417721,PMC10332182,10.1080/07853890.2023.2233556,1365-2060,Annals of medicine,Ultrasound-guided injection acupotomy as a minimally invasive intervention therapy for cervical spondylotic radiculopathy: a randomized control trial.,2023-11-19,2023-12-01,True,,CC BY-NC,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/8f/eb/IANN_55_2233556.PMC10332182.pdf,pubmed_abstracts/IANN_55_2233556.PMC10332182.pdf
-37417723,,,1528-1159,Spine,GSK-3β and β-Catenin Signaling Pathway is Involved in Myofibroblast Transition of Ligamentum Flavum in Lumbar Spinal Stenosis Patients.,2023-09-27,2023-Oct-15,False,,,,pubmed_abstracts/37417723.txt
-37417724,,,1528-1159,Spine,Subclassification of Sanders Maturation Stage 3 Demonstrates Differences in Spine and Total Height Velocity Between 3A and 3B in Patients with Idiopathic Scoliosis.,2023-07-07,2023-Jul-06,False,,,,pubmed_abstracts/37417724.txt
-37417725,,,1523-4681,Journal of bone and mineral research : the official journal of the American Society for Bone and Mineral Research,Efficacy and Safety of Transdermal Abaloparatide in Postmenopausal Women with Osteoporosis: A Randomized Study.,2023-10-26,2023-10-01,False,,,,pubmed_abstracts/37417725.txt
-37417726,,,1521-4141,European journal of immunology,CKBA suppresses mast cell activation via ERK signaling pathway in murine atopic dermatitis.,2023-09-11,2023-09-01,False,,,,pubmed_abstracts/37417726.txt
-37417728,,,1521-4095,"Advanced materials (Deerfield Beach, Fla.)",Reconstructed Hierarchically Structured Keratin Fibers with Shape-Memory Features Based on Reversible Secondary-Structure Transformation.,2023-10-23,2023-Oct-01,False,,,,pubmed_abstracts/37417728.txt
-37417727,PMC10181040,10.3390/nu15092153,2072-6643,Nutrients,"Effectiveness of a Digitally Delivered Continuous Care Intervention (Defeat Diabetes) on Type 2 Diabetes Outcomes: A 12-Month Single-Arm, Pre-Post Intervention Study.",2023-07-19,2023-Apr-30,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/f3/34/PMC10181040.tar.gz,pubmed_abstracts/PMC10181040/nutrients-15-02153.pdf
-37417730,PMC10356134,10.7554/eLife.88310,2050-084X,eLife,Metformin regulates bone marrow stromal cells to accelerate bone healing in diabetic mice.,2023-07-21,2023-07-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/5e/82/PMC10356134.tar.gz,"pubmed_abstracts/PMC10356134/elife-88310-mdarchecklist1.pdf,pubmed_abstracts/PMC10356134/elife-88310.pdf"
-37417729,PMC10508548,10.1002/vms3.1196,2053-1095,Veterinary medicine and science,Global prevalence of Neospora caninum in rodents: A systematic review and meta-analysis.,2023-09-21,2023-09-01,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/1d/b2/PMC10508548.tar.gz,pubmed_abstracts/PMC10508548/VMS3-9-2192.pdf
-37417731,,,1520-6890,Chemical reviews,Enantioselective Transformations in the Synthesis of Therapeutic Agents.,2023-08-30,2023-08-09,False,,,,pubmed_abstracts/37417731.txt
-37417732,,,1948-7185,The journal of physical chemistry letters,Reduction-Active Antisolvent: A Universal and Innovative Strategy of Further Ameliorating Additive Optimization for High Efficiency Perovskite Solar Cells.,2023-07-20,2023-Jul-20,False,,,,pubmed_abstracts/37417732.txt
-37417734,PMC10328535,10.7554/eLife.86373,2050-084X,eLife,The Opto-inflammasome in zebrafish as a tool to study cell and tissue responses to speck formation and cell death.,2023-11-16,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/b9/9f/PMC10328535.tar.gz,pubmed_abstracts/PMC10328535/elife-86373.pdf
-37417733,PMC10392983,10.7554/eLife.88058,2050-084X,eLife,Allosteric activation or inhibition of PI3Kγ mediated through conformational changes in the p110γ helical domain.,2023-11-16,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/91/25/PMC10392983.tar.gz,"pubmed_abstracts/PMC10392983/elife-88058.pdf,pubmed_abstracts/PMC10392983/elife-88058-mdarchecklist1.pdf"
-37417737,PMC10731660,10.1021/acs.inorgchem.3c01620,1520-510X,Inorganic chemistry,Role of Pure Technetium Chemistry: Are There Still Links to Applications in Imaging?,2023-07-07,2023-Jul-07,True,,CC BY,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/14/5f/PMC10731660.tar.gz,pubmed_abstracts/PMC10731660/ic3c01620.pdf

From 4338931746bf4e04b138d56319b62c99a4a5d38b Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Tue, 14 May 2024 12:59:40 -0500
Subject: [PATCH 24/28] test comment

---
 ai_ta_backend/utils/pubmed_extraction.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ai_ta_backend/utils/pubmed_extraction.py b/ai_ta_backend/utils/pubmed_extraction.py
index 524c6e79..b5700174 100644
--- a/ai_ta_backend/utils/pubmed_extraction.py
+++ b/ai_ta_backend/utils/pubmed_extraction.py
@@ -417,6 +417,7 @@ def getArticleIDs(metadata: list):
   for i in range(0, len(metadata), batch_size):
     batch = metadata[i:i + batch_size]
     ids = ",".join([article['pmid'] for article in batch])
+    # test comment
     try:
         response = requests.get(base_url + app_details + "&ids=" + ids)
         data = response.json()

From 048f41de3503be289ca2b824ecf720096fe3e4c9 Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Tue, 14 May 2024 13:00:53 -0500
Subject: [PATCH 25/28] print test comment

---
 ai_ta_backend/utils/pubmed_extraction.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ai_ta_backend/utils/pubmed_extraction.py b/ai_ta_backend/utils/pubmed_extraction.py
index b5700174..502f03f9 100644
--- a/ai_ta_backend/utils/pubmed_extraction.py
+++ b/ai_ta_backend/utils/pubmed_extraction.py
@@ -418,6 +418,7 @@ def getArticleIDs(metadata: list):
     batch = metadata[i:i + batch_size]
     ids = ",".join([article['pmid'] for article in batch])
     # test comment
+    print("test comment")
     try:
         response = requests.get(base_url + app_details + "&ids=" + ids)
         data = response.json()

From 472814ec9b122326b21cabab7b3cd20c65837026 Mon Sep 17 00:00:00 2001
From: Kastan Day <kastanvday@gmail.com>
Date: Tue, 14 May 2024 11:16:53 -0700
Subject: [PATCH 26/28] Commented out prints for speed

---
 ai_ta_backend/utils/pubmed_extraction.py | 43 ++++++++++++------------
 1 file changed, 21 insertions(+), 22 deletions(-)

diff --git a/ai_ta_backend/utils/pubmed_extraction.py b/ai_ta_backend/utils/pubmed_extraction.py
index 502f03f9..a3dd3912 100644
--- a/ai_ta_backend/utils/pubmed_extraction.py
+++ b/ai_ta_backend/utils/pubmed_extraction.py
@@ -18,7 +18,6 @@
 import json
 from functools import partial
 
-
 SUPBASE_CLIENT = supabase.create_client(    # type: ignore
     supabase_url=os.getenv('SUPABASE_URL'), # type: ignore
     supabase_key=os.getenv('SUPABASE_API_KEY')  # type: ignore
@@ -46,7 +45,7 @@ def extractPubmedData():
             print("Processing file: ", file)
         
             gz_filepath = downloadXML(ftp_address, ftp_path, file, "pubmed")
-            print("GZ Downloaded: ", gz_filepath)
+            # print("GZ Downloaded: ", gz_filepath)
             print("Time taken to download .gz file: ", round(time.time() - start_time, 2), "seconds")
             gz_file_download_time = time.time()
 
@@ -54,7 +53,7 @@ def extractPubmedData():
             if not gz_filepath:
                 return "failure"
             xml_filepath = extractXMLFile(gz_filepath)
-            print("XML Extracted: ", xml_filepath)
+            # print("XML Extracted: ", xml_filepath)
             print("Time taken to extract XML file: ", round(time.time() - gz_file_download_time, 2), "seconds")
             
             #xml_filepath = "pubmed/pubmed24n1217.xml"
@@ -68,7 +67,7 @@ def extractPubmedData():
                 
                 # download the articles
                 complete_metadata = downloadArticles(metadata_with_ids)
-                print(complete_metadata)
+                # print(complete_metadata)
                 print("Time taken to download articles for 100 articles: ", round(time.time() - metadata_update_time, 2), "seconds")
 
                 # store metadata in csv file
@@ -141,7 +140,7 @@ def downloadXML(ftp_address: str, ftp_path: str, file: str, local_dir: str):
         with open(local_filepath, 'wb') as f:
             ftp.retrbinary('RETR ' + file, f.write)
             
-        print(f"Downloaded {file} to {local_filepath}")
+        # print(f"Downloaded {file} to {local_filepath}")
 
         ftp.quit()
         return local_filepath
@@ -175,7 +174,7 @@ def getFileList(ftp_address: str, ftp_path: str, extension: str = ".gz"):
         # Filter for files with the specified extension
         gz_files = [entry for entry in file_listing if entry.endswith(extension)]
         gz_files.sort(reverse=True)
-        print(f"Found {len(gz_files)} files on {ftp_address}/{ftp_path}")
+        # print(f"Found {len(gz_files)} files on {ftp_address}/{ftp_path}")
 
         return gz_files
     except Exception as e:
@@ -191,7 +190,7 @@ def extractXMLFile(gz_filepath: str):
         xml_filepath: Path to the extracted XML file.
     """
     try:
-        print("Downloaded .gz file path: ", gz_filepath)
+        # print("Downloaded .gz file path: ", gz_filepath)
         xml_filepath = gz_filepath.replace(".gz", "")
         with gzip.open(gz_filepath, 'rb') as f_in:
             with open(xml_filepath, 'wb') as f_out:
@@ -212,7 +211,7 @@ def extractMetadataFromXML(xml_filepath: str):
     Returns:
         metadata: List of dictionaries containing metadata for each article.
     """
-    print("inside extractMetadataFromXML()")
+    # print("inside extractMetadataFromXML()")
     try:
         # create a directory to store abstracts
         os.makedirs("pubmed_abstracts", exist_ok=True)
@@ -233,14 +232,14 @@ def extractMetadataFromXML(xml_filepath: str):
                 metadata.append(article_data)
 
                 if len(metadata) == 100:
-                    print("collected 100 articles")
+                    # print("collected 100 articles")
                     yield metadata
                     metadata = []   # reset metadata for next batch
 
         if metadata:
             yield metadata
         
-        print("Metadata extraction complete.")
+        # print("Metadata extraction complete.")
     except Exception as e:
         print("Error extracting metadata: ", e)
         return []
@@ -594,7 +593,7 @@ def downloadArticles(metadata: list):
             if article['pmid'] in updated_articles:
                 article.update(updated_articles[article['pmid']])
                 
-        print("Updated metadata after download: ", metadata)
+        # print("Updated metadata after download: ", metadata)
         
         return metadata
 
@@ -625,7 +624,7 @@ def download_article(article, api_url):
 
     if article['pmcid']:
         final_url = api_url + "id=" + article['pmcid']
-        print("\nDownload URL: ", final_url)
+        # print("\nDownload URL: ", final_url)
 
         xml_response = requests.get(final_url)
         extracted_data = extractArticleData(xml_response.text)
@@ -640,7 +639,7 @@ def download_article(article, api_url):
 
         ftp_url = urlparse(extracted_data[0]['href'])
         ftp_path = ftp_url.path[1:]
-        print("FTP path: ", ftp_path)
+        # print("FTP path: ", ftp_path)
 
         filename = ftp_path.split("/")[-1]
         local_file = os.path.join("pubmed_abstracts", filename)
@@ -649,12 +648,12 @@ def download_article(article, api_url):
             with open(local_file, 'wb') as f:
                 ftp.retrbinary('RETR ' + ftp_path, f.write)  # Download directly to file
 
-            print("Downloaded FTP file: ", local_file)
+            # print("Downloaded FTP file: ", local_file)
             article['filepath'] = local_file
 
             if filename.endswith(".tar.gz"):
                 extracted_pdf_paths = extractPDF(local_file)
-                print("Extracted PDFs from .tar.gz file: ", extracted_pdf_paths)
+                # print("Extracted PDFs from .tar.gz file: ", extracted_pdf_paths)
                 article['filepath'] = ",".join(extracted_pdf_paths)
                 os.remove(local_file)
 
@@ -663,7 +662,7 @@ def download_article(article, api_url):
 
         ftp.quit()
 
-        print("\nUpdated metadata after download: ", article)
+        # print("\nUpdated metadata after download: ", article)
         return article
          
 
@@ -677,7 +676,7 @@ def extractPDF(tar_gz_filepath: str):
         extracted_paths: List of paths to the extracted PDF files.
     """
     try:
-        print("Extracting PDF from: ", tar_gz_filepath)
+        # print("Extracting PDF from: ", tar_gz_filepath)
         extracted_paths = []
         with tarfile.open(tar_gz_filepath, "r:gz") as tar:
             for member in tar:
@@ -700,7 +699,7 @@ def extractArticleData(xml_string: str):
     Returns:
         extracted_data: List of dictionaries containing license and download link for the article.
     """
-    print("In extractArticleData")
+    # print("In extractArticleData")
     try:
         root = ET.fromstring(xml_string)
         # if there is an errors (article not open-access), return empty list (skip article)
@@ -741,7 +740,7 @@ def upload_file(client, bucket_name, file_path, object_name):
     """
     try:
         client.fput_object(bucket_name, object_name, file_path)
-        print(f"Uploaded: {object_name}")
+        # print(f"Uploaded: {object_name}")
     except Exception as e:
         print(f"Error uploading {object_name}: {e}")
 
@@ -749,7 +748,7 @@ def uploadToStorage(filepath: str):
     """
     Uploads all files present under given filepath to Minio bucket in parallel.
     """
-    print("in uploadToStorage()")
+    # print("in uploadToStorage()")
     try:
         bucket_name = "pubmed"
 
@@ -757,8 +756,8 @@ def uploadToStorage(filepath: str):
         if not found:
             MINIO_CLIENT.make_bucket(bucket_name)
             print("Created bucket", bucket_name)
-        else:
-            print("Bucket", bucket_name, "already exists")
+        # else:
+        #     print("Bucket", bucket_name, "already exists")
 
         # Get all files to upload
         files = []

From 880ae975acc83ecd5a758f7aa23a62ae76ef173f Mon Sep 17 00:00:00 2001
From: Kastan Day <kastanvday@gmail.com>
Date: Tue, 14 May 2024 11:17:27 -0700
Subject: [PATCH 27/28] Commented out prints for speed

---
 ai_ta_backend/utils/pubmed_extraction.py | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/ai_ta_backend/utils/pubmed_extraction.py b/ai_ta_backend/utils/pubmed_extraction.py
index a3dd3912..92e0461e 100644
--- a/ai_ta_backend/utils/pubmed_extraction.py
+++ b/ai_ta_backend/utils/pubmed_extraction.py
@@ -18,6 +18,7 @@
 import json
 from functools import partial
 
+
 SUPBASE_CLIENT = supabase.create_client(    # type: ignore
     supabase_url=os.getenv('SUPABASE_URL'), # type: ignore
     supabase_key=os.getenv('SUPABASE_API_KEY')  # type: ignore
@@ -39,13 +40,12 @@ def extractPubmedData():
     ftp_path = "pubmed/baseline"
     file_list = getFileList(ftp_address, ftp_path, ".gz")
     
-
     for file in file_list[22:23]:  
         try:
             print("Processing file: ", file)
         
             gz_filepath = downloadXML(ftp_address, ftp_path, file, "pubmed")
-            # print("GZ Downloaded: ", gz_filepath)
+            print("GZ Downloaded: ", gz_filepath)
             print("Time taken to download .gz file: ", round(time.time() - start_time, 2), "seconds")
             gz_file_download_time = time.time()
 
@@ -53,7 +53,7 @@ def extractPubmedData():
             if not gz_filepath:
                 return "failure"
             xml_filepath = extractXMLFile(gz_filepath)
-            # print("XML Extracted: ", xml_filepath)
+            print("XML Extracted: ", xml_filepath)
             print("Time taken to extract XML file: ", round(time.time() - gz_file_download_time, 2), "seconds")
             
             #xml_filepath = "pubmed/pubmed24n1217.xml"
@@ -174,7 +174,7 @@ def getFileList(ftp_address: str, ftp_path: str, extension: str = ".gz"):
         # Filter for files with the specified extension
         gz_files = [entry for entry in file_listing if entry.endswith(extension)]
         gz_files.sort(reverse=True)
-        # print(f"Found {len(gz_files)} files on {ftp_address}/{ftp_path}")
+        print(f"Found {len(gz_files)} files on {ftp_address}/{ftp_path}")
 
         return gz_files
     except Exception as e:
@@ -406,7 +406,7 @@ def getArticleIDs(metadata: list):
   Returns:
       metadata: Updated metadata with PMCID, DOI, release date, and live status information.
   """
-  print("In getArticleIDs()")
+#   print("In getArticleIDs()")
 
   base_url = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/"
   app_details = "?tool=ncsa_uiuc&email=caiincsa@gmail.com&format=json"
@@ -416,8 +416,6 @@ def getArticleIDs(metadata: list):
   for i in range(0, len(metadata), batch_size):
     batch = metadata[i:i + batch_size]
     ids = ",".join([article['pmid'] for article in batch])
-    # test comment
-    print("test comment")
     try:
         response = requests.get(base_url + app_details + "&ids=" + ids)
         data = response.json()
@@ -568,7 +566,7 @@ def downloadArticles(metadata: list):
     Returns:
       metadata: Updated metadata with license, FTP link, and downloaded filepath information.
     """
-    print("In downloadArticles()")
+    # print("In downloadArticles()")
     try:
         base_url = "https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?"
 
@@ -580,11 +578,11 @@ def downloadArticles(metadata: list):
             futures = [executor.submit(download_article_partial, article) for article in metadata]
             for future in concurrent.futures.as_completed(futures):
                 try:
-                    print("Starting new download...")
+                    # print("Starting new download...")
                     updated_article = future.result(timeout=15*60)  # Check result without blocking
                     if updated_article:
                         updated_articles[updated_article['pmid']] = updated_article
-                    print("Updated article: ", updated_article)
+                    # print("Updated article: ", updated_article)
                 except Exception as e:
                     print("Error downloading article:", e)
 
@@ -613,7 +611,7 @@ def download_article(article, api_url):
         article: Updated metadata for the article.
     """
 
-    print("Downloading articles...")
+    # print("Downloading articles...")
     if not article['live'] or article['pmcid'] is None:
         return
 
@@ -628,7 +626,7 @@ def download_article(article, api_url):
 
         xml_response = requests.get(final_url)
         extracted_data = extractArticleData(xml_response.text)
-        print("Extracted license and link data: ", extracted_data)
+        # print("Extracted license and link data: ", extracted_data)
 
         if not extracted_data:
             article['live'] = False

From 304ec5d62f08f9e2be3fdb00f2892ec6c6197f34 Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Wed, 15 May 2024 11:52:01 -0500
Subject: [PATCH 28/28] parallelized main for loop and added xml filename
 column

---
 ai_ta_backend/utils/pubmed_extraction.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/ai_ta_backend/utils/pubmed_extraction.py b/ai_ta_backend/utils/pubmed_extraction.py
index d5806d6a..0008147e 100644
--- a/ai_ta_backend/utils/pubmed_extraction.py
+++ b/ai_ta_backend/utils/pubmed_extraction.py
@@ -41,7 +41,7 @@ def extractPubmedData():
     file_list = getFileList(ftp_address, ftp_path, ".gz")
 
     with concurrent.futures.ProcessPoolExecutor() as executor:
-        futures = [executor.submit(processPubmedXML, file, ftp_address, ftp_path) for file in file_list[21:22]]
+        futures = [executor.submit(processPubmedXML, file, ftp_address, ftp_path) for file in file_list[32:33]]
         for future in concurrent.futures.as_completed(futures):
             try:
                 future.result()
@@ -95,13 +95,16 @@ def processPubmedXML(file:str, ftp_address:str, ftp_path:str):
             print("Total articles retrieved: ", len(complete_metadata))
             df = pd.DataFrame(complete_metadata)
 
+            # add a column for the XML file path
+            df['xml_filename'] = os.path.basename(xml_filepath)
+
             if os.path.isfile(csv_filepath):
                 df.to_csv(csv_filepath, mode='a', header=False, index=False)
             else:
                 df.to_csv(csv_filepath, index=False)
                 
             print("Time taken to extract metadata for 100 articles: ", round(time.time() - metadata_extract_start_time, 2), "seconds")
-
+            exit()
 
         print("Time taken to download articles: ", round(time.time() - start_time, 2), "seconds")
         print("Total metadata extracted: ", len(complete_metadata))