updated pubmed eutils function

UIUC-Chatbot · Mar 8, 2024 · 064d4d0 · 064d4d0
1 parent 79001e4
commit 064d4d0
Showing 1 changed file with 29 additions and 31 deletions.
diff --git a/ai_ta_backend/journal_ingest.py b/ai_ta_backend/journal_ingest.py
@@ -631,7 +631,7 @@ def searchPubmedArticlesWithEutils(course: str, search: str, title: str, journal
     This function is used for a text-based search in PubMed using the E-Utilities API.
     Args:
         course: course name
-        query: search query
+        search: search query
         title: article title
         journal: journal title
     """
@@ -640,7 +640,7 @@ def searchPubmedArticlesWithEutils(course: str, search: str, title: str, journal
         os.makedirs(directory)
 
     base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?"
-    database = "db=pmc"
+    database = "db=pubmed"
     final_query = "term="
 
     title_query = journal_query = search_query = ""
@@ -661,45 +661,43 @@ def searchPubmedArticlesWithEutils(course: str, search: str, title: str, journal
     final_url = base_url + database + "&" + final_query + "&retmode=json&retmax=100"
     print("Final URL: ", final_url)
     response = requests.get(final_url)
-    data = response.json()
 
-    print(data)
+    if response.status_code != 200:
+        return "Error: " + str(response.status_code) + " - " + response.text
+
+    data = response.json()
 
     total_records = int(data['esearchresult']['count'])
-    current_records = len(data['esearchresult']['idlist'])
-    id_list = data['esearchresult']['idlist']
+    current_records = 0
 
     print("Total Records: ", total_records)
-    print("Current Records: ", current_records)
-    print("ID List: ", id_list)
 
     while current_records < total_records:
+        # extract ID and convert them to PMC ID
+        id_list = data['esearchresult']['idlist']
+        print("Number of records in current page: ", len(id_list))
+        id_str = ",".join(id_list)
+        current_pmc_ids = pubmed_id_converter(id_str)
+        print("Number of PMC IDs: ", len(current_pmc_ids))
+
+        # call pubmed download here - parallel processing
+        with concurrent.futures.ProcessPoolExecutor() as executor:
+            results = [executor.submit(downloadPubmedArticles, id, course) for id in current_pmc_ids]
+
+        # update current records count
+        current_records += len(id_list)
+
+        # if next page exists, update next page url and call the API again
         retstart = current_records
-        final_url = base_url + database + "&" + final_query + "&retmode=json&retmax=100&retstart=" + str(retstart)
-        print("Final URL: ", final_url)
-        response = requests.get(final_url)
+        next_page_url = base_url + database + "&" + final_query + "&retmode=json&retmax=100&retstart=" + str(retstart)
+        print("Next page URL: ", next_page_url)
+        response = requests.get(next_page_url)
+        if response.status_code != 200:
+            return "Error in next page: " + str(response.status_code) + " - " + response.text
         data = response.json()
 
-        current_ids = data['esearchresult']['idlist']
-        id_list += current_ids
-        current_records += len(current_ids)
-        print("Current Records: ", current_records)
-
-        id_str = ",".join(id_list)
-        current_pmc_ids = pubmed_id_converter(id_str)
-
-        # call pubmed download here 
-        for pmc_id in current_pmc_ids:
-            downloadPubmedArticles(id=pmc_id, course_name=course)
-
-    id_str = ",".join(id_list)
-    current_pmc_ids = pubmed_id_converter(id_str)
-    print("Current PMC IDs: ", current_pmc_ids)
+    # upload to supabase bucket
 
-    # call pubmed download here 
-    for pmc_id in current_pmc_ids:
-        downloadPubmedArticles(id=pmc_id, course_name=course)
-
     return "success"
 
 
@@ -714,7 +712,7 @@ def pubmed_id_converter(id: str):
     url = base_url + app_details + "&ids=" + id
 
     response = requests.get(url)
-    print("Response: ", response.text)
+    #print("Response: ", response.text)
     root = ET.fromstring(response.text)
     records = root.findall(".//record")
     for record in records: