From 5bb0a1ec688260c5cb7163d827339bf4dd2f2190 Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Tue, 23 Apr 2024 14:02:11 -0500
Subject: [PATCH 01/18] minor changes

---
 ai_ta_backend/main.py             |  27 ++++++
 ai_ta_backend/utils/pub_ingest.py | 154 ++++++++++++++++++++++++++++++
 2 files changed, 181 insertions(+)
 create mode 100644 ai_ta_backend/utils/pub_ingest.py

diff --git a/ai_ta_backend/main.py b/ai_ta_backend/main.py
index c70466c0..a563fb3c 100644
--- a/ai_ta_backend/main.py
+++ b/ai_ta_backend/main.py
@@ -40,6 +40,7 @@
 from ai_ta_backend.service.sentry_service import SentryService
 
 from ai_ta_backend.beam.nomic_logging import create_document_map
+from ai_ta_backend.utils.pub_ingest import downloadSpringerFulltext
 
 app = Flask(__name__)
 CORS(app)
@@ -382,6 +383,32 @@ def getTopContextsWithMQR(service: RetrievalService, posthog_service: PosthogSer
   return response
 
 
+@app.route('/get-springer-fulltext', methods=['GET'])
+def get_springer_data():
+  course_name: str = request.args.get('course_name', default='', type=str)
+  issn = request.args.get('issn', default='', type=str)
+  subject = request.args.get('subject', default='', type=str)
+  journal = request.args.get('journal', default='', type=str)
+  title = request.args.get('title', default='', type=str)
+  doi = request.args.get('doi', default='', type=str)
+
+  print("In /get-springer-fulltext")
+
+  if (issn == '' and subject == '' and journal == '' and title == '' and doi == '') or course_name == '':
+    # proper web error "400 Bad request"
+    abort(
+        400,
+        description=
+        f"Missing required parameters: 'issn' or 'subject' or 'title' or 'journal' or 'doi' and 'course_name' must be provided."
+    )
+
+  fulltext = downloadSpringerFulltext(issn, subject, journal, title, doi, course_name)
+
+  response = jsonify(fulltext)
+  response.headers.add('Access-Control-Allow-Origin', '*')
+  return response
+
+
 def configure(binder: Binder) -> None:
   binder.bind(RetrievalService, to=RetrievalService, scope=RequestScope)
   binder.bind(PosthogService, to=PosthogService, scope=SingletonScope)
diff --git a/ai_ta_backend/utils/pub_ingest.py b/ai_ta_backend/utils/pub_ingest.py
new file mode 100644
index 00000000..ed7a9813
--- /dev/null
+++ b/ai_ta_backend/utils/pub_ingest.py
@@ -0,0 +1,154 @@
+import os
+import shutil
+import requests
+import json
+import arxiv
+import crossref_commons.retrieval
+import xml.etree.ElementTree as ET
+import ftplib
+from urllib.parse import urlparse
+import urllib.parse
+import supabase
+import tarfile
+import concurrent.futures
+import time
+
+SPRINGER_API_KEY = os.environ.get('SPRINGER_API_KEY')
+
+SUPABASE_CLIENT = supabase.create_client(  # type: ignore
+      supabase_url=os.getenv('SUPABASE_URL'),  # type: ignore
+      supabase_key=os.getenv('SUPABASE_API_KEY'))  # type: ignore
+
+
+def downloadSpringerFulltext(issn=None, subject=None, journal=None, title=None, doi=None, course_name=None):
+    """
+    This function uses the Springer Nature API to download openaccess journal articles.
+    Args:
+        issn: limit to ISSN number of the journal/book
+        subject: limit articles to a specific subject - Chemistry, Physics, etc.
+        journal: limit to keywords occuring in journal title
+        title: limit to keywords occuring in article title
+    The initial API response returns a list of articles with metadata.
+    
+    """
+    # create directory to store files
+    directory = os.path.join(os.getcwd(), 'springer_papers')
+    if not os.path.exists(directory):
+        os.makedirs(directory)
+
+    # set headers
+    api_url = "http://api.springernature.com/openaccess/json?q="
+    headers = {'Accept': 'application/json'}
+
+    # form the query URL based on the input parameters received
+    if doi:
+        query_str = "doi:" + doi
+    elif issn:
+        query_str = "issn:" + issn
+    elif journal:   
+        journal = "%22" + journal.replace(" ", "%20") + "%22"
+        query_str = "journal:" + journal
+    elif title:
+        title = "%22" + title.replace(" ", "%20") + "%22"
+        query_str = "title:" + title
+    elif subject:
+        query_str = "subject:" + subject
+    else:
+        return "No query parameters provided"
+    
+    main_url = api_url + query_str + "&api_key=" + str(SPRINGER_API_KEY)
+    print("Full URL: ", main_url)
+    
+    response = requests.get(main_url, headers=headers)
+    print("Status: ", response.status_code)
+
+    if response.status_code != 200:
+        return "Error: " + str(response.status_code) + " - " + response.text
+
+    data = response.json()
+    # check for total number of records 
+    total_records = int(data['result'][0]['total'])
+    print("Total records: ", total_records)
+    current_records = 0
+
+    while current_records < total_records:
+        # check if nextPage exists
+        if 'nextPage' in data:
+            next_page_url = "http://api.springernature.com" + data['nextPage']
+        else:
+            next_page_url = None
+
+        # multi-process all records in current page
+        with concurrent.futures.ProcessPoolExecutor() as executor:
+            results = [executor.submit(downloadPDFSpringer, record, directory) for record in data['records']]
+            for f in concurrent.futures.as_completed(results):
+                print(f.result())
+
+        # update current records count
+        current_records += int(len(data['records']))
+
+        # if next page exists, update next page url and call the API again
+        if next_page_url:
+            # API key is already present in the URL
+            response = requests.get(next_page_url, headers=headers)
+            if response.status_code != 200:
+                return "Error in next page: " + str(response.status_code) + " - " + response.text
+            
+            data = response.json()
+
+    # call ingest function here
+    
+
+    # call document groups API       
+    
+
+    # # Delete files from local directory
+    # shutil.rmtree(directory)
+                                
+    return "success"
+
+def downloadPDFSpringer(record: dict, directory: str):
+    """
+    This function takes a record from the Springer API response and downloads the PDF file.
+    It is called in a multi-process loop in downloadSpringerFulltext().
+    Args:
+        record: dictionary containing DOI and other metadata
+        directory: local directory to save the files
+    """
+    headers = {'Accept': 'application/json'}
+
+    if len(record['url']) < 1:
+        return "No download link found for DOI: " + record['doi']
+
+    # extract URL
+    url = record['url'][0]['value'] + "?api_key=" + str(SPRINGER_API_KEY)
+    url_response = requests.get(url, headers=headers)
+    if url_response.status_code != 200:
+        return "Error in accessing article link: " + str(url_response.status_code) + " - " + url_response.text
+    url_data = url_response.json()
+
+    # extract PDF link
+    pdf_link = None
+    links = url_data['link']
+    for link in links:
+        if link['content-type'] == 'application/pdf' and link['intended-application'] == 'text-mining':
+            pdf_link = link['URL']
+            #print("PDF Link: ", pdf_link)
+            break
+    if not pdf_link:
+        return "No PDF link found for DOI: " + record['doi']
+    
+    # download PDF
+    filename = record['doi'].replace("/", "_")
+    try:
+        response = requests.get(pdf_link)
+        if response.status_code != 200:
+            return "Error in downloading PDF: " + str(response.status_code) + " - " + response.text
+        
+        with open(directory + "/" + filename + ".pdf", "wb") as f:  # Open a file in binary write mode ("wb")
+            for chunk in response.iter_content(chunk_size=1024):  # Download in chunks
+                f.write(chunk)
+        print("Downloaded: ", filename)
+        return "success"
+    except Exception as e:
+        return "Error in downloading PDF: " + str(e)

From 970703be9c7978a7962b04de5c58fce6e4d8f8ac Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Fri, 26 Apr 2024 15:44:32 -0500
Subject: [PATCH 02/18] added beam ingest

---
 ai_ta_backend/database/sql.py     |  4 +-
 ai_ta_backend/utils/pub_ingest.py | 69 +++++++++++++++++++++++--------
 2 files changed, 55 insertions(+), 18 deletions(-)

diff --git a/ai_ta_backend/database/sql.py b/ai_ta_backend/database/sql.py
index caf0ac51..cbee1a66 100644
--- a/ai_ta_backend/database/sql.py
+++ b/ai_ta_backend/database/sql.py
@@ -11,7 +11,7 @@ def __init__(self):
     # Create a Supabase client
     self.supabase_client = supabase.create_client(  # type: ignore
         supabase_url=os.environ['SUPABASE_URL'], supabase_key=os.environ['SUPABASE_API_KEY'])
-
+    
   def getAllMaterialsForCourse(self, course_name: str):
     return self.supabase_client.table(
         os.environ['SUPABASE_DOCUMENTS_TABLE']).select('course_name, s3_path, readable_filename, url, base_url').eq(
@@ -110,3 +110,5 @@ def updateProjects(self, course_name: str, data: dict):
   def getConversation(self, course_name: str, key: str, value: str):
     return self.supabase_client.table("llm-convo-monitor").select("*").eq(key, value).eq("course_name", course_name).execute()
   
+  def getCourseDocumentByS3Path(self, course_name: str, s3_path: str):
+    return self.supabase_client.table("documents").select("id, course_name, readable_filename, url, base_url, s3_path, created_at").eq("course_name", course_name).eq("s3_path", s3_path).execute()
\ No newline at end of file
diff --git a/ai_ta_backend/utils/pub_ingest.py b/ai_ta_backend/utils/pub_ingest.py
index ed7a9813..9651cfae 100644
--- a/ai_ta_backend/utils/pub_ingest.py
+++ b/ai_ta_backend/utils/pub_ingest.py
@@ -1,23 +1,20 @@
 import os
+import json
+import pandas as pd
 import shutil
 import requests
-import json
-import arxiv
-import crossref_commons.retrieval
+import supabase
 import xml.etree.ElementTree as ET
-import ftplib
 from urllib.parse import urlparse
-import urllib.parse
-import supabase
-import tarfile
 import concurrent.futures
-import time
+from ai_ta_backend.database import aws, sql
 
 SPRINGER_API_KEY = os.environ.get('SPRINGER_API_KEY')
 
-SUPABASE_CLIENT = supabase.create_client(  # type: ignore
-      supabase_url=os.getenv('SUPABASE_URL'),  # type: ignore
-      supabase_key=os.getenv('SUPABASE_API_KEY'))  # type: ignore
+s3_client = aws.AWSStorage()
+aws_bucket = os.getenv('S3_BUCKET_NAME')
+supabase_client = supabase.create_client(  # type: ignore
+      supabase_url=os.environ['SUPABASE_URL'], supabase_key=os.environ['SUPABASE_API_KEY'])
 
 
 def downloadSpringerFulltext(issn=None, subject=None, journal=None, title=None, doi=None, course_name=None):
@@ -31,6 +28,7 @@ def downloadSpringerFulltext(issn=None, subject=None, journal=None, title=None,
     The initial API response returns a list of articles with metadata.
     
     """
+    print("in downloadSpringerFulltext")
     # create directory to store files
     directory = os.path.join(os.getcwd(), 'springer_papers')
     if not os.path.exists(directory):
@@ -96,14 +94,51 @@ def downloadSpringerFulltext(issn=None, subject=None, journal=None, title=None,
             
             data = response.json()
 
-    # call ingest function here
-    
-
-    # call document groups API       
+    print("Course name: ", course_name)
+    # prep payload for beam ingest
+    ingest_data = []
+
+    # upload files to S3 bucket
+    for file in os.listdir(directory):
+        data = {
+            "course_name": course_name,
+            "s3_paths": "",
+            "readable_filename": "",
+            "base_url": "",
+            "url": "",
+            "issn": issn
+        }
+        s3_path = "courses/" + course_name + "/" + file # type: ignore
+        data["s3_paths"] = s3_path
+        data["readable_filename"] = file
+        s3_client.upload_file(directory + "/" + file, aws_bucket, s3_path)  # type: ignore
+        ingest_data.append(data)
     
+    # save ingest data to csv
+    ingest_df = pd.DataFrame(ingest_data)
+    csv_file = "publications_data.csv"
+    if not os.path.exists(csv_file):
+        ingest_df.to_csv(csv_file, index=False)
+    else:
+        ingest_df.to_csv(csv_file, mode='a', header=False, index=False)
+
+
+    # call ingest
+    beam_url = "https://41kgx.apps.beam.cloud"
+    headers = {
+    "Content-Type": "application/json",
+    "Authorization": "Basic " + os.getenv('BEAM_AUTH_TOKEN')    # type: ignore
+    }
+    for data in ingest_data:
+        payload = json.dumps(data)
+        response = requests.post(beam_url, headers=headers, data=payload)
+        if response.status_code == 200:
+            print("Task status retrieved successfully!")
+        else:
+            print(f"Error: {response.status_code}. {response.text}")
 
-    # # Delete files from local directory
-    # shutil.rmtree(directory)
+    # Delete files from local directory
+    shutil.rmtree(directory)
                                 
     return "success"
 

From 042264c2471b5b75a14891f9582ffdbbcf43c6a4 Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Sun, 28 Apr 2024 17:19:39 -0500
Subject: [PATCH 03/18] minor changes

---
 ai_ta_backend/utils/pub_ingest.py | 119 ++++++++++++++++--------------
 1 file changed, 62 insertions(+), 57 deletions(-)

diff --git a/ai_ta_backend/utils/pub_ingest.py b/ai_ta_backend/utils/pub_ingest.py
index 9651cfae..f77fdc9b 100644
--- a/ai_ta_backend/utils/pub_ingest.py
+++ b/ai_ta_backend/utils/pub_ingest.py
@@ -35,64 +35,66 @@ def downloadSpringerFulltext(issn=None, subject=None, journal=None, title=None,
         os.makedirs(directory)
 
     # set headers
-    api_url = "http://api.springernature.com/openaccess/json?q="
-    headers = {'Accept': 'application/json'}
-
-    # form the query URL based on the input parameters received
-    if doi:
-        query_str = "doi:" + doi
-    elif issn:
-        query_str = "issn:" + issn
-    elif journal:   
-        journal = "%22" + journal.replace(" ", "%20") + "%22"
-        query_str = "journal:" + journal
-    elif title:
-        title = "%22" + title.replace(" ", "%20") + "%22"
-        query_str = "title:" + title
-    elif subject:
-        query_str = "subject:" + subject
-    else:
-        return "No query parameters provided"
+    # api_url = "http://api.springernature.com/openaccess/json?q="
+    # headers = {'Accept': 'application/json'}
+
+    # # form the query URL based on the input parameters received
+    # if doi:
+    #     query_str = "doi:" + doi
+    # elif issn:
+    #     query_str = "issn:" + issn
+    # elif journal:   
+    #     journal = "%22" + journal.replace(" ", "%20") + "%22"
+    #     query_str = "journal:" + journal
+    # elif title:
+    #     title = "%22" + title.replace(" ", "%20") + "%22"
+    #     query_str = "title:" + title
+    # elif subject:
+    #     query_str = "subject:" + subject
+    # else:
+    #     return "No query parameters provided"
     
-    main_url = api_url + query_str + "&api_key=" + str(SPRINGER_API_KEY)
-    print("Full URL: ", main_url)
+    # main_url = api_url + query_str + "&api_key=" + str(SPRINGER_API_KEY)
+    # print("Full URL: ", main_url)
     
-    response = requests.get(main_url, headers=headers)
-    print("Status: ", response.status_code)
-
-    if response.status_code != 200:
-        return "Error: " + str(response.status_code) + " - " + response.text
-
-    data = response.json()
-    # check for total number of records 
-    total_records = int(data['result'][0]['total'])
-    print("Total records: ", total_records)
-    current_records = 0
-
-    while current_records < total_records:
-        # check if nextPage exists
-        if 'nextPage' in data:
-            next_page_url = "http://api.springernature.com" + data['nextPage']
-        else:
-            next_page_url = None
-
-        # multi-process all records in current page
-        with concurrent.futures.ProcessPoolExecutor() as executor:
-            results = [executor.submit(downloadPDFSpringer, record, directory) for record in data['records']]
-            for f in concurrent.futures.as_completed(results):
-                print(f.result())
-
-        # update current records count
-        current_records += int(len(data['records']))
-
-        # if next page exists, update next page url and call the API again
-        if next_page_url:
-            # API key is already present in the URL
-            response = requests.get(next_page_url, headers=headers)
-            if response.status_code != 200:
-                return "Error in next page: " + str(response.status_code) + " - " + response.text
-            
-            data = response.json()
+    # response = requests.get(main_url, headers=headers)
+    # print("Status: ", response.status_code)
+
+    # if response.status_code != 200:
+    #     return "Error: " + str(response.status_code) + " - " + response.text
+
+    # data = response.json()
+    # # check for total number of records 
+    # total_records = int(data['result'][0]['total'])
+    # print("Total records: ", total_records)
+    # current_records = 0
+    # while current_records < total_records:
+    #     # check if nextPage exists
+    #     try:
+    #         if 'nextPage' in data:
+    #             next_page_url = "http://api.springernature.com" + data['nextPage']
+    #         else:
+    #             next_page_url = None
+
+    #         # multi-process all records in current page
+    #         with concurrent.futures.ProcessPoolExecutor() as executor:
+    #             results = [executor.submit(downloadPDFSpringer, record, directory) for record in data['records']]
+    #             for f in concurrent.futures.as_completed(results):
+    #                 print(f.result())
+
+    #         # update current records count
+    #         current_records += int(len(data['records']))
+
+    #         # if next page exists, update next page url and call the API again
+    #         if next_page_url:
+    #             # API key is already present in the URL
+    #             response = requests.get(next_page_url, headers=headers)
+    #             if response.status_code != 200:
+    #                 return "Error in next page: " + str(response.status_code) + " - " + response.text
+                
+    #             data = response.json()
+    #     except Exception as e:
+    #         print(e)
 
     print("Course name: ", course_name)
     # prep payload for beam ingest
@@ -102,11 +104,12 @@ def downloadSpringerFulltext(issn=None, subject=None, journal=None, title=None,
     for file in os.listdir(directory):
         data = {
             "course_name": course_name,
+            "group": "springer_open",
             "s3_paths": "",
             "readable_filename": "",
             "base_url": "",
             "url": "",
-            "issn": issn
+            "journal": "rice",
         }
         s3_path = "courses/" + course_name + "/" + file # type: ignore
         data["s3_paths"] = s3_path
@@ -175,6 +178,8 @@ def downloadPDFSpringer(record: dict, directory: str):
     
     # download PDF
     filename = record['doi'].replace("/", "_")
+    if filename in ['10.1186_2196-5641-1-1', '10.1186_s40538-014-0009-x']:
+        return "Skipping: " + filename
     try:
         response = requests.get(pdf_link)
         if response.status_code != 200:

From 5d2d7a25c51a7fcdcb9eb6063d9f9dcc5add9882 Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Mon, 6 May 2024 15:30:43 -0500
Subject: [PATCH 04/18] added doi link in metadata

---
 ai_ta_backend/utils/pub_ingest.py | 131 +++++++++++++++---------------
 1 file changed, 66 insertions(+), 65 deletions(-)

diff --git a/ai_ta_backend/utils/pub_ingest.py b/ai_ta_backend/utils/pub_ingest.py
index f77fdc9b..b2668702 100644
--- a/ai_ta_backend/utils/pub_ingest.py
+++ b/ai_ta_backend/utils/pub_ingest.py
@@ -35,85 +35,86 @@ def downloadSpringerFulltext(issn=None, subject=None, journal=None, title=None,
         os.makedirs(directory)
 
     # set headers
-    # api_url = "http://api.springernature.com/openaccess/json?q="
-    # headers = {'Accept': 'application/json'}
-
-    # # form the query URL based on the input parameters received
-    # if doi:
-    #     query_str = "doi:" + doi
-    # elif issn:
-    #     query_str = "issn:" + issn
-    # elif journal:   
-    #     journal = "%22" + journal.replace(" ", "%20") + "%22"
-    #     query_str = "journal:" + journal
-    # elif title:
-    #     title = "%22" + title.replace(" ", "%20") + "%22"
-    #     query_str = "title:" + title
-    # elif subject:
-    #     query_str = "subject:" + subject
-    # else:
-    #     return "No query parameters provided"
+    api_url = "http://api.springernature.com/openaccess/json?q="
+    headers = {'Accept': 'application/json'}
+
+    # form the query URL based on the input parameters received
+    if doi:
+        query_str = "doi:" + doi
+    elif issn:
+        query_str = "issn:" + issn
+    elif journal:   
+        journal = "%22" + journal.replace(" ", "%20") + "%22"
+        query_str = "journal:" + journal
+    elif title:
+        title = "%22" + title.replace(" ", "%20") + "%22"
+        query_str = "title:" + title
+    elif subject:
+        query_str = "subject:" + subject
+    else:
+        return "No query parameters provided"
     
-    # main_url = api_url + query_str + "&api_key=" + str(SPRINGER_API_KEY)
-    # print("Full URL: ", main_url)
+    main_url = api_url + query_str + "&api_key=" + str(SPRINGER_API_KEY)
+    print("Full URL: ", main_url)
     
-    # response = requests.get(main_url, headers=headers)
-    # print("Status: ", response.status_code)
-
-    # if response.status_code != 200:
-    #     return "Error: " + str(response.status_code) + " - " + response.text
-
-    # data = response.json()
-    # # check for total number of records 
-    # total_records = int(data['result'][0]['total'])
-    # print("Total records: ", total_records)
-    # current_records = 0
-    # while current_records < total_records:
-    #     # check if nextPage exists
-    #     try:
-    #         if 'nextPage' in data:
-    #             next_page_url = "http://api.springernature.com" + data['nextPage']
-    #         else:
-    #             next_page_url = None
-
-    #         # multi-process all records in current page
-    #         with concurrent.futures.ProcessPoolExecutor() as executor:
-    #             results = [executor.submit(downloadPDFSpringer, record, directory) for record in data['records']]
-    #             for f in concurrent.futures.as_completed(results):
-    #                 print(f.result())
-
-    #         # update current records count
-    #         current_records += int(len(data['records']))
-
-    #         # if next page exists, update next page url and call the API again
-    #         if next_page_url:
-    #             # API key is already present in the URL
-    #             response = requests.get(next_page_url, headers=headers)
-    #             if response.status_code != 200:
-    #                 return "Error in next page: " + str(response.status_code) + " - " + response.text
+    response = requests.get(main_url, headers=headers)
+    print("Status: ", response.status_code)
+
+    if response.status_code != 200:
+        return "Error: " + str(response.status_code) + " - " + response.text
+
+    data = response.json()
+    # check for total number of records 
+    total_records = int(data['result'][0]['total'])
+    print("Total records: ", total_records)
+    current_records = 0
+    while current_records < total_records:
+        # check if nextPage exists
+        try:
+            if 'nextPage' in data:
+                next_page_url = "http://api.springernature.com" + data['nextPage']
+            else:
+                next_page_url = None
+
+            # multi-process all records in current page
+            with concurrent.futures.ProcessPoolExecutor() as executor:
+                results = [executor.submit(downloadPDFSpringer, record, directory) for record in data['records']]
+                for f in concurrent.futures.as_completed(results):
+                    print(f.result())
+
+            # update current records count
+            current_records += int(len(data['records']))
+
+            # if next page exists, update next page url and call the API again
+            if next_page_url:
+                # API key is already present in the URL
+                response = requests.get(next_page_url, headers=headers)
+                if response.status_code != 200:
+                    return "Error in next page: " + str(response.status_code) + " - " + response.text
                 
-    #             data = response.json()
-    #     except Exception as e:
-    #         print(e)
+                data = response.json()
+        except Exception as e:
+            print(e)
 
     print("Course name: ", course_name)
     # prep payload for beam ingest
     ingest_data = []
-
+    
     # upload files to S3 bucket
     for file in os.listdir(directory):
+        doi = file[:-4]
+        doi = doi.replace("_", "/")
+        doi_link = f"https://doi.org/{doi}"
         data = {
             "course_name": course_name,
             "group": "springer_open",
-            "s3_paths": "",
-            "readable_filename": "",
+            "s3_paths": "courses/" + course_name + "/" + file, # type: ignore
+            "readable_filename": file,
             "base_url": "",
-            "url": "",
-            "journal": "rice",
+            "url": doi_link,
+            "journal": "",
         }
-        s3_path = "courses/" + course_name + "/" + file # type: ignore
-        data["s3_paths"] = s3_path
-        data["readable_filename"] = file
+        
         s3_client.upload_file(directory + "/" + file, aws_bucket, s3_path)  # type: ignore
         ingest_data.append(data)
     

From 8c4d76df26348ff51b97178f434a3e9c02343c1d Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Thu, 9 May 2024 10:51:30 -0500
Subject: [PATCH 05/18] minor changes

---
 ai_ta_backend/main.py             |  24 +++-
 ai_ta_backend/utils/pub_ingest.py | 232 +++++++++++++++++++++++++++++-
 requirements.txt                  |   4 +
 3 files changed, 256 insertions(+), 4 deletions(-)

diff --git a/ai_ta_backend/main.py b/ai_ta_backend/main.py
index a563fb3c..c30a7a7b 100644
--- a/ai_ta_backend/main.py
+++ b/ai_ta_backend/main.py
@@ -40,7 +40,7 @@
 from ai_ta_backend.service.sentry_service import SentryService
 
 from ai_ta_backend.beam.nomic_logging import create_document_map
-from ai_ta_backend.utils.pub_ingest import downloadSpringerFulltext
+from ai_ta_backend.utils.pub_ingest import downloadSpringerFulltext, downloadWileyFulltext
 
 app = Flask(__name__)
 CORS(app)
@@ -408,6 +408,28 @@ def get_springer_data():
   response.headers.add('Access-Control-Allow-Origin', '*')
   return response
 
+@app.route('/get-wiley-fulltext', methods=['GET'])
+def get_wiley_data():
+  course_name: str = request.args.get('course_name', default='', type=str)
+  issn = request.args.get('issn', default='', type=str)
+  #doi = request.args.get('doi', default='', type=str)
+
+  print("In /get-wiley-fulltext")
+
+  if issn == '' or course_name == '':
+    # proper web error "400 Bad request"
+    abort(
+        400,
+        description=
+        f"Missing required parameters: 'issn' or 'doi' and 'course_name' must be provided."
+    )
+
+  fulltext = downloadWileyFulltext(course_name, issn)
+
+  response = jsonify(fulltext)
+  response.headers.add('Access-Control-Allow-Origin', '*')
+  return response
+
 
 def configure(binder: Binder) -> None:
   binder.bind(RetrievalService, to=RetrievalService, scope=RequestScope)
diff --git a/ai_ta_backend/utils/pub_ingest.py b/ai_ta_backend/utils/pub_ingest.py
index b2668702..7c73c303 100644
--- a/ai_ta_backend/utils/pub_ingest.py
+++ b/ai_ta_backend/utils/pub_ingest.py
@@ -1,15 +1,22 @@
 import os
 import json
+import time
 import pandas as pd
 import shutil
 import requests
 import supabase
-import xml.etree.ElementTree as ET
-from urllib.parse import urlparse
 import concurrent.futures
+from crossref.restful import Works, Journals
 from ai_ta_backend.database import aws, sql
 
 SPRINGER_API_KEY = os.environ.get('SPRINGER_API_KEY')
+LICENSES = {
+    "http://onlinelibrary.wiley.com/termsAndConditions#vor": "closed_access",
+    "http://creativecommons.org/licenses/by/4.0/": "CC BY",
+    "http://creativecommons.org/licenses/by-nc/4.0/": "CC BY-NC",
+    "http://creativecommons.org/licenses/by-nc-nd/4.0/": "CC BY-NC-ND",
+    "http://creativecommons.org/licenses/by-nc-sa/4.0/": "CC BY-NC-SA",
+}
 
 s3_client = aws.AWSStorage()
 aws_bucket = os.getenv('S3_BUCKET_NAME')
@@ -114,7 +121,7 @@ def downloadSpringerFulltext(issn=None, subject=None, journal=None, title=None,
             "url": doi_link,
             "journal": "",
         }
-        
+        s3_path = "courses/" + course_name + "/" + file # type: ignore
         s3_client.upload_file(directory + "/" + file, aws_bucket, s3_path)  # type: ignore
         ingest_data.append(data)
     
@@ -193,3 +200,222 @@ def downloadPDFSpringer(record: dict, directory: str):
         return "success"
     except Exception as e:
         return "Error in downloading PDF: " + str(e)
+
+
+def downloadWileyFulltext(course_name=None, issn=None):
+    """
+    This function fetches metadata from Crossref and downloads 
+    full-text articles from a given journal from Wiley.
+    """
+    # create directory to store files
+    directory = os.path.join(os.getcwd(), 'wiley_papers')
+    if not os.path.exists(directory):
+        os.makedirs(directory)
+
+    api_key = os.environ.get("WILEY_TDM_TOKEN")
+    metadata = []
+
+    # get journal metadata
+    journals = Journals()
+    works = journals.works(issn=issn)
+    count = 0
+    for item in works:
+        open_access = True
+        count += 1
+        article_metadata = {}
+        # check if the license is open access - variant of CC
+        if 'license' not in item:
+            continue
+            
+        for license in item['license']:
+            print("License URL: ", license['URL'])
+            if license['URL'] in LICENSES:
+                if LICENSES[license['URL']] == "closed_access":
+                    #print("Article is not open access: ", item['DOI'])
+                    open_access = False
+                else:
+                    print("Article is open access: ", item['DOI'])
+                    article_metadata['license'] = LICENSES[license['URL']]
+                    article_metadata['license_link'] = license['URL']
+            else:
+                article_metadata['license_link'] = license['URL']
+            
+        if not open_access:
+            continue
+
+        article_metadata['doi'] = item['DOI']
+        article_metadata['title'] = item['title'][0]
+        article_metadata['journal'] = item['container-title'][0]
+        article_metadata['publisher'] = item['publisher']
+        article_metadata['issn'] = item['ISSN'][0]
+        article_metadata['url'] = item['URL']
+        article_metadata['filename'] = item['DOI'].replace("/", "_") + ".pdf"
+
+        print("Article Metadata: ", article_metadata)
+
+        # download PDF based on doi
+        download_status = downloadWileyPDF(item['DOI'])
+        print("Download status: ", download_status)
+        metadata.append(article_metadata)
+    
+    print("Download complete.")
+    print("Total articles: ", count)
+    metadata_csv = "wiley_metadata.csv"
+    metadata_df = pd.DataFrame(metadata)
+    if not os.path.exists(metadata_csv):
+        metadata_df.to_csv(metadata_csv, index=False)
+    else:
+        metadata_df.to_csv(metadata_csv, mode='a', header=False, index=False)
+    # prep payload for beam ingest
+    # ingest_data = []
+        
+    # # upload files to S3 bucket
+    # for file in os.listdir(directory):
+    #     doi = file[:-4]
+    #     doi = doi.replace("_", "/")
+    #     doi_link = f"https://doi.org/{doi}"
+    #     data = {
+    #         "course_name": course_name,
+    #         "group": "wiley",
+    #         "s3_paths": "courses/" + course_name + "/" + file, # type: ignore
+    #         "readable_filename": file,
+    #         "base_url": "",
+    #         "url": doi_link,
+    #         "journal": "",
+    #     }
+    #     s3_path = "courses/" + course_name + "/" + file # type: ignore
+    #     s3_client.upload_file(directory + "/" + file, aws_bucket, s3_path)  # type: ignore
+    #     ingest_data.append(data)
+        
+    # # save ingest data to csv
+    # ingest_df = pd.DataFrame(ingest_data)
+    # csv_file = "publications_data.csv"
+    # if not os.path.exists(csv_file):
+    #     ingest_df.to_csv(csv_file, index=False)
+    # else:
+    #     ingest_df.to_csv(csv_file, mode='a', header=False, index=False)
+
+
+    # # call ingest
+    # beam_url = "https://41kgx.apps.beam.cloud"
+    # headers = {
+    # "Content-Type": "application/json",
+    # "Authorization": "Basic " + os.getenv('BEAM_AUTH_TOKEN')    # type: ignore
+    # }
+    # for data in ingest_data:
+    #     payload = json.dumps(data)
+    #     response = requests.post(beam_url, headers=headers, data=payload)
+    #     if response.status_code == 200:
+    #         print("Task status retrieved successfully!")
+    #     else:
+    #         print(f"Error: {response.status_code}. {response.text}")
+
+    # Delete files from local directory
+    #shutil.rmtree(directory)
+                
+
+def downloadWileyPDF(doi=None):
+    """
+    This function downloads a PDF file from Wiley based on the DOI.
+    """
+    # create directory to store files
+    directory = os.path.join(os.getcwd(), 'wiley_papers')
+    if not os.path.exists(directory):
+        os.makedirs(directory)
+
+    api_key = os.environ.get("WILEY_TDM_TOKEN")
+
+    # download PDF based on doi
+    base_url = "https://api.wiley.com/onlinelibrary/tdm/v1/articles/"
+    url = base_url + str(doi)
+    print("URL: ", url)
+
+    headers = {
+        'Wiley-TDM-Client-Token': api_key,
+        'Content-Type': 'application/json'
+    }
+    time.sleep(3)
+    response = requests.get(url, headers=headers)
+    if response.status_code != 200:
+        return "Error in accessing article link: " + str(response.status_code) + " - " + response.text
+        
+    filename = str(doi).replace("/", "_") + ".pdf"
+    with open(directory + "/" + filename, "wb") as f:  # Open a file in binary write mode ("wb")
+        for chunk in response.iter_content(chunk_size=1024):  # Download in chunks
+            f.write(chunk)
+    print("Downloaded: ", filename)
+    
+    return "success"
+
+
+def downloadWileyArticle(doi=None):
+    """
+    This function fetches metadata from Crossref and downloads open access full text articles from Wiley.
+    """
+    # create directory to store files
+    directory = os.path.join(os.getcwd(), 'wiley_papers')
+    if not os.path.exists(directory):
+        os.makedirs(directory)
+
+    api_key = os.environ.get("WILEY_TDM_TOKEN")
+    metadata = {}
+    
+    # get metadata from Crossref
+    if doi:
+        # get article metadata
+        works = Works()
+        article_data = works.doi(doi)
+        print("Article license: ", article_data['license'])
+        
+        article_licenses = []
+        
+        for item in article_data['license']:
+            article_licenses.append(item['URL'])
+        print("Licenses: ", article_licenses)
+        # check if the license is open access - variant of CC
+        for license in article_licenses:
+            if license in LICENSES:
+                print("License found: ", license)
+                if LICENSES[license] == "closed_access":
+                    return "Article is not open access."
+                else:
+                    metadata['license'] = LICENSES[license]
+                    break
+            else:
+                return "License not found."
+        
+        metadata['doi'] = doi
+        metadata['title'] = article_data['title'][0]
+        metadata['journal'] = article_data['container-title'][0]
+        metadata['publisher'] = article_data['publisher']
+        metadata['issn'] = article_data['ISSN'][0]
+        metadata['url'] = article_data['URL']
+
+        print("Metadata: ", metadata)
+
+        # download PDF based on doi
+        base_url = "https://api.wiley.com/onlinelibrary/tdm/v1/articles/"
+        url = base_url + str(doi)
+
+        print("URL: ", url)
+
+        headers = {
+            'Wiley-TDM-Client-Token': api_key,
+            'Content-Type': 'application/json'
+        }
+
+        response = requests.get(url, headers=headers)
+        if response.status_code != 200:
+            return "Error in accessing article link: " + str(response.status_code) + " - " + response.text
+        
+        filename = str(doi).replace("/", "_")
+        with open(directory + "/" + filename + ".pdf", "wb") as f:  # Open a file in binary write mode ("wb")
+            for chunk in response.iter_content(chunk_size=1024):  # Download in chunks
+                f.write(chunk)
+        print("Downloaded: ", filename)
+
+        # upload file to S3 bucket
+
+        # prep payload for beam ingest
+
+        return "success"
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 848c10d0..4d75f5b7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -39,6 +39,10 @@ supabase==2.0.2
 posthog==3.1.0
 sentry-sdk==1.39.1
 
+# Publications
+crossrefapi
+
+
 # Not currently supporting coursera ingest
 # cs-dlp @ git+https://github.com/raffaem/cs-dlp.git@0.12.0b0 # previously called coursera-dl
 

From fac8c291cdab60aa2b440cb8cc3e3bfe20c32035 Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Sat, 11 May 2024 16:33:01 -0500
Subject: [PATCH 06/18] minor changes

---
 ai_ta_backend/utils/pub_ingest.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/ai_ta_backend/utils/pub_ingest.py b/ai_ta_backend/utils/pub_ingest.py
index 7c73c303..a7884303 100644
--- a/ai_ta_backend/utils/pub_ingest.py
+++ b/ai_ta_backend/utils/pub_ingest.py
@@ -8,6 +8,7 @@
 import concurrent.futures
 from crossref.restful import Works, Journals
 from ai_ta_backend.database import aws, sql
+import backoff
 
 SPRINGER_API_KEY = os.environ.get('SPRINGER_API_KEY')
 LICENSES = {
@@ -252,11 +253,12 @@ def downloadWileyFulltext(course_name=None, issn=None):
         article_metadata['filename'] = item['DOI'].replace("/", "_") + ".pdf"
 
         print("Article Metadata: ", article_metadata)
+        metadata.append(article_metadata)
 
         # download PDF based on doi
         download_status = downloadWileyPDF(item['DOI'])
         print("Download status: ", download_status)
-        metadata.append(article_metadata)
+        
     
     print("Download complete.")
     print("Total articles: ", count)
@@ -313,7 +315,7 @@ def downloadWileyFulltext(course_name=None, issn=None):
     # Delete files from local directory
     #shutil.rmtree(directory)
                 
-
+@backoff.on_exception(backoff.expo, requests.exceptions.HTTPError, max_tries=20)
 def downloadWileyPDF(doi=None):
     """
     This function downloads a PDF file from Wiley based on the DOI.
@@ -334,10 +336,9 @@ def downloadWileyPDF(doi=None):
         'Wiley-TDM-Client-Token': api_key,
         'Content-Type': 'application/json'
     }
-    time.sleep(3)
+    
     response = requests.get(url, headers=headers)
-    if response.status_code != 200:
-        return "Error in accessing article link: " + str(response.status_code) + " - " + response.text
+    response.raise_for_status()
         
     filename = str(doi).replace("/", "_") + ".pdf"
     with open(directory + "/" + filename, "wb") as f:  # Open a file in binary write mode ("wb")
@@ -406,6 +407,9 @@ def downloadWileyArticle(doi=None):
 
         response = requests.get(url, headers=headers)
         if response.status_code != 200:
+            # exponential backoff logic
+            print("Error in accessing article link, retrying: ", response.text)
+
             return "Error in accessing article link: " + str(response.status_code) + " - " + response.text
         
         filename = str(doi).replace("/", "_")

From c2a01fa1b47054e65a6002a0baf164c91e24eb82 Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Tue, 21 May 2024 11:20:03 -0500
Subject: [PATCH 07/18] separated metadata extraction and download

---
 ai_ta_backend/utils/pub_ingest.py | 119 +++++++++++++++++++-----------
 1 file changed, 77 insertions(+), 42 deletions(-)

diff --git a/ai_ta_backend/utils/pub_ingest.py b/ai_ta_backend/utils/pub_ingest.py
index a7884303..c30452df 100644
--- a/ai_ta_backend/utils/pub_ingest.py
+++ b/ai_ta_backend/utils/pub_ingest.py
@@ -11,12 +11,17 @@
 import backoff
 
 SPRINGER_API_KEY = os.environ.get('SPRINGER_API_KEY')
-LICENSES = {
-    "http://onlinelibrary.wiley.com/termsAndConditions#vor": "closed_access",
+CC_LICENSES = {
     "http://creativecommons.org/licenses/by/4.0/": "CC BY",
     "http://creativecommons.org/licenses/by-nc/4.0/": "CC BY-NC",
     "http://creativecommons.org/licenses/by-nc-nd/4.0/": "CC BY-NC-ND",
-    "http://creativecommons.org/licenses/by-nc-sa/4.0/": "CC BY-NC-SA",
+    "http://creativecommons.org/licenses/by-nc-sa/4.0/": "CC BY-NC-SA"
+}
+
+OTHER_LICENSES = {
+    "http://onlinelibrary.wiley.com/termsAndConditions#vor": "wiley_tnc",
+    "http://onlinelibrary.wiley.com/termsAndConditions#am": "wiley_tnc",
+    "http://doi.wiley.com/10.1002/tdm_license_1.1": "wiley_tdm"
 }
 
 s3_client = aws.AWSStorage()
@@ -202,65 +207,51 @@ def downloadPDFSpringer(record: dict, directory: str):
     except Exception as e:
         return "Error in downloading PDF: " + str(e)
 
-
-def downloadWileyFulltext(course_name=None, issn=None):
+def getCrossrefMetadata(issn: str):
     """
-    This function fetches metadata from Crossref and downloads 
-    full-text articles from a given journal from Wiley.
+    Creates a csv file with metadata of all articles for given journal (ISSN)
     """
-    # create directory to store files
-    directory = os.path.join(os.getcwd(), 'wiley_papers')
-    if not os.path.exists(directory):
-        os.makedirs(directory)
-
-    api_key = os.environ.get("WILEY_TDM_TOKEN")
     metadata = []
-
     # get journal metadata
     journals = Journals()
     works = journals.works(issn=issn)
     count = 0
+    no_license = 0
     for item in works:
-        open_access = True
         count += 1
         article_metadata = {}
         # check if the license is open access - variant of CC
         if 'license' not in item:
+            no_license += 1
             continue
-            
-        for license in item['license']:
-            print("License URL: ", license['URL'])
-            if license['URL'] in LICENSES:
-                if LICENSES[license['URL']] == "closed_access":
-                    #print("Article is not open access: ", item['DOI'])
-                    open_access = False
+        else:
+            for license in item['license']:
+                # check for creative commons license
+                if license['URL'] in CC_LICENSES:
+                    article_metadata['license'] = CC_LICENSES[license['URL']]
+                    article_metadata['license_url'] = license['URL']
+                    break
+                elif license['URL'] in OTHER_LICENSES:
+                    article_metadata['license'] = OTHER_LICENSES[license['URL']]
+                    article_metadata['license_url'] = license['URL']
                 else:
-                    print("Article is open access: ", item['DOI'])
-                    article_metadata['license'] = LICENSES[license['URL']]
-                    article_metadata['license_link'] = license['URL']
-            else:
-                article_metadata['license_link'] = license['URL']
-            
-        if not open_access:
-            continue
-
+                    article_metadata['license'] = "unknown"
+                    article_metadata['license_url'] = license['URL']
+                    
         article_metadata['doi'] = item['DOI']
-        article_metadata['title'] = item['title'][0]
+        if 'title' not in item:
+            article_metadata['title'] = "No title found"
+        else:
+            article_metadata['title'] = item['title'][0]
         article_metadata['journal'] = item['container-title'][0]
         article_metadata['publisher'] = item['publisher']
         article_metadata['issn'] = item['ISSN'][0]
         article_metadata['url'] = item['URL']
         article_metadata['filename'] = item['DOI'].replace("/", "_") + ".pdf"
-
-        print("Article Metadata: ", article_metadata)
+        article_metadata['downloaded'] = "no"
         metadata.append(article_metadata)
-
-        # download PDF based on doi
-        download_status = downloadWileyPDF(item['DOI'])
-        print("Download status: ", download_status)
-        
+        print("Processed: ", article_metadata['doi'])
     
-    print("Download complete.")
     print("Total articles: ", count)
     metadata_csv = "wiley_metadata.csv"
     metadata_df = pd.DataFrame(metadata)
@@ -268,7 +259,49 @@ def downloadWileyFulltext(course_name=None, issn=None):
         metadata_df.to_csv(metadata_csv, index=False)
     else:
         metadata_df.to_csv(metadata_csv, mode='a', header=False, index=False)
-    # prep payload for beam ingest
+    
+    return "success"
+
+
+def downloadWileyFulltext(course_name=None, issn=None):
+    """
+    This function fetches metadata from Crossref and downloads 
+    full-text articles from a given journal from Wiley.
+    """
+    # create directory to store files
+    directory = os.path.join(os.getcwd(), 'wiley_papers')
+    if not os.path.exists(directory):
+        os.makedirs(directory)
+
+    # fetch metadata
+    # metadata_status = getCrossrefMetadata(issn)
+    # print("Metadata status: ", metadata_status)
+
+    
+    # download PDFs based on metadata
+    metadata_csv = "wiley_metadata.csv"
+    if os.path.exists(metadata_csv):
+        metadata_df = pd.read_csv(metadata_csv)
+        metadata = metadata_df.to_dict(orient='records')
+
+    for item in metadata:
+        try:
+            if item['license'] in ['CC BY', 'CC BY-NC', 'CC BY-NC-ND', 'CC BY-NC-SA'] and item['downloaded'] == 'no':
+                status = downloadWileyPDF(item['doi'])
+                print("Download status: ", status)
+                if status == "success":
+                    item['downloaded'] = 'yes'
+                    
+        except Exception as e:
+            print(e)
+    
+    metadata_df = pd.DataFrame(metadata)
+    metadata_df.to_csv(metadata_csv, index=False)
+
+    return "success"
+
+    
+    # # prep payload for beam ingest
     # ingest_data = []
         
     # # upload files to S3 bucket
@@ -314,8 +347,10 @@ def downloadWileyFulltext(course_name=None, issn=None):
 
     # Delete files from local directory
     #shutil.rmtree(directory)
+
+    
                 
-@backoff.on_exception(backoff.expo, requests.exceptions.HTTPError, max_tries=20)
+@backoff.on_exception(backoff.expo, requests.exceptions.HTTPError, max_tries=7)
 def downloadWileyPDF(doi=None):
     """
     This function downloads a PDF file from Wiley based on the DOI.

From bdaf9760170e1ec7f1e9d74088b376acf762f32b Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Tue, 28 May 2024 12:23:16 -0500
Subject: [PATCH 08/18] minor download code changes

---
 ai_ta_backend/main.py             |  14 ++-
 ai_ta_backend/utils/pub_ingest.py | 198 ++++++++++++++++--------------
 2 files changed, 112 insertions(+), 100 deletions(-)

diff --git a/ai_ta_backend/main.py b/ai_ta_backend/main.py
index c30a7a7b..ec8b6a73 100644
--- a/ai_ta_backend/main.py
+++ b/ai_ta_backend/main.py
@@ -408,15 +408,17 @@ def get_springer_data():
   response.headers.add('Access-Control-Allow-Origin', '*')
   return response
 
-@app.route('/get-wiley-fulltext', methods=['GET'])
+@app.route('/get-wiley-fulltext', methods=['POST'])
 def get_wiley_data():
-  course_name: str = request.args.get('course_name', default='', type=str)
-  issn = request.args.get('issn', default='', type=str)
-  #doi = request.args.get('doi', default='', type=str)
-
+  data = request.get_json()
+  print(data)
+  
+  course_name = data['course_name']
+  issn = data['issn']
+  
   print("In /get-wiley-fulltext")
 
-  if issn == '' or course_name == '':
+  if issn == [] or course_name == '':
     # proper web error "400 Bad request"
     abort(
         400,
diff --git a/ai_ta_backend/utils/pub_ingest.py b/ai_ta_backend/utils/pub_ingest.py
index c30452df..2fe45ac2 100644
--- a/ai_ta_backend/utils/pub_ingest.py
+++ b/ai_ta_backend/utils/pub_ingest.py
@@ -211,59 +211,62 @@ def getCrossrefMetadata(issn: str):
     """
     Creates a csv file with metadata of all articles for given journal (ISSN)
     """
-    metadata = []
-    # get journal metadata
-    journals = Journals()
-    works = journals.works(issn=issn)
-    count = 0
-    no_license = 0
-    for item in works:
-        count += 1
-        article_metadata = {}
-        # check if the license is open access - variant of CC
-        if 'license' not in item:
-            no_license += 1
-            continue
-        else:
-            for license in item['license']:
-                # check for creative commons license
-                if license['URL'] in CC_LICENSES:
-                    article_metadata['license'] = CC_LICENSES[license['URL']]
-                    article_metadata['license_url'] = license['URL']
-                    break
-                elif license['URL'] in OTHER_LICENSES:
-                    article_metadata['license'] = OTHER_LICENSES[license['URL']]
-                    article_metadata['license_url'] = license['URL']
-                else:
-                    article_metadata['license'] = "unknown"
-                    article_metadata['license_url'] = license['URL']
-                    
-        article_metadata['doi'] = item['DOI']
-        if 'title' not in item:
-            article_metadata['title'] = "No title found"
+    try:
+        metadata = []
+        # get journal metadata
+        journals = Journals()
+        works = journals.works(issn=issn)
+        count = 0
+        no_license = 0
+        for item in works:
+            count += 1
+            article_metadata = {}
+            # check if the license is open access - variant of CC
+            if 'license' not in item:
+                no_license += 1
+                continue
+            else:
+                for license in item['license']:
+                    # check for creative commons license
+                    if license['URL'] in CC_LICENSES:
+                        article_metadata['license'] = CC_LICENSES[license['URL']]
+                        article_metadata['license_url'] = license['URL']
+                        break
+                    elif license['URL'] in OTHER_LICENSES:
+                        article_metadata['license'] = OTHER_LICENSES[license['URL']]
+                        article_metadata['license_url'] = license['URL']
+                    else:
+                        article_metadata['license'] = "unknown"
+                        article_metadata['license_url'] = license['URL']
+                        
+            article_metadata['doi'] = item['DOI']
+            if 'title' not in item:
+                article_metadata['title'] = "No title found"
+            else:
+                article_metadata['title'] = item['title'][0]
+            article_metadata['journal'] = item['container-title'][0]
+            article_metadata['publisher'] = item['publisher']
+            article_metadata['issn'] = item['ISSN'][0]
+            article_metadata['url'] = item['URL']
+            article_metadata['filename'] = item['DOI'].replace("/", "_") + ".pdf"
+            article_metadata['downloaded'] = "no"
+            metadata.append(article_metadata)
+            print("Processed: ", article_metadata['doi'])
+        
+        print("Total articles: ", count)
+        metadata_csv = "wiley_metadata.csv"
+        metadata_df = pd.DataFrame(metadata)
+        if not os.path.exists(metadata_csv):
+            metadata_df.to_csv(metadata_csv, index=False)
         else:
-            article_metadata['title'] = item['title'][0]
-        article_metadata['journal'] = item['container-title'][0]
-        article_metadata['publisher'] = item['publisher']
-        article_metadata['issn'] = item['ISSN'][0]
-        article_metadata['url'] = item['URL']
-        article_metadata['filename'] = item['DOI'].replace("/", "_") + ".pdf"
-        article_metadata['downloaded'] = "no"
-        metadata.append(article_metadata)
-        print("Processed: ", article_metadata['doi'])
-    
-    print("Total articles: ", count)
-    metadata_csv = "wiley_metadata.csv"
-    metadata_df = pd.DataFrame(metadata)
-    if not os.path.exists(metadata_csv):
-        metadata_df.to_csv(metadata_csv, index=False)
-    else:
-        metadata_df.to_csv(metadata_csv, mode='a', header=False, index=False)
-    
-    return "success"
+            metadata_df.to_csv(metadata_csv, mode='a', header=False, index=False)
+        
+        return "success"
+    except Exception as e:
+        return "Error: " + str(e)
 
 
-def downloadWileyFulltext(course_name=None, issn=None):
+def downloadWileyFulltext(course_name=None, issn=[]):
     """
     This function fetches metadata from Crossref and downloads 
     full-text articles from a given journal from Wiley.
@@ -273,30 +276,33 @@ def downloadWileyFulltext(course_name=None, issn=None):
     if not os.path.exists(directory):
         os.makedirs(directory)
 
-    # fetch metadata
-    # metadata_status = getCrossrefMetadata(issn)
-    # print("Metadata status: ", metadata_status)
 
+    # fetch metadata
+    for item in issn:
+        metadata_status = getCrossrefMetadata(item)
+        print("Metadata status: ", metadata_status)
     
     # download PDFs based on metadata
-    metadata_csv = "wiley_metadata.csv"
-    if os.path.exists(metadata_csv):
-        metadata_df = pd.read_csv(metadata_csv)
-        metadata = metadata_df.to_dict(orient='records')
-
-    for item in metadata:
-        try:
-            if item['license'] in ['CC BY', 'CC BY-NC', 'CC BY-NC-ND', 'CC BY-NC-SA'] and item['downloaded'] == 'no':
-                status = downloadWileyPDF(item['doi'])
-                print("Download status: ", status)
-                if status == "success":
-                    item['downloaded'] = 'yes'
-                    
-        except Exception as e:
-            print(e)
+    # metadata_csv = "wiley_metadata.csv"
+    # if os.path.exists(metadata_csv):
+    #     metadata_df = pd.read_csv(metadata_csv)
+    #     metadata = metadata_df.to_dict(orient='records')
+
+    # for item in metadata:
+    #     try:
+    #         if item['license'] in ['CC BY', 'CC BY-NC', 'CC BY-NC-ND', 'CC BY-NC-SA'] and item['downloaded'] == 'no' and item['publisher'] == 'Wiley':
+    #             status = downloadWileyPDF(item['doi'])
+    #             print("Download status: ", status)
+    #             if status == "success":
+    #                 item['downloaded'] = 'yes'
+    #             time.sleep(5)    
+    #     except Exception as e:
+    #         print(e)
+        
+    #     #time.sleep(10)
     
-    metadata_df = pd.DataFrame(metadata)
-    metadata_df.to_csv(metadata_csv, index=False)
+    # metadata_df = pd.DataFrame(metadata)
+    # metadata_df.to_csv(metadata_csv, index=False) 
 
     return "success"
 
@@ -350,38 +356,42 @@ def downloadWileyFulltext(course_name=None, issn=None):
 
     
                 
-@backoff.on_exception(backoff.expo, requests.exceptions.HTTPError, max_tries=7)
+#@backoff.on_exception(backoff.expo, requests.exceptions.HTTPError, max_tries=1)
 def downloadWileyPDF(doi=None):
     """
     This function downloads a PDF file from Wiley based on the DOI.
     """
-    # create directory to store files
-    directory = os.path.join(os.getcwd(), 'wiley_papers')
-    if not os.path.exists(directory):
-        os.makedirs(directory)
+    try:
+        # create directory to store files
+        directory = os.path.join(os.getcwd(), 'wiley_papers')
+        if not os.path.exists(directory):
+            os.makedirs(directory)
 
-    api_key = os.environ.get("WILEY_TDM_TOKEN")
+        api_key = os.environ.get("WILEY_TDM_TOKEN")
 
-    # download PDF based on doi
-    base_url = "https://api.wiley.com/onlinelibrary/tdm/v1/articles/"
-    url = base_url + str(doi)
-    print("URL: ", url)
+        # download PDF based on doi
+        base_url = "https://api.wiley.com/onlinelibrary/tdm/v1/articles/"
+        url = base_url + str(doi)
+        print("URL: ", url)
 
-    headers = {
-        'Wiley-TDM-Client-Token': api_key,
-        'Content-Type': 'application/json'
-    }
-    
-    response = requests.get(url, headers=headers)
-    response.raise_for_status()
+        headers = {
+            'Wiley-TDM-Client-Token': api_key,
+            'Content-Type': 'application/json'
+        }
         
-    filename = str(doi).replace("/", "_") + ".pdf"
-    with open(directory + "/" + filename, "wb") as f:  # Open a file in binary write mode ("wb")
-        for chunk in response.iter_content(chunk_size=1024):  # Download in chunks
-            f.write(chunk)
-    print("Downloaded: ", filename)
-    
-    return "success"
+        response = requests.get(url, headers=headers)
+        response.raise_for_status()
+            
+        filename = str(doi).replace("/", "_") + ".pdf"
+        with open(directory + "/" + filename, "wb") as f:  # Open a file in binary write mode ("wb")
+            for chunk in response.iter_content(chunk_size=1024):  # Download in chunks
+                f.write(chunk)
+        print("Downloaded: ", filename)
+        
+        return "success"
+    except Exception as e:
+        print("Error: ", e)
+        return "error"
 
 
 def downloadWileyArticle(doi=None):

From 2943aef5e11cf9da409c984475d155ee4f551b56 Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Tue, 28 May 2024 14:44:20 -0500
Subject: [PATCH 09/18] adding metadata to csv file line by line

---
 ai_ta_backend/utils/pub_ingest.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/ai_ta_backend/utils/pub_ingest.py b/ai_ta_backend/utils/pub_ingest.py
index 2fe45ac2..8e2793d5 100644
--- a/ai_ta_backend/utils/pub_ingest.py
+++ b/ai_ta_backend/utils/pub_ingest.py
@@ -250,16 +250,22 @@ def getCrossrefMetadata(issn: str):
             article_metadata['url'] = item['URL']
             article_metadata['filename'] = item['DOI'].replace("/", "_") + ".pdf"
             article_metadata['downloaded'] = "no"
-            metadata.append(article_metadata)
+            metadata_csv = "wiley_metadata.csv"
+            metadata_df = pd.DataFrame([article_metadata])
+            if not os.path.exists(metadata_csv):
+                metadata_df.to_csv(metadata_csv, index=False)
+            else:
+                metadata_df.to_csv(metadata_csv, mode='a', header=False, index=False)
+            #metadata.append(article_metadata)
             print("Processed: ", article_metadata['doi'])
         
         print("Total articles: ", count)
-        metadata_csv = "wiley_metadata.csv"
-        metadata_df = pd.DataFrame(metadata)
-        if not os.path.exists(metadata_csv):
-            metadata_df.to_csv(metadata_csv, index=False)
-        else:
-            metadata_df.to_csv(metadata_csv, mode='a', header=False, index=False)
+        # metadata_csv = "wiley_metadata.csv"
+        # metadata_df = pd.DataFrame(metadata)
+        # if not os.path.exists(metadata_csv):
+        #     metadata_df.to_csv(metadata_csv, index=False)
+        # else:
+        #     metadata_df.to_csv(metadata_csv, mode='a', header=False, index=False)
         
         return "success"
     except Exception as e:

From 54b7a6f219b03716dc4f96c9c98c6e8f0d84a053 Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Thu, 30 May 2024 14:58:32 -0500
Subject: [PATCH 10/18] minor changes

---
 ai_ta_backend/utils/pub_ingest.py | 124 +++++++++++++++---------------
 1 file changed, 63 insertions(+), 61 deletions(-)

diff --git a/ai_ta_backend/utils/pub_ingest.py b/ai_ta_backend/utils/pub_ingest.py
index 8e2793d5..a4971713 100644
--- a/ai_ta_backend/utils/pub_ingest.py
+++ b/ai_ta_backend/utils/pub_ingest.py
@@ -219,46 +219,48 @@ def getCrossrefMetadata(issn: str):
         count = 0
         no_license = 0
         for item in works:
-            count += 1
-            article_metadata = {}
-            # check if the license is open access - variant of CC
-            if 'license' not in item:
-                no_license += 1
-                continue
-            else:
-                for license in item['license']:
-                    # check for creative commons license
-                    if license['URL'] in CC_LICENSES:
-                        article_metadata['license'] = CC_LICENSES[license['URL']]
-                        article_metadata['license_url'] = license['URL']
-                        break
-                    elif license['URL'] in OTHER_LICENSES:
-                        article_metadata['license'] = OTHER_LICENSES[license['URL']]
-                        article_metadata['license_url'] = license['URL']
-                    else:
-                        article_metadata['license'] = "unknown"
-                        article_metadata['license_url'] = license['URL']
-                        
-            article_metadata['doi'] = item['DOI']
-            if 'title' not in item:
-                article_metadata['title'] = "No title found"
-            else:
-                article_metadata['title'] = item['title'][0]
-            article_metadata['journal'] = item['container-title'][0]
-            article_metadata['publisher'] = item['publisher']
-            article_metadata['issn'] = item['ISSN'][0]
-            article_metadata['url'] = item['URL']
-            article_metadata['filename'] = item['DOI'].replace("/", "_") + ".pdf"
-            article_metadata['downloaded'] = "no"
-            metadata_csv = "wiley_metadata.csv"
-            metadata_df = pd.DataFrame([article_metadata])
-            if not os.path.exists(metadata_csv):
-                metadata_df.to_csv(metadata_csv, index=False)
-            else:
-                metadata_df.to_csv(metadata_csv, mode='a', header=False, index=False)
-            #metadata.append(article_metadata)
-            print("Processed: ", article_metadata['doi'])
-        
+            try:
+                count += 1
+                article_metadata = {}
+                # check if the license is open access - variant of CC
+                if 'license' not in item:
+                    no_license += 1
+                    continue
+                else:
+                    for license in item['license']:
+                        # check for creative commons license
+                        if license['URL'] in CC_LICENSES:
+                            article_metadata['license'] = CC_LICENSES[license['URL']]
+                            article_metadata['license_url'] = license['URL']
+                            break
+                        elif license['URL'] in OTHER_LICENSES:
+                            article_metadata['license'] = OTHER_LICENSES[license['URL']]
+                            article_metadata['license_url'] = license['URL']
+                        else:
+                            article_metadata['license'] = "unknown"
+                            article_metadata['license_url'] = license['URL']
+                            
+                article_metadata['doi'] = item['DOI']
+                if 'title' not in item:
+                    article_metadata['title'] = "No title found"
+                else:
+                    article_metadata['title'] = item['title'][0]
+                article_metadata['journal'] = item['container-title'][0]
+                article_metadata['publisher'] = item['publisher']
+                article_metadata['issn'] = item['ISSN'][0]
+                article_metadata['url'] = item['URL']
+                article_metadata['filename'] = item['DOI'].replace("/", "_") + ".pdf"
+                article_metadata['downloaded'] = "no"
+                metadata_csv = "wiley_metadata.csv"
+                metadata_df = pd.DataFrame([article_metadata])
+                if not os.path.exists(metadata_csv):
+                    metadata_df.to_csv(metadata_csv, index=False)
+                else:
+                    metadata_df.to_csv(metadata_csv, mode='a', header=False, index=False)
+                #metadata.append(article_metadata)
+                print("Processed: ", article_metadata['doi'])
+            except Exception as e:
+                print("Error processing article: ", article_metadata['doi'], e)
         print("Total articles: ", count)
         # metadata_csv = "wiley_metadata.csv"
         # metadata_df = pd.DataFrame(metadata)
@@ -284,31 +286,31 @@ def downloadWileyFulltext(course_name=None, issn=[]):
 
 
     # fetch metadata
-    for item in issn:
-        metadata_status = getCrossrefMetadata(item)
-        print("Metadata status: ", metadata_status)
+    # for item in issn:
+    #     metadata_status = getCrossrefMetadata(item)
+    #     print("Metadata status: ", metadata_status)
     
     # download PDFs based on metadata
-    # metadata_csv = "wiley_metadata.csv"
-    # if os.path.exists(metadata_csv):
-    #     metadata_df = pd.read_csv(metadata_csv)
-    #     metadata = metadata_df.to_dict(orient='records')
-
-    # for item in metadata:
-    #     try:
-    #         if item['license'] in ['CC BY', 'CC BY-NC', 'CC BY-NC-ND', 'CC BY-NC-SA'] and item['downloaded'] == 'no' and item['publisher'] == 'Wiley':
-    #             status = downloadWileyPDF(item['doi'])
-    #             print("Download status: ", status)
-    #             if status == "success":
-    #                 item['downloaded'] = 'yes'
-    #             time.sleep(5)    
-    #     except Exception as e:
-    #         print(e)
+    metadata_csv = "wiley_metadata.csv"
+    if os.path.exists(metadata_csv):
+        metadata_df = pd.read_csv(metadata_csv)
+        metadata = metadata_df.to_dict(orient='records')
+
+    for item in metadata:
+        try:
+            if item['license'] in ['CC BY', 'CC BY-NC', 'CC BY-NC-ND', 'CC BY-NC-SA'] and item['downloaded'] == 'no' and item['publisher'] == 'Wiley':
+                status = downloadWileyPDF(item['doi'])
+                print("Download status: ", status)
+                if status == "success":
+                    item['downloaded'] = 'yes'
+                time.sleep(5)    
+        except Exception as e:
+            print(e)
         
-    #     #time.sleep(10)
+        #time.sleep(10)
     
-    # metadata_df = pd.DataFrame(metadata)
-    # metadata_df.to_csv(metadata_csv, index=False) 
+    metadata_df = pd.DataFrame(metadata)
+    metadata_df.to_csv(metadata_csv, index=False) 
 
     return "success"
 

From 74cca6e07261b32e349c9c618114e75003c0e985 Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Thu, 6 Jun 2024 16:44:19 -0500
Subject: [PATCH 11/18] minor changes

---
 ai_ta_backend/utils/pub_ingest.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ai_ta_backend/utils/pub_ingest.py b/ai_ta_backend/utils/pub_ingest.py
index a4971713..2665c1fd 100644
--- a/ai_ta_backend/utils/pub_ingest.py
+++ b/ai_ta_backend/utils/pub_ingest.py
@@ -286,9 +286,9 @@ def downloadWileyFulltext(course_name=None, issn=[]):
 
 
     # fetch metadata
-    # for item in issn:
-    #     metadata_status = getCrossrefMetadata(item)
-    #     print("Metadata status: ", metadata_status)
+    for item in issn:
+        metadata_status = getCrossrefMetadata(item)
+        print("Metadata status: ", metadata_status)
     
     # download PDFs based on metadata
     metadata_csv = "wiley_metadata.csv"

From 5f2be3c469c775ed14bfc3ef243c983e964265d2 Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Tue, 11 Jun 2024 11:05:36 -0500
Subject: [PATCH 12/18] minor download changes

---
 ai_ta_backend/utils/pub_ingest.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ai_ta_backend/utils/pub_ingest.py b/ai_ta_backend/utils/pub_ingest.py
index 2665c1fd..a4971713 100644
--- a/ai_ta_backend/utils/pub_ingest.py
+++ b/ai_ta_backend/utils/pub_ingest.py
@@ -286,9 +286,9 @@ def downloadWileyFulltext(course_name=None, issn=[]):
 
 
     # fetch metadata
-    for item in issn:
-        metadata_status = getCrossrefMetadata(item)
-        print("Metadata status: ", metadata_status)
+    # for item in issn:
+    #     metadata_status = getCrossrefMetadata(item)
+    #     print("Metadata status: ", metadata_status)
     
     # download PDFs based on metadata
     metadata_csv = "wiley_metadata.csv"

From 1ed6b1474558f926a77c3b5fa38cf0c11b31aefd Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Mon, 1 Jul 2024 10:10:34 -0500
Subject: [PATCH 13/18] minor changes for download

---
 ai_ta_backend/utils/pub_ingest.py | 45 ++++++++++++++++++-------------
 1 file changed, 27 insertions(+), 18 deletions(-)

diff --git a/ai_ta_backend/utils/pub_ingest.py b/ai_ta_backend/utils/pub_ingest.py
index a4971713..9b8d038e 100644
--- a/ai_ta_backend/utils/pub_ingest.py
+++ b/ai_ta_backend/utils/pub_ingest.py
@@ -90,10 +90,19 @@ def downloadSpringerFulltext(issn=None, subject=None, journal=None, title=None,
                 next_page_url = None
 
             # multi-process all records in current page
-            with concurrent.futures.ProcessPoolExecutor() as executor:
-                results = [executor.submit(downloadPDFSpringer, record, directory) for record in data['records']]
-                for f in concurrent.futures.as_completed(results):
-                    print(f.result())
+            # with concurrent.futures.ProcessPoolExecutor() as executor:
+            #     results = []
+            #     for i in range(0, len(data['records']), 3):
+            #         batch = data['records'][i:i+3]
+            #         batch_results = [executor.submit(downloadPDFSpringer, record, directory) for record in batch]
+            #         results.extend(batch_results)
+            #     for f in concurrent.futures.as_completed(results):
+            #         print(f.result())
+
+            for i in range(len(data['records'])):
+                status = downloadPDFSpringer(data['records'][i], directory)
+                print("Status: ", status)
+                
 
             # update current records count
             current_records += int(len(data['records']))
@@ -141,21 +150,21 @@ def downloadSpringerFulltext(issn=None, subject=None, journal=None, title=None,
 
 
     # call ingest
-    beam_url = "https://41kgx.apps.beam.cloud"
-    headers = {
-    "Content-Type": "application/json",
-    "Authorization": "Basic " + os.getenv('BEAM_AUTH_TOKEN')    # type: ignore
-    }
-    for data in ingest_data:
-        payload = json.dumps(data)
-        response = requests.post(beam_url, headers=headers, data=payload)
-        if response.status_code == 200:
-            print("Task status retrieved successfully!")
-        else:
-            print(f"Error: {response.status_code}. {response.text}")
+    # beam_url = "https://41kgx.apps.beam.cloud"
+    # headers = {
+    # "Content-Type": "application/json",
+    # "Authorization": "Basic " + os.getenv('BEAM_AUTH_TOKEN')    # type: ignore
+    # }
+    # for data in ingest_data:
+    #     payload = json.dumps(data)
+    #     response = requests.post(beam_url, headers=headers, data=payload)
+    #     if response.status_code == 200:
+    #         print("Task status retrieved successfully!")
+    #     else:
+    #         print(f"Error: {response.status_code}. {response.text}")
 
-    # Delete files from local directory
-    shutil.rmtree(directory)
+    # # Delete files from local directory
+    # shutil.rmtree(directory)
                                 
     return "success"
 

From 11056b3970dee808111a5d3cde6ca3b8ab9a5dfe Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Thu, 1 Aug 2024 11:02:20 -0500
Subject: [PATCH 14/18] minor changes

---
 ai_ta_backend/utils/pub_ingest.py | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/ai_ta_backend/utils/pub_ingest.py b/ai_ta_backend/utils/pub_ingest.py
index 9b8d038e..e771fca2 100644
--- a/ai_ta_backend/utils/pub_ingest.py
+++ b/ai_ta_backend/utils/pub_ingest.py
@@ -129,7 +129,7 @@ def downloadSpringerFulltext(issn=None, subject=None, journal=None, title=None,
         doi_link = f"https://doi.org/{doi}"
         data = {
             "course_name": course_name,
-            "group": "springer_open",
+            "groups": "springer_open",
             "s3_paths": "courses/" + course_name + "/" + file, # type: ignore
             "readable_filename": file,
             "base_url": "",
@@ -149,22 +149,36 @@ def downloadSpringerFulltext(issn=None, subject=None, journal=None, title=None,
         ingest_df.to_csv(csv_file, mode='a', header=False, index=False)
 
 
-    # call ingest
-    # beam_url = "https://41kgx.apps.beam.cloud"
+    # # call ingest
+    # beam_url = "https://3xn8l.apps.beam.cloud"
     # headers = {
     # "Content-Type": "application/json",
     # "Authorization": "Basic " + os.getenv('BEAM_AUTH_TOKEN')    # type: ignore
     # }
-    # for data in ingest_data:
+
+    # pubs_data = pd.read_csv(csv_file)
+
+    # for row in pubs_data.iterrows():
+    #     payload = {
+    #     "course_name": "cropwizard-pro",
+    #     "s3_paths": [row[1]["s3_paths"]],
+    #     "readable_filename": row[1]["readable_filename"],
+    #     "base_url": "",
+    #     "url": row[1]["url"],
+    #     "groups": ["Springer", "CC-BY", "Research Paper"]
+    #     }
+    #     print(payload)
     #     payload = json.dumps(data)
     #     response = requests.post(beam_url, headers=headers, data=payload)
+
     #     if response.status_code == 200:
     #         print("Task status retrieved successfully!")
     #     else:
     #         print(f"Error: {response.status_code}. {response.text}")
 
-    # # Delete files from local directory
+    # Delete files from local directory
     # shutil.rmtree(directory)
+    # os.remove(csv_file)
                                 
     return "success"
 

From f1a9090f2378802ecd32dc584fe5acf0b5f21afc Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Wed, 21 Aug 2024 11:47:28 -0400
Subject: [PATCH 15/18] minor changes

---
 ai_ta_backend/utils/pub_ingest.py | 110 +++++++++++++++++++-----------
 1 file changed, 72 insertions(+), 38 deletions(-)

diff --git a/ai_ta_backend/utils/pub_ingest.py b/ai_ta_backend/utils/pub_ingest.py
index e771fca2..7aa22a58 100644
--- a/ai_ta_backend/utils/pub_ingest.py
+++ b/ai_ta_backend/utils/pub_ingest.py
@@ -12,10 +12,10 @@
 
 SPRINGER_API_KEY = os.environ.get('SPRINGER_API_KEY')
 CC_LICENSES = {
-    "http://creativecommons.org/licenses/by/4.0/": "CC BY",
-    "http://creativecommons.org/licenses/by-nc/4.0/": "CC BY-NC",
-    "http://creativecommons.org/licenses/by-nc-nd/4.0/": "CC BY-NC-ND",
-    "http://creativecommons.org/licenses/by-nc-sa/4.0/": "CC BY-NC-SA"
+    "https://creativecommons.org/licenses/by/4.0/": "CC BY",
+    "https://creativecommons.org/licenses/by-nc/4.0/": "CC BY-NC",
+    "https://creativecommons.org/licenses/by-nc-nd/4.0/": "CC BY-NC-ND",
+    "https://creativecommons.org/licenses/by-nc-sa/4.0/": "CC BY-NC-SA"
 }
 
 OTHER_LICENSES = {
@@ -100,10 +100,17 @@ def downloadSpringerFulltext(issn=None, subject=None, journal=None, title=None,
             #         print(f.result())
 
             for i in range(len(data['records'])):
-                status = downloadPDFSpringer(data['records'][i], directory)
-                print("Status: ", status)
-                
+                article_metadata = downloadPDFSpringer(data['records'][i], directory)
+                article_metadata['issn'] = issn
 
+                # write article metadata to CSV file
+                metadata_csv = "springer_metadata.csv"
+                metadata_df = pd.DataFrame([article_metadata])
+                if not os.path.exists(metadata_csv):
+                    metadata_df.to_csv(metadata_csv, index=False)
+                else:
+                    metadata_df.to_csv(metadata_csv, mode='a', header=False, index=False)
+                
             # update current records count
             current_records += int(len(data['records']))
 
@@ -118,35 +125,35 @@ def downloadSpringerFulltext(issn=None, subject=None, journal=None, title=None,
         except Exception as e:
             print(e)
 
-    print("Course name: ", course_name)
+    # print("Course name: ", course_name)
     # prep payload for beam ingest
-    ingest_data = []
+    # ingest_data = []
     
     # upload files to S3 bucket
-    for file in os.listdir(directory):
-        doi = file[:-4]
-        doi = doi.replace("_", "/")
-        doi_link = f"https://doi.org/{doi}"
-        data = {
-            "course_name": course_name,
-            "groups": "springer_open",
-            "s3_paths": "courses/" + course_name + "/" + file, # type: ignore
-            "readable_filename": file,
-            "base_url": "",
-            "url": doi_link,
-            "journal": "",
-        }
-        s3_path = "courses/" + course_name + "/" + file # type: ignore
-        s3_client.upload_file(directory + "/" + file, aws_bucket, s3_path)  # type: ignore
-        ingest_data.append(data)
+    # for file in os.listdir(directory):
+    #     doi = file[:-4]
+    #     doi = doi.replace("_", "/")
+    #     doi_link = f"https://doi.org/{doi}"
+    #     data = {
+    #         "course_name": course_name,
+    #         "groups": "springer_open",
+    #         "s3_paths": "courses/" + course_name + "/" + file, # type: ignore
+    #         "readable_filename": file,
+    #         "base_url": "",
+    #         "url": doi_link,
+    #         "journal": "",
+    #     }
+    #     s3_path = "courses/" + course_name + "/" + file # type: ignore
+    #     s3_client.upload_file(directory + "/" + file, aws_bucket, s3_path)  # type: ignore
+    #     ingest_data.append(data)
     
-    # save ingest data to csv
-    ingest_df = pd.DataFrame(ingest_data)
-    csv_file = "publications_data.csv"
-    if not os.path.exists(csv_file):
-        ingest_df.to_csv(csv_file, index=False)
-    else:
-        ingest_df.to_csv(csv_file, mode='a', header=False, index=False)
+    # # save ingest data to csv
+    # ingest_df = pd.DataFrame(ingest_data)
+    # csv_file = "publications_data.csv"
+    # if not os.path.exists(csv_file):
+    #     ingest_df.to_csv(csv_file, index=False)
+    # else:
+    #     ingest_df.to_csv(csv_file, mode='a', header=False, index=False)
 
 
     # # call ingest
@@ -190,6 +197,7 @@ def downloadPDFSpringer(record: dict, directory: str):
         record: dictionary containing DOI and other metadata
         directory: local directory to save the files
     """
+    print("in downloadPDFSpringer")
     headers = {'Accept': 'application/json'}
 
     if len(record['url']) < 1:
@@ -197,26 +205,42 @@ def downloadPDFSpringer(record: dict, directory: str):
 
     # extract URL
     url = record['url'][0]['value'] + "?api_key=" + str(SPRINGER_API_KEY)
+    print("URL: ", url)
     url_response = requests.get(url, headers=headers)
     if url_response.status_code != 200:
         return "Error in accessing article link: " + str(url_response.status_code) + " - " + url_response.text
     url_data = url_response.json()
 
+    if 'license' in url_data:
+        license_url = url_data['license'][0]['URL']
+        license = CC_LICENSES.get(license_url, license_url)
+        print("License: ", license)
+    else:
+        license = "unknown"
+        license_url = "unknown"
+
     # extract PDF link
     pdf_link = None
     links = url_data['link']
     for link in links:
         if link['content-type'] == 'application/pdf' and link['intended-application'] == 'text-mining':
             pdf_link = link['URL']
-            #print("PDF Link: ", pdf_link)
+            print("PDF Link: ", pdf_link)
             break
+        
     if not pdf_link:
-        return "No PDF link found for DOI: " + record['doi']
+        pdf_link = links[0]['URL']
+        print("PDF Link: ", pdf_link)
+        if not pdf_link:
+            return "No PDF link found for DOI: " + record['doi']
     
     # download PDF
-    filename = record['doi'].replace("/", "_")
-    if filename in ['10.1186_2196-5641-1-1', '10.1186_s40538-014-0009-x']:
-        return "Skipping: " + filename
+    print("Downloading PDF: ", record['doi'])
+    if 'doi' in record:
+        filename = record['doi'].replace("/", "_")
+    else:
+        filename = url_data['DOI'].replace("/", "_")
+
     try:
         response = requests.get(pdf_link)
         if response.status_code != 200:
@@ -226,7 +250,17 @@ def downloadPDFSpringer(record: dict, directory: str):
             for chunk in response.iter_content(chunk_size=1024):  # Download in chunks
                 f.write(chunk)
         print("Downloaded: ", filename)
-        return "success"
+
+        # form metadata
+        metadata = {
+            "doi": record['doi'],
+            "publisher": record['publisher'],
+            "issn": record['issn'],
+            "license": license,
+            "license_url": license_url,
+            "metadata": url_data,
+        }
+        return metadata
     except Exception as e:
         return "Error in downloading PDF: " + str(e)
 

From 5fed7ae80f59689e1bce5e171918f4ffb33a3816 Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Mon, 9 Sep 2024 23:07:42 -0500
Subject: [PATCH 16/18] minor changes for download

---
 ai_ta_backend/utils/pub_ingest.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/ai_ta_backend/utils/pub_ingest.py b/ai_ta_backend/utils/pub_ingest.py
index 7aa22a58..699ae1bc 100644
--- a/ai_ta_backend/utils/pub_ingest.py
+++ b/ai_ta_backend/utils/pub_ingest.py
@@ -12,10 +12,10 @@
 
 SPRINGER_API_KEY = os.environ.get('SPRINGER_API_KEY')
 CC_LICENSES = {
-    "https://creativecommons.org/licenses/by/4.0/": "CC BY",
-    "https://creativecommons.org/licenses/by-nc/4.0/": "CC BY-NC",
-    "https://creativecommons.org/licenses/by-nc-nd/4.0/": "CC BY-NC-ND",
-    "https://creativecommons.org/licenses/by-nc-sa/4.0/": "CC BY-NC-SA"
+    "http://creativecommons.org/licenses/by/4.0/": "CC BY",
+    "http://creativecommons.org/licenses/by-nc/4.0/": "CC BY-NC",
+    "http://creativecommons.org/licenses/by-nc-nd/4.0/": "CC BY-NC-ND",
+    "http://creativecommons.org/licenses/by-nc-sa/4.0/": "CC BY-NC-SA"
 }
 
 OTHER_LICENSES = {

From acda1a1cfffe5128c465b3c4978864726939ec37 Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Thu, 12 Sep 2024 14:00:08 -0500
Subject: [PATCH 17/18] minor changes

---
 ai_ta_backend/utils/pub_ingest.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/ai_ta_backend/utils/pub_ingest.py b/ai_ta_backend/utils/pub_ingest.py
index 699ae1bc..cc59f928 100644
--- a/ai_ta_backend/utils/pub_ingest.py
+++ b/ai_ta_backend/utils/pub_ingest.py
@@ -98,8 +98,12 @@ def downloadSpringerFulltext(issn=None, subject=None, journal=None, title=None,
             #         results.extend(batch_results)
             #     for f in concurrent.futures.as_completed(results):
             #         print(f.result())
-
+            print("Total records: ", len(data['records']))
+            
             for i in range(len(data['records'])):
+                print("i: ", i)
+                print("Processing record: ", data['records'][i])
+                print("\n")
                 article_metadata = downloadPDFSpringer(data['records'][i], directory)
                 article_metadata['issn'] = issn
 
@@ -207,8 +211,10 @@ def downloadPDFSpringer(record: dict, directory: str):
     url = record['url'][0]['value'] + "?api_key=" + str(SPRINGER_API_KEY)
     print("URL: ", url)
     url_response = requests.get(url, headers=headers)
+    print("URL response: ", url_response.status_code)
     if url_response.status_code != 200:
         return "Error in accessing article link: " + str(url_response.status_code) + " - " + url_response.text
+    
     url_data = url_response.json()
 
     if 'license' in url_data:
@@ -343,9 +349,9 @@ def downloadWileyFulltext(course_name=None, issn=[]):
 
 
     # fetch metadata
-    # for item in issn:
-    #     metadata_status = getCrossrefMetadata(item)
-    #     print("Metadata status: ", metadata_status)
+    for item in issn:
+        metadata_status = getCrossrefMetadata(item)
+        print("Metadata status: ", metadata_status)
     
     # download PDFs based on metadata
     metadata_csv = "wiley_metadata.csv"

From 183c181e77b9de8bd674dc6428493bafa42bbc3c Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Mon, 4 Nov 2024 14:38:52 -0600
Subject: [PATCH 18/18] added nal download script

---
 ai_ta_backend/utils/nal_data_mining.py |   0
 ai_ta_backend/utils/nal_download.py    | 506 +++++++++++++++++++++++++
 2 files changed, 506 insertions(+)
 create mode 100644 ai_ta_backend/utils/nal_data_mining.py
 create mode 100644 ai_ta_backend/utils/nal_download.py

diff --git a/ai_ta_backend/utils/nal_data_mining.py b/ai_ta_backend/utils/nal_data_mining.py
new file mode 100644
index 00000000..e69de29b
diff --git a/ai_ta_backend/utils/nal_download.py b/ai_ta_backend/utils/nal_download.py
new file mode 100644
index 00000000..ca4d128c
--- /dev/null
+++ b/ai_ta_backend/utils/nal_download.py
@@ -0,0 +1,506 @@
+import os
+from supabase import create_client, Client
+import requests
+import boto3
+from dotenv import load_dotenv
+import datetime
+import time
+
+load_dotenv()
+print("Supabase URL: ", os.getenv("SUPABASE_URL"))
+print("Supabase API key: ", os.getenv("SUPABASE_API_KEY"))
+
+# Initialize the Supabase client
+SUPABASE_CLIENT: Client = create_client(os.getenv("SUPABASE_URL"), os.getenv("SUPABASE_API_KEY"))
+SPRINGER_API_KEY = os.environ.get('SPRINGER_API_KEY')
+DOWNLOAD_LOG = "download_log.txt"
+
+S3_CLIENT = boto3.client('s3', aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'), aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'))
+
+AWS_BUCKET = os.getenv('S3_BUCKET_NAME')
+
+CC_LICENSES = {
+    "http://creativecommons.org/licenses/by/4.0/": "CC BY",
+    "http://creativecommons.org/licenses/by-nc/4.0/": "CC BY-NC",
+    "http://creativecommons.org/licenses/by-nc-nd/4.0/": "CC BY-NC-ND",
+    "http://creativecommons.org/licenses/by-nc-sa/4.0/": "CC BY-NC-SA"
+}
+
+OTHER_LICENSES = {
+    "http://onlinelibrary.wiley.com/termsAndConditions#vor": "wiley_tnc",
+    "http://onlinelibrary.wiley.com/termsAndConditions#am": "wiley_tnc",
+    "http://doi.wiley.com/10.1002/tdm_license_1.1": "wiley_tdm"
+}
+
+
+def main():
+    data = [1, 2, 3]
+    # fetch records from SQL
+    while len(data) > 0:
+        response = SUPABASE_CLIENT.table("nal_publications").select("doi_number, publisher, metadata").eq("ingested", False).eq("downloadable", True).neq("publisher", "Wiley").limit(1000).execute()
+        data = response.data
+        print("No. of records: ", len(data))
+        for record in data:
+            if 'Springer' in record['publisher']:
+                # route to springer download
+                result = downloadSpringerFulltext(doi=record['doi_number'])
+                
+            elif 'Wiley' in record['publisher']:
+                # route to wiley download
+                print('Wiley')
+                continue
+                result = downloadWileyPDF(doi=record['doi_number'], metadata=record['metadata'])
+                time.sleep(10) # sleep for 10 seconds to avoid rate limiting
+            elif 'Elsevier' in record['publisher']:
+                # update supabase
+                update_info = {"notes": "Elsevier articles not downloadable.", "downloadable": False, "modified_date": datetime.datetime.now().isoformat()}
+                response = SUPABASE_CLIENT.table("nal_publications").update(update_info).eq("doi_number", record['doi_number']).execute()
+
+            else:
+                # regular file save
+                print("publisher name: " , record['publisher'])
+                result = download_article_from_url(record['doi_number'], record['metadata'])
+                print(result)
+                #time.sleep(10)
+    return "Success"
+
+
+def download_article_from_url(doi, metadata):    
+    print("in download_article_from_url: ", doi)
+    
+    if 'link' not in metadata:
+        print("No link")
+        # update supabase
+        update_info = {"notes": "Download link absent.", "downloadable": False, "modified_date": datetime.datetime.now().isoformat()}
+        SUPABASE_CLIENT.table("nal_publications").update(update_info).eq("doi_number", doi).execute()
+        return "No download link present"
+    else:
+        # save to local
+        print("Link found")
+        pdf_link = metadata['link'][0]['URL']
+
+        if 'license' not in metadata:
+            print("No license")
+            # update supabase
+            update_info = {"notes": "License absent.", "downloadable": False, "modified_date": datetime.datetime.now().isoformat()}
+            SUPABASE_CLIENT.table("nal_publications").update(update_info).eq("doi_number", doi).execute()
+            return {"error": "License not found."}
+
+        license = get_license(metadata['license'][0]['URL'])
+
+        status = download_pdf_in_chunks(url=pdf_link, doi=doi)
+        if 'failed' in status:
+            # update supabase
+            print("Error in PDF download: ", status['failed'])
+            update_info = {"notes": str(status['failed']), "downloadable": False, "modified_date": datetime.datetime.now().isoformat()}
+            SUPABASE_CLIENT.table("nal_publications").update(update_info).eq("doi_number", doi).execute()
+            return {"error": "Error in PDF download."}
+        else:
+            filepath = status['success']
+
+            updated_metadata = {
+                "doi": doi,
+                "filename": filepath.split("/")[-1],
+                "file_path": filepath,
+                "publisher": metadata['publisher'],
+                "license": license,
+            }
+            print("Updated metadata: ", updated_metadata)
+            
+            ingest_status = upload_and_ingest(filepath, updated_metadata, doi)
+            print(ingest_status)
+
+            return {"success": "Downloaded and ingested successfully."}
+
+def upload_and_ingest(filepath, metadata, doi):
+    """
+    Uploads file to S3 and ingests them into cropwizard-1.5
+    """
+    filename = os.path.basename(filepath)
+
+    s3_path = "courses/cropwizard-1.5/" + filename
+
+    S3_CLIENT.upload_file(filepath, AWS_BUCKET, s3_path)
+    
+    publisher = metadata['publisher']
+    if 'Springer' in publisher:
+        publisher = "Springer"
+    elif 'Wiley' in publisher:
+        publisher = "Wiley"
+
+    
+    # ingest
+    ingest_url = "https://ingest-task-queue-6ee4a59-v12.app.beam.cloud"
+    ingest_headers = {
+          'Accept': '*/*',
+          'Accept-Encoding': 'gzip, deflate',
+          'Authorization': f"Bearer {os.environ['BEAM_API_KEY']}",
+          'Content-Type': 'application/json',
+    }
+    doi_url = f"https://doi.org/{doi}"
+    ingest_payload = {
+        "course_name": "cropwizard-1.5",
+        "s3_paths": [s3_path],
+        "readable_filename": filename,
+        "url": doi_url,
+        "base_url": "",
+        "groups": ["Research Papers", "NAL", publisher]
+    }
+    
+    if 'license' in metadata and metadata['license'] not in ['Unknown', 'unknown']:
+        ingest_payload['groups'].append(metadata['license'])
+
+    print("FINAL INGEST PAYLOAD: ", ingest_payload)
+    ingest_response = requests.post(ingest_url, headers=ingest_headers, json=ingest_payload)
+
+    # update supabase
+    update_info = {"ingested": True, "modified_date": datetime.datetime.now().isoformat()}
+    response = SUPABASE_CLIENT.table("nal_publications").update(update_info).eq("doi_number", doi).execute()
+    return "success"
+
+
+def download_pdf_in_chunks(url, doi, chunk_size=1024):
+    try:
+        
+        # create directory to store files
+        directory = "other_papers"
+        if not os.path.exists(directory):
+            os.makedirs(directory)
+
+        # Send a GET request to the URL with stream=True to download in chunks
+        response = requests.get(url, stream=True)
+        
+        # Check if the request was successful
+        if response.status_code == 200:
+            # Open the file in binary write mode
+            filename = doi.replace("/", "_")
+            filepath = "other_papers/" + filename + ".pdf"
+            with open(filepath, 'wb') as file:
+                # Iterate over the response in chunks and write each to the file
+                for chunk in response.iter_content(chunk_size=chunk_size):
+                    if chunk:  # Filter out keep-alive chunks
+                        file.write(chunk)
+            print(f"PDF successfully downloaded and saved as {filepath}")
+
+            return {"success": filepath}
+
+        else:
+            print(f"Failed to download PDF. Status code: {response.status_code}")
+
+            # update supabase
+            update_info = {"notes": f"Failed to download PDF (anti-bot). Status code: {response.status_code}",
+                           "downloadable": False, "modified_date": datetime.datetime.now().isoformat()}
+            SUPABASE_CLIENT.table("nal_publications").update(update_info).eq("doi_number", doi).execute()
+
+            return {"failed": response.status_code}
+    
+    except requests.exceptions.RequestException as e:
+        print(f"An error occurred: {e}")
+        return {"failed": e}
+    
+
+############# SPRINGER DOWNLOAD #############
+
+def downloadSpringerFulltext(issn=None, subject=None, journal=None, title=None, doi=None, course_name=None):
+    """
+    This function uses the Springer Nature API to download openaccess journal articles.
+    Args:
+        issn: limit to ISSN number of the journal/book
+        subject: limit articles to a specific subject - Chemistry, Physics, etc.
+        journal: limit to keywords occuring in journal title
+        title: limit to keywords occuring in article title
+    The initial API response returns a list of articles with metadata.
+    
+    """
+    print("in downloadSpringerFulltext")
+    # create directory to store files
+    directory = os.path.join(os.getcwd(), 'springer_papers')
+    if not os.path.exists(directory):
+        os.makedirs(directory)
+
+    # set headers
+    api_url = "http://api.springernature.com/openaccess/json?q="
+    headers = {'Accept': 'application/json'}
+
+    # form the query URL based on the input parameters received
+    if doi:
+        query_str = "doi:" + doi
+    elif issn:
+        query_str = "issn:" + issn
+    elif journal:   
+        journal = "%22" + journal.replace(" ", "%20") + "%22"
+        query_str = "journal:" + journal
+    elif title:
+        title = "%22" + title.replace(" ", "%20") + "%22"
+        query_str = "title:" + title
+    elif subject:
+        query_str = "subject:" + subject
+    else:
+        return "No query parameters provided"
+    
+    main_url = api_url + query_str + "&api_key=" + str(SPRINGER_API_KEY)
+    print("Full URL: ", main_url)
+    
+    response = requests.get(main_url, headers=headers)
+
+    if response.status_code != 200:
+        print("Error in accessing Springer API: ", response.text)
+        response = SUPABASE_CLIENT.table("nal_publications").update({"notes": f"Error in accessing Springer API. Status code: {response.text}", "downloadable": False, "modified_date": datetime.datetime.now().isoformat()}).eq("doi_number", doi).execute()
+        return "Error" 
+
+    data = response.json()
+    # check for total number of records 
+    total_records = int(data['result'][0]['total'])
+
+    if total_records == 0:
+        # update supabase record and exit
+        response = SUPABASE_CLIENT.table("nal_publications").update({"notes": "Article is not OA.", "downloadable": False, "modified_date": datetime.datetime.now().isoformat()}).eq("doi_number", doi).execute()
+        return "Article not OA."
+    else:
+        # download paper
+        download_info = downloadPDFSpringer(data['records'][0], directory)
+        
+        if 'error' in download_info:
+            response = SUPABASE_CLIENT.table("nal_publications").update({"notes": download_info['error'], "downloadable": False, "modified_date": datetime.datetime.now().isoformat()}).eq("doi_number", doi).execute()
+        else:
+            # ingest
+            print("Download info: ", download_info)
+            ingest_status = upload_and_ingest(download_info['file_path'], download_info, doi)
+                                
+    return "success"
+
+def downloadPDFSpringer(record: dict, directory: str):
+    """
+    This function takes a record from the Springer API response and downloads the PDF file.
+    It is called in a multi-process loop in downloadSpringerFulltext().
+    Args:
+        record: dictionary containing DOI and other metadata
+        directory: local directory to save the files
+    """
+    print("in downloadPDFSpringer")
+    headers = {'Accept': 'application/json'}
+
+    if len(record['url']) < 1:
+        return "No download link found for DOI: " + record['doi']
+
+    # extract URL
+    url = record['url'][0]['value'] + "?api_key=" + str(SPRINGER_API_KEY)
+    
+    url_response = requests.get(url, headers=headers)
+    
+    if url_response.status_code != 200:
+        return {"error": "Error in accessing article link: " + str(url_response.status_code) + " - " + url_response.text}
+    
+    url_data = url_response.json()
+
+    if 'license' in url_data:
+        license_url = url_data['license'][0]['URL']
+        license = get_license(license_url)
+        print("License: ", license)
+    else:
+        license = "unknown"
+        license_url = "unknown"
+
+    # extract PDF link
+    pdf_link = None
+    if 'link' not in url_data:
+        return {"error": "No link found for DOI: " + record['doi']}
+    
+    links = url_data['link']
+    for link in links:
+        if link['content-type'] == 'application/pdf' and link['intended-application'] == 'text-mining':
+            pdf_link = link['URL']
+            
+            break
+        
+    if not pdf_link:
+        pdf_link = links[0]['URL']
+        
+        if not pdf_link:
+            return {"error": "No PDF link found for DOI: " + record['doi']}
+    
+    # download PDF
+    
+    if 'doi' in record:
+        filename = record['doi'].replace("/", "_")
+    else:
+        filename = url_data['DOI'].replace("/", "_")
+
+    try:
+        response = requests.get(pdf_link)
+        if response.status_code != 200:
+            return {"error": "Error in downloading PDF: " + str(response.status_code) + " - " + response.text}
+        
+        with open(directory + "/" + filename + ".pdf", "wb") as f:  # Open a file in binary write mode ("wb")
+            for chunk in response.iter_content(chunk_size=1024):  # Download in chunks
+                f.write(chunk)
+        
+
+        # form metadata
+        metadata = {
+            "doi": record['doi'],
+            "publisher": record['publisher'],
+            "issn": record['issn'],
+            "license": license,
+            "license_url": license_url,
+            "filename": filename + ".pdf",
+            "file_path": directory + "/" + filename + ".pdf"
+        }
+        return metadata
+    except Exception as e:
+        return {"error": "Error in downloading PDF: " + str(e)}
+
+
+def downloadWileyPDF(doi, metadata):
+    """
+    This function downloads a PDF file from Wiley based on the DOI.
+    """
+    print("in downloadWileyPDF")
+    try:
+        # create directory to store files
+        directory = "wiley_papers"
+        if not os.path.exists(directory):
+            os.makedirs(directory)
+
+        api_key = os.environ.get("WILEY_TDM_TOKEN")
+
+        # download PDF based on doi
+        base_url = "https://api.wiley.com/onlinelibrary/tdm/v1/articles/"
+        url = base_url + str(doi)
+        print("URL: ", url)
+
+        headers = {
+            'Wiley-TDM-Client-Token': api_key,
+            'Content-Type': 'application/json'
+        }
+        
+        response = requests.get(url, headers=headers)
+        response.raise_for_status()
+            
+        filename = str(doi).replace("/", "_") + ".pdf"
+        with open(directory + "/" + filename, "wb") as f:  # Open a file in binary write mode ("wb")
+            for chunk in response.iter_content(chunk_size=1024):  # Download in chunks
+                f.write(chunk)
+        print("Downloaded: ", filename)
+
+        # get license
+        license = get_license(metadata['license'][0]['URL'])
+        print("License: ", license)
+
+        # route to upload and ingest
+        updated_metadata = {
+            "doi": doi,
+            "filename": filename,
+            "file_path": directory + "/" + filename,
+            "publisher": metadata['publisher'],
+            "license": license,
+        }
+        print("Updated metadata: ", updated_metadata)
+        
+        # call upload and ingest
+        ingest_status = upload_and_ingest(updated_metadata['file_path'], updated_metadata, doi)
+        
+        return {"success": "Downloaded and ingested successfully."}
+    except Exception as e:
+        print("Error: ", e)
+        # probably a 403 error - update supabase
+        update_info = {"notes": "403 client error (forbidden) in PDF download.", "downloadable": False, "modified_date": datetime.datetime.now().isoformat()}
+        response = SUPABASE_CLIENT.table("nal_publications").update(update_info).eq("doi_number", doi).execute()
+        return {"error": "403 client error (forbidden) in PDF download."}
+
+
+def downloadWileyArticle(doi=None):
+    """
+    This function fetches metadata from Crossref and downloads open access full text articles from Wiley.
+    """
+    # create directory to store files
+    directory = os.path.join(os.getcwd(), 'wiley_papers')
+    if not os.path.exists(directory):
+        os.makedirs(directory)
+
+    api_key = os.environ.get("WILEY_TDM_TOKEN")
+    metadata = {}
+    
+    # get metadata from Crossref
+    if doi:
+        # get article metadata
+        works = Works()
+        article_data = works.doi(doi)
+        print("Article license: ", article_data['license'])
+        
+        article_licenses = []
+        
+        for item in article_data['license']:
+            article_licenses.append(item['URL'])
+        print("Licenses: ", article_licenses)
+        # check if the license is open access - variant of CC
+        for license in article_licenses:
+            if license in LICENSES:
+                print("License found: ", license)
+                if LICENSES[license] == "closed_access":
+                    return "Article is not open access."
+                else:
+                    metadata['license'] = LICENSES[license]
+                    break
+            else:
+                return "License not found."
+        
+        metadata['doi'] = doi
+        metadata['title'] = article_data['title'][0]
+        metadata['journal'] = article_data['container-title'][0]
+        metadata['publisher'] = article_data['publisher']
+        metadata['issn'] = article_data['ISSN'][0]
+        metadata['url'] = article_data['URL']
+
+        print("Metadata: ", metadata)
+
+        # download PDF based on doi
+        base_url = "https://api.wiley.com/onlinelibrary/tdm/v1/articles/"
+        url = base_url + str(doi)
+
+        print("URL: ", url)
+
+        headers = {
+            'Wiley-TDM-Client-Token': api_key,
+            'Content-Type': 'application/json'
+        }
+
+        response = requests.get(url, headers=headers)
+        if response.status_code != 200:
+            # exponential backoff logic
+            print("Error in accessing article link, retrying: ", response.text)
+
+            return "Error in accessing article link: " + str(response.status_code) + " - " + response.text
+        
+        filename = str(doi).replace("/", "_")
+        with open(directory + "/" + filename + ".pdf", "wb") as f:  # Open a file in binary write mode ("wb")
+            for chunk in response.iter_content(chunk_size=1024):  # Download in chunks
+                f.write(chunk)
+        print("Downloaded: ", filename)
+
+        # upload file to S3 bucket
+
+        # prep payload for beam ingest
+
+        return "success"
+
+
+def get_license(url: str) -> str:
+    # Define license matches
+    license_mapping = {
+        "by-nc-nd": "CC BY-NC-ND",
+        "by-nc-sa": "CC BY-NC-SA",
+        "by-nc": "CC BY-NC",
+        "by": "CC BY",
+    }
+    
+    # Loop through the mapping and check if the URL contains the license string
+    for key, license in license_mapping.items():
+        if key in url:
+            return license
+
+    # Return 'Unknown' if no match is found
+    return "Unknown"
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file