From 5bb0a1ec688260c5cb7163d827339bf4dd2f2190 Mon Sep 17 00:00:00 2001 From: star-nox Date: Tue, 23 Apr 2024 14:02:11 -0500 Subject: [PATCH 01/18] minor changes --- ai_ta_backend/main.py | 27 ++++++ ai_ta_backend/utils/pub_ingest.py | 154 ++++++++++++++++++++++++++++++ 2 files changed, 181 insertions(+) create mode 100644 ai_ta_backend/utils/pub_ingest.py diff --git a/ai_ta_backend/main.py b/ai_ta_backend/main.py index c70466c0..a563fb3c 100644 --- a/ai_ta_backend/main.py +++ b/ai_ta_backend/main.py @@ -40,6 +40,7 @@ from ai_ta_backend.service.sentry_service import SentryService from ai_ta_backend.beam.nomic_logging import create_document_map +from ai_ta_backend.utils.pub_ingest import downloadSpringerFulltext app = Flask(__name__) CORS(app) @@ -382,6 +383,32 @@ def getTopContextsWithMQR(service: RetrievalService, posthog_service: PosthogSer return response +@app.route('/get-springer-fulltext', methods=['GET']) +def get_springer_data(): + course_name: str = request.args.get('course_name', default='', type=str) + issn = request.args.get('issn', default='', type=str) + subject = request.args.get('subject', default='', type=str) + journal = request.args.get('journal', default='', type=str) + title = request.args.get('title', default='', type=str) + doi = request.args.get('doi', default='', type=str) + + print("In /get-springer-fulltext") + + if (issn == '' and subject == '' and journal == '' and title == '' and doi == '') or course_name == '': + # proper web error "400 Bad request" + abort( + 400, + description= + f"Missing required parameters: 'issn' or 'subject' or 'title' or 'journal' or 'doi' and 'course_name' must be provided." + ) + + fulltext = downloadSpringerFulltext(issn, subject, journal, title, doi, course_name) + + response = jsonify(fulltext) + response.headers.add('Access-Control-Allow-Origin', '*') + return response + + def configure(binder: Binder) -> None: binder.bind(RetrievalService, to=RetrievalService, scope=RequestScope) binder.bind(PosthogService, to=PosthogService, scope=SingletonScope) diff --git a/ai_ta_backend/utils/pub_ingest.py b/ai_ta_backend/utils/pub_ingest.py new file mode 100644 index 00000000..ed7a9813 --- /dev/null +++ b/ai_ta_backend/utils/pub_ingest.py @@ -0,0 +1,154 @@ +import os +import shutil +import requests +import json +import arxiv +import crossref_commons.retrieval +import xml.etree.ElementTree as ET +import ftplib +from urllib.parse import urlparse +import urllib.parse +import supabase +import tarfile +import concurrent.futures +import time + +SPRINGER_API_KEY = os.environ.get('SPRINGER_API_KEY') + +SUPABASE_CLIENT = supabase.create_client( # type: ignore + supabase_url=os.getenv('SUPABASE_URL'), # type: ignore + supabase_key=os.getenv('SUPABASE_API_KEY')) # type: ignore + + +def downloadSpringerFulltext(issn=None, subject=None, journal=None, title=None, doi=None, course_name=None): + """ + This function uses the Springer Nature API to download openaccess journal articles. + Args: + issn: limit to ISSN number of the journal/book + subject: limit articles to a specific subject - Chemistry, Physics, etc. + journal: limit to keywords occuring in journal title + title: limit to keywords occuring in article title + The initial API response returns a list of articles with metadata. + + """ + # create directory to store files + directory = os.path.join(os.getcwd(), 'springer_papers') + if not os.path.exists(directory): + os.makedirs(directory) + + # set headers + api_url = "http://api.springernature.com/openaccess/json?q=" + headers = {'Accept': 'application/json'} + + # form the query URL based on the input parameters received + if doi: + query_str = "doi:" + doi + elif issn: + query_str = "issn:" + issn + elif journal: + journal = "%22" + journal.replace(" ", "%20") + "%22" + query_str = "journal:" + journal + elif title: + title = "%22" + title.replace(" ", "%20") + "%22" + query_str = "title:" + title + elif subject: + query_str = "subject:" + subject + else: + return "No query parameters provided" + + main_url = api_url + query_str + "&api_key=" + str(SPRINGER_API_KEY) + print("Full URL: ", main_url) + + response = requests.get(main_url, headers=headers) + print("Status: ", response.status_code) + + if response.status_code != 200: + return "Error: " + str(response.status_code) + " - " + response.text + + data = response.json() + # check for total number of records + total_records = int(data['result'][0]['total']) + print("Total records: ", total_records) + current_records = 0 + + while current_records < total_records: + # check if nextPage exists + if 'nextPage' in data: + next_page_url = "http://api.springernature.com" + data['nextPage'] + else: + next_page_url = None + + # multi-process all records in current page + with concurrent.futures.ProcessPoolExecutor() as executor: + results = [executor.submit(downloadPDFSpringer, record, directory) for record in data['records']] + for f in concurrent.futures.as_completed(results): + print(f.result()) + + # update current records count + current_records += int(len(data['records'])) + + # if next page exists, update next page url and call the API again + if next_page_url: + # API key is already present in the URL + response = requests.get(next_page_url, headers=headers) + if response.status_code != 200: + return "Error in next page: " + str(response.status_code) + " - " + response.text + + data = response.json() + + # call ingest function here + + + # call document groups API + + + # # Delete files from local directory + # shutil.rmtree(directory) + + return "success" + +def downloadPDFSpringer(record: dict, directory: str): + """ + This function takes a record from the Springer API response and downloads the PDF file. + It is called in a multi-process loop in downloadSpringerFulltext(). + Args: + record: dictionary containing DOI and other metadata + directory: local directory to save the files + """ + headers = {'Accept': 'application/json'} + + if len(record['url']) < 1: + return "No download link found for DOI: " + record['doi'] + + # extract URL + url = record['url'][0]['value'] + "?api_key=" + str(SPRINGER_API_KEY) + url_response = requests.get(url, headers=headers) + if url_response.status_code != 200: + return "Error in accessing article link: " + str(url_response.status_code) + " - " + url_response.text + url_data = url_response.json() + + # extract PDF link + pdf_link = None + links = url_data['link'] + for link in links: + if link['content-type'] == 'application/pdf' and link['intended-application'] == 'text-mining': + pdf_link = link['URL'] + #print("PDF Link: ", pdf_link) + break + if not pdf_link: + return "No PDF link found for DOI: " + record['doi'] + + # download PDF + filename = record['doi'].replace("/", "_") + try: + response = requests.get(pdf_link) + if response.status_code != 200: + return "Error in downloading PDF: " + str(response.status_code) + " - " + response.text + + with open(directory + "/" + filename + ".pdf", "wb") as f: # Open a file in binary write mode ("wb") + for chunk in response.iter_content(chunk_size=1024): # Download in chunks + f.write(chunk) + print("Downloaded: ", filename) + return "success" + except Exception as e: + return "Error in downloading PDF: " + str(e) From 970703be9c7978a7962b04de5c58fce6e4d8f8ac Mon Sep 17 00:00:00 2001 From: star-nox Date: Fri, 26 Apr 2024 15:44:32 -0500 Subject: [PATCH 02/18] added beam ingest --- ai_ta_backend/database/sql.py | 4 +- ai_ta_backend/utils/pub_ingest.py | 69 +++++++++++++++++++++++-------- 2 files changed, 55 insertions(+), 18 deletions(-) diff --git a/ai_ta_backend/database/sql.py b/ai_ta_backend/database/sql.py index caf0ac51..cbee1a66 100644 --- a/ai_ta_backend/database/sql.py +++ b/ai_ta_backend/database/sql.py @@ -11,7 +11,7 @@ def __init__(self): # Create a Supabase client self.supabase_client = supabase.create_client( # type: ignore supabase_url=os.environ['SUPABASE_URL'], supabase_key=os.environ['SUPABASE_API_KEY']) - + def getAllMaterialsForCourse(self, course_name: str): return self.supabase_client.table( os.environ['SUPABASE_DOCUMENTS_TABLE']).select('course_name, s3_path, readable_filename, url, base_url').eq( @@ -110,3 +110,5 @@ def updateProjects(self, course_name: str, data: dict): def getConversation(self, course_name: str, key: str, value: str): return self.supabase_client.table("llm-convo-monitor").select("*").eq(key, value).eq("course_name", course_name).execute() + def getCourseDocumentByS3Path(self, course_name: str, s3_path: str): + return self.supabase_client.table("documents").select("id, course_name, readable_filename, url, base_url, s3_path, created_at").eq("course_name", course_name).eq("s3_path", s3_path).execute() \ No newline at end of file diff --git a/ai_ta_backend/utils/pub_ingest.py b/ai_ta_backend/utils/pub_ingest.py index ed7a9813..9651cfae 100644 --- a/ai_ta_backend/utils/pub_ingest.py +++ b/ai_ta_backend/utils/pub_ingest.py @@ -1,23 +1,20 @@ import os +import json +import pandas as pd import shutil import requests -import json -import arxiv -import crossref_commons.retrieval +import supabase import xml.etree.ElementTree as ET -import ftplib from urllib.parse import urlparse -import urllib.parse -import supabase -import tarfile import concurrent.futures -import time +from ai_ta_backend.database import aws, sql SPRINGER_API_KEY = os.environ.get('SPRINGER_API_KEY') -SUPABASE_CLIENT = supabase.create_client( # type: ignore - supabase_url=os.getenv('SUPABASE_URL'), # type: ignore - supabase_key=os.getenv('SUPABASE_API_KEY')) # type: ignore +s3_client = aws.AWSStorage() +aws_bucket = os.getenv('S3_BUCKET_NAME') +supabase_client = supabase.create_client( # type: ignore + supabase_url=os.environ['SUPABASE_URL'], supabase_key=os.environ['SUPABASE_API_KEY']) def downloadSpringerFulltext(issn=None, subject=None, journal=None, title=None, doi=None, course_name=None): @@ -31,6 +28,7 @@ def downloadSpringerFulltext(issn=None, subject=None, journal=None, title=None, The initial API response returns a list of articles with metadata. """ + print("in downloadSpringerFulltext") # create directory to store files directory = os.path.join(os.getcwd(), 'springer_papers') if not os.path.exists(directory): @@ -96,14 +94,51 @@ def downloadSpringerFulltext(issn=None, subject=None, journal=None, title=None, data = response.json() - # call ingest function here - - - # call document groups API + print("Course name: ", course_name) + # prep payload for beam ingest + ingest_data = [] + + # upload files to S3 bucket + for file in os.listdir(directory): + data = { + "course_name": course_name, + "s3_paths": "", + "readable_filename": "", + "base_url": "", + "url": "", + "issn": issn + } + s3_path = "courses/" + course_name + "/" + file # type: ignore + data["s3_paths"] = s3_path + data["readable_filename"] = file + s3_client.upload_file(directory + "/" + file, aws_bucket, s3_path) # type: ignore + ingest_data.append(data) + # save ingest data to csv + ingest_df = pd.DataFrame(ingest_data) + csv_file = "publications_data.csv" + if not os.path.exists(csv_file): + ingest_df.to_csv(csv_file, index=False) + else: + ingest_df.to_csv(csv_file, mode='a', header=False, index=False) + + + # call ingest + beam_url = "https://41kgx.apps.beam.cloud" + headers = { + "Content-Type": "application/json", + "Authorization": "Basic " + os.getenv('BEAM_AUTH_TOKEN') # type: ignore + } + for data in ingest_data: + payload = json.dumps(data) + response = requests.post(beam_url, headers=headers, data=payload) + if response.status_code == 200: + print("Task status retrieved successfully!") + else: + print(f"Error: {response.status_code}. {response.text}") - # # Delete files from local directory - # shutil.rmtree(directory) + # Delete files from local directory + shutil.rmtree(directory) return "success" From 042264c2471b5b75a14891f9582ffdbbcf43c6a4 Mon Sep 17 00:00:00 2001 From: star-nox Date: Sun, 28 Apr 2024 17:19:39 -0500 Subject: [PATCH 03/18] minor changes --- ai_ta_backend/utils/pub_ingest.py | 119 ++++++++++++++++-------------- 1 file changed, 62 insertions(+), 57 deletions(-) diff --git a/ai_ta_backend/utils/pub_ingest.py b/ai_ta_backend/utils/pub_ingest.py index 9651cfae..f77fdc9b 100644 --- a/ai_ta_backend/utils/pub_ingest.py +++ b/ai_ta_backend/utils/pub_ingest.py @@ -35,64 +35,66 @@ def downloadSpringerFulltext(issn=None, subject=None, journal=None, title=None, os.makedirs(directory) # set headers - api_url = "http://api.springernature.com/openaccess/json?q=" - headers = {'Accept': 'application/json'} - - # form the query URL based on the input parameters received - if doi: - query_str = "doi:" + doi - elif issn: - query_str = "issn:" + issn - elif journal: - journal = "%22" + journal.replace(" ", "%20") + "%22" - query_str = "journal:" + journal - elif title: - title = "%22" + title.replace(" ", "%20") + "%22" - query_str = "title:" + title - elif subject: - query_str = "subject:" + subject - else: - return "No query parameters provided" + # api_url = "http://api.springernature.com/openaccess/json?q=" + # headers = {'Accept': 'application/json'} + + # # form the query URL based on the input parameters received + # if doi: + # query_str = "doi:" + doi + # elif issn: + # query_str = "issn:" + issn + # elif journal: + # journal = "%22" + journal.replace(" ", "%20") + "%22" + # query_str = "journal:" + journal + # elif title: + # title = "%22" + title.replace(" ", "%20") + "%22" + # query_str = "title:" + title + # elif subject: + # query_str = "subject:" + subject + # else: + # return "No query parameters provided" - main_url = api_url + query_str + "&api_key=" + str(SPRINGER_API_KEY) - print("Full URL: ", main_url) + # main_url = api_url + query_str + "&api_key=" + str(SPRINGER_API_KEY) + # print("Full URL: ", main_url) - response = requests.get(main_url, headers=headers) - print("Status: ", response.status_code) - - if response.status_code != 200: - return "Error: " + str(response.status_code) + " - " + response.text - - data = response.json() - # check for total number of records - total_records = int(data['result'][0]['total']) - print("Total records: ", total_records) - current_records = 0 - - while current_records < total_records: - # check if nextPage exists - if 'nextPage' in data: - next_page_url = "http://api.springernature.com" + data['nextPage'] - else: - next_page_url = None - - # multi-process all records in current page - with concurrent.futures.ProcessPoolExecutor() as executor: - results = [executor.submit(downloadPDFSpringer, record, directory) for record in data['records']] - for f in concurrent.futures.as_completed(results): - print(f.result()) - - # update current records count - current_records += int(len(data['records'])) - - # if next page exists, update next page url and call the API again - if next_page_url: - # API key is already present in the URL - response = requests.get(next_page_url, headers=headers) - if response.status_code != 200: - return "Error in next page: " + str(response.status_code) + " - " + response.text - - data = response.json() + # response = requests.get(main_url, headers=headers) + # print("Status: ", response.status_code) + + # if response.status_code != 200: + # return "Error: " + str(response.status_code) + " - " + response.text + + # data = response.json() + # # check for total number of records + # total_records = int(data['result'][0]['total']) + # print("Total records: ", total_records) + # current_records = 0 + # while current_records < total_records: + # # check if nextPage exists + # try: + # if 'nextPage' in data: + # next_page_url = "http://api.springernature.com" + data['nextPage'] + # else: + # next_page_url = None + + # # multi-process all records in current page + # with concurrent.futures.ProcessPoolExecutor() as executor: + # results = [executor.submit(downloadPDFSpringer, record, directory) for record in data['records']] + # for f in concurrent.futures.as_completed(results): + # print(f.result()) + + # # update current records count + # current_records += int(len(data['records'])) + + # # if next page exists, update next page url and call the API again + # if next_page_url: + # # API key is already present in the URL + # response = requests.get(next_page_url, headers=headers) + # if response.status_code != 200: + # return "Error in next page: " + str(response.status_code) + " - " + response.text + + # data = response.json() + # except Exception as e: + # print(e) print("Course name: ", course_name) # prep payload for beam ingest @@ -102,11 +104,12 @@ def downloadSpringerFulltext(issn=None, subject=None, journal=None, title=None, for file in os.listdir(directory): data = { "course_name": course_name, + "group": "springer_open", "s3_paths": "", "readable_filename": "", "base_url": "", "url": "", - "issn": issn + "journal": "rice", } s3_path = "courses/" + course_name + "/" + file # type: ignore data["s3_paths"] = s3_path @@ -175,6 +178,8 @@ def downloadPDFSpringer(record: dict, directory: str): # download PDF filename = record['doi'].replace("/", "_") + if filename in ['10.1186_2196-5641-1-1', '10.1186_s40538-014-0009-x']: + return "Skipping: " + filename try: response = requests.get(pdf_link) if response.status_code != 200: From 5d2d7a25c51a7fcdcb9eb6063d9f9dcc5add9882 Mon Sep 17 00:00:00 2001 From: star-nox Date: Mon, 6 May 2024 15:30:43 -0500 Subject: [PATCH 04/18] added doi link in metadata --- ai_ta_backend/utils/pub_ingest.py | 131 +++++++++++++++--------------- 1 file changed, 66 insertions(+), 65 deletions(-) diff --git a/ai_ta_backend/utils/pub_ingest.py b/ai_ta_backend/utils/pub_ingest.py index f77fdc9b..b2668702 100644 --- a/ai_ta_backend/utils/pub_ingest.py +++ b/ai_ta_backend/utils/pub_ingest.py @@ -35,85 +35,86 @@ def downloadSpringerFulltext(issn=None, subject=None, journal=None, title=None, os.makedirs(directory) # set headers - # api_url = "http://api.springernature.com/openaccess/json?q=" - # headers = {'Accept': 'application/json'} - - # # form the query URL based on the input parameters received - # if doi: - # query_str = "doi:" + doi - # elif issn: - # query_str = "issn:" + issn - # elif journal: - # journal = "%22" + journal.replace(" ", "%20") + "%22" - # query_str = "journal:" + journal - # elif title: - # title = "%22" + title.replace(" ", "%20") + "%22" - # query_str = "title:" + title - # elif subject: - # query_str = "subject:" + subject - # else: - # return "No query parameters provided" + api_url = "http://api.springernature.com/openaccess/json?q=" + headers = {'Accept': 'application/json'} + + # form the query URL based on the input parameters received + if doi: + query_str = "doi:" + doi + elif issn: + query_str = "issn:" + issn + elif journal: + journal = "%22" + journal.replace(" ", "%20") + "%22" + query_str = "journal:" + journal + elif title: + title = "%22" + title.replace(" ", "%20") + "%22" + query_str = "title:" + title + elif subject: + query_str = "subject:" + subject + else: + return "No query parameters provided" - # main_url = api_url + query_str + "&api_key=" + str(SPRINGER_API_KEY) - # print("Full URL: ", main_url) + main_url = api_url + query_str + "&api_key=" + str(SPRINGER_API_KEY) + print("Full URL: ", main_url) - # response = requests.get(main_url, headers=headers) - # print("Status: ", response.status_code) - - # if response.status_code != 200: - # return "Error: " + str(response.status_code) + " - " + response.text - - # data = response.json() - # # check for total number of records - # total_records = int(data['result'][0]['total']) - # print("Total records: ", total_records) - # current_records = 0 - # while current_records < total_records: - # # check if nextPage exists - # try: - # if 'nextPage' in data: - # next_page_url = "http://api.springernature.com" + data['nextPage'] - # else: - # next_page_url = None - - # # multi-process all records in current page - # with concurrent.futures.ProcessPoolExecutor() as executor: - # results = [executor.submit(downloadPDFSpringer, record, directory) for record in data['records']] - # for f in concurrent.futures.as_completed(results): - # print(f.result()) - - # # update current records count - # current_records += int(len(data['records'])) - - # # if next page exists, update next page url and call the API again - # if next_page_url: - # # API key is already present in the URL - # response = requests.get(next_page_url, headers=headers) - # if response.status_code != 200: - # return "Error in next page: " + str(response.status_code) + " - " + response.text + response = requests.get(main_url, headers=headers) + print("Status: ", response.status_code) + + if response.status_code != 200: + return "Error: " + str(response.status_code) + " - " + response.text + + data = response.json() + # check for total number of records + total_records = int(data['result'][0]['total']) + print("Total records: ", total_records) + current_records = 0 + while current_records < total_records: + # check if nextPage exists + try: + if 'nextPage' in data: + next_page_url = "http://api.springernature.com" + data['nextPage'] + else: + next_page_url = None + + # multi-process all records in current page + with concurrent.futures.ProcessPoolExecutor() as executor: + results = [executor.submit(downloadPDFSpringer, record, directory) for record in data['records']] + for f in concurrent.futures.as_completed(results): + print(f.result()) + + # update current records count + current_records += int(len(data['records'])) + + # if next page exists, update next page url and call the API again + if next_page_url: + # API key is already present in the URL + response = requests.get(next_page_url, headers=headers) + if response.status_code != 200: + return "Error in next page: " + str(response.status_code) + " - " + response.text - # data = response.json() - # except Exception as e: - # print(e) + data = response.json() + except Exception as e: + print(e) print("Course name: ", course_name) # prep payload for beam ingest ingest_data = [] - + # upload files to S3 bucket for file in os.listdir(directory): + doi = file[:-4] + doi = doi.replace("_", "/") + doi_link = f"https://doi.org/{doi}" data = { "course_name": course_name, "group": "springer_open", - "s3_paths": "", - "readable_filename": "", + "s3_paths": "courses/" + course_name + "/" + file, # type: ignore + "readable_filename": file, "base_url": "", - "url": "", - "journal": "rice", + "url": doi_link, + "journal": "", } - s3_path = "courses/" + course_name + "/" + file # type: ignore - data["s3_paths"] = s3_path - data["readable_filename"] = file + s3_client.upload_file(directory + "/" + file, aws_bucket, s3_path) # type: ignore ingest_data.append(data) From 8c4d76df26348ff51b97178f434a3e9c02343c1d Mon Sep 17 00:00:00 2001 From: star-nox Date: Thu, 9 May 2024 10:51:30 -0500 Subject: [PATCH 05/18] minor changes --- ai_ta_backend/main.py | 24 +++- ai_ta_backend/utils/pub_ingest.py | 232 +++++++++++++++++++++++++++++- requirements.txt | 4 + 3 files changed, 256 insertions(+), 4 deletions(-) diff --git a/ai_ta_backend/main.py b/ai_ta_backend/main.py index a563fb3c..c30a7a7b 100644 --- a/ai_ta_backend/main.py +++ b/ai_ta_backend/main.py @@ -40,7 +40,7 @@ from ai_ta_backend.service.sentry_service import SentryService from ai_ta_backend.beam.nomic_logging import create_document_map -from ai_ta_backend.utils.pub_ingest import downloadSpringerFulltext +from ai_ta_backend.utils.pub_ingest import downloadSpringerFulltext, downloadWileyFulltext app = Flask(__name__) CORS(app) @@ -408,6 +408,28 @@ def get_springer_data(): response.headers.add('Access-Control-Allow-Origin', '*') return response +@app.route('/get-wiley-fulltext', methods=['GET']) +def get_wiley_data(): + course_name: str = request.args.get('course_name', default='', type=str) + issn = request.args.get('issn', default='', type=str) + #doi = request.args.get('doi', default='', type=str) + + print("In /get-wiley-fulltext") + + if issn == '' or course_name == '': + # proper web error "400 Bad request" + abort( + 400, + description= + f"Missing required parameters: 'issn' or 'doi' and 'course_name' must be provided." + ) + + fulltext = downloadWileyFulltext(course_name, issn) + + response = jsonify(fulltext) + response.headers.add('Access-Control-Allow-Origin', '*') + return response + def configure(binder: Binder) -> None: binder.bind(RetrievalService, to=RetrievalService, scope=RequestScope) diff --git a/ai_ta_backend/utils/pub_ingest.py b/ai_ta_backend/utils/pub_ingest.py index b2668702..7c73c303 100644 --- a/ai_ta_backend/utils/pub_ingest.py +++ b/ai_ta_backend/utils/pub_ingest.py @@ -1,15 +1,22 @@ import os import json +import time import pandas as pd import shutil import requests import supabase -import xml.etree.ElementTree as ET -from urllib.parse import urlparse import concurrent.futures +from crossref.restful import Works, Journals from ai_ta_backend.database import aws, sql SPRINGER_API_KEY = os.environ.get('SPRINGER_API_KEY') +LICENSES = { + "http://onlinelibrary.wiley.com/termsAndConditions#vor": "closed_access", + "http://creativecommons.org/licenses/by/4.0/": "CC BY", + "http://creativecommons.org/licenses/by-nc/4.0/": "CC BY-NC", + "http://creativecommons.org/licenses/by-nc-nd/4.0/": "CC BY-NC-ND", + "http://creativecommons.org/licenses/by-nc-sa/4.0/": "CC BY-NC-SA", +} s3_client = aws.AWSStorage() aws_bucket = os.getenv('S3_BUCKET_NAME') @@ -114,7 +121,7 @@ def downloadSpringerFulltext(issn=None, subject=None, journal=None, title=None, "url": doi_link, "journal": "", } - + s3_path = "courses/" + course_name + "/" + file # type: ignore s3_client.upload_file(directory + "/" + file, aws_bucket, s3_path) # type: ignore ingest_data.append(data) @@ -193,3 +200,222 @@ def downloadPDFSpringer(record: dict, directory: str): return "success" except Exception as e: return "Error in downloading PDF: " + str(e) + + +def downloadWileyFulltext(course_name=None, issn=None): + """ + This function fetches metadata from Crossref and downloads + full-text articles from a given journal from Wiley. + """ + # create directory to store files + directory = os.path.join(os.getcwd(), 'wiley_papers') + if not os.path.exists(directory): + os.makedirs(directory) + + api_key = os.environ.get("WILEY_TDM_TOKEN") + metadata = [] + + # get journal metadata + journals = Journals() + works = journals.works(issn=issn) + count = 0 + for item in works: + open_access = True + count += 1 + article_metadata = {} + # check if the license is open access - variant of CC + if 'license' not in item: + continue + + for license in item['license']: + print("License URL: ", license['URL']) + if license['URL'] in LICENSES: + if LICENSES[license['URL']] == "closed_access": + #print("Article is not open access: ", item['DOI']) + open_access = False + else: + print("Article is open access: ", item['DOI']) + article_metadata['license'] = LICENSES[license['URL']] + article_metadata['license_link'] = license['URL'] + else: + article_metadata['license_link'] = license['URL'] + + if not open_access: + continue + + article_metadata['doi'] = item['DOI'] + article_metadata['title'] = item['title'][0] + article_metadata['journal'] = item['container-title'][0] + article_metadata['publisher'] = item['publisher'] + article_metadata['issn'] = item['ISSN'][0] + article_metadata['url'] = item['URL'] + article_metadata['filename'] = item['DOI'].replace("/", "_") + ".pdf" + + print("Article Metadata: ", article_metadata) + + # download PDF based on doi + download_status = downloadWileyPDF(item['DOI']) + print("Download status: ", download_status) + metadata.append(article_metadata) + + print("Download complete.") + print("Total articles: ", count) + metadata_csv = "wiley_metadata.csv" + metadata_df = pd.DataFrame(metadata) + if not os.path.exists(metadata_csv): + metadata_df.to_csv(metadata_csv, index=False) + else: + metadata_df.to_csv(metadata_csv, mode='a', header=False, index=False) + # prep payload for beam ingest + # ingest_data = [] + + # # upload files to S3 bucket + # for file in os.listdir(directory): + # doi = file[:-4] + # doi = doi.replace("_", "/") + # doi_link = f"https://doi.org/{doi}" + # data = { + # "course_name": course_name, + # "group": "wiley", + # "s3_paths": "courses/" + course_name + "/" + file, # type: ignore + # "readable_filename": file, + # "base_url": "", + # "url": doi_link, + # "journal": "", + # } + # s3_path = "courses/" + course_name + "/" + file # type: ignore + # s3_client.upload_file(directory + "/" + file, aws_bucket, s3_path) # type: ignore + # ingest_data.append(data) + + # # save ingest data to csv + # ingest_df = pd.DataFrame(ingest_data) + # csv_file = "publications_data.csv" + # if not os.path.exists(csv_file): + # ingest_df.to_csv(csv_file, index=False) + # else: + # ingest_df.to_csv(csv_file, mode='a', header=False, index=False) + + + # # call ingest + # beam_url = "https://41kgx.apps.beam.cloud" + # headers = { + # "Content-Type": "application/json", + # "Authorization": "Basic " + os.getenv('BEAM_AUTH_TOKEN') # type: ignore + # } + # for data in ingest_data: + # payload = json.dumps(data) + # response = requests.post(beam_url, headers=headers, data=payload) + # if response.status_code == 200: + # print("Task status retrieved successfully!") + # else: + # print(f"Error: {response.status_code}. {response.text}") + + # Delete files from local directory + #shutil.rmtree(directory) + + +def downloadWileyPDF(doi=None): + """ + This function downloads a PDF file from Wiley based on the DOI. + """ + # create directory to store files + directory = os.path.join(os.getcwd(), 'wiley_papers') + if not os.path.exists(directory): + os.makedirs(directory) + + api_key = os.environ.get("WILEY_TDM_TOKEN") + + # download PDF based on doi + base_url = "https://api.wiley.com/onlinelibrary/tdm/v1/articles/" + url = base_url + str(doi) + print("URL: ", url) + + headers = { + 'Wiley-TDM-Client-Token': api_key, + 'Content-Type': 'application/json' + } + time.sleep(3) + response = requests.get(url, headers=headers) + if response.status_code != 200: + return "Error in accessing article link: " + str(response.status_code) + " - " + response.text + + filename = str(doi).replace("/", "_") + ".pdf" + with open(directory + "/" + filename, "wb") as f: # Open a file in binary write mode ("wb") + for chunk in response.iter_content(chunk_size=1024): # Download in chunks + f.write(chunk) + print("Downloaded: ", filename) + + return "success" + + +def downloadWileyArticle(doi=None): + """ + This function fetches metadata from Crossref and downloads open access full text articles from Wiley. + """ + # create directory to store files + directory = os.path.join(os.getcwd(), 'wiley_papers') + if not os.path.exists(directory): + os.makedirs(directory) + + api_key = os.environ.get("WILEY_TDM_TOKEN") + metadata = {} + + # get metadata from Crossref + if doi: + # get article metadata + works = Works() + article_data = works.doi(doi) + print("Article license: ", article_data['license']) + + article_licenses = [] + + for item in article_data['license']: + article_licenses.append(item['URL']) + print("Licenses: ", article_licenses) + # check if the license is open access - variant of CC + for license in article_licenses: + if license in LICENSES: + print("License found: ", license) + if LICENSES[license] == "closed_access": + return "Article is not open access." + else: + metadata['license'] = LICENSES[license] + break + else: + return "License not found." + + metadata['doi'] = doi + metadata['title'] = article_data['title'][0] + metadata['journal'] = article_data['container-title'][0] + metadata['publisher'] = article_data['publisher'] + metadata['issn'] = article_data['ISSN'][0] + metadata['url'] = article_data['URL'] + + print("Metadata: ", metadata) + + # download PDF based on doi + base_url = "https://api.wiley.com/onlinelibrary/tdm/v1/articles/" + url = base_url + str(doi) + + print("URL: ", url) + + headers = { + 'Wiley-TDM-Client-Token': api_key, + 'Content-Type': 'application/json' + } + + response = requests.get(url, headers=headers) + if response.status_code != 200: + return "Error in accessing article link: " + str(response.status_code) + " - " + response.text + + filename = str(doi).replace("/", "_") + with open(directory + "/" + filename + ".pdf", "wb") as f: # Open a file in binary write mode ("wb") + for chunk in response.iter_content(chunk_size=1024): # Download in chunks + f.write(chunk) + print("Downloaded: ", filename) + + # upload file to S3 bucket + + # prep payload for beam ingest + + return "success" \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 848c10d0..4d75f5b7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -39,6 +39,10 @@ supabase==2.0.2 posthog==3.1.0 sentry-sdk==1.39.1 +# Publications +crossrefapi + + # Not currently supporting coursera ingest # cs-dlp @ git+https://github.com/raffaem/cs-dlp.git@0.12.0b0 # previously called coursera-dl From fac8c291cdab60aa2b440cb8cc3e3bfe20c32035 Mon Sep 17 00:00:00 2001 From: star-nox Date: Sat, 11 May 2024 16:33:01 -0500 Subject: [PATCH 06/18] minor changes --- ai_ta_backend/utils/pub_ingest.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/ai_ta_backend/utils/pub_ingest.py b/ai_ta_backend/utils/pub_ingest.py index 7c73c303..a7884303 100644 --- a/ai_ta_backend/utils/pub_ingest.py +++ b/ai_ta_backend/utils/pub_ingest.py @@ -8,6 +8,7 @@ import concurrent.futures from crossref.restful import Works, Journals from ai_ta_backend.database import aws, sql +import backoff SPRINGER_API_KEY = os.environ.get('SPRINGER_API_KEY') LICENSES = { @@ -252,11 +253,12 @@ def downloadWileyFulltext(course_name=None, issn=None): article_metadata['filename'] = item['DOI'].replace("/", "_") + ".pdf" print("Article Metadata: ", article_metadata) + metadata.append(article_metadata) # download PDF based on doi download_status = downloadWileyPDF(item['DOI']) print("Download status: ", download_status) - metadata.append(article_metadata) + print("Download complete.") print("Total articles: ", count) @@ -313,7 +315,7 @@ def downloadWileyFulltext(course_name=None, issn=None): # Delete files from local directory #shutil.rmtree(directory) - +@backoff.on_exception(backoff.expo, requests.exceptions.HTTPError, max_tries=20) def downloadWileyPDF(doi=None): """ This function downloads a PDF file from Wiley based on the DOI. @@ -334,10 +336,9 @@ def downloadWileyPDF(doi=None): 'Wiley-TDM-Client-Token': api_key, 'Content-Type': 'application/json' } - time.sleep(3) + response = requests.get(url, headers=headers) - if response.status_code != 200: - return "Error in accessing article link: " + str(response.status_code) + " - " + response.text + response.raise_for_status() filename = str(doi).replace("/", "_") + ".pdf" with open(directory + "/" + filename, "wb") as f: # Open a file in binary write mode ("wb") @@ -406,6 +407,9 @@ def downloadWileyArticle(doi=None): response = requests.get(url, headers=headers) if response.status_code != 200: + # exponential backoff logic + print("Error in accessing article link, retrying: ", response.text) + return "Error in accessing article link: " + str(response.status_code) + " - " + response.text filename = str(doi).replace("/", "_") From c2a01fa1b47054e65a6002a0baf164c91e24eb82 Mon Sep 17 00:00:00 2001 From: star-nox Date: Tue, 21 May 2024 11:20:03 -0500 Subject: [PATCH 07/18] separated metadata extraction and download --- ai_ta_backend/utils/pub_ingest.py | 119 +++++++++++++++++++----------- 1 file changed, 77 insertions(+), 42 deletions(-) diff --git a/ai_ta_backend/utils/pub_ingest.py b/ai_ta_backend/utils/pub_ingest.py index a7884303..c30452df 100644 --- a/ai_ta_backend/utils/pub_ingest.py +++ b/ai_ta_backend/utils/pub_ingest.py @@ -11,12 +11,17 @@ import backoff SPRINGER_API_KEY = os.environ.get('SPRINGER_API_KEY') -LICENSES = { - "http://onlinelibrary.wiley.com/termsAndConditions#vor": "closed_access", +CC_LICENSES = { "http://creativecommons.org/licenses/by/4.0/": "CC BY", "http://creativecommons.org/licenses/by-nc/4.0/": "CC BY-NC", "http://creativecommons.org/licenses/by-nc-nd/4.0/": "CC BY-NC-ND", - "http://creativecommons.org/licenses/by-nc-sa/4.0/": "CC BY-NC-SA", + "http://creativecommons.org/licenses/by-nc-sa/4.0/": "CC BY-NC-SA" +} + +OTHER_LICENSES = { + "http://onlinelibrary.wiley.com/termsAndConditions#vor": "wiley_tnc", + "http://onlinelibrary.wiley.com/termsAndConditions#am": "wiley_tnc", + "http://doi.wiley.com/10.1002/tdm_license_1.1": "wiley_tdm" } s3_client = aws.AWSStorage() @@ -202,65 +207,51 @@ def downloadPDFSpringer(record: dict, directory: str): except Exception as e: return "Error in downloading PDF: " + str(e) - -def downloadWileyFulltext(course_name=None, issn=None): +def getCrossrefMetadata(issn: str): """ - This function fetches metadata from Crossref and downloads - full-text articles from a given journal from Wiley. + Creates a csv file with metadata of all articles for given journal (ISSN) """ - # create directory to store files - directory = os.path.join(os.getcwd(), 'wiley_papers') - if not os.path.exists(directory): - os.makedirs(directory) - - api_key = os.environ.get("WILEY_TDM_TOKEN") metadata = [] - # get journal metadata journals = Journals() works = journals.works(issn=issn) count = 0 + no_license = 0 for item in works: - open_access = True count += 1 article_metadata = {} # check if the license is open access - variant of CC if 'license' not in item: + no_license += 1 continue - - for license in item['license']: - print("License URL: ", license['URL']) - if license['URL'] in LICENSES: - if LICENSES[license['URL']] == "closed_access": - #print("Article is not open access: ", item['DOI']) - open_access = False + else: + for license in item['license']: + # check for creative commons license + if license['URL'] in CC_LICENSES: + article_metadata['license'] = CC_LICENSES[license['URL']] + article_metadata['license_url'] = license['URL'] + break + elif license['URL'] in OTHER_LICENSES: + article_metadata['license'] = OTHER_LICENSES[license['URL']] + article_metadata['license_url'] = license['URL'] else: - print("Article is open access: ", item['DOI']) - article_metadata['license'] = LICENSES[license['URL']] - article_metadata['license_link'] = license['URL'] - else: - article_metadata['license_link'] = license['URL'] - - if not open_access: - continue - + article_metadata['license'] = "unknown" + article_metadata['license_url'] = license['URL'] + article_metadata['doi'] = item['DOI'] - article_metadata['title'] = item['title'][0] + if 'title' not in item: + article_metadata['title'] = "No title found" + else: + article_metadata['title'] = item['title'][0] article_metadata['journal'] = item['container-title'][0] article_metadata['publisher'] = item['publisher'] article_metadata['issn'] = item['ISSN'][0] article_metadata['url'] = item['URL'] article_metadata['filename'] = item['DOI'].replace("/", "_") + ".pdf" - - print("Article Metadata: ", article_metadata) + article_metadata['downloaded'] = "no" metadata.append(article_metadata) - - # download PDF based on doi - download_status = downloadWileyPDF(item['DOI']) - print("Download status: ", download_status) - + print("Processed: ", article_metadata['doi']) - print("Download complete.") print("Total articles: ", count) metadata_csv = "wiley_metadata.csv" metadata_df = pd.DataFrame(metadata) @@ -268,7 +259,49 @@ def downloadWileyFulltext(course_name=None, issn=None): metadata_df.to_csv(metadata_csv, index=False) else: metadata_df.to_csv(metadata_csv, mode='a', header=False, index=False) - # prep payload for beam ingest + + return "success" + + +def downloadWileyFulltext(course_name=None, issn=None): + """ + This function fetches metadata from Crossref and downloads + full-text articles from a given journal from Wiley. + """ + # create directory to store files + directory = os.path.join(os.getcwd(), 'wiley_papers') + if not os.path.exists(directory): + os.makedirs(directory) + + # fetch metadata + # metadata_status = getCrossrefMetadata(issn) + # print("Metadata status: ", metadata_status) + + + # download PDFs based on metadata + metadata_csv = "wiley_metadata.csv" + if os.path.exists(metadata_csv): + metadata_df = pd.read_csv(metadata_csv) + metadata = metadata_df.to_dict(orient='records') + + for item in metadata: + try: + if item['license'] in ['CC BY', 'CC BY-NC', 'CC BY-NC-ND', 'CC BY-NC-SA'] and item['downloaded'] == 'no': + status = downloadWileyPDF(item['doi']) + print("Download status: ", status) + if status == "success": + item['downloaded'] = 'yes' + + except Exception as e: + print(e) + + metadata_df = pd.DataFrame(metadata) + metadata_df.to_csv(metadata_csv, index=False) + + return "success" + + + # # prep payload for beam ingest # ingest_data = [] # # upload files to S3 bucket @@ -314,8 +347,10 @@ def downloadWileyFulltext(course_name=None, issn=None): # Delete files from local directory #shutil.rmtree(directory) + + -@backoff.on_exception(backoff.expo, requests.exceptions.HTTPError, max_tries=20) +@backoff.on_exception(backoff.expo, requests.exceptions.HTTPError, max_tries=7) def downloadWileyPDF(doi=None): """ This function downloads a PDF file from Wiley based on the DOI. From bdaf9760170e1ec7f1e9d74088b376acf762f32b Mon Sep 17 00:00:00 2001 From: star-nox Date: Tue, 28 May 2024 12:23:16 -0500 Subject: [PATCH 08/18] minor download code changes --- ai_ta_backend/main.py | 14 ++- ai_ta_backend/utils/pub_ingest.py | 198 ++++++++++++++++-------------- 2 files changed, 112 insertions(+), 100 deletions(-) diff --git a/ai_ta_backend/main.py b/ai_ta_backend/main.py index c30a7a7b..ec8b6a73 100644 --- a/ai_ta_backend/main.py +++ b/ai_ta_backend/main.py @@ -408,15 +408,17 @@ def get_springer_data(): response.headers.add('Access-Control-Allow-Origin', '*') return response -@app.route('/get-wiley-fulltext', methods=['GET']) +@app.route('/get-wiley-fulltext', methods=['POST']) def get_wiley_data(): - course_name: str = request.args.get('course_name', default='', type=str) - issn = request.args.get('issn', default='', type=str) - #doi = request.args.get('doi', default='', type=str) - + data = request.get_json() + print(data) + + course_name = data['course_name'] + issn = data['issn'] + print("In /get-wiley-fulltext") - if issn == '' or course_name == '': + if issn == [] or course_name == '': # proper web error "400 Bad request" abort( 400, diff --git a/ai_ta_backend/utils/pub_ingest.py b/ai_ta_backend/utils/pub_ingest.py index c30452df..2fe45ac2 100644 --- a/ai_ta_backend/utils/pub_ingest.py +++ b/ai_ta_backend/utils/pub_ingest.py @@ -211,59 +211,62 @@ def getCrossrefMetadata(issn: str): """ Creates a csv file with metadata of all articles for given journal (ISSN) """ - metadata = [] - # get journal metadata - journals = Journals() - works = journals.works(issn=issn) - count = 0 - no_license = 0 - for item in works: - count += 1 - article_metadata = {} - # check if the license is open access - variant of CC - if 'license' not in item: - no_license += 1 - continue - else: - for license in item['license']: - # check for creative commons license - if license['URL'] in CC_LICENSES: - article_metadata['license'] = CC_LICENSES[license['URL']] - article_metadata['license_url'] = license['URL'] - break - elif license['URL'] in OTHER_LICENSES: - article_metadata['license'] = OTHER_LICENSES[license['URL']] - article_metadata['license_url'] = license['URL'] - else: - article_metadata['license'] = "unknown" - article_metadata['license_url'] = license['URL'] - - article_metadata['doi'] = item['DOI'] - if 'title' not in item: - article_metadata['title'] = "No title found" + try: + metadata = [] + # get journal metadata + journals = Journals() + works = journals.works(issn=issn) + count = 0 + no_license = 0 + for item in works: + count += 1 + article_metadata = {} + # check if the license is open access - variant of CC + if 'license' not in item: + no_license += 1 + continue + else: + for license in item['license']: + # check for creative commons license + if license['URL'] in CC_LICENSES: + article_metadata['license'] = CC_LICENSES[license['URL']] + article_metadata['license_url'] = license['URL'] + break + elif license['URL'] in OTHER_LICENSES: + article_metadata['license'] = OTHER_LICENSES[license['URL']] + article_metadata['license_url'] = license['URL'] + else: + article_metadata['license'] = "unknown" + article_metadata['license_url'] = license['URL'] + + article_metadata['doi'] = item['DOI'] + if 'title' not in item: + article_metadata['title'] = "No title found" + else: + article_metadata['title'] = item['title'][0] + article_metadata['journal'] = item['container-title'][0] + article_metadata['publisher'] = item['publisher'] + article_metadata['issn'] = item['ISSN'][0] + article_metadata['url'] = item['URL'] + article_metadata['filename'] = item['DOI'].replace("/", "_") + ".pdf" + article_metadata['downloaded'] = "no" + metadata.append(article_metadata) + print("Processed: ", article_metadata['doi']) + + print("Total articles: ", count) + metadata_csv = "wiley_metadata.csv" + metadata_df = pd.DataFrame(metadata) + if not os.path.exists(metadata_csv): + metadata_df.to_csv(metadata_csv, index=False) else: - article_metadata['title'] = item['title'][0] - article_metadata['journal'] = item['container-title'][0] - article_metadata['publisher'] = item['publisher'] - article_metadata['issn'] = item['ISSN'][0] - article_metadata['url'] = item['URL'] - article_metadata['filename'] = item['DOI'].replace("/", "_") + ".pdf" - article_metadata['downloaded'] = "no" - metadata.append(article_metadata) - print("Processed: ", article_metadata['doi']) - - print("Total articles: ", count) - metadata_csv = "wiley_metadata.csv" - metadata_df = pd.DataFrame(metadata) - if not os.path.exists(metadata_csv): - metadata_df.to_csv(metadata_csv, index=False) - else: - metadata_df.to_csv(metadata_csv, mode='a', header=False, index=False) - - return "success" + metadata_df.to_csv(metadata_csv, mode='a', header=False, index=False) + + return "success" + except Exception as e: + return "Error: " + str(e) -def downloadWileyFulltext(course_name=None, issn=None): +def downloadWileyFulltext(course_name=None, issn=[]): """ This function fetches metadata from Crossref and downloads full-text articles from a given journal from Wiley. @@ -273,30 +276,33 @@ def downloadWileyFulltext(course_name=None, issn=None): if not os.path.exists(directory): os.makedirs(directory) - # fetch metadata - # metadata_status = getCrossrefMetadata(issn) - # print("Metadata status: ", metadata_status) + # fetch metadata + for item in issn: + metadata_status = getCrossrefMetadata(item) + print("Metadata status: ", metadata_status) # download PDFs based on metadata - metadata_csv = "wiley_metadata.csv" - if os.path.exists(metadata_csv): - metadata_df = pd.read_csv(metadata_csv) - metadata = metadata_df.to_dict(orient='records') - - for item in metadata: - try: - if item['license'] in ['CC BY', 'CC BY-NC', 'CC BY-NC-ND', 'CC BY-NC-SA'] and item['downloaded'] == 'no': - status = downloadWileyPDF(item['doi']) - print("Download status: ", status) - if status == "success": - item['downloaded'] = 'yes' - - except Exception as e: - print(e) + # metadata_csv = "wiley_metadata.csv" + # if os.path.exists(metadata_csv): + # metadata_df = pd.read_csv(metadata_csv) + # metadata = metadata_df.to_dict(orient='records') + + # for item in metadata: + # try: + # if item['license'] in ['CC BY', 'CC BY-NC', 'CC BY-NC-ND', 'CC BY-NC-SA'] and item['downloaded'] == 'no' and item['publisher'] == 'Wiley': + # status = downloadWileyPDF(item['doi']) + # print("Download status: ", status) + # if status == "success": + # item['downloaded'] = 'yes' + # time.sleep(5) + # except Exception as e: + # print(e) + + # #time.sleep(10) - metadata_df = pd.DataFrame(metadata) - metadata_df.to_csv(metadata_csv, index=False) + # metadata_df = pd.DataFrame(metadata) + # metadata_df.to_csv(metadata_csv, index=False) return "success" @@ -350,38 +356,42 @@ def downloadWileyFulltext(course_name=None, issn=None): -@backoff.on_exception(backoff.expo, requests.exceptions.HTTPError, max_tries=7) +#@backoff.on_exception(backoff.expo, requests.exceptions.HTTPError, max_tries=1) def downloadWileyPDF(doi=None): """ This function downloads a PDF file from Wiley based on the DOI. """ - # create directory to store files - directory = os.path.join(os.getcwd(), 'wiley_papers') - if not os.path.exists(directory): - os.makedirs(directory) + try: + # create directory to store files + directory = os.path.join(os.getcwd(), 'wiley_papers') + if not os.path.exists(directory): + os.makedirs(directory) - api_key = os.environ.get("WILEY_TDM_TOKEN") + api_key = os.environ.get("WILEY_TDM_TOKEN") - # download PDF based on doi - base_url = "https://api.wiley.com/onlinelibrary/tdm/v1/articles/" - url = base_url + str(doi) - print("URL: ", url) + # download PDF based on doi + base_url = "https://api.wiley.com/onlinelibrary/tdm/v1/articles/" + url = base_url + str(doi) + print("URL: ", url) - headers = { - 'Wiley-TDM-Client-Token': api_key, - 'Content-Type': 'application/json' - } - - response = requests.get(url, headers=headers) - response.raise_for_status() + headers = { + 'Wiley-TDM-Client-Token': api_key, + 'Content-Type': 'application/json' + } - filename = str(doi).replace("/", "_") + ".pdf" - with open(directory + "/" + filename, "wb") as f: # Open a file in binary write mode ("wb") - for chunk in response.iter_content(chunk_size=1024): # Download in chunks - f.write(chunk) - print("Downloaded: ", filename) - - return "success" + response = requests.get(url, headers=headers) + response.raise_for_status() + + filename = str(doi).replace("/", "_") + ".pdf" + with open(directory + "/" + filename, "wb") as f: # Open a file in binary write mode ("wb") + for chunk in response.iter_content(chunk_size=1024): # Download in chunks + f.write(chunk) + print("Downloaded: ", filename) + + return "success" + except Exception as e: + print("Error: ", e) + return "error" def downloadWileyArticle(doi=None): From 2943aef5e11cf9da409c984475d155ee4f551b56 Mon Sep 17 00:00:00 2001 From: star-nox Date: Tue, 28 May 2024 14:44:20 -0500 Subject: [PATCH 09/18] adding metadata to csv file line by line --- ai_ta_backend/utils/pub_ingest.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/ai_ta_backend/utils/pub_ingest.py b/ai_ta_backend/utils/pub_ingest.py index 2fe45ac2..8e2793d5 100644 --- a/ai_ta_backend/utils/pub_ingest.py +++ b/ai_ta_backend/utils/pub_ingest.py @@ -250,16 +250,22 @@ def getCrossrefMetadata(issn: str): article_metadata['url'] = item['URL'] article_metadata['filename'] = item['DOI'].replace("/", "_") + ".pdf" article_metadata['downloaded'] = "no" - metadata.append(article_metadata) + metadata_csv = "wiley_metadata.csv" + metadata_df = pd.DataFrame([article_metadata]) + if not os.path.exists(metadata_csv): + metadata_df.to_csv(metadata_csv, index=False) + else: + metadata_df.to_csv(metadata_csv, mode='a', header=False, index=False) + #metadata.append(article_metadata) print("Processed: ", article_metadata['doi']) print("Total articles: ", count) - metadata_csv = "wiley_metadata.csv" - metadata_df = pd.DataFrame(metadata) - if not os.path.exists(metadata_csv): - metadata_df.to_csv(metadata_csv, index=False) - else: - metadata_df.to_csv(metadata_csv, mode='a', header=False, index=False) + # metadata_csv = "wiley_metadata.csv" + # metadata_df = pd.DataFrame(metadata) + # if not os.path.exists(metadata_csv): + # metadata_df.to_csv(metadata_csv, index=False) + # else: + # metadata_df.to_csv(metadata_csv, mode='a', header=False, index=False) return "success" except Exception as e: From 54b7a6f219b03716dc4f96c9c98c6e8f0d84a053 Mon Sep 17 00:00:00 2001 From: star-nox Date: Thu, 30 May 2024 14:58:32 -0500 Subject: [PATCH 10/18] minor changes --- ai_ta_backend/utils/pub_ingest.py | 124 +++++++++++++++--------------- 1 file changed, 63 insertions(+), 61 deletions(-) diff --git a/ai_ta_backend/utils/pub_ingest.py b/ai_ta_backend/utils/pub_ingest.py index 8e2793d5..a4971713 100644 --- a/ai_ta_backend/utils/pub_ingest.py +++ b/ai_ta_backend/utils/pub_ingest.py @@ -219,46 +219,48 @@ def getCrossrefMetadata(issn: str): count = 0 no_license = 0 for item in works: - count += 1 - article_metadata = {} - # check if the license is open access - variant of CC - if 'license' not in item: - no_license += 1 - continue - else: - for license in item['license']: - # check for creative commons license - if license['URL'] in CC_LICENSES: - article_metadata['license'] = CC_LICENSES[license['URL']] - article_metadata['license_url'] = license['URL'] - break - elif license['URL'] in OTHER_LICENSES: - article_metadata['license'] = OTHER_LICENSES[license['URL']] - article_metadata['license_url'] = license['URL'] - else: - article_metadata['license'] = "unknown" - article_metadata['license_url'] = license['URL'] - - article_metadata['doi'] = item['DOI'] - if 'title' not in item: - article_metadata['title'] = "No title found" - else: - article_metadata['title'] = item['title'][0] - article_metadata['journal'] = item['container-title'][0] - article_metadata['publisher'] = item['publisher'] - article_metadata['issn'] = item['ISSN'][0] - article_metadata['url'] = item['URL'] - article_metadata['filename'] = item['DOI'].replace("/", "_") + ".pdf" - article_metadata['downloaded'] = "no" - metadata_csv = "wiley_metadata.csv" - metadata_df = pd.DataFrame([article_metadata]) - if not os.path.exists(metadata_csv): - metadata_df.to_csv(metadata_csv, index=False) - else: - metadata_df.to_csv(metadata_csv, mode='a', header=False, index=False) - #metadata.append(article_metadata) - print("Processed: ", article_metadata['doi']) - + try: + count += 1 + article_metadata = {} + # check if the license is open access - variant of CC + if 'license' not in item: + no_license += 1 + continue + else: + for license in item['license']: + # check for creative commons license + if license['URL'] in CC_LICENSES: + article_metadata['license'] = CC_LICENSES[license['URL']] + article_metadata['license_url'] = license['URL'] + break + elif license['URL'] in OTHER_LICENSES: + article_metadata['license'] = OTHER_LICENSES[license['URL']] + article_metadata['license_url'] = license['URL'] + else: + article_metadata['license'] = "unknown" + article_metadata['license_url'] = license['URL'] + + article_metadata['doi'] = item['DOI'] + if 'title' not in item: + article_metadata['title'] = "No title found" + else: + article_metadata['title'] = item['title'][0] + article_metadata['journal'] = item['container-title'][0] + article_metadata['publisher'] = item['publisher'] + article_metadata['issn'] = item['ISSN'][0] + article_metadata['url'] = item['URL'] + article_metadata['filename'] = item['DOI'].replace("/", "_") + ".pdf" + article_metadata['downloaded'] = "no" + metadata_csv = "wiley_metadata.csv" + metadata_df = pd.DataFrame([article_metadata]) + if not os.path.exists(metadata_csv): + metadata_df.to_csv(metadata_csv, index=False) + else: + metadata_df.to_csv(metadata_csv, mode='a', header=False, index=False) + #metadata.append(article_metadata) + print("Processed: ", article_metadata['doi']) + except Exception as e: + print("Error processing article: ", article_metadata['doi'], e) print("Total articles: ", count) # metadata_csv = "wiley_metadata.csv" # metadata_df = pd.DataFrame(metadata) @@ -284,31 +286,31 @@ def downloadWileyFulltext(course_name=None, issn=[]): # fetch metadata - for item in issn: - metadata_status = getCrossrefMetadata(item) - print("Metadata status: ", metadata_status) + # for item in issn: + # metadata_status = getCrossrefMetadata(item) + # print("Metadata status: ", metadata_status) # download PDFs based on metadata - # metadata_csv = "wiley_metadata.csv" - # if os.path.exists(metadata_csv): - # metadata_df = pd.read_csv(metadata_csv) - # metadata = metadata_df.to_dict(orient='records') - - # for item in metadata: - # try: - # if item['license'] in ['CC BY', 'CC BY-NC', 'CC BY-NC-ND', 'CC BY-NC-SA'] and item['downloaded'] == 'no' and item['publisher'] == 'Wiley': - # status = downloadWileyPDF(item['doi']) - # print("Download status: ", status) - # if status == "success": - # item['downloaded'] = 'yes' - # time.sleep(5) - # except Exception as e: - # print(e) + metadata_csv = "wiley_metadata.csv" + if os.path.exists(metadata_csv): + metadata_df = pd.read_csv(metadata_csv) + metadata = metadata_df.to_dict(orient='records') + + for item in metadata: + try: + if item['license'] in ['CC BY', 'CC BY-NC', 'CC BY-NC-ND', 'CC BY-NC-SA'] and item['downloaded'] == 'no' and item['publisher'] == 'Wiley': + status = downloadWileyPDF(item['doi']) + print("Download status: ", status) + if status == "success": + item['downloaded'] = 'yes' + time.sleep(5) + except Exception as e: + print(e) - # #time.sleep(10) + #time.sleep(10) - # metadata_df = pd.DataFrame(metadata) - # metadata_df.to_csv(metadata_csv, index=False) + metadata_df = pd.DataFrame(metadata) + metadata_df.to_csv(metadata_csv, index=False) return "success" From 74cca6e07261b32e349c9c618114e75003c0e985 Mon Sep 17 00:00:00 2001 From: star-nox Date: Thu, 6 Jun 2024 16:44:19 -0500 Subject: [PATCH 11/18] minor changes --- ai_ta_backend/utils/pub_ingest.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ai_ta_backend/utils/pub_ingest.py b/ai_ta_backend/utils/pub_ingest.py index a4971713..2665c1fd 100644 --- a/ai_ta_backend/utils/pub_ingest.py +++ b/ai_ta_backend/utils/pub_ingest.py @@ -286,9 +286,9 @@ def downloadWileyFulltext(course_name=None, issn=[]): # fetch metadata - # for item in issn: - # metadata_status = getCrossrefMetadata(item) - # print("Metadata status: ", metadata_status) + for item in issn: + metadata_status = getCrossrefMetadata(item) + print("Metadata status: ", metadata_status) # download PDFs based on metadata metadata_csv = "wiley_metadata.csv" From 5f2be3c469c775ed14bfc3ef243c983e964265d2 Mon Sep 17 00:00:00 2001 From: star-nox Date: Tue, 11 Jun 2024 11:05:36 -0500 Subject: [PATCH 12/18] minor download changes --- ai_ta_backend/utils/pub_ingest.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ai_ta_backend/utils/pub_ingest.py b/ai_ta_backend/utils/pub_ingest.py index 2665c1fd..a4971713 100644 --- a/ai_ta_backend/utils/pub_ingest.py +++ b/ai_ta_backend/utils/pub_ingest.py @@ -286,9 +286,9 @@ def downloadWileyFulltext(course_name=None, issn=[]): # fetch metadata - for item in issn: - metadata_status = getCrossrefMetadata(item) - print("Metadata status: ", metadata_status) + # for item in issn: + # metadata_status = getCrossrefMetadata(item) + # print("Metadata status: ", metadata_status) # download PDFs based on metadata metadata_csv = "wiley_metadata.csv" From 1ed6b1474558f926a77c3b5fa38cf0c11b31aefd Mon Sep 17 00:00:00 2001 From: star-nox Date: Mon, 1 Jul 2024 10:10:34 -0500 Subject: [PATCH 13/18] minor changes for download --- ai_ta_backend/utils/pub_ingest.py | 45 ++++++++++++++++++------------- 1 file changed, 27 insertions(+), 18 deletions(-) diff --git a/ai_ta_backend/utils/pub_ingest.py b/ai_ta_backend/utils/pub_ingest.py index a4971713..9b8d038e 100644 --- a/ai_ta_backend/utils/pub_ingest.py +++ b/ai_ta_backend/utils/pub_ingest.py @@ -90,10 +90,19 @@ def downloadSpringerFulltext(issn=None, subject=None, journal=None, title=None, next_page_url = None # multi-process all records in current page - with concurrent.futures.ProcessPoolExecutor() as executor: - results = [executor.submit(downloadPDFSpringer, record, directory) for record in data['records']] - for f in concurrent.futures.as_completed(results): - print(f.result()) + # with concurrent.futures.ProcessPoolExecutor() as executor: + # results = [] + # for i in range(0, len(data['records']), 3): + # batch = data['records'][i:i+3] + # batch_results = [executor.submit(downloadPDFSpringer, record, directory) for record in batch] + # results.extend(batch_results) + # for f in concurrent.futures.as_completed(results): + # print(f.result()) + + for i in range(len(data['records'])): + status = downloadPDFSpringer(data['records'][i], directory) + print("Status: ", status) + # update current records count current_records += int(len(data['records'])) @@ -141,21 +150,21 @@ def downloadSpringerFulltext(issn=None, subject=None, journal=None, title=None, # call ingest - beam_url = "https://41kgx.apps.beam.cloud" - headers = { - "Content-Type": "application/json", - "Authorization": "Basic " + os.getenv('BEAM_AUTH_TOKEN') # type: ignore - } - for data in ingest_data: - payload = json.dumps(data) - response = requests.post(beam_url, headers=headers, data=payload) - if response.status_code == 200: - print("Task status retrieved successfully!") - else: - print(f"Error: {response.status_code}. {response.text}") + # beam_url = "https://41kgx.apps.beam.cloud" + # headers = { + # "Content-Type": "application/json", + # "Authorization": "Basic " + os.getenv('BEAM_AUTH_TOKEN') # type: ignore + # } + # for data in ingest_data: + # payload = json.dumps(data) + # response = requests.post(beam_url, headers=headers, data=payload) + # if response.status_code == 200: + # print("Task status retrieved successfully!") + # else: + # print(f"Error: {response.status_code}. {response.text}") - # Delete files from local directory - shutil.rmtree(directory) + # # Delete files from local directory + # shutil.rmtree(directory) return "success" From 11056b3970dee808111a5d3cde6ca3b8ab9a5dfe Mon Sep 17 00:00:00 2001 From: star-nox Date: Thu, 1 Aug 2024 11:02:20 -0500 Subject: [PATCH 14/18] minor changes --- ai_ta_backend/utils/pub_ingest.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/ai_ta_backend/utils/pub_ingest.py b/ai_ta_backend/utils/pub_ingest.py index 9b8d038e..e771fca2 100644 --- a/ai_ta_backend/utils/pub_ingest.py +++ b/ai_ta_backend/utils/pub_ingest.py @@ -129,7 +129,7 @@ def downloadSpringerFulltext(issn=None, subject=None, journal=None, title=None, doi_link = f"https://doi.org/{doi}" data = { "course_name": course_name, - "group": "springer_open", + "groups": "springer_open", "s3_paths": "courses/" + course_name + "/" + file, # type: ignore "readable_filename": file, "base_url": "", @@ -149,22 +149,36 @@ def downloadSpringerFulltext(issn=None, subject=None, journal=None, title=None, ingest_df.to_csv(csv_file, mode='a', header=False, index=False) - # call ingest - # beam_url = "https://41kgx.apps.beam.cloud" + # # call ingest + # beam_url = "https://3xn8l.apps.beam.cloud" # headers = { # "Content-Type": "application/json", # "Authorization": "Basic " + os.getenv('BEAM_AUTH_TOKEN') # type: ignore # } - # for data in ingest_data: + + # pubs_data = pd.read_csv(csv_file) + + # for row in pubs_data.iterrows(): + # payload = { + # "course_name": "cropwizard-pro", + # "s3_paths": [row[1]["s3_paths"]], + # "readable_filename": row[1]["readable_filename"], + # "base_url": "", + # "url": row[1]["url"], + # "groups": ["Springer", "CC-BY", "Research Paper"] + # } + # print(payload) # payload = json.dumps(data) # response = requests.post(beam_url, headers=headers, data=payload) + # if response.status_code == 200: # print("Task status retrieved successfully!") # else: # print(f"Error: {response.status_code}. {response.text}") - # # Delete files from local directory + # Delete files from local directory # shutil.rmtree(directory) + # os.remove(csv_file) return "success" From f1a9090f2378802ecd32dc584fe5acf0b5f21afc Mon Sep 17 00:00:00 2001 From: star-nox Date: Wed, 21 Aug 2024 11:47:28 -0400 Subject: [PATCH 15/18] minor changes --- ai_ta_backend/utils/pub_ingest.py | 110 +++++++++++++++++++----------- 1 file changed, 72 insertions(+), 38 deletions(-) diff --git a/ai_ta_backend/utils/pub_ingest.py b/ai_ta_backend/utils/pub_ingest.py index e771fca2..7aa22a58 100644 --- a/ai_ta_backend/utils/pub_ingest.py +++ b/ai_ta_backend/utils/pub_ingest.py @@ -12,10 +12,10 @@ SPRINGER_API_KEY = os.environ.get('SPRINGER_API_KEY') CC_LICENSES = { - "http://creativecommons.org/licenses/by/4.0/": "CC BY", - "http://creativecommons.org/licenses/by-nc/4.0/": "CC BY-NC", - "http://creativecommons.org/licenses/by-nc-nd/4.0/": "CC BY-NC-ND", - "http://creativecommons.org/licenses/by-nc-sa/4.0/": "CC BY-NC-SA" + "https://creativecommons.org/licenses/by/4.0/": "CC BY", + "https://creativecommons.org/licenses/by-nc/4.0/": "CC BY-NC", + "https://creativecommons.org/licenses/by-nc-nd/4.0/": "CC BY-NC-ND", + "https://creativecommons.org/licenses/by-nc-sa/4.0/": "CC BY-NC-SA" } OTHER_LICENSES = { @@ -100,10 +100,17 @@ def downloadSpringerFulltext(issn=None, subject=None, journal=None, title=None, # print(f.result()) for i in range(len(data['records'])): - status = downloadPDFSpringer(data['records'][i], directory) - print("Status: ", status) - + article_metadata = downloadPDFSpringer(data['records'][i], directory) + article_metadata['issn'] = issn + # write article metadata to CSV file + metadata_csv = "springer_metadata.csv" + metadata_df = pd.DataFrame([article_metadata]) + if not os.path.exists(metadata_csv): + metadata_df.to_csv(metadata_csv, index=False) + else: + metadata_df.to_csv(metadata_csv, mode='a', header=False, index=False) + # update current records count current_records += int(len(data['records'])) @@ -118,35 +125,35 @@ def downloadSpringerFulltext(issn=None, subject=None, journal=None, title=None, except Exception as e: print(e) - print("Course name: ", course_name) + # print("Course name: ", course_name) # prep payload for beam ingest - ingest_data = [] + # ingest_data = [] # upload files to S3 bucket - for file in os.listdir(directory): - doi = file[:-4] - doi = doi.replace("_", "/") - doi_link = f"https://doi.org/{doi}" - data = { - "course_name": course_name, - "groups": "springer_open", - "s3_paths": "courses/" + course_name + "/" + file, # type: ignore - "readable_filename": file, - "base_url": "", - "url": doi_link, - "journal": "", - } - s3_path = "courses/" + course_name + "/" + file # type: ignore - s3_client.upload_file(directory + "/" + file, aws_bucket, s3_path) # type: ignore - ingest_data.append(data) + # for file in os.listdir(directory): + # doi = file[:-4] + # doi = doi.replace("_", "/") + # doi_link = f"https://doi.org/{doi}" + # data = { + # "course_name": course_name, + # "groups": "springer_open", + # "s3_paths": "courses/" + course_name + "/" + file, # type: ignore + # "readable_filename": file, + # "base_url": "", + # "url": doi_link, + # "journal": "", + # } + # s3_path = "courses/" + course_name + "/" + file # type: ignore + # s3_client.upload_file(directory + "/" + file, aws_bucket, s3_path) # type: ignore + # ingest_data.append(data) - # save ingest data to csv - ingest_df = pd.DataFrame(ingest_data) - csv_file = "publications_data.csv" - if not os.path.exists(csv_file): - ingest_df.to_csv(csv_file, index=False) - else: - ingest_df.to_csv(csv_file, mode='a', header=False, index=False) + # # save ingest data to csv + # ingest_df = pd.DataFrame(ingest_data) + # csv_file = "publications_data.csv" + # if not os.path.exists(csv_file): + # ingest_df.to_csv(csv_file, index=False) + # else: + # ingest_df.to_csv(csv_file, mode='a', header=False, index=False) # # call ingest @@ -190,6 +197,7 @@ def downloadPDFSpringer(record: dict, directory: str): record: dictionary containing DOI and other metadata directory: local directory to save the files """ + print("in downloadPDFSpringer") headers = {'Accept': 'application/json'} if len(record['url']) < 1: @@ -197,26 +205,42 @@ def downloadPDFSpringer(record: dict, directory: str): # extract URL url = record['url'][0]['value'] + "?api_key=" + str(SPRINGER_API_KEY) + print("URL: ", url) url_response = requests.get(url, headers=headers) if url_response.status_code != 200: return "Error in accessing article link: " + str(url_response.status_code) + " - " + url_response.text url_data = url_response.json() + if 'license' in url_data: + license_url = url_data['license'][0]['URL'] + license = CC_LICENSES.get(license_url, license_url) + print("License: ", license) + else: + license = "unknown" + license_url = "unknown" + # extract PDF link pdf_link = None links = url_data['link'] for link in links: if link['content-type'] == 'application/pdf' and link['intended-application'] == 'text-mining': pdf_link = link['URL'] - #print("PDF Link: ", pdf_link) + print("PDF Link: ", pdf_link) break + if not pdf_link: - return "No PDF link found for DOI: " + record['doi'] + pdf_link = links[0]['URL'] + print("PDF Link: ", pdf_link) + if not pdf_link: + return "No PDF link found for DOI: " + record['doi'] # download PDF - filename = record['doi'].replace("/", "_") - if filename in ['10.1186_2196-5641-1-1', '10.1186_s40538-014-0009-x']: - return "Skipping: " + filename + print("Downloading PDF: ", record['doi']) + if 'doi' in record: + filename = record['doi'].replace("/", "_") + else: + filename = url_data['DOI'].replace("/", "_") + try: response = requests.get(pdf_link) if response.status_code != 200: @@ -226,7 +250,17 @@ def downloadPDFSpringer(record: dict, directory: str): for chunk in response.iter_content(chunk_size=1024): # Download in chunks f.write(chunk) print("Downloaded: ", filename) - return "success" + + # form metadata + metadata = { + "doi": record['doi'], + "publisher": record['publisher'], + "issn": record['issn'], + "license": license, + "license_url": license_url, + "metadata": url_data, + } + return metadata except Exception as e: return "Error in downloading PDF: " + str(e) From 5fed7ae80f59689e1bce5e171918f4ffb33a3816 Mon Sep 17 00:00:00 2001 From: star-nox Date: Mon, 9 Sep 2024 23:07:42 -0500 Subject: [PATCH 16/18] minor changes for download --- ai_ta_backend/utils/pub_ingest.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ai_ta_backend/utils/pub_ingest.py b/ai_ta_backend/utils/pub_ingest.py index 7aa22a58..699ae1bc 100644 --- a/ai_ta_backend/utils/pub_ingest.py +++ b/ai_ta_backend/utils/pub_ingest.py @@ -12,10 +12,10 @@ SPRINGER_API_KEY = os.environ.get('SPRINGER_API_KEY') CC_LICENSES = { - "https://creativecommons.org/licenses/by/4.0/": "CC BY", - "https://creativecommons.org/licenses/by-nc/4.0/": "CC BY-NC", - "https://creativecommons.org/licenses/by-nc-nd/4.0/": "CC BY-NC-ND", - "https://creativecommons.org/licenses/by-nc-sa/4.0/": "CC BY-NC-SA" + "http://creativecommons.org/licenses/by/4.0/": "CC BY", + "http://creativecommons.org/licenses/by-nc/4.0/": "CC BY-NC", + "http://creativecommons.org/licenses/by-nc-nd/4.0/": "CC BY-NC-ND", + "http://creativecommons.org/licenses/by-nc-sa/4.0/": "CC BY-NC-SA" } OTHER_LICENSES = { From acda1a1cfffe5128c465b3c4978864726939ec37 Mon Sep 17 00:00:00 2001 From: star-nox Date: Thu, 12 Sep 2024 14:00:08 -0500 Subject: [PATCH 17/18] minor changes --- ai_ta_backend/utils/pub_ingest.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/ai_ta_backend/utils/pub_ingest.py b/ai_ta_backend/utils/pub_ingest.py index 699ae1bc..cc59f928 100644 --- a/ai_ta_backend/utils/pub_ingest.py +++ b/ai_ta_backend/utils/pub_ingest.py @@ -98,8 +98,12 @@ def downloadSpringerFulltext(issn=None, subject=None, journal=None, title=None, # results.extend(batch_results) # for f in concurrent.futures.as_completed(results): # print(f.result()) - + print("Total records: ", len(data['records'])) + for i in range(len(data['records'])): + print("i: ", i) + print("Processing record: ", data['records'][i]) + print("\n") article_metadata = downloadPDFSpringer(data['records'][i], directory) article_metadata['issn'] = issn @@ -207,8 +211,10 @@ def downloadPDFSpringer(record: dict, directory: str): url = record['url'][0]['value'] + "?api_key=" + str(SPRINGER_API_KEY) print("URL: ", url) url_response = requests.get(url, headers=headers) + print("URL response: ", url_response.status_code) if url_response.status_code != 200: return "Error in accessing article link: " + str(url_response.status_code) + " - " + url_response.text + url_data = url_response.json() if 'license' in url_data: @@ -343,9 +349,9 @@ def downloadWileyFulltext(course_name=None, issn=[]): # fetch metadata - # for item in issn: - # metadata_status = getCrossrefMetadata(item) - # print("Metadata status: ", metadata_status) + for item in issn: + metadata_status = getCrossrefMetadata(item) + print("Metadata status: ", metadata_status) # download PDFs based on metadata metadata_csv = "wiley_metadata.csv" From 183c181e77b9de8bd674dc6428493bafa42bbc3c Mon Sep 17 00:00:00 2001 From: star-nox Date: Mon, 4 Nov 2024 14:38:52 -0600 Subject: [PATCH 18/18] added nal download script --- ai_ta_backend/utils/nal_data_mining.py | 0 ai_ta_backend/utils/nal_download.py | 506 +++++++++++++++++++++++++ 2 files changed, 506 insertions(+) create mode 100644 ai_ta_backend/utils/nal_data_mining.py create mode 100644 ai_ta_backend/utils/nal_download.py diff --git a/ai_ta_backend/utils/nal_data_mining.py b/ai_ta_backend/utils/nal_data_mining.py new file mode 100644 index 00000000..e69de29b diff --git a/ai_ta_backend/utils/nal_download.py b/ai_ta_backend/utils/nal_download.py new file mode 100644 index 00000000..ca4d128c --- /dev/null +++ b/ai_ta_backend/utils/nal_download.py @@ -0,0 +1,506 @@ +import os +from supabase import create_client, Client +import requests +import boto3 +from dotenv import load_dotenv +import datetime +import time + +load_dotenv() +print("Supabase URL: ", os.getenv("SUPABASE_URL")) +print("Supabase API key: ", os.getenv("SUPABASE_API_KEY")) + +# Initialize the Supabase client +SUPABASE_CLIENT: Client = create_client(os.getenv("SUPABASE_URL"), os.getenv("SUPABASE_API_KEY")) +SPRINGER_API_KEY = os.environ.get('SPRINGER_API_KEY') +DOWNLOAD_LOG = "download_log.txt" + +S3_CLIENT = boto3.client('s3', aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'), aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY')) + +AWS_BUCKET = os.getenv('S3_BUCKET_NAME') + +CC_LICENSES = { + "http://creativecommons.org/licenses/by/4.0/": "CC BY", + "http://creativecommons.org/licenses/by-nc/4.0/": "CC BY-NC", + "http://creativecommons.org/licenses/by-nc-nd/4.0/": "CC BY-NC-ND", + "http://creativecommons.org/licenses/by-nc-sa/4.0/": "CC BY-NC-SA" +} + +OTHER_LICENSES = { + "http://onlinelibrary.wiley.com/termsAndConditions#vor": "wiley_tnc", + "http://onlinelibrary.wiley.com/termsAndConditions#am": "wiley_tnc", + "http://doi.wiley.com/10.1002/tdm_license_1.1": "wiley_tdm" +} + + +def main(): + data = [1, 2, 3] + # fetch records from SQL + while len(data) > 0: + response = SUPABASE_CLIENT.table("nal_publications").select("doi_number, publisher, metadata").eq("ingested", False).eq("downloadable", True).neq("publisher", "Wiley").limit(1000).execute() + data = response.data + print("No. of records: ", len(data)) + for record in data: + if 'Springer' in record['publisher']: + # route to springer download + result = downloadSpringerFulltext(doi=record['doi_number']) + + elif 'Wiley' in record['publisher']: + # route to wiley download + print('Wiley') + continue + result = downloadWileyPDF(doi=record['doi_number'], metadata=record['metadata']) + time.sleep(10) # sleep for 10 seconds to avoid rate limiting + elif 'Elsevier' in record['publisher']: + # update supabase + update_info = {"notes": "Elsevier articles not downloadable.", "downloadable": False, "modified_date": datetime.datetime.now().isoformat()} + response = SUPABASE_CLIENT.table("nal_publications").update(update_info).eq("doi_number", record['doi_number']).execute() + + else: + # regular file save + print("publisher name: " , record['publisher']) + result = download_article_from_url(record['doi_number'], record['metadata']) + print(result) + #time.sleep(10) + return "Success" + + +def download_article_from_url(doi, metadata): + print("in download_article_from_url: ", doi) + + if 'link' not in metadata: + print("No link") + # update supabase + update_info = {"notes": "Download link absent.", "downloadable": False, "modified_date": datetime.datetime.now().isoformat()} + SUPABASE_CLIENT.table("nal_publications").update(update_info).eq("doi_number", doi).execute() + return "No download link present" + else: + # save to local + print("Link found") + pdf_link = metadata['link'][0]['URL'] + + if 'license' not in metadata: + print("No license") + # update supabase + update_info = {"notes": "License absent.", "downloadable": False, "modified_date": datetime.datetime.now().isoformat()} + SUPABASE_CLIENT.table("nal_publications").update(update_info).eq("doi_number", doi).execute() + return {"error": "License not found."} + + license = get_license(metadata['license'][0]['URL']) + + status = download_pdf_in_chunks(url=pdf_link, doi=doi) + if 'failed' in status: + # update supabase + print("Error in PDF download: ", status['failed']) + update_info = {"notes": str(status['failed']), "downloadable": False, "modified_date": datetime.datetime.now().isoformat()} + SUPABASE_CLIENT.table("nal_publications").update(update_info).eq("doi_number", doi).execute() + return {"error": "Error in PDF download."} + else: + filepath = status['success'] + + updated_metadata = { + "doi": doi, + "filename": filepath.split("/")[-1], + "file_path": filepath, + "publisher": metadata['publisher'], + "license": license, + } + print("Updated metadata: ", updated_metadata) + + ingest_status = upload_and_ingest(filepath, updated_metadata, doi) + print(ingest_status) + + return {"success": "Downloaded and ingested successfully."} + +def upload_and_ingest(filepath, metadata, doi): + """ + Uploads file to S3 and ingests them into cropwizard-1.5 + """ + filename = os.path.basename(filepath) + + s3_path = "courses/cropwizard-1.5/" + filename + + S3_CLIENT.upload_file(filepath, AWS_BUCKET, s3_path) + + publisher = metadata['publisher'] + if 'Springer' in publisher: + publisher = "Springer" + elif 'Wiley' in publisher: + publisher = "Wiley" + + + # ingest + ingest_url = "https://ingest-task-queue-6ee4a59-v12.app.beam.cloud" + ingest_headers = { + 'Accept': '*/*', + 'Accept-Encoding': 'gzip, deflate', + 'Authorization': f"Bearer {os.environ['BEAM_API_KEY']}", + 'Content-Type': 'application/json', + } + doi_url = f"https://doi.org/{doi}" + ingest_payload = { + "course_name": "cropwizard-1.5", + "s3_paths": [s3_path], + "readable_filename": filename, + "url": doi_url, + "base_url": "", + "groups": ["Research Papers", "NAL", publisher] + } + + if 'license' in metadata and metadata['license'] not in ['Unknown', 'unknown']: + ingest_payload['groups'].append(metadata['license']) + + print("FINAL INGEST PAYLOAD: ", ingest_payload) + ingest_response = requests.post(ingest_url, headers=ingest_headers, json=ingest_payload) + + # update supabase + update_info = {"ingested": True, "modified_date": datetime.datetime.now().isoformat()} + response = SUPABASE_CLIENT.table("nal_publications").update(update_info).eq("doi_number", doi).execute() + return "success" + + +def download_pdf_in_chunks(url, doi, chunk_size=1024): + try: + + # create directory to store files + directory = "other_papers" + if not os.path.exists(directory): + os.makedirs(directory) + + # Send a GET request to the URL with stream=True to download in chunks + response = requests.get(url, stream=True) + + # Check if the request was successful + if response.status_code == 200: + # Open the file in binary write mode + filename = doi.replace("/", "_") + filepath = "other_papers/" + filename + ".pdf" + with open(filepath, 'wb') as file: + # Iterate over the response in chunks and write each to the file + for chunk in response.iter_content(chunk_size=chunk_size): + if chunk: # Filter out keep-alive chunks + file.write(chunk) + print(f"PDF successfully downloaded and saved as {filepath}") + + return {"success": filepath} + + else: + print(f"Failed to download PDF. Status code: {response.status_code}") + + # update supabase + update_info = {"notes": f"Failed to download PDF (anti-bot). Status code: {response.status_code}", + "downloadable": False, "modified_date": datetime.datetime.now().isoformat()} + SUPABASE_CLIENT.table("nal_publications").update(update_info).eq("doi_number", doi).execute() + + return {"failed": response.status_code} + + except requests.exceptions.RequestException as e: + print(f"An error occurred: {e}") + return {"failed": e} + + +############# SPRINGER DOWNLOAD ############# + +def downloadSpringerFulltext(issn=None, subject=None, journal=None, title=None, doi=None, course_name=None): + """ + This function uses the Springer Nature API to download openaccess journal articles. + Args: + issn: limit to ISSN number of the journal/book + subject: limit articles to a specific subject - Chemistry, Physics, etc. + journal: limit to keywords occuring in journal title + title: limit to keywords occuring in article title + The initial API response returns a list of articles with metadata. + + """ + print("in downloadSpringerFulltext") + # create directory to store files + directory = os.path.join(os.getcwd(), 'springer_papers') + if not os.path.exists(directory): + os.makedirs(directory) + + # set headers + api_url = "http://api.springernature.com/openaccess/json?q=" + headers = {'Accept': 'application/json'} + + # form the query URL based on the input parameters received + if doi: + query_str = "doi:" + doi + elif issn: + query_str = "issn:" + issn + elif journal: + journal = "%22" + journal.replace(" ", "%20") + "%22" + query_str = "journal:" + journal + elif title: + title = "%22" + title.replace(" ", "%20") + "%22" + query_str = "title:" + title + elif subject: + query_str = "subject:" + subject + else: + return "No query parameters provided" + + main_url = api_url + query_str + "&api_key=" + str(SPRINGER_API_KEY) + print("Full URL: ", main_url) + + response = requests.get(main_url, headers=headers) + + if response.status_code != 200: + print("Error in accessing Springer API: ", response.text) + response = SUPABASE_CLIENT.table("nal_publications").update({"notes": f"Error in accessing Springer API. Status code: {response.text}", "downloadable": False, "modified_date": datetime.datetime.now().isoformat()}).eq("doi_number", doi).execute() + return "Error" + + data = response.json() + # check for total number of records + total_records = int(data['result'][0]['total']) + + if total_records == 0: + # update supabase record and exit + response = SUPABASE_CLIENT.table("nal_publications").update({"notes": "Article is not OA.", "downloadable": False, "modified_date": datetime.datetime.now().isoformat()}).eq("doi_number", doi).execute() + return "Article not OA." + else: + # download paper + download_info = downloadPDFSpringer(data['records'][0], directory) + + if 'error' in download_info: + response = SUPABASE_CLIENT.table("nal_publications").update({"notes": download_info['error'], "downloadable": False, "modified_date": datetime.datetime.now().isoformat()}).eq("doi_number", doi).execute() + else: + # ingest + print("Download info: ", download_info) + ingest_status = upload_and_ingest(download_info['file_path'], download_info, doi) + + return "success" + +def downloadPDFSpringer(record: dict, directory: str): + """ + This function takes a record from the Springer API response and downloads the PDF file. + It is called in a multi-process loop in downloadSpringerFulltext(). + Args: + record: dictionary containing DOI and other metadata + directory: local directory to save the files + """ + print("in downloadPDFSpringer") + headers = {'Accept': 'application/json'} + + if len(record['url']) < 1: + return "No download link found for DOI: " + record['doi'] + + # extract URL + url = record['url'][0]['value'] + "?api_key=" + str(SPRINGER_API_KEY) + + url_response = requests.get(url, headers=headers) + + if url_response.status_code != 200: + return {"error": "Error in accessing article link: " + str(url_response.status_code) + " - " + url_response.text} + + url_data = url_response.json() + + if 'license' in url_data: + license_url = url_data['license'][0]['URL'] + license = get_license(license_url) + print("License: ", license) + else: + license = "unknown" + license_url = "unknown" + + # extract PDF link + pdf_link = None + if 'link' not in url_data: + return {"error": "No link found for DOI: " + record['doi']} + + links = url_data['link'] + for link in links: + if link['content-type'] == 'application/pdf' and link['intended-application'] == 'text-mining': + pdf_link = link['URL'] + + break + + if not pdf_link: + pdf_link = links[0]['URL'] + + if not pdf_link: + return {"error": "No PDF link found for DOI: " + record['doi']} + + # download PDF + + if 'doi' in record: + filename = record['doi'].replace("/", "_") + else: + filename = url_data['DOI'].replace("/", "_") + + try: + response = requests.get(pdf_link) + if response.status_code != 200: + return {"error": "Error in downloading PDF: " + str(response.status_code) + " - " + response.text} + + with open(directory + "/" + filename + ".pdf", "wb") as f: # Open a file in binary write mode ("wb") + for chunk in response.iter_content(chunk_size=1024): # Download in chunks + f.write(chunk) + + + # form metadata + metadata = { + "doi": record['doi'], + "publisher": record['publisher'], + "issn": record['issn'], + "license": license, + "license_url": license_url, + "filename": filename + ".pdf", + "file_path": directory + "/" + filename + ".pdf" + } + return metadata + except Exception as e: + return {"error": "Error in downloading PDF: " + str(e)} + + +def downloadWileyPDF(doi, metadata): + """ + This function downloads a PDF file from Wiley based on the DOI. + """ + print("in downloadWileyPDF") + try: + # create directory to store files + directory = "wiley_papers" + if not os.path.exists(directory): + os.makedirs(directory) + + api_key = os.environ.get("WILEY_TDM_TOKEN") + + # download PDF based on doi + base_url = "https://api.wiley.com/onlinelibrary/tdm/v1/articles/" + url = base_url + str(doi) + print("URL: ", url) + + headers = { + 'Wiley-TDM-Client-Token': api_key, + 'Content-Type': 'application/json' + } + + response = requests.get(url, headers=headers) + response.raise_for_status() + + filename = str(doi).replace("/", "_") + ".pdf" + with open(directory + "/" + filename, "wb") as f: # Open a file in binary write mode ("wb") + for chunk in response.iter_content(chunk_size=1024): # Download in chunks + f.write(chunk) + print("Downloaded: ", filename) + + # get license + license = get_license(metadata['license'][0]['URL']) + print("License: ", license) + + # route to upload and ingest + updated_metadata = { + "doi": doi, + "filename": filename, + "file_path": directory + "/" + filename, + "publisher": metadata['publisher'], + "license": license, + } + print("Updated metadata: ", updated_metadata) + + # call upload and ingest + ingest_status = upload_and_ingest(updated_metadata['file_path'], updated_metadata, doi) + + return {"success": "Downloaded and ingested successfully."} + except Exception as e: + print("Error: ", e) + # probably a 403 error - update supabase + update_info = {"notes": "403 client error (forbidden) in PDF download.", "downloadable": False, "modified_date": datetime.datetime.now().isoformat()} + response = SUPABASE_CLIENT.table("nal_publications").update(update_info).eq("doi_number", doi).execute() + return {"error": "403 client error (forbidden) in PDF download."} + + +def downloadWileyArticle(doi=None): + """ + This function fetches metadata from Crossref and downloads open access full text articles from Wiley. + """ + # create directory to store files + directory = os.path.join(os.getcwd(), 'wiley_papers') + if not os.path.exists(directory): + os.makedirs(directory) + + api_key = os.environ.get("WILEY_TDM_TOKEN") + metadata = {} + + # get metadata from Crossref + if doi: + # get article metadata + works = Works() + article_data = works.doi(doi) + print("Article license: ", article_data['license']) + + article_licenses = [] + + for item in article_data['license']: + article_licenses.append(item['URL']) + print("Licenses: ", article_licenses) + # check if the license is open access - variant of CC + for license in article_licenses: + if license in LICENSES: + print("License found: ", license) + if LICENSES[license] == "closed_access": + return "Article is not open access." + else: + metadata['license'] = LICENSES[license] + break + else: + return "License not found." + + metadata['doi'] = doi + metadata['title'] = article_data['title'][0] + metadata['journal'] = article_data['container-title'][0] + metadata['publisher'] = article_data['publisher'] + metadata['issn'] = article_data['ISSN'][0] + metadata['url'] = article_data['URL'] + + print("Metadata: ", metadata) + + # download PDF based on doi + base_url = "https://api.wiley.com/onlinelibrary/tdm/v1/articles/" + url = base_url + str(doi) + + print("URL: ", url) + + headers = { + 'Wiley-TDM-Client-Token': api_key, + 'Content-Type': 'application/json' + } + + response = requests.get(url, headers=headers) + if response.status_code != 200: + # exponential backoff logic + print("Error in accessing article link, retrying: ", response.text) + + return "Error in accessing article link: " + str(response.status_code) + " - " + response.text + + filename = str(doi).replace("/", "_") + with open(directory + "/" + filename + ".pdf", "wb") as f: # Open a file in binary write mode ("wb") + for chunk in response.iter_content(chunk_size=1024): # Download in chunks + f.write(chunk) + print("Downloaded: ", filename) + + # upload file to S3 bucket + + # prep payload for beam ingest + + return "success" + + +def get_license(url: str) -> str: + # Define license matches + license_mapping = { + "by-nc-nd": "CC BY-NC-ND", + "by-nc-sa": "CC BY-NC-SA", + "by-nc": "CC BY-NC", + "by": "CC BY", + } + + # Loop through the mapping and check if the URL contains the license string + for key, license in license_mapping.items(): + if key in url: + return license + + # Return 'Unknown' if no match is found + return "Unknown" + +if __name__ == "__main__": + main() \ No newline at end of file