diff --git a/ai_ta_backend/database/sql.py b/ai_ta_backend/database/sql.py index a8287234..da854891 100644 --- a/ai_ta_backend/database/sql.py +++ b/ai_ta_backend/database/sql.py @@ -11,7 +11,7 @@ def __init__(self): # Create a Supabase client self.supabase_client = supabase.create_client( # type: ignore supabase_url=os.environ['SUPABASE_URL'], supabase_key=os.environ['SUPABASE_API_KEY']) - + def getAllMaterialsForCourse(self, course_name: str): return self.supabase_client.table( os.environ['SUPABASE_DOCUMENTS_TABLE']).select('course_name, s3_path, readable_filename, url, base_url').eq( @@ -134,3 +134,5 @@ def getConversation(self, course_name: str, key: str, value: str): def getDisabledDocGroups(self, course_name: str): return self.supabase_client.table("doc_groups").select("name").eq("course_name", course_name).eq("enabled", False).execute() + def getCourseDocumentByS3Path(self, course_name: str, s3_path: str): + return self.supabase_client.table("documents").select("id, course_name, readable_filename, url, base_url, s3_path, created_at").eq("course_name", course_name).eq("s3_path", s3_path).execute() \ No newline at end of file diff --git a/ai_ta_backend/main.py b/ai_ta_backend/main.py index 7a746c89..156b5993 100644 --- a/ai_ta_backend/main.py +++ b/ai_ta_backend/main.py @@ -40,6 +40,7 @@ from ai_ta_backend.service.retrieval_service import RetrievalService from ai_ta_backend.service.sentry_service import SentryService from ai_ta_backend.service.workflow_service import WorkflowService +from ai_ta_backend.utils.pub_ingest import downloadSpringerFulltext, downloadWileyFulltext app = Flask(__name__) CORS(app) @@ -480,6 +481,56 @@ def run_flow(service: WorkflowService) -> Response: return response +@app.route('/get-springer-fulltext', methods=['GET']) +def get_springer_data(): + course_name: str = request.args.get('course_name', default='', type=str) + issn = request.args.get('issn', default='', type=str) + subject = request.args.get('subject', default='', type=str) + journal = request.args.get('journal', default='', type=str) + title = request.args.get('title', default='', type=str) + doi = request.args.get('doi', default='', type=str) + + print("In /get-springer-fulltext") + + if (issn == '' and subject == '' and journal == '' and title == '' and doi == '') or course_name == '': + # proper web error "400 Bad request" + abort( + 400, + description= + f"Missing required parameters: 'issn' or 'subject' or 'title' or 'journal' or 'doi' and 'course_name' must be provided." + ) + + fulltext = downloadSpringerFulltext(issn, subject, journal, title, doi, course_name) + + response = jsonify(fulltext) + response.headers.add('Access-Control-Allow-Origin', '*') + return response + +@app.route('/get-wiley-fulltext', methods=['POST']) +def get_wiley_data(): + data = request.get_json() + print(data) + + course_name = data['course_name'] + issn = data['issn'] + + print("In /get-wiley-fulltext") + + if issn == [] or course_name == '': + # proper web error "400 Bad request" + abort( + 400, + description= + f"Missing required parameters: 'issn' or 'doi' and 'course_name' must be provided." + ) + + fulltext = downloadWileyFulltext(course_name, issn) + + response = jsonify(fulltext) + response.headers.add('Access-Control-Allow-Origin', '*') + return response + + def configure(binder: Binder) -> None: binder.bind(RetrievalService, to=RetrievalService, scope=RequestScope) binder.bind(PosthogService, to=PosthogService, scope=SingletonScope) diff --git a/ai_ta_backend/utils/nal_data_mining.py b/ai_ta_backend/utils/nal_data_mining.py new file mode 100644 index 00000000..e69de29b diff --git a/ai_ta_backend/utils/nal_download.py b/ai_ta_backend/utils/nal_download.py new file mode 100644 index 00000000..ca4d128c --- /dev/null +++ b/ai_ta_backend/utils/nal_download.py @@ -0,0 +1,506 @@ +import os +from supabase import create_client, Client +import requests +import boto3 +from dotenv import load_dotenv +import datetime +import time + +load_dotenv() +print("Supabase URL: ", os.getenv("SUPABASE_URL")) +print("Supabase API key: ", os.getenv("SUPABASE_API_KEY")) + +# Initialize the Supabase client +SUPABASE_CLIENT: Client = create_client(os.getenv("SUPABASE_URL"), os.getenv("SUPABASE_API_KEY")) +SPRINGER_API_KEY = os.environ.get('SPRINGER_API_KEY') +DOWNLOAD_LOG = "download_log.txt" + +S3_CLIENT = boto3.client('s3', aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'), aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY')) + +AWS_BUCKET = os.getenv('S3_BUCKET_NAME') + +CC_LICENSES = { + "http://creativecommons.org/licenses/by/4.0/": "CC BY", + "http://creativecommons.org/licenses/by-nc/4.0/": "CC BY-NC", + "http://creativecommons.org/licenses/by-nc-nd/4.0/": "CC BY-NC-ND", + "http://creativecommons.org/licenses/by-nc-sa/4.0/": "CC BY-NC-SA" +} + +OTHER_LICENSES = { + "http://onlinelibrary.wiley.com/termsAndConditions#vor": "wiley_tnc", + "http://onlinelibrary.wiley.com/termsAndConditions#am": "wiley_tnc", + "http://doi.wiley.com/10.1002/tdm_license_1.1": "wiley_tdm" +} + + +def main(): + data = [1, 2, 3] + # fetch records from SQL + while len(data) > 0: + response = SUPABASE_CLIENT.table("nal_publications").select("doi_number, publisher, metadata").eq("ingested", False).eq("downloadable", True).neq("publisher", "Wiley").limit(1000).execute() + data = response.data + print("No. of records: ", len(data)) + for record in data: + if 'Springer' in record['publisher']: + # route to springer download + result = downloadSpringerFulltext(doi=record['doi_number']) + + elif 'Wiley' in record['publisher']: + # route to wiley download + print('Wiley') + continue + result = downloadWileyPDF(doi=record['doi_number'], metadata=record['metadata']) + time.sleep(10) # sleep for 10 seconds to avoid rate limiting + elif 'Elsevier' in record['publisher']: + # update supabase + update_info = {"notes": "Elsevier articles not downloadable.", "downloadable": False, "modified_date": datetime.datetime.now().isoformat()} + response = SUPABASE_CLIENT.table("nal_publications").update(update_info).eq("doi_number", record['doi_number']).execute() + + else: + # regular file save + print("publisher name: " , record['publisher']) + result = download_article_from_url(record['doi_number'], record['metadata']) + print(result) + #time.sleep(10) + return "Success" + + +def download_article_from_url(doi, metadata): + print("in download_article_from_url: ", doi) + + if 'link' not in metadata: + print("No link") + # update supabase + update_info = {"notes": "Download link absent.", "downloadable": False, "modified_date": datetime.datetime.now().isoformat()} + SUPABASE_CLIENT.table("nal_publications").update(update_info).eq("doi_number", doi).execute() + return "No download link present" + else: + # save to local + print("Link found") + pdf_link = metadata['link'][0]['URL'] + + if 'license' not in metadata: + print("No license") + # update supabase + update_info = {"notes": "License absent.", "downloadable": False, "modified_date": datetime.datetime.now().isoformat()} + SUPABASE_CLIENT.table("nal_publications").update(update_info).eq("doi_number", doi).execute() + return {"error": "License not found."} + + license = get_license(metadata['license'][0]['URL']) + + status = download_pdf_in_chunks(url=pdf_link, doi=doi) + if 'failed' in status: + # update supabase + print("Error in PDF download: ", status['failed']) + update_info = {"notes": str(status['failed']), "downloadable": False, "modified_date": datetime.datetime.now().isoformat()} + SUPABASE_CLIENT.table("nal_publications").update(update_info).eq("doi_number", doi).execute() + return {"error": "Error in PDF download."} + else: + filepath = status['success'] + + updated_metadata = { + "doi": doi, + "filename": filepath.split("/")[-1], + "file_path": filepath, + "publisher": metadata['publisher'], + "license": license, + } + print("Updated metadata: ", updated_metadata) + + ingest_status = upload_and_ingest(filepath, updated_metadata, doi) + print(ingest_status) + + return {"success": "Downloaded and ingested successfully."} + +def upload_and_ingest(filepath, metadata, doi): + """ + Uploads file to S3 and ingests them into cropwizard-1.5 + """ + filename = os.path.basename(filepath) + + s3_path = "courses/cropwizard-1.5/" + filename + + S3_CLIENT.upload_file(filepath, AWS_BUCKET, s3_path) + + publisher = metadata['publisher'] + if 'Springer' in publisher: + publisher = "Springer" + elif 'Wiley' in publisher: + publisher = "Wiley" + + + # ingest + ingest_url = "https://ingest-task-queue-6ee4a59-v12.app.beam.cloud" + ingest_headers = { + 'Accept': '*/*', + 'Accept-Encoding': 'gzip, deflate', + 'Authorization': f"Bearer {os.environ['BEAM_API_KEY']}", + 'Content-Type': 'application/json', + } + doi_url = f"https://doi.org/{doi}" + ingest_payload = { + "course_name": "cropwizard-1.5", + "s3_paths": [s3_path], + "readable_filename": filename, + "url": doi_url, + "base_url": "", + "groups": ["Research Papers", "NAL", publisher] + } + + if 'license' in metadata and metadata['license'] not in ['Unknown', 'unknown']: + ingest_payload['groups'].append(metadata['license']) + + print("FINAL INGEST PAYLOAD: ", ingest_payload) + ingest_response = requests.post(ingest_url, headers=ingest_headers, json=ingest_payload) + + # update supabase + update_info = {"ingested": True, "modified_date": datetime.datetime.now().isoformat()} + response = SUPABASE_CLIENT.table("nal_publications").update(update_info).eq("doi_number", doi).execute() + return "success" + + +def download_pdf_in_chunks(url, doi, chunk_size=1024): + try: + + # create directory to store files + directory = "other_papers" + if not os.path.exists(directory): + os.makedirs(directory) + + # Send a GET request to the URL with stream=True to download in chunks + response = requests.get(url, stream=True) + + # Check if the request was successful + if response.status_code == 200: + # Open the file in binary write mode + filename = doi.replace("/", "_") + filepath = "other_papers/" + filename + ".pdf" + with open(filepath, 'wb') as file: + # Iterate over the response in chunks and write each to the file + for chunk in response.iter_content(chunk_size=chunk_size): + if chunk: # Filter out keep-alive chunks + file.write(chunk) + print(f"PDF successfully downloaded and saved as {filepath}") + + return {"success": filepath} + + else: + print(f"Failed to download PDF. Status code: {response.status_code}") + + # update supabase + update_info = {"notes": f"Failed to download PDF (anti-bot). Status code: {response.status_code}", + "downloadable": False, "modified_date": datetime.datetime.now().isoformat()} + SUPABASE_CLIENT.table("nal_publications").update(update_info).eq("doi_number", doi).execute() + + return {"failed": response.status_code} + + except requests.exceptions.RequestException as e: + print(f"An error occurred: {e}") + return {"failed": e} + + +############# SPRINGER DOWNLOAD ############# + +def downloadSpringerFulltext(issn=None, subject=None, journal=None, title=None, doi=None, course_name=None): + """ + This function uses the Springer Nature API to download openaccess journal articles. + Args: + issn: limit to ISSN number of the journal/book + subject: limit articles to a specific subject - Chemistry, Physics, etc. + journal: limit to keywords occuring in journal title + title: limit to keywords occuring in article title + The initial API response returns a list of articles with metadata. + + """ + print("in downloadSpringerFulltext") + # create directory to store files + directory = os.path.join(os.getcwd(), 'springer_papers') + if not os.path.exists(directory): + os.makedirs(directory) + + # set headers + api_url = "http://api.springernature.com/openaccess/json?q=" + headers = {'Accept': 'application/json'} + + # form the query URL based on the input parameters received + if doi: + query_str = "doi:" + doi + elif issn: + query_str = "issn:" + issn + elif journal: + journal = "%22" + journal.replace(" ", "%20") + "%22" + query_str = "journal:" + journal + elif title: + title = "%22" + title.replace(" ", "%20") + "%22" + query_str = "title:" + title + elif subject: + query_str = "subject:" + subject + else: + return "No query parameters provided" + + main_url = api_url + query_str + "&api_key=" + str(SPRINGER_API_KEY) + print("Full URL: ", main_url) + + response = requests.get(main_url, headers=headers) + + if response.status_code != 200: + print("Error in accessing Springer API: ", response.text) + response = SUPABASE_CLIENT.table("nal_publications").update({"notes": f"Error in accessing Springer API. Status code: {response.text}", "downloadable": False, "modified_date": datetime.datetime.now().isoformat()}).eq("doi_number", doi).execute() + return "Error" + + data = response.json() + # check for total number of records + total_records = int(data['result'][0]['total']) + + if total_records == 0: + # update supabase record and exit + response = SUPABASE_CLIENT.table("nal_publications").update({"notes": "Article is not OA.", "downloadable": False, "modified_date": datetime.datetime.now().isoformat()}).eq("doi_number", doi).execute() + return "Article not OA." + else: + # download paper + download_info = downloadPDFSpringer(data['records'][0], directory) + + if 'error' in download_info: + response = SUPABASE_CLIENT.table("nal_publications").update({"notes": download_info['error'], "downloadable": False, "modified_date": datetime.datetime.now().isoformat()}).eq("doi_number", doi).execute() + else: + # ingest + print("Download info: ", download_info) + ingest_status = upload_and_ingest(download_info['file_path'], download_info, doi) + + return "success" + +def downloadPDFSpringer(record: dict, directory: str): + """ + This function takes a record from the Springer API response and downloads the PDF file. + It is called in a multi-process loop in downloadSpringerFulltext(). + Args: + record: dictionary containing DOI and other metadata + directory: local directory to save the files + """ + print("in downloadPDFSpringer") + headers = {'Accept': 'application/json'} + + if len(record['url']) < 1: + return "No download link found for DOI: " + record['doi'] + + # extract URL + url = record['url'][0]['value'] + "?api_key=" + str(SPRINGER_API_KEY) + + url_response = requests.get(url, headers=headers) + + if url_response.status_code != 200: + return {"error": "Error in accessing article link: " + str(url_response.status_code) + " - " + url_response.text} + + url_data = url_response.json() + + if 'license' in url_data: + license_url = url_data['license'][0]['URL'] + license = get_license(license_url) + print("License: ", license) + else: + license = "unknown" + license_url = "unknown" + + # extract PDF link + pdf_link = None + if 'link' not in url_data: + return {"error": "No link found for DOI: " + record['doi']} + + links = url_data['link'] + for link in links: + if link['content-type'] == 'application/pdf' and link['intended-application'] == 'text-mining': + pdf_link = link['URL'] + + break + + if not pdf_link: + pdf_link = links[0]['URL'] + + if not pdf_link: + return {"error": "No PDF link found for DOI: " + record['doi']} + + # download PDF + + if 'doi' in record: + filename = record['doi'].replace("/", "_") + else: + filename = url_data['DOI'].replace("/", "_") + + try: + response = requests.get(pdf_link) + if response.status_code != 200: + return {"error": "Error in downloading PDF: " + str(response.status_code) + " - " + response.text} + + with open(directory + "/" + filename + ".pdf", "wb") as f: # Open a file in binary write mode ("wb") + for chunk in response.iter_content(chunk_size=1024): # Download in chunks + f.write(chunk) + + + # form metadata + metadata = { + "doi": record['doi'], + "publisher": record['publisher'], + "issn": record['issn'], + "license": license, + "license_url": license_url, + "filename": filename + ".pdf", + "file_path": directory + "/" + filename + ".pdf" + } + return metadata + except Exception as e: + return {"error": "Error in downloading PDF: " + str(e)} + + +def downloadWileyPDF(doi, metadata): + """ + This function downloads a PDF file from Wiley based on the DOI. + """ + print("in downloadWileyPDF") + try: + # create directory to store files + directory = "wiley_papers" + if not os.path.exists(directory): + os.makedirs(directory) + + api_key = os.environ.get("WILEY_TDM_TOKEN") + + # download PDF based on doi + base_url = "https://api.wiley.com/onlinelibrary/tdm/v1/articles/" + url = base_url + str(doi) + print("URL: ", url) + + headers = { + 'Wiley-TDM-Client-Token': api_key, + 'Content-Type': 'application/json' + } + + response = requests.get(url, headers=headers) + response.raise_for_status() + + filename = str(doi).replace("/", "_") + ".pdf" + with open(directory + "/" + filename, "wb") as f: # Open a file in binary write mode ("wb") + for chunk in response.iter_content(chunk_size=1024): # Download in chunks + f.write(chunk) + print("Downloaded: ", filename) + + # get license + license = get_license(metadata['license'][0]['URL']) + print("License: ", license) + + # route to upload and ingest + updated_metadata = { + "doi": doi, + "filename": filename, + "file_path": directory + "/" + filename, + "publisher": metadata['publisher'], + "license": license, + } + print("Updated metadata: ", updated_metadata) + + # call upload and ingest + ingest_status = upload_and_ingest(updated_metadata['file_path'], updated_metadata, doi) + + return {"success": "Downloaded and ingested successfully."} + except Exception as e: + print("Error: ", e) + # probably a 403 error - update supabase + update_info = {"notes": "403 client error (forbidden) in PDF download.", "downloadable": False, "modified_date": datetime.datetime.now().isoformat()} + response = SUPABASE_CLIENT.table("nal_publications").update(update_info).eq("doi_number", doi).execute() + return {"error": "403 client error (forbidden) in PDF download."} + + +def downloadWileyArticle(doi=None): + """ + This function fetches metadata from Crossref and downloads open access full text articles from Wiley. + """ + # create directory to store files + directory = os.path.join(os.getcwd(), 'wiley_papers') + if not os.path.exists(directory): + os.makedirs(directory) + + api_key = os.environ.get("WILEY_TDM_TOKEN") + metadata = {} + + # get metadata from Crossref + if doi: + # get article metadata + works = Works() + article_data = works.doi(doi) + print("Article license: ", article_data['license']) + + article_licenses = [] + + for item in article_data['license']: + article_licenses.append(item['URL']) + print("Licenses: ", article_licenses) + # check if the license is open access - variant of CC + for license in article_licenses: + if license in LICENSES: + print("License found: ", license) + if LICENSES[license] == "closed_access": + return "Article is not open access." + else: + metadata['license'] = LICENSES[license] + break + else: + return "License not found." + + metadata['doi'] = doi + metadata['title'] = article_data['title'][0] + metadata['journal'] = article_data['container-title'][0] + metadata['publisher'] = article_data['publisher'] + metadata['issn'] = article_data['ISSN'][0] + metadata['url'] = article_data['URL'] + + print("Metadata: ", metadata) + + # download PDF based on doi + base_url = "https://api.wiley.com/onlinelibrary/tdm/v1/articles/" + url = base_url + str(doi) + + print("URL: ", url) + + headers = { + 'Wiley-TDM-Client-Token': api_key, + 'Content-Type': 'application/json' + } + + response = requests.get(url, headers=headers) + if response.status_code != 200: + # exponential backoff logic + print("Error in accessing article link, retrying: ", response.text) + + return "Error in accessing article link: " + str(response.status_code) + " - " + response.text + + filename = str(doi).replace("/", "_") + with open(directory + "/" + filename + ".pdf", "wb") as f: # Open a file in binary write mode ("wb") + for chunk in response.iter_content(chunk_size=1024): # Download in chunks + f.write(chunk) + print("Downloaded: ", filename) + + # upload file to S3 bucket + + # prep payload for beam ingest + + return "success" + + +def get_license(url: str) -> str: + # Define license matches + license_mapping = { + "by-nc-nd": "CC BY-NC-ND", + "by-nc-sa": "CC BY-NC-SA", + "by-nc": "CC BY-NC", + "by": "CC BY", + } + + # Loop through the mapping and check if the URL contains the license string + for key, license in license_mapping.items(): + if key in url: + return license + + # Return 'Unknown' if no match is found + return "Unknown" + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/ai_ta_backend/utils/pub_ingest.py b/ai_ta_backend/utils/pub_ingest.py new file mode 100644 index 00000000..cc59f928 --- /dev/null +++ b/ai_ta_backend/utils/pub_ingest.py @@ -0,0 +1,541 @@ +import os +import json +import time +import pandas as pd +import shutil +import requests +import supabase +import concurrent.futures +from crossref.restful import Works, Journals +from ai_ta_backend.database import aws, sql +import backoff + +SPRINGER_API_KEY = os.environ.get('SPRINGER_API_KEY') +CC_LICENSES = { + "http://creativecommons.org/licenses/by/4.0/": "CC BY", + "http://creativecommons.org/licenses/by-nc/4.0/": "CC BY-NC", + "http://creativecommons.org/licenses/by-nc-nd/4.0/": "CC BY-NC-ND", + "http://creativecommons.org/licenses/by-nc-sa/4.0/": "CC BY-NC-SA" +} + +OTHER_LICENSES = { + "http://onlinelibrary.wiley.com/termsAndConditions#vor": "wiley_tnc", + "http://onlinelibrary.wiley.com/termsAndConditions#am": "wiley_tnc", + "http://doi.wiley.com/10.1002/tdm_license_1.1": "wiley_tdm" +} + +s3_client = aws.AWSStorage() +aws_bucket = os.getenv('S3_BUCKET_NAME') +supabase_client = supabase.create_client( # type: ignore + supabase_url=os.environ['SUPABASE_URL'], supabase_key=os.environ['SUPABASE_API_KEY']) + + +def downloadSpringerFulltext(issn=None, subject=None, journal=None, title=None, doi=None, course_name=None): + """ + This function uses the Springer Nature API to download openaccess journal articles. + Args: + issn: limit to ISSN number of the journal/book + subject: limit articles to a specific subject - Chemistry, Physics, etc. + journal: limit to keywords occuring in journal title + title: limit to keywords occuring in article title + The initial API response returns a list of articles with metadata. + + """ + print("in downloadSpringerFulltext") + # create directory to store files + directory = os.path.join(os.getcwd(), 'springer_papers') + if not os.path.exists(directory): + os.makedirs(directory) + + # set headers + api_url = "http://api.springernature.com/openaccess/json?q=" + headers = {'Accept': 'application/json'} + + # form the query URL based on the input parameters received + if doi: + query_str = "doi:" + doi + elif issn: + query_str = "issn:" + issn + elif journal: + journal = "%22" + journal.replace(" ", "%20") + "%22" + query_str = "journal:" + journal + elif title: + title = "%22" + title.replace(" ", "%20") + "%22" + query_str = "title:" + title + elif subject: + query_str = "subject:" + subject + else: + return "No query parameters provided" + + main_url = api_url + query_str + "&api_key=" + str(SPRINGER_API_KEY) + print("Full URL: ", main_url) + + response = requests.get(main_url, headers=headers) + print("Status: ", response.status_code) + + if response.status_code != 200: + return "Error: " + str(response.status_code) + " - " + response.text + + data = response.json() + # check for total number of records + total_records = int(data['result'][0]['total']) + print("Total records: ", total_records) + current_records = 0 + while current_records < total_records: + # check if nextPage exists + try: + if 'nextPage' in data: + next_page_url = "http://api.springernature.com" + data['nextPage'] + else: + next_page_url = None + + # multi-process all records in current page + # with concurrent.futures.ProcessPoolExecutor() as executor: + # results = [] + # for i in range(0, len(data['records']), 3): + # batch = data['records'][i:i+3] + # batch_results = [executor.submit(downloadPDFSpringer, record, directory) for record in batch] + # results.extend(batch_results) + # for f in concurrent.futures.as_completed(results): + # print(f.result()) + print("Total records: ", len(data['records'])) + + for i in range(len(data['records'])): + print("i: ", i) + print("Processing record: ", data['records'][i]) + print("\n") + article_metadata = downloadPDFSpringer(data['records'][i], directory) + article_metadata['issn'] = issn + + # write article metadata to CSV file + metadata_csv = "springer_metadata.csv" + metadata_df = pd.DataFrame([article_metadata]) + if not os.path.exists(metadata_csv): + metadata_df.to_csv(metadata_csv, index=False) + else: + metadata_df.to_csv(metadata_csv, mode='a', header=False, index=False) + + # update current records count + current_records += int(len(data['records'])) + + # if next page exists, update next page url and call the API again + if next_page_url: + # API key is already present in the URL + response = requests.get(next_page_url, headers=headers) + if response.status_code != 200: + return "Error in next page: " + str(response.status_code) + " - " + response.text + + data = response.json() + except Exception as e: + print(e) + + # print("Course name: ", course_name) + # prep payload for beam ingest + # ingest_data = [] + + # upload files to S3 bucket + # for file in os.listdir(directory): + # doi = file[:-4] + # doi = doi.replace("_", "/") + # doi_link = f"https://doi.org/{doi}" + # data = { + # "course_name": course_name, + # "groups": "springer_open", + # "s3_paths": "courses/" + course_name + "/" + file, # type: ignore + # "readable_filename": file, + # "base_url": "", + # "url": doi_link, + # "journal": "", + # } + # s3_path = "courses/" + course_name + "/" + file # type: ignore + # s3_client.upload_file(directory + "/" + file, aws_bucket, s3_path) # type: ignore + # ingest_data.append(data) + + # # save ingest data to csv + # ingest_df = pd.DataFrame(ingest_data) + # csv_file = "publications_data.csv" + # if not os.path.exists(csv_file): + # ingest_df.to_csv(csv_file, index=False) + # else: + # ingest_df.to_csv(csv_file, mode='a', header=False, index=False) + + + # # call ingest + # beam_url = "https://3xn8l.apps.beam.cloud" + # headers = { + # "Content-Type": "application/json", + # "Authorization": "Basic " + os.getenv('BEAM_AUTH_TOKEN') # type: ignore + # } + + # pubs_data = pd.read_csv(csv_file) + + # for row in pubs_data.iterrows(): + # payload = { + # "course_name": "cropwizard-pro", + # "s3_paths": [row[1]["s3_paths"]], + # "readable_filename": row[1]["readable_filename"], + # "base_url": "", + # "url": row[1]["url"], + # "groups": ["Springer", "CC-BY", "Research Paper"] + # } + # print(payload) + # payload = json.dumps(data) + # response = requests.post(beam_url, headers=headers, data=payload) + + # if response.status_code == 200: + # print("Task status retrieved successfully!") + # else: + # print(f"Error: {response.status_code}. {response.text}") + + # Delete files from local directory + # shutil.rmtree(directory) + # os.remove(csv_file) + + return "success" + +def downloadPDFSpringer(record: dict, directory: str): + """ + This function takes a record from the Springer API response and downloads the PDF file. + It is called in a multi-process loop in downloadSpringerFulltext(). + Args: + record: dictionary containing DOI and other metadata + directory: local directory to save the files + """ + print("in downloadPDFSpringer") + headers = {'Accept': 'application/json'} + + if len(record['url']) < 1: + return "No download link found for DOI: " + record['doi'] + + # extract URL + url = record['url'][0]['value'] + "?api_key=" + str(SPRINGER_API_KEY) + print("URL: ", url) + url_response = requests.get(url, headers=headers) + print("URL response: ", url_response.status_code) + if url_response.status_code != 200: + return "Error in accessing article link: " + str(url_response.status_code) + " - " + url_response.text + + url_data = url_response.json() + + if 'license' in url_data: + license_url = url_data['license'][0]['URL'] + license = CC_LICENSES.get(license_url, license_url) + print("License: ", license) + else: + license = "unknown" + license_url = "unknown" + + # extract PDF link + pdf_link = None + links = url_data['link'] + for link in links: + if link['content-type'] == 'application/pdf' and link['intended-application'] == 'text-mining': + pdf_link = link['URL'] + print("PDF Link: ", pdf_link) + break + + if not pdf_link: + pdf_link = links[0]['URL'] + print("PDF Link: ", pdf_link) + if not pdf_link: + return "No PDF link found for DOI: " + record['doi'] + + # download PDF + print("Downloading PDF: ", record['doi']) + if 'doi' in record: + filename = record['doi'].replace("/", "_") + else: + filename = url_data['DOI'].replace("/", "_") + + try: + response = requests.get(pdf_link) + if response.status_code != 200: + return "Error in downloading PDF: " + str(response.status_code) + " - " + response.text + + with open(directory + "/" + filename + ".pdf", "wb") as f: # Open a file in binary write mode ("wb") + for chunk in response.iter_content(chunk_size=1024): # Download in chunks + f.write(chunk) + print("Downloaded: ", filename) + + # form metadata + metadata = { + "doi": record['doi'], + "publisher": record['publisher'], + "issn": record['issn'], + "license": license, + "license_url": license_url, + "metadata": url_data, + } + return metadata + except Exception as e: + return "Error in downloading PDF: " + str(e) + +def getCrossrefMetadata(issn: str): + """ + Creates a csv file with metadata of all articles for given journal (ISSN) + """ + try: + metadata = [] + # get journal metadata + journals = Journals() + works = journals.works(issn=issn) + count = 0 + no_license = 0 + for item in works: + try: + count += 1 + article_metadata = {} + # check if the license is open access - variant of CC + if 'license' not in item: + no_license += 1 + continue + else: + for license in item['license']: + # check for creative commons license + if license['URL'] in CC_LICENSES: + article_metadata['license'] = CC_LICENSES[license['URL']] + article_metadata['license_url'] = license['URL'] + break + elif license['URL'] in OTHER_LICENSES: + article_metadata['license'] = OTHER_LICENSES[license['URL']] + article_metadata['license_url'] = license['URL'] + else: + article_metadata['license'] = "unknown" + article_metadata['license_url'] = license['URL'] + + article_metadata['doi'] = item['DOI'] + if 'title' not in item: + article_metadata['title'] = "No title found" + else: + article_metadata['title'] = item['title'][0] + article_metadata['journal'] = item['container-title'][0] + article_metadata['publisher'] = item['publisher'] + article_metadata['issn'] = item['ISSN'][0] + article_metadata['url'] = item['URL'] + article_metadata['filename'] = item['DOI'].replace("/", "_") + ".pdf" + article_metadata['downloaded'] = "no" + metadata_csv = "wiley_metadata.csv" + metadata_df = pd.DataFrame([article_metadata]) + if not os.path.exists(metadata_csv): + metadata_df.to_csv(metadata_csv, index=False) + else: + metadata_df.to_csv(metadata_csv, mode='a', header=False, index=False) + #metadata.append(article_metadata) + print("Processed: ", article_metadata['doi']) + except Exception as e: + print("Error processing article: ", article_metadata['doi'], e) + print("Total articles: ", count) + # metadata_csv = "wiley_metadata.csv" + # metadata_df = pd.DataFrame(metadata) + # if not os.path.exists(metadata_csv): + # metadata_df.to_csv(metadata_csv, index=False) + # else: + # metadata_df.to_csv(metadata_csv, mode='a', header=False, index=False) + + return "success" + except Exception as e: + return "Error: " + str(e) + + +def downloadWileyFulltext(course_name=None, issn=[]): + """ + This function fetches metadata from Crossref and downloads + full-text articles from a given journal from Wiley. + """ + # create directory to store files + directory = os.path.join(os.getcwd(), 'wiley_papers') + if not os.path.exists(directory): + os.makedirs(directory) + + + # fetch metadata + for item in issn: + metadata_status = getCrossrefMetadata(item) + print("Metadata status: ", metadata_status) + + # download PDFs based on metadata + metadata_csv = "wiley_metadata.csv" + if os.path.exists(metadata_csv): + metadata_df = pd.read_csv(metadata_csv) + metadata = metadata_df.to_dict(orient='records') + + for item in metadata: + try: + if item['license'] in ['CC BY', 'CC BY-NC', 'CC BY-NC-ND', 'CC BY-NC-SA'] and item['downloaded'] == 'no' and item['publisher'] == 'Wiley': + status = downloadWileyPDF(item['doi']) + print("Download status: ", status) + if status == "success": + item['downloaded'] = 'yes' + time.sleep(5) + except Exception as e: + print(e) + + #time.sleep(10) + + metadata_df = pd.DataFrame(metadata) + metadata_df.to_csv(metadata_csv, index=False) + + return "success" + + + # # prep payload for beam ingest + # ingest_data = [] + + # # upload files to S3 bucket + # for file in os.listdir(directory): + # doi = file[:-4] + # doi = doi.replace("_", "/") + # doi_link = f"https://doi.org/{doi}" + # data = { + # "course_name": course_name, + # "group": "wiley", + # "s3_paths": "courses/" + course_name + "/" + file, # type: ignore + # "readable_filename": file, + # "base_url": "", + # "url": doi_link, + # "journal": "", + # } + # s3_path = "courses/" + course_name + "/" + file # type: ignore + # s3_client.upload_file(directory + "/" + file, aws_bucket, s3_path) # type: ignore + # ingest_data.append(data) + + # # save ingest data to csv + # ingest_df = pd.DataFrame(ingest_data) + # csv_file = "publications_data.csv" + # if not os.path.exists(csv_file): + # ingest_df.to_csv(csv_file, index=False) + # else: + # ingest_df.to_csv(csv_file, mode='a', header=False, index=False) + + + # # call ingest + # beam_url = "https://41kgx.apps.beam.cloud" + # headers = { + # "Content-Type": "application/json", + # "Authorization": "Basic " + os.getenv('BEAM_AUTH_TOKEN') # type: ignore + # } + # for data in ingest_data: + # payload = json.dumps(data) + # response = requests.post(beam_url, headers=headers, data=payload) + # if response.status_code == 200: + # print("Task status retrieved successfully!") + # else: + # print(f"Error: {response.status_code}. {response.text}") + + # Delete files from local directory + #shutil.rmtree(directory) + + + +#@backoff.on_exception(backoff.expo, requests.exceptions.HTTPError, max_tries=1) +def downloadWileyPDF(doi=None): + """ + This function downloads a PDF file from Wiley based on the DOI. + """ + try: + # create directory to store files + directory = os.path.join(os.getcwd(), 'wiley_papers') + if not os.path.exists(directory): + os.makedirs(directory) + + api_key = os.environ.get("WILEY_TDM_TOKEN") + + # download PDF based on doi + base_url = "https://api.wiley.com/onlinelibrary/tdm/v1/articles/" + url = base_url + str(doi) + print("URL: ", url) + + headers = { + 'Wiley-TDM-Client-Token': api_key, + 'Content-Type': 'application/json' + } + + response = requests.get(url, headers=headers) + response.raise_for_status() + + filename = str(doi).replace("/", "_") + ".pdf" + with open(directory + "/" + filename, "wb") as f: # Open a file in binary write mode ("wb") + for chunk in response.iter_content(chunk_size=1024): # Download in chunks + f.write(chunk) + print("Downloaded: ", filename) + + return "success" + except Exception as e: + print("Error: ", e) + return "error" + + +def downloadWileyArticle(doi=None): + """ + This function fetches metadata from Crossref and downloads open access full text articles from Wiley. + """ + # create directory to store files + directory = os.path.join(os.getcwd(), 'wiley_papers') + if not os.path.exists(directory): + os.makedirs(directory) + + api_key = os.environ.get("WILEY_TDM_TOKEN") + metadata = {} + + # get metadata from Crossref + if doi: + # get article metadata + works = Works() + article_data = works.doi(doi) + print("Article license: ", article_data['license']) + + article_licenses = [] + + for item in article_data['license']: + article_licenses.append(item['URL']) + print("Licenses: ", article_licenses) + # check if the license is open access - variant of CC + for license in article_licenses: + if license in LICENSES: + print("License found: ", license) + if LICENSES[license] == "closed_access": + return "Article is not open access." + else: + metadata['license'] = LICENSES[license] + break + else: + return "License not found." + + metadata['doi'] = doi + metadata['title'] = article_data['title'][0] + metadata['journal'] = article_data['container-title'][0] + metadata['publisher'] = article_data['publisher'] + metadata['issn'] = article_data['ISSN'][0] + metadata['url'] = article_data['URL'] + + print("Metadata: ", metadata) + + # download PDF based on doi + base_url = "https://api.wiley.com/onlinelibrary/tdm/v1/articles/" + url = base_url + str(doi) + + print("URL: ", url) + + headers = { + 'Wiley-TDM-Client-Token': api_key, + 'Content-Type': 'application/json' + } + + response = requests.get(url, headers=headers) + if response.status_code != 200: + # exponential backoff logic + print("Error in accessing article link, retrying: ", response.text) + + return "Error in accessing article link: " + str(response.status_code) + " - " + response.text + + filename = str(doi).replace("/", "_") + with open(directory + "/" + filename + ".pdf", "wb") as f: # Open a file in binary write mode ("wb") + for chunk in response.iter_content(chunk_size=1024): # Download in chunks + f.write(chunk) + print("Downloaded: ", filename) + + # upload file to S3 bucket + + # prep payload for beam ingest + + return "success" \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 600955d5..1139f506 100644 --- a/requirements.txt +++ b/requirements.txt @@ -40,6 +40,10 @@ supabase==2.0.2 posthog==3.1.0 sentry-sdk==1.39.1 +# Publications +crossrefapi + + # Not currently supporting coursera ingest # cs-dlp @ git+https://github.com/raffaem/cs-dlp.git@0.12.0b0 # previously called coursera-dl