From 306c48ebd597b0beb3ce2037651478aa54ef016e Mon Sep 17 00:00:00 2001 From: star-nox Date: Thu, 25 Jul 2024 21:48:17 -0500 Subject: [PATCH 1/3] created API endpoint for NAL datamining --- ai_ta_backend/main.py | 18 +++++ ai_ta_backend/utils/cw_datamining.py | 116 +++++++++++++++++++++++++++ 2 files changed, 134 insertions(+) create mode 100644 ai_ta_backend/utils/cw_datamining.py diff --git a/ai_ta_backend/main.py b/ai_ta_backend/main.py index 4c91317d..079adbea 100644 --- a/ai_ta_backend/main.py +++ b/ai_ta_backend/main.py @@ -41,6 +41,8 @@ from ai_ta_backend.service.sentry_service import SentryService from ai_ta_backend.service.workflow_service import WorkflowService +from ai_ta_backend.utils.cw_datamining import extract_article_metadata + app = Flask(__name__) CORS(app) executor = Executor(app) @@ -478,6 +480,22 @@ def run_flow(service: WorkflowService) -> Response: response.status_code = 500 response.headers.add('Access-Control-Allow-Origin', '*') return response + +@app.route('/nal-data-mining', methods=['GET']) +def nal_data_mining() -> Response: + """ + Queries NAL website with keywords to extract article data. + """ + search_query: str = request.args.get('search_query', default='', type=str) + + if search_query == '': + # proper web error "400 Bad request" + abort(400, description=f"Missing required parameter: 'search_query' must be provided. Search query: `{search_query}`") + + result = extract_article_metadata(search_query) + response = jsonify(result) + response.headers.add('Access-Control-Allow-Origin', '*') + return response def configure(binder: Binder) -> None: diff --git a/ai_ta_backend/utils/cw_datamining.py b/ai_ta_backend/utils/cw_datamining.py new file mode 100644 index 00000000..544b2fdf --- /dev/null +++ b/ai_ta_backend/utils/cw_datamining.py @@ -0,0 +1,116 @@ +import os +import requests +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.chrome.service import Service +from selenium.webdriver.chrome.options import Options +from webdriver_manager.chrome import ChromeDriverManager +import time + +def extract_article_metadata(search_str: str) -> str: + """ + Extract article metadata from NAL website. + Store the metadata in a SQL database. + """ + print("Extracting article metadata from NAL website...") + + # get list of articles - 1st page + search_results = get_search_results(search_str) + + # for each article, go one level deeper to extract DOI + search_results = extract_doi(search_results) + + # fetch metadata for each DOI using crossref API + + + + return "Article metadata extracted successfully." + + +def get_search_results(query): + # Set up Selenium with Chrome WebDriver + chrome_options = Options() + chrome_options.add_argument("--headless") # Run in headless mode (no GUI) + chrome_options.add_argument("--disable-gpu") # Disable GPU acceleration + chrome_options.add_argument("--no-sandbox") # Required for running as root + chrome_options.add_argument("--disable-dev-shm-usage") # Overcome limited resource problems + + # Use ChromeDriverManager to automatically manage the driver + driver_service = Service(ChromeDriverManager().install()) + driver = webdriver.Chrome(service=driver_service, options=chrome_options) + + try: + # Construct the URL with the query + base_url = "https://search.nal.usda.gov/discovery/search" + params = f"?query=any,contains,{query}&tab=pubag&search_scope=pubag&vid=01NAL_INST:MAIN&facet=tlevel,include,open_access&offset=0" + url = base_url + params + print("URL: ", url) + # Load the page + driver.get(url) + + # Wait for the page to load (you may need to adjust the sleep time) + time.sleep(20) + + # Find the search results + results = driver.find_elements(By.CLASS_NAME, 'list-item') + print("Results: ", len(results)) + + # Extract the titles and links + search_results = [] + for result in results: + title_element = result.find_element(By.CLASS_NAME, 'item-title') + title = title_element.text.strip() + link = title_element.find_element(By.TAG_NAME, 'a').get_attribute('href') + search_results.append({'title': title, 'link': link}) + + return search_results + finally: + # Close the browser + driver.quit() + +def extract_doi(article_list: list): + """ + Extract DOI from the article page and append to article_list dictionary. + """ + # Set up Selenium with Chrome WebDriver + chrome_options = Options() + chrome_options.add_argument("--headless") # Run in headless mode (no GUI) + chrome_options.add_argument("--disable-gpu") # Disable GPU acceleration + chrome_options.add_argument("--no-sandbox") # Required for running as root + chrome_options.add_argument("--disable-dev-shm-usage") # Overcome limited resource problems + + # Use ChromeDriverManager to automatically manage the driver + driver_service = Service(ChromeDriverManager().install()) + driver = webdriver.Chrome(service=driver_service, options=chrome_options) + + for item in article_list: + page_link = item['link'] + + # Load the page + driver.get(page_link) + + # Wait for the page to load (you may need to adjust the sleep time) + time.sleep(20) + + # Find the search results + results = driver.find_elements(By.ID, 'item-details') + #print("Results: ", results) + + # Extract the titles and links + for result in results: + + try: + doi_link_element = result.find_element(By.XPATH, './/a[contains(@href, "https://doi.org/")]') + doi_link = doi_link_element.get_attribute("href") + except Exception as e: + doi_link = "N/A" + item['doi'] = doi_link + + # Close the browser + driver.quit() + + return article_list + + + + \ No newline at end of file From 6ed0d74320426707dbc11218e082431b14782ceb Mon Sep 17 00:00:00 2001 From: star-nox Date: Fri, 26 Jul 2024 09:56:31 -0500 Subject: [PATCH 2/3] added chrome binary path for selenium --- ai_ta_backend/utils/cw_datamining.py | 50 ++++++++++++++++++++++++++-- requirements.txt | 4 +++ 2 files changed, 51 insertions(+), 3 deletions(-) diff --git a/ai_ta_backend/utils/cw_datamining.py b/ai_ta_backend/utils/cw_datamining.py index 544b2fdf..558a9c45 100644 --- a/ai_ta_backend/utils/cw_datamining.py +++ b/ai_ta_backend/utils/cw_datamining.py @@ -6,6 +6,7 @@ from selenium.webdriver.chrome.options import Options from webdriver_manager.chrome import ChromeDriverManager import time +import crossref_commons def extract_article_metadata(search_str: str) -> str: """ @@ -21,20 +22,52 @@ def extract_article_metadata(search_str: str) -> str: search_results = extract_doi(search_results) # fetch metadata for each DOI using crossref API - - - + for article in search_results: + doi = article['doi'] + if doi != "N/A": + metadata = get_article_metadata_from_crossref(doi) + article['doi_number'] = doi + article['publisher'] = metadata['publisher'] + + if 'license' in metadata: + print("license: ", metadata['license']) + + for ele in metadata['license']: + if ele['content-version'] == 'tdm': + article['license'] = ele['URL'] + break + # if license is still empty, go for vor + if 'license' not in article: + for ele in metadata['license']: + if ele['content-version'] == 'vor': + article['license'] = ele['URL'] + break + else: + article['license'] = "N/A" + + else: + article['license'] = "N/A" + + article['metadata'] = metadata + + print("Article: ", article) + + print("Full list of articles: ", search_results) return "Article metadata extracted successfully." def get_search_results(query): # Set up Selenium with Chrome WebDriver + chrome_binary_path = "C:/Program Files/Google/Chrome/Application/chrome.exe" chrome_options = Options() + chrome_options.binary_location = chrome_binary_path + chrome_options.add_argument("--headless") # Run in headless mode (no GUI) chrome_options.add_argument("--disable-gpu") # Disable GPU acceleration chrome_options.add_argument("--no-sandbox") # Required for running as root chrome_options.add_argument("--disable-dev-shm-usage") # Overcome limited resource problems + # Use ChromeDriverManager to automatically manage the driver driver_service = Service(ChromeDriverManager().install()) driver = webdriver.Chrome(service=driver_service, options=chrome_options) @@ -111,6 +144,17 @@ def extract_doi(article_list: list): return article_list + +def get_article_metadata_from_crossref(doi: str): + """ + Get article metadata from Crossref API. + """ + # Get metadata from Crossref + metadata = crossref_commons.retrieval.get_publication_as_json(doi) + print("Metadata: ", metadata) + + return metadata + \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index d228546a..74434757 100644 --- a/requirements.txt +++ b/requirements.txt @@ -67,3 +67,7 @@ sentry-sdk==1.39.1 # unstructured.pytesseract==0.3.12 # unstructured-inference==0.7.11 # this is the real large one :( # unstructured[xlsx,image,pptx]==0.10.29 # causes huge ~5.3 GB of installs. Probbably from onnx: https://github.com/Unstructured-IO/unstructured/blob/ad14321016533dc03c1782f6ebea00bc9c804846/requirements/extra-pdf-image.in#L4 + +selenium +webdriver_manager +crossref_commons \ No newline at end of file From a1abcc08088500d27eedef6ed1b45d32c7f527e8 Mon Sep 17 00:00:00 2001 From: star-nox Date: Wed, 4 Sep 2024 10:18:17 -0500 Subject: [PATCH 3/3] updated data mining script --- ai_ta_backend/utils/cw_datamining.py | 252 ++++++++++++++++----------- 1 file changed, 152 insertions(+), 100 deletions(-) diff --git a/ai_ta_backend/utils/cw_datamining.py b/ai_ta_backend/utils/cw_datamining.py index 558a9c45..a67bb11a 100644 --- a/ai_ta_backend/utils/cw_datamining.py +++ b/ai_ta_backend/utils/cw_datamining.py @@ -1,12 +1,26 @@ -import os +import os import requests from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.options import Options +from selenium.webdriver.chrome.service import Service as ChromeService + from webdriver_manager.chrome import ChromeDriverManager import time -import crossref_commons +import json +import pandas as pd +import crossref_commons.retrieval +import shutil +import supabase +from supabase import create_client, Client + +# Initialize the Supabase client +url = os.environ.get("SUPABASE_URL") +key = os.environ.get("SUPABASE_API_KEY") +SUPABASE_CLIENT: Client = create_client(url, key) + +LOG = "log.txt" def extract_article_metadata(search_str: str) -> str: """ @@ -14,135 +28,173 @@ def extract_article_metadata(search_str: str) -> str: Store the metadata in a SQL database. """ print("Extracting article metadata from NAL website...") - + start_time = time.time() # get list of articles - 1st page search_results = get_search_results(search_str) + search_time = time.time() + print("Time taken to search results: ", search_time - start_time) # for each article, go one level deeper to extract DOI - search_results = extract_doi(search_results) - - # fetch metadata for each DOI using crossref API - for article in search_results: - doi = article['doi'] - if doi != "N/A": - metadata = get_article_metadata_from_crossref(doi) - article['doi_number'] = doi - article['publisher'] = metadata['publisher'] - - if 'license' in metadata: - print("license: ", metadata['license']) - - for ele in metadata['license']: - if ele['content-version'] == 'tdm': - article['license'] = ele['URL'] - break - # if license is still empty, go for vor - if 'license' not in article: - for ele in metadata['license']: - if ele['content-version'] == 'vor': - article['license'] = ele['URL'] - break - else: - article['license'] = "N/A" - - else: - article['license'] = "N/A" + search_results = extract_doi(search_results, SUPABASE_CLIENT) + doi_time = time.time() + print("Time taken to extract DOI and upload metadata: ", doi_time - search_time) - article['metadata'] = metadata - - print("Article: ", article) - - print("Full list of articles: ", search_results) + return "Article metadata extracted successfully." - def get_search_results(query): - # Set up Selenium with Chrome WebDriver - chrome_binary_path = "C:/Program Files/Google/Chrome/Application/chrome.exe" chrome_options = Options() - chrome_options.binary_location = chrome_binary_path - chrome_options.add_argument("--headless") # Run in headless mode (no GUI) chrome_options.add_argument("--disable-gpu") # Disable GPU acceleration chrome_options.add_argument("--no-sandbox") # Required for running as root chrome_options.add_argument("--disable-dev-shm-usage") # Overcome limited resource problems + chrome_options.add_argument("--log-level=3") + chrome_path = shutil.which("google-chrome") + print(f"Chrome binary path: {chrome_path}") + # path = "C:/Users/dabho/.wdm/drivers/chromedriver/win64/127.0.6533.72/chromedriver-win32/chromedriver.exe" + # driver_service = Service(path) + #chrome_options.binary_location ="C:/Users/dabho/.wdm/drivers/chromedriver/win64/127.0.6533.72/chromedriver-win32/chromedriver.exe" + + #chrome_options.binary_location = "/usr/bin/google-chrome" + chrome_options.binary_location = "/opt/google/chrome/google-chrome" - # Use ChromeDriverManager to automatically manage the driver - driver_service = Service(ChromeDriverManager().install()) - driver = webdriver.Chrome(service=driver_service, options=chrome_options) - - try: - # Construct the URL with the query - base_url = "https://search.nal.usda.gov/discovery/search" - params = f"?query=any,contains,{query}&tab=pubag&search_scope=pubag&vid=01NAL_INST:MAIN&facet=tlevel,include,open_access&offset=0" - url = base_url + params - print("URL: ", url) - # Load the page - driver.get(url) - - # Wait for the page to load (you may need to adjust the sleep time) - time.sleep(20) - - # Find the search results - results = driver.find_elements(By.CLASS_NAME, 'list-item') - print("Results: ", len(results)) - - # Extract the titles and links - search_results = [] - for result in results: - title_element = result.find_element(By.CLASS_NAME, 'item-title') - title = title_element.text.strip() - link = title_element.find_element(By.TAG_NAME, 'a').get_attribute('href') - search_results.append({'title': title, 'link': link}) - - return search_results - finally: - # Close the browser - driver.quit() - -def extract_doi(article_list: list): - """ - Extract DOI from the article page and append to article_list dictionary. - """ - # Set up Selenium with Chrome WebDriver + driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install())) + + + #driver = webdriver.Chrome(service=driver_service, options=chrome_options) + + count = 0 + search_results = [] + sleep_time = 0 + while count < 1000: + try: + # Construct the URL with the query + base_url = "https://search.nal.usda.gov/discovery/search" + params = f"?query=any,contains,{query}&tab=pubag&search_scope=pubag&vid=01NAL_INST:MAIN&offset={count}" + url = base_url + params + print("URL: ", url) + # Load the page + driver.get(url) + + # Wait for the page to load (you may need to adjust the sleep time) + time.sleep(sleep_time) + + # Find the search results + results = driver.find_elements(By.CLASS_NAME, 'list-item') + while len(results) == 0 and sleep_time < 30: + sleep_time += 1 + print("Sleeping for ", sleep_time, " seconds") + time.sleep(sleep_time) + results = driver.find_elements(By.CLASS_NAME, 'list-item') + + if len(results) == 0: + print("No results found after count: ", count) + break + + # Extract the titles and links + for result in results: + title_element = result.find_element(By.CLASS_NAME, 'item-title') + title = title_element.text.strip() + link = title_element.find_element(By.TAG_NAME, 'a').get_attribute('href') + search_results.append({'title': title, 'link': link}) + + + except Exception as e: + print(e) + + count += 10 + + driver.quit() + return search_results + + +def extract_doi(main_results, supabase_client): chrome_options = Options() chrome_options.add_argument("--headless") # Run in headless mode (no GUI) chrome_options.add_argument("--disable-gpu") # Disable GPU acceleration chrome_options.add_argument("--no-sandbox") # Required for running as root chrome_options.add_argument("--disable-dev-shm-usage") # Overcome limited resource problems - # Use ChromeDriverManager to automatically manage the driver - driver_service = Service(ChromeDriverManager().install()) - driver = webdriver.Chrome(service=driver_service, options=chrome_options) + #path = "C:/Users/dabho/.wdm/drivers/chromedriver/win64/127.0.6533.72/chromedriver-win32/chromedriver.exe" + #driver_service = Service(path) + chrome_options.binary_location = "/usr/bin/google-chrome" + driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install())) - for item in article_list: - page_link = item['link'] + sleep_time = 0 + for item in main_results: + link = item['link'] + try: + start_time = time.time() - # Load the page - driver.get(page_link) + # Load the page + driver.get(link) - # Wait for the page to load (you may need to adjust the sleep time) - time.sleep(20) + # Wait for the page to load (adjust sleep time if needed) + time.sleep(sleep_time) - # Find the search results - results = driver.find_elements(By.ID, 'item-details') - #print("Results: ", results) + # Find the search results + results = driver.find_elements(By.ID, 'item-details') + while not results and sleep_time < 30: + sleep_time += 5 + print("Sleeping for ", sleep_time, " seconds") + time.sleep(sleep_time) + results = driver.find_elements(By.ID, 'item-details') + + if not results: + item['doi'] = "N/A" + continue - # Extract the titles and links - for result in results: + # Extract the DOI link + for result in results: + try: + doi_link_element = result.find_element(By.XPATH, './/a[contains(@href, "https://doi.org/")]') + doi_link = doi_link_element.get_attribute("href") + except Exception: + doi_link = "N/A" + item['doi'] = doi_link + + # Extract DOI from the link + try: + doi = doi_link.split("https://doi.org/")[1] + print("DOI:", doi) + except Exception: + continue + + # Get metadata of the article + item_metadata = get_article_metadata_from_crossref(doi) + item['doi_number'] = doi + item['publisher'] = item_metadata.get('publisher', 'N/A') + item['metadata'] = item_metadata + + if 'license' in item_metadata: + # Look for TDM license + for ele in item_metadata['license']: + if ele['content-version'] == 'tdm': + item['license'] = ele['URL'] + break - try: - doi_link_element = result.find_element(By.XPATH, './/a[contains(@href, "https://doi.org/")]') - doi_link = doi_link_element.get_attribute("href") - except Exception as e: - doi_link = "N/A" - item['doi'] = doi_link + # If no TDM license, look for VOR license + if 'license' not in item: + for ele in item_metadata['license']: + if ele['content-version'] == 'vor': + item['license'] = ele['URL'] + break + + # Upload to SQL + response = supabase_client.table("nal_publications").insert(item).execute() + #print(response) + end_time = time.time() + print("Time taken to process 1 article: ", end_time - start_time) + + except Exception as e: + print(e) + # Close the browser driver.quit() - return article_list + return "success" def get_article_metadata_from_crossref(doi: str):