diff --git a/ai_ta_backend/main.py b/ai_ta_backend/main.py index fecfd3be..f763fceb 100644 --- a/ai_ta_backend/main.py +++ b/ai_ta_backend/main.py @@ -41,6 +41,8 @@ from ai_ta_backend.service.sentry_service import SentryService from ai_ta_backend.service.workflow_service import WorkflowService +from ai_ta_backend.utils.cw_datamining import extract_article_metadata + app = Flask(__name__) CORS(app) executor = Executor(app) @@ -512,6 +514,22 @@ def run_flow(service: WorkflowService) -> Response: response.status_code = 500 response.headers.add('Access-Control-Allow-Origin', '*') return response + +@app.route('/nal-data-mining', methods=['GET']) +def nal_data_mining() -> Response: + """ + Queries NAL website with keywords to extract article data. + """ + search_query: str = request.args.get('search_query', default='', type=str) + + if search_query == '': + # proper web error "400 Bad request" + abort(400, description=f"Missing required parameter: 'search_query' must be provided. Search query: `{search_query}`") + + result = extract_article_metadata(search_query) + response = jsonify(result) + response.headers.add('Access-Control-Allow-Origin', '*') + return response def configure(binder: Binder) -> None: diff --git a/ai_ta_backend/utils/cw_datamining.py b/ai_ta_backend/utils/cw_datamining.py new file mode 100644 index 00000000..a67bb11a --- /dev/null +++ b/ai_ta_backend/utils/cw_datamining.py @@ -0,0 +1,212 @@ +import os +import requests +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.chrome.service import Service +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.chrome.service import Service as ChromeService + +from webdriver_manager.chrome import ChromeDriverManager +import time +import json +import pandas as pd +import crossref_commons.retrieval +import shutil +import supabase +from supabase import create_client, Client + +# Initialize the Supabase client +url = os.environ.get("SUPABASE_URL") +key = os.environ.get("SUPABASE_API_KEY") +SUPABASE_CLIENT: Client = create_client(url, key) + +LOG = "log.txt" + +def extract_article_metadata(search_str: str) -> str: + """ + Extract article metadata from NAL website. + Store the metadata in a SQL database. + """ + print("Extracting article metadata from NAL website...") + start_time = time.time() + # get list of articles - 1st page + search_results = get_search_results(search_str) + search_time = time.time() + print("Time taken to search results: ", search_time - start_time) + + # for each article, go one level deeper to extract DOI + search_results = extract_doi(search_results, SUPABASE_CLIENT) + doi_time = time.time() + print("Time taken to extract DOI and upload metadata: ", doi_time - search_time) + + + return "Article metadata extracted successfully." + +def get_search_results(query): + chrome_options = Options() + chrome_options.add_argument("--headless") # Run in headless mode (no GUI) + chrome_options.add_argument("--disable-gpu") # Disable GPU acceleration + chrome_options.add_argument("--no-sandbox") # Required for running as root + chrome_options.add_argument("--disable-dev-shm-usage") # Overcome limited resource problems + chrome_options.add_argument("--log-level=3") + + chrome_path = shutil.which("google-chrome") + print(f"Chrome binary path: {chrome_path}") + # path = "C:/Users/dabho/.wdm/drivers/chromedriver/win64/127.0.6533.72/chromedriver-win32/chromedriver.exe" + # driver_service = Service(path) + #chrome_options.binary_location ="C:/Users/dabho/.wdm/drivers/chromedriver/win64/127.0.6533.72/chromedriver-win32/chromedriver.exe" + + #chrome_options.binary_location = "/usr/bin/google-chrome" + chrome_options.binary_location = "/opt/google/chrome/google-chrome" + + driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install())) + + + #driver = webdriver.Chrome(service=driver_service, options=chrome_options) + + count = 0 + search_results = [] + sleep_time = 0 + while count < 1000: + try: + # Construct the URL with the query + base_url = "https://search.nal.usda.gov/discovery/search" + params = f"?query=any,contains,{query}&tab=pubag&search_scope=pubag&vid=01NAL_INST:MAIN&offset={count}" + url = base_url + params + print("URL: ", url) + # Load the page + driver.get(url) + + # Wait for the page to load (you may need to adjust the sleep time) + time.sleep(sleep_time) + + # Find the search results + results = driver.find_elements(By.CLASS_NAME, 'list-item') + while len(results) == 0 and sleep_time < 30: + sleep_time += 1 + print("Sleeping for ", sleep_time, " seconds") + time.sleep(sleep_time) + results = driver.find_elements(By.CLASS_NAME, 'list-item') + + if len(results) == 0: + print("No results found after count: ", count) + break + + # Extract the titles and links + for result in results: + title_element = result.find_element(By.CLASS_NAME, 'item-title') + title = title_element.text.strip() + link = title_element.find_element(By.TAG_NAME, 'a').get_attribute('href') + search_results.append({'title': title, 'link': link}) + + + except Exception as e: + print(e) + + count += 10 + + driver.quit() + return search_results + + +def extract_doi(main_results, supabase_client): + chrome_options = Options() + chrome_options.add_argument("--headless") # Run in headless mode (no GUI) + chrome_options.add_argument("--disable-gpu") # Disable GPU acceleration + chrome_options.add_argument("--no-sandbox") # Required for running as root + chrome_options.add_argument("--disable-dev-shm-usage") # Overcome limited resource problems + + #path = "C:/Users/dabho/.wdm/drivers/chromedriver/win64/127.0.6533.72/chromedriver-win32/chromedriver.exe" + #driver_service = Service(path) + chrome_options.binary_location = "/usr/bin/google-chrome" + driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install())) + + sleep_time = 0 + for item in main_results: + link = item['link'] + try: + start_time = time.time() + + # Load the page + driver.get(link) + + # Wait for the page to load (adjust sleep time if needed) + time.sleep(sleep_time) + + # Find the search results + results = driver.find_elements(By.ID, 'item-details') + while not results and sleep_time < 30: + sleep_time += 5 + print("Sleeping for ", sleep_time, " seconds") + time.sleep(sleep_time) + results = driver.find_elements(By.ID, 'item-details') + + if not results: + item['doi'] = "N/A" + continue + + # Extract the DOI link + for result in results: + try: + doi_link_element = result.find_element(By.XPATH, './/a[contains(@href, "https://doi.org/")]') + doi_link = doi_link_element.get_attribute("href") + except Exception: + doi_link = "N/A" + item['doi'] = doi_link + + # Extract DOI from the link + try: + doi = doi_link.split("https://doi.org/")[1] + print("DOI:", doi) + except Exception: + continue + + # Get metadata of the article + item_metadata = get_article_metadata_from_crossref(doi) + item['doi_number'] = doi + item['publisher'] = item_metadata.get('publisher', 'N/A') + item['metadata'] = item_metadata + + if 'license' in item_metadata: + # Look for TDM license + for ele in item_metadata['license']: + if ele['content-version'] == 'tdm': + item['license'] = ele['URL'] + break + + # If no TDM license, look for VOR license + if 'license' not in item: + for ele in item_metadata['license']: + if ele['content-version'] == 'vor': + item['license'] = ele['URL'] + break + + # Upload to SQL + response = supabase_client.table("nal_publications").insert(item).execute() + #print(response) + + end_time = time.time() + print("Time taken to process 1 article: ", end_time - start_time) + + except Exception as e: + print(e) + + # Close the browser + driver.quit() + + return "success" + + +def get_article_metadata_from_crossref(doi: str): + """ + Get article metadata from Crossref API. + """ + # Get metadata from Crossref + metadata = crossref_commons.retrieval.get_publication_as_json(doi) + print("Metadata: ", metadata) + + return metadata + + + + \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 8d5dd65b..4d06035a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -68,3 +68,7 @@ sentry-sdk==1.39.1 # unstructured.pytesseract==0.3.12 # unstructured-inference==0.7.11 # this is the real large one :( # unstructured[xlsx,image,pptx]==0.10.29 # causes huge ~5.3 GB of installs. Probbably from onnx: https://github.com/Unstructured-IO/unstructured/blob/ad14321016533dc03c1782f6ebea00bc9c804846/requirements/extra-pdf-image.in#L4 + +selenium +webdriver_manager +crossref_commons \ No newline at end of file