From 306c48ebd597b0beb3ce2037651478aa54ef016e Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Thu, 25 Jul 2024 21:48:17 -0500
Subject: [PATCH 1/3] created API endpoint for NAL datamining

---
 ai_ta_backend/main.py                |  18 +++++
 ai_ta_backend/utils/cw_datamining.py | 116 +++++++++++++++++++++++++++
 2 files changed, 134 insertions(+)
 create mode 100644 ai_ta_backend/utils/cw_datamining.py

diff --git a/ai_ta_backend/main.py b/ai_ta_backend/main.py
index 4c91317d..079adbea 100644
--- a/ai_ta_backend/main.py
+++ b/ai_ta_backend/main.py
@@ -41,6 +41,8 @@
 from ai_ta_backend.service.sentry_service import SentryService
 from ai_ta_backend.service.workflow_service import WorkflowService
 
+from ai_ta_backend.utils.cw_datamining import extract_article_metadata
+
 app = Flask(__name__)
 CORS(app)
 executor = Executor(app)
@@ -478,6 +480,22 @@ def run_flow(service: WorkflowService) -> Response:
       response.status_code = 500
       response.headers.add('Access-Control-Allow-Origin', '*')
       return response
+    
+@app.route('/nal-data-mining', methods=['GET'])
+def nal_data_mining() -> Response:
+  """
+  Queries NAL website with keywords to extract article data.
+  """
+  search_query: str = request.args.get('search_query', default='', type=str)
+
+  if search_query == '':
+    # proper web error "400 Bad request"
+    abort(400, description=f"Missing required parameter: 'search_query' must be provided. Search query: `{search_query}`")
+  
+  result = extract_article_metadata(search_query)
+  response = jsonify(result)
+  response.headers.add('Access-Control-Allow-Origin', '*')
+  return response
 
 
 def configure(binder: Binder) -> None:
diff --git a/ai_ta_backend/utils/cw_datamining.py b/ai_ta_backend/utils/cw_datamining.py
new file mode 100644
index 00000000..544b2fdf
--- /dev/null
+++ b/ai_ta_backend/utils/cw_datamining.py
@@ -0,0 +1,116 @@
+import os
+import requests
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.chrome.options import Options
+from webdriver_manager.chrome import ChromeDriverManager
+import time
+
+def extract_article_metadata(search_str: str) -> str:
+    """
+    Extract article metadata from NAL website.
+    Store the metadata in a SQL database.
+    """
+    print("Extracting article metadata from NAL website...")
+
+    # get list of articles - 1st page
+    search_results = get_search_results(search_str)
+
+    # for each article, go one level deeper to extract DOI
+    search_results = extract_doi(search_results)
+
+    # fetch metadata for each DOI using crossref API
+        
+
+
+    return "Article metadata extracted successfully."
+
+
+def get_search_results(query):
+    # Set up Selenium with Chrome WebDriver
+    chrome_options = Options()
+    chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
+    chrome_options.add_argument("--disable-gpu")  # Disable GPU acceleration
+    chrome_options.add_argument("--no-sandbox")  # Required for running as root
+    chrome_options.add_argument("--disable-dev-shm-usage")  # Overcome limited resource problems
+
+    # Use ChromeDriverManager to automatically manage the driver
+    driver_service = Service(ChromeDriverManager().install())
+    driver = webdriver.Chrome(service=driver_service, options=chrome_options)
+
+    try:
+        # Construct the URL with the query
+        base_url = "https://search.nal.usda.gov/discovery/search"
+        params = f"?query=any,contains,{query}&tab=pubag&search_scope=pubag&vid=01NAL_INST:MAIN&facet=tlevel,include,open_access&offset=0"
+        url = base_url + params
+        print("URL: ", url)
+        # Load the page
+        driver.get(url)
+        
+        # Wait for the page to load (you may need to adjust the sleep time)
+        time.sleep(20)
+        
+        # Find the search results
+        results = driver.find_elements(By.CLASS_NAME, 'list-item')
+        print("Results: ", len(results))
+        
+        # Extract the titles and links
+        search_results = []
+        for result in results:
+            title_element = result.find_element(By.CLASS_NAME, 'item-title')
+            title = title_element.text.strip()
+            link = title_element.find_element(By.TAG_NAME, 'a').get_attribute('href')
+            search_results.append({'title': title, 'link': link})
+        
+        return search_results
+    finally:
+        # Close the browser
+        driver.quit()
+
+def extract_doi(article_list: list):
+    """
+    Extract DOI from the article page and append to article_list dictionary.
+    """
+    # Set up Selenium with Chrome WebDriver
+    chrome_options = Options()
+    chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
+    chrome_options.add_argument("--disable-gpu")  # Disable GPU acceleration
+    chrome_options.add_argument("--no-sandbox")  # Required for running as root
+    chrome_options.add_argument("--disable-dev-shm-usage")  # Overcome limited resource problems
+
+    # Use ChromeDriverManager to automatically manage the driver
+    driver_service = Service(ChromeDriverManager().install())
+    driver = webdriver.Chrome(service=driver_service, options=chrome_options)
+
+    for item in article_list:
+        page_link = item['link']
+
+        # Load the page
+        driver.get(page_link)
+            
+        # Wait for the page to load (you may need to adjust the sleep time)
+        time.sleep(20)
+            
+        # Find the search results
+        results = driver.find_elements(By.ID, 'item-details')
+        #print("Results: ", results)
+            
+        # Extract the titles and links
+        for result in results:
+                
+            try:
+                doi_link_element = result.find_element(By.XPATH, './/a[contains(@href, "https://doi.org/")]')
+                doi_link = doi_link_element.get_attribute("href")
+            except Exception as e:
+                doi_link = "N/A"
+            item['doi'] = doi_link
+
+    # Close the browser
+    driver.quit()
+
+    return article_list
+
+    
+
+        
\ No newline at end of file

From 6ed0d74320426707dbc11218e082431b14782ceb Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Fri, 26 Jul 2024 09:56:31 -0500
Subject: [PATCH 2/3] added chrome binary path for selenium

---
 ai_ta_backend/utils/cw_datamining.py | 50 ++++++++++++++++++++++++++--
 requirements.txt                     |  4 +++
 2 files changed, 51 insertions(+), 3 deletions(-)

diff --git a/ai_ta_backend/utils/cw_datamining.py b/ai_ta_backend/utils/cw_datamining.py
index 544b2fdf..558a9c45 100644
--- a/ai_ta_backend/utils/cw_datamining.py
+++ b/ai_ta_backend/utils/cw_datamining.py
@@ -6,6 +6,7 @@
 from selenium.webdriver.chrome.options import Options
 from webdriver_manager.chrome import ChromeDriverManager
 import time
+import crossref_commons
 
 def extract_article_metadata(search_str: str) -> str:
     """
@@ -21,20 +22,52 @@ def extract_article_metadata(search_str: str) -> str:
     search_results = extract_doi(search_results)
 
     # fetch metadata for each DOI using crossref API
-        
-
-
+    for article in search_results:
+        doi = article['doi']
+        if doi != "N/A":
+            metadata = get_article_metadata_from_crossref(doi)
+            article['doi_number'] = doi
+            article['publisher'] = metadata['publisher']
+            
+            if 'license' in metadata:
+                print("license: ", metadata['license'])
+            
+                for ele in metadata['license']:
+                    if ele['content-version'] == 'tdm':
+                        article['license'] = ele['URL']
+                        break
+                # if license is still empty, go for vor
+                if 'license' not in article:
+                    for ele in metadata['license']:
+                        if ele['content-version'] == 'vor':
+                            article['license'] = ele['URL']
+                            break
+                else:
+                    article['license'] = "N/A"
+                    
+            else:
+                article['license'] = "N/A"   
+
+            article['metadata'] = metadata   
+
+            print("Article: ", article)
+            
+    print("Full list of articles: ", search_results)        
     return "Article metadata extracted successfully."
 
 
 def get_search_results(query):
     # Set up Selenium with Chrome WebDriver
+    chrome_binary_path = "C:/Program Files/Google/Chrome/Application/chrome.exe"
     chrome_options = Options()
+    chrome_options.binary_location = chrome_binary_path
+
     chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
     chrome_options.add_argument("--disable-gpu")  # Disable GPU acceleration
     chrome_options.add_argument("--no-sandbox")  # Required for running as root
     chrome_options.add_argument("--disable-dev-shm-usage")  # Overcome limited resource problems
 
+
     # Use ChromeDriverManager to automatically manage the driver
     driver_service = Service(ChromeDriverManager().install())
     driver = webdriver.Chrome(service=driver_service, options=chrome_options)
@@ -111,6 +144,17 @@ def extract_doi(article_list: list):
 
     return article_list
 
+
+def get_article_metadata_from_crossref(doi: str):
+    """
+    Get article metadata from Crossref API.
+    """
+    # Get metadata from Crossref
+    metadata = crossref_commons.retrieval.get_publication_as_json(doi)
+    print("Metadata: ", metadata)
+    
+    return metadata
+
     
 
         
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index d228546a..74434757 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -67,3 +67,7 @@ sentry-sdk==1.39.1
 # unstructured.pytesseract==0.3.12
 # unstructured-inference==0.7.11 # this is the real large one :(
 # unstructured[xlsx,image,pptx]==0.10.29 # causes huge ~5.3 GB of installs. Probbably from onnx: https://github.com/Unstructured-IO/unstructured/blob/ad14321016533dc03c1782f6ebea00bc9c804846/requirements/extra-pdf-image.in#L4
+
+selenium
+webdriver_manager
+crossref_commons
\ No newline at end of file

From a1abcc08088500d27eedef6ed1b45d32c7f527e8 Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Wed, 4 Sep 2024 10:18:17 -0500
Subject: [PATCH 3/3] updated data mining script

---
 ai_ta_backend/utils/cw_datamining.py | 252 ++++++++++++++++-----------
 1 file changed, 152 insertions(+), 100 deletions(-)

diff --git a/ai_ta_backend/utils/cw_datamining.py b/ai_ta_backend/utils/cw_datamining.py
index 558a9c45..a67bb11a 100644
--- a/ai_ta_backend/utils/cw_datamining.py
+++ b/ai_ta_backend/utils/cw_datamining.py
@@ -1,12 +1,26 @@
-import os
+import os 
 import requests
 from selenium import webdriver
 from selenium.webdriver.common.by import By
 from selenium.webdriver.chrome.service import Service
 from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.chrome.service import Service as ChromeService
+
 from webdriver_manager.chrome import ChromeDriverManager
 import time
-import crossref_commons
+import json
+import pandas as pd
+import crossref_commons.retrieval
+import shutil
+import supabase
+from supabase import create_client, Client
+
+# Initialize the Supabase client
+url = os.environ.get("SUPABASE_URL")
+key = os.environ.get("SUPABASE_API_KEY")
+SUPABASE_CLIENT: Client = create_client(url, key)
+
+LOG = "log.txt"
 
 def extract_article_metadata(search_str: str) -> str:
     """
@@ -14,135 +28,173 @@ def extract_article_metadata(search_str: str) -> str:
     Store the metadata in a SQL database.
     """
     print("Extracting article metadata from NAL website...")
-
+    start_time = time.time()
     # get list of articles - 1st page
     search_results = get_search_results(search_str)
+    search_time = time.time() 
+    print("Time taken to search results: ", search_time - start_time)
 
     # for each article, go one level deeper to extract DOI
-    search_results = extract_doi(search_results)
-
-    # fetch metadata for each DOI using crossref API
-    for article in search_results:
-        doi = article['doi']
-        if doi != "N/A":
-            metadata = get_article_metadata_from_crossref(doi)
-            article['doi_number'] = doi
-            article['publisher'] = metadata['publisher']
-            
-            if 'license' in metadata:
-                print("license: ", metadata['license'])
-            
-                for ele in metadata['license']:
-                    if ele['content-version'] == 'tdm':
-                        article['license'] = ele['URL']
-                        break
-                # if license is still empty, go for vor
-                if 'license' not in article:
-                    for ele in metadata['license']:
-                        if ele['content-version'] == 'vor':
-                            article['license'] = ele['URL']
-                            break
-                else:
-                    article['license'] = "N/A"
-                    
-            else:
-                article['license'] = "N/A"   
+    search_results = extract_doi(search_results, SUPABASE_CLIENT)
+    doi_time = time.time()
+    print("Time taken to extract DOI and upload metadata: ", doi_time - search_time)
 
-            article['metadata'] = metadata   
-
-            print("Article: ", article)
-            
-    print("Full list of articles: ", search_results)        
+           
     return "Article metadata extracted successfully."
 
-
 def get_search_results(query):
-    # Set up Selenium with Chrome WebDriver
-    chrome_binary_path = "C:/Program Files/Google/Chrome/Application/chrome.exe"
     chrome_options = Options()
-    chrome_options.binary_location = chrome_binary_path
-
     chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
     chrome_options.add_argument("--disable-gpu")  # Disable GPU acceleration
     chrome_options.add_argument("--no-sandbox")  # Required for running as root
     chrome_options.add_argument("--disable-dev-shm-usage")  # Overcome limited resource problems
+    chrome_options.add_argument("--log-level=3")
 
+    chrome_path = shutil.which("google-chrome")
+    print(f"Chrome binary path: {chrome_path}")
+    # path = "C:/Users/dabho/.wdm/drivers/chromedriver/win64/127.0.6533.72/chromedriver-win32/chromedriver.exe"
+    # driver_service = Service(path)
+    #chrome_options.binary_location ="C:/Users/dabho/.wdm/drivers/chromedriver/win64/127.0.6533.72/chromedriver-win32/chromedriver.exe"
+    
+    #chrome_options.binary_location = "/usr/bin/google-chrome"
+    chrome_options.binary_location = "/opt/google/chrome/google-chrome"
 
-    # Use ChromeDriverManager to automatically manage the driver
-    driver_service = Service(ChromeDriverManager().install())
-    driver = webdriver.Chrome(service=driver_service, options=chrome_options)
-
-    try:
-        # Construct the URL with the query
-        base_url = "https://search.nal.usda.gov/discovery/search"
-        params = f"?query=any,contains,{query}&tab=pubag&search_scope=pubag&vid=01NAL_INST:MAIN&facet=tlevel,include,open_access&offset=0"
-        url = base_url + params
-        print("URL: ", url)
-        # Load the page
-        driver.get(url)
-        
-        # Wait for the page to load (you may need to adjust the sleep time)
-        time.sleep(20)
-        
-        # Find the search results
-        results = driver.find_elements(By.CLASS_NAME, 'list-item')
-        print("Results: ", len(results))
-        
-        # Extract the titles and links
-        search_results = []
-        for result in results:
-            title_element = result.find_element(By.CLASS_NAME, 'item-title')
-            title = title_element.text.strip()
-            link = title_element.find_element(By.TAG_NAME, 'a').get_attribute('href')
-            search_results.append({'title': title, 'link': link})
-        
-        return search_results
-    finally:
-        # Close the browser
-        driver.quit()
-
-def extract_doi(article_list: list):
-    """
-    Extract DOI from the article page and append to article_list dictionary.
-    """
-    # Set up Selenium with Chrome WebDriver
+    driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
+
+
+    #driver = webdriver.Chrome(service=driver_service, options=chrome_options)
+    
+    count = 0
+    search_results = []
+    sleep_time = 0
+    while count < 1000:
+        try:
+            # Construct the URL with the query
+            base_url = "https://search.nal.usda.gov/discovery/search"
+            params = f"?query=any,contains,{query}&tab=pubag&search_scope=pubag&vid=01NAL_INST:MAIN&offset={count}"
+            url = base_url + params
+            print("URL: ", url)
+            # Load the page
+            driver.get(url)
+            
+            # Wait for the page to load (you may need to adjust the sleep time)
+            time.sleep(sleep_time)
+            
+            # Find the search results
+            results = driver.find_elements(By.CLASS_NAME, 'list-item')
+            while len(results) == 0 and sleep_time < 30:
+                sleep_time += 1
+                print("Sleeping for ", sleep_time, " seconds")
+                time.sleep(sleep_time)
+                results = driver.find_elements(By.CLASS_NAME, 'list-item')
+
+            if len(results) == 0:
+                print("No results found after count: ", count)
+                break
+            
+            # Extract the titles and links
+            for result in results:
+                title_element = result.find_element(By.CLASS_NAME, 'item-title')
+                title = title_element.text.strip()
+                link = title_element.find_element(By.TAG_NAME, 'a').get_attribute('href')
+                search_results.append({'title': title, 'link': link})
+            
+            
+        except Exception as e:
+            print(e)
+            
+        count += 10
+    
+    driver.quit()
+    return search_results
+
+
+def extract_doi(main_results, supabase_client):
     chrome_options = Options()
     chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
     chrome_options.add_argument("--disable-gpu")  # Disable GPU acceleration
     chrome_options.add_argument("--no-sandbox")  # Required for running as root
     chrome_options.add_argument("--disable-dev-shm-usage")  # Overcome limited resource problems
 
-    # Use ChromeDriverManager to automatically manage the driver
-    driver_service = Service(ChromeDriverManager().install())
-    driver = webdriver.Chrome(service=driver_service, options=chrome_options)
+    #path = "C:/Users/dabho/.wdm/drivers/chromedriver/win64/127.0.6533.72/chromedriver-win32/chromedriver.exe"
+    #driver_service = Service(path)
+    chrome_options.binary_location = "/usr/bin/google-chrome"
+    driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
 
-    for item in article_list:
-        page_link = item['link']
+    sleep_time = 0
+    for item in main_results:
+        link = item['link']
+        try:
+            start_time = time.time()
 
-        # Load the page
-        driver.get(page_link)
+            # Load the page
+            driver.get(link)
             
-        # Wait for the page to load (you may need to adjust the sleep time)
-        time.sleep(20)
+            # Wait for the page to load (adjust sleep time if needed)
+            time.sleep(sleep_time)
             
-        # Find the search results
-        results = driver.find_elements(By.ID, 'item-details')
-        #print("Results: ", results)
+            # Find the search results
+            results = driver.find_elements(By.ID, 'item-details')
+            while not results and sleep_time < 30:
+                sleep_time += 5
+                print("Sleeping for ", sleep_time, " seconds")
+                time.sleep(sleep_time)
+                results = driver.find_elements(By.ID, 'item-details')
+
+            if not results:
+                item['doi'] = "N/A"
+                continue
             
-        # Extract the titles and links
-        for result in results:
+            # Extract the DOI link
+            for result in results:
+                try:
+                    doi_link_element = result.find_element(By.XPATH, './/a[contains(@href, "https://doi.org/")]')
+                    doi_link = doi_link_element.get_attribute("href")
+                except Exception:
+                    doi_link = "N/A"
+                item['doi'] = doi_link
+
+            # Extract DOI from the link
+            try: 
+                doi = doi_link.split("https://doi.org/")[1]
+                print("DOI:", doi)
+            except Exception:
+                continue
+
+            # Get metadata of the article
+            item_metadata = get_article_metadata_from_crossref(doi)
+            item['doi_number'] = doi
+            item['publisher'] = item_metadata.get('publisher', 'N/A')
+            item['metadata'] = item_metadata
+
+            if 'license' in item_metadata:
+                # Look for TDM license
+                for ele in item_metadata['license']:
+                    if ele['content-version'] == 'tdm':
+                        item['license'] = ele['URL']
+                        break
                 
-            try:
-                doi_link_element = result.find_element(By.XPATH, './/a[contains(@href, "https://doi.org/")]')
-                doi_link = doi_link_element.get_attribute("href")
-            except Exception as e:
-                doi_link = "N/A"
-            item['doi'] = doi_link
+                # If no TDM license, look for VOR license
+                if 'license' not in item:
+                    for ele in item_metadata['license']:
+                        if ele['content-version'] == 'vor':
+                            item['license'] = ele['URL']
+                            break
+            
+            # Upload to SQL
+            response = supabase_client.table("nal_publications").insert(item).execute()
+            #print(response)
 
+            end_time = time.time()
+            print("Time taken to process 1 article: ", end_time - start_time)
+
+        except Exception as e:
+            print(e)
+    
     # Close the browser
     driver.quit()
 
-    return article_list
+    return "success"
 
 
 def get_article_metadata_from_crossref(doi: str):