Main merge (#35)

* FT Studio FY25 Q1 Changes (#94) * Inline configs, JSON dataset import (#16) * Inline configs, JSON dataset import * fix broken tests * DSE-40672: Fix prompt page issue (#17) * DSE-40737: Support JSON LINES in import data (#18) * Fixed bug for adapter_id not present (#21) * Experiments same (#19) * Code for evaluation and artifact in the same run * temporary removing ticketing agents * Adding back the steps * replacing &. "/" with - * Dbdump (#22) * Added module for db export import along with tests * Couple of bugfixes * Bugfix for return type not set in timport database * Alembic integration (#20) * Added alembic migrations * Added Logic for on button click. 3 steps * Changed to alembic upgrade head * Corrected Migration code to work with existing dev branch as well * Moved upgrade st function inside check amp func, as things don't work as expected * Removed double check * Pyamp (#24) * Added uv packaging * Checking for uv compatibility * Compatibility with uv for all the process * Bug fix in intialize project defaults * Added activation of venv before running jobs * Trying to change the directory * Trying to add directory installation for composability * Modifying start script * changed app launch script * updated dependendecy of python to 3.11 * Downgraded to python 3.10 * moved to python 3.10 * Autopep8 in toml file added * Bug fix for initialize defaults (#25) * DSE-41286: Upgade studio UI (#26) * bug fixes * Added nav packages * Pyamp bug v2 (#27) * bug fixes * Working Jobs and cml script path change * top-nav-repo * Added is_embedded to true * Making embed_application to be true (#28) * Changed requirements and main logic to add in top nav bar * gpu label id fix (#29) * Upgrading huggingface hub (#31) * Cloudera runtimes update * Bugfix for install dependencies --------- Co-authored-by: Jason Everett <[email protected]> Co-authored-by: mihirj <[email protected]> Co-authored-by: Abhishek Ranjan <[email protected]> * Inline configs, JSON dataset import (#16) * Inline configs, JSON dataset import * fix broken tests * DSE-40737: Support JSON LINES in import data (#18) * Experiments same (#19) * Code for evaluation and artifact in the same run * temporary removing ticketing agents * Adding back the steps * replacing &. "/" with - * Alembic integration (#20) * Added alembic migrations * Added Logic for on button click. 3 steps * Changed to alembic upgrade head * Corrected Migration code to work with existing dev branch as well * Moved upgrade st function inside check amp func, as things don't work as expected * Removed double check * Pyamp (#24) * Added uv packaging * Checking for uv compatibility * Compatibility with uv for all the process * Bug fix in intialize project defaults * Added activation of venv before running jobs * Trying to change the directory * Trying to add directory installation for composability * Modifying start script * changed app launch script * updated dependendecy of python to 3.11 * Downgraded to python 3.10 * moved to python 3.10 * Autopep8 in toml file added * Top nav comp (#32) * bug fixes * Added nav packages * top-nav-repo * Added is_embedded to true * Changed requirements and main logic to add in top nav bar * Cloudera runtimes update * no gpu * Ran formatting --------- Co-authored-by: Abhishek Ranjan <[email protected]> Co-authored-by: Jason Everett <[email protected]> Co-authored-by: mihirj <[email protected]>
cloudera · Feb 6, 2025 · 77915f9 · 77915f9
1 parent 9ff5bb0
commit 77915f9
Show file tree

Hide file tree

Showing 15 changed files with 3,959 additions and 86 deletions.
diff --git a/examples/ticketing-agent-app/ticketing-agent-launch.py b/examples/ticketing-agent-app/ticketing-agent-launch.py
@@ -4,4 +4,4 @@
 if os.getenv("IS_COMPOSABLE", "") != "":
   os.chdir("/home/cdsw/fine-tuning-studio")
 
-!uv run -m streamlit run examples/ticketing-agent-app/ticketing-agent-app.py --server.port $CDSW_APP_PORT --server.address 127.0.0.1
+!uv run -m streamlit run examples/ticketing-agent-app/ticketing-agent-app.py --server.port $CDSW_APP_PORT --server.address 127.0.0.1
diff --git a/ft/consts.py b/ft/consts.py
@@ -1,3 +1,4 @@
+import os
 HF_LOGO = "https://huggingface.co/datasets/huggingface/brand-assets/resolve/main/hf-logo.png"
 DEFAULT_FTS_GRPC_PORT = "50051"
 AXOLOTL_DATASET_FORMAT_CONFIGS_FOLDER_PATH = "ft/config/axolotl/dataset_formats"
@@ -35,12 +36,10 @@
 """
 
 
-import os
-
 if os.getenv("IS_COMPOSABLE", "") != "":
-  CML_MODEL_PREDICT_SCRIPT_FILEPATH = "fine-tuning-studio/ft/scripts/cml_model_predict_script.py"
+    CML_MODEL_PREDICT_SCRIPT_FILEPATH = "fine-tuning-studio/ft/scripts/cml_model_predict_script.py"
 else:
-  CML_MODEL_PREDICT_SCRIPT_FILEPATH = "ft/scripts/cml_model_predict_script.py"
+    CML_MODEL_PREDICT_SCRIPT_FILEPATH = "ft/scripts/cml_model_predict_script.py"
 """
 Filepath for the main predict functionality and generation loop of a
 deployed model+adapter as a CML Model.

diff --git a/ft/databse_ops.py b/ft/databse_ops.py
@@ -2,6 +2,7 @@
 from ft.api import *
 from ft.db.db_import_export import DatabaseJsonConverter
 
+
 def export_database(request: ExportDatabaseRequest,
                     dao: FineTuningStudioDao = None) -> ExportDatabaseResponse:
     db_converter = DatabaseJsonConverter()

diff --git a/ft/db/db_import_export.py b/ft/db/db_import_export.py
@@ -5,6 +5,7 @@
 import re
 from ft.db.dao import get_sqlite_db_location
 
+
 class DatabaseJsonConverter:
     """
     A utility class for converting SQLite databases to JSON and vice versa.
@@ -80,7 +81,7 @@ def export_to_json(self, output_path=None) -> str:
                     }
 
                 # # Write to JSON file
-                if output_path != None:
+                if output_path is not None:
                     with open(output_path, 'w') as f:
                         json.dump(database_dict, f, indent=2)
 

diff --git a/ft/eval/mlflow_evaluator.py b/ft/eval/mlflow_evaluator.py
@@ -10,7 +10,6 @@ def __init__(self) -> None:
 
     @staticmethod
     def evaluate_model(model_info, eval_df, experiment_id, run_id, eval_target_column_name=EVAL_OUTPUT_COLUM):
-
         with mlflow.start_run(experiment_id=experiment_id, run_id=run_id):
             results = mlflow.evaluate(
                 model_info.model_uri,

diff --git a/ft/scripts/accel_fine_tune_base_script.py b/ft/scripts/accel_fine_tune_base_script.py
@@ -1,27 +1,10 @@
-import os
-if os.getenv("IS_COMPOSABLE", "") != "":
-  os.chdir("/home/cdsw/fine-tuning-studio")
-from ft.venv_utils import activate_venv
-activate_venv(".venv")
-from accelerate.utils.constants import ELASTIC_LOG_LINE_PREFIX_TEMPLATE_PYTORCH_VERSION
-from accelerate.utils import (
-    PrepareForLaunch,
-    check_cuda_p2p_ib_support,
-    is_torch_version,
-    patch_environment,
+from ft.training.utils import (
+    map_dataset_with_prompt_template,
+    sample_and_split_dataset,
+    get_model_parameters,
+    configure_tokenizer_padding
 )
-from pathlib import Path
-from peft import prepare_model_for_kbit_training
-import torch
-import argparse
-import sys
-import json
-import os
-import datasets
-from accelerate import Accelerator, notebook_launcher
-from ft.utils import attempt_hf_login
-from ft.client import FineTuningStudioClient
-from ft.api import *
+from ft.datasets import load_dataset_into_memory
 from ft.consts import (
     TRAINING_DEFAULT_TRAIN_TEST_SPLIT,
     TRAINING_DEFAULT_DATASET_FRACTION,
@@ -30,13 +13,29 @@
     DEFAULT_LORA_CONFIG,
     DEFAULT_TRAINING_ARGUMENTS
 )
-from ft.datasets import load_dataset_into_memory
-from ft.training.utils import (
-    map_dataset_with_prompt_template,
-    sample_and_split_dataset,
-    get_model_parameters,
-    configure_tokenizer_padding
+from ft.api import *
+from ft.client import FineTuningStudioClient
+from ft.utils import attempt_hf_login
+from accelerate import Accelerator, notebook_launcher
+import datasets
+import json
+import sys
+import argparse
+import torch
+from peft import prepare_model_for_kbit_training
+from pathlib import Path
+from accelerate.utils import (
+    PrepareForLaunch,
+    check_cuda_p2p_ib_support,
+    is_torch_version,
+    patch_environment,
 )
+from accelerate.utils.constants import ELASTIC_LOG_LINE_PREFIX_TEMPLATE_PYTORCH_VERSION
+import os
+if os.getenv("IS_COMPOSABLE", "") != "":
+    os.chdir("/home/cdsw/fine-tuning-studio")
+from ft.venv_utils import activate_venv
+activate_venv(".venv")
 
 # TODO: Make all FTS configs/settings loading come from an imported module
 #       so scripts like this focus on fine-tuning loop only

diff --git a/ft/scripts/mlflow_evaluation_base_script.py b/ft/scripts/mlflow_evaluation_base_script.py
@@ -1,19 +1,17 @@
+from ft.consts import EVAL_DATASET_DEFAULT_FRACTION, USER_DEFINED_IDENTIFIER, DEFAULT_BNB_CONFIG, DEFAULT_GENERATIONAL_CONFIG
+import json
+from copy import deepcopy
+import pandas as pd
+from ft.client import FineTuningStudioClient
+import ast
+import argparse
+from ft.eval.mlflow_driver import driver
 import os
 if os.getenv("IS_COMPOSABLE", "") != "":
-  os.chdir("/home/cdsw/fine-tuning-studio")
+    os.chdir("/home/cdsw/fine-tuning-studio")
 from ft.venv_utils import activate_venv
 activate_venv(".venv")
 
-from ft.eval.mlflow_driver import driver
-import argparse
-import os
-import ast
-from ft.client import FineTuningStudioClient
-import pandas as pd
-from copy import deepcopy
-import json
-from ft.consts import EVAL_DATASET_DEFAULT_FRACTION, USER_DEFINED_IDENTIFIER, DEFAULT_BNB_CONFIG, DEFAULT_GENERATIONAL_CONFIG
-
 
 # Parse arguments from environment variable
 arg_string = os.environ.get('JOB_ARGUMENTS', '')

diff --git a/ft/upgrade/restarter.py b/ft/upgrade/restarter.py
@@ -2,6 +2,7 @@
 import cmlapi
 import streamlit as st
 
+
 def restart_application_function():
     cml = cmlapi.default_client()
     project_id = os.getenv("CDSW_PROJECT_ID")
@@ -20,7 +21,8 @@ def restart_application_function():
         else:
             st.error(f"Application {app.name} is not running.")
             return False
-
+
+
 def is_gpu_present():
     cml = cmlapi.default_client()
     project_id = os.getenv("CDSW_PROJECT_ID")
@@ -31,7 +33,7 @@ def is_gpu_present():
         if app.status == "APPLICATION_RUNNING":
             return app.nvidia_gpu
     return 0
-    
+
 
 def update_app_with_gpu():
     cml = cmlapi.default_client()
@@ -43,7 +45,7 @@ def update_app_with_gpu():
         if app.status == "APPLICATION_RUNNING":
             try:
                 body = {"nvidia_gpu": 1}
-                #body = {"gpu": 1}
+                # body = {"gpu": 1}
                 cml.update_application(body, project_id, app.id)
                 cml.restart_application(project_id, app.id)
                 st.success(f"Application {app.name} Updated with a GPU successfully!")

diff --git a/ft/upgrade/upgrader.py b/ft/upgrade/upgrader.py
@@ -5,19 +5,20 @@
 import ensurepip
 from time import sleep
 
+
 def run_git_pull():
     """
     Execute git pull to update the repository.
-    
+
     Returns:
     - True if successful, False otherwise
     """
     try:
         # Fetch the latest changes and pull
         result = subprocess.run(
-            ['git', 'pull'], 
-            capture_output=True, 
-            text=True, 
+            ['git', 'pull'],
+            capture_output=True,
+            text=True,
             check=True
         )
         st.success("Git pull successful!")
@@ -30,14 +31,13 @@ def run_git_pull():
         return False
 
 
-
 def is_package_installed(package_name):
     """
     Check if a Python package is installed.
-    
+
     Args:
         package_name (str): Name of the package to check
-    
+
     Returns:
         bool: True if package is installed, False otherwise
     """
@@ -47,73 +47,75 @@ def is_package_installed(package_name):
     except ImportError:
         return False
 
+
 def install_package(package_name):
     """
     Install a Python package using pip.
-    
+
     Args:
         package_name (str): Name of the package to install
-    
+
     Returns:
         bool: True if installation successful, False otherwise
     """
     try:
         # Ensure pip is available
         ensurepip.bootstrap()
-        
+
         # Install the package
         result = subprocess.run(
-            [sys.executable, '-m', 'pip', 'install', package_name], 
-            capture_output=True, 
-            text=True, 
+            [sys.executable, '-m', 'pip', 'install', package_name],
+            capture_output=True,
+            text=True,
             check=True
         )
-        
+
         st.success(f"{package_name} installed successfully!")
         st.text(result.stdout)
         return True
     except (subprocess.CalledProcessError, Exception) as e:
         st.error(f"Failed to install {package_name}: {str(e)}")
         return False
 
+
 def run_alembic_upgrade():
     """
     Run Alembic database migrations after checking and installing if necessary.
-    
+
     Returns:
     - True if successful, False otherwise
     """
     try:
         # Check if Alembic is installed
         if not is_package_installed('alembic'):  # Check if a particular version is needed here
             st.warning("Alembic not found. Attempting to install...")
-            
+
             # Try to install Alembic
             install_result = install_package('alembic')
-            
+
             if not install_result:
                 st.error("Could not install Alembic. Upgrade process cannot continue.")
                 return False
-        
+
         # Verify installation after attempted install
         if not is_package_installed('alembic'):
             st.error("Alembic installation failed unexpectedly.")
             return False
-        
+
         # Run Alembic upgrade
         result = subprocess.run(
-            ['alembic', 'upgrade', 'head'], 
-            capture_output=True, 
-            text=True, 
+            ['alembic', 'upgrade', 'head'],
+            capture_output=True,
+            text=True,
             check=True
         )
-        
+
         st.success("Alembic database upgrade successful!")
         st.text(result.stdout)
         print("Alembic upgrade successful! Sleeping for 10 seconds.")
         sleep(10)
         return True
-    
+
     except subprocess.CalledProcessError as e:
         st.error(f"Alembic upgrade failed: {e.stderr}")
         return False

diff --git a/ft/venv_utils.py b/ft/venv_utils.py
@@ -1,10 +1,11 @@
 import sys
 import os
 
+
 def activate_venv(venv_path):
     """
     Activate a Python virtual environment programmatically
-    
+
     Args:
         venv_path: Path to the virtual environment directory
     """
@@ -13,25 +14,24 @@ def activate_venv(venv_path):
         activate_script = os.path.join(venv_path, "Scripts", "activate.bat")
     else:  # Linux/Mac
         activate_script = os.path.join(venv_path, "bin", "activate")
-    
+
     if not os.path.exists(activate_script):
         raise FileNotFoundError(f"Virtual environment activation script not found at {activate_script}")
 
     # Add virtual environment's site-packages to sys.path
     site_packages = os.path.join(
-        venv_path, 
+        venv_path,
         "Lib" if sys.platform == "win32" else "lib",
         f"python{sys.version_info.major}.{sys.version_info.minor}",
         "site-packages"
     )
-    
+
     if site_packages not in sys.path:
         sys.path.insert(0, site_packages)
-    
+
     # Set environment variables
     os.environ["VIRTUAL_ENV"] = venv_path
     os.environ["PATH"] = os.pathsep.join([
         os.path.join(venv_path, "Scripts" if sys.platform == "win32" else "bin"),
         os.environ.get("PATH", "")
     ])
-
diff --git a/pgs/database.py b/pgs/database.py
@@ -3,7 +3,7 @@
 from ft.consts import IconPaths, DIVIDER_COLOR
 from pgs.streamlit_utils import get_fine_tuning_studio_client
 fts = get_fine_tuning_studio_client()
-import json
+
 
 def create_header():
     with st.container(border=True):