diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..be2df74 --- /dev/null +++ b/.env.example @@ -0,0 +1,19 @@ +DEFAULT_ALGORITHM_DESCRIPTION=/path/to/PCA_v1.0.0.json + +PREFECT_API_URL=http://prefect:4200/api +FLOW_NAME="Parent flow/launch_parent_flow" +TIMEZONE="US/Pacific" +PREFECT_TAGS='["latent-space-explorer"]' + +# MLEx Content Registry API +CONTENT_API_URL="http://content-api:8000/api/v0/models" + +TILED_API_KEY= + +READ_DIR=/path/to/read/data +WRITE_DIR=/path/to/write/results + +# Slurm jobs +PARTITIONS='["p1", "p2"]' +RESERVATIONS='["r1", "r2"]' +MAX_TIME="1:00:00" diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..a9f89c5 --- /dev/null +++ b/.flake8 @@ -0,0 +1,7 @@ +[flake8] +# 127 is width of the Github code viewer, +# black default is 88 so this will only warn about comments >127 +max-line-length = 127 +# Ignore errors due to incompatibility with black +#https://black.readthedocs.io/en/stable/guides/using_black_with_other_tools.html +extend-ignore = E203,E701 diff --git a/.gitignore b/.gitignore index 496e1ee..96318fb 100644 --- a/.gitignore +++ b/.gitignore @@ -5,14 +5,13 @@ __pycache__/ test.py # output dir +results/ data/output/ data/upload/ data/.file_manager_vars.pkl data/mlexchange_store/ .DS_Store -src/dash_component_editor.py - # C extensions *.so diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..83d178f --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,34 @@ +default_language_version: + python: python3 +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.5.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-ast + - id: check-case-conflict + - id: check-merge-conflict + - id: check-symlinks + - id: check-yaml + - id: debug-statements + - repo: https://github.com/gitguardian/ggshield + rev: v1.25.0 + hooks: + - id: ggshield + language_version: python3 + stages: [commit] + # Using this mirror lets us use mypyc-compiled black, which is about 2x faster + - repo: https://github.com/psf/black-pre-commit-mirror + rev: 24.2.0 + hooks: + - id: black + - repo: https://github.com/pycqa/flake8 + rev: 7.0.0 + hooks: + - id: flake8 + - repo: https://github.com/pycqa/isort + rev: 5.13.2 + hooks: + - id: isort + args: ["--profile", "black"] \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 383494d..4f3a7e6 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,6 +1,7 @@ version: "3" services: + front-end: restart: "unless-stopped" container_name: "latentxp" @@ -9,18 +10,27 @@ services: dockerfile: "docker/Dockerfile" mem_limit: 2g environment: - DATA_DIR: "${PWD}/data/" - # USER: "$USER" + READ_DIR: "${READ_DIR}" + WRITE_DIR: "${WRITE_DIR}" + PREFECT_TAGS: "${PREFECT_TAGS}" + PREFECT_API_URL: '${PREFECT_API_URL}' + CONTENT_API_URL: '${CONTENT_API_URL}' + TILED_API_KEY: '${TILED_API_KEY}' + FLOW_NAME: '${FLOW_NAME}' + TIMEZONE: "${TIMEZONE}" + PARTITIONS: "${PARTITIONS}" + RESERVATIONS: "${RESERVATIONS}" + MAX_TIME: "${MAX_TIME}" volumes: - - ./data:/app/work/data - - ./src:/app/work/src + - $READ_DIR:/app/work/data + - $WRITE_DIR:/app/work/mlex_store + # - ./src:/app/work/src + - ../mlex_file_manager/file_manager:/app/work/src/file_manager ports: - - "8070:8070" + - "127.0.0.1:8070:8070" networks: - - computing_api_default + mlex_mle_net: networks: - computing_api_default: + mlex_mle_net: external: true - -# env file: set up pwd \ No newline at end of file diff --git a/docker/Dockerfile b/docker/Dockerfile index d57f623..87d8d55 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,61 +1,17 @@ -# FROM python:3.9 -# LABEL maintainer="THE MLEXCHANGE TEAM" - -# RUN ls -# COPY docker/requirements.txt requirements.txt - -# RUN apt-get update && apt-get install -y \ -# build-essential \ -# wget \ -# python3-pip\ -# ffmpeg\ -# libsm6\ -# libxext6 - -# RUN pip3 install --upgrade pip &&\ -# pip3 install --timeout=2000 -r requirements.txt\ -# pip install git+https://github.com/taxe10/mlex_file_manager - -# RUN git clone https://github.com/mlexchange/mlex_dash_component_editor - -# # EXPOSE 8000 - -# WORKDIR /app/work -# ENV HOME /app/work -# COPY src src -# # ENV PYTHONUNBUFFERED=1 -# RUN mv /mlex_dash_component_editor/src/dash_component_editor.py /app/work/src/dash_component_editor.py - -# CMD ["bash"] -# #CMD python3 src/frontend.py -# CMD sleep 3600 - FROM python:3.9 LABEL maintainer="THE MLEXCHANGE TEAM" RUN ls COPY docker/requirements.txt requirements.txt -RUN apt-get update && apt-get install -y \ - build-essential \ - wget \ - python3-pip\ - ffmpeg\ - libsm6\ - libxext6 - RUN pip3 install --upgrade pip &&\ - pip3 install --timeout=2000 -r requirements.txt\ - pip install git+https://github.com/taxe10/mlex_file_manager + pip3 install -r requirements.txt\ + pip install git+https://github.com/mlexchange/mlex_file_manager\ + pip install git+https://github.com/mlexchange/mlex_dash_component_editor -RUN git clone https://github.com/mlexchange/mlex_dash_component_editor WORKDIR /app/work ENV HOME /app/work COPY src src -RUN mv /mlex_dash_component_editor/src/dash_component_editor.py /app/work/src/dash_component_editor.py CMD ["bash"] -#CMD sleep 3600 CMD python3 src/frontend.py - - diff --git a/docker/requirements.txt b/docker/requirements.txt index 13c2d7d..1fda0c8 100644 --- a/docker/requirements.txt +++ b/docker/requirements.txt @@ -1,5 +1,4 @@ dash==2.9.3 -# dash_component_editor==0.0.7 dash-core-components==2.0.0 dash-bootstrap-components==1.0.2 dash-html-components==2.0.0 @@ -11,4 +10,7 @@ requests==2.26.0 pyarrow==11.0.0 diskcache==5.6.3 pandas -numpy \ No newline at end of file +numpy +Pillow +python-dotenv +prefect-client==2.14.21 diff --git a/src/app_layout.py b/src/app_layout.py index 5664c1b..96cb47c 100644 --- a/src/app_layout.py +++ b/src/app_layout.py @@ -1,252 +1,355 @@ -from dash import Dash, html, dcc +import os + import dash_bootstrap_components as dbc -from dash_iconify import DashIconify -from dash.long_callback import DiskcacheLongCallbackManager -import plotly.graph_objects as go import dash_uploader as du import diskcache -import pathlib -import os +import plotly.graph_objects as go +from dash import Dash, dcc, html +from dash.long_callback import DiskcacheLongCallbackManager +from dash_iconify import DashIconify +from dotenv import load_dotenv +from file_manager.main import FileManager import templates -from file_manager.main import FileManager -### GLOBAL VARIABLES -ALGORITHM_DATABASE = {"PCA": "PCA", "UMAP": "UMAP",} -CLUSTER_ALGORITHM_DATABASE = {"KMeans": "KMeans", "DBSCAN": "DBSCAN", "HDBSCAN": "HDBSCAN"} +load_dotenv(".env", override=True) + +# GLOBAL VARIABLES +ALGORITHM_DATABASE = { + "PCA": "PCA", + "UMAP": "UMAP", +} +CLUSTER_ALGORITHM_DATABASE = { + "KMeans": "KMeans", + "DBSCAN": "DBSCAN", + "HDBSCAN": "HDBSCAN", +} DATA_OPTION = [ - {"label": "Synthetic Shapes", "value": "data/example_shapes/Demoshapes.npz"}, - {"label": "Latent representations from encoder-decoder model", "value": "data/example_latentrepresentation/f_vectors.parquet"} + { + "label": "Synthetic Shapes", + "value": f"{os.getcwd()}/data/example_shapes/Demoshapes.npz", + }, + { + "label": "Latent representations from encoder-decoder model", + "value": f"{os.getcwd()}/data/example_latentrepresentation/f_vectors.parquet", + }, ] -DOCKER_DATA = pathlib.Path.home() / 'data' #/app/work/data -UPLOAD_FOLDER_ROOT = DOCKER_DATA / 'upload' #/app/work/data/upload - -# DATA_CLINIC_OPTION = +READ_DIR = "data" +UPLOAD_FOLDER_ROOT = "data/upload" +TILED_API_KEY = os.getenv("TILED_API_KEY", None) -#### SETUP DASH APP #### +# SETUP DASH APP cache = diskcache.Cache("./cache") long_callback_manager = DiskcacheLongCallbackManager(cache) external_stylesheets = [dbc.themes.BOOTSTRAP, "../assets/segmentation-style.css"] -app = Dash(__name__, - external_stylesheets=external_stylesheets, - suppress_callback_exceptions=True, - long_callback_manager=long_callback_manager) +app = Dash( + __name__, + external_stylesheets=external_stylesheets, + suppress_callback_exceptions=True, + long_callback_manager=long_callback_manager, +) server = app.server -dash_file_explorer = FileManager(DOCKER_DATA, - UPLOAD_FOLDER_ROOT, - open_explorer=False) +dash_file_explorer = FileManager( + READ_DIR, UPLOAD_FOLDER_ROOT, open_explorer=False, api_key=TILED_API_KEY +) dash_file_explorer.init_callbacks(app) du.configure_upload(app, UPLOAD_FOLDER_ROOT, use_upload_id=False) -#### BEGIN DASH CODE #### +# BEGIN DASH CODE header = templates.header() -# right panel: uploader, scatter plot, individual image plot +# right panel: file manager, scatter plot, individual image plot +scatter_control_panel = html.Div( + [ + dbc.Card( + style={"width": "100%"}, + children=[ + dbc.CardHeader("Scatter Plot Control Panel"), + dbc.CardBody( + [ + dbc.Label("Scatter Colors", className="mr-3"), + dcc.RadioItems( + id="scatter-color", + options=[ + {"label": "cluster", "value": "cluster"}, + {"label": "label", "value": "label"}, + ], + value="cluster", + style={"min-width": "250px"}, + className="mb-2", + ), + dbc.Label("Select cluster", className="mr-3"), + dcc.Dropdown( + id="cluster-dropdown", + value=-1, + style={"min-width": "250px"}, + className="mb-2", + ), + dbc.Label("Select label", className="mr-3"), + dcc.Dropdown( + id="label-dropdown", + value=-2, + style={"min-width": "250px"}, + ), + ] + ), + ], + ), + dcc.Interval( + id="interval-component", + interval=3000, # in milliseconds + max_intervals=-1, # keep triggering indefinitely, None + n_intervals=0, + ), + ] +) + +heatmap_control_panel = html.Div( + [ + dbc.Card( + style={"width": "100%"}, + children=[ + dbc.CardHeader("Heatmap Control Panel"), + dbc.CardBody( + [ + dbc.Label( + [ + "Select a Group of Points using ", + html.Span( + html.I(DashIconify(icon="lucide:lasso")), + className="icon", + ), + " or ", + html.Span( + html.I(DashIconify(icon="lucide:box-select")), + className="icon", + ), + " Tools :", + ], + className="mb-3", + ), + dbc.Label( + id="stats-div", + children=[ + "Number of images selected: 0", + html.Br(), + "Clusters represented: N/A", + html.Br(), + "Labels represented: N/A", + ], + ), + dbc.Label("Display Image Options", className="mr-3"), + dcc.RadioItems( + id="mean-std-toggle", + options=[ + {"label": "Mean", "value": "mean"}, + {"label": "Standard Deviation", "value": "sigma"}, + ], + value="mean", + style={"min-width": "250px"}, + className="mb-2", + ), + ] + ), + ], + ) + ] +) + image_panel = [ dbc.Card( id="image-card", children=[ dbc.CardHeader( - [ - dbc.Label('Upload your own zipped dataset', className='mr-2'), + [ + dbc.Label("Upload your own zipped dataset", className="mr-2"), dash_file_explorer.file_explorer, - dbc.Label('Or select Data Clinic modal', className='mr-2'), + dbc.Label("Or select Data Clinic modal", className="mr-2"), dcc.Dropdown( - id='feature-vector-model-list', + id="feature-vector-model-list", clearable=False, - style={'margin-bottom': '1rem'} + style={"margin-bottom": "1rem"}, ), - dbc.Label('Or try Example Dataset', className='mr-2'), + dbc.Label("Or try Example Dataset", className="mr-2"), dcc.Dropdown( - id='dataset-selection', + id="example-dataset-selection", options=DATA_OPTION, - #value = DATA_OPTION[0]['value'], clearable=False, - style={'margin-bottom': '1rem'} + style={"margin-bottom": "1rem"}, ), ] ), dbc.CardBody( - dcc.Graph( - id="scatter", - figure=go.Figure(go.Scattergl(mode='markers')), - ) + [ + dbc.Row( + [ + dbc.Col( + dcc.Graph( + id="scatter", + figure=go.Figure( + go.Scattergl(mode="markers"), + layout=go.Layout( + autosize=True, + margin=go.layout.Margin( + l=20, + r=20, + b=20, + t=20, + pad=0, + ), + ), + ), + ), + width=6, + ), + dbc.Col( + dcc.Graph( + id="heatmap", + figure=go.Figure( + go.Heatmap(), + layout=go.Layout( + autosize=True, + margin=go.layout.Margin( + l=20, + r=20, + b=20, + t=20, + pad=0, + ), + ), + ), + ), + width=6, + ), + ] + ), + ] ), dbc.CardFooter( - dcc.Graph( - id="heatmap", - figure=go.Figure(go.Heatmap()) - ) - ) - ] + [ + dbc.Row( + [ + dbc.Col( + scatter_control_panel, + width=6, + ), + dbc.Col( + heatmap_control_panel, + width=6, + ), + ] + ) + ] + ), + ], ) ] # left panel: choose algorithms, submit job, choose scatter plot attributes, and statistics... -algo_panel = html.Div( - [dbc.Card( - id="algo-card", - style={"width": "100%"}, - children=[ - dbc.Collapse(children=[ - dbc.CardHeader("Select Dimension Reduction Algorithms"), - dbc.CardBody( +algo_panel = dbc.AccordionItem( + [ + dbc.CardBody( + [ + dbc.Label("Algorithm", className="mr-2"), + dcc.Dropdown( + id="algo-dropdown", + options=[ + {"label": entry, "value": entry} + for entry in ALGORITHM_DATABASE + ], + style={"min-width": "250px"}, + value="PCA", + ), + html.Div(id="additional-model-params"), + html.Hr(), + html.Div( [ - dbc.Label("Algorithm", className='mr-2'), - dcc.Dropdown(id="algo-dropdown", - options=[{"label": entry, "value": entry} for entry in ALGORITHM_DATABASE], - style={'min-width': '250px'}, - value='PCA', - ), - html.Div(id='additional-model-params'), - html.Hr(), - html.Div( - [ - dbc.Button( - "Submit", - color="secondary", - id="run-algo", - outline=True, - size="lg", - className="m-1", - style={'width':'50%'} - ), - ], - className='row', - style={'align-items': 'center', 'justify-content': 'center'} - ), - html.Div(id='invisible-apply-div') + dbc.Label("Name your job", className="mr-2"), + dcc.Input( + id="job-name", + placeholder="test0", + style={ + "width": "100%", + "margin-bottom": "1rem", + }, + ), ] - ) - ], - id="model-collapse", - is_open=True, - style = {'margin-bottom': '0rem'} - ) - ] - ) - ] + ), + html.Div( + [ + dbc.Button( + "Submit", + color="secondary", + id="run-algo", + outline=True, + size="lg", + className="m-1", + style={"width": "50%"}, + ), + ], + className="row", + style={ + "align-items": "center", + "justify-content": "center", + }, + ), + html.Hr(), + html.Div( + [ + dbc.Label("Select a job..."), + dcc.Dropdown(id="job-selector"), + ] + ), + html.Div(id="invisible-apply-div"), + ] + ), + ], + title="Select Dimension Reduction Algorithms", ) -cluster_algo_panel = html.Div( +cluster_algo_panel = dbc.AccordionItem( [ - dbc.Card( - id="cluster-algo-card", - style={"width": "100%"}, - children=[ - dbc.Collapse(children=[ - dbc.CardHeader("Select Clustering Algorithms"), - dbc.CardBody([ - dbc.Label("Algorithm", className='mr-2'), - dcc.Dropdown(id="cluster-algo-dropdown", - options=[{"label": entry, "value": entry} for entry in CLUSTER_ALGORITHM_DATABASE], - style={'min-width': '250px'}, - value='DBSCAN', - ), - html.Div(id='additional-cluster-params'), - html.Hr(), - html.Div( - [ - dbc.Button( - "Apply", - color="secondary", - id="run-cluster-algo", - outline=True, - size="lg", - className="m-1", - style={'width':'50%'} - ), - ], - className='row', - style={'align-items': 'center', 'justify-content': 'center'} - ), - html.Div(id='invisible-submit-div') - ] - - ) + dbc.CardBody( + [ + dbc.Label("Algorithm", className="mr-2"), + dcc.Dropdown( + id="cluster-algo-dropdown", + options=[ + {"label": entry, "value": entry} + for entry in CLUSTER_ALGORITHM_DATABASE + ], + style={"min-width": "250px"}, + value="DBSCAN", + ), + html.Div(id="additional-cluster-params"), + html.Hr(), + html.Div( + [ + dbc.Button( + "Apply", + color="secondary", + id="run-cluster-algo", + outline=True, + size="lg", + className="m-1", + style={"width": "50%"}, + ), ], - id="cluster-model-collapse", - is_open=True, - style = {'margin-bottom': '0rem'} - ) + className="row", + style={ + "align-items": "center", + "justify-content": "center", + }, + ), + html.Div(id="invisible-submit-div"), ] - ) - ] -) - -scatter_control_panel = html.Div( - [dbc.Card( - style={"width": "100%"}, - children=[ - dbc.CardHeader("Scatter Plot Control Panel"), - dbc.CardBody([ - dbc.Label('Scatter Colors', className='mr-3'), - dcc.RadioItems(id='scatter-color', - options=[ - {'label': 'cluster', 'value': 'cluster'}, - {'label': 'label', 'value': 'label'} - ], - value = 'cluster', - style={'min-width': '250px'}, - className='mb-2'), - dbc.Label("Select cluster", className='mr-3'), - dcc.Dropdown(id='cluster-dropdown', - value=-1, - style={'min-width': '250px'}, - className='mb-2'), - dbc.Label("Select label", className='mr-3'), - dcc.Dropdown(id='label-dropdown', - value=-2, - style={'min-width': '250px'}, - ) - ]) - ] - ), - dcc.Interval( - id='interval-component', - interval=3000, # in milliseconds - max_intervals=-1, # keep triggering indefinitely, None - n_intervals=0, - ), - ] + ), + ], + title="Select Clustering Algorithms", ) -heatmap_control_panel = html.Div( - [dbc.Card( - style={"width": "100%"}, - children=[ - dbc.CardHeader("Heatmap Control Panel"), - dbc.CardBody([ - dbc.Label([ - 'Select a Group of Points using ', - html.Span(html.I(DashIconify(icon="lucide:lasso")), className='icon'), - ' or ', - html.Span(html.I(DashIconify(icon="lucide:box-select")), className='icon'), - ' Tools :' - ], - className='mb-3'), - dbc.Label(id='stats-div', children=[ - 'Number of images selected: 0', - html.Br(), - 'Clusters represented: N/A', - html.Br(), - 'Labels represented: N/A', - ]), - dbc.Label('Display Image Options', className='mr-3'), - dcc.RadioItems(id='mean-std-toggle', - options=[ - {'label': 'Mean', 'value': 'mean'}, - {'label': 'Standard Deviation', 'value': 'sigma'} - ], - value = 'mean', - style={'min-width': '250px'}, - className='mb-2'), - ]) - ] - )] -) # add alert pop up window modal = html.Div( @@ -262,8 +365,17 @@ ] ) - -control_panel = [algo_panel, cluster_algo_panel, scatter_control_panel, heatmap_control_panel, modal] +control_panel = dbc.Accordion( + [ + algo_panel, + cluster_algo_panel + ], + style={ + 'position': 'sticky', + 'top': '10%', + 'width': '100%' + } + ) # metadata @@ -272,35 +384,38 @@ id="no-display", children=[ # Store for user created contents - dcc.Store(id='image-length', data=0), - dcc.Store(id='user-upload-data-dir', data=None), - dcc.Store(id='dataset-options', data=DATA_OPTION), - dcc.Store(id='run-counter', data=0), - dcc.Store(id='experiment-id', data=None), + dcc.Store(id="image-length", data=0), + dcc.Store(id="user-upload-data-dir", data=None), + dcc.Store(id="dataset-options", data=DATA_OPTION), + dcc.Store(id="run-counter", data=0), + dcc.Store(id="experiment-id", data=None), # data_label_schema, latent vectors, clusters - dcc.Store(id='input_data', data=None), - dcc.Store(id='input_labels', data=None), - dcc.Store(id='label_schema', data=None), - dcc.Store(id='model_id', data=None), - dcc.Store(id='latent_vectors', data=None), - dcc.Store(id='clusters', data=None), + dcc.Store(id="input_labels", data=None), + dcc.Store(id="label_schema", data=None), + dcc.Store(id="model_id", data=None), + dcc.Store(id="latent_vectors", data=None), + dcc.Store(id="clusters", data=None), ], ) ] -##### DEFINE LAYOUT #### +# DEFINE LAYOUT app.layout = html.Div( [ - header, + header, dbc.Container( - children = [ - dbc.Row([ dbc.Col(control_panel, width=4), - dbc.Col(image_panel, width=7) - ]), + children=[ + dbc.Row( + [ + dbc.Col(control_panel, width=4, style={'display': 'flex', 'margin-top': '1em'}), + dbc.Col(image_panel, width=8) + ] + ), + dbc.Row(dbc.Col(modal)), dbc.Row(dbc.Col(meta)), - ] + ], + fluid=True, ), - modal - ] -) \ No newline at end of file + ], +) diff --git a/src/assets/sample_models.json b/src/assets/sample_models.json new file mode 100644 index 0000000..9d601b9 --- /dev/null +++ b/src/assets/sample_models.json @@ -0,0 +1,192 @@ +[ + { + "content_id": "uid1", + "content_type": "model", + "name": "PCA", + "public": true, + "version": "1.0.0", + "type": "unsupervised", + "owner": "mlexchange team", + "service_type": "frontend", + "docker_image_uri": "ghcr.io/runboj/mlex_dimension_reduction_pca:main", + "conda_env_name": "mlex_dimension_reduction_pca", + "reference": "PCA algorithm", + "application": [ + "dimension reduction" + ], + "description": "PCA-based dimension reduction", + "gui_parameters": [ + { + "type": "dropdown", + "name": "ncomp-dropdown-menu", + "title": "Number of Components", + "value": 2, + "options": [ + { + "label": "2", + "value": 2 + }, + { + "label": "3", + "value": 3 + } + ], + "param_key": "n_components", + "comp_group": "all" + } + ], + "cmd": [ + "python pca_run.py" + ], + "kwargs": {}, + "compute_resources": { + "num_processors": 1, + "num_gpus": 0 + } + }, + { + "content_id": "uid2", + "content_type": "model", + "name": "UMAP", + "public": true, + "version": "1.0.0", + "type": "unsupervised", + "owner": "mlexchange team", + "service_type": "frontend", + "docker_image_uri": "ghcr.io/runboj/mlex_dimension_reduction_umap:main", + "conda_env_name": "mlex_dimension_reduction_umap", + "reference": "UMAP algorithm", + "application": [ + "dimension reduction" + ], + "description": "UMAP algotihtm for dimension reduction", + "gui_parameters": [ + { + "type": "dropdown", + "name": "ncomp-dropdown-menu-2", + "title": "Number of Components", + "value": 2, + "options": [ + { + "label": "2", + "value": 2 + }, + { + "label": "3", + "value": 3 + } + ], + "param_key": "n_components", + "comp_group": "all" + }, + { + "type": "dropdown", + "name": "mindist-dropdown-menu", + "title": "Min distance between points", + "value": 0.1, + "options": [ + { + "label": 0.1, + "value": 0.1 + }, + { + "label": 0.2, + "value": 0.2 + }, + { + "label": 0.3, + "value": 0.3 + }, + { + "label": 0.4, + "value": 0.4 + }, + { + "label": 0.5, + "value": 0.5 + }, + { + "label": 0.6, + "value": 0.6 + }, + { + "label": 0.7, + "value": 0.7 + }, + { + "label": 0.8, + "value": 0.8 + }, + { + "label": 0.9, + "value": 0.9 + }, + { + "label": 1.0, + "value": 1.0 + } + ], + "param_key": "min_dist", + "comp_group": "all" + }, + { + "type": "dropdown", + "name": "nneighbor-dropdown-menu", + "title": "Number of Nearest Neighbors", + "value": 15, + "options": [ + { + "label": 5, + "value": 5 + }, + { + "label": 10, + "value": 10 + }, + { + "label": 15, + "value": 15 + }, + { + "label": 20, + "value": 20 + }, + { + "label": 25, + "value": 25 + }, + { + "label": 30, + "value": 30 + }, + { + "label": 35, + "value": 35 + }, + { + "label": 40, + "value": 40 + }, + { + "label": 45, + "value": 45 + }, + { + "label": 50, + "value": 50 + } + ], + "param_key": "n_neighbors", + "comp_group": "all" + } + ], + "cmd": [ + "python umap_run.py" + ], + "kwargs": {}, + "compute_resources": { + "num_processors": 1, + "num_gpus": 0 + } + } +] diff --git a/src/assets/segmentation-style.css b/src/assets/segmentation-style.css index 4cb18ba..8e76b71 100644 --- a/src/assets/segmentation-style.css +++ b/src/assets/segmentation-style.css @@ -32,3 +32,8 @@ label { margin: 0; border-style: solid; } + +.accordion-button { + font-size: large; + font-weight: bold; +} diff --git a/src/frontend.py b/src/frontend.py index 5018236..b448d97 100755 --- a/src/frontend.py +++ b/src/frontend.py @@ -1,101 +1,203 @@ -import dash -from dash import html, Input, Output, State -from dash.exceptions import PreventUpdate -import plotly.graph_objects as go -import pandas as pd -import numpy as np -from sklearn.cluster import MiniBatchKMeans, DBSCAN, HDBSCAN -import pathlib +import copy import json -import uuid -import requests import os -import requests +import uuid +from datetime import datetime +import numpy as np +import pandas as pd +import plotly.graph_objects as go +import pytz +import requests +from dash import Input, Output, State, html +from dash.exceptions import PreventUpdate +from dotenv import load_dotenv from file_manager.data_project import DataProject +from sklearn.cluster import DBSCAN, HDBSCAN, MiniBatchKMeans -from app_layout import app, DOCKER_DATA, UPLOAD_FOLDER_ROOT -from latentxp_utils import kmeans_kwargs, dbscan_kwargs, hdbscan_kwargs, hex_to_rgba, generate_scatter_data, remove_key_from_dict_list, get_content, get_trained_models_list +from app_layout import app from dash_component_editor import JSONParameterEditor +from latentxp_utils import ( + dbscan_kwargs, + generate_scatter_data, + hdbscan_kwargs, + hex_to_rgba, + kmeans_kwargs, + remove_key_from_dict_list, +) +from utils_prefect import ( + get_children_flow_run_ids, + get_flow_runs_by_name, + schedule_prefect_flow, +) + +load_dotenv(".env") +# GLOBAL PARAMS +READ_DIR = os.getenv("READ_DIR", "data") +WRITE_DIR = os.getenv("WRITE_DIR", "mlex_store") -#### GLOBAL PARAMS #### -DATA_DIR = str(os.environ['DATA_DIR']) -OUTPUT_DIR = pathlib.Path('data/output') -USER = 'admin' #'mlexchange-team' # move to env file +MODEL_DIR = "data/models" UPLOAD_FOLDER_ROOT = "data/upload" +PREFECT_TAGS = json.loads(os.getenv("PREFECT_TAGS", '["latent-space-explorer"]')) +TIMEZONE = os.getenv("TIMEZONE", "US/Pacific") +FLOW_NAME = os.getenv("FLOW_NAME", "") +FLOW_TYPE = "conda" + +CONTENT_API_URL = os.getenv("CONTENT_API_URL", "http://localhost:8000/api/v0/models") +DEFAULT_ALGORITHM_DESCRIPTION = os.getenv("DEFAULT_ALGORITHM_DESCRIPTION") + +PARTITIONS = os.getenv("PARTITIONS", None) +RESERVATIONS = os.getenv("RESERVATIONS", None) +MAX_TIME = os.getenv("MAX_TIME", "1:00:00") + +if FLOW_TYPE == "podman": + TRAIN_PARAMS_EXAMPLE = { + "flow_type": "podman", + "params_list": [ + { + "image_name": "ghcr.io/runboj/mlex_dimension_reduction_pca", + "image_tag": "main", + "command": 'python -c \\"import time; time.sleep(30)\\"', + "params": { + "io_parameters": {"uid_save": "uid0001", "uid_retrieve": None} + }, + "volumes": [ + f"{READ_DIR}:/app/work/data", + f"{WRITE_DIR}:/app/work/mlex_store", + ], + } + ], + } + +elif FLOW_TYPE == "conda": + TRAIN_PARAMS_EXAMPLE = { + "flow_type": "conda", + "params_list": [ + { + "conda_env_name": "mlex_dimension_reduction_pca", + "params": { + "io_parameters": {"uid_save": "uid0001", "uid_retrieve": None} + }, + } + ], + } + +else: + TRAIN_PARAMS_EXAMPLE = { + "flow_type": "slurm", + "params_list": [ + { + "job_name": "latent_space_explorer", + "num_nodes": 1, + "partitions": PARTITIONS, + "reservations": RESERVATIONS, + "max_time": MAX_TIME, + "conda_env_name": "mlex_dimension_reduction_pca", + "params": { + "io_parameters": {"uid_save": "uid0001", "uid_retrieve": None} + }, + } + ], + } + + @app.callback( - Output('additional-model-params', 'children'), - Output('model_id', 'data'), - Input('algo-dropdown', 'value') + Output("additional-model-params", "children"), + Output("model_id", "data"), + Input("algo-dropdown", "value"), ) def show_dimension_reduction_gui_layouts(selected_algo): - ''' + """ This callback display dropdown menu in the frontend for different dimension reduction algos Args: selected_algo: Selected dimension reduction algorithm Returns: item_list: dropdown menu html code model_uid: selected algo's uid - ''' - data = requests.get('http://content-api:8000/api/v0/models').json() # all model - - if selected_algo == 'PCA': - conditions = {'name': 'PCA'} - elif selected_algo == 'UMAP': - conditions = {'name': 'UMAP'} - - model = [d for d in data if all((k in d and d[k] == v) for k, v in conditions.items())] # filter pca or umap - model_uid = model[0]['content_id'] - new_model = remove_key_from_dict_list(model[0]["gui_parameters"], 'comp_group') - - item_list = JSONParameterEditor(_id={'type': str(uuid.uuid4())}, - json_blob=new_model, + """ + try: + data = requests.get(CONTENT_API_URL).json() # all model + except Exception as e: + print(f"Cannot access content api: {e}", flush=True) + with open("src/assets/sample_models.json", "r") as f: + data = json.load(f) + + if selected_algo == "PCA": + conditions = {"name": "PCA"} + elif selected_algo == "UMAP": + conditions = {"name": "UMAP"} + + model = [ + d for d in data if all((k in d and d[k] == v) for k, v in conditions.items()) + ] # filter pca or umap + model_uid = model[0]["content_id"] + new_model = remove_key_from_dict_list(model[0]["gui_parameters"], "comp_group") + + item_list = JSONParameterEditor( + _id={"type": str(uuid.uuid4())}, + json_blob=new_model, ) item_list.init_callbacks(app) - + return item_list, model_uid + @app.callback( - Output('additional-cluster-params', 'children'), - Input('cluster-algo-dropdown', 'value'), + Output("additional-cluster-params", "children"), + Input("cluster-algo-dropdown", "value"), ) def show_clustering_gui_layouts(selected_algo): - ''' + """ This callback display drop down menu in the fronend for different clustering algos Args: selected_algo: selected clustering algorithm Returns: item_list: dropdown menu html code - ''' - if selected_algo == 'KMeans': + """ + if selected_algo == "KMeans": kwargs = kmeans_kwargs - elif selected_algo == 'DBSCAN': + elif selected_algo == "DBSCAN": kwargs = dbscan_kwargs - elif selected_algo == 'HDBSCAN': + elif selected_algo == "HDBSCAN": kwargs = hdbscan_kwargs - - item_list = JSONParameterEditor(_id={'type': str(uuid.uuid4())}, - json_blob=kwargs["gui_parameters"]) + + item_list = JSONParameterEditor( + _id={"type": str(uuid.uuid4())}, json_blob=kwargs["gui_parameters"] + ) item_list.init_callbacks(app) return item_list + @app.callback( - Output('input_data', 'data'), - Output('input_labels', 'data'), - Output('label_schema', 'data'), - Output('label-dropdown', 'options'), - Output('user-upload-data-dir', 'data'), - Input('dataset-selection', 'value'), # Example dataset - Input({'base_id': 'file-manager', 'name': 'docker-file-paths'},'data'), # FM - Input('feature-vector-model-list', 'value'), # data clinic + Output("job-selector", "options"), + Input("interval-component", "n_intervals"), +) +def update_job_selector(n_intervals): + # TODO: Split train/inference and add data project name + jobs = get_flow_runs_by_name(tags=PREFECT_TAGS) + return jobs + + +@app.callback( + [ + Output("input_labels", "data"), + Output("label_schema", "data"), + Output("label-dropdown", "options"), + ], + [ + Input("example-dataset-selection", "value"), # example dataset + Input( + {"base_id": "file-manager", "name": "data-project-dict"}, "data" + ), # FM dataset + ], ) -def update_data_n_label_schema(selected_dataset, upload_file_paths, data_clinic_file_path): - ''' +def update_data_n_label_schema(selected_example_dataset, data_project_dict): + """ This callback updates the selected dataset from the provided example datasets, as well as labels, and label schema Args: - dataset-selection: selected dataset from the provided example datasets, not the one that user uploaded + example-dataset-selection: selected dataset from the provided example datasets, not the one that user uploaded upload_file_pahts: Data project info, the user uploaded zip file using FileManager, list Returns: input_data: input image data, numpy.ndarray @@ -103,93 +205,75 @@ def update_data_n_label_schema(selected_dataset, upload_file_paths, data_clinic_ label_schema: the text of each unique label label_dropdown: label dropdown options user_upload_data_dir: dir name for the user uploaded zip file - ''' - # FM - data_project = DataProject() - data_project.init_from_dict(upload_file_paths) - data_set = data_project.data # list of len 1920, each element is a local_dataset.LocalDataset object - - data = None + """ labels = None label_schema = {} + + # check if user is using user uploaded zip file or example dataset or data clinic file + # priority level: FileManage > DataClinic > Example Datasets + + data_project = DataProject.from_dict(data_project_dict) options = [] - user_upload_data_dir = None - - # FM options - if len(data_set) > 0: - data = [] - for i in range(len(data_set)): #if dataset too large, dash will exit with code 247, 137 - image, uri = data_project.data[i].read_data(export='pillow') - data.append(np.array(image)) - data = np.array(data) - print(data.shape) - labels = np.full((data.shape[0],), -1) - user_upload_data_dir = os.path.dirname(upload_file_paths[0]['uri']) + if len(data_project.datasets) > 0: + labels = np.full((len(data_project.datasets),), -1) # Example dataset option 1 - elif selected_dataset == "data/example_shapes/Demoshapes.npz": - data = np.load("/app/work/" + selected_dataset)['arr_0'] + elif selected_example_dataset == "data/example_shapes/Demoshapes.npz": labels = np.load("/app/work/data/example_shapes/DemoLabels.npy") f = open("/app/work/data/example_shapes/label_schema.json") label_schema = json.load(f) # Example dataset option 2 - elif selected_dataset == "data/example_latentrepresentation/f_vectors.parquet": - df = pd.read_parquet("/app/work/" + selected_dataset) - data = df.values - labels = np.full((df.shape[0],), -1) - # DataClinic options - elif data_clinic_file_path is not None: - df = pd.read_parquet(data_clinic_file_path) - data = df.values + elif ( + selected_example_dataset + == "data/example_latentrepresentation/f_vectors.parquet" + ): + df = pd.read_parquet("/app/work/" + selected_example_dataset) labels = np.full((df.shape[0],), -1) - if label_schema: - options = [{'label': f'Label {label}', 'value': label} for label in label_schema] - options.insert(0, {'label': 'Unlabeled', 'value': -1}) - options.insert(0, {'label': 'All', 'value': -2}) + if label_schema: + options = [ + {"label": f"Label {label}", "value": label} for label in label_schema + ] + options.insert(0, {"label": "Unlabeled", "value": -1}) + options.insert(0, {"label": "All", "value": -2}) - return data, labels, label_schema, options, user_upload_data_dir + return labels, label_schema, options -def job_content_dict(content): - job_content = {# 'mlex_app': content['name'], - 'mlex_app': 'dimension reduction demo', - 'service_type': content['service_type'], - 'working_directory': DATA_DIR, - 'job_kwargs': {'uri': content['uri'], - 'cmd': content['cmd'][0]} - } - if 'map' in content: - job_content['job_kwargs']['map'] = content['map'] - - return job_content @app.callback( [ - # flag the read variable - Output('experiment-id', 'data'), # reset scatter plot control panel - Output('scatter-color', 'value'), - Output('cluster-dropdown', 'value'), - Output('label-dropdown', 'value'), + Output("scatter-color", "value"), + Output("cluster-dropdown", "value"), + Output("label-dropdown", "value"), # reset heatmap - Output('heatmap', 'figure', allow_duplicate=True), + Output("heatmap", "figure", allow_duplicate=True), # reset interval value to - Output('interval-component', 'max_intervals'), + Output("interval-component", "max_intervals"), ], - Input('run-algo', 'n_clicks'), + Input("run-algo", "n_clicks"), [ - State('dataset-selection', 'value'), - State('user-upload-data-dir', 'data'), - State('feature-vector-model-list', 'value'), - State('input_data', 'data'), - State('model_id', 'data'), - State('algo-dropdown', 'value'), - State('additional-model-params', 'children'), + State("job-name", "value"), # job_name + State("example-dataset-selection", "value"), # 2 example dataset + State("feature-vector-model-list", "value"), # DataClinic + State("model_id", "data"), + State("algo-dropdown", "value"), + State("additional-model-params", "children"), + State( + {"base_id": "file-manager", "name": "data-project-dict"}, "data" + ), # DataProject for FM ], - prevent_initial_call=True + prevent_initial_call=True, ) -def submit_dimension_reduction_job(submit_n_clicks, - selected_dataset, user_upload_data_dir, data_clinic_file_path, - input_data, model_id, selected_algo, children): +def submit_dimension_reduction_job( + submit_n_clicks, + job_name, + selected_example_dataset, + data_clinic_file_path, + model_id, + selected_algo, + children, + data_project_dict, +): """ This callback is triggered every time the Submit button is hit: - compute latent vectors, which will be saved in data/output/experiment_id @@ -197,85 +281,163 @@ def submit_dimension_reduction_job(submit_n_clicks, - reset heatmap to no image Args: submit_n_clicks: num of clicks for the submit button - selected_dataset: selected example dataset + selected_example_dataset: selected example dataset user_upload_data_dir: user uploaded dataset model_id: uid of selected dimension reduciton algo selected_algo: selected dimension reduction algo children: div for algo's parameters Returns: - experiment-id: uuid for current run cluster-dropdown: options for cluster dropdown scatter-color: default scatter-color value cluster-dropdown: default cluster-dropdown value heatmap: empty heatmap figure interval: set interval component to trigger to find the latent_vectors.npy file (-1) """ - if not submit_n_clicks or not input_data: + if not submit_n_clicks: + raise PreventUpdate + if ( + not selected_example_dataset + and not data_project_dict + and not data_clinic_file_path + ): raise PreventUpdate + job_params = job_params = copy.deepcopy(TRAIN_PARAMS_EXAMPLE) input_params = {} if children: - for child in children['props']['children']: - key = child["props"]["children"][1]["props"]["id"]["param_key"] + for child in children["props"]["children"]: + key = child["props"]["children"][1]["props"]["id"]["param_key"] value = child["props"]["children"][1]["props"]["value"] input_params[key] = value - print("Dimension reduction algo params: ", input_params) - model_content = get_content(model_id) - print(model_content) - job_content = job_content_dict(model_content) - job_content['job_kwargs']['kwargs'] = {} - job_content['job_kwargs']['kwargs']['parameters'] = input_params - #TODO: other kwargs - - compute_dict = {'user_uid': USER, - 'host_list': ['mlsandbox.als.lbl.gov', 'local.als.lbl.gov', 'vaughan.als.lbl.gov'], - 'requirements': {'num_processors': 2, - 'num_gpus': 0, - 'num_nodes': 2}, - } - compute_dict['job_list'] = [job_content] - compute_dict['dependencies'] = {'0':[]} - compute_dict['requirements']['num_nodes'] = 1 - - # create user directory to store users data/experiments - experiment_id = str(uuid.uuid4()) # create unique id for experiment - output_path = OUTPUT_DIR / experiment_id - output_path.mkdir(parents=True, exist_ok=True) + print("Dimension reduction algo params: ", input_params, flush=True) # check if user is using user uploaded zip file or example dataset or data clinic file - if user_upload_data_dir is not None: - selected_dataset = user_upload_data_dir - elif data_clinic_file_path is not None: - selected_dataset = data_clinic_file_path - + data_project = DataProject.from_dict(data_project_dict) + if len(data_project.datasets) > 0: + print("FM", flush=True) + data_project = DataProject.from_dict(data_project_dict) + io_parameters = { + "data_uris": [dataset.uri for dataset in data_project.datasets], + "data_tiled_api_key": data_project.api_key, + "data_type": data_project.data_type, + "root_uri": data_project.root_uri, + } + + else: + print("selected_example_dataset: " + selected_example_dataset, flush=True) + io_parameters = { + "data_uris": [selected_example_dataset], + "data_tiled_api_key": None, + "data_type": "file", + "root_uri": None, + } + + # Autoencoder + if data_clinic_file_path is not None: + auto_io_params = io_parameters.copy() + auto_io_params["model_dir"] = data_clinic_file_path + "/last.ckpt" + auto_params = ( + { + "io_parameters": auto_io_params, + "target_width": 64, + "target_height": 64, + "batch_size": 32, + }, + ) + # TODO: Use content registry to retrieve the model parameters + if FLOW_TYPE == "podman": + autoencoder_params = { + "image_name": "ghcr.io/mlexchange/mlex_pytorch_autoencoders:main", + "image_tag": "main", + "command": "python src/predict_model.py", + "params": auto_params, + "volumes": [ + f"{READ_DIR}:/app/work/data", + f"{WRITE_DIR}:/app/work/mlex_store", + ], + } + elif FLOW_TYPE == "conda": + autoencoder_params = { + "conda_env_name": "pytorch_autoencoders", + "params": auto_params, + "python_file_name": "mlex_pytorch_autoencoders/src/predict_model.py", + } + else: + autoencoder_params = { + "job_name": "latent_space_explorer", + "num_nodes": 1, + "partitions": PARTITIONS, + "reservations": RESERVATIONS, + "max_time": MAX_TIME, + "conda_env_name": "pytorch_autoencoders", + "params": auto_params, + } + job_params["params_list"].insert(0, autoencoder_params) + + # prefect + current_time = datetime.now(pytz.timezone(TIMEZONE)).strftime("%Y/%m/%d %H:%M:%S") + if not job_name: + job_name = "test0" + job_name += " " + str(current_time) + # TODO: Hash root_uri + data_uris + project_name = "fake_name" + print(PREFECT_TAGS, flush=True) + # check which dimension reduction algo, then compose command - if selected_algo == 'PCA': - cmd_list = ["python pca_run.py", selected_dataset, str(output_path)] - elif selected_algo == 'UMAP': - cmd_list = ["python umap_run.py", selected_dataset, str(output_path)] - - docker_cmd = " ".join(cmd_list) - #print(docker_cmd) - docker_cmd = docker_cmd + ' \'' + json.dumps(input_params) + '\'' - #print(docker_cmd) - job_content['job_kwargs']['cmd'] = docker_cmd - - response = requests.post('http://job-service:8080/api/v0/workflows', json=compute_dict) - print("respnse: ", response) - # job_response = get_job(user=None, mlex_app=job_content['mlex_app']) - - - return experiment_id, 'cluster', -1, -2, go.Figure(go.Heatmap()), -1 + if selected_algo == "PCA": + if FLOW_TYPE == "podman": + job_params["params_list"][-1]["command"] = "python pca_run.py" + else: + job_params["params_list"][-1][ + "python_file_name" + ] = "mlex_dimension_reduction_pca/pca_run.py" + elif selected_algo == "UMAP": + if FLOW_TYPE == "podman": + job_params["params_list"][-1]["command"] = "python umap_run.py" + else: + job_params["params_list"][-1][ + "python_file_name" + ] = "mlex_dimension_reduction_umap/umap_run.py" + + job_params["params_list"][-1]["params"]["io_parameters"] = io_parameters + job_params["params_list"][-1]["params"]["io_parameters"][ + "output_dir" + ] = f"{os.getcwd()}/mlex_store" + job_params["params_list"][-1]["params"]["io_parameters"]["uid_save"] = "" + job_params["params_list"][-1]["params"]["io_parameters"]["uid_retrieve"] = None + job_params["params_list"][-1]["params"]["model_parameters"] = input_params + print(job_params) + print(TRAIN_PARAMS_EXAMPLE, flush=True) + + # run prefect job, job_uid is the new experiment id -> uid_save in the pca_example.yaml file + job_uid = schedule_prefect_flow( + FLOW_NAME, + parameters=job_params, + flow_run_name=f"{job_name} {current_time}", + tags=PREFECT_TAGS + ["train", project_name], + ) + job_message = f"Job has been succesfully submitted with uid: {job_uid}." + print(job_message, flush=True) + + fig = go.Figure( + go.Heatmap(), + layout=go.Layout( + autosize=True, + margin=go.layout.Margin(l=20, r=20, b=20, t=20, pad=0), + ), + ) + return "cluster", -1, -2, fig, -1 + @app.callback( - [ - Output('latent_vectors', 'data'), - Output('interval-component', 'max_intervals', allow_duplicate=True), + [ + Output("latent_vectors", "data"), + Output("interval-component", "max_intervals", allow_duplicate=True), ], - Input('interval-component', 'n_intervals'), - State('experiment-id', 'data'), - State('interval-component', 'max_intervals'), - prevent_initial_call=True + Input("interval-component", "n_intervals"), + State("job-selector", "value"), + State("interval-component", "max_intervals"), + prevent_initial_call=True, ) def read_latent_vectors(n_intervals, experiment_id, max_intervals): """ @@ -293,32 +455,34 @@ def read_latent_vectors(n_intervals, experiment_id, max_intervals): if experiment_id is None or n_intervals == 0 or max_intervals == 0: raise PreventUpdate - #read the latent vectors from the output dir - output_path = OUTPUT_DIR / experiment_id - npz_files = list(output_path.glob('*.npy')) - if len(npz_files) > 0 : - lv_filepath = npz_files[0] # latent vector file path - latent_vectors = np.load(str(lv_filepath)) - print("latent vector", latent_vectors.shape) - return latent_vectors, 0 - else: - return None, -1 - + children_flows = get_children_flow_run_ids(experiment_id) + if len(children_flows) > 0: + # read the latent vectors from the output dir + output_path = f"mlex_store/{children_flows[-1]}/latent_vectors.npy" + print(output_path, flush=True) + if os.path.exists(output_path): + latent_vectors = np.load(output_path) + print("latent vector", latent_vectors.shape) + return latent_vectors, 0 + return None, -1 + + @app.callback( [ - Output('clusters', 'data'), - Output('cluster-dropdown', 'options'), + Output("clusters", "data"), + Output("cluster-dropdown", "options"), ], - Input('run-cluster-algo', 'n_clicks'), + Input("run-cluster-algo", "n_clicks"), [ - State('latent_vectors', 'data'), - State('cluster-algo-dropdown', 'value'), - State('additional-cluster-params', 'children'), - State('experiment-id', 'data'), - ] + State("latent_vectors", "data"), + State("cluster-algo-dropdown", "value"), + State("additional-cluster-params", "children"), + State("job-selector", "value"), + ], ) -def apply_clustering(apply_n_clicks, - latent_vectors, selected_algo, children, experiment_id): +def apply_clustering( + apply_n_clicks, latent_vectors, selected_algo, children, experiment_id +): """ This callback is triggered by click the 'Apply' button at the clustering panel: - apply cluster @@ -332,58 +496,76 @@ def apply_clustering(apply_n_clicks, Returns: clusters: clustering result for each data point """ - ## TODO: pop up a widow to ask user to first run diemnsion reduction then apply + # TODO: pop up a widow to ask user to first run diemnsion reduction then apply if apply_n_clicks == 0 or experiment_id is None: raise PreventUpdate latent_vectors = np.array(latent_vectors) input_params = {} if children: - for child in children['props']['children']: - key = child["props"]["children"][1]["props"]["id"]["param_key"] + for child in children["props"]["children"]: + key = child["props"]["children"][1]["props"]["id"]["param_key"] value = child["props"]["children"][1]["props"]["value"] input_params[key] = value - print("Clustering params:", input_params) - + print("Clustering params:", input_params, flush=True) + if selected_algo == "KMeans": - obj = MiniBatchKMeans(n_clusters=input_params['n_clusters']) + obj = MiniBatchKMeans(n_clusters=input_params["n_clusters"]) elif selected_algo == "DBSCAN": - obj = DBSCAN(eps=input_params['eps'], min_samples=input_params['min_samples']) + obj = DBSCAN(eps=input_params["eps"], min_samples=input_params["min_samples"]) elif selected_algo == "HDBSCAN": - obj = HDBSCAN(min_cluster_size=input_params['min_cluster_size']) + obj = HDBSCAN(min_cluster_size=input_params["min_cluster_size"]) clusters, options = None, None if obj: - clusters = obj.fit_predict(latent_vectors) - output_path = OUTPUT_DIR / experiment_id - np.save(output_path/'clusters.npy', clusters) - unique_clusters = np.unique(clusters) - options = [{'label': f'Cluster {cluster}', 'value': cluster} for cluster in unique_clusters if cluster != -1] - options.insert(0, {'label': 'All', 'value': -1}) + children_flows = get_children_flow_run_ids(experiment_id) + if len(children_flows) > 0: + clusters = obj.fit_predict(latent_vectors) + output_path = f"mlex_store/{children_flows[0]}" + np.save(f"{output_path}/clusters.npy", clusters) + unique_clusters = np.unique(clusters) + options = [ + {"label": f"Cluster {cluster}", "value": cluster} + for cluster in unique_clusters + if cluster != -1 + ] + options.insert(0, {"label": "All", "value": -1}) + + print("clusters", clusters, flush=True) return clusters, options + @app.callback( - Output('scatter', 'figure'), + Output("scatter", "figure"), [ - Input('latent_vectors', 'data'), - Input('cluster-dropdown', 'value'), - Input('label-dropdown', 'value'), - Input('scatter-color', 'value'), - Input('clusters', 'data'), #move clusters to the input + Input("latent_vectors", "data"), + Input("cluster-dropdown", "value"), + Input("label-dropdown", "value"), + Input("scatter-color", "value"), + Input("clusters", "data"), # move clusters to the input ], [ - State('scatter', 'figure'), - State('scatter', 'selectedData'), - State('additional-model-params', 'children'), - - State('input_labels', 'data'), - State('label_schema', 'data'), - ] + State("scatter", "figure"), + State("scatter", "selectedData"), + State("additional-model-params", "children"), + State("input_labels", "data"), + State("label_schema", "data"), + ], ) -def update_scatter_plot(latent_vectors, selected_cluster, selected_label, scatter_color, clusters, - current_figure, selected_data, children, labels, label_names): - ''' +def update_scatter_plot( + latent_vectors, + selected_cluster, + selected_label, + scatter_color, + clusters, + current_figure, + selected_data, + children, + labels, + label_names, +): + """ This callback update the scater plot Args: latent_vectors: data from dimension reduction algos @@ -398,47 +580,59 @@ def update_scatter_plot(latent_vectors, selected_cluster, selected_label, scatte label_names: same as label_schema defined earlier Returns: fig: updated scatter figure - ''' + """ if latent_vectors is None or children is None: raise PreventUpdate latent_vectors = np.array(latent_vectors) + print("latent vector shape:", latent_vectors.shape, flush=True) - n_components = children['props']['children'][0]["props"]["children"][1]["props"]["value"] + n_components = children["props"]["children"][0]["props"]["children"][1]["props"][ + "value" + ] - # if selected_data is not None and len(selected_data.get('points', [])) > 0: - # selected_indices = [point['customdata'][0] for point in selected_data['points']] - if selected_data is not None and len(selected_data.get('points', [])) > 0: + if selected_data is not None and len(selected_data.get("points", [])) > 0: selected_indices = [] - for point in selected_data['points']: - if 'customdata' in point and len(point['customdata']): - selected_indices.append(point['customdata'][0]) + for point in selected_data["points"]: + if "customdata" in point and len(point["customdata"]): + selected_indices.append(point["customdata"][0]) print("selected indices: ", selected_indices) else: selected_indices = None - - if not clusters: # when clusters is None, i.e., after submit dimension reduction but before apply clustering + + if ( + not clusters + ): # when clusters is None, i.e., after submit dimension reduction but before apply clustering clusters = [-1 for i in range(latent_vectors.shape[0])] cluster_names = {a: a for a in np.unique(clusters).astype(int)} - - scatter_data = generate_scatter_data(latent_vectors, - n_components, - selected_cluster, - clusters, - cluster_names, - selected_label, - labels, - label_names, - scatter_color) + + scatter_data = generate_scatter_data( + latent_vectors, + n_components, + selected_cluster, + clusters, + cluster_names, + selected_label, + labels, + label_names, + scatter_color, + ) fig = go.Figure(scatter_data) - fig.update_layout(legend=dict(tracegroupgap=20)) + fig.update_layout( + margin=go.layout.Margin(l=20, r=20, b=20, t=20, pad=0), + legend=dict(tracegroupgap=20), + ) - if current_figure and 'xaxis' in current_figure['layout'] and 'yaxis' in current_figure[ - 'layout'] and 'autorange' in current_figure['layout']['xaxis'] and current_figure['layout']['xaxis'][ - 'autorange'] is False: + if ( + current_figure + and "xaxis" in current_figure["layout"] + and "yaxis" in current_figure["layout"] + and "autorange" in current_figure["layout"]["xaxis"] + and current_figure["layout"]["xaxis"]["autorange"] is False + ): # Update the axis range with current figure's values if available and if autorange is False - fig.update_xaxes(range=current_figure['layout']['xaxis']['range']) - fig.update_yaxes(range=current_figure['layout']['yaxis']['range']) + fig.update_xaxes(range=current_figure["layout"]["xaxis"]["range"]) + fig.update_yaxes(range=current_figure["layout"]["yaxis"]["range"]) else: # If it's the initial figure or autorange is True, set autorange to True to fit all points in view fig.update_xaxes(autorange=True) @@ -448,96 +642,172 @@ def update_scatter_plot(latent_vectors, selected_cluster, selected_label, scatte # Use the selected indices to highlight the selected points in the updated figure for trace in fig.data: if trace.marker.color is not None: - trace.marker.color = [hex_to_rgba('grey', 0.3) if i not in selected_indices else 'red' for i in - range(len(trace.marker.color))] + trace.marker.color = [ + hex_to_rgba("grey", 0.3) if i not in selected_indices else "red" + for i in range(len(trace.marker.color)) + ] return fig + @app.callback( - Output('heatmap', 'figure', allow_duplicate=True), + Output("heatmap", "figure", allow_duplicate=True), [ - Input('scatter', 'clickData'), - Input('scatter', 'selectedData'), - Input('mean-std-toggle', 'value'), + Input("scatter", "clickData"), + Input("scatter", "selectedData"), + Input("mean-std-toggle", "value"), ], - State('input_data', 'data'), - prevent_initial_call=True + [ + State("example-dataset-selection", "value"), # example dataset + State( + {"base_id": "file-manager", "name": "data-project-dict"}, "data" + ), # DataProject for FM + ], + prevent_initial_call=True, ) -def update_heatmap(click_data, selected_data, display_option, input_data): - ''' +def update_heatmap( + click_data, + selected_data, + display_option, + selected_example_dataset, + data_project_dict, +): + """ This callback update the heatmap Args: click_data: clicked data on scatter figure selected_data: lasso or rect selected data points on scatter figure display_option: option to display mean or std - input_data: input image data Returns: fig: updated heatmap - ''' - if input_data is None: + """ + if not selected_example_dataset and not data_project_dict: raise PreventUpdate - - images = np.array(input_data) - if selected_data is not None and len(selected_data['points']) > 0: - selected_indices = [point['customdata'][0] for point in selected_data['points']] # Access customdata for the original indices - selected_images = images[selected_indices] - if display_option == 'mean': + + # user select a group of points + if selected_data is not None and len(selected_data["points"]) > 0: + selected_indices = [ + point["customdata"][0] for point in selected_data["points"] + ] # Access customdata for the original indices + print("selected_indices", selected_indices) + + # FileManager + # print("upload_file_paths") # if not selected, its an empty list not None + selected_images = [] + + data_project = DataProject.from_dict(data_project_dict) + if len(data_project.datasets) > 0: + print("FM file") + selected_images, _ = data_project.read_datasets( + selected_indices, export="pillow" + ) + # Example dataset + elif "data/example_shapes/Demoshapes.npz" in selected_example_dataset: + print("Demoshapes.npz") + selected_images = np.load(selected_example_dataset)["arr_0"][ + selected_indices + ] + print(selected_images.shape) + elif ( + "data/example_latentrepresentation/f_vectors.parquet" + in selected_example_dataset + ): + print("f_vectors.parquet") + df = pd.read_parquet(selected_example_dataset) + selected_images = df.iloc[selected_indices].values + selected_images = np.array(selected_images) + + print("selected_images shape:", selected_images.shape) + + # display options + if display_option == "mean": heatmap_data = go.Heatmap(z=np.mean(selected_images, axis=0)) - elif display_option == 'sigma': + elif display_option == "sigma": heatmap_data = go.Heatmap(z=np.std(selected_images, axis=0)) - elif click_data is not None and len(click_data['points']) > 0: - selected_index = click_data['points'][0]['customdata'][0] # click_data['points'][0]['pointIndex'] - heatmap_data = go.Heatmap(z=images[selected_index]) + + elif click_data is not None and len(click_data["points"]) > 0: + selected_index = click_data["points"][0]["customdata"][0] + # FileManager + data_project = DataProject.from_dict(data_project_dict) + if len(data_project.datasets) > 0: + selected_images, _ = data_project.read_datasets( + [selected_index], export="pillow" + ) + # Example dataset + elif selected_example_dataset == "data/example_shapes/Demoshapes.npz": + clicked_image = np.load("/app/work/" + selected_example_dataset)["arr_0"][ + selected_index + ] + elif ( + selected_example_dataset + == "data/example_latentrepresentation/f_vectors.parquet" + ): + df = pd.read_parquet("/app/work/" + selected_example_dataset) + clicked_image = df.iloc[selected_index].values + clicked_image = np.array(clicked_image) + + heatmap_data = go.Heatmap(z=clicked_image) + else: heatmap_data = go.Heatmap() # only update heat map when the input data is 2d images, do not update for input latent vectors - if heatmap_data['z'] is None or len(np.shape(heatmap_data['z'])) < 2: + if heatmap_data["z"] is None or len(np.shape(heatmap_data["z"])) < 2: raise PreventUpdate - + # Determine the aspect ratio based on the shape of the heatmap_data's z-values aspect_x = 1 aspect_y = 1 - if heatmap_data['z'] is not None: - if heatmap_data['z'].size > 0: - aspect_y, aspect_x = np.shape(heatmap_data['z']) + if heatmap_data["z"] is not None: + if heatmap_data["z"].size > 0: + print(np.shape(heatmap_data["z"])) + aspect_y, aspect_x = np.shape(heatmap_data["z"])[-2:] return go.Figure( data=heatmap_data, layout=dict( autosize=True, - yaxis=dict(scaleanchor="x", scaleratio=aspect_y / aspect_x), - ) + margin=go.layout.Margin(l=20, r=20, b=20, t=20, pad=0), + yaxis=dict( + scaleanchor="x", scaleratio=aspect_y / aspect_x, autorange="reversed" + ), + ), ) + @app.callback( - Output('stats-div', 'children'), - Input('scatter', 'selectedData'), + Output("stats-div", "children"), + Input("scatter", "selectedData"), [ - State('clusters', 'data'), - State('input_labels', 'data'), - State('label_schema', 'data'), - ] + State("clusters", "data"), + State("input_labels", "data"), + State("label_schema", "data"), + ], ) def update_statistics(selected_data, clusters, assigned_labels, label_names): - ''' + """ This callback update the statistics panel Args: selected_data: lasso or rect selected data points on scatter figure clusters: clusters for latent vectors assigned_labels: labels for each latent vector - label_names: same as label schema + label_names: same as label schema Returns: [num_images, clusters, labels]: statistics - ''' - - clusters = np.array(clusters) + """ assigned_labels = np.array(assigned_labels) - - if selected_data is not None and len(selected_data['points']) > 0: - selected_indices = [point['customdata'][0] for point in - selected_data['points']] # Access customdata for the original indices + print("assigned_labels", assigned_labels, flush=True) + + if ( + selected_data is not None + and len(selected_data["points"]) > 0 + and (assigned_labels != [-1]).all() + ): + selected_indices = [ + point["customdata"][0] for point in selected_data["points"] + ] # Access customdata for the original indices selected_clusters = [] - if clusters: + if clusters is not None: + clusters = np.array(clusters) selected_clusters = clusters[selected_indices] selected_labels = assigned_labels[selected_indices] @@ -548,7 +818,9 @@ def update_statistics(selected_data, clusters, assigned_labels, label_names): # Format the clusters and labels as comma-separated strings clusters_str = ", ".join(str(cluster) for cluster in unique_clusters) label_int_to_str_map = {val: key for key, val in label_names.items()} - labels_str = ", ".join(str(label_int_to_str_map[label]) for label in unique_labels if label >= 0) + labels_str = ", ".join( + str(label_int_to_str_map[label]) for label in unique_labels if label >= 0 + ) else: num_images = 0 clusters_str = "N/A" @@ -562,64 +834,32 @@ def update_statistics(selected_data, clusters, assigned_labels, label_names): f"Labels represented: {labels_str}", ] -@app.callback( - [Output("modal", "is_open"), Output("modal-body", "children")], - [ - Input('run-algo', 'n_clicks'), - Input('run-cluster-algo', 'n_clicks'), - ], - [ - State("modal", "is_open"), - State('input_data', 'data'), - ] -) -def toggle_modal(n_submit, n_apply, - is_open, input_data): - ''' - This callback pop up a winder to remind user to follow this flow: - select dataset -> Submit dimension reduction job -> Apply clustering - Args: - n_submit (int): Number of clicks on the 'Submit' button. - n_apply (int): Number of clicks on the 'Apply' button. - is_open (bool): Current state of the modal window (open/closed). - input_data (list): User selected data - Returns: - is_open (bool): New state of the modal window. - modal_body_text (str): Text to be displayed in the modal body. - ''' - - if n_submit and input_data is None: - return True, "Please select an example dataset or upload your own zipped dataset." - elif n_apply and input_data is None: - return True, "Please select an example dataset or upload your own zipped dataset." - elif n_apply and n_submit is None: - return True, "Please select a dimension reduction algorithm and click 'Submit' button before clustering." - - return False, "No alert." - @app.callback( - Output('feature-vector-model-list', 'options'), - Input('interval-component', 'n_intervals'), + Output("feature-vector-model-list", "options"), + Input("interval-component", "n_intervals"), ) -def update_trained_model_list(interval): - ''' - This callback updates the list of trained models +def update_feature_vector_model_list(n_intervals): + """ + This callback update the feature vector model list Args: - tab_value: Tab option - prob_refresh_n_clicks: Button to refresh the list of probability-based trained models - similarity_refresh_n_clicks: Button to refresh the list of similarity-based trained models + n_intervals: interval component Returns: - prob_model_list: List of trained models in mlcoach - similarity_model_list: List of trained models in data clinic and mlcoach - ''' - data_clinic_models = get_trained_models_list(USER, 'data_clinic') - ml_coach_models = get_trained_models_list(USER, 'mlcoach') - feature_vector_models = data_clinic_models + ml_coach_models - #print(feature_vector_models) - - return feature_vector_models + options: feature vector model list + """ + # TODO: Connect to data clinic + # TODO: Check if inference has already taken place in this dataset + folder_names = [ + os.path.join(dirpath, dir) + for dirpath, dirs, _ in os.walk(MODEL_DIR) + for dir in dirs + ] + return folder_names -if __name__ == '__main__': - app.run_server(debug=True, host='0.0.0.0', port=8070, ) +if __name__ == "__main__": + app.run_server( + debug=True, + host="0.0.0.0", + port=8070, + ) diff --git a/src/latentxp_utils.py b/src/latentxp_utils.py index 977823b..76be66f 100755 --- a/src/latentxp_utils.py +++ b/src/latentxp_utils.py @@ -4,6 +4,7 @@ from copy import deepcopy import requests import os +from PIL import Image kmeans_kwargs = {"gui_parameters": [{"type": "dropdown", "name": "ncluster-dropdown-menu", "title": "Number of clusters", "param_key": "n_clusters", "options": [{"label": i, "value": i} for i in range(1, 21)], @@ -353,4 +354,39 @@ def get_trained_models_list(user, app): trained_models.append({'label': app+': '+model['job_kwargs']['kwargs']['job_type'], 'value': out_path+filename}) trained_models.reverse() - return trained_models \ No newline at end of file + return trained_models + + +def load_images_from_directory(directory_path, indices): + image_data = [] + for filename in os.listdir(directory_path): + if filename.endswith(".png") or filename.endswith(".jpg"): + file_path = os.path.join(directory_path, filename) + try: + img = Image.open(file_path) + img_array = np.array(img) + image_data.append(img_array) + except Exception as e: + print(f"Error processing {file_path}: {e}") + + image_data = np.array(image_data) + return image_data + +def load_images_by_indices(directory_path, indices): + image_data = [] + filenames = [filename for filename in sorted(os.listdir(directory_path)) if filename.lower().endswith(('.png', '.jpg'))] + for index in indices: + if index in range(len(filenames)): + filename = filenames[index] + file_path = os.path.join(directory_path, filename) + try: + img = Image.open(file_path) + img_array = np.array(img) + image_data.append(img_array) + except Exception as e: + print(f"Error processing {file_path}: {e}") + + image_data = np.array(image_data) + return image_data + + diff --git a/src/utils_prefect.py b/src/utils_prefect.py new file mode 100644 index 0000000..2b6612a --- /dev/null +++ b/src/utils_prefect.py @@ -0,0 +1,105 @@ +import asyncio +from typing import Optional + +from prefect import get_client +from prefect.client.schemas.filters import ( + FlowRunFilter, + FlowRunFilterName, + FlowRunFilterParentFlowRunId, + FlowRunFilterTags, +) + + +async def _schedule( + deployment_name: str, + flow_run_name: str, + parameters: Optional[dict] = None, + tags: Optional[list] = [], +): + async with get_client() as client: + deployment = await client.read_deployment_by_name(deployment_name) + assert ( + deployment + ), f"No deployment found in config for deployment_name {deployment_name}" + flow_run = await client.create_flow_run_from_deployment( + deployment.id, + parameters=parameters, + name=flow_run_name, + tags=tags, + ) + return flow_run.id + + +def schedule_prefect_flow( + deployment_name: str, + parameters: Optional[dict] = None, + flow_run_name: Optional[str] = None, + tags: Optional[list] = [], +): + if not flow_run_name: + model_name = parameters["model_name"] + flow_run_name = f"{deployment_name}: {model_name}" + flow_run_id = asyncio.run( + _schedule(deployment_name, flow_run_name, parameters, tags) + ) + return flow_run_id + + +async def _get_name(flow_run_id): + async with get_client() as client: + flow_run = await client.read_flow_run(flow_run_id) + if flow_run.state.is_final(): + if flow_run.state.is_completed(): + return flow_run.name + return None + + +def get_flow_run_name(flow_run_id): + """Retrieves the name of the flow with the given id.""" + return asyncio.run(_get_name(flow_run_id)) + + +async def _flow_run_query( + tags=None, flow_run_name=None, parent_flow_run_id=None, sort="START_TIME_DESC" +): + flow_run_filter_parent_flow_run_id = ( + FlowRunFilterParentFlowRunId(any_=[parent_flow_run_id]) + if parent_flow_run_id + else None + ) + async with get_client() as client: + flow_runs = await client.read_flow_runs( + flow_run_filter=FlowRunFilter( + name=FlowRunFilterName(like_=flow_run_name), + parent_flow_run_id=flow_run_filter_parent_flow_run_id, + tags=FlowRunFilterTags(all_=tags), + ), + sort=sort, + ) + return flow_runs + + +def get_flow_runs_by_name(flow_run_name=None, tags=None): + flow_runs_by_name = [] + flow_runs = asyncio.run(_flow_run_query(tags, flow_run_name=flow_run_name)) + for flow_run in flow_runs: + if flow_run.state_name in {"Failed", "Crashed"}: + flow_name = f"❌ {flow_run.name}" + elif flow_run.state_name == "Completed": + flow_name = f"✅ {flow_run.name}" + elif flow_run.state_name == "Cancelled": + flow_name = f"🚫 {flow_run.name}" + else: + flow_name = f"🕑 {flow_run.name}" + flow_runs_by_name.append({"label": flow_name, "value": str(flow_run.id)}) + return flow_runs_by_name + + +def get_children_flow_run_ids(parent_flow_run_id, sort="START_TIME_ASC"): + children_flow_runs = asyncio.run( + _flow_run_query(parent_flow_run_id=parent_flow_run_id, sort=sort) + ) + children_flow_run_ids = [ + str(children_flow_run.id) for children_flow_run in children_flow_runs + ] + return children_flow_run_ids \ No newline at end of file