Skip to content
This repository has been archived by the owner on Dec 19, 2024. It is now read-only.

Commit

Permalink
KFP Visualizer and SummaryWriter Fix (#119)
Browse files Browse the repository at this point in the history
  • Loading branch information
Saurav-D authored Nov 5, 2020
1 parent a20a227 commit 166ce8c
Show file tree
Hide file tree
Showing 14 changed files with 195 additions and 57 deletions.
19 changes: 14 additions & 5 deletions datasetinsights/commands/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,17 +57,24 @@
),
)
@click.option(
"--kfp-metrics-dir",
"--kfp-log-dir",
type=click.Path(file_okay=False, writable=True),
default=const.DEFAULT_KFP_METRICS_DIR,
help="Path to the directory where Kubeflow Metrics files are stored.",
default=const.DEFAULT_KFP_LOG_DIR,
help="Path to the directory where Kubeflow ui metadata file and "
"metrics are stored.",
)
@click.option(
"--kfp-metrics-filename",
type=click.STRING,
default=const.DEFAULT_KFP_METRICS_FILENAME,
help="Kubeflow Metrics filename.",
)
@click.option(
"--kfp-ui-metadata-filename",
type=click.STRING,
default=const.DEFAULT_KFP_UI_METADATA_FILENAME,
help="Kubeflow UI Metadata JSON filename (for tensorboard).",
)
@click.option(
"--no-cuda",
is_flag=True,
Expand All @@ -83,8 +90,9 @@ def cli(
test_data,
tb_log_dir,
workers,
kfp_metrics_dir,
kfp_log_dir,
kfp_metrics_filename,
kfp_ui_metadata_filename,
no_cuda,
):
ctx = click.get_current_context()
Expand All @@ -97,8 +105,9 @@ def cli(
checkpoint_file=checkpoint_file,
tb_log_dir=tb_log_dir,
workers=workers,
kfp_metrics_dir=kfp_metrics_dir,
kfp_log_dir=kfp_log_dir,
kfp_metrics_filename=kfp_metrics_filename,
kfp_ui_metadata_filename=kfp_ui_metadata_filename,
no_cuda=no_cuda,
)

Expand Down
17 changes: 17 additions & 0 deletions datasetinsights/commands/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,19 @@
"to a local directory."
),
)
@click.option(
"--kfp-log-dir",
type=click.Path(file_okay=False, writable=True),
default=const.DEFAULT_KFP_LOG_DIR,
help="Path to the directory where Kubeflow ui metadata file and "
"metrics are stored.",
)
@click.option(
"--kfp-ui-metadata-filename",
type=click.STRING,
default=const.DEFAULT_KFP_UI_METADATA_FILENAME,
help="Kubeflow UI Metadata JSON filename (for tensorboard).",
)
@click.option(
"-p",
"--checkpoint-dir",
Expand Down Expand Up @@ -98,6 +111,8 @@ def cli(
val_data,
checkpoint_file,
tb_log_dir,
kfp_log_dir,
kfp_ui_metadata_filename,
checkpoint_dir,
workers,
no_cuda,
Expand All @@ -114,6 +129,8 @@ def cli(
config=config,
checkpoint_file=checkpoint_file,
tb_log_dir=tb_log_dir,
kfp_log_dir=kfp_log_dir,
kfp_ui_metadata_filename=kfp_ui_metadata_filename,
checkpoint_dir=checkpoint_dir,
no_cuda=no_cuda,
no_val=no_val,
Expand Down
5 changes: 2 additions & 3 deletions datasetinsights/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,9 @@
PROJECT_ROOT = os.path.dirname(os.path.dirname(__file__))

# Default for tensorboard logs, checkpoints and metrics
DEFAULT_KFP_METRICS_DIR = os.path.join(
PROJECT_ROOT, "metrics", TIMESTAMP_SUFFIX
)
DEFAULT_KFP_LOG_DIR = os.path.join(PROJECT_ROOT, "kfp", TIMESTAMP_SUFFIX)
DEFAULT_KFP_METRICS_FILENAME = "mlpipeline-metrics.json"
DEFAULT_KFP_UI_METADATA_FILENAME = "mlpipeline-ui-metadata.json"

DEFAULT_TENSORBOARD_LOG_DIR = os.path.join(
PROJECT_ROOT, "runs", TIMESTAMP_SUFFIX
Expand Down
11 changes: 5 additions & 6 deletions datasetinsights/estimators/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@ def create_estimator(
tb_log_dir=None,
no_cuda=None,
checkpoint_dir=None,
kfp_metrics_dir=const.DEFAULT_KFP_METRICS_DIR,
kfp_log_dir=const.DEFAULT_KFP_LOG_DIR,
kfp_metrics_filename=const.DEFAULT_KFP_METRICS_FILENAME,
kfp_ui_metadata_filename=const.DEFAULT_KFP_UI_METADATA_FILENAME,
no_val=None,
**kwargs,
):
Expand All @@ -35,10 +36,10 @@ def create_estimator(

kfp_writer = KubeflowPipelineWriter(
tb_log_dir=tb_log_dir,
filename=kfp_metrics_filename,
filepath=kfp_metrics_dir,
kfp_log_dir=kfp_log_dir,
kfp_metrics_filename=kfp_metrics_filename,
kfp_ui_metadata_filename=kfp_ui_metadata_filename,
)
kfp_writer.create_tb_visualization_json()
checkpointer = EstimatorCheckpoint(
estimator_name=name, checkpoint_dir=checkpoint_dir, distributed=False,
)
Expand All @@ -50,8 +51,6 @@ def create_estimator(
logdir=tb_log_dir,
no_cuda=no_cuda,
no_val=no_val,
kfp_metrics_dir=kfp_metrics_dir,
kfp_metrics_filename=kfp_metrics_filename,
**kwargs,
)

Expand Down
10 changes: 8 additions & 2 deletions datasetinsights/estimators/faster_rcnn.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,14 @@
import torch.distributed as dist
import torchvision
from codetiming import Timer
from torch.utils.tensorboard import SummaryWriter
from torchvision import transforms
from tqdm import tqdm

import datasetinsights.constants as const
from datasetinsights.datasets import Dataset
from datasetinsights.evaluation_metrics.base import EvaluationMetric
from datasetinsights.io.bbox import BBox2D
from datasetinsights.io.summarywriter import get_summary_writer
from datasetinsights.io.transforms import Compose
from datasetinsights.torch_distributed import get_world_size, is_master

Expand Down Expand Up @@ -79,7 +79,9 @@ def __init__(
self._init_distributed_mode()
self.no_cuda = no_cuda
self._init_device()
self.writer = SummaryWriter(logdir)

summary_writer = get_summary_writer()
self.writer = summary_writer(logdir)

self.kfp_writer = kfp_writer
checkpointer.distributed = self.distributed
Expand Down Expand Up @@ -306,6 +308,7 @@ def train_one_epoch(
f"(total training examples: {examples_seen}) is "
f"{intermediate_loss}"
)

self.writer.add_scalar(
"training/intermediate_loss",
intermediate_loss,
Expand All @@ -316,6 +319,7 @@ def train_one_epoch(
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()

self.writer.add_scalar("training/loss", loss_metric.compute(), epoch)
self.writer.add_scalar(
"training/lr", optimizer.param_groups[0]["lr"], epoch
Expand Down Expand Up @@ -419,6 +423,7 @@ def evaluate_per_epoch(
self.log_metric_val(label_mappings, epoch)
val_loss = loss_metric.compute()
logger.info(f"validation loss is {val_loss}")

self.writer.add_scalar("val/loss", val_loss, epoch)

torch.set_num_threads(n_threads)
Expand Down Expand Up @@ -447,6 +452,7 @@ def log_metric_val(self, label_mappings, epoch):
label_mappings.get(id, str(id)): value
for id, value in result.items()
}

self.writer.add_scalars(
f"val/{metric_name}-per-class", label_results, epoch
)
Expand Down
64 changes: 40 additions & 24 deletions datasetinsights/io/kfp_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,26 @@
import logging
import os

from datasetinsights.constants import DEFAULT_TENSORBOARD_LOG_DIR
from datasetinsights.constants import (
DEFAULT_KFP_LOG_DIR,
DEFAULT_KFP_METRICS_FILENAME,
DEFAULT_KFP_UI_METADATA_FILENAME,
DEFAULT_TENSORBOARD_LOG_DIR,
)

logger = logging.getLogger(__name__)


class KubeflowPipelineWriter(object):
"""
Serializes metrics dictionary genereated during model training/evaluation to
JSON and store in a file.
KFP Writer for serializing metrics dictionary genereated during model
training/evaluation toJSON and store in a file and create KFP dashboard
visualizer JSON file for tensorboard.
Args:
filename (str): Name of the file to which the writer will save metrics
filepath (str): Path where the file will be stored
kfp_log_dir (str): Path where all files related to KFP will be stored
tb_log_dir (str): Path where tensorobard logs are saved
Attributes:
filename (str): Name of the file to which the writer will save metrics
Expand All @@ -26,19 +33,24 @@ class KubeflowPipelineWriter(object):
def __init__(
self,
tb_log_dir=DEFAULT_TENSORBOARD_LOG_DIR,
filename="mlpipeline-metrics.json",
filepath="/",
kfp_log_dir=DEFAULT_KFP_LOG_DIR,
kfp_metrics_filename=DEFAULT_KFP_METRICS_FILENAME,
kfp_ui_metadata_filename=DEFAULT_KFP_UI_METADATA_FILENAME,
):
"""
Creates KubeflowPipelineWriter that will write out metrics to the output
file
"""

self.filename = filename
self.filepath = filepath
self.kfp_metrics_filename = kfp_metrics_filename
self.kfp_log_dir = kfp_log_dir
self.data_dict = {}
self.data = {"metrics": []}
self.tb_log_dir = tb_log_dir

if not os.path.exists(self.kfp_log_dir):
os.makedirs(self.kfp_log_dir)

self.create_tb_visualization_json(tb_log_dir, kfp_ui_metadata_filename)

def add_metric(self, name, val):
"""
Expand Down Expand Up @@ -68,23 +80,27 @@ def write_metric(self):
self.data["metrics"].append(
{"name": key, "numberValue": val, "format": "RAW"}
)
if not os.path.exists(self.filepath):
os.makedirs(self.filepath)
with open(os.path.join(self.filepath, self.filename), "w") as f:
with open(
os.path.join(self.kfp_log_dir, self.kfp_metrics_filename), "w"
) as f:
json.dump(self.data, f)

logger.debug(
f"Metrics file {self.filename} saved at path:" f" {self.filepath}"
f"Metrics file {self.kfp_metrics_filename} saved at path:"
f" {self.kfp_log_dir}"
)

def create_tb_visualization_json(self):
try:
metadata = {
"outputs": [{"type": "tensorboard", "source": self.tb_log_dir}]
}
with open("/mlpipeline-ui-metadata.json", "w") as f:
json.dump(metadata, f)

# when we don't have write permission
except IOError:
logger.info("Can not create Tensorboard Visualization JSON file.")
def create_tb_visualization_json(
self, tb_log_dir, kfp_ui_metadata_filename
):

metadata = {"outputs": [{"type": "tensorboard", "source": tb_log_dir}]}
with open(
os.path.join(self.kfp_log_dir, kfp_ui_metadata_filename), "w"
) as f:
json.dump(metadata, f)

logger.debug(
f"KFP UI Metadata JSON file {kfp_ui_metadata_filename} "
f"saved at path: {self.kfp_log_dir}"
)
59 changes: 59 additions & 0 deletions datasetinsights/io/summarywriter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from torch.utils.tensorboard import SummaryWriter

from datasetinsights.torch_distributed import is_master


class DummySummaryWriter:
"""A fake summary writer that writes nothing to the disk. This writer is
used when the process is not master process so that no data is written
which can prevent overwriting real data. This writer mimics the
SummaryWriter module in pytorch library. To see more about pytorch
tensorbaord summary writer visit:
https://github.com/pytorch/pytorch/blob/master/torch/utils/tensorboard/writer.py#L150
"""

def __init__(self, log_dir, *args, **kwargs):
self.logdir = log_dir

def add_event(self, *args, **kwargs):
return

def add_summary(self, *args, **kwargs):
return

def add_graph(self, *args, **kwargs):
return

def add_scalar(self, *args, **kwargs):
return

def add_scalars(self, *args, **kwargs):
return

def add_histogram(self, *args, **kwargs):
return

def add_histogram_raw(self, *args, **kwargs):
return

def add_figure(self, *args, **kwargs):
return

def flush(self):
return

def close(self):
return


def get_summary_writer():
"""
Returns summary writer for tensorboard according to the process (master/
non master)
"""
if is_master():
writer = SummaryWriter
else:
writer = DummySummaryWriter

return writer
9 changes: 7 additions & 2 deletions kubeflow/compiled/evaluate_the_model.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ apiVersion: argoproj.io/v1alpha1
kind: Workflow
metadata:
generateName: evaluate-the-model-
annotations: {pipelines.kubeflow.org/kfp_sdk_version: 1.0.1, pipelines.kubeflow.org/pipeline_compilation_time: '2020-10-23T14:11:46.915351',
annotations: {pipelines.kubeflow.org/kfp_sdk_version: 1.0.1, pipelines.kubeflow.org/pipeline_compilation_time: '2020-11-04T17:16:55.837308',
pipelines.kubeflow.org/pipeline_spec: '{"description": "Evaluate the model", "inputs":
[{"default": "unitytechnologies/datasetinsights:latest", "name": "docker", "optional":
true, "type": "String"}, {"default": "https://storage.googleapis.com/datasetinsights/data/groceries/v3.zip",
Expand Down Expand Up @@ -43,7 +43,8 @@ spec:
- name: evaluate
container:
args: ['--config={{inputs.parameters.config}}', '--checkpoint-file={{inputs.parameters.checkpoint_file}}',
--test-data=/data, '--tb-log-dir={{inputs.parameters.tb_log_dir}}']
--test-data=/data, '--tb-log-dir={{inputs.parameters.tb_log_dir}}', --kfp-log-dir=/kfp_logs,
--kfp-ui-metadata-filename=kfp_ui_metadata.json, --kfp-metrics-filename=kfp_metrics.json]
command: [datasetinsights, evaluate]
env:
- {name: GOOGLE_APPLICATION_CREDENTIALS, value: /secret/gcp-credentials/user-gcp-sa.json}
Expand All @@ -62,6 +63,10 @@ spec:
- {name: docker}
- {name: pvc-name}
- {name: tb_log_dir}
outputs:
artifacts:
- {name: mlpipeline-metrics, path: /kfp_logs/kfp_metrics.json}
- {name: mlpipeline-ui-metadata, path: /kfp_logs/kfp_ui_metadata.json}
nodeSelector: {cloud.google.com/gke-accelerator: nvidia-tesla-v100}
volumes:
- name: gcp-credentials-user-gcp-sa
Expand Down
Loading

0 comments on commit 166ce8c

Please sign in to comment.