Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add MLPerf logging #831

Merged
merged 63 commits into from
May 3, 2022
Merged
Changes from 1 commit
Commits
Show all changes
63 commits
Select commit Hold shift + click to select a range
c52f47b
draft mlperf logger
hanlint Mar 18, 2022
1d75c3a
add to callbacks module
hanlint Mar 21, 2022
caab2e9
add mlperf logging callback
hanlint Mar 23, 2022
8f2fee6
add submission directory structure
hanlint Mar 25, 2022
69bb806
add mlperf to setup
hanlint Mar 25, 2022
0813bb6
fix duplicate logging
hanlint Mar 25, 2022
43c74cd
Merge branch 'dev' into hanlin/mlperf
hanlint Mar 25, 2022
d2153d2
Apply suggestions from code review
hanlint Mar 28, 2022
8bbd7cd
Merge branch 'dev' into hanlin/mlperf
hanlint Apr 18, 2022
e010476
update with current_metrics
hanlint Apr 18, 2022
9d588f7
fix setup
hanlint Apr 19, 2022
bee409f
fix docstrings
hanlint Apr 19, 2022
f70406b
add hparams object
hanlint Apr 19, 2022
ba8652f
fix error
hanlint Apr 19, 2022
7ac866b
skip callback in asset test
hanlint Apr 19, 2022
03758b1
Merge branch 'dev' into hanlin/mlperf
hanlint Apr 19, 2022
689d84c
cleanup
hanlint Apr 19, 2022
f02eef3
try removing world_size
hanlint Apr 19, 2022
6491e8c
restore world_size
hanlint Apr 19, 2022
5fe7957
Merge branch 'dev' into hanlin/mlperf
hanlint Apr 19, 2022
b1b6004
Merge branch 'dev' into hanlin/mlperf
hanlint Apr 19, 2022
465f76f
Merge branch 'hanlin/mlperf' of github.com:mosaicml/composer into han…
hanlint Apr 19, 2022
d80e39d
trying removing mlperf tag
hanlint Apr 19, 2022
99bb2ab
cleanup
hanlint Apr 19, 2022
50ae088
Merge branch 'dev' into hanlin/mlperf
ravi-mosaicml Apr 19, 2022
73931a8
please jenkins help
hanlint Apr 19, 2022
2139ae0
one more time
hanlint Apr 19, 2022
6696fdb
never say timeout
hanlint Apr 19, 2022
7ff3392
Merge branch 'hanlin/mlperf' of github.com:mosaicml/composer into han…
hanlint Apr 19, 2022
e03c14e
Merge branch 'dev' into hanlin/mlperf
hanlint Apr 19, 2022
fed0d3f
Merge branch 'dev' into hanlin/mlperf
hanlint Apr 20, 2022
09ed9e5
remove world_size again
hanlint Apr 20, 2022
95c26fc
Merge branch 'hanlin/mlperf' of github.com:mosaicml/composer into han…
hanlint Apr 20, 2022
f2d3c51
Merge branch 'dev' into hanlin/mlperf
ravi-mosaicml Apr 20, 2022
7034f13
remove logging pip
hanlint Apr 21, 2022
67c0640
Merge branch 'hanlin/mlperf' of github.com:mosaicml/composer into han…
hanlint Apr 21, 2022
24e7439
Merge branch 'dev' into hanlin/mlperf
hanlint Apr 21, 2022
e497ce9
Merge branch 'hanlin/mlperf' of github.com:mosaicml/composer into han…
hanlint Apr 21, 2022
2862776
address comments
hanlint Apr 23, 2022
e0a13a4
implement cache clear
hanlint Apr 23, 2022
aae087b
Merge branch 'dev' into hanlin/mlperf
hanlint Apr 25, 2022
5585ce8
fix doctest
hanlint Apr 26, 2022
a524a65
Merge branch 'hanlin/mlperf' of github.com:mosaicml/composer into han…
hanlint Apr 26, 2022
a553110
Merge branch 'dev' into hanlin/mlperf
hanlint Apr 27, 2022
6a2c637
Update composer/callbacks/mlperf.py
hanlint Apr 29, 2022
8f4ea2f
address comments
hanlint Apr 29, 2022
a80a208
Merge branch 'hanlin/mlperf' of github.com:mosaicml/composer into han…
hanlint Apr 29, 2022
cc4d9be
restore dataloaders to state
hanlint May 3, 2022
a431577
cleanup
hanlint May 3, 2022
4cbd163
move items to init
hanlint May 3, 2022
b7fd11e
Merge branch 'dev' into hanlin/mlperf
hanlint May 3, 2022
cdeac03
fix pyright
hanlint May 3, 2022
212b089
clean up tests
hanlint May 3, 2022
13066df
use code block because cannot automate testcode
hanlint May 3, 2022
f8c9732
Apply suggestions from code review
hanlint May 3, 2022
b689f86
address comments
hanlint May 3, 2022
1705a61
Merge branch 'dev' into hanlin/mlperf
hanlint May 3, 2022
051035b
cleanup
hanlint May 3, 2022
afe5313
type ignore until logging pypi is done
hanlint May 3, 2022
ec2a578
Merge branch 'dev' into hanlin/mlperf
hanlint May 3, 2022
2480a2b
cleanup
hanlint May 3, 2022
0136d3d
Merge branch 'hanlin/mlperf' of github.com:mosaicml/composer into han…
hanlint May 3, 2022
621d12e
cleanup
hanlint May 3, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
update with current_metrics
hanlint committed Apr 18, 2022
commit e010476171d153f899e8ac61d5d31ace76331959
39 changes: 28 additions & 11 deletions composer/callbacks/mlperf.py
Original file line number Diff line number Diff line change
@@ -4,24 +4,27 @@
import platform
import sys
import warnings
from typing import Dict, Optional
from typing import Any, Dict, Optional, Sized

import cpuinfo
import psutil
import torch

import composer
from composer import Callback, State
from composer.core import State
from composer.core.callback import Callback
from composer.loggers import Logger
from composer.utils import dist

try:
import cpuinfo
import psutil
from mlperf_logging import mllog
from mlperf_logging.mllog import constants

mlperf_available = True
except ImportError:
mlperf_available = False

# this callback only supports the following options:
BENCHMARKS = ("resnet")
hanlint marked this conversation as resolved.
Show resolved Hide resolved
hanlint marked this conversation as resolved.
Show resolved Hide resolved
DIVISIONS = ("open")
hanlint marked this conversation as resolved.
Show resolved Hide resolved
STATUS = ("onprem", "cloud", "preview")
@@ -133,9 +136,10 @@ def __init__(
constants.SUBMISSION_STATUS: status,
})

self.success = False

def _create_submission_folders(self, root_folder: str, system_name: str, benchmark: str):
if not os.path.isdir(root_folder):
raise FileNotFoundError(f"{root_folder} not found.")
os.makedirs(root_folder, exist_ok=True)
hanlint marked this conversation as resolved.
Show resolved Hide resolved

results_folder = os.path.join(root_folder, 'results')
log_folder = os.path.join(root_folder, 'results', system_name)
@@ -151,10 +155,21 @@ def _log_dict(self, data: Dict[str, Any]):
for key, value in data.items():
self.mllogger.event(key=key, value=value)

def _get_accuracy(self, state: State):
if 'Accuracy' not in state.current_metrics['eval']:
hanlint marked this conversation as resolved.
Show resolved Hide resolved
raise ValueError('Accuracy must be a validation metric.')
return state.current_metrics['eval']['Accuracy']

def fit_start(self, state: State, logger: Logger) -> None:
if rank_zero():
if state.train_dataloader.batch_size is None:
raise ValueError("Batch size is required to be set for dataloader.")
if len(state.evaluators) > 1:
raise ValueError("Only one evaluator is supported for the MLPerfCallback.")
if not isinstance(state.train_dataloader.dataset, Sized):
raise ValueError("Train dataset must have __len__ property")
hanlint marked this conversation as resolved.
Show resolved Hide resolved
if not isinstance(state.evaluators[0].dataloader.dataloader.dataset, Sized):
raise ValueError("Train dataset must have __len__ property")
hanlint marked this conversation as resolved.
Show resolved Hide resolved

self._log_dict({
constants.SEED: state.seed,
@@ -189,17 +204,18 @@ def eval_start(self, state: State, logger: Logger) -> None:

def eval_end(self, state: State, logger: Logger) -> None:
if rank_zero():
accuracy = 0.99 # TODO: retrieve accuracy from metrics
accuracy = self._get_accuracy(state)

self.mllogger.event(key=constants.EVAL_STOP, metadata={'epoch_num': state.timer.epoch.value})
self.mllogger.event(key=constants.EVAL_ACCURACY,
value=accuracy,
metadata={'epoch_num': state.timer.epoch.value})
self.mllogger.event(key=constants.BLOCK_STOP, metadata={'first_epoch_num': state.timer.epoch.value})

if accuracy > self.target:
if accuracy > self.target and not self.success:
self.mllogger.event(key=constants.RUN_STOP, metadata={"status": "success"})
self.mllogger.logger.removeHandler(self._file_handler)
self.success = True # only log once
hanlint marked this conversation as resolved.
Show resolved Hide resolved

hanlint marked this conversation as resolved.
Show resolved Hide resolved

def get_system_description(
@@ -255,9 +271,10 @@ def get_system_description(
"accelerator_interconnect_topology": "",
"cooling": "",
"hw_notes": "",
"framework": f"PyTorch v{torch.__version__} and MosaicML composer v{composer.__version__}",
"framework":
f"PyTorch v{torch.__version__} and MosaicML composer v{composer.__version__}", # type: ignore (third-party missing stub)
"other_software_stack": {
"cuda_version": torch.version.cuda if is_cuda else "",
"cuda_version": torch.version.cuda if is_cuda else "", # type: ignore (third-party missing stub)
"composer_version": composer.__version__,
"python_version": sys.version,
},
@@ -281,4 +298,4 @@ def get_system_description(
# default to system name as "[world_size]x[device_name]"
# e.g. 8xNVIDIA_A100_80GB
system_desc['system_name'] = system_name
return system_desc
return system_desc
171 changes: 138 additions & 33 deletions tests/callbacks/test_mlperf_callback.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,23 @@
import logging
from unittest.mock import Mock

import numpy as np
import pytest
from torch.utils.data import DataLoader

from composer import Trainer
from composer import State, Trainer
from composer.callbacks import MLPerfCallback
from tests.common import RandomClassificationDataset, SimpleModel
from composer.utils import dist
from tests.common import RandomClassificationDataset, SimpleModel, world_size

logging.basicConfig(filename="/Users/hanlintang/composer/package_checker.log", level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler())
formatter = logging.Formatter("%(levelname)s - %(message)s")
logging.getLogger().handlers[0].setFormatter(formatter)
logging.getLogger().handlers[1].setFormatter(formatter)

def rank_zero() -> bool:
return dist.get_global_rank() == 0


@pytest.fixture(autouse=True)
def importor_skip_mlperf_logging():
pytest.importorskip("mlperf_logging")

hanlint marked this conversation as resolved.
Show resolved Hide resolved

@pytest.fixture
@@ -30,35 +35,135 @@ def config():
dataset=RandomClassificationDataset(),
shuffle=False,
),
'max_duration': '2ep',
'max_duration': '3ep',
'deterministic_mode': True, # testing equivalence
'loggers': [], # no progress bar
'progress_bar': False, # no progress bar
'log_to_console': False,
'loggers': [],
'callbacks': []
}


@pytest.mark.filterwarnings(
"ignore::DeprecationWarning",)
def test_mlperf_callback(config, tmpdir):
pytest.importorskip("mlperf_logging")
class MockMLLogger:
"""Mocks the MLPerf Logger interface."""

def __init__(self) -> None:
self.logs = []
self.logger = Mock()

def event(self, key, metadata, value=None):
self.logs.append({'key': key, 'value': value, 'metadata': metadata})


@world_size(1, 2)
class TestMLPerfCallbackEvents:

@pytest.fixture
def mlperf_callback(self, monkeypatch, tmpdir) -> MLPerfCallback:
"""Returns a callback with the MockMLLogger patched."""
callback = MLPerfCallback(tmpdir, 0)
monkeypatch.setattr(callback, 'mllogger', MockMLLogger())
return callback

@pytest.fixture
def mock_state(self):
"""Mocks a state at epoch 1 with Accuracy 0.99."""
current_metrics = {'eval': {'Accuracy': 0.99}}

state = Mock()
state.current_metrics = current_metrics
state.timer.epoch.value = 1

return state

def test_eval_start(self, mlperf_callback, mock_state, world_size):
mlperf_callback.eval_start(mock_state, Mock())

if not rank_zero():
assert mlperf_callback.mllogger.logs == []
return

assert mlperf_callback.mllogger.logs == [{'key': 'eval_start', 'value': None, 'metadata': {'epoch_num': 1}}]

def test_eval_end(self, mlperf_callback, mock_state, world_size):
mlperf_callback.eval_end(mock_state, Mock())

if not rank_zero():
assert mlperf_callback.success == False
assert mlperf_callback.mllogger.logs == []
return

assert mlperf_callback.success == True
assert mlperf_callback.mllogger.logs[-1] == {
'key': 'run_stop',
'value': None,
'metadata': {
'status': 'success'
}
}


@world_size(1, 2)
class TestWithMLPerfChecker:
"""Ensures that the logs created by the MLPerfCallback pass the official package checker."""

@pytest.mark.timeout(15)
def test_mlperf_callback_passes(self, config, tmpdir, monkeypatch, world_size):

def mock_accuracy(self, state: State):
if state.timer.epoch >= 2:
return 0.99
else:
return 0.01

monkeypatch.setattr(MLPerfCallback, '_get_accuracy', mock_accuracy)

self.generate_submission(tmpdir, config)

if rank_zero():
self.run_mlperf_checker(tmpdir, monkeypatch)

@pytest.mark.timeout(15)
def test_mlperf_callback_fails(self, config, tmpdir, monkeypatch, world_size):

def mock_accuracy(self, state: State):
return 0.01

monkeypatch.setattr(MLPerfCallback, '_get_accuracy', mock_accuracy)

self.generate_submission(tmpdir, config)
with pytest.raises(ValueError, match='MLPerf checker failed'):
self.run_mlperf_checker(tmpdir, monkeypatch)

def generate_submission(self, directory, config):
"""Generates submission files by training the benchark n=5 times."""

for run in range(5):
mlperf_callback = MLPerfCallback(root_folder=directory, index=run)
config['callbacks'] = [mlperf_callback]
config['seed'] = np.random.randint(low=2048) # mlperf seeds are released near submission deadline
trainer = Trainer(**config)
trainer.fit()

def run_mlperf_checker(self, directory, monkeypatch):
"""Runs the MLPerf package checker and fails on any errors."""

# monkeypatch the logging so that logging.error raises Exception
def fail_on_error(msg, *args, **kwargs):
print(msg.format(*args))
raise ValueError('MLPerf checker failed, see logs.')

monkeypatch.setattr(logging, "error", fail_on_error)

from mlperf_logging.package_checker.package_checker import check_training_package

for run in range(5):
mlperf_callback = MLPerfCallback(root_folder=tmpdir, num_result=run)
config['callbacks'] = [mlperf_callback]
config['seed'] = np.random.randint(2e5) # mlperf seeds are released near submission deadline
trainer = Trainer(**config)
trainer.fit()

# run result checker
from mlperf_logging.package_checker.package_checker import check_training_package

check_training_package(
folder=tmpdir,
usage="training",
ruleset="1.1.0",
werror=True,
quiet=False,
rcp_bypass=False,
rcp_bert_train_samples=False,
log_output="package_checker.log",
)
check_training_package(
folder=directory,
usage="training",
ruleset="1.1.0",
werror=True,
quiet=False,
rcp_bypass=False,
rcp_bert_train_samples=False,
log_output="package_checker.log",
)