Skip to content

Commit

Permalink
Remove the Scale Schedule Algorithm (#854)
Browse files Browse the repository at this point in the history
The algorithm form of scale schedule has been deprecated. It is available as an argument to the trainer.

This PR removes the Scale Schedule Algorithm as an algorithm (it must be specified via the trainer init args).
It also restores the scale schedule method card that was in the 0.3.1 release. This method card has been updated to
reflect the non-algorithm-class usage.

Closes #434.
  • Loading branch information
ravi-mosaicml authored May 3, 2022
1 parent 5b6a090 commit a950de6
Show file tree
Hide file tree
Showing 39 changed files with 126 additions and 166 deletions.
8 changes: 2 additions & 6 deletions composer/algorithms/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,17 +53,15 @@ def apply(self, state: State, event: Event, logger: Logger):
ColOutHparams, CutMixHparams, CutOutHparams, EMAHparams, FactorizeHparams,
GhostBatchNormHparams, LabelSmoothingHparams, LayerFreezingHparams,
MixUpHparams, NoOpModelHparams, ProgressiveResizingHparams, RandAugmentHparams,
SAMHparams, ScaleScheduleHparams, SelectiveBackpropHparams,
SeqLengthWarmupHparams, SqueezeExciteHparams, StochasticDepthHparams,
SWAHparams)
SAMHparams, SelectiveBackpropHparams, SeqLengthWarmupHparams,
SqueezeExciteHparams, StochasticDepthHparams, SWAHparams)
from composer.algorithms.label_smoothing import LabelSmoothing
from composer.algorithms.layer_freezing import LayerFreezing
from composer.algorithms.mixup import MixUp
from composer.algorithms.no_op_model import NoOpModel
from composer.algorithms.progressive_resizing import ProgressiveResizing
from composer.algorithms.randaugment import RandAugment, RandAugmentTransform
from composer.algorithms.sam import SAM
from composer.algorithms.scale_schedule import ScaleSchedule
from composer.algorithms.selective_backprop import SelectiveBackprop
from composer.algorithms.seq_length_warmup import SeqLengthWarmup
from composer.algorithms.squeeze_excite import SqueezeExcite, SqueezeExcite2d, SqueezeExciteConv2d
Expand Down Expand Up @@ -99,7 +97,6 @@ def apply(self, state: State, event: Event, logger: Logger):
"RandAugment",
"RandAugmentTransform",
"SAM",
"ScaleSchedule",
"SelectiveBackprop",
"SeqLengthWarmup",
"SqueezeExcite",
Expand Down Expand Up @@ -128,7 +125,6 @@ def apply(self, state: State, event: Event, logger: Logger):
"ProgressiveResizingHparams",
"RandAugmentHparams",
"SAMHparams",
"ScaleScheduleHparams",
"SelectiveBackpropHparams",
"SeqLengthWarmupHparams",
"SqueezeExciteHparams",
Expand Down
6 changes: 2 additions & 4 deletions composer/algorithms/algorithm_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,8 @@
ColOutHparams, CutMixHparams, CutOutHparams, EMAHparams, FactorizeHparams,
GhostBatchNormHparams, LabelSmoothingHparams, LayerFreezingHparams,
MixUpHparams, NoOpModelHparams, ProgressiveResizingHparams, RandAugmentHparams,
SAMHparams, ScaleScheduleHparams, SelectiveBackpropHparams,
SeqLengthWarmupHparams, SqueezeExciteHparams, StochasticDepthHparams,
SWAHparams)
SAMHparams, SelectiveBackpropHparams, SeqLengthWarmupHparams,
SqueezeExciteHparams, StochasticDepthHparams, SWAHparams)
from composer.core.algorithm import Algorithm

registry: Dict[str, Type[AlgorithmHparams]] = {
Expand All @@ -27,7 +26,6 @@
'swa': SWAHparams,
'no_op_model': NoOpModelHparams,
'mixup': MixUpHparams,
'scale_schedule': ScaleScheduleHparams,
'stochastic_depth': StochasticDepthHparams,
'colout': ColOutHparams,
'progressive_resizing': ProgressiveResizingHparams,
Expand Down
11 changes: 0 additions & 11 deletions composer/algorithms/hparams.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
from composer.algorithms.progressive_resizing import ProgressiveResizing
from composer.algorithms.randaugment import RandAugment
from composer.algorithms.sam import SAM
from composer.algorithms.scale_schedule import ScaleSchedule
from composer.algorithms.selective_backprop import SelectiveBackprop
from composer.algorithms.seq_length_warmup import SeqLengthWarmup
from composer.algorithms.squeeze_excite import SqueezeExcite
Expand Down Expand Up @@ -307,16 +306,6 @@ def initialize_object(self) -> SAM:
return SAM(**asdict(self))


@dataclass
class ScaleScheduleHparams(AlgorithmHparams):
"""See :class:`ScaleSchedule`"""

ratio: float = hp.optional('Ratio to scale the schedule.', default=1.0)

def initialize_object(self) -> "ScaleSchedule":
return ScaleSchedule(**asdict(self))


@dataclass
class SelectiveBackpropHparams(AlgorithmHparams):
"""See :class:`SelectiveBackprop`"""
Expand Down
3 changes: 0 additions & 3 deletions composer/algorithms/scale_schedule/README.md

This file was deleted.

9 changes: 0 additions & 9 deletions composer/algorithms/scale_schedule/__init__.py

This file was deleted.

13 changes: 0 additions & 13 deletions composer/algorithms/scale_schedule/metadata.json

This file was deleted.

46 changes: 0 additions & 46 deletions composer/algorithms/scale_schedule/scale_schedule.py

This file was deleted.

7 changes: 0 additions & 7 deletions composer/trainer/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,6 @@
from torchmetrics import Metric, MetricCollection

import composer
from composer.algorithms import ScaleSchedule
from composer.callbacks import CheckpointSaver
from composer.core import Algorithm, Callback, DataSpec, Engine, Evaluator, Event, Precision, State, Time, Timestamp
from composer.core.evaluator import evaluate_periodically
Expand Down Expand Up @@ -630,12 +629,6 @@ def __init__(
# self._use_grad_scaling() will raise a RuntimeError if grad scaling is not available when it is required
warnings.filterwarnings(action="ignore", message="torch.cuda.amp.GradScaler")

# ScaleSchedule is a deprecated algorithm, but if it is used, updated SSR with its ratio.
# TODO(#434): Remove this completely.
for algorithm in ensure_tuple(algorithms):
if isinstance(algorithm, ScaleSchedule):
scale_schedule_ratio = algorithm.ratio

if isinstance(max_duration, str):
max_duration = Time.from_timestring(max_duration)
elif isinstance(max_duration, int):
Expand Down
1 change: 0 additions & 1 deletion composer/yamls/algorithms/scale_schedule/0.25.yaml

This file was deleted.

1 change: 0 additions & 1 deletion composer/yamls/algorithms/scale_schedule/0.29.yaml

This file was deleted.

1 change: 0 additions & 1 deletion composer/yamls/algorithms/scale_schedule/0.33.yaml

This file was deleted.

1 change: 0 additions & 1 deletion composer/yamls/algorithms/scale_schedule/0.36.yaml

This file was deleted.

1 change: 0 additions & 1 deletion composer/yamls/algorithms/scale_schedule/0.43.yaml

This file was deleted.

1 change: 0 additions & 1 deletion composer/yamls/algorithms/scale_schedule/0.45.yaml

This file was deleted.

1 change: 0 additions & 1 deletion composer/yamls/algorithms/scale_schedule/0.50.yaml

This file was deleted.

1 change: 0 additions & 1 deletion composer/yamls/algorithms/scale_schedule/0.55.yaml

This file was deleted.

1 change: 0 additions & 1 deletion composer/yamls/algorithms/scale_schedule/0.56.yaml

This file was deleted.

1 change: 0 additions & 1 deletion composer/yamls/algorithms/scale_schedule/0.57.yaml

This file was deleted.

1 change: 0 additions & 1 deletion composer/yamls/algorithms/scale_schedule/0.64.yaml

This file was deleted.

1 change: 0 additions & 1 deletion composer/yamls/algorithms/scale_schedule/0.67.yaml

This file was deleted.

1 change: 0 additions & 1 deletion composer/yamls/algorithms/scale_schedule/0.71.yaml

This file was deleted.

1 change: 0 additions & 1 deletion composer/yamls/algorithms/scale_schedule/0.73.yaml

This file was deleted.

1 change: 0 additions & 1 deletion composer/yamls/algorithms/scale_schedule/0.75.yaml

This file was deleted.

1 change: 0 additions & 1 deletion composer/yamls/algorithms/scale_schedule/0.78.yaml

This file was deleted.

1 change: 0 additions & 1 deletion composer/yamls/algorithms/scale_schedule/0.79.yaml

This file was deleted.

1 change: 0 additions & 1 deletion composer/yamls/algorithms/scale_schedule/0.80.yaml

This file was deleted.

1 change: 0 additions & 1 deletion composer/yamls/algorithms/scale_schedule/0.82.yaml

This file was deleted.

1 change: 0 additions & 1 deletion composer/yamls/algorithms/scale_schedule/0.86.yaml

This file was deleted.

1 change: 0 additions & 1 deletion composer/yamls/algorithms/scale_schedule/0.89.yaml

This file was deleted.

1 change: 0 additions & 1 deletion composer/yamls/algorithms/scale_schedule/0.91.yaml

This file was deleted.

1 change: 0 additions & 1 deletion composer/yamls/algorithms/scale_schedule/0.93.yaml

This file was deleted.

1 change: 0 additions & 1 deletion composer/yamls/algorithms/scale_schedule/1.00.yaml

This file was deleted.

1 change: 0 additions & 1 deletion composer/yamls/algorithms/scale_schedule/1.25.yaml

This file was deleted.

1 change: 0 additions & 1 deletion composer/yamls/algorithms/scale_schedule/1.50.yaml

This file was deleted.

1 change: 0 additions & 1 deletion composer/yamls/algorithms/scale_schedule/2.00.yaml

This file was deleted.

1 change: 0 additions & 1 deletion docs/source/method_cards/scale_schedule.md

This file was deleted.

115 changes: 115 additions & 0 deletions docs/source/method_cards/scale_schedule.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
# ⚖️ Scale Schedule

![scale_schedule.png](https://storage.googleapis.com/docs.mosaicml.com/images/methods/scale_schedule.png)

Tags: `Best Practice`, `Speedup`

## TL;DR

Scale Schedule changes the number of training steps by a dilation factor and dilating learning rate changes
accordingly. Doing so varies the training budget, making it possible to explore tradeoffs between cost (measured in
time or money) and the quality of the final model.

## Attribution

The number of training steps to perform is an important hyperparameter to tune when developing a model. This technique
appears implicitly throughout the deep learning literature. One example of a systematic study of this approach is the
*scan-SGD* technique in
[How Important is Importance Sampling for Deep Budgeted Training](https://openreview.net/forum?id=TqQ0oOzJlai) by
Eric Arazo, Diego Ortega, Paul Albert, Noel O'Connor, and Kevin McGuinness. Posted to OpenReview in 2020.

## Hyperparameters

- `ratio` - The ratio of the scaled learning rate schedule to the full learning rate schedule. For example, a ratio
of 0.8 would train for 80% as many steps as the original schedule.

## Example Effects

Changing the length of training will affect the final accuracy of the model. For example, training ResNet-50 on
ImageNet for the standard schedule in the `composer` library leads to final validation accuracy of 76.6%, while
using scale schedule with a ratio of 0.5 leads to final validation accuracy of 75.6%. Training for longer can lead
to diminishing returns or even overfitting and worse validation accuracy. In general, the cost of training is
proportional to the length of training when using scale schedule (assuming all other techniques, such as progressive
resizing, have their schedules scaled accordingly).

```{note}
The warmup periods of schedulers are not scaled by the scale schedule ratio.
```

## Implementation Details

Scale schedule is implemented as part of the {class}`~.Trainer` via the `scale_schedule_ratio` argument.
The trainer will scale the ``max_duration`` by the ``scale_schedule_ratio``, and also adjust non-warmup milestones
for the learning rate schedulers.

Scale schedule supports all Composer Schedulers:

```{eval-rst}
.. currentmodule:: composer.optim.scheduler
.. autosummary::
:nosignatures:
StepScheduler
MultiStepScheduler
MultiStepWithWarmupScheduler
ConstantScheduler
LinearScheduler
LinearWithWarmupScheduler
ExponentialScheduler
CosineAnnealingScheduler
CosineAnnealingWithWarmupScheduler
CosineAnnealingWarmRestartsScheduler
PolynomialScheduler
```

```{eval-rst}
.. seealso:: The :ref:`Scheduling Guide <Composer Schedulers>` more information about Composer Schedulers.
```

Scale schedule also supports the following PyTorch Schedulers:
* {class}`~torch.optim.lr_scheduler.StepLR`
* {class}`~torch.optim.lr_scheduler.MultiStepLR`
* {class}`~torch.optim.lr_scheduler.ExponentialLR`
* {class}`~torch.optim.lr_scheduler.CosineAnnealingLR`
* {class}`~torch.optim.lr_scheduler.CosineAnnealingWarmRestarts`.


For example, the code below will scale the training time by half
(to 10 epochs) and also scale the learning rate schedule.

```{eval-rst}
.. testcode::
from composer import Trainer
from composer.optim.scheduler import MultiStepScheduler
trainer = Trainer(
...,
max_duration="20ep",
schedulers=MultiStepScheduler(milestones=["10ep", "16ep"]),
scale_schedule_ratio=0.5,
)
# or equivalently, with default SSR=1.0:
trainer = Trainer(
...,
max_duration="10ep",
schedulers=MultiStepScheduler(milestones=["5ep", "8ep"])
)
```

For additional details on using the scale schedule ratio, see the {ref}`Scale Schedule Ratio <Scale Schedule Ratio>`
section in the schedulers guide.

## Suggested Hyperparameters

The default scale schedule ratio is 1.0. For a standard maximum number of epochs (these will differ depending on the
task), scaling down the learning rate schedule will lead to a monotonic decrease in accuracy. Increasing the scale
schedule ratio will often improve the accuracy up to a plateau, although this leads to longer training time and added
cost.

## Composability

As general rule, scale schedule can be applied in conjunction with any method. If other methods also perform actions
according to a schedule, it is important to modify their schedules to coincide with the altered number of epochs.
4 changes: 4 additions & 0 deletions docs/source/trainer/schedulers.rst
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ For PyTorch schedulers, we step every epoch by default. To instead step every ba
Our experiments have shown better accuracy using stepwise schedulers, so
it is the recommended setting in most cases.

.. _Composer Schedulers:

Composer Schedulers
-------------------

Expand Down Expand Up @@ -81,6 +83,8 @@ Below are the supported schedulers found at :mod:`composer.optim.scheduler`.
an optimizer directly. The trainer will handle binding the optimizer when
it compiles the scheduler later.

.. _Scale Schedule Ratio:

Scale Schedule Ratio
--------------------

Expand Down
Loading

0 comments on commit a950de6

Please sign in to comment.