forked from syne-tune/syne-tune
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlaunch_sagemaker_backend.py
140 lines (126 loc) · 4.92 KB
/
launch_sagemaker_backend.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
# If you like to run the code linked in this tutorial, please make sure to use
# the current `PyPI` release. If you cloned the source repository, this is
# obtained as follows:
#
# ```bash
# git checkout -b basic_tutorial v0.11
# ```
#
# This gives you a local branch `basic_tutorial`, in which you can play around
# with the code.
import logging
from pathlib import Path
from sagemaker.pytorch import PyTorch
from syne_tune.config_space import randint, uniform, loguniform
from syne_tune.backend import SageMakerBackend
from syne_tune.backend.sagemaker_backend.sagemaker_utils import (
get_execution_role,
default_sagemaker_session,
)
from syne_tune.optimizer.schedulers import HyperbandScheduler
from syne_tune import Tuner, StoppingCriterion
from syne_tune.util import repository_root_path
if __name__ == "__main__":
logging.getLogger().setLevel(logging.INFO)
random_seed = 31415927
n_workers = 8
max_wallclock_time = 3 * 3600 # Run for 3 hours
max_resource_level = 81 # Maximum number of training epochs
# Here, we specify the training script we want to tune
# - `mode` and `metric` must match what is reported in the training script
# - Metrics need to be reported after each epoch, `resource_attr` must match
# what is reported in the training script
entry_point = str(Path(__file__).parent / "traincode_report_withcheckpointing.py")
mode = "max"
metric = "accuracy"
resource_attr = "epoch"
max_resource_attr = "epochs"
# Search space (or configuration space)
# For each tunable parameter, need to define type, range, and encoding
# (linear, logarithmic)
config_space = {
"n_units_1": randint(4, 1024),
"n_units_2": randint(4, 1024),
"batch_size": randint(8, 128),
"dropout_1": uniform(0, 0.99),
"dropout_2": uniform(0, 0.99),
"learning_rate": loguniform(1e-6, 1),
"weight_decay": loguniform(1e-8, 1),
}
# Additional fixed parameters
config_space.update(
{
max_resource_attr: max_resource_level,
"dataset_path": "./",
}
)
# SageMaker back-end: Responsible for scheduling trials
# Each trial is run as a separate SageMaker training job. This is useful for
# expensive workloads, where all resources of an instance (or several ones)
# are used for training. On the other hand, training job start-up overhead
# is incurred for every trial.
# [1]
trial_backend = SageMakerBackend(
# we tune a PyTorch Framework from Sagemaker
sm_estimator=PyTorch(
entry_point=entry_point,
instance_type="ml.m4.xlarge",
instance_count=1,
role=get_execution_role(),
dependencies=[str(repository_root_path() / "benchmarking")],
max_run=int(1.05 * max_wallclock_time),
framework_version="1.7.1",
py_version="py3",
disable_profiler=True,
sagemaker_session=default_sagemaker_session(),
),
metrics_names=[metric],
)
# Scheduler:
# 'HyperbandScheduler' runs asynchronous successive halving, or Hyperband.
# It starts a trial whenever a worker is free.
# We configure this scheduler with Bayesian optimization: configurations
# for new trials are selected by optimizing an acquisition function based
# on a Gaussian process surrogate model. The latter models learning curves
# f(x, r), x the configuration, r the number of epochs done, not just final
# values f(x).
searcher = "bayesopt"
search_options = {
"num_init_random": n_workers + 2,
"gp_resource_kernel": "exp-decay-sum", # GP surrogate model
}
scheduler = HyperbandScheduler(
config_space,
type="stopping",
searcher=searcher,
search_options=search_options,
grace_period=1,
reduction_factor=3,
resource_attr=resource_attr,
max_resource_attr=max_resource_attr,
mode=mode,
metric=metric,
random_seed=random_seed,
)
# The experiment is stopped after `max_wallclock_time` seconds
stop_criterion = StoppingCriterion(max_wallclock_time=max_wallclock_time)
# Everything comes together in the tuner
tuner = Tuner(
trial_backend=trial_backend,
scheduler=scheduler,
stop_criterion=stop_criterion,
n_workers=n_workers,
)
tuner.run()