-
Notifications
You must be signed in to change notification settings - Fork 0
/
GeneralConfig.yaml
460 lines (354 loc) · 23.8 KB
/
GeneralConfig.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
training:
# Float specifying the discount factor of the Markov Decision process.
gamma: 0.99
# The default learning rate.
lr: 0.001
# Training batch size, if applicable.
train_batch_size: 50000
# Arguments passed into the policy model. See models/catalog.py for a full list of the available model options.
# TODO: Provide ModelConfig objects instead of dicts.
model:
fcnet_hiddens: [256, 256]
fcnet_activation: swish
vf_share_layers: false
free_log_std: true
# Arguments to pass to the policy optimizer.
optimizer:
type: adam
# eps: 1e-08
#PPO-specific config
# lr_schedule – Learning rate schedule. In the format of [[timestep, lr-value], [timestep, lr-value], …] Intermediary timesteps will be assigned to interpolated learning rate values. A schedule should normally start from timestep 0.
lr_schedule: null
# use_critic – Should use a critic as a baseline (otherwise don’t use value baseline; required for using GAE).
use_critic: true
# use_gae – If true, use the Generalized Advantage Estimator (GAE) with a value function, see https://arxiv.org/pdf/1506.02438.pdf.
use_gae: true
# lambda – The GAE (lambda) parameter.
lambda_: 0.95
# kl_coeff – Initial coefficient for KL divergence.
kl_coeff: 0.2
# sgd_minibatch_size – Total SGD batch size across all devices for SGD. This defines the minibatch size within each epoch.
sgd_minibatch_size: 9000
# num_sgd_iter – Number of SGD iterations in each outer loop (i.e., number of epochs to execute per train batch).
num_sgd_iter: 6
# shuffle_sequences – Whether to shuffle sequences in the batch when training (recommended).
shuffle_sequences: true
# vf_loss_coeff – Coefficient of the value function loss. IMPORTANT: you must tune this if you set vf_share_layers=True inside your model’s config.
vf_loss_coeff: 0.5
# entropy_coeff – Coefficient of the entropy regularizer.
entropy_coeff: 0.01
# entropy_coeff_schedule – Decay schedule for the entropy regularizer.
entropy_coeff_schedule: null
# clip_param – PPO clip parameter.
clip_param: 0.3
# vf_clip_param – Clip param for the value function. Note that this is sensitive to the scale of the rewards. If your expected V is large, increase this.
vf_clip_param: 1000
# grad_clip – If specified, clip the global norm of gradients by this amount.
grad_clip: 100
# kl_target – Target value for KL divergence.
kl_target: 0.01
# Max number of inflight requests to each sampling worker.
# See the FaultTolerantActorManager class for more details. Tuning these values is important when running experiments
# with large sample batches, where there is the risk that the object store may fill up, causing spilling of objects to disk.
# This can cause any asynchronous requests to become very slow, making your experiment run slow as well.
# You can inspect the object store during your experiment via a call to ray memory on your headnode, and by using the ray dashboard.
# If you’re seeing that the object store is filling up, turn down the number of remote requests in flight,
# or enable compression in your experiment of timesteps.
max_requests_in_flight_per_sampler_worker: 0
# Whether to enable the TrainerRunner and RLTrainer for training.
# This API uses ray.train to run the training loop which allows for a more flexible distributed training.
_enable_rl_trainer_api: null
environment:
# Algorithm's environment specifier.
env: cassie-v0
# Arguments dict passed to the environment creator as an EnvContext object.
# EnvContext is a dict plus the properties: num_rollout_workers, worker_index, vector_index, and remote.
env_config: {}
# Observation space for the policies of this algorithm.
#observation_space: null
# Action space for the policies of this algorithm.
#action_space: null
# A callable taking the last train results, the base env and the env context as args and returning a new task to set the env to.
# The env must be a TaskSettableEnv sub-class for this to work. See examples/curriculum_learning.py for an example.
env_task_fn: null
# If True, try to render the environment on the local worker or on worker 1 (if num_rollout_workers > 0).
# For vectorized envs, this usually means that only the first sub-environment will be rendered.
# In order for this to work, your env will have to implement the render() method which either:
# a) handles window generation and rendering itself (returning True)
# b) returns a numpy uint8 image of shape [height x width x 3 (RGB)]
render_env: null
# Whether to clip rewards during Policy’s postprocessing.
# None (default): Clip for Atari only (r=sign(r)).
# True: r=sign(r): Fixed rewards -1.0, 1.0, or 0.0.
# False: Never clip.
# [float value]: Clip at -value and + value.
# Tuple[value1, value2]: Clip at value1 and value2.
clip_rewards: null
# If True, RLlib will learn entirely inside a normalized action space (0.0 centered with small stddev; only affecting Box components).
# We will unsquash actions (and clip, just in case) to the bounds of the env’s action space before sending actions back to the env.
normalize_actions: false
# If True, RLlib will clip actions according to the env’s bounds before sending them back to the env.
# TODO: (sven) This option should be deprecated and always be False.
clip_actions: false
# If True, disable the environment pre-checking module.
disable_env_checking: true
# This config can be used to explicitly specify whether the env is an Atari env or not.
# If not specified, RLlib will try to auto-detect this during config validation.
is_atari: false
# Whether to auto-wrap old gym environments (using the pre 0.24 gym APIs, e.g. reset() returning single obs and no info dict).
# If True, RLlib will automatically wrap the given gym env class with the gym-provided compatibility wrapper (gym.wrappers.EnvCompatibility).
# If False, RLlib will produce a descriptive error on which steps to perform to upgrade to gymnasium (or to switch this flag to True).
auto_wrap_old_gym_envs: false
framework:
# TensorFlow (static-graph); tf2: TensorFlow 2.x (eager or traced, if eager_tracing=True); torch: PyTorch
framework: tf2
# Enable tracing in eager mode. This greatly improves performance (speedup ~2x),
# but makes it slightly harder to debug since Python code won’t be evaluated after the initial eager pass.
# Only possible if framework=tf2.
eager_tracing: true
# Maximum number of tf.function re-traces before a runtime error is raised.
# This is to prevent unnoticed retraces of methods inside the _eager_traced Policy,
# which could slow down execution by a factor of 4, without the user noticing what the root cause for this slowdown could be.
# Only necessary for framework=tf2. Set to None to ignore the re-trace count and never throw an error.
eager_max_retraces: null
# Configures TF for single-process operation by default.
tf_session_args:
gpu_options:
allow_growth: true
allow_soft_placement: true
# Override the following tf session args on the local worker.
local_tf_session_args:
intra_op_parallelism_threads: null
inter_op_parallelism_threads: null
rollouts:
# Number of rollout worker actors to create for parallel sampling.
# Setting this to 0 will force rollouts to be done in the local worker (driver process
# or the Algorithm’s actor when using Tune).
num_rollout_workers: 20
# Number of environments to evaluate vector-wise per worker.
# This enables model inference batching, which can improve performance for inference bottlenecked workloads.
num_envs_per_worker: 1
# The SampleCollector class to be used to collect and retrieve environment-, model-, and sampler data.
# Override the SampleCollector base class to implement your own collection/buffering/retrieval logic.
sample_collector: null
# When num_rollout_workers > 0, the driver (local_worker; worker-idx=0) does not need an environment.
# This is because it doesn’t have to sample (done by remote_workers; worker_indices > 0)
# nor evaluate (done by evaluation workers; see below).
create_env_on_local_worker: true
# Use a background thread for sampling (slightly off-policy, usually not advisable to turn on unless your env specifically requires it).
sample_async: false
# Use connector based environment runner, so that all preprocessing of obs and postprocessing of actions are done in agent and action connectors.
enable_connectors: null
# Divide episodes into fragments of this many steps each during rollouts.
# Trajectories of this size are collected from rollout workers and combined into a larger batch of train_batch_size for learning.
# For example, given rollout_fragment_length=100 and train_batch_size=1000:
# 1. RLlib collects 10 fragments of 100 steps each from rollout workers.
# 2. These fragments are concatenated and we perform an epoch of SGD.
# When using multiple envs per worker, the fragment size is multiplied by num_envs_per_worker.
# This is since we are collecting steps from multiple envs in parallel.
# For example, if num_envs_per_worker=5, then rollout workers will return experiences in chunks of 5*100 = 500 steps.
# The dataflow here can vary per algorithm.
# For example, PPO further divides the train batch into minibatches for multi-epoch SGD.
# Set to “auto” to have RLlib compute an exact rollout_fragment_length to match the given batch size.
rollout_fragment_length: auto
# How to build per-Sampler (RolloutWorker) batches, which are then usually concat’d to form the train batch.
# Note that “steps” below can mean different things (either env- or agent-steps) and depends on the count_steps_by setting,
# adjustable via AlgorithmConfig.multi_agent(count_steps_by=..):
# 1) “truncate_episodes”: Each call to sample() will return a batch of at most rollout_fragment_length * num_envs_per_worker in size.
# The batch will be exactly rollout_fragment_length * num_envs in size if postprocessing does not change batch sizes.
# Episodes may be truncated in order to meet this size requirement.
# This mode guarantees evenly sized batches, but increases variance as the future return must now be estimated at truncation boundaries.
# 2) “complete_episodes”: Each call to sample() will return a batch of at least rollout_fragment_length * num_envs_per_worker in size.
# Episodes will not be truncated, but multiple episodes may be packed within one batch to meet the (minimum) batch size.
# Note that when num_envs_per_worker > 1, episode steps will be buffered until the episode completes,
# and hence batches may contain significant amounts of off-policy data.
batch_mode: truncate_episodes
# If using num_envs_per_worker > 1, whether to create those new envs in remote processes instead of in the same worker.
# This adds overheads, but can make sense if your envs can take much time to step/reset (e.g., for StarCraft).
# Use this cautiously; overheads are significant.
remote_worker_envs: null
# Timeout that remote workers are waiting when polling environments. 0 (continue when at least one env is ready) is a reasonable default,
# but optimal value could be obtained by measuring your environment step/reset and model inference perf.
remote_env_batch_wait_ms: null
# Whether to validate that each created remote worker is healthy after its construction process.
validate_workers_after_construction: null
# Whether to attempt to continue training if a worker crashes. The number of currently healthy workers is reported as the “num_healthy_workers” metric.
ignore_worker_failures: null
# Whether - upon a worker failure - RLlib will try to recreate the lost worker as an identical copy of the failed one.
# The new worker will only differ from the failed one in its self.recreated_worker=True property value.
# It will have the same worker_index as the original one. If True, the ignore_worker_failures setting will be ignored.
recreate_failed_workers: true
# If True and any sub-environment (within a vectorized env) throws any error during env stepping,
# the Sampler will try to restart the faulty sub-environment. This is done without disturbing the other (still intact) sub-environment
# and without the RolloutWorker crashing.
restart_failed_sub_environments: true
# The number of consecutive times a rollout worker (or evaluation worker) failure is tolerated before finally crashing the Algorithm.
# Only useful if either ignore_worker_failures or recreate_failed_workers is True.
# Note that for restart_failed_sub_environments and sub-environment failures, the worker itself is NOT affected and won’t throw any errors
# as the flawed sub-environment is silently restarted under the hood.
num_consecutive_worker_failures_tolerance: 5
# Whether to use “rllib” or “deepmind” preprocessors by default. Set to None for using no preprocessor.
# In this case, the model will have to handle possibly complex observations from the environment.
# preprocessor_pref: rllib
# Element-wise observation filter, either “NoFilter” or “MeanStdFilter”.
observation_filter: MeanStdFilter
# Whether to synchronize the statistics of remote filters.
#synchronize_filter: True
# Whether to LZ4 compress individual observations in the SampleBatches collected during rollouts.
compress_observations: true
# Explicitly tells the rollout worker to enable TF eager execution. This is useful for example when framework is “torch”,
# but a TF2 policy needs to be restored for evaluation or league-based purposes.
# enable_tf1_exec_eagerly: null
# If specified, perf stats are in EMAs. This is the coeff of how much new data points contribute to the averages.
# Default is None, which uses simple global average instead.
# The EMA update rule is: updated = (1 - ema_coef) * old + ema_coef * new
# sampler_perf_stats_ema_coef: null
# Max amount of time we should spend waiting for health probe calls to finish.
# Health pings are very cheap, so the default is 1 minute.
# worker_health_probe_timeout_s: null
# Max amount of time we should wait to restore states on recovered worker actors. Default is 30 mins.
# worker_restore_timeout_s: null
evaluation:
# Evaluate with every evaluation_interval training iterations.
# The evaluation stats will be reported under the "evaluation" metric key.
# Note that for Ape-X metrics are already only reported for the lowest epsilon workers (least random workers).
# Set to None (or 0) for no evaluation.
evaluation_interval: 2
# Duration for which to run evaluation each evaluation_interval.
# The unit for the duration can be set via evaluation_duration_unit to either "episodes" (default) or "timesteps".
# If using multiple evaluation workers (evaluation_num_workers > 1), the load to run will be split amongst these.
# If the value is "auto":
# - For evaluation_parallel_to_training=True: Will run as many episodes/timesteps that fit into the (parallel) training step.
# - For evaluation_parallel_to_training=False: Error.
evaluation_duration: 10
# evaluation_duration_unit: "episodes"
# The timeout (in seconds) for the ray.get call to the remote evaluation worker(s) sample() method.
# After this time, the user will receive a warning and instructions on how to fix the issue.
# This could be either to make sure the episode ends, increasing the timeout, or switching to evaluation_duration_unit=timesteps.
# evaluation_sample_timeout_s: null
# Whether to run evaluation in parallel to a Algorithm.train() call using threading.
# Default=False.
# E.g. evaluation_interval=2 -> For every other training iteration, the Algorithm.train() and Algorithm.evaluate() calls run in parallel.
# Note: This is experimental.
# Possible pitfalls could be race conditions for weight synching at the beginning of the evaluation loop.
# evaluation_parallel_to_training: False
# Typical usage is to pass extra args to evaluation env creator and to disable exploration by computing deterministic actions.
# IMPORTANT NOTE: Policy gradient algorithms are able to find the optimal policy, even if this is a stochastic one.
# Setting "explore=False" here will result in the evaluation workers not using this optimal policy!
# evaluation_config:
# explore: False
# extra_args: {}
# Specify how to evaluate the current policy, along with any optional config parameters.
# This only has an effect when reading offline experiences ("input" is not "sampler").
# Available keys: {ope_method_name: {"type": ope_type, ...}} where ope_method_name is a user-defined string to save the OPE results under,
# and ope_type can be any subclass of OffPolicyEstimator, e.g. ray.rllib.offline.estimators.is::ImportanceSampling or your own custom subclass,
# or the full class path to the subclass.
# You can also add additional config arguments to be passed to the OffPolicyEstimator in the dict,
# e.g. {"qreg_dr": {"type": DoublyRobust, "q_model_type": "qreg", "k": 5}}
# off_policy_estimation_methods:
# ope_method_name: null
# type: null
# extra_config: {}
# Whether to use SampleBatch.split_by_episode() to split the input batch to episodes before estimating the ope metrics.
# In case of bandits you should make this False to see improvements in ope evaluation speed.
# In case of bandits, it is ok to not split by episode, since each record is one timestep already.
# The default is True.
ope_split_batch_by_episode: null
# Number of parallel workers to use for evaluation.
# Note that this is set to zero by default, which means evaluation will be run
# in the algorithm process (only if evaluation_interval is not None).
# If you increase this, it will increase the Ray resource usage of the algorithm
# since evaluation workers are created separately from rollout workers (used to sample data for training).
#evaluation_num_workers: 0
# Customize the evaluation method.
# This must be a function of signature (algo: Algorithm, eval_workers: WorkerSet) -> metrics: dict.
# See the Algorithm.evaluate() method to see the default implementation.
# The Algorithm guarantees all eval workers have the latest policy state before this function is called.
# custom_evaluation_function: null
# Make sure the latest available evaluation results are always attached to a step result dict.
# This may be useful if Tune or some other meta controller needs access to evaluation metrics all the time.
# always_attach_evaluation_results: null
# If True, use an AsyncRequestsManager for the evaluation workers and use this manager
# to send sample() requests to the evaluation workers. This way, the Algorithm becomes
# more robust against long running episodes and/or failing (and restarting) workers.
# enable_async_evaluation: null
checkpointing:
# Whether to include (tf or torch) native model files in the individual Policy or Algorithm checkpoints.
# These files can be used to restore the NN models without requiring RLlib.
# These files are generated using the tf- or torch- built-in saving utility methods on the actual models.
export_native_model_files: true
# Whether to add only the trainable Policies to the Algorithm checkpoint.
# This is determined by the is_trainable_policy callable of the local worker.
# These Policies are added to the sub-directory "policies/" in the Algorithm checkpoint.
# checkpoint_trainable_policies_only: null
debugging:
# Callable that creates a ray.tune.Logger object. If unspecified, a default logger is created.
# logger_creator: null
# Define logger-specific configuration to be used inside Logger.
# Default value None allows overwriting with nested dicts.
# logger_config: null
# Set the ray.rllib.* log level for the agent process and its workers.
# Should be one of DEBUG, INFO, WARN, or ERROR.
# The DEBUG level will also periodically print out summaries of relevant internal dataflow (this is also printed out once at startup at the INFO level).
# When using the rllib train command, you can also use the -v and -vv flags as shorthand for INFO and DEBUG.
# log_level: null
# Log system resource metrics to results.
# This requires psutil to be installed for sys stats, and gputil for GPU metrics.
log_sys_usage: false
# Use fake (infinite speed) sampler. For testing only.
fake_sampler: null
# This argument, in conjunction with worker_index, sets the random seed of each worker, so that identically configured trials will have identical results. This makes experiments reproducible.
seed: 1234
# Use a custom RolloutWorker type for unit testing purpose.
#worker_cls: null
# callbacks:
# # Callbacks class, whose methods will be run during various phases of training and environment sample collection.
# # See the DefaultCallbacks class and examples/custom_metrics_and_callbacks.py for more usage information.
# callbacks_class: null
resources:
# Number of GPUs to allocate to the algorithm process.
# Note that not all algorithms can take advantage of GPUs.
# Support for multi-GPU is currently only available for tf-[PPO/IMPALA/DQN/PG].
# This can be fractional (e.g., 0.3 GPUs).
num_gpus: 1
# Set to True for debugging (multi-)?GPU functionality on a CPU machine.
# GPU towers will be simulated by graphs located on CPUs in this case.
# Use num_gpus to test for different numbers of fake GPUs.
_fake_gpus: null
# Number of CPUs to allocate per worker.
num_cpus_per_worker: 1
# Number of GPUs to allocate per worker. This can be fractional.
# This is usually needed only if your env itself requires a GPU (i.e., it is a GPU-intensive video game),
# or model inference is unusually expensive.
num_gpus_per_worker: 0.05
# Number of workers used for training. A value of 0 means training will take place on a local worker on head node CPUs or 1 GPU
# (determined by num_gpus_per_trainer_worker).
# For multi-gpu training, set number of workers greater than 1 and set num_gpus_per_trainer_worker accordingly
# (e.g. 4 GPUs total, and model needs 2 GPUs: num_trainer_workers = 2 and num_gpus_per_trainer_worker = 2)
num_trainer_workers: 0
# Number of CPUs allocated per trainer worker.
# Only necessary for custom processing pipeline inside each RLTrainer requiring multiple CPU cores.
# Ignored if num_trainer_workers = 0.
num_cpus_per_trainer_worker: null
# Number of GPUs allocated per worker.
# If num_trainer_workers = 0, any value greater than 0 will run the training on a single GPU on the head node,
# while a value of 0 will run the training on head node CPU cores.
num_gpus_per_trainer_worker: null
# Any custom Ray resources to allocate per worker.
custom_resources_per_worker: null
# Number of CPUs to allocate for the algorithm.
# Note: this only takes effect when running in Tune. Otherwise, the algorithm runs in the main program (driver).
num_cpus_for_local_worker: null
# Any custom Ray resources to allocate per worker.
custom_resources_per_worker: null
# The strategy for the placement group factory returned by Algorithm.default_resource_request().
# A PlacementGroup defines, which devices (resources) should always be co-located on the same node.
# For example, an Algorithm with 2 rollout workers, running with num_gpus=1 will request a placement group with the bundles:
# [{“gpu”: 1, “cpu”: 1}, {“cpu”: 1}, {“cpu”: 1}],
# where the first bundle is for the driver and the other 2 bundles are for the two workers.
# These bundles can now be “placed” on the same or different nodes depending on the value of placement_strategy:
# “PACK”: Packs bundles into as few nodes as possible.
# “SPREAD”: Places bundles across distinct nodes as even as possible.
# “STRICT_PACK”: Packs bundles into one node. The group is not allowed to span multiple nodes.
# “STRICT_SPREAD”: Packs bundles across distinct nodes.
placement_strategy: SPREAD