Skip to content

Commit 0a79a93

Browse files
committed
Merge origin/main into fsdp_template
2 parents 8d95dcb + e82c6da commit 0a79a93

File tree

17 files changed

+215
-130
lines changed

17 files changed

+215
-130
lines changed

templates/configs/_global.yaml

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,15 +19,14 @@ hydra:
1919
dir: ${paths.work_dir}
2020
launcher:
2121
submitit_folder: ${hydra.sweep.dir}/submitit_logs/%j
22-
nodes: ${compute.nodes}
23-
gpus_per_node: null
24-
tasks_per_node: ${oc.select:compute.tasks_per_node, ${compute.gpus_per_node}}
25-
cpus_per_task: ${compute.cpus_per_task}
22+
nodes: ${oc.select:compute.nodes,null}
23+
tasks_per_node: ${oc.select:compute.tasks_per_node, 1}
24+
cpus_per_task: ${oc.select:compute.cpus_per_task, 4}
2625
mem_gb: ${compute.mem_gb}
2726
timeout_min: ${compute.timeout_min}
28-
gres: ${compute.gres}
29-
partition: ${oc.select:compute.slurm.partition,null}
30-
qos: ${oc.select:compute.slurm.qos,null}
27+
gres: ${oc.select:compute.gres, null}
28+
partition: ${oc.select:compute.slurm.partition, null}
29+
qos: ${oc.select:compute.slurm.qos, null}
3130
account: ${user.slurm.account}
3231
max_num_timeout: 2
3332
additional_parameters: ${oc.select:compute.slurm.additional_parameters, ${oc.select:user.slurm.additional_parameters, {}}}

templates/configs/compute/bon_echo/a100_1x.yaml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,12 @@ cluster: bon_echo
22
nodes: 1
33
gpu_type: a100
44
gpus_per_node: 1
5-
time_limit: "8:00:00"
6-
timeout_min: 480
7-
work_root: /scratch/ssd004/scratch/${oc.env:USER}
8-
mem_gb: 80
9-
cpus_per_task: 16
105
gres: gpu:${.gpu_type}:${.gpus_per_node}
6+
tasks_per_node: ${.gpus_per_node}
7+
cpus_per_task: 16
8+
mem_gb: 80
9+
work_root: /scratch/ssd004/scratch/${oc.env:USER}
10+
timeout_min: 60
1111
slurm:
1212
partition: a100
13-
gpus_per_node: null
13+
additional_parameters: {}

templates/configs/compute/bon_echo/a100_4x.yaml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,12 @@ cluster: bon_echo
22
nodes: 1
33
gpu_type: a100
44
gpus_per_node: 4
5-
time_limit: "2:00:00"
6-
timeout_min: 120
7-
work_root: /scratch/ssd004/scratch/${oc.env:USER}
8-
mem_gb: 320
9-
cpus_per_task: 8
105
gres: gpu:${.gpu_type}:${.gpus_per_node}
6+
tasks_per_node: ${.gpus_per_node}
7+
cpus_per_task: 16
8+
mem_gb: 320
9+
work_root: /scratch/ssd004/scratch/${oc.env:USER}
10+
timeout_min: 60
1111
slurm:
1212
partition: a100
13-
gpus_per_node: null
13+
additional_parameters: {}

templates/configs/compute/bon_echo/a40_1x.yaml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,12 @@ cluster: bon_echo
22
nodes: 1
33
gpu_type: a40
44
gpus_per_node: 1
5-
time_limit: "8:00:00"
6-
timeout_min: 480
7-
work_root: /scratch/ssd004/scratch/${oc.env:USER}
8-
mem_gb: 16
9-
cpus_per_task: 16
105
gres: gpu:${.gpu_type}:${.gpus_per_node}
6+
tasks_per_node: ${.gpus_per_node}
7+
cpus_per_task: 8
8+
mem_gb: 40
9+
work_root: /scratch/ssd004/scratch/${oc.env:USER}
10+
timeout_min: 60
1111
slurm:
1212
partition: a40
13-
gpus_per_node: null
13+
additional_parameters: {}

templates/configs/compute/bon_echo/a40_2x.yaml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,12 @@ cluster: bon_echo
22
nodes: 1
33
gpu_type: a40
44
gpus_per_node: 2
5-
time_limit: "2:00:00"
6-
timeout_min: 120
7-
work_root: /scratch/ssd004/scratch/${oc.env:USER}
8-
mem_gb: 64
9-
cpus_per_task: 8
105
gres: gpu:${.gpu_type}:${.gpus_per_node}
6+
tasks_per_node: ${.gpus_per_node}
7+
cpus_per_task: 8
8+
mem_gb: 80
9+
work_root: /scratch/ssd004/scratch/${oc.env:USER}
10+
timeout_min: 60
1111
slurm:
1212
partition: a40
13-
gpus_per_node: null
13+
additional_parameters: {}
Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,13 @@
1+
cluster: bon_echo
12
nodes: 1
3+
gpu_type: a40
24
gpus_per_node: 4
3-
cpus_per_task: 32
4-
mem_gb: 64
5+
gres: gpu:${.gpu_type}:${.gpus_per_node}
6+
tasks_per_node: ${.gpus_per_node}
7+
cpus_per_task: 8
8+
mem_gb: 160
9+
work_root: /scratch/ssd004/scratch/${oc.env:USER}
510
timeout_min: 60
611
slurm:
712
partition: a40
13+
additional_parameters: {}
Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,10 @@
11
cluster: bon_echo
22
nodes: 1
33
gpus_per_node: 0
4+
gres: null
45
cpus_per_task: 2
56
mem_gb: 8
67
work_root: /scratch/ssd004/scratch/${oc.env:USER}
7-
time_limit: "0:15:00"
8-
timeout_min: 15
9-
gres: null
8+
timeout_min: 60
109
slurm:
11-
gpus_per_node: null
10+
additional_parameters: {}

templates/configs/compute/killarney/cpu_1x.yaml

Lines changed: 0 additions & 10 deletions
This file was deleted.

templates/configs/compute/killarney/h100_1x.yaml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,11 @@ cluster: killarney
22
nodes: 1
33
gpu_type: h100
44
gpus_per_node: 1
5-
time_limit: "1:00:00"
6-
timeout_min: 60
7-
work_root: /scratch/${oc.env:USER}
8-
mem_gb: 256
9-
cpus_per_task: 24
105
gres: gpu:${.gpu_type}:${.gpus_per_node}
6+
tasks_per_node: ${.gpus_per_node}
7+
cpus_per_task: 6
8+
mem_gb: 240
9+
work_root: /scratch/${oc.env:USER}
10+
timeout_min: 60
1111
slurm:
12-
gpus_per_node: null
12+
additional_parameters: {}
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
cluster: killarney
2+
nodes: 1
3+
gpu_type: h100
4+
gpus_per_node: 2
5+
gres: gpu:${.gpu_type}:${.gpus_per_node}
6+
tasks_per_node: ${.gpus_per_node}
7+
cpus_per_task: 6
8+
mem_gb: 480
9+
work_root: /scratch/${oc.env:USER}
10+
timeout_min: 60
11+
slurm:
12+
additional_parameters: {}

0 commit comments

Comments
 (0)