Skip to content

Commit

Permalink
Add instance_role and dbd_location in slurm_files
Browse files Browse the repository at this point in the history
  • Loading branch information
jvilarru committed Aug 28, 2024
1 parent f153fd9 commit 1151d81
Show file tree
Hide file tree
Showing 7 changed files with 83 additions and 15 deletions.
2 changes: 1 addition & 1 deletion scripts/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ def install_slurm_conf(lkp: util.Lookup) -> None:
def install_slurmdbd_conf(lkp: util.Lookup) -> None:
"""install slurmdbd.conf"""
conf_options = {
"control_host": lkp.control_host,
"control_host": lkp.dbd_host,
"slurmlog": dirs.log,
"state_save": slurmdirs.state,
"db_name": "slurm_acct_db",
Expand Down
49 changes: 41 additions & 8 deletions scripts/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,9 @@ def run_custom_scripts():
if lkp.instance_role == "controller":
# controller has all scripts, but only runs controller.d
custom_dirs = [custom_dir / "controller.d"]
elif lkp.instance_role == "dbd":
# compute setup with compute.d and nodeset.d
custom_dirs = [custom_dir / "dbd.d"]
elif lkp.instance_role == "compute":
# compute setup with compute.d and nodeset.d
custom_dirs = [custom_dir / "compute.d", custom_dir / "nodeset.d"]
Expand All @@ -165,6 +168,8 @@ def run_custom_scripts():
for script in custom_scripts:
if "/controller.d/" in str(script):
timeout = lkp.cfg.get("controller_startup_scripts_timeout", 300)
elif "/dbd.d/" in str(script):
timeout = lkp.cfg.get("dbd_startup_scripts_timeout", 300)
elif "/compute.d/" in str(script) or "/nodeset.d/" in str(script):
timeout = lkp.cfg.get("compute_startup_scripts_timeout", 300)
elif "/login.d/" in str(script):
Expand Down Expand Up @@ -331,14 +336,37 @@ def configure_dirs():
scripts_log.symlink_to(dirs.log)


def setup_dbd(args):
"""run dbd node setup"""
log.info("Setting up dbd")
install_custom_scripts()
install_slurmdbd_conf(lkp)
setup_network_storage(log)
setup_sudoers()
if not cfg.cloudsql_secret:
configure_mysql()
run("systemctl restart munge")
run("systemctl enable slurmdbd", timeout=30)
run("systemctl restart slurmdbd", timeout=30)
run("systemctl enable --now slurmcmd.timer", timeout=30)

run_custom_scripts()

log.info("Check status of cluster services")
run("systemctl status munge", timeout=30)
run("systemctl status slurmdbd", timeout=30)

log.info("Done setting up dbd")

def setup_controller(args):
"""Run controller setup"""
log.info("Setting up controller")
util.chown_slurm(dirs.scripts / "config.yaml", mode=0o600)
install_custom_scripts()

install_slurm_conf(lkp)
install_slurmdbd_conf(lkp)
if not lkp.dbd_separate:
install_slurmdbd_conf(lkp)

gen_cloud_conf(lkp)
gen_cloud_gres_conf(lkp)
Expand All @@ -348,8 +376,10 @@ def setup_controller(args):
install_topology_conf(lkp)
install_jobsubmit_lua(lkp)

#if slurmdbd
setup_jwt_key()
setup_munge_key()

setup_sudoers()

if cfg.controller_secondary_disk:
Expand All @@ -358,14 +388,15 @@ def setup_controller(args):

run_custom_scripts()

if not cfg.cloudsql_secret:
configure_mysql()

run("systemctl enable slurmdbd", timeout=30)
run("systemctl restart slurmdbd", timeout=30)
if not lkp.dbd_separate and not cfg.cloudsql_secret:
configure_mysql()

# Wait for slurmdbd to come up
time.sleep(5)
if not lkp.dbd_separate:
run("systemctl enable slurmdbd", timeout=30)
run("systemctl restart slurmdbd", timeout=30)
# Wait for slurmdbd to come up
time.sleep(5)

sacctmgr = f"{slurmdirs.prefix}/bin/sacctmgr -i"
result = run(
Expand All @@ -391,7 +422,8 @@ def setup_controller(args):

log.info("Check status of cluster services")
run("systemctl status munge", timeout=30)
run("systemctl status slurmdbd", timeout=30)
if not lkp.dbd_separate:
run("systemctl status slurmdbd", timeout=30)
run("systemctl status slurmctld", timeout=30)
run("systemctl status slurmrestd", timeout=30)

Expand Down Expand Up @@ -482,6 +514,7 @@ def main(args):
setup = dict.get(
{
"controller": setup_controller,
"dbd": setup_dbd,
"compute": setup_compute,
"login": setup_login,
},
Expand Down
15 changes: 15 additions & 0 deletions scripts/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -406,6 +406,7 @@ def install_custom_scripts(check_hash=False):
"login": ["login"],
"compute": compute_tokens,
"controller": ["controller", "prolog", "epilog"],
"dbd": ["dbd"],
},
lkp.instance_role,
[],
Expand Down Expand Up @@ -1592,6 +1593,20 @@ def control_addr(self):
@property
def control_host(self):
return self.cfg.slurm_control_host

@property
def dbd_separate(self):
return self.cfg.dbd_location.separate

@property
def dbd_host(self):
if self.cfg.dbd_location.separate:
return self.cfg.dbd_location.dbd_addr
return self.cfg.slurm_control_host

@property
def dbd_host_addr(self):
return host_lookup(self.dbd_host())

@cached_property
def control_host_addr(self):
Expand Down
6 changes: 3 additions & 3 deletions terraform/slurm_cluster/modules/_slurm_instance/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -138,13 +138,13 @@ variable "labels" {
#########

variable "slurm_instance_role" {
description = "Slurm instance type. Must be one of: controller; login; compute."
description = "Slurm instance type. Must be one of: controller; login; dbd; compute."
type = string
default = null

validation {
condition = contains(["controller", "login", "compute"], lower(var.slurm_instance_role))
error_message = "Must be one of: controller; login; compute."
condition = contains(["controller", "login", "dbd", "compute"], lower(var.slurm_instance_role))
error_message = "Must be one of: controller; login; dbd; compute."
}
}

Expand Down
1 change: 1 addition & 0 deletions terraform/slurm_cluster/modules/slurm_files/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ locals {
nodeset = local.nodeset
nodeset_dyn = local.nodeset_dyn
nodeset_tpu = local.nodeset_tpu
dbd_location = var.dbd_location

# hybrid
hybrid = var.enable_hybrid
Expand Down
19 changes: 19 additions & 0 deletions terraform/slurm_cluster/modules/slurm_files/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -465,3 +465,22 @@ variable "endpoint_versions" {
compute = null
}
}

variable "dbd_location" {
type = object({
separate = bool
dbd_addr = string
})
default = {
separate = false
dbd_addr = null
}
description = <<EOD
Use this variable to specify the location of your dbd.
If separate is false then dbd is installed the same machine as the controller.
If separate is true, then dbd_addr specifies it, either by having the address
of it or be being null, in that case, a new VM is created that hosts the dbd.
This is useful in case multiple clusters are to be connected together by slurm
multicluster.
EOD
}
Original file line number Diff line number Diff line change
Expand Up @@ -340,15 +340,15 @@ variable "additional_disks" {

variable "slurm_instance_role" {
type = string
description = "Slurm instance type. Must be one of: controller; login; compute; or null."
description = "Slurm instance type. Must be one of: controller; dbd; login; compute; or null."
default = null

validation {
condition = (
var.slurm_instance_role == null
? true
: contains(["controller", "login", "compute"], lower(var.slurm_instance_role)))
error_message = "Must be one of: controller; login; compute; or null."
: contains(["controller", "dbd", "login", "compute"], lower(var.slurm_instance_role)))
error_message = "Must be one of: controller; dbd; login; compute; or null."
}
}

Expand Down

0 comments on commit 1151d81

Please sign in to comment.