diff --git a/scripts/conf.py b/scripts/conf.py index 0dd81d29..524a031f 100755 --- a/scripts/conf.py +++ b/scripts/conf.py @@ -299,7 +299,7 @@ def install_slurm_conf(lkp: util.Lookup) -> None: def install_slurmdbd_conf(lkp: util.Lookup) -> None: """install slurmdbd.conf""" conf_options = { - "control_host": lkp.control_host, + "control_host": lkp.dbd_host, "slurmlog": dirs.log, "state_save": slurmdirs.state, "db_name": "slurm_acct_db", diff --git a/scripts/setup.py b/scripts/setup.py index 92d14bc0..80f706ee 100755 --- a/scripts/setup.py +++ b/scripts/setup.py @@ -143,6 +143,9 @@ def run_custom_scripts(): if lkp.instance_role == "controller": # controller has all scripts, but only runs controller.d custom_dirs = [custom_dir / "controller.d"] + elif lkp.instance_role == "dbd": + # compute setup with compute.d and nodeset.d + custom_dirs = [custom_dir / "dbd.d"] elif lkp.instance_role == "compute": # compute setup with compute.d and nodeset.d custom_dirs = [custom_dir / "compute.d", custom_dir / "nodeset.d"] @@ -165,6 +168,8 @@ def run_custom_scripts(): for script in custom_scripts: if "/controller.d/" in str(script): timeout = lkp.cfg.get("controller_startup_scripts_timeout", 300) + elif "/dbd.d/" in str(script): + timeout = lkp.cfg.get("dbd_startup_scripts_timeout", 300) elif "/compute.d/" in str(script) or "/nodeset.d/" in str(script): timeout = lkp.cfg.get("compute_startup_scripts_timeout", 300) elif "/login.d/" in str(script): @@ -331,6 +336,28 @@ def configure_dirs(): scripts_log.symlink_to(dirs.log) +def setup_dbd(args): + """run dbd node setup""" + log.info("Setting up dbd") + install_custom_scripts() + install_slurmdbd_conf(lkp) + setup_network_storage(log) + setup_sudoers() + if not cfg.cloudsql_secret: + configure_mysql() + run("systemctl restart munge") + run("systemctl enable slurmdbd", timeout=30) + run("systemctl restart slurmdbd", timeout=30) + run("systemctl enable --now slurmcmd.timer", timeout=30) + + run_custom_scripts() + + log.info("Check status of cluster services") + run("systemctl status munge", timeout=30) + run("systemctl status slurmdbd", timeout=30) + + log.info("Done setting up dbd") + def setup_controller(args): """Run controller setup""" log.info("Setting up controller") @@ -338,7 +365,8 @@ def setup_controller(args): install_custom_scripts() install_slurm_conf(lkp) - install_slurmdbd_conf(lkp) + if not lkp.dbd_separate: + install_slurmdbd_conf(lkp) gen_cloud_conf(lkp) gen_cloud_gres_conf(lkp) @@ -348,8 +376,10 @@ def setup_controller(args): install_topology_conf(lkp) install_jobsubmit_lua(lkp) + #if slurmdbd setup_jwt_key() setup_munge_key() + setup_sudoers() if cfg.controller_secondary_disk: @@ -358,14 +388,15 @@ def setup_controller(args): run_custom_scripts() - if not cfg.cloudsql_secret: - configure_mysql() - run("systemctl enable slurmdbd", timeout=30) - run("systemctl restart slurmdbd", timeout=30) + if not lkp.dbd_separate and not cfg.cloudsql_secret: + configure_mysql() - # Wait for slurmdbd to come up - time.sleep(5) + if not lkp.dbd_separate: + run("systemctl enable slurmdbd", timeout=30) + run("systemctl restart slurmdbd", timeout=30) + # Wait for slurmdbd to come up + time.sleep(5) sacctmgr = f"{slurmdirs.prefix}/bin/sacctmgr -i" result = run( @@ -391,7 +422,8 @@ def setup_controller(args): log.info("Check status of cluster services") run("systemctl status munge", timeout=30) - run("systemctl status slurmdbd", timeout=30) + if not lkp.dbd_separate: + run("systemctl status slurmdbd", timeout=30) run("systemctl status slurmctld", timeout=30) run("systemctl status slurmrestd", timeout=30) @@ -482,6 +514,7 @@ def main(args): setup = dict.get( { "controller": setup_controller, + "dbd": setup_dbd, "compute": setup_compute, "login": setup_login, }, diff --git a/scripts/util.py b/scripts/util.py index e2d9c710..3dca20c9 100755 --- a/scripts/util.py +++ b/scripts/util.py @@ -406,6 +406,7 @@ def install_custom_scripts(check_hash=False): "login": ["login"], "compute": compute_tokens, "controller": ["controller", "prolog", "epilog"], + "dbd": ["dbd"], }, lkp.instance_role, [], @@ -1592,6 +1593,20 @@ def control_addr(self): @property def control_host(self): return self.cfg.slurm_control_host + + @property + def dbd_separate(self): + return self.cfg.dbd_location.separate + + @property + def dbd_host(self): + if self.cfg.dbd_location.separate: + return self.cfg.dbd_location.dbd_addr + return self.cfg.slurm_control_host + + @property + def dbd_host_addr(self): + return host_lookup(self.dbd_host()) @cached_property def control_host_addr(self): diff --git a/terraform/slurm_cluster/modules/_slurm_instance/variables.tf b/terraform/slurm_cluster/modules/_slurm_instance/variables.tf index 3adb24f1..06c9feec 100644 --- a/terraform/slurm_cluster/modules/_slurm_instance/variables.tf +++ b/terraform/slurm_cluster/modules/_slurm_instance/variables.tf @@ -138,13 +138,13 @@ variable "labels" { ######### variable "slurm_instance_role" { - description = "Slurm instance type. Must be one of: controller; login; compute." + description = "Slurm instance type. Must be one of: controller; login; dbd; compute." type = string default = null validation { - condition = contains(["controller", "login", "compute"], lower(var.slurm_instance_role)) - error_message = "Must be one of: controller; login; compute." + condition = contains(["controller", "login", "dbd", "compute"], lower(var.slurm_instance_role)) + error_message = "Must be one of: controller; login; dbd; compute." } } diff --git a/terraform/slurm_cluster/modules/slurm_files/main.tf b/terraform/slurm_cluster/modules/slurm_files/main.tf index 896c17dc..d10ef3ff 100644 --- a/terraform/slurm_cluster/modules/slurm_files/main.tf +++ b/terraform/slurm_cluster/modules/slurm_files/main.tf @@ -70,6 +70,7 @@ locals { nodeset = local.nodeset nodeset_dyn = local.nodeset_dyn nodeset_tpu = local.nodeset_tpu + dbd_location = var.dbd_location # hybrid hybrid = var.enable_hybrid diff --git a/terraform/slurm_cluster/modules/slurm_files/variables.tf b/terraform/slurm_cluster/modules/slurm_files/variables.tf index 1cffbb30..1e28aa6d 100644 --- a/terraform/slurm_cluster/modules/slurm_files/variables.tf +++ b/terraform/slurm_cluster/modules/slurm_files/variables.tf @@ -465,3 +465,22 @@ variable "endpoint_versions" { compute = null } } + +variable "dbd_location" { + type = object({ + separate = bool + dbd_addr = string + }) + default = { + separate = false + dbd_addr = null + } + description = <