From d042ffa3a92c991e9266c294c7b6bc72e9cc27ca Mon Sep 17 00:00:00 2001 From: lbluque Date: Fri, 31 Jan 2025 09:36:53 -0800 Subject: [PATCH 1/2] add git hash changes announcement --- src/fairchem/core/_cli_hydra.py | 36 +++++++++++++++++--------- src/fairchem/core/components/runner.py | 2 +- 2 files changed, 25 insertions(+), 13 deletions(-) diff --git a/src/fairchem/core/_cli_hydra.py b/src/fairchem/core/_cli_hydra.py index cb08714829..61e92ad80e 100644 --- a/src/fairchem/core/_cli_hydra.py +++ b/src/fairchem/core/_cli_hydra.py @@ -50,6 +50,7 @@ class SchedulerConfig: mode: SchedulerType = SchedulerType.LOCAL ranks_per_node: int = 1 num_nodes: int = 1 + num_jobs: int = 1 slurm: dict = field( default_factory=lambda: { "mem_gb": 80, # slurm mem in GB @@ -83,7 +84,7 @@ def checkpoint_dir(self) -> str: class Submitit(Checkpointable): - def __call__(self, dict_config: DictConfig) -> None: + def __call__(self, dict_config: DictConfig, **run_kwargs) -> None: self.config = dict_config job_config: JobConfig = OmegaConf.to_object(dict_config.job) # TODO: setup_imports is not needed if we stop instantiating models with Registry. @@ -94,7 +95,7 @@ def __call__(self, dict_config: DictConfig) -> None: runner: Runner = hydra.utils.instantiate(dict_config.runner) runner.job_config = job_config runner.load_state() - runner.run() + runner.run(**run_kwargs) distutils.cleanup() def _init_logger(self) -> None: @@ -151,10 +152,6 @@ def get_hydra_config_from_yaml( return hydra.compose(config_name=config_name, overrides=overrides_args) -def runner_wrapper(config: DictConfig): - Submitit()(config) - - def main( args: argparse.Namespace | None = None, override_args: list[str] | None = None ): @@ -188,10 +185,25 @@ def main( slurm_qos=scheduler_cfg.slurm.qos, slurm_account=scheduler_cfg.slurm.account, ) - job = executor.submit(runner_wrapper, cfg) - logging.info( - f"Submitted job id: {job_cfg.timestamp_id}, slurm id: {job.job_id}, logs: {job_cfg.log_dir}" - ) + if scheduler_cfg.num_jobs == 1: + job = executor.submit(Submitit(), cfg) + logging.info( + f"Submitted job id: {job_cfg.timestamp_id}, slurm id: {job.job_id}, logs: {job_cfg.log_dir}" + ) + elif scheduler_cfg.num_jobs > 1: + executor.update_parameters(slurm_array_parallelism=scheduler_cfg.num_jobs) + + jobs = [] + with executor.batch(): + for job_number in range(scheduler_cfg.num_jobs): + job = executor.submit( + Submitit(), + cfg, + job_number=job_number, + num_jobs=scheduler_cfg.num_jobs, + ) + jobs.append(job) + logging.info(f"Submitted {len(jobs)} jobs: {jobs[0].job_id.split("_")[0]}") else: from torch.distributed.launcher.api import LaunchConfig, elastic_launch @@ -204,8 +216,8 @@ def main( rdzv_backend="c10d", max_restarts=0, ) - elastic_launch(launch_config, runner_wrapper)(cfg) + elastic_launch(launch_config, Submitit())(cfg) else: logging.info("Running in local mode without elastic launch") distutils.setup_env_local() - runner_wrapper(cfg) + Submitit()(cfg) diff --git a/src/fairchem/core/components/runner.py b/src/fairchem/core/components/runner.py index 6f5bdfefb8..fb20246813 100644 --- a/src/fairchem/core/components/runner.py +++ b/src/fairchem/core/components/runner.py @@ -25,7 +25,7 @@ def job_config(self, cfg: DictConfig): self._job_config = cfg @abstractmethod - def run(self) -> Any: + def run(self, **kwargs) -> Any: raise NotImplementedError @abstractmethod From 980ef9b3886183644c8919c112b48f2892fe29c1 Mon Sep 17 00:00:00 2001 From: lbluque Date: Wed, 12 Feb 2025 07:57:43 +1300 Subject: [PATCH 2/2] fix f string --- src/fairchem/core/_cli_hydra.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fairchem/core/_cli_hydra.py b/src/fairchem/core/_cli_hydra.py index cdf36836a6..5ccdbd5176 100644 --- a/src/fairchem/core/_cli_hydra.py +++ b/src/fairchem/core/_cli_hydra.py @@ -268,7 +268,7 @@ def main( num_jobs=scheduler_cfg.num_jobs, ) jobs.append(job) - logging.info(f"Submitted {len(jobs)} jobs: {jobs[0].job_id.split("_")[0]}") + logging.info(f"Submitted {len(jobs)} jobs: {jobs[0].job_id.split('_')[0]}") else: from torch.distributed.launcher.api import LaunchConfig, elastic_launch