Skip to content

Commit

Permalink
multigpu training
Browse files Browse the repository at this point in the history
  • Loading branch information
svandenhaute committed Jul 20, 2024
1 parent d4f4bcc commit 32f903e
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 6 deletions.
2 changes: 1 addition & 1 deletion examples/submit_lumi.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ do
#!/bin/bash
#
#SBATCH -p small
#SBATCH --account=project_465000847
#SBATCH --account=project_465001125
#SBATCH --time=01:00:00
#SBATCH --nodes=1
#SBATCH --job-name=$name
Expand Down
14 changes: 9 additions & 5 deletions psiflow/execution.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,13 +262,17 @@ def wq_resources(self):
if self.use_threadpool:
return {}
resource_specification = {}
resource_specification["cores"] = self.cores_per_worker
resource_specification["disk"] = 1000 # some random nontrivial amount?
memory = 2000 * self.cores_per_worker # similarly rather random
if self.gpu:
nworkers = int(self.cores_available / self.cores_per_worker)
resource_specification["gpus"] = nworkers # one per GPU
else:
nworkers = 1
resource_specification["gpus"] = nworkers # one per GPU
resource_specification["cores"] = self.cores_available
resource_specification["disk"] = 1000 * nworkers # some random nontrivial amount?
memory = 1000 * self.cores_available # similarly rather random
resource_specification["memory"] = int(memory)
resource_specification["running_time_min"] = self.max_training_time
if self.gpu:
resource_specification["gpus"] = 1
return resource_specification


Expand Down

0 comments on commit 32f903e

Please sign in to comment.