Skip to content

Commit 5029c1c

Browse files
committed
cluster status
1 parent e3f4c9c commit 5029c1c

File tree

1 file changed

+21
-5
lines changed

1 file changed

+21
-5
lines changed

src/DistributedEnvironments.jl

+21-5
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
module DistributedEnvironments
22

3-
export @initcluster, @eachmachine, @everywhere
3+
export @initcluster, @eachmachine, @everywhere, cluster_status
44

55
using Distributed, Pkg, MacroTools
66

@@ -61,7 +61,7 @@ function _initcluster(nodes; status=false, sync=true, worker_procs=:auto)
6161
end
6262

6363
# Check status of machines
64-
$(status) && status(cluster)
64+
$(status) && cluster_status(cluster)
6565

6666
# Sync and instantiate (does precompilation)
6767
if $(sync)
@@ -112,12 +112,15 @@ macro eachmachine(expr)
112112
end
113113

114114
function _eachmachine(expr)
115-
machinepids = unique(id -> Distributed.get_bind_addr(id), procs())
115+
machinepids = get_unique_machine_ids()
116116
quote
117117
@everywhere $machinepids $expr
118118
end
119119
end
120120

121+
get_unique_machine_ids() = unique(id -> Distributed.get_bind_addr(id), procs())
122+
get_unique_machine_ips() = unique(map(id -> Distributed.get_bind_addr(id), procs()))
123+
121124
function sync_env(cluster)
122125
proj_path = dirname(Pkg.project().path)
123126
deps = Pkg.dependencies()
@@ -157,8 +160,13 @@ function scp(path, target)
157160
run(`ssh -q -t $(target) rm -rf $(path)`) # Delete old
158161
run(`scp -r -q $(path) $(target):$(path)`) # Copy
159162
end
160-
161-
function status(cluster::Vector{String})
163+
164+
"""
165+
cluster_status!(cluster)
166+
167+
Run a status check on each machine in the list and remove any machines not reachable.
168+
"""
169+
function cluster_status!(cluster::Vector{String})
162170
calc_cpu = "awk '{u=\$2+\$4; t=\$2+\$4+\$5; if (NR==1){u1=u; t1=t;} else print (\$2+\$4-u1) * 100 / (t-t1) \"%\"; }' <(grep 'cpu ' /proc/stat) <(sleep 1;grep 'cpu ' /proc/stat)"
163171

164172
connection_error = []
@@ -181,5 +189,13 @@ function status(cluster::Vector{String})
181189
end
182190
end
183191

192+
"""
193+
cluster_status()
194+
195+
Run a status check on each machine in the running cluster.
196+
197+
Prints current users, current cpu utilization and current julia version.
198+
"""
199+
cluster_status() = cluster_status!(get_unique_machine_ips())
184200

185201
end

0 commit comments

Comments
 (0)