1
1
module DistributedEnvironments
2
2
3
- export @initcluster , @eachmachine , @everywhere
3
+ export @initcluster , @eachmachine , @everywhere , cluster_status
4
4
5
5
using Distributed, Pkg, MacroTools
6
6
@@ -61,7 +61,7 @@ function _initcluster(nodes; status=false, sync=true, worker_procs=:auto)
61
61
end
62
62
63
63
# Check status of machines
64
- $ (status) && status (cluster)
64
+ $ (status) && cluster_status (cluster)
65
65
66
66
# Sync and instantiate (does precompilation)
67
67
if $ (sync)
@@ -112,12 +112,15 @@ macro eachmachine(expr)
112
112
end
113
113
114
114
function _eachmachine (expr)
115
- machinepids = unique (id -> Distributed . get_bind_addr (id), procs () )
115
+ machinepids = get_unique_machine_ids ( )
116
116
quote
117
117
@everywhere $ machinepids $ expr
118
118
end
119
119
end
120
120
121
+ get_unique_machine_ids () = unique (id -> Distributed. get_bind_addr (id), procs ())
122
+ get_unique_machine_ips () = unique (map (id -> Distributed. get_bind_addr (id), procs ()))
123
+
121
124
function sync_env (cluster)
122
125
proj_path = dirname (Pkg. project (). path)
123
126
deps = Pkg. dependencies ()
@@ -157,8 +160,13 @@ function scp(path, target)
157
160
run (` ssh -q -t $(target) rm -rf $(path) ` ) # Delete old
158
161
run (` scp -r -q $(path) $(target) :$(path) ` ) # Copy
159
162
end
160
-
161
- function status (cluster:: Vector{String} )
163
+
164
+ """
165
+ cluster_status!(cluster)
166
+
167
+ Run a status check on each machine in the list and remove any machines not reachable.
168
+ """
169
+ function cluster_status! (cluster:: Vector{String} )
162
170
calc_cpu = " awk '{u=\$ 2+\$ 4; t=\$ 2+\$ 4+\$ 5; if (NR==1){u1=u; t1=t;} else print (\$ 2+\$ 4-u1) * 100 / (t-t1) \" %\" ; }' <(grep 'cpu ' /proc/stat) <(sleep 1;grep 'cpu ' /proc/stat)"
163
171
164
172
connection_error = []
@@ -181,5 +189,13 @@ function status(cluster::Vector{String})
181
189
end
182
190
end
183
191
192
+ """
193
+ cluster_status()
194
+
195
+ Run a status check on each machine in the running cluster.
196
+
197
+ Prints current users, current cpu utilization and current julia version.
198
+ """
199
+ cluster_status () = cluster_status! (get_unique_machine_ips ())
184
200
185
201
end
0 commit comments