Skip to content

Commit 34226f7

Browse files
committed
Merge branch 'release/0.1.10'
2 parents c77af13 + 7f61459 commit 34226f7

34 files changed

+1219
-536
lines changed

Diff for: deephyper/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
VERSION = (0, 1, 9)
1+
VERSION = (0, 1, 10)
22

33
__version__ = ".".join(map(str, VERSION))
44

Diff for: deephyper/benchmark/nas/nasbench101/__init__.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,11 @@
99
1010
sh download_only108.sh
1111
12-
An example usage with the regularized evolution search is::
12+
An example usage with the regularized evolution search if you used ``download_only108.sh``::
1313
14-
python -m deephyper.search.nas.regevo --problem deephyper.benchmark.nas.nasbench101.problem.Problem --evaluator threadPool --run deephyper.benchmark.nas.nasbench101.run.run --max-evals 10000
14+
python -m deephyper.search.nas.regevo --problem deephyper.benchmark.nas.nasbench101.problem.Problem --evaluator threadPool --run deephyper.benchmark.nas.nasbench101.run_only108.run --max-evals 1000
15+
16+
Or, if you used ``download_full.sh``::
17+
18+
python -m deephyper.search.nas.regevo --problem deephyper.benchmark.nas.nasbench101.problem.Problem --evaluator threadPool --run deephyper.benchmark.nas.nasbench101.run_full.run --max-evals 1000
1519
"""

Diff for: deephyper/benchmark/nas/nasbench101/run.py renamed to deephyper/benchmark/nas/nasbench101/run_full.py

-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
HERE = os.path.dirname(os.path.abspath(__file__))
66

77
# # Use nasbench_full.tfrecord for full dataset (run download command above).
8-
# data_file = os.path.join(HERE, "nasbench_only108.tfrecord")
98
data_file = os.path.join(HERE, "nasbench_full.tfrecord")
109
nasbench = api.NASBench(data_file)
1110

Diff for: deephyper/benchmark/nas/nasbench101/run_only108.py

+18
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
import os
2+
from deephyper.benchmark.nas.nasbench101.util import create_search_space, evaluate_ops
3+
from nasbench import api
4+
5+
HERE = os.path.dirname(os.path.abspath(__file__))
6+
7+
# # Use nasbench_full.tfrecord for full dataset (run download command above).
8+
data_file = os.path.join(HERE, "nasbench_only108.tfrecord")
9+
nasbench = api.NASBench(data_file)
10+
11+
def run(config):
12+
13+
ss = create_search_space()
14+
15+
ops = config["arch_seq"]
16+
val_acc = evaluate_ops(ss, ops, nasbench)
17+
18+
return val_acc

Diff for: deephyper/evaluator/_balsam.py

+25-12
Original file line numberDiff line numberDiff line change
@@ -30,28 +30,41 @@ class BalsamEvaluator(Evaluator):
3030
used as the key for caching evaluations. Multiple inputs that map to the same
3131
hashable key will only be evaluated once. If ``None``, then cache_key defaults
3232
to a lossless (identity) encoding of the input dict.
33+
num_nodes_per_eval (int):
3334
"""
3435

35-
def __init__(self, run_function, cache_key=None, **kwargs):
36+
def __init__(self, run_function, cache_key=None, num_nodes_master=1, num_nodes_per_eval=1, num_ranks_per_node=1, num_evals_per_node=1, num_threads_per_rank=64, **kwargs):
3637
super().__init__(run_function, cache_key)
3738
self.id_key_map = {}
39+
40+
# Attributes related to scaling policy
41+
self.num_nodes_master = num_nodes_master
42+
self.num_nodes_per_eval = num_nodes_per_eval
43+
self.num_ranks_per_node = num_ranks_per_node
44+
self.num_evals_per_node = num_evals_per_node
45+
self.num_threads_per_rank = num_threads_per_rank
46+
3847
# reserve 1 DeepHyper worker for searcher process
3948
if LAUNCHER_NODES == 1:
4049
# --job-mode=serial edge case where 2 ranks (Master, Worker) are placed on the node
41-
self.num_workers = self.WORKERS_PER_NODE - 1
50+
self.num_workers = self.num_evals_per_node - 1
4251
# 1 node case for --job-mode=mpi will result in search process occupying
4352
# entirety of the only node ---> no evaluator workers (also should have DEEPHYPER_WORKERS_PER_NODE=1)
4453
else:
4554
if JOB_MODE == "serial":
4655
# MPI ensemble Master rank0 occupies entirety of first node
47-
self.num_workers = (LAUNCHER_NODES - 1) * self.WORKERS_PER_NODE - 1
56+
assert self.num_nodes_master == 1, f"num_nodes_master=={self.num_nodes_master} when it should be 1 because job-mode is 'serial'."
57+
self.num_workers = (LAUNCHER_NODES - 1) * self.num_evals_per_node - self.num_nodes_master
4858
if JOB_MODE == "mpi":
4959
# all nodes free, but restricted to 1 job=worker per node
50-
assert self.WORKERS_PER_NODE == 1
51-
self.num_workers = LAUNCHER_NODES * self.WORKERS_PER_NODE - 1
60+
self.num_workers = LAUNCHER_NODES - self.num_nodes_master
61+
self.num_workers //= self.num_nodes_per_eval
62+
assert self.num_workers > 0, f"The number of workers is {self.num_workers} when it shoud be > 0."
63+
5264
logger.info("Balsam Evaluator instantiated")
5365
logger.debug(f"LAUNCHER_NODES = {LAUNCHER_NODES}")
54-
logger.debug(f"WORKERS_PER_NODE = {self.WORKERS_PER_NODE}")
66+
logger.debug(f"WORKERS_PER_NODE = {self.num_evals_per_node}")
67+
logger.debug(f"NUM_NODES_PER_EVAL = {self.num_nodes_per_eval}")
5568
logger.debug(f"Total number of workers: {self.num_workers}")
5669
logger.info(f"Backend runs will use Python: {self.PYTHON_EXE}")
5770
self._init_app()
@@ -109,13 +122,13 @@ def _eval_exec(self, x):
109122
def _create_balsam_task(self, x):
110123
args = f"'{self.encode(x)}'"
111124
envs = f"KERAS_BACKEND={self.KERAS_BACKEND}"
112-
# envs = ":".join(f'KERAS_BACKEND={self.KERAS_BACKEND} OMP_NUM_THREADS=62 KMP_BLOCKTIME=0 KMP_AFFINITY=\"granularity=fine,compact,1,0\"'.split())
113125
resources = {
114-
"num_nodes": 1,
115-
"ranks_per_node": 1,
116-
"threads_per_rank": 64,
117-
"node_packing_count": self.WORKERS_PER_NODE,
126+
"num_nodes": self.num_nodes_per_eval,
127+
"ranks_per_node": self.num_ranks_per_node,
128+
"threads_per_rank": self.num_threads_per_rank,
129+
"node_packing_count": self.num_evals_per_node
118130
}
131+
119132
for key in resources:
120133
if key in x:
121134
resources[key] = x[key]
@@ -135,5 +148,5 @@ def _on_done(job):
135148

136149
@staticmethod
137150
def _on_fail(job):
138-
logger.info(f"Task {job.cute_id} failed; setting objective as float_max")
151+
logger.info(f"Task {job.cute_id} failed; setting objective as float_min")
139152
return Evaluator.FAIL_RETURN_VALUE

Diff for: deephyper/evaluator/_mpiWorkerPool.py

+38-30
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,10 @@
66
from deephyper.evaluator.evaluate import Evaluator
77

88
logger = logging.getLogger(__name__)
9-
WaitResult = namedtuple('WaitResult', ['active', 'done', 'failed', 'cancelled'])
9+
WaitResult = namedtuple("WaitResult", ["active", "done", "failed", "cancelled"])
1010

1111

12-
class MPIFuture():
12+
class MPIFuture:
1313
"""MPIFuture is a class meant to track a pending evaluation.
1414
It record whether it was posted to a worker, the associated
1515
MPI request, the tag, and the command that was sent."""
@@ -28,7 +28,7 @@ def posted(self):
2828
def post(self, comm, worker, tag):
2929
"""Posts the request to a particular worker,
3030
with a particular tag."""
31-
if(self.posted):
31+
if self.posted:
3232
raise ValueError("Request already posted")
3333
comm.send(self._cmd, dest=worker, tag=tag)
3434
self._worker = worker
@@ -57,7 +57,7 @@ def _set_result(self, value):
5757
def test(self):
5858
"""Tests if the request has completed."""
5959
completed, result = MPI.Request.test(self._request)
60-
if(completed):
60+
if completed:
6161
self._set_result(result)
6262
return completed
6363

@@ -66,7 +66,7 @@ def waitany(futures):
6666
"""Waits for any of the provided futures to complete
6767
and sets the result of the one that completed."""
6868
status = MPI.Status()
69-
requests = [ f._request for f in futures]
69+
requests = [f._request for f in futures]
7070
idx, result = MPI.Request.waitany(requests, status=status)
7171
f = futures[idx]
7272
f._set_result(result)
@@ -76,10 +76,11 @@ def waitany(futures):
7676
def waitall(futures):
7777
"""Waits for all the provided futures to complete and
7878
sets their result."""
79-
results = MPI.Request.waitall([ f._request for f in futures ])
79+
results = MPI.Request.waitall([f._request for f in futures])
8080
for r, f in zip(results, futures):
8181
f._set_result(r)
8282

83+
8384
class MPIWorkerPool(Evaluator):
8485
"""Evaluator using a pool of MPI workers.
8586
@@ -91,21 +92,33 @@ class MPIWorkerPool(Evaluator):
9192
If ``None``, then cache_key defaults to a lossless (identity)
9293
encoding of the input dict.
9394
"""
94-
def __init__(self, run_function, cache_key=None, comm=None, **kwargs):
95+
96+
def __init__(
97+
self,
98+
run_function,
99+
cache_key=None,
100+
comm=None,
101+
num_nodes_master=1,
102+
num_nodes_per_eval=1,
103+
num_ranks_per_node=1,
104+
num_evals_per_node=1,
105+
num_threads_per_rank=64,
106+
**kwargs
107+
):
95108
"""Constructor."""
96109
super().__init__(run_function, cache_key)
97-
if(comm is None):
110+
if comm is None:
98111
self.comm = MPI.COMM_WORLD
99112
else:
100113
self.comm = comm
101-
self.num_workers = self.comm.Get_size()-1
114+
self.num_workers = self.comm.Get_size() - 1
102115
self.avail_workers = []
103-
for tag in range(0, self.WORKERS_PER_NODE):
116+
for tag in range(0, num_ranks_per_node):
104117
for rank in range(0, self.num_workers):
105-
self.avail_workers.append((rank+1, tag+1))
118+
self.avail_workers.append((rank + 1, tag + 1))
106119
funcName = self._run_function.__name__
107120
moduleName = self._run_function.__module__
108-
self.appName = '.'.join((moduleName, funcName))
121+
self.appName = ".".join((moduleName, funcName))
109122

110123
def _try_posting(self, unposted):
111124
"""This function takes a list of MPIFuture instances that aren't
@@ -115,7 +128,7 @@ def _try_posting(self, unposted):
115128
now_posted = []
116129
now_unposted = []
117130
for f in unposted:
118-
if(len(self.avail_workers) > 0):
131+
if len(self.avail_workers) > 0:
119132
worker, tag = self.avail_workers.pop()
120133
f.post(self.comm, worker, tag)
121134
now_posted.append(f)
@@ -128,29 +141,29 @@ def _eval_exec(self, x):
128141
with the provided point x as argument. Returns an instance
129142
of MPIFuture. If possible, this future will have been posted."""
130143
assert isinstance(x, dict)
131-
cmd = {'cmd': 'exec', 'args': [x] }
144+
cmd = {"cmd": "exec", "args": [x]}
132145
future = MPIFuture(cmd)
133-
if(len(self.avail_workers) > 0):
146+
if len(self.avail_workers) > 0:
134147
worker, tag = self.avail_workers.pop()
135148
future.post(self.comm, worker, tag)
136149
return future
137150

138-
def wait(self, futures, timeout=None, return_when='ANY_COMPLETED'):
151+
def wait(self, futures, timeout=None, return_when="ANY_COMPLETED"):
139152
"""Waits for a set of futures to complete. If return_when == ANY_COMPLETED,
140153
this function will return as soon as at least one of the futures has completed.
141154
Otherwise it will wait for all the futures to have completed."""
142155
# TODO: for now the timeout is not taken into account and
143156
# the failed and cancelled lists will always be empty.
144-
done, failed, cancelled, active = [],[],[],[]
157+
done, failed, cancelled, active = [], [], [], []
145158
posted = [f for f in futures if f.posted]
146159
unposted = [f for f in futures if not f.posted]
147160

148-
if(len(posted) == 0):
161+
if len(posted) == 0:
149162
newly_posted, unposted = self._try_posting(unposted)
150163
posted.extend(newly_posted)
151164

152-
if(return_when == 'ALL_COMPLETED'):
153-
while(len(posted) > 0 or len(unposted) > 0):
165+
if return_when == "ALL_COMPLETED":
166+
while len(posted) > 0 or len(unposted) > 0:
154167
MPIFuture.waitall(posted)
155168
for f in posted:
156169
self.avail_workers.append((f.worker, f.tag))
@@ -167,18 +180,18 @@ def wait(self, futures, timeout=None, return_when='ANY_COMPLETED'):
167180
one_completed = True
168181
done.append(f)
169182
# one request completed, try posting a new request
170-
if(len(unposted) > 0):
183+
if len(unposted) > 0:
171184
p = unposted.pop(0)
172185
p.post(self.comm, worker=f.worker, tag=f.tag)
173186
active.append(p)
174187
else:
175188
self.avail_workers.append((f.worker, f.tag))
176189
else:
177190
active.append(f)
178-
if not one_completed: # we need to call waitany
191+
if not one_completed: # we need to call waitany
179192
f = MPIFuture.waitany(posted)
180193
done.append(f)
181-
if(len(unposted) > 0):
194+
if len(unposted) > 0:
182195
p = unposted.pop(0)
183196
p.post(self.comm, worker=f.worker, tag=f.tag)
184197
active.append(p)
@@ -187,18 +200,13 @@ def wait(self, futures, timeout=None, return_when='ANY_COMPLETED'):
187200
for f in unposted:
188201
active.append(f)
189202

190-
return WaitResult(
191-
active=active,
192-
done=done,
193-
failed=failed,
194-
cancelled=cancelled
195-
)
203+
return WaitResult(active=active, done=done, failed=failed, cancelled=cancelled)
196204

197205
def shutdown_workers(self):
198206
"""Shuts down all the MPIWorker instances."""
199207
req = []
200208
for k in range(1, self.comm.Get_size()):
201-
r = self.comm.isend({'cmd': 'exit'}, dest=k, tag=0)
209+
r = self.comm.isend({"cmd": "exit"}, dest=k, tag=0)
202210
req.append(r)
203211
MPI.Request.waitall(req)
204212

Diff for: deephyper/evaluator/_processPool.py

+16-17
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@
77
from deephyper.evaluator.evaluate import Evaluator
88

99
logger = logging.getLogger(__name__)
10-
WaitResult = namedtuple('WaitResult', ['active', 'done', 'failed', 'cancelled'])
10+
WaitResult = namedtuple("WaitResult", ["active", "done", "failed", "cancelled"])
11+
1112

1213
class ProcessPoolEvaluator(Evaluator):
1314
"""Evaluator using ProcessPoolExecutor.
@@ -18,27 +19,30 @@ class ProcessPoolEvaluator(Evaluator):
1819
run_function (func): takes one parameter of type dict and returns a scalar value.
1920
cache_key (func): takes one parameter of type dict and returns a hashable type, used as the key for caching evaluations. Multiple inputs that map to the same hashable key will only be evaluated once. If ``None``, then cache_key defaults to a lossless (identity) encoding of the input dict.
2021
"""
22+
2123
def __init__(self, run_function, cache_key=None, **kwargs):
2224
super().__init__(run_function, cache_key)
23-
self.num_workers = self.WORKERS_PER_NODE
24-
self.executor = ProcessPoolExecutor(
25-
max_workers = self.num_workers
25+
self.num_workers = 1
26+
self.executor = ProcessPoolExecutor(max_workers=self.num_workers)
27+
logger.info(
28+
f"ProcessPool Evaluator will execute {self._run_function.__name__}() from module {self._run_function.__module__}"
2629
)
27-
logger.info(f"ProcessPool Evaluator will execute {self._run_function.__name__}() from module {self._run_function.__module__}")
2830

2931
def _eval_exec(self, x):
3032
assert isinstance(x, dict)
3133
future = self.executor.submit(self._run_function, x)
3234
return future
3335

34-
def wait(self, futures, timeout=None, return_when='ANY_COMPLETED'):
35-
return_when=return_when.replace('ANY','FIRST')
36+
def wait(self, futures, timeout=None, return_when="ANY_COMPLETED"):
37+
return_when = return_when.replace("ANY", "FIRST")
3638
results = _futures_wait(futures, timeout=timeout, return_when=return_when)
37-
done, failed, cancelled = [],[],[]
39+
done, failed, cancelled = [], [], []
3840
active = list(results.not_done)
39-
if len(active) > 0 and return_when=='ALL_COMPLETED':
40-
raise TimeoutError(f'{timeout} sec timeout expired while '
41-
f'waiting on {len(futures)} tasks until {return_when}')
41+
if len(active) > 0 and return_when == "ALL_COMPLETED":
42+
raise TimeoutError(
43+
f"{timeout} sec timeout expired while "
44+
f"waiting on {len(futures)} tasks until {return_when}"
45+
)
4246
for res in results.done:
4347
try:
4448
res.result(timeout=0)
@@ -51,9 +55,4 @@ def wait(self, futures, timeout=None, return_when='ANY_COMPLETED'):
5155
failed.append(res)
5256
else:
5357
done.append(res)
54-
return WaitResult(
55-
active=active,
56-
done=done,
57-
failed=failed,
58-
cancelled=cancelled
59-
)
58+
return WaitResult(active=active, done=done, failed=failed, cancelled=cancelled)

0 commit comments

Comments
 (0)