Skip to content

Commit af1e87e

Browse files
authored
Merge pull request #421 from leondavi/exp-summary
New ExperimentSummary System
2 parents 08be41c + 6f83e20 commit af1e87e

File tree

12 files changed

+930
-43
lines changed

12 files changed

+930
-43
lines changed
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
{
2+
"connectionsMap":
3+
{
4+
"r1":["mainServer", "c1", "s1", "r2"],
5+
"r2":["r3"],
6+
"r3":["r4"],
7+
"r4":["r1"]
8+
}
9+
}
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
{
2+
"nerlnetSettings": {
3+
"frequency": "100",
4+
"batchSize": "25"
5+
},
6+
"mainServer": {
7+
"port": "8080",
8+
"args": ""
9+
},
10+
"apiServer": {
11+
"port": "8081",
12+
"args": ""
13+
},
14+
"devices": [
15+
{
16+
"name": "NerlControllerG",
17+
"ipv4": "172.31.91.176",
18+
"entities": "mainServer,apiServer,r1"
19+
},
20+
{
21+
"name": "NerlControllerD",
22+
"ipv4": "172.31.93.88",
23+
"entities": "s1,r2,r3"
24+
},
25+
{
26+
"name": "Nerl-Powerful",
27+
"ipv4": "172.31.44.250",
28+
"entities": "c1,r4"
29+
}
30+
],
31+
"routers": [
32+
{
33+
"name": "r1",
34+
"port": "8090",
35+
"policy": "0"
36+
},
37+
{
38+
"name": "r2",
39+
"port": "8091",
40+
"policy": "0"
41+
},
42+
{
43+
"name": "r3",
44+
"port": "8092",
45+
"policy": "0"
46+
},
47+
{
48+
"name": "r4",
49+
"port": "8093",
50+
"policy": "0"
51+
}
52+
],
53+
"sources": [
54+
{
55+
"name": "s1",
56+
"port": "8086",
57+
"frequency": "25",
58+
"policy": "0",
59+
"epochs": "4",
60+
"type": "0"
61+
}
62+
],
63+
"clients": [
64+
{
65+
"name": "c1",
66+
"port": "8082",
67+
"workers": "w1"
68+
}
69+
],
70+
"workers": [
71+
{
72+
"name": "w1",
73+
"model_sha": "9c5f1261068be7be96487a2cae282aa22e8c1cb482a5bf8d557bc8e1e2b6fef0"
74+
}
75+
],
76+
"model_sha": {
77+
"9c5f1261068be7be96487a2cae282aa22e8c1cb482a5bf8d557bc8e1e2b6fef0": {
78+
"modelType": "0",
79+
"_doc_modelType": " nn:0 | approximation:1 | classification:2 | forecasting:3 | image_classification:4 | text_classification:5 | text_generation:6 | auto_association:7 | autoencoder:8 | ae_classifier:9 |",
80+
"modelArgs": "",
81+
"_doc_modelArgs": "Extra arguments to model",
82+
"layersSizes": "28x28x1k5x5x1x6p0s1t1,28x28x6k2x2p0s2,14x14x6k4x4x6x12p0s1t0,1,32,10",
83+
"_doc_layersSizes": "List of postive integers [L0, L1, ..., LN]",
84+
"layerTypesList": "2,4,2,9,3,5",
85+
"_doc_LayerTypes": " Default:0 | Scaling:1 | Conv:2 | Perceptron:3 | Pooling:4 | Probabilistic:5 | LSTM:6 | Reccurrent:7 | Unscaling:8 | Flatten:9 | Bounding:10 |",
86+
"layers_functions": "6,2,6,1,6,4",
87+
"_doc_layers_functions_activation": " Threshold:1 | Sign:2 | Logistic:3 | Tanh:4 | Linear:5 | ReLU:6 | eLU:7 | SeLU:8 | Soft-plus:9 | Soft-sign:10 | Hard-sigmoid:11 |",
88+
"_doc_layer_functions_pooling": " none:1 | Max:2 | Avg:3 |",
89+
"_doc_layer_functions_probabilistic": " Binary:1 | Logistic:2 | Competitive:3 | Softmax:4 |",
90+
"_doc_layer_functions_scaler": " none:1 | MinMax:2 | MeanStd:3 | STD:4 | Log:5 |",
91+
"lossMethod": "2",
92+
"lossArgs": "",
93+
"_doc_lossMethod": " SSE:1 | MSE:2 | NSE:3 | MinkowskiE:4 | WSE:5 | CEE:6 |",
94+
"lr": "0.01",
95+
"_doc_lr": "Positve float",
96+
"epochs": "1",
97+
"_doc_epochs": "Positve Integer",
98+
"optimizer": "5",
99+
"_doc_optimizer": " GD:0 | CGD:1 | SGD:2 | QuasiNeuton:3 | LVM:4 | ADAM:5 |",
100+
"optimizerArgs": "none",
101+
"_doc_optimizerArgs": "String",
102+
"infraType": "0",
103+
"_doc_infraType": " opennn:0 | wolfengine:1 |",
104+
"distributedSystemType": "0",
105+
"_doc_distributedSystemType": " none:0 | FedClientAvg:1 | FedServerAvg:2 | FedClientWeightedAvgClassification:3 | FedServerWeightedAvgClassification:4 | FedClientAE:5 | FedServerAE:6 | tiles:7 |",
106+
"distributedSystemArgs": "none",
107+
"_doc_distributedSystemArgs": "String",
108+
"distributedSystemToken": "none",
109+
"_doc_distributedSystemToken": "Token that associates distributed group of workers and parameter-server"
110+
}
111+
}
112+
}
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
{
2+
"experimentName": "mnist_rr",
3+
"experimentType": "classification",
4+
"batchSize": 25,
5+
"csvFilePath": "/tmp/nerlnet/data/NerlnetData-master/nerlnet/mnist_norm/mnist_train_255_norm.csv",
6+
"numOfFeatures": "784",
7+
"numOfLabels": "10",
8+
"headersNames": "0,1,2,3,4,5,6,7,8,9",
9+
"Phases":
10+
[
11+
{
12+
"phaseName": "training_phase1",
13+
"phaseType": "training",
14+
"sourcePieces":
15+
[
16+
{
17+
"sourceName": "s1",
18+
"startingSample": "0",
19+
"numOfBatches": "400",
20+
"workers": "w1",
21+
"nerltensorType": "float"
22+
}
23+
]
24+
},
25+
{
26+
"phaseName": "training_phase2",
27+
"phaseType": "training",
28+
"sourcePieces":
29+
[
30+
{
31+
"sourceName": "s1",
32+
"startingSample": "20000",
33+
"numOfBatches": "400",
34+
"workers": "w1",
35+
"nerltensorType": "float"
36+
}
37+
]
38+
},
39+
{
40+
"phaseName": "prediction_phase",
41+
"phaseType": "prediction",
42+
"sourcePieces":
43+
[
44+
{
45+
"sourceName": "s1",
46+
"startingSample": "50000",
47+
"numOfBatches": "400",
48+
"workers": "w1",
49+
"nerltensorType": "float"
50+
}
51+
]
52+
}
53+
]
54+
}
55+
56+

src_erl/NerlnetApp/src/Client/clientStatem.erl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,7 @@ idle(cast, In = {training}, State = #client_statem_state{myName = _MyName, etsRe
196196
cast_message_to_workers(EtsRef, MessageToCast),
197197
ets:update_element(EtsRef, all_workers_done, {?DATA_IDX, false}),
198198
stats:performance_stats_reset(PerformanceStatsEts),
199+
stats:communication_stats_reset(ClientStatsEts),
199200
stats:tic(ClientStatsEts, time_train_total),
200201
stats:reset_query_cpu_util_cores(),
201202
{next_state, waitforWorkers, State#client_statem_state{waitforWorkers = clientWorkersFunctions:get_workers_names(EtsRef), nextState = training}};
@@ -208,6 +209,7 @@ idle(cast, In = {predict}, State = #client_statem_state{etsRef = EtsRef}) ->
208209
MessageToCast = {predict},
209210
cast_message_to_workers(EtsRef, MessageToCast),
210211
stats:performance_stats_reset(PerformanceStatsEts),
212+
stats:communication_stats_reset(ClientStatsEts),
211213
stats:tic(ClientStatsEts, time_predict_total),
212214
stats:reset_query_cpu_util_cores(),
213215
{next_state, waitforWorkers, State#client_statem_state{waitforWorkers = clientWorkersFunctions:get_workers_names(EtsRef),nextState = predict}};

src_erl/NerlnetApp/src/Stats/stats.erl

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
% performance stats
1818
-export([generate_performance_stats_ets/0]).
1919
-export([start_os_mon/0]).
20-
-export([performance_stats_reset/1]).
20+
-export([performance_stats_reset/1, communication_stats_reset/1]).
2121
% perofmance stats getters/setters
2222
-export([get_time_train_active/1, increment_time_train_active/2]).
2323
-export([get_time_train_total/1, increment_time_train_total/2]).
@@ -237,6 +237,19 @@ get_bad_messages(StatsEts) ->
237237
increment_bad_messages(StatsEts) ->
238238
ets:update_counter(StatsEts, ?STATS_ATOM_BAD_MSG, 1).
239239

240+
communication_stats_reset(ComStatsEts) ->
241+
% Reset all communication stats to zero
242+
ets:update_element(ComStatsEts, ?STATS_ATOM_MSG_RECV, {?STATS_KEYVAL_VAL_IDX, 0}),
243+
ets:update_element(ComStatsEts, ?STATS_ATOM_MSG_SENT, {?STATS_KEYVAL_VAL_IDX, 0}),
244+
ets:update_element(ComStatsEts, ?STATS_ATOM_MSG_DROP, {?STATS_KEYVAL_VAL_IDX, 0}),
245+
ets:update_element(ComStatsEts, ?STATS_ATOM_BYTES_RECV, {?STATS_KEYVAL_VAL_IDX, 0}),
246+
ets:update_element(ComStatsEts, ?STATS_ATOM_BYTES_SENT, {?STATS_KEYVAL_VAL_IDX, 0}),
247+
ets:update_element(ComStatsEts, ?STATS_ATOM_BAD_MSG, {?STATS_KEYVAL_VAL_IDX, 0}),
248+
ets:update_element(ComStatsEts, ?STATS_ATOM_BATCHES_RECEIVED, {?STATS_KEYVAL_VAL_IDX, 0}), % related with client only
249+
ets:update_element(ComStatsEts, ?STATS_ATOM_BATCHES_DROPPED, {?STATS_KEYVAL_VAL_IDX, 0}), % related with client only
250+
ets:update_element(ComStatsEts, ?STATS_ATOM_BATCHES_SENT, {?STATS_KEYVAL_VAL_IDX, 0}), % related with source only
251+
ets:update_element(ComStatsEts, ?STATS_ATOM_ACTUAL_FREQUENCY, {?STATS_KEYVAL_VAL_IDX, 0}), % related with source only
252+
ok.
240253

241254
performance_stats_reset(PerfStatsEts) ->
242255
% Reset all performance stats to zero

src_erl/NerlnetApp/src/Stats/stats.hrl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
-define(STATS_ATOM_BATCHES_RECEIVED, batches_received).
88
-define(STATS_ATOM_BATCHES_DROPPED, batches_dropped).
99
-define(STATS_ATOM_BATCHES_SENT, batches_sent).
10+
-define(STATS_ATOM_ACTUAL_FREQUENCY, actual_frequency).
1011

1112
-define(STATS_KEYVAL_KEY_IDX, 1).
1213
-define(STATS_KEYVAL_VAL_IDX, 2).

src_py/apiServer/apiServer.py

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,107 @@ def experiment_phase_is_valid(self):
215215
current_exp_flow = globe.experiment_focused_on
216216
return current_exp_flow.current_exp_phase_index < len(current_exp_flow.exp_phase_list)
217217

218+
def run_all_experiment_phases(self):
219+
"""
220+
Runs all experiment phases sequentially from the experiment JSON file.
221+
222+
This function iterates through all phases defined in the experiment flow JSON,
223+
executing each phase in order (training phases followed by prediction phases).
224+
It collects statistics for each completed phase and returns them as a list.
225+
226+
The function will:
227+
1. Print information about the current phase being executed (name and type)
228+
2. Execute each phase using run_current_experiment_phase()
229+
3. Generate and collect Stats objects for each completed phase
230+
4. Move to the next phase using next_experiment_phase()
231+
5. Continue until all phases are completed
232+
233+
Returns:
234+
list: A list of Stats objects, one for each successfully completed phase.
235+
Each Stats object contains performance metrics, communication statistics,
236+
and other phase-specific data.
237+
238+
Raises:
239+
AssertionError: If no valid experiment is currently focused or if required
240+
setup (initialization, send_jsons_to_devices) is not completed.
241+
242+
Example:
243+
# After initialization and sending JSONs to devices
244+
api_server.send_jsons_to_devices()
245+
all_stats = api_server.run_all_experiment_phases()
246+
247+
# Process results
248+
for i, stats in enumerate(all_stats):
249+
print(f"Phase {i+1}: {stats.get_name()} ({stats.get_phase()})")
250+
if stats.get_phase() == "training":
251+
loss_data = stats.get_loss_ts()
252+
elif stats.get_phase() == "prediction":
253+
confusion_matrices = stats.get_confusion_matrices()
254+
255+
Note:
256+
- Requires that initialization() and send_jsons_to_devices() have been called first
257+
- All NerlNet devices must be running and accessible
258+
- The experiment JSON must contain valid phase definitions
259+
"""
260+
# Ensure we have a valid experiment focused
261+
if self.current_exp is None:
262+
raise AssertionError("No experiment is currently focused. Call initialization() and experiment_focused_on() first.")
263+
264+
# Ensure JSONs have been sent to devices
265+
send_jsons_event = self.apiserver_event_sync.get_event_status(EventSync.SEND_JSONS)
266+
if send_jsons_event != EventSync.DONE:
267+
raise AssertionError("JSONs must be sent to devices first. Call send_jsons_to_devices() before running phases.")
268+
269+
all_phases_stats = []
270+
271+
# Get the initial phase information
272+
current_exp_flow = self.current_exp
273+
total_phases = len(current_exp_flow.exp_phase_list)
274+
275+
if total_phases == 0:
276+
LOG_WARNING("No experiment phases found in the experiment flow")
277+
return all_phases_stats
278+
279+
LOG_INFO(f"Starting to run all {total_phases} experiment phases for experiment: {current_exp_flow.get_exp_name()}")
280+
281+
# Run phases sequentially
282+
phase_count = 1
283+
while True:
284+
if not self.next_expertiment_phase_exist:
285+
LOG_WARNING("No valid experiment phase available to run")
286+
break
287+
288+
current_phase = current_exp_flow.get_current_experiment_phase()
289+
phase_name = current_phase.get_name()
290+
phase_type = current_phase.get_phase_type()
291+
292+
LOG_INFO(f"Running phase {phase_count}/{total_phases}: '{phase_name}' (Type: {phase_type})")
293+
294+
try:
295+
# Run the current phase
296+
self.run_current_experiment_phase()
297+
298+
# Generate stats for the completed phase
299+
phase_stats = current_exp_flow.generate_stats(current_phase)
300+
all_phases_stats.append(phase_stats)
301+
302+
LOG_INFO(f"Completed phase {phase_count}/{total_phases}: '{phase_name}' ({phase_type})")
303+
304+
except Exception as e:
305+
LOG_ERROR(f"Error running phase {phase_count}/{total_phases} '{phase_name}': {str(e)}")
306+
# Continue with next phase instead of stopping completely
307+
308+
# Move to next phase
309+
next_phase_type = self.next_experiment_phase()
310+
if next_phase_type is None:
311+
LOG_INFO("All experiment phases completed successfully")
312+
break
313+
314+
phase_count += 1
315+
316+
LOG_INFO(f"Finished running all experiment phases. Total phases executed: {len(all_phases_stats)}")
317+
return all_phases_stats
318+
218319
def list_datasets(self):
219320
with open(HF_DATA_REPO_PATHS_JSON) as file:
220321
repo_ids = json.load(file)

src_py/apiServer/apiServerHelp.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
-experiment_phase_is_valid() returns True if there are more experiment phases to run
4242
-run_current_experiment_phase() runs the current experiment phase
4343
-next_experiment_phase() moves to the next experiment phase and returns the phase type
44+
-run_all_experiment_phases() runs all experiment phases sequentially and returns a list of Stats objects for each phase
4445
4546
======== Retrieving statistics ======
4647
-get_experiment_flow(experiment_name).generate_stats() returns statistics object (E.g., assigned to StatsInst) class for the current experiment phase

0 commit comments

Comments
 (0)