Skip to content

Commit 82316bf

Browse files
authored
Merge pull request #406 from leondavi/perf_stats
Performance stats ETS - initalize work
2 parents 9e93b49 + aa47f86 commit 82316bf

22 files changed

+584
-22
lines changed

.github/workflows/pr.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,3 +40,9 @@ jobs:
4040
run: |
4141
./tests/NerlnetFullFlowTest.sh
4242
timeout-minutes: 20
43+
- name: Post FullFlow test
44+
id: fullflowpost
45+
if: steps.fullflow.outcome == 'success'
46+
run: |
47+
./tests/NerlnetFullFlowTestPost.sh
48+
timeout-minutes: 5

NerlnetRun.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,8 @@ function init()
5959
if [ $is_rasp -gt "0" ]; then
6060
export LD_PRELOAD=/usr/lib/arm-linux-gnueabihf/libatomic.so.1.2.0
6161
fi
62+
63+
pkill beam.smp
6264
}
6365

6466
function status()
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
{
2+
"connectionsMap":
3+
{
4+
"r1":["mainServer", "c1", "s1"]
5+
}
6+
}
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
{
2+
"nerlnetSettings": {
3+
"frequency": "200",
4+
"batchSize": "10"
5+
},
6+
"mainServer": {
7+
"port": "8081",
8+
"args": ""
9+
},
10+
"apiServer": {
11+
"port": "8082",
12+
"args": ""
13+
},
14+
"devices": [
15+
{
16+
"name": "pc1",
17+
"ipv4": "127.0.0.1",
18+
"entities": "c1,r1,s1,apiServer,mainServer"
19+
}
20+
],
21+
"routers": [
22+
{
23+
"name": "r1",
24+
"port": "8086",
25+
"policy": "0"
26+
}
27+
],
28+
"sources": [
29+
{
30+
"name": "s1",
31+
"port": "8085",
32+
"frequency": "200",
33+
"policy": "0",
34+
"epochs": "1",
35+
"type": "0"
36+
}
37+
],
38+
"clients": [
39+
{
40+
"name": "c1",
41+
"port": "8083",
42+
"workers": "w1,w2"
43+
}
44+
],
45+
"workers": [
46+
{
47+
"name": "w1",
48+
"model_sha": "d8df752e0a2e8f01de8f66e9cec941cdbc65d144ecf90ab7713e69d65e7e82aa"
49+
},
50+
{
51+
"name": "w2",
52+
"model_sha": "d8df752e0a2e8f01de8f66e9cec941cdbc65d144ecf90ab7713e69d65e7e82aa"
53+
}
54+
],
55+
"model_sha": {
56+
"d8df752e0a2e8f01de8f66e9cec941cdbc65d144ecf90ab7713e69d65e7e82aa": {
57+
"modelType": "0",
58+
"_doc_modelType": " nn:0 | approximation:1 | classification:2 | forecasting:3 | image-classification:4 | text-classification:5 | text-generation:6 | auto-association:7 | autoencoder:8 | ae-classifier:9 |",
59+
"modelArgs": "",
60+
"layersSizes": "5,16,8,3",
61+
"_doc_layersSizes": "List of postive integers [L0, L1, ..., LN]",
62+
"layerTypesList": "1,3,3,3",
63+
"_doc_LayerTypes": " Default:0 | Scaling:1 | CNN:2 | Perceptron:3 | Pooling:4 | Probabilistic:5 | LSTM:6 | Reccurrent:7 | Unscaling:8 |",
64+
"layers_functions": "1,7,7,11",
65+
"_doc_layers_functions_activation": " Threshold:1 | Sign:2 | Logistic:3 | Tanh:4 | Linear:5 | ReLU:6 | eLU:7 | SeLU:8 | Soft-plus:9 | Soft-sign:10 | Hard-sigmoid:11 |",
66+
"_doc_layer_functions_pooling": " none:1 | Max:2 | Avg:3 |",
67+
"_doc_layer_functions_probabilistic": " Binary:1 | Logistic:2 | Competitive:3 | Softmax:4 |",
68+
"_doc_layer_functions_scaler": " none:1 | MinMax:2 | MeanStd:3 | STD:4 | Log:5 |",
69+
"lossMethod": "2",
70+
"_doc_lossMethod": " SSE:1 | MSE:2 | NSE:3 | MinkowskiE:4 | WSE:5 | CEE:6 |",
71+
"lossArgs": "",
72+
"_doc_lossArgs": "reg=L2, reg=L1, reg=NoRegularization (can be also empty)",
73+
"lr": "0.001",
74+
"_doc_lr": "Positve float",
75+
"epochs": "1",
76+
"_doc_epochs": "Positve Integer",
77+
"optimizer": "5",
78+
"_doc_optimizer": " GD:0 | CGD:1 | SGD:2 | QuasiNeuton:3 | LVM:4 | ADAM:5 |",
79+
"optimizerArgs": "",
80+
"_doc_optimizerArgs": "String",
81+
"infraType": "0",
82+
"_doc_infraType": " opennn:0 | wolfengine:1 |",
83+
"distributedSystemType": "0",
84+
"_doc_distributedSystemType": " none:0 | fedClientAvg:1 | fedServerAvg:2 |",
85+
"distributedSystemArgs": "",
86+
"_doc_distributedSystemArgs": "String",
87+
"distributedSystemToken": "none",
88+
"_doc_distributedSystemToken": "Token that associates distributed group of workers and parameter-server"
89+
}
90+
}
91+
}
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
{
2+
"experimentName": "synthetic_3_gausians",
3+
"experimentType": "classification",
4+
"batchSize": 10,
5+
"csvFilePath": "/tmp/nerlnet/data/NerlnetData-master/nerlnet/synthetic_norm/synthetic_full.csv",
6+
"numOfFeatures": "5",
7+
"numOfLabels": "3",
8+
"headersNames": "Norm(0:1),Norm(4:1),Norm(10:3)",
9+
"Phases":
10+
[
11+
{
12+
"phaseName": "training_phase",
13+
"phaseType": "training",
14+
"sourcePieces":
15+
[
16+
{
17+
"sourceName": "s1",
18+
"startingSample": "0",
19+
"numOfBatches": "5",
20+
"workers": "w1,w2",
21+
"nerltensorType": "float"
22+
}
23+
]
24+
},
25+
{
26+
"phaseName": "prediction_phase",
27+
"phaseType": "prediction",
28+
"sourcePieces":
29+
[
30+
{
31+
"sourceName": "s1",
32+
"startingSample": "50000",
33+
"numOfBatches": "5",
34+
"workers": "w1,w2",
35+
"nerltensorType": "float"
36+
}
37+
]
38+
}
39+
]
40+
}

src_erl/NerlnetApp/rebar.config

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
{apps, [nerlnetApp]}
99
]}.
1010
{base_dir, "/usr/local/lib/nerlnet-lib/NErlNet/build/rebar"}.
11-
{relx, [{release, {nerlnetApp, "1.4.3"}, [nerlnetApp,cowboy,jsx,kernel,stdlib,inets]},
11+
{relx, [{release, {nerlnetApp, "1.4.3"}, [nerlnetApp,cowboy,jsx,kernel,stdlib,inets,sasl,os_mon]},
1212
{dev_mode, true},
1313
{include_erts, true},
1414
%{include_src, true},

src_erl/NerlnetApp/src/Bridge/onnWorkers/nerlNIF.erl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ call_to_train(ModelID, {DataTensor, Type}, WorkerPid , BatchID , SourceName)->
5252
receive
5353
{nerlnif, nan, TrainTime} ->
5454
gen_statem:cast(WorkerPid,{loss, nan , TrainTime , BatchID , SourceName}); %TODO Guy - Please the behavior when this case happens
55-
{nerlnif , LossTensor, LossTensorType , TrainTime}->
55+
{nerlnif , LossTensor, LossTensorType , TrainTime}-> % TrainTime is in microseconds
5656
gen_statem:cast(WorkerPid,{loss, {LossTensor, LossTensorType} , TrainTime , BatchID , SourceName})
5757
after ?TRAIN_TIMEOUT -> %TODO inspect this timeout
5858
?LOG_ERROR("Worker train timeout reached! bid:~p s:~p",[BatchID , SourceName]),

src_erl/NerlnetApp/src/Client/clientStatem.erl

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,10 @@ init({MyName,NerlnetGraph, ClientWorkers , WorkerShaMap , WorkerToClientMap , Sh
7575
EtsRef = ets:new(client_data, [set, public]), %% client_data is responsible for functional attributes
7676
EtsStats = ets:new(ets_stats, [set]), %% ets_stats is responsible for holding all the ets stats (client + workers)
7777
ClientStatsEts = stats:generate_stats_ets(), %% client stats ets inside ets_stats
78+
% TODO add flag to control generate performance stats ets
79+
ClientPerformanceEts = stats:generate_performance_stats_ets(), %% client performance stats ets inside ets_stats
7880
ets:insert(EtsStats, {MyName, ClientStatsEts}),
81+
ets:insert(EtsStats, {performance_stats, ClientPerformanceEts}),
7982
put(ets_stats, EtsStats),
8083
ets:insert(EtsRef, {workerToClient, WorkerToClientMap}), % All workers in the network (map to their client)
8184
ets:insert(EtsRef, {workersNames, ClientWorkers}), % All THIS Client's workers
@@ -105,6 +108,7 @@ init({MyName,NerlnetGraph, ClientWorkers , WorkerShaMap , WorkerToClientMap , Sh
105108
put(client_data, EtsRef),
106109
put(ets_stats, EtsStats),
107110
put(client_stats_ets , ClientStatsEts),
111+
put(performance_stats_ets , ClientPerformanceEts),
108112
put(my_pid , self()),
109113

110114
{ok, idle, #client_statem_state{myName= MyName, etsRef = EtsRef}}.
@@ -130,8 +134,7 @@ waitforWorkers(cast, In = {stateChange,WorkerName}, State = #client_statem_state
130134
stats:increment_messages_sent(ClientStatsEts),
131135
?LOG_INFO("Client ~p and its workers are ready~n",[MyName]),
132136
{next_state, NextState, State#client_statem_state{waitforWorkers = []}};
133-
_ -> %io:format("Client ~p is waiting for workers ~p~n",[MyName,NewWaitforWorkers]),
134-
{next_state, waitforWorkers, State#client_statem_state{waitforWorkers = NewWaitforWorkers}}
137+
_ -> {next_state, waitforWorkers, State#client_statem_state{waitforWorkers = NewWaitforWorkers}}
135138
end;
136139

137140
waitforWorkers(cast, In = {worker_to_worker_msg, FromWorker, ToWorker, Data}, State = #client_statem_state{etsRef = EtsRef}) ->
@@ -173,8 +176,10 @@ idle(cast, _In = {statistics}, State = #client_statem_state{ myName = MyName, et
173176
ClientStatsEncStr = stats:encode_ets_to_http_bin_str(ClientStatsEts),
174177
stats:increment_messages_received(ClientStatsEts),
175178
ListStatsEts = ets:tab2list(EtsStats) -- [{MyName , ClientStatsEts}],
179+
PerformenceStatsEts = get(performance_stats_ets),
180+
ClientPerformenceStatsEncStr = ?PERF_STATS_SEPERATOR ++ stats:encode_ets_to_http_bin_str(PerformenceStatsEts) ++ ?PERF_STATS_SEPERATOR,
176181
WorkersStatsEncStr = create_encoded_stats_str(ListStatsEts),
177-
DataToSend = ClientStatsEncStr ++ WorkersStatsEncStr,
182+
DataToSend = ClientStatsEncStr ++ ClientPerformenceStatsEncStr ++ WorkersStatsEncStr,
178183
StatsBody = {MyName , DataToSend},
179184
{RouterHost,RouterPort} = ets:lookup_element(EtsRef, my_router, ?DATA_IDX),
180185
nerl_tools:http_router_request(RouterHost, RouterPort, [?MAIN_SERVER_ATOM], atom_to_list(statistics), StatsBody),
@@ -184,19 +189,27 @@ idle(cast, _In = {statistics}, State = #client_statem_state{ myName = MyName, et
184189
% Main Server triggers this state
185190
idle(cast, In = {training}, State = #client_statem_state{myName = _MyName, etsRef = EtsRef}) ->
186191
ClientStatsEts = get(client_stats_ets),
192+
PerformanceStatsEts = get(performance_stats_ets),
187193
stats:increment_messages_received(ClientStatsEts),
188194
stats:increment_bytes_received(ClientStatsEts , nerl_tools:calculate_size(In)),
189195
MessageToCast = {training},
190196
cast_message_to_workers(EtsRef, MessageToCast),
191197
ets:update_element(EtsRef, all_workers_done, {?DATA_IDX, false}),
198+
stats:performance_stats_reset(PerformanceStatsEts),
199+
stats:tic(ClientStatsEts, time_train_total),
200+
stats:reset_query_cpu_util_cores(),
192201
{next_state, waitforWorkers, State#client_statem_state{waitforWorkers = clientWorkersFunctions:get_workers_names(EtsRef), nextState = training}};
193202

194203
idle(cast, In = {predict}, State = #client_statem_state{etsRef = EtsRef}) ->
195204
ClientStatsEts = get(client_stats_ets),
205+
PerformanceStatsEts = get(performance_stats_ets),
196206
stats:increment_messages_received(ClientStatsEts),
197207
stats:increment_bytes_received(ClientStatsEts , nerl_tools:calculate_size(In)),
198208
MessageToCast = {predict},
199209
cast_message_to_workers(EtsRef, MessageToCast),
210+
stats:performance_stats_reset(PerformanceStatsEts),
211+
stats:tic(ClientStatsEts, time_predict_total),
212+
stats:reset_query_cpu_util_cores(),
200213
{next_state, waitforWorkers, State#client_statem_state{waitforWorkers = clientWorkersFunctions:get_workers_names(EtsRef),nextState = predict}};
201214

202215
idle(cast, EventContent, State = #client_statem_state{etsRef = EtsRef , myName = MyName}) ->
@@ -302,6 +315,7 @@ training(cast, In = {stream_ended , Pair}, State = #client_statem_state{etsRef =
302315
% From MainServer
303316
training(cast, In = {idle}, State = #client_statem_state{myName = MyName, etsRef = EtsRef}) ->
304317
ClientStatsEts = get(client_stats_ets),
318+
ClientPerformanceEts = get(performance_stats_ets),
305319
stats:increment_messages_received(ClientStatsEts),
306320
stats:increment_bytes_received(ClientStatsEts , nerl_tools:calculate_size(In)),
307321
MessageToCast = {idle},
@@ -310,6 +324,9 @@ training(cast, In = {idle}, State = #client_statem_state{myName = MyName, etsRef
310324
true -> cast_message_to_workers(EtsRef, MessageToCast),
311325
Workers = clientWorkersFunctions:get_workers_names(EtsRef),
312326
?LOG_INFO("~p sent idle to workers: ~p , waiting for confirmation...~n",[MyName, ets:lookup_element(EtsRef, workersNames, ?DATA_IDX)]),
327+
Elapsed = stats:toc(ClientStatsEts, time_train_total),
328+
stats:increment_time_train_total(ClientPerformanceEts, Elapsed),
329+
stats:update_cpu_util_per_core(ClientPerformanceEts, train), % Update CPU utilization for training phase
313330
{next_state, waitforWorkers, State#client_statem_state{etsRef = EtsRef, waitforWorkers = Workers , nextState = idle}};
314331
false -> MyPid = get(my_pid),
315332
spawn(fun() -> timer:sleep(10), gen_statem:cast(MyPid, {idle}) end), % Trigger this action until all workers are done
@@ -323,9 +340,11 @@ training(cast, _In = {predict}, State = #client_statem_state{myName = MyName, et
323340

324341
training(cast, In = {loss, WorkerName ,SourceName ,LossTensor ,TimeNIF , WorkerToken,BatchID ,BatchTS}, State = #client_statem_state{myName = MyName,etsRef = EtsRef}) ->
325342
ClientStatsEts = get(client_stats_ets),
343+
ClientPerformanceEts = get(performance_stats_ets),
326344
stats:increment_messages_received(ClientStatsEts),
327345
stats:increment_bytes_received(ClientStatsEts , nerl_tools:calculate_size(In)),
328346
{RouterHost,RouterPort} = ets:lookup_element(EtsRef, my_router, ?DATA_IDX),
347+
stats:increment_time_train_active(ClientPerformanceEts, trunc(TimeNIF)), % in microseconds
329348
MessageBody = {WorkerName , SourceName , LossTensor , TimeNIF , WorkerToken, BatchID , BatchTS},
330349
nerl_tools:http_router_request(RouterHost, RouterPort, [?MAIN_SERVER_ATOM], atom_to_list(lossFunction), MessageBody), %% Change lossFunction atom to lossValue
331350
stats:increment_messages_sent(ClientStatsEts),
@@ -401,6 +420,7 @@ predict(cast, In = {stream_ended , Pair}, State = #client_statem_state{etsRef =
401420
% From MainServer
402421
predict(cast, In = {idle}, State = #client_statem_state{myName = MyName, etsRef = EtsRef}) ->
403422
ClientStatsEts = get(client_stats_ets),
423+
ClientPerformanceEts = get(performance_stats_ets),
404424
stats:increment_messages_received(ClientStatsEts),
405425
stats:increment_bytes_received(ClientStatsEts , nerl_tools:calculate_size(In)),
406426
MessageToCast = {idle},
@@ -409,13 +429,17 @@ predict(cast, In = {idle}, State = #client_statem_state{myName = MyName, etsRef
409429
true -> cast_message_to_workers(EtsRef, MessageToCast),
410430
Workers = clientWorkersFunctions:get_workers_names(EtsRef),
411431
?LOG_INFO("~p sent idle to workers: ~p , waiting for confirmation...~n",[MyName, ets:lookup_element(EtsRef, workersNames, ?DATA_IDX)]),
432+
Elapsed = stats:toc(ClientStatsEts, time_predict_total),
433+
stats:increment_time_predict_total(ClientPerformanceEts, Elapsed),
434+
stats:update_cpu_util_per_core(ClientPerformanceEts, predict), % Update CPU utilization for predict phase
412435
{next_state, waitforWorkers, State#client_statem_state{etsRef = EtsRef, waitforWorkers = Workers , nextState = idle}};
413436
false -> gen_statem:cast(get(my_pid) , {idle}), % Trigger this action until all workers are done
414437
{keep_state, State}
415438
end;
416439

417440
predict(cast, In = {predictRes,WorkerName, SourceName ,{PredictNerlTensor, NetlTensorType} , TimeTook , WorkerToken, BatchID , BatchTS}, State = #client_statem_state{myName = _MyName, etsRef = EtsRef}) ->
418441
ClientStatsEts = get(client_stats_ets),
442+
ClientPerformanceEts = get(performance_stats_ets),
419443
stats:increment_messages_received(ClientStatsEts),
420444
stats:increment_bytes_received(ClientStatsEts , nerl_tools:calculate_size(In)),
421445

@@ -425,6 +449,7 @@ predict(cast, In = {predictRes,WorkerName, SourceName ,{PredictNerlTensor, NetlT
425449

426450
stats:increment_messages_sent(ClientStatsEts),
427451
stats:increment_bytes_sent(ClientStatsEts , nerl_tools:calculate_size(MessageBody)),
452+
stats:increment_time_predict_active(ClientPerformanceEts, trunc(TimeTook)), % in microseconds
428453
{next_state, predict, State#client_statem_state{etsRef = EtsRef}};
429454

430455
% TODO from predict directly to training?!?!?

src_erl/NerlnetApp/src/MainServer/mainGenserver.erl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,7 @@ handle_cast({statistics,Body}, State = #main_genserver_state{myName = MyName}) -
200200
%% statistics arrived from Entity
201201
{From, StatsEtsEncStr} = binary_to_term(Body),
202202
set_entity_stats_ets_str(From, StatsEtsEncStr),
203-
203+
204204
% increase counter_received_stats ets by 1
205205
ets:update_counter(get(main_server_ets), counter_received_stats, 1),
206206
stats:increment_messages_received(StatsEts),

0 commit comments

Comments
 (0)