leondavi
diff --git a/‎.github/workflows/pr.yml
Lines changed: 6 additions & 0 deletions b/‎.github/workflows/pr.yml
Lines changed: 6 additions & 0 deletions
diff --git a/‎NerlnetRun.sh
Lines changed: 2 additions & 0 deletions b/‎NerlnetRun.sh
Lines changed: 2 additions & 0 deletions
diff --git a/‎inputJsonsFiles/debug/conn_debug_synt.json
Lines changed: 6 additions & 0 deletions b/‎inputJsonsFiles/debug/conn_debug_synt.json
Lines changed: 6 additions & 0 deletions
diff --git a/‎inputJsonsFiles/debug/dc_debug_synt.json
Lines changed: 91 additions & 0 deletions b/‎inputJsonsFiles/debug/dc_debug_synt.json
Lines changed: 91 additions & 0 deletions
diff --git a/‎inputJsonsFiles/debug/exp_debug_synt.json
Lines changed: 40 additions & 0 deletions b/‎inputJsonsFiles/debug/exp_debug_synt.json
Lines changed: 40 additions & 0 deletions
diff --git a/‎src_erl/NerlnetApp/rebar.config
Lines changed: 1 addition & 1 deletion b/‎src_erl/NerlnetApp/rebar.config
Lines changed: 1 addition & 1 deletion
diff --git a/‎src_erl/NerlnetApp/src/Bridge/onnWorkers/nerlNIF.erl
Lines changed: 1 addition & 1 deletion b/‎src_erl/NerlnetApp/src/Bridge/onnWorkers/nerlNIF.erl
Lines changed: 1 addition & 1 deletion
diff --git a/‎src_erl/NerlnetApp/src/Client/clientStatem.erl
Lines changed: 28 additions & 3 deletions b/‎src_erl/NerlnetApp/src/Client/clientStatem.erl
Lines changed: 28 additions & 3 deletions
diff --git a/‎src_erl/NerlnetApp/src/MainServer/mainGenserver.erl
Lines changed: 1 addition & 1 deletion b/‎src_erl/NerlnetApp/src/MainServer/mainGenserver.erl
Lines changed: 1 addition & 1 deletion
@@ -40,3 +40,9 @@ jobs:
         run: |
           ./tests/NerlnetFullFlowTest.sh
         timeout-minutes: 20
+      - name: Post FullFlow test
+        id: fullflowpost
+        if: steps.fullflow.outcome == 'success'
+        run: |
+          ./tests/NerlnetFullFlowTestPost.sh
+        timeout-minutes: 5
@@ -59,6 +59,8 @@ function init()
     if [ $is_rasp -gt "0" ]; then 
         export LD_PRELOAD=/usr/lib/arm-linux-gnueabihf/libatomic.so.1.2.0 
     fi
+
+	pkill beam.smp
 }
 
 function status()
 
@@ -0,0 +1,6 @@
+{
+    "connectionsMap":
+    {
+        "r1":["mainServer", "c1", "s1"]
+    }
+}
@@ -0,0 +1,91 @@
+{
+    "nerlnetSettings": {
+        "frequency": "200",
+        "batchSize": "10"
+    },
+    "mainServer": {
+        "port": "8081",
+        "args": ""
+    },
+    "apiServer": {
+        "port": "8082",
+        "args": ""
+    },
+    "devices": [
+        {
+            "name": "pc1",
+            "ipv4": "127.0.0.1",
+            "entities": "c1,r1,s1,apiServer,mainServer"
+        }
+    ],
+    "routers": [
+        {
+            "name": "r1",
+            "port": "8086",
+            "policy": "0"
+        }
+    ],
+    "sources": [
+        {
+            "name": "s1",
+            "port": "8085",
+            "frequency": "200",
+            "policy": "0",
+            "epochs": "1",
+            "type": "0"
+        }
+    ],
+    "clients": [
+        {
+            "name": "c1",
+            "port": "8083",
+            "workers": "w1,w2"
+        }
+    ],
+    "workers": [
+        {
+            "name": "w1",
+            "model_sha": "d8df752e0a2e8f01de8f66e9cec941cdbc65d144ecf90ab7713e69d65e7e82aa"
+        },
+        {
+            "name": "w2",
+            "model_sha": "d8df752e0a2e8f01de8f66e9cec941cdbc65d144ecf90ab7713e69d65e7e82aa"
+        }
+    ],
+    "model_sha": {
+        "d8df752e0a2e8f01de8f66e9cec941cdbc65d144ecf90ab7713e69d65e7e82aa": {
+            "modelType": "0",
+            "_doc_modelType": " nn:0 | approximation:1 | classification:2 | forecasting:3 | image-classification:4 | text-classification:5 | text-generation:6 | auto-association:7 | autoencoder:8 | ae-classifier:9 |",
+            "modelArgs": "",
+            "layersSizes": "5,16,8,3",
+            "_doc_layersSizes": "List of postive integers [L0, L1, ..., LN]",
+            "layerTypesList": "1,3,3,3",
+            "_doc_LayerTypes": " Default:0 | Scaling:1 | CNN:2 | Perceptron:3 | Pooling:4 | Probabilistic:5 | LSTM:6 | Reccurrent:7 | Unscaling:8 |",
+            "layers_functions": "1,7,7,11",
+            "_doc_layers_functions_activation": " Threshold:1 | Sign:2 | Logistic:3 | Tanh:4 | Linear:5 | ReLU:6 | eLU:7 | SeLU:8 | Soft-plus:9 | Soft-sign:10 | Hard-sigmoid:11 |",
+            "_doc_layer_functions_pooling": " none:1 | Max:2 | Avg:3 |",
+            "_doc_layer_functions_probabilistic": " Binary:1 | Logistic:2 | Competitive:3 | Softmax:4 |",
+            "_doc_layer_functions_scaler": " none:1 | MinMax:2 | MeanStd:3 | STD:4 | Log:5 |",
+            "lossMethod": "2",
+            "_doc_lossMethod": " SSE:1 | MSE:2 | NSE:3 | MinkowskiE:4 | WSE:5 | CEE:6 |",
+            "lossArgs": "",
+            "_doc_lossArgs": "reg=L2, reg=L1, reg=NoRegularization (can be also empty)",
+            "lr": "0.001",
+            "_doc_lr": "Positve float",
+            "epochs": "1",
+            "_doc_epochs": "Positve Integer",
+            "optimizer": "5",
+            "_doc_optimizer": " GD:0 | CGD:1 | SGD:2 | QuasiNeuton:3 | LVM:4 | ADAM:5 |",
+            "optimizerArgs": "",
+            "_doc_optimizerArgs": "String",
+            "infraType": "0",
+            "_doc_infraType": " opennn:0 | wolfengine:1 |",
+            "distributedSystemType": "0",
+            "_doc_distributedSystemType": " none:0 | fedClientAvg:1 | fedServerAvg:2 |",
+            "distributedSystemArgs": "",
+            "_doc_distributedSystemArgs": "String",
+            "distributedSystemToken": "none",
+            "_doc_distributedSystemToken": "Token that associates distributed group of workers and parameter-server"
+        }
+    }
+}
@@ -0,0 +1,40 @@
+{
+   "experimentName": "synthetic_3_gausians",
+   "experimentType": "classification",
+   "batchSize": 10,
+   "csvFilePath": "/tmp/nerlnet/data/NerlnetData-master/nerlnet/synthetic_norm/synthetic_full.csv",
+   "numOfFeatures": "5",
+   "numOfLabels": "3",
+   "headersNames": "Norm(0:1),Norm(4:1),Norm(10:3)",
+   "Phases": 
+   [
+      {
+         "phaseName": "training_phase",
+         "phaseType": "training",
+         "sourcePieces":
+         [
+            {
+            "sourceName": "s1",
+            "startingSample": "0",
+            "numOfBatches": "5",
+            "workers": "w1,w2",
+            "nerltensorType": "float"
+            }
+         ]  
+      },
+      {
+         "phaseName": "prediction_phase",
+         "phaseType": "prediction",
+         "sourcePieces":
+         [
+            {
+            "sourceName": "s1",
+            "startingSample": "50000",
+            "numOfBatches": "5",
+            "workers": "w1,w2",
+            "nerltensorType": "float"
+            }
+         ]    
+      }
+   ]
+}
@@ -8,7 +8,7 @@
     {apps, [nerlnetApp]}
 ]}.
 {base_dir, "/usr/local/lib/nerlnet-lib/NErlNet/build/rebar"}.
-{relx, [{release, {nerlnetApp, "1.4.3"}, [nerlnetApp,cowboy,jsx,kernel,stdlib,inets]},
+{relx, [{release, {nerlnetApp, "1.4.3"}, [nerlnetApp,cowboy,jsx,kernel,stdlib,inets,sasl,os_mon]},
         {dev_mode, true},
         {include_erts, true},
         %{include_src, true},
 
@@ -52,7 +52,7 @@ call_to_train(ModelID, {DataTensor, Type}, WorkerPid , BatchID , SourceName)->
       receive
             {nerlnif, nan, TrainTime} -> 
                   gen_statem:cast(WorkerPid,{loss, nan , TrainTime , BatchID , SourceName}); %TODO Guy - Please the behavior when this case happens
-            {nerlnif , LossTensor, LossTensorType , TrainTime}->
+            {nerlnif , LossTensor, LossTensorType , TrainTime}-> % TrainTime is in microseconds
                   gen_statem:cast(WorkerPid,{loss, {LossTensor, LossTensorType} , TrainTime , BatchID , SourceName})
             after ?TRAIN_TIMEOUT ->  %TODO inspect this timeout 
                   ?LOG_ERROR("Worker train timeout reached! bid:~p s:~p",[BatchID , SourceName]),
 
@@ -75,7 +75,10 @@ init({MyName,NerlnetGraph, ClientWorkers , WorkerShaMap , WorkerToClientMap , Sh
   EtsRef = ets:new(client_data, [set, public]), %% client_data is responsible for functional attributes
   EtsStats = ets:new(ets_stats, [set]), %% ets_stats is responsible for holding all the ets stats (client + workers)
   ClientStatsEts = stats:generate_stats_ets(), %% client stats ets inside ets_stats
+  % TODO add flag to control generate performance stats ets
+  ClientPerformanceEts = stats:generate_performance_stats_ets(), %% client performance stats ets inside ets_stats
   ets:insert(EtsStats, {MyName, ClientStatsEts}),
+  ets:insert(EtsStats, {performance_stats, ClientPerformanceEts}),
   put(ets_stats, EtsStats),
   ets:insert(EtsRef, {workerToClient, WorkerToClientMap}), % All workers in the network (map to their client)
   ets:insert(EtsRef, {workersNames, ClientWorkers}), % All THIS Client's workers
@@ -105,6 +108,7 @@ init({MyName,NerlnetGraph, ClientWorkers , WorkerShaMap , WorkerToClientMap , Sh
   put(client_data, EtsRef),
   put(ets_stats, EtsStats),
   put(client_stats_ets , ClientStatsEts),
+  put(performance_stats_ets , ClientPerformanceEts),
   put(my_pid , self()),
 
   {ok, idle, #client_statem_state{myName= MyName, etsRef = EtsRef}}.
@@ -130,8 +134,7 @@ waitforWorkers(cast, In = {stateChange,WorkerName}, State = #client_statem_state
             stats:increment_messages_sent(ClientStatsEts),
             ?LOG_INFO("Client ~p and its workers are ready~n",[MyName]),
             {next_state, NextState, State#client_statem_state{waitforWorkers = []}};
-    _  ->   %io:format("Client ~p is waiting for workers ~p~n",[MyName,NewWaitforWorkers]),
-            {next_state, waitforWorkers, State#client_statem_state{waitforWorkers = NewWaitforWorkers}}
+    _  ->   {next_state, waitforWorkers, State#client_statem_state{waitforWorkers = NewWaitforWorkers}}
   end;
 
 waitforWorkers(cast, In = {worker_to_worker_msg, FromWorker, ToWorker, Data}, State = #client_statem_state{etsRef = EtsRef}) ->
@@ -173,8 +176,10 @@ idle(cast, _In = {statistics}, State = #client_statem_state{ myName = MyName, et
   ClientStatsEncStr = stats:encode_ets_to_http_bin_str(ClientStatsEts),
   stats:increment_messages_received(ClientStatsEts),
   ListStatsEts = ets:tab2list(EtsStats) -- [{MyName , ClientStatsEts}], 
+  PerformenceStatsEts = get(performance_stats_ets),
+  ClientPerformenceStatsEncStr = ?PERF_STATS_SEPERATOR ++ stats:encode_ets_to_http_bin_str(PerformenceStatsEts) ++ ?PERF_STATS_SEPERATOR,
   WorkersStatsEncStr = create_encoded_stats_str(ListStatsEts),
-  DataToSend = ClientStatsEncStr ++ WorkersStatsEncStr,
+  DataToSend = ClientStatsEncStr ++ ClientPerformenceStatsEncStr ++ WorkersStatsEncStr,
   StatsBody = {MyName , DataToSend},
   {RouterHost,RouterPort} = ets:lookup_element(EtsRef, my_router, ?DATA_IDX),
   nerl_tools:http_router_request(RouterHost, RouterPort, [?MAIN_SERVER_ATOM], atom_to_list(statistics), StatsBody),
@@ -184,19 +189,27 @@ idle(cast, _In = {statistics}, State = #client_statem_state{ myName = MyName, et
 % Main Server triggers this state
 idle(cast, In = {training}, State = #client_statem_state{myName = _MyName, etsRef = EtsRef}) ->
   ClientStatsEts = get(client_stats_ets),
+  PerformanceStatsEts = get(performance_stats_ets),
   stats:increment_messages_received(ClientStatsEts),
   stats:increment_bytes_received(ClientStatsEts , nerl_tools:calculate_size(In)),
   MessageToCast = {training},
   cast_message_to_workers(EtsRef, MessageToCast),
   ets:update_element(EtsRef, all_workers_done, {?DATA_IDX, false}),
+  stats:performance_stats_reset(PerformanceStatsEts),
+  stats:tic(ClientStatsEts, time_train_total),
+  stats:reset_query_cpu_util_cores(),
   {next_state, waitforWorkers, State#client_statem_state{waitforWorkers =  clientWorkersFunctions:get_workers_names(EtsRef), nextState = training}};
 
 idle(cast, In = {predict}, State = #client_statem_state{etsRef = EtsRef}) ->
   ClientStatsEts = get(client_stats_ets),
+  PerformanceStatsEts = get(performance_stats_ets),
   stats:increment_messages_received(ClientStatsEts),
   stats:increment_bytes_received(ClientStatsEts , nerl_tools:calculate_size(In)),
   MessageToCast = {predict},
   cast_message_to_workers(EtsRef, MessageToCast),
+  stats:performance_stats_reset(PerformanceStatsEts),
+  stats:tic(ClientStatsEts, time_predict_total), 
+  stats:reset_query_cpu_util_cores(),
   {next_state, waitforWorkers, State#client_statem_state{waitforWorkers = clientWorkersFunctions:get_workers_names(EtsRef),nextState = predict}};
 
 idle(cast, EventContent, State = #client_statem_state{etsRef = EtsRef , myName = MyName}) ->
@@ -302,6 +315,7 @@ training(cast, In = {stream_ended , Pair}, State = #client_statem_state{etsRef =
 % From MainServer
 training(cast, In = {idle}, State = #client_statem_state{myName = MyName, etsRef = EtsRef}) ->
   ClientStatsEts = get(client_stats_ets),
+  ClientPerformanceEts = get(performance_stats_ets),
   stats:increment_messages_received(ClientStatsEts),
   stats:increment_bytes_received(ClientStatsEts , nerl_tools:calculate_size(In)),
   MessageToCast = {idle},
@@ -310,6 +324,9 @@ training(cast, In = {idle}, State = #client_statem_state{myName = MyName, etsRef
     true ->   cast_message_to_workers(EtsRef, MessageToCast),
               Workers =  clientWorkersFunctions:get_workers_names(EtsRef),
               ?LOG_INFO("~p sent idle to workers: ~p , waiting for confirmation...~n",[MyName, ets:lookup_element(EtsRef, workersNames, ?DATA_IDX)]),
+              Elapsed = stats:toc(ClientStatsEts, time_train_total),
+              stats:increment_time_train_total(ClientPerformanceEts, Elapsed),
+              stats:update_cpu_util_per_core(ClientPerformanceEts, train), % Update CPU utilization for training phase
               {next_state, waitforWorkers, State#client_statem_state{etsRef = EtsRef, waitforWorkers = Workers , nextState = idle}};
     false ->  MyPid = get(my_pid), 
               spawn(fun() -> timer:sleep(10), gen_statem:cast(MyPid, {idle}) end), % Trigger this action until all workers are done
@@ -323,9 +340,11 @@ training(cast, _In = {predict}, State = #client_statem_state{myName = MyName, et
 
 training(cast, In = {loss, WorkerName ,SourceName ,LossTensor ,TimeNIF , WorkerToken,BatchID ,BatchTS}, State = #client_statem_state{myName = MyName,etsRef = EtsRef}) ->
   ClientStatsEts = get(client_stats_ets),
+  ClientPerformanceEts = get(performance_stats_ets),
   stats:increment_messages_received(ClientStatsEts),
   stats:increment_bytes_received(ClientStatsEts , nerl_tools:calculate_size(In)),
   {RouterHost,RouterPort} = ets:lookup_element(EtsRef, my_router, ?DATA_IDX),
+  stats:increment_time_train_active(ClientPerformanceEts, trunc(TimeNIF)), % in microseconds
   MessageBody = {WorkerName , SourceName , LossTensor , TimeNIF , WorkerToken, BatchID , BatchTS},
   nerl_tools:http_router_request(RouterHost, RouterPort, [?MAIN_SERVER_ATOM], atom_to_list(lossFunction), MessageBody), %% Change lossFunction atom to lossValue
   stats:increment_messages_sent(ClientStatsEts),
@@ -401,6 +420,7 @@ predict(cast, In = {stream_ended , Pair}, State = #client_statem_state{etsRef =
 % From MainServer
 predict(cast, In = {idle}, State = #client_statem_state{myName = MyName, etsRef = EtsRef}) ->
   ClientStatsEts = get(client_stats_ets),
+  ClientPerformanceEts = get(performance_stats_ets),
   stats:increment_messages_received(ClientStatsEts),
   stats:increment_bytes_received(ClientStatsEts , nerl_tools:calculate_size(In)),
   MessageToCast = {idle},
@@ -409,13 +429,17 @@ predict(cast, In = {idle}, State = #client_statem_state{myName = MyName, etsRef
     true ->   cast_message_to_workers(EtsRef, MessageToCast),
               Workers =  clientWorkersFunctions:get_workers_names(EtsRef),
               ?LOG_INFO("~p sent idle to workers: ~p , waiting for confirmation...~n",[MyName, ets:lookup_element(EtsRef, workersNames, ?DATA_IDX)]),
+              Elapsed = stats:toc(ClientStatsEts, time_predict_total),
+              stats:increment_time_predict_total(ClientPerformanceEts, Elapsed),
+              stats:update_cpu_util_per_core(ClientPerformanceEts, predict), % Update CPU utilization for predict phase
               {next_state, waitforWorkers, State#client_statem_state{etsRef = EtsRef, waitforWorkers = Workers , nextState = idle}};
     false ->  gen_statem:cast(get(my_pid) , {idle}), % Trigger this action until all workers are done
               {keep_state, State}
   end;
 
 predict(cast, In = {predictRes,WorkerName, SourceName ,{PredictNerlTensor, NetlTensorType} , TimeTook , WorkerToken, BatchID , BatchTS}, State = #client_statem_state{myName = _MyName, etsRef = EtsRef}) ->
   ClientStatsEts = get(client_stats_ets),
+  ClientPerformanceEts = get(performance_stats_ets),
   stats:increment_messages_received(ClientStatsEts),
   stats:increment_bytes_received(ClientStatsEts , nerl_tools:calculate_size(In)),
 
@@ -425,6 +449,7 @@ predict(cast, In = {predictRes,WorkerName, SourceName ,{PredictNerlTensor, NetlT
 
   stats:increment_messages_sent(ClientStatsEts),
   stats:increment_bytes_sent(ClientStatsEts , nerl_tools:calculate_size(MessageBody)),
+  stats:increment_time_predict_active(ClientPerformanceEts, trunc(TimeTook)), % in microseconds
   {next_state, predict, State#client_statem_state{etsRef = EtsRef}};
 
 % TODO from predict directly to training?!?!?
 
@@ -200,7 +200,7 @@ handle_cast({statistics,Body}, State = #main_genserver_state{myName = MyName}) -
           %% statistics arrived from Entity
           {From, StatsEtsEncStr} = binary_to_term(Body),
           set_entity_stats_ets_str(From, StatsEtsEncStr),
-
+          
           % increase counter_received_stats ets by 1
           ets:update_counter(get(main_server_ets), counter_received_stats, 1),
           stats:increment_messages_received(StatsEts),
Original file line number	Diff line number	Diff line change
`@@ -59,6 +59,8 @@ function init()`
`59`	`59`	`if [ $is_rasp -gt "0" ]; then`
`60`	`60`	`export LD_PRELOAD=/usr/lib/arm-linux-gnueabihf/libatomic.so.1.2.0`
`61`	`61`	`fi`
	`62`	`+`
	`63`	`+ pkill beam.smp`
`62`	`64`	`}`
`63`	`65`
`64`	`66`	`function status()`
-Original file line number
+Diff line change
@@ @@ -0,0 +1,6 @@ @@
 +{
 +    "connectionsMap":
 +    {
 +        "r1":["mainServer", "c1", "s1"]
 +    }
 +}