Skip to content

Commit

Permalink
Merge pull request #294 from leondavi/nerlplanner_fix
Browse files Browse the repository at this point in the history
[NerlnetApp] Critical bugs in distributed running
  • Loading branch information
leondavi authored Mar 24, 2024
2 parents e83e7ed + f8c4989 commit 3d4006a
Show file tree
Hide file tree
Showing 27 changed files with 475 additions and 231 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"connectionsMap":
{
"r1":["mainServer", "r2"],
"r2":["r3", "s1"],
"r3":["r4", "c1","s2"],
"r4":["r5", "c2","s3"],
"r5":["r6", "c3"],
"r6":["r3", "r4"]
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
{
"nerlnetSettings": {
"frequency": "50",
"batchSize": "100"
},
"mainServer": {
"port": "8844",
"args": ""
},
"apiServer": {
"port": "8845",
"args": ""
},
"devices": [
{
"name": "mac",
"ipv4": "192.168.0.248",
"entities": "c3,r1,r4,r5,r6,s1,mainServer,apiServer"
},
{
"name": "jet1",
"ipv4": "192.168.0.226",
"entities": "c1,r2,s2"
},
{
"name": "jet2",
"ipv4": "192.168.0.228",
"entities": "c2,r3,s3"
}
],
"routers": [
{
"name": "r1",
"port": "8900",
"policy": "0"
},
{
"name": "r2",
"port": "8902",
"policy": "0"
},
{
"name": "r3",
"port": "8903",
"policy": "0"
},
{
"name": "r4",
"port": "8904",
"policy": "0"
},
{
"name": "r5",
"port": "8905",
"policy": "0"
},
{
"name": "r6",
"port": "8906",
"policy": "0"
}
],
"sources": [
{
"name": "s1",
"port": "8853",
"frequency": "50",
"policy": "0",
"epochs": "1",
"type": "0"
},
{
"name": "s2",
"port": "8854",
"frequency": "50",
"policy": "0",
"epochs": "1",
"type": "0"
},
{
"name": "s3",
"port": "8855",
"frequency": "50",
"policy": "0",
"epochs": "1",
"type": "0"
}
],
"clients": [
{
"name": "c1",
"port": "8846",
"workers": "w1,w2"
},
{
"name": "c2",
"port": "8847",
"workers": "w3,w4"
},
{
"name": "c3",
"port": "8851",
"workers": "w5"
}
],
"workers": [
{
"name": "w1",
"model_sha": "1f3078160da415bda29a65e6c854f938c291d38166d0cb6e89cec2fd81678613"
},
{
"name": "w2",
"model_sha": "1f3078160da415bda29a65e6c854f938c291d38166d0cb6e89cec2fd81678613"
},
{
"name": "w3",
"model_sha": "1f3078160da415bda29a65e6c854f938c291d38166d0cb6e89cec2fd81678613"
},
{
"name": "w4",
"model_sha": "1f3078160da415bda29a65e6c854f938c291d38166d0cb6e89cec2fd81678613"
},
{
"name": "w5",
"model_sha": "1f3078160da415bda29a65e6c854f938c291d38166d0cb6e89cec2fd81678613"
}
],
"model_sha": {
"1f3078160da415bda29a65e6c854f938c291d38166d0cb6e89cec2fd81678613": {
"modelType": "0",
"_doc_modelType": " nn:0 | approximation:1 | classification:2 | forecasting:3 | image_classification:4 | text_classification:5 | text_generation:6 | auto_association:7 | autoencoder:8 | ae_classifier:9 |",
"layersSizes": "5,16,8,3",
"_doc_layersSizes": "List of postive integers [L0, L1, ..., LN]",
"layerTypesList": "1,3,3,3",
"_doc_LayerTypes": " Default:0 | Scaling:1 | CNN:2 | Perceptron:3 | Pooling:4 | Probabilistic:5 | LSTM:6 | Reccurrent:7 | Unscaling:8 | Bounding:9 |",
"layers_functions": "1,7,7,11",
"_doc_layers_functions_activation": " Threshold:1 | Sign:2 | Logistic:3 | Tanh:4 | Linear:5 | ReLU:6 | eLU:7 | SeLU:8 | Soft-plus:9 | Soft-sign:10 | Hard-sigmoid:11 |",
"_doc_layer_functions_pooling": " none:1 | Max:2 | Avg:3 |",
"_doc_layer_functions_probabilistic": " Binary:1 | Logistic:2 | Competitive:3 | Softmax:4 |",
"_doc_layer_functions_scaler": " none:1 | MinMax:2 | MeanStd:3 | STD:4 | Log:5 |",
"lossMethod": "2",
"_doc_lossMethod": " SSE:1 | MSE:2 | NSE:3 | MinkowskiE:4 | WSE:5 | CEE:6 |",
"lr": "0.01",
"_doc_lr": "Positve float",
"epochs": "1",
"_doc_epochs": "Positve Integer",
"optimizer": "5",
"_doc_optimizer": " GD:0 | CGD:1 | SGD:2 | QuasiNeuton:3 | LVM:4 | ADAM:5 |",
"optimizerArgs": "none",
"_doc_optimizerArgs": "String",
"infraType": "0",
"_doc_infraType": " opennn:0 | wolfengine:1 |",
"distributedSystemType": "0",
"_doc_distributedSystemType": " none:0 | fedClientAvg:1 | fedServerAvg:2 |",
"distributedSystemArgs": "none",
"_doc_distributedSystemArgs": "String",
"distributedSystemToken": "none",
"_doc_distributedSystemToken": "Token that associates distributed group of workers and parameter-server"
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
{
"experimentName": "synthetic_3_gausians",
"batchSize": 100,
"csvFilePath": "/tmp/nerlnet/data/NerlnetData-master/nerlnet/synthetic/synthetic_full.csv",
"numOfFeatures": "5",
"numOfLabels": "3",
"headersNames": "Norm(0:1),Norm(4:1),Norm(10:3)",
"Phases":
[
{
"phaseName": "training_phase",
"phaseType": "training",
"sourcePieces":
[
{
"sourceName": "s1",
"startingSample": "0",
"numOfBatches": "200",
"workers": "w1,w2"
},
{
"sourceName": "s2",
"startingSample": "20000",
"numOfBatches": "200",
"workers": "w3,w4"
},
{
"sourceName": "s3",
"startingSample": "40000",
"numOfBatches": "200",
"workers": "w5,w1,w2,w3,w4"
}
]
},
{
"phaseName": "prediction_phase",
"phaseType": "prediction",
"sourcePieces":
[
{
"sourceName": "s1",
"startingSample": "40000",
"numOfBatches": "300",
"workers": "w1,w2,w3,w4"
},
{
"sourceName": "s2",
"startingSample": "40000",
"numOfBatches": "300",
"workers": "w1,w2,w3,w4,w5"
},
{
"sourceName": "s3",
"startingSample": "40000",
"numOfBatches": "300",
"workers": "w5"
}
]
}
]
}

2 changes: 1 addition & 1 deletion src_erl/NerlnetApp/src/Init/jsonHandler.erl
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ init(Req0, [ApplicationPid]) ->
ApplicationPid ! {jsonAddress,{lists:nth(1, Data),lists:nth(2, Data)}};
_Other ->
{ok,Body,_} = cowboy_req:read_body(Req0), %% shouldn't be here, files expected
io:format("got Req: ~p~nData: ~p~n",[Req0, Body])
io:format("Error - Got an unknown request: ~p~nData: ~p~n",[Req0, Body])
end,

Reply = io_lib:format("nerlnet starting", []),
Expand Down
3 changes: 2 additions & 1 deletion src_erl/NerlnetApp/src/MainServer/ackHandler.erl
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ init(Req0, [Who,Main_genserver_Pid]) ->
dataReady -> gen_server:cast(Main_genserver_Pid, {sourceAckDataReady,Body}); %% when source data is ready
sourceDone -> gen_server:cast(Main_genserver_Pid, {sourceDone,Body}); %% when source finished casting
clientAck -> gen_server:cast(Main_genserver_Pid, {clientAck,Body}); %% when client received message (new state)
jsonReceived -> gen_server:cast(Main_genserver_Pid, {jsonReceived,Body}) %% when other devices got the json and ready to start
jsonReceived -> gen_server:cast(Main_genserver_Pid, {jsonReceived,Body}); %% when other devices got the json and ready to start
apiserver_ack_validation -> Main_genserver_Pid ! {apiserver_ack_validation, Body} % This ack validates transmission with flask
end,
Reply = io_lib:format("Body Received: ~p ~n ", [Body]),
Req = cowboy_req:reply(200,
Expand Down
4 changes: 2 additions & 2 deletions src_erl/NerlnetApp/src/MainServer/initHandler.erl
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@ init(Req0, [Main_genServer_Pid]) ->
Decoded_body = binary_to_list(Body),
%Decoded_body = read_all_data(Req0),
% io:format("GOT DATA: ~p~n",[Decoded_body]),
[SourceName, _WorkersStr, _Epochs, _Data] = string:split(Decoded_body, "#", all),
[Index, TotalSources, SourceName, _WorkersStr, _Epochs, _Data] = string:split(Decoded_body, "#", all),
%WorkersList = string:split(WorkersStr, ",", all),
gen_server:cast(Main_genServer_Pid,{initCSV, SourceName, Body}),
gen_server:cast(Main_genServer_Pid,{initCSV, Index, TotalSources, SourceName, Body}),
%[Source|WorkersAndInput] = re:split(binary_to_list(Body), "#", [{return, list}]),
%{Workers,SourceData} = getWorkerInput(WorkersAndInput,[]),

Expand Down
Loading

0 comments on commit 3d4006a

Please sign in to comment.