Skip to content

Commit a2cfe5f

Browse files
authored
Merge pull request #298 from leondavi/results_opt
[ApiServer] Optimize communication of model phase results
2 parents 07ee1ef + 8aa5858 commit a2cfe5f

File tree

13 files changed

+107
-120
lines changed

13 files changed

+107
-120
lines changed

src_cpp/opennnBridge/openNNnif.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,8 @@ void* PredictFun(void* arg)
102102
// Stop the timer and calculate the time took for training
103103
high_resolution_clock::time_point stop = high_resolution_clock::now();
104104
auto duration = duration_cast<microseconds>(stop - PredictNNptr->start_time);
105-
nifpp::TERM predict_time = nifpp::make(env, duration.count());
105+
106+
ERL_NIF_TERM predict_time = enif_make_double(env, duration.count());
106107
nifpp::str_atom nerlnif_atom_str(NERLNIF_ATOM_STR);
107108
nifpp::TERM nerlnif_atom = nifpp::make(env , nerlnif_atom_str);
108109
ERL_NIF_TERM predict_res_and_time = enif_make_tuple(env, 4 , nerlnif_atom , prediction , nifpp::make(env, PredictNNptr->return_tensor_type) , predict_time);

src_erl/NerlnetApp/src/Bridge/nerlNIF.erl

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -53,8 +53,7 @@ call_to_train(ModelID, {DataTensor, Type}, WorkerPid , BatchID , SourceName)->
5353
{nerlnif, nan, TrainTime} ->
5454
gen_statem:cast(WorkerPid,{loss, nan , TrainTime , BatchID , SourceName}); %TODO Guy - Please the behavior when this case happens
5555
{nerlnif , LossTensor, LossTensorType , TrainTime}->
56-
{ErlTensor, ErlTensorType} = nerltensor_conversion({LossTensor, LossTensorType}, erl_float), % TODO Guy - Please do the conversion in main server
57-
gen_statem:cast(WorkerPid,{loss, {ErlTensor, ErlTensorType} , TrainTime , BatchID , SourceName})
56+
gen_statem:cast(WorkerPid,{loss, {LossTensor, LossTensorType} , TrainTime , BatchID , SourceName})
5857
after ?TRAIN_TIMEOUT -> %TODO inspect this timeout
5958
?LOG_ERROR("Worker train timeout reached! bid:~p s:~p",[BatchID , SourceName]),
6059
gen_statem:cast(WorkerPid,{loss, timeout , SourceName}) %% TODO Guy Define train timeout state
@@ -64,11 +63,11 @@ call_to_predict(ModelID, {BatchTensor, Type}, WorkerPid, BatchID , SourceName)->
6463
ok = predict_nif(ModelID, BatchTensor, Type),
6564
receive
6665

67-
{nerlnif , PredNerlTensor, NewType, TimeTook}-> %% nerlnif atom means a message from the nif implementation
66+
{nerlnif , PredNerlTensor, PredNerlTensorType, TimeNif}-> %% nerlnif atom means a message from the nif implementation
6867
% io:format("pred_nif done~n"),
6968
% {PredTen, _NewType} = nerltensor_conversion({PredNerlTensor, NewType}, erl_float),
7069
% io:format("Pred returned: ~p~n", [PredNerlTensor]),
71-
gen_statem:cast(WorkerPid,{predictRes,PredNerlTensor, NewType, TimeTook, BatchID , SourceName});
70+
gen_statem:cast(WorkerPid,{predictRes,PredNerlTensor, PredNerlTensorType, TimeNif, BatchID , SourceName});
7271
Error ->
7372
?LOG_ERROR("received wrong prediction_nif format: ~p" ,[Error]),
7473
throw("received wrong prediction_nif format")

src_erl/NerlnetApp/src/Bridge/onnWorkers/workerGeneric.erl

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -161,24 +161,23 @@ idle(cast, _Param, State) ->
161161

162162
%% Waiting for receiving results or loss function
163163
%% Got nan or inf from loss function - Error, loss function too big for double
164-
wait(cast, {loss , nan , TimeNIF , BatchID , SourceName}, State = #workerGeneric_state{myName = MyName, nextState = NextState}) ->
164+
wait(cast, {loss, nan , TrainTime , BatchID , SourceName}, State = #workerGeneric_state{myName = MyName, nextState = NextState}) ->
165165
stats:increment_by_value(get(worker_stats_ets), nan_loss_count, 1),
166-
gen_statem:cast(get(client_pid),{loss, MyName , SourceName ,nan , TimeNIF ,BatchID}),
166+
gen_statem:cast(get(client_pid),{loss, MyName , SourceName ,nan , TrainTime ,BatchID}),
167167
{next_state, NextState, State};
168168

169-
wait(cast, {loss, LossTensor , TimeNIF , BatchID , SourceName}, State = #workerGeneric_state{myName = MyName, nextState = NextState, modelID=_ModelID, distributedBehaviorFunc = DistributedBehaviorFunc, distributedWorkerData = DistributedWorkerData}) ->
170-
% {[_ , _ , _ , LossValue] , _} = LossTensor,
171-
% io:format("Got Loss Value ~p~n",[LossValue]),
169+
170+
wait(cast, {loss, {LossTensor, LossTensorType} , TrainTime , BatchID , SourceName}, State = #workerGeneric_state{myName = MyName, nextState = NextState, modelID=_ModelID, distributedBehaviorFunc = DistributedBehaviorFunc, distributedWorkerData = DistributedWorkerData}) ->
172171
BatchTimeStamp = erlang:system_time(nanosecond),
173-
gen_statem:cast(get(client_pid),{loss, MyName, SourceName ,LossTensor , TimeNIF , BatchID , BatchTimeStamp}), %% TODO Add Time and Time_NIF to the cast
172+
gen_statem:cast(get(client_pid),{loss, MyName, SourceName ,{LossTensor, LossTensorType} , TrainTime , BatchID , BatchTimeStamp}),
174173
ToUpdate = DistributedBehaviorFunc(post_train, {get(generic_worker_ets),DistributedWorkerData}),
175174
if ToUpdate -> {next_state, update, State#workerGeneric_state{nextState=NextState}};
176175
true -> {next_state, NextState, State}
177176
end;
178177

179-
wait(cast, {predictRes,PredNerlTensor, Type, TimeNIF, BatchID , SourceName}, State = #workerGeneric_state{myName = MyName, nextState = NextState, distributedBehaviorFunc = DistributedBehaviorFunc, distributedWorkerData = DistributedWorkerData}) ->
178+
wait(cast, {predictRes, PredNerlTensor, PredNerlTensorType, TimeNif, BatchID , SourceName}, State = #workerGeneric_state{myName = MyName, nextState = NextState, distributedBehaviorFunc = DistributedBehaviorFunc, distributedWorkerData = DistributedWorkerData}) ->
180179
BatchTimeStamp = erlang:system_time(nanosecond),
181-
gen_statem:cast(get(client_pid),{predictRes,MyName,SourceName, {PredNerlTensor, Type}, TimeNIF , BatchID , BatchTimeStamp}),
180+
gen_statem:cast(get(client_pid),{predictRes,MyName, SourceName, {PredNerlTensor, PredNerlTensorType}, TimeNif , BatchID , BatchTimeStamp}),
182181
Update = DistributedBehaviorFunc(post_predict, {get(generic_worker_ets),DistributedWorkerData}),
183182
if Update ->
184183
{next_state, update, State#workerGeneric_state{nextState=NextState}};

src_erl/NerlnetApp/src/Client/clientStatem.erl

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -279,7 +279,7 @@ training(cast, _In = {predict}, State = #client_statem_state{myName = MyName, et
279279
?LOG_ERROR("Wrong request , client ~p can't go from training to predict directly", [MyName]),
280280
{next_state, training, State#client_statem_state{etsRef = EtsRef}};
281281

282-
training(cast, In = {loss , WorkerName , SourceName , LossTensor , TimeNIF , BatchID , BatchTS}, State = #client_statem_state{myName = MyName,etsRef = EtsRef}) ->
282+
training(cast, In = {loss, WorkerName ,SourceName ,LossTensor ,TimeNIF ,BatchID ,BatchTS}, State = #client_statem_state{myName = MyName,etsRef = EtsRef}) ->
283283
ClientStatsEts = get(client_stats_ets),
284284
stats:increment_messages_received(ClientStatsEts),
285285
stats:increment_bytes_received(ClientStatsEts , nerl_tools:calculate_size(In)),
@@ -315,14 +315,15 @@ predict(cast, In = {sample,Body}, State = #client_statem_state{etsRef = EtsRef})
315315
end,
316316
{next_state, predict, State#client_statem_state{etsRef = EtsRef}};
317317

318-
predict(cast, In = {predictRes,WorkerName, SourceName ,{PredictNerlTensor, Type} , TimeTook , BatchID , BatchTS}, State = #client_statem_state{myName = _MyName, etsRef = EtsRef}) ->
318+
predict(cast, In = {predictRes,WorkerName, SourceName ,{PredictNerlTensor, NetlTensorType} , TimeTook , BatchID , BatchTS}, State = #client_statem_state{myName = _MyName, etsRef = EtsRef}) ->
319319
ClientStatsEts = get(client_stats_ets),
320320
stats:increment_messages_received(ClientStatsEts),
321321
stats:increment_bytes_received(ClientStatsEts , nerl_tools:calculate_size(In)),
322-
322+
323323
{RouterHost,RouterPort} = ets:lookup_element(EtsRef, my_router, ?DATA_IDX),
324-
MessageBody = {atom_to_list(WorkerName), SourceName, BatchID, {PredictNerlTensor , Type} , TimeTook , BatchTS}, %% SHOULD INCLUDE TYPE?
324+
MessageBody = {WorkerName, SourceName, {PredictNerlTensor , NetlTensorType}, TimeTook, BatchID, BatchTS}, %% SHOULD INCLUDE TYPE?
325325
nerl_tools:http_router_request(RouterHost, RouterPort, [?MAIN_SERVER_ATOM], atom_to_list(predictRes), MessageBody),
326+
326327
stats:increment_messages_sent(ClientStatsEts),
327328
stats:increment_bytes_sent(ClientStatsEts , nerl_tools:calculate_size(MessageBody)),
328329
{next_state, predict, State#client_statem_state{etsRef = EtsRef}};

src_erl/NerlnetApp/src/MainServer/mainGenserver.erl

Lines changed: 20 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -262,16 +262,16 @@ handle_cast({clientAck,Body}, State = #main_genserver_state{clientsWaitingList =
262262
ClientName = binary_to_term(Body),
263263
NewWaitingList = WaitingList--[ClientName], % waitingList is initialized in clientsTraining or clientsPredict handl cast calls
264264
if length(NewWaitingList) == 0 ->
265-
ResultsToSendStr = generate_phase_result_data_to_send_from_ets_as_str(),
266-
NothingToSend = string:is_empty(ResultsToSendStr),
265+
PhaseResultsDataMap = generate_phase_result_data_map(),
266+
NothingToSend = string:is_empty(PhaseResultsDataMap),
267267
if
268268
NothingToSend -> pass;
269269
true -> Action = case get(active_phase) of
270270
training -> trainRes;
271271
prediction -> predRes
272272
end,
273273
{RouterHost,RouterPort} = ets:lookup_element(get(main_server_ets), my_router, ?DATA_IDX), % get main_server's router
274-
nerl_tools:http_router_request(RouterHost, RouterPort, [?API_SERVER_ATOM], atom_to_list(Action), ResultsToSendStr),
274+
nerl_tools:http_router_request(RouterHost, RouterPort, [?API_SERVER_ATOM], atom_to_list(Action), {json, PhaseResultsDataMap}),
275275
stats:increment_messages_sent(StatsEts),
276276
clean_phase_result_data_to_send_ets() % getting ready for next phase after data was sent to APIServer
277277
end,
@@ -324,11 +324,12 @@ handle_cast({lossFunction,Body}, State = #main_genserver_state{myName = MyName})
324324
stats:increment_messages_received(StatsEts),
325325
try
326326
case binary_to_term(Body) of
327-
{WorkerName , SourceName , {LossTensor , _Type} , TimeNIF , BatchID , BatchTS} ->
328-
ToSend = ?PHASE_RES_DATA_SEPARATOR ++ atom_to_list(WorkerName) ++ ?PHASE_RES_WORKER_NAME_SEPERATOR ++ atom_to_list(SourceName) ++
329-
?PHASE_RES_VALUES_SEPERATOR ++ nerl_tools:string_format("~p",[LossTensor]) ++ ?PHASE_RES_VALUES_SEPERATOR ++ float_to_list(TimeNIF) ++
330-
?PHASE_RES_VALUES_SEPERATOR ++ integer_to_list(BatchID) ++ ?PHASE_RES_VALUES_SEPERATOR ++ integer_to_list(BatchTS) ++ ?PHASE_RES_DATA_SEPARATOR,
331-
store_phase_result_data_to_send_ets({WorkerName, BatchID , BatchTS}, ToSend);
327+
{WorkerName , SourceName , {LossNerlTensor , LossNerlTensorType} , TimeNIF , BatchID , BatchTS} ->
328+
Key = atom_to_list(WorkerName) ++ ?PHASE_RES_VALUES_IN_KEY_SEPARATOR ++ atom_to_list(SourceName) ++
329+
?PHASE_RES_VALUES_IN_KEY_SEPARATOR ++ integer_to_list(BatchID) ++ ?PHASE_RES_VALUES_IN_KEY_SEPARATOR ++
330+
integer_to_list(BatchTS) ++ ?PHASE_RES_VALUES_IN_KEY_SEPARATOR ++ float_to_list(TimeNIF) ++ ?PHASE_RES_VALUES_IN_KEY_SEPARATOR ++
331+
atom_to_list(LossNerlTensorType),
332+
store_phase_result_data_to_send_ets(Key, binary_to_list(LossNerlTensor));
332333
_ELSE ->
333334
?LOG_ERROR("~p Wrong loss function pattern received from client and its worker ~p", [MyName, Body])
334335
end
@@ -344,18 +345,12 @@ handle_cast({predictRes,Body}, State) ->
344345
_BatchSize = ets:lookup_element(get(main_server_ets), batch_size, ?DATA_IDX),
345346
stats:increment_messages_received(StatsEts),
346347
try
347-
{WorkerName, SourceName, BatchID, {NerlTensor, Type}, TimeNIF , BatchTS} = binary_to_term(Body), %% TODO: add convention with client
348-
%io:format("WorkerName: ~p, InputName: ~p, BatchID: ~p, Type: ~p~n",[WorkerName, InputName, BatchID, Type]),
349-
{DecodedNerlTensor, _Type} =
350-
if
351-
(NerlTensor==<<>>) -> ?LOG_ERROR(?LOG_HEADER++"Got empty tensor"), empty_nerltensor_err;
352-
true -> nerlNIF:nerltensor_conversion({NerlTensor, Type}, nerlNIF:erl_type_conversion(Type)) % converting nerltensor from binary to erlang type using NerlNIF
353-
end,
354-
ToSend = ?PHASE_RES_DATA_SEPARATOR ++ WorkerName ++ ?PHASE_RES_WORKER_NAME_SEPERATOR ++ atom_to_list(SourceName) ++
355-
?PHASE_RES_VALUES_SEPERATOR ++ nerl_tools:string_format("~p",[DecodedNerlTensor]) ++ ?PHASE_RES_VALUES_SEPERATOR ++
356-
integer_to_list(TimeNIF) ++ ?PHASE_RES_VALUES_SEPERATOR ++ integer_to_list(BatchID) ++ ?PHASE_RES_VALUES_SEPERATOR ++
357-
integer_to_list(BatchTS) ++ ?PHASE_RES_DATA_SEPARATOR,
358-
store_phase_result_data_to_send_ets({WorkerName, BatchID , BatchTS}, ToSend)
348+
{WorkerName, SourceName, {NerlTensor, NerlTensorType}, TimeNIF , BatchID, BatchTS} = binary_to_term(Body),
349+
Key = atom_to_list(WorkerName) ++ ?PHASE_RES_VALUES_IN_KEY_SEPARATOR ++ atom_to_list(SourceName) ++
350+
?PHASE_RES_VALUES_IN_KEY_SEPARATOR ++ integer_to_list(BatchID) ++ ?PHASE_RES_VALUES_IN_KEY_SEPARATOR ++
351+
integer_to_list(BatchTS) ++ ?PHASE_RES_VALUES_IN_KEY_SEPARATOR ++ float_to_list(TimeNIF) ++ ?PHASE_RES_VALUES_IN_KEY_SEPARATOR ++
352+
atom_to_list(NerlTensorType),
353+
store_phase_result_data_to_send_ets(Key, binary_to_list(NerlTensor))
359354
catch Err:E ->
360355
?LOG_ERROR(?LOG_HEADER++"Error receiving predict result ~p",[{Err,E}])
361356
end,
@@ -471,19 +466,14 @@ retransmission_to_apiserver(HttpRouterRequestFunc, Trials) ->
471466
end.
472467

473468

474-
store_phase_result_data_to_send_ets({WorkerName, BatchID , BatchTS}, DataToSendStr) ->
475-
Key = {WorkerName, BatchID , BatchTS},
476-
ets:insert(get(phase_res_data_ets),{Key, DataToSendStr}).
477-
469+
store_phase_result_data_to_send_ets(Key, NerlTensorData) ->
470+
KeyBin = list_to_binary(Key),
471+
ets:insert(get(phase_res_data_ets),{KeyBin, NerlTensorData}).
478472

479-
generate_phase_result_data_string_from_list([], _ResString) -> _ResString;
480-
generate_phase_result_data_string_from_list(ListOfData, ResString) ->
481-
NewResString = ResString++element(?DATA_IDX,hd(ListOfData)),
482-
generate_phase_result_data_string_from_list(tl(ListOfData), NewResString).
483473

484-
generate_phase_result_data_to_send_from_ets_as_str() ->
474+
generate_phase_result_data_map() ->
485475
ListOfData = ets:tab2list(get(phase_res_data_ets)),
486-
generate_phase_result_data_string_from_list(ListOfData, ""). % String to send is retruned
476+
ListOfData.
487477

488478
clean_phase_result_data_to_send_ets() ->
489479
ets:delete_all_objects(get(phase_res_data_ets)).
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
-define(API_SERVER_ACTION_ACK, "ackPy").
22

3+
-define(PHASE_RES_VALUES_IN_KEY_SEPARATOR, "#").
34
-define(PHASE_RES_WORKER_NAME_SEPERATOR, "#").
45
-define(PHASE_RES_VALUES_SEPERATOR, "|").
56
-define(PHASE_RES_DATA_SEPARATOR, "?").

src_erl/NerlnetApp/src/Source/sourceStatem.erl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222

2323

2424
%% defintions
25-
-define(SENDING_FREQUENCY_OVERHEAD_FIX_FACTOR_PERC, 0.85).
25+
-define(SENDING_FREQUENCY_OVERHEAD_FIX_FACTOR_PERC, 0.75).
2626
-define(MICRO_TO_MILLI_FACTOR, 0.001).
2727

2828

src_erl/NerlnetApp/src/nerl_tools.erl

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -31,14 +31,22 @@ http_router_request(RouterHost, RouterPort, DestinationsList, ActionStr, Body) -
3131
end.
3232

3333

34+
http_request(Host, Port, Path, {json, Body}) ->
35+
io:format("Sending Json to ~p:~p~n",[Host,Port]),
36+
JsonContentType = ?HTTP_CONTENT_TYPE_JSON,
37+
Json = jsx:encode(Body),
38+
http_request(Host, Port,Path, JsonContentType, Json);
39+
http_request(Host, Port,Path, Body) ->
40+
DefaultContentType = ?HTTP_CONTENT_TYPE_FORM_URLENCODED,
41+
http_request(Host, Port,Path, DefaultContentType, Body).
3442

3543
%% send message between entities
36-
http_request(Host, Port,Path, Body) when is_atom(Body) -> http_request(Host, Port,Path, atom_to_list(Body));
37-
http_request(Host, Port,Path, Body) when is_binary(Host) -> http_request(binary_to_list(Host), Port,Path, Body);
38-
http_request(Host, Port,Path, Body)->
39-
URL = "http://" ++ Host ++ ":"++integer_to_list(Port) ++ "/" ++ Path,
44+
http_request(Host, Port, Path, ContentType, Body) when is_atom(Body) -> http_request(Host, Port,Path, ContentType, atom_to_list(Body));
45+
http_request(Host, Port, Path, ContentType, Body) when is_binary(Host) -> http_request(binary_to_list(Host), Port,Path, ContentType, Body);
46+
http_request(Host, Port, Path, ContentType, Body)->
47+
URL = "http://" ++ Host ++ ":"++integer_to_list(Port) ++ "/" ++ Path, % Path is the action
4048
httpc:set_options([{proxy, {{Host, Port},[Host]}}]),
41-
httpc:request(post,{URL, [],"application/x-www-form-urlencoded",Body}, [], []).
49+
httpc:request(post,{URL, [], ContentType, Body}, [], []).
4250

4351
get_client_worker_pairs([],_WorkersMap,Ret)-> Ret;
4452
get_client_worker_pairs([WorkerName|WorkersNames],WorkersMap,Ret)->

src_erl/NerlnetApp/src/nerl_tools.hrl

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,11 @@
1313
-define(VALIDATION_OF_TRANSMISSION_WITH_API_SERVER_INTERVAL_MS, 100). % how much between each resend
1414
%% ETS definitions
1515

16+
%% HTTP Content type definitions
17+
-define(HTTP_CONTENT_TYPE_MULTI_PART_FORM_DATA, "multipart/form-data").
18+
-define(HTTP_CONTENT_TYPE_JSON, "application/json").
19+
-define(HTTP_CONTENT_TYPE_FORM_URLENCODED, "application/x-www-form-urlencoded").
20+
1621
% 2 elements ETS:
1722
-define(KEY_IDX, 1).
1823
-define(DATA_IDX, 2).

0 commit comments

Comments
 (0)