From a05dcc8b41f5b84f2d8ac00afed305b4f776f28c Mon Sep 17 00:00:00 2001 From: Fan Date: Mon, 27 Feb 2023 13:19:48 -0500 Subject: [PATCH 1/2] Fix legacy import --- examples/notebook/fedscale_demo_client.ipynb | 235 ++++---------- examples/notebook/fedscale_demo_server.ipynb | 325 +++---------------- 2 files changed, 104 insertions(+), 456 deletions(-) diff --git a/examples/notebook/fedscale_demo_client.ipynb b/examples/notebook/fedscale_demo_client.ipynb index f2ce4e8d..c30cd55a 100644 --- a/examples/notebook/fedscale_demo_client.ipynb +++ b/examples/notebook/fedscale_demo_client.ipynb @@ -26,14 +26,30 @@ "name": "stderr", "output_type": "stream", "text": [ - "(10-05) 10:56:27 INFO [executor.py:61] (EXECUTOR:1) is setting up environ ...\n" + "(02-27) 12:24:33 INFO [fllibs.py:97] Initializing the model ...\n", + "(02-27) 12:24:33 INFO [executor.py:75] (EXECUTOR:1) is setting up environ ...\n", + "0.0%" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Files already downloaded and verified\n", + "Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to /Users/fan/cifar10/cifar-10-python.tar.gz\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100.0%\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Extracting /Users/fan/cifar10/cifar-10-python.tar.gz to /Users/fan/cifar10/\n", "Files already downloaded and verified\n" ] }, @@ -41,174 +57,41 @@ "name": "stderr", "output_type": "stream", "text": [ - "(10-05) 10:56:29 INFO [executor.py:117] Data partitioner starts ...\n", - "(10-05) 10:56:29 INFO [divide_data.py:106] Randomly partitioning data, 50000 samples...\n", - "(10-05) 10:56:29 INFO [divide_data.py:106] Randomly partitioning data, 10000 samples...\n", - "(10-05) 10:56:29 INFO [executor.py:128] Data partitioner completes ...\n", - "(10-05) 10:56:29 INFO [channel_context.py:21] %%%%%%%%%% Opening grpc connection to 127.0.0.1 %%%%%%%%%%\n", - "(10-05) 10:56:29 INFO [executor.py:414] Start monitoring events ...\n", - "(10-05) 10:56:31 INFO [client.py:32] Start to train (CLIENT: 1) ...\n", - "(10-05) 10:56:38 INFO [client.py:67] Training of (CLIENT: 1) completes, {'clientId': 1, 'moving_loss': 6.385307352168408, 'trained_size': 600, 'success': True}\n", - "(10-05) 10:56:39 INFO [client.py:32] Start to train (CLIENT: 2) ...\n", - "(10-05) 10:56:46 INFO [client.py:67] Training of (CLIENT: 2) completes, {'clientId': 2, 'moving_loss': 5.383798982996671, 'trained_size': 600, 'success': True}\n", - "(10-05) 10:56:46 INFO [client.py:32] Start to train (CLIENT: 3) ...\n", - "(10-05) 10:56:53 INFO [client.py:67] Training of (CLIENT: 3) completes, {'clientId': 3, 'moving_loss': 5.385428038255249, 'trained_size': 600, 'success': True}\n", - "(10-05) 10:56:53 INFO [client.py:32] Start to train (CLIENT: 4) ...\n", - "(10-05) 10:57:00 INFO [client.py:67] Training of (CLIENT: 4) completes, {'clientId': 4, 'moving_loss': 5.255024715696292, 'trained_size': 600, 'success': True}\n", - "(10-05) 10:57:02 INFO [client.py:32] Start to train (CLIENT: 1) ...\n", - "(10-05) 10:57:09 INFO [client.py:67] Training of (CLIENT: 1) completes, {'clientId': 1, 'moving_loss': 4.171052567362976, 'trained_size': 600, 'success': True}\n", - "(10-05) 10:57:09 INFO [client.py:32] Start to train (CLIENT: 2) ...\n", - "(10-05) 10:57:16 INFO [client.py:67] Training of (CLIENT: 2) completes, {'clientId': 2, 'moving_loss': 5.34112758911072, 'trained_size': 600, 'success': True}\n", - "(10-05) 10:57:17 INFO [client.py:32] Start to train (CLIENT: 3) ...\n", - "(10-05) 10:57:23 INFO [client.py:67] Training of (CLIENT: 3) completes, {'clientId': 3, 'moving_loss': 6.283451225568975, 'trained_size': 600, 'success': True}\n", - "(10-05) 10:57:24 INFO [client.py:32] Start to train (CLIENT: 4) ...\n", - "(10-05) 10:57:31 INFO [client.py:67] Training of (CLIENT: 4) completes, {'clientId': 4, 'moving_loss': 5.414917396137178, 'trained_size': 600, 'success': True}\n", - "(10-05) 10:57:33 INFO [client.py:32] Start to train (CLIENT: 1) ...\n", - "(10-05) 10:57:40 INFO [client.py:67] Training of (CLIENT: 1) completes, {'clientId': 1, 'moving_loss': 4.728196259173867, 'trained_size': 600, 'success': True}\n", - "(10-05) 10:57:40 INFO [client.py:32] Start to train (CLIENT: 2) ...\n", - "(10-05) 10:57:47 INFO [client.py:67] Training of (CLIENT: 2) completes, {'clientId': 2, 'moving_loss': 4.245132194605622, 'trained_size': 600, 'success': True}\n", - "(10-05) 10:57:47 INFO [client.py:32] Start to train (CLIENT: 3) ...\n", - "(10-05) 10:57:55 INFO [client.py:67] Training of (CLIENT: 3) completes, {'clientId': 3, 'moving_loss': 4.600091122132928, 'trained_size': 600, 'success': True}\n", - "(10-05) 10:57:55 INFO [client.py:32] Start to train (CLIENT: 4) ...\n", - "(10-05) 10:58:02 INFO [client.py:67] Training of (CLIENT: 4) completes, {'clientId': 4, 'moving_loss': 4.824014905330292, 'trained_size': 600, 'success': True}\n", - "(10-05) 10:58:04 INFO [client.py:32] Start to train (CLIENT: 1) ...\n", - "(10-05) 10:58:11 INFO [client.py:67] Training of (CLIENT: 1) completes, {'clientId': 1, 'moving_loss': 4.123896865647862, 'trained_size': 600, 'success': True}\n", - "(10-05) 10:58:11 INFO [client.py:32] Start to train (CLIENT: 2) ...\n", - "(10-05) 10:58:18 INFO [client.py:67] Training of (CLIENT: 2) completes, {'clientId': 2, 'moving_loss': 4.305955504712801, 'trained_size': 600, 'success': True}\n", - "(10-05) 10:58:18 INFO [client.py:32] Start to train (CLIENT: 3) ...\n", - "(10-05) 10:58:25 INFO [client.py:67] Training of (CLIENT: 3) completes, {'clientId': 3, 'moving_loss': 5.471528879133904, 'trained_size': 600, 'success': True}\n", - "(10-05) 10:58:25 INFO [client.py:32] Start to train (CLIENT: 4) ...\n", - "(10-05) 10:58:32 INFO [client.py:67] Training of (CLIENT: 4) completes, {'clientId': 4, 'moving_loss': 3.8462213926046678, 'trained_size': 600, 'success': True}\n", - "(10-05) 10:58:53 INFO [model_test_module.py:306] Rank 1: Test set: Average loss: 2.4887, Top-1 Accuracy: 1000.0/10000 (0.1), Top-5 Accuracy: 0.5142\n", - "(10-05) 10:58:53 INFO [executor.py:376] After aggregation round 5, CumulTime 146.3842, eval_time 18.7353, test_loss 2.4887, test_accuracy 10.00%, test_5_accuracy 51.42% \n", - "\n", - "(10-05) 10:58:54 INFO [client.py:32] Start to train (CLIENT: 1) ...\n", - "(10-05) 10:59:01 INFO [client.py:67] Training of (CLIENT: 1) completes, {'clientId': 1, 'moving_loss': 4.542471038460385, 'trained_size': 600, 'success': True}\n", - "(10-05) 10:59:01 INFO [client.py:32] Start to train (CLIENT: 2) ...\n", - "(10-05) 10:59:08 INFO [client.py:67] Training of (CLIENT: 2) completes, {'clientId': 2, 'moving_loss': 3.3273613208657293, 'trained_size': 600, 'success': True}\n", - "(10-05) 10:59:09 INFO [client.py:32] Start to train (CLIENT: 3) ...\n", - "(10-05) 10:59:15 INFO [client.py:67] Training of (CLIENT: 3) completes, {'clientId': 3, 'moving_loss': 4.458196635807365, 'trained_size': 600, 'success': True}\n", - "(10-05) 10:59:16 INFO [client.py:32] Start to train (CLIENT: 4) ...\n", - "(10-05) 10:59:22 INFO [client.py:67] Training of (CLIENT: 4) completes, {'clientId': 4, 'moving_loss': 3.2138776843070707, 'trained_size': 600, 'success': True}\n", - "(10-05) 10:59:25 INFO [client.py:32] Start to train (CLIENT: 1) ...\n", - "(10-05) 10:59:32 INFO [client.py:67] Training of (CLIENT: 1) completes, {'clientId': 1, 'moving_loss': 3.80448099041252, 'trained_size': 600, 'success': True}\n", - "(10-05) 10:59:32 INFO [client.py:32] Start to train (CLIENT: 2) ...\n", - "(10-05) 10:59:39 INFO [client.py:67] Training of (CLIENT: 2) completes, {'clientId': 2, 'moving_loss': 3.3631498863039644, 'trained_size': 600, 'success': True}\n", - "(10-05) 10:59:39 INFO [client.py:32] Start to train (CLIENT: 3) ...\n", - "(10-05) 10:59:46 INFO [client.py:67] Training of (CLIENT: 3) completes, {'clientId': 3, 'moving_loss': 3.59191329329352, 'trained_size': 600, 'success': True}\n", - "(10-05) 10:59:46 INFO [client.py:32] Start to train (CLIENT: 4) ...\n", - "(10-05) 10:59:53 INFO [client.py:67] Training of (CLIENT: 4) completes, {'clientId': 4, 'moving_loss': 2.848148092967869, 'trained_size': 600, 'success': True}\n", - "(10-05) 10:59:56 INFO [client.py:32] Start to train (CLIENT: 1) ...\n", - "(10-05) 11:00:03 INFO [client.py:67] Training of (CLIENT: 1) completes, {'clientId': 1, 'moving_loss': 2.872066663595638, 'trained_size': 600, 'success': True}\n", - "(10-05) 11:00:03 INFO [client.py:32] Start to train (CLIENT: 2) ...\n", - "(10-05) 11:00:10 INFO [client.py:67] Training of (CLIENT: 2) completes, {'clientId': 2, 'moving_loss': 3.5066577600041056, 'trained_size': 600, 'success': True}\n", - "(10-05) 11:00:10 INFO [client.py:32] Start to train (CLIENT: 3) ...\n", - "(10-05) 11:00:17 INFO [client.py:67] Training of (CLIENT: 3) completes, {'clientId': 3, 'moving_loss': 2.966638272894974, 'trained_size': 600, 'success': True}\n", - "(10-05) 11:00:17 INFO [client.py:32] Start to train (CLIENT: 4) ...\n", - "(10-05) 11:00:24 INFO [client.py:67] Training of (CLIENT: 4) completes, {'clientId': 4, 'moving_loss': 2.885736187690472, 'trained_size': 600, 'success': True}\n", - "(10-05) 11:00:26 INFO [client.py:32] Start to train (CLIENT: 1) ...\n", - "(10-05) 11:00:33 INFO [client.py:67] Training of (CLIENT: 1) completes, {'clientId': 1, 'moving_loss': 3.212822436472684, 'trained_size': 600, 'success': True}\n", - "(10-05) 11:00:33 INFO [client.py:32] Start to train (CLIENT: 2) ...\n", - "(10-05) 11:00:40 INFO [client.py:67] Training of (CLIENT: 2) completes, {'clientId': 2, 'moving_loss': 2.6508591739685192, 'trained_size': 600, 'success': True}\n", - "(10-05) 11:00:40 INFO [client.py:32] Start to train (CLIENT: 3) ...\n", - "(10-05) 11:00:47 INFO [client.py:67] Training of (CLIENT: 3) completes, {'clientId': 3, 'moving_loss': 2.887190611917579, 'trained_size': 600, 'success': True}\n", - "(10-05) 11:00:47 INFO [client.py:32] Start to train (CLIENT: 4) ...\n", - "(10-05) 11:00:54 INFO [client.py:67] Training of (CLIENT: 4) completes, {'clientId': 4, 'moving_loss': 2.704264756492174, 'trained_size': 600, 'success': True}\n", - "(10-05) 11:00:57 INFO [client.py:32] Start to train (CLIENT: 1) ...\n", - "(10-05) 11:01:04 INFO [client.py:67] Training of (CLIENT: 1) completes, {'clientId': 1, 'moving_loss': 3.152766867797803, 'trained_size': 600, 'success': True}\n", - "(10-05) 11:01:04 INFO [client.py:32] Start to train (CLIENT: 2) ...\n", - "(10-05) 11:01:11 INFO [client.py:67] Training of (CLIENT: 2) completes, {'clientId': 2, 'moving_loss': 2.4352049035450603, 'trained_size': 600, 'success': True}\n", - "(10-05) 11:01:11 INFO [client.py:32] Start to train (CLIENT: 3) ...\n", - "(10-05) 11:01:17 INFO [client.py:67] Training of (CLIENT: 3) completes, {'clientId': 3, 'moving_loss': 2.9751960506664665, 'trained_size': 600, 'success': True}\n", - "(10-05) 11:01:18 INFO [client.py:32] Start to train (CLIENT: 4) ...\n", - "(10-05) 11:01:24 INFO [client.py:67] Training of (CLIENT: 4) completes, {'clientId': 4, 'moving_loss': 2.7658717371408987, 'trained_size': 600, 'success': True}\n", - "(10-05) 11:01:45 INFO [model_test_module.py:306] Rank 1: Test set: Average loss: 2.3363, Top-1 Accuracy: 1766.0/10000 (0.1766), Top-5 Accuracy: 0.6799\n", - "(10-05) 11:01:45 INFO [executor.py:376] After aggregation round 10, CumulTime 318.6031, eval_time 18.3159, test_loss 2.3363, test_accuracy 17.66%, test_5_accuracy 67.99% \n", - "\n", - "(10-05) 11:01:46 INFO [client.py:32] Start to train (CLIENT: 1) ...\n", - "(10-05) 11:01:53 INFO [client.py:67] Training of (CLIENT: 1) completes, {'clientId': 1, 'moving_loss': 2.471355666196691, 'trained_size': 600, 'success': True}\n", - "(10-05) 11:01:53 INFO [client.py:32] Start to train (CLIENT: 2) ...\n", - "(10-05) 11:02:00 INFO [client.py:67] Training of (CLIENT: 2) completes, {'clientId': 2, 'moving_loss': 2.4356494496458128, 'trained_size': 600, 'success': True}\n", - "(10-05) 11:02:00 INFO [client.py:32] Start to train (CLIENT: 3) ...\n", - "(10-05) 11:02:07 INFO [client.py:67] Training of (CLIENT: 3) completes, {'clientId': 3, 'moving_loss': 2.576381177196222, 'trained_size': 600, 'success': True}\n", - "(10-05) 11:02:07 INFO [client.py:32] Start to train (CLIENT: 4) ...\n", - "(10-05) 11:02:15 INFO [client.py:67] Training of (CLIENT: 4) completes, {'clientId': 4, 'moving_loss': 2.3470000365460093, 'trained_size': 600, 'success': True}\n", - "(10-05) 11:02:17 INFO [client.py:32] Start to train (CLIENT: 1) ...\n", - "(10-05) 11:02:24 INFO [client.py:67] Training of (CLIENT: 1) completes, {'clientId': 1, 'moving_loss': 2.509514877440946, 'trained_size': 600, 'success': True}\n", - "(10-05) 11:02:24 INFO [client.py:32] Start to train (CLIENT: 2) ...\n", - "(10-05) 11:02:31 INFO [client.py:67] Training of (CLIENT: 2) completes, {'clientId': 2, 'moving_loss': 2.2464047509950236, 'trained_size': 600, 'success': True}\n", - "(10-05) 11:02:31 INFO [client.py:32] Start to train (CLIENT: 3) ...\n", - "(10-05) 11:02:38 INFO [client.py:67] Training of (CLIENT: 3) completes, {'clientId': 3, 'moving_loss': 2.4089918757961515, 'trained_size': 600, 'success': True}\n", - "(10-05) 11:02:38 INFO [client.py:32] Start to train (CLIENT: 4) ...\n", - "(10-05) 11:02:45 INFO [client.py:67] Training of (CLIENT: 4) completes, {'clientId': 4, 'moving_loss': 2.35296483668845, 'trained_size': 600, 'success': True}\n", - "(10-05) 11:02:48 INFO [client.py:32] Start to train (CLIENT: 1) ...\n", - "(10-05) 11:02:54 INFO [client.py:67] Training of (CLIENT: 1) completes, {'clientId': 1, 'moving_loss': 2.430718451410085, 'trained_size': 600, 'success': True}\n", - "(10-05) 11:02:55 INFO [client.py:32] Start to train (CLIENT: 2) ...\n", - "(10-05) 11:03:01 INFO [client.py:67] Training of (CLIENT: 2) completes, {'clientId': 2, 'moving_loss': 2.4424235744065412, 'trained_size': 600, 'success': True}\n", - "(10-05) 11:03:02 INFO [client.py:32] Start to train (CLIENT: 3) ...\n", - "(10-05) 11:03:09 INFO [client.py:67] Training of (CLIENT: 3) completes, {'clientId': 3, 'moving_loss': 2.3308809210055044, 'trained_size': 600, 'success': True}\n", - "(10-05) 11:03:09 INFO [client.py:32] Start to train (CLIENT: 4) ...\n", - "(10-05) 11:03:16 INFO [client.py:67] Training of (CLIENT: 4) completes, {'clientId': 4, 'moving_loss': 2.302724640744112, 'trained_size': 600, 'success': True}\n", - "(10-05) 11:03:18 INFO [client.py:32] Start to train (CLIENT: 1) ...\n", - "(10-05) 11:03:25 INFO [client.py:67] Training of (CLIENT: 1) completes, {'clientId': 1, 'moving_loss': 2.5397319132390055, 'trained_size': 600, 'success': True}\n", - "(10-05) 11:03:25 INFO [client.py:32] Start to train (CLIENT: 2) ...\n", - "(10-05) 11:03:32 INFO [client.py:67] Training of (CLIENT: 2) completes, {'clientId': 2, 'moving_loss': 2.447979369095947, 'trained_size': 600, 'success': True}\n", - "(10-05) 11:03:33 INFO [client.py:32] Start to train (CLIENT: 3) ...\n", - "(10-05) 11:03:39 INFO [client.py:67] Training of (CLIENT: 3) completes, {'clientId': 3, 'moving_loss': 2.412356183045489, 'trained_size': 600, 'success': True}\n", - "(10-05) 11:03:39 INFO [client.py:32] Start to train (CLIENT: 4) ...\n", - "(10-05) 11:03:46 INFO [client.py:67] Training of (CLIENT: 4) completes, {'clientId': 4, 'moving_loss': 2.2552800768384884, 'trained_size': 600, 'success': True}\n", - "(10-05) 11:03:49 INFO [client.py:32] Start to train (CLIENT: 1) ...\n", - "(10-05) 11:03:56 INFO [client.py:67] Training of (CLIENT: 1) completes, {'clientId': 1, 'moving_loss': 2.3954310684108044, 'trained_size': 600, 'success': True}\n", - "(10-05) 11:03:56 INFO [client.py:32] Start to train (CLIENT: 2) ...\n", - "(10-05) 11:04:02 INFO [client.py:67] Training of (CLIENT: 2) completes, {'clientId': 2, 'moving_loss': 2.356739756125249, 'trained_size': 600, 'success': True}\n", - "(10-05) 11:04:03 INFO [client.py:32] Start to train (CLIENT: 3) ...\n", - "(10-05) 11:04:10 INFO [client.py:67] Training of (CLIENT: 3) completes, {'clientId': 3, 'moving_loss': 2.3305684278581604, 'trained_size': 600, 'success': True}\n", - "(10-05) 11:04:10 INFO [client.py:32] Start to train (CLIENT: 4) ...\n", - "(10-05) 11:04:17 INFO [client.py:67] Training of (CLIENT: 4) completes, {'clientId': 4, 'moving_loss': 2.2867733053916672, 'trained_size': 600, 'success': True}\n", - "(10-05) 11:04:38 INFO [model_test_module.py:306] Rank 1: Test set: Average loss: 2.2923, Top-1 Accuracy: 2431.0/10000 (0.2431), Top-5 Accuracy: 0.7641\n", - "(10-05) 11:04:38 INFO [executor.py:376] After aggregation round 15, CumulTime 491.1768, eval_time 18.6568, test_loss 2.2923, test_accuracy 24.31%, test_5_accuracy 76.41% \n", + "(02-27) 12:24:53 INFO [executor.py:123] Data partitioner starts ...\n", + "(02-27) 12:24:53 INFO [divide_data.py:105] Randomly partitioning data, 50000 samples...\n", + "(02-27) 12:24:53 INFO [divide_data.py:105] Randomly partitioning data, 10000 samples...\n", + "(02-27) 12:24:53 INFO [executor.py:134] Data partitioner completes ...\n", + "(02-27) 12:24:53 INFO [channel_context.py:20] %%%%%%%%%% Opening grpc connection to 127.0.0.1 %%%%%%%%%%\n", + "(02-27) 12:24:53 INFO [executor.py:372] Start monitoring events ...\n", + "(02-27) 12:26:30 INFO [model_test_module.py:307] Rank 1: Test set: Average loss: 2.3027, Top-1 Accuracy: 1000.0/10000 (0.1), Top-5 Accuracy: 0.5\n", + "(02-27) 12:26:30 INFO [torch_client.py:264] Test results: Eval_time 94.6244, test_loss 2.3027, test_accuracy 10.00%, test_5_accuracy 50.00% \n", "\n", - "(10-05) 11:04:39 INFO [client.py:32] Start to train (CLIENT: 1) ...\n", - "(10-05) 11:04:46 INFO [client.py:67] Training of (CLIENT: 1) completes, {'clientId': 1, 'moving_loss': 2.3175615307159316, 'trained_size': 600, 'success': True}\n", - "(10-05) 11:04:46 INFO [client.py:32] Start to train (CLIENT: 2) ...\n", - "(10-05) 11:04:53 INFO [client.py:67] Training of (CLIENT: 2) completes, {'clientId': 2, 'moving_loss': 2.198226276340273, 'trained_size': 600, 'success': True}\n", - "(10-05) 11:04:53 INFO [client.py:32] Start to train (CLIENT: 3) ...\n", - "(10-05) 11:05:00 INFO [client.py:67] Training of (CLIENT: 3) completes, {'clientId': 3, 'moving_loss': 2.317745547848251, 'trained_size': 600, 'success': True}\n", - "(10-05) 11:05:00 INFO [client.py:32] Start to train (CLIENT: 4) ...\n", - "(10-05) 11:05:07 INFO [client.py:67] Training of (CLIENT: 4) completes, {'clientId': 4, 'moving_loss': 2.2811246385331883, 'trained_size': 600, 'success': True}\n", - "(10-05) 11:05:10 INFO [client.py:32] Start to train (CLIENT: 1) ...\n", - "(10-05) 11:05:16 INFO [client.py:67] Training of (CLIENT: 1) completes, {'clientId': 1, 'moving_loss': 2.200125896722598, 'trained_size': 600, 'success': True}\n", - "(10-05) 11:05:17 INFO [client.py:32] Start to train (CLIENT: 2) ...\n", - "(10-05) 11:05:23 INFO [client.py:67] Training of (CLIENT: 2) completes, {'clientId': 2, 'moving_loss': 2.2880915137374966, 'trained_size': 600, 'success': True}\n", - "(10-05) 11:05:23 INFO [client.py:32] Start to train (CLIENT: 3) ...\n", - "(10-05) 11:05:30 INFO [client.py:67] Training of (CLIENT: 3) completes, {'clientId': 3, 'moving_loss': 2.2059975263981326, 'trained_size': 600, 'success': True}\n", - "(10-05) 11:05:30 INFO [client.py:32] Start to train (CLIENT: 4) ...\n", - "(10-05) 11:05:37 INFO [client.py:67] Training of (CLIENT: 4) completes, {'clientId': 4, 'moving_loss': 2.34091048582892, 'trained_size': 600, 'success': True}\n", - "(10-05) 11:05:40 INFO [client.py:32] Start to train (CLIENT: 1) ...\n", - "(10-05) 11:05:46 INFO [client.py:67] Training of (CLIENT: 1) completes, {'clientId': 1, 'moving_loss': 2.149645458253341, 'trained_size': 600, 'success': True}\n", - "(10-05) 11:05:47 INFO [client.py:32] Start to train (CLIENT: 2) ...\n", - "(10-05) 11:05:54 INFO [client.py:67] Training of (CLIENT: 2) completes, {'clientId': 2, 'moving_loss': 2.113924674635463, 'trained_size': 600, 'success': True}\n", - "(10-05) 11:05:54 INFO [client.py:32] Start to train (CLIENT: 3) ...\n", - "(10-05) 11:06:01 INFO [client.py:67] Training of (CLIENT: 3) completes, {'clientId': 3, 'moving_loss': 2.2351507674495643, 'trained_size': 600, 'success': True}\n", - "(10-05) 11:06:01 INFO [client.py:32] Start to train (CLIENT: 4) ...\n", - "(10-05) 11:06:08 INFO [client.py:67] Training of (CLIENT: 4) completes, {'clientId': 4, 'moving_loss': 2.2538349455169575, 'trained_size': 600, 'success': True}\n", - "(10-05) 11:06:10 INFO [client.py:32] Start to train (CLIENT: 1) ...\n", - "(10-05) 11:06:17 INFO [client.py:67] Training of (CLIENT: 1) completes, {'clientId': 1, 'moving_loss': 2.1228796089115454, 'trained_size': 600, 'success': True}\n", - "(10-05) 11:06:17 INFO [client.py:32] Start to train (CLIENT: 2) ...\n", - "(10-05) 11:06:25 INFO [client.py:67] Training of (CLIENT: 2) completes, {'clientId': 2, 'moving_loss': 2.079776040856923, 'trained_size': 600, 'success': True}\n", - "(10-05) 11:06:25 INFO [client.py:32] Start to train (CLIENT: 3) ...\n", - "(10-05) 11:06:31 INFO [client.py:67] Training of (CLIENT: 3) completes, {'clientId': 3, 'moving_loss': 2.064791808439913, 'trained_size': 600, 'success': True}\n", - "(10-05) 11:06:31 INFO [client.py:32] Start to train (CLIENT: 4) ...\n", - "(10-05) 11:06:38 INFO [client.py:67] Training of (CLIENT: 4) completes, {'clientId': 4, 'moving_loss': 2.107674873224351, 'trained_size': 600, 'success': True}\n", - "(10-05) 11:06:41 INFO [client.py:32] Start to train (CLIENT: 1) ...\n", - "(10-05) 11:06:48 INFO [client.py:67] Training of (CLIENT: 1) completes, {'clientId': 1, 'moving_loss': 2.1673746026503267, 'trained_size': 600, 'success': True}\n", - "(10-05) 11:06:48 INFO [client.py:32] Start to train (CLIENT: 2) ...\n", - "(10-05) 11:06:55 INFO [client.py:67] Training of (CLIENT: 2) completes, {'clientId': 2, 'moving_loss': 2.0539934263054516, 'trained_size': 600, 'success': True}\n", - "(10-05) 11:06:55 INFO [client.py:32] Start to train (CLIENT: 3) ...\n", - "(10-05) 11:07:02 INFO [client.py:67] Training of (CLIENT: 3) completes, {'clientId': 3, 'moving_loss': 2.0785532106098277, 'trained_size': 600, 'success': True}\n", - "(10-05) 11:07:02 INFO [client.py:32] Start to train (CLIENT: 4) ...\n", - "(10-05) 11:07:09 INFO [client.py:67] Training of (CLIENT: 4) completes, {'clientId': 4, 'moving_loss': 2.1002533861294888, 'trained_size': 600, 'success': True}\n", - "(10-05) 11:07:10 INFO [channel_context.py:33] %%%%%%%%%% Closing grpc connection to the aggregator %%%%%%%%%%\n" + "(02-27) 12:26:31 INFO [torch_client.py:49] Start to train (CLIENT: 1) ...\n", + "(02-27) 12:27:33 INFO [torch_client.py:84] Training of (CLIENT: 1) completes, {'client_id': 1, 'moving_loss': 6.559604595482319, 'trained_size': 600, 'success': True}\n", + "(02-27) 12:27:33 INFO [torch_client.py:49] Start to train (CLIENT: 2) ...\n", + "(02-27) 12:28:36 INFO [torch_client.py:84] Training of (CLIENT: 2) completes, {'client_id': 2, 'moving_loss': 5.693940820821615, 'trained_size': 600, 'success': True}\n", + "(02-27) 12:28:36 INFO [torch_client.py:49] Start to train (CLIENT: 3) ...\n", + "(02-27) 12:29:37 INFO [torch_client.py:84] Training of (CLIENT: 3) completes, {'client_id': 3, 'moving_loss': 5.515216224300269, 'trained_size': 600, 'success': True}\n", + "(02-27) 12:29:38 INFO [torch_client.py:49] Start to train (CLIENT: 4) ...\n" + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[1], line 15\u001b[0m\n\u001b[1;32m 13\u001b[0m parser\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39muse_cuda \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFalse\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 14\u001b[0m Demo_Executor \u001b[38;5;241m=\u001b[39m Executor(parser\u001b[38;5;241m.\u001b[39margs)\n\u001b[0;32m---> 15\u001b[0m \u001b[43mDemo_Executor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/anaconda3/envs/fedscale/lib/python3.10/site-packages/fedscale/cloud/execution/executor.py:144\u001b[0m, in \u001b[0;36mExecutor.run\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 142\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtraining_sets, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtesting_sets \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39minit_data()\n\u001b[1;32m 143\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msetup_communication()\n\u001b[0;32m--> 144\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mevent_monitor\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/anaconda3/envs/fedscale/lib/python3.10/site-packages/fedscale/cloud/execution/executor.py:385\u001b[0m, in \u001b[0;36mExecutor.event_monitor\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 383\u001b[0m train_config[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmodel\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m train_model\n\u001b[1;32m 384\u001b[0m train_config[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mclient_id\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mint\u001b[39m(train_config[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mclient_id\u001b[39m\u001b[38;5;124m'\u001b[39m])\n\u001b[0;32m--> 385\u001b[0m client_id, train_res \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mTrain\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtrain_config\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 387\u001b[0m \u001b[38;5;66;03m# Upload model updates\u001b[39;00m\n\u001b[1;32m 388\u001b[0m future_call \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maggregator_communicator\u001b[38;5;241m.\u001b[39mstub\u001b[38;5;241m.\u001b[39mCLIENT_EXECUTE_COMPLETION\u001b[38;5;241m.\u001b[39mfuture(\n\u001b[1;32m 389\u001b[0m job_api_pb2\u001b[38;5;241m.\u001b[39mCompleteRequest(client_id\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mstr\u001b[39m(client_id), executor_id\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexecutor_id,\n\u001b[1;32m 390\u001b[0m event\u001b[38;5;241m=\u001b[39mcommons\u001b[38;5;241m.\u001b[39mUPLOAD_MODEL, status\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, msg\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 391\u001b[0m meta_result\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, data_result\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mserialize_response(train_res)\n\u001b[1;32m 392\u001b[0m ))\n", + "File \u001b[0;32m~/anaconda3/envs/fedscale/lib/python3.10/site-packages/fedscale/cloud/execution/executor.py:204\u001b[0m, in \u001b[0;36mExecutor.Train\u001b[0;34m(self, config)\u001b[0m\n\u001b[1;32m 202\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThe \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmodel\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m object must be a non-null value in the training config.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 203\u001b[0m client_conf \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moverride_conf(train_config)\n\u001b[0;32m--> 204\u001b[0m train_res \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtraining_handler\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 205\u001b[0m \u001b[43m \u001b[49m\u001b[43mclient_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mclient_id\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconf\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mclient_conf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mmodel\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 207\u001b[0m \u001b[38;5;66;03m# Report execution completion meta information\u001b[39;00m\n\u001b[1;32m 208\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maggregator_communicator\u001b[38;5;241m.\u001b[39mstub\u001b[38;5;241m.\u001b[39mCLIENT_EXECUTE_COMPLETION(\n\u001b[1;32m 209\u001b[0m job_api_pb2\u001b[38;5;241m.\u001b[39mCompleteRequest(\n\u001b[1;32m 210\u001b[0m client_id\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mstr\u001b[39m(client_id), executor_id\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexecutor_id,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 213\u001b[0m )\n\u001b[1;32m 214\u001b[0m )\n", + "File \u001b[0;32m~/anaconda3/envs/fedscale/lib/python3.10/site-packages/fedscale/cloud/execution/executor.py:309\u001b[0m, in \u001b[0;36mExecutor.training_handler\u001b[0;34m(self, client_id, conf, model)\u001b[0m\n\u001b[1;32m 303\u001b[0m client_data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtraining_sets \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mtask \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrl\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m \\\n\u001b[1;32m 304\u001b[0m select_dataset(client_id, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtraining_sets,\n\u001b[1;32m 305\u001b[0m batch_size\u001b[38;5;241m=\u001b[39mconf\u001b[38;5;241m.\u001b[39mbatch_size, args\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs,\n\u001b[1;32m 306\u001b[0m collate_fn\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcollate_fn\n\u001b[1;32m 307\u001b[0m )\n\u001b[1;32m 308\u001b[0m client \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_client_trainer(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs)\n\u001b[0;32m--> 309\u001b[0m train_res \u001b[38;5;241m=\u001b[39m \u001b[43mclient\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 310\u001b[0m \u001b[43m \u001b[49m\u001b[43mclient_data\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mclient_data\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel_adapter\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_model\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconf\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconf\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 312\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m train_res\n", + "File \u001b[0;32m~/anaconda3/envs/fedscale/lib/python3.10/site-packages/fedscale/cloud/execution/torch_client.py:71\u001b[0m, in \u001b[0;36mTorchClient.train\u001b[0;34m(self, client_data, model, conf)\u001b[0m\n\u001b[1;32m 69\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcompleted_steps \u001b[38;5;241m<\u001b[39m conf\u001b[38;5;241m.\u001b[39mlocal_steps:\n\u001b[1;32m 70\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m---> 71\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain_step\u001b[49m\u001b[43m(\u001b[49m\u001b[43mclient_data\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptimizer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcriterion\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 72\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m ex:\n\u001b[1;32m 73\u001b[0m error_type \u001b[38;5;241m=\u001b[39m ex\n", + "File \u001b[0;32m~/anaconda3/envs/fedscale/lib/python3.10/site-packages/fedscale/cloud/execution/torch_client.py:235\u001b[0m, in \u001b[0;36mTorchClient.train_step\u001b[0;34m(self, client_data, conf, model, optimizer, criterion)\u001b[0m\n\u001b[1;32m 233\u001b[0m \u001b[38;5;66;03m# ========= Define the backward loss ==============\u001b[39;00m\n\u001b[1;32m 234\u001b[0m optimizer\u001b[38;5;241m.\u001b[39mzero_grad()\n\u001b[0;32m--> 235\u001b[0m \u001b[43mloss\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 236\u001b[0m optimizer\u001b[38;5;241m.\u001b[39mstep()\n\u001b[1;32m 238\u001b[0m \u001b[38;5;66;03m# ========= Weight handler ========================\u001b[39;00m\n", + "File \u001b[0;32m~/anaconda3/envs/fedscale/lib/python3.10/site-packages/torch/_tensor.py:488\u001b[0m, in \u001b[0;36mTensor.backward\u001b[0;34m(self, gradient, retain_graph, create_graph, inputs)\u001b[0m\n\u001b[1;32m 478\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m has_torch_function_unary(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m 479\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m handle_torch_function(\n\u001b[1;32m 480\u001b[0m Tensor\u001b[38;5;241m.\u001b[39mbackward,\n\u001b[1;32m 481\u001b[0m (\u001b[38;5;28mself\u001b[39m,),\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 486\u001b[0m inputs\u001b[38;5;241m=\u001b[39minputs,\n\u001b[1;32m 487\u001b[0m )\n\u001b[0;32m--> 488\u001b[0m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mautograd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 489\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgradient\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs\u001b[49m\n\u001b[1;32m 490\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/anaconda3/envs/fedscale/lib/python3.10/site-packages/torch/autograd/__init__.py:197\u001b[0m, in \u001b[0;36mbackward\u001b[0;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001b[0m\n\u001b[1;32m 192\u001b[0m retain_graph \u001b[38;5;241m=\u001b[39m create_graph\n\u001b[1;32m 194\u001b[0m \u001b[38;5;66;03m# The reason we repeat same the comment below is that\u001b[39;00m\n\u001b[1;32m 195\u001b[0m \u001b[38;5;66;03m# some Python versions print out the first line of a multi-line function\u001b[39;00m\n\u001b[1;32m 196\u001b[0m \u001b[38;5;66;03m# calls in the traceback and some print out the last line\u001b[39;00m\n\u001b[0;32m--> 197\u001b[0m \u001b[43mVariable\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_execution_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun_backward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Calls into the C++ engine to run the backward pass\u001b[39;49;00m\n\u001b[1;32m 198\u001b[0m \u001b[43m \u001b[49m\u001b[43mtensors\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgrad_tensors_\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 199\u001b[0m \u001b[43m \u001b[49m\u001b[43mallow_unreachable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maccumulate_grad\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " ] } ], @@ -222,18 +105,26 @@ "import sys, os\n", "\n", "import fedscale.cloud.config_parser as parser\n", - "from fedscale.cloud.execution.client import Client\n", + "from fedscale.cloud.execution.torch_client import TorchClient\n", "from fedscale.cloud.execution.executor import Executor\n", "### On CPU\n", "parser.args.use_cuda = \"False\"\n", "Demo_Executor = Executor(parser.args)\n", "Demo_Executor.run()" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "227bece8", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3.7.13 ('fedscale': conda)", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -247,7 +138,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.13" + "version": "3.10.9" }, "vscode": { "interpreter": { diff --git a/examples/notebook/fedscale_demo_server.ipynb b/examples/notebook/fedscale_demo_server.ipynb index bc5ff058..e0efac07 100644 --- a/examples/notebook/fedscale_demo_server.ipynb +++ b/examples/notebook/fedscale_demo_server.ipynb @@ -18,7 +18,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 6, "id": "a1e48395", "metadata": {}, "outputs": [ @@ -26,287 +26,37 @@ "name": "stderr", "output_type": "stream", "text": [ - "(10-05) 10:56:16 INFO [aggregator.py:33] Job args Namespace(adam_epsilon=1e-08, arrival_interval=3, async_buffer=10, async_mode=False, backbone='./resnet50.pth', backend='gloo', batch_size=30, bidirectional=True, blacklist_max_len=0.3, blacklist_rounds=-1, block_size=64, cfg_file='./utils/rcnn/cfgs/res101.yml', checkin_period=50, clf_block_size=32, clip_bound=0.9, clip_threshold=3.0, clock_factor=1.1624548736462095, conf_path='~/dataset/', connection_timeout=60, cuda_device=None, cut_off_util=0.05, data_cache='', data_dir='~/cifar10/', data_map_file=None, data_set='cifar10', decay_factor=0.98, decay_round=10, device_avail_file=None, device_conf_file='/tmp/client.cfg', dump_epoch=10000000000.0, embedding_file='glove.840B.300d.txt', engine='pytorch', epsilon=0.9, eval_interval=5, executor_configs='127.0.0.1:[1]', experiment_mode='simulation', exploration_alpha=0.3, exploration_decay=0.98, exploration_factor=0.9, exploration_min=0.3, filter_less=32, filter_more=1000000000000000.0, finetune=False, gamma=0.9, gradient_policy=None, hidden_layers=7, hidden_size=256, input_dim=0, job_name='demo_job', labels_path='labels.json', learning_rate=0.05, line_by_line=False, local_steps=20, log_path='./', loss_decay=0.2, malicious_factor=1000000000000000.0, max_concurrency=100, max_staleness=5, memory_capacity=2000, min_learning_rate=5e-05, mlm=False, mlm_probability=0.15, model='shufflenet_v2_x2_0', model_size=65536, model_zoo='torchcv', n_actions=2, n_states=4, noise_dir=None, noise_factor=0.1, noise_max=0.5, noise_min=0.0, noise_prob=0.4, num_class=10, num_classes=35, num_executors=1, num_loaders=2, num_participants=4, output_dim=0, overcommitment=1.3, overwrite_cache=False, pacer_delta=5, pacer_step=20, proxy_mu=0.1, ps_ip='127.0.0.1', ps_port='29501', rnn_type='lstm', round_penalty=2.0, round_threshold=30, rounds=20, sample_mode='random', sample_rate=16000, sample_seed=233, sample_window=5.0, spec_augment=False, speed_volume_perturb=False, target_delta=0.0001, target_replace_iter=15, task='cv', test_bsz=128, test_manifest='data/test_manifest.csv', test_output_dir='./logs/server', test_ratio=1.0, test_size_file='', this_rank=1, time_stamp='logs', train_manifest='data/train_manifest.csv', train_size_file='', train_uniform=False, upload_step=20, use_cuda=True, vocab_tag_size=500, vocab_token_size=10000, weight_decay=0, window='hamming', window_size=0.02, window_stride=0.01, yogi_beta=0.9, yogi_beta2=0.99, yogi_eta=0.003, yogi_tau=1e-08)\n", - "2022-10-05 10:56:16.673184: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory\n", - "2022-10-05 10:56:16.673223: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.\n", - "2022-10-05 10:56:16.732819: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", - "2022-10-05 10:56:17.784849: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory\n", - "2022-10-05 10:56:17.784957: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory\n", - "2022-10-05 10:56:17.784973: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.\n", - "(10-05) 10:56:19 INFO [aggregator.py:129] Initiating control plane communication ...\n", - "(10-05) 10:56:19 INFO [aggregator.py:153] %%%%%%%%%% Opening aggregator sever using port [::]:29501 %%%%%%%%%%\n", - "(10-05) 10:56:19 INFO [fllibs.py:120] Initializing the model ...\n", - "(10-05) 10:56:19 INFO [aggregator.py:871] Start monitoring events ...\n", - "(10-05) 10:56:29 INFO [aggregator.py:263] Received executor 1 information, 1/1\n", - "(10-05) 10:56:29 INFO [aggregator.py:234] Loading 4 client traces ...\n", - "(10-05) 10:56:29 INFO [aggregator.py:251] Info of all feasible clients {'total_feasible_clients': 4, 'total_num_samples': 50000}\n", - "(10-05) 10:56:29 INFO [aggregator.py:529] Wall clock: 0 s, round: 1, Planned participants: 0, Succeed participants: 0, Training loss: 0.0\n", - "(10-05) 10:56:29 INFO [client_manager.py:202] Wall clock time: 0, 4 clients online, 0 clients offline\n", - "(10-05) 10:56:29 INFO [aggregator.py:541] Selected participants to run: [1, 2, 3, 4]\n", - "(10-05) 10:56:30 INFO [aggregator.py:804] ====event queue 1, deque(['update_model', 'client_train'])\n", - "(10-05) 10:56:30 INFO [aggregator.py:827] Issue EVENT (update_model) to EXECUTOR (1)\n", - "(10-05) 10:56:31 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 10:56:31 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 10:56:38 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 10:56:38 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 10:56:46 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 10:56:46 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 10:56:53 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 10:56:53 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 10:57:00 INFO [aggregator.py:529] Wall clock: 339847 s, round: 2, Planned participants: 4, Succeed participants: 4, Training loss: 5.602389772279155\n", - "(10-05) 10:57:00 INFO [client_manager.py:202] Wall clock time: 339847, 4 clients online, 0 clients offline\n", - "(10-05) 10:57:00 INFO [aggregator.py:541] Selected participants to run: [1, 2, 3, 4]\n", - "(10-05) 10:57:01 INFO [aggregator.py:804] ====event queue 1, deque(['update_model', 'client_train'])\n", - "(10-05) 10:57:01 INFO [aggregator.py:827] Issue EVENT (update_model) to EXECUTOR (1)\n", - "(10-05) 10:57:02 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 10:57:02 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 10:57:09 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 10:57:09 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 10:57:16 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 10:57:16 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 10:57:23 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 10:57:23 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 10:57:31 INFO [aggregator.py:529] Wall clock: 679693 s, round: 3, Planned participants: 4, Succeed participants: 4, Training loss: 5.302637194544962\n", - "(10-05) 10:57:31 INFO [client_manager.py:202] Wall clock time: 679693, 4 clients online, 0 clients offline\n", - "(10-05) 10:57:31 INFO [aggregator.py:541] Selected participants to run: [1, 2, 3, 4]\n", - "(10-05) 10:57:32 INFO [aggregator.py:804] ====event queue 1, deque(['update_model', 'client_train'])\n", - "(10-05) 10:57:32 INFO [aggregator.py:827] Issue EVENT (update_model) to EXECUTOR (1)\n", - "(10-05) 10:57:33 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 10:57:33 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 10:57:40 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 10:57:40 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 10:57:47 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 10:57:47 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 10:57:55 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 10:57:55 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 10:58:02 INFO [aggregator.py:529] Wall clock: 1019540 s, round: 4, Planned participants: 4, Succeed participants: 4, Training loss: 4.599358620310677\n", - "(10-05) 10:58:02 INFO [client_manager.py:202] Wall clock time: 1019540, 4 clients online, 0 clients offline\n", - "(10-05) 10:58:02 INFO [aggregator.py:541] Selected participants to run: [1, 2, 3, 4]\n", - "(10-05) 10:58:03 INFO [aggregator.py:804] ====event queue 1, deque(['update_model', 'client_train'])\n", - "(10-05) 10:58:03 INFO [aggregator.py:827] Issue EVENT (update_model) to EXECUTOR (1)\n", - "(10-05) 10:58:04 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 10:58:04 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 10:58:11 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 10:58:11 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 10:58:18 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 10:58:18 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 10:58:25 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 10:58:25 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 10:58:32 INFO [aggregator.py:529] Wall clock: 1359387 s, round: 5, Planned participants: 4, Succeed participants: 4, Training loss: 4.436900660524809\n", - "(10-05) 10:58:32 INFO [client_manager.py:202] Wall clock time: 1359387, 4 clients online, 0 clients offline\n", - "(10-05) 10:58:32 INFO [aggregator.py:541] Selected participants to run: [1, 2, 3, 4]\n", - "(10-05) 10:58:33 INFO [aggregator.py:804] ====event queue 1, deque(['update_model', 'model_test'])\n", - "(10-05) 10:58:33 INFO [aggregator.py:827] Issue EVENT (update_model) to EXECUTOR (1)\n", - "(10-05) 10:58:34 INFO [aggregator.py:804] ====event queue 1, deque(['model_test'])\n", - "(10-05) 10:58:34 INFO [aggregator.py:827] Issue EVENT (model_test) to EXECUTOR (1)\n", - "(10-05) 10:58:53 INFO [aggragation.py:64] FL Testing in round: 5, virtual_clock: 1359386.95, top_1: 10.0 %, top_5: 51.42 %, test loss: 2.4887, test len: 10000\n", - "(10-05) 10:58:54 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 10:58:54 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 10:59:01 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 10:59:01 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 10:59:08 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 10:59:08 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 10:59:15 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 10:59:15 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 10:59:23 INFO [aggregator.py:529] Wall clock: 1699234 s, round: 6, Planned participants: 4, Succeed participants: 4, Training loss: 3.8854766698601377\n", - "(10-05) 10:59:23 INFO [client_manager.py:202] Wall clock time: 1699234, 4 clients online, 0 clients offline\n", - "(10-05) 10:59:23 INFO [aggregator.py:541] Selected participants to run: [1, 2, 3, 4]\n", - "(10-05) 10:59:23 INFO [aggregator.py:804] ====event queue 1, deque(['update_model', 'client_train'])\n", - "(10-05) 10:59:23 INFO [aggregator.py:827] Issue EVENT (update_model) to EXECUTOR (1)\n", - "(10-05) 10:59:25 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 10:59:25 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 10:59:32 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 10:59:32 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 10:59:39 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 10:59:39 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 10:59:46 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 10:59:46 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 10:59:53 INFO [aggregator.py:529] Wall clock: 2039080 s, round: 7, Planned participants: 4, Succeed participants: 4, Training loss: 3.4019230657444686\n", - "(10-05) 10:59:53 INFO [client_manager.py:202] Wall clock time: 2039080, 4 clients online, 0 clients offline\n", - "(10-05) 10:59:53 INFO [aggregator.py:541] Selected participants to run: [1, 2, 3, 4]\n", - "(10-05) 10:59:54 INFO [aggregator.py:804] ====event queue 1, deque(['update_model', 'client_train'])\n", - "(10-05) 10:59:54 INFO [aggregator.py:827] Issue EVENT (update_model) to EXECUTOR (1)\n", - "(10-05) 10:59:56 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 10:59:56 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 11:00:03 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 11:00:03 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 11:00:10 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 11:00:10 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 11:00:17 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 11:00:17 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 11:00:24 INFO [aggregator.py:529] Wall clock: 2378927 s, round: 8, Planned participants: 4, Succeed participants: 4, Training loss: 3.0577747210462976\n", - "(10-05) 11:00:24 INFO [client_manager.py:202] Wall clock time: 2378927, 4 clients online, 0 clients offline\n", - "(10-05) 11:00:24 INFO [aggregator.py:541] Selected participants to run: [1, 2, 3, 4]\n", - "(10-05) 11:00:25 INFO [aggregator.py:804] ====event queue 1, deque(['update_model', 'client_train'])\n", - "(10-05) 11:00:25 INFO [aggregator.py:827] Issue EVENT (update_model) to EXECUTOR (1)\n", - "(10-05) 11:00:26 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 11:00:26 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 11:00:33 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 11:00:33 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 11:00:40 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 11:00:40 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 11:00:47 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 11:00:47 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 11:00:54 INFO [aggregator.py:529] Wall clock: 2718774 s, round: 9, Planned participants: 4, Succeed participants: 4, Training loss: 2.8637842447127393\n", - "(10-05) 11:00:54 INFO [client_manager.py:202] Wall clock time: 2718774, 4 clients online, 0 clients offline\n", - "(10-05) 11:00:54 INFO [aggregator.py:541] Selected participants to run: [1, 2, 3, 4]\n", - "(10-05) 11:00:55 INFO [aggregator.py:804] ====event queue 1, deque(['update_model', 'client_train'])\n", - "(10-05) 11:00:55 INFO [aggregator.py:827] Issue EVENT (update_model) to EXECUTOR (1)\n", - "(10-05) 11:00:56 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 11:00:56 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 11:01:04 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 11:01:04 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 11:01:11 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 11:01:11 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 11:01:17 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 11:01:17 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 11:01:25 INFO [aggregator.py:529] Wall clock: 3058621 s, round: 10, Planned participants: 4, Succeed participants: 4, Training loss: 2.832259889787557\n", - "(10-05) 11:01:25 INFO [client_manager.py:202] Wall clock time: 3058621, 4 clients online, 0 clients offline\n", - "(10-05) 11:01:25 INFO [aggregator.py:541] Selected participants to run: [1, 2, 3, 4]\n", - "(10-05) 11:01:25 INFO [aggregator.py:804] ====event queue 1, deque(['update_model', 'model_test'])\n", - "(10-05) 11:01:26 INFO [aggregator.py:827] Issue EVENT (update_model) to EXECUTOR (1)\n", - "(10-05) 11:01:27 INFO [aggregator.py:804] ====event queue 1, deque(['model_test'])\n", - "(10-05) 11:01:27 INFO [aggregator.py:827] Issue EVENT (model_test) to EXECUTOR (1)\n", - "(10-05) 11:01:45 INFO [aggragation.py:64] FL Testing in round: 10, virtual_clock: 3058620.6374999997, top_1: 17.66 %, top_5: 67.99 %, test loss: 2.3363, test len: 10000\n", - "(10-05) 11:01:46 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 11:01:46 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 11:01:53 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 11:01:53 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 11:02:00 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 11:02:00 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 11:02:07 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 11:02:07 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 11:02:15 INFO [aggregator.py:529] Wall clock: 3398467 s, round: 11, Planned participants: 4, Succeed participants: 4, Training loss: 2.4575965823961834\n", - "(10-05) 11:02:15 INFO [client_manager.py:202] Wall clock time: 3398467, 4 clients online, 0 clients offline\n", - "(10-05) 11:02:15 INFO [aggregator.py:541] Selected participants to run: [1, 2, 3, 4]\n", - "(10-05) 11:02:16 INFO [aggregator.py:804] ====event queue 1, deque(['update_model', 'client_train'])\n", - "(10-05) 11:02:16 INFO [aggregator.py:827] Issue EVENT (update_model) to EXECUTOR (1)\n", - "(10-05) 11:02:17 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 11:02:17 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 11:02:24 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 11:02:24 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 11:02:31 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 11:02:31 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 11:02:38 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 11:02:38 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 11:02:45 INFO [aggregator.py:529] Wall clock: 3738314 s, round: 12, Planned participants: 4, Succeed participants: 4, Training loss: 2.3794690852301428\n", - "(10-05) 11:02:45 INFO [client_manager.py:202] Wall clock time: 3738314, 4 clients online, 0 clients offline\n", - "(10-05) 11:02:45 INFO [aggregator.py:541] Selected participants to run: [1, 2, 3, 4]\n", - "(10-05) 11:02:46 INFO [aggregator.py:804] ====event queue 1, deque(['update_model', 'client_train'])\n", - "(10-05) 11:02:46 INFO [aggregator.py:827] Issue EVENT (update_model) to EXECUTOR (1)\n", - "(10-05) 11:02:47 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 11:02:47 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 11:02:54 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 11:02:54 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 11:03:01 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 11:03:01 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 11:03:09 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 11:03:09 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 11:03:16 INFO [aggregator.py:529] Wall clock: 4078161 s, round: 13, Planned participants: 4, Succeed participants: 4, Training loss: 2.376686896891561\n", - "(10-05) 11:03:16 INFO [client_manager.py:202] Wall clock time: 4078161, 4 clients online, 0 clients offline\n", - "(10-05) 11:03:16 INFO [aggregator.py:541] Selected participants to run: [1, 2, 3, 4]\n", - "(10-05) 11:03:17 INFO [aggregator.py:804] ====event queue 1, deque(['update_model', 'client_train'])\n", - "(10-05) 11:03:17 INFO [aggregator.py:827] Issue EVENT (update_model) to EXECUTOR (1)\n", - "(10-05) 11:03:18 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 11:03:18 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 11:03:25 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 11:03:25 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 11:03:32 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 11:03:32 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 11:03:39 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 11:03:39 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 11:03:46 INFO [aggregator.py:529] Wall clock: 4418008 s, round: 14, Planned participants: 4, Succeed participants: 4, Training loss: 2.4138368855547325\n", - "(10-05) 11:03:46 INFO [client_manager.py:202] Wall clock time: 4418008, 4 clients online, 0 clients offline\n", - "(10-05) 11:03:46 INFO [aggregator.py:541] Selected participants to run: [1, 2, 3, 4]\n", - "(10-05) 11:03:47 INFO [aggregator.py:804] ====event queue 1, deque(['update_model', 'client_train'])\n", - "(10-05) 11:03:47 INFO [aggregator.py:827] Issue EVENT (update_model) to EXECUTOR (1)\n", - "(10-05) 11:03:49 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 11:03:49 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 11:03:56 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 11:03:56 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 11:04:02 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 11:04:02 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 11:04:10 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 11:04:10 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 11:04:17 INFO [aggregator.py:529] Wall clock: 4757854 s, round: 15, Planned participants: 4, Succeed participants: 4, Training loss: 2.3423781394464704\n", - "(10-05) 11:04:17 INFO [client_manager.py:202] Wall clock time: 4757854, 4 clients online, 0 clients offline\n", - "(10-05) 11:04:17 INFO [aggregator.py:541] Selected participants to run: [1, 2, 3, 4]\n", - "(10-05) 11:04:18 INFO [aggregator.py:804] ====event queue 1, deque(['update_model', 'model_test'])\n", - "(10-05) 11:04:18 INFO [aggregator.py:827] Issue EVENT (update_model) to EXECUTOR (1)\n", - "(10-05) 11:04:19 INFO [aggregator.py:804] ====event queue 1, deque(['model_test'])\n", - "(10-05) 11:04:19 INFO [aggregator.py:827] Issue EVENT (model_test) to EXECUTOR (1)\n", - "(10-05) 11:04:38 INFO [aggragation.py:64] FL Testing in round: 15, virtual_clock: 4757854.324999999, top_1: 24.31 %, top_5: 76.41 %, test loss: 2.2923, test len: 10000\n", - "(10-05) 11:04:39 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 11:04:39 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 11:04:46 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 11:04:46 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 11:04:53 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 11:04:53 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 11:05:00 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 11:05:00 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 11:05:07 INFO [aggregator.py:529] Wall clock: 5097701 s, round: 16, Planned participants: 4, Succeed participants: 4, Training loss: 2.2786644983594107\n", - "(10-05) 11:05:08 INFO [client_manager.py:202] Wall clock time: 5097701, 4 clients online, 0 clients offline\n", - "(10-05) 11:05:08 INFO [aggregator.py:541] Selected participants to run: [1, 2, 3, 4]\n", - "(10-05) 11:05:08 INFO [aggregator.py:804] ====event queue 1, deque(['update_model', 'client_train'])\n", - "(10-05) 11:05:08 INFO [aggregator.py:827] Issue EVENT (update_model) to EXECUTOR (1)\n", - "(10-05) 11:05:10 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 11:05:10 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 11:05:16 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 11:05:16 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 11:05:23 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 11:05:23 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 11:05:30 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 11:05:30 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 11:05:37 INFO [aggregator.py:529] Wall clock: 5437548 s, round: 17, Planned participants: 4, Succeed participants: 4, Training loss: 2.2587813556717866\n", - "(10-05) 11:05:37 INFO [client_manager.py:202] Wall clock time: 5437548, 4 clients online, 0 clients offline\n", - "(10-05) 11:05:37 INFO [aggregator.py:541] Selected participants to run: [1, 2, 3, 4]\n", - "(10-05) 11:05:38 INFO [aggregator.py:804] ====event queue 1, deque(['update_model', 'client_train'])\n", - "(10-05) 11:05:38 INFO [aggregator.py:827] Issue EVENT (update_model) to EXECUTOR (1)\n", - "(10-05) 11:05:40 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 11:05:40 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 11:05:46 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 11:05:46 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 11:05:54 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 11:05:54 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 11:06:01 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 11:06:01 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 11:06:08 INFO [aggregator.py:529] Wall clock: 5777395 s, round: 18, Planned participants: 4, Succeed participants: 4, Training loss: 2.1881389614638316\n", - "(10-05) 11:06:08 INFO [client_manager.py:202] Wall clock time: 5777395, 4 clients online, 0 clients offline\n", - "(10-05) 11:06:08 INFO [aggregator.py:541] Selected participants to run: [1, 2, 3, 4]\n", - "(10-05) 11:06:09 INFO [aggregator.py:804] ====event queue 1, deque(['update_model', 'client_train'])\n", - "(10-05) 11:06:09 INFO [aggregator.py:827] Issue EVENT (update_model) to EXECUTOR (1)\n", - "(10-05) 11:06:10 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 11:06:10 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 11:06:17 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 11:06:17 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 11:06:25 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 11:06:25 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 11:06:31 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 11:06:31 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 11:06:39 INFO [aggregator.py:529] Wall clock: 6117241 s, round: 19, Planned participants: 4, Succeed participants: 4, Training loss: 2.093780582858183\n", - "(10-05) 11:06:39 INFO [client_manager.py:202] Wall clock time: 6117241, 4 clients online, 0 clients offline\n", - "(10-05) 11:06:39 INFO [aggregator.py:541] Selected participants to run: [1, 2, 3, 4]\n", - "(10-05) 11:06:39 INFO [aggregator.py:804] ====event queue 1, deque(['update_model', 'client_train'])\n", - "(10-05) 11:06:40 INFO [aggregator.py:827] Issue EVENT (update_model) to EXECUTOR (1)\n", - "(10-05) 11:06:41 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 11:06:41 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 11:06:48 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 11:06:48 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 11:06:55 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 11:06:55 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 11:07:02 INFO [aggregator.py:804] ====event queue 1, deque(['client_train'])\n", - "(10-05) 11:07:02 INFO [aggregator.py:827] Issue EVENT (client_train) to EXECUTOR (1)\n", - "(10-05) 11:07:09 INFO [aggregator.py:529] Wall clock: 6457088 s, round: 20, Planned participants: 4, Succeed participants: 4, Training loss: 2.1000436564237734\n", - "(10-05) 11:07:09 INFO [client_manager.py:202] Wall clock time: 6457088, 4 clients online, 0 clients offline\n", - "(10-05) 11:07:09 INFO [aggregator.py:541] Selected participants to run: [1, 2, 3, 4]\n" + "(02-27) 12:24:08 INFO [aggregator.py:44] Job args Namespace(job_name='demo_job', log_path='./', wandb_token='', ps_ip='127.0.0.1', ps_port='29500', this_rank=1, connection_timeout=60, experiment_mode='simulation', engine='pytorch', num_executors=1, executor_configs='127.0.0.1:[1]', num_participants=4, data_map_file=None, use_cuda=True, cuda_device=None, time_stamp='logs', task='cv', device_avail_file=None, clock_factor=1.1624548736462095, model_zoo='torchcv', data_dir='~/cifar10/', device_conf_file='/tmp/client.cfg', model='shufflenet_v2_x2_0', data_set='cifar10', sample_mode='random', filter_less=32, filter_more=1000000000000000.0, train_uniform=False, conf_path='~/dataset/', overcommitment=1.3, model_size=65536, round_threshold=30, round_penalty=2.0, clip_bound=0.9, blacklist_rounds=-1, blacklist_max_len=0.3, embedding_file='glove.840B.300d.txt', input_shape=[1, 3, 28, 28], save_checkpoint=False, rounds=50, local_steps=20, batch_size=30, test_bsz=128, backend='gloo', learning_rate=0.05, min_learning_rate=5e-05, input_dim=0, output_dim=0, dump_epoch=10000000000.0, decay_factor=0.98, decay_round=10, num_loaders=2, eval_interval=5, sample_seed=233, test_ratio=1.0, loss_decay=0.2, exploration_min=0.3, cut_off_util=0.05, gradient_policy=None, yogi_eta=0.003, yogi_tau=1e-08, yogi_beta=0.9, yogi_beta2=0.99, proxy_mu=0.1, cfg_file='./utils/rcnn/cfgs/res101.yml', test_output_dir='./logs/server', train_size_file='', test_size_file='', data_cache='', backbone='./resnet50.pth', malicious_factor=1000000000000000.0, max_concurrency=10, max_staleness=5, noise_factor=0.1, clip_threshold=3.0, target_delta=0.0001, pacer_delta=5, pacer_step=20, exploration_alpha=0.3, exploration_factor=0.9, exploration_decay=0.98, sample_window=5.0, line_by_line=False, clf_block_size=32, mlm=False, mlm_probability=0.15, overwrite_cache=False, block_size=64, weight_decay=0, adam_epsilon=1e-08, vocab_token_size=10000, vocab_tag_size=500, epsilon=0.9, gamma=0.9, memory_capacity=2000, target_replace_iter=15, n_actions=2, n_states=4, num_classes=35, train_manifest='data/train_manifest.csv', test_manifest='data/test_manifest.csv', sample_rate=16000, labels_path='labels.json', window_size=0.02, window_stride=0.01, window='hamming', hidden_size=256, hidden_layers=7, rnn_type='lstm', finetune=False, speed_volume_perturb=False, spec_augment=False, noise_dir=None, noise_prob=0.4, noise_min=0.0, noise_max=0.5, bidirectional=True, num_class=10)\n", + "(02-27) 12:24:08 INFO [aggregator.py:156] Initiating control plane communication ...\n", + "(02-27) 12:24:08 INFO [aggregator.py:180] %%%%%%%%%% Opening aggregator sever using port [::]:29500 %%%%%%%%%%\n", + "(02-27) 12:24:08 INFO [fllibs.py:97] Initializing the model ...\n", + "(02-27) 12:24:08 INFO [aggregator.py:869] Start monitoring events ...\n", + "(02-27) 12:24:53 INFO [aggregator.py:298] Received executor 1 information, 1/1\n", + "(02-27) 12:24:53 INFO [aggregator.py:264] Loading 4 client traces ...\n", + "(02-27) 12:24:53 INFO [aggregator.py:285] Info of all feasible clients {'total_feasible_clients': 4, 'total_num_samples': 50000}\n", + "(02-27) 12:24:53 INFO [aggregator.py:517] Wall clock: 0 s, round: 1, Planned participants: 0, Succeed participants: 0, Training loss: 0.0\n", + "(02-27) 12:24:53 INFO [client_manager.py:194] Wall clock time: 0, 4 clients online, 0 clients offline\n", + "(02-27) 12:24:53 INFO [aggregator.py:531] Selected participants to run: [1, 2, 3, 4]\n", + "(02-27) 12:24:54 INFO [aggregator.py:825] Issue EVENT (update_model) to EXECUTOR (1)\n", + "(02-27) 12:24:55 INFO [aggregator.py:825] Issue EVENT (model_test) to EXECUTOR (1)\n", + "(02-27) 12:26:30 INFO [aggregator.py:490] FL Testing in round: 1, virtual_clock: 0.0, results: {'round': 1, 'clock': 0.0, 'top_1': 0.1, 'top_5': 0.5, 'loss': 2.3026571062546743}\n", + "(02-27) 12:26:31 INFO [aggregator.py:825] Issue EVENT (client_train) to EXECUTOR (1)\n", + "(02-27) 12:27:33 INFO [aggregator.py:825] Issue EVENT (client_train) to EXECUTOR (1)\n", + "(02-27) 12:28:36 INFO [aggregator.py:825] Issue EVENT (client_train) to EXECUTOR (1)\n", + "(02-27) 12:29:38 INFO [aggregator.py:825] Issue EVENT (client_train) to EXECUTOR (1)\n" ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "(10-05) 11:07:10 INFO [aggregator.py:804] ====event queue 1, deque(['terminate_executor'])\n", - "(10-05) 11:07:10 INFO [aggregator.py:827] Issue EVENT (terminate_executor) to EXECUTOR (1)\n" + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[6], line 9\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[38;5;66;03m### On CPU\u001b[39;00m\n\u001b[1;32m 8\u001b[0m parser\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39muse_cuda \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFalse\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m----> 9\u001b[0m \u001b[43mDemo_Aggregator\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/Documents/Research/code/FedScale/fedscale/cloud/aggregation/aggregator.py:390\u001b[0m, in \u001b[0;36mAggregator.run\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 386\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39minit_model()\n\u001b[1;32m 387\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel_update_size \u001b[38;5;241m=\u001b[39m sys\u001b[38;5;241m.\u001b[39mgetsizeof(\n\u001b[1;32m 388\u001b[0m pickle\u001b[38;5;241m.\u001b[39mdumps(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel_wrapper)) \u001b[38;5;241m/\u001b[39m \u001b[38;5;241m1024.0\u001b[39m \u001b[38;5;241m*\u001b[39m \u001b[38;5;241m8.\u001b[39m \u001b[38;5;66;03m# kbits\u001b[39;00m\n\u001b[0;32m--> 390\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mevent_monitor\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 391\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstop()\n", + "File \u001b[0;32m~/Documents/Research/code/FedScale/fedscale/cloud/aggregation/aggregator.py:906\u001b[0m, in \u001b[0;36mAggregator.event_monitor\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 902\u001b[0m logging\u001b[38;5;241m.\u001b[39merror(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mEvent \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcurrent_event\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m is not defined\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 904\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 905\u001b[0m \u001b[38;5;66;03m# execute every 100 ms\u001b[39;00m\n\u001b[0;32m--> 906\u001b[0m \u001b[43mtime\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msleep\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m0.1\u001b[39;49m\u001b[43m)\u001b[49m\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " ] } ], @@ -314,7 +64,7 @@ "import sys, os\n", "\n", "import fedscale.cloud.config_parser as parser\n", - "from fedscale.cloud.execution.client import Client\n", + "from fedscale.cloud.execution.torch_client import TorchClient\n", "from fedscale.cloud.aggregation.aggregator import Aggregator\n", "Demo_Aggregator = Aggregator(parser.args)\n", "### On CPU\n", @@ -332,24 +82,31 @@ "name": "stdout", "output_type": "stream", "text": [ - "TensorFlow installation not found - running with reduced feature set.\n", "\n", "NOTE: Using experimental fast data loading logic. To disable, pass\n", " \"--load_fast=false\" and report issues on GitHub. More details:\n", " https://github.com/tensorflow/tensorboard/issues/4784\n", "\n", - "TensorBoard 2.8.0 at http://clnode219.clemson.cloudlab.us:6007/ (Press CTRL+C to quit)\n" + "TensorBoard 2.11.2 at http://localhost:6007/ (Press CTRL+C to quit)\n" ] } ], "source": [ "!tensorboard --logdir=./logs/demo_job --port=6007 --bind_all" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f2a40035", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3.7.13 ('fedscale': conda)", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -363,7 +120,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.13" + "version": "3.10.9" }, "vscode": { "interpreter": { From 93c7548d8efd0ce58801948858409e4a9fc92dca Mon Sep 17 00:00:00 2001 From: fanlai0990 Date: Sat, 11 Mar 2023 11:48:36 -0500 Subject: [PATCH 2/2] [Doc] Deployment Readme --- fedscale/edge/android/README-App.md | 30 +++++++++ fedscale/edge/android/README.md | 100 +++++++++++++++------------- 2 files changed, 85 insertions(+), 45 deletions(-) create mode 100644 fedscale/edge/android/README-App.md diff --git a/fedscale/edge/android/README-App.md b/fedscale/edge/android/README-App.md new file mode 100644 index 00000000..51f39469 --- /dev/null +++ b/fedscale/edge/android/README-App.md @@ -0,0 +1,30 @@ +## FedScale Example Mobile App + +We provide a sample app which you can choose to +- Train/test models with TFLite or Alibaba MNN. +- Fine-tune models locally **after** receiving model from the cloud. + +Please follow these steps to download and build the sample android app. + +1. Download and unzip [sample dataset (TrainTest.zip)](https://drive.google.com/file/d/1nfi3SVzjaE0LPxwj_5DNdqi6rK7BU8kb/view?usp=sharing) to `assets/` directory. Remove `TrainTest.zip` after unzip to save space on your mobile device. After unzip, you should see 3 files and 2 directories under `assets/`: + 1. `TrainSet`: Training set directory, contains 320 images. + 2. `TestSet`: Testing set directory, contains 32 images. + 3. `conf.json`: Configuration file for mobile app. + 4. `train_labels.txt`: Training label file with format `