Provide GPU version of lifelong cityscapes example.

jaypume · jaypume · commit 6ebe33e3c242 · 2023-07-29T15:52:23.000+08:00
Signed-off-by: Jie Pu &lt;i@jaypu.com&gt;
diff --git a/examples/README.md b/examples/README.md
@@ -22,6 +22,8 @@ Example: [Using Federated Learning Job in Surface Defect Detection Scenario](./f
 ### Lifelong Learning
 Example: [Using Lifelong Learning Job in Thermal Comfort Prediction Scenario](./lifelong_learning/atcii/README.md)
 
+Example: [Using Lifelong Learning in Campus Robot Delivery Scenario](./lifelong_learning/cityscapes/README.md)
+
 ### Multi-Edge Inference
 Example: [Using ReID to Track an Infected COVID-19 Carrier in Pandemic Scenario](./multiedgeinference/pedestrian_tracking/README.md)
 
diff --git a/examples/lifelong_learning/cityscapes/cityscapes-segmentation-lifelong-learning-tutorial.md b/examples/lifelong_learning/cityscapes/cityscapes-segmentation-lifelong-learning-tutorial.md
@@ -211,6 +211,23 @@ spec:
 EOF
 ```
 
+### GPU enabled (optional)
+If you want GPU to accelerate training or inference in Sedna, you can follow the steps below to enable GPU:
+
+> 1. Follow the instructions in  [nvidia-device-plugin](https://github.com/NVIDIA/k8s-device-plugin#quick-start) to make nvidia-docker to be docker runtime.
+> 2. Set config `devicePluginEnabled` to `true` and restart edgecore in the gpu edge node.
+> 3. Deploy the [device-plugin daemonset](https://github.com/NVIDIA/k8s-device-plugin#enabling-gpu-support-in-kubernetes) and check the device-plugin-pod running status in the gpu edge node.
+> 4. Check the capacity and allocatable of gpu edge node status.
+> 5. Deploy the [cuda-add pod](https://github.com/NVIDIA/k8s-device-plugin#enabling-gpu-support-in-kubernetes), and wait some time for the pod to be running since the size of cuda-add image is 1.97GB.
+> 6. Check the cuda-add pod status, the log of "Test PASSED" means the gpu is enabled successfully.
+ 
+The disscussion can be found in this [issue](https://github.com/kubeedge/kubeedge/issues/2324#issuecomment-726645832)
+
+When GPU plugin has been enabled, you can use the [robot-dog-delivery-gpu.yaml](./yaml/robot-dog-delivery-gpu.yaml) configuration to create and run lifelong learning job.
+
+To enable GPU in other Sedna features can be similarly configured like the above steps.
+
+
 ## 1.5 Check Lifelong Learning Job
 **(1). Query lifelong learning service status**
 
diff --git a/examples/lifelong_learning/cityscapes/yaml/robot-dog-delivery-gpu.yaml b/examples/lifelong_learning/cityscapes/yaml/robot-dog-delivery-gpu.yaml
@@ -0,0 +1,143 @@
+apiVersion: sedna.io/v1alpha1
+kind: LifelongLearningJob
+metadata:
+  name: $job_name
+spec:
+  dataset:
+    name: "lifelong-robo-dataset"
+    trainProb: 0.8
+  trainSpec:
+    template:
+      spec:
+        nodeName: $TRAIN_NODE
+        dnsPolicy: ClusterFirstWithHostNet
+        containers:
+          - image: $cloud_image
+            name: train-worker
+            imagePullPolicy: IfNotPresent
+            args: [ "train.py" ]
+            env:
+              - name: "num_class"
+                value: "24"
+              - name: "epoches"
+                value: "1"
+              - name: "attribute"
+                value: "real, sim"
+              - name: "city"
+                value: "berlin"
+              - name: "BACKEND_TYPE"
+                value: "PYTORCH"
+              - name: "train_ratio"
+                value: "0.9"
+              - name: "gpu_ids"
+                value: "0"
+            resources:
+              limits:
+                nvidia.com/gpu: 1 # requesting 1 GPU
+                cpu: 6
+                memory: 12Gi
+              requests:
+                cpu: 4
+                memory: 12Gi
+                nvidia.com/gpu: 1 # requesting 1 GPU
+            volumeMounts:
+              - mountPath: /dev/shm
+                name: cache-volume
+        volumes:
+          - emptyDir:
+              medium: Memory
+              sizeLimit: 256Mi
+            name: cache-volume
+    trigger:
+      checkPeriodSeconds: 30
+      timer:
+        start: 00:00
+        end: 24:00
+      condition:
+        operator: ">"
+        threshold: 100
+        metric: num_of_samples
+  evalSpec:
+    template:
+      spec:
+        nodeName: $EVAL_NODE
+        dnsPolicy: ClusterFirstWithHostNet
+        containers:
+          - image: $cloud_image
+            name: eval-worker
+            imagePullPolicy: IfNotPresent
+            args: [ "evaluate.py" ]
+            env:
+              - name: "operator"
+                value: "<"
+              - name: "model_threshold"
+                value: "0"
+              - name: "num_class"
+                value: "24"
+              - name: "BACKEND_TYPE"
+                value: "PYTORCH"
+              - name: "gpu_ids"
+                value: "0"
+            resources:
+              limits:
+                cpu: 6
+                memory: 12Gi
+                nvidia.com/gpu: 1 # requesting 1 GPU
+              requests:
+                cpu: 4
+                memory: 12Gi
+                nvidia.com/gpu: 1 # requesting 1 GPU
+  deploySpec:
+    template:
+      spec:
+        nodeName: $INFER_NODE
+        dnsPolicy: ClusterFirstWithHostNet
+        hostNetwork: true
+        containers:
+          - image: $edge_image
+            name: infer-worker
+            imagePullPolicy: IfNotPresent
+            args: [ "predict.py" ]
+            env:
+              - name: "test_data"
+                value: "/data/test_data"
+              - name: "num_class"
+                value: "24"
+              - name: "unseen_save_url"
+                value: "/data/unseen_samples"
+              - name: "INFERENCE_RESULT_DIR"
+                value: "/data/infer_results"
+              - name: "BACKEND_TYPE"
+                value: "PYTORCH"
+              - name: "gpu_ids"
+                value: "0"
+            volumeMounts:
+              - name: unseenurl
+                mountPath: /data/unseen_samples
+              - name: inferdata
+                mountPath: /data/infer_results
+              - name: testdata
+                mountPath: /data/test_data
+            resources:
+              limits:
+                cpu: 6
+                memory: 12Gi
+                nvidia.com/gpu: 1 # requesting 1 GPU
+              requests:
+                cpu: 4
+                memory: 12Gi
+                nvidia.com/gpu: 1 # requesting 1 GPU
+        volumes:
+          - name: unseenurl
+            hostPath:
+              path: /data/unseen_samples
+              type: DirectoryOrCreate
+          - name: inferdata
+            hostPath:
+              path: /data/infer_results
+              type: DirectoryOrCreate
+          - name: testdata
+            hostPath:
+              path: /data/test_data
+              type: DirectoryOrCreate
+  outputDir: $OUTPUT/$job_name