From c7cfe0e3b3f28bdd31ca84c7c3bbff616d1393d0 Mon Sep 17 00:00:00 2001 From: Jie Pu Date: Sat, 29 Jul 2023 15:40:21 +0800 Subject: [PATCH] Provide GPU version of lifelong cityscapes example. Signed-off-by: Jie Pu --- examples/README.md | 2 + ...segmentation-lifelong-learning-tutorial.md | 17 +++ .../yaml/robot-dog-delivery-gpu.yaml | 143 ++++++++++++++++++ 3 files changed, 162 insertions(+) create mode 100644 examples/lifelong_learning/cityscapes/yaml/robot-dog-delivery-gpu.yaml diff --git a/examples/README.md b/examples/README.md index 09e15c0e7..4efeb6f60 100644 --- a/examples/README.md +++ b/examples/README.md @@ -22,6 +22,8 @@ Example: [Using Federated Learning Job in Surface Defect Detection Scenario](./f ### Lifelong Learning Example: [Using Lifelong Learning Job in Thermal Comfort Prediction Scenario](./lifelong_learning/atcii/README.md) +Example: [Using Lifelong Learning in Campus Robot Delivery Scenario](./lifelong_learning/cityscapes/README.md) + ### Multi-Edge Inference Example: [Using ReID to Track an Infected COVID-19 Carrier in Pandemic Scenario](./multiedgeinference/pedestrian_tracking/README.md) diff --git a/examples/lifelong_learning/cityscapes/cityscapes-segmentation-lifelong-learning-tutorial.md b/examples/lifelong_learning/cityscapes/cityscapes-segmentation-lifelong-learning-tutorial.md index 7d082b43e..38e6ef148 100644 --- a/examples/lifelong_learning/cityscapes/cityscapes-segmentation-lifelong-learning-tutorial.md +++ b/examples/lifelong_learning/cityscapes/cityscapes-segmentation-lifelong-learning-tutorial.md @@ -211,6 +211,23 @@ spec: EOF ``` +### GPU enabled (optional) +If you want GPU to accelerate training or inference in Sedna, you can follow the steps below to enable GPU: + +> 1. Follow the instructions in [nvidia-device-plugin](https://github.com/NVIDIA/k8s-device-plugin#quick-start) to make nvidia-docker to be docker runtime. +> 2. Set config `devicePluginEnabled` to `true` and restart edgecore in the gpu edge node. Please refer to this [doc](https://kubeedge.io/docs/setup/config#configuration-edge-side-kubeedge-worker-node) for configuring edgecore. +> 3. Deploy the [device-plugin daemonset](https://github.com/NVIDIA/k8s-device-plugin#enabling-gpu-support-in-kubernetes) and check the device-plugin-pod running status in the gpu edge node. +> 4. Check the capacity and allocatable of gpu edge node status. +> 5. Deploy the [cuda-add pod](https://github.com/NVIDIA/k8s-device-plugin#enabling-gpu-support-in-kubernetes), and wait some time for the pod to be running since the size of cuda-add image is 1.97GB. +> 6. Check the cuda-add pod status, the log of "Test PASSED" means the gpu is enabled successfully. + +The disscussion can be found in this [issue](https://github.com/kubeedge/kubeedge/issues/2324#issuecomment-726645832) + +When GPU plugin has been enabled, you can use the [robot-dog-delivery-gpu.yaml](./yaml/robot-dog-delivery-gpu.yaml) configuration to create and run lifelong learning job. + +To enable GPU in other Sedna features can be similarly configured like the above steps. + + ## 1.5 Check Lifelong Learning Job **(1). Query lifelong learning service status** diff --git a/examples/lifelong_learning/cityscapes/yaml/robot-dog-delivery-gpu.yaml b/examples/lifelong_learning/cityscapes/yaml/robot-dog-delivery-gpu.yaml new file mode 100644 index 000000000..cb41cd4c6 --- /dev/null +++ b/examples/lifelong_learning/cityscapes/yaml/robot-dog-delivery-gpu.yaml @@ -0,0 +1,143 @@ +apiVersion: sedna.io/v1alpha1 +kind: LifelongLearningJob +metadata: + name: $job_name +spec: + dataset: + name: "lifelong-robo-dataset" + trainProb: 0.8 + trainSpec: + template: + spec: + nodeName: $TRAIN_NODE + dnsPolicy: ClusterFirstWithHostNet + containers: + - image: $cloud_image + name: train-worker + imagePullPolicy: IfNotPresent + args: [ "train.py" ] + env: + - name: "num_class" + value: "24" + - name: "epoches" + value: "1" + - name: "attribute" + value: "real, sim" + - name: "city" + value: "berlin" + - name: "BACKEND_TYPE" + value: "PYTORCH" + - name: "train_ratio" + value: "0.9" + - name: "gpu_ids" + value: "0" + resources: + limits: + nvidia.com/gpu: 1 # requesting 1 GPU + cpu: 6 + memory: 12Gi + requests: + cpu: 4 + memory: 12Gi + nvidia.com/gpu: 1 # requesting 1 GPU + volumeMounts: + - mountPath: /dev/shm + name: cache-volume + volumes: + - emptyDir: + medium: Memory + sizeLimit: 256Mi + name: cache-volume + trigger: + checkPeriodSeconds: 30 + timer: + start: 00:00 + end: 24:00 + condition: + operator: ">" + threshold: 100 + metric: num_of_samples + evalSpec: + template: + spec: + nodeName: $EVAL_NODE + dnsPolicy: ClusterFirstWithHostNet + containers: + - image: $cloud_image + name: eval-worker + imagePullPolicy: IfNotPresent + args: [ "evaluate.py" ] + env: + - name: "operator" + value: "<" + - name: "model_threshold" + value: "0" + - name: "num_class" + value: "24" + - name: "BACKEND_TYPE" + value: "PYTORCH" + - name: "gpu_ids" + value: "0" + resources: + limits: + cpu: 6 + memory: 12Gi + nvidia.com/gpu: 1 # requesting 1 GPU + requests: + cpu: 4 + memory: 12Gi + nvidia.com/gpu: 1 # requesting 1 GPU + deploySpec: + template: + spec: + nodeName: $INFER_NODE + dnsPolicy: ClusterFirstWithHostNet + hostNetwork: true + containers: + - image: $edge_image + name: infer-worker + imagePullPolicy: IfNotPresent + args: [ "predict.py" ] + env: + - name: "test_data" + value: "/data/test_data" + - name: "num_class" + value: "24" + - name: "unseen_save_url" + value: "/data/unseen_samples" + - name: "INFERENCE_RESULT_DIR" + value: "/data/infer_results" + - name: "BACKEND_TYPE" + value: "PYTORCH" + - name: "gpu_ids" + value: "0" + volumeMounts: + - name: unseenurl + mountPath: /data/unseen_samples + - name: inferdata + mountPath: /data/infer_results + - name: testdata + mountPath: /data/test_data + resources: + limits: + cpu: 6 + memory: 12Gi + nvidia.com/gpu: 1 # requesting 1 GPU + requests: + cpu: 4 + memory: 12Gi + nvidia.com/gpu: 1 # requesting 1 GPU + volumes: + - name: unseenurl + hostPath: + path: /data/unseen_samples + type: DirectoryOrCreate + - name: inferdata + hostPath: + path: /data/infer_results + type: DirectoryOrCreate + - name: testdata + hostPath: + path: /data/test_data + type: DirectoryOrCreate + outputDir: $OUTPUT/$job_name