diff --git a/Dockerfile b/Dockerfile index a4fcadb..c8ca053 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,7 +5,7 @@ RUN apt update -y && apt install -y gcc make wget ARG GO_VERSION=1.22.5 RUN wget https://golang.google.cn/dl/go$GO_VERSION.linux-arm64.tar.gz RUN rm -rf /usr/local/go && tar -C /usr/local -xzf go$GO_VERSION.linux-arm64.tar.gz -ENV PATH=/usr/local/go/bin:$PATH +ENV PATH=/usr/local/go/bin:/root/go/bin:$PATH ARG GOPROXY ARG VERSION ADD . /build diff --git a/Makefile b/Makefile index 71bd182..342cb2a 100644 --- a/Makefile +++ b/Makefile @@ -7,7 +7,7 @@ all: ascend-device-plugin tidy: $(GO) mod tidy -lint: +lint: tidy $(GO) install github.com/golangci/golangci-lint/cmd/golangci-lint@v1.61.0 golangci-lint run diff --git a/README.md b/README.md new file mode 100644 index 0000000..c7d6e06 --- /dev/null +++ b/README.md @@ -0,0 +1,60 @@ +# Ascend Device Plugin + +## 说明 + +基于[HAMi](https://github.com/Project-HAMi/HAMi)调度机制的ascend device plugin。 + +支持基于显存调度,显存是基于昇腾的虚拟化模板来切分的,会找到满足显存需求的最小模板来作为容器的显存。 + +启动容器依赖[ascend-docker-runtime](https://gitee.com/ascend/ascend-docker-runtime)。 + +## 编译 + +### 编译二进制文件 + +```bash +make all +``` + +### 编译镜像 + +```bash +docker buildx build -t $IMAGE_NAME . +``` + +## 部署 + +由于和HAMi的一些依赖关系,部署集成在HAMi的部署中,修改HAMi chart values中的以下部分即可。 + +```yaml +devices: + ascend: + enabled: true + image: "ascend-device-plugin:master" + imagePullPolicy: IfNotPresent + extraArgs: [] + nodeSelector: + ascend: "on" + tolerations: [] + resources: + - huawei.com/Ascend910A + - huawei.com/Ascend910A-memory + - huawei.com/Ascend910B + - huawei.com/Ascend910B-memory + - huawei.com/Ascend310P + - huawei.com/Ascend310P-memory +``` + +## 使用 + +```yaml +... + containers: + - name: npu_pod + ... + resources: + limits: + huawei.com/Ascend910B: "1" + # 不填写显存默认使用整张卡 + huawei.com/Ascend910B-memory: "4096" +``` diff --git a/go.mod b/go.mod index 2bd40b9..5bdc517 100644 --- a/go.mod +++ b/go.mod @@ -66,6 +66,6 @@ require ( ) replace ( - github.com/Project-HAMi/HAMi v0.0.0 => github.com/zoyopei/HAMi v0.0.0-20240911093519-601839823f68 + github.com/Project-HAMi/HAMi v0.0.0 => github.com/zoyopei/HAMi v0.0.0-20240913070807-899199680605 huawei.com/npu-exporter/v6 => gitee.com/ascend/ascend-npu-exporter/v6 v6.0.0-RC2.b001 ) diff --git a/go.sum b/go.sum index c9b7d2e..d0b2403 100644 --- a/go.sum +++ b/go.sum @@ -125,8 +125,8 @@ github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673/go.mod h1:N3UwUGtsr github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= -github.com/zoyopei/HAMi v0.0.0-20240911093519-601839823f68 h1:WV8Qod2BWFsOqfRr4/X7EXve1bnJgh4EX8OW4doXGUY= -github.com/zoyopei/HAMi v0.0.0-20240911093519-601839823f68/go.mod h1:lY4bmpcPiKWg0bVPCJFRH6xDW8p5PouIk/nIIU1I2d8= +github.com/zoyopei/HAMi v0.0.0-20240913070807-899199680605 h1:5Zf/OYHoYhEQlaIs3/mtso6wlH3bAp5cFrpw2X+PZsc= +github.com/zoyopei/HAMi v0.0.0-20240913070807-899199680605/go.mod h1:lY4bmpcPiKWg0bVPCJFRH6xDW8p5PouIk/nIIU1I2d8= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= diff --git a/manager.go b/manager.go index 2d7c053..3d1ca4b 100644 --- a/manager.go +++ b/manager.go @@ -123,7 +123,7 @@ func (am *AscendManager) UpdateDevice() error { return err } am.devs = append(am.devs, &Device{ - UUID: fmt.Sprintf("%s-%d", am.config.CommonWord, ID), + UUID: fmt.Sprintf("%s-%d", am.config.CommonWord, phyID), LogicID: ID, PhyID: phyID, CardID: cardID, diff --git a/server.go b/server.go index c32d73e..e84b631 100644 --- a/server.go +++ b/server.go @@ -23,15 +23,15 @@ import ( "net" "os" "path" - "strconv" - "strings" "time" "github.com/Project-HAMi/HAMi/pkg/api" + "github.com/Project-HAMi/HAMi/pkg/device/ascend" "github.com/Project-HAMi/HAMi/pkg/util" "google.golang.org/grpc" "google.golang.org/grpc/credentials/insecure" v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/util/json" "k8s.io/klog/v2" "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" ) @@ -187,9 +187,10 @@ func (ps *PluginServer) registerKubelet() error { func (ps *PluginServer) registerHAMi() error { devs := ps.mgr.GetDevices() apiDevices := make([]*api.DeviceInfo, 0, len(devs)) - for _, dev := range devs { + // hami currently believes that the index starts from 0 and is continuous. + for i, dev := range devs { apiDevices = append(apiDevices, &api.DeviceInfo{ - Index: int(dev.PhyID), + Index: i, ID: dev.UUID, Count: 1, Devmem: int32(dev.Memory), @@ -248,25 +249,20 @@ func (ps *PluginServer) parsePodAnnotation(pod *v1.Pod) ([]int32, []string, erro if !ok { return nil, nil, fmt.Errorf("annotation %s not set", "huawei.com/Ascend") } + var rtInfo []ascend.RuntimeInfo + err := json.Unmarshal([]byte(anno), &rtInfo) + if err != nil { + return nil, nil, fmt.Errorf("annotation %s value %s invalid", ps.allocAnno, anno) + } var IDs []int32 var temps []string - ss := strings.Split(anno, ",") - for _, s := range ss { - if s == "" { - continue + for _, info := range rtInfo { + d := ps.mgr.GetDeviceByUUID(info.UUID) + if d == nil { + return nil, nil, fmt.Errorf("unknown uuid: %s", info.UUID) } - is := strings.Split(s, "-") - phyID := 0 - temp := "" - phyID, err := strconv.Atoi(is[0]) - if err != nil { - return nil, nil, fmt.Errorf("annotation %s value %s invalid", ps.allocAnno, anno) - } - if len(is) == 2 { - temp = is[1] - } - IDs = append(IDs, int32(phyID)) - temps = append(temps, temp) + IDs = append(IDs, d.PhyID) + temps = append(temps, info.Temp) } if len(IDs) == 0 { return nil, nil, fmt.Errorf("annotation %s value %s invalid", ps.allocAnno, anno) @@ -343,7 +339,9 @@ func (ps *PluginServer) Allocate(ctx context.Context, reqs *v1beta1.AllocateRequ } resp.Envs = make(map[string]string) resp.Envs["ASCEND_VISIBLE_DEVICES"] = ascendVisibleDevices - resp.Envs["ASCEND_VNPU_SPECS"] = ascendVNPUSpec + if ascendVNPUSpec != "" { + resp.Envs["ASCEND_VNPU_SPECS"] = ascendVNPUSpec + } klog.V(5).Infof("allocate response: %v", resp) return &v1beta1.AllocateResponse{ContainerResponses: []*v1beta1.ContainerAllocateResponse{&resp}}, nil }