Skip to content

Commit

Permalink
change pod alloc annotations
Browse files Browse the repository at this point in the history
Signed-off-by: zoyopei <[email protected]>
  • Loading branch information
zoyopei committed Sep 13, 2024
1 parent 56f7924 commit aa1f98a
Show file tree
Hide file tree
Showing 7 changed files with 85 additions and 27 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ RUN apt update -y && apt install -y gcc make wget
ARG GO_VERSION=1.22.5
RUN wget https://golang.google.cn/dl/go$GO_VERSION.linux-arm64.tar.gz
RUN rm -rf /usr/local/go && tar -C /usr/local -xzf go$GO_VERSION.linux-arm64.tar.gz
ENV PATH=/usr/local/go/bin:$PATH
ENV PATH=/usr/local/go/bin:/root/go/bin:$PATH
ARG GOPROXY
ARG VERSION
ADD . /build
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ all: ascend-device-plugin
tidy:
$(GO) mod tidy

lint:
lint: tidy
$(GO) install github.com/golangci/golangci-lint/cmd/[email protected]
golangci-lint run

Expand Down
60 changes: 60 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# Ascend Device Plugin

## 说明

基于[HAMi](https://github.com/Project-HAMi/HAMi)调度机制的ascend device plugin。

支持基于显存调度,显存是基于昇腾的虚拟化模板来切分的,会找到满足显存需求的最小模板来作为容器的显存。

启动容器依赖[ascend-docker-runtime](https://gitee.com/ascend/ascend-docker-runtime)

## 编译

### 编译二进制文件

```bash
make all
```

### 编译镜像

```bash
docker buildx build -t $IMAGE_NAME .
```

## 部署

由于和HAMi的一些依赖关系,部署集成在HAMi的部署中,修改HAMi chart values中的以下部分即可。

```yaml
devices:
ascend:
enabled: true
image: "ascend-device-plugin:master"
imagePullPolicy: IfNotPresent
extraArgs: []
nodeSelector:
ascend: "on"
tolerations: []
resources:
- huawei.com/Ascend910A
- huawei.com/Ascend910A-memory
- huawei.com/Ascend910B
- huawei.com/Ascend910B-memory
- huawei.com/Ascend310P
- huawei.com/Ascend310P-memory
```
## 使用
```yaml
...
containers:
- name: npu_pod
...
resources:
limits:
huawei.com/Ascend910B: "1"
# 不填写显存默认使用整张卡
huawei.com/Ascend910B-memory: "4096"
```
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,6 @@ require (
)

replace (
github.com/Project-HAMi/HAMi v0.0.0 => github.com/zoyopei/HAMi v0.0.0-20240911093519-601839823f68
github.com/Project-HAMi/HAMi v0.0.0 => github.com/zoyopei/HAMi v0.0.0-20240913070807-899199680605
huawei.com/npu-exporter/v6 => gitee.com/ascend/ascend-npu-exporter/v6 v6.0.0-RC2.b001
)
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -125,8 +125,8 @@ github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673/go.mod h1:N3UwUGtsr
github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
github.com/zoyopei/HAMi v0.0.0-20240911093519-601839823f68 h1:WV8Qod2BWFsOqfRr4/X7EXve1bnJgh4EX8OW4doXGUY=
github.com/zoyopei/HAMi v0.0.0-20240911093519-601839823f68/go.mod h1:lY4bmpcPiKWg0bVPCJFRH6xDW8p5PouIk/nIIU1I2d8=
github.com/zoyopei/HAMi v0.0.0-20240913070807-899199680605 h1:5Zf/OYHoYhEQlaIs3/mtso6wlH3bAp5cFrpw2X+PZsc=
github.com/zoyopei/HAMi v0.0.0-20240913070807-899199680605/go.mod h1:lY4bmpcPiKWg0bVPCJFRH6xDW8p5PouIk/nIIU1I2d8=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
Expand Down
2 changes: 1 addition & 1 deletion manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ func (am *AscendManager) UpdateDevice() error {
return err
}
am.devs = append(am.devs, &Device{
UUID: fmt.Sprintf("%s-%d", am.config.CommonWord, ID),
UUID: fmt.Sprintf("%s-%d", am.config.CommonWord, phyID),
LogicID: ID,
PhyID: phyID,
CardID: cardID,
Expand Down
40 changes: 19 additions & 21 deletions server.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,15 @@ import (
"net"
"os"
"path"
"strconv"
"strings"
"time"

"github.com/Project-HAMi/HAMi/pkg/api"
"github.com/Project-HAMi/HAMi/pkg/device/ascend"
"github.com/Project-HAMi/HAMi/pkg/util"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials/insecure"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/json"
"k8s.io/klog/v2"
"k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
)
Expand Down Expand Up @@ -187,9 +187,10 @@ func (ps *PluginServer) registerKubelet() error {
func (ps *PluginServer) registerHAMi() error {
devs := ps.mgr.GetDevices()
apiDevices := make([]*api.DeviceInfo, 0, len(devs))
for _, dev := range devs {
// hami currently believes that the index starts from 0 and is continuous.
for i, dev := range devs {
apiDevices = append(apiDevices, &api.DeviceInfo{
Index: int(dev.PhyID),
Index: i,
ID: dev.UUID,
Count: 1,
Devmem: int32(dev.Memory),
Expand Down Expand Up @@ -248,25 +249,20 @@ func (ps *PluginServer) parsePodAnnotation(pod *v1.Pod) ([]int32, []string, erro
if !ok {
return nil, nil, fmt.Errorf("annotation %s not set", "huawei.com/Ascend")
}
var rtInfo []ascend.RuntimeInfo
err := json.Unmarshal([]byte(anno), &rtInfo)
if err != nil {
return nil, nil, fmt.Errorf("annotation %s value %s invalid", ps.allocAnno, anno)
}
var IDs []int32
var temps []string
ss := strings.Split(anno, ",")
for _, s := range ss {
if s == "" {
continue
for _, info := range rtInfo {
d := ps.mgr.GetDeviceByUUID(info.UUID)
if d == nil {
return nil, nil, fmt.Errorf("unknown uuid: %s", info.UUID)
}
is := strings.Split(s, "-")
phyID := 0
temp := ""
phyID, err := strconv.Atoi(is[0])
if err != nil {
return nil, nil, fmt.Errorf("annotation %s value %s invalid", ps.allocAnno, anno)
}
if len(is) == 2 {
temp = is[1]
}
IDs = append(IDs, int32(phyID))
temps = append(temps, temp)
IDs = append(IDs, d.PhyID)
temps = append(temps, info.Temp)
}
if len(IDs) == 0 {
return nil, nil, fmt.Errorf("annotation %s value %s invalid", ps.allocAnno, anno)
Expand Down Expand Up @@ -343,7 +339,9 @@ func (ps *PluginServer) Allocate(ctx context.Context, reqs *v1beta1.AllocateRequ
}
resp.Envs = make(map[string]string)
resp.Envs["ASCEND_VISIBLE_DEVICES"] = ascendVisibleDevices
resp.Envs["ASCEND_VNPU_SPECS"] = ascendVNPUSpec
if ascendVNPUSpec != "" {
resp.Envs["ASCEND_VNPU_SPECS"] = ascendVNPUSpec
}
klog.V(5).Infof("allocate response: %v", resp)
return &v1beta1.AllocateResponse{ContainerResponses: []*v1beta1.ContainerAllocateResponse{&resp}}, nil
}
Expand Down

0 comments on commit aa1f98a

Please sign in to comment.