Skip to content

Commit d711478

Browse files
committed
Add basic support for GPUs that can be partitioned
1 parent ab22d52 commit d711478

File tree

2,768 files changed

+28350
-4
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

2,768 files changed

+28350
-4
lines changed

cmd/k8s-device-plugin/main.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,8 @@ func (p *Plugin) PreStartContainer(ctx context.Context, r *pluginapi.PreStartCon
128128
func (p *Plugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.DevicePlugin_ListAndWatchServer) error {
129129
p.AMDGPUs = amdgpu.GetAMDGPUs()
130130

131+
glog.Infof("Found %d AMDGPUs", len(p.AMDGPUs))
132+
131133
devs := make([]*pluginapi.Device, len(p.AMDGPUs))
132134

133135
// limit scope for hwloc

internal/pkg/amdgpu/amdgpu.go

Lines changed: 69 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -95,21 +95,52 @@ func GetAMDGPUs() map[string]map[string]int {
9595
matches, _ := filepath.Glob("/sys/module/amdgpu/drivers/pci:amdgpu/[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]:*")
9696

9797
devices := make(map[string]map[string]int)
98+
card, renderD := 0, 128
9899

99100
for _, path := range matches {
100101
glog.Info(path)
101102
devPaths, _ := filepath.Glob(path + "/drm/*")
102-
devices[filepath.Base(path)] = make(map[string]int)
103103

104104
for _, devPath := range devPaths {
105105
switch name := filepath.Base(devPath); {
106106
case name[0:4] == "card":
107-
devices[filepath.Base(path)][name[0:4]], _ = strconv.Atoi(name[4:])
107+
card, _ = strconv.Atoi(name[4:])
108108
case name[0:7] == "renderD":
109-
devices[filepath.Base(path)][name[0:7]], _ = strconv.Atoi(name[7:])
109+
renderD, _ = strconv.Atoi(name[7:])
110110
}
111111
}
112+
113+
devices[filepath.Base(path)] = map[string]int{"card": card, "renderD": renderD}
114+
}
115+
116+
// certain products have additional devices (such as MI300's partitions)
117+
//ex: /sys/devices/platform/amdgpu_xcp_30
118+
platformMatches, _ := filepath.Glob("/sys/devices/platform/amdgpu_xcp_*")
119+
120+
// This is needed because some of the visible renderD are actually not valid
121+
// Their validity depends on topology information from KFD
122+
topoRenderNodes := renderNodeSetFromTopology()
123+
124+
for _, path := range platformMatches {
125+
glog.Info(path)
126+
devPaths, _ := filepath.Glob(path + "/drm/*")
127+
128+
for _, devPath := range devPaths {
129+
switch name := filepath.Base(devPath); {
130+
case name[0:4] == "card":
131+
card, _ = strconv.Atoi(name[4:])
132+
case name[0:7] == "renderD":
133+
renderD, _ = strconv.Atoi(name[7:])
134+
}
135+
}
136+
137+
if !topoRenderNodes[renderD] {
138+
continue
139+
}
140+
141+
devices[filepath.Base(path)] = map[string]int{"card": card, "renderD": renderD}
112142
}
143+
113144
return devices
114145
}
115146

@@ -274,3 +305,38 @@ func parseDebugFSFirmwareInfo(path string) (map[string]uint32, map[string]uint32
274305

275306
return feat, fw
276307
}
308+
309+
var topoDrmRenderMinorRe = regexp.MustCompile(`drm_render_minor\s(\d+)`)
310+
311+
func renderNodeSetFromTopology(topoRootParam ...string) map[int]bool {
312+
topoRoot := "/sys/class/kfd/kfd"
313+
if len(topoRootParam) == 1 {
314+
topoRoot = topoRootParam[0]
315+
}
316+
317+
renderNodes := make(map[int]bool)
318+
var nodeFiles []string
319+
var err error
320+
321+
if nodeFiles, err = filepath.Glob(topoRoot + "/topology/nodes/*/properties"); err != nil {
322+
glog.Fatalf("glob error: %s", err)
323+
return renderNodes
324+
}
325+
326+
for _, nodeFile := range nodeFiles {
327+
glog.Info("Parsing " + nodeFile)
328+
v, e := ParseTopologyProperties(nodeFile, topoDrmRenderMinorRe)
329+
if e != nil {
330+
glog.Error(e)
331+
continue
332+
}
333+
334+
if v <= 0 {
335+
continue
336+
}
337+
338+
renderNodes[int(v)] = true
339+
}
340+
341+
return renderNodes
342+
}

internal/pkg/amdgpu/amdgpu_test.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,11 @@
1717
package amdgpu
1818

1919
import (
20+
"encoding/json"
2021
"fmt"
2122
"io/ioutil"
2223
"path/filepath"
24+
"reflect"
2325
"regexp"
2426
"strings"
2527
"testing"
@@ -213,3 +215,17 @@ func TestParseDebugFSFirmwareInfo(t *testing.T) {
213215
t.Errorf("Incorrect parsing of amdgpu firmware info from debugfs")
214216
}
215217
}
218+
219+
func TestRenderNodeSetFromTopology(t *testing.T) {
220+
renderNodes := renderNodeSetFromTopology("../../../testdata/topology-parsing-mi308")
221+
222+
expNodes := map[int]bool{128: true, 129: true}
223+
if !reflect.DeepEqual(renderNodes, expNodes) {
224+
val, _ := json.MarshalIndent(renderNodes, "", " ")
225+
exp, _ := json.MarshalIndent(expNodes, "", " ")
226+
227+
t.Errorf("RenderNode set was incorrect")
228+
t.Errorf("Got: %s", val)
229+
t.Errorf("Want: %s", exp)
230+
}
231+
}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
33
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
0
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
type 5
2+
version_major 0
3+
version_minor 0
4+
node_from 0
5+
node_to 1
6+
weight 21
7+
min_latency 0
8+
max_latency 0
9+
min_bandwidth 0
10+
max_bandwidth 0
11+
recommended_transfer_size 0
12+
flags 0
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
type 2
2+
version_major 0
3+
version_minor 0
4+
node_from 0
5+
node_to 2
6+
weight 20
7+
min_latency 0
8+
max_latency 0
9+
min_bandwidth 312
10+
max_bandwidth 64000
11+
recommended_transfer_size 0
12+
flags 3
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
type 2
2+
version_major 0
3+
version_minor 0
4+
node_from 0
5+
node_to 11
6+
weight 20
7+
min_latency 0
8+
max_latency 0
9+
min_bandwidth 312
10+
max_bandwidth 64000
11+
recommended_transfer_size 0
12+
flags 3
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
type 2
2+
version_major 0
3+
version_minor 0
4+
node_from 0
5+
node_to 12
6+
weight 20
7+
min_latency 0
8+
max_latency 0
9+
min_bandwidth 312
10+
max_bandwidth 64000
11+
recommended_transfer_size 0
12+
flags 3
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
type 2
2+
version_major 0
3+
version_minor 0
4+
node_from 0
5+
node_to 13
6+
weight 20
7+
min_latency 0
8+
max_latency 0
9+
min_bandwidth 312
10+
max_bandwidth 64000
11+
recommended_transfer_size 0
12+
flags 3

0 commit comments

Comments
 (0)