@@ -63,6 +63,9 @@ type NvidiaDevicePlugin struct {
63
63
server * grpc.Server
64
64
health chan * rm.Device
65
65
stop chan interface {}
66
+
67
+ mpsDaemon * mps.Daemon
68
+ mpsHostRoot mps.Root
66
69
}
67
70
68
71
// NewNvidiaDevicePlugin returns an initialized NvidiaDevicePlugin
@@ -74,6 +77,13 @@ func NewNvidiaDevicePlugin(config *spec.Config, resourceManager rm.ResourceManag
74
77
pluginName := "nvidia-" + name
75
78
pluginPath := filepath .Join (pluginapi .DevicePluginPath , pluginName )
76
79
80
+ var mpsDaemon * mps.Daemon
81
+ var mpsHostRoot mps.Root
82
+ if config .Sharing .SharingStrategy () != spec .SharingStrategyMPS {
83
+ mpsDaemon = mps .NewDaemon (resourceManager , mps .ContainerRoot )
84
+ mpsHostRoot = mps .Root (* config .Flags .CommandLineFlags .MpsRoot )
85
+ }
86
+
77
87
return & NvidiaDevicePlugin {
78
88
rm : resourceManager ,
79
89
config : config ,
@@ -83,6 +93,9 @@ func NewNvidiaDevicePlugin(config *spec.Config, resourceManager rm.ResourceManag
83
93
cdiHandler : cdiHandler ,
84
94
cdiAnnotationPrefix : * config .Flags .Plugin .CDIAnnotationPrefix ,
85
95
96
+ mpsDaemon : mpsDaemon ,
97
+ mpsHostRoot : mpsHostRoot ,
98
+
86
99
// These will be reinitialized every
87
100
// time the plugin server is restarted.
88
101
server : nil ,
@@ -148,11 +161,12 @@ func (plugin *NvidiaDevicePlugin) waitForMPSDaemon() error {
148
161
if plugin .config .Sharing .SharingStrategy () != spec .SharingStrategyMPS {
149
162
return nil
150
163
}
151
- // TODO: Check the started file here.
164
+ // TODO: Check the .ready file here.
152
165
// TODO: Have some retry strategy here.
153
- if err := mps . NewDaemon ( plugin .rm ) .AssertHealthy (); err != nil {
166
+ if err := plugin .mpsDaemon .AssertHealthy (); err != nil {
154
167
return fmt .Errorf ("error checking MPS daemon health: %w" , err )
155
168
}
169
+ klog .InfoS ("MPS daemon is healthy" , "resource" , plugin .rm .Resource ())
156
170
return nil
157
171
}
158
172
@@ -329,7 +343,6 @@ func (plugin *NvidiaDevicePlugin) getAllocateResponse(requestIds []string) (*plu
329
343
response := & pluginapi.ContainerAllocateResponse {
330
344
Envs : make (map [string ]string ),
331
345
}
332
-
333
346
if plugin .deviceListStrategies .IsCDIEnabled () {
334
347
responseID := uuid .New ().String ()
335
348
if err := plugin .updateResponseForCDI (response , responseID , deviceIDs ... ); err != nil {
@@ -361,26 +374,24 @@ func (plugin *NvidiaDevicePlugin) getAllocateResponse(requestIds []string) (*plu
361
374
// This includes per-resource pipe and log directories as well as a global daemon-specific shm
362
375
// and assumes that an MPS control daemon has already been started.
363
376
func (plugin NvidiaDevicePlugin ) updateResponseForMPS (response * pluginapi.ContainerAllocateResponse ) {
364
- pipeDir := filepath .Join ("/mps" , string (plugin .rm .Resource ()), "pipe" )
365
- response .Envs ["CUDA_MPS_PIPE_DIRECTORY" ] = pipeDir
377
+ // TODO: We should check that the deviceIDs are shared using MPS.
378
+ for k , v := range plugin .mpsDaemon .Envvars () {
379
+ response .Envs [k ] = v
380
+ }
381
+
382
+ resourceName := plugin .rm .Resource ()
366
383
response .Mounts = append (response .Mounts ,
367
384
& pluginapi.Mount {
368
- ContainerPath : pipeDir ,
369
- HostPath : filepath . Join ( "/var/lib/kubelet/device-plugins" , pipeDir ),
385
+ ContainerPath : plugin . mpsDaemon . PipeDir () ,
386
+ HostPath : plugin . mpsHostRoot . PipeDir ( resourceName ),
370
387
},
371
- )
372
- logDir := filepath .Join ("/mps" , string (plugin .rm .Resource ()), "log" )
373
- response .Envs ["CUDA_MPS_LOG_DIRECTORY" ] = logDir
374
- response .Mounts = append (response .Mounts ,
375
388
& pluginapi.Mount {
376
- ContainerPath : logDir ,
377
- HostPath : filepath . Join ( "/var/lib/kubelet/device-plugins" , logDir ),
389
+ ContainerPath : plugin . mpsDaemon . PipeDir () ,
390
+ HostPath : plugin . mpsHostRoot . LogDir ( resourceName ),
378
391
},
379
- )
380
- response .Mounts = append (response .Mounts ,
381
392
& pluginapi.Mount {
382
- ContainerPath : "/dev/shm" ,
383
- HostPath : "/var/lib/kubelet/device-plugins/mps/shm" ,
393
+ ContainerPath : plugin . mpsDaemon . ShmDir () ,
394
+ HostPath : plugin . mpsHostRoot . ShmDir ( resourceName ) ,
384
395
},
385
396
)
386
397
}
0 commit comments