Skip to content

Commit 4867e7a

Browse files
authored
fix: compute request normalize to tflops bug (#472)
1 parent 6a943cd commit 4867e7a

File tree

7 files changed

+532
-26
lines changed

7 files changed

+532
-26
lines changed

internal/autoscaler/recommender/percentile_recommender.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -329,20 +329,20 @@ func (p *PercentileRecommender) handleResourceScaling(
329329

330330
// Must inside scaling range
331331
targetReqValue := targetReq.AsApproximateFloat64()
332-
if targetReqValue < minAllowedReq {
332+
if minAllowedReq != 0 && targetReqValue < minAllowedReq {
333333
targetReqValue = minAllowedReq
334334
targetReq = *resource.NewQuantity(int64(targetReqValue), targetReq.Format)
335335
}
336-
if targetReqValue > maxAllowedReq {
336+
if maxAllowedReq != 0 && targetReqValue > maxAllowedReq {
337337
targetReqValue = maxAllowedReq
338338
targetReq = *resource.NewQuantity(int64(targetReqValue), targetReq.Format)
339339
}
340340
targetLimValue := targetLim.AsApproximateFloat64()
341-
if targetLimValue < minAllowedLim {
341+
if minAllowedLim != 0 && targetLimValue < minAllowedLim {
342342
targetLimValue = minAllowedLim
343343
targetLim = *resource.NewQuantity(int64(targetLimValue), targetLim.Format)
344344
}
345-
if targetLimValue > maxAllowedLim {
345+
if maxAllowedLim != 0 && targetLimValue > maxAllowedLim {
346346
targetLimValue = maxAllowedLim
347347
targetLim = *resource.NewQuantity(int64(targetLimValue), targetLim.Format)
348348
}

internal/autoscaler/workload/handler.go

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -330,8 +330,16 @@ func (h *handler) GetMaxAllowedResourcesSpec(workload *State) (*tfv1.Resource, e
330330
availableVram := gpu.Status.Available.Vram.DeepCopy()
331331
for _, worker := range workers {
332332
// Add back this workload's allocated resources to get the total available for this workload
333-
availableTflops.Add(allocRequests[string(worker.UID)].Request.Tflops)
334-
availableVram.Add(allocRequests[string(worker.UID)].Request.Vram)
333+
allocReq := allocRequests[string(worker.UID)]
334+
var reqTflops resource.Quantity
335+
if gpu.Status.Capacity != nil && !allocReq.Request.ComputePercent.IsZero() {
336+
requiredTflops := utils.ComputePercentToTflops(gpu.Status.Capacity.Tflops, allocReq.Request)
337+
reqTflops = *requiredTflops
338+
} else {
339+
reqTflops = allocReq.Request.Tflops
340+
}
341+
availableTflops.Add(reqTflops)
342+
availableVram.Add(allocReq.Request.Vram)
335343
}
336344

337345
workerCount := int64(len(workers))

internal/gpuallocator/gpuallocator.go

Lines changed: 73 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,14 @@ func (s *GpuAllocator) FilterWithPreempt(
217217
return nil, nil, fmt.Errorf("gpu %s not found", gpuName)
218218
}
219219
gpuCopy := gpu.DeepCopy()
220-
gpuCopy.Status.Available.Tflops.Add(preemptAllocRequest.Request.Tflops)
220+
var reqTflops resource.Quantity
221+
if !preemptAllocRequest.Request.ComputePercent.IsZero() {
222+
requiredTflops := utils.ComputePercentToTflops(gpuCopy.Status.Capacity.Tflops, preemptAllocRequest.Request)
223+
reqTflops = *requiredTflops
224+
} else {
225+
reqTflops = preemptAllocRequest.Request.Tflops
226+
}
227+
gpuCopy.Status.Available.Tflops.Add(reqTflops)
221228
gpuCopy.Status.Available.Vram.Add(preemptAllocRequest.Request.Vram)
222229
toFilterGPUs = append(toFilterGPUs, gpuCopy)
223230
}
@@ -302,9 +309,16 @@ func (s *GpuAllocator) Bind(
302309
if gpu.Status.Available == nil {
303310
return nil, fmt.Errorf("GPU %s has nil available resources", selectedGPU)
304311
}
305-
if gpu.Status.Available.Tflops.Cmp(req.Request.Tflops) < 0 {
312+
var reqTflops resource.Quantity
313+
if !req.Request.ComputePercent.IsZero() {
314+
requiredTflops := utils.ComputePercentToTflops(gpu.Status.Capacity.Tflops, req.Request)
315+
reqTflops = *requiredTflops
316+
} else {
317+
reqTflops = req.Request.Tflops
318+
}
319+
if gpu.Status.Available.Tflops.Cmp(reqTflops) < 0 {
306320
return nil, fmt.Errorf("GPU %s insufficient TFLOPs: available %s, requested %s",
307-
selectedGPU, gpu.Status.Available.Tflops.String(), req.Request.Tflops.String())
321+
selectedGPU, gpu.Status.Available.Tflops.String(), reqTflops.String())
308322
}
309323
if gpu.Status.Available.Vram.Cmp(req.Request.Vram) < 0 {
310324
return nil, fmt.Errorf("GPU %s insufficient VRAM: available %s, requested %s",
@@ -675,8 +689,23 @@ func (s *GpuAllocator) checkGPUCapacityAndQuota(gpu *tfv1.GPU, oldRes, newRes tf
675689
Vram: remainVram,
676690
}
677691

678-
remainTflops.Add(oldRes.Tflops)
679-
remainTflops.Sub(newRes.Tflops)
692+
// Get actual TFLOPs values, converting from ComputePercent if needed
693+
var oldTflops, newTflops resource.Quantity
694+
if !oldRes.ComputePercent.IsZero() {
695+
requiredTflops := utils.ComputePercentToTflops(gpu.Status.Capacity.Tflops, oldRes)
696+
oldTflops = *requiredTflops
697+
} else {
698+
oldTflops = oldRes.Tflops
699+
}
700+
if !newRes.ComputePercent.IsZero() {
701+
requiredTflops := utils.ComputePercentToTflops(gpu.Status.Capacity.Tflops, newRes)
702+
newTflops = *requiredTflops
703+
} else {
704+
newTflops = newRes.Tflops
705+
}
706+
707+
remainTflops.Add(oldTflops)
708+
remainTflops.Sub(newTflops)
680709
if remainTflops.Cmp(resource.Quantity{}) < 0 {
681710
return remainRes, ScalingQuotaExceededError
682711
}
@@ -1272,9 +1301,37 @@ func (s *GpuAllocator) CheckQuotaAndFilterSingleNodePreempt(
12721301
if existingAllocation == nil {
12731302
continue
12741303
}
1275-
toPreemptUsage.Requests.Tflops.Add(existingAllocation.Request.Tflops)
1304+
// Get actual TFLOPs values, converting from ComputePercent if needed
1305+
// We need GPU capacity to convert, so we get it from the first GPU of the allocation
1306+
var reqTflops, limitTflops resource.Quantity
1307+
if len(existingAllocation.GPUNames) > 0 {
1308+
gpuNameNs := types.NamespacedName{Name: existingAllocation.GPUNames[0]}
1309+
if gpu, exists := s.gpuStore[gpuNameNs]; exists && gpu.Status.Capacity != nil {
1310+
if !existingAllocation.Request.ComputePercent.IsZero() {
1311+
requiredTflops := utils.ComputePercentToTflops(gpu.Status.Capacity.Tflops, existingAllocation.Request)
1312+
reqTflops = *requiredTflops
1313+
} else {
1314+
reqTflops = existingAllocation.Request.Tflops
1315+
}
1316+
if !existingAllocation.Limit.ComputePercent.IsZero() {
1317+
requiredTflops := utils.ComputePercentToTflops(gpu.Status.Capacity.Tflops, existingAllocation.Limit)
1318+
limitTflops = *requiredTflops
1319+
} else {
1320+
limitTflops = existingAllocation.Limit.Tflops
1321+
}
1322+
} else {
1323+
// Fallback to direct TFLOPs if GPU not found
1324+
reqTflops = existingAllocation.Request.Tflops
1325+
limitTflops = existingAllocation.Limit.Tflops
1326+
}
1327+
} else {
1328+
// Fallback to direct TFLOPs if no GPUs
1329+
reqTflops = existingAllocation.Request.Tflops
1330+
limitTflops = existingAllocation.Limit.Tflops
1331+
}
1332+
toPreemptUsage.Requests.Tflops.Add(reqTflops)
12761333
toPreemptUsage.Requests.Vram.Add(existingAllocation.Request.Vram)
1277-
toPreemptUsage.Limits.Tflops.Add(existingAllocation.Limit.Tflops)
1334+
toPreemptUsage.Limits.Tflops.Add(limitTflops)
12781335
toPreemptUsage.Limits.Vram.Add(existingAllocation.Limit.Vram)
12791336
preemptAllocRequests = append(preemptAllocRequests, existingAllocation)
12801337
}
@@ -1364,7 +1421,15 @@ func (s *GpuAllocator) reconcileAllocationState() {
13641421
gpuKey := types.NamespacedName{Name: gpuId}
13651422
gpuAvailableRes, ok := actualAvailableMap[gpuKey]
13661423
if ok {
1367-
gpuAvailableRes.Tflops.Sub(allocRequest.Request.Tflops)
1424+
var reqTflops resource.Quantity
1425+
gpu := s.gpuStore[gpuKey]
1426+
if gpu != nil && gpu.Status.Capacity != nil && !allocRequest.Request.ComputePercent.IsZero() {
1427+
requiredTflops := utils.ComputePercentToTflops(gpu.Status.Capacity.Tflops, allocRequest.Request)
1428+
reqTflops = *requiredTflops
1429+
} else {
1430+
reqTflops = allocRequest.Request.Tflops
1431+
}
1432+
gpuAvailableRes.Tflops.Sub(reqTflops)
13681433
gpuAvailableRes.Vram.Sub(allocRequest.Request.Vram)
13691434
}
13701435
addRunningApp(ctx, s.gpuStore[gpuKey], allocRequest)

internal/gpuallocator/node_capacity.go

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,31 @@ func RefreshGPUNodeCapacity(
7575
allocRequests := allocator.GetAllocationReqByNodeName(node.Name)
7676
for _, allocRequest := range allocRequests {
7777
vramAvailable.Sub(allocRequest.Limit.Vram)
78-
tflopsAvailable.Sub(allocRequest.Limit.Tflops)
78+
// Get actual TFLOPs value, converting from ComputePercent if needed
79+
var limitTflops resource.Quantity
80+
if len(allocRequest.GPUNames) > 0 {
81+
// Try to find the GPU to get capacity for conversion
82+
for _, gpu := range gpuList.Items {
83+
if gpu.Name == allocRequest.GPUNames[0] && gpu.Status.Capacity != nil {
84+
if !allocRequest.Limit.ComputePercent.IsZero() {
85+
requiredTflops := utils.ComputePercentToTflops(gpu.Status.Capacity.Tflops, allocRequest.Limit)
86+
limitTflops = *requiredTflops
87+
} else {
88+
limitTflops = allocRequest.Limit.Tflops
89+
}
90+
break
91+
}
92+
}
93+
// If GPU not found, fallback to direct TFLOPs
94+
if limitTflops.IsZero() && !allocRequest.Limit.Tflops.IsZero() {
95+
limitTflops = allocRequest.Limit.Tflops
96+
}
97+
} else {
98+
limitTflops = allocRequest.Limit.Tflops
99+
}
100+
if !limitTflops.IsZero() {
101+
tflopsAvailable.Sub(limitTflops)
102+
}
79103
}
80104
node.Status.VirtualAvailableVRAM = &vramAvailable
81105
node.Status.VirtualAvailableTFlops = &tflopsAvailable

0 commit comments

Comments
 (0)