Skip to content

Commit 14f103d

Browse files
klihubaskervin
authored andcommitted
topology-aware: check grants, look for stales duplicates.
Check grants, looking for grants with stale allocations or duplicate containers (detected using fully qualified names). Dump total memory and CPU granted. Signed-off-by: Krisztian Litkey <[email protected]>
1 parent c88101a commit 14f103d

File tree

1 file changed

+70
-5
lines changed

1 file changed

+70
-5
lines changed

cmd/plugins/topology-aware/policy/topology-aware-policy.go

Lines changed: 70 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,7 @@ func (p *policy) Start() error {
134134
p.checkColdstartOff()
135135

136136
p.root.Dump("<post-start>")
137+
p.checkAllocations(" <post-start>")
137138

138139
return nil
139140
}
@@ -148,19 +149,82 @@ func (p *policy) Sync(add []cache.Container, del []cache.Container) error {
148149
p.AllocateResources(c)
149150
}
150151

152+
p.checkAllocations(" <post-sync>")
153+
151154
return nil
152155
}
153156

157+
func (p *policy) checkAllocations(format string, args ...interface{}) {
158+
var (
159+
prefix = fmt.Sprintf(format, args...)
160+
cpuExcl = 0
161+
cpuPart = 0
162+
mem = uint64(0)
163+
ctr = map[string]Grant{}
164+
dup = map[string][]Grant{}
165+
)
166+
167+
getMemorySize := func(g Grant) uint64 {
168+
var (
169+
limit = g.MemLimit()
170+
total = uint64(0)
171+
)
172+
for _, memType := range []memoryType{memoryDRAM, memoryPMEM, memoryHBM} {
173+
total += limit[memType]
174+
}
175+
return total
176+
}
177+
178+
for _, g := range p.allocations.grants {
179+
log.Debug("%s %s (%s)", prefix, g, g.GetContainer().GetID())
180+
full := g.ExclusiveCPUs().Size()
181+
part := g.CPUPortion()
182+
cpuExcl += full
183+
cpuPart += part
184+
185+
mem += getMemorySize(g)
186+
187+
_, ok := p.cache.LookupContainer(g.GetContainer().GetID())
188+
if !ok {
189+
log.Error("%s %s STALE container among allocations, not found in cache", prefix, g)
190+
}
191+
192+
key := g.GetContainer().PrettyName()
193+
old, ok := ctr[key]
194+
if ok {
195+
if len(dup[key]) == 0 {
196+
dup[key] = []Grant{old, g}
197+
} else {
198+
dup[key] = append(dup[key], g)
199+
}
200+
} else {
201+
ctr[key] = g
202+
}
203+
}
204+
205+
for key, grants := range dup {
206+
log.Error("%s DUPLICATE allocation entries for container %s", prefix, key)
207+
for _, g := range grants {
208+
log.Error("%s %s (%s)", prefix, g, g.GetContainer().GetID())
209+
}
210+
}
211+
212+
log.Info("%s total CPU granted: %dm (%d exclusive + %dm shared), total memory granted: %s",
213+
prefix, 1000*cpuExcl+cpuPart, cpuExcl, cpuPart, prettyMem(mem))
214+
215+
}
216+
154217
// AllocateResources is a resource allocation request for this policy.
155218
func (p *policy) AllocateResources(container cache.Container) error {
156-
log.Debug("allocating resources for %s...", container.PrettyName())
219+
log.Debug("allocating resources for %s (%s)...", container.PrettyName(), container.GetID())
157220

158221
err := p.allocateResources(container, "")
159222
if err != nil {
160223
return err
161224
}
162225

163226
p.root.Dump("<post-alloc>")
227+
p.checkAllocations(" <post-alloc %s>", container.PrettyName())
164228

165229
return nil
166230
}
@@ -179,13 +243,14 @@ func (p *policy) allocateResources(container cache.Container, poolHint string) e
179243

180244
// ReleaseResources is a resource release request for this policy.
181245
func (p *policy) ReleaseResources(container cache.Container) error {
182-
log.Debug("releasing resources of %s...", container.PrettyName())
246+
log.Debug("releasing resources for %s (%s)...", container.PrettyName(), container.GetID())
183247

184248
if grant, found := p.releasePool(container); found {
185249
p.updateSharedAllocations(&grant)
186250
}
187251

188252
p.root.Dump("<post-release>")
253+
p.checkAllocations(" <post-release %s>", container.PrettyName())
189254

190255
return nil
191256
}
@@ -208,6 +273,7 @@ func (p *policy) UpdateResources(container cache.Container) error {
208273
}
209274

210275
p.root.Dump("<post-update>")
276+
p.checkAllocations(" <post-update %s>", container.PrettyName())
211277

212278
return nil
213279
}
@@ -394,7 +460,7 @@ func (p *policy) reallocateResources(containers []cache.Container, pools map[str
394460
p.releasePool(c)
395461
}
396462
for _, c := range containers {
397-
log.Debug("reallocating resources for %s...", c.PrettyName())
463+
log.Debug("reallocating resources for %s (%s)...", c.PrettyName(), c.GetID())
398464

399465
grant, err := p.allocatePool(c, pools[c.GetID()])
400466
if err != nil {
@@ -410,8 +476,6 @@ func (p *policy) reallocateResources(containers []cache.Container, pools map[str
410476

411477
p.updateSharedAllocations(nil)
412478

413-
p.root.Dump("<post-realloc>")
414-
415479
return nil
416480
}
417481

@@ -452,6 +516,7 @@ func (p *policy) Reconfigure(newCfg interface{}) error {
452516
}
453517

454518
p.root.Dump("<post-config>")
519+
p.checkAllocations(" <post-config>")
455520

456521
return nil
457522
}

0 commit comments

Comments
 (0)