Skip to content

Commit

Permalink
Improve reclaimer logging
Browse files Browse the repository at this point in the history
Signed-off-by: Nicolas Belouin <[email protected]>
  • Loading branch information
diconico07 committed Apr 30, 2024
1 parent cef2054 commit 2e3c3a5
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -858,6 +858,7 @@ pub async fn reconcile(
plugin
}
Some(plugin) => {
// TODO: Add a way to handle a change in the instance's capacity.
plugin.update_slots(&instance.spec.device_usage).await?;
plugin.clone()
}
Expand Down
8 changes: 6 additions & 2 deletions agent/src/plugin_manager/device_plugin_slot_reclaimer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,7 @@ pub async fn start_reclaimer(dp_manager: Arc<DevicePluginManager>) {
loop {
trace!("reclaiming unused slots - start");
if let Ok(used_slots) = get_used_slots().await {
trace!("used slots: {:?}", used_slots);
let theoretical_slots = dp_manager.get_used_slots().await;
trace!("theoretical slots: {:?}", theoretical_slots);
let mut new_stalled_slots: HashMap<String, Instant> = HashMap::new();
let reclaim_iteration_start = Instant::now();
for slot_to_reclaim in theoretical_slots.difference(&used_slots) {
Expand All @@ -91,6 +89,12 @@ pub async fn start_reclaimer(dp_manager: Arc<DevicePluginManager>) {
.await
.is_err()
{
warn!(
"Failed to free slot {}, will try again in {}s",
slot_to_reclaim,
SLOT_RECLAIM_INTERVAL.as_secs()
);
// To try again we just keep the slot as stalled
new_stalled_slots.insert(slot_to_reclaim.to_string(), at.to_owned());
};
} else {
Expand Down

0 comments on commit 2e3c3a5

Please sign in to comment.