From f60062e975aab1c9b0a4bc9ec4a87b3e61c081e5 Mon Sep 17 00:00:00 2001 From: Maksim Petrov <47208721+vmapetr@users.noreply.github.com> Date: Wed, 14 Aug 2024 23:59:32 +0200 Subject: [PATCH] Improve CPU resource monitor for Linux systems (#4888) * Replace free call on linux * Minor fixes * Rework CPU info for linux * Fix typos * Fix issue with memory monitor * Add comments * Remove task run delegates * Fix for macos * Expand comments for metrics methods * Remove unnecessary cancellation exits * Add FF to disable resource monitor on debug runs * Minor fix * Fix warning * Replace DisableResourceMonitorDebugOutput with EnableResourceMonitorDebugOutput * Fix typo * Update variable name * Fix typo --- src/Agent.Sdk/Knob/AgentKnobs.cs | 6 + src/Agent.Worker/JobRunner.cs | 9 +- src/Agent.Worker/ResourceMetricsManager.cs | 238 +++++++++------------ src/Misc/layoutbin/en-US/strings.json | 3 +- 4 files changed, 117 insertions(+), 139 deletions(-) diff --git a/src/Agent.Sdk/Knob/AgentKnobs.cs b/src/Agent.Sdk/Knob/AgentKnobs.cs index d2658b5bfc..a790c35ba0 100644 --- a/src/Agent.Sdk/Knob/AgentKnobs.cs +++ b/src/Agent.Sdk/Knob/AgentKnobs.cs @@ -592,6 +592,12 @@ public class AgentKnobs new EnvironmentKnobSource("AGENT_DISABLE_DRAIN_QUEUES_AFTER_TASK"), new BuiltInDefaultKnobSource("false")); + public static readonly Knob EnableResourceMonitorDebugOutput = new Knob( + nameof(EnableResourceMonitorDebugOutput), + "If true, the agent will show the resource monitor output for debug runs", + new RuntimeKnobSource("AZP_ENABLE_RESOURCE_MONITOR_DEBUG_OUTPUT"), + new EnvironmentKnobSource("AZP_ENABLE_RESOURCE_MONITOR_DEBUG_OUTPUT"), + new BuiltInDefaultKnobSource("false")); public static readonly Knob EnableResourceUtilizationWarnings = new Knob( nameof(EnableResourceUtilizationWarnings), diff --git a/src/Agent.Worker/JobRunner.cs b/src/Agent.Worker/JobRunner.cs index be979e4a89..0a291a146a 100644 --- a/src/Agent.Worker/JobRunner.cs +++ b/src/Agent.Worker/JobRunner.cs @@ -119,7 +119,14 @@ public async Task RunAsync(Pipelines.AgentJobRequestMessage message, if (string.Equals(systemDebug, "true", StringComparison.OrdinalIgnoreCase)) { - _ = resourceDiagnosticManager.RunDebugResourceMonitorAsync(); + if (AgentKnobs.EnableResourceMonitorDebugOutput.GetValue(jobContext).AsBoolean()) + { + _ = resourceDiagnosticManager.RunDebugResourceMonitorAsync(); + } + else + { + jobContext.Debug(StringUtil.Loc("ResourceUtilizationDebugOutputIsDisabled")); + } } agentShutdownRegistration = HostContext.AgentShutdownToken.Register(() => diff --git a/src/Agent.Worker/ResourceMetricsManager.cs b/src/Agent.Worker/ResourceMetricsManager.cs index e20c33371c..e56f9cb35e 100644 --- a/src/Agent.Worker/ResourceMetricsManager.cs +++ b/src/Agent.Worker/ResourceMetricsManager.cs @@ -111,7 +111,7 @@ private void PublishTelemetry(string message, string taskId) #endregion #region MetricMethods - private async Task GetCpuInfoAsync() + private async Task GetCpuInfoAsync(CancellationToken cancellationToken) { if (_cpuInfo.Updated >= DateTime.Now - TimeSpan.FromMilliseconds(METRICS_UPDATE_INTERVAL)) { @@ -120,13 +120,6 @@ private async Task GetCpuInfoAsync() if (PlatformUtil.RunningOnWindows) { - using var timeoutTokenSource = new CancellationTokenSource(); - timeoutTokenSource.CancelAfter(TimeSpan.FromMilliseconds(METRICS_UPDATE_INTERVAL)); - - using var linkedTokenSource = CancellationTokenSource.CreateLinkedTokenSource( - _context.CancellationToken, - timeoutTokenSource.Token); - await Task.Run(() => { using var query = new ManagementObjectSearcher("SELECT PercentIdleTime FROM Win32_PerfFormattedData_PerfOS_Processor WHERE Name=\"_Total\""); @@ -139,66 +132,58 @@ await Task.Run(() => _cpuInfo.Updated = DateTime.Now; _cpuInfo.Usage = 100 - cpuInfoIdle; } - }, linkedTokenSource.Token); + }, cancellationToken); } if (PlatformUtil.RunningOnLinux) { - using var processInvoker = HostContext.CreateService(); + List samples = new(); + int samplesCount = 10; - using var timeoutTokenSource = new CancellationTokenSource(); - timeoutTokenSource.CancelAfter(TimeSpan.FromMilliseconds(METRICS_UPDATE_INTERVAL)); - - using var linkedTokenSource = CancellationTokenSource.CreateLinkedTokenSource( - _context.CancellationToken, - timeoutTokenSource.Token); - - processInvoker.OutputDataReceived += delegate (object sender, ProcessDataReceivedEventArgs message) + // /proc/stat updates linearly in real time and shows CPU time counters during the whole system uptime + // so we need to collect multiple samples to calculate CPU usage + for (int i = 0; i < samplesCount + 1; i++) { - var processInvokerOutput = message.Data; - - var cpuInfoNice = int.Parse(processInvokerOutput.Split(' ', (char)StringSplitOptions.RemoveEmptyEntries)[2]); - var cpuInfoIdle = int.Parse(processInvokerOutput.Split(' ', (char)StringSplitOptions.RemoveEmptyEntries)[4]); - var cpuInfoIOWait = int.Parse(processInvokerOutput.Split(' ', (char)StringSplitOptions.RemoveEmptyEntries)[5]); - - lock (_cpuInfoLock) + string[] strings = await File.ReadAllLinesAsync("/proc/stat", cancellationToken); + if (cancellationToken.IsCancellationRequested) { - _cpuInfo.Updated = DateTime.Now; - _cpuInfo.Usage = (double)(cpuInfoNice + cpuInfoIdle) * 100 / (cpuInfoNice + cpuInfoIdle + cpuInfoIOWait); + return; } - }; - processInvoker.ErrorDataReceived += delegate (object sender, ProcessDataReceivedEventArgs message) + samples.Add(strings[0] + .Split(' ', StringSplitOptions.RemoveEmptyEntries) + .Skip(1) + .Select(float.Parse) + .ToArray()); + + await Task.Delay(100, cancellationToken); + } + + // The CPU time counters in the /proc/stat are: + // user, nice, system, idle, iowait, irq, softirq, steal, guest, guest_nice + // + // We need to get deltas for idle and total CPU time using the gathered samples + // and calculate the average to provide the CPU utilization in the moment + double cpuUsage = 0.0; + for (int i = 1; i < samplesCount + 1; i++) { - Trace.Error($"Error on receiving CPU info: {message.Data}"); - }; + double idle = samples[i][3] - samples[i - 1][3]; + double total = samples[i].Sum() - samples[i - 1].Sum(); - var filePath = "grep"; - var arguments = "\"cpu \" /proc/stat"; - await processInvoker.ExecuteAsync( - workingDirectory: string.Empty, - fileName: filePath, - arguments: arguments, - environment: null, - requireExitCodeZero: true, - outputEncoding: null, - killProcessOnCancel: true, - cancellationToken: linkedTokenSource.Token); - } + cpuUsage += 1.0 - (idle / total); + } + lock (_cpuInfoLock) + { + _cpuInfo.Updated = DateTime.Now; + _cpuInfo.Usage = (cpuUsage / samplesCount) * 100; + } + } if (PlatformUtil.RunningOnMacOS) { - List outputs = new List(); - using var processInvoker = HostContext.CreateService(); - using var timeoutTokenSource = new CancellationTokenSource(); - timeoutTokenSource.CancelAfter(TimeSpan.FromMilliseconds(METRICS_UPDATE_INTERVAL)); - - using var linkedTokenSource = CancellationTokenSource.CreateLinkedTokenSource( - _context.CancellationToken, - timeoutTokenSource.Token); - + List outputs = new List(); processInvoker.OutputDataReceived += delegate (object sender, ProcessDataReceivedEventArgs message) { outputs.Add(message.Data); @@ -219,7 +204,7 @@ await processInvoker.ExecuteAsync( requireExitCodeZero: true, outputEncoding: null, killProcessOnCancel: true, - cancellationToken: linkedTokenSource.Token); + cancellationToken: cancellationToken); // Use second sample for more accurate calculation var cpuInfoIdle = double.Parse(outputs[1].Split(' ', (char)StringSplitOptions.RemoveEmptyEntries)[6].Trim('%')); @@ -251,7 +236,7 @@ private void GetDiskInfo() } } - private async Task GetMemoryInfoAsync() + private async Task GetMemoryInfoAsync(CancellationToken cancellationToken) { if (_memoryInfo.Updated >= DateTime.Now - TimeSpan.FromMilliseconds(METRICS_UPDATE_INTERVAL)) { @@ -260,13 +245,6 @@ private async Task GetMemoryInfoAsync() if (PlatformUtil.RunningOnWindows) { - using var timeoutTokenSource = new CancellationTokenSource(); - timeoutTokenSource.CancelAfter(TimeSpan.FromMilliseconds(METRICS_UPDATE_INTERVAL)); - - using var linkedTokenSource = CancellationTokenSource.CreateLinkedTokenSource( - _context.CancellationToken, - timeoutTokenSource.Token); - await Task.Run(() => { using var query = new ManagementObjectSearcher("SELECT FreePhysicalMemory, TotalVisibleMemorySize FROM CIM_OperatingSystem"); @@ -281,69 +259,29 @@ await Task.Run(() => _memoryInfo.TotalMemoryMB = totalMemory / 1024; _memoryInfo.UsedMemoryMB = (totalMemory - freeMemory) / 1024; } - }, linkedTokenSource.Token); + }, cancellationToken); } if (PlatformUtil.RunningOnLinux) { - // Some compact Linux distributions like UBI may not have "free" utility installed, or it may have a custom output - // We don't want to break currently existing pipelines with ADO warnings - // so related errors thrown here will be sent to the trace or debug logs by caller methods - - using var processInvoker = HostContext.CreateService(); - - using var timeoutTokenSource = new CancellationTokenSource(); - timeoutTokenSource.CancelAfter(TimeSpan.FromMilliseconds(METRICS_UPDATE_INTERVAL)); - - using var linkedTokenSource = CancellationTokenSource.CreateLinkedTokenSource( - _context.CancellationToken, - timeoutTokenSource.Token); - - processInvoker.OutputDataReceived += delegate (object sender, ProcessDataReceivedEventArgs message) + string[] memoryInfo = await File.ReadAllLinesAsync("/proc/meminfo", cancellationToken); + if (cancellationToken.IsCancellationRequested) { - if (!message.Data.StartsWith("Mem:")) - { - return; - } - - var processInvokerOutputString = message.Data; - var memoryInfoString = processInvokerOutputString.Split(" ", StringSplitOptions.RemoveEmptyEntries); - - if (memoryInfoString.Length != 7) - { - throw new Exception("\"free\" utility has non-default output"); - } - - lock (_memoryInfoLock) - { - _memoryInfo.Updated = DateTime.Now; - _memoryInfo.TotalMemoryMB = long.Parse(memoryInfoString[1]); - _memoryInfo.UsedMemoryMB = long.Parse(memoryInfoString[2]); - } - }; + return; + } - processInvoker.ErrorDataReceived += delegate (object sender, ProcessDataReceivedEventArgs message) - { - Trace.Error($"Error on receiving memory info: {message.Data}"); - }; + // The /proc/meminfo file contains several memory counters. To calculate the available memory + // we need to get the total memory and the available memory counters + // The available memory contains the sum of free, cached, and buffer memory + // it shows more accurate information about the memory usage than the free memory counter + int totalMemory = int.Parse(memoryInfo[0].Split(" ", StringSplitOptions.RemoveEmptyEntries)[1]); + int availableMemory = int.Parse(memoryInfo[2].Split(" ", StringSplitOptions.RemoveEmptyEntries)[1]); - try - { - var filePath = "free"; - var arguments = "-m"; - await processInvoker.ExecuteAsync( - workingDirectory: string.Empty, - fileName: filePath, - arguments: arguments, - environment: null, - requireExitCodeZero: true, - outputEncoding: null, - killProcessOnCancel: true, - cancellationToken: linkedTokenSource.Token); - } - catch (Win32Exception ex) + lock (_memoryInfoLock) { - throw new Exception($"\"free\" utility is unavailable. Exception: {ex.Message}"); + _memoryInfo.Updated = DateTime.Now; + _memoryInfo.TotalMemoryMB = totalMemory / 1024; + _memoryInfo.UsedMemoryMB = (totalMemory - availableMemory) / 1024; } } @@ -353,17 +291,9 @@ await processInvoker.ExecuteAsync( // but unfortunately it returns values in pages and has no built-in arguments for custom output // so we need to parse and cast the output manually - List outputs = new List(); - using var processInvoker = HostContext.CreateService(); - using var timeoutTokenSource = new CancellationTokenSource(); - timeoutTokenSource.CancelAfter(TimeSpan.FromMilliseconds(METRICS_UPDATE_INTERVAL)); - - using var linkedTokenSource = CancellationTokenSource.CreateLinkedTokenSource( - _context.CancellationToken, - timeoutTokenSource.Token); - + List outputs = new List(); processInvoker.OutputDataReceived += delegate (object sender, ProcessDataReceivedEventArgs message) { outputs.Add(message.Data); @@ -383,7 +313,7 @@ await processInvoker.ExecuteAsync( requireExitCodeZero: true, outputEncoding: null, killProcessOnCancel: true, - cancellationToken: linkedTokenSource.Token); + cancellationToken: cancellationToken); var pageSize = int.Parse(outputs[0].Split(" ", StringSplitOptions.RemoveEmptyEntries)[7]); @@ -408,11 +338,11 @@ await processInvoker.ExecuteAsync( #endregion #region StringMethods - private async Task GetCpuInfoStringAsync() + private async Task GetCpuInfoStringAsync(CancellationToken cancellationToken) { try { - await GetCpuInfoAsync(); + await GetCpuInfoAsync(cancellationToken); return StringUtil.Loc("ResourceMonitorCPUInfo", $"{_cpuInfo.Usage:0.00}"); } @@ -428,7 +358,10 @@ private string GetDiskInfoString() { GetDiskInfo(); - return StringUtil.Loc("ResourceMonitorDiskInfo", _diskInfo.VolumeRoot, $"{_diskInfo.FreeDiskSpaceMB:0.00}", $"{_diskInfo.TotalDiskSpaceMB:0.00}"); + return StringUtil.Loc("ResourceMonitorDiskInfo", + _diskInfo.VolumeRoot, + $"{_diskInfo.FreeDiskSpaceMB:0.00}", + $"{_diskInfo.TotalDiskSpaceMB:0.00}"); } catch (Exception ex) { @@ -436,13 +369,15 @@ private string GetDiskInfoString() } } - private async Task GetMemoryInfoStringAsync() + private async Task GetMemoryInfoStringAsync(CancellationToken cancellationToken) { try { - await GetMemoryInfoAsync(); + await GetMemoryInfoAsync(cancellationToken); - return StringUtil.Loc("ResourceMonitorMemoryInfo", $"{_memoryInfo.UsedMemoryMB:0.00}", $"{_memoryInfo.TotalMemoryMB:0.00}"); + return StringUtil.Loc("ResourceMonitorMemoryInfo", + $"{_memoryInfo.UsedMemoryMB:0.00}", + $"{_memoryInfo.TotalMemoryMB:0.00}"); } catch (Exception ex) { @@ -456,7 +391,17 @@ public async Task RunDebugResourceMonitorAsync() { while (!_context.CancellationToken.IsCancellationRequested) { - _context.Debug(StringUtil.Loc("ResourceMonitorAgentEnvironmentResource", GetDiskInfoString(), await GetMemoryInfoStringAsync(), await GetCpuInfoStringAsync())); + using var timeoutTokenSource = new CancellationTokenSource(); + timeoutTokenSource.CancelAfter(TimeSpan.FromMilliseconds(METRICS_UPDATE_INTERVAL)); + + using var linkedTokenSource = CancellationTokenSource.CreateLinkedTokenSource( + _context.CancellationToken, + timeoutTokenSource.Token); + + _context.Debug(StringUtil.Loc("ResourceMonitorAgentEnvironmentResource", + GetDiskInfoString(), + await GetMemoryInfoStringAsync(linkedTokenSource.Token), + await GetCpuInfoStringAsync(linkedTokenSource.Token))); await Task.Delay(ACTIVE_MODE_INTERVAL, _context.CancellationToken); } @@ -475,7 +420,10 @@ public async Task RunDiskSpaceUtilizationMonitorAsync() if (freeDiskSpacePercentage <= AVAILABLE_DISK_SPACE_PERCENTAGE_THRESHOLD) { - _context.Warning(StringUtil.Loc("ResourceMonitorFreeDiskSpaceIsLowerThanThreshold", _diskInfo.VolumeRoot, AVAILABLE_DISK_SPACE_PERCENTAGE_THRESHOLD, $"{usedDiskSpacePercentage:0.00}")); + _context.Warning(StringUtil.Loc("ResourceMonitorFreeDiskSpaceIsLowerThanThreshold", + _diskInfo.VolumeRoot, + AVAILABLE_DISK_SPACE_PERCENTAGE_THRESHOLD, + $"{usedDiskSpacePercentage:0.00}")); break; } @@ -495,15 +443,24 @@ public async Task RunMemoryUtilizationMonitorAsync() { while (!_context.CancellationToken.IsCancellationRequested) { + using var timeoutTokenSource = new CancellationTokenSource(); + timeoutTokenSource.CancelAfter(TimeSpan.FromMilliseconds(METRICS_UPDATE_INTERVAL)); + + using var linkedTokenSource = CancellationTokenSource.CreateLinkedTokenSource( + _context.CancellationToken, + timeoutTokenSource.Token); + try { - await GetMemoryInfoAsync(); + await GetMemoryInfoAsync(linkedTokenSource.Token); var usedMemoryPercentage = Math.Round(((_memoryInfo.UsedMemoryMB / (double)_memoryInfo.TotalMemoryMB) * 100.0), 2); if (100.0 - usedMemoryPercentage <= AVAILABLE_MEMORY_PERCENTAGE_THRESHOLD) { - _context.Warning(StringUtil.Loc("ResourceMonitorMemorySpaceIsLowerThanThreshold", AVAILABLE_MEMORY_PERCENTAGE_THRESHOLD, $"{usedMemoryPercentage:0.00}")); + _context.Warning(StringUtil.Loc("ResourceMonitorMemorySpaceIsLowerThanThreshold", + AVAILABLE_MEMORY_PERCENTAGE_THRESHOLD, + $"{usedMemoryPercentage:0.00}")); break; } @@ -523,9 +480,16 @@ public async Task RunCpuUtilizationMonitorAsync(string taskId) { while (!_context.CancellationToken.IsCancellationRequested) { + using var timeoutTokenSource = new CancellationTokenSource(); + timeoutTokenSource.CancelAfter(TimeSpan.FromMilliseconds(METRICS_UPDATE_INTERVAL)); + + using var linkedTokenSource = CancellationTokenSource.CreateLinkedTokenSource( + _context.CancellationToken, + timeoutTokenSource.Token); + try { - await GetCpuInfoAsync(); + await GetCpuInfoAsync(linkedTokenSource.Token); if (_cpuInfo.Usage >= CPU_UTILIZATION_PERCENTAGE_THRESHOLD) { diff --git a/src/Misc/layoutbin/en-US/strings.json b/src/Misc/layoutbin/en-US/strings.json index f96afaa75c..81242bf1f4 100644 --- a/src/Misc/layoutbin/en-US/strings.json +++ b/src/Misc/layoutbin/en-US/strings.json @@ -503,7 +503,8 @@ "ResourceMonitorMemoryInfo": "Memory: Used {0} MB out of {1} MB", "ResourceMonitorMemoryInfoError": "Unable to get memory info, exception: {0}", "ResourceMonitorMemorySpaceIsLowerThanThreshold": "Free memory is lower than {0}%; Currently used: {1}%", - "ResourceUtilizationWarningsIsDisabled": "Resource Utilization warnings is disabled, switch \"DISABLE_RESOURCE_UTILIZATION_WARNINGS\" variable to \"true\" if you want to enable it", + "ResourceUtilizationDebugOutputIsDisabled": "Resource Utilization output for debug runs is disabled, switch \"AZP_ENABLE_RESOURCE_MONITOR_DEBUG_OUTPUT\" variable to \"true\" if you want to enable it", + "ResourceUtilizationWarningsIsDisabled": "Resource Utilization warnings is disabled, switch \"DISABLE_RESOURCE_UTILIZATION_WARNINGS\" variable to \"false\" if you want to enable it", "RestartIn15SecMessage": "Restarting the machine in 15 seconds...", "RestartMessage": "Restart the machine to launch agent and for autologon settings to take effect.", "ReStreamLogsToFilesError": "You cannot use --disableloguploads and --reStreamLogsToFiles at the same time!",