diff --git a/src/Agent.Listener/DistributedTask.Pipelines/Yaml/Contracts/DeploymentTarget.cs b/src/Agent.Listener/DistributedTask.Pipelines/Yaml/Contracts/DeploymentTarget.cs index fc4899a203..5783c45f2f 100644 --- a/src/Agent.Listener/DistributedTask.Pipelines/Yaml/Contracts/DeploymentTarget.cs +++ b/src/Agent.Listener/DistributedTask.Pipelines/Yaml/Contracts/DeploymentTarget.cs @@ -19,5 +19,8 @@ internal sealed class DeploymentTarget: IPhaseTarget internal IList Tags { get; set; } internal String TimeoutInMinutes { get; set; } + + /// Number of retries for task failure + internal String RetryCountOnTaskFailure { get; set; } } } diff --git a/src/Agent.Listener/DistributedTask.Pipelines/Yaml/Contracts/TaskStep.cs b/src/Agent.Listener/DistributedTask.Pipelines/Yaml/Contracts/TaskStep.cs index 87e97afdb7..3e66e34dcc 100644 --- a/src/Agent.Listener/DistributedTask.Pipelines/Yaml/Contracts/TaskStep.cs +++ b/src/Agent.Listener/DistributedTask.Pipelines/Yaml/Contracts/TaskStep.cs @@ -24,6 +24,9 @@ internal sealed class TaskStep : ISimpleStep internal Int32 TimeoutInMinutes { get; set; } + /// Number of retries for task failure + internal Int32 RetryCountOnTaskFailure { get; set; } + public ISimpleStep Clone() { return new TaskStep() @@ -36,6 +39,7 @@ public ISimpleStep Clone() Inputs = new Dictionary(Inputs ?? new Dictionary(0, StringComparer.OrdinalIgnoreCase)), Reference = Reference?.Clone(), TimeoutInMinutes = TimeoutInMinutes, + RetryCountOnTaskFailure = RetryCountOnTaskFailure }; } } diff --git a/src/Agent.Listener/DistributedTask.Pipelines/Yaml/TypeConverters/ConverterUtil.phases.cs b/src/Agent.Listener/DistributedTask.Pipelines/Yaml/TypeConverters/ConverterUtil.phases.cs index d9392fe767..d65ce5dae0 100644 --- a/src/Agent.Listener/DistributedTask.Pipelines/Yaml/TypeConverters/ConverterUtil.phases.cs +++ b/src/Agent.Listener/DistributedTask.Pipelines/Yaml/TypeConverters/ConverterUtil.phases.cs @@ -194,6 +194,10 @@ internal static DeploymentTarget ReadDeploymentTarget(IParser parser) result.TimeoutInMinutes = ReadNonEmptyString(parser); break; + case YamlConstants.RetryCountOnTaskFailure: + result.RetryCountOnTaskFailure = ReadNonEmptyString(parser); + break; + default: throw new SyntaxErrorException(scalar.Start, scalar.End, $"Unexpected property: '{scalar.Value}'"); } diff --git a/src/Agent.Listener/DistributedTask.Pipelines/Yaml/TypeConverters/YamlConstants.cs b/src/Agent.Listener/DistributedTask.Pipelines/Yaml/TypeConverters/YamlConstants.cs index c513bee19f..b315e16cf8 100644 --- a/src/Agent.Listener/DistributedTask.Pipelines/Yaml/TypeConverters/YamlConstants.cs +++ b/src/Agent.Listener/DistributedTask.Pipelines/Yaml/TypeConverters/YamlConstants.cs @@ -50,5 +50,6 @@ internal static class YamlConstants internal const String Value = "value"; internal const String Variables = "variables"; internal const String WorkingDirectory = "workingDirectory"; + internal const String RetryCountOnTaskFailure = "retryCountOnTaskFailure"; } } \ No newline at end of file diff --git a/src/Agent.Worker/ExecutionContext.cs b/src/Agent.Worker/ExecutionContext.cs index b7e2feb8ec..4584b6d3b3 100644 --- a/src/Agent.Worker/ExecutionContext.cs +++ b/src/Agent.Worker/ExecutionContext.cs @@ -74,6 +74,16 @@ public interface IExecutionContext : IAgentService, IKnobValueContext void SetStepTarget(Pipelines.StepTarget target); string TranslatePathForStepTarget(string val); IHostContext GetHostContext(); + /// + /// Re-initializes force completed - between next retry attempt + /// + /// + void ReInitializeForceCompleted(); + /// + /// Cancel force task completion between retry attempts + /// + /// + void CancelForceTaskCompletion(); } public sealed class ExecutionContext : AgentService, IExecutionContext, IDisposable @@ -96,6 +106,7 @@ public sealed class ExecutionContext : AgentService, IExecutionContext, IDisposa private Guid _detailTimelineId; private int _childTimelineRecordOrder = 0; private CancellationTokenSource _cancellationTokenSource; + private CancellationTokenSource _forceCompleteCancellationTokenSource = new CancellationTokenSource(); private TaskCompletionSource _forceCompleted = new TaskCompletionSource(); private bool _throttlingReported = false; private ExecutionTargetInfo _defaultStepTarget; @@ -112,6 +123,7 @@ public sealed class ExecutionContext : AgentService, IExecutionContext, IDisposa public Guid Id => _record.Id; public Task ForceCompleted => _forceCompleted.Task; public CancellationToken CancellationToken => _cancellationTokenSource.Token; + public CancellationToken ForceCompleteCancellationToken => _forceCompleteCancellationTokenSource.Token; public List Endpoints { get; private set; } public List SecureFiles { get; private set; } public List Repositories { get; private set; } @@ -181,11 +193,20 @@ public void ForceTaskComplete() Trace.Info("Force finish current task in 5 sec."); Task.Run(async () => { - await Task.Delay(TimeSpan.FromSeconds(5)); - _forceCompleted?.TrySetResult(1); + await Task.Delay(TimeSpan.FromSeconds(5), ForceCompleteCancellationToken); + if (!ForceCompleteCancellationToken.IsCancellationRequested) + { + _forceCompleted?.TrySetResult(1); + } }); } + public void CancelForceTaskCompletion() + { + Trace.Info($"Forced completion canceled"); + this._forceCompleteCancellationTokenSource.Cancel(); + } + [System.Diagnostics.CodeAnalysis.SuppressMessage("Microsoft.Naming", "CA1721: Property names should not match get methods")] public IHostContext GetHostContext() { @@ -834,9 +855,16 @@ public IScopedEnvironment GetScopedEnvironment() return new SystemEnvironment(); } + public void ReInitializeForceCompleted() + { + this._forceCompleted = new TaskCompletionSource(); + this._forceCompleteCancellationTokenSource = new CancellationTokenSource(); + } + public void Dispose() { _cancellationTokenSource?.Dispose(); + _forceCompleteCancellationTokenSource?.Dispose(); _buildLogsWriter?.Dispose(); _buildLogsWriter = null; diff --git a/src/Agent.Worker/Handlers/NodeHandler.cs b/src/Agent.Worker/Handlers/NodeHandler.cs index 80e9599b35..4eda287a15 100644 --- a/src/Agent.Worker/Handlers/NodeHandler.cs +++ b/src/Agent.Worker/Handlers/NodeHandler.cs @@ -134,29 +134,37 @@ public async Task RunAsync() outputEncoding = Encoding.UTF8; } - // Execute the process. Exit code 0 should always be returned. - // A non-zero exit code indicates infrastructural failure. - // Task failure should be communicated over STDOUT using ## commands. - Task step = StepHost.ExecuteAsync(workingDirectory: StepHost.ResolvePathForStepHost(workingDirectory), - fileName: StepHost.ResolvePathForStepHost(file), - arguments: arguments, - environment: Environment, - requireExitCodeZero: true, - outputEncoding: outputEncoding, - killProcessOnCancel: false, - inheritConsoleHandler: !ExecutionContext.Variables.Retain_Default_Encoding, - cancellationToken: ExecutionContext.CancellationToken); - - // Wait for either the node exit or force finish through ##vso command - await System.Threading.Tasks.Task.WhenAny(step, ExecutionContext.ForceCompleted); - - if (ExecutionContext.ForceCompleted.IsCompleted) + try { - ExecutionContext.Debug("The task was marked as \"done\", but the process has not closed after 5 seconds. Treating the task as complete."); + // Execute the process. Exit code 0 should always be returned. + // A non-zero exit code indicates infrastructural failure. + // Task failure should be communicated over STDOUT using ## commands. + Task step = StepHost.ExecuteAsync(workingDirectory: StepHost.ResolvePathForStepHost(workingDirectory), + fileName: StepHost.ResolvePathForStepHost(file), + arguments: arguments, + environment: Environment, + requireExitCodeZero: true, + outputEncoding: outputEncoding, + killProcessOnCancel: false, + inheritConsoleHandler: !ExecutionContext.Variables.Retain_Default_Encoding, + cancellationToken: ExecutionContext.CancellationToken); + + // Wait for either the node exit or force finish through ##vso command + await System.Threading.Tasks.Task.WhenAny(step, ExecutionContext.ForceCompleted); + + if (ExecutionContext.ForceCompleted.IsCompleted) + { + ExecutionContext.Debug("The task was marked as \"done\", but the process has not closed after 5 seconds. Treating the task as complete."); + } + else + { + await step; + } } - else + finally { - await step; + StepHost.OutputDataReceived -= OnDataReceived; + StepHost.ErrorDataReceived -= OnDataReceived; } } diff --git a/src/Agent.Worker/Handlers/PowerShell3Handler.cs b/src/Agent.Worker/Handlers/PowerShell3Handler.cs index 11864cda69..2fc271ea75 100644 --- a/src/Agent.Worker/Handlers/PowerShell3Handler.cs +++ b/src/Agent.Worker/Handlers/PowerShell3Handler.cs @@ -67,15 +67,23 @@ public async Task RunAsync() // Execute the process. Exit code 0 should always be returned. // A non-zero exit code indicates infrastructural failure. // Task failure should be communicated over STDOUT using ## commands. - await StepHost.ExecuteAsync(workingDirectory: StepHost.ResolvePathForStepHost(scriptDirectory), - fileName: powerShellExe, - arguments: powerShellExeArgs, - environment: Environment, - requireExitCodeZero: true, - outputEncoding: null, - killProcessOnCancel: false, - inheritConsoleHandler: !ExecutionContext.Variables.Retain_Default_Encoding, - cancellationToken: ExecutionContext.CancellationToken); + try + { + await StepHost.ExecuteAsync(workingDirectory: StepHost.ResolvePathForStepHost(scriptDirectory), + fileName: powerShellExe, + arguments: powerShellExeArgs, + environment: Environment, + requireExitCodeZero: true, + outputEncoding: null, + killProcessOnCancel: false, + inheritConsoleHandler: !ExecutionContext.Variables.Retain_Default_Encoding, + cancellationToken: ExecutionContext.CancellationToken); + } + finally + { + StepHost.OutputDataReceived -= OnDataReceived; + StepHost.ErrorDataReceived -= OnDataReceived; + } } private void OnDataReceived(object sender, ProcessDataReceivedEventArgs e) diff --git a/src/Agent.Worker/RetryHelper.cs b/src/Agent.Worker/RetryHelper.cs index 1ac3754eb3..9e3aabe1b0 100644 --- a/src/Agent.Worker/RetryHelper.cs +++ b/src/Agent.Worker/RetryHelper.cs @@ -1,15 +1,29 @@ using System; using System.Threading.Tasks; +using Microsoft.TeamFoundation.DistributedTask.WebApi; namespace Microsoft.VisualStudio.Services.Agent.Worker { internal class RetryHelper { + /// + /// Returns exponential delay - depending on number of retry + /// Considers that retryNumber starts from 0 + /// Initial delay - 1 second + /// + /// + public static int ExponentialDelay(int retryNumber) + { + return (int)(Math.Pow(retryNumber + 1, 2) * 1000); + } + + public RetryHelper(IExecutionContext executionContext, int maxRetries = 3) { Debug = (str) => executionContext.Debug(str); Warning = (str) => executionContext.Warning(str); MaxRetries = maxRetries; + ExecutionContext = executionContext; } public RetryHelper(IAsyncCommandContext commandContext, int maxRetries = 3) @@ -48,6 +62,59 @@ public async Task Retry(Func> action, Func timeDelayInte } + /// + /// Runs action with maxRetries number of retries + /// + /// Action to execute with retries + /// Function to calculate delay between retries depending on retry number. Should take retry number as argument and consider that it starts from 0. + /// + public async Task RetryStep(Func action, Func timeDelayInterval) + { + int retryCounter = 0; + do + { + using (new SimpleTimer($"RetryHelper Method:{action.Method} ", Debug)) + { + var delayInterval = timeDelayInterval(retryCounter); + try + { + if (retryCounter > 0) + { + //ReInitialize _forceCompleted and _forceCompleteCancellationTokenSource + ExecutionContext.ReInitializeForceCompleted(); + } + + Debug($"Invoking Method: {action.Method}. Attempt count: {retryCounter}"); + await action(); + + if (ExecutionContext.Result != TaskResult.Failed || ExhaustedRetryCount(retryCounter)) + { + return; + } + else + { + string exceptionMessage = $"Task result {ExecutionContext.Result}"; + ExecutionContext.Result = null; + Warning($"RetryHelper encountered task failure, will retry (attempt #: {retryCounter + 1} out of {this.MaxRetries}) after {delayInterval} ms"); + } + } + catch (Exception ex) + { + if (!ShouldRetryStepOnException(ex) || ExhaustedRetryCount(retryCounter)) + { + throw; + } + Warning($"RetryHelper encountered exception, will retry (attempt #: {retryCounter + 1} {ex.Message}) afer {delayInterval} ms"); + } + //Cancel force task completion before the next attempt + ExecutionContext.CancelForceTaskCompletion(); + + await Task.Delay(timeDelayInterval(retryCounter), ExecutionContext.CancellationToken); + retryCounter++; + } + } while (true); + } + private bool ExhaustedRetryCount(int retryCount) { if (retryCount >= MaxRetries) @@ -58,8 +125,14 @@ private bool ExhaustedRetryCount(int retryCount) return false; } + private bool ShouldRetryStepOnException(Exception exception) + { + return !(exception is TimeoutException) && !(exception is OperationCanceledException); + } + private readonly int MaxRetries; private readonly Action Debug; private readonly Action Warning; + private readonly IExecutionContext ExecutionContext; } } diff --git a/src/Agent.Worker/TaskRunner.cs b/src/Agent.Worker/TaskRunner.cs index a36e62d291..56917596be 100644 --- a/src/Agent.Worker/TaskRunner.cs +++ b/src/Agent.Worker/TaskRunner.cs @@ -53,6 +53,8 @@ public sealed class TaskRunner : AgentService, ITaskRunner public Pipelines.StepTarget Target => Task?.Target; + const int RetryCountOnTaskFailureLimit = 10; + public async Task RunAsync() { // Validate args. @@ -362,7 +364,23 @@ public async Task RunAsync() taskDirectory: definition.Directory); // Run the task. - await handler.RunAsync(); + int retryCount = this.Task.RetryCountOnTaskFailure; + + if (retryCount > 0) + { + if (retryCount > RetryCountOnTaskFailureLimit) + { + ExecutionContext.Warning(StringUtil.Loc("RetryCountLimitExceeded", RetryCountOnTaskFailureLimit, retryCount)); + retryCount = RetryCountOnTaskFailureLimit; + } + + RetryHelper rh = new RetryHelper(ExecutionContext, retryCount); + await rh.RetryStep(async () => await handler.RunAsync(), RetryHelper.ExponentialDelay); + } + else + { + await handler.RunAsync(); + } } } diff --git a/src/Common.props b/src/Common.props index ddf36ed233..8a0743483e 100644 --- a/src/Common.props +++ b/src/Common.props @@ -10,7 +10,7 @@ OS_UNKNOWN ARCH_UNKNOWN - 0.5.170-private + 0.5.171-private $(CodeAnalysis) diff --git a/src/Misc/layoutbin/en-US/strings.json b/src/Misc/layoutbin/en-US/strings.json index 31b086b684..6c6862641a 100644 --- a/src/Misc/layoutbin/en-US/strings.json +++ b/src/Misc/layoutbin/en-US/strings.json @@ -474,6 +474,7 @@ "RepositoryNotExist": "Can't update repository, the repository does not exist.", "RestartIn15SecMessage": "Restarting the machine in 15 seconds...", "RestartMessage": "Restart the machine to launch agent and for autologon settings to take effect.", + "RetryCountLimitExceeded": "The maximum allowed number of attempts is {0} but got {1}. Retry attempts count will be decreased to {0}.", "RMApiFailure": "Api {0} failed with an error code {1}", "RMArtifactContainerDetailsInvalidError": "The artifact does not have valid container details: {0}", "RMArtifactContainerDetailsNotFoundError": "The artifact does not contain container details: {0}",