Skip to content

Commit

Permalink
Add the ability to retry the pipeline task if execution failed (#3505)
Browse files Browse the repository at this point in the history
  • Loading branch information
max-zaytsev authored Oct 14, 2021
1 parent 4b7aaaf commit 9839e29
Show file tree
Hide file tree
Showing 11 changed files with 181 additions and 33 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,8 @@ internal sealed class DeploymentTarget: IPhaseTarget
internal IList<String> Tags { get; set; }

internal String TimeoutInMinutes { get; set; }

/// <summary>Number of retries for task failure</summary>
internal String RetryCountOnTaskFailure { get; set; }
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ internal sealed class TaskStep : ISimpleStep

internal Int32 TimeoutInMinutes { get; set; }

/// <summary>Number of retries for task failure</summary>
internal Int32 RetryCountOnTaskFailure { get; set; }

public ISimpleStep Clone()
{
return new TaskStep()
Expand All @@ -36,6 +39,7 @@ public ISimpleStep Clone()
Inputs = new Dictionary<String, String>(Inputs ?? new Dictionary<String, String>(0, StringComparer.OrdinalIgnoreCase)),
Reference = Reference?.Clone(),
TimeoutInMinutes = TimeoutInMinutes,
RetryCountOnTaskFailure = RetryCountOnTaskFailure
};
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,10 @@ internal static DeploymentTarget ReadDeploymentTarget(IParser parser)
result.TimeoutInMinutes = ReadNonEmptyString(parser);
break;

case YamlConstants.RetryCountOnTaskFailure:
result.RetryCountOnTaskFailure = ReadNonEmptyString(parser);
break;

default:
throw new SyntaxErrorException(scalar.Start, scalar.End, $"Unexpected property: '{scalar.Value}'");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,5 +50,6 @@ internal static class YamlConstants
internal const String Value = "value";
internal const String Variables = "variables";
internal const String WorkingDirectory = "workingDirectory";
internal const String RetryCountOnTaskFailure = "retryCountOnTaskFailure";
}
}
32 changes: 30 additions & 2 deletions src/Agent.Worker/ExecutionContext.cs
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,16 @@ public interface IExecutionContext : IAgentService, IKnobValueContext
void SetStepTarget(Pipelines.StepTarget target);
string TranslatePathForStepTarget(string val);
IHostContext GetHostContext();
/// <summary>
/// Re-initializes force completed - between next retry attempt
/// </summary>
/// <returns></returns>
void ReInitializeForceCompleted();
/// <summary>
/// Cancel force task completion between retry attempts
/// </summary>
/// <returns></returns>
void CancelForceTaskCompletion();
}

public sealed class ExecutionContext : AgentService, IExecutionContext, IDisposable
Expand All @@ -96,6 +106,7 @@ public sealed class ExecutionContext : AgentService, IExecutionContext, IDisposa
private Guid _detailTimelineId;
private int _childTimelineRecordOrder = 0;
private CancellationTokenSource _cancellationTokenSource;
private CancellationTokenSource _forceCompleteCancellationTokenSource = new CancellationTokenSource();
private TaskCompletionSource<int> _forceCompleted = new TaskCompletionSource<int>();
private bool _throttlingReported = false;
private ExecutionTargetInfo _defaultStepTarget;
Expand All @@ -112,6 +123,7 @@ public sealed class ExecutionContext : AgentService, IExecutionContext, IDisposa
public Guid Id => _record.Id;
public Task ForceCompleted => _forceCompleted.Task;
public CancellationToken CancellationToken => _cancellationTokenSource.Token;
public CancellationToken ForceCompleteCancellationToken => _forceCompleteCancellationTokenSource.Token;
public List<ServiceEndpoint> Endpoints { get; private set; }
public List<SecureFile> SecureFiles { get; private set; }
public List<Pipelines.RepositoryResource> Repositories { get; private set; }
Expand Down Expand Up @@ -181,11 +193,20 @@ public void ForceTaskComplete()
Trace.Info("Force finish current task in 5 sec.");
Task.Run(async () =>
{
await Task.Delay(TimeSpan.FromSeconds(5));
_forceCompleted?.TrySetResult(1);
await Task.Delay(TimeSpan.FromSeconds(5), ForceCompleteCancellationToken);
if (!ForceCompleteCancellationToken.IsCancellationRequested)
{
_forceCompleted?.TrySetResult(1);
}
});
}

public void CancelForceTaskCompletion()
{
Trace.Info($"Forced completion canceled");
this._forceCompleteCancellationTokenSource.Cancel();
}

[System.Diagnostics.CodeAnalysis.SuppressMessage("Microsoft.Naming", "CA1721: Property names should not match get methods")]
public IHostContext GetHostContext()
{
Expand Down Expand Up @@ -834,9 +855,16 @@ public IScopedEnvironment GetScopedEnvironment()
return new SystemEnvironment();
}

public void ReInitializeForceCompleted()
{
this._forceCompleted = new TaskCompletionSource<int>();
this._forceCompleteCancellationTokenSource = new CancellationTokenSource();
}

public void Dispose()
{
_cancellationTokenSource?.Dispose();
_forceCompleteCancellationTokenSource?.Dispose();

_buildLogsWriter?.Dispose();
_buildLogsWriter = null;
Expand Down
48 changes: 28 additions & 20 deletions src/Agent.Worker/Handlers/NodeHandler.cs
Original file line number Diff line number Diff line change
Expand Up @@ -134,29 +134,37 @@ public async Task RunAsync()
outputEncoding = Encoding.UTF8;
}

// Execute the process. Exit code 0 should always be returned.
// A non-zero exit code indicates infrastructural failure.
// Task failure should be communicated over STDOUT using ## commands.
Task step = StepHost.ExecuteAsync(workingDirectory: StepHost.ResolvePathForStepHost(workingDirectory),
fileName: StepHost.ResolvePathForStepHost(file),
arguments: arguments,
environment: Environment,
requireExitCodeZero: true,
outputEncoding: outputEncoding,
killProcessOnCancel: false,
inheritConsoleHandler: !ExecutionContext.Variables.Retain_Default_Encoding,
cancellationToken: ExecutionContext.CancellationToken);

// Wait for either the node exit or force finish through ##vso command
await System.Threading.Tasks.Task.WhenAny(step, ExecutionContext.ForceCompleted);

if (ExecutionContext.ForceCompleted.IsCompleted)
try
{
ExecutionContext.Debug("The task was marked as \"done\", but the process has not closed after 5 seconds. Treating the task as complete.");
// Execute the process. Exit code 0 should always be returned.
// A non-zero exit code indicates infrastructural failure.
// Task failure should be communicated over STDOUT using ## commands.
Task step = StepHost.ExecuteAsync(workingDirectory: StepHost.ResolvePathForStepHost(workingDirectory),
fileName: StepHost.ResolvePathForStepHost(file),
arguments: arguments,
environment: Environment,
requireExitCodeZero: true,
outputEncoding: outputEncoding,
killProcessOnCancel: false,
inheritConsoleHandler: !ExecutionContext.Variables.Retain_Default_Encoding,
cancellationToken: ExecutionContext.CancellationToken);

// Wait for either the node exit or force finish through ##vso command
await System.Threading.Tasks.Task.WhenAny(step, ExecutionContext.ForceCompleted);

if (ExecutionContext.ForceCompleted.IsCompleted)
{
ExecutionContext.Debug("The task was marked as \"done\", but the process has not closed after 5 seconds. Treating the task as complete.");
}
else
{
await step;
}
}
else
finally
{
await step;
StepHost.OutputDataReceived -= OnDataReceived;
StepHost.ErrorDataReceived -= OnDataReceived;
}
}

Expand Down
26 changes: 17 additions & 9 deletions src/Agent.Worker/Handlers/PowerShell3Handler.cs
Original file line number Diff line number Diff line change
Expand Up @@ -67,15 +67,23 @@ public async Task RunAsync()
// Execute the process. Exit code 0 should always be returned.
// A non-zero exit code indicates infrastructural failure.
// Task failure should be communicated over STDOUT using ## commands.
await StepHost.ExecuteAsync(workingDirectory: StepHost.ResolvePathForStepHost(scriptDirectory),
fileName: powerShellExe,
arguments: powerShellExeArgs,
environment: Environment,
requireExitCodeZero: true,
outputEncoding: null,
killProcessOnCancel: false,
inheritConsoleHandler: !ExecutionContext.Variables.Retain_Default_Encoding,
cancellationToken: ExecutionContext.CancellationToken);
try
{
await StepHost.ExecuteAsync(workingDirectory: StepHost.ResolvePathForStepHost(scriptDirectory),
fileName: powerShellExe,
arguments: powerShellExeArgs,
environment: Environment,
requireExitCodeZero: true,
outputEncoding: null,
killProcessOnCancel: false,
inheritConsoleHandler: !ExecutionContext.Variables.Retain_Default_Encoding,
cancellationToken: ExecutionContext.CancellationToken);
}
finally
{
StepHost.OutputDataReceived -= OnDataReceived;
StepHost.ErrorDataReceived -= OnDataReceived;
}
}

private void OnDataReceived(object sender, ProcessDataReceivedEventArgs e)
Expand Down
73 changes: 73 additions & 0 deletions src/Agent.Worker/RetryHelper.cs
Original file line number Diff line number Diff line change
@@ -1,15 +1,29 @@
using System;
using System.Threading.Tasks;
using Microsoft.TeamFoundation.DistributedTask.WebApi;

namespace Microsoft.VisualStudio.Services.Agent.Worker
{
internal class RetryHelper
{
/// <summary>
/// Returns exponential delay - depending on number of retry
/// Considers that retryNumber starts from 0
/// Initial delay - 1 second
/// </summary>
/// <returns></returns>
public static int ExponentialDelay(int retryNumber)
{
return (int)(Math.Pow(retryNumber + 1, 2) * 1000);
}


public RetryHelper(IExecutionContext executionContext, int maxRetries = 3)
{
Debug = (str) => executionContext.Debug(str);
Warning = (str) => executionContext.Warning(str);
MaxRetries = maxRetries;
ExecutionContext = executionContext;
}

public RetryHelper(IAsyncCommandContext commandContext, int maxRetries = 3)
Expand Down Expand Up @@ -48,6 +62,59 @@ public async Task<T> Retry<T>(Func<Task<T>> action, Func<int, int> timeDelayInte

}

/// <summary>
/// Runs action with maxRetries number of retries
/// </summary>
/// <param name="action">Action to execute with retries</param>
/// <param name="timeDelayInterval">Function to calculate delay between retries depending on retry number. Should take retry number as argument and consider that it starts from 0.</param>
/// <returns></returns>
public async Task RetryStep(Func<Task> action, Func<int, int> timeDelayInterval)
{
int retryCounter = 0;
do
{
using (new SimpleTimer($"RetryHelper Method:{action.Method} ", Debug))
{
var delayInterval = timeDelayInterval(retryCounter);
try
{
if (retryCounter > 0)
{
//ReInitialize _forceCompleted and _forceCompleteCancellationTokenSource
ExecutionContext.ReInitializeForceCompleted();
}

Debug($"Invoking Method: {action.Method}. Attempt count: {retryCounter}");
await action();

if (ExecutionContext.Result != TaskResult.Failed || ExhaustedRetryCount(retryCounter))
{
return;
}
else
{
string exceptionMessage = $"Task result {ExecutionContext.Result}";
ExecutionContext.Result = null;
Warning($"RetryHelper encountered task failure, will retry (attempt #: {retryCounter + 1} out of {this.MaxRetries}) after {delayInterval} ms");
}
}
catch (Exception ex)
{
if (!ShouldRetryStepOnException(ex) || ExhaustedRetryCount(retryCounter))
{
throw;
}
Warning($"RetryHelper encountered exception, will retry (attempt #: {retryCounter + 1} {ex.Message}) afer {delayInterval} ms");
}
//Cancel force task completion before the next attempt
ExecutionContext.CancelForceTaskCompletion();

await Task.Delay(timeDelayInterval(retryCounter), ExecutionContext.CancellationToken);
retryCounter++;
}
} while (true);
}

private bool ExhaustedRetryCount(int retryCount)
{
if (retryCount >= MaxRetries)
Expand All @@ -58,8 +125,14 @@ private bool ExhaustedRetryCount(int retryCount)
return false;
}

private bool ShouldRetryStepOnException(Exception exception)
{
return !(exception is TimeoutException) && !(exception is OperationCanceledException);
}

private readonly int MaxRetries;
private readonly Action<string> Debug;
private readonly Action<string> Warning;
private readonly IExecutionContext ExecutionContext;
}
}
20 changes: 19 additions & 1 deletion src/Agent.Worker/TaskRunner.cs
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ public sealed class TaskRunner : AgentService, ITaskRunner

public Pipelines.StepTarget Target => Task?.Target;

const int RetryCountOnTaskFailureLimit = 10;

public async Task RunAsync()
{
// Validate args.
Expand Down Expand Up @@ -362,7 +364,23 @@ public async Task RunAsync()
taskDirectory: definition.Directory);

// Run the task.
await handler.RunAsync();
int retryCount = this.Task.RetryCountOnTaskFailure;

if (retryCount > 0)
{
if (retryCount > RetryCountOnTaskFailureLimit)
{
ExecutionContext.Warning(StringUtil.Loc("RetryCountLimitExceeded", RetryCountOnTaskFailureLimit, retryCount));
retryCount = RetryCountOnTaskFailureLimit;
}

RetryHelper rh = new RetryHelper(ExecutionContext, retryCount);
await rh.RetryStep(async () => await handler.RunAsync(), RetryHelper.ExponentialDelay);
}
else
{
await handler.RunAsync();
}
}
}

Expand Down
2 changes: 1 addition & 1 deletion src/Common.props
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
<OSPlatform>OS_UNKNOWN</OSPlatform>
<OSArchitecture>ARCH_UNKNOWN</OSArchitecture>
<DebugConstant></DebugConstant>
<VssApiVersion>0.5.170-private</VssApiVersion>
<VssApiVersion>0.5.171-private</VssApiVersion>
<CodeAnalysis>$(CodeAnalysis)</CodeAnalysis>
</PropertyGroup>

Expand Down
1 change: 1 addition & 0 deletions src/Misc/layoutbin/en-US/strings.json
Original file line number Diff line number Diff line change
Expand Up @@ -474,6 +474,7 @@
"RepositoryNotExist": "Can't update repository, the repository does not exist.",
"RestartIn15SecMessage": "Restarting the machine in 15 seconds...",
"RestartMessage": "Restart the machine to launch agent and for autologon settings to take effect.",
"RetryCountLimitExceeded": "The maximum allowed number of attempts is {0} but got {1}. Retry attempts count will be decreased to {0}.",
"RMApiFailure": "Api {0} failed with an error code {1}",
"RMArtifactContainerDetailsInvalidError": "The artifact does not have valid container details: {0}",
"RMArtifactContainerDetailsNotFoundError": "The artifact does not contain container details: {0}",
Expand Down

0 comments on commit 9839e29

Please sign in to comment.