Skip to content

Commit

Permalink
Refactor conditions to return error
Browse files Browse the repository at this point in the history
Currently, all conditions steps return InternalServerError and only
looking at the RP logs will give detailed understanding on the failure,
this PR attempts to change this by adding a wrapper function to exiting
poll function to return readable error.
  • Loading branch information
SrinivasAtmakuri committed Jul 11, 2023
1 parent f87f198 commit 748932d
Show file tree
Hide file tree
Showing 2 changed files with 137 additions and 0 deletions.
42 changes: 42 additions & 0 deletions pkg/util/steps/condition.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,35 @@ package steps

import (
"context"
"errors"
"fmt"
"net/http"
"strings"
"time"

"github.com/sirupsen/logrus"
"k8s.io/apimachinery/pkg/util/wait"

"github.com/Azure/ARO-RP/pkg/api"
)

// Functions that run as condition-steps return Error
// instead of InternalServerError
// Efforts are being made to not have generic Hive errors but specific, actionable failure cases.
// Instead of providing Hive-specific error messages to customers, the below will send a timeout error message.
var timeoutConditionErrors = map[string]string{
"apiServersReady": "Kube API has not initialised successfully and is unavailable.",
"minimumWorkerNodesReady": "Minimum number of worker nodes have not been successfully created.",
"operatorConsoleExists": "Console Cluster Operator has failed to initialize successfully.",
"operatorConsoleReady": "Console Cluster Operator has not started successfully.",
"clusterVersionReady": "Cluster Verion is not reporting status as ready.",
"ingressControllerReady": "Ingress Cluster Operator has not started successfully.",
"aroDeploymentReady": "ARO Cluster Operator has failed to initialize successfully.",
"ensureAROOperatorRunningDesiredVersion": "ARO Cluster Operator is not running desired version.",
"hiveClusterDeploymentReady": "Timed out waiting for a condition, cluster Installation is unsuccessful.",
"hiveClusterInstallationComplete": "Timed out waiting for a condition, cluster Installation is unsuccessful.",
}

// conditionFunction is a function that takes a context and returns whether the
// condition has been met and an error.
//
Expand Down Expand Up @@ -66,9 +88,29 @@ func (c conditionStep) run(ctx context.Context, log *logrus.Entry) error {
log.Warnf("step %s failed but has configured 'fail=%t'. Continuing. Error: %s", c, c.fail, err.Error())
return nil
}
if errors.Is(err, wait.ErrWaitTimeout) {
return enrichConditionTimeoutError(c.f)
}
return err
}

// Instead of giving Generic, timed out waiting for a condition, error
// returns enriched error messages mentioned in timeoutConditionErrors
func enrichConditionTimeoutError(f conditionFunction) error {
funcNameParts := strings.Split(FriendlyName(f), ".")
funcName := strings.TrimRight(funcNameParts[len(funcNameParts)-1], "-fm")

message, exists := timeoutConditionErrors[funcName]
if !exists {
return errors.New("timed out waiting for the condition")
}
return api.NewCloudError(
http.StatusInternalServerError,
api.CloudErrorCodeDeploymentFailed,
"", message+"Please retry, if issue persists: raise azure support ticket",
)
}

func (c conditionStep) String() string {
return fmt.Sprintf("[Condition %s, timeout %s]", FriendlyName(c.f), c.timeout)
}
Expand Down
95 changes: 95 additions & 0 deletions pkg/util/steps/condition_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
package steps

// Copyright (c) Microsoft Corporation.
// Licensed under the Apache License 2.0.

import (
"context"
"testing"
)

// functionnames that will be used in the conditionFunction below
// All the keys of map timeoutConditionErrors
func apiServersReady(context.Context) (bool, error) { return false, nil }
func minimumWorkerNodesReady(context.Context) (bool, error) { return false, nil }
func operatorConsoleExists(context.Context) (bool, error) { return false, nil }
func operatorConsoleReady(context.Context) (bool, error) { return false, nil }
func clusterVersionReady(context.Context) (bool, error) { return false, nil }
func ingressControllerReady(context.Context) (bool, error) { return false, nil }
func aroDeploymentReady(context.Context) (bool, error) { return false, nil }
func ensureAROOperatorRunningDesiredVersion(context.Context) (bool, error) { return false, nil }
func hiveClusterDeploymentReady(context.Context) (bool, error) { return false, nil }
func hiveClusterInstallationComplete(context.Context) (bool, error) { return false, nil }

func TestEnrichConditionTimeoutError(t *testing.T) {
for _, tt := range []struct {
desc string
function conditionFunction
wantErr string
}{
// Verify response for func's mention in timeoutConditionErrors and
// Emit generic Error if an unknown func
{
// unknown function
desc: "test conditionfail for func - unknownFunc",
function: timingOutCondition,
wantErr: "timed out waiting for the condition",
},
{
desc: "test conditionfail for func - apiServersReady",
function: apiServersReady,
wantErr: "500: DeploymentFailed: : Kube API has not initialised successfully and is unavailable.Please retry, if issue persists: raise azure support ticket",
},
{
desc: "test conditionfail for func - minimumWorkerNodesReady",
function: minimumWorkerNodesReady,
wantErr: "500: DeploymentFailed: : Minimum number of worker nodes have not been successfully created.Please retry, if issue persists: raise azure support ticket",
},
{
desc: "test conditionfail for func - operatorConsoleExists",
function: operatorConsoleExists,
wantErr: "500: DeploymentFailed: : Console Cluster Operator has failed to initialize successfully.Please retry, if issue persists: raise azure support ticket",
},
{
desc: "test conditionfail for func - operatorConsoleReady",
function: operatorConsoleReady,
wantErr: "500: DeploymentFailed: : Console Cluster Operator has not started successfully.Please retry, if issue persists: raise azure support ticket",
},
{
desc: "test conditionfail for func - clusterVersionReady",
function: clusterVersionReady,
wantErr: "500: DeploymentFailed: : Cluster Verion is not reporting status as ready.Please retry, if issue persists: raise azure support ticket",
},
{
desc: "test conditionfail for func - clusterVersionReady",
function: ingressControllerReady,
wantErr: "500: DeploymentFailed: : Ingress Cluster Operator has not started successfully.Please retry, if issue persists: raise azure support ticket",
},
{
desc: "test conditionfail for func - aroDeploymentReady",
function: aroDeploymentReady,
wantErr: "500: DeploymentFailed: : ARO Cluster Operator has failed to initialize successfully.Please retry, if issue persists: raise azure support ticket",
},
{
desc: "test conditionfail for func - ensureAROOperatorRunningDesiredVersion",
function: ensureAROOperatorRunningDesiredVersion,
wantErr: "500: DeploymentFailed: : ARO Cluster Operator is not running desired version.Please retry, if issue persists: raise azure support ticket",
},
{
desc: "test conditionfail for func - hiveClusterDeploymentReady",
function: hiveClusterDeploymentReady,
wantErr: "500: DeploymentFailed: : Timed out waiting for a condition, cluster Installation is unsuccessful.Please retry, if issue persists: raise azure support ticket",
},
{
desc: "test conditionfail for func - hiveClusterInstallationComplete",
function: hiveClusterInstallationComplete,
wantErr: "500: DeploymentFailed: : Timed out waiting for a condition, cluster Installation is unsuccessful.Please retry, if issue persists: raise azure support ticket",
},
} {
t.Run(tt.desc, func(t *testing.T) {
if got := enrichConditionTimeoutError(tt.function); got.Error() != tt.wantErr {
t.Errorf("invlaid enrichConditionTimeoutError: %s, got: %s", tt.wantErr, got)
}
})
}
}

0 comments on commit 748932d

Please sign in to comment.