diff --git a/hack/genhiveconfig/genhiveconfig.go b/hack/genhiveconfig/genhiveconfig.go index eb49c3c521c..4986b710af5 100644 --- a/hack/genhiveconfig/genhiveconfig.go +++ b/hack/genhiveconfig/genhiveconfig.go @@ -18,7 +18,6 @@ import ( const ( hiveNamespaceName = "hive" configMapName = "additional-install-log-regexes" - configMapPath = "hack/hive-config/hive-additional-install-log-regexes.yaml" regexDataEntryName = "regexes" ) @@ -29,7 +28,7 @@ type installLogRegex struct { InstallFailingMessage string `json:"installFailingMessage"` } -func run(ctx context.Context) error { +func run(ctx context.Context, path string) error { ilrs := []installLogRegex{} for _, reason := range failure.Reasons { @@ -59,7 +58,13 @@ func run(ctx context.Context) error { if err != nil { return err } - return os.WriteFile(configMapPath, configmapRaw, 0666) + + if path != "" { + return os.WriteFile(path, configmapRaw, 0666) + } else { + print(string(configmapRaw)) + return nil + } } func failureReasonToInstallLogRegex(reason failure.InstallFailingReason) installLogRegex { @@ -78,7 +83,12 @@ func failureReasonToInstallLogRegex(reason failure.InstallFailingReason) install func main() { log := utillog.GetLogger() - if err := run(context.Background()); err != nil { + path := "" + if len(os.Args) > 1 { + path = os.Args[1] + } + + if err := run(context.Background(), path); err != nil { log.Fatal(err) } } diff --git a/hack/hive-config/generate.go b/hack/hive-config/generate.go new file mode 100644 index 00000000000..ed707639f86 --- /dev/null +++ b/hack/hive-config/generate.go @@ -0,0 +1,6 @@ +package main + +// Copyright (c) Microsoft Corporation. +// Licensed under the Apache License 2.0. + +//go:generate go run ../genhiveconfig ./hive-additional-install-log-regexes.yaml diff --git a/hack/hive-config/hive-additional-install-log-regexes.yaml b/hack/hive-config/hive-additional-install-log-regexes.yaml index a6d4d75db91..ad5adc77410 100644 --- a/hack/hive-config/hive-additional-install-log-regexes.yaml +++ b/hack/hive-config/hive-additional-install-log-regexes.yaml @@ -1,8 +1,13 @@ apiVersion: v1 data: regexes: | - - installFailingMessage: The template deployment failed. Please see details for more - information. + - installFailingMessage: Deployment failed due to RequestDisallowedByPolicy. Please + see details for more information. + installFailingReason: AzureRequestDisallowedByPolicy + name: AzureRequestDisallowedByPolicy + searchRegexStrings: + - '"code":\w?"InvalidTemplateDeployment".*"code":\w?"RequestDisallowedByPolicy"' + - installFailingMessage: Deployment failed. Please see details for more information. installFailingReason: AzureInvalidTemplateDeployment name: AzureInvalidTemplateDeployment searchRegexStrings: diff --git a/pkg/hive/failure/handler.go b/pkg/hive/failure/handler.go new file mode 100644 index 00000000000..7d4a78b393a --- /dev/null +++ b/pkg/hive/failure/handler.go @@ -0,0 +1,96 @@ +package failure + +// Copyright (c) Microsoft Corporation. +// Licensed under the Apache License 2.0. + +import ( + "context" + "encoding/json" + "net/http" + "regexp" + + mgmtfeatures "github.com/Azure/azure-sdk-for-go/services/resources/mgmt/2019-07-01/features" + hivev1 "github.com/openshift/hive/apis/hive/v1" + corev1 "k8s.io/api/core/v1" + + "github.com/Azure/ARO-RP/pkg/api" +) + +var genericErr = &api.CloudError{ + StatusCode: http.StatusInternalServerError, + CloudErrorBody: &api.CloudErrorBody{ + Code: api.CloudErrorCodeInternalServerError, + Message: "Deployment failed.", + }, +} + +func HandleProvisionFailed(ctx context.Context, cd *hivev1.ClusterDeployment, cond hivev1.ClusterDeploymentCondition, installLog *string) error { + if cond.Status != corev1.ConditionTrue { + return nil + } + + switch cond.Reason { + case AzureRequestDisallowedByPolicy.Reason: + armError, err := parseDeploymentFailedJson(*installLog) + if err != nil { + return err + } + + return wrapArmError( + AzureRequestDisallowedByPolicy.Message, + *armError, + ) + case AzureInvalidTemplateDeployment.Reason: + armError, err := parseDeploymentFailedJson(*installLog) + if err != nil { + return err + } + + return wrapArmError( + AzureInvalidTemplateDeployment.Message, + *armError, + ) + default: + return genericErr + } +} + +func parseDeploymentFailedJson(installLog string) (*mgmtfeatures.ErrorResponse, error) { + regex := regexp.MustCompile(`level=error msg=400: DeploymentFailed: : Deployment failed. Details: : : (\{.*\})`) + rawJson := regex.FindStringSubmatch(installLog)[1] + + armResponse := &mgmtfeatures.ErrorResponse{} + if err := json.Unmarshal([]byte(rawJson), armResponse); err != nil { + return nil, err + } + return armResponse, nil +} + +func wrapArmError(errorMessage string, armError mgmtfeatures.ErrorResponse) *api.CloudError { + details := make([]api.CloudErrorBody, len(*armError.Details)) + for i, detail := range *armError.Details { + details[i] = errorResponseToCloudErrorBody(detail) + } + + return &api.CloudError{ + StatusCode: http.StatusBadRequest, + CloudErrorBody: &api.CloudErrorBody{ + Code: api.CloudErrorCodeDeploymentFailed, + Message: errorMessage, + Details: details, + }, + } +} + +func errorResponseToCloudErrorBody(errorResponse mgmtfeatures.ErrorResponse) api.CloudErrorBody { + body := api.CloudErrorBody{ + Code: *errorResponse.Code, + Message: *errorResponse.Message, + } + + if errorResponse.Target != nil { + body.Target = *errorResponse.Target + } + + return body +} diff --git a/pkg/hive/failure/reasons.go b/pkg/hive/failure/reasons.go index cb6536402a8..414e0ed03e8 100644 --- a/pkg/hive/failure/reasons.go +++ b/pkg/hive/failure/reasons.go @@ -22,7 +22,7 @@ var Reasons = []InstallFailingReason{ var AzureRequestDisallowedByPolicy = InstallFailingReason{ Name: "AzureRequestDisallowedByPolicy", Reason: "AzureRequestDisallowedByPolicy", - Message: "Cluster Deployment was disallowed by policy. Please see install log for more information.", + Message: "Deployment failed due to RequestDisallowedByPolicy. Please see details for more information.", SearchRegexes: []*regexp.Regexp{ regexp.MustCompile(`"code":\w?"InvalidTemplateDeployment".*"code":\w?"RequestDisallowedByPolicy"`), }, @@ -31,7 +31,7 @@ var AzureRequestDisallowedByPolicy = InstallFailingReason{ var AzureInvalidTemplateDeployment = InstallFailingReason{ Name: "AzureInvalidTemplateDeployment", Reason: "AzureInvalidTemplateDeployment", - Message: "The template deployment failed. Please see install log for more information.", + Message: "Deployment failed. Please see details for more information.", SearchRegexes: []*regexp.Regexp{ regexp.MustCompile(`"code":\w?"InvalidTemplateDeployment"`), }, diff --git a/pkg/hive/manager.go b/pkg/hive/manager.go index 4ad4d744a7c..311df397dc1 100644 --- a/pkg/hive/manager.go +++ b/pkg/hive/manager.go @@ -5,7 +5,9 @@ package hive import ( "context" + "errors" "fmt" + "sort" hivev1 "github.com/openshift/hive/apis/hive/v1" "github.com/sirupsen/logrus" @@ -19,6 +21,7 @@ import ( "github.com/Azure/ARO-RP/pkg/api" "github.com/Azure/ARO-RP/pkg/env" + "github.com/Azure/ARO-RP/pkg/hive/failure" "github.com/Azure/ARO-RP/pkg/util/dynamichelper" utillog "github.com/Azure/ARO-RP/pkg/util/log" "github.com/Azure/ARO-RP/pkg/util/uuid" @@ -195,14 +198,13 @@ func (hr *clusterManager) IsClusterInstallationComplete(ctx context.Context, doc return true, nil } - checkFailureConditions := map[hivev1.ClusterDeploymentConditionType]corev1.ConditionStatus{ - hivev1.ProvisionFailedCondition: corev1.ConditionTrue, - } - for _, cond := range cd.Status.Conditions { - conditionStatus, found := checkFailureConditions[cond.Type] - if found && conditionStatus == cond.Status { - return false, fmt.Errorf("clusterdeployment has failed: %s == %s", cond.Type, cond.Status) + if cond.Type == hivev1.ProvisionFailedCondition && cond.Status == corev1.ConditionTrue { + log, err := hr.installLogsForDeployment(ctx, cd) + if err != nil { + return false, err + } + return false, failure.HandleProvisionFailed(ctx, cd, cond, log) } } @@ -237,3 +239,27 @@ func (hr *clusterManager) ResetCorrelationData(ctx context.Context, doc *api.Ope return hr.hiveClientset.Update(ctx, cd) }) } + +func (hr *clusterManager) installLogsForDeployment(ctx context.Context, cd *hivev1.ClusterDeployment) (*string, error) { + provisionList := &hivev1.ClusterProvisionList{} + if err := hr.hiveClientset.List( + ctx, + provisionList, + client.InNamespace(cd.Namespace), + client.MatchingLabels(map[string]string{"hive.openshift.io/cluster-deployment-name": cd.Name}), + ); err != nil { + hr.log.WithError(err).Warn("could not list provisions for clusterdeployment") + return nil, err + } + if len(provisionList.Items) == 0 { + return nil, errors.New("no provisions for deployment") + } + provisions := make([]*hivev1.ClusterProvision, len(provisionList.Items)) + for i := range provisionList.Items { + provisions[i] = &provisionList.Items[i] + } + sort.Slice(provisions, func(i, j int) bool { return provisions[i].Spec.Attempt > provisions[j].Spec.Attempt }) + latestProvision := provisions[0] + + return latestProvision.Spec.InstallLog, nil +} diff --git a/pkg/hive/manager_test.go b/pkg/hive/manager_test.go index 0de2aa46d4c..082bd05560a 100644 --- a/pkg/hive/manager_test.go +++ b/pkg/hive/manager_test.go @@ -5,6 +5,7 @@ package hive import ( "context" + "net/http" "reflect" "testing" @@ -17,6 +18,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client/fake" "github.com/Azure/ARO-RP/pkg/api" + "github.com/Azure/ARO-RP/pkg/hive/failure" "github.com/Azure/ARO-RP/pkg/util/cmp" "github.com/Azure/ARO-RP/pkg/util/uuid" uuidfake "github.com/Azure/ARO-RP/pkg/util/uuid/fake" @@ -179,71 +181,200 @@ func TestIsClusterInstallationComplete(t *testing.T) { }, } + genericErr := &api.CloudError{ + StatusCode: http.StatusInternalServerError, + CloudErrorBody: &api.CloudErrorBody{ + Code: api.CloudErrorCodeInternalServerError, + Message: "Deployment failed.", + }, + } + + makeClusterDeployment := func(installed bool, provisionFailedCond hivev1.ClusterDeploymentCondition) *hivev1.ClusterDeployment { + return &hivev1.ClusterDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: ClusterDeploymentName, + Namespace: fakeNamespace, + }, + Spec: hivev1.ClusterDeploymentSpec{ + Installed: installed, + }, + Status: hivev1.ClusterDeploymentStatus{ + Conditions: []hivev1.ClusterDeploymentCondition{provisionFailedCond}, + }, + } + } + makeClusterProvision := func(installLog string) *hivev1.ClusterProvision { + return &hivev1.ClusterProvision{ + ObjectMeta: metav1.ObjectMeta{ + Name: ClusterDeploymentName + "-0-bbbbb", + Namespace: fakeNamespace, + Labels: map[string]string{ + "hive.openshift.io/cluster-deployment-name": ClusterDeploymentName, + }, + }, + Spec: hivev1.ClusterProvisionSpec{ + InstallLog: &installLog, + }, + } + } + for _, tt := range []struct { name string - cd kruntime.Object + cd *hivev1.ClusterDeployment + cp *hivev1.ClusterProvision wantResult bool - wantErr string + wantErr error }{ { name: "is installed", - cd: &hivev1.ClusterDeployment{ - ObjectMeta: metav1.ObjectMeta{ - Name: ClusterDeploymentName, - Namespace: fakeNamespace, - }, - Spec: hivev1.ClusterDeploymentSpec{ - Installed: true, - }, - Status: hivev1.ClusterDeploymentStatus{ - Conditions: []hivev1.ClusterDeploymentCondition{ - { - Type: hivev1.ProvisionFailedCondition, - Status: corev1.ConditionFalse, - }, - }, + cd: makeClusterDeployment( + true, + hivev1.ClusterDeploymentCondition{ + Type: hivev1.ProvisionFailedCondition, + Status: corev1.ConditionFalse, }, - }, + ), wantResult: true, }, { name: "is not installed yet", - cd: &hivev1.ClusterDeployment{ - ObjectMeta: metav1.ObjectMeta{ - Name: ClusterDeploymentName, - Namespace: fakeNamespace, + cd: makeClusterDeployment( + false, + hivev1.ClusterDeploymentCondition{ + Type: hivev1.ProvisionFailedCondition, + Status: corev1.ConditionFalse, + }, + ), + wantResult: false, + }, + { + name: "has failed provisioning - no Reason", + cd: makeClusterDeployment( + false, + hivev1.ClusterDeploymentCondition{ + Type: hivev1.ProvisionFailedCondition, + Status: corev1.ConditionTrue, }, - Spec: hivev1.ClusterDeploymentSpec{ - Installed: false, + ), + wantErr: genericErr, + wantResult: false, + }, + { + name: "has failed provisioning - Known Reason not relevant to ARO", + cd: makeClusterDeployment( + false, + hivev1.ClusterDeploymentCondition{ + Type: hivev1.ProvisionFailedCondition, + Status: corev1.ConditionTrue, + Reason: "AWSInsufficientCapacity", }, - Status: hivev1.ClusterDeploymentStatus{ - Conditions: []hivev1.ClusterDeploymentCondition{ - { - Type: hivev1.ProvisionFailedCondition, - Status: corev1.ConditionFalse, - }, - }, + ), + wantErr: genericErr, + wantResult: false, + }, + { + name: "has failed provisioning - UnknownError", + cd: makeClusterDeployment( + false, + hivev1.ClusterDeploymentCondition{ + Type: hivev1.ProvisionFailedCondition, + Status: corev1.ConditionTrue, + Reason: "UnknownError", }, - }, + ), + wantErr: genericErr, wantResult: false, }, { - name: "has failed provisioning", - cd: &hivev1.ClusterDeployment{ - ObjectMeta: metav1.ObjectMeta{ - Name: ClusterDeploymentName, - Namespace: fakeNamespace, + name: "has failed provisioning - RequestDisallowedByPolicy", + cd: makeClusterDeployment( + false, + hivev1.ClusterDeploymentCondition{ + Type: hivev1.ProvisionFailedCondition, + Status: corev1.ConditionTrue, + Reason: failure.AzureRequestDisallowedByPolicy.Reason, }, - Status: hivev1.ClusterDeploymentStatus{ - Conditions: []hivev1.ClusterDeploymentCondition{ + ), + cp: makeClusterProvision(`level=info msg=running in local development mode + level=info msg=creating development InstanceMetadata + level=info msg=InstanceMetadata: running on AzurePublicCloud + level=info msg=running step [Action github.com/Azure/ARO-RP/pkg/installer.(*manager).Manifests.func1] + level=info msg=running step [Action github.com/Azure/ARO-RP/pkg/installer.(*manager).Manifests.func2] + level=info msg=resolving graph + level=info msg=running step [Action github.com/Azure/ARO-RP/pkg/installer.(*manager).Manifests.func3] + level=info msg=checking if graph exists + level=info msg=save graph + Generates the Ignition Config asset + + level=info msg=running in local development mode + level=info msg=creating development InstanceMetadata + level=info msg=InstanceMetadata: running on AzurePublicCloud + level=info msg=running step [AuthorizationRefreshingAction [Action github.com/Azure/ARO-RP/pkg/installer.(*manager).deployResourceTemplate-fm]] + level=info msg=load persisted graph + level=info msg=deploying resources template + level=error msg=step [AuthorizationRefreshingAction [Action github.com/Azure/ARO-RP/pkg/installer.(*manager).deployResourceTemplate-fm]] encountered error: 400: DeploymentFailed: : Deployment failed. Details: : : {"code": "InvalidTemplateDeployment","message": "The template deployment failed with multiple errors. Please see details for more information.","target": null,"details": [{"code": "RequestDisallowedByPolicy","message": "Resource 'aro-test-aaaaa-bootstrap' was disallowed by policy.","target": "aro-test-aaaaa-bootstrap"},{"code": "RequestDisallowedByPolicy","message": "Resource 'aro-test-aaaaa-master-0' was disallowed by policy.","target": "aro-test-aaaaa-master-0"},{"code": "RequestDisallowedByPolicy","message": "Resource 'aro-test-aaaaa-master-1' was disallowed by policy.","target": "aro-test-aaaaa-master-1"},{"code": "RequestDisallowedByPolicy","message": "Resource 'aro-test-aaaaa-master-2' was disallowed by policy.","target": "aro-test-aaaaa-master-2"}]} + level=error msg=400: DeploymentFailed: : Deployment failed. Details: : : {"code": "InvalidTemplateDeployment","message": "The template deployment failed with multiple errors. Please see details for more information.","target": null,"details": [{"code": "RequestDisallowedByPolicy","message": "Resource 'aro-test-aaaaa-bootstrap' was disallowed by policy.","target": "aro-test-aaaaa-bootstrap"},{"code": "RequestDisallowedByPolicy","message": "Resource 'aro-test-aaaaa-master-0' was disallowed by policy.","target": "aro-test-aaaaa-master-0"},{"code": "RequestDisallowedByPolicy","message": "Resource 'aro-test-aaaaa-master-1' was disallowed by policy.","target": "aro-test-aaaaa-master-1"},{"code": "RequestDisallowedByPolicy","message": "Resource 'aro-test-aaaaa-master-2' was disallowed by policy.","target": "aro-test-aaaaa-master-2"}]}`), + wantErr: &api.CloudError{ + StatusCode: http.StatusBadRequest, + CloudErrorBody: &api.CloudErrorBody{ + Code: api.CloudErrorCodeDeploymentFailed, + Message: "Deployment failed due to RequestDisallowedByPolicy. Please see details for more information.", + Details: []api.CloudErrorBody{ { - Type: hivev1.ProvisionFailedCondition, - Status: corev1.ConditionTrue, + Code: api.CloudErrorCodeRequestDisallowedByPolicy, + Message: "Resource 'aro-test-aaaaa-bootstrap' was disallowed by policy.", + Target: "aro-test-aaaaa-bootstrap", + }, + { + Code: api.CloudErrorCodeRequestDisallowedByPolicy, + Message: "Resource 'aro-test-aaaaa-master-0' was disallowed by policy.", + Target: "aro-test-aaaaa-master-0", + }, + { + Code: api.CloudErrorCodeRequestDisallowedByPolicy, + Message: "Resource 'aro-test-aaaaa-master-1' was disallowed by policy.", + Target: "aro-test-aaaaa-master-1", + }, + { + Code: api.CloudErrorCodeRequestDisallowedByPolicy, + Message: "Resource 'aro-test-aaaaa-master-2' was disallowed by policy.", + Target: "aro-test-aaaaa-master-2", }, }, }, }, - wantErr: "clusterdeployment has failed: ProvisionFailed == True", + wantResult: false, + }, + { + name: "has failed provisioning - InvalidTemplateDeployment", + cd: makeClusterDeployment( + false, + hivev1.ClusterDeploymentCondition{ + Type: hivev1.ProvisionFailedCondition, + Status: corev1.ConditionTrue, + Reason: failure.AzureInvalidTemplateDeployment.Reason, + }, + ), + cp: makeClusterProvision(`level=info msg=running in local development mode + level=info msg=creating development InstanceMetadata + level=info msg=InstanceMetadata: running on AzurePublicCloud + level=info msg=running step [Action github.com/Azure/ARO-RP/pkg/installer.(*manager).Manifests.func1] + level=info msg=running step [Action github.com/Azure/ARO-RP/pkg/installer.(*manager).Manifests.func2] + level=info msg=resolving graph + level=info msg=running step [Action github.com/Azure/ARO-RP/pkg/installer.(*manager).Manifests.func3] + level=info msg=checking if graph exists + level=info msg=save graph + Generates the Ignition Config asset + + level=info msg=running in local development mode + level=info msg=creating development InstanceMetadata + level=info msg=InstanceMetadata: running on AzurePublicCloud + level=info msg=running step [AuthorizationRefreshingAction [Action github.com/Azure/ARO-RP/pkg/installer.(*manager).deployResourceTemplate-fm]] + level=info msg=load persisted graph + level=info msg=deploying resources template + level=error msg=step [AuthorizationRefreshingAction [Action github.com/Azure/ARO-RP/pkg/installer.(*manager).deployResourceTemplate-fm]] encountered error: 400: DeploymentFailed: : Deployment failed. Details: : : {"code": "InvalidTemplateDeployment","message": "The template deployment failed with multiple errors. Please see details for more information.","target": null,"details": []} + level=error msg=400: DeploymentFailed: : Deployment failed. Details: : : {"code": "InvalidTemplateDeployment","message": "The template deployment failed with multiple errors. Please see details for more information.","target": null,"details": []}`), + wantErr: genericErr, wantResult: false, }, } { @@ -252,13 +383,20 @@ func TestIsClusterInstallationComplete(t *testing.T) { if tt.cd != nil { fakeClientBuilder = fakeClientBuilder.WithRuntimeObjects(tt.cd) } + if tt.cp != nil { + fakeClientBuilder = fakeClientBuilder.WithRuntimeObjects(tt.cp) + } else { + fakeClientBuilder = fakeClientBuilder.WithRuntimeObjects(makeClusterProvision("")) + } c := clusterManager{ hiveClientset: fakeClientBuilder.Build(), log: logrus.NewEntry(logrus.StandardLogger()), } result, err := c.IsClusterInstallationComplete(context.Background(), doc) - utilerror.AssertErrorMessage(t, err, tt.wantErr) + if diff := cmp.Diff(tt.wantErr, err); diff != "" { + t.Error(diff) + } if tt.wantResult != result { t.Error(result)