Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle Hive ProvisionFailed=True conditions within RP and pass errors in API response #3116

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 94 additions & 0 deletions hack/genhiveconfig/genhiveconfig.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
package main

// Copyright (c) Microsoft Corporation.
// Licensed under the Apache License 2.0.

import (
"context"
"os"

"github.com/ghodss/yaml"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

failure "github.com/Azure/ARO-RP/pkg/hive/failure"
utillog "github.com/Azure/ARO-RP/pkg/util/log"
)

const (
hiveNamespaceName = "hive"
configMapName = "additional-install-log-regexes"
regexDataEntryName = "regexes"
)

type installLogRegex struct {
Name string `json:"name"`
SearchRegexStrings []string `json:"searchRegexStrings"`
InstallFailingReason string `json:"installFailingReason"`
InstallFailingMessage string `json:"installFailingMessage"`
}

func run(ctx context.Context, path string) error {
ilrs := []installLogRegex{}

for _, reason := range failure.Reasons {
ilrs = append(ilrs, failureReasonToInstallLogRegex(reason))
}

ilrsRaw, err := yaml.Marshal(ilrs)
if err != nil {
return err
}

configmap := &corev1.ConfigMap{
TypeMeta: metav1.TypeMeta{
APIVersion: corev1.SchemeGroupVersion.String(),
Kind: "ConfigMap",
},
ObjectMeta: metav1.ObjectMeta{
Namespace: hiveNamespaceName,
Name: configMapName,
},
Data: map[string]string{
regexDataEntryName: string(ilrsRaw),
},
}

configmapRaw, err := yaml.Marshal(configmap)
if err != nil {
return err
}

if path != "" {
return os.WriteFile(path, configmapRaw, 0666)
} else {
Comment on lines +62 to +64
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: if we can always genhiveconfig.go | oc apply -f - is there a reason we'd need to write to FS, or can we remove this logic for simplicity?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I haven't plumbed this file through to our actual deployment pipelines. As of right now it's manually copied from this repository into the pipelines and applied statically alongside other static Hive resources.

print(string(configmapRaw))
return nil
}
}

func failureReasonToInstallLogRegex(reason failure.InstallFailingReason) installLogRegex {
ilr := installLogRegex{
Name: reason.Name,
InstallFailingReason: reason.Reason,
InstallFailingMessage: reason.Message,
SearchRegexStrings: []string{},
}
for _, regex := range reason.SearchRegexes {
ilr.SearchRegexStrings = append(ilr.SearchRegexStrings, regex.String())
}
return ilr
}

func main() {
log := utillog.GetLogger()

path := ""
if len(os.Args) > 1 {
path = os.Args[1]
}

if err := run(context.Background(), path); err != nil {
log.Fatal(err)
}
}
37 changes: 37 additions & 0 deletions hack/genhiveconfig/genhiveconfig_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
package main

// Copyright (c) Microsoft Corporation.
// Licensed under the Apache License 2.0.

import (
"reflect"
"regexp"
"testing"

"github.com/Azure/ARO-RP/pkg/hive/failure"
)

func TestFailureReasonToInstallLogRegex(t *testing.T) {
input := failure.InstallFailingReason{
Name: "TestReason",
Reason: "AzureTestReason",
Message: "This is a sentence.",
SearchRegexes: []*regexp.Regexp{
regexp.MustCompile(".*"),
regexp.MustCompile("^$"),
},
}

want := installLogRegex{
Name: "TestReason",
InstallFailingReason: "AzureTestReason",
InstallFailingMessage: "This is a sentence.",
SearchRegexStrings: []string{".*", "^$"},
}

got := failureReasonToInstallLogRegex(input)

if !reflect.DeepEqual(got, want) {
t.Errorf("got %v, want %v", got, want)
}
}
6 changes: 6 additions & 0 deletions hack/hive-config/generate.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
package hiveconfig

// Copyright (c) Microsoft Corporation.
// Licensed under the Apache License 2.0.

//go:generate go run ../genhiveconfig ./hive-additional-install-log-regexes.yaml
19 changes: 19 additions & 0 deletions hack/hive-config/hive-additional-install-log-regexes.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
apiVersion: v1
data:
regexes: |
- installFailingMessage: Deployment failed due to RequestDisallowedByPolicy. Please
see details for more information.
installFailingReason: AzureRequestDisallowedByPolicy
name: AzureRequestDisallowedByPolicy
searchRegexStrings:
- '"code":\w?"InvalidTemplateDeployment".*"code":\w?"RequestDisallowedByPolicy"'
- installFailingMessage: Deployment failed. Please see details for more information.
installFailingReason: AzureInvalidTemplateDeployment
name: AzureInvalidTemplateDeployment
searchRegexStrings:
- '"code":\w?"InvalidTemplateDeployment"'
kind: ConfigMap
Copy link
Collaborator

@SudoBrendan SudoBrendan Aug 29, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, ok now I'm wondering why we need genhiveconfig.go at all? What's wrong with us keeping a static resource in source control somewhere we apply via GitOps/etc? I guess I don't quite see the value of that script given this resource's existence since it's 99% of the way there, though I'm sure there's a reason.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The primary benefit here is to get unit tests for these regex patterns, as well as to tie our RP-side implementation to the same strings used in the ConfigMap.

We could just maintain the CM directly, just like Hive itself does, though they also maintain unit tests that send that ConfigMap through their implementation and assert that the regexes do as they're expected.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where this is this going to be deployed?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

another point to consider - when we want to add a regex (perhaps for 423s on storage account api calls) we need to do a full RP release vs a config+hive release.

metadata:
creationTimestamp: null
name: additional-install-log-regexes
namespace: hive
96 changes: 96 additions & 0 deletions pkg/hive/failure/handler.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
package failure
Copy link
Collaborator

@SudoBrendan SudoBrendan Aug 29, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

super-nit: I'm not sure I like the noun handler for this, since my assumption would be that this pkg is meant to implement golang's standard lib's net/http.Handler, but it might just be me :p

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree, open to better name ideas for this "ThingThatTakesHiveProvisionFailed=TrueConditionsAndGeneratesAUsefulUserErrorFromThem"er (transformer? enricher?)


// Copyright (c) Microsoft Corporation.
// Licensed under the Apache License 2.0.

import (
"context"
"encoding/json"
"net/http"
"regexp"

mgmtfeatures "github.com/Azure/azure-sdk-for-go/services/resources/mgmt/2019-07-01/features"
hivev1 "github.com/openshift/hive/apis/hive/v1"
corev1 "k8s.io/api/core/v1"

"github.com/Azure/ARO-RP/pkg/api"
)

var genericErr = &api.CloudError{
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It might be worth exporting this / other err definitions and use them in your tests. I see we redeclare this in manager_test.go. Thoughts?

StatusCode: http.StatusInternalServerError,
CloudErrorBody: &api.CloudErrorBody{
Code: api.CloudErrorCodeInternalServerError,
Message: "Deployment failed.",
},
}

func HandleProvisionFailed(ctx context.Context, cd *hivev1.ClusterDeployment, cond hivev1.ClusterDeploymentCondition, installLog *string) error {
if cond.Status != corev1.ConditionTrue {
return nil
}

switch cond.Reason {
case AzureRequestDisallowedByPolicy.Reason:
armError, err := parseDeploymentFailedJson(*installLog)
if err != nil {
return err
}

return wrapArmError(
AzureRequestDisallowedByPolicy.Message,
*armError,
)
case AzureInvalidTemplateDeployment.Reason:
armError, err := parseDeploymentFailedJson(*installLog)
if err != nil {
return err
}

return wrapArmError(
AzureInvalidTemplateDeployment.Message,
*armError,
)
Comment on lines +43 to +52
Copy link
Collaborator

@SudoBrendan SudoBrendan Aug 29, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we update the PR description to include this too? It looks like we're processing two cases.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added this extra case and reasoning. Open to suggestions on changing how we handle this case as well (e.g. below comment)

default:
return genericErr
}
}

func parseDeploymentFailedJson(installLog string) (*mgmtfeatures.ErrorResponse, error) {
regex := regexp.MustCompile(`level=error msg=400: DeploymentFailed: : Deployment failed. Details: : : (\{.*\})`)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Trying to run my head through this... Is there ever a case where we might leak something like our credentials through an error like this to a customer? Should we perform sanitation of data like service principal IDs/tokens/passwords to be safe?

Copy link
Collaborator Author

@tsatam tsatam Aug 29, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My understanding was that the previous RP implementation (before Hive) would directly return this object to the customer, but I can validate exactly what was done then and ensure we do at least as thorough a job as the previous implementation did.

I'm also open to implementing specific transformations for specific sub-errors here, and not returning the generic error at all, and I think if we want to pursue that path, we should align other parts of the RP that handle RequestDisallowedByPolicy errors (e.g.

if detailedErr, ok := err.(autorest.DetailedError); ok {
if strings.Contains(detailedErr.Original.Error(), "RequestDisallowedByPolicy") {
return &api.CloudError{
StatusCode: http.StatusBadRequest,
CloudErrorBody: &api.CloudErrorBody{
Code: api.CloudErrorCodeRequestDisallowedByPolicy,
Message: fmt.Sprintf("Resource %s was disallowed by policy.",
subnetId[strings.LastIndex(subnetId, "/")+1:],
),
Details: []api.CloudErrorBody{
{
Code: api.CloudErrorCodeRequestDisallowedByPolicy,
Message: fmt.Sprintf("Policy definition : %s\nPolicy Assignment : %s",
regexp.MustCompile(`policyDefinitionName":"([^"]+)"`).FindStringSubmatch(detailedErr.Original.Error())[1],
regexp.MustCompile(`policyAssignmentName":"([^"]+)"`).FindStringSubmatch(detailedErr.Original.Error())[1],
),
},
},
},
}
}
}
) to return errors the same way.

rawJson := regex.FindStringSubmatch(installLog)[1]

armResponse := &mgmtfeatures.ErrorResponse{}
if err := json.Unmarshal([]byte(rawJson), armResponse); err != nil {
return nil, err
}
return armResponse, nil
}

func wrapArmError(errorMessage string, armError mgmtfeatures.ErrorResponse) *api.CloudError {
details := make([]api.CloudErrorBody, len(*armError.Details))
for i, detail := range *armError.Details {
details[i] = errorResponseToCloudErrorBody(detail)
}

return &api.CloudError{
StatusCode: http.StatusBadRequest,
CloudErrorBody: &api.CloudErrorBody{
Code: api.CloudErrorCodeDeploymentFailed,
Message: errorMessage,
Details: details,
},
}
}

func errorResponseToCloudErrorBody(errorResponse mgmtfeatures.ErrorResponse) api.CloudErrorBody {
body := api.CloudErrorBody{
Code: *errorResponse.Code,
Message: *errorResponse.Message,
}

if errorResponse.Target != nil {
body.Target = *errorResponse.Target
}

return body
}
38 changes: 38 additions & 0 deletions pkg/hive/failure/reasons.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
package failure
Copy link
Contributor

@s-amann s-amann Oct 25, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we have to have this file? it is essentially a duplicate of our configuration. I understand the benefit of testing the regex, but we are testing a copy of it by duplicating the yaml into a go file.

hack/hive-config/hive-additional-install-log-regexes.yaml

edit: I suppose since the file going to production isn't the one above either, this is less of an issue but the same regex are now in triplicate.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The YAML is a generated artifact, which make generate will keep up-to-date based on the source-of-truth in this file.

As is, we would need to manually copy this YAML into our deployment pipeline repository, but I believe it would be possible to automate that as well.


// Copyright (c) Microsoft Corporation.
// Licensed under the Apache License 2.0.

import "regexp"

type InstallFailingReason struct {
Name string
Reason string
Message string
SearchRegexes []*regexp.Regexp
}

var Reasons = []InstallFailingReason{
// Order within this array determines precedence. Earlier entries will take
// priority over later ones.
Comment on lines +16 to +17
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure I understand this comment. Isn't this just establishing a list of errors that we range over? If not, how is priority decided?

AzureRequestDisallowedByPolicy,
AzureInvalidTemplateDeployment,
}

var AzureRequestDisallowedByPolicy = InstallFailingReason{
Name: "AzureRequestDisallowedByPolicy",
Reason: "AzureRequestDisallowedByPolicy",
Message: "Deployment failed due to RequestDisallowedByPolicy. Please see details for more information.",
SearchRegexes: []*regexp.Regexp{
regexp.MustCompile(`"code":\w?"InvalidTemplateDeployment".*"code":\w?"RequestDisallowedByPolicy"`),
},
}

var AzureInvalidTemplateDeployment = InstallFailingReason{
Name: "AzureInvalidTemplateDeployment",
Reason: "AzureInvalidTemplateDeployment",
Message: "Deployment failed. Please see details for more information.",
SearchRegexes: []*regexp.Regexp{
regexp.MustCompile(`"code":\w?"InvalidTemplateDeployment"`),
},
}
95 changes: 95 additions & 0 deletions pkg/hive/failure/reasons_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
package failure

// Copyright (c) Microsoft Corporation.
// Licensed under the Apache License 2.0.

import (
"reflect"
"regexp"
"testing"
)

func TestInstallFailingReasonRegexes(t *testing.T) {
for _, tt := range []struct {
name string
installLog string
want InstallFailingReason
}{
{
name: "InvalidTemplateDeployment - no known errors",
installLog: `
level=info msg=running in local development mode
level=info msg=creating development InstanceMetadata
level=info msg=InstanceMetadata: running on AzurePublicCloud
level=info msg=running step [Action github.com/openshift/ARO-Installer/pkg/installer.(*manager).Manifests.func1]
level=info msg=running step [Action github.com/openshift/ARO-Installer/pkg/installer.(*manager).Manifests.func2]
level=info msg=resolving graph
level=info msg=running step [Action github.com/openshift/ARO-Installer/pkg/installer.(*manager).Manifests.func3]
level=info msg=checking if graph exists
level=info msg=save graph
Generates the Ignition Config asset

level=info msg=running in local development mode
level=info msg=creating development InstanceMetadata
level=info msg=InstanceMetadata: running on AzurePublicCloud
level=info msg=running step [AuthorizationRetryingAction github.com/openshift/ARO-Installer/pkg/installer.(*manager).deployResourceTemplate-fm]
level=info msg=load persisted graph
level=info msg=deploying resources template
level=error msg=step [AuthorizationRetryingAction github.com/openshift/ARO-Installer/pkg/installer.(*manager).deployResourceTemplate-fm] encountered error: 400: DeploymentFailed: : Deployment failed. Details: : : {"code":"InvalidTemplateDeployment","message":"The template deployment failed with multiple errors. Please see details for more information.","details":[]}
level=error msg=400: DeploymentFailed: : Deployment failed. Details: : : {"code":"InvalidTemplateDeployment","message":"The template deployment failed with multiple errors. Please see details for more information.","details":[]}`,
want: AzureInvalidTemplateDeployment,
},
{
name: "InvalidTemplateDeployment - RequestDisallowedByPolicy",
installLog: `
level=info msg=running in local development mode
level=info msg=creating development InstanceMetadata
level=info msg=InstanceMetadata: running on AzurePublicCloud
level=info msg=running step [Action github.com/openshift/ARO-Installer/pkg/installer.(*manager).Manifests.func1]
level=info msg=running step [Action github.com/openshift/ARO-Installer/pkg/installer.(*manager).Manifests.func2]
level=info msg=resolving graph
level=info msg=running step [Action github.com/openshift/ARO-Installer/pkg/installer.(*manager).Manifests.func3]
level=info msg=checking if graph exists
level=info msg=save graph
Generates the Ignition Config asset

level=info msg=running in local development mode
level=info msg=creating development InstanceMetadata
level=info msg=InstanceMetadata: running on AzurePublicCloud
level=info msg=running step [AuthorizationRetryingAction github.com/openshift/ARO-Installer/pkg/installer.(*manager).deployResourceTemplate-fm]
level=info msg=load persisted graph
level=info msg=deploying resources template
level=error msg=step [AuthorizationRetryingAction github.com/openshift/ARO-Installer/pkg/installer.(*manager).deployResourceTemplate-fm] encountered error: 400: DeploymentFailed: : Deployment failed. Details: : : {"code":"InvalidTemplateDeployment","message":"The template deployment failed with multiple errors. Please see details for more information.","details":[{"additionalInfo":[],"code":"RequestDisallowedByPolicy","message":"Resource 'test-bootstrap' was disallowed by policy. Policy identifiers: ''.","target":"test-bootstrap"}]}
level=error msg=400: DeploymentFailed: : Deployment failed. Details: : : {"code":"InvalidTemplateDeployment","message":"The template deployment failed with multiple errors. Please see details for more information.","details":[{"additionalInfo":[],"code":"RequestDisallowedByPolicy","message":"Resource 'test-bootstrap' was disallowed by policy. Policy identifiers: ''.","target":"test-bootstrap"}]}`,
want: AzureRequestDisallowedByPolicy,
},
} {
t.Run(tt.name, func(t *testing.T) {
// This test uses a "mock" version of Hive's real implementation for matching install logs against regex patterns.
// https://github.com/bennerv/hive/blob/fec14dcf0-plus-base-image-update/pkg/controller/clusterprovision/installlogmonitor.go#L83
// The purpose of this test is to test the regular expressions themselves, not the implementation.
got := mockHiveIdentifyReason(tt.installLog)

if !reflect.DeepEqual(got, tt.want) {
t.Errorf("got %v, want %v", got, tt.want)
}
})
}
}

func mockHiveIdentifyReason(installLog string) InstallFailingReason {
for _, reason := range Reasons {
for _, regex := range reason.SearchRegexes {
if regex.MatchString(installLog) {
return reason
}
}
}

return InstallFailingReason{
Name: "UnknownError",
Reason: "UnknownError",
Message: installLog,
SearchRegexes: []*regexp.Regexp{},
}
}
Loading
Loading