Skip to content

Commit

Permalink
[ARO-5368] Try and log the VM info + console log on failure (#3629)
Browse files Browse the repository at this point in the history
Dumps the VM info + console logs on failure so that we don't need to run the Geneva Action or have the control plane still around to get it. Also refactors frontend and geneva action to make use of the same code path.
  • Loading branch information
hawkowl authored Jul 15, 2024
1 parent 1b6e0e7 commit 31af734
Show file tree
Hide file tree
Showing 60 changed files with 806 additions and 230 deletions.
2 changes: 1 addition & 1 deletion cmd/aro/rp.go
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ func rp(ctx context.Context, log, audit *logrus.Entry) error {
if err != nil {
return err
}
f, err := frontend.NewFrontend(ctx, audit, log.WithField("component", "frontend"), _env, dbAsyncOperations, dbClusterManagerConfiguration, dbOpenShiftClusters, dbSubscriptions, dbOpenShiftVersions, dbPlatformWorkloadIdentityRoleSets, api.APIs, metrics, clusterm, feAead, hiveClusterManager, adminactions.NewKubeActions, adminactions.NewAzureActions, clusterdata.NewParallelEnricher(metrics, _env))
f, err := frontend.NewFrontend(ctx, audit, log.WithField("component", "frontend"), _env, dbAsyncOperations, dbClusterManagerConfiguration, dbOpenShiftClusters, dbSubscriptions, dbOpenShiftVersions, dbPlatformWorkloadIdentityRoleSets, api.APIs, metrics, clusterm, feAead, hiveClusterManager, adminactions.NewKubeActions, adminactions.NewAzureActions, adminactions.NewAppLensActions, clusterdata.NewParallelEnricher(metrics, _env))
if err != nil {
return err
}
Expand Down
34 changes: 34 additions & 0 deletions pkg/cluster/failurediagnostics/diagnostics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
package failurediagnostics

// Copyright (c) Microsoft Corporation.
// Licensed under the Apache License 2.0.

import (
"github.com/sirupsen/logrus"

"github.com/Azure/ARO-RP/pkg/api"
"github.com/Azure/ARO-RP/pkg/env"
"github.com/Azure/ARO-RP/pkg/util/azureclient/mgmt/compute"
)

type manager struct {
log *logrus.Entry
env env.Interface
doc *api.OpenShiftClusterDocument

virtualMachines compute.VirtualMachinesClient
}

func NewFailureDiagnostics(log *logrus.Entry, _env env.Interface,
doc *api.OpenShiftClusterDocument,

virtualMachines compute.VirtualMachinesClient,

) *manager {
return &manager{
log: log,
env: _env,
doc: doc,
virtualMachines: virtualMachines,
}
}
72 changes: 72 additions & 0 deletions pkg/cluster/failurediagnostics/virtualmachines.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
package failurediagnostics

// Copyright (c) Microsoft Corporation.
// Licensed under the Apache License 2.0.

import (
"bufio"
"bytes"
"context"
"fmt"

"github.com/Azure/ARO-RP/pkg/util/stringutils"
)

// LogVMSerialConsole fetches the serial console from VMs and logs them with
// the associated VM name.
func (m *manager) LogVMSerialConsole(ctx context.Context) (interface{}, error) {
items := make([]interface{}, 0)

if m.virtualMachines == nil {
items = append(items, "vmclient missing")
return items, nil
}

resourceGroupName := stringutils.LastTokenByte(m.doc.OpenShiftCluster.Properties.ClusterProfile.ResourceGroupID, '/')
vms, err := m.virtualMachines.List(ctx, resourceGroupName)
if err != nil {
items = append(items, fmt.Sprintf("vm listing error: %s", err))
return items, nil
}

if len(vms) == 0 {
items = append(items, "no VMs found")
return items, nil
}

vmNames := make([]string, 0)
for _, v := range vms {
j, err := v.MarshalJSON()
if err != nil {
items = append(items, fmt.Sprintf("vm marshalling error: %s", err))
} else {
vmName := "<unknown>"
if v.Name != nil {
vmName = *v.Name
vmNames = append(vmNames, vmName)
}
items = append(items, fmt.Sprintf("vm %s: %s", vmName, string(j)))
}
}

// Fetch boot diagnostics URIs for the VMs
for _, vmName := range vmNames {
blob := &bytes.Buffer{}
err := m.virtualMachines.GetSerialConsoleForVM(ctx, resourceGroupName, vmName, blob)
if err != nil {
items = append(items, fmt.Sprintf("vm boot diagnostics retrieval error for %s: %s", vmName, err))
continue
}

logForVM := m.log.WithField("failedRoleInstance", vmName)
scanner := bufio.NewScanner(blob)
for scanner.Scan() {
logForVM.Info(scanner.Text())
}
if err := scanner.Err(); err != nil {
items = append(items, fmt.Sprintf("blob storage scan on %s: %s", vmName, err))
}
}

return items, nil
}
168 changes: 168 additions & 0 deletions pkg/cluster/failurediagnostics/virtualmachines_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
package failurediagnostics

// Copyright (c) Microsoft Corporation.
// Licensed under the Apache License 2.0.

import (
"bytes"
"context"
"errors"
"io"
"strings"
"testing"

mgmtcompute "github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2020-06-01/compute"
"github.com/Azure/go-autorest/autorest/to"
"github.com/go-test/deep"
"github.com/golang/mock/gomock"
"github.com/onsi/gomega"
"github.com/onsi/gomega/types"
"github.com/sirupsen/logrus"

"github.com/Azure/ARO-RP/pkg/api"
mock_compute "github.com/Azure/ARO-RP/pkg/util/mocks/azureclient/mgmt/compute"
testlog "github.com/Azure/ARO-RP/test/util/log"
)

func TestVirtualMachinesSerialConsole(t *testing.T) {
const (
key = "/subscriptions/00000000-0000-0000-0000-000000000000/resourcegroups/resourceGroup/providers/Microsoft.RedHatOpenShift/openShiftClusters/resourceName1"
clusterProfile = "/subscriptions/00000000-0000-0000-0000-000000000000/resourcegroups/resourceGroupCluster"
)

oc := &api.OpenShiftClusterDocument{
Key: strings.ToLower(key),
OpenShiftCluster: &api.OpenShiftCluster{
ID: key,
Properties: api.OpenShiftClusterProperties{
ClusterProfile: api.ClusterProfile{
ResourceGroupID: clusterProfile,
},
StorageSuffix: "PrefixHere",
},
},
}

for _, tt := range []struct {
name string
expectedOutput interface{}
mock func(vmClient *mock_compute.MockVirtualMachinesClient)
expectedLogs []map[string]types.GomegaMatcher
}{
{
name: "failure to fetch VMs",
mock: func(vmClient *mock_compute.MockVirtualMachinesClient) {
vmClient.EXPECT().List(gomock.Any(), "resourceGroupCluster").Return(nil, errors.New("vm explod"))
},
expectedLogs: []map[string]types.GomegaMatcher{},
expectedOutput: []interface{}{
"vm listing error: vm explod",
},
},
{
name: "no VMs returned",
mock: func(vmClient *mock_compute.MockVirtualMachinesClient) {
vmClient.EXPECT().List(gomock.Any(), "resourceGroupCluster").Return([]mgmtcompute.VirtualMachine{}, nil)
},
expectedLogs: []map[string]types.GomegaMatcher{},
expectedOutput: []interface{}{
"no VMs found",
},
},
{
name: "failure to get VM serial console",
mock: func(vmClient *mock_compute.MockVirtualMachinesClient) {
vmClient.EXPECT().List(gomock.Any(), "resourceGroupCluster").Return([]mgmtcompute.VirtualMachine{
{
Name: to.StringPtr("somename"),
Location: to.StringPtr("eastus"),
VirtualMachineProperties: &mgmtcompute.VirtualMachineProperties{
InstanceView: &mgmtcompute.VirtualMachineInstanceView{
BootDiagnostics: &mgmtcompute.BootDiagnosticsInstanceView{
SerialConsoleLogBlobURI: to.StringPtr("bogusurl"),
},
},
},
},
}, nil)

vmClient.EXPECT().GetSerialConsoleForVM(
gomock.Any(), "resourceGroupCluster", "somename", gomock.Any(),
).Times(1).Return(errors.New("explod"))
},
expectedLogs: []map[string]types.GomegaMatcher{},
expectedOutput: []interface{}{
`vm somename: {"location":"eastus","properties":{}}`,
"vm boot diagnostics retrieval error for somename: explod",
},
},
{
name: "success",
mock: func(vmClient *mock_compute.MockVirtualMachinesClient) {
vmClient.EXPECT().List(gomock.Any(), "resourceGroupCluster").Return([]mgmtcompute.VirtualMachine{
{
Name: to.StringPtr("somename"),
Location: to.StringPtr("eastus"),
VirtualMachineProperties: &mgmtcompute.VirtualMachineProperties{},
},
}, nil)

iothing := bytes.NewBufferString("hello\nthere :)")
vmClient.EXPECT().GetSerialConsoleForVM(
gomock.Any(), "resourceGroupCluster", "somename", gomock.Any(),
).Times(1).DoAndReturn(func(ctx context.Context,
rg string, vmName string, target io.Writer) error {
_, err := io.Copy(target, iothing)
return err
})
},
expectedLogs: []map[string]types.GomegaMatcher{
{
"level": gomega.Equal(logrus.InfoLevel),
"msg": gomega.Equal(`hello`),
"failedRoleInstance": gomega.Equal("somename"),
},
{
"level": gomega.Equal(logrus.InfoLevel),
"msg": gomega.Equal(`there :)`),
"failedRoleInstance": gomega.Equal("somename"),
},
},
expectedOutput: []interface{}{
`vm somename: {"location":"eastus","properties":{}}`,
},
},
} {
t.Run(tt.name, func(t *testing.T) {
ctx := context.Background()
hook, entry := testlog.New()

controller := gomock.NewController(t)
defer controller.Finish()

vmClient := mock_compute.NewMockVirtualMachinesClient(controller)

tt.mock(vmClient)

d := &manager{
log: entry,
doc: oc,
virtualMachines: vmClient,
}

out, err := d.LogVMSerialConsole(ctx)
if err != nil {
t.Errorf("returned %s, should never return an error", err)
}

err = testlog.AssertLoggingOutput(hook, tt.expectedLogs)
if err != nil {
t.Error(err)
}

for _, e := range deep.Equal(out, tt.expectedOutput) {
t.Error(e)
}
})
}
}
45 changes: 33 additions & 12 deletions pkg/cluster/gatherlogs.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,29 +9,50 @@ import (

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

"github.com/Azure/ARO-RP/pkg/cluster/failurediagnostics"
"github.com/Azure/ARO-RP/pkg/util/steps"
)

type diagnosticStep struct {
f func(context.Context) (interface{}, error)
isJSON bool
}

func (m *manager) gatherFailureLogs(ctx context.Context) {
for _, f := range []func(context.Context) (interface{}, error){
m.logClusterVersion,
m.logNodes,
m.logClusterOperators,
m.logIngressControllers,
d := failurediagnostics.NewFailureDiagnostics(m.log, m.env, m.doc, m.virtualMachines)

for _, f := range []diagnosticStep{
{f: m.logClusterVersion, isJSON: true},
{f: m.logNodes, isJSON: true},
{f: m.logClusterOperators, isJSON: true},
{f: m.logIngressControllers, isJSON: true},
{f: d.LogVMSerialConsole, isJSON: false},
} {
o, err := f(ctx)
o, err := f.f(ctx)
if err != nil {
m.log.Error(err)
continue
}

b, err := json.MarshalIndent(o, "", " ")
if err != nil {
m.log.Error(err)
continue
if f.isJSON {
b, err := json.MarshalIndent(o, "", " ")
if err != nil {
m.log.Error(err)
continue
}

m.log.Printf("%s: %s", steps.FriendlyName(f.f), string(b))
} else {
entries, ok := o.([]interface{})
name := steps.FriendlyName(f.f)
if ok {
for _, i := range entries {
m.log.Printf("%s: %v", name, i)
}
} else {
m.log.Printf("%s: %v", steps.FriendlyName(f.f), o)
}
}

m.log.Printf("%s: %s", steps.FriendlyName(f), string(b))
}
}

Expand Down
Loading

0 comments on commit 31af734

Please sign in to comment.