diff --git a/pkg/cluster/gatherlogs.go b/pkg/cluster/gatherlogs.go index 31633ce8dad..403f7583b0c 100644 --- a/pkg/cluster/gatherlogs.go +++ b/pkg/cluster/gatherlogs.go @@ -4,9 +4,13 @@ package cluster // Licensed under the Apache License 2.0. import ( + "bufio" "context" "encoding/json" + "fmt" + "io" + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "github.com/Azure/ARO-RP/pkg/cluster/failurediagnostics" @@ -26,6 +30,7 @@ func (m *manager) gatherFailureLogs(ctx context.Context) { {f: m.logNodes, isJSON: true}, {f: m.logClusterOperators, isJSON: true}, {f: m.logIngressControllers, isJSON: true}, + {f: m.logPodLogs, isJSON: false}, {f: d.LogVMSerialConsole, isJSON: false}, } { o, err := f.f(ctx) @@ -121,3 +126,46 @@ func (m *manager) logIngressControllers(ctx context.Context) (interface{}, error return ics.Items, nil } + +func (m *manager) logPodLogs(ctx context.Context) (interface{}, error) { + if m.operatorcli == nil { + return nil, nil + } + + tailLines := int64(20) + podLogOptions := corev1.PodLogOptions{ + TailLines: &tailLines, + } + items := make([]interface{}, 0) + + pods, err := m.kubernetescli.CoreV1().Pods("openshift-azure-operator").List(ctx, metav1.ListOptions{}) + if err != nil { + return nil, err + } + for _, i := range pods.Items { + items = append(items, fmt.Sprintf("pod status %s: %v", i.Name, i.Status)) + + req := m.kubernetescli.CoreV1().Pods("openshift-azure-operator").GetLogs(i.Name, &podLogOptions) + logForPod := m.log.WithField("pod", i.Name) + logStream, err := req.Stream(ctx) + if err != nil { + items = append(items, fmt.Sprintf("pod logs retrieval error for %s: %s", i.Name, err)) + continue + } + defer logStream.Close() + + reader := bufio.NewReader(logStream) + for { + line, err := reader.ReadString('\n') + logForPod.Info(line) + if err == io.EOF { + break + } + if err != nil { + m.log.Errorf("pod logs reading error for %s: %s", i.Name, err) + break + } + } + } + return items, nil +} diff --git a/pkg/operator/deploy/deploy.go b/pkg/operator/deploy/deploy.go index 72cfa350cea..d7f8f94ffa9 100644 --- a/pkg/operator/deploy/deploy.go +++ b/pkg/operator/deploy/deploy.go @@ -7,6 +7,7 @@ import ( "bytes" "context" "embed" + "encoding/json" "errors" "fmt" "path/filepath" @@ -561,18 +562,77 @@ func (o *operator) EnsureUpgradeAnnotation(ctx context.Context) error { } func (o *operator) IsReady(ctx context.Context) (bool, error) { - ok, err := ready.CheckDeploymentIsReady(ctx, o.kubernetescli.AppsV1().Deployments(pkgoperator.Namespace), "aro-operator-master")() - o.log.Infof("deployment %q ok status is: %v, err is: %v", "aro-operator-master", ok, err) - if !ok || err != nil { - return ok, err - } - ok, err = ready.CheckDeploymentIsReady(ctx, o.kubernetescli.AppsV1().Deployments(pkgoperator.Namespace), "aro-operator-worker")() - o.log.Infof("deployment %q ok status is: %v, err is: %v", "aro-operator-worker", ok, err) - if !ok || err != nil { - return ok, err - } + deploymentOk := true + var deploymentErr error + + deployments := o.kubernetescli.AppsV1().Deployments(pkgoperator.Namespace) + replicasets := o.kubernetescli.AppsV1().ReplicaSets(pkgoperator.Namespace) + pods := o.kubernetescli.CoreV1().Pods(pkgoperator.Namespace) + + for _, deployment := range []string{"aro-operator-master", "aro-operator-worker"} { + ok, err := ready.CheckDeploymentIsReady(ctx, deployments, deployment)() + o.log.Infof("deployment %q ok status is: %v, err is: %v", deployment, ok, err) + deploymentOk = deploymentOk && ok + if deploymentErr == nil && err != nil { + deploymentErr = err + } + if ok { + continue + } - return true, nil + d, err := deployments.Get(ctx, deployment, metav1.GetOptions{}) + if err != nil { + o.log.Errorf("failed to get deployment %q: %s", deployment, err) + continue + } + j, err := json.Marshal(d.Status) + if err != nil { + o.log.Errorf("failed to serialize deployment %q: %s", deployment, err) + continue + } + o.log.Infof("deployment %q status: %s", deployment, string(j)) + + // Gather and print status of this deployment's replicasets + rs, err := replicasets.List(ctx, metav1.ListOptions{LabelSelector: fmt.Sprintf("app=%s", deployment)}) + if err != nil { + o.log.Errorf("failed to list replicasets: %s", err) + continue + } + for _, replicaset := range rs.Items { + r, err := replicasets.Get(ctx, replicaset.Name, metav1.GetOptions{}) + if err != nil { + o.log.Errorf("failed to get replicaset %s: %s", replicaset.Name, err) + continue + } + j, err := json.Marshal(r.Status) + if err != nil { + o.log.Errorf("failed to serialize replicaset status %q: %s", replicaset.Name, err) + continue + } + o.log.Infof("replicaset %q status: %s", replicaset.Name, string(j)) + } + + // Gather and print status of this deployment's pods + ps, err := pods.List(ctx, metav1.ListOptions{LabelSelector: fmt.Sprintf("app=%s", deployment)}) + if err != nil { + o.log.Errorf("failed to list pods: %s", err) + continue + } + for _, pod := range ps.Items { + p, err := pods.Get(ctx, pod.Name, metav1.GetOptions{}) + if err != nil { + o.log.Errorf("failed to get pod %s: %s", pod.Name, err) + continue + } + j, err := json.Marshal(p.Status) + if err != nil { + o.log.Errorf("failed to serialize pod status %q: %s", pod.Name, err) + continue + } + o.log.Infof("pod %q status: %s", pod.Name, string(j)) + } + } + return deploymentOk, deploymentErr } func (o *operator) Restart(ctx context.Context, deploymentNames []string) error {