Skip to content

Commit

Permalink
feat: ditributed tracing span error (pod-delete only)
Browse files Browse the repository at this point in the history
Signed-off-by: Jaeyeon Park <[email protected]>
  • Loading branch information
moggaa committed Nov 28, 2024
1 parent 7e08c69 commit d37e04c
Show file tree
Hide file tree
Showing 4 changed files with 80 additions and 4 deletions.
5 changes: 5 additions & 0 deletions bin/experiment/experiment.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"
"errors"
"flag"
"fmt"
"os"

// Uncomment to load all auth plugins
Expand Down Expand Up @@ -68,6 +69,7 @@ import (
"github.com/litmuschaos/litmus-go/pkg/telemetry"
"github.com/sirupsen/logrus"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/codes"
)

func init() {
Expand Down Expand Up @@ -106,6 +108,8 @@ func main() {
//Getting kubeConfig and Generate ClientSets
if err := clients.GenerateClientSetFromKubeConfig(); err != nil {
log.Errorf("Unable to Get the kubeconfig, err: %v", err)
span.SetStatus(codes.Error, "Unable to Get the kubeconfig")
span.RecordError(err)
return
}

Expand Down Expand Up @@ -211,6 +215,7 @@ func main() {
k6Loadgen.Experiment(ctx, clients)
default:
log.Errorf("Unsupported -name %v, please provide the correct value of -name args", *experimentName)
span.SetStatus(codes.Error, fmt.Sprintf("Unsupported -name %v", *experimentName))
return
}
}
42 changes: 39 additions & 3 deletions chaoslib/litmus/pod-delete/lib/pod-delete.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"github.com/palantir/stacktrace"
"github.com/sirupsen/logrus"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/codes"
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

Expand All @@ -46,14 +47,22 @@ func PreparePodDelete(ctx context.Context, experimentsDetails *experimentTypes.E
switch strings.ToLower(experimentsDetails.Sequence) {
case "serial":
if err := injectChaosInSerialMode(ctx, experimentsDetails, clients, chaosDetails, eventsDetails, resultDetails); err != nil {
span.SetStatus(codes.Error, "could not run chaos in serial mode")
span.RecordError(err)
return stacktrace.Propagate(err, "could not run chaos in serial mode")
}
case "parallel":
if err := injectChaosInParallelMode(ctx, experimentsDetails, clients, chaosDetails, eventsDetails, resultDetails); err != nil {
span.SetStatus(codes.Error, "could not run chaos in parallel mode")
span.RecordError(err)
return stacktrace.Propagate(err, "could not run chaos in parallel mode")
}
default:
return cerrors.Error{ErrorCode: cerrors.ErrorTypeGeneric, Reason: fmt.Sprintf("'%s' sequence is not supported", experimentsDetails.Sequence)}
errReason := fmt.Sprintf("sequence '%s' is not supported", experimentsDetails.Sequence)
span.SetStatus(codes.Error, errReason)
err := cerrors.Error{ErrorCode: cerrors.ErrorTypeGeneric, Reason: errReason}
span.RecordError(err)
return err
}

//Waiting for the ramp time after chaos injection
Expand All @@ -72,6 +81,8 @@ func injectChaosInSerialMode(ctx context.Context, experimentsDetails *experiment
// run the probes during chaos
if len(resultDetails.ProbeDetails) != 0 {
if err := probe.RunProbes(ctx, chaosDetails, clients, resultDetails, "DuringChaos", eventsDetails); err != nil {
span.SetStatus(codes.Error, "could not run the probes during chaos")
span.RecordError(err)
return err
}
}
Expand All @@ -85,18 +96,25 @@ func injectChaosInSerialMode(ctx context.Context, experimentsDetails *experiment
// Get the target pod details for the chaos execution
// if the target pod is not defined it will derive the random target pod list using pod affected percentage
if experimentsDetails.TargetPods == "" && chaosDetails.AppDetail == nil {
return cerrors.Error{ErrorCode: cerrors.ErrorTypeTargetSelection, Reason: "provide one of the appLabel or TARGET_PODS"}
span.SetStatus(codes.Error, "provide one of the appLabel or TARGET_PODS")
err := cerrors.Error{ErrorCode: cerrors.ErrorTypeTargetSelection, Reason: "provide one of the appLabel or TARGET_PODS"}
span.RecordError(err)
return err
}

targetPodList, err := common.GetTargetPods(experimentsDetails.NodeLabel, experimentsDetails.TargetPods, experimentsDetails.PodsAffectedPerc, clients, chaosDetails)
if err != nil {
span.SetStatus(codes.Error, "could not get target pods")
span.RecordError(err)
return stacktrace.Propagate(err, "could not get target pods")
}

// deriving the parent name of the target resources
for _, pod := range targetPodList.Items {
kind, parentName, err := workloads.GetPodOwnerTypeAndName(&pod, clients.DynamicClient)
if err != nil {
span.SetStatus(codes.Error, "could not get pod owner name and kind")
span.RecordError(err)
return stacktrace.Propagate(err, "could not get pod owner name and kind")
}
common.SetParentName(parentName, kind, pod.Namespace, chaosDetails)
Expand All @@ -123,12 +141,16 @@ func injectChaosInSerialMode(ctx context.Context, experimentsDetails *experiment
err = clients.KubeClient.CoreV1().Pods(pod.Namespace).Delete(context.Background(), pod.Name, v1.DeleteOptions{})
}
if err != nil {
span.SetStatus(codes.Error, "could not delete the target pod")
span.RecordError(err)
return cerrors.Error{ErrorCode: cerrors.ErrorTypeChaosInject, Target: fmt.Sprintf("{podName: %s, namespace: %s}", pod.Name, pod.Namespace), Reason: fmt.Sprintf("failed to delete the target pod: %s", err.Error())}
}

switch chaosDetails.Randomness {
case true:
if err := common.RandomInterval(experimentsDetails.ChaosInterval); err != nil {
span.SetStatus(codes.Error, "could not get random chaos interval")
span.RecordError(err)
return stacktrace.Propagate(err, "could not get random chaos interval")
}
default:
Expand All @@ -149,6 +171,8 @@ func injectChaosInSerialMode(ctx context.Context, experimentsDetails *experiment
Namespace: parent.Namespace,
}
if err = status.CheckUnTerminatedPodStatusesByWorkloadName(target, experimentsDetails.Timeout, experimentsDetails.Delay, clients); err != nil {
span.SetStatus(codes.Error, "could not check pod statuses by workload names")
span.RecordError(err)
return stacktrace.Propagate(err, "could not check pod statuses by workload names")
}
}
Expand Down Expand Up @@ -184,17 +208,24 @@ func injectChaosInParallelMode(ctx context.Context, experimentsDetails *experime
// Get the target pod details for the chaos execution
// if the target pod is not defined it will derive the random target pod list using pod affected percentage
if experimentsDetails.TargetPods == "" && chaosDetails.AppDetail == nil {
return cerrors.Error{ErrorCode: cerrors.ErrorTypeTargetSelection, Reason: "please provide one of the appLabel or TARGET_PODS"}
span.SetStatus(codes.Error, "please provide one of the appLabel or TARGET_PODS")
err := cerrors.Error{ErrorCode: cerrors.ErrorTypeTargetSelection, Reason: "please provide one of the appLabel or TARGET_PODS"}
span.RecordError(err)
return err
}
targetPodList, err := common.GetTargetPods(experimentsDetails.NodeLabel, experimentsDetails.TargetPods, experimentsDetails.PodsAffectedPerc, clients, chaosDetails)
if err != nil {
span.SetStatus(codes.Error, "could not get target pods")
span.RecordError(err)
return stacktrace.Propagate(err, "could not get target pods")
}

// deriving the parent name of the target resources
for _, pod := range targetPodList.Items {
kind, parentName, err := workloads.GetPodOwnerTypeAndName(&pod, clients.DynamicClient)
if err != nil {
span.SetStatus(codes.Error, "could not get pod owner name and kind")
span.RecordError(err)
return stacktrace.Propagate(err, "could not get pod owner name and kind")
}
common.SetParentName(parentName, kind, pod.Namespace, chaosDetails)
Expand All @@ -221,13 +252,16 @@ func injectChaosInParallelMode(ctx context.Context, experimentsDetails *experime
err = clients.KubeClient.CoreV1().Pods(pod.Namespace).Delete(context.Background(), pod.Name, v1.DeleteOptions{})
}
if err != nil {
span.SetStatus(codes.Error, "could not delete the target pod")
span.RecordError(err)
return cerrors.Error{ErrorCode: cerrors.ErrorTypeChaosInject, Target: fmt.Sprintf("{podName: %s, namespace: %s}", pod.Name, pod.Namespace), Reason: fmt.Sprintf("failed to delete the target pod: %s", err.Error())}
}
}

switch chaosDetails.Randomness {
case true:
if err := common.RandomInterval(experimentsDetails.ChaosInterval); err != nil {
span.SetStatus(codes.Error, "could not get random chaos interval")
return stacktrace.Propagate(err, "could not get random chaos interval")
}
default:
Expand All @@ -248,6 +282,8 @@ func injectChaosInParallelMode(ctx context.Context, experimentsDetails *experime
Namespace: parent.Namespace,
}
if err = status.CheckUnTerminatedPodStatusesByWorkloadName(target, experimentsDetails.Timeout, experimentsDetails.Delay, clients); err != nil {
span.SetStatus(codes.Error, "could not check pod statuses by workload names")
span.RecordError(err)
return stacktrace.Propagate(err, "could not check pod statuses by workload names")
}
}
Expand Down
22 changes: 22 additions & 0 deletions experiments/generic/pod-delete/experiment/pod-delete.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,14 @@ import (
"github.com/litmuschaos/litmus-go/pkg/types"
"github.com/litmuschaos/litmus-go/pkg/utils/common"
"github.com/sirupsen/logrus"
"go.opentelemetry.io/otel/codes"
"go.opentelemetry.io/otel/trace"
)

// PodDelete inject the pod-delete chaos
func PodDelete(ctx context.Context, clients clients.ClientSets) {
span := trace.SpanFromContext(ctx)

experimentsDetails := experimentTypes.ExperimentDetails{}
resultDetails := types.ResultDetails{}
eventsDetails := types.EventDetails{}
Expand All @@ -40,6 +44,8 @@ func PodDelete(ctx context.Context, clients clients.ClientSets) {
// Get values from chaosengine. Bail out upon error, as we haven't entered exp business logic yet
if err := types.GetValuesFromChaosEngine(&chaosDetails, clients, &resultDetails); err != nil {
log.Errorf("Unable to initialize the probes, err: %v", err)
span.SetStatus(codes.Error, "Unable to initialize the probes")
span.RecordError(err)
return
}
}
Expand All @@ -49,13 +55,17 @@ func PodDelete(ctx context.Context, clients clients.ClientSets) {
if err := result.ChaosResult(&chaosDetails, clients, &resultDetails, "SOT"); err != nil {
log.Errorf("Unable to create the chaosresult, err: %v", err)
result.RecordAfterFailure(&chaosDetails, &resultDetails, err, clients, &eventsDetails)
span.SetStatus(codes.Error, "Unable to create the chaosresult")
span.RecordError(err)
return
}

// Set the chaos result uid
if err := result.SetResultUID(&resultDetails, clients, &chaosDetails); err != nil {
log.Errorf("Unable to set the result uid, err: %v", err)
result.RecordAfterFailure(&chaosDetails, &resultDetails, err, clients, &eventsDetails)
span.SetStatus(codes.Error, "Unable to set the result uid")
span.RecordError(err)
return
}

Expand Down Expand Up @@ -85,6 +95,8 @@ func PodDelete(ctx context.Context, clients clients.ClientSets) {
log.Errorf("failed to create %v event inside chaosengine", types.PreChaosCheck)
}
result.RecordAfterFailure(&chaosDetails, &resultDetails, err, clients, &eventsDetails)
span.SetStatus(codes.Error, "Application status check failed")
span.RecordError(err)
return
}
}
Expand All @@ -104,6 +116,8 @@ func PodDelete(ctx context.Context, clients clients.ClientSets) {
log.Errorf("failed to create %v event inside chaosengine", types.PreChaosCheck)
}
result.RecordAfterFailure(&chaosDetails, &resultDetails, err, clients, &eventsDetails)
span.SetStatus(codes.Error, "Probe Failed")
span.RecordError(err)
return
}
msg = common.GetStatusMessage(chaosDetails.DefaultHealthCheck, "AUT: Running", "Successful")
Expand All @@ -117,6 +131,8 @@ func PodDelete(ctx context.Context, clients clients.ClientSets) {
if err := litmusLIB.PreparePodDelete(ctx, &experimentsDetails, clients, &resultDetails, &eventsDetails, &chaosDetails); err != nil {
log.Errorf("Chaos injection failed, err: %v", err)
result.RecordAfterFailure(&chaosDetails, &resultDetails, err, clients, &eventsDetails)
span.SetStatus(codes.Error, "Chaos injection failed")
span.RecordError(err)
return
}

Expand All @@ -132,6 +148,8 @@ func PodDelete(ctx context.Context, clients clients.ClientSets) {
types.SetEngineEventAttributes(&eventsDetails, types.PostChaosCheck, "AUT: Not Running", "Warning", &chaosDetails)
events.GenerateEvents(&eventsDetails, clients, &chaosDetails, "ChaosEngine")
result.RecordAfterFailure(&chaosDetails, &resultDetails, err, clients, &eventsDetails)
span.SetStatus(codes.Error, "Application status check failed")
span.RecordError(err)
return
}
}
Expand All @@ -150,6 +168,8 @@ func PodDelete(ctx context.Context, clients clients.ClientSets) {
log.Errorf("failed to create %v event inside chaosengine", types.PostChaosCheck)
}
result.RecordAfterFailure(&chaosDetails, &resultDetails, err, clients, &eventsDetails)
span.SetStatus(codes.Error, "Probes Failed")
span.RecordError(err)
return
}
msg = common.GetStatusMessage(chaosDetails.DefaultHealthCheck, "AUT: Running", "Successful")
Expand All @@ -165,6 +185,8 @@ func PodDelete(ctx context.Context, clients clients.ClientSets) {
if err := result.ChaosResult(&chaosDetails, clients, &resultDetails, "EOT"); err != nil {
log.Errorf("Unable to update the chaosresult, err: %v", err)
result.RecordAfterFailure(&chaosDetails, &resultDetails, err, clients, &eventsDetails)
span.SetStatus(codes.Error, "Unable to update the chaosresult")
span.RecordError(err)
return
}

Expand Down
15 changes: 14 additions & 1 deletion pkg/probe/probe.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
"github.com/palantir/stacktrace"
"github.com/sirupsen/logrus"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/codes"
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

Expand All @@ -32,6 +33,8 @@ func RunProbes(ctx context.Context, chaosDetails *types.ChaosDetails, clients cl
// get the probes details from the chaosengine
probes, err := getProbesFromChaosEngine(chaosDetails, clients)
if err != nil {
span.SetStatus(codes.Error, "getProbesFromChaosEngine failed")
span.RecordError(err)
return err
}

Expand All @@ -42,6 +45,8 @@ func RunProbes(ctx context.Context, chaosDetails *types.ChaosDetails, clients cl
switch strings.ToLower(probe.Mode) {
case "sot", "edge", "continuous":
if err := execute(probe, chaosDetails, clients, resultDetails, phase); err != nil {
span.SetStatus(codes.Error, fmt.Sprintf("%s mode %s probe execute failed", probe.Mode, probe.Name))
span.RecordError(err)
return err
}
}
Expand All @@ -51,6 +56,8 @@ func RunProbes(ctx context.Context, chaosDetails *types.ChaosDetails, clients cl
for _, probe := range probes {
if strings.ToLower(probe.Mode) == "onchaos" {
if err := execute(probe, chaosDetails, clients, resultDetails, phase); err != nil {
span.SetStatus(codes.Error, fmt.Sprintf("%s mode %s probe execute failed", probe.Mode, probe.Name))
span.RecordError(err)
return err
}
}
Expand All @@ -72,13 +79,19 @@ func RunProbes(ctx context.Context, chaosDetails *types.ChaosDetails, clients cl
}
}
if len(probeError) != 0 {
return cerrors.PreserveError{ErrString: fmt.Sprintf("[%s]", strings.Join(probeError, ","))}
errString := fmt.Sprintf("[%s]", strings.Join(probeError, ","))
span.SetStatus(codes.Error, errString)
err := cerrors.PreserveError{ErrString: errString}
span.RecordError(err)
return err
}
// executes the eot and edge modes
for _, probe := range probes {
switch strings.ToLower(probe.Mode) {
case "eot", "edge":
if err := execute(probe, chaosDetails, clients, resultDetails, phase); err != nil {
span.SetStatus(codes.Error, fmt.Sprintf("%s mode %s probe execute failed", probe.Mode, probe.Name))
span.RecordError(err)
return err
}
}
Expand Down

0 comments on commit d37e04c

Please sign in to comment.