Skip to content

Commit

Permalink
Add group snapshot controller metrics
Browse files Browse the repository at this point in the history
this commit adds metrics for volumegroupsnapshot

Signed-off-by: yati1998 <[email protected]>
  • Loading branch information
yati1998 committed Jun 14, 2024
1 parent d505406 commit 8b2cf59
Show file tree
Hide file tree
Showing 5 changed files with 258 additions and 3 deletions.
59 changes: 56 additions & 3 deletions pkg/common-controller/groupsnapshot_controller_helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ import (

crdv1alpha1 "github.com/kubernetes-csi/external-snapshotter/client/v8/apis/volumegroupsnapshot/v1alpha1"
crdv1 "github.com/kubernetes-csi/external-snapshotter/client/v8/apis/volumesnapshot/v1"
"github.com/kubernetes-csi/external-snapshotter/v8/pkg/metrics"
"github.com/kubernetes-csi/external-snapshotter/v8/pkg/utils"
)

Expand Down Expand Up @@ -263,6 +264,14 @@ func (ctrl *csiSnapshotCommonController) deleteGroupSnapshot(groupSnapshot *crdv
_ = ctrl.snapshotStore.Delete(groupSnapshot)
klog.V(4).Infof("group snapshot %q deleted", utils.GroupSnapshotKey(groupSnapshot))

driverName, err := ctrl.getGroupSnapshotDriverName(groupSnapshot)
if err != nil {
klog.Errorf("failed to getGroupSnapshotDriverName while recording metrics for group snapshot %q: %v", utils.GroupSnapshotKey(groupSnapshot), err)
} else {
deleteOperationKey := metrics.NewOperationKey(metrics.DeleteGroupSnapshotOperationName, groupSnapshot.UID)
ctrl.metricsManager.RecordMetrics(deleteOperationKey, metrics.NewSnapshotOperationStatus(metrics.SnapshotStatusTypeSuccess), driverName)
}

groupSnapshotContentName := ""
if groupSnapshot.Status != nil && groupSnapshot.Status.BoundVolumeGroupSnapshotContentName != nil {
groupSnapshotContentName = *groupSnapshot.Status.BoundVolumeGroupSnapshotContentName
Expand Down Expand Up @@ -376,9 +385,28 @@ func (ctrl *csiSnapshotCommonController) getGroupSnapshotContentFromStore(conten
func (ctrl *csiSnapshotCommonController) syncUnreadyGroupSnapshot(groupSnapshot *crdv1alpha1.VolumeGroupSnapshot) error {
uniqueGroupSnapshotName := utils.GroupSnapshotKey(groupSnapshot)
klog.V(5).Infof("syncUnreadyGroupSnapshot %s", uniqueGroupSnapshotName)
/*
TODO: Add metrics
*/
driverName, err := ctrl.getGroupSnapshotDriverName(groupSnapshot)
if err != nil {
klog.Errorf("failed to getGroupSnapshotDriverName while recording metrics for groupsnapshot %q: %s", utils.GroupSnapshotKey(groupSnapshot), err)
}

groupSnapshotProvisionType := metrics.DynamicSnapshotType
if groupSnapshot.Spec.Source.VolumeGroupSnapshotContentName != nil {
groupSnapshotProvisionType = metrics.PreProvisionedSnapshotType
}

// Start metrics operations for volumegroupsnapshot
if !utils.IsGroupSnapshotCreated(groupSnapshot) {
// Only start CreateGroupSnapshot operation if the snapshot has not been cut
ctrl.metricsManager.OperationStart(
metrics.NewOperationKey(metrics.CreateGroupSnapshotOperationName, groupSnapshot.UID),
metrics.NewOperationValue(driverName, groupSnapshotProvisionType),
)
}
ctrl.metricsManager.OperationStart(
metrics.NewOperationKey(metrics.CreateGroupSnapshotAndReadyOperationName, groupSnapshot.UID),
metrics.NewOperationValue(driverName, groupSnapshotProvisionType),
)

// Pre-provisioned snapshot
if groupSnapshot.Spec.Source.VolumeGroupSnapshotContentName != nil {
Expand Down Expand Up @@ -664,12 +692,20 @@ func (ctrl *csiSnapshotCommonController) updateGroupSnapshotStatus(groupSnapshot
groupSnapshotClone := groupSnapshotObj.DeepCopy()
groupSnapshotClone.Status = newStatus

// We need to record metrics before updating the status due to a bug causing cache entries after a failed UpdateStatus call.
// Must meet the following criteria to emit a successful CreateSnapshot status
// 1. Previous status was nil OR Previous status had a nil CreationTime
// 2. New status must be non-nil with a non-nil CreationTime
driverName := groupSnapshotContent.Spec.Driver
createOperationKey := metrics.NewOperationKey(metrics.CreateGroupSnapshotOperationName, groupSnapshot.UID)

// Must meet the following criteria to emit a successful CreateGroupSnapshot status
// 1. Previous status was nil OR Previous status had a nil CreationTime
// 2. New status must be non-nil with a non-nil CreationTime
if !utils.IsGroupSnapshotCreated(groupSnapshotObj) && utils.IsGroupSnapshotCreated(groupSnapshotClone) {
msg := fmt.Sprintf("GroupSnapshot %s was successfully created by the CSI driver.", utils.GroupSnapshotKey(groupSnapshot))
ctrl.eventRecorder.Event(groupSnapshot, v1.EventTypeNormal, "GroupSnapshotCreated", msg)
ctrl.metricsManager.RecordVolumeGroupMetrics(createOperationKey, metrics.NewSnapshotOperationStatus(metrics.SnapshotStatusTypeSuccess), driverName)
}

// Must meet the following criteria to emit a successful CreateGroupSnapshotAndReady status
Expand All @@ -678,6 +714,8 @@ func (ctrl *csiSnapshotCommonController) updateGroupSnapshotStatus(groupSnapshot
if !utils.IsGroupSnapshotReady(groupSnapshotObj) && utils.IsGroupSnapshotReady(groupSnapshotClone) {
msg := fmt.Sprintf("GroupSnapshot %s is ready to use.", utils.GroupSnapshotKey(groupSnapshot))
ctrl.eventRecorder.Event(groupSnapshot, v1.EventTypeNormal, "GroupSnapshotReady", msg)
createAndReadyOperation := metrics.NewOperationKey(metrics.CreateGroupSnapshotAndReadyOperationName, groupSnapshot.UID)
ctrl.metricsManager.RecordMetrics(createAndReadyOperation, metrics.NewSnapshotOperationStatus(metrics.SnapshotStatusTypeSuccess), driverName)
}

newGroupSnapshotObj, err := ctrl.clientset.GroupsnapshotV1alpha1().VolumeGroupSnapshots(groupSnapshotClone.Namespace).UpdateStatus(context.TODO(), groupSnapshotClone, metav1.UpdateOptions{})
Expand Down Expand Up @@ -1126,6 +1164,21 @@ func (ctrl *csiSnapshotCommonController) addGroupSnapshotFinalizer(groupSnapshot
func (ctrl *csiSnapshotCommonController) processGroupSnapshotWithDeletionTimestamp(groupSnapshot *crdv1alpha1.VolumeGroupSnapshot) error {
klog.V(5).Infof("processGroupSnapshotWithDeletionTimestamp VolumeGroupSnapshot[%s]: %s", utils.GroupSnapshotKey(groupSnapshot), utils.GetGroupSnapshotStatusForLogging(groupSnapshot))

driverName, err := ctrl.getGroupSnapshotDriverName(groupSnapshot)
if err != nil {
klog.Errorf("failed to getGroupSnapshotDriverName while recording metrics for group snapshot %q: %v", utils.GroupSnapshotKey(groupSnapshot), err)
}

groupSnapshotProvisionType := metrics.DynamicSnapshotType
if groupSnapshot.Spec.Source.VolumeGroupSnapshotContentName != nil {
groupSnapshotProvisionType = metrics.PreProvisionedSnapshotType
}

// Processing delete, start operation metric
deleteOperationKey := metrics.NewOperationKey(metrics.DeleteGroupSnapshotOperationName, groupSnapshot.UID)
deleteOperationValue := metrics.NewOperationValue(driverName, groupSnapshotProvisionType)
ctrl.metricsManager.OperationStart(deleteOperationKey, deleteOperationValue)

var groupSnapshotContentName string
if groupSnapshot.Status != nil && groupSnapshot.Status.BoundVolumeGroupSnapshotContentName != nil {
groupSnapshotContentName = *groupSnapshot.Status.BoundVolumeGroupSnapshotContentName
Expand Down
40 changes: 40 additions & 0 deletions pkg/common-controller/snapshot_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import (
corev1helpers "k8s.io/component-helpers/scheduling/corev1"
klog "k8s.io/klog/v2"

crdv1alpha1 "github.com/kubernetes-csi/external-snapshotter/client/v8/apis/volumegroupsnapshot/v1alpha1"
crdv1 "github.com/kubernetes-csi/external-snapshotter/client/v8/apis/volumesnapshot/v1"
"github.com/kubernetes-csi/external-snapshotter/v8/pkg/metrics"
"github.com/kubernetes-csi/external-snapshotter/v8/pkg/utils"
Expand Down Expand Up @@ -1373,6 +1374,45 @@ func (ctrl *csiSnapshotCommonController) getSnapshotDriverName(vs *crdv1.VolumeS
return driverName, nil
}

// getGroupSnapshotDriverName is a helper function to get snapshot driver from the VolumeGroupSnapshot.
// We try to get the driverName in multiple ways, as snapshot controller metrics depend on the correct driverName.
func (ctrl *csiSnapshotCommonController) getGroupSnapshotDriverName(vgs *crdv1alpha1.VolumeGroupSnapshot) (string, error) {
klog.V(5).Infof("getSnapshotDriverName: VolumeSnapshot[%s]", vgs.Name)
var driverName string

// Pre-Provisioned snapshots have contentName as source
var contentName string
if vgs.Spec.Source.VolumeGroupSnapshotContentName != nil {
contentName = *vgs.Spec.Source.VolumeGroupSnapshotContentName
}

// Get Driver name from GroupSnapshotContent if we found a contentName
if contentName != "" {
content, err := ctrl.groupSnapshotContentLister.Get(contentName)
if err != nil {
klog.Errorf("getGroupSnapshotDriverName: failed to get groupSnapshotContent: %v", contentName)
} else {
driverName = content.Spec.Driver
}

if driverName != "" {
return driverName, nil
}
}

// Dynamic groupsnapshots will have a groupsnapshotclass with a driver
if vgs.Spec.VolumeGroupSnapshotClassName != nil {
class, err := ctrl.getSnapshotClass(*vgs.Spec.VolumeGroupSnapshotClassName)
if err != nil {
klog.Errorf("getGroupSnapshotDriverName: failed to get groupsnapshotClass: %v", *vgs.Spec.VolumeGroupSnapshotClassName)
} else {
driverName = class.Driver
}
}

return driverName, nil
}

// SetDefaultSnapshotClass is a helper function to figure out the default snapshot class.
// For pre-provisioned case, it's an no-op.
// For dynamic provisioning, it gets the default SnapshotClasses in the system if there is any(could be multiple),
Expand Down
4 changes: 4 additions & 0 deletions pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,10 @@ type MetricsManager interface {
// "Unknown" status of the passed-in operation is assumed.
RecordMetrics(op OperationKey, status OperationStatus, driverName string)

// RecordVolumeGroupMetrics records a metric for operations related to
// VolumeGroupSnapshot
RecordVolumeGroupMetrics(op OperationKey, status OperationStatus, driverName string)

// GetRegistry() returns the metrics.KubeRegistry used by this metrics manager.
GetRegistry() k8smetrics.KubeRegistry
}
Expand Down
88 changes: 88 additions & 0 deletions pkg/metrics/metrics_group.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package metrics

import (
"time"
)

const (
// CreateGroupSnapshotOperationName is the operation that tracks how long the controller takes to create a groupsnapshot.
// Specifically, the operation metric is emitted based on the following timestamps:
// - Start_time: controller notices the first time that there is a new VolumeGroupSnapshot CR to dynamically provision a groupsnapshot
// - End_time: controller notices that the CR has a status with CreationTime field set to be non-nil
CreateGroupSnapshotOperationName = "CreateGroupSnapshot"

// CreateGroupSnapshotAndReadyOperationName is the operation that tracks how long the controller takes to create a groupsnapshot and for it to be ready.
// Specifically, the operation metric is emitted based on the following timestamps:
// - Start_time: controller notices the first time that there is a new VolumeGroupSnapshot CR(both dynamic and pre-provisioned cases)
// - End_time: controller notices that the CR has a status with Ready To Use field set to be true
CreateGroupSnapshotAndReadyOperationName = "CreateGroupSnapshotAndReady"

// DeleteGroupSnapshotOperationName is the operation that tracks how long a groupsnapshot deletion takes.
// Specifically, the operation metric is emitted based on the following timestamps:
// - Start_time: controller notices the first time that there is a deletion timestamp placed on the VolumeGroupSnapshot CR and the CR is ready to be deleted.
// Note that if the CR is being used by a PVC for rehydration, the controller should *NOT* set the start_time.
// - End_time: controller removed all finalizers on the VolumeGroupSnapshot CR such that the CR is ready to be removed in the API server.
DeleteGroupSnapshotOperationName = "DeleteGroupSnapshot"
)

// RecordVolumeGroupMetrics emits operation metrics
func (opMgr *operationMetricsManager) RecordVolumeGroupMetrics(opKey OperationKey, opStatus OperationStatus, driverName string) {
opMgr.mu.Lock()
defer opMgr.mu.Unlock()
opVal, exists := opMgr.cache[opKey]
if !exists {
// the operation has not been cached, return directly
return
}
status := string(SnapshotStatusTypeUnknown)
if opStatus != nil {
status = opStatus.String()
}

// if we do not know the driverName while recording metrics,
// refer to the cached version instead.
if driverName == "" || driverName == unknownDriverName {
driverName = opVal.Driver
}

operationDuration := time.Since(opVal.startTime).Seconds()
opMgr.opLatencyMetrics.WithLabelValues(driverName, opKey.Name, opVal.SnapshotType, status).Observe(operationDuration)

// Report cancel metrics if we are deleting an unfinished VolumeGroupSnapshot
if opKey.Name == DeleteGroupSnapshotOperationName {
// check if we have a CreateGroupSnapshot operation pending for this
createKey := NewOperationKey(CreateGroupSnapshotOperationName, opKey.ResourceID)
obj, exists := opMgr.cache[createKey]
if exists {
// record a cancel metric if found
opMgr.recordCancelMetricLocked(obj, createKey, operationDuration)
}

// check if we have a CreateGroupSnapshotAndReady operation pending for this
createAndReadyKey := NewOperationKey(CreateGroupSnapshotAndReadyOperationName, opKey.ResourceID)
obj, exists = opMgr.cache[createAndReadyKey]
if exists {
// record a cancel metric if found
opMgr.recordCancelMetricLocked(obj, createAndReadyKey, operationDuration)
}
}

delete(opMgr.cache, opKey)
opMgr.opInFlight.Set(float64(len(opMgr.cache)))
}
70 changes: 70 additions & 0 deletions pkg/metrics/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -738,3 +738,73 @@ func TestProcessStartTimeMetricExist(t *testing.T) {

t.Fatalf("Metrics does not contain %v. Scraped content: %v", processStartTimeMetric, metricsFamilies)
}

func TestRecordVolumeGroupMetrics(t *testing.T) {
mgr, srv := initMgr()
srvAddr := "http://" + srv.Addr + httpPattern
defer shutdown(srv)
// add an operation
opKey := OperationKey{
Name: "op1",
ResourceID: types.UID("uid1"),
}
opVal := NewOperationValue("driver", DynamicSnapshotType)
mgr.OperationStart(opKey, opVal)
// should create a Success data point with latency ~ 1100ms
time.Sleep(1100 * time.Millisecond)
success := &fakeOpStatus{
statusCode: 0,
}
mgr.RecordVolumeGroupMetrics(opKey, success, "driver")

// add another operation metric
opKey.Name = "op2"
opKey.ResourceID = types.UID("uid2")
mgr.OperationStart(opKey, opVal)
// should create a Failure data point with latency ~ 100ms
time.Sleep(100 * time.Millisecond)
failure := &fakeOpStatus{
statusCode: 1,
}
mgr.RecordVolumeGroupMetrics(opKey, failure, "driver")

expected :=
`# HELP snapshot_controller_operation_total_seconds [ALPHA] Total number of seconds spent by the controller on an operation from end to end
# TYPE snapshot_controller_operation_total_seconds histogram
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="0.1"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="0.25"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="0.5"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="1"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="2.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="10"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="15"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="30"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="60"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="120"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="300"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="600"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="+Inf"} 1
snapshot_controller_operation_total_seconds_sum{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type=""} 1.1
snapshot_controller_operation_total_seconds_count{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type=""} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="0.1"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="0.25"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="0.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="1"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="2.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="10"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="15"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="30"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="60"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="120"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="300"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="600"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="+Inf"} 1
snapshot_controller_operation_total_seconds_sum{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type=""} 0.1
snapshot_controller_operation_total_seconds_count{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type=""} 1
`
if err := verifyMetric(expected, srvAddr); err != nil {
t.Errorf("failed testing [%v]", err)
}
}

0 comments on commit 8b2cf59

Please sign in to comment.