Skip to content

Commit

Permalink
Add group snapshot controller metrics
Browse files Browse the repository at this point in the history
this commit adds metrics for volumegroupsnapshot

Signed-off-by: yati1998 <[email protected]>
  • Loading branch information
yati1998 committed Jun 17, 2024
1 parent d505406 commit 7e803ef
Show file tree
Hide file tree
Showing 4 changed files with 261 additions and 3 deletions.
98 changes: 95 additions & 3 deletions pkg/common-controller/groupsnapshot_controller_helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ import (

crdv1alpha1 "github.com/kubernetes-csi/external-snapshotter/client/v8/apis/volumegroupsnapshot/v1alpha1"
crdv1 "github.com/kubernetes-csi/external-snapshotter/client/v8/apis/volumesnapshot/v1"
"github.com/kubernetes-csi/external-snapshotter/v8/pkg/metrics"
"github.com/kubernetes-csi/external-snapshotter/v8/pkg/utils"
)

Expand Down Expand Up @@ -263,6 +264,14 @@ func (ctrl *csiSnapshotCommonController) deleteGroupSnapshot(groupSnapshot *crdv
_ = ctrl.snapshotStore.Delete(groupSnapshot)
klog.V(4).Infof("group snapshot %q deleted", utils.GroupSnapshotKey(groupSnapshot))

driverName, err := ctrl.getGroupSnapshotDriverName(groupSnapshot)
if err != nil {
klog.Errorf("failed to getGroupSnapshotDriverName while recording metrics for group snapshot %q: %v", utils.GroupSnapshotKey(groupSnapshot), err)
} else {
deleteOperationKey := metrics.NewOperationKey(metrics.DeleteGroupSnapshotOperationName, groupSnapshot.UID)
ctrl.metricsManager.RecordMetrics(deleteOperationKey, metrics.NewSnapshotOperationStatus(metrics.SnapshotStatusTypeSuccess), driverName)
}

groupSnapshotContentName := ""
if groupSnapshot.Status != nil && groupSnapshot.Status.BoundVolumeGroupSnapshotContentName != nil {
groupSnapshotContentName = *groupSnapshot.Status.BoundVolumeGroupSnapshotContentName
Expand Down Expand Up @@ -376,9 +385,28 @@ func (ctrl *csiSnapshotCommonController) getGroupSnapshotContentFromStore(conten
func (ctrl *csiSnapshotCommonController) syncUnreadyGroupSnapshot(groupSnapshot *crdv1alpha1.VolumeGroupSnapshot) error {
uniqueGroupSnapshotName := utils.GroupSnapshotKey(groupSnapshot)
klog.V(5).Infof("syncUnreadyGroupSnapshot %s", uniqueGroupSnapshotName)
/*
TODO: Add metrics
*/
driverName, err := ctrl.getGroupSnapshotDriverName(groupSnapshot)
if err != nil {
klog.Errorf("failed to getGroupSnapshotDriverName while recording metrics for groupsnapshot %q: %s", utils.GroupSnapshotKey(groupSnapshot), err)
}

groupSnapshotProvisionType := metrics.DynamicGroupSnapshotType
if groupSnapshot.Spec.Source.VolumeGroupSnapshotContentName != nil {
groupSnapshotProvisionType = metrics.PreProvisionedGroupSnapshotType
}

// Start metrics operations for volumegroupsnapshot
if !utils.IsGroupSnapshotCreated(groupSnapshot) {
// Only start CreateGroupSnapshot operation if the groupsnapshot has not been cut
ctrl.metricsManager.OperationStart(
metrics.NewOperationKey(metrics.CreateGroupSnapshotOperationName, groupSnapshot.UID),
metrics.NewOperationValue(driverName, groupSnapshotProvisionType),
)
}
ctrl.metricsManager.OperationStart(
metrics.NewOperationKey(metrics.CreateGroupSnapshotAndReadyOperationName, groupSnapshot.UID),
metrics.NewOperationValue(driverName, groupSnapshotProvisionType),
)

// Pre-provisioned snapshot
if groupSnapshot.Spec.Source.VolumeGroupSnapshotContentName != nil {
Expand Down Expand Up @@ -664,12 +692,20 @@ func (ctrl *csiSnapshotCommonController) updateGroupSnapshotStatus(groupSnapshot
groupSnapshotClone := groupSnapshotObj.DeepCopy()
groupSnapshotClone.Status = newStatus

// We need to record metrics before updating the status due to a bug causing cache entries after a failed UpdateStatus call.
// Must meet the following criteria to emit a successful CreateGroupSnapshot status
// 1. Previous status was nil OR Previous status had a nil CreationTime
// 2. New status must be non-nil with a non-nil CreationTime
driverName := groupSnapshotContent.Spec.Driver
createOperationKey := metrics.NewOperationKey(metrics.CreateGroupSnapshotOperationName, groupSnapshot.UID)

// Must meet the following criteria to emit a successful CreateGroupSnapshot status
// 1. Previous status was nil OR Previous status had a nil CreationTime
// 2. New status must be non-nil with a non-nil CreationTime
if !utils.IsGroupSnapshotCreated(groupSnapshotObj) && utils.IsGroupSnapshotCreated(groupSnapshotClone) {
msg := fmt.Sprintf("GroupSnapshot %s was successfully created by the CSI driver.", utils.GroupSnapshotKey(groupSnapshot))
ctrl.eventRecorder.Event(groupSnapshot, v1.EventTypeNormal, "GroupSnapshotCreated", msg)
ctrl.metricsManager.RecordVolumeGroupSnapshotMetrics(createOperationKey, metrics.NewSnapshotOperationStatus(metrics.SnapshotStatusTypeSuccess), driverName)
}

// Must meet the following criteria to emit a successful CreateGroupSnapshotAndReady status
Expand All @@ -678,6 +714,8 @@ func (ctrl *csiSnapshotCommonController) updateGroupSnapshotStatus(groupSnapshot
if !utils.IsGroupSnapshotReady(groupSnapshotObj) && utils.IsGroupSnapshotReady(groupSnapshotClone) {
msg := fmt.Sprintf("GroupSnapshot %s is ready to use.", utils.GroupSnapshotKey(groupSnapshot))
ctrl.eventRecorder.Event(groupSnapshot, v1.EventTypeNormal, "GroupSnapshotReady", msg)
createAndReadyOperation := metrics.NewOperationKey(metrics.CreateGroupSnapshotAndReadyOperationName, groupSnapshot.UID)
ctrl.metricsManager.RecordMetrics(createAndReadyOperation, metrics.NewSnapshotOperationStatus(metrics.SnapshotStatusTypeSuccess), driverName)
}

newGroupSnapshotObj, err := ctrl.clientset.GroupsnapshotV1alpha1().VolumeGroupSnapshots(groupSnapshotClone.Namespace).UpdateStatus(context.TODO(), groupSnapshotClone, metav1.UpdateOptions{})
Expand Down Expand Up @@ -1126,6 +1164,21 @@ func (ctrl *csiSnapshotCommonController) addGroupSnapshotFinalizer(groupSnapshot
func (ctrl *csiSnapshotCommonController) processGroupSnapshotWithDeletionTimestamp(groupSnapshot *crdv1alpha1.VolumeGroupSnapshot) error {
klog.V(5).Infof("processGroupSnapshotWithDeletionTimestamp VolumeGroupSnapshot[%s]: %s", utils.GroupSnapshotKey(groupSnapshot), utils.GetGroupSnapshotStatusForLogging(groupSnapshot))

driverName, err := ctrl.getGroupSnapshotDriverName(groupSnapshot)
if err != nil {
klog.Errorf("failed to getGroupSnapshotDriverName while recording metrics for group snapshot %q: %v", utils.GroupSnapshotKey(groupSnapshot), err)
}

groupSnapshotProvisionType := metrics.DynamicGroupSnapshotType
if groupSnapshot.Spec.Source.VolumeGroupSnapshotContentName != nil {
groupSnapshotProvisionType = metrics.PreProvisionedGroupSnapshotType
}

// Processing delete, start operation metric
deleteOperationKey := metrics.NewOperationKey(metrics.DeleteGroupSnapshotOperationName, groupSnapshot.UID)
deleteOperationValue := metrics.NewOperationValue(driverName, groupSnapshotProvisionType)
ctrl.metricsManager.OperationStart(deleteOperationKey, deleteOperationValue)

var groupSnapshotContentName string
if groupSnapshot.Status != nil && groupSnapshot.Status.BoundVolumeGroupSnapshotContentName != nil {
groupSnapshotContentName = *groupSnapshot.Status.BoundVolumeGroupSnapshotContentName
Expand Down Expand Up @@ -1297,3 +1350,42 @@ func (ctrl *csiSnapshotCommonController) removeGroupSnapshotFinalizer(groupSnaps
klog.V(5).Infof("Removed protection finalizer from volume group snapshot %s", utils.GroupSnapshotKey(groupSnapshot))
return nil
}

// getGroupSnapshotDriverName is a helper function to get driver from the VolumeGroupSnapshot.
// We try to get the driverName in multiple ways, as snapshot controller metrics depend on the correct driverName.
func (ctrl *csiSnapshotCommonController) getGroupSnapshotDriverName(vgs *crdv1alpha1.VolumeGroupSnapshot) (string, error) {
klog.V(5).Infof("getSnapshotDriverName: VolumeSnapshot[%s]", vgs.Name)
var driverName string

// Pre-Provisioned groupsnapshots have contentName as source
var contentName string
if vgs.Spec.Source.VolumeGroupSnapshotContentName != nil {
contentName = *vgs.Spec.Source.VolumeGroupSnapshotContentName
}

// Get Driver name from GroupSnapshotContent if we found a contentName
if contentName != "" {
content, err := ctrl.groupSnapshotContentLister.Get(contentName)
if err != nil {
klog.Errorf("getGroupSnapshotDriverName: failed to get groupSnapshotContent: %v", contentName)
} else {
driverName = content.Spec.Driver
}

if driverName != "" {
return driverName, nil
}
}

// Dynamic groupsnapshots will have a groupsnapshotclass with a driver
if vgs.Spec.VolumeGroupSnapshotClassName != nil {
class, err := ctrl.getSnapshotClass(*vgs.Spec.VolumeGroupSnapshotClassName)
if err != nil {
klog.Errorf("getGroupSnapshotDriverName: failed to get groupsnapshotClass: %v", *vgs.Spec.VolumeGroupSnapshotClassName)
} else {
driverName = class.Driver
}
}

return driverName, nil
}
4 changes: 4 additions & 0 deletions pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,10 @@ type MetricsManager interface {
// "Unknown" status of the passed-in operation is assumed.
RecordMetrics(op OperationKey, status OperationStatus, driverName string)

// RecordVolumeGroupSnapshotMetrics records a metric for operations related to
// VolumeGroupSnapshot
RecordVolumeGroupSnapshotMetrics(op OperationKey, status OperationStatus, driverName string)

// GetRegistry() returns the metrics.KubeRegistry used by this metrics manager.
GetRegistry() k8smetrics.KubeRegistry
}
Expand Down
92 changes: 92 additions & 0 deletions pkg/metrics/metrics_group.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
/*
Copyright 2024 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package metrics

import (
"time"
)

const (
// CreateGroupSnapshotOperationName is the operation that tracks how long the controller takes to create a groupsnapshot.
// Specifically, the operation metric is emitted based on the following timestamps:
// - Start_time: controller notices the first time that there is a new VolumeGroupSnapshot CR to dynamically provision a groupsnapshot
// - End_time: controller notices that the CR has a status with CreationTime field set to be non-nil
CreateGroupSnapshotOperationName = "CreateGroupSnapshot"

// CreateGroupSnapshotAndReadyOperationName is the operation that tracks how long the controller takes to create a groupsnapshot and for it to be ready.
// Specifically, the operation metric is emitted based on the following timestamps:
// - Start_time: controller notices the first time that there is a new VolumeGroupSnapshot CR(both dynamic and pre-provisioned cases)
// - End_time: controller notices that the CR has a status with Ready To Use field set to be true
CreateGroupSnapshotAndReadyOperationName = "CreateGroupSnapshotAndReady"

// DeleteGroupSnapshotOperationName is the operation that tracks how long a groupsnapshot deletion takes.
// Specifically, the operation metric is emitted based on the following timestamps:
// - Start_time: controller notices the first time that there is a deletion timestamp placed on the VolumeGroupSnapshot CR and the CR is ready to be deleted.
// Note that if the CR is being used by a PVC for rehydration, the controller should *NOT* set the start_time.
// - End_time: controller removed all finalizers on the VolumeGroupSnapshot CR such that the CR is ready to be removed in the API server.
DeleteGroupSnapshotOperationName = "DeleteGroupSnapshot"
// DynamicGroupSnapshotType represents a groupsnapshot that is being dynamically provisioned
DynamicGroupSnapshotType = snapshotProvisionType("dynamic")
// PreProvisionedGroupSnapshotType represents a groupsnapshot that is pre-provisioned
PreProvisionedGroupSnapshotType = snapshotProvisionType("pre-provisioned")
)

// RecordVolumeGroupMetrics emits operation metrics
func (opMgr *operationMetricsManager) RecordVolumeGroupSnapshotMetrics(opKey OperationKey, opStatus OperationStatus, driverName string) {
opMgr.mu.Lock()
defer opMgr.mu.Unlock()
opVal, exists := opMgr.cache[opKey]
if !exists {
// the operation has not been cached, return directly
return
}
status := string(SnapshotStatusTypeUnknown)
if opStatus != nil {
status = opStatus.String()
}

// if we do not know the driverName while recording metrics,
// refer to the cached version instead.
if driverName == "" || driverName == unknownDriverName {
driverName = opVal.Driver
}

operationDuration := time.Since(opVal.startTime).Seconds()
opMgr.opLatencyMetrics.WithLabelValues(driverName, opKey.Name, opVal.SnapshotType, status).Observe(operationDuration)

// Report cancel metrics if we are deleting an unfinished VolumeGroupSnapshot
if opKey.Name == DeleteGroupSnapshotOperationName {
// check if we have a CreateGroupSnapshot operation pending for this
createKey := NewOperationKey(CreateGroupSnapshotOperationName, opKey.ResourceID)
obj, exists := opMgr.cache[createKey]
if exists {
// record a cancel metric if found
opMgr.recordCancelMetricLocked(obj, createKey, operationDuration)
}

// check if we have a CreateGroupSnapshotAndReady operation pending for this
createAndReadyKey := NewOperationKey(CreateGroupSnapshotAndReadyOperationName, opKey.ResourceID)
obj, exists = opMgr.cache[createAndReadyKey]
if exists {
// record a cancel metric if found
opMgr.recordCancelMetricLocked(obj, createAndReadyKey, operationDuration)
}
}

delete(opMgr.cache, opKey)
opMgr.opInFlight.Set(float64(len(opMgr.cache)))
}
70 changes: 70 additions & 0 deletions pkg/metrics/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -738,3 +738,73 @@ func TestProcessStartTimeMetricExist(t *testing.T) {

t.Fatalf("Metrics does not contain %v. Scraped content: %v", processStartTimeMetric, metricsFamilies)
}

func TestRecordVolumeGroupSnapshotMetrics(t *testing.T) {
mgr, srv := initMgr()
srvAddr := "http://" + srv.Addr + httpPattern
defer shutdown(srv)
// add an operation
opKey := OperationKey{
Name: "op1",
ResourceID: types.UID("uid1"),
}
opVal := NewOperationValue("driver", DynamicSnapshotType)
mgr.OperationStart(opKey, opVal)
// should create a Success data point with latency ~ 1100ms
time.Sleep(1100 * time.Millisecond)
success := &fakeOpStatus{
statusCode: 0,
}
mgr.RecordVolumeGroupSnapshotMetrics(opKey, success, "driver")

// add another operation metric
opKey.Name = "op2"
opKey.ResourceID = types.UID("uid2")
mgr.OperationStart(opKey, opVal)
// should create a Failure data point with latency ~ 100ms
time.Sleep(100 * time.Millisecond)
failure := &fakeOpStatus{
statusCode: 1,
}
mgr.RecordVolumeGroupSnapshotMetrics(opKey, failure, "driver")

expected :=
`# HELP snapshot_controller_operation_total_seconds [ALPHA] Total number of seconds spent by the controller on an operation from end to end
# TYPE snapshot_controller_operation_total_seconds histogram
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="0.1"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="0.25"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="0.5"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="1"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="2.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="10"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="15"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="30"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="60"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="120"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="300"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="600"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="+Inf"} 1
snapshot_controller_operation_total_seconds_sum{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type=""} 1.1
snapshot_controller_operation_total_seconds_count{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type=""} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="0.1"} 0
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="0.25"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="0.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="1"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="2.5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="5"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="10"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="15"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="30"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="60"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="120"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="300"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="600"} 1
snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="+Inf"} 1
snapshot_controller_operation_total_seconds_sum{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type=""} 0.1
snapshot_controller_operation_total_seconds_count{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type=""} 1
`
if err := verifyMetric(expected, srvAddr); err != nil {
t.Errorf("failed testing [%v]", err)
}
}

0 comments on commit 7e803ef

Please sign in to comment.