Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add group snapshot controller metrics #1107

Merged
merged 1 commit into from
Jun 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 95 additions & 3 deletions pkg/common-controller/groupsnapshot_controller_helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ import (

crdv1alpha1 "github.com/kubernetes-csi/external-snapshotter/client/v8/apis/volumegroupsnapshot/v1alpha1"
crdv1 "github.com/kubernetes-csi/external-snapshotter/client/v8/apis/volumesnapshot/v1"
"github.com/kubernetes-csi/external-snapshotter/v8/pkg/metrics"
"github.com/kubernetes-csi/external-snapshotter/v8/pkg/utils"
)

Expand Down Expand Up @@ -263,6 +264,14 @@ func (ctrl *csiSnapshotCommonController) deleteGroupSnapshot(groupSnapshot *crdv
_ = ctrl.snapshotStore.Delete(groupSnapshot)
klog.V(4).Infof("group snapshot %q deleted", utils.GroupSnapshotKey(groupSnapshot))

driverName, err := ctrl.getGroupSnapshotDriverName(groupSnapshot)
if err != nil {
klog.Errorf("failed to getGroupSnapshotDriverName while recording metrics for group snapshot %q: %v", utils.GroupSnapshotKey(groupSnapshot), err)
} else {
deleteOperationKey := metrics.NewOperationKey(metrics.DeleteGroupSnapshotOperationName, groupSnapshot.UID)
ctrl.metricsManager.RecordMetrics(deleteOperationKey, metrics.NewSnapshotOperationStatus(metrics.SnapshotStatusTypeSuccess), driverName)
}

groupSnapshotContentName := ""
if groupSnapshot.Status != nil && groupSnapshot.Status.BoundVolumeGroupSnapshotContentName != nil {
groupSnapshotContentName = *groupSnapshot.Status.BoundVolumeGroupSnapshotContentName
Expand Down Expand Up @@ -376,9 +385,28 @@ func (ctrl *csiSnapshotCommonController) getGroupSnapshotContentFromStore(conten
func (ctrl *csiSnapshotCommonController) syncUnreadyGroupSnapshot(groupSnapshot *crdv1alpha1.VolumeGroupSnapshot) error {
uniqueGroupSnapshotName := utils.GroupSnapshotKey(groupSnapshot)
klog.V(5).Infof("syncUnreadyGroupSnapshot %s", uniqueGroupSnapshotName)
/*
TODO: Add metrics
*/
driverName, err := ctrl.getGroupSnapshotDriverName(groupSnapshot)
if err != nil {
klog.Errorf("failed to getGroupSnapshotDriverName while recording metrics for groupsnapshot %q: %s", utils.GroupSnapshotKey(groupSnapshot), err)
}

groupSnapshotProvisionType := metrics.DynamicGroupSnapshotType
if groupSnapshot.Spec.Source.VolumeGroupSnapshotContentName != nil {
groupSnapshotProvisionType = metrics.PreProvisionedGroupSnapshotType
}

// Start metrics operations for volumegroupsnapshot
if !utils.IsGroupSnapshotCreated(groupSnapshot) {
// Only start CreateGroupSnapshot operation if the groupsnapshot has not been cut
ctrl.metricsManager.OperationStart(
metrics.NewOperationKey(metrics.CreateGroupSnapshotOperationName, groupSnapshot.UID),
metrics.NewOperationValue(driverName, groupSnapshotProvisionType),
)
}
ctrl.metricsManager.OperationStart(
metrics.NewOperationKey(metrics.CreateGroupSnapshotAndReadyOperationName, groupSnapshot.UID),
metrics.NewOperationValue(driverName, groupSnapshotProvisionType),
)

// Pre-provisioned snapshot
if groupSnapshot.Spec.Source.VolumeGroupSnapshotContentName != nil {
Expand Down Expand Up @@ -664,12 +692,20 @@ func (ctrl *csiSnapshotCommonController) updateGroupSnapshotStatus(groupSnapshot
groupSnapshotClone := groupSnapshotObj.DeepCopy()
groupSnapshotClone.Status = newStatus

// We need to record metrics before updating the status due to a bug causing cache entries after a failed UpdateStatus call.
// Must meet the following criteria to emit a successful CreateGroupSnapshot status
// 1. Previous status was nil OR Previous status had a nil CreationTime
// 2. New status must be non-nil with a non-nil CreationTime
driverName := groupSnapshotContent.Spec.Driver
createOperationKey := metrics.NewOperationKey(metrics.CreateGroupSnapshotOperationName, groupSnapshot.UID)

// Must meet the following criteria to emit a successful CreateGroupSnapshot status
// 1. Previous status was nil OR Previous status had a nil CreationTime
// 2. New status must be non-nil with a non-nil CreationTime
if !utils.IsGroupSnapshotCreated(groupSnapshotObj) && utils.IsGroupSnapshotCreated(groupSnapshotClone) {
msg := fmt.Sprintf("GroupSnapshot %s was successfully created by the CSI driver.", utils.GroupSnapshotKey(groupSnapshot))
ctrl.eventRecorder.Event(groupSnapshot, v1.EventTypeNormal, "GroupSnapshotCreated", msg)
ctrl.metricsManager.RecordVolumeGroupSnapshotMetrics(createOperationKey, metrics.NewSnapshotOperationStatus(metrics.SnapshotStatusTypeSuccess), driverName)
}

// Must meet the following criteria to emit a successful CreateGroupSnapshotAndReady status
Expand All @@ -678,6 +714,8 @@ func (ctrl *csiSnapshotCommonController) updateGroupSnapshotStatus(groupSnapshot
if !utils.IsGroupSnapshotReady(groupSnapshotObj) && utils.IsGroupSnapshotReady(groupSnapshotClone) {
msg := fmt.Sprintf("GroupSnapshot %s is ready to use.", utils.GroupSnapshotKey(groupSnapshot))
ctrl.eventRecorder.Event(groupSnapshot, v1.EventTypeNormal, "GroupSnapshotReady", msg)
createAndReadyOperation := metrics.NewOperationKey(metrics.CreateGroupSnapshotAndReadyOperationName, groupSnapshot.UID)
ctrl.metricsManager.RecordMetrics(createAndReadyOperation, metrics.NewSnapshotOperationStatus(metrics.SnapshotStatusTypeSuccess), driverName)
}

newGroupSnapshotObj, err := ctrl.clientset.GroupsnapshotV1alpha1().VolumeGroupSnapshots(groupSnapshotClone.Namespace).UpdateStatus(context.TODO(), groupSnapshotClone, metav1.UpdateOptions{})
Expand Down Expand Up @@ -1126,6 +1164,21 @@ func (ctrl *csiSnapshotCommonController) addGroupSnapshotFinalizer(groupSnapshot
func (ctrl *csiSnapshotCommonController) processGroupSnapshotWithDeletionTimestamp(groupSnapshot *crdv1alpha1.VolumeGroupSnapshot) error {
klog.V(5).Infof("processGroupSnapshotWithDeletionTimestamp VolumeGroupSnapshot[%s]: %s", utils.GroupSnapshotKey(groupSnapshot), utils.GetGroupSnapshotStatusForLogging(groupSnapshot))

driverName, err := ctrl.getGroupSnapshotDriverName(groupSnapshot)
if err != nil {
klog.Errorf("failed to getGroupSnapshotDriverName while recording metrics for group snapshot %q: %v", utils.GroupSnapshotKey(groupSnapshot), err)
}

groupSnapshotProvisionType := metrics.DynamicGroupSnapshotType
if groupSnapshot.Spec.Source.VolumeGroupSnapshotContentName != nil {
groupSnapshotProvisionType = metrics.PreProvisionedGroupSnapshotType
}

// Processing delete, start operation metric
deleteOperationKey := metrics.NewOperationKey(metrics.DeleteGroupSnapshotOperationName, groupSnapshot.UID)
deleteOperationValue := metrics.NewOperationValue(driverName, groupSnapshotProvisionType)
ctrl.metricsManager.OperationStart(deleteOperationKey, deleteOperationValue)

var groupSnapshotContentName string
if groupSnapshot.Status != nil && groupSnapshot.Status.BoundVolumeGroupSnapshotContentName != nil {
groupSnapshotContentName = *groupSnapshot.Status.BoundVolumeGroupSnapshotContentName
Expand Down Expand Up @@ -1297,3 +1350,42 @@ func (ctrl *csiSnapshotCommonController) removeGroupSnapshotFinalizer(groupSnaps
klog.V(5).Infof("Removed protection finalizer from volume group snapshot %s", utils.GroupSnapshotKey(groupSnapshot))
return nil
}

// getGroupSnapshotDriverName is a helper function to get driver from the VolumeGroupSnapshot.
// We try to get the driverName in multiple ways, as snapshot controller metrics depend on the correct driverName.
func (ctrl *csiSnapshotCommonController) getGroupSnapshotDriverName(vgs *crdv1alpha1.VolumeGroupSnapshot) (string, error) {
klog.V(5).Infof("getSnapshotDriverName: VolumeSnapshot[%s]", vgs.Name)
var driverName string

// Pre-Provisioned groupsnapshots have contentName as source
var contentName string
if vgs.Spec.Source.VolumeGroupSnapshotContentName != nil {
contentName = *vgs.Spec.Source.VolumeGroupSnapshotContentName
}

// Get Driver name from GroupSnapshotContent if we found a contentName
if contentName != "" {
content, err := ctrl.groupSnapshotContentLister.Get(contentName)
if err != nil {
klog.Errorf("getGroupSnapshotDriverName: failed to get groupSnapshotContent: %v", contentName)
} else {
driverName = content.Spec.Driver
}

if driverName != "" {
return driverName, nil
}
}

// Dynamic groupsnapshots will have a groupsnapshotclass with a driver
if vgs.Spec.VolumeGroupSnapshotClassName != nil {
class, err := ctrl.getSnapshotClass(*vgs.Spec.VolumeGroupSnapshotClassName)
if err != nil {
klog.Errorf("getGroupSnapshotDriverName: failed to get groupsnapshotClass: %v", *vgs.Spec.VolumeGroupSnapshotClassName)
} else {
driverName = class.Driver
}
}

return driverName, nil
}
4 changes: 4 additions & 0 deletions pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,10 @@ type MetricsManager interface {
// "Unknown" status of the passed-in operation is assumed.
RecordMetrics(op OperationKey, status OperationStatus, driverName string)

// RecordVolumeGroupSnapshotMetrics records a metric for operations related to
// VolumeGroupSnapshot
RecordVolumeGroupSnapshotMetrics(op OperationKey, status OperationStatus, driverName string)

// GetRegistry() returns the metrics.KubeRegistry used by this metrics manager.
GetRegistry() k8smetrics.KubeRegistry
}
Expand Down
92 changes: 92 additions & 0 deletions pkg/metrics/metrics_group.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
/*
Copyright 2024 The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package metrics

import (
"time"
)

const (
// CreateGroupSnapshotOperationName is the operation that tracks how long the controller takes to create a groupsnapshot.
// Specifically, the operation metric is emitted based on the following timestamps:
// - Start_time: controller notices the first time that there is a new VolumeGroupSnapshot CR to dynamically provision a groupsnapshot
// - End_time: controller notices that the CR has a status with CreationTime field set to be non-nil
CreateGroupSnapshotOperationName = "CreateGroupSnapshot"

// CreateGroupSnapshotAndReadyOperationName is the operation that tracks how long the controller takes to create a groupsnapshot and for it to be ready.
// Specifically, the operation metric is emitted based on the following timestamps:
// - Start_time: controller notices the first time that there is a new VolumeGroupSnapshot CR(both dynamic and pre-provisioned cases)
// - End_time: controller notices that the CR has a status with Ready To Use field set to be true
CreateGroupSnapshotAndReadyOperationName = "CreateGroupSnapshotAndReady"

// DeleteGroupSnapshotOperationName is the operation that tracks how long a groupsnapshot deletion takes.
// Specifically, the operation metric is emitted based on the following timestamps:
// - Start_time: controller notices the first time that there is a deletion timestamp placed on the VolumeGroupSnapshot CR and the CR is ready to be deleted.
// Note that if the CR is being used by a PVC for rehydration, the controller should *NOT* set the start_time.
// - End_time: controller removed all finalizers on the VolumeGroupSnapshot CR such that the CR is ready to be removed in the API server.
DeleteGroupSnapshotOperationName = "DeleteGroupSnapshot"
// DynamicGroupSnapshotType represents a groupsnapshot that is being dynamically provisioned
DynamicGroupSnapshotType = snapshotProvisionType("dynamic")
xing-yang marked this conversation as resolved.
Show resolved Hide resolved
// PreProvisionedGroupSnapshotType represents a groupsnapshot that is pre-provisioned
PreProvisionedGroupSnapshotType = snapshotProvisionType("pre-provisioned")
xing-yang marked this conversation as resolved.
Show resolved Hide resolved
)

// RecordVolumeGroupMetrics emits operation metrics
func (opMgr *operationMetricsManager) RecordVolumeGroupSnapshotMetrics(opKey OperationKey, opStatus OperationStatus, driverName string) {
opMgr.mu.Lock()
defer opMgr.mu.Unlock()
opVal, exists := opMgr.cache[opKey]
if !exists {
// the operation has not been cached, return directly
return
}
status := string(SnapshotStatusTypeUnknown)
if opStatus != nil {
status = opStatus.String()
}

// if we do not know the driverName while recording metrics,
// refer to the cached version instead.
if driverName == "" || driverName == unknownDriverName {
driverName = opVal.Driver
}

operationDuration := time.Since(opVal.startTime).Seconds()
opMgr.opLatencyMetrics.WithLabelValues(driverName, opKey.Name, opVal.SnapshotType, status).Observe(operationDuration)

// Report cancel metrics if we are deleting an unfinished VolumeGroupSnapshot
if opKey.Name == DeleteGroupSnapshotOperationName {
// check if we have a CreateGroupSnapshot operation pending for this
createKey := NewOperationKey(CreateGroupSnapshotOperationName, opKey.ResourceID)
obj, exists := opMgr.cache[createKey]
if exists {
// record a cancel metric if found
opMgr.recordCancelMetricLocked(obj, createKey, operationDuration)
}

// check if we have a CreateGroupSnapshotAndReady operation pending for this
createAndReadyKey := NewOperationKey(CreateGroupSnapshotAndReadyOperationName, opKey.ResourceID)
obj, exists = opMgr.cache[createAndReadyKey]
if exists {
// record a cancel metric if found
opMgr.recordCancelMetricLocked(obj, createAndReadyKey, operationDuration)
}
}

delete(opMgr.cache, opKey)
opMgr.opInFlight.Set(float64(len(opMgr.cache)))
}
Loading