Skip to content

Commit e38efc3

Browse files
committed
feat(core): auto-cleanup for dvcr storage
- Add maintenance mode for DVCR Deployment. - Add auto-cleanup schedule setting in ModuleConfig. - Add 'auto-cleanup check' command to dvcr-cleaner to get all images eligible for cleanup. Internals: - Add hook to switch DVCR into maintenance mode depending on Secret created by controller. - Refactor cron source, make it independent on gc manager. Start maintenance mode by cron source. - Add condition on Deployment/dvcr to get maintenance mode state, e.g. auto-cleanup state. - Maintenance mode keep dvcr in RO mode, so VM with mounted images should able to reboot. - Also, postpone importer and uploader Pods creation for new cvi/vi/vd until auto-cleanup finishes. Signed-off-by: Ivan Mikheykin <[email protected]>
1 parent 85d9410 commit e38efc3

File tree

56 files changed

+2063
-76
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

56 files changed

+2063
-76
lines changed
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
/*
2+
Copyright 2025 Flant JSC
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package dvcr_deployment_condition
18+
19+
// Type represents the various condition types for the `ClusterVirtualImage`.
20+
type Type string
21+
22+
func (s Type) String() string {
23+
return string(s)
24+
}
25+
26+
const (
27+
// MaintenanceType indicates whether the deployment/dvcr is in maintenance mode.
28+
MaintenanceType Type = "Maintenance"
29+
)
30+
31+
type (
32+
// MaintenanceReason represents the various reasons for the DVCRMaintenance condition type.
33+
MaintenanceReason string
34+
)
35+
36+
func (s MaintenanceReason) String() string {
37+
return string(s)
38+
}
39+
40+
const (
41+
// PrepareAutoCleanup indicates that the maintenance is prepared: create secret, wait for vi/cvi/vd to stop uploading.
42+
PrepareAutoCleanup MaintenanceReason = "PrepareAutoCleanup"
43+
// MaintenanceAutoCleanupInProgress indicates that deployment is in the maintenance mode.
44+
MaintenanceAutoCleanupInProgress MaintenanceReason = "AutoCleanupInProgress"
45+
// MaintenanceAutoCleanupScheduled indicates that the deployment is in the normal mode, and the maintenance is scheduled for some time in the future
46+
MaintenanceAutoCleanupScheduled MaintenanceReason = "AutoCleanupScheduled"
47+
)

api/core/v1alpha2/events.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,9 @@ const (
104104
// ReasonDataSourceQuotaExceeded is event reason that DataSource sync is failed because quota exceed.
105105
ReasonDataSourceQuotaExceeded = "DataSourceQuotaExceed"
106106

107+
// ReasonImageOperationPostponedDueToDVCRMaintenance is event reason that operation is postponed until the end of DVCR maintenance mode.
108+
ReasonImageOperationPostponedDueToDVCRMaintenance = "ImageOperationPostponedDueToDVCRMaintenance"
109+
107110
// ReasonDataSourceDiskProvisioningFailed is event reason that DataSource disk provisioning is failed.
108111
ReasonDataSourceDiskProvisioningFailed = "DataSourceImportDiskProvisioningFailed"
109112

images/dvcr-artifact/cmd/dvcr-cleaner/cmd/gc.go

Lines changed: 305 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,24 @@ limitations under the License.
1717
package cmd
1818

1919
import (
20+
"context"
21+
"encoding/json"
2022
"errors"
2123
"fmt"
2224
"os"
23-
"os/exec"
25+
"sort"
26+
"strings"
2427

28+
"github.com/hashicorp/go-multierror"
2529
"github.com/manifoldco/promptui"
2630
"github.com/spf13/cobra"
31+
"k8s.io/apimachinery/pkg/api/resource"
32+
33+
"github.com/deckhouse/virtualization-controller/dvcr-importers/pkg/cleaner/kubernetes"
34+
"github.com/deckhouse/virtualization-controller/dvcr-importers/pkg/cleaner/registry"
35+
"github.com/deckhouse/virtualization-controller/dvcr-importers/pkg/cleaner/signal"
36+
"github.com/deckhouse/virtualization-controller/dvcr-importers/pkg/cleaner/storage"
37+
"github.com/deckhouse/virtualization/api/core/v1alpha2"
2738
)
2839

2940
var GcCmd = &cobra.Command{
@@ -51,8 +62,7 @@ var gcRunCmd = &cobra.Command{
5162
return fmt.Errorf("cache data cannot be deleted: %w", err)
5263
}
5364

54-
execCmd := exec.Command("registry", "garbage-collect", "/etc/docker/registry/config.yml", "--delete-untagged")
55-
stdout, err := execCmd.Output()
65+
stdout, err := registry.ExecGarbageCollect()
5666
if err != nil {
5767
fmt.Println(err.Error())
5868
return nil
@@ -65,6 +75,38 @@ var gcRunCmd = &cobra.Command{
6575
SilenceErrors: true,
6676
}
6777

78+
var (
79+
MaintenanceSecretName string
80+
)
81+
82+
var autoCleanupCmd = &cobra.Command{
83+
Use: "auto-cleanup [--maintenance-secret-name secret]",
84+
Short: "`auto-cleanup` deletes all stale images that have no corresponding resource in the cluster and then runs garbage-collect to remove underlying blobs (Note: not for manual run unless you 100% sure what are you doing)",
85+
Args: cobra.OnlyValidArgs,
86+
RunE: autoCleanupHandler,
87+
SilenceUsage: true,
88+
SilenceErrors: true,
89+
}
90+
91+
var checkCmd = &cobra.Command{
92+
Use: "check",
93+
Short: "`check` reports stale images that have no corresponding resource in the cluster",
94+
Args: cobra.OnlyValidArgs,
95+
RunE: checkCleanupHandler,
96+
SilenceUsage: true,
97+
SilenceErrors: true,
98+
}
99+
100+
func init() {
101+
GcCmd.AddCommand(gcRunCmd)
102+
103+
// Add 'run' command.
104+
GcCmd.AddCommand(autoCleanupCmd)
105+
autoCleanupCmd.Flags().StringVar(&MaintenanceSecretName, "maintenance-secret-name", "", "update secret with result and annotation after the cleanup")
106+
// Add 'check' command.
107+
GcCmd.AddCommand(checkCmd)
108+
}
109+
68110
func Confirm() (bool, error) {
69111
prompt := promptui.Prompt{
70112
Label: "Confirm",
@@ -82,6 +124,264 @@ func Confirm() (bool, error) {
82124
return true, nil
83125
}
84126

85-
func init() {
86-
GcCmd.AddCommand(gcRunCmd)
127+
func autoCleanupHandler(cmd *cobra.Command, args []string) error {
128+
fsInfoBeforeCleanup, err := registry.StorageStats()
129+
if err != nil {
130+
return fmt.Errorf("get repositories filesystem info before cleanup: %w", err)
131+
}
132+
133+
var errs *multierror.Error
134+
135+
cleanupErr := performAutoCleanup()
136+
if cleanupErr != nil {
137+
errs = multierror.Append(errs, cleanupErr)
138+
}
139+
140+
// Report disk usage.
141+
fsInfoAfterCleanup, errFSInfo := registry.StorageStats()
142+
if errFSInfo != nil {
143+
errs = multierror.Append(errs, fmt.Errorf("get repositories filesystem info after cleanup: %w", errFSInfo))
144+
}
145+
freedSpace := ""
146+
availableSpace := ""
147+
usedSpace := ""
148+
totalSpace := ""
149+
if errFSInfo == nil {
150+
// Available space after cleanup should be greater than available space before cleanup.
151+
// The difference is the freed space. Format it with GiB/MiB suffix.
152+
freedSpaceRaw := fsInfoAfterCleanup.Available - fsInfoBeforeCleanup.Available
153+
freedSpace = storage.HumanizeQuantity(freedSpaceRaw) + "B"
154+
availableSpace = storage.HumanizeQuantity(fsInfoAfterCleanup.Available) + "B"
155+
usedSpace = storage.HumanizeQuantity(fsInfoAfterCleanup.Total-fsInfoAfterCleanup.Available) + "B"
156+
totalSpace = storage.HumanizeQuantity(fsInfoAfterCleanup.Total) + "B"
157+
}
158+
fmt.Printf("Freed space during cleanup: %s, available space now: %s\n", freedSpace, availableSpace)
159+
fmt.Printf("%7s %7s %7s\n", "Total", "Used", "Avail")
160+
fmt.Printf("%7s %7s %7s\n", totalSpace, usedSpace, availableSpace)
161+
162+
// Terminate without waiting if no secret name was provided.
163+
if MaintenanceSecretName == "" {
164+
return errs.ErrorOrNil()
165+
}
166+
167+
// Update maintenance secret and wait for termination signal.
168+
result := map[string]string{
169+
"result": "success",
170+
"freedSpace": freedSpace,
171+
"availableSpace": availableSpace,
172+
}
173+
if cleanupErr != nil {
174+
result["result"] = "fail"
175+
result["error"] = cleanupErr.Error()
176+
}
177+
178+
secretErr := annotateMaintenanceSecretOnCleanupDone(context.Background(), result)
179+
if secretErr != nil {
180+
errs = multierror.Append(errs, secretErr)
181+
}
182+
183+
// Return previous errors, so Pod will be restarted without waiting.
184+
err = errs.ErrorOrNil()
185+
if err != nil {
186+
return err
187+
}
188+
189+
// Wait until termination.
190+
fmt.Println("Wait for signal before terminate.")
191+
signal.WaitForTermination()
192+
return nil
193+
}
194+
195+
func performAutoCleanup() error {
196+
absentImages, err := getAbsentImages()
197+
if err != nil {
198+
return err
199+
}
200+
201+
// Delete manifests for absent images.
202+
if len(absentImages) == 0 {
203+
fmt.Println("No images eligible for cleanup.")
204+
return nil
205+
}
206+
207+
err = registry.RemoveImages(absentImages)
208+
if err != nil {
209+
return fmt.Errorf("remove manifests: %w", err)
210+
}
211+
212+
// Run 'registry garbage-collect' to remove blobs.
213+
stdout, err := registry.ExecGarbageCollect()
214+
if err != nil {
215+
return err
216+
}
217+
218+
fmt.Println(string(stdout))
219+
return nil
220+
}
221+
222+
func checkCleanupHandler(_ *cobra.Command, _ []string) error {
223+
fsInfo, err := registry.StorageStats()
224+
if err != nil {
225+
return fmt.Errorf("get repositories filesystem info before cleanup: %w", err)
226+
}
227+
228+
absentImages, err := getAbsentImages()
229+
if err != nil {
230+
return err
231+
}
232+
233+
availableSpace := resource.NewQuantity(int64(fsInfo.Available), resource.BinarySI).String() + "B"
234+
235+
fmt.Printf("Available space: %s\n", availableSpace)
236+
237+
if len(absentImages) == 0 {
238+
fmt.Println("No images eligible for auto-cleanup.")
239+
}
240+
241+
sort.SliceStable(absentImages, func(i, j int) bool {
242+
return absentImages[i].Path < absentImages[j].Path
243+
})
244+
245+
fmt.Println("Images eligible for cleanup:")
246+
for _, image := range absentImages {
247+
img := strings.TrimPrefix(image.Path, registry.RepoDir)
248+
img = strings.TrimPrefix(image.Path, "/")
249+
fmt.Println(img)
250+
}
251+
252+
return nil
253+
}
254+
255+
func getAbsentImages() ([]registry.Image, error) {
256+
// List all images created for all ClusterVirtualImage and VirtualImage resources.
257+
images, err := registry.ListImagesAll()
258+
if err != nil {
259+
return nil, fmt.Errorf("list all images: %w", err)
260+
}
261+
262+
// Get all VirtualImages and ClusterImages
263+
virtClient, err := kubernetes.NewVirtualizationClient()
264+
if err != nil {
265+
return nil, fmt.Errorf("initialize Kubernetes client: %w", err)
266+
}
267+
268+
kubeImages, err := virtClient.ListAllPossibleImages(context.Background())
269+
if err == nil {
270+
return nil, fmt.Errorf("list images in cluster: %w", err)
271+
}
272+
273+
// Compare lists, return images absent in the cluster.
274+
return compareRegistryAndClusterImages(images, kubeImages), nil
275+
}
276+
277+
// compareRegistryAndClusterImages returns images that has no corresponding resource in the cluster.
278+
// VirtualDisks in Ready phase are considered for cleanup.
279+
func compareRegistryAndClusterImages(images []registry.Image, kubeImages []kubernetes.ImageInfo) []registry.Image {
280+
// Create indexes for all resources found in cluster.
281+
// A map for ClusterImages. Keys are names.
282+
clusterVirtualImages := make(map[string]struct{})
283+
// A map for virtualImages: namespace -> name
284+
virtualImages := make(map[string]map[string]struct{})
285+
// A map for virtualDisks: namespace -> name -> disk phase
286+
virtualDisks := make(map[string]map[string]v1alpha2.DiskPhase)
287+
for _, kubeImage := range kubeImages {
288+
switch kubeImage.Type {
289+
case v1alpha2.ClusterVirtualImageKind:
290+
clusterVirtualImages[kubeImage.Name] = struct{}{}
291+
case v1alpha2.VirtualImageKind:
292+
if _, ok := virtualImages[kubeImage.Namespace]; !ok {
293+
virtualImages[kubeImage.Namespace] = make(map[string]struct{})
294+
}
295+
virtualImages[kubeImage.Namespace][kubeImage.Name] = struct{}{}
296+
case v1alpha2.VirtualDiskKind:
297+
if _, ok := virtualDisks[kubeImage.Namespace]; !ok {
298+
virtualDisks[kubeImage.Namespace] = make(map[string]v1alpha2.DiskPhase)
299+
}
300+
virtualDisks[kubeImage.Namespace][kubeImage.Name] = kubeImage.Phase
301+
}
302+
}
303+
304+
absentImages := make([]registry.Image, 0)
305+
for _, image := range images {
306+
switch image.Type {
307+
case v1alpha2.ClusterVirtualImageKind:
308+
if _, ok := clusterVirtualImages[image.Name]; !ok {
309+
absentImages = append(absentImages, image)
310+
}
311+
case v1alpha2.VirtualImageKind:
312+
if _, ok := virtualImages[image.Namespace]; !ok {
313+
absentImages = append(absentImages, image)
314+
continue
315+
}
316+
if _, ok := virtualImages[image.Namespace][image.Name]; !ok {
317+
absentImages = append(absentImages, image)
318+
}
319+
case v1alpha2.VirtualDiskKind:
320+
if _, ok := virtualDisks[image.Namespace]; !ok {
321+
absentImages = append(absentImages, image)
322+
continue
323+
}
324+
if _, ok := virtualDisks[image.Namespace][image.Name]; !ok {
325+
absentImages = append(absentImages, image)
326+
continue
327+
}
328+
// Images for disks in Ready phase are eligible for cleanup.
329+
if virtualDisks[image.Namespace][image.Name] == v1alpha2.DiskReady {
330+
absentImages = append(absentImages, image)
331+
}
332+
}
333+
}
334+
335+
return absentImages
336+
}
337+
338+
func writeTerminationMessage(err error, extra map[string]string) error {
339+
report := map[string]string{
340+
"result": "success",
341+
}
342+
if err != nil {
343+
report["result"] = "fail"
344+
}
345+
return kubernetes.ReportTerminationMessage(err, report, extra)
346+
}
347+
348+
const (
349+
cleanupDoneAnno = "virtualization.deckhouse.io/dvcr-deployment-cleanup-done"
350+
switchToMaintenanceAnno = "virtualization.deckhouse.io/dvcr-deployment-switch-to-maintenance-mode"
351+
)
352+
353+
func annotateMaintenanceSecretOnCleanupDone(ctx context.Context, result map[string]string) error {
354+
resultBytes, err := json.Marshal(result)
355+
if err != nil {
356+
return fmt.Errorf("marshal result to json: %w", err)
357+
}
358+
359+
// Get all VirtualImages and ClusterImages
360+
virtClient, err := kubernetes.NewVirtualizationClient()
361+
if err != nil {
362+
return fmt.Errorf("initialize Kubernetes client: %w", err)
363+
}
364+
365+
secret, err := virtClient.GetMaintenanceSecret(ctx)
366+
if err != nil {
367+
return err
368+
}
369+
370+
if secret.Annotations == nil {
371+
secret.Annotations = make(map[string]string)
372+
}
373+
secret.Annotations[cleanupDoneAnno] = ""
374+
delete(secret.Annotations, switchToMaintenanceAnno)
375+
376+
if secret.Data == nil {
377+
secret.Data = make(map[string][]byte)
378+
}
379+
secret.Data["result"] = resultBytes
380+
381+
err = virtClient.UpdateMaintenanceSecret(ctx, secret)
382+
if err != nil {
383+
return fmt.Errorf("update secret on cleanup done: %w", err)
384+
}
385+
386+
return nil
87387
}

0 commit comments

Comments
 (0)