diff --git a/README.md b/README.md index 6fde201eb96..25ab15603a7 100644 --- a/README.md +++ b/README.md @@ -317,6 +317,8 @@ spec: managementState: Managed workbenches: managementState: Managed + modelregistry: + managementState: Managed ``` 2. Enable only Dashboard and Workbenches diff --git a/apis/datasciencecluster/v1/datasciencecluster_types.go b/apis/datasciencecluster/v1/datasciencecluster_types.go index b24633ad56b..dcdf4b9b057 100644 --- a/apis/datasciencecluster/v1/datasciencecluster_types.go +++ b/apis/datasciencecluster/v1/datasciencecluster_types.go @@ -31,6 +31,7 @@ import ( "github.com/opendatahub-io/opendatahub-operator/v2/components/kserve" "github.com/opendatahub-io/opendatahub-operator/v2/components/kueue" "github.com/opendatahub-io/opendatahub-operator/v2/components/modelmeshserving" + "github.com/opendatahub-io/opendatahub-operator/v2/components/modelregistry" "github.com/opendatahub-io/opendatahub-operator/v2/components/ray" "github.com/opendatahub-io/opendatahub-operator/v2/components/trainingoperator" "github.com/opendatahub-io/opendatahub-operator/v2/components/trustyai" @@ -80,6 +81,9 @@ type Components struct { //Training Operator component configuration. TrainingOperator trainingoperator.TrainingOperator `json:"trainingoperator,omitempty"` + + // ModelRegistry component configuration. + ModelRegistry modelregistry.ModelRegistry `json:"modelregistry,omitempty"` } // DataScienceClusterStatus defines the observed state of DataScienceCluster. diff --git a/apis/datasciencecluster/v1/zz_generated.deepcopy.go b/apis/datasciencecluster/v1/zz_generated.deepcopy.go index ccb61f7b60b..38fae1c3f09 100644 --- a/apis/datasciencecluster/v1/zz_generated.deepcopy.go +++ b/apis/datasciencecluster/v1/zz_generated.deepcopy.go @@ -40,6 +40,7 @@ func (in *Components) DeepCopyInto(out *Components) { in.Ray.DeepCopyInto(&out.Ray) in.TrustyAI.DeepCopyInto(&out.TrustyAI) in.TrainingOperator.DeepCopyInto(&out.TrainingOperator) + in.ModelRegistry.DeepCopyInto(&out.ModelRegistry) } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Components. diff --git a/bundle/manifests/datasciencecluster.opendatahub.io_datascienceclusters.yaml b/bundle/manifests/datasciencecluster.opendatahub.io_datascienceclusters.yaml index 6972bf35135..0a9ba701939 100644 --- a/bundle/manifests/datasciencecluster.opendatahub.io_datascienceclusters.yaml +++ b/bundle/manifests/datasciencecluster.opendatahub.io_datascienceclusters.yaml @@ -385,6 +385,49 @@ spec: pattern: ^(Managed|Unmanaged|Force|Removed)$ type: string type: object + modelregistry: + description: ModelRegistry component configuration. + properties: + devFlags: + description: Add developer fields + properties: + manifests: + description: List of custom manifests for the given component + items: + properties: + contextDir: + default: "" + description: contextDir is the relative path to + the folder containing manifests in a repository + type: string + sourcePath: + default: "" + description: 'sourcePath is the subpath within contextDir + where kustomize builds start. Examples include + any sub-folder or path: `base`, `overlays/dev`, + `default`, `odh` etc' + type: string + uri: + default: "" + description: uri is the URI point to a git repo + with tag/branch. e.g https://github.com/org/repo/tarball/ + type: string + type: object + type: array + type: object + managementState: + description: "Set to one of the following values: \n - \"Managed\" + : the operator is actively managing the component and trying + to keep it active. It will only upgrade the component if + it is safe to do so \n - \"Removed\" : the operator is actively + managing the component and will not install it, or if it + is installed, the operator will try to remove it" + enum: + - Managed + - Removed + pattern: ^(Managed|Unmanaged|Force|Removed)$ + type: string + type: object ray: description: Ray component configuration. properties: diff --git a/bundle/manifests/rhods-operator.clusterserviceversion.yaml b/bundle/manifests/rhods-operator.clusterserviceversion.yaml index 3df62a61064..b35ff2da79a 100644 --- a/bundle/manifests/rhods-operator.clusterserviceversion.yaml +++ b/bundle/manifests/rhods-operator.clusterserviceversion.yaml @@ -46,6 +46,9 @@ metadata: "modelmeshserving": { "managementState": "Managed" }, + "modelregistry": { + "managementState": "Removed" + }, "ray": { "managementState": "Managed" }, @@ -1032,6 +1035,32 @@ spec: - update - use - watch + - apiGroups: + - modelregistry.opendatahub.io + resources: + - modelregistries + verbs: + - create + - delete + - get + - list + - patch + - update + - watch + - apiGroups: + - modelregistry.opendatahub.io + resources: + - modelregistries/finalizers + verbs: + - update + - apiGroups: + - modelregistry.opendatahub.io + resources: + - modelregistries/status + verbs: + - get + - patch + - update - apiGroups: - monitoring.coreos.com resources: diff --git a/components/component.go b/components/component.go index 39eb85a08d9..c8fe864ec12 100644 --- a/components/component.go +++ b/components/component.go @@ -133,6 +133,8 @@ func (c *Component) UpdatePrometheusConfig(_ client.Client, enable bool, compone TrustyAIARules string `yaml:"trustyai-alerting.rules"` KserveRRules string `yaml:"kserve-recording.rules"` KserveARules string `yaml:"kserve-alerting.rules"` + ModelRegistryRRules string `yaml:"model-registry-operator-recording.rules"` + ModelRegistryARules string `yaml:"model-registry-operator-alerting.rules"` } `yaml:"data"` } var configMap ConfigMap diff --git a/components/modelregistry/modelregistry.go b/components/modelregistry/modelregistry.go new file mode 100644 index 00000000000..89ded5e9fa2 --- /dev/null +++ b/components/modelregistry/modelregistry.go @@ -0,0 +1,211 @@ +// Package modelregistry provides utility functions to config ModelRegistry, an ML Model metadata repository service +// +groupName=datasciencecluster.opendatahub.io +package modelregistry + +import ( + "context" + "errors" + "fmt" + "path/filepath" + "strings" + "text/template" + + "github.com/go-logr/logr" + operatorv1 "github.com/openshift/api/operator/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + + dsciv1 "github.com/opendatahub-io/opendatahub-operator/v2/apis/dscinitialization/v1" + infrav1 "github.com/opendatahub-io/opendatahub-operator/v2/apis/infrastructure/v1" + "github.com/opendatahub-io/opendatahub-operator/v2/components" + "github.com/opendatahub-io/opendatahub-operator/v2/pkg/cluster" + "github.com/opendatahub-io/opendatahub-operator/v2/pkg/conversion" + "github.com/opendatahub-io/opendatahub-operator/v2/pkg/deploy" + + _ "embed" +) + +const DefaultModelRegistryCert = "default-modelregistry-cert" + +var ( + ComponentName = "model-registry-operator" + Path = deploy.DefaultManifestPath + "/" + ComponentName + "/overlays/odh" + // we should not apply this label to the namespace, as it triggered namspace deletion during operator uninstall + // modelRegistryLabels = cluster.WithLabels( + // labels.ODH.OwnedNamespace, "true", + // ). + ModelRegistriesNamespace = "odh-model-registries" +) + +// Verifies that ModelRegistry implements ComponentInterface. +var _ components.ComponentInterface = (*ModelRegistry)(nil) + +// ModelRegistry struct holds the configuration for the ModelRegistry component. +// +kubebuilder:object:generate=true +type ModelRegistry struct { + components.Component `json:""` +} + +func (m *ModelRegistry) OverrideManifests(ctx context.Context, _ cluster.Platform) error { + // If devflags are set, update default manifests path + if len(m.DevFlags.Manifests) != 0 { + manifestConfig := m.DevFlags.Manifests[0] + if err := deploy.DownloadManifests(ctx, ComponentName, manifestConfig); err != nil { + return err + } + // If overlay is defined, update paths + defaultKustomizePath := "overlays/odh" + if manifestConfig.SourcePath != "" { + defaultKustomizePath = manifestConfig.SourcePath + } + Path = filepath.Join(deploy.DefaultManifestPath, ComponentName, defaultKustomizePath) + } + + return nil +} + +func (m *ModelRegistry) GetComponentName() string { + return ComponentName +} + +func (m *ModelRegistry) ReconcileComponent(ctx context.Context, cli client.Client, logger logr.Logger, + owner metav1.Object, dscispec *dsciv1.DSCInitializationSpec, platform cluster.Platform, _ bool) error { + l := m.ConfigComponentLogger(logger, ComponentName, dscispec) + var imageParamMap = map[string]string{ + "IMAGES_MODELREGISTRY_OPERATOR": "RELATED_IMAGE_ODH_MODEL_REGISTRY_OPERATOR_IMAGE", + "IMAGES_GRPC_SERVICE": "RELATED_IMAGE_ODH_MLMD_GRPC_SERVER_IMAGE", + "IMAGES_REST_SERVICE": "RELATED_IMAGE_ODH_MODEL_REGISTRY_IMAGE", + } + enabled := m.GetManagementState() == operatorv1.Managed + monitoringEnabled := dscispec.Monitoring.ManagementState == operatorv1.Managed + + if enabled { + // return error if ServiceMesh is not enabled, as it's a required feature + if dscispec.ServiceMesh == nil || dscispec.ServiceMesh.ManagementState != operatorv1.Managed { + return errors.New("ServiceMesh needs to be set to 'Managed' in DSCI CR, it is required by Model Registry") + } + + if err := m.createDependencies(ctx, cli, dscispec); err != nil { + return err + } + + if m.DevFlags != nil { + // Download manifests and update paths + if err := m.OverrideManifests(ctx, platform); err != nil { + return err + } + } + + // Update image parameters only when we do not have customized manifests set + if (dscispec.DevFlags == nil || dscispec.DevFlags.ManifestsUri == "") && (m.DevFlags == nil || len(m.DevFlags.Manifests) == 0) { + extraParamsMap := map[string]string{ + "DEFAULT_CERT": DefaultModelRegistryCert, + } + if err := deploy.ApplyParams(Path, imageParamMap, false, extraParamsMap); err != nil { + return fmt.Errorf("failed to update image from %s : %w", Path, err) + } + } + + // Create model registries namespace + // We do not delete this namespace even when ModelRegistry is Removed or when operator is uninstalled. + ns, err := cluster.CreateNamespace(ctx, cli, ModelRegistriesNamespace) + if err != nil { + return err + } + l.Info("created model registry namespace", "namespace", ModelRegistriesNamespace) + // create servicemeshmember here, for now until post MVP solution + err = enrollToServiceMesh(ctx, cli, dscispec, ns) + if err != nil { + return err + } + l.Info("created model registry servicemesh member", "namespace", ModelRegistriesNamespace) + } else { + err := m.removeDependencies(ctx, cli, dscispec) + if err != nil { + return err + } + } + + // Deploy ModelRegistry Operator + if err := deploy.DeployManifestsFromPath(ctx, cli, owner, Path, dscispec.ApplicationsNamespace, m.GetComponentName(), enabled); err != nil { + return err + } + l.Info("apply manifests done") + + // Create additional model registry resources, componentEnabled=true because these extras are never deleted! + if err := deploy.DeployManifestsFromPath(ctx, cli, owner, Path+"/extras", dscispec.ApplicationsNamespace, m.GetComponentName(), true); err != nil { + return err + } + l.Info("apply extra manifests done") + + // CloudService Monitoring handling + if platform == cluster.ManagedRhods { + if enabled { + if err := cluster.WaitForDeploymentAvailable(ctx, cli, ComponentName, dscispec.ApplicationsNamespace, 10, 1); err != nil { + return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err) + } + l.Info("deployment is done, updating monitoring rules") + } + if err := m.UpdatePrometheusConfig(cli, enabled && monitoringEnabled, ComponentName); err != nil { + return err + } + if err := deploy.DeployManifestsFromPath(ctx, cli, owner, + filepath.Join(deploy.DefaultManifestPath, "monitoring", "prometheus", "apps"), + dscispec.Monitoring.Namespace, + "prometheus", true); err != nil { + return err + } + l.Info("updating SRE monitoring done") + } + return nil +} + +func (m *ModelRegistry) createDependencies(ctx context.Context, cli client.Client, dscispec *dsciv1.DSCInitializationSpec) error { + // create DefaultModelRegistryCert + if err := cluster.PropagateDefaultIngressCertificate(ctx, cli, DefaultModelRegistryCert, dscispec.ServiceMesh.ControlPlane.Namespace); err != nil { + return err + } + return nil +} + +func (m *ModelRegistry) removeDependencies(ctx context.Context, cli client.Client, dscispec *dsciv1.DSCInitializationSpec) error { + // delete DefaultModelRegistryCert + certSecret := corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: DefaultModelRegistryCert, + Namespace: dscispec.ServiceMesh.ControlPlane.Namespace, + }, + } + // ignore error if the secret has already been removed + if err := cli.Delete(ctx, &certSecret); client.IgnoreNotFound(err) != nil { + return err + } + return nil +} + +//go:embed resources/servicemesh-member.tmpl.yaml +var smmTemplate string + +func enrollToServiceMesh(ctx context.Context, cli client.Client, dscispec *dsciv1.DSCInitializationSpec, namespace *corev1.Namespace) error { + tmpl, err := template.New("servicemeshmember").Parse(smmTemplate) + if err != nil { + return fmt.Errorf("error parsing servicemeshmember template: %w", err) + } + builder := strings.Builder{} + controlPlaneData := struct { + Namespace string + ControlPlane *infrav1.ControlPlaneSpec + }{Namespace: namespace.Name, ControlPlane: &dscispec.ServiceMesh.ControlPlane} + + if err = tmpl.Execute(&builder, controlPlaneData); err != nil { + return fmt.Errorf("error executing servicemeshmember template: %w", err) + } + + unstrObj, err := conversion.StrToUnstructured(builder.String()) + if err != nil || len(unstrObj) != 1 { + return fmt.Errorf("error converting servicemeshmember template: %w", err) + } + + return client.IgnoreAlreadyExists(cli.Create(ctx, unstrObj[0])) +} diff --git a/components/modelregistry/resources/servicemesh-member.tmpl.yaml b/components/modelregistry/resources/servicemesh-member.tmpl.yaml new file mode 100644 index 00000000000..8665f2ba54f --- /dev/null +++ b/components/modelregistry/resources/servicemesh-member.tmpl.yaml @@ -0,0 +1,9 @@ +apiVersion: maistra.io/v1 +kind: ServiceMeshMember +metadata: + name: default + namespace: {{.Namespace}} +spec: + controlPlaneRef: + namespace: {{ .ControlPlane.Namespace }} + name: {{ .ControlPlane.Name }} diff --git a/components/modelregistry/zz_generated.deepcopy.go b/components/modelregistry/zz_generated.deepcopy.go new file mode 100644 index 00000000000..3ed241dd7f1 --- /dev/null +++ b/components/modelregistry/zz_generated.deepcopy.go @@ -0,0 +1,40 @@ +//go:build !ignore_autogenerated +// +build !ignore_autogenerated + +/* +Copyright 2023. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by controller-gen. DO NOT EDIT. + +package modelregistry + +import () + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ModelRegistry) DeepCopyInto(out *ModelRegistry) { + *out = *in + in.Component.DeepCopyInto(&out.Component) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ModelRegistry. +func (in *ModelRegistry) DeepCopy() *ModelRegistry { + if in == nil { + return nil + } + out := new(ModelRegistry) + in.DeepCopyInto(out) + return out +} diff --git a/config/crd/bases/datasciencecluster.opendatahub.io_datascienceclusters.yaml b/config/crd/bases/datasciencecluster.opendatahub.io_datascienceclusters.yaml index 601118bb015..a50f2d24dd2 100644 --- a/config/crd/bases/datasciencecluster.opendatahub.io_datascienceclusters.yaml +++ b/config/crd/bases/datasciencecluster.opendatahub.io_datascienceclusters.yaml @@ -386,6 +386,50 @@ spec: pattern: ^(Managed|Unmanaged|Force|Removed)$ type: string type: object + modelregistry: + description: ModelRegistry component configuration. + properties: + devFlags: + description: Add developer fields + properties: + manifests: + description: List of custom manifests for the given component + items: + properties: + contextDir: + default: manifests + description: contextDir is the relative path to + the folder containing manifests in a repository, + default value "manifests" + type: string + sourcePath: + default: "" + description: 'sourcePath is the subpath within contextDir + where kustomize builds start. Examples include + any sub-folder or path: `base`, `overlays/dev`, + `default`, `odh` etc.' + type: string + uri: + default: "" + description: uri is the URI point to a git repo + with tag/branch. e.g. https://github.com/org/repo/tarball/ + type: string + type: object + type: array + type: object + managementState: + description: "Set to one of the following values: \n - \"Managed\" + : the operator is actively managing the component and trying + to keep it active. It will only upgrade the component if + it is safe to do so \n - \"Removed\" : the operator is actively + managing the component and will not install it, or if it + is installed, the operator will try to remove it" + enum: + - Managed + - Removed + pattern: ^(Managed|Unmanaged|Force|Removed)$ + type: string + type: object ray: description: Ray component configuration. properties: diff --git a/config/monitoring/prometheus/apps/prometheus-configs.yaml b/config/monitoring/prometheus/apps/prometheus-configs.yaml index b19ec0394ba..933421afca7 100644 --- a/config/monitoring/prometheus/apps/prometheus-configs.yaml +++ b/config/monitoring/prometheus/apps/prometheus-configs.yaml @@ -372,6 +372,31 @@ data: target_label: __address__ replacement: ${1}:8080 + - job_name: 'Model Registry Operator' + honor_labels: true + metrics_path: /metrics + scheme: https + tls_config: + insecure_skip_verify: true + params: + module: [http_2xx] + authorization: + credentials_file: /run/secrets/kubernetes.io/serviceaccount/token + kubernetes_sd_configs: + - role: endpoints + namespaces: + names: + - + relabel_configs: + - source_labels: [__meta_kubernetes_service_name] + regex: ^(model-registry-operator-controller-manager-metrics-service)$ + target_label: kubernetes_name + action: keep + - source_labels: [__address__] + regex: (.+):(\d+) + target_label: __address__ + replacement: ${1}:8443 + - job_name: 'RHOAI Metrics' honor_labels: true scheme: http @@ -1577,3 +1602,92 @@ data: labels: severity: warning instance: trustyai-service-operator-controller-manager + + model-registry-operator-recording.rules: | + groups: + - name: SLOs - Model Registry Operator + rules: + - expr: | + absent(up{job=~'Model Registry Operator'}) * 0 or vector(1) + labels: + instance: model-registry-operator + record: probe_success + - expr: | + 1 - min(avg_over_time(probe_success{instance="model-registry-operator"}[1d])) + labels: + instance: model-registry-operator + record: probe_success:burnrate1d + - expr: | + 1 - min(avg_over_time(probe_success{instance="model-registry-operator"}[1h])) + labels: + instance: model-registry-operator + record: probe_success:burnrate1h + - expr: | + 1 - min(avg_over_time(probe_success{instance="model-registry-operator"}[2h])) + labels: + instance: model-registry-operator + record: probe_success:burnrate2h + - expr: | + 1 - min(avg_over_time(probe_success{instance="model-registry-operator"}[30m])) + labels: + instance: model-registry-operator + record: probe_success:burnrate30m + - expr: | + 1 - min(avg_over_time(probe_success{instance="model-registry-operator"}[3d])) + labels: + instance: model-registry-operator + record: probe_success:burnrate3d + - expr: | + 1 - min(avg_over_time(probe_success{instance="model-registry-operator"}[5m])) + labels: + instance: model-registry-operator + record: probe_success:burnrate5m + - expr: | + 1 - min(avg_over_time(probe_success{instance="model-registry-operator"}[6h])) + labels: + instance: model-registry-operator + record: probe_success:burnrate6h + + model-registry-operator-alerting.rules: | + groups: + - name: SLOs-probe_success_model_controller + rules: + - alert: Model Registry Operator Probe Success Burn Rate + annotations: + message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' + triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-model-registry-operator-probe-success-burn-rate.md" + summary: Model Registry Operator Probe Success Burn Rate + expr: | + sum(probe_success:burnrate5m{instance=~"model-registry-operator"}) by (instance) > (14.40 * (1-0.98000)) + and + sum(probe_success:burnrate1h{instance=~"model-registry-operator"}) by (instance) > (14.40 * (1-0.98000)) + for: 2m + labels: + severity: critical + namespace: redhat-ods-applications + - alert: Model Registry Operator Probe Success Burn Rate + annotations: + message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' + triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-model-registry-operator-probe-success-burn-rate.md" + summary: Model Registry Operator Probe Success Burn Rate + expr: | + sum(probe_success:burnrate30m{instance=~"model-registry-operator"}) by (instance) > (6.00 * (1-0.98000)) + and + sum(probe_success:burnrate6h{instance=~"model-registry-operator"}) by (instance) > (6.00 * (1-0.98000)) + for: 15m + labels: + severity: critical + namespace: redhat-ods-applications + - alert: Model Registry Operator Probe Success Burn Rate + annotations: + message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' + triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-model-registry-operator-probe-success-burn-rate.md" + summary: Model Registry Operator Probe Success Burn Rate + expr: | + sum(probe_success:burnrate2h{instance=~"model-registry-operator"}) by (instance) > (3.00 * (1-0.98000)) + and + sum(probe_success:burnrate1d{instance=~"model-registry-operator"}) by (instance) > (3.00 * (1-0.98000)) + for: 1h + labels: + severity: warning + namespace: redhat-ods-applications diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index eeac23e42d1..dc61fbb68e1 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -785,6 +785,32 @@ rules: - update - use - watch +- apiGroups: + - modelregistry.opendatahub.io + resources: + - modelregistries + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - modelregistry.opendatahub.io + resources: + - modelregistries/finalizers + verbs: + - update +- apiGroups: + - modelregistry.opendatahub.io + resources: + - modelregistries/status + verbs: + - get + - patch + - update - apiGroups: - monitoring.coreos.com resources: diff --git a/config/samples/datasciencecluster_v1_datasciencecluster.yaml b/config/samples/datasciencecluster_v1_datasciencecluster.yaml index 4ffd8b417e8..7838ccfcda6 100644 --- a/config/samples/datasciencecluster_v1_datasciencecluster.yaml +++ b/config/samples/datasciencecluster_v1_datasciencecluster.yaml @@ -39,4 +39,6 @@ spec: workbenches: managementState: "Managed" trustyai: + managementState: "Removed" + modelregistry: managementState: "Removed" diff --git a/controllers/datasciencecluster/kubebuilder_rbac.go b/controllers/datasciencecluster/kubebuilder_rbac.go index d23cd070851..017c752f7f7 100644 --- a/controllers/datasciencecluster/kubebuilder_rbac.go +++ b/controllers/datasciencecluster/kubebuilder_rbac.go @@ -122,6 +122,10 @@ package datasciencecluster // +kubebuilder:rbac:groups="monitoring.coreos.com",resources=probes,verbs=get;create;patch;delete;deletecollection // +kubebuilder:rbac:groups="monitoring.coreos.com",resources=prometheusrules,verbs=get;create;patch;delete;deletecollection +// +kubebuilder:rbac:groups=modelregistry.opendatahub.io,resources=modelregistries,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=modelregistry.opendatahub.io,resources=modelregistries/status,verbs=get;update;patch +// +kubebuilder:rbac:groups=modelregistry.opendatahub.io,resources=modelregistries/finalizers,verbs=update + // +kubebuilder:rbac:groups="monitoring.coreos.com",resources=prometheuses/finalizers,verbs=get;create;patch;delete;deletecollection // +kubebuilder:rbac:groups="monitoring.coreos.com",resources=prometheuses/status,verbs=get;create;patch;delete;deletecollection diff --git a/docs/api-overview.md b/docs/api-overview.md index 57c93b9c08f..ead3204d4fb 100644 --- a/docs/api-overview.md +++ b/docs/api-overview.md @@ -49,6 +49,7 @@ _Appears in:_ - [Kserve](#kserve) - [Kueue](#kueue) - [ModelMeshServing](#modelmeshserving) +- [ModelRegistry](#modelregistry) - [Ray](#ray) - [TrainingOperator](#trainingoperator) - [TrustyAI](#trustyai) @@ -230,6 +231,29 @@ _Appears in:_ +## datasciencecluster.opendatahub.io/modelregistry + +Package modelregistry provides utility functions to config ModelRegistry, an ML Model metadata repository service + + + +#### ModelRegistry + + + +ModelRegistry struct holds the configuration for the ModelRegistry component. + + + +_Appears in:_ +- [Components](#components) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `Component` _[Component](#component)_ | | | | + + + ## datasciencecluster.opendatahub.io/ray Package ray provides utility functions to config Ray as part of the stack @@ -356,6 +380,7 @@ _Appears in:_ | `ray` _[Ray](#ray)_ | Ray component configuration. | | | | `trustyai` _[TrustyAI](#trustyai)_ | TrustyAI component configuration. | | | | `trainingoperator` _[TrainingOperator](#trainingoperator)_ | Training Operator component configuration. | | | +| `modelregistry` _[ModelRegistry](#modelregistry)_ | ModelRegistry component configuration. | | | #### ControlPlaneSpec diff --git a/get_all_manifests.sh b/get_all_manifests.sh index a1f81ec925e..548adf9ac1e 100755 --- a/get_all_manifests.sh +++ b/get_all_manifests.sh @@ -5,7 +5,7 @@ GITHUB_URL="https://github.com/" # update to use different git repo for legacy manifests MANIFEST_ORG="red-hat-data-services" -# component: notebook, dsp, kserve, dashbaord, cf/ray, trustyai, modelmesh. +# component: notebook, dsp, kserve, dashbaord, cf/ray, trustyai, modelmesh, modelregistry. # in the format of "repo-org:repo-name:branch-name:source-folder:target-folder". declare -A COMPONENT_MANIFESTS=( ["codeflare"]="red-hat-data-services:codeflare-operator:rhoai-2.12:config:codeflare" @@ -21,6 +21,7 @@ declare -A COMPONENT_MANIFESTS=( ["kserve"]="red-hat-data-services:kserve:rhoai-2.12:config:kserve" ["odh-dashboard"]="red-hat-data-services:odh-dashboard:rhoai-2.12:manifests:dashboard" ["trainingoperator"]="red-hat-data-services:training-operator:rhoai-2.12:manifests:trainingoperator" + ["modelregistry"]="opendatahub-io:model-registry-operator:main:config:model-registry-operator" ) # Allow overwriting repo using flags component=repo diff --git a/pkg/upgrade/upgrade.go b/pkg/upgrade/upgrade.go index eba6c148123..9f073bdcdc6 100644 --- a/pkg/upgrade/upgrade.go +++ b/pkg/upgrade/upgrade.go @@ -34,6 +34,7 @@ import ( "github.com/opendatahub-io/opendatahub-operator/v2/components/kserve" "github.com/opendatahub-io/opendatahub-operator/v2/components/kueue" "github.com/opendatahub-io/opendatahub-operator/v2/components/modelmeshserving" + "github.com/opendatahub-io/opendatahub-operator/v2/components/modelregistry" "github.com/opendatahub-io/opendatahub-operator/v2/components/ray" "github.com/opendatahub-io/opendatahub-operator/v2/components/trainingoperator" "github.com/opendatahub-io/opendatahub-operator/v2/components/trustyai" @@ -95,6 +96,9 @@ func CreateDefaultDSC(ctx context.Context, cli client.Client) error { TrustyAI: trustyai.TrustyAI{ Component: components.Component{ManagementState: operatorv1.Removed}, }, + ModelRegistry: modelregistry.ModelRegistry{ + Component: components.Component{ManagementState: operatorv1.Removed}, + }, }, }, } diff --git a/tests/e2e/dsc_creation_test.go b/tests/e2e/dsc_creation_test.go index cd90a259538..71b7ff1ab02 100644 --- a/tests/e2e/dsc_creation_test.go +++ b/tests/e2e/dsc_creation_test.go @@ -15,14 +15,17 @@ import ( autoscalingv1 "k8s.io/api/autoscaling/v1" k8serr "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/util/retry" + "sigs.k8s.io/controller-runtime/pkg/client" dscv1 "github.com/opendatahub-io/opendatahub-operator/v2/apis/datasciencecluster/v1" dsciv1 "github.com/opendatahub-io/opendatahub-operator/v2/apis/dscinitialization/v1" infrav1 "github.com/opendatahub-io/opendatahub-operator/v2/apis/infrastructure/v1" "github.com/opendatahub-io/opendatahub-operator/v2/components" + "github.com/opendatahub-io/opendatahub-operator/v2/components/modelregistry" "github.com/opendatahub-io/opendatahub-operator/v2/pkg/cluster" "github.com/opendatahub-io/opendatahub-operator/v2/pkg/feature/serverless" ) @@ -75,6 +78,14 @@ func creationTestSuite(t *testing.T) { err = testCtx.testDefaultCertsAvailable() require.NoError(t, err, "error getting default cert secrets for Kserve") }) + t.Run("Validate default model registry cert available", func(t *testing.T) { + err = testCtx.testDefaultModelRegistryCertAvailable() + require.NoError(t, err, "error getting default cert secret for ModelRegistry") + }) + t.Run("Validate model registry servicemeshmember available", func(t *testing.T) { + err = testCtx.testMRServiceMeshMember() + require.NoError(t, err, "error getting servicemeshmember for Model Registry") + }) t.Run("Validate Controller reconcile", func(t *testing.T) { // only test Dashboard component for now err = testCtx.testUpdateComponentReconcile() @@ -417,6 +428,64 @@ func (tc *testContext) testDefaultCertsAvailable() error { return nil } +func (tc *testContext) testDefaultModelRegistryCertAvailable() error { + // return if MR is not set to Managed + if tc.testDsc.Spec.Components.ModelRegistry.ManagementState != operatorv1.Managed { + return nil + } + + // Get expected cert secrets + defaultIngressCtrl, err := cluster.FindAvailableIngressController(tc.ctx, tc.customClient) + if err != nil { + return fmt.Errorf("failed to get ingress controller: %w", err) + } + + defaultIngressCertName := cluster.GetDefaultIngressCertSecretName(defaultIngressCtrl) + + defaultIngressSecret, err := cluster.GetSecret(tc.ctx, tc.customClient, "openshift-ingress", defaultIngressCertName) + if err != nil { + return err + } + + // Verify secret from Control Plane namespace matches the default MR cert secret + defaultMRSecretName := modelregistry.DefaultModelRegistryCert + defaultMRSecret, err := cluster.GetSecret(tc.ctx, tc.customClient, tc.testDSCI.Spec.ServiceMesh.ControlPlane.Namespace, + defaultMRSecretName) + if err != nil { + return err + } + + if defaultMRSecret.Type != defaultIngressSecret.Type { + return fmt.Errorf("wrong type of MR cert secret is created for %v. Expected %v, Got %v", defaultMRSecretName, defaultIngressSecret.Type, defaultMRSecret.Type) + } + + if string(defaultIngressSecret.Data["tls.crt"]) != string(defaultMRSecret.Data["tls.crt"]) { + return fmt.Errorf("default MR cert secret not expected. Epected %v, Got %v", defaultIngressSecret.Data["tls.crt"], defaultMRSecret.Data["tls.crt"]) + } + + if string(defaultIngressSecret.Data["tls.key"]) != string(defaultMRSecret.Data["tls.key"]) { + return fmt.Errorf("default MR cert secret not expected. Epected %v, Got %v", defaultIngressSecret.Data["tls.crt"], defaultMRSecret.Data["tls.crt"]) + } + return nil +} + +func (tc *testContext) testMRServiceMeshMember() error { + if tc.testDsc.Spec.Components.ModelRegistry.ManagementState != operatorv1.Managed { + return nil + } + + // Get unstructured ServiceMeshMember + smm := unstructured.Unstructured{} + smm.SetAPIVersion("maistra.io/v1") + smm.SetKind("ServiceMeshMember") + err := tc.customClient.Get(tc.ctx, + client.ObjectKey{Namespace: modelregistry.ModelRegistriesNamespace, Name: "default"}, &smm) + if err != nil { + return fmt.Errorf("failed to get servicemesh member: %w", err) + } + return nil +} + func (tc *testContext) testUpdateComponentReconcile() error { // Test Updating Dashboard Replicas diff --git a/tests/e2e/helper_test.go b/tests/e2e/helper_test.go index a16866b83f2..c313fa33432 100644 --- a/tests/e2e/helper_test.go +++ b/tests/e2e/helper_test.go @@ -30,6 +30,7 @@ import ( "github.com/opendatahub-io/opendatahub-operator/v2/components/kserve" "github.com/opendatahub-io/opendatahub-operator/v2/components/kueue" "github.com/opendatahub-io/opendatahub-operator/v2/components/modelmeshserving" + "github.com/opendatahub-io/opendatahub-operator/v2/components/modelregistry" "github.com/opendatahub-io/opendatahub-operator/v2/components/ray" "github.com/opendatahub-io/opendatahub-operator/v2/components/trainingoperator" "github.com/opendatahub-io/opendatahub-operator/v2/components/trustyai" @@ -156,6 +157,11 @@ func setupDSCInstance(name string) *dscv1.DataScienceCluster { ManagementState: operatorv1.Removed, }, }, + ModelRegistry: modelregistry.ModelRegistry{ + Component: components.Component{ + ManagementState: operatorv1.Managed, + }, + }, }, }, }