Skip to content

Commit

Permalink
MGMT-17217: Add AlarmDictionary to resourceTypes API
Browse files Browse the repository at this point in the history
This PR includes the following:
* Added alarms/definitions.json file.
* Added AlarmDefinitionHandler that returns objects from definitions.json
  (with mappings according to spec)
* Created AlarmDictionary for each ResourceType
  * Added to ResourceTypeHandler
  * Filter AlarmDefinitions according to ResourceClass
    E.g. for Nodes -> filter by 'COMPUTE'
  • Loading branch information
danielerez committed Apr 7, 2024
1 parent b64dba1 commit 2540dcc
Show file tree
Hide file tree
Showing 12 changed files with 644 additions and 27 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -237,4 +237,4 @@ $ curl -s http://localhost:8003/o2ims-infrastructureMonitoring/v1/alarmProbableC
Notes:
* This API is not defined by O2ims Interface Specification.
* The server supports the `alarmProbableCauses` endpoint for exposing a custom list of probable causes.
* The list is available in [data folder](data/alarms/probable_causes.json). Can be customized and maintained as required.
* The list is available in [data folder](internal/files/alarms/probable_causes.json). Can be customized and maintained as required.
4 changes: 2 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ require (
github.com/coreos/go-semver v0.3.1
github.com/go-logr/logr v1.2.4
github.com/golang-jwt/jwt/v4 v4.5.0
github.com/google/uuid v1.3.0
github.com/gorilla/mux v1.8.1
github.com/imdario/mergo v0.3.11
github.com/itchyny/gojq v0.12.14
Expand All @@ -17,6 +18,7 @@ require (
github.com/spf13/pflag v1.0.6-0.20210604193023-d5e0c0615ace
github.com/thoas/go-funk v0.9.3
go.uber.org/mock v0.4.0
golang.org/x/exp v0.0.0-20230118134722-a68e582fa157
golang.org/x/net v0.18.0
gopkg.in/yaml.v3 v3.0.1
k8s.io/api v0.28.4
Expand Down Expand Up @@ -47,7 +49,6 @@ require (
github.com/google/go-cmp v0.6.0 // indirect
github.com/google/gofuzz v1.2.0 // indirect
github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1 // indirect
github.com/google/uuid v1.3.0 // indirect
github.com/inconshreveable/mousetrap v1.1.0 // indirect
github.com/itchyny/timefmt-go v0.1.5 // indirect
github.com/josharian/intern v1.0.0 // indirect
Expand All @@ -63,7 +64,6 @@ require (
github.com/prometheus/procfs v0.10.1 // indirect
go.uber.org/multierr v1.11.0 // indirect
go.uber.org/zap v1.25.0 // indirect
golang.org/x/exp v0.0.0-20230118134722-a68e582fa157 // indirect
golang.org/x/oauth2 v0.8.0 // indirect
golang.org/x/sys v0.15.0 // indirect
golang.org/x/term v0.14.0 // indirect
Expand Down
62 changes: 62 additions & 0 deletions internal/files/alarms/definitions.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
[
{
"alarmDefinitionId": "Watchdog",
"alarmName": "An alert that should always be firing to certify that Alertmanager is working properly.",
"alarmDescription": "This is an alert meant to ensure that the entire alerting pipeline is functional.\nThis alert is always firing, therefore it should always be firing in Alertmanager\nand always fire against a receiver. There are integrations with various notification\nmechanisms that send a notification when this alert is not firing. For example the\n\"DeadMansSnitch\" integration in PagerDuty.\n"
},
{
"alarmDefinitionId": "UpdateAvailable",
"alarmName": "Your upstream update recommendation service recommends you update your cluster.",
"alarmDescription": "For more information refer to 'oc adm upgrade'"
},
{
"alarmDefinitionId": "ClusterNotUpgradeable",
"alarmName": "One or more cluster operators have been blocking minor version cluster upgrades for at least an hour.",
"alarmDescription": "In most cases, you will still be able to apply patch releases. Reason AdminAckRequired.",
"proposedRepairActions": "For more information refer to 'oc adm upgrade' or https://console-openshift-console.apps.<cluster_domain>/settings/cluster/."
},
{
"alarmDefinitionId": "AlertmanagerReceiversNotConfigured",
"alarmName": "Receivers (notification integrations) are not configured on Alertmanager",
"alarmDescription": "Alerts are not configured to be sent to a notification system, meaning that you may not be notified in a timely fashion when important failures occur.",
"proposedRepairActions": "Check the OpenShift documentation to learn how to configure notifications with Alertmanager."
},
{
"alarmDefinitionId": "HighOverallControlPlaneMemory",
"alarmName": "Memory utilization across all control plane nodes is high, and could impact responsiveness and stability.",
"alarmDescription": "Given three control plane nodes, the overall memory utilization may only be about 2/3 of all available capacity. This is because if a single control plane node fails, the kube-apiserver and etcd my be slow to respond.",
"proposedRepairActions": "To fix this, increase memory of the control plane nodes."
},
{
"alarmDefinitionId": "NodeClockNotSynchronising",
"alarmName": "Clock not synchronising.",
"alarmDescription": "Clock on host is not synchronising. Ensure NTP is configured on this host.",
"alarmAdditionalFields": {
"resourceClass": "COMPUTE"
}
},
{
"alarmDefinitionId": "NodeClockSkewDetected",
"alarmName": "Clock skew detected.",
"alarmDescription": "Clock is out of sync by more than 0.05s. Ensure NTP is configured correctly on this host.",
"alarmAdditionalFields": {
"resourceClass": "COMPUTE"
}
},
{
"alarmDefinitionId": "IngressWithoutClassName",
"alarmName": "Ingress without IngressClassName for 1 day",
"alarmDescription": "This alert fires when there is an Ingress with an unset IngressClassName for longer than one day.",
"alarmAdditionalFields": {
"resourceClass": "COMPUTE"
}
},
{
"alarmDefinitionId": "NodeMemoryHighUtilization",
"alarmName": "Host is running out of memory.",
"alarmDescription": "Memory is filling up, has been above memory high utilization threshold for the last 15 minutes",
"alarmAdditionalFields": {
"resourceClass": "COMPUTE"
}
}
]
Original file line number Diff line number Diff line change
Expand Up @@ -7,26 +7,41 @@
{
"probableCauseId": "UpdateAvailable",
"name": "Your upstream update recommendation service recommends you update your cluster.",
"description": "For more information refer to 'oc adm upgrade' or https://console-openshift-console.apps.spoke1.redhat.com/settings/cluster/."
},
{
"probableCauseId": "NodeClockNotSynchronising",
"name": "Clock not synchronising.",
"description": "Clock on ostest-extraworker-1 is not synchronising. Ensure NTP is configured on this host."
"description": "For more information refer to 'oc adm upgrade'"
},
{
"probableCauseId": "ClusterNotUpgradeable",
"name": "One or more cluster operators have been blocking minor version cluster upgrades for at least an hour.",
"description": "In most cases, you will still be able to apply patch releases. Reason AdminAckRequired. For more information refer to 'oc adm upgrade' or https://console-openshift-console.apps.spoke1.redhat.com/settings/cluster/."
"description": "In most cases, you will still be able to apply patch releases. Reason AdminAckRequired."
},
{
"probableCauseId": "AlertmanagerReceiversNotConfigured",
"name": "Receivers (notification integrations) are not configured on Alertmanager",
"description": "Alerts are not configured to be sent to a notification system, meaning that you may not be notified in a timely fashion when important failures occur. Check the OpenShift documentation to learn how to configure notifications with Alertmanager."
"description": "Alerts are not configured to be sent to a notification system, meaning that you may not be notified in a timely fashion when important failures occur."
},
{
"probableCauseId": "HighOverallControlPlaneMemory",
"name": "Memory utilization across all control plane nodes is high, and could impact responsiveness and stability.",
"description": "Given three control plane nodes, the overall memory utilization may only be about 2/3 of all available capacity. This is because if a single control plane node fails, the kube-apiserver and etcd my be slow to respond. To fix this, increase memory of the control plane nodes."
"description": "Given three control plane nodes, the overall memory utilization may only be about 2/3 of all available capacity. This is because if a single control plane node fails, the kube-apiserver and etcd my be slow to respond."
},
{
"probableCauseId": "NodeClockNotSynchronising",
"name": "Clock not synchronising.",
"description": "Clock on host is not synchronising. Ensure NTP is configured on this host."
},
{
"probableCauseId": "NodeClockSkewDetected",
"name": "Clock skew detected.",
"description": "Clock is out of sync by more than 0.05s. Ensure NTP is configured correctly on this host."
},
{
"probableCauseId": "IngressWithoutClassName",
"name": "Ingress without IngressClassName for 1 day",
"description": "This alert fires when there is an Ingress with an unset IngressClassName for longer than one day."
},
{
"probableCauseId": "NodeMemoryHighUtilization",
"name": "Host is running out of memory.",
"description": "Memory is filling up, has been above memory high utilization threshold for the last 15 minutes"
}
]
12 changes: 12 additions & 0 deletions internal/files/embed.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
package files

import "embed"

var (
//go:embed alarms
Alarms embed.FS
)

const (
AlarmDictionaryVersion = "v1"
)
18 changes: 18 additions & 0 deletions internal/openapi/spec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -590,6 +590,24 @@ components:
resourceClass:
type: string
example: "COMPUTE"
alarmDictionary:
type: object
example:
alarmDictionaryVersion: "v1"
managementInterfaceId: "O2IMS"
alarmDefinition: [
{
"alarmName": "Host is running out of memory.",
"alarmDescription": "Memory is filling up, has been above memory high utilization threshold for the last 15 minutes",
"proposedRepairActions": "",
"managementInterfaceId": "O2IMS",
"pkNotificationField": "alarmDefinitionID",
"alarmAdditionalFields": {
"resourceClass": "COMPUTE"
},
"alarmDefinitionId": "NodeMemoryHighUtilization"
}
]

ResourceTypes:
description: |
Expand Down
205 changes: 205 additions & 0 deletions internal/service/alarm_definition_handler.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
/*
Copyright 2023 Red Hat Inc.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in
compliance with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is
distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied. See the License for the specific language governing permissions and limitations under the
License.
*/

package service

import (
"bytes"
"context"
"errors"
"fmt"
"log/slog"

jsoniter "github.com/json-iterator/go"

"github.com/openshift-kni/oran-o2ims/internal/data"
"github.com/openshift-kni/oran-o2ims/internal/files"
"github.com/openshift-kni/oran-o2ims/internal/k8s"
)

const (
alarmsDefinitionsPath = "alarms/definitions.json"
alarmsProbableCausesPath = "alarms/probable_causes.json"
)

// AlarmDefinitionHandlerBuilder contains the data and logic needed to create a new alarm
// definition collection handler. Don't create instances of this type directly, use the NewAlarmDefinitionHandler
// function instead.
type AlarmDefinitionHandlerBuilder struct {
logger *slog.Logger
}

// AlarmDefinitionHandler knows how to respond to requests to list alarms. Don't create
// instances of this type directly, use the NewAlarmDefinitionHandler function instead.
type AlarmDefinitionHandler struct {
logger *slog.Logger
jsonAPI jsoniter.API
}

// NewAlarmDefinitionHandler creates a builder that can then be used to configure and create a
// handler for the collection of alarms.
func NewAlarmDefinitionHandler() *AlarmDefinitionHandlerBuilder {
return &AlarmDefinitionHandlerBuilder{}
}

// SetLogger sets the logger that the handler will use to write to the log. This is mandatory.
func (b *AlarmDefinitionHandlerBuilder) SetLogger(
value *slog.Logger) *AlarmDefinitionHandlerBuilder {
b.logger = value
return b
}

// Build uses the data stored in the builder to create and configure a new handler.
func (b *AlarmDefinitionHandlerBuilder) Build() (
result *AlarmDefinitionHandler, err error) {
// Check parameters:
if b.logger == nil {
err = errors.New("logger is mandatory")
return
}

// Prepare the JSON iterator API:
jsonConfig := jsoniter.Config{
IndentionStep: 2,
}
jsonAPI := jsonConfig.Froze()

// Create and populate the object:
result = &AlarmDefinitionHandler{
logger: b.logger,
jsonAPI: jsonAPI,
}
return
}

// List is part of the implementation of the collection handler interface.
func (h *AlarmDefinitionHandler) List(ctx context.Context,
request *ListRequest) (response *ListResponse, err error) {

// Transform the items into what we need:
definitions, err := h.fetchItems()
if err != nil {
return
}

// Return the result:
response = &ListResponse{
Items: definitions,
}
return
}

// Get is part of the implementation of the object handler interface.
func (h *AlarmDefinitionHandler) Get(ctx context.Context,
request *GetRequest) (response *GetResponse, err error) {

// Fetch the object:
definition, err := h.fetchItem(ctx, request.Variables[0])
if err != nil {
return
}

// Return the result:
response = &GetResponse{
Object: definition,
}

return
}

func (h *AlarmDefinitionHandler) fetchItems() (result data.Stream, err error) {
jsonFile, err := files.Alarms.ReadFile(alarmsDefinitionsPath)
if err != nil {
return nil, err
}
reader := bytes.NewReader(jsonFile)

definitions, err := k8s.NewStream().
SetLogger(h.logger).
SetReader(reader).
Build()

// Transform to AlarmDefinitions objects
result = data.Map(definitions, h.mapItem)

return
}

func (h *AlarmDefinitionHandler) fetchItem(ctx context.Context,
id string) (probableCause data.Object, err error) {

probableCauses, err := h.fetchItems()
if err != nil {
return
}

// Filter by ID
probableCauses = data.Select(
probableCauses,
func(ctx context.Context, item data.Object) (result bool, err error) {
result = item["probableCauseId"] == id
return
},
)

// Get first result
probableCause, err = probableCauses.Next(ctx)

return
}

// Map Definition to an O2 AlarmDefinitions object.
func (h *AlarmDefinitionHandler) mapItem(ctx context.Context,
from data.Object) (to data.Object, err error) {

alarmDefinitionId, err := data.GetString(from, "alarmDefinitionId")
if err != nil {
return
}

alarmName, err := data.GetString(from, "alarmName")
if err != nil {
return
}

alarmDescription, err := data.GetString(from, "alarmDescription")
if err != nil {
return
}

proposedRepairActions, err := data.GetString(from, "proposedRepairActions")
if err != nil {
// Property is optional
h.logger.Debug(fmt.Sprintf("'%s' is missing from alarm definition (optional)", "proposedRepairActions"))
}

alarmAdditionalFields, err := data.GetObj(from, "alarmAdditionalFields")
if err != nil {
// Property is optional
h.logger.Debug(fmt.Sprintf("'%s' is missing from alarm definition (optional)", "alarmAdditionalFields"))
err = nil
}

to = data.Object{
"alarmDefinitionId": alarmDefinitionId,
"alarmName": alarmName,
"alarmDescription": alarmDescription,
"proposedRepairActions": proposedRepairActions,
"managementInterfaceId": "O2IMS",
"pkNotificationField": "alarmDefinitionID",
"alarmAdditionalFields": alarmAdditionalFields,
}

return
}
Loading

0 comments on commit 2540dcc

Please sign in to comment.