Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions RELEASES.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ The plugin version is unchanged at `44` and is compatible with version `v1.14.0`
- Added:
- `--system-tracker-disk-required-available-space-percentage`
- `--system-tracker-disk-warning-available-space-percentage`
- `--system-tracker-memory-warning-available-percentage` - Monitors available system memory and reports unhealthy when below threshold. Disabled by default (set to 0). Supports both bare-metal and containerized (K8s/Docker) environments via cgroup detection.

- Deprecated:
- `--system-tracker-disk-required-available-space`
- `--system-tracker-disk-warning-threshold-available-space`
Expand Down
19 changes: 18 additions & 1 deletion config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,8 @@ const (
chainUpgradeFileName = "upgrade"
subnetConfigFileExt = ".json"

maxDiskSpaceThreshold = 50
maxDiskSpaceThreshold = 50
maxMemorySpaceThreshold = 50
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can change this if we'd like. I just copied all the values for memory, but I understand memory is not disk!

)

var (
Expand Down Expand Up @@ -88,6 +89,7 @@ var (
errInvalidSignerConfig = fmt.Errorf("only one of the following flags can be set: %s, %s, %s, %s", StakingEphemeralSignerEnabledKey, StakingSignerKeyContentKey, StakingSignerKeyPathKey, StakingRPCSignerEndpointKey)
errDiskSpaceOutOfRange = fmt.Errorf("out of range [0,%d]", maxDiskSpaceThreshold)
errDiskWarnAfterFatal = errors.New("warning disk space threshold cannot be greater than fatal threshold")
errMemorySpaceOutOfRange = fmt.Errorf("out of range [0,%d]", maxMemorySpaceThreshold)
)

func getConsensusConfig(v *viper.Viper) snowball.Parameters {
Expand Down Expand Up @@ -1159,6 +1161,16 @@ func getDiskSpaceConfig(v *viper.Viper) (
}
}

func getMemoryConfig(v *viper.Viper) (uint64, error) {
warnKey := SystemTrackerWarningAvailableMemoryPercentageKey
warn := v.GetUint64(warnKey)

if warn > maxMemorySpaceThreshold {
return 0, fmt.Errorf("%w: %q (%d)", errMemorySpaceOutOfRange, warnKey, warn)
}
return warn, nil
}

func getDiskTargeterConfig(v *viper.Viper) (tracker.TargeterConfig, error) {
vdrAlloc := v.GetFloat64(DiskVdrAllocKey)
maxNonVdrUsage := v.GetFloat64(DiskMaxNonVdrUsageKey)
Expand Down Expand Up @@ -1416,6 +1428,11 @@ func GetNodeConfig(v *viper.Viper) (node.Config, error) {
return node.Config{}, err
}

nodeConfig.WarningAvailableMemoryPercentage, err = getMemoryConfig(v)
if err != nil {
return node.Config{}, err
}

nodeConfig.CPUTargeterConfig, err = getCPUTargeterConfig(v)
if err != nil {
return node.Config{}, err
Expand Down
41 changes: 41 additions & 0 deletions config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -690,6 +690,47 @@ func TestGetDiskSpaceConfig(t *testing.T) {
}
}

func TestGetMemoryConfig(t *testing.T) {
tests := []struct {
name string
config map[string]uint64
expectedErr error
}{
{
name: "empty config",
config: map[string]uint64{},
},
{
name: "valid config",
config: map[string]uint64{
SystemTrackerWarningAvailableMemoryPercentageKey: maxMemorySpaceThreshold,
},
},
{
name: "invalid config - warning too big",
config: map[string]uint64{
SystemTrackerWarningAvailableMemoryPercentageKey: maxMemorySpaceThreshold + 1,
},
expectedErr: errMemorySpaceOutOfRange,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
require := require.New(t)
v := setupViperFlags()

for key, value := range tt.config {
v.Set(key, value)
}

_, err := GetNodeConfig(v)

require.ErrorIs(err, tt.expectedErr)
})
}
}

// setups config json file and writes content
func setupConfigJSON(t *testing.T, rootPath string, value string) string {
configFilePath := filepath.Join(rootPath, "config.json")
Expand Down
1 change: 1 addition & 0 deletions config/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,7 @@ func addNodeFlags(fs *pflag.FlagSet) {
fs.Uint64(SystemTrackerRequiredAvailableDiskSpacePercentageKey, 3, "Minimum percentage (between 0 and 50) of available disk space, under which the node will shutdown.")
fs.Uint64(SystemTrackerWarningThresholdAvailableDiskSpaceKey, 0, fmt.Sprintf("DEPRECATED: Warning threshold for the number of available bytes on disk, under which the node will be considered unhealthy. Must be >= [%s]", SystemTrackerRequiredAvailableDiskSpaceKey))
fs.Uint64(SystemTrackerWarningAvailableDiskSpacePercentageKey, 10, fmt.Sprintf("Warning threshold for the percentage (between 0 and 50) of available disk space, under which the node will be considered unhealthy. Must be >= [%s]", SystemTrackerRequiredAvailableDiskSpacePercentageKey))
fs.Uint64(SystemTrackerWarningAvailableMemoryPercentageKey, 0, "Warning threshold percentage (between 0 and 50) for available system memory. When available memory drops below this, the node will report unhealthy. Set to 0 to disable, disabled by default.")

// CPU management
fs.Float64(CPUVdrAllocKey, float64(runtime.NumCPU()), "Maximum number of CPUs to allocate for use by validators. Value should be in range [0, total core count]")
Expand Down
1 change: 1 addition & 0 deletions config/keys.go
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,7 @@ const (
SystemTrackerRequiredAvailableDiskSpacePercentageKey = "system-tracker-disk-required-available-space-percentage"
SystemTrackerWarningThresholdAvailableDiskSpaceKey = "system-tracker-disk-warning-threshold-available-space"
SystemTrackerWarningAvailableDiskSpacePercentageKey = "system-tracker-disk-warning-available-space-percentage"
SystemTrackerWarningAvailableMemoryPercentageKey = "system-tracker-memory-warning-available-percentage"
DiskVdrAllocKey = "throttler-inbound-disk-validator-alloc"
DiskMaxNonVdrUsageKey = "throttler-inbound-disk-max-non-validator-usage"
DiskMaxNonVdrNodeUsageKey = "throttler-inbound-disk-max-non-validator-node-usage"
Expand Down
2 changes: 2 additions & 0 deletions config/node/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,8 @@ type Config struct {
RequiredAvailableDiskSpacePercentage uint64 `json:"requiredAvailableDiskSpacePercentage"`
WarningAvailableDiskSpacePercentage uint64 `json:"warningAvailableDiskSpacePercentage"`

WarningAvailableMemoryPercentage uint64 `json:"warningAvailableMemoryPercentage"`

TraceConfig trace.Config `json:"traceConfig"`

// See comment on [UseCurrentHeight] in platformvm.Config
Expand Down
22 changes: 22 additions & 0 deletions node/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -1485,6 +1485,28 @@ func (n *Node) initHealthAPI() error {
return fmt.Errorf("couldn't register resource health check: %w", err)
}

memoryCheck := health.CheckerFunc(func(context.Context) (interface{}, error) {
// Monitor available memory and report unhealthy if below threshold.

availableMemoryBytes := n.resourceTracker.MemoryTracker().AvailableMemoryBytes()
availableMemoryPercentage := n.resourceTracker.MemoryTracker().AvailableMemoryPercentage()

var err error
if n.Config.WarningAvailableMemoryPercentage > 0 && availableMemoryPercentage < n.Config.WarningAvailableMemoryPercentage {
err = fmt.Errorf("remaining available memory percentage (%d%%) is below warning threshold (%d%%)", availableMemoryPercentage, n.Config.WarningAvailableMemoryPercentage)
}

return map[string]interface{}{
"availableMemoryBytes": availableMemoryBytes,
"availableMemoryPercentage": availableMemoryPercentage,
}, err
})

err = n.health.RegisterHealthCheck("memory", memoryCheck, health.ApplicationTag)
if err != nil {
return fmt.Errorf("couldn't register memory health check: %w", err)
}

wrongBLSKeyCheck := health.CheckerFunc(func(context.Context) (interface{}, error) {
vdr, ok := n.vdrs.GetValidator(constants.PrimaryNetworkID, n.ID)
if !ok {
Expand Down
62 changes: 54 additions & 8 deletions snow/networking/tracker/resource_tracker.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,16 @@ type DiskTracker interface {
AvailableDiskPercentage() uint64
}

type MemoryTracker interface {
AvailableMemoryBytes() uint64
AvailableMemoryPercentage() uint64
}

// ResourceTracker is an interface for tracking peers' usage of resources
type ResourceTracker interface {
CPUTracker() Tracker
DiskTracker() DiskTracker
MemoryTracker() MemoryTracker
// Registers that the given node started processing at the given time.
StartProcessing(ids.NodeID, time.Time)
// Registers that the given node stopped processing at the given time.
Expand Down Expand Up @@ -198,6 +204,30 @@ func (t *diskResourceTracker) TimeUntilUsage(nodeID ids.NodeID, now time.Time, v
return m.TimeUntil(now, value/scale)
}

type memoryResourceTracker struct {
t *resourceTracker
}

func (t *memoryResourceTracker) AvailableMemoryBytes() uint64 {
rt := t.t
rt.lock.Lock()
defer rt.lock.Unlock()

bytesAvailable := rt.resources.AvailableMemoryBytes()
rt.metrics.memoryAvailable.Set(float64(bytesAvailable))
return bytesAvailable
}

func (t *memoryResourceTracker) AvailableMemoryPercentage() uint64 {
rt := t.t
rt.lock.Lock()
defer rt.lock.Unlock()

percentageAvailable := rt.resources.AvailableMemoryPercentage()
rt.metrics.memoryPercentageAvailable.Set(float64(percentageAvailable))
return percentageAvailable
}

type resourceTracker struct {
lock sync.RWMutex

Expand Down Expand Up @@ -244,6 +274,10 @@ func (rt *resourceTracker) DiskTracker() DiskTracker {
return &diskResourceTracker{t: rt}
}

func (rt *resourceTracker) MemoryTracker() MemoryTracker {
return &memoryResourceTracker{t: rt}
}

func (rt *resourceTracker) StartProcessing(nodeID ids.NodeID, now time.Time) {
rt.lock.Lock()
defer rt.lock.Unlock()
Expand Down Expand Up @@ -297,12 +331,14 @@ func (rt *resourceTracker) prune(now time.Time) {
}

type trackerMetrics struct {
processingTimeMetric prometheus.Gauge
cpuMetric prometheus.Gauge
diskReadsMetric prometheus.Gauge
diskWritesMetric prometheus.Gauge
diskSpaceAvailable prometheus.Gauge
diskPercentageAvailable prometheus.Gauge
processingTimeMetric prometheus.Gauge
cpuMetric prometheus.Gauge
diskReadsMetric prometheus.Gauge
diskWritesMetric prometheus.Gauge
diskSpaceAvailable prometheus.Gauge
diskPercentageAvailable prometheus.Gauge
memoryAvailable prometheus.Gauge
memoryPercentageAvailable prometheus.Gauge
}

func newCPUTrackerMetrics(reg prometheus.Registerer) (*trackerMetrics, error) {
Expand All @@ -324,13 +360,21 @@ func newCPUTrackerMetrics(reg prometheus.Registerer) (*trackerMetrics, error) {
Help: "Disk writes (bytes/sec) tracked by the resource manager",
}),
diskSpaceAvailable: prometheus.NewGauge(prometheus.GaugeOpts{
Name: "disk_available_space",
Name: "disk_available",
Help: "Available space remaining (bytes) on the database volume",
}),
diskPercentageAvailable: prometheus.NewGauge(prometheus.GaugeOpts{
Name: "disk_available_percentage",
Name: "disk_percentage_available",
Help: "Percentage of database volume available",
}),
memoryAvailable: prometheus.NewGauge(prometheus.GaugeOpts{
Name: "memory_available",
Help: "Available memory remaining (bytes) on the system",
}),
memoryPercentageAvailable: prometheus.NewGauge(prometheus.GaugeOpts{
Name: "memory_percentage_available",
Help: "Percentage of system memory available",
}),
}
err := errors.Join(
reg.Register(m.processingTimeMetric),
Expand All @@ -339,6 +383,8 @@ func newCPUTrackerMetrics(reg prometheus.Registerer) (*trackerMetrics, error) {
reg.Register(m.diskWritesMetric),
reg.Register(m.diskSpaceAvailable),
reg.Register(m.diskPercentageAvailable),
reg.Register(m.memoryAvailable),
reg.Register(m.memoryPercentageAvailable),
)
return m, err
}
5 changes: 3 additions & 2 deletions tests/fixture/tmpnet/defaults.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,11 @@ func DefaultTmpnetFlags() FlagsMap {
config.HealthCheckFreqKey: "2s",
config.AdminAPIEnabledKey: "true",
config.IndexEnabledKey: "true",
// Disable disk checks by default since temporary networks often run in
// resource-constrained environments that commonly have low disk space.
// Disable disk and memory checks by default since temporary networks often run in
// resource-constrained environments that commonly have low disk space and memory.
config.SystemTrackerRequiredAvailableDiskSpacePercentageKey: "0",
config.SystemTrackerWarningAvailableDiskSpacePercentageKey: "0",
config.SystemTrackerWarningAvailableMemoryPercentageKey: "0",
}
}

Expand Down
8 changes: 8 additions & 0 deletions utils/resource/no_usage.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,11 @@ func (noUsage) DiskUsage() (float64, float64) {
func (noUsage) AvailableDiskBytes() uint64 {
return math.MaxUint64
}

func (noUsage) AvailableMemoryBytes() uint64 {
return math.MaxUint64
}

func (noUsage) AvailableMemoryPercentage() uint64 {
return 100
}
28 changes: 28 additions & 0 deletions utils/resource/resourcemock/user.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading