Skip to content
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions RELEASES.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ The plugin version is unchanged at `44` and is compatible with version `v1.14.0`
- Added:
- `--system-tracker-disk-required-available-space-percentage`
- `--system-tracker-disk-warning-available-space-percentage`
- `--system-tracker-memory-required-available-space-percentage`
- `--system-tracker-memory-warning-available-space-percentage`

- Deprecated:
- `--system-tracker-disk-required-available-space`
- `--system-tracker-disk-warning-threshold-available-space`
Expand Down
32 changes: 31 additions & 1 deletion config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,8 @@ const (
chainUpgradeFileName = "upgrade"
subnetConfigFileExt = ".json"

maxDiskSpaceThreshold = 50
maxDiskSpaceThreshold = 50
maxMemorySpaceThreshold = 50
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can change this if we'd like. I just copied all the values for memory, but I understand memory is not disk!

)

var (
Expand Down Expand Up @@ -88,6 +89,8 @@ var (
errInvalidSignerConfig = fmt.Errorf("only one of the following flags can be set: %s, %s, %s, %s", StakingEphemeralSignerEnabledKey, StakingSignerKeyContentKey, StakingSignerKeyPathKey, StakingRPCSignerEndpointKey)
errDiskSpaceOutOfRange = fmt.Errorf("out of range [0,%d]", maxDiskSpaceThreshold)
errDiskWarnAfterFatal = errors.New("warning disk space threshold cannot be greater than fatal threshold")
errMemorySpaceOutOfRange = fmt.Errorf("out of range [0,%d]", maxMemorySpaceThreshold)
errMemoryWarnAfterFatal = errors.New("warning memory threshold cannot be greater than fatal threshold")
)

func getConsensusConfig(v *viper.Viper) snowball.Parameters {
Expand Down Expand Up @@ -1167,6 +1170,28 @@ func getDiskSpaceConfig(v *viper.Viper) (
}
}

func getMemoryConfig(v *viper.Viper) (
requiredAvailableMemoryPercentage uint64,
warningAvailableMemoryPercentage uint64,
err error,
) {
var (
warnKey = SystemTrackerWarningAvailableMemoryPercentageKey
requiredKey = SystemTrackerRequiredAvailableMemoryPercentageKey

warn = v.GetUint64(warnKey)
required = v.GetUint64(requiredKey)
)
switch {
case warn > maxMemorySpaceThreshold:
return 0, 0, fmt.Errorf("%w: %q (%d)", errMemorySpaceOutOfRange, warnKey, warn)
case warn < required:
return 0, 0, fmt.Errorf("%w: %d < %d", errMemoryWarnAfterFatal, warn, required)
default:
return required, warn, nil
}
}

func getDiskTargeterConfig(v *viper.Viper) (tracker.TargeterConfig, error) {
vdrAlloc := v.GetFloat64(DiskVdrAllocKey)
maxNonVdrUsage := v.GetFloat64(DiskMaxNonVdrUsageKey)
Expand Down Expand Up @@ -1423,6 +1448,11 @@ func GetNodeConfig(v *viper.Viper) (node.Config, error) {
return node.Config{}, err
}

nodeConfig.RequiredAvailableMemoryPercentage, nodeConfig.WarningAvailableMemoryPercentage, err = getMemoryConfig(v)
if err != nil {
return node.Config{}, err
}

nodeConfig.CPUTargeterConfig, err = getCPUTargeterConfig(v)
if err != nil {
return node.Config{}, err
Expand Down
51 changes: 51 additions & 0 deletions config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -690,6 +690,57 @@ func TestGetDiskSpaceConfig(t *testing.T) {
}
}

func TestGetMemoryConfig(t *testing.T) {
tests := []struct {
name string
config map[string]uint64
expectedErr error
}{
{
name: "empty config",
config: map[string]uint64{},
},
{
name: "valid config",
config: map[string]uint64{
SystemTrackerWarningAvailableMemoryPercentageKey: maxMemorySpaceThreshold,
SystemTrackerRequiredAvailableMemoryPercentageKey: 1,
},
},
{
name: "invalid config - warning less than required",
config: map[string]uint64{
SystemTrackerWarningAvailableMemoryPercentageKey: 25,
SystemTrackerRequiredAvailableMemoryPercentageKey: 30,
},
expectedErr: errMemoryWarnAfterFatal,
},
{
name: "invalid config - warning too big",
config: map[string]uint64{
SystemTrackerWarningAvailableMemoryPercentageKey: maxMemorySpaceThreshold + 1,
SystemTrackerRequiredAvailableMemoryPercentageKey: 15,
},
expectedErr: errMemorySpaceOutOfRange,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
require := require.New(t)
v := setupViperFlags()

for key, value := range tt.config {
v.Set(key, value)
}

_, err := GetNodeConfig(v)

require.ErrorIs(err, tt.expectedErr)
})
}
}

// setups config json file and writes content
func setupConfigJSON(t *testing.T, rootPath string, value string) string {
configFilePath := filepath.Join(rootPath, "config.json")
Expand Down
2 changes: 2 additions & 0 deletions config/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,8 @@ func addNodeFlags(fs *pflag.FlagSet) {
fs.Uint64(SystemTrackerRequiredAvailableDiskSpacePercentageKey, 3, "Minimum percentage (between 0 and 50) of available disk space, under which the node will shutdown.")
fs.Uint64(SystemTrackerWarningThresholdAvailableDiskSpaceKey, 0, fmt.Sprintf("DEPRECATED: Warning threshold for the number of available bytes on disk, under which the node will be considered unhealthy. Must be >= [%s]", SystemTrackerRequiredAvailableDiskSpaceKey))
fs.Uint64(SystemTrackerWarningAvailableDiskSpacePercentageKey, 10, fmt.Sprintf("Warning threshold for the percentage (between 0 and 50) of available disk space, under which the node will be considered unhealthy. Must be >= [%s]", SystemTrackerRequiredAvailableDiskSpacePercentageKey))
fs.Uint64(SystemTrackerRequiredAvailableMemoryPercentageKey, 5, "Minimum percentage (between 0 and 50) of available system memory, under which the node will shutdown.")
fs.Uint64(SystemTrackerWarningAvailableMemoryPercentageKey, 10, fmt.Sprintf("Warning threshold for the percentage (between 0 and 50) of available system memory, under which the node will be considered unhealthy. Must be >= [%s]", SystemTrackerRequiredAvailableMemoryPercentageKey))

// CPU management
fs.Float64(CPUVdrAllocKey, float64(runtime.NumCPU()), "Maximum number of CPUs to allocate for use by validators. Value should be in range [0, total core count]")
Expand Down
2 changes: 2 additions & 0 deletions config/keys.go
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,8 @@ const (
SystemTrackerRequiredAvailableDiskSpacePercentageKey = "system-tracker-disk-required-available-space-percentage"
SystemTrackerWarningThresholdAvailableDiskSpaceKey = "system-tracker-disk-warning-threshold-available-space"
SystemTrackerWarningAvailableDiskSpacePercentageKey = "system-tracker-disk-warning-available-space-percentage"
SystemTrackerRequiredAvailableMemoryPercentageKey = "system-tracker-memory-required-available-percentage"
SystemTrackerWarningAvailableMemoryPercentageKey = "system-tracker-memory-warning-available-percentage"
DiskVdrAllocKey = "throttler-inbound-disk-validator-alloc"
DiskMaxNonVdrUsageKey = "throttler-inbound-disk-max-non-validator-usage"
DiskMaxNonVdrNodeUsageKey = "throttler-inbound-disk-max-non-validator-node-usage"
Expand Down
3 changes: 3 additions & 0 deletions config/node/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,9 @@ type Config struct {
RequiredAvailableDiskSpacePercentage uint64 `json:"requiredAvailableDiskSpacePercentage"`
WarningAvailableDiskSpacePercentage uint64 `json:"warningAvailableDiskSpacePercentage"`

RequiredAvailableMemoryPercentage uint64 `json:"requiredAvailableMemoryPercentage"`
WarningAvailableMemoryPercentage uint64 `json:"warningAvailableMemoryPercentage"`

TraceConfig trace.Config `json:"traceConfig"`

// See comment on [UseCurrentHeight] in platformvm.Config
Expand Down
32 changes: 32 additions & 0 deletions node/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -1484,6 +1484,38 @@ func (n *Node) initHealthAPI() error {
return fmt.Errorf("couldn't register resource health check: %w", err)
}

memoryCheck := health.CheckerFunc(func(context.Context) (interface{}, error) {
// confirm that the node has enough memory to continue operating
// if there is too little memory remaining, first report unhealthy and then shutdown the node

availableMemoryBytes := n.resourceTracker.MemoryTracker().AvailableMemoryBytes()
availableMemoryPercentage := n.resourceTracker.MemoryTracker().AvailableMemoryPercentage()

var err error

if availableMemoryPercentage < n.Config.RequiredAvailableMemoryPercentage {
n.Log.Fatal("low on available memory. Shutting down...",
zap.Uint64("availableMemoryBytes", availableMemoryBytes),
zap.Uint64("remainingMemoryPercentage", availableMemoryPercentage),
zap.Uint64("requiredMemoryPercentage", n.Config.RequiredAvailableMemoryPercentage),
)
go n.Shutdown(1)
err = fmt.Errorf("remaining available memory percentage (%d%%) is below minimum required available memory percentage (%d%%)", availableMemoryPercentage, n.Config.RequiredAvailableMemoryPercentage)
} else if availableMemoryPercentage < n.Config.WarningAvailableMemoryPercentage {
err = fmt.Errorf("remaining available memory percentage (%d%%) is below warning threshold available memory percentage (%d%%)", availableMemoryPercentage, n.Config.WarningAvailableMemoryPercentage)
}

return map[string]interface{}{
"availableMemoryBytes": availableMemoryBytes,
"availableMemoryPercentage": availableMemoryPercentage,
}, err
})

err = n.health.RegisterHealthCheck("memory", memoryCheck, health.ApplicationTag)
if err != nil {
return fmt.Errorf("couldn't register memory health check: %w", err)
}

wrongBLSKeyCheck := health.CheckerFunc(func(context.Context) (interface{}, error) {
vdr, ok := n.vdrs.GetValidator(constants.PrimaryNetworkID, n.ID)
if !ok {
Expand Down
58 changes: 52 additions & 6 deletions snow/networking/tracker/resource_tracker.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,16 @@ type DiskTracker interface {
AvailableDiskPercentage() uint64
}

type MemoryTracker interface {
AvailableMemoryBytes() uint64
AvailableMemoryPercentage() uint64
}

// ResourceTracker is an interface for tracking peers' usage of resources
type ResourceTracker interface {
CPUTracker() Tracker
DiskTracker() DiskTracker
MemoryTracker() MemoryTracker
// Registers that the given node started processing at the given time.
StartProcessing(ids.NodeID, time.Time)
// Registers that the given node stopped processing at the given time.
Expand Down Expand Up @@ -198,6 +204,30 @@ func (t *diskResourceTracker) TimeUntilUsage(nodeID ids.NodeID, now time.Time, v
return m.TimeUntil(now, value/scale)
}

type memoryResourceTracker struct {
t *resourceTracker
}

func (t *memoryResourceTracker) AvailableMemoryBytes() uint64 {
rt := t.t
rt.lock.Lock()
defer rt.lock.Unlock()

bytesAvailable := rt.resources.AvailableMemoryBytes()
rt.metrics.memoryAvailable.Set(float64(bytesAvailable))
return bytesAvailable
}

func (t *memoryResourceTracker) AvailableMemoryPercentage() uint64 {
rt := t.t
rt.lock.Lock()
defer rt.lock.Unlock()

percentageAvailable := rt.resources.AvailableMemoryPercentage()
rt.metrics.memoryPercentageAvailable.Set(float64(percentageAvailable))
return percentageAvailable
}

type resourceTracker struct {
lock sync.RWMutex

Expand Down Expand Up @@ -244,6 +274,10 @@ func (rt *resourceTracker) DiskTracker() DiskTracker {
return &diskResourceTracker{t: rt}
}

func (rt *resourceTracker) MemoryTracker() MemoryTracker {
return &memoryResourceTracker{t: rt}
}

func (rt *resourceTracker) StartProcessing(nodeID ids.NodeID, now time.Time) {
rt.lock.Lock()
defer rt.lock.Unlock()
Expand Down Expand Up @@ -297,12 +331,14 @@ func (rt *resourceTracker) prune(now time.Time) {
}

type trackerMetrics struct {
processingTimeMetric prometheus.Gauge
cpuMetric prometheus.Gauge
diskReadsMetric prometheus.Gauge
diskWritesMetric prometheus.Gauge
diskSpaceAvailable prometheus.Gauge
diskPercentageAvailable prometheus.Gauge
processingTimeMetric prometheus.Gauge
cpuMetric prometheus.Gauge
diskReadsMetric prometheus.Gauge
diskWritesMetric prometheus.Gauge
diskSpaceAvailable prometheus.Gauge
diskPercentageAvailable prometheus.Gauge
memoryAvailable prometheus.Gauge
memoryPercentageAvailable prometheus.Gauge
}

func newCPUTrackerMetrics(reg prometheus.Registerer) (*trackerMetrics, error) {
Expand Down Expand Up @@ -331,6 +367,14 @@ func newCPUTrackerMetrics(reg prometheus.Registerer) (*trackerMetrics, error) {
Name: "disk_available_percentage",
Help: "Percentage of database volume available",
}),
memoryAvailable: prometheus.NewGauge(prometheus.GaugeOpts{
Name: "memory_available_space",
Help: "Available memory remaining (bytes) on the system",
}),
memoryPercentageAvailable: prometheus.NewGauge(prometheus.GaugeOpts{
Name: "memory_available_percentage",
Help: "Percentage of system memory available",
}),
}
err := errors.Join(
reg.Register(m.processingTimeMetric),
Expand All @@ -339,6 +383,8 @@ func newCPUTrackerMetrics(reg prometheus.Registerer) (*trackerMetrics, error) {
reg.Register(m.diskWritesMetric),
reg.Register(m.diskSpaceAvailable),
reg.Register(m.diskPercentageAvailable),
reg.Register(m.memoryAvailable),
reg.Register(m.memoryPercentageAvailable),
)
return m, err
}
6 changes: 4 additions & 2 deletions tests/fixture/tmpnet/defaults.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,12 @@ func DefaultTmpnetFlags() FlagsMap {
config.HealthCheckFreqKey: "2s",
config.AdminAPIEnabledKey: "true",
config.IndexEnabledKey: "true",
// Disable disk checks by default since temporary networks often run in
// resource-constrained environments that commonly have low disk space.
// Disable disk and memory checks by default since temporary networks often run in
// resource-constrained environments that commonly have low disk space and memory.
config.SystemTrackerRequiredAvailableDiskSpacePercentageKey: "0",
config.SystemTrackerWarningAvailableDiskSpacePercentageKey: "0",
config.SystemTrackerRequiredAvailableMemoryPercentageKey: "0",
config.SystemTrackerWarningAvailableMemoryPercentageKey: "0",
}
}

Expand Down
8 changes: 8 additions & 0 deletions utils/resource/no_usage.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,11 @@ func (noUsage) DiskUsage() (float64, float64) {
func (noUsage) AvailableDiskBytes() uint64 {
return math.MaxUint64
}

func (noUsage) AvailableMemoryBytes() uint64 {
return math.MaxUint64
}

func (noUsage) AvailableMemoryPercentage() uint64 {
return 100
}
28 changes: 28 additions & 0 deletions utils/resource/resourcemock/user.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading