| title | Prometheus Metrics |
|---|---|
| description | Monitor Cbox Init processes with comprehensive Prometheus metrics and alerting |
| weight | 40 |
Comprehensive Prometheus metrics for monitoring Cbox Init and managed processes.
Enable metrics in your cbox-init.yaml:
global:
metrics_enabled: true
metrics_port: 9090
metrics_path: /metricsType: Gauge
Labels: name, instance
Description: Process status (1=running, 0=stopped)
# Query running instances of php-fpm
cbox_init_process_up{name="php-fpm"}
Type: Counter
Labels: name, reason
Description: Total number of process restarts by reason (crash, health_check, normal_exit)
# Total restarts for all processes
sum(cbox_init_process_restarts_total) by (name)
# Restarts due to health check failures
cbox_init_process_restarts_total{reason="health_check"}
Type: Gauge
Labels: name, instance
Description: Unix timestamp when process instance started
# Process uptime in seconds
time() - cbox_init_process_start_time_seconds
Type: Gauge
Labels: name, instance
Description: Last exit code of process instance
# Non-zero exit codes (errors)
cbox_init_process_last_exit_code != 0
Type: Gauge
Labels: name, type
Description: Health check status (1=healthy, 0=unhealthy)
# Unhealthy processes
cbox_init_health_check_status == 0
Type: Histogram
Labels: name, type
Description: Health check duration in seconds
# 95th percentile health check latency
histogram_quantile(0.95,
sum(rate(cbox_init_health_check_duration_seconds_bucket[5m])) by (le, name)
)
Type: Counter
Labels: name, type, status
Description: Total number of health checks performed
# Health check failure rate
rate(cbox_init_health_check_total{status="failure"}[5m])
Type: Gauge
Labels: name
Description: Current consecutive health check failures
# Processes with multiple consecutive failures
cbox_init_health_check_consecutive_fails > 1
Type: Gauge
Labels: name
Description: Desired number of process instances
# Desired scale configuration
cbox_init_process_desired_scale
Type: Gauge
Labels: name
Description: Current number of running instances
# Scale drift (actual vs desired)
cbox_init_process_current_scale - cbox_init_process_desired_scale
Type: Counter
Labels: name, type, status
Description: Total hook executions by type and status
# Failed pre-start hooks
cbox_init_hook_executions_total{type="pre_start", status="failure"}
Type: Histogram
Labels: name, type
Description: Hook execution duration in seconds
# 99th percentile hook duration
histogram_quantile(0.99,
sum(rate(cbox_init_hook_duration_seconds_bucket[5m])) by (le, type)
)
Type: Gauge Description: Total number of managed processes
# Total processes under management
cbox_init_manager_process_count
Type: Gauge Description: Unix timestamp when manager started
# Manager uptime in seconds
time() - cbox_init_manager_start_time_seconds
Type: Gauge
Labels: version, go_version
Description: Cbox Init build information
# Version information
cbox_init_build_info
# Count of healthy processes
sum(cbox_init_process_up) by (name)
# Count of processes with health check failures
count(cbox_init_health_check_status{status="0"}) by (name)
# Restart rate per minute
rate(cbox_init_process_restarts_total[1m])
# Processes restarting frequently (>5/hour)
sum(increase(cbox_init_process_restarts_total[1h])) by (name) > 5
# Instances not matching desired scale
abs(cbox_init_process_current_scale - cbox_init_process_desired_scale) > 0
# Slow hooks (>30s)
max(cbox_init_hook_duration_seconds) by (name, type) > 30
# Hook failure rate
rate(cbox_init_hook_executions_total{status="failure"}[5m])
groups:
- name: cbox_init
rules:
# Process down
- alert: ProcessDown
expr: cbox_init_process_up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Process {{ $labels.name }} instance {{ $labels.instance }} is down"
# Frequent restarts
- alert: FrequentRestarts
expr: rate(cbox_init_process_restarts_total[5m]) > 0.1
for: 5m
labels:
severity: warning
annotations:
summary: "Process {{ $labels.name }} restarting frequently"
# Health check failures
- alert: HealthCheckFailing
expr: cbox_init_health_check_status == 0
for: 2m
labels:
severity: warning
annotations:
summary: "Health check failing for {{ $labels.name }}"
# Scale drift
- alert: ScaleDrift
expr: abs(cbox_init_process_current_scale - cbox_init_process_desired_scale) > 0
for: 5m
labels:
severity: warning
annotations:
summary: "{{ $labels.name }} scale drift detected"
# Hook failures
- alert: HookFailures
expr: rate(cbox_init_hook_executions_total{status="failure"}[5m]) > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Hook {{ $labels.name }} failing"{
"dashboard": {
"title": "Cbox Init Overview",
"panels": [
{
"title": "Process Status",
"targets": [
{
"expr": "cbox_init_process_up"
}
]
},
{
"title": "Restart Rate",
"targets": [
{
"expr": "rate(cbox_init_process_restarts_total[5m])"
}
]
},
{
"title": "Health Check Status",
"targets": [
{
"expr": "cbox_init_health_check_status"
}
]
},
{
"title": "Scale Status",
"targets": [
{
"expr": "cbox_init_process_current_scale",
"legendFormat": "Current"
},
{
"expr": "cbox_init_process_desired_scale",
"legendFormat": "Desired"
}
]
}
]
}
}scrape_configs:
- job_name: 'cbox-init'
static_configs:
- targets: ['localhost:9090']
scrape_interval: 15sservices:
cbox-init:
image: cboxdk/init:latest
environment:
- CBOX_INIT_GLOBAL_METRICS_ENABLED=true
- CBOX_INIT_GLOBAL_METRICS_PORT=9090
ports:
- "9090:9090"
prometheus:
image: prom/prometheus:latest
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
ports:
- "9091:9090"