From f74ff589eb136164c28039d9e3679a92646b5109 Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Sat, 2 May 2026 21:44:50 -0700 Subject: [PATCH 01/60] Add host metrics receiver --- .../crates/core-nodes/Cargo.toml | 1 + .../receivers/host_metrics_receiver/mod.rs | 737 ++++++++ .../receivers/host_metrics_receiver/procfs.rs | 1595 +++++++++++++++++ .../crates/core-nodes/src/receivers/mod.rs | 3 + 4 files changed, 2336 insertions(+) create mode 100644 rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs create mode 100644 rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs diff --git a/rust/otap-dataflow/crates/core-nodes/Cargo.toml b/rust/otap-dataflow/crates/core-nodes/Cargo.toml index 2d75be447d..554b2a4c98 100644 --- a/rust/otap-dataflow/crates/core-nodes/Cargo.toml +++ b/rust/otap-dataflow/crates/core-nodes/Cargo.toml @@ -42,6 +42,7 @@ object_store = {workspace = true, features = ["fs"]} parquet.workspace = true prost.workspace = true rand.workspace = true +regex.workspace = true serde.workspace = true serde_json.workspace = true slotmap.workspace = true diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs new file mode 100644 index 0000000000..f5f32cb123 --- /dev/null +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs @@ -0,0 +1,737 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +//! Host metrics receiver. + +use async_trait::async_trait; +use bytes::Bytes; +use linkme::distributed_slice; +use otap_df_config::node::NodeUserConfig; +use otap_df_engine::MessageSourceLocalEffectHandlerExtension; +use otap_df_engine::ReceiverFactory; +use otap_df_engine::config::ReceiverConfig; +use otap_df_engine::context::PipelineContext; +use otap_df_engine::control::NodeControlMsg; +use otap_df_engine::error::{Error, ReceiverErrorKind, TypedError}; +use otap_df_engine::local::receiver as local; +use otap_df_engine::node::NodeId; +use otap_df_engine::receiver::ReceiverWrapper; +use otap_df_engine::terminal_state::TerminalState; +use otap_df_otap::OTAP_RECEIVER_FACTORIES; +use otap_df_otap::pdata::{Context, OtapPdata}; +use otap_df_pdata::OtlpProtoBytes; +use otap_df_pdata::otap::OtapArrowRecords; +use otap_df_telemetry::metrics::MetricSetSnapshot; +use otap_df_telemetry::{otel_info, otel_warn}; +use prost::Message as _; +use regex::Regex; +use serde::{Deserialize, Serialize}; +use serde_json::Value; +use std::collections::HashSet; +use std::path::{Component, Path, PathBuf}; +use std::sync::Arc; +use std::sync::{LazyLock, Mutex}; +use std::time::Duration; +use tokio::time::interval; + +mod procfs; + +use procfs::{HostSnapshot, ProcfsConfig, ProcfsSource}; + +/// The URN for the host metrics receiver. +pub const HOST_METRICS_RECEIVER_URN: &str = "urn:otel:receiver:host_metrics"; + +fn default_collection_interval() -> Duration { + Duration::from_secs(10) +} + +/// Configuration for the host metrics receiver. +#[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(deny_unknown_fields)] +pub struct Config { + /// Collection interval. + #[serde(default = "default_collection_interval", with = "humantime_serde")] + pub collection_interval: Duration, + + /// Optional host root path. In Kubernetes this is commonly `/host`. + #[serde(default)] + pub root_path: Option, + + /// Metric family configuration. + #[serde(default)] + pub families: FamiliesConfig, +} + +impl Default for Config { + fn default() -> Self { + Self { + collection_interval: default_collection_interval(), + root_path: None, + families: FamiliesConfig::default(), + } + } +} + +/// Metric family configuration. +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +#[serde(default, deny_unknown_fields)] +pub struct FamiliesConfig { + /// CPU metrics. + pub cpu: FamilyConfig, + /// Memory metrics. + pub memory: FamilyConfig, + /// Paging metrics. + pub paging: FamilyConfig, + /// System metrics. + pub system: FamilyConfig, + /// Disk metrics. + pub disk: DiskFamilyConfig, + /// Network metrics. + pub network: NetworkFamilyConfig, + /// Process summary metrics. + pub processes: ProcessesFamilyConfig, +} + +impl FamiliesConfig { + fn enabled_count(&self) -> usize { + usize::from(self.cpu.enabled) + + usize::from(self.memory.enabled) + + usize::from(self.paging.enabled) + + usize::from(self.system.enabled) + + usize::from(self.disk.enabled) + + usize::from(self.network.enabled) + + usize::from(self.processes.enabled) + } +} + +/// Common family config. +#[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(default, deny_unknown_fields)] +pub struct FamilyConfig { + /// Enable this family. + pub enabled: bool, +} + +impl Default for FamilyConfig { + fn default() -> Self { + Self { enabled: true } + } +} + +/// Disk family config. +#[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(default, deny_unknown_fields)] +pub struct DiskFamilyConfig { + /// Enable disk metrics. + pub enabled: bool, + /// Device include filter. + pub include: Option, + /// Device exclude filter. + pub exclude: Option, +} + +impl Default for DiskFamilyConfig { + fn default() -> Self { + Self { + enabled: true, + include: None, + exclude: None, + } + } +} + +/// Network family config. +#[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(default, deny_unknown_fields)] +pub struct NetworkFamilyConfig { + /// Enable network metrics. + pub enabled: bool, + /// Interface include filter. + pub include: Option, + /// Interface exclude filter. + pub exclude: Option, + /// Connection count is not supported in v1. + pub include_connection_count: bool, +} + +impl Default for NetworkFamilyConfig { + fn default() -> Self { + Self { + enabled: true, + include: None, + exclude: None, + include_connection_count: false, + } + } +} + +/// Process family config. +#[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(default, deny_unknown_fields)] +pub struct ProcessesFamilyConfig { + /// Enable process summary metrics. + pub enabled: bool, + /// Only `summary` is supported in v1. + pub mode: ProcessMode, +} + +impl Default for ProcessesFamilyConfig { + fn default() -> Self { + Self { + enabled: true, + mode: ProcessMode::Summary, + } + } +} + +/// Process collection mode. +#[derive(Clone, Copy, Debug, Deserialize, Eq, PartialEq, Serialize)] +#[serde(rename_all = "snake_case")] +pub enum ProcessMode { + /// Aggregate host process summary. + Summary, +} + +impl Default for ProcessMode { + fn default() -> Self { + Self::Summary + } +} + +/// Disk device filter. +#[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(deny_unknown_fields)] +pub struct DeviceFilterConfig { + /// Device names. + pub devices: Vec, + /// Match type. + #[serde(default)] + pub match_type: MatchType, +} + +/// Network interface filter. +#[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(deny_unknown_fields)] +pub struct InterfaceFilterConfig { + /// Interface names. + pub interfaces: Vec, + /// Match type. + #[serde(default)] + pub match_type: MatchType, +} + +/// Filter match type. +#[derive(Clone, Copy, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] +#[serde(rename_all = "snake_case")] +pub enum MatchType { + /// Exact string match. + #[default] + Strict, + /// Glob match with `*` and `?`. + Glob, + /// Regular expression match. + Regexp, +} + +#[derive(Clone)] +struct RuntimeConfig { + root_path: Option, + collection_interval: Duration, + families: RuntimeFamilies, +} + +#[derive(Clone)] +struct RuntimeFamilies { + cpu: bool, + memory: bool, + paging: bool, + system: bool, + disk: RuntimeDiskFamily, + network: RuntimeNetworkFamily, + processes: bool, +} + +#[derive(Clone)] +struct RuntimeDiskFamily { + enabled: bool, + include: Option, + exclude: Option, +} + +#[derive(Clone)] +struct RuntimeNetworkFamily { + enabled: bool, + include: Option, + exclude: Option, +} + +#[derive(Clone)] +pub(crate) struct CompiledFilter { + match_type: MatchType, + values: Vec, + regexes: Vec, +} + +impl CompiledFilter { + fn compile( + match_type: MatchType, + values: Vec, + ) -> Result, otap_df_config::error::Error> { + if values.is_empty() { + return Ok(None); + } + let regexes = if match_type == MatchType::Regexp { + let mut regexes = Vec::with_capacity(values.len()); + for value in &values { + regexes.push(Regex::new(value).map_err(|err| { + otap_df_config::error::Error::InvalidUserConfig { + error: format!("invalid host metrics regexp filter {value:?}: {err}"), + } + })?); + } + regexes + } else { + Vec::new() + }; + Ok(Some(Self { + match_type, + values, + regexes, + })) + } + + pub(crate) fn matches(&self, value: &str) -> bool { + match self.match_type { + MatchType::Strict => self.values.iter().any(|candidate| candidate == value), + MatchType::Glob => self + .values + .iter() + .any(|candidate| glob_matches(candidate.as_bytes(), value.as_bytes())), + MatchType::Regexp => self + .regexes + .iter() + .any(|candidate| candidate.is_match(value)), + } + } +} + +fn glob_matches(pattern: &[u8], value: &[u8]) -> bool { + let (mut p, mut v) = (0, 0); + let mut star = None; + let mut star_value = 0; + + while v < value.len() { + if p < pattern.len() && (pattern[p] == b'?' || pattern[p] == value[v]) { + p += 1; + v += 1; + } else if p < pattern.len() && pattern[p] == b'*' { + star = Some(p); + p += 1; + star_value = v; + } else if let Some(star_pos) = star { + p = star_pos + 1; + star_value += 1; + v = star_value; + } else { + return false; + } + } + + while p < pattern.len() && pattern[p] == b'*' { + p += 1; + } + p == pattern.len() +} + +/// Host metrics receiver. +pub struct HostMetricsReceiver { + config: RuntimeConfig, + _lease: HostMetricsLease, +} + +#[allow(unsafe_code)] +#[distributed_slice(OTAP_RECEIVER_FACTORIES)] +/// Declares the host metrics receiver as a local receiver factory. +pub static HOST_METRICS_RECEIVER: ReceiverFactory = ReceiverFactory { + name: HOST_METRICS_RECEIVER_URN, + create: |pipeline: PipelineContext, + node: NodeId, + node_config: Arc, + receiver_config: &ReceiverConfig| { + if pipeline.num_cores() > 1 { + return Err(otap_df_config::error::Error::InvalidUserConfig { + error: "host-wide collection must run in a one-core source pipeline; use receiver:host_metrics -> exporter:topic and fan out downstream".to_owned(), + }); + } + Ok(ReceiverWrapper::local( + HostMetricsReceiver::from_config(&node_config.config)?, + node, + node_config, + receiver_config, + )) + }, + wiring_contract: otap_df_engine::wiring_contract::WiringContract::UNRESTRICTED, + validate_config: |config| { + let config: Config = serde_json::from_value(config.clone()).map_err(|e| { + otap_df_config::error::Error::InvalidUserConfig { + error: e.to_string(), + } + })?; + validate_config(&config) + }, +}; + +impl HostMetricsReceiver { + /// Creates a new host metrics receiver. + pub fn new(config: Config) -> Result { + let root_path = normalized_root_path(config.root_path.as_deref())?; + let lease = HostMetricsLease::acquire(root_path)?; + let config = RuntimeConfig::try_from(config)?; + Ok(Self { + config, + _lease: lease, + }) + } + + /// Creates a host metrics receiver from JSON config. + pub fn from_config(config: &Value) -> Result { + let config: Config = serde_json::from_value(config.clone()).map_err(|e| { + otap_df_config::error::Error::InvalidUserConfig { + error: e.to_string(), + } + })?; + validate_config(&config)?; + Self::new(config) + } +} + +fn validate_config(config: &Config) -> Result<(), otap_df_config::error::Error> { + if config.collection_interval.is_zero() { + return Err(otap_df_config::error::Error::InvalidUserConfig { + error: "collection_interval must be greater than zero".to_owned(), + }); + } + if config.families.enabled_count() == 0 { + return Err(otap_df_config::error::Error::InvalidUserConfig { + error: "at least one host metrics family must be enabled".to_owned(), + }); + } + if config.families.network.include_connection_count { + return Err(otap_df_config::error::Error::InvalidUserConfig { + error: "network include_connection_count is not supported in v1".to_owned(), + }); + } + let _ = normalized_root_path(config.root_path.as_deref())?; + Ok(()) +} + +impl TryFrom for RuntimeConfig { + type Error = otap_df_config::error::Error; + + fn try_from(config: Config) -> Result { + validate_config(&config)?; + let disk_include = config + .families + .disk + .include + .map(|filter| CompiledFilter::compile(filter.match_type, filter.devices)) + .transpose()? + .flatten(); + let disk_exclude = config + .families + .disk + .exclude + .map(|filter| CompiledFilter::compile(filter.match_type, filter.devices)) + .transpose()? + .flatten(); + let network_include = config + .families + .network + .include + .map(|filter| CompiledFilter::compile(filter.match_type, filter.interfaces)) + .transpose()? + .flatten(); + let network_exclude = config + .families + .network + .exclude + .map(|filter| CompiledFilter::compile(filter.match_type, filter.interfaces)) + .transpose()? + .flatten(); + + Ok(Self { + root_path: config.root_path, + collection_interval: config.collection_interval, + families: RuntimeFamilies { + cpu: config.families.cpu.enabled, + memory: config.families.memory.enabled, + paging: config.families.paging.enabled, + system: config.families.system.enabled, + disk: RuntimeDiskFamily { + enabled: config.families.disk.enabled, + include: disk_include, + exclude: disk_exclude, + }, + network: RuntimeNetworkFamily { + enabled: config.families.network.enabled, + include: network_include, + exclude: network_exclude, + }, + processes: config.families.processes.enabled, + }, + }) + } +} + +static HOST_METRICS_LEASES: LazyLock>> = + LazyLock::new(|| Mutex::new(HashSet::new())); + +struct HostMetricsLease { + key: PathBuf, +} + +impl HostMetricsLease { + fn acquire(key: PathBuf) -> Result { + let mut leases = HOST_METRICS_LEASES.lock().map_err(|_| { + otap_df_config::error::Error::InvalidUserConfig { + error: "host metrics lease registry is unavailable".to_owned(), + } + })?; + if !leases.insert(key.clone()) { + return Err(otap_df_config::error::Error::InvalidUserConfig { + error: format!( + "another host_metrics receiver already collects host root {}", + key.display() + ), + }); + } + Ok(Self { key }) + } +} + +impl Drop for HostMetricsLease { + fn drop(&mut self) { + if let Ok(mut leases) = HOST_METRICS_LEASES.lock() { + let _ = leases.remove(&self.key); + } + } +} + +fn normalized_root_path(root_path: Option<&Path>) -> Result { + let path = root_path.unwrap_or_else(|| Path::new("/")); + if !path.is_absolute() { + return Err(otap_df_config::error::Error::InvalidUserConfig { + error: format!("root_path must be absolute: {}", path.display()), + }); + } + + let mut normalized = PathBuf::from("/"); + for component in path.components() { + match component { + Component::RootDir => {} + Component::CurDir => {} + Component::Normal(part) => normalized.push(part), + Component::ParentDir => { + let _ = normalized.pop(); + } + Component::Prefix(_) => { + return Err(otap_df_config::error::Error::InvalidUserConfig { + error: format!("root_path must be a Unix absolute path: {}", path.display()), + }); + } + } + } + Ok(normalized) +} + +#[async_trait(?Send)] +impl local::Receiver for HostMetricsReceiver { + async fn start( + self: Box, + mut ctrl_msg_recv: local::ControlChannel, + effect_handler: local::EffectHandler, + ) -> Result { + let mut source = ProcfsSource::new( + self.config.root_path.as_deref(), + ProcfsConfig { + cpu: self.config.families.cpu, + memory: self.config.families.memory, + paging: self.config.families.paging, + system: self.config.families.system, + disk: self.config.families.disk.enabled, + network: self.config.families.network.enabled, + processes: self.config.families.processes, + disk_include: self.config.families.disk.include.clone(), + disk_exclude: self.config.families.disk.exclude.clone(), + network_include: self.config.families.network.include.clone(), + network_exclude: self.config.families.network.exclude.clone(), + }, + ) + .map_err(|err| Error::ReceiverError { + receiver: effect_handler.receiver_id(), + kind: ReceiverErrorKind::Configuration, + error: format!("failed to validate host metrics procfs sources: {err}"), + source_detail: String::new(), + })?; + let mut ticker = interval(self.config.collection_interval); + + let _ = effect_handler + .start_periodic_telemetry(Duration::from_secs(1)) + .await?; + + loop { + tokio::select! { + biased; + + msg = ctrl_msg_recv.recv() => { + match msg { + Ok(NodeControlMsg::CollectTelemetry { .. }) => {} + Ok(NodeControlMsg::DrainIngress { deadline, .. }) => { + otel_info!("host_metrics_receiver.drain_ingress"); + effect_handler.notify_receiver_drained().await?; + return Ok(TerminalState::new::<[MetricSetSnapshot; 0]>(deadline, [])); + } + Ok(NodeControlMsg::Shutdown { deadline, .. }) => { + otel_info!("host_metrics_receiver.shutdown"); + return Ok(TerminalState::new::<[MetricSetSnapshot; 0]>(deadline, [])); + } + Err(e) => return Err(Error::ChannelRecvError(e)), + _ => {} + } + } + + _ = ticker.tick() => { + match source.scrape() { + Ok(snapshot) => { + let pdata = encode_snapshot(snapshot).map_err(|err| Error::ReceiverError { + receiver: effect_handler.receiver_id(), + kind: ReceiverErrorKind::Other, + error: format!("failed to encode host metrics: {err}"), + source_detail: String::new(), + })?; + if let Err(err) = effect_handler.try_send_message_with_source_node(pdata) { + match err { + TypedError::ChannelSendError(_) => { + otel_warn!("host metrics dropped due to downstream backpressure"); + } + other => { + return Err(Error::ReceiverError { + receiver: effect_handler.receiver_id(), + kind: ReceiverErrorKind::Other, + error: format!("failed to send host metrics: {other}"), + source_detail: String::new(), + }); + } + } + } + } + Err(err) => { + return Err(Error::ReceiverError { + receiver: effect_handler.receiver_id(), + kind: ReceiverErrorKind::Other, + error: format!("failed to collect host metrics: {err}"), + source_detail: String::new(), + }); + } + } + } + } + } + } +} + +fn encode_snapshot(snapshot: HostSnapshot) -> Result { + let request = snapshot.into_export_request(); + let mut buf = Vec::with_capacity(request.encoded_len()); + request + .encode(&mut buf) + .expect("encoding metrics request to Vec cannot fail"); + let records: OtapArrowRecords = + OtlpProtoBytes::ExportMetricsRequest(Bytes::from(buf)).try_into()?; + Ok(OtapPdata::new(Context::default(), records.into())) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn rejects_zero_collection_interval() { + let mut config = Config::default(); + config.collection_interval = Duration::ZERO; + + assert!(matches!( + validate_config(&config), + Err(otap_df_config::error::Error::InvalidUserConfig { .. }) + )); + } + + #[test] + fn rejects_all_families_disabled() { + let config = Config { + families: FamiliesConfig { + cpu: FamilyConfig { enabled: false }, + memory: FamilyConfig { enabled: false }, + paging: FamilyConfig { enabled: false }, + system: FamilyConfig { enabled: false }, + disk: DiskFamilyConfig { + enabled: false, + ..DiskFamilyConfig::default() + }, + network: NetworkFamilyConfig { + enabled: false, + ..NetworkFamilyConfig::default() + }, + processes: ProcessesFamilyConfig { + enabled: false, + ..ProcessesFamilyConfig::default() + }, + }, + ..Config::default() + }; + + assert!(matches!( + validate_config(&config), + Err(otap_df_config::error::Error::InvalidUserConfig { .. }) + )); + } + + #[test] + fn rejects_v1_network_connection_counts() { + let config = Config { + families: FamiliesConfig { + network: NetworkFamilyConfig { + include_connection_count: true, + ..NetworkFamilyConfig::default() + }, + ..FamiliesConfig::default() + }, + ..Config::default() + }; + + assert!(matches!( + validate_config(&config), + Err(otap_df_config::error::Error::InvalidUserConfig { .. }) + )); + } + + #[test] + fn glob_filter_matches_without_regex_allocations() { + let filter = CompiledFilter::compile(MatchType::Glob, vec!["loop*".to_owned()]) + .expect("valid filter") + .expect("non-empty filter"); + + assert!(filter.matches("loop0")); + assert!(!filter.matches("nvme0n1")); + } + + #[test] + fn regexp_filter_is_compiled_once() { + let filter = CompiledFilter::compile(MatchType::Regexp, vec![r"^eth[0-9]+$".to_owned()]) + .expect("valid filter") + .expect("non-empty filter"); + + assert!(filter.matches("eth0")); + assert!(!filter.matches("lo")); + } +} diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs new file mode 100644 index 0000000000..a6def64f40 --- /dev/null +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs @@ -0,0 +1,1595 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +//! Linux procfs-backed host metric source. + +use crate::receivers::host_metrics_receiver::CompiledFilter; +use otap_df_pdata::proto::opentelemetry::collector::metrics::v1::ExportMetricsServiceRequest; +use otap_df_pdata::proto::opentelemetry::common::v1::{ + AnyValue, InstrumentationScope, KeyValue, any_value, +}; +use otap_df_pdata::proto::opentelemetry::metrics::v1::{ + AggregationTemporality, Gauge, Metric, NumberDataPoint, ResourceMetrics, ScopeMetrics, Sum, + metric, number_data_point, +}; +use otap_df_pdata::proto::opentelemetry::resource::v1::Resource; +use std::collections::HashSet; +use std::fs::File; +use std::io::{self, Read}; +use std::path::{Path, PathBuf}; +use std::time::{SystemTime, UNIX_EPOCH}; + +const NANOS_PER_SEC: u64 = 1_000_000_000; +const BYTES_PER_KIB: u64 = 1024; +const DISKSTAT_SECTOR_BYTES: u64 = 512; + +/// Procfs-backed source for host metrics. +pub struct ProcfsSource { + paths: ProcfsPaths, + config: ProcfsConfig, + buf: String, + clk_tck: f64, +} + +/// Procfs collection config. +pub struct ProcfsConfig { + /// CPU metrics. + pub cpu: bool, + /// Memory metrics. + pub memory: bool, + /// Paging metrics. + pub paging: bool, + /// System metrics. + pub system: bool, + /// Disk metrics. + pub disk: bool, + /// Network metrics. + pub network: bool, + /// Process summary metrics. + pub processes: bool, + /// Disk include filter. + pub disk_include: Option, + /// Disk exclude filter. + pub disk_exclude: Option, + /// Network include filter. + pub network_include: Option, + /// Network exclude filter. + pub network_exclude: Option, +} + +impl ProcfsSource { + /// Creates a procfs source rooted at `/` or at a host root bind mount. + pub fn new(root_path: Option<&Path>, config: ProcfsConfig) -> io::Result { + let source = Self { + paths: ProcfsPaths::new(root_path), + config, + buf: String::with_capacity(16 * 1024), + clk_tck: clock_ticks_per_second(), + }; + source.validate_selected_paths()?; + Ok(source) + } + + /// Collects one host snapshot. + pub fn scrape(&mut self) -> io::Result { + let now_unix_nano = now_unix_nano(); + let clk_tck = self.clk_tck; + let needs_stat = self.config.cpu || self.config.system || self.config.processes; + let stat = if needs_stat { + let proc_stat = self.read_path(PathKind::Stat)?; + parse_stat(proc_stat, clk_tck) + } else { + StatSnapshot::default() + }; + + let cpuinfo = if self.config.cpu { + let cpuinfo = self.read_path(PathKind::Cpuinfo)?; + parse_cpuinfo(cpuinfo) + } else { + CpuInfo::default() + }; + + let memory = if self.config.memory { + let meminfo = self.read_path(PathKind::Meminfo)?; + parse_meminfo(meminfo) + } else { + None + }; + + let uptime_seconds = if self.config.system { + let uptime = self.read_path(PathKind::Uptime)?; + parse_uptime(uptime) + } else { + None + }; + + let paging = if self.config.paging { + let vmstat = self.read_path(PathKind::Vmstat)?; + Some(parse_vmstat(vmstat)) + } else { + None + }; + + let swaps = if self.config.paging { + let swaps = self.read_path(PathKind::Swaps)?; + parse_swaps(swaps) + } else { + Vec::new() + }; + + let disks = if self.config.disk { + let disk_include = self.config.disk_include.clone(); + let disk_exclude = self.config.disk_exclude.clone(); + let diskstats = self.read_path(PathKind::Diskstats)?; + parse_diskstats(diskstats, disk_include.as_ref(), disk_exclude.as_ref()) + } else { + Vec::new() + }; + + let networks = if self.config.network { + let network_include = self.config.network_include.clone(); + let network_exclude = self.config.network_exclude.clone(); + let netdev = self.read_path(PathKind::NetDev)?; + parse_netdev(netdev, network_include.as_ref(), network_exclude.as_ref()) + } else { + Vec::new() + }; + + let resource = self.read_resource(); + + Ok(HostSnapshot { + now_unix_nano, + start_time_unix_nano: stat.boot_time_unix_nano, + cpu: stat.cpu, + cpuinfo, + memory, + uptime_seconds, + paging, + swaps, + processes: self.config.processes.then_some(stat.processes), + disks, + networks, + resource, + }) + } + + fn validate_selected_paths(&self) -> io::Result<()> { + if self.config.cpu || self.config.system || self.config.processes { + let _ = File::open(self.paths.path(PathKind::Stat))?; + } + if self.config.cpu { + let _ = File::open(self.paths.path(PathKind::Cpuinfo))?; + } + if self.config.memory { + let _ = File::open(self.paths.path(PathKind::Meminfo))?; + } + if self.config.system { + let _ = File::open(self.paths.path(PathKind::Uptime))?; + } + if self.config.paging { + let _ = File::open(self.paths.path(PathKind::Vmstat))?; + let _ = File::open(self.paths.path(PathKind::Swaps))?; + } + if self.config.disk { + let _ = File::open(self.paths.path(PathKind::Diskstats))?; + } + if self.config.network { + let _ = File::open(self.paths.path(PathKind::NetDev))?; + } + Ok(()) + } + + fn read_path(&mut self, kind: PathKind) -> io::Result<&str> { + self.buf.clear(); + let mut file = File::open(self.paths.path(kind))?; + let _ = file.read_to_string(&mut self.buf)?; + Ok(self.buf.as_str()) + } + + fn read_resource(&mut self) -> HostResource { + HostResource { + host_id: self + .read_trimmed_optional(PathKind::MachineId) + .or_else(|| self.read_trimmed_optional(PathKind::DbusMachineId)), + host_name: self.read_trimmed_optional(PathKind::Hostname), + } + } + + fn read_trimmed_optional(&mut self, kind: PathKind) -> Option { + self.read_path(kind) + .ok() + .map(str::trim) + .filter(|value| !value.is_empty()) + .map(str::to_owned) + } +} + +#[derive(Clone, Debug)] +struct ProcfsPaths { + stat: PathBuf, + cpuinfo: PathBuf, + meminfo: PathBuf, + uptime: PathBuf, + vmstat: PathBuf, + swaps: PathBuf, + diskstats: PathBuf, + net_dev: PathBuf, + machine_id: PathBuf, + dbus_machine_id: PathBuf, + hostname: PathBuf, +} + +impl ProcfsPaths { + fn new(root_path: Option<&Path>) -> Self { + let root = root_path.unwrap_or_else(|| Path::new("/")); + let host_root = root_path.is_some_and(|path| path != Path::new("/")); + Self { + stat: root.join("proc/stat"), + cpuinfo: root.join("proc/cpuinfo"), + meminfo: root.join("proc/meminfo"), + uptime: root.join("proc/uptime"), + vmstat: root.join("proc/vmstat"), + swaps: root.join("proc/swaps"), + diskstats: root.join("proc/diskstats"), + machine_id: root.join("etc/machine-id"), + dbus_machine_id: root.join("var/lib/dbus/machine-id"), + hostname: root.join("proc/sys/kernel/hostname"), + net_dev: if host_root { + root.join("proc/1/net/dev") + } else { + root.join("proc/net/dev") + }, + } + } + + fn path(&self, kind: PathKind) -> &Path { + match kind { + PathKind::Stat => &self.stat, + PathKind::Cpuinfo => &self.cpuinfo, + PathKind::Meminfo => &self.meminfo, + PathKind::Uptime => &self.uptime, + PathKind::Vmstat => &self.vmstat, + PathKind::Swaps => &self.swaps, + PathKind::Diskstats => &self.diskstats, + PathKind::NetDev => &self.net_dev, + PathKind::MachineId => &self.machine_id, + PathKind::DbusMachineId => &self.dbus_machine_id, + PathKind::Hostname => &self.hostname, + } + } +} + +#[derive(Copy, Clone)] +enum PathKind { + Stat, + Cpuinfo, + Meminfo, + Uptime, + Vmstat, + Swaps, + Diskstats, + NetDev, + MachineId, + DbusMachineId, + Hostname, +} + +/// One host metrics snapshot. +#[derive(Default)] +pub struct HostSnapshot { + now_unix_nano: u64, + start_time_unix_nano: u64, + cpu: Option, + cpuinfo: CpuInfo, + memory: Option, + uptime_seconds: Option, + paging: Option, + swaps: Vec, + processes: Option, + disks: Vec, + networks: Vec, + resource: HostResource, +} + +impl HostSnapshot { + /// Converts a snapshot into an OTLP metrics request. + pub fn into_export_request(self) -> ExportMetricsServiceRequest { + let mut metrics = Vec::with_capacity(64); + let now = self.now_unix_nano; + let start = self.start_time_unix_nano; + + if let Some(cpu) = self.cpu { + push_sum_f64( + &mut metrics, + "system.cpu.time", + "s", + start, + now, + &[ + ("user", cpu.user), + ("nice", cpu.nice), + ("system", cpu.system), + ("idle", cpu.idle), + ("wait", cpu.wait), + ("interrupt", cpu.interrupt), + ("steal", cpu.steal), + ], + "cpu.mode", + ); + } + + if self.cpuinfo.logical_count != 0 { + push_gauge_single_u64( + &mut metrics, + "system.cpu.logical.count", + "{cpu}", + now, + self.cpuinfo.logical_count, + ); + } + if self.cpuinfo.physical_count != 0 { + push_gauge_single_u64( + &mut metrics, + "system.cpu.physical.count", + "{cpu}", + now, + self.cpuinfo.physical_count, + ); + } + push_cpu_frequency(&mut metrics, now, &self.cpuinfo.frequencies_hz); + + if let Some(memory) = self.memory { + push_gauge_u64( + &mut metrics, + "system.memory.usage", + "By", + now, + &[ + ("used", memory.used), + ("free", memory.free), + ("cached", memory.cached), + ("buffers", memory.buffered), + ], + "system.memory.state", + ); + push_gauge_ratio( + &mut metrics, + "system.memory.utilization", + "1", + now, + memory.total, + &[ + ("used", memory.used), + ("free", memory.free), + ("cached", memory.cached), + ("buffers", memory.buffered), + ], + "system.memory.state", + ); + push_gauge_single_u64( + &mut metrics, + "system.memory.linux.available", + "By", + now, + memory.available, + ); + push_gauge_u64( + &mut metrics, + "system.memory.linux.slab.usage", + "By", + now, + &[ + ("reclaimable", memory.slab_reclaimable), + ("unreclaimable", memory.slab_unreclaimable), + ], + "system.memory.linux.slab.state", + ); + } + + if let Some(uptime_seconds) = self.uptime_seconds { + push_gauge_f64(&mut metrics, "system.uptime", "s", now, uptime_seconds); + } + + if let Some(paging) = self.paging { + push_sum_u64( + &mut metrics, + "system.paging.faults", + "{fault}", + start, + now, + &[ + ("minor", paging.minor_faults), + ("major", paging.major_faults), + ], + "system.paging.fault.type", + ); + push_sum_u64( + &mut metrics, + "system.paging.operations", + "{operation}", + start, + now, + &[("in", paging.swap_in), ("out", paging.swap_out)], + "system.paging.direction", + ); + } + + for swap in self.swaps { + push_gauge_u64_with_device( + &mut metrics, + "system.paging.usage", + "By", + now, + &swap.name, + &[("used", swap.used), ("free", swap.free)], + "system.paging.state", + ); + push_gauge_ratio_with_device( + &mut metrics, + "system.paging.utilization", + "1", + now, + &swap.name, + swap.size, + &[("used", swap.used), ("free", swap.free)], + "system.paging.state", + ); + } + + if let Some(processes) = self.processes { + push_gauge_u64( + &mut metrics, + "system.process.count", + "{process}", + now, + &[ + ("running", processes.running), + ("blocked", processes.blocked), + ], + "process.state", + ); + push_sum_single_u64( + &mut metrics, + "system.process.created", + "{process}", + start, + now, + processes.created, + ); + } + + for disk in self.disks { + push_disk_sum( + &mut metrics, + "system.disk.io", + "By", + start, + now, + &disk, + DiskProjection::Bytes, + ); + push_disk_sum( + &mut metrics, + "system.disk.operations", + "{operation}", + start, + now, + &disk, + DiskProjection::Operations, + ); + push_disk_sum( + &mut metrics, + "system.disk.io_time", + "s", + start, + now, + &disk, + DiskProjection::IoTime, + ); + push_disk_sum( + &mut metrics, + "system.disk.operation_time", + "s", + start, + now, + &disk, + DiskProjection::OperationTime, + ); + push_disk_sum( + &mut metrics, + "system.disk.merged", + "{operation}", + start, + now, + &disk, + DiskProjection::Merged, + ); + } + + for network in self.networks { + push_network_sum( + &mut metrics, + "system.network.io", + "By", + start, + now, + &network, + NetworkProjection::Bytes, + ); + push_network_sum( + &mut metrics, + "system.network.packet.count", + "{packet}", + start, + now, + &network, + NetworkProjection::Packets, + ); + push_network_sum( + &mut metrics, + "system.network.packet.dropped", + "{packet}", + start, + now, + &network, + NetworkProjection::Dropped, + ); + push_network_sum( + &mut metrics, + "system.network.errors", + "{error}", + start, + now, + &network, + NetworkProjection::Errors, + ); + } + + ExportMetricsServiceRequest { + resource_metrics: vec![ResourceMetrics { + resource: Some(Resource { + attributes: self.resource.into_attributes(), + dropped_attributes_count: 0, + entity_refs: Vec::new(), + }), + scope_metrics: vec![ScopeMetrics { + scope: Some(InstrumentationScope { + name: "otap-df-core-nodes/host-metrics".to_owned(), + version: env!("CARGO_PKG_VERSION").to_owned(), + attributes: Vec::new(), + dropped_attributes_count: 0, + }), + metrics, + schema_url: String::new(), + }], + schema_url: String::new(), + }], + } + } +} + +#[derive(Default)] +struct HostResource { + host_id: Option, + host_name: Option, +} + +impl HostResource { + fn into_attributes(self) -> Vec { + let mut attributes = Vec::with_capacity(4); + attributes.push(kv_str("os.type", "linux")); + if let Some(host_id) = self.host_id { + attributes.push(kv_str("host.id", &host_id)); + } + if let Some(host_name) = self.host_name { + attributes.push(kv_str("host.name", &host_name)); + } + attributes + } +} + +#[derive(Copy, Clone, Default)] +struct CpuTimes { + user: f64, + nice: f64, + system: f64, + idle: f64, + wait: f64, + interrupt: f64, + steal: f64, +} + +#[derive(Clone, Default)] +struct CpuInfo { + logical_count: u64, + physical_count: u64, + frequencies_hz: Vec, +} + +#[derive(Copy, Clone, Default)] +struct StatSnapshot { + boot_time_unix_nano: u64, + cpu: Option, + processes: ProcessStats, +} + +#[derive(Copy, Clone, Default)] +struct MemoryStats { + total: u64, + used: u64, + free: u64, + available: u64, + cached: u64, + buffered: u64, + slab_reclaimable: u64, + slab_unreclaimable: u64, +} + +#[derive(Copy, Clone, Default)] +struct PagingStats { + minor_faults: u64, + major_faults: u64, + swap_in: u64, + swap_out: u64, +} + +#[derive(Default)] +struct SwapStats { + name: String, + size: u64, + used: u64, + free: u64, +} + +#[derive(Copy, Clone, Default)] +struct ProcessStats { + running: u64, + blocked: u64, + created: u64, +} + +#[derive(Default)] +struct DiskStats { + name: String, + read_bytes: u64, + write_bytes: u64, + read_ops: u64, + write_ops: u64, + read_merged: u64, + write_merged: u64, + read_time_seconds: f64, + write_time_seconds: f64, + io_time_seconds: f64, +} + +#[derive(Default)] +struct NetworkStats { + name: String, + rx_bytes: u64, + tx_bytes: u64, + rx_packets: u64, + tx_packets: u64, + rx_errors: u64, + tx_errors: u64, + rx_dropped: u64, + tx_dropped: u64, +} + +fn parse_stat(input: &str, clk_tck: f64) -> StatSnapshot { + let mut snapshot = StatSnapshot::default(); + for line in input.lines() { + if let Some(rest) = line.strip_prefix("cpu ") { + snapshot.cpu = parse_cpu_total(rest, clk_tck); + } else if let Some(value) = line.strip_prefix("btime ") { + snapshot.boot_time_unix_nano = parse_u64(value).saturating_mul(NANOS_PER_SEC); + } else if let Some(value) = line.strip_prefix("procs_running ") { + snapshot.processes.running = parse_u64(value); + } else if let Some(value) = line.strip_prefix("procs_blocked ") { + snapshot.processes.blocked = parse_u64(value); + } else if let Some(value) = line.strip_prefix("processes ") { + snapshot.processes.created = parse_u64(value); + } + } + snapshot +} + +fn parse_cpu_total(input: &str, clk_tck: f64) -> Option { + let mut fields = [0_u64; 10]; + let mut count = 0; + for (idx, token) in input.split_whitespace().take(fields.len()).enumerate() { + fields[idx] = parse_u64(token); + count += 1; + } + if count < 4 { + return None; + } + + let user = fields[0].saturating_sub(fields[8]); + let nice = fields[1].saturating_sub(fields[9]); + Some(CpuTimes { + user: ticks_to_seconds(user, clk_tck), + nice: ticks_to_seconds(nice, clk_tck), + system: ticks_to_seconds(fields[2], clk_tck), + idle: ticks_to_seconds(fields[3], clk_tck), + wait: ticks_to_seconds(fields[4], clk_tck), + interrupt: ticks_to_seconds(fields[5].saturating_add(fields[6]), clk_tck), + steal: ticks_to_seconds(fields[7], clk_tck), + }) +} + +fn parse_cpuinfo(input: &str) -> CpuInfo { + let mut logical_count = 0; + let mut frequencies_hz = Vec::new(); + let mut physical_cores = HashSet::new(); + let mut physical_id = None; + let mut core_id = None; + + for line in input.lines() { + let Some((key, value)) = line.split_once(':') else { + continue; + }; + let key = key.trim(); + let value = value.trim(); + match key { + "processor" => { + logical_count += 1; + if let (Some(physical), Some(core)) = (physical_id.take(), core_id.take()) { + let _ = physical_cores.insert((physical, core)); + } + } + "physical id" => physical_id = Some(parse_u64(value)), + "core id" => core_id = Some(parse_u64(value)), + "cpu MHz" => { + if let Ok(mhz) = value.parse::() { + frequencies_hz.push(mhz * 1_000_000.0); + } + } + _ => {} + } + } + if let (Some(physical), Some(core)) = (physical_id, core_id) { + let _ = physical_cores.insert((physical, core)); + } + + let physical_count = u64::try_from(physical_cores.len()) + .ok() + .filter(|count| *count != 0) + .unwrap_or(logical_count); + CpuInfo { + logical_count, + physical_count, + frequencies_hz, + } +} + +fn parse_meminfo(input: &str) -> Option { + let mut total = 0; + let mut free = 0; + let mut available = None; + let mut buffers = 0; + let mut cached = 0; + let mut slab_reclaimable = 0; + let mut slab_unreclaimable = 0; + + for line in input.lines() { + let mut fields = line.split_whitespace(); + let Some(key) = fields.next() else { + continue; + }; + let value = fields.next().map(parse_u64).unwrap_or_default() * BYTES_PER_KIB; + match key.trim_end_matches(':') { + "MemTotal" => total = value, + "MemFree" => free = value, + "MemAvailable" => available = Some(value), + "Buffers" => buffers = value, + "Cached" => cached = value, + "SReclaimable" => slab_reclaimable = value, + "SUnreclaim" => slab_unreclaimable = value, + _ => {} + } + } + + if total == 0 { + return None; + } + let available = + available.unwrap_or_else(|| free.saturating_add(buffers).saturating_add(cached)); + Some(MemoryStats { + total, + used: total.saturating_sub(available), + free, + available, + cached, + buffered: buffers, + slab_reclaimable, + slab_unreclaimable, + }) +} + +fn parse_uptime(input: &str) -> Option { + input.split_whitespace().next()?.parse().ok() +} + +fn parse_vmstat(input: &str) -> PagingStats { + let mut total_faults = 0; + let mut major_faults = 0; + let mut swap_in = 0; + let mut swap_out = 0; + + for line in input.lines() { + let mut fields = line.split_whitespace(); + let Some(key) = fields.next() else { + continue; + }; + let value = fields.next().map(parse_u64).unwrap_or_default(); + match key { + "pgfault" => total_faults = value, + "pgmajfault" => major_faults = value, + "pswpin" => swap_in = value, + "pswpout" => swap_out = value, + _ => {} + } + } + + PagingStats { + minor_faults: total_faults.saturating_sub(major_faults), + major_faults, + swap_in, + swap_out, + } +} + +fn parse_swaps(input: &str) -> Vec { + let mut swaps = Vec::new(); + for line in input.lines().skip(1) { + let mut fields = line.split_whitespace(); + let Some(name) = fields.next() else { + continue; + }; + let _kind = fields.next(); + let Some(size_kib) = fields.next() else { + continue; + }; + let Some(used_kib) = fields.next() else { + continue; + }; + let size = parse_u64(size_kib).saturating_mul(BYTES_PER_KIB); + let used = parse_u64(used_kib).saturating_mul(BYTES_PER_KIB); + swaps.push(SwapStats { + name: name.to_owned(), + size, + used, + free: size.saturating_sub(used), + }); + } + swaps +} + +fn parse_diskstats( + input: &str, + include: Option<&CompiledFilter>, + exclude: Option<&CompiledFilter>, +) -> Vec { + let mut disks = Vec::new(); + for line in input.lines() { + let mut fields = line.split_whitespace(); + let _major = fields.next(); + let _minor = fields.next(); + let Some(name) = fields.next() else { + continue; + }; + if !filter_allows(name, include, exclude) { + continue; + } + let Some(read_ops) = fields.next() else { + continue; + }; + let Some(read_merged) = fields.next() else { + continue; + }; + let Some(read_sectors) = fields.next() else { + continue; + }; + let Some(read_ms) = fields.next() else { + continue; + }; + let Some(write_ops) = fields.next() else { + continue; + }; + let Some(write_merged) = fields.next() else { + continue; + }; + let Some(write_sectors) = fields.next() else { + continue; + }; + let Some(write_ms) = fields.next() else { + continue; + }; + let _in_progress = fields.next(); + let Some(io_ms) = fields.next() else { + continue; + }; + disks.push(DiskStats { + name: name.to_owned(), + read_ops: parse_u64(read_ops), + read_bytes: parse_u64(read_sectors).saturating_mul(DISKSTAT_SECTOR_BYTES), + write_ops: parse_u64(write_ops), + write_bytes: parse_u64(write_sectors).saturating_mul(DISKSTAT_SECTOR_BYTES), + read_merged: parse_u64(read_merged), + write_merged: parse_u64(write_merged), + read_time_seconds: millis_to_seconds(parse_u64(read_ms)), + write_time_seconds: millis_to_seconds(parse_u64(write_ms)), + io_time_seconds: millis_to_seconds(parse_u64(io_ms)), + }); + } + disks +} + +fn parse_netdev( + input: &str, + include: Option<&CompiledFilter>, + exclude: Option<&CompiledFilter>, +) -> Vec { + let mut interfaces = Vec::new(); + for line in input.lines().skip(2) { + let Some((name, values)) = line.split_once(':') else { + continue; + }; + let name = name.trim(); + if !filter_allows(name, include, exclude) { + continue; + } + let mut fields = values.split_whitespace(); + let Some(rx_bytes) = fields.next() else { + continue; + }; + let Some(rx_packets) = fields.next() else { + continue; + }; + let Some(rx_errors) = fields.next() else { + continue; + }; + let Some(rx_dropped) = fields.next() else { + continue; + }; + let _rx_fifo = fields.next(); + let _rx_frame = fields.next(); + let _rx_compressed = fields.next(); + let _rx_multicast = fields.next(); + let Some(tx_bytes) = fields.next() else { + continue; + }; + let Some(tx_packets) = fields.next() else { + continue; + }; + let Some(tx_errors) = fields.next() else { + continue; + }; + let Some(tx_dropped) = fields.next() else { + continue; + }; + interfaces.push(NetworkStats { + name: name.to_owned(), + rx_bytes: parse_u64(rx_bytes), + rx_packets: parse_u64(rx_packets), + tx_bytes: parse_u64(tx_bytes), + tx_packets: parse_u64(tx_packets), + rx_errors: parse_u64(rx_errors), + tx_errors: parse_u64(tx_errors), + rx_dropped: parse_u64(rx_dropped), + tx_dropped: parse_u64(tx_dropped), + }); + } + interfaces +} + +fn filter_allows( + value: &str, + include: Option<&CompiledFilter>, + exclude: Option<&CompiledFilter>, +) -> bool { + include.is_none_or(|filter| filter.matches(value)) + && !exclude.is_some_and(|filter| filter.matches(value)) +} + +fn push_gauge_f64( + metrics: &mut Vec, + name: &'static str, + unit: &'static str, + now: u64, + value: f64, +) { + metrics.push(Metric { + name: name.to_owned(), + description: String::new(), + unit: unit.to_owned(), + metadata: Vec::new(), + data: Some(metric::Data::Gauge(Gauge { + data_points: vec![number_point_f64(Vec::new(), 0, now, value)], + })), + }); +} + +fn push_gauge_u64( + metrics: &mut Vec, + name: &'static str, + unit: &'static str, + now: u64, + values: &[(&'static str, u64)], + attr_name: &'static str, +) { + let mut points = Vec::with_capacity(values.len()); + for (state, value) in values { + points.push(number_point_i64( + vec![kv_str(attr_name, state)], + 0, + now, + saturating_i64(*value), + )); + } + metrics.push(Metric { + name: name.to_owned(), + description: String::new(), + unit: unit.to_owned(), + metadata: Vec::new(), + data: Some(metric::Data::Gauge(Gauge { + data_points: points, + })), + }); +} + +fn push_gauge_u64_with_device( + metrics: &mut Vec, + name: &'static str, + unit: &'static str, + now: u64, + device: &str, + values: &[(&'static str, u64)], + attr_name: &'static str, +) { + let mut points = Vec::with_capacity(values.len()); + for (state, value) in values { + points.push(number_point_i64( + vec![kv_str("system.device", device), kv_str(attr_name, state)], + 0, + now, + saturating_i64(*value), + )); + } + metrics.push(Metric { + name: name.to_owned(), + description: String::new(), + unit: unit.to_owned(), + metadata: Vec::new(), + data: Some(metric::Data::Gauge(Gauge { + data_points: points, + })), + }); +} + +fn push_gauge_single_u64( + metrics: &mut Vec, + name: &'static str, + unit: &'static str, + now: u64, + value: u64, +) { + metrics.push(Metric { + name: name.to_owned(), + description: String::new(), + unit: unit.to_owned(), + metadata: Vec::new(), + data: Some(metric::Data::Gauge(Gauge { + data_points: vec![number_point_i64(Vec::new(), 0, now, saturating_i64(value))], + })), + }); +} + +fn push_cpu_frequency(metrics: &mut Vec, now: u64, frequencies_hz: &[f64]) { + if frequencies_hz.is_empty() { + return; + } + let mut points = Vec::with_capacity(frequencies_hz.len()); + for (idx, frequency) in frequencies_hz.iter().enumerate() { + points.push(number_point_f64( + vec![kv_str("cpu.logical_number", &idx.to_string())], + 0, + now, + *frequency, + )); + } + metrics.push(Metric { + name: "system.cpu.frequency".to_owned(), + description: String::new(), + unit: "Hz".to_owned(), + metadata: Vec::new(), + data: Some(metric::Data::Gauge(Gauge { + data_points: points, + })), + }); +} + +fn push_gauge_ratio( + metrics: &mut Vec, + name: &'static str, + unit: &'static str, + now: u64, + total: u64, + values: &[(&'static str, u64)], + attr_name: &'static str, +) { + if total == 0 { + return; + } + let total = total as f64; + let mut points = Vec::with_capacity(values.len()); + for (state, value) in values { + points.push(number_point_f64( + vec![kv_str(attr_name, state)], + 0, + now, + *value as f64 / total, + )); + } + metrics.push(Metric { + name: name.to_owned(), + description: String::new(), + unit: unit.to_owned(), + metadata: Vec::new(), + data: Some(metric::Data::Gauge(Gauge { + data_points: points, + })), + }); +} + +fn push_gauge_ratio_with_device( + metrics: &mut Vec, + name: &'static str, + unit: &'static str, + now: u64, + device: &str, + total: u64, + values: &[(&'static str, u64)], + attr_name: &'static str, +) { + if total == 0 { + return; + } + let total = total as f64; + let mut points = Vec::with_capacity(values.len()); + for (state, value) in values { + points.push(number_point_f64( + vec![kv_str("system.device", device), kv_str(attr_name, state)], + 0, + now, + *value as f64 / total, + )); + } + metrics.push(Metric { + name: name.to_owned(), + description: String::new(), + unit: unit.to_owned(), + metadata: Vec::new(), + data: Some(metric::Data::Gauge(Gauge { + data_points: points, + })), + }); +} + +fn push_sum_f64( + metrics: &mut Vec, + name: &'static str, + unit: &'static str, + start: u64, + now: u64, + values: &[(&'static str, f64)], + attr_name: &'static str, +) { + let mut points = Vec::with_capacity(values.len()); + for (state, value) in values { + points.push(number_point_f64( + vec![kv_str(attr_name, state)], + start, + now, + *value, + )); + } + push_sum_metric(metrics, name, unit, points); +} + +fn push_sum_u64( + metrics: &mut Vec, + name: &'static str, + unit: &'static str, + start: u64, + now: u64, + values: &[(&'static str, u64)], + attr_name: &'static str, +) { + let mut points = Vec::with_capacity(values.len()); + for (state, value) in values { + points.push(number_point_i64( + vec![kv_str(attr_name, state)], + start, + now, + saturating_i64(*value), + )); + } + push_sum_metric(metrics, name, unit, points); +} + +fn push_sum_single_u64( + metrics: &mut Vec, + name: &'static str, + unit: &'static str, + start: u64, + now: u64, + value: u64, +) { + push_sum_metric( + metrics, + name, + unit, + vec![number_point_i64( + Vec::new(), + start, + now, + saturating_i64(value), + )], + ); +} + +fn push_disk_sum( + metrics: &mut Vec, + name: &'static str, + unit: &'static str, + start: u64, + now: u64, + disk: &DiskStats, + projection: DiskProjection, +) { + if let DiskProjection::IoTime = projection { + push_sum_metric( + metrics, + name, + unit, + vec![number_point_f64( + vec![kv_str("system.device", &disk.name)], + start, + now, + disk.io_time_seconds, + )], + ); + return; + } + + let (read, write) = match projection { + DiskProjection::Bytes => ( + DiskValue::Integer(disk.read_bytes), + DiskValue::Integer(disk.write_bytes), + ), + DiskProjection::Operations => ( + DiskValue::Integer(disk.read_ops), + DiskValue::Integer(disk.write_ops), + ), + DiskProjection::OperationTime => ( + DiskValue::Float(disk.read_time_seconds), + DiskValue::Float(disk.write_time_seconds), + ), + DiskProjection::Merged => ( + DiskValue::Integer(disk.read_merged), + DiskValue::Integer(disk.write_merged), + ), + DiskProjection::IoTime => unreachable!(), + }; + let points = vec![ + disk_number_point(&disk.name, "read", start, now, read), + disk_number_point(&disk.name, "write", start, now, write), + ]; + push_sum_metric(metrics, name, unit, points); +} + +#[derive(Copy, Clone)] +enum DiskProjection { + Bytes, + Operations, + IoTime, + OperationTime, + Merged, +} + +#[derive(Copy, Clone)] +enum DiskValue { + Integer(u64), + Float(f64), +} + +fn disk_number_point( + device: &str, + direction: &'static str, + start: u64, + now: u64, + value: DiskValue, +) -> NumberDataPoint { + let attributes = vec![ + kv_str("system.device", device), + kv_str("disk.io.direction", direction), + ]; + match value { + DiskValue::Integer(value) => { + number_point_i64(attributes, start, now, saturating_i64(value)) + } + DiskValue::Float(value) => number_point_f64(attributes, start, now, value), + } +} + +fn push_network_sum( + metrics: &mut Vec, + name: &'static str, + unit: &'static str, + start: u64, + now: u64, + network: &NetworkStats, + projection: NetworkProjection, +) { + let (rx, tx, interface_attr) = match projection { + NetworkProjection::Bytes => (network.rx_bytes, network.tx_bytes, "network.interface.name"), + NetworkProjection::Packets => (network.rx_packets, network.tx_packets, "system.device"), + NetworkProjection::Dropped => ( + network.rx_dropped, + network.tx_dropped, + "network.interface.name", + ), + NetworkProjection::Errors => ( + network.rx_errors, + network.tx_errors, + "network.interface.name", + ), + }; + let points = vec![ + number_point_i64( + vec![ + kv_str(interface_attr, &network.name), + kv_str("network.io.direction", "receive"), + ], + start, + now, + saturating_i64(rx), + ), + number_point_i64( + vec![ + kv_str(interface_attr, &network.name), + kv_str("network.io.direction", "transmit"), + ], + start, + now, + saturating_i64(tx), + ), + ]; + push_sum_metric(metrics, name, unit, points); +} + +#[derive(Copy, Clone)] +enum NetworkProjection { + Bytes, + Packets, + Dropped, + Errors, +} + +fn push_sum_metric( + metrics: &mut Vec, + name: &'static str, + unit: &'static str, + points: Vec, +) { + metrics.push(Metric { + name: name.to_owned(), + description: String::new(), + unit: unit.to_owned(), + metadata: Vec::new(), + data: Some(metric::Data::Sum(Sum { + data_points: points, + aggregation_temporality: AggregationTemporality::Cumulative.into(), + is_monotonic: true, + })), + }); +} + +fn number_point_f64( + attributes: Vec, + start_time_unix_nano: u64, + time_unix_nano: u64, + value: f64, +) -> NumberDataPoint { + NumberDataPoint { + attributes, + start_time_unix_nano, + time_unix_nano, + exemplars: Vec::new(), + flags: 0, + value: Some(number_data_point::Value::AsDouble(value)), + } +} + +fn number_point_i64( + attributes: Vec, + start_time_unix_nano: u64, + time_unix_nano: u64, + value: i64, +) -> NumberDataPoint { + NumberDataPoint { + attributes, + start_time_unix_nano, + time_unix_nano, + exemplars: Vec::new(), + flags: 0, + value: Some(number_data_point::Value::AsInt(value)), + } +} + +fn kv_str(key: &str, value: &str) -> KeyValue { + KeyValue { + key: key.to_owned(), + value: Some(AnyValue { + value: Some(any_value::Value::StringValue(value.to_owned())), + }), + } +} + +fn parse_u64(input: &str) -> u64 { + input.parse().unwrap_or_default() +} + +fn ticks_to_seconds(ticks: u64, clk_tck: f64) -> f64 { + ticks as f64 / clk_tck +} + +fn millis_to_seconds(ms: u64) -> f64 { + ms as f64 / 1_000.0 +} + +fn clock_ticks_per_second() -> f64 { + // Linux exposes CPU counters in USER_HZ. 100 is the common Linux value and + // keeps this receiver dependency-light until a platform helper is added. + 100.0 +} + +fn now_unix_nano() -> u64 { + let Ok(duration) = SystemTime::now().duration_since(UNIX_EPOCH) else { + return 0; + }; + duration.as_secs().saturating_mul(NANOS_PER_SEC) + u64::from(duration.subsec_nanos()) +} + +fn saturating_i64(value: u64) -> i64 { + i64::try_from(value).unwrap_or(i64::MAX) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn cpu_parser_accepts_missing_newer_fields() { + let cpu = parse_cpu_total("10 20 30 40", 10.0).expect("cpu row"); + assert_eq!(cpu.user, 1.0); + assert_eq!(cpu.nice, 2.0); + assert_eq!(cpu.system, 3.0); + assert_eq!(cpu.idle, 4.0); + assert_eq!(cpu.steal, 0.0); + } + + #[test] + fn cpu_parser_removes_guest_from_user_and_nice() { + let cpu = parse_cpu_total("100 50 30 40 5 2 3 7 10 4", 10.0).expect("cpu row"); + assert_eq!(cpu.user, 9.0); + assert_eq!(cpu.nice, 4.6); + assert_eq!(cpu.interrupt, 0.5); + } + + #[test] + fn memavailable_fallback_uses_free_buffers_cached() { + let memory = + parse_meminfo("MemTotal: 1000 kB\nMemFree: 100 kB\nBuffers: 20 kB\nCached: 30 kB\n") + .expect("memory"); + assert_eq!(memory.available, 150 * BYTES_PER_KIB); + assert_eq!(memory.used, 850 * BYTES_PER_KIB); + } + + #[test] + fn uptime_parser_reads_first_field() { + assert_eq!(parse_uptime("123.45 67.89"), Some(123.45)); + } + + #[test] + fn vmstat_parser_derives_minor_faults() { + let paging = parse_vmstat("pgfault 100\npgmajfault 7\npswpin 3\npswpout 4\n"); + assert_eq!(paging.minor_faults, 93); + assert_eq!(paging.major_faults, 7); + assert_eq!(paging.swap_in, 3); + assert_eq!(paging.swap_out, 4); + } + + #[test] + fn swaps_parser_reads_device_usage() { + let swaps = + parse_swaps("Filename Type Size Used Priority\n/dev/sda2 partition 200 50 -2\n"); + assert_eq!(swaps.len(), 1); + assert_eq!(swaps[0].name, "/dev/sda2"); + assert_eq!(swaps[0].used, 50 * BYTES_PER_KIB); + assert_eq!(swaps[0].free, 150 * BYTES_PER_KIB); + } + + #[test] + fn diskstats_parser_accepts_flush_columns() { + let disks = parse_diskstats("8 0 sda 1 0 2 3 4 0 5 6 0 0 0 0 0 0 0 0\n", None, None); + assert_eq!(disks.len(), 1); + assert_eq!(disks[0].name, "sda"); + assert_eq!(disks[0].read_bytes, 1024); + assert_eq!(disks[0].write_bytes, 2560); + } + + #[test] + fn diskstats_parser_applies_filters_before_parsing_values() { + let exclude = CompiledFilter::compile( + crate::receivers::host_metrics_receiver::MatchType::Glob, + vec!["loop*".to_owned()], + ) + .expect("valid") + .expect("filter"); + let disks = parse_diskstats( + "7 0 loop0 broken row\n8 0 sda 1 0 2 3 4 0 5 6 0 0 0 0 0 0 0 0\n", + None, + Some(&exclude), + ); + + assert_eq!(disks.len(), 1); + assert_eq!(disks[0].name, "sda"); + } + + #[test] + fn netdev_parser_reads_device_counters() { + let interfaces = parse_netdev( + "Inter-| Receive | Transmit\n face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed\n eth0: 10 2 0 0 0 0 0 0 30 4 0 0 0 0 0 0\n", + None, + None, + ); + assert_eq!(interfaces.len(), 1); + assert_eq!(interfaces[0].name, "eth0"); + assert_eq!(interfaces[0].rx_bytes, 10); + assert_eq!(interfaces[0].tx_packets, 4); + } + + #[test] + fn netdev_parser_applies_interface_filters() { + let include = CompiledFilter::compile( + crate::receivers::host_metrics_receiver::MatchType::Strict, + vec!["eth0".to_owned()], + ) + .expect("valid") + .expect("filter"); + let interfaces = parse_netdev( + "Inter-| Receive | Transmit\n face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed\n lo: 1 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0\n eth0: 10 2 3 4 0 0 0 0 30 4 5 6 0 0 0 0\n", + Some(&include), + None, + ); + + assert_eq!(interfaces.len(), 1); + assert_eq!(interfaces[0].name, "eth0"); + assert_eq!(interfaces[0].rx_errors, 3); + assert_eq!(interfaces[0].tx_dropped, 6); + } + + #[test] + fn root_path_uses_host_pid_one_netdev() { + let paths = ProcfsPaths::new(Some(Path::new("/host"))); + assert_eq!(paths.net_dev, PathBuf::from("/host/proc/1/net/dev")); + } + + #[test] + fn root_slash_uses_current_proc_netdev() { + let paths = ProcfsPaths::new(Some(Path::new("/"))); + assert_eq!(paths.net_dev, PathBuf::from("/proc/net/dev")); + } +} diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/mod.rs index f05f61a992..79af5c183e 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/mod.rs @@ -19,3 +19,6 @@ pub mod otap_receiver; /// OTLP receiver. pub mod otlp_receiver; + +/// Host metrics receiver. +pub mod host_metrics_receiver; From 1b894033b6b878ae060a090e1c54c87380c1880e Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Sat, 2 May 2026 22:23:08 -0700 Subject: [PATCH 02/60] Add host metrics family scheduler --- .../receivers/host_metrics_receiver/mod.rs | 344 ++++++++++++++++-- .../receivers/host_metrics_receiver/procfs.rs | 41 ++- 2 files changed, 347 insertions(+), 38 deletions(-) diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs index f5f32cb123..8875185b7f 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs @@ -32,11 +32,11 @@ use std::path::{Component, Path, PathBuf}; use std::sync::Arc; use std::sync::{LazyLock, Mutex}; use std::time::Duration; -use tokio::time::interval; +use tokio::time::{Instant, sleep_until}; mod procfs; -use procfs::{HostSnapshot, ProcfsConfig, ProcfsSource}; +use procfs::{HostSnapshot, ProcfsConfig, ProcfsFamilies, ProcfsSource}; /// The URN for the host metrics receiver. pub const HOST_METRICS_RECEIVER_URN: &str = "urn:otel:receiver:host_metrics"; @@ -53,6 +53,10 @@ pub struct Config { #[serde(default = "default_collection_interval", with = "humantime_serde")] pub collection_interval: Duration, + /// Delay before the first scrape. + #[serde(default, with = "humantime_serde")] + pub initial_delay: Duration, + /// Optional host root path. In Kubernetes this is commonly `/host`. #[serde(default)] pub root_path: Option, @@ -66,6 +70,7 @@ impl Default for Config { fn default() -> Self { Self { collection_interval: default_collection_interval(), + initial_delay: Duration::ZERO, root_path: None, families: FamiliesConfig::default(), } @@ -110,11 +115,17 @@ impl FamiliesConfig { pub struct FamilyConfig { /// Enable this family. pub enabled: bool, + /// Family collection interval. Defaults to top-level `collection_interval`. + #[serde(default, with = "humantime_serde::option")] + pub interval: Option, } impl Default for FamilyConfig { fn default() -> Self { - Self { enabled: true } + Self { + enabled: true, + interval: None, + } } } @@ -124,6 +135,9 @@ impl Default for FamilyConfig { pub struct DiskFamilyConfig { /// Enable disk metrics. pub enabled: bool, + /// Family collection interval. Defaults to top-level `collection_interval`. + #[serde(default, with = "humantime_serde::option")] + pub interval: Option, /// Device include filter. pub include: Option, /// Device exclude filter. @@ -134,6 +148,7 @@ impl Default for DiskFamilyConfig { fn default() -> Self { Self { enabled: true, + interval: None, include: None, exclude: None, } @@ -146,6 +161,9 @@ impl Default for DiskFamilyConfig { pub struct NetworkFamilyConfig { /// Enable network metrics. pub enabled: bool, + /// Family collection interval. Defaults to top-level `collection_interval`. + #[serde(default, with = "humantime_serde::option")] + pub interval: Option, /// Interface include filter. pub include: Option, /// Interface exclude filter. @@ -158,6 +176,7 @@ impl Default for NetworkFamilyConfig { fn default() -> Self { Self { enabled: true, + interval: None, include: None, exclude: None, include_connection_count: false, @@ -171,6 +190,9 @@ impl Default for NetworkFamilyConfig { pub struct ProcessesFamilyConfig { /// Enable process summary metrics. pub enabled: bool, + /// Family collection interval. Defaults to top-level `collection_interval`. + #[serde(default, with = "humantime_serde::option")] + pub interval: Option, /// Only `summary` is supported in v1. pub mode: ProcessMode, } @@ -179,6 +201,7 @@ impl Default for ProcessesFamilyConfig { fn default() -> Self { Self { enabled: true, + interval: None, mode: ProcessMode::Summary, } } @@ -236,24 +259,31 @@ pub enum MatchType { #[derive(Clone)] struct RuntimeConfig { root_path: Option, - collection_interval: Duration, + initial_delay: Duration, families: RuntimeFamilies, } #[derive(Clone)] struct RuntimeFamilies { - cpu: bool, - memory: bool, - paging: bool, - system: bool, + cpu: RuntimeFamily, + memory: RuntimeFamily, + paging: RuntimeFamily, + system: RuntimeFamily, disk: RuntimeDiskFamily, network: RuntimeNetworkFamily, - processes: bool, + processes: RuntimeFamily, +} + +#[derive(Clone)] +struct RuntimeFamily { + enabled: bool, + interval: Duration, } #[derive(Clone)] struct RuntimeDiskFamily { enabled: bool, + interval: Duration, include: Option, exclude: Option, } @@ -261,6 +291,7 @@ struct RuntimeDiskFamily { #[derive(Clone)] struct RuntimeNetworkFamily { enabled: bool, + interval: Duration, include: Option, exclude: Option, } @@ -421,10 +452,58 @@ fn validate_config(config: &Config) -> Result<(), otap_df_config::error::Error> error: "network include_connection_count is not supported in v1".to_owned(), }); } + validate_family_interval( + "cpu", + config.families.cpu.enabled, + config.families.cpu.interval, + )?; + validate_family_interval( + "memory", + config.families.memory.enabled, + config.families.memory.interval, + )?; + validate_family_interval( + "paging", + config.families.paging.enabled, + config.families.paging.interval, + )?; + validate_family_interval( + "system", + config.families.system.enabled, + config.families.system.interval, + )?; + validate_family_interval( + "disk", + config.families.disk.enabled, + config.families.disk.interval, + )?; + validate_family_interval( + "network", + config.families.network.enabled, + config.families.network.interval, + )?; + validate_family_interval( + "processes", + config.families.processes.enabled, + config.families.processes.interval, + )?; let _ = normalized_root_path(config.root_path.as_deref())?; Ok(()) } +fn validate_family_interval( + family: &'static str, + enabled: bool, + interval: Option, +) -> Result<(), otap_df_config::error::Error> { + if enabled && interval.is_some_and(|interval| interval.is_zero()) { + return Err(otap_df_config::error::Error::InvalidUserConfig { + error: format!("{family} interval must be greater than zero"), + }); + } + Ok(()) +} + impl TryFrom for RuntimeConfig { type Error = otap_df_config::error::Error; @@ -461,28 +540,171 @@ impl TryFrom for RuntimeConfig { Ok(Self { root_path: config.root_path, - collection_interval: config.collection_interval, + initial_delay: config.initial_delay, families: RuntimeFamilies { - cpu: config.families.cpu.enabled, - memory: config.families.memory.enabled, - paging: config.families.paging.enabled, - system: config.families.system.enabled, + cpu: RuntimeFamily::new(&config.families.cpu, config.collection_interval), + memory: RuntimeFamily::new(&config.families.memory, config.collection_interval), + paging: RuntimeFamily::new(&config.families.paging, config.collection_interval), + system: RuntimeFamily::new(&config.families.system, config.collection_interval), disk: RuntimeDiskFamily { enabled: config.families.disk.enabled, + interval: config + .families + .disk + .interval + .unwrap_or(config.collection_interval), include: disk_include, exclude: disk_exclude, }, network: RuntimeNetworkFamily { enabled: config.families.network.enabled, + interval: config + .families + .network + .interval + .unwrap_or(config.collection_interval), include: network_include, exclude: network_exclude, }, - processes: config.families.processes.enabled, + processes: RuntimeFamily { + enabled: config.families.processes.enabled, + interval: config + .families + .processes + .interval + .unwrap_or(config.collection_interval), + }, }, }) } } +impl RuntimeFamily { + fn new(config: &FamilyConfig, default_interval: Duration) -> Self { + Self { + enabled: config.enabled, + interval: config.interval.unwrap_or(default_interval), + } + } +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +enum ScheduledFamilyKind { + Cpu, + Memory, + Paging, + System, + Disk, + Network, + Processes, +} + +struct ScheduledFamily { + kind: ScheduledFamilyKind, + interval: Duration, + next_due: Instant, +} + +struct FamilyScheduler { + entries: Vec, +} + +impl FamilyScheduler { + fn new(config: &RuntimeConfig, now: Instant) -> Self { + let first_due = now + config.initial_delay; + let mut entries = Vec::with_capacity(7); + push_scheduled( + &mut entries, + ScheduledFamilyKind::Cpu, + &config.families.cpu, + first_due, + ); + push_scheduled( + &mut entries, + ScheduledFamilyKind::Memory, + &config.families.memory, + first_due, + ); + push_scheduled( + &mut entries, + ScheduledFamilyKind::Paging, + &config.families.paging, + first_due, + ); + push_scheduled( + &mut entries, + ScheduledFamilyKind::System, + &config.families.system, + first_due, + ); + if config.families.disk.enabled { + entries.push(ScheduledFamily { + kind: ScheduledFamilyKind::Disk, + interval: config.families.disk.interval, + next_due: first_due, + }); + } + if config.families.network.enabled { + entries.push(ScheduledFamily { + kind: ScheduledFamilyKind::Network, + interval: config.families.network.interval, + next_due: first_due, + }); + } + push_scheduled( + &mut entries, + ScheduledFamilyKind::Processes, + &config.families.processes, + first_due, + ); + Self { entries } + } + + fn next_due(&self) -> Instant { + self.entries + .iter() + .map(|entry| entry.next_due) + .min() + .expect("scheduler has at least one enabled family") + } + + fn mark_due(&mut self, now: Instant) -> ProcfsFamilies { + let mut due = ProcfsFamilies::default(); + for entry in &mut self.entries { + if entry.next_due <= now { + match entry.kind { + ScheduledFamilyKind::Cpu => due.cpu = true, + ScheduledFamilyKind::Memory => due.memory = true, + ScheduledFamilyKind::Paging => due.paging = true, + ScheduledFamilyKind::System => due.system = true, + ScheduledFamilyKind::Disk => due.disk = true, + ScheduledFamilyKind::Network => due.network = true, + ScheduledFamilyKind::Processes => due.processes = true, + } + while entry.next_due <= now { + entry.next_due += entry.interval; + } + } + } + due + } +} + +fn push_scheduled( + entries: &mut Vec, + kind: ScheduledFamilyKind, + family: &RuntimeFamily, + first_due: Instant, +) { + if family.enabled { + entries.push(ScheduledFamily { + kind, + interval: family.interval, + next_due: first_due, + }); + } +} + static HOST_METRICS_LEASES: LazyLock>> = LazyLock::new(|| Mutex::new(HashSet::new())); @@ -554,13 +776,13 @@ impl local::Receiver for HostMetricsReceiver { let mut source = ProcfsSource::new( self.config.root_path.as_deref(), ProcfsConfig { - cpu: self.config.families.cpu, - memory: self.config.families.memory, - paging: self.config.families.paging, - system: self.config.families.system, + cpu: self.config.families.cpu.enabled, + memory: self.config.families.memory.enabled, + paging: self.config.families.paging.enabled, + system: self.config.families.system.enabled, disk: self.config.families.disk.enabled, network: self.config.families.network.enabled, - processes: self.config.families.processes, + processes: self.config.families.processes.enabled, disk_include: self.config.families.disk.include.clone(), disk_exclude: self.config.families.disk.exclude.clone(), network_include: self.config.families.network.include.clone(), @@ -573,7 +795,7 @@ impl local::Receiver for HostMetricsReceiver { error: format!("failed to validate host metrics procfs sources: {err}"), source_detail: String::new(), })?; - let mut ticker = interval(self.config.collection_interval); + let mut scheduler = FamilyScheduler::new(&self.config, Instant::now()); let _ = effect_handler .start_periodic_telemetry(Duration::from_secs(1)) @@ -600,8 +822,9 @@ impl local::Receiver for HostMetricsReceiver { } } - _ = ticker.tick() => { - match source.scrape() { + _ = sleep_until(scheduler.next_due()) => { + let due = scheduler.mark_due(Instant::now()); + match source.scrape_due(due) { Ok(snapshot) => { let pdata = encode_snapshot(snapshot).map_err(|err| Error::ReceiverError { receiver: effect_handler.receiver_id(), @@ -670,10 +893,22 @@ mod tests { fn rejects_all_families_disabled() { let config = Config { families: FamiliesConfig { - cpu: FamilyConfig { enabled: false }, - memory: FamilyConfig { enabled: false }, - paging: FamilyConfig { enabled: false }, - system: FamilyConfig { enabled: false }, + cpu: FamilyConfig { + enabled: false, + ..FamilyConfig::default() + }, + memory: FamilyConfig { + enabled: false, + ..FamilyConfig::default() + }, + paging: FamilyConfig { + enabled: false, + ..FamilyConfig::default() + }, + system: FamilyConfig { + enabled: false, + ..FamilyConfig::default() + }, disk: DiskFamilyConfig { enabled: false, ..DiskFamilyConfig::default() @@ -715,6 +950,61 @@ mod tests { )); } + #[test] + fn rejects_zero_enabled_family_interval() { + let config = Config { + families: FamiliesConfig { + cpu: FamilyConfig { + interval: Some(Duration::ZERO), + ..FamilyConfig::default() + }, + ..FamiliesConfig::default() + }, + ..Config::default() + }; + + assert!(matches!( + validate_config(&config), + Err(otap_df_config::error::Error::InvalidUserConfig { .. }) + )); + } + + #[test] + fn scheduler_honors_initial_delay_and_family_intervals() { + let config = Config { + collection_interval: Duration::from_secs(10), + initial_delay: Duration::from_secs(1), + families: FamiliesConfig { + cpu: FamilyConfig { + interval: Some(Duration::from_secs(5)), + ..FamilyConfig::default() + }, + ..FamiliesConfig::default() + }, + ..Config::default() + }; + let config = RuntimeConfig::try_from(config).expect("valid config"); + let now = Instant::now(); + let mut scheduler = FamilyScheduler::new(&config, now); + + assert_eq!(scheduler.next_due(), now + Duration::from_secs(1)); + assert_eq!( + scheduler.mark_due(now), + ProcfsFamilies::default(), + "nothing is due before initial_delay" + ); + + let first_due = scheduler.mark_due(now + Duration::from_secs(1)); + assert!(first_due.cpu); + assert!(first_due.memory); + assert!(first_due.disk); + + let second_due = scheduler.mark_due(now + Duration::from_secs(6)); + assert!(second_due.cpu); + assert!(!second_due.memory); + assert!(!second_due.disk); + } + #[test] fn glob_filter_matches_without_regex_allocations() { let filter = CompiledFilter::compile(MatchType::Glob, vec!["loop*".to_owned()]) diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs index a6def64f40..3ad78ac3b1 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs @@ -57,6 +57,25 @@ pub struct ProcfsConfig { pub network_exclude: Option, } +/// Families due for one scrape. +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +pub struct ProcfsFamilies { + /// CPU metrics. + pub cpu: bool, + /// Memory metrics. + pub memory: bool, + /// Paging metrics. + pub paging: bool, + /// System metrics. + pub system: bool, + /// Disk metrics. + pub disk: bool, + /// Network metrics. + pub network: bool, + /// Process summary metrics. + pub processes: bool, +} + impl ProcfsSource { /// Creates a procfs source rooted at `/` or at a host root bind mount. pub fn new(root_path: Option<&Path>, config: ProcfsConfig) -> io::Result { @@ -70,11 +89,11 @@ impl ProcfsSource { Ok(source) } - /// Collects one host snapshot. - pub fn scrape(&mut self) -> io::Result { + /// Collects one host snapshot for the due family set. + pub fn scrape_due(&mut self, due: ProcfsFamilies) -> io::Result { let now_unix_nano = now_unix_nano(); let clk_tck = self.clk_tck; - let needs_stat = self.config.cpu || self.config.system || self.config.processes; + let needs_stat = due.cpu || due.system || due.processes; let stat = if needs_stat { let proc_stat = self.read_path(PathKind::Stat)?; parse_stat(proc_stat, clk_tck) @@ -82,42 +101,42 @@ impl ProcfsSource { StatSnapshot::default() }; - let cpuinfo = if self.config.cpu { + let cpuinfo = if due.cpu { let cpuinfo = self.read_path(PathKind::Cpuinfo)?; parse_cpuinfo(cpuinfo) } else { CpuInfo::default() }; - let memory = if self.config.memory { + let memory = if due.memory { let meminfo = self.read_path(PathKind::Meminfo)?; parse_meminfo(meminfo) } else { None }; - let uptime_seconds = if self.config.system { + let uptime_seconds = if due.system { let uptime = self.read_path(PathKind::Uptime)?; parse_uptime(uptime) } else { None }; - let paging = if self.config.paging { + let paging = if due.paging { let vmstat = self.read_path(PathKind::Vmstat)?; Some(parse_vmstat(vmstat)) } else { None }; - let swaps = if self.config.paging { + let swaps = if due.paging { let swaps = self.read_path(PathKind::Swaps)?; parse_swaps(swaps) } else { Vec::new() }; - let disks = if self.config.disk { + let disks = if due.disk { let disk_include = self.config.disk_include.clone(); let disk_exclude = self.config.disk_exclude.clone(); let diskstats = self.read_path(PathKind::Diskstats)?; @@ -126,7 +145,7 @@ impl ProcfsSource { Vec::new() }; - let networks = if self.config.network { + let networks = if due.network { let network_include = self.config.network_include.clone(); let network_exclude = self.config.network_exclude.clone(); let netdev = self.read_path(PathKind::NetDev)?; @@ -146,7 +165,7 @@ impl ProcfsSource { uptime_seconds, paging, swaps, - processes: self.config.processes.then_some(stat.processes), + processes: due.processes.then_some(stat.processes), disks, networks, resource, From 8f99645178200b50ad76b29f8cd4a42ac9222452 Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Sat, 2 May 2026 22:35:54 -0700 Subject: [PATCH 03/60] Add host view validation modes --- .../receivers/host_metrics_receiver/mod.rs | 105 +++++++++++++++++- .../receivers/host_metrics_receiver/procfs.rs | 62 ++++++++++- 2 files changed, 158 insertions(+), 9 deletions(-) diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs index 8875185b7f..c916ef95dd 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs @@ -45,6 +45,10 @@ fn default_collection_interval() -> Duration { Duration::from_secs(10) } +fn default_root_path() -> PathBuf { + PathBuf::from("/") +} + /// Configuration for the host metrics receiver. #[derive(Clone, Debug, Deserialize, Serialize)] #[serde(deny_unknown_fields)] @@ -57,10 +61,14 @@ pub struct Config { #[serde(default, with = "humantime_serde")] pub initial_delay: Duration, - /// Optional host root path. In Kubernetes this is commonly `/host`. + /// Optional legacy host root path. Prefer `host_view.root_path`. #[serde(default)] pub root_path: Option, + /// Host filesystem view. + #[serde(default)] + pub host_view: HostViewConfig, + /// Metric family configuration. #[serde(default)] pub families: FamiliesConfig, @@ -72,11 +80,45 @@ impl Default for Config { collection_interval: default_collection_interval(), initial_delay: Duration::ZERO, root_path: None, + host_view: HostViewConfig::default(), families: FamiliesConfig::default(), } } } +/// Host filesystem view configuration. +#[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(default, deny_unknown_fields)] +pub struct HostViewConfig { + /// Root path for the observed host filesystem. + #[serde(default = "default_root_path")] + pub root_path: PathBuf, + /// Startup validation mode. + pub validation: HostViewValidationMode, +} + +impl Default for HostViewConfig { + fn default() -> Self { + Self { + root_path: default_root_path(), + validation: HostViewValidationMode::FailSelected, + } + } +} + +/// Host view startup validation mode. +#[derive(Clone, Copy, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] +#[serde(rename_all = "snake_case")] +pub enum HostViewValidationMode { + /// Fail startup if selected sources are unavailable. + #[default] + FailSelected, + /// Start and disable unavailable selected sources. + WarnSelected, + /// Skip startup validation. + None, +} + /// Metric family configuration. #[derive(Clone, Debug, Default, Deserialize, Serialize)] #[serde(default, deny_unknown_fields)] @@ -258,7 +300,8 @@ pub enum MatchType { #[derive(Clone)] struct RuntimeConfig { - root_path: Option, + root_path: PathBuf, + validation: HostViewValidationMode, initial_delay: Duration, families: RuntimeFamilies, } @@ -415,7 +458,7 @@ pub static HOST_METRICS_RECEIVER: ReceiverFactory = ReceiverFactory { impl HostMetricsReceiver { /// Creates a new host metrics receiver. pub fn new(config: Config) -> Result { - let root_path = normalized_root_path(config.root_path.as_deref())?; + let root_path = normalized_root_path(Some(effective_root_path(&config)?))?; let lease = HostMetricsLease::acquire(root_path)?; let config = RuntimeConfig::try_from(config)?; Ok(Self { @@ -487,10 +530,25 @@ fn validate_config(config: &Config) -> Result<(), otap_df_config::error::Error> config.families.processes.enabled, config.families.processes.interval, )?; - let _ = normalized_root_path(config.root_path.as_deref())?; + let _ = normalized_root_path(Some(effective_root_path(config)?))?; Ok(()) } +fn effective_root_path(config: &Config) -> Result<&Path, otap_df_config::error::Error> { + if let Some(root_path) = config.root_path.as_deref() { + let host_view_root = config.host_view.root_path.as_path(); + if host_view_root != Path::new("/") && root_path != host_view_root { + return Err(otap_df_config::error::Error::InvalidUserConfig { + error: "root_path and host_view.root_path cannot both be set to different values" + .to_owned(), + }); + } + Ok(root_path) + } else { + Ok(config.host_view.root_path.as_path()) + } +} + fn validate_family_interval( family: &'static str, enabled: bool, @@ -509,6 +567,7 @@ impl TryFrom for RuntimeConfig { fn try_from(config: Config) -> Result { validate_config(&config)?; + let root_path = normalized_root_path(Some(effective_root_path(&config)?))?; let disk_include = config .families .disk @@ -539,7 +598,8 @@ impl TryFrom for RuntimeConfig { .flatten(); Ok(Self { - root_path: config.root_path, + root_path, + validation: config.host_view.validation, initial_delay: config.initial_delay, families: RuntimeFamilies { cpu: RuntimeFamily::new(&config.families.cpu, config.collection_interval), @@ -774,7 +834,7 @@ impl local::Receiver for HostMetricsReceiver { effect_handler: local::EffectHandler, ) -> Result { let mut source = ProcfsSource::new( - self.config.root_path.as_deref(), + Some(self.config.root_path.as_path()), ProcfsConfig { cpu: self.config.families.cpu.enabled, memory: self.config.families.memory.enabled, @@ -787,6 +847,7 @@ impl local::Receiver for HostMetricsReceiver { disk_exclude: self.config.families.disk.exclude.clone(), network_include: self.config.families.network.include.clone(), network_exclude: self.config.families.network.exclude.clone(), + validation: self.config.validation, }, ) .map_err(|err| Error::ReceiverError { @@ -969,6 +1030,38 @@ mod tests { )); } + #[test] + fn rejects_conflicting_root_paths() { + let config = Config { + root_path: Some(PathBuf::from("/host")), + host_view: HostViewConfig { + root_path: PathBuf::from("/container"), + ..HostViewConfig::default() + }, + ..Config::default() + }; + + assert!(matches!( + validate_config(&config), + Err(otap_df_config::error::Error::InvalidUserConfig { .. }) + )); + } + + #[test] + fn accepts_host_view_validation_modes() { + let config: Config = serde_json::from_value(serde_json::json!({ + "host_view": { + "validation": "warn_selected" + } + })) + .expect("valid host view config"); + + assert_eq!( + config.host_view.validation, + HostViewValidationMode::WarnSelected + ); + } + #[test] fn scheduler_honors_initial_delay_and_family_intervals() { let config = Config { diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs index 3ad78ac3b1..cffcece61c 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs @@ -3,7 +3,7 @@ //! Linux procfs-backed host metric source. -use crate::receivers::host_metrics_receiver::CompiledFilter; +use crate::receivers::host_metrics_receiver::{CompiledFilter, HostViewValidationMode}; use otap_df_pdata::proto::opentelemetry::collector::metrics::v1::ExportMetricsServiceRequest; use otap_df_pdata::proto::opentelemetry::common::v1::{ AnyValue, InstrumentationScope, KeyValue, any_value, @@ -55,6 +55,8 @@ pub struct ProcfsConfig { pub network_include: Option, /// Network exclude filter. pub network_exclude: Option, + /// Startup validation mode. + pub validation: HostViewValidationMode, } /// Families due for one scrape. @@ -79,18 +81,27 @@ pub struct ProcfsFamilies { impl ProcfsSource { /// Creates a procfs source rooted at `/` or at a host root bind mount. pub fn new(root_path: Option<&Path>, config: ProcfsConfig) -> io::Result { - let source = Self { + let mut source = Self { paths: ProcfsPaths::new(root_path), config, buf: String::with_capacity(16 * 1024), clk_tck: clock_ticks_per_second(), }; - source.validate_selected_paths()?; + source.apply_startup_validation()?; Ok(source) } /// Collects one host snapshot for the due family set. pub fn scrape_due(&mut self, due: ProcfsFamilies) -> io::Result { + let due = ProcfsFamilies { + cpu: due.cpu && self.config.cpu, + memory: due.memory && self.config.memory, + paging: due.paging && self.config.paging, + system: due.system && self.config.system, + disk: due.disk && self.config.disk, + network: due.network && self.config.network, + processes: due.processes && self.config.processes, + }; let now_unix_nano = now_unix_nano(); let clk_tck = self.clk_tck; let needs_stat = due.cpu || due.system || due.processes; @@ -172,6 +183,17 @@ impl ProcfsSource { }) } + fn apply_startup_validation(&mut self) -> io::Result<()> { + match self.config.validation { + HostViewValidationMode::None => Ok(()), + HostViewValidationMode::FailSelected => self.validate_selected_paths(), + HostViewValidationMode::WarnSelected => { + self.disable_unavailable_sources(); + Ok(()) + } + } + } + fn validate_selected_paths(&self) -> io::Result<()> { if self.config.cpu || self.config.system || self.config.processes { let _ = File::open(self.paths.path(PathKind::Stat))?; @@ -198,6 +220,40 @@ impl ProcfsSource { Ok(()) } + fn disable_unavailable_sources(&mut self) { + if (self.config.cpu || self.config.system || self.config.processes) + && !self.source_available(PathKind::Stat) + { + self.config.cpu = false; + self.config.system = false; + self.config.processes = false; + } + if self.config.cpu && !self.source_available(PathKind::Cpuinfo) { + self.config.cpu = false; + } + if self.config.memory && !self.source_available(PathKind::Meminfo) { + self.config.memory = false; + } + if self.config.system && !self.source_available(PathKind::Uptime) { + self.config.system = false; + } + if self.config.paging + && (!self.source_available(PathKind::Vmstat) || !self.source_available(PathKind::Swaps)) + { + self.config.paging = false; + } + if self.config.disk && !self.source_available(PathKind::Diskstats) { + self.config.disk = false; + } + if self.config.network && !self.source_available(PathKind::NetDev) { + self.config.network = false; + } + } + + fn source_available(&self, kind: PathKind) -> bool { + File::open(self.paths.path(kind)).is_ok() + } + fn read_path(&mut self, kind: PathKind) -> io::Result<&str> { self.buf.clear(); let mut file = File::open(self.paths.path(kind))?; From 6985bbed3a98bb1c25600972aef9df3a871a8898 Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Sat, 2 May 2026 22:42:06 -0700 Subject: [PATCH 04/60] Add host metrics receiver telemetry --- .../receivers/host_metrics_receiver/mod.rs | 179 ++++++++++++++---- 1 file changed, 142 insertions(+), 37 deletions(-) diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs index c916ef95dd..bee0462e8c 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs @@ -21,8 +21,10 @@ use otap_df_otap::OTAP_RECEIVER_FACTORIES; use otap_df_otap::pdata::{Context, OtapPdata}; use otap_df_pdata::OtlpProtoBytes; use otap_df_pdata::otap::OtapArrowRecords; -use otap_df_telemetry::metrics::MetricSetSnapshot; +use otap_df_telemetry::instrument::{Counter, Mmsc}; +use otap_df_telemetry::metrics::{MetricSet, MetricSetSnapshot}; use otap_df_telemetry::{otel_info, otel_warn}; +use otap_df_telemetry_macros::metric_set; use prost::Message as _; use regex::Regex; use serde::{Deserialize, Serialize}; @@ -31,7 +33,7 @@ use std::collections::HashSet; use std::path::{Component, Path, PathBuf}; use std::sync::Arc; use std::sync::{LazyLock, Mutex}; -use std::time::Duration; +use std::time::{Duration, Instant as StdInstant}; use tokio::time::{Instant, sleep_until}; mod procfs; @@ -41,6 +43,36 @@ use procfs::{HostSnapshot, ProcfsConfig, ProcfsFamilies, ProcfsSource}; /// The URN for the host metrics receiver. pub const HOST_METRICS_RECEIVER_URN: &str = "urn:otel:receiver:host_metrics"; +/// Telemetry metrics for the host metrics receiver. +#[metric_set(name = "host_metrics.receiver.metrics")] +#[derive(Debug, Default, Clone)] +pub struct HostMetricsReceiverMetrics { + /// Number of scrape ticks started. + #[metric(unit = "{scrape}")] + pub scrapes_started: Counter, + /// Number of scrape ticks that built and sent a metrics batch. + #[metric(unit = "{scrape}")] + pub scrapes_completed: Counter, + /// Number of fatal scrape failures. + #[metric(unit = "{scrape}")] + pub scrapes_failed: Counter, + /// Number of due metric families processed. + #[metric(unit = "{family}")] + pub families_scraped: Counter, + /// Wall-clock scrape duration. + #[metric(unit = "ns")] + pub scrape_duration_ns: Mmsc, + /// Delay between scheduled and actual scrape start. + #[metric(unit = "ns")] + pub scrape_lag_ns: Mmsc, + /// Number of batches sent downstream. + #[metric(unit = "{batch}")] + pub batches_sent: Counter, + /// Number of downstream send failures. + #[metric(unit = "{error}")] + pub send_failures: Counter, +} + fn default_collection_interval() -> Duration { Duration::from_secs(10) } @@ -421,6 +453,7 @@ fn glob_matches(pattern: &[u8], value: &[u8]) -> bool { pub struct HostMetricsReceiver { config: RuntimeConfig, _lease: HostMetricsLease, + metrics: Option>, } #[allow(unsafe_code)] @@ -437,8 +470,10 @@ pub static HOST_METRICS_RECEIVER: ReceiverFactory = ReceiverFactory { error: "host-wide collection must run in a one-core source pipeline; use receiver:host_metrics -> exporter:topic and fan out downstream".to_owned(), }); } + let mut receiver = HostMetricsReceiver::from_config(&node_config.config)?; + receiver.metrics = Some(pipeline.register_metrics::()); Ok(ReceiverWrapper::local( - HostMetricsReceiver::from_config(&node_config.config)?, + receiver, node, node_config, receiver_config, @@ -464,6 +499,7 @@ impl HostMetricsReceiver { Ok(Self { config, _lease: lease, + metrics: None, }) } @@ -562,6 +598,35 @@ fn validate_family_interval( Ok(()) } +fn duration_nanos(duration: Duration) -> f64 { + duration.as_secs_f64() * 1e9 +} + +fn elapsed_nanos(start: StdInstant) -> f64 { + duration_nanos(start.elapsed()) +} + +fn terminal_state( + deadline: StdInstant, + metrics: &Option>, +) -> TerminalState { + if let Some(metrics) = metrics { + TerminalState::new(deadline, [metrics.snapshot()]) + } else { + TerminalState::new::<[MetricSetSnapshot; 0]>(deadline, []) + } +} + +fn due_family_count(due: ProcfsFamilies) -> u64 { + u64::from(due.cpu) + + u64::from(due.memory) + + u64::from(due.paging) + + u64::from(due.system) + + u64::from(due.disk) + + u64::from(due.network) + + u64::from(due.processes) +} + impl TryFrom for RuntimeConfig { type Error = otap_df_config::error::Error; @@ -833,21 +898,26 @@ impl local::Receiver for HostMetricsReceiver { mut ctrl_msg_recv: local::ControlChannel, effect_handler: local::EffectHandler, ) -> Result { + let HostMetricsReceiver { + config, + _lease, + mut metrics, + } = *self; let mut source = ProcfsSource::new( - Some(self.config.root_path.as_path()), + Some(config.root_path.as_path()), ProcfsConfig { - cpu: self.config.families.cpu.enabled, - memory: self.config.families.memory.enabled, - paging: self.config.families.paging.enabled, - system: self.config.families.system.enabled, - disk: self.config.families.disk.enabled, - network: self.config.families.network.enabled, - processes: self.config.families.processes.enabled, - disk_include: self.config.families.disk.include.clone(), - disk_exclude: self.config.families.disk.exclude.clone(), - network_include: self.config.families.network.include.clone(), - network_exclude: self.config.families.network.exclude.clone(), - validation: self.config.validation, + cpu: config.families.cpu.enabled, + memory: config.families.memory.enabled, + paging: config.families.paging.enabled, + system: config.families.system.enabled, + disk: config.families.disk.enabled, + network: config.families.network.enabled, + processes: config.families.processes.enabled, + disk_include: config.families.disk.include.clone(), + disk_exclude: config.families.disk.exclude.clone(), + network_include: config.families.network.include.clone(), + network_exclude: config.families.network.exclude.clone(), + validation: config.validation, }, ) .map_err(|err| Error::ReceiverError { @@ -856,7 +926,7 @@ impl local::Receiver for HostMetricsReceiver { error: format!("failed to validate host metrics procfs sources: {err}"), source_detail: String::new(), })?; - let mut scheduler = FamilyScheduler::new(&self.config, Instant::now()); + let mut scheduler = FamilyScheduler::new(&config, Instant::now()); let _ = effect_handler .start_periodic_telemetry(Duration::from_secs(1)) @@ -872,11 +942,11 @@ impl local::Receiver for HostMetricsReceiver { Ok(NodeControlMsg::DrainIngress { deadline, .. }) => { otel_info!("host_metrics_receiver.drain_ingress"); effect_handler.notify_receiver_drained().await?; - return Ok(TerminalState::new::<[MetricSetSnapshot; 0]>(deadline, [])); + return Ok(terminal_state(deadline, &metrics)); } Ok(NodeControlMsg::Shutdown { deadline, .. }) => { otel_info!("host_metrics_receiver.shutdown"); - return Ok(TerminalState::new::<[MetricSetSnapshot; 0]>(deadline, [])); + return Ok(terminal_state(deadline, &metrics)); } Err(e) => return Err(Error::ChannelRecvError(e)), _ => {} @@ -884,32 +954,67 @@ impl local::Receiver for HostMetricsReceiver { } _ = sleep_until(scheduler.next_due()) => { - let due = scheduler.mark_due(Instant::now()); + let scheduled_due = scheduler.next_due(); + let now = Instant::now(); + let due = scheduler.mark_due(now); + let scrape_start = StdInstant::now(); + if let Some(metrics) = metrics.as_mut() { + metrics.scrapes_started.add(1); + metrics.families_scraped.add(due_family_count(due)); + metrics.scrape_lag_ns.record(duration_nanos(now.saturating_duration_since(scheduled_due))); + } match source.scrape_due(due) { Ok(snapshot) => { - let pdata = encode_snapshot(snapshot).map_err(|err| Error::ReceiverError { - receiver: effect_handler.receiver_id(), - kind: ReceiverErrorKind::Other, - error: format!("failed to encode host metrics: {err}"), - source_detail: String::new(), - })?; - if let Err(err) = effect_handler.try_send_message_with_source_node(pdata) { - match err { - TypedError::ChannelSendError(_) => { - otel_warn!("host metrics dropped due to downstream backpressure"); + let pdata = match encode_snapshot(snapshot) { + Ok(pdata) => pdata, + Err(err) => { + if let Some(metrics) = metrics.as_mut() { + metrics.scrapes_failed.add(1); + metrics.scrape_duration_ns.record(elapsed_nanos(scrape_start)); } - other => { - return Err(Error::ReceiverError { - receiver: effect_handler.receiver_id(), - kind: ReceiverErrorKind::Other, - error: format!("failed to send host metrics: {other}"), - source_detail: String::new(), - }); + return Err(Error::ReceiverError { + receiver: effect_handler.receiver_id(), + kind: ReceiverErrorKind::Other, + error: format!("failed to encode host metrics: {err}"), + source_detail: String::new(), + }); + } + }; + match effect_handler.try_send_message_with_source_node(pdata) { + Ok(()) => { + if let Some(metrics) = metrics.as_mut() { + metrics.batches_sent.add(1); + metrics.scrapes_completed.add(1); + metrics.scrape_duration_ns.record(elapsed_nanos(scrape_start)); } } + Err(TypedError::ChannelSendError(_)) => { + if let Some(metrics) = metrics.as_mut() { + metrics.send_failures.add(1); + metrics.scrape_duration_ns.record(elapsed_nanos(scrape_start)); + } + otel_warn!("host metrics dropped due to downstream backpressure"); + } + Err(other) => { + if let Some(metrics) = metrics.as_mut() { + metrics.send_failures.add(1); + metrics.scrapes_failed.add(1); + metrics.scrape_duration_ns.record(elapsed_nanos(scrape_start)); + } + return Err(Error::ReceiverError { + receiver: effect_handler.receiver_id(), + kind: ReceiverErrorKind::Other, + error: format!("failed to send host metrics: {other}"), + source_detail: String::new(), + }); + } } } Err(err) => { + if let Some(metrics) = metrics.as_mut() { + metrics.scrapes_failed.add(1); + metrics.scrape_duration_ns.record(elapsed_nanos(scrape_start)); + } return Err(Error::ReceiverError { receiver: effect_handler.receiver_id(), kind: ReceiverErrorKind::Other, From 204a41d709fbaaf5e23e578919cc6c9a18d01f88 Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Sat, 2 May 2026 22:43:38 -0700 Subject: [PATCH 05/60] Add host architecture resource attribute --- .../receivers/host_metrics_receiver/procfs.rs | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs index cffcece61c..9509e6f494 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs @@ -267,6 +267,7 @@ impl ProcfsSource { .read_trimmed_optional(PathKind::MachineId) .or_else(|| self.read_trimmed_optional(PathKind::DbusMachineId)), host_name: self.read_trimmed_optional(PathKind::Hostname), + host_arch: host_arch(), } } @@ -647,6 +648,7 @@ impl HostSnapshot { struct HostResource { host_id: Option, host_name: Option, + host_arch: Option<&'static str>, } impl HostResource { @@ -659,10 +661,25 @@ impl HostResource { if let Some(host_name) = self.host_name { attributes.push(kv_str("host.name", &host_name)); } + if let Some(host_arch) = self.host_arch { + attributes.push(kv_str("host.arch", host_arch)); + } attributes } } +fn host_arch() -> Option<&'static str> { + match std::env::consts::ARCH { + "aarch64" => Some("arm64"), + "arm" => Some("arm32"), + "powerpc" => Some("ppc32"), + "powerpc64" => Some("ppc64"), + "x86" => Some("x86"), + "x86_64" => Some("amd64"), + _ => None, + } +} + #[derive(Copy, Clone, Default)] struct CpuTimes { user: f64, @@ -1667,4 +1684,14 @@ mod tests { let paths = ProcfsPaths::new(Some(Path::new("/"))); assert_eq!(paths.net_dev, PathBuf::from("/proc/net/dev")); } + + #[test] + fn host_arch_uses_semconv_values() { + if let Some(arch) = host_arch() { + assert!(matches!( + arch, + "amd64" | "arm32" | "arm64" | "ppc32" | "ppc64" | "x86" + )); + } + } } From a709c89dca2c7bb02d82eb3bce18de94d5cf32a4 Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Sat, 2 May 2026 22:49:04 -0700 Subject: [PATCH 06/60] Reject host metrics on unsupported platforms --- .../src/receivers/host_metrics_receiver/mod.rs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs index bee0462e8c..2a56533814 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs @@ -465,6 +465,7 @@ pub static HOST_METRICS_RECEIVER: ReceiverFactory = ReceiverFactory { node: NodeId, node_config: Arc, receiver_config: &ReceiverConfig| { + validate_supported_platform()?; if pipeline.num_cores() > 1 { return Err(otap_df_config::error::Error::InvalidUserConfig { error: "host-wide collection must run in a one-core source pipeline; use receiver:host_metrics -> exporter:topic and fan out downstream".to_owned(), @@ -493,6 +494,7 @@ pub static HOST_METRICS_RECEIVER: ReceiverFactory = ReceiverFactory { impl HostMetricsReceiver { /// Creates a new host metrics receiver. pub fn new(config: Config) -> Result { + validate_supported_platform()?; let root_path = normalized_root_path(Some(effective_root_path(&config)?))?; let lease = HostMetricsLease::acquire(root_path)?; let config = RuntimeConfig::try_from(config)?; @@ -570,6 +572,16 @@ fn validate_config(config: &Config) -> Result<(), otap_df_config::error::Error> Ok(()) } +fn validate_supported_platform() -> Result<(), otap_df_config::error::Error> { + if cfg!(target_os = "linux") { + Ok(()) + } else { + Err(otap_df_config::error::Error::InvalidUserConfig { + error: "host_metrics receiver is supported only on Linux".to_owned(), + }) + } +} + fn effective_root_path(config: &Config) -> Result<&Path, otap_df_config::error::Error> { if let Some(root_path) = config.root_path.as_deref() { let host_view_root = config.host_view.root_path.as_path(); From ad0c9736b3d2077c662b9f5c14069eb8ecc82f7b Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Sat, 2 May 2026 22:53:53 -0700 Subject: [PATCH 07/60] Add host metrics semantic shape test --- .../receivers/host_metrics_receiver/procfs.rs | 218 ++++++++++++++++++ 1 file changed, 218 insertions(+) diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs index 9509e6f494..9a799cfb60 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs @@ -1562,6 +1562,153 @@ fn saturating_i64(value: u64) -> i64 { mod tests { use super::*; + #[test] + fn projection_uses_expected_metric_shapes() { + let request = HostSnapshot { + now_unix_nano: 2_000, + start_time_unix_nano: 1_000, + cpu: Some(CpuTimes { + user: 1.0, + nice: 2.0, + system: 3.0, + idle: 4.0, + wait: 5.0, + interrupt: 6.0, + steal: 7.0, + }), + cpuinfo: CpuInfo { + logical_count: 2, + physical_count: 1, + frequencies_hz: vec![2_400_000_000.0], + }, + memory: Some(MemoryStats { + total: 100, + used: 80, + free: 10, + available: 20, + cached: 5, + buffered: 5, + slab_reclaimable: 3, + slab_unreclaimable: 2, + }), + uptime_seconds: Some(42.0), + paging: Some(PagingStats { + minor_faults: 9, + major_faults: 1, + swap_in: 2, + swap_out: 3, + }), + swaps: vec![SwapStats { + name: "/dev/swap".to_owned(), + size: 100, + used: 25, + free: 75, + }], + processes: Some(ProcessStats { + running: 4, + blocked: 1, + created: 99, + }), + disks: vec![DiskStats { + name: "sda".to_owned(), + read_bytes: 10, + write_bytes: 20, + read_ops: 1, + write_ops: 2, + read_merged: 3, + write_merged: 4, + read_time_seconds: 0.5, + write_time_seconds: 0.6, + io_time_seconds: 0.7, + }], + networks: vec![NetworkStats { + name: "eth0".to_owned(), + rx_bytes: 10, + tx_bytes: 20, + rx_packets: 1, + tx_packets: 2, + rx_errors: 3, + tx_errors: 4, + rx_dropped: 5, + tx_dropped: 6, + }], + resource: HostResource { + host_id: Some("host-id".to_owned()), + host_name: Some("host-name".to_owned()), + host_arch: Some("amd64"), + }, + } + .into_export_request(); + + let resource_metrics = request.resource_metrics.first().expect("resource metrics"); + let resource = resource_metrics.resource.as_ref().expect("resource"); + assert_has_attr(&resource.attributes, "os.type", "linux"); + assert_has_attr(&resource.attributes, "host.id", "host-id"); + assert_has_attr(&resource.attributes, "host.name", "host-name"); + assert_has_attr(&resource.attributes, "host.arch", "amd64"); + + let metrics = &resource_metrics.scope_metrics[0].metrics; + assert_metric_shape(metrics, "system.cpu.time", "s", true); + assert_first_point_attr(metrics, "system.cpu.time", "cpu.mode", "user"); + assert_metric_shape(metrics, "system.cpu.logical.count", "{cpu}", false); + assert_metric_shape(metrics, "system.cpu.physical.count", "{cpu}", false); + assert_metric_shape(metrics, "system.cpu.frequency", "Hz", false); + assert_first_point_attr(metrics, "system.cpu.frequency", "cpu.logical_number", "0"); + assert_metric_shape(metrics, "system.memory.usage", "By", false); + assert_first_point_attr( + metrics, + "system.memory.usage", + "system.memory.state", + "used", + ); + assert_metric_shape(metrics, "system.memory.utilization", "1", false); + assert_metric_shape(metrics, "system.memory.linux.available", "By", false); + assert_metric_shape(metrics, "system.memory.linux.slab.usage", "By", false); + assert_metric_shape(metrics, "system.uptime", "s", false); + assert_metric_shape(metrics, "system.paging.faults", "{fault}", true); + assert_first_point_attr( + metrics, + "system.paging.faults", + "system.paging.fault.type", + "minor", + ); + assert_metric_shape(metrics, "system.paging.operations", "{operation}", true); + assert_metric_shape(metrics, "system.paging.usage", "By", false); + assert_first_point_attr(metrics, "system.paging.usage", "system.device", "/dev/swap"); + assert_metric_shape(metrics, "system.paging.utilization", "1", false); + assert_metric_shape(metrics, "system.process.count", "{process}", false); + assert_metric_shape(metrics, "system.process.created", "{process}", true); + assert_metric_shape(metrics, "system.disk.io", "By", true); + assert_first_point_attr(metrics, "system.disk.io", "disk.io.direction", "read"); + assert_metric_shape(metrics, "system.disk.operations", "{operation}", true); + assert_metric_shape(metrics, "system.disk.io_time", "s", true); + assert_first_point_attr(metrics, "system.disk.io_time", "system.device", "sda"); + assert_metric_shape(metrics, "system.disk.operation_time", "s", true); + assert_metric_shape(metrics, "system.disk.merged", "{operation}", true); + assert_metric_shape(metrics, "system.network.io", "By", true); + assert_first_point_attr( + metrics, + "system.network.io", + "network.interface.name", + "eth0", + ); + assert_metric_shape(metrics, "system.network.packet.count", "{packet}", true); + assert_first_point_attr( + metrics, + "system.network.packet.count", + "system.device", + "eth0", + ); + assert_metric_shape(metrics, "system.network.packet.dropped", "{packet}", true); + assert_first_point_attr( + metrics, + "system.network.packet.dropped", + "network.interface.name", + "eth0", + ); + assert_metric_shape(metrics, "system.network.errors", "{error}", true); + } + #[test] fn cpu_parser_accepts_missing_newer_fields() { let cpu = parse_cpu_total("10 20 30 40", 10.0).expect("cpu row"); @@ -1694,4 +1841,75 @@ mod tests { )); } } + + fn assert_metric_shape( + metrics: &[Metric], + name: &'static str, + unit: &'static str, + is_sum: bool, + ) { + let metric = metric_by_name(metrics, name); + assert_eq!(metric.unit, unit); + match metric.data.as_ref().expect("metric data") { + metric::Data::Sum(sum) => { + assert!(is_sum, "{name} should be a gauge"); + assert_eq!( + sum.aggregation_temporality, + AggregationTemporality::Cumulative as i32 + ); + assert!(sum.is_monotonic); + assert!( + sum.data_points + .iter() + .all(|point| point.start_time_unix_nano == 1_000) + ); + } + metric::Data::Gauge(gauge) => { + assert!(!is_sum, "{name} should be a cumulative sum"); + assert!( + gauge + .data_points + .iter() + .all(|point| point.start_time_unix_nano == 0) + ); + } + _ => panic!("unexpected data kind for {name}"), + } + } + + fn assert_first_point_attr( + metrics: &[Metric], + name: &'static str, + key: &'static str, + value: &'static str, + ) { + let metric = metric_by_name(metrics, name); + let point = match metric.data.as_ref().expect("metric data") { + metric::Data::Sum(sum) => sum.data_points.first(), + metric::Data::Gauge(gauge) => gauge.data_points.first(), + _ => None, + } + .expect("data point"); + assert_has_attr(&point.attributes, key, value); + } + + fn metric_by_name<'a>(metrics: &'a [Metric], name: &'static str) -> &'a Metric { + metrics + .iter() + .find(|metric| metric.name == name) + .unwrap_or_else(|| panic!("missing metric {name}")) + } + + fn assert_has_attr(attributes: &[KeyValue], key: &'static str, value: &'static str) { + assert!( + attributes.iter().any(|attr| { + attr.key == key + && matches!( + attr.value.as_ref().and_then(|value| value.value.as_ref()), + Some(any_value::Value::StringValue(actual)) if actual == value + ) + }), + "missing attribute {key}={value}" + ); + } } From 1b943c16d4ac98bfc54c204501c6e8f41c7766d9 Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Sat, 2 May 2026 22:55:20 -0700 Subject: [PATCH 08/60] Handle CPU per-core config option --- .../receivers/host_metrics_receiver/mod.rs | 85 +++++++++++++++++-- 1 file changed, 77 insertions(+), 8 deletions(-) diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs index 2a56533814..d7383ba8aa 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs @@ -156,7 +156,7 @@ pub enum HostViewValidationMode { #[serde(default, deny_unknown_fields)] pub struct FamiliesConfig { /// CPU metrics. - pub cpu: FamilyConfig, + pub cpu: CpuFamilyConfig, /// Memory metrics. pub memory: FamilyConfig, /// Paging metrics. @@ -183,6 +183,29 @@ impl FamiliesConfig { } } +/// CPU family config. +#[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(default, deny_unknown_fields)] +pub struct CpuFamilyConfig { + /// Enable CPU metrics. + pub enabled: bool, + /// Family collection interval. Defaults to top-level `collection_interval`. + #[serde(default, with = "humantime_serde::option")] + pub interval: Option, + /// Emit per-logical-CPU time series. Not supported in v1. + pub per_cpu: bool, +} + +impl Default for CpuFamilyConfig { + fn default() -> Self { + Self { + enabled: true, + interval: None, + per_cpu: false, + } + } +} + /// Common family config. #[derive(Clone, Debug, Deserialize, Serialize)] #[serde(default, deny_unknown_fields)] @@ -533,6 +556,11 @@ fn validate_config(config: &Config) -> Result<(), otap_df_config::error::Error> error: "network include_connection_count is not supported in v1".to_owned(), }); } + if config.families.cpu.per_cpu { + return Err(otap_df_config::error::Error::InvalidUserConfig { + error: "cpu per_cpu is not supported in v1".to_owned(), + }); + } validate_family_interval( "cpu", config.families.cpu.enabled, @@ -679,7 +707,7 @@ impl TryFrom for RuntimeConfig { validation: config.host_view.validation, initial_delay: config.initial_delay, families: RuntimeFamilies { - cpu: RuntimeFamily::new(&config.families.cpu, config.collection_interval), + cpu: RuntimeFamily::new_cpu(&config.families.cpu, config.collection_interval), memory: RuntimeFamily::new(&config.families.memory, config.collection_interval), paging: RuntimeFamily::new(&config.families.paging, config.collection_interval), system: RuntimeFamily::new(&config.families.system, config.collection_interval), @@ -723,6 +751,13 @@ impl RuntimeFamily { interval: config.interval.unwrap_or(default_interval), } } + + fn new_cpu(config: &CpuFamilyConfig, default_interval: Duration) -> Self { + Self { + enabled: config.enabled, + interval: config.interval.unwrap_or(default_interval), + } + } } #[derive(Clone, Copy, Debug, Eq, PartialEq)] @@ -1071,9 +1106,9 @@ mod tests { fn rejects_all_families_disabled() { let config = Config { families: FamiliesConfig { - cpu: FamilyConfig { + cpu: CpuFamilyConfig { enabled: false, - ..FamilyConfig::default() + ..CpuFamilyConfig::default() }, memory: FamilyConfig { enabled: false, @@ -1132,9 +1167,43 @@ mod tests { fn rejects_zero_enabled_family_interval() { let config = Config { families: FamiliesConfig { - cpu: FamilyConfig { + cpu: CpuFamilyConfig { interval: Some(Duration::ZERO), - ..FamilyConfig::default() + ..CpuFamilyConfig::default() + }, + ..FamiliesConfig::default() + }, + ..Config::default() + }; + + assert!(matches!( + validate_config(&config), + Err(otap_df_config::error::Error::InvalidUserConfig { .. }) + )); + } + + #[test] + fn accepts_disabled_cpu_per_cpu_flag() { + let config: Config = serde_json::from_value(serde_json::json!({ + "families": { + "cpu": { + "per_cpu": false + } + } + })) + .expect("valid cpu config"); + + assert!(!config.families.cpu.per_cpu); + validate_config(&config).expect("valid config"); + } + + #[test] + fn rejects_v1_cpu_per_cpu() { + let config = Config { + families: FamiliesConfig { + cpu: CpuFamilyConfig { + per_cpu: true, + ..CpuFamilyConfig::default() }, ..FamiliesConfig::default() }, @@ -1185,9 +1254,9 @@ mod tests { collection_interval: Duration::from_secs(10), initial_delay: Duration::from_secs(1), families: FamiliesConfig { - cpu: FamilyConfig { + cpu: CpuFamilyConfig { interval: Some(Duration::from_secs(5)), - ..FamilyConfig::default() + ..CpuFamilyConfig::default() }, ..FamiliesConfig::default() }, From cc0d813524f04ce1b0d66225a32fa8b2db4b502d Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Sat, 2 May 2026 22:57:13 -0700 Subject: [PATCH 09/60] Test host metrics duplicate lease --- .../src/receivers/host_metrics_receiver/mod.rs | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs index d7383ba8aa..ed3e1ef09a 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs @@ -1216,6 +1216,20 @@ mod tests { )); } + #[test] + fn duplicate_lease_rejects_same_root_until_drop() { + let root = PathBuf::from("/tmp/otap-host-metrics-lease-test"); + let lease = HostMetricsLease::acquire(root.clone()).expect("first lease"); + + assert!(matches!( + HostMetricsLease::acquire(root.clone()), + Err(otap_df_config::error::Error::InvalidUserConfig { .. }) + )); + + drop(lease); + let _lease = HostMetricsLease::acquire(root).expect("lease released on drop"); + } + #[test] fn rejects_conflicting_root_paths() { let config = Config { From 502e1a23d3ee67c50f9c0b8123bdb028386cc601 Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Sat, 2 May 2026 23:00:37 -0700 Subject: [PATCH 10/60] Allow partial host metrics scrapes --- .../receivers/host_metrics_receiver/mod.rs | 10 +- .../receivers/host_metrics_receiver/procfs.rs | 238 +++++++++++++++--- 2 files changed, 210 insertions(+), 38 deletions(-) diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs index ed3e1ef09a..250a26134e 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs @@ -56,6 +56,9 @@ pub struct HostMetricsReceiverMetrics { /// Number of fatal scrape failures. #[metric(unit = "{scrape}")] pub scrapes_failed: Counter, + /// Number of source read errors skipped because other families succeeded. + #[metric(unit = "{error}")] + pub partial_errors: Counter, /// Number of due metric families processed. #[metric(unit = "{family}")] pub families_scraped: Counter, @@ -1011,8 +1014,11 @@ impl local::Receiver for HostMetricsReceiver { metrics.scrape_lag_ns.record(duration_nanos(now.saturating_duration_since(scheduled_due))); } match source.scrape_due(due) { - Ok(snapshot) => { - let pdata = match encode_snapshot(snapshot) { + Ok(scrape) => { + if let Some(metrics) = metrics.as_mut() { + metrics.partial_errors.add(scrape.partial_errors); + } + let pdata = match encode_snapshot(scrape.snapshot) { Ok(pdata) => pdata, Err(err) => { if let Some(metrics) = metrics.as_mut() { diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs index 9a799cfb60..b284039457 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs @@ -92,7 +92,7 @@ impl ProcfsSource { } /// Collects one host snapshot for the due family set. - pub fn scrape_due(&mut self, due: ProcfsFamilies) -> io::Result { + pub fn scrape_due(&mut self, due: ProcfsFamilies) -> io::Result { let due = ProcfsFamilies { cpu: due.cpu && self.config.cpu, memory: due.memory && self.config.memory, @@ -104,54 +104,98 @@ impl ProcfsSource { }; let now_unix_nano = now_unix_nano(); let clk_tck = self.clk_tck; + let mut partial_errors = 0; + let mut first_error = None; let needs_stat = due.cpu || due.system || due.processes; - let stat = if needs_stat { - let proc_stat = self.read_path(PathKind::Stat)?; - parse_stat(proc_stat, clk_tck) - } else { - StatSnapshot::default() + let stat = match needs_stat + .then(|| self.read_path(PathKind::Stat)) + .transpose() + { + Ok(Some(proc_stat)) => parse_stat(proc_stat, clk_tck), + Ok(None) => StatSnapshot::default(), + Err(err) => { + record_partial_error(&mut partial_errors, &mut first_error, err); + StatSnapshot::default() + } }; - let cpuinfo = if due.cpu { - let cpuinfo = self.read_path(PathKind::Cpuinfo)?; - parse_cpuinfo(cpuinfo) - } else { - CpuInfo::default() + let cpuinfo = match due + .cpu + .then(|| self.read_path(PathKind::Cpuinfo)) + .transpose() + { + Ok(Some(cpuinfo)) => parse_cpuinfo(cpuinfo), + Ok(None) => CpuInfo::default(), + Err(err) => { + record_partial_error(&mut partial_errors, &mut first_error, err); + CpuInfo::default() + } }; - let memory = if due.memory { - let meminfo = self.read_path(PathKind::Meminfo)?; - parse_meminfo(meminfo) - } else { - None + let memory = match due + .memory + .then(|| self.read_path(PathKind::Meminfo)) + .transpose() + { + Ok(Some(meminfo)) => parse_meminfo(meminfo), + Ok(None) => None, + Err(err) => { + record_partial_error(&mut partial_errors, &mut first_error, err); + None + } }; - let uptime_seconds = if due.system { - let uptime = self.read_path(PathKind::Uptime)?; - parse_uptime(uptime) - } else { - None + let uptime_seconds = match due + .system + .then(|| self.read_path(PathKind::Uptime)) + .transpose() + { + Ok(Some(uptime)) => parse_uptime(uptime), + Ok(None) => None, + Err(err) => { + record_partial_error(&mut partial_errors, &mut first_error, err); + None + } }; - let paging = if due.paging { - let vmstat = self.read_path(PathKind::Vmstat)?; - Some(parse_vmstat(vmstat)) - } else { - None + let paging = match due + .paging + .then(|| self.read_path(PathKind::Vmstat)) + .transpose() + { + Ok(Some(vmstat)) => Some(parse_vmstat(vmstat)), + Ok(None) => None, + Err(err) => { + record_partial_error(&mut partial_errors, &mut first_error, err); + None + } }; - let swaps = if due.paging { - let swaps = self.read_path(PathKind::Swaps)?; - parse_swaps(swaps) - } else { - Vec::new() + let swaps = match due + .paging + .then(|| self.read_path(PathKind::Swaps)) + .transpose() + { + Ok(Some(swaps)) => parse_swaps(swaps), + Ok(None) => Vec::new(), + Err(err) => { + record_partial_error(&mut partial_errors, &mut first_error, err); + Vec::new() + } }; let disks = if due.disk { let disk_include = self.config.disk_include.clone(); let disk_exclude = self.config.disk_exclude.clone(); - let diskstats = self.read_path(PathKind::Diskstats)?; - parse_diskstats(diskstats, disk_include.as_ref(), disk_exclude.as_ref()) + match self.read_path(PathKind::Diskstats) { + Ok(diskstats) => { + parse_diskstats(diskstats, disk_include.as_ref(), disk_exclude.as_ref()) + } + Err(err) => { + record_partial_error(&mut partial_errors, &mut first_error, err); + Vec::new() + } + } } else { Vec::new() }; @@ -159,15 +203,22 @@ impl ProcfsSource { let networks = if due.network { let network_include = self.config.network_include.clone(); let network_exclude = self.config.network_exclude.clone(); - let netdev = self.read_path(PathKind::NetDev)?; - parse_netdev(netdev, network_include.as_ref(), network_exclude.as_ref()) + match self.read_path(PathKind::NetDev) { + Ok(netdev) => { + parse_netdev(netdev, network_include.as_ref(), network_exclude.as_ref()) + } + Err(err) => { + record_partial_error(&mut partial_errors, &mut first_error, err); + Vec::new() + } + } } else { Vec::new() }; let resource = self.read_resource(); - Ok(HostSnapshot { + let snapshot = HostSnapshot { now_unix_nano, start_time_unix_nano: stat.boot_time_unix_nano, cpu: stat.cpu, @@ -180,6 +231,14 @@ impl ProcfsSource { disks, networks, resource, + }; + if !snapshot.has_metrics() { + return Err(first_error + .unwrap_or_else(|| io::Error::other("host metrics scrape produced no metrics"))); + } + Ok(HostScrape { + snapshot, + partial_errors, }) } @@ -350,6 +409,14 @@ enum PathKind { Hostname, } +/// Result of one host metrics scrape. +pub struct HostScrape { + /// Collected host snapshot. + pub snapshot: HostSnapshot, + /// Number of source read errors skipped because other families succeeded. + pub partial_errors: u64, +} + /// One host metrics snapshot. #[derive(Default)] pub struct HostSnapshot { @@ -368,6 +435,20 @@ pub struct HostSnapshot { } impl HostSnapshot { + fn has_metrics(&self) -> bool { + self.cpu.is_some() + || self.cpuinfo.logical_count != 0 + || self.cpuinfo.physical_count != 0 + || !self.cpuinfo.frequencies_hz.is_empty() + || self.memory.is_some() + || self.uptime_seconds.is_some() + || self.paging.is_some() + || !self.swaps.is_empty() + || self.processes.is_some() + || !self.disks.is_empty() + || !self.networks.is_empty() + } + /// Converts a snapshot into an OTLP metrics request. pub fn into_export_request(self) -> ExportMetricsServiceRequest { let mut metrics = Vec::with_capacity(64); @@ -1084,6 +1165,17 @@ fn filter_allows( && !exclude.is_some_and(|filter| filter.matches(value)) } +fn record_partial_error( + partial_errors: &mut u64, + first_error: &mut Option, + err: io::Error, +) { + *partial_errors = partial_errors.saturating_add(1); + if first_error.is_none() { + *first_error = Some(err); + } +} + fn push_gauge_f64( metrics: &mut Vec, name: &'static str, @@ -1709,6 +1801,80 @@ mod tests { assert_metric_shape(metrics, "system.network.errors", "{error}", true); } + #[test] + fn scrape_due_emits_successful_families_after_partial_read_error() { + let root = tempfile::tempdir().expect("tempdir"); + let proc = root.path().join("proc"); + std::fs::create_dir(&proc).expect("proc dir"); + std::fs::write( + proc.join("meminfo"), + "MemTotal: 1000 kB\nMemFree: 100 kB\nMemAvailable: 200 kB\n", + ) + .expect("meminfo"); + let mut source = ProcfsSource::new( + Some(root.path()), + ProcfsConfig { + cpu: false, + memory: true, + paging: false, + system: false, + disk: true, + network: false, + processes: false, + disk_include: None, + disk_exclude: None, + network_include: None, + network_exclude: None, + validation: HostViewValidationMode::None, + }, + ) + .expect("source"); + + let scrape = source + .scrape_due(ProcfsFamilies { + memory: true, + disk: true, + ..ProcfsFamilies::default() + }) + .expect("partial scrape"); + + assert_eq!(scrape.partial_errors, 1); + assert!(scrape.snapshot.memory.is_some()); + assert!(scrape.snapshot.disks.is_empty()); + } + + #[test] + fn scrape_due_fails_when_all_due_families_fail() { + let root = tempfile::tempdir().expect("tempdir"); + let mut source = ProcfsSource::new( + Some(root.path()), + ProcfsConfig { + cpu: false, + memory: true, + paging: false, + system: false, + disk: false, + network: false, + processes: false, + disk_include: None, + disk_exclude: None, + network_include: None, + network_exclude: None, + validation: HostViewValidationMode::None, + }, + ) + .expect("source"); + + assert!( + source + .scrape_due(ProcfsFamilies { + memory: true, + ..ProcfsFamilies::default() + }) + .is_err() + ); + } + #[test] fn cpu_parser_accepts_missing_newer_fields() { let cpu = parse_cpu_total("10 20 30 40", 10.0).expect("cpu row"); From 5abf68eb1abd643f2b783c92e14b8ee0e863ea93 Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Sat, 2 May 2026 23:02:28 -0700 Subject: [PATCH 11/60] Track host metrics source read errors --- .../core-nodes/src/receivers/host_metrics_receiver/mod.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs index 250a26134e..3bd0997d60 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs @@ -59,6 +59,9 @@ pub struct HostMetricsReceiverMetrics { /// Number of source read errors skipped because other families succeeded. #[metric(unit = "{error}")] pub partial_errors: Counter, + /// Number of source read errors seen during scrapes. + #[metric(unit = "{error}")] + pub source_read_errors: Counter, /// Number of due metric families processed. #[metric(unit = "{family}")] pub families_scraped: Counter, @@ -1017,6 +1020,7 @@ impl local::Receiver for HostMetricsReceiver { Ok(scrape) => { if let Some(metrics) = metrics.as_mut() { metrics.partial_errors.add(scrape.partial_errors); + metrics.source_read_errors.add(scrape.partial_errors); } let pdata = match encode_snapshot(scrape.snapshot) { Ok(pdata) => pdata, @@ -1066,6 +1070,7 @@ impl local::Receiver for HostMetricsReceiver { Err(err) => { if let Some(metrics) = metrics.as_mut() { metrics.scrapes_failed.add(1); + metrics.source_read_errors.add(1); metrics.scrape_duration_ns.record(elapsed_nanos(scrape_start)); } return Err(Error::ReceiverError { From f2a06b8dfb0fbf2e700aad8476fdf7d33a007420 Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Sat, 2 May 2026 23:04:39 -0700 Subject: [PATCH 12/60] Derive host metrics process mode default --- .../src/receivers/host_metrics_receiver/mod.rs | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs index 3bd0997d60..9b053169cd 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs @@ -311,19 +311,14 @@ impl Default for ProcessesFamilyConfig { } /// Process collection mode. -#[derive(Clone, Copy, Debug, Deserialize, Eq, PartialEq, Serialize)] +#[derive(Clone, Copy, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] #[serde(rename_all = "snake_case")] pub enum ProcessMode { /// Aggregate host process summary. + #[default] Summary, } -impl Default for ProcessMode { - fn default() -> Self { - Self::Summary - } -} - /// Disk device filter. #[derive(Clone, Debug, Deserialize, Serialize)] #[serde(deny_unknown_fields)] From fc6f3f22def62850da91ec176541789baf1fe68b Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Sat, 2 May 2026 23:08:55 -0700 Subject: [PATCH 13/60] Use system clock ticks for CPU metrics --- rust/otap-dataflow/crates/core-nodes/Cargo.toml | 1 + .../src/receivers/host_metrics_receiver/procfs.rs | 14 +++++++++++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/rust/otap-dataflow/crates/core-nodes/Cargo.toml b/rust/otap-dataflow/crates/core-nodes/Cargo.toml index 554b2a4c98..dff437166c 100644 --- a/rust/otap-dataflow/crates/core-nodes/Cargo.toml +++ b/rust/otap-dataflow/crates/core-nodes/Cargo.toml @@ -38,6 +38,7 @@ futures.workspace = true futures-timer.workspace = true humantime-serde.workspace = true linkme.workspace = true +nix = { workspace = true, features = ["feature"] } object_store = {workspace = true, features = ["fs"]} parquet.workspace = true prost.workspace = true diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs index b284039457..1dab55b7c7 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs @@ -1634,9 +1634,12 @@ fn millis_to_seconds(ms: u64) -> f64 { } fn clock_ticks_per_second() -> f64 { - // Linux exposes CPU counters in USER_HZ. 100 is the common Linux value and - // keeps this receiver dependency-light until a platform helper is added. - 100.0 + nix::unistd::sysconf(nix::unistd::SysconfVar::CLK_TCK) + .ok() + .flatten() + .filter(|ticks| *ticks > 0) + .map(|ticks| ticks as f64) + .unwrap_or(100.0) } fn now_unix_nano() -> u64 { @@ -1893,6 +1896,11 @@ mod tests { assert_eq!(cpu.interrupt, 0.5); } + #[test] + fn clock_ticks_per_second_uses_positive_system_value() { + assert!(clock_ticks_per_second() > 0.0); + } + #[test] fn memavailable_fallback_uses_free_buffers_cached() { let memory = From 26e4d9bcc1b54a13842ee7674c3f6459a9a1eba4 Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Sat, 2 May 2026 23:12:04 -0700 Subject: [PATCH 14/60] Add opt-in CPU utilization metric --- .../receivers/host_metrics_receiver/mod.rs | 21 +++ .../receivers/host_metrics_receiver/procfs.rs | 137 ++++++++++++++++++ 2 files changed, 158 insertions(+) diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs index 9b053169cd..c3e61029ad 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs @@ -200,6 +200,8 @@ pub struct CpuFamilyConfig { pub interval: Option, /// Emit per-logical-CPU time series. Not supported in v1. pub per_cpu: bool, + /// Emit aggregate CPU utilization derived from CPU time deltas. + pub utilization: bool, } impl Default for CpuFamilyConfig { @@ -208,6 +210,7 @@ impl Default for CpuFamilyConfig { enabled: true, interval: None, per_cpu: false, + utilization: false, } } } @@ -359,6 +362,7 @@ struct RuntimeConfig { root_path: PathBuf, validation: HostViewValidationMode, initial_delay: Duration, + cpu_utilization: bool, families: RuntimeFamilies, } @@ -707,6 +711,7 @@ impl TryFrom for RuntimeConfig { root_path, validation: config.host_view.validation, initial_delay: config.initial_delay, + cpu_utilization: config.families.cpu.utilization, families: RuntimeFamilies { cpu: RuntimeFamily::new_cpu(&config.families.cpu, config.collection_interval), memory: RuntimeFamily::new(&config.families.memory, config.collection_interval), @@ -961,6 +966,7 @@ impl local::Receiver for HostMetricsReceiver { disk: config.families.disk.enabled, network: config.families.network.enabled, processes: config.families.processes.enabled, + cpu_utilization: config.cpu_utilization, disk_include: config.families.disk.include.clone(), disk_exclude: config.families.disk.exclude.clone(), network_include: config.families.network.include.clone(), @@ -1203,6 +1209,21 @@ mod tests { validate_config(&config).expect("valid config"); } + #[test] + fn accepts_cpu_utilization_opt_in() { + let config: Config = serde_json::from_value(serde_json::json!({ + "families": { + "cpu": { + "utilization": true + } + } + })) + .expect("valid cpu config"); + + assert!(config.families.cpu.utilization); + validate_config(&config).expect("valid config"); + } + #[test] fn rejects_v1_cpu_per_cpu() { let config = Config { diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs index 1dab55b7c7..8c1acf16ec 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs @@ -29,6 +29,7 @@ pub struct ProcfsSource { config: ProcfsConfig, buf: String, clk_tck: f64, + previous_cpu: Option, } /// Procfs collection config. @@ -47,6 +48,8 @@ pub struct ProcfsConfig { pub network: bool, /// Process summary metrics. pub processes: bool, + /// Derived aggregate CPU utilization. + pub cpu_utilization: bool, /// Disk include filter. pub disk_include: Option, /// Disk exclude filter. @@ -86,6 +89,7 @@ impl ProcfsSource { config, buf: String::with_capacity(16 * 1024), clk_tck: clock_ticks_per_second(), + previous_cpu: None, }; source.apply_startup_validation()?; Ok(source) @@ -118,6 +122,16 @@ impl ProcfsSource { StatSnapshot::default() } }; + let cpu_utilization = if due.cpu && self.config.cpu_utilization { + let utilization = stat.cpu.and_then(|current| { + self.previous_cpu + .and_then(|previous| cpu_utilization(previous, current)) + }); + self.previous_cpu = stat.cpu; + utilization + } else { + None + }; let cpuinfo = match due .cpu @@ -222,6 +236,7 @@ impl ProcfsSource { now_unix_nano, start_time_unix_nano: stat.boot_time_unix_nano, cpu: stat.cpu, + cpu_utilization, cpuinfo, memory, uptime_seconds, @@ -423,6 +438,7 @@ pub struct HostSnapshot { now_unix_nano: u64, start_time_unix_nano: u64, cpu: Option, + cpu_utilization: Option, cpuinfo: CpuInfo, memory: Option, uptime_seconds: Option, @@ -437,6 +453,7 @@ pub struct HostSnapshot { impl HostSnapshot { fn has_metrics(&self) -> bool { self.cpu.is_some() + || self.cpu_utilization.is_some() || self.cpuinfo.logical_count != 0 || self.cpuinfo.physical_count != 0 || !self.cpuinfo.frequencies_hz.is_empty() @@ -474,6 +491,24 @@ impl HostSnapshot { "cpu.mode", ); } + if let Some(cpu) = self.cpu_utilization { + push_gauge_f64_by_attr( + &mut metrics, + "system.cpu.utilization", + "1", + now, + &[ + ("user", cpu.user), + ("nice", cpu.nice), + ("system", cpu.system), + ("idle", cpu.idle), + ("wait", cpu.wait), + ("interrupt", cpu.interrupt), + ("steal", cpu.steal), + ], + "cpu.mode", + ); + } if self.cpuinfo.logical_count != 0 { push_gauge_single_u64( @@ -890,6 +925,30 @@ fn parse_cpu_total(input: &str, clk_tck: f64) -> Option { }) } +fn cpu_utilization(previous: CpuTimes, current: CpuTimes) -> Option { + let user = counter_delta(previous.user, current.user)?; + let nice = counter_delta(previous.nice, current.nice)?; + let system = counter_delta(previous.system, current.system)?; + let idle = counter_delta(previous.idle, current.idle)?; + let wait = counter_delta(previous.wait, current.wait)?; + let interrupt = counter_delta(previous.interrupt, current.interrupt)?; + let steal = counter_delta(previous.steal, current.steal)?; + let total = user + nice + system + idle + wait + interrupt + steal; + (total > 0.0).then(|| CpuTimes { + user: user / total, + nice: nice / total, + system: system / total, + idle: idle / total, + wait: wait / total, + interrupt: interrupt / total, + steal: steal / total, + }) +} + +fn counter_delta(previous: f64, current: f64) -> Option { + (current >= previous).then_some(current - previous) +} + fn parse_cpuinfo(input: &str) -> CpuInfo { let mut logical_count = 0; let mut frequencies_hz = Vec::new(); @@ -1194,6 +1253,34 @@ fn push_gauge_f64( }); } +fn push_gauge_f64_by_attr( + metrics: &mut Vec, + name: &'static str, + unit: &'static str, + now: u64, + values: &[(&'static str, f64)], + attr_name: &'static str, +) { + let mut points = Vec::with_capacity(values.len()); + for (state, value) in values { + points.push(number_point_f64( + vec![kv_str(attr_name, state)], + 0, + now, + *value, + )); + } + metrics.push(Metric { + name: name.to_owned(), + description: String::new(), + unit: unit.to_owned(), + metadata: Vec::new(), + data: Some(metric::Data::Gauge(Gauge { + data_points: points, + })), + }); +} + fn push_gauge_u64( metrics: &mut Vec, name: &'static str, @@ -1671,6 +1758,15 @@ mod tests { interrupt: 6.0, steal: 7.0, }), + cpu_utilization: Some(CpuTimes { + user: 0.1, + nice: 0.1, + system: 0.2, + idle: 0.3, + wait: 0.1, + interrupt: 0.1, + steal: 0.1, + }), cpuinfo: CpuInfo { logical_count: 2, physical_count: 1, @@ -1745,6 +1841,8 @@ mod tests { let metrics = &resource_metrics.scope_metrics[0].metrics; assert_metric_shape(metrics, "system.cpu.time", "s", true); assert_first_point_attr(metrics, "system.cpu.time", "cpu.mode", "user"); + assert_metric_shape(metrics, "system.cpu.utilization", "1", false); + assert_first_point_attr(metrics, "system.cpu.utilization", "cpu.mode", "user"); assert_metric_shape(metrics, "system.cpu.logical.count", "{cpu}", false); assert_metric_shape(metrics, "system.cpu.physical.count", "{cpu}", false); assert_metric_shape(metrics, "system.cpu.frequency", "Hz", false); @@ -1824,6 +1922,7 @@ mod tests { disk: true, network: false, processes: false, + cpu_utilization: false, disk_include: None, disk_exclude: None, network_include: None, @@ -1859,6 +1958,7 @@ mod tests { disk: false, network: false, processes: false, + cpu_utilization: false, disk_include: None, disk_exclude: None, network_include: None, @@ -1896,6 +1996,43 @@ mod tests { assert_eq!(cpu.interrupt, 0.5); } + #[test] + fn cpu_utilization_uses_counter_deltas() { + let utilization = cpu_utilization( + CpuTimes { + user: 1.0, + idle: 1.0, + ..CpuTimes::default() + }, + CpuTimes { + user: 3.0, + idle: 2.0, + ..CpuTimes::default() + }, + ) + .expect("utilization"); + + assert_eq!(utilization.user, 2.0 / 3.0); + assert_eq!(utilization.idle, 1.0 / 3.0); + } + + #[test] + fn cpu_utilization_skips_counter_resets() { + assert!( + cpu_utilization( + CpuTimes { + user: 2.0, + ..CpuTimes::default() + }, + CpuTimes { + user: 1.0, + ..CpuTimes::default() + }, + ) + .is_none() + ); + } + #[test] fn clock_ticks_per_second_uses_positive_system_value() { assert!(clock_ticks_per_second() > 0.0); From 38e0ee69976883fe068136a3f0c2ce32514addcd Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Sat, 2 May 2026 23:16:34 -0700 Subject: [PATCH 15/60] Project host updown metrics as sums --- .../receivers/host_metrics_receiver/procfs.rs | 169 +++++++++++------- 1 file changed, 100 insertions(+), 69 deletions(-) diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs index 8c1acf16ec..29b29e96d6 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs @@ -511,19 +511,21 @@ impl HostSnapshot { } if self.cpuinfo.logical_count != 0 { - push_gauge_single_u64( + push_updown_single_u64( &mut metrics, "system.cpu.logical.count", "{cpu}", + start, now, self.cpuinfo.logical_count, ); } if self.cpuinfo.physical_count != 0 { - push_gauge_single_u64( + push_updown_single_u64( &mut metrics, "system.cpu.physical.count", "{cpu}", + start, now, self.cpuinfo.physical_count, ); @@ -531,10 +533,11 @@ impl HostSnapshot { push_cpu_frequency(&mut metrics, now, &self.cpuinfo.frequencies_hz); if let Some(memory) = self.memory { - push_gauge_u64( + push_updown_u64( &mut metrics, "system.memory.usage", "By", + start, now, &[ ("used", memory.used), @@ -558,17 +561,19 @@ impl HostSnapshot { ], "system.memory.state", ); - push_gauge_single_u64( + push_updown_single_u64( &mut metrics, "system.memory.linux.available", "By", + start, now, memory.available, ); - push_gauge_u64( + push_updown_u64( &mut metrics, "system.memory.linux.slab.usage", "By", + start, now, &[ ("reclaimable", memory.slab_reclaimable), @@ -607,10 +612,11 @@ impl HostSnapshot { } for swap in self.swaps { - push_gauge_u64_with_device( + push_updown_u64_with_device( &mut metrics, "system.paging.usage", "By", + start, now, &swap.name, &[("used", swap.used), ("free", swap.free)], @@ -629,10 +635,11 @@ impl HostSnapshot { } if let Some(processes) = self.processes { - push_gauge_u64( + push_updown_u64( &mut metrics, "system.process.count", "{process}", + start, now, &[ ("running", processes.running), @@ -1281,10 +1288,11 @@ fn push_gauge_f64_by_attr( }); } -fn push_gauge_u64( +fn push_updown_u64( metrics: &mut Vec, name: &'static str, unit: &'static str, + start: u64, now: u64, values: &[(&'static str, u64)], attr_name: &'static str, @@ -1293,26 +1301,19 @@ fn push_gauge_u64( for (state, value) in values { points.push(number_point_i64( vec![kv_str(attr_name, state)], - 0, + start, now, saturating_i64(*value), )); } - metrics.push(Metric { - name: name.to_owned(), - description: String::new(), - unit: unit.to_owned(), - metadata: Vec::new(), - data: Some(metric::Data::Gauge(Gauge { - data_points: points, - })), - }); + push_updown_metric(metrics, name, unit, points); } -fn push_gauge_u64_with_device( +fn push_updown_u64_with_device( metrics: &mut Vec, name: &'static str, unit: &'static str, + start: u64, now: u64, device: &str, values: &[(&'static str, u64)], @@ -1322,38 +1323,33 @@ fn push_gauge_u64_with_device( for (state, value) in values { points.push(number_point_i64( vec![kv_str("system.device", device), kv_str(attr_name, state)], - 0, + start, now, saturating_i64(*value), )); } - metrics.push(Metric { - name: name.to_owned(), - description: String::new(), - unit: unit.to_owned(), - metadata: Vec::new(), - data: Some(metric::Data::Gauge(Gauge { - data_points: points, - })), - }); + push_updown_metric(metrics, name, unit, points); } -fn push_gauge_single_u64( +fn push_updown_single_u64( metrics: &mut Vec, name: &'static str, unit: &'static str, + start: u64, now: u64, value: u64, ) { - metrics.push(Metric { - name: name.to_owned(), - description: String::new(), - unit: unit.to_owned(), - metadata: Vec::new(), - data: Some(metric::Data::Gauge(Gauge { - data_points: vec![number_point_i64(Vec::new(), 0, now, saturating_i64(value))], - })), - }); + push_updown_metric( + metrics, + name, + unit, + vec![number_point_i64( + Vec::new(), + start, + now, + saturating_i64(value), + )], + ); } fn push_cpu_frequency(metrics: &mut Vec, now: u64, frequencies_hz: &[f64]) { @@ -1653,6 +1649,25 @@ fn push_sum_metric( name: &'static str, unit: &'static str, points: Vec, +) { + push_sum_metric_with_monotonic(metrics, name, unit, points, true); +} + +fn push_updown_metric( + metrics: &mut Vec, + name: &'static str, + unit: &'static str, + points: Vec, +) { + push_sum_metric_with_monotonic(metrics, name, unit, points, false); +} + +fn push_sum_metric_with_monotonic( + metrics: &mut Vec, + name: &'static str, + unit: &'static str, + points: Vec, + is_monotonic: bool, ) { metrics.push(Metric { name: name.to_owned(), @@ -1662,7 +1677,7 @@ fn push_sum_metric( data: Some(metric::Data::Sum(Sum { data_points: points, aggregation_temporality: AggregationTemporality::Cumulative.into(), - is_monotonic: true, + is_monotonic, })), }); } @@ -1839,67 +1854,82 @@ mod tests { assert_has_attr(&resource.attributes, "host.arch", "amd64"); let metrics = &resource_metrics.scope_metrics[0].metrics; - assert_metric_shape(metrics, "system.cpu.time", "s", true); + assert_metric_shape(metrics, "system.cpu.time", "s", Some(true)); assert_first_point_attr(metrics, "system.cpu.time", "cpu.mode", "user"); - assert_metric_shape(metrics, "system.cpu.utilization", "1", false); + assert_metric_shape(metrics, "system.cpu.utilization", "1", None); assert_first_point_attr(metrics, "system.cpu.utilization", "cpu.mode", "user"); - assert_metric_shape(metrics, "system.cpu.logical.count", "{cpu}", false); - assert_metric_shape(metrics, "system.cpu.physical.count", "{cpu}", false); - assert_metric_shape(metrics, "system.cpu.frequency", "Hz", false); + assert_metric_shape(metrics, "system.cpu.logical.count", "{cpu}", Some(false)); + assert_metric_shape(metrics, "system.cpu.physical.count", "{cpu}", Some(false)); + assert_metric_shape(metrics, "system.cpu.frequency", "Hz", None); assert_first_point_attr(metrics, "system.cpu.frequency", "cpu.logical_number", "0"); - assert_metric_shape(metrics, "system.memory.usage", "By", false); + assert_metric_shape(metrics, "system.memory.usage", "By", Some(false)); assert_first_point_attr( metrics, "system.memory.usage", "system.memory.state", "used", ); - assert_metric_shape(metrics, "system.memory.utilization", "1", false); - assert_metric_shape(metrics, "system.memory.linux.available", "By", false); - assert_metric_shape(metrics, "system.memory.linux.slab.usage", "By", false); - assert_metric_shape(metrics, "system.uptime", "s", false); - assert_metric_shape(metrics, "system.paging.faults", "{fault}", true); + assert_metric_shape(metrics, "system.memory.utilization", "1", None); + assert_metric_shape(metrics, "system.memory.linux.available", "By", Some(false)); + assert_metric_shape(metrics, "system.memory.linux.slab.usage", "By", Some(false)); + assert_metric_shape(metrics, "system.uptime", "s", None); + assert_metric_shape(metrics, "system.paging.faults", "{fault}", Some(true)); assert_first_point_attr( metrics, "system.paging.faults", "system.paging.fault.type", "minor", ); - assert_metric_shape(metrics, "system.paging.operations", "{operation}", true); - assert_metric_shape(metrics, "system.paging.usage", "By", false); + assert_metric_shape( + metrics, + "system.paging.operations", + "{operation}", + Some(true), + ); + assert_metric_shape(metrics, "system.paging.usage", "By", Some(false)); assert_first_point_attr(metrics, "system.paging.usage", "system.device", "/dev/swap"); - assert_metric_shape(metrics, "system.paging.utilization", "1", false); - assert_metric_shape(metrics, "system.process.count", "{process}", false); - assert_metric_shape(metrics, "system.process.created", "{process}", true); - assert_metric_shape(metrics, "system.disk.io", "By", true); + assert_metric_shape(metrics, "system.paging.utilization", "1", None); + assert_metric_shape(metrics, "system.process.count", "{process}", Some(false)); + assert_metric_shape(metrics, "system.process.created", "{process}", Some(true)); + assert_metric_shape(metrics, "system.disk.io", "By", Some(true)); assert_first_point_attr(metrics, "system.disk.io", "disk.io.direction", "read"); - assert_metric_shape(metrics, "system.disk.operations", "{operation}", true); - assert_metric_shape(metrics, "system.disk.io_time", "s", true); + assert_metric_shape(metrics, "system.disk.operations", "{operation}", Some(true)); + assert_metric_shape(metrics, "system.disk.io_time", "s", Some(true)); assert_first_point_attr(metrics, "system.disk.io_time", "system.device", "sda"); - assert_metric_shape(metrics, "system.disk.operation_time", "s", true); - assert_metric_shape(metrics, "system.disk.merged", "{operation}", true); - assert_metric_shape(metrics, "system.network.io", "By", true); + assert_metric_shape(metrics, "system.disk.operation_time", "s", Some(true)); + assert_metric_shape(metrics, "system.disk.merged", "{operation}", Some(true)); + assert_metric_shape(metrics, "system.network.io", "By", Some(true)); assert_first_point_attr( metrics, "system.network.io", "network.interface.name", "eth0", ); - assert_metric_shape(metrics, "system.network.packet.count", "{packet}", true); + assert_metric_shape( + metrics, + "system.network.packet.count", + "{packet}", + Some(true), + ); assert_first_point_attr( metrics, "system.network.packet.count", "system.device", "eth0", ); - assert_metric_shape(metrics, "system.network.packet.dropped", "{packet}", true); + assert_metric_shape( + metrics, + "system.network.packet.dropped", + "{packet}", + Some(true), + ); assert_first_point_attr( metrics, "system.network.packet.dropped", "network.interface.name", "eth0", ); - assert_metric_shape(metrics, "system.network.errors", "{error}", true); + assert_metric_shape(metrics, "system.network.errors", "{error}", Some(true)); } #[test] @@ -2157,18 +2187,19 @@ mod tests { metrics: &[Metric], name: &'static str, unit: &'static str, - is_sum: bool, + monotonic_sum: Option, ) { let metric = metric_by_name(metrics, name); assert_eq!(metric.unit, unit); match metric.data.as_ref().expect("metric data") { metric::Data::Sum(sum) => { - assert!(is_sum, "{name} should be a gauge"); + let expected_monotonic = + monotonic_sum.unwrap_or_else(|| panic!("{name} should be a gauge")); assert_eq!( sum.aggregation_temporality, AggregationTemporality::Cumulative as i32 ); - assert!(sum.is_monotonic); + assert_eq!(sum.is_monotonic, expected_monotonic); assert!( sum.data_points .iter() @@ -2176,7 +2207,7 @@ mod tests { ); } metric::Data::Gauge(gauge) => { - assert!(!is_sum, "{name} should be a cumulative sum"); + assert!(monotonic_sum.is_none(), "{name} should be a cumulative sum"); assert!( gauge .data_points From 6d8801c2d7da0162a304971e99da6915d684792b Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Sat, 2 May 2026 23:22:49 -0700 Subject: [PATCH 16/60] Add opt-in disk limit metric --- .../receivers/host_metrics_receiver/mod.rs | 21 ++++ .../receivers/host_metrics_receiver/procfs.rs | 109 +++++++++++++++++- 2 files changed, 129 insertions(+), 1 deletion(-) diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs index c3e61029ad..5b85643c71 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs @@ -244,6 +244,8 @@ pub struct DiskFamilyConfig { /// Family collection interval. Defaults to top-level `collection_interval`. #[serde(default, with = "humantime_serde::option")] pub interval: Option, + /// Enable disk limit metrics. + pub limit: bool, /// Device include filter. pub include: Option, /// Device exclude filter. @@ -255,6 +257,7 @@ impl Default for DiskFamilyConfig { Self { enabled: true, interval: None, + limit: false, include: None, exclude: None, } @@ -387,6 +390,7 @@ struct RuntimeFamily { struct RuntimeDiskFamily { enabled: bool, interval: Duration, + limit: bool, include: Option, exclude: Option, } @@ -724,6 +728,7 @@ impl TryFrom for RuntimeConfig { .disk .interval .unwrap_or(config.collection_interval), + limit: config.families.disk.limit, include: disk_include, exclude: disk_exclude, }, @@ -967,6 +972,7 @@ impl local::Receiver for HostMetricsReceiver { network: config.families.network.enabled, processes: config.families.processes.enabled, cpu_utilization: config.cpu_utilization, + disk_limit: config.families.disk.limit, disk_include: config.families.disk.include.clone(), disk_exclude: config.families.disk.exclude.clone(), network_include: config.families.network.include.clone(), @@ -1224,6 +1230,21 @@ mod tests { validate_config(&config).expect("valid config"); } + #[test] + fn accepts_disk_limit_opt_in() { + let config: Config = serde_json::from_value(serde_json::json!({ + "families": { + "disk": { + "limit": true + } + } + })) + .expect("valid disk config"); + + assert!(config.families.disk.limit); + validate_config(&config).expect("valid config"); + } + #[test] fn rejects_v1_cpu_per_cpu() { let config = Config { diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs index 29b29e96d6..874d764366 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs @@ -50,6 +50,8 @@ pub struct ProcfsConfig { pub processes: bool, /// Derived aggregate CPU utilization. pub cpu_utilization: bool, + /// Derived disk limit from sysfs block device size. + pub disk_limit: bool, /// Disk include filter. pub disk_include: Option, /// Disk exclude filter. @@ -203,7 +205,14 @@ impl ProcfsSource { let disk_exclude = self.config.disk_exclude.clone(); match self.read_path(PathKind::Diskstats) { Ok(diskstats) => { - parse_diskstats(diskstats, disk_include.as_ref(), disk_exclude.as_ref()) + let mut disks = + parse_diskstats(diskstats, disk_include.as_ref(), disk_exclude.as_ref()); + if self.config.disk_limit { + for disk in &mut disks { + disk.limit_bytes = self.read_disk_limit_bytes(&disk.name).ok(); + } + } + disks } Err(err) => { record_partial_error(&mut partial_errors, &mut first_error, err); @@ -335,6 +344,14 @@ impl ProcfsSource { Ok(self.buf.as_str()) } + fn read_disk_limit_bytes(&mut self, disk_name: &str) -> io::Result { + self.buf.clear(); + let mut file = File::open(self.paths.sys_block.join(disk_name).join("size"))?; + let _ = file.read_to_string(&mut self.buf)?; + let sectors = parse_u64(self.buf.trim()); + Ok(sectors.saturating_mul(DISKSTAT_SECTOR_BYTES)) + } + fn read_resource(&mut self) -> HostResource { HostResource { host_id: self @@ -363,6 +380,7 @@ struct ProcfsPaths { vmstat: PathBuf, swaps: PathBuf, diskstats: PathBuf, + sys_block: PathBuf, net_dev: PathBuf, machine_id: PathBuf, dbus_machine_id: PathBuf, @@ -381,6 +399,7 @@ impl ProcfsPaths { vmstat: root.join("proc/vmstat"), swaps: root.join("proc/swaps"), diskstats: root.join("proc/diskstats"), + sys_block: root.join("sys/block"), machine_id: root.join("etc/machine-id"), dbus_machine_id: root.join("var/lib/dbus/machine-id"), hostname: root.join("proc/sys/kernel/hostname"), @@ -658,6 +677,17 @@ impl HostSnapshot { } for disk in self.disks { + if let Some(limit_bytes) = disk.limit_bytes { + push_updown_single_u64_with_device( + &mut metrics, + "system.disk.limit", + "By", + start, + now, + &disk.name, + limit_bytes, + ); + } push_disk_sum( &mut metrics, "system.disk.io", @@ -866,6 +896,7 @@ struct ProcessStats { #[derive(Default)] struct DiskStats { name: String, + limit_bytes: Option, read_bytes: u64, write_bytes: u64, read_ops: u64, @@ -1150,6 +1181,7 @@ fn parse_diskstats( }; disks.push(DiskStats { name: name.to_owned(), + limit_bytes: None, read_ops: parse_u64(read_ops), read_bytes: parse_u64(read_sectors).saturating_mul(DISKSTAT_SECTOR_BYTES), write_ops: parse_u64(write_ops), @@ -1352,6 +1384,28 @@ fn push_updown_single_u64( ); } +fn push_updown_single_u64_with_device( + metrics: &mut Vec, + name: &'static str, + unit: &'static str, + start: u64, + now: u64, + device: &str, + value: u64, +) { + push_updown_metric( + metrics, + name, + unit, + vec![number_point_i64( + vec![kv_str("system.device", device)], + start, + now, + saturating_i64(value), + )], + ); +} + fn push_cpu_frequency(metrics: &mut Vec, now: u64, frequencies_hz: &[f64]) { if frequencies_hz.is_empty() { return; @@ -1817,6 +1871,7 @@ mod tests { }), disks: vec![DiskStats { name: "sda".to_owned(), + limit_bytes: Some(123), read_bytes: 10, write_bytes: 20, read_ops: 1, @@ -1898,6 +1953,8 @@ mod tests { assert_first_point_attr(metrics, "system.disk.io_time", "system.device", "sda"); assert_metric_shape(metrics, "system.disk.operation_time", "s", Some(true)); assert_metric_shape(metrics, "system.disk.merged", "{operation}", Some(true)); + assert_metric_shape(metrics, "system.disk.limit", "By", Some(false)); + assert_first_point_attr(metrics, "system.disk.limit", "system.device", "sda"); assert_metric_shape(metrics, "system.network.io", "By", Some(true)); assert_first_point_attr( metrics, @@ -1953,6 +2010,7 @@ mod tests { network: false, processes: false, cpu_utilization: false, + disk_limit: false, disk_include: None, disk_exclude: None, network_include: None, @@ -1989,6 +2047,7 @@ mod tests { network: false, processes: false, cpu_utilization: false, + disk_limit: false, disk_include: None, disk_exclude: None, network_include: None, @@ -2008,6 +2067,54 @@ mod tests { ); } + #[test] + fn scrape_due_reads_opt_in_disk_limit_from_sysfs() { + let root = tempfile::tempdir().expect("tempdir"); + let proc = root.path().join("proc"); + let sys_sda = root.path().join("sys/block/sda"); + std::fs::create_dir(&proc).expect("proc dir"); + std::fs::create_dir_all(&sys_sda).expect("sys block dir"); + std::fs::write( + proc.join("diskstats"), + "8 0 sda 1 0 2 3 4 0 5 6 0 0 0 0 0 0 0 0\n", + ) + .expect("diskstats"); + std::fs::write(sys_sda.join("size"), "4096\n").expect("disk size"); + let mut source = ProcfsSource::new( + Some(root.path()), + ProcfsConfig { + cpu: false, + memory: false, + paging: false, + system: false, + disk: true, + network: false, + processes: false, + cpu_utilization: false, + disk_limit: true, + disk_include: None, + disk_exclude: None, + network_include: None, + network_exclude: None, + validation: HostViewValidationMode::None, + }, + ) + .expect("source"); + + let scrape = source + .scrape_due(ProcfsFamilies { + disk: true, + ..ProcfsFamilies::default() + }) + .expect("disk scrape"); + + assert_eq!(scrape.snapshot.disks.len(), 1); + assert_eq!( + scrape.snapshot.disks[0].limit_bytes, + Some(4096 * DISKSTAT_SECTOR_BYTES) + ); + } + #[test] fn cpu_parser_accepts_missing_newer_fields() { let cpu = parse_cpu_total("10 20 30 40", 10.0).expect("cpu row"); From 8e747e43118eefb04222cb66744d2a0e1ebc59ef Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Sat, 2 May 2026 23:30:46 -0700 Subject: [PATCH 17/60] Add host filesystem metrics --- .../receivers/host_metrics_receiver/mod.rs | 98 +++- .../receivers/host_metrics_receiver/procfs.rs | 503 +++++++++++++++++- 2 files changed, 599 insertions(+), 2 deletions(-) diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs index 5b85643c71..f61a9ef5af 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs @@ -171,6 +171,8 @@ pub struct FamiliesConfig { pub system: FamilyConfig, /// Disk metrics. pub disk: DiskFamilyConfig, + /// Filesystem metrics. + pub filesystem: FilesystemFamilyConfig, /// Network metrics. pub network: NetworkFamilyConfig, /// Process summary metrics. @@ -184,6 +186,7 @@ impl FamiliesConfig { + usize::from(self.paging.enabled) + usize::from(self.system.enabled) + usize::from(self.disk.enabled) + + usize::from(self.filesystem.enabled) + usize::from(self.network.enabled) + usize::from(self.processes.enabled) } @@ -264,6 +267,32 @@ impl Default for DiskFamilyConfig { } } +/// Filesystem family config. +#[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(default, deny_unknown_fields)] +pub struct FilesystemFamilyConfig { + /// Enable filesystem metrics. + pub enabled: bool, + /// Family collection interval. Defaults to top-level `collection_interval`. + #[serde(default, with = "humantime_serde::option")] + pub interval: Option, + /// Include virtual filesystems. + pub include_virtual_filesystems: bool, + /// Enable filesystem limit metrics. + pub limit: bool, +} + +impl Default for FilesystemFamilyConfig { + fn default() -> Self { + Self { + enabled: true, + interval: None, + include_virtual_filesystems: false, + limit: false, + } + } +} + /// Network family config. #[derive(Clone, Debug, Deserialize, Serialize)] #[serde(default, deny_unknown_fields)] @@ -376,6 +405,7 @@ struct RuntimeFamilies { paging: RuntimeFamily, system: RuntimeFamily, disk: RuntimeDiskFamily, + filesystem: RuntimeFilesystemFamily, network: RuntimeNetworkFamily, processes: RuntimeFamily, } @@ -395,6 +425,14 @@ struct RuntimeDiskFamily { exclude: Option, } +#[derive(Clone)] +struct RuntimeFilesystemFamily { + enabled: bool, + interval: Duration, + include_virtual_filesystems: bool, + limit: bool, +} + #[derive(Clone)] struct RuntimeNetworkFamily { enabled: bool, @@ -595,6 +633,11 @@ fn validate_config(config: &Config) -> Result<(), otap_df_config::error::Error> config.families.disk.enabled, config.families.disk.interval, )?; + validate_family_interval( + "filesystem", + config.families.filesystem.enabled, + config.families.filesystem.interval, + )?; validate_family_interval( "network", config.families.network.enabled, @@ -732,6 +775,19 @@ impl TryFrom for RuntimeConfig { include: disk_include, exclude: disk_exclude, }, + filesystem: RuntimeFilesystemFamily { + enabled: config.families.filesystem.enabled, + interval: config + .families + .filesystem + .interval + .unwrap_or(config.collection_interval), + include_virtual_filesystems: config + .families + .filesystem + .include_virtual_filesystems, + limit: config.families.filesystem.limit, + }, network: RuntimeNetworkFamily { enabled: config.families.network.enabled, interval: config @@ -778,6 +834,7 @@ enum ScheduledFamilyKind { Paging, System, Disk, + Filesystem, Network, Processes, } @@ -795,7 +852,7 @@ struct FamilyScheduler { impl FamilyScheduler { fn new(config: &RuntimeConfig, now: Instant) -> Self { let first_due = now + config.initial_delay; - let mut entries = Vec::with_capacity(7); + let mut entries = Vec::with_capacity(8); push_scheduled( &mut entries, ScheduledFamilyKind::Cpu, @@ -827,6 +884,13 @@ impl FamilyScheduler { next_due: first_due, }); } + if config.families.filesystem.enabled { + entries.push(ScheduledFamily { + kind: ScheduledFamilyKind::Filesystem, + interval: config.families.filesystem.interval, + next_due: first_due, + }); + } if config.families.network.enabled { entries.push(ScheduledFamily { kind: ScheduledFamilyKind::Network, @@ -861,6 +925,7 @@ impl FamilyScheduler { ScheduledFamilyKind::Paging => due.paging = true, ScheduledFamilyKind::System => due.system = true, ScheduledFamilyKind::Disk => due.disk = true, + ScheduledFamilyKind::Filesystem => due.filesystem = true, ScheduledFamilyKind::Network => due.network = true, ScheduledFamilyKind::Processes => due.processes = true, } @@ -969,10 +1034,13 @@ impl local::Receiver for HostMetricsReceiver { paging: config.families.paging.enabled, system: config.families.system.enabled, disk: config.families.disk.enabled, + filesystem: config.families.filesystem.enabled, network: config.families.network.enabled, processes: config.families.processes.enabled, cpu_utilization: config.cpu_utilization, disk_limit: config.families.disk.limit, + filesystem_include_virtual: config.families.filesystem.include_virtual_filesystems, + filesystem_limit: config.families.filesystem.limit, disk_include: config.families.disk.include.clone(), disk_exclude: config.families.disk.exclude.clone(), network_include: config.families.network.include.clone(), @@ -1144,6 +1212,10 @@ mod tests { enabled: false, ..DiskFamilyConfig::default() }, + filesystem: FilesystemFamilyConfig { + enabled: false, + ..FilesystemFamilyConfig::default() + }, network: NetworkFamilyConfig { enabled: false, ..NetworkFamilyConfig::default() @@ -1245,6 +1317,28 @@ mod tests { validate_config(&config).expect("valid config"); } + #[test] + fn accepts_filesystem_options() { + let config: Config = serde_json::from_value(serde_json::json!({ + "families": { + "filesystem": { + "interval": "5m", + "include_virtual_filesystems": true, + "limit": true + } + } + })) + .expect("valid filesystem config"); + + assert_eq!( + config.families.filesystem.interval, + Some(Duration::from_secs(300)) + ); + assert!(config.families.filesystem.include_virtual_filesystems); + assert!(config.families.filesystem.limit); + validate_config(&config).expect("valid config"); + } + #[test] fn rejects_v1_cpu_per_cpu() { let config = Config { @@ -1339,11 +1433,13 @@ mod tests { assert!(first_due.cpu); assert!(first_due.memory); assert!(first_due.disk); + assert!(first_due.filesystem); let second_due = scheduler.mark_due(now + Duration::from_secs(6)); assert!(second_due.cpu); assert!(!second_due.memory); assert!(!second_due.disk); + assert!(!second_due.filesystem); } #[test] diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs index 874d764366..932fba26fb 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs @@ -17,11 +17,13 @@ use std::collections::HashSet; use std::fs::File; use std::io::{self, Read}; use std::path::{Path, PathBuf}; -use std::time::{SystemTime, UNIX_EPOCH}; +use std::sync::mpsc; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; const NANOS_PER_SEC: u64 = 1_000_000_000; const BYTES_PER_KIB: u64 = 1024; const DISKSTAT_SECTOR_BYTES: u64 = 512; +const FILESYSTEM_STAT_TIMEOUT: Duration = Duration::from_millis(100); /// Procfs-backed source for host metrics. pub struct ProcfsSource { @@ -30,6 +32,7 @@ pub struct ProcfsSource { buf: String, clk_tck: f64, previous_cpu: Option, + filesystem_worker: FilesystemStatWorker, } /// Procfs collection config. @@ -44,6 +47,8 @@ pub struct ProcfsConfig { pub system: bool, /// Disk metrics. pub disk: bool, + /// Filesystem metrics. + pub filesystem: bool, /// Network metrics. pub network: bool, /// Process summary metrics. @@ -52,6 +57,10 @@ pub struct ProcfsConfig { pub cpu_utilization: bool, /// Derived disk limit from sysfs block device size. pub disk_limit: bool, + /// Include virtual filesystems. + pub filesystem_include_virtual: bool, + /// Emit filesystem limit metric. + pub filesystem_limit: bool, /// Disk include filter. pub disk_include: Option, /// Disk exclude filter. @@ -77,6 +86,8 @@ pub struct ProcfsFamilies { pub system: bool, /// Disk metrics. pub disk: bool, + /// Filesystem metrics. + pub filesystem: bool, /// Network metrics. pub network: bool, /// Process summary metrics. @@ -92,6 +103,7 @@ impl ProcfsSource { buf: String::with_capacity(16 * 1024), clk_tck: clock_ticks_per_second(), previous_cpu: None, + filesystem_worker: FilesystemStatWorker::new(), }; source.apply_startup_validation()?; Ok(source) @@ -105,6 +117,7 @@ impl ProcfsSource { paging: due.paging && self.config.paging, system: due.system && self.config.system, disk: due.disk && self.config.disk, + filesystem: due.filesystem && self.config.filesystem, network: due.network && self.config.network, processes: due.processes && self.config.processes, }; @@ -239,6 +252,23 @@ impl ProcfsSource { Vec::new() }; + let filesystems = if due.filesystem { + let include_virtual = self.config.filesystem_include_virtual; + let emit_limit = self.config.filesystem_limit; + match self.read_path(PathKind::Mountinfo) { + Ok(mountinfo) => { + let mounts = parse_mountinfo(mountinfo, include_virtual, emit_limit); + self.read_filesystems(mounts) + } + Err(err) => { + record_partial_error(&mut partial_errors, &mut first_error, err); + Vec::new() + } + } + } else { + Vec::new() + }; + let resource = self.read_resource(); let snapshot = HostSnapshot { @@ -253,6 +283,7 @@ impl ProcfsSource { swaps, processes: due.processes.then_some(stat.processes), disks, + filesystems, networks, resource, }; @@ -297,6 +328,9 @@ impl ProcfsSource { if self.config.disk { let _ = File::open(self.paths.path(PathKind::Diskstats))?; } + if self.config.filesystem { + let _ = File::open(self.paths.path(PathKind::Mountinfo))?; + } if self.config.network { let _ = File::open(self.paths.path(PathKind::NetDev))?; } @@ -328,6 +362,9 @@ impl ProcfsSource { if self.config.disk && !self.source_available(PathKind::Diskstats) { self.config.disk = false; } + if self.config.filesystem && !self.source_available(PathKind::Mountinfo) { + self.config.filesystem = false; + } if self.config.network && !self.source_available(PathKind::NetDev) { self.config.network = false; } @@ -352,6 +389,33 @@ impl ProcfsSource { Ok(sectors.saturating_mul(DISKSTAT_SECTOR_BYTES)) } + fn read_filesystems(&mut self, mounts: Vec) -> Vec { + let mut filesystems = Vec::with_capacity(mounts.len()); + for mount in mounts { + let path = self.paths.host_path(&mount.mountpoint); + let Ok(stat) = self + .filesystem_worker + .statvfs(path, FILESYSTEM_STAT_TIMEOUT) + else { + continue; + }; + let free = stat.available_bytes; + let reserved = stat.free_bytes.saturating_sub(stat.available_bytes); + let used = stat.total_bytes.saturating_sub(stat.free_bytes); + filesystems.push(FilesystemStats { + device: mount.device, + mountpoint: mount.mountpoint, + fs_type: mount.fs_type, + mode: mount.mode, + used, + free, + reserved, + limit_bytes: mount.emit_limit.then_some(stat.total_bytes), + }); + } + filesystems + } + fn read_resource(&mut self) -> HostResource { HostResource { host_id: self @@ -373,6 +437,7 @@ impl ProcfsSource { #[derive(Clone, Debug)] struct ProcfsPaths { + root: PathBuf, stat: PathBuf, cpuinfo: PathBuf, meminfo: PathBuf, @@ -380,6 +445,7 @@ struct ProcfsPaths { vmstat: PathBuf, swaps: PathBuf, diskstats: PathBuf, + mountinfo: PathBuf, sys_block: PathBuf, net_dev: PathBuf, machine_id: PathBuf, @@ -392,6 +458,7 @@ impl ProcfsPaths { let root = root_path.unwrap_or_else(|| Path::new("/")); let host_root = root_path.is_some_and(|path| path != Path::new("/")); Self { + root: root.to_path_buf(), stat: root.join("proc/stat"), cpuinfo: root.join("proc/cpuinfo"), meminfo: root.join("proc/meminfo"), @@ -399,6 +466,11 @@ impl ProcfsPaths { vmstat: root.join("proc/vmstat"), swaps: root.join("proc/swaps"), diskstats: root.join("proc/diskstats"), + mountinfo: if host_root { + root.join("proc/1/mountinfo") + } else { + root.join("proc/self/mountinfo") + }, sys_block: root.join("sys/block"), machine_id: root.join("etc/machine-id"), dbus_machine_id: root.join("var/lib/dbus/machine-id"), @@ -420,12 +492,20 @@ impl ProcfsPaths { PathKind::Vmstat => &self.vmstat, PathKind::Swaps => &self.swaps, PathKind::Diskstats => &self.diskstats, + PathKind::Mountinfo => &self.mountinfo, PathKind::NetDev => &self.net_dev, PathKind::MachineId => &self.machine_id, PathKind::DbusMachineId => &self.dbus_machine_id, PathKind::Hostname => &self.hostname, } } + + fn host_path(&self, host_absolute_path: &str) -> PathBuf { + let relative = host_absolute_path + .strip_prefix('/') + .unwrap_or(host_absolute_path); + self.root.join(relative) + } } #[derive(Copy, Clone)] @@ -437,6 +517,7 @@ enum PathKind { Vmstat, Swaps, Diskstats, + Mountinfo, NetDev, MachineId, DbusMachineId, @@ -465,6 +546,7 @@ pub struct HostSnapshot { swaps: Vec, processes: Option, disks: Vec, + filesystems: Vec, networks: Vec, resource: HostResource, } @@ -482,6 +564,7 @@ impl HostSnapshot { || !self.swaps.is_empty() || self.processes.is_some() || !self.disks.is_empty() + || !self.filesystems.is_empty() || !self.networks.is_empty() } @@ -735,6 +818,14 @@ impl HostSnapshot { ); } + for filesystem in self.filesystems { + push_filesystem_usage(&mut metrics, start, now, &filesystem); + push_filesystem_utilization(&mut metrics, now, &filesystem); + if let Some(limit_bytes) = filesystem.limit_bytes { + push_filesystem_limit(&mut metrics, start, now, &filesystem, limit_bytes); + } + } + for network in self.networks { push_network_sum( &mut metrics, @@ -908,6 +999,67 @@ struct DiskStats { io_time_seconds: f64, } +#[derive(Default)] +struct FilesystemStats { + device: String, + mountpoint: String, + fs_type: String, + mode: &'static str, + used: u64, + free: u64, + reserved: u64, + limit_bytes: Option, +} + +struct FilesystemStatWorker { + tx: mpsc::SyncSender, +} + +struct FilesystemStatRequest { + path: PathBuf, + response: mpsc::Sender>, +} + +struct FilesystemStat { + total_bytes: u64, + free_bytes: u64, + available_bytes: u64, +} + +impl FilesystemStatWorker { + fn new() -> Self { + let (tx, rx) = mpsc::sync_channel::(1); + let _ = std::thread::Builder::new() + .name("host-metrics-statvfs".to_owned()) + .spawn(move || { + while let Ok(request) = rx.recv() { + let result = statvfs_bytes(&request.path); + let _ = request.response.send(result); + } + }); + Self { tx } + } + + fn statvfs(&self, path: PathBuf, timeout: Duration) -> io::Result { + let (response, rx) = mpsc::channel(); + self.tx + .try_send(FilesystemStatRequest { path, response }) + .map_err(|_| io::Error::new(io::ErrorKind::TimedOut, "statvfs worker is busy"))?; + rx.recv_timeout(timeout) + .map_err(|_| io::Error::new(io::ErrorKind::TimedOut, "statvfs timed out"))? + } +} + +fn statvfs_bytes(path: &Path) -> io::Result { + let stat = nix::sys::statvfs::statvfs(path).map_err(io::Error::other)?; + let block_size = stat.fragment_size(); + Ok(FilesystemStat { + total_bytes: u64::from(stat.blocks()).saturating_mul(block_size), + free_bytes: u64::from(stat.blocks_free()).saturating_mul(block_size), + available_bytes: u64::from(stat.blocks_available()).saturating_mul(block_size), + }) +} + #[derive(Default)] struct NetworkStats { name: String, @@ -1196,6 +1348,124 @@ fn parse_diskstats( disks } +struct FilesystemMount { + device: String, + mountpoint: String, + fs_type: String, + mode: &'static str, + emit_limit: bool, +} + +fn parse_mountinfo( + input: &str, + include_virtual_filesystems: bool, + emit_limit: bool, +) -> Vec { + let mut mounts = Vec::new(); + for line in input.lines() { + let Some(separator) = line.find(" - ") else { + continue; + }; + let mut pre_fields = line[..separator].split_whitespace(); + let _mount_id = pre_fields.next(); + let _parent_id = pre_fields.next(); + let _major_minor = pre_fields.next(); + let _root = pre_fields.next(); + let Some(mountpoint) = pre_fields.next() else { + continue; + }; + let Some(options) = pre_fields.next() else { + continue; + }; + + let mut post_fields = line[separator + 3..].split_whitespace(); + let Some(fs_type) = post_fields.next() else { + continue; + }; + let Some(device) = post_fields.next() else { + continue; + }; + if !include_virtual_filesystems && is_skipped_filesystem_type(fs_type) { + continue; + } + mounts.push(FilesystemMount { + device: unescape_mountinfo(device), + mountpoint: unescape_mountinfo(mountpoint), + fs_type: fs_type.to_owned(), + mode: filesystem_mode(options), + emit_limit, + }); + } + mounts +} + +fn filesystem_mode(options: &str) -> &'static str { + if options.split(',').any(|option| option == "ro") { + "ro" + } else { + "rw" + } +} + +fn is_skipped_filesystem_type(fs_type: &str) -> bool { + matches!( + fs_type, + "autofs" + | "bpf" + | "binfmt_misc" + | "cgroup" + | "cgroup2" + | "debugfs" + | "devtmpfs" + | "fusectl" + | "mqueue" + | "nsfs" + | "overlay" + | "proc" + | "pstore" + | "squashfs" + | "sysfs" + | "tmpfs" + | "tracefs" + | "nfs" + | "nfs4" + | "cifs" + | "smb3" + | "9p" + ) +} + +fn unescape_mountinfo(input: &str) -> String { + let bytes = input.as_bytes(); + let mut escaped = None; + for idx in 0..bytes.len() { + if bytes[idx] == b'\\' && idx + 3 < bytes.len() { + escaped = Some(idx); + break; + } + } + let Some(first_escape) = escaped else { + return input.to_owned(); + }; + + let mut output = String::with_capacity(input.len()); + output.push_str(&input[..first_escape]); + let mut idx = first_escape; + while idx < bytes.len() { + if bytes[idx] == b'\\' && idx + 3 < bytes.len() { + let octal = &input[idx + 1..idx + 4]; + if let Ok(value) = u8::from_str_radix(octal, 8) { + output.push(value as char); + idx += 4; + continue; + } + } + output.push(bytes[idx] as char); + idx += 1; + } + output +} + fn parse_netdev( input: &str, include: Option<&CompiledFilter>, @@ -1644,6 +1914,130 @@ fn disk_number_point( } } +fn push_filesystem_usage( + metrics: &mut Vec, + start: u64, + now: u64, + filesystem: &FilesystemStats, +) { + let points = vec![ + filesystem_number_point( + filesystem, + "used", + start, + now, + FilesystemValue::Integer(filesystem.used), + ), + filesystem_number_point( + filesystem, + "free", + start, + now, + FilesystemValue::Integer(filesystem.free), + ), + filesystem_number_point( + filesystem, + "reserved", + start, + now, + FilesystemValue::Integer(filesystem.reserved), + ), + ]; + push_updown_metric(metrics, "system.filesystem.usage", "By", points); +} + +fn push_filesystem_utilization(metrics: &mut Vec, now: u64, filesystem: &FilesystemStats) { + let total = filesystem + .used + .saturating_add(filesystem.free) + .saturating_add(filesystem.reserved); + if total == 0 { + return; + } + let total = total as f64; + let points = vec![ + filesystem_number_point( + filesystem, + "used", + 0, + now, + FilesystemValue::Float(filesystem.used as f64 / total), + ), + filesystem_number_point( + filesystem, + "free", + 0, + now, + FilesystemValue::Float(filesystem.free as f64 / total), + ), + filesystem_number_point( + filesystem, + "reserved", + 0, + now, + FilesystemValue::Float(filesystem.reserved as f64 / total), + ), + ]; + metrics.push(Metric { + name: "system.filesystem.utilization".to_owned(), + description: String::new(), + unit: "1".to_owned(), + metadata: Vec::new(), + data: Some(metric::Data::Gauge(Gauge { + data_points: points, + })), + }); +} + +fn push_filesystem_limit( + metrics: &mut Vec, + start: u64, + now: u64, + filesystem: &FilesystemStats, + limit_bytes: u64, +) { + push_updown_metric( + metrics, + "system.filesystem.limit", + "By", + vec![filesystem_number_point( + filesystem, + "limit", + start, + now, + FilesystemValue::Integer(limit_bytes), + )], + ); +} + +#[derive(Copy, Clone)] +enum FilesystemValue { + Integer(u64), + Float(f64), +} + +fn filesystem_number_point( + filesystem: &FilesystemStats, + state: &'static str, + start: u64, + now: u64, + value: FilesystemValue, +) -> NumberDataPoint { + let attributes = vec![ + kv_str("system.device", &filesystem.device), + kv_str("system.filesystem.state", state), + kv_str("system.filesystem.type", &filesystem.fs_type), + kv_str("system.filesystem.mode", filesystem.mode), + kv_str("system.filesystem.mountpoint", &filesystem.mountpoint), + ]; + match value { + FilesystemValue::Integer(value) => { + number_point_i64(attributes, start, now, saturating_i64(value)) + } + FilesystemValue::Float(value) => number_point_f64(attributes, start, now, value), + } +} + fn push_network_sum( metrics: &mut Vec, name: &'static str, @@ -1882,6 +2276,16 @@ mod tests { write_time_seconds: 0.6, io_time_seconds: 0.7, }], + filesystems: vec![FilesystemStats { + device: "/dev/sda1".to_owned(), + mountpoint: "/".to_owned(), + fs_type: "ext4".to_owned(), + mode: "rw", + used: 60, + free: 30, + reserved: 10, + limit_bytes: Some(100), + }], networks: vec![NetworkStats { name: "eth0".to_owned(), rx_bytes: 10, @@ -1955,6 +2359,15 @@ mod tests { assert_metric_shape(metrics, "system.disk.merged", "{operation}", Some(true)); assert_metric_shape(metrics, "system.disk.limit", "By", Some(false)); assert_first_point_attr(metrics, "system.disk.limit", "system.device", "sda"); + assert_metric_shape(metrics, "system.filesystem.usage", "By", Some(false)); + assert_first_point_attr( + metrics, + "system.filesystem.usage", + "system.filesystem.state", + "used", + ); + assert_metric_shape(metrics, "system.filesystem.utilization", "1", None); + assert_metric_shape(metrics, "system.filesystem.limit", "By", Some(false)); assert_metric_shape(metrics, "system.network.io", "By", Some(true)); assert_first_point_attr( metrics, @@ -2007,10 +2420,13 @@ mod tests { paging: false, system: false, disk: true, + filesystem: false, network: false, processes: false, cpu_utilization: false, disk_limit: false, + filesystem_include_virtual: false, + filesystem_limit: false, disk_include: None, disk_exclude: None, network_include: None, @@ -2044,10 +2460,13 @@ mod tests { paging: false, system: false, disk: false, + filesystem: false, network: false, processes: false, cpu_utilization: false, disk_limit: false, + filesystem_include_virtual: false, + filesystem_limit: false, disk_include: None, disk_exclude: None, network_include: None, @@ -2088,10 +2507,13 @@ mod tests { paging: false, system: false, disk: true, + filesystem: false, network: false, processes: false, cpu_utilization: false, disk_limit: true, + filesystem_include_virtual: false, + filesystem_limit: false, disk_include: None, disk_exclude: None, network_include: None, @@ -2115,6 +2537,54 @@ mod tests { ); } + #[test] + fn scrape_due_reads_filesystem_usage_from_mountinfo() { + let root = tempfile::tempdir().expect("tempdir"); + let proc_one = root.path().join("proc/1"); + std::fs::create_dir_all(&proc_one).expect("proc one dir"); + std::fs::write( + proc_one.join("mountinfo"), + "36 25 8:1 / / rw,relatime - ext4 /dev/sda1 rw\n", + ) + .expect("mountinfo"); + let mut source = ProcfsSource::new( + Some(root.path()), + ProcfsConfig { + cpu: false, + memory: false, + paging: false, + system: false, + disk: false, + filesystem: true, + network: false, + processes: false, + cpu_utilization: false, + disk_limit: false, + filesystem_include_virtual: false, + filesystem_limit: true, + disk_include: None, + disk_exclude: None, + network_include: None, + network_exclude: None, + validation: HostViewValidationMode::None, + }, + ) + .expect("source"); + + let scrape = source + .scrape_due(ProcfsFamilies { + filesystem: true, + ..ProcfsFamilies::default() + }) + .expect("filesystem scrape"); + + assert_eq!(scrape.snapshot.filesystems.len(), 1); + assert_eq!(scrape.snapshot.filesystems[0].device, "/dev/sda1"); + assert_eq!(scrape.snapshot.filesystems[0].mountpoint, "/"); + assert_eq!(scrape.snapshot.filesystems[0].fs_type, "ext4"); + assert!(scrape.snapshot.filesystems[0].limit_bytes.is_some()); + } + #[test] fn cpu_parser_accepts_missing_newer_fields() { let cpu = parse_cpu_total("10 20 30 40", 10.0).expect("cpu row"); @@ -2235,6 +2705,35 @@ mod tests { assert_eq!(disks[0].name, "sda"); } + #[test] + fn mountinfo_parser_skips_virtual_filesystems_by_default() { + let mounts = parse_mountinfo( + "36 25 8:1 / / rw,relatime - ext4 /dev/sda1 rw\n37 25 0:32 / /proc rw,nosuid,nodev,noexec,relatime - proc proc rw\n", + false, + true, + ); + + assert_eq!(mounts.len(), 1); + assert_eq!(mounts[0].device, "/dev/sda1"); + assert_eq!(mounts[0].mountpoint, "/"); + assert_eq!(mounts[0].fs_type, "ext4"); + assert_eq!(mounts[0].mode, "rw"); + assert!(mounts[0].emit_limit); + } + + #[test] + fn mountinfo_parser_unescapes_paths() { + let mounts = parse_mountinfo( + "36 25 8:1 / /mnt/data\\040disk rw,relatime - ext4 /dev/disk\\040one rw\n", + false, + false, + ); + + assert_eq!(mounts.len(), 1); + assert_eq!(mounts[0].device, "/dev/disk one"); + assert_eq!(mounts[0].mountpoint, "/mnt/data disk"); + } + #[test] fn netdev_parser_reads_device_counters() { let interfaces = parse_netdev( @@ -2272,12 +2771,14 @@ mod tests { fn root_path_uses_host_pid_one_netdev() { let paths = ProcfsPaths::new(Some(Path::new("/host"))); assert_eq!(paths.net_dev, PathBuf::from("/host/proc/1/net/dev")); + assert_eq!(paths.mountinfo, PathBuf::from("/host/proc/1/mountinfo")); } #[test] fn root_slash_uses_current_proc_netdev() { let paths = ProcfsPaths::new(Some(Path::new("/"))); assert_eq!(paths.net_dev, PathBuf::from("/proc/net/dev")); + assert_eq!(paths.mountinfo, PathBuf::from("/proc/self/mountinfo")); } #[test] From 494fdaae8b5eae264638fcece74f1e207d34b3d7 Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Sat, 2 May 2026 23:34:15 -0700 Subject: [PATCH 18/60] Add filesystem filters --- .../receivers/host_metrics_receiver/mod.rs | 155 +++++++++++++++++- .../receivers/host_metrics_receiver/procfs.rs | 114 ++++++++++++- 2 files changed, 265 insertions(+), 4 deletions(-) diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs index f61a9ef5af..bd61ace35c 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs @@ -280,6 +280,18 @@ pub struct FilesystemFamilyConfig { pub include_virtual_filesystems: bool, /// Enable filesystem limit metrics. pub limit: bool, + /// Device include filter. + pub include_devices: Option, + /// Device exclude filter. + pub exclude_devices: Option, + /// Filesystem type include filter. + pub include_fs_types: Option, + /// Filesystem type exclude filter. + pub exclude_fs_types: Option, + /// Mount point include filter. + pub include_mount_points: Option, + /// Mount point exclude filter. + pub exclude_mount_points: Option, } impl Default for FilesystemFamilyConfig { @@ -289,6 +301,50 @@ impl Default for FilesystemFamilyConfig { interval: None, include_virtual_filesystems: false, limit: false, + include_devices: None, + exclude_devices: None, + include_fs_types: None, + exclude_fs_types: None, + include_mount_points: None, + exclude_mount_points: None, + } + } +} + +/// Filesystem type filter config. +#[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(default, deny_unknown_fields)] +pub struct FilesystemTypeFilterConfig { + /// Filesystem types. + pub fs_types: Vec, + /// Match type. + pub match_type: MatchType, +} + +impl Default for FilesystemTypeFilterConfig { + fn default() -> Self { + Self { + fs_types: Vec::new(), + match_type: MatchType::Strict, + } + } +} + +/// Mount point filter config. +#[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(default, deny_unknown_fields)] +pub struct MountPointFilterConfig { + /// Mount points. + pub mount_points: Vec, + /// Match type. + pub match_type: MatchType, +} + +impl Default for MountPointFilterConfig { + fn default() -> Self { + Self { + mount_points: Vec::new(), + match_type: MatchType::Strict, } } } @@ -431,6 +487,12 @@ struct RuntimeFilesystemFamily { interval: Duration, include_virtual_filesystems: bool, limit: bool, + include_devices: Option, + exclude_devices: Option, + include_fs_types: Option, + exclude_fs_types: Option, + include_mount_points: Option, + exclude_mount_points: Option, } #[derive(Clone)] @@ -715,6 +777,7 @@ fn due_family_count(due: ProcfsFamilies) -> u64 { + u64::from(due.paging) + u64::from(due.system) + u64::from(due.disk) + + u64::from(due.filesystem) + u64::from(due.network) + u64::from(due.processes) } @@ -739,6 +802,48 @@ impl TryFrom for RuntimeConfig { .map(|filter| CompiledFilter::compile(filter.match_type, filter.devices)) .transpose()? .flatten(); + let filesystem_include_devices = config + .families + .filesystem + .include_devices + .map(|filter| CompiledFilter::compile(filter.match_type, filter.devices)) + .transpose()? + .flatten(); + let filesystem_exclude_devices = config + .families + .filesystem + .exclude_devices + .map(|filter| CompiledFilter::compile(filter.match_type, filter.devices)) + .transpose()? + .flatten(); + let filesystem_include_fs_types = config + .families + .filesystem + .include_fs_types + .map(|filter| CompiledFilter::compile(filter.match_type, filter.fs_types)) + .transpose()? + .flatten(); + let filesystem_exclude_fs_types = config + .families + .filesystem + .exclude_fs_types + .map(|filter| CompiledFilter::compile(filter.match_type, filter.fs_types)) + .transpose()? + .flatten(); + let filesystem_include_mount_points = config + .families + .filesystem + .include_mount_points + .map(|filter| CompiledFilter::compile(filter.match_type, filter.mount_points)) + .transpose()? + .flatten(); + let filesystem_exclude_mount_points = config + .families + .filesystem + .exclude_mount_points + .map(|filter| CompiledFilter::compile(filter.match_type, filter.mount_points)) + .transpose()? + .flatten(); let network_include = config .families .network @@ -787,6 +892,12 @@ impl TryFrom for RuntimeConfig { .filesystem .include_virtual_filesystems, limit: config.families.filesystem.limit, + include_devices: filesystem_include_devices, + exclude_devices: filesystem_exclude_devices, + include_fs_types: filesystem_include_fs_types, + exclude_fs_types: filesystem_exclude_fs_types, + include_mount_points: filesystem_include_mount_points, + exclude_mount_points: filesystem_exclude_mount_points, }, network: RuntimeNetworkFamily { enabled: config.families.network.enabled, @@ -1043,6 +1154,20 @@ impl local::Receiver for HostMetricsReceiver { filesystem_limit: config.families.filesystem.limit, disk_include: config.families.disk.include.clone(), disk_exclude: config.families.disk.exclude.clone(), + filesystem_include_devices: config.families.filesystem.include_devices.clone(), + filesystem_exclude_devices: config.families.filesystem.exclude_devices.clone(), + filesystem_include_fs_types: config.families.filesystem.include_fs_types.clone(), + filesystem_exclude_fs_types: config.families.filesystem.exclude_fs_types.clone(), + filesystem_include_mount_points: config + .families + .filesystem + .include_mount_points + .clone(), + filesystem_exclude_mount_points: config + .families + .filesystem + .exclude_mount_points + .clone(), network_include: config.families.network.include.clone(), network_exclude: config.families.network.exclude.clone(), validation: config.validation, @@ -1324,7 +1449,15 @@ mod tests { "filesystem": { "interval": "5m", "include_virtual_filesystems": true, - "limit": true + "limit": true, + "exclude_fs_types": { + "fs_types": ["tmpfs"], + "match_type": "strict" + }, + "include_mount_points": { + "mount_points": ["/", "/data*"], + "match_type": "glob" + } } } })) @@ -1336,6 +1469,26 @@ mod tests { ); assert!(config.families.filesystem.include_virtual_filesystems); assert!(config.families.filesystem.limit); + assert_eq!( + config + .families + .filesystem + .exclude_fs_types + .as_ref() + .expect("exclude fs types") + .fs_types, + vec!["tmpfs".to_owned()] + ); + assert_eq!( + config + .families + .filesystem + .include_mount_points + .as_ref() + .expect("include mount points") + .mount_points, + vec!["/".to_owned(), "/data*".to_owned()] + ); validate_config(&config).expect("valid config"); } diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs index 932fba26fb..a18415c076 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs @@ -65,6 +65,18 @@ pub struct ProcfsConfig { pub disk_include: Option, /// Disk exclude filter. pub disk_exclude: Option, + /// Filesystem device include filter. + pub filesystem_include_devices: Option, + /// Filesystem device exclude filter. + pub filesystem_exclude_devices: Option, + /// Filesystem type include filter. + pub filesystem_include_fs_types: Option, + /// Filesystem type exclude filter. + pub filesystem_exclude_fs_types: Option, + /// Filesystem mount point include filter. + pub filesystem_include_mount_points: Option, + /// Filesystem mount point exclude filter. + pub filesystem_exclude_mount_points: Option, /// Network include filter. pub network_include: Option, /// Network exclude filter. @@ -255,9 +267,23 @@ impl ProcfsSource { let filesystems = if due.filesystem { let include_virtual = self.config.filesystem_include_virtual; let emit_limit = self.config.filesystem_limit; + let include_devices = self.config.filesystem_include_devices.clone(); + let exclude_devices = self.config.filesystem_exclude_devices.clone(); + let include_fs_types = self.config.filesystem_include_fs_types.clone(); + let exclude_fs_types = self.config.filesystem_exclude_fs_types.clone(); + let include_mount_points = self.config.filesystem_include_mount_points.clone(); + let exclude_mount_points = self.config.filesystem_exclude_mount_points.clone(); match self.read_path(PathKind::Mountinfo) { Ok(mountinfo) => { - let mounts = parse_mountinfo(mountinfo, include_virtual, emit_limit); + let filters = FilesystemFilters { + include_devices: include_devices.as_ref(), + exclude_devices: exclude_devices.as_ref(), + include_fs_types: include_fs_types.as_ref(), + exclude_fs_types: exclude_fs_types.as_ref(), + include_mount_points: include_mount_points.as_ref(), + exclude_mount_points: exclude_mount_points.as_ref(), + }; + let mounts = parse_mountinfo(mountinfo, include_virtual, emit_limit, filters); self.read_filesystems(mounts) } Err(err) => { @@ -1356,10 +1382,21 @@ struct FilesystemMount { emit_limit: bool, } +#[derive(Clone, Copy, Default)] +struct FilesystemFilters<'a> { + include_devices: Option<&'a CompiledFilter>, + exclude_devices: Option<&'a CompiledFilter>, + include_fs_types: Option<&'a CompiledFilter>, + exclude_fs_types: Option<&'a CompiledFilter>, + include_mount_points: Option<&'a CompiledFilter>, + exclude_mount_points: Option<&'a CompiledFilter>, +} + fn parse_mountinfo( input: &str, include_virtual_filesystems: bool, emit_limit: bool, + filters: FilesystemFilters<'_>, ) -> Vec { let mut mounts = Vec::new(); for line in input.lines() { @@ -1388,9 +1425,24 @@ fn parse_mountinfo( if !include_virtual_filesystems && is_skipped_filesystem_type(fs_type) { continue; } + if !filter_allows(fs_type, filters.include_fs_types, filters.exclude_fs_types) { + continue; + } + let device = unescape_mountinfo(device); + if !filter_allows(&device, filters.include_devices, filters.exclude_devices) { + continue; + } + let mountpoint = unescape_mountinfo(mountpoint); + if !filter_allows( + &mountpoint, + filters.include_mount_points, + filters.exclude_mount_points, + ) { + continue; + } mounts.push(FilesystemMount { - device: unescape_mountinfo(device), - mountpoint: unescape_mountinfo(mountpoint), + device, + mountpoint, fs_type: fs_type.to_owned(), mode: filesystem_mode(options), emit_limit, @@ -2427,6 +2479,12 @@ mod tests { disk_limit: false, filesystem_include_virtual: false, filesystem_limit: false, + filesystem_include_devices: None, + filesystem_exclude_devices: None, + filesystem_include_fs_types: None, + filesystem_exclude_fs_types: None, + filesystem_include_mount_points: None, + filesystem_exclude_mount_points: None, disk_include: None, disk_exclude: None, network_include: None, @@ -2467,6 +2525,12 @@ mod tests { disk_limit: false, filesystem_include_virtual: false, filesystem_limit: false, + filesystem_include_devices: None, + filesystem_exclude_devices: None, + filesystem_include_fs_types: None, + filesystem_exclude_fs_types: None, + filesystem_include_mount_points: None, + filesystem_exclude_mount_points: None, disk_include: None, disk_exclude: None, network_include: None, @@ -2514,6 +2578,12 @@ mod tests { disk_limit: true, filesystem_include_virtual: false, filesystem_limit: false, + filesystem_include_devices: None, + filesystem_exclude_devices: None, + filesystem_include_fs_types: None, + filesystem_exclude_fs_types: None, + filesystem_include_mount_points: None, + filesystem_exclude_mount_points: None, disk_include: None, disk_exclude: None, network_include: None, @@ -2562,6 +2632,12 @@ mod tests { disk_limit: false, filesystem_include_virtual: false, filesystem_limit: true, + filesystem_include_devices: None, + filesystem_exclude_devices: None, + filesystem_include_fs_types: None, + filesystem_exclude_fs_types: None, + filesystem_include_mount_points: None, + filesystem_exclude_mount_points: None, disk_include: None, disk_exclude: None, network_include: None, @@ -2711,6 +2787,7 @@ mod tests { "36 25 8:1 / / rw,relatime - ext4 /dev/sda1 rw\n37 25 0:32 / /proc rw,nosuid,nodev,noexec,relatime - proc proc rw\n", false, true, + FilesystemFilters::default(), ); assert_eq!(mounts.len(), 1); @@ -2727,6 +2804,7 @@ mod tests { "36 25 8:1 / /mnt/data\\040disk rw,relatime - ext4 /dev/disk\\040one rw\n", false, false, + FilesystemFilters::default(), ); assert_eq!(mounts.len(), 1); @@ -2734,6 +2812,36 @@ mod tests { assert_eq!(mounts[0].mountpoint, "/mnt/data disk"); } + #[test] + fn mountinfo_parser_applies_filesystem_filters() { + let include_mounts = CompiledFilter::compile( + crate::receivers::host_metrics_receiver::MatchType::Glob, + vec!["/data*".to_owned()], + ) + .expect("valid") + .expect("filter"); + let exclude_fs_types = CompiledFilter::compile( + crate::receivers::host_metrics_receiver::MatchType::Strict, + vec!["xfs".to_owned()], + ) + .expect("valid") + .expect("filter"); + let mounts = parse_mountinfo( + "36 25 8:1 / / rw,relatime - ext4 /dev/sda1 rw\n37 25 8:2 / /data rw,relatime - ext4 /dev/sdb1 rw\n38 25 8:3 / /data2 rw,relatime - xfs /dev/sdc1 rw\n", + false, + false, + FilesystemFilters { + include_mount_points: Some(&include_mounts), + exclude_fs_types: Some(&exclude_fs_types), + ..FilesystemFilters::default() + }, + ); + + assert_eq!(mounts.len(), 1); + assert_eq!(mounts[0].device, "/dev/sdb1"); + assert_eq!(mounts[0].mountpoint, "/data"); + } + #[test] fn netdev_parser_reads_device_counters() { let interfaces = parse_netdev( From 8aeb3a896cb043458646b8a6b0779d14539caca5 Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Sat, 2 May 2026 23:37:56 -0700 Subject: [PATCH 19/60] Add memory opt-in metrics --- .../receivers/host_metrics_receiver/mod.rs | 67 +++++++++++++++++-- .../receivers/host_metrics_receiver/procfs.rs | 52 ++++++++++++++ 2 files changed, 115 insertions(+), 4 deletions(-) diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs index bd61ace35c..b3ee8650dd 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs @@ -164,7 +164,7 @@ pub struct FamiliesConfig { /// CPU metrics. pub cpu: CpuFamilyConfig, /// Memory metrics. - pub memory: FamilyConfig, + pub memory: MemoryFamilyConfig, /// Paging metrics. pub paging: FamilyConfig, /// System metrics. @@ -238,6 +238,32 @@ impl Default for FamilyConfig { } } +/// Memory family config. +#[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(default, deny_unknown_fields)] +pub struct MemoryFamilyConfig { + /// Enable memory metrics. + pub enabled: bool, + /// Family collection interval. Defaults to top-level `collection_interval`. + #[serde(default, with = "humantime_serde::option")] + pub interval: Option, + /// Enable memory limit metrics. + pub limit: bool, + /// Enable Linux shared memory metric. + pub shared: bool, +} + +impl Default for MemoryFamilyConfig { + fn default() -> Self { + Self { + enabled: true, + interval: None, + limit: false, + shared: false, + } + } +} + /// Disk family config. #[derive(Clone, Debug, Deserialize, Serialize)] #[serde(default, deny_unknown_fields)] @@ -451,6 +477,8 @@ struct RuntimeConfig { validation: HostViewValidationMode, initial_delay: Duration, cpu_utilization: bool, + memory_limit: bool, + memory_shared: bool, families: RuntimeFamilies, } @@ -864,9 +892,14 @@ impl TryFrom for RuntimeConfig { validation: config.host_view.validation, initial_delay: config.initial_delay, cpu_utilization: config.families.cpu.utilization, + memory_limit: config.families.memory.limit, + memory_shared: config.families.memory.shared, families: RuntimeFamilies { cpu: RuntimeFamily::new_cpu(&config.families.cpu, config.collection_interval), - memory: RuntimeFamily::new(&config.families.memory, config.collection_interval), + memory: RuntimeFamily::new_memory( + &config.families.memory, + config.collection_interval, + ), paging: RuntimeFamily::new(&config.families.paging, config.collection_interval), system: RuntimeFamily::new(&config.families.system, config.collection_interval), disk: RuntimeDiskFamily { @@ -936,6 +969,13 @@ impl RuntimeFamily { interval: config.interval.unwrap_or(default_interval), } } + + fn new_memory(config: &MemoryFamilyConfig, default_interval: Duration) -> Self { + Self { + enabled: config.enabled, + interval: config.interval.unwrap_or(default_interval), + } + } } #[derive(Clone, Copy, Debug, Eq, PartialEq)] @@ -1149,6 +1189,8 @@ impl local::Receiver for HostMetricsReceiver { network: config.families.network.enabled, processes: config.families.processes.enabled, cpu_utilization: config.cpu_utilization, + memory_limit: config.memory_limit, + memory_shared: config.memory_shared, disk_limit: config.families.disk.limit, filesystem_include_virtual: config.families.filesystem.include_virtual_filesystems, filesystem_limit: config.families.filesystem.limit, @@ -1321,9 +1363,9 @@ mod tests { enabled: false, ..CpuFamilyConfig::default() }, - memory: FamilyConfig { + memory: MemoryFamilyConfig { enabled: false, - ..FamilyConfig::default() + ..MemoryFamilyConfig::default() }, paging: FamilyConfig { enabled: false, @@ -1427,6 +1469,23 @@ mod tests { validate_config(&config).expect("valid config"); } + #[test] + fn accepts_memory_opt_ins() { + let config: Config = serde_json::from_value(serde_json::json!({ + "families": { + "memory": { + "limit": true, + "shared": true + } + } + })) + .expect("valid memory config"); + + assert!(config.families.memory.limit); + assert!(config.families.memory.shared); + validate_config(&config).expect("valid config"); + } + #[test] fn accepts_disk_limit_opt_in() { let config: Config = serde_json::from_value(serde_json::json!({ diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs index a18415c076..5ae3d45c38 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs @@ -55,6 +55,10 @@ pub struct ProcfsConfig { pub processes: bool, /// Derived aggregate CPU utilization. pub cpu_utilization: bool, + /// Emit memory limit metric. + pub memory_limit: bool, + /// Emit Linux shared memory metric. + pub memory_shared: bool, /// Derived disk limit from sysfs block device size. pub disk_limit: bool, /// Include virtual filesystems. @@ -300,6 +304,8 @@ impl ProcfsSource { let snapshot = HostSnapshot { now_unix_nano, start_time_unix_nano: stat.boot_time_unix_nano, + memory_limit: self.config.memory_limit, + memory_shared: self.config.memory_shared, cpu: stat.cpu, cpu_utilization, cpuinfo, @@ -563,6 +569,8 @@ pub struct HostScrape { pub struct HostSnapshot { now_unix_nano: u64, start_time_unix_nano: u64, + memory_limit: bool, + memory_shared: bool, cpu: Option, cpu_utilization: Option, cpuinfo: CpuInfo, @@ -709,6 +717,26 @@ impl HostSnapshot { ], "system.memory.linux.slab.state", ); + if self.memory_limit { + push_updown_single_u64( + &mut metrics, + "system.memory.limit", + "By", + start, + now, + memory.total, + ); + } + if self.memory_shared { + push_updown_single_u64( + &mut metrics, + "system.memory.linux.shared", + "By", + start, + now, + memory.shared, + ); + } } if let Some(uptime_seconds) = self.uptime_seconds { @@ -983,6 +1011,7 @@ struct MemoryStats { available: u64, cached: u64, buffered: u64, + shared: u64, slab_reclaimable: u64, slab_unreclaimable: u64, } @@ -1216,6 +1245,7 @@ fn parse_meminfo(input: &str) -> Option { let mut available = None; let mut buffers = 0; let mut cached = 0; + let mut shared = 0; let mut slab_reclaimable = 0; let mut slab_unreclaimable = 0; @@ -1231,6 +1261,7 @@ fn parse_meminfo(input: &str) -> Option { "MemAvailable" => available = Some(value), "Buffers" => buffers = value, "Cached" => cached = value, + "Shmem" => shared = value, "SReclaimable" => slab_reclaimable = value, "SUnreclaim" => slab_unreclaimable = value, _ => {} @@ -1249,6 +1280,7 @@ fn parse_meminfo(input: &str) -> Option { available, cached, buffered: buffers, + shared, slab_reclaimable, slab_unreclaimable, }) @@ -2264,6 +2296,8 @@ mod tests { let request = HostSnapshot { now_unix_nano: 2_000, start_time_unix_nano: 1_000, + memory_limit: true, + memory_shared: true, cpu: Some(CpuTimes { user: 1.0, nice: 2.0, @@ -2294,6 +2328,7 @@ mod tests { available: 20, cached: 5, buffered: 5, + shared: 7, slab_reclaimable: 3, slab_unreclaimable: 2, }), @@ -2383,6 +2418,8 @@ mod tests { assert_metric_shape(metrics, "system.memory.utilization", "1", None); assert_metric_shape(metrics, "system.memory.linux.available", "By", Some(false)); assert_metric_shape(metrics, "system.memory.linux.slab.usage", "By", Some(false)); + assert_metric_shape(metrics, "system.memory.limit", "By", Some(false)); + assert_metric_shape(metrics, "system.memory.linux.shared", "By", Some(false)); assert_metric_shape(metrics, "system.uptime", "s", None); assert_metric_shape(metrics, "system.paging.faults", "{fault}", Some(true)); assert_first_point_attr( @@ -2476,6 +2513,8 @@ mod tests { network: false, processes: false, cpu_utilization: false, + memory_limit: false, + memory_shared: false, disk_limit: false, filesystem_include_virtual: false, filesystem_limit: false, @@ -2522,6 +2561,8 @@ mod tests { network: false, processes: false, cpu_utilization: false, + memory_limit: false, + memory_shared: false, disk_limit: false, filesystem_include_virtual: false, filesystem_limit: false, @@ -2575,6 +2616,8 @@ mod tests { network: false, processes: false, cpu_utilization: false, + memory_limit: false, + memory_shared: false, disk_limit: true, filesystem_include_virtual: false, filesystem_limit: false, @@ -2629,6 +2672,8 @@ mod tests { network: false, processes: false, cpu_utilization: false, + memory_limit: false, + memory_shared: false, disk_limit: false, filesystem_include_virtual: false, filesystem_limit: true, @@ -2730,6 +2775,13 @@ mod tests { assert_eq!(memory.used, 850 * BYTES_PER_KIB); } + #[test] + fn meminfo_parser_reads_shared_memory() { + let memory = + parse_meminfo("MemTotal: 1000 kB\nMemFree: 100 kB\nShmem: 12 kB\n").expect("memory"); + assert_eq!(memory.shared, 12 * BYTES_PER_KIB); + } + #[test] fn uptime_parser_reads_first_field() { assert_eq!(parse_uptime("123.45 67.89"), Some(123.45)); From 3aa1428da67040f6bfb5b9f81d26b8bd120de337 Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Sat, 2 May 2026 23:41:33 -0700 Subject: [PATCH 20/60] Fix host metrics config test initializer --- .../core-nodes/src/receivers/host_metrics_receiver/mod.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs index b3ee8650dd..0a6e684dbc 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs @@ -1346,8 +1346,10 @@ mod tests { #[test] fn rejects_zero_collection_interval() { - let mut config = Config::default(); - config.collection_interval = Duration::ZERO; + let config = Config { + collection_interval: Duration::ZERO, + ..Config::default() + }; assert!(matches!( validate_config(&config), From ddef929801110f11b0d7e622efffb2ce29821564 Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Sun, 3 May 2026 00:03:07 -0700 Subject: [PATCH 21/60] Complete host metrics receiver phase 1 --- .../receivers/host_metrics_receiver/mod.rs | 15 +- .../receivers/host_metrics_receiver/procfs.rs | 485 +++++++++++++++++- 2 files changed, 478 insertions(+), 22 deletions(-) diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs index 0a6e684dbc..e253f987e1 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs @@ -4,7 +4,6 @@ //! Host metrics receiver. use async_trait::async_trait; -use bytes::Bytes; use linkme::distributed_slice; use otap_df_config::node::NodeUserConfig; use otap_df_engine::MessageSourceLocalEffectHandlerExtension; @@ -19,13 +18,13 @@ use otap_df_engine::receiver::ReceiverWrapper; use otap_df_engine::terminal_state::TerminalState; use otap_df_otap::OTAP_RECEIVER_FACTORIES; use otap_df_otap::pdata::{Context, OtapPdata}; -use otap_df_pdata::OtlpProtoBytes; +use otap_df_pdata::encode::encode_metrics_otap_batch; use otap_df_pdata::otap::OtapArrowRecords; +use otap_df_pdata::proto::opentelemetry::metrics::v1::MetricsData; use otap_df_telemetry::instrument::{Counter, Mmsc}; use otap_df_telemetry::metrics::{MetricSet, MetricSetSnapshot}; use otap_df_telemetry::{otel_info, otel_warn}; use otap_df_telemetry_macros::metric_set; -use prost::Message as _; use regex::Regex; use serde::{Deserialize, Serialize}; use serde_json::Value; @@ -1331,12 +1330,10 @@ impl local::Receiver for HostMetricsReceiver { fn encode_snapshot(snapshot: HostSnapshot) -> Result { let request = snapshot.into_export_request(); - let mut buf = Vec::with_capacity(request.encoded_len()); - request - .encode(&mut buf) - .expect("encoding metrics request to Vec cannot fail"); - let records: OtapArrowRecords = - OtlpProtoBytes::ExportMetricsRequest(Bytes::from(buf)).try_into()?; + let data = MetricsData { + resource_metrics: request.resource_metrics, + }; + let records: OtapArrowRecords = encode_metrics_otap_batch(&data)?; Ok(OtapPdata::new(Context::default(), records.into())) } diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs index 5ae3d45c38..f35253da6a 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs @@ -13,7 +13,7 @@ use otap_df_pdata::proto::opentelemetry::metrics::v1::{ metric, number_data_point, }; use otap_df_pdata::proto::opentelemetry::resource::v1::Resource; -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; use std::fs::File; use std::io::{self, Read}; use std::path::{Path, PathBuf}; @@ -33,6 +33,7 @@ pub struct ProcfsSource { clk_tck: f64, previous_cpu: Option, filesystem_worker: FilesystemStatWorker, + counter_tracker: CounterTracker, } /// Procfs collection config. @@ -120,6 +121,7 @@ impl ProcfsSource { clk_tck: clock_ticks_per_second(), previous_cpu: None, filesystem_worker: FilesystemStatWorker::new(), + counter_tracker: CounterTracker::default(), }; source.apply_startup_validation()?; Ok(source) @@ -300,10 +302,20 @@ impl ProcfsSource { }; let resource = self.read_resource(); + let counter_starts = self.counter_tracker.snapshot( + stat.boot_time_unix_nano, + now_unix_nano, + stat.cpu.as_ref(), + paging.as_ref(), + due.processes.then_some(stat.processes).as_ref(), + &disks, + &networks, + ); let snapshot = HostSnapshot { now_unix_nano, start_time_unix_nano: stat.boot_time_unix_nano, + counter_starts, memory_limit: self.config.memory_limit, memory_shared: self.config.memory_shared, cpu: stat.cpu, @@ -569,6 +581,7 @@ pub struct HostScrape { pub struct HostSnapshot { now_unix_nano: u64, start_time_unix_nano: u64, + counter_starts: CounterStarts, memory_limit: bool, memory_shared: bool, cpu: Option, @@ -607,6 +620,7 @@ impl HostSnapshot { let mut metrics = Vec::with_capacity(64); let now = self.now_unix_nano; let start = self.start_time_unix_nano; + let counter_starts = &self.counter_starts; if let Some(cpu) = self.cpu { push_sum_f64( @@ -615,6 +629,7 @@ impl HostSnapshot { "s", start, now, + counter_starts, &[ ("user", cpu.user), ("nice", cpu.nice), @@ -750,6 +765,7 @@ impl HostSnapshot { "{fault}", start, now, + counter_starts, &[ ("minor", paging.minor_faults), ("major", paging.major_faults), @@ -762,6 +778,7 @@ impl HostSnapshot { "{operation}", start, now, + counter_starts, &[("in", paging.swap_in), ("out", paging.swap_out)], "system.paging.direction", ); @@ -809,6 +826,7 @@ impl HostSnapshot { "{process}", start, now, + counter_starts, processes.created, ); } @@ -831,6 +849,7 @@ impl HostSnapshot { "By", start, now, + counter_starts, &disk, DiskProjection::Bytes, ); @@ -840,6 +859,7 @@ impl HostSnapshot { "{operation}", start, now, + counter_starts, &disk, DiskProjection::Operations, ); @@ -849,6 +869,7 @@ impl HostSnapshot { "s", start, now, + counter_starts, &disk, DiskProjection::IoTime, ); @@ -858,6 +879,7 @@ impl HostSnapshot { "s", start, now, + counter_starts, &disk, DiskProjection::OperationTime, ); @@ -867,6 +889,7 @@ impl HostSnapshot { "{operation}", start, now, + counter_starts, &disk, DiskProjection::Merged, ); @@ -887,6 +910,7 @@ impl HostSnapshot { "By", start, now, + counter_starts, &network, NetworkProjection::Bytes, ); @@ -896,6 +920,7 @@ impl HostSnapshot { "{packet}", start, now, + counter_starts, &network, NetworkProjection::Packets, ); @@ -905,6 +930,7 @@ impl HostSnapshot { "{packet}", start, now, + counter_starts, &network, NetworkProjection::Dropped, ); @@ -914,6 +940,7 @@ impl HostSnapshot { "{error}", start, now, + counter_starts, &network, NetworkProjection::Errors, ); @@ -966,6 +993,367 @@ impl HostResource { } } +#[derive(Default)] +struct CounterTracker { + states: HashMap, +} + +struct CounterState { + previous: f64, + start_time_unix_nano: u64, +} + +#[derive(Default)] +struct CounterStarts { + entries: Vec<(String, u64)>, +} + +impl CounterStarts { + fn get(&self, metric: &'static str, series: &str, default_start: u64) -> u64 { + self.entries + .iter() + .find_map(|(key, start)| counter_key_matches(key, metric, series).then_some(*start)) + .unwrap_or(default_start) + } + + fn get_joined( + &self, + metric: &'static str, + first: &str, + second: &'static str, + default_start: u64, + ) -> u64 { + self.entries + .iter() + .find_map(|(key, start)| { + counter_key_matches_joined(key, metric, first, second).then_some(*start) + }) + .unwrap_or(default_start) + } +} + +impl CounterTracker { + fn snapshot( + &mut self, + default_start: u64, + now: u64, + cpu: Option<&CpuTimes>, + paging: Option<&PagingStats>, + processes: Option<&ProcessStats>, + disks: &[DiskStats], + networks: &[NetworkStats], + ) -> CounterStarts { + let mut starts = CounterStarts::default(); + if let Some(cpu) = cpu { + self.observe_all( + "system.cpu.time", + default_start, + now, + &[ + ("user", cpu.user), + ("nice", cpu.nice), + ("system", cpu.system), + ("idle", cpu.idle), + ("wait", cpu.wait), + ("interrupt", cpu.interrupt), + ("steal", cpu.steal), + ], + &mut starts, + ); + } + if let Some(paging) = paging { + self.observe_all( + "system.paging.faults", + default_start, + now, + &[ + ("minor", paging.minor_faults as f64), + ("major", paging.major_faults as f64), + ], + &mut starts, + ); + self.observe_all( + "system.paging.operations", + default_start, + now, + &[ + ("in", paging.swap_in as f64), + ("out", paging.swap_out as f64), + ], + &mut starts, + ); + } + if let Some(processes) = processes { + self.observe( + "system.process.created", + "", + processes.created as f64, + default_start, + now, + &mut starts, + ); + } + for disk in disks { + self.observe_disk_all( + "system.disk.io", + default_start, + now, + &disk.name, + &[ + ("read", disk.read_bytes as f64), + ("write", disk.write_bytes as f64), + ], + &mut starts, + ); + self.observe_disk_all( + "system.disk.operations", + default_start, + now, + &disk.name, + &[ + ("read", disk.read_ops as f64), + ("write", disk.write_ops as f64), + ], + &mut starts, + ); + self.observe( + "system.disk.io_time", + &disk.name, + disk.io_time_seconds, + default_start, + now, + &mut starts, + ); + self.observe_disk_all( + "system.disk.operation_time", + default_start, + now, + &disk.name, + &[ + ("read", disk.read_time_seconds), + ("write", disk.write_time_seconds), + ], + &mut starts, + ); + self.observe_disk_all( + "system.disk.merged", + default_start, + now, + &disk.name, + &[ + ("read", disk.read_merged as f64), + ("write", disk.write_merged as f64), + ], + &mut starts, + ); + } + for network in networks { + self.observe_network( + "system.network.io", + default_start, + now, + network, + network.rx_bytes, + network.tx_bytes, + &mut starts, + ); + self.observe_network( + "system.network.packet.count", + default_start, + now, + network, + network.rx_packets, + network.tx_packets, + &mut starts, + ); + self.observe_network( + "system.network.packet.dropped", + default_start, + now, + network, + network.rx_dropped, + network.tx_dropped, + &mut starts, + ); + self.observe_network( + "system.network.errors", + default_start, + now, + network, + network.rx_errors, + network.tx_errors, + &mut starts, + ); + } + starts + } + + fn observe_all( + &mut self, + metric: &'static str, + default_start: u64, + now: u64, + values: &[(&str, f64)], + starts: &mut CounterStarts, + ) { + for (series, value) in values { + self.observe(metric, series, *value, default_start, now, starts); + } + } + + fn observe_disk_all( + &mut self, + metric: &'static str, + default_start: u64, + now: u64, + device: &str, + values: &[(&'static str, f64)], + starts: &mut CounterStarts, + ) { + for (direction, value) in values { + self.observe_joined( + metric, + device, + direction, + *value, + default_start, + now, + starts, + ); + } + } + + fn observe_network( + &mut self, + metric: &'static str, + default_start: u64, + now: u64, + network: &NetworkStats, + rx: u64, + tx: u64, + starts: &mut CounterStarts, + ) { + self.observe_joined( + metric, + &network.name, + "receive", + rx as f64, + default_start, + now, + starts, + ); + self.observe_joined( + metric, + &network.name, + "transmit", + tx as f64, + default_start, + now, + starts, + ); + } + + fn observe( + &mut self, + metric: &'static str, + series: &str, + value: f64, + default_start: u64, + now: u64, + starts: &mut CounterStarts, + ) { + self.observe_key( + counter_key(metric, series), + value, + default_start, + now, + starts, + ); + } + + fn observe_joined( + &mut self, + metric: &'static str, + first: &str, + second: &'static str, + value: f64, + default_start: u64, + now: u64, + starts: &mut CounterStarts, + ) { + self.observe_key( + counter_key_joined(metric, first, second), + value, + default_start, + now, + starts, + ); + } + + fn observe_key( + &mut self, + key: String, + value: f64, + default_start: u64, + now: u64, + starts: &mut CounterStarts, + ) { + let state = self.states.entry(key.clone()).or_insert(CounterState { + previous: value, + start_time_unix_nano: default_start, + }); + if state.start_time_unix_nano < default_start { + state.start_time_unix_nano = default_start; + } else if value < state.previous { + state.start_time_unix_nano = now; + } + state.previous = value; + starts.entries.push((key, state.start_time_unix_nano)); + } +} + +fn counter_key(metric: &'static str, series: &str) -> String { + let mut key = String::with_capacity(metric.len() + 1 + series.len()); + key.push_str(metric); + key.push('|'); + key.push_str(series); + key +} + +fn counter_key_joined(metric: &'static str, first: &str, second: &'static str) -> String { + let mut key = String::with_capacity(metric.len() + 2 + first.len() + second.len()); + key.push_str(metric); + key.push('|'); + key.push_str(first); + key.push('|'); + key.push_str(second); + key +} + +fn counter_key_matches(key: &str, metric: &'static str, series: &str) -> bool { + key.strip_prefix(metric) + .and_then(|rest| rest.strip_prefix('|')) + == Some(series) +} + +fn counter_key_matches_joined( + key: &str, + metric: &'static str, + first: &str, + second: &'static str, +) -> bool { + let Some(series) = key + .strip_prefix(metric) + .and_then(|rest| rest.strip_prefix('|')) + else { + return false; + }; + series + .strip_prefix(first) + .and_then(|rest| rest.strip_prefix('|')) + == Some(second) +} + fn host_arch() -> Option<&'static str> { match std::env::consts::ARCH { "aarch64" => Some("arm64"), @@ -1857,6 +2245,7 @@ fn push_sum_f64( unit: &'static str, start: u64, now: u64, + counter_starts: &CounterStarts, values: &[(&'static str, f64)], attr_name: &'static str, ) { @@ -1864,7 +2253,7 @@ fn push_sum_f64( for (state, value) in values { points.push(number_point_f64( vec![kv_str(attr_name, state)], - start, + counter_starts.get(name, state, start), now, *value, )); @@ -1878,6 +2267,7 @@ fn push_sum_u64( unit: &'static str, start: u64, now: u64, + counter_starts: &CounterStarts, values: &[(&'static str, u64)], attr_name: &'static str, ) { @@ -1885,7 +2275,7 @@ fn push_sum_u64( for (state, value) in values { points.push(number_point_i64( vec![kv_str(attr_name, state)], - start, + counter_starts.get(name, state, start), now, saturating_i64(*value), )); @@ -1899,6 +2289,7 @@ fn push_sum_single_u64( unit: &'static str, start: u64, now: u64, + counter_starts: &CounterStarts, value: u64, ) { push_sum_metric( @@ -1907,7 +2298,7 @@ fn push_sum_single_u64( unit, vec![number_point_i64( Vec::new(), - start, + counter_starts.get(name, "", start), now, saturating_i64(value), )], @@ -1920,6 +2311,7 @@ fn push_disk_sum( unit: &'static str, start: u64, now: u64, + counter_starts: &CounterStarts, disk: &DiskStats, projection: DiskProjection, ) { @@ -1930,7 +2322,7 @@ fn push_disk_sum( unit, vec![number_point_f64( vec![kv_str("system.device", &disk.name)], - start, + counter_starts.get(name, &disk.name, start), now, disk.io_time_seconds, )], @@ -1958,8 +2350,8 @@ fn push_disk_sum( DiskProjection::IoTime => unreachable!(), }; let points = vec![ - disk_number_point(&disk.name, "read", start, now, read), - disk_number_point(&disk.name, "write", start, now, write), + disk_number_point(&disk.name, "read", start, now, counter_starts, name, read), + disk_number_point(&disk.name, "write", start, now, counter_starts, name, write), ]; push_sum_metric(metrics, name, unit, points); } @@ -1984,6 +2376,8 @@ fn disk_number_point( direction: &'static str, start: u64, now: u64, + counter_starts: &CounterStarts, + metric: &'static str, value: DiskValue, ) -> NumberDataPoint { let attributes = vec![ @@ -1991,10 +2385,18 @@ fn disk_number_point( kv_str("disk.io.direction", direction), ]; match value { - DiskValue::Integer(value) => { - number_point_i64(attributes, start, now, saturating_i64(value)) - } - DiskValue::Float(value) => number_point_f64(attributes, start, now, value), + DiskValue::Integer(value) => number_point_i64( + attributes, + counter_starts.get_joined(metric, device, direction, start), + now, + saturating_i64(value), + ), + DiskValue::Float(value) => number_point_f64( + attributes, + counter_starts.get_joined(metric, device, direction, start), + now, + value, + ), } } @@ -2128,6 +2530,7 @@ fn push_network_sum( unit: &'static str, start: u64, now: u64, + counter_starts: &CounterStarts, network: &NetworkStats, projection: NetworkProjection, ) { @@ -2151,7 +2554,7 @@ fn push_network_sum( kv_str(interface_attr, &network.name), kv_str("network.io.direction", "receive"), ], - start, + counter_starts.get_joined(name, &network.name, "receive", start), now, saturating_i64(rx), ), @@ -2160,7 +2563,7 @@ fn push_network_sum( kv_str(interface_attr, &network.name), kv_str("network.io.direction", "transmit"), ], - start, + counter_starts.get_joined(name, &network.name, "transmit", start), now, saturating_i64(tx), ), @@ -2296,6 +2699,7 @@ mod tests { let request = HostSnapshot { now_unix_nano: 2_000, start_time_unix_nano: 1_000, + counter_starts: CounterStarts::default(), memory_limit: true, memory_shared: true, cpu: Some(CpuTimes { @@ -2491,6 +2895,52 @@ mod tests { assert_metric_shape(metrics, "system.network.errors", "{error}", Some(true)); } + #[test] + fn projection_uses_counter_start_overrides_for_reset_series() { + let request = HostSnapshot { + now_unix_nano: 2_000, + start_time_unix_nano: 1_000, + counter_starts: CounterStarts { + entries: vec![(counter_key("system.process.created", ""), 1_500)], + }, + processes: Some(ProcessStats { + created: 99, + ..ProcessStats::default() + }), + ..HostSnapshot::default() + } + .into_export_request(); + + let metrics = &request.resource_metrics[0].scope_metrics[0].metrics; + assert_first_sum_point_start(metrics, "system.process.created", 1_500); + } + + #[test] + fn counter_tracker_rebaselines_reset_series_only() { + let mut tracker = CounterTracker::default(); + let disks = vec![DiskStats { + name: "sda".to_owned(), + read_bytes: 100, + write_bytes: 200, + ..DiskStats::default() + }]; + let starts = tracker.snapshot(10, 20, None, None, None, &disks, &[]); + + assert_eq!(starts.get_joined("system.disk.io", "sda", "read", 10), 10); + assert_eq!(starts.get_joined("system.disk.io", "sda", "write", 10), 10); + + let disks = vec![DiskStats { + name: "sda".to_owned(), + read_bytes: 50, + write_bytes: 250, + ..DiskStats::default() + }]; + let starts = tracker.snapshot(10, 30, None, None, None, &disks, &[]); + + assert_eq!(starts.get_joined("system.disk.io", "sda", "read", 10), 30); + assert_eq!(starts.get_joined("system.disk.io", "sda", "write", 10), 10); + } + #[test] fn scrape_due_emits_successful_families_after_partial_read_error() { let root = tempfile::tempdir().expect("tempdir"); @@ -3003,6 +3453,15 @@ mod tests { assert_has_attr(&point.attributes, key, value); } + fn assert_first_sum_point_start(metrics: &[Metric], name: &'static str, expected_start: u64) { + let metric = metric_by_name(metrics, name); + let metric::Data::Sum(sum) = metric.data.as_ref().expect("metric data") else { + panic!("{name} should be a cumulative sum"); + }; + let point = sum.data_points.first().expect("data point"); + assert_eq!(point.start_time_unix_nano, expected_start); + } + fn metric_by_name<'a>(metrics: &'a [Metric], name: &'static str) -> &'a Metric { metrics .iter() From 6602933a78ffd6163c78c53c32a1796c13cc03fb Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Sun, 3 May 2026 00:12:45 -0700 Subject: [PATCH 22/60] Add host memory hugepage metrics --- .../receivers/host_metrics_receiver/mod.rs | 10 +- .../receivers/host_metrics_receiver/procfs.rs | 160 +++++++++++++++++- 2 files changed, 168 insertions(+), 2 deletions(-) diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs index e253f987e1..76dfe52b78 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs @@ -250,6 +250,8 @@ pub struct MemoryFamilyConfig { pub limit: bool, /// Enable Linux shared memory metric. pub shared: bool, + /// Enable Linux hugepage metrics. + pub hugepages: bool, } impl Default for MemoryFamilyConfig { @@ -259,6 +261,7 @@ impl Default for MemoryFamilyConfig { interval: None, limit: false, shared: false, + hugepages: false, } } } @@ -478,6 +481,7 @@ struct RuntimeConfig { cpu_utilization: bool, memory_limit: bool, memory_shared: bool, + memory_hugepages: bool, families: RuntimeFamilies, } @@ -893,6 +897,7 @@ impl TryFrom for RuntimeConfig { cpu_utilization: config.families.cpu.utilization, memory_limit: config.families.memory.limit, memory_shared: config.families.memory.shared, + memory_hugepages: config.families.memory.hugepages, families: RuntimeFamilies { cpu: RuntimeFamily::new_cpu(&config.families.cpu, config.collection_interval), memory: RuntimeFamily::new_memory( @@ -1190,6 +1195,7 @@ impl local::Receiver for HostMetricsReceiver { cpu_utilization: config.cpu_utilization, memory_limit: config.memory_limit, memory_shared: config.memory_shared, + memory_hugepages: config.memory_hugepages, disk_limit: config.families.disk.limit, filesystem_include_virtual: config.families.filesystem.include_virtual_filesystems, filesystem_limit: config.families.filesystem.limit, @@ -1474,7 +1480,8 @@ mod tests { "families": { "memory": { "limit": true, - "shared": true + "shared": true, + "hugepages": true } } })) @@ -1482,6 +1489,7 @@ mod tests { assert!(config.families.memory.limit); assert!(config.families.memory.shared); + assert!(config.families.memory.hugepages); validate_config(&config).expect("valid config"); } diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs index f35253da6a..689ed51f61 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs @@ -60,6 +60,8 @@ pub struct ProcfsConfig { pub memory_limit: bool, /// Emit Linux shared memory metric. pub memory_shared: bool, + /// Emit Linux hugepage metrics. + pub memory_hugepages: bool, /// Derived disk limit from sysfs block device size. pub disk_limit: bool, /// Include virtual filesystems. @@ -318,6 +320,7 @@ impl ProcfsSource { counter_starts, memory_limit: self.config.memory_limit, memory_shared: self.config.memory_shared, + memory_hugepages: self.config.memory_hugepages, cpu: stat.cpu, cpu_utilization, cpuinfo, @@ -584,6 +587,7 @@ pub struct HostSnapshot { counter_starts: CounterStarts, memory_limit: bool, memory_shared: bool, + memory_hugepages: bool, cpu: Option, cpu_utilization: Option, cpuinfo: CpuInfo, @@ -752,6 +756,9 @@ impl HostSnapshot { memory.shared, ); } + if self.memory_hugepages { + push_hugepage_metrics(&mut metrics, start, now, &memory.hugepages); + } } if let Some(uptime_seconds) = self.uptime_seconds { @@ -1402,6 +1409,16 @@ struct MemoryStats { shared: u64, slab_reclaimable: u64, slab_unreclaimable: u64, + hugepages: HugepageStats, +} + +#[derive(Copy, Clone, Default)] +struct HugepageStats { + total: u64, + free: u64, + reserved: u64, + surplus: u64, + page_size_bytes: u64, } #[derive(Copy, Clone, Default)] @@ -1636,13 +1653,15 @@ fn parse_meminfo(input: &str) -> Option { let mut shared = 0; let mut slab_reclaimable = 0; let mut slab_unreclaimable = 0; + let mut hugepages = HugepageStats::default(); for line in input.lines() { let mut fields = line.split_whitespace(); let Some(key) = fields.next() else { continue; }; - let value = fields.next().map(parse_u64).unwrap_or_default() * BYTES_PER_KIB; + let raw_value = fields.next().map(parse_u64).unwrap_or_default(); + let value = raw_value * BYTES_PER_KIB; match key.trim_end_matches(':') { "MemTotal" => total = value, "MemFree" => free = value, @@ -1652,6 +1671,11 @@ fn parse_meminfo(input: &str) -> Option { "Shmem" => shared = value, "SReclaimable" => slab_reclaimable = value, "SUnreclaim" => slab_unreclaimable = value, + "HugePages_Total" => hugepages.total = raw_value, + "HugePages_Free" => hugepages.free = raw_value, + "HugePages_Rsvd" => hugepages.reserved = raw_value, + "HugePages_Surp" => hugepages.surplus = raw_value, + "Hugepagesize" => hugepages.page_size_bytes = value, _ => {} } } @@ -1671,6 +1695,7 @@ fn parse_meminfo(input: &str) -> Option { shared, slab_reclaimable, slab_unreclaimable, + hugepages, }) } @@ -2172,6 +2197,65 @@ fn push_cpu_frequency(metrics: &mut Vec, now: u64, frequencies_hz: &[f64 }); } +fn push_hugepage_metrics( + metrics: &mut Vec, + start: u64, + now: u64, + hugepages: &HugepageStats, +) { + push_updown_single_u64( + metrics, + "system.memory.linux.hugepages.limit", + "{page}", + start, + now, + hugepages.total, + ); + push_updown_single_u64( + metrics, + "system.memory.linux.hugepages.page_size", + "By", + start, + now, + hugepages.page_size_bytes, + ); + push_updown_single_u64( + metrics, + "system.memory.linux.hugepages.reserved", + "{page}", + start, + now, + hugepages.reserved, + ); + push_updown_single_u64( + metrics, + "system.memory.linux.hugepages.surplus", + "{page}", + start, + now, + hugepages.surplus, + ); + let used = hugepages.total.saturating_sub(hugepages.free); + push_updown_u64( + metrics, + "system.memory.linux.hugepages.usage", + "{page}", + start, + now, + &[("used", used), ("free", hugepages.free)], + "system.memory.linux.hugepages.state", + ); + push_gauge_ratio( + metrics, + "system.memory.linux.hugepages.utilization", + "1", + now, + hugepages.total, + &[("used", used), ("free", hugepages.free)], + "system.memory.linux.hugepages.state", + ); +} + fn push_gauge_ratio( metrics: &mut Vec, name: &'static str, @@ -2702,6 +2786,7 @@ mod tests { counter_starts: CounterStarts::default(), memory_limit: true, memory_shared: true, + memory_hugepages: true, cpu: Some(CpuTimes { user: 1.0, nice: 2.0, @@ -2735,6 +2820,13 @@ mod tests { shared: 7, slab_reclaimable: 3, slab_unreclaimable: 2, + hugepages: HugepageStats { + total: 10, + free: 4, + reserved: 2, + surplus: 1, + page_size_bytes: 2 * BYTES_PER_KIB, + }, }), uptime_seconds: Some(42.0), paging: Some(PagingStats { @@ -2824,6 +2916,48 @@ mod tests { assert_metric_shape(metrics, "system.memory.linux.slab.usage", "By", Some(false)); assert_metric_shape(metrics, "system.memory.limit", "By", Some(false)); assert_metric_shape(metrics, "system.memory.linux.shared", "By", Some(false)); + assert_metric_shape( + metrics, + "system.memory.linux.hugepages.limit", + "{page}", + Some(false), + ); + assert_metric_shape( + metrics, + "system.memory.linux.hugepages.page_size", + "By", + Some(false), + ); + assert_metric_shape( + metrics, + "system.memory.linux.hugepages.reserved", + "{page}", + Some(false), + ); + assert_metric_shape( + metrics, + "system.memory.linux.hugepages.surplus", + "{page}", + Some(false), + ); + assert_metric_shape( + metrics, + "system.memory.linux.hugepages.usage", + "{page}", + Some(false), + ); + assert_first_point_attr( + metrics, + "system.memory.linux.hugepages.usage", + "system.memory.linux.hugepages.state", + "used", + ); + assert_metric_shape( + metrics, + "system.memory.linux.hugepages.utilization", + "1", + None, + ); assert_metric_shape(metrics, "system.uptime", "s", None); assert_metric_shape(metrics, "system.paging.faults", "{fault}", Some(true)); assert_first_point_attr( @@ -2965,6 +3099,7 @@ mod tests { cpu_utilization: false, memory_limit: false, memory_shared: false, + memory_hugepages: false, disk_limit: false, filesystem_include_virtual: false, filesystem_limit: false, @@ -3013,6 +3148,7 @@ mod tests { cpu_utilization: false, memory_limit: false, memory_shared: false, + memory_hugepages: false, disk_limit: false, filesystem_include_virtual: false, filesystem_limit: false, @@ -3068,6 +3204,7 @@ mod tests { cpu_utilization: false, memory_limit: false, memory_shared: false, + memory_hugepages: false, disk_limit: true, filesystem_include_virtual: false, filesystem_limit: false, @@ -3124,6 +3261,7 @@ mod tests { cpu_utilization: false, memory_limit: false, memory_shared: false, + memory_hugepages: false, disk_limit: false, filesystem_include_virtual: false, filesystem_limit: true, @@ -3232,6 +3370,26 @@ mod tests { assert_eq!(memory.shared, 12 * BYTES_PER_KIB); } + #[test] + fn meminfo_parser_reads_hugepage_stats() { + let memory = parse_meminfo( + "MemTotal: 1000 kB\n\ + MemFree: 100 kB\n\ + HugePages_Total: 8\n\ + HugePages_Free: 3\n\ + HugePages_Rsvd: 2\n\ + HugePages_Surp: 1\n\ + Hugepagesize: 2048 kB\n", + ) + .expect("memory"); + + assert_eq!(memory.hugepages.total, 8); + assert_eq!(memory.hugepages.free, 3); + assert_eq!(memory.hugepages.reserved, 2); + assert_eq!(memory.hugepages.surplus, 1); + assert_eq!(memory.hugepages.page_size_bytes, 2048 * BYTES_PER_KIB); + } + #[test] fn uptime_parser_reads_first_field() { assert_eq!(parse_uptime("123.45 67.89"), Some(123.45)); From 77aa4da9e8b7163fa15a7cdb5e9015a4fcd67153 Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Sun, 3 May 2026 00:21:36 -0700 Subject: [PATCH 23/60] Align host metric attributes with semconv --- .../receivers/host_metrics_receiver/procfs.rs | 74 +++++++++++++++---- 1 file changed, 60 insertions(+), 14 deletions(-) diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs index 689ed51f61..34e1567be2 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs @@ -639,7 +639,7 @@ impl HostSnapshot { ("nice", cpu.nice), ("system", cpu.system), ("idle", cpu.idle), - ("wait", cpu.wait), + ("iowait", cpu.wait), ("interrupt", cpu.interrupt), ("steal", cpu.steal), ], @@ -657,7 +657,7 @@ impl HostSnapshot { ("nice", cpu.nice), ("system", cpu.system), ("idle", cpu.idle), - ("wait", cpu.wait), + ("iowait", cpu.wait), ("interrupt", cpu.interrupt), ("steal", cpu.steal), ], @@ -1061,7 +1061,7 @@ impl CounterTracker { ("nice", cpu.nice), ("system", cpu.system), ("idle", cpu.idle), - ("wait", cpu.wait), + ("iowait", cpu.wait), ("interrupt", cpu.interrupt), ("steal", cpu.steal), ], @@ -2570,12 +2570,16 @@ fn push_filesystem_limit( metrics, "system.filesystem.limit", "By", - vec![filesystem_number_point( - filesystem, - "limit", + vec![number_point_i64( + vec![ + kv_str("system.device", &filesystem.device), + kv_str("system.filesystem.type", &filesystem.fs_type), + kv_str("system.filesystem.mode", filesystem.mode), + kv_str("system.filesystem.mountpoint", &filesystem.mountpoint), + ], start, now, - FilesystemValue::Integer(limit_bytes), + saturating_i64(limit_bytes), )], ); } @@ -2898,6 +2902,7 @@ mod tests { let metrics = &resource_metrics.scope_metrics[0].metrics; assert_metric_shape(metrics, "system.cpu.time", "s", Some(true)); assert_first_point_attr(metrics, "system.cpu.time", "cpu.mode", "user"); + assert_sum_point_attr(metrics, "system.cpu.time", "cpu.mode", "iowait"); assert_metric_shape(metrics, "system.cpu.utilization", "1", None); assert_first_point_attr(metrics, "system.cpu.utilization", "cpu.mode", "user"); assert_metric_shape(metrics, "system.cpu.logical.count", "{cpu}", Some(false)); @@ -2995,6 +3000,11 @@ mod tests { ); assert_metric_shape(metrics, "system.filesystem.utilization", "1", None); assert_metric_shape(metrics, "system.filesystem.limit", "By", Some(false)); + assert_no_first_point_attr( + metrics, + "system.filesystem.limit", + "system.filesystem.state", + ); assert_metric_shape(metrics, "system.network.io", "By", Some(true)); assert_first_point_attr( metrics, @@ -3611,6 +3621,38 @@ mod tests { assert_has_attr(&point.attributes, key, value); } + fn assert_sum_point_attr( + metrics: &[Metric], + name: &'static str, + key: &'static str, + value: &'static str, + ) { + let metric = metric_by_name(metrics, name); + let metric::Data::Sum(sum) = metric.data.as_ref().expect("metric data") else { + panic!("{name} should be a cumulative sum"); + }; + assert!( + sum.data_points + .iter() + .any(|point| has_attr(&point.attributes, key, value)), + "missing point attribute {key}={value}" + ); + } + + fn assert_no_first_point_attr(metrics: &[Metric], name: &'static str, key: &'static str) { + let metric = metric_by_name(metrics, name); + let point = match metric.data.as_ref().expect("metric data") { + metric::Data::Sum(sum) => sum.data_points.first(), + metric::Data::Gauge(gauge) => gauge.data_points.first(), + _ => None, + } + .expect("data point"); + assert!( + !point.attributes.iter().any(|attr| attr.key == key), + "unexpected attribute {key}" + ); + } + fn assert_first_sum_point_start(metrics: &[Metric], name: &'static str, expected_start: u64) { let metric = metric_by_name(metrics, name); let metric::Data::Sum(sum) = metric.data.as_ref().expect("metric data") else { @@ -3629,14 +3671,18 @@ mod tests { fn assert_has_attr(attributes: &[KeyValue], key: &'static str, value: &'static str) { assert!( - attributes.iter().any(|attr| { - attr.key == key - && matches!( - attr.value.as_ref().and_then(|value| value.value.as_ref()), - Some(any_value::Value::StringValue(actual)) if actual == value - ) - }), + has_attr(attributes, key, value), "missing attribute {key}={value}" ); } + + fn has_attr(attributes: &[KeyValue], key: &'static str, value: &'static str) -> bool { + attributes.iter().any(|attr| { + attr.key == key + && matches!( + attr.value.as_ref().and_then(|value| value.value.as_ref()), + Some(any_value::Value::StringValue(actual)) if actual == value + ) + }) + } } From e715992086c03dcd8bec1e3323e44e102c179dde Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Sun, 3 May 2026 00:43:11 -0700 Subject: [PATCH 24/60] Add host metrics semconv drift check --- .../receivers/host_metrics_receiver/procfs.rs | 399 +++++++++++++----- 1 file changed, 292 insertions(+), 107 deletions(-) diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs index 34e1567be2..7a3da67eea 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs @@ -2781,116 +2781,24 @@ fn saturating_i64(value: u64) -> i64 { #[cfg(test)] mod tests { use super::*; + #[cfg(feature = "dev-tools")] + use std::collections::{BTreeMap, BTreeSet}; + #[cfg(feature = "dev-tools")] + use weaver_common::{result::WResult, vdir::VirtualDirectoryPath}; + #[cfg(feature = "dev-tools")] + use weaver_forge::registry::ResolvedRegistry; + #[cfg(feature = "dev-tools")] + use weaver_resolver::SchemaResolver; + #[cfg(feature = "dev-tools")] + use weaver_semconv::{ + attribute::{BasicRequirementLevelSpec, RequirementLevel}, + group::{GroupType, InstrumentSpec}, + registry_repo::RegistryRepo, + }; #[test] fn projection_uses_expected_metric_shapes() { - let request = HostSnapshot { - now_unix_nano: 2_000, - start_time_unix_nano: 1_000, - counter_starts: CounterStarts::default(), - memory_limit: true, - memory_shared: true, - memory_hugepages: true, - cpu: Some(CpuTimes { - user: 1.0, - nice: 2.0, - system: 3.0, - idle: 4.0, - wait: 5.0, - interrupt: 6.0, - steal: 7.0, - }), - cpu_utilization: Some(CpuTimes { - user: 0.1, - nice: 0.1, - system: 0.2, - idle: 0.3, - wait: 0.1, - interrupt: 0.1, - steal: 0.1, - }), - cpuinfo: CpuInfo { - logical_count: 2, - physical_count: 1, - frequencies_hz: vec![2_400_000_000.0], - }, - memory: Some(MemoryStats { - total: 100, - used: 80, - free: 10, - available: 20, - cached: 5, - buffered: 5, - shared: 7, - slab_reclaimable: 3, - slab_unreclaimable: 2, - hugepages: HugepageStats { - total: 10, - free: 4, - reserved: 2, - surplus: 1, - page_size_bytes: 2 * BYTES_PER_KIB, - }, - }), - uptime_seconds: Some(42.0), - paging: Some(PagingStats { - minor_faults: 9, - major_faults: 1, - swap_in: 2, - swap_out: 3, - }), - swaps: vec![SwapStats { - name: "/dev/swap".to_owned(), - size: 100, - used: 25, - free: 75, - }], - processes: Some(ProcessStats { - running: 4, - blocked: 1, - created: 99, - }), - disks: vec![DiskStats { - name: "sda".to_owned(), - limit_bytes: Some(123), - read_bytes: 10, - write_bytes: 20, - read_ops: 1, - write_ops: 2, - read_merged: 3, - write_merged: 4, - read_time_seconds: 0.5, - write_time_seconds: 0.6, - io_time_seconds: 0.7, - }], - filesystems: vec![FilesystemStats { - device: "/dev/sda1".to_owned(), - mountpoint: "/".to_owned(), - fs_type: "ext4".to_owned(), - mode: "rw", - used: 60, - free: 30, - reserved: 10, - limit_bytes: Some(100), - }], - networks: vec![NetworkStats { - name: "eth0".to_owned(), - rx_bytes: 10, - tx_bytes: 20, - rx_packets: 1, - tx_packets: 2, - rx_errors: 3, - tx_errors: 4, - rx_dropped: 5, - tx_dropped: 6, - }], - resource: HostResource { - host_id: Some("host-id".to_owned()), - host_name: Some("host-name".to_owned()), - host_arch: Some("amd64"), - }, - } - .into_export_request(); + let request = projection_fixture_request(); let resource_metrics = request.resource_metrics.first().expect("resource metrics"); let resource = resource_metrics.resource.as_ref().expect("resource"); @@ -3039,6 +2947,37 @@ mod tests { assert_metric_shape(metrics, "system.network.errors", "{error}", Some(true)); } + #[cfg(feature = "dev-tools")] + #[test] + #[ignore = "dev-only semconv drift check; may access a local or remote semantic-conventions registry"] + fn emitted_phase1_metric_shapes_match_weaver_semconv() { + let registry = load_semconv_registry(); + let semconv_shapes = semconv_system_metric_shapes(®istry); + let emitted_shapes = emitted_phase1_metric_shapes(); + + for (name, emitted) in emitted_shapes { + let semconv = semconv_shapes + .get(&name) + .unwrap_or_else(|| panic!("missing semconv metric {name}")); + + assert_eq!(emitted.unit, semconv.unit, "unit mismatch for {name}"); + assert_eq!( + emitted.monotonic, semconv.monotonic, + "instrument/temporality mismatch for {name}" + ); + + for attr in &semconv.attributes { + if is_intentional_semconv_attribute_gap(name.as_str(), attr.as_str()) { + continue; + } + assert!( + emitted.attributes.contains(attr), + "missing semconv attribute {attr} on {name}" + ); + } + } + } + #[test] fn projection_uses_counter_start_overrides_for_reset_series() { let request = HostSnapshot { @@ -3569,6 +3508,252 @@ mod tests { } } + #[cfg(feature = "dev-tools")] + #[derive(Debug)] + struct MetricShape { + unit: String, + monotonic: Option, + attributes: BTreeSet, + } + + #[cfg(feature = "dev-tools")] + fn load_semconv_registry() -> ResolvedRegistry { + let registry_path = std::env::var("OTAP_HOST_METRICS_SEMCONV_REGISTRY") + .map(|path| { + path.parse::() + .expect("valid OTAP_HOST_METRICS_SEMCONV_REGISTRY") + }) + .unwrap_or_else(|_| VirtualDirectoryPath::GitRepo { + url: "https://github.com/open-telemetry/semantic-conventions.git".to_owned(), + sub_folder: Some("model".to_owned()), + refspec: None, + }); + + let registry_repo = + RegistryRepo::try_new("main", ®istry_path).expect("semantic convention registry"); + let registry = match SchemaResolver::load_semconv_repository(registry_repo, false) { + WResult::Ok(registry) | WResult::OkWithNFEs(registry, _) => registry, + WResult::FatalErr(err) => panic!("failed to load semantic convention registry: {err}"), + }; + let resolved_schema = match SchemaResolver::resolve(registry, true) { + WResult::Ok(schema) | WResult::OkWithNFEs(schema, _) => schema, + WResult::FatalErr(err) => { + panic!("failed to resolve semantic convention registry: {err}"); + } + }; + + ResolvedRegistry::try_from_resolved_registry( + &resolved_schema.registry, + resolved_schema.catalog(), + ) + .expect("resolved semantic convention registry") + } + + #[cfg(feature = "dev-tools")] + fn semconv_system_metric_shapes(registry: &ResolvedRegistry) -> BTreeMap { + registry + .groups + .iter() + .filter(|group| group.r#type == GroupType::Metric) + .filter_map(|group| { + let name = group.metric_name.as_ref()?; + if !name.starts_with("system.") { + return None; + } + + let monotonic = match group.instrument.as_ref()? { + InstrumentSpec::Counter => Some(true), + InstrumentSpec::UpDownCounter => Some(false), + InstrumentSpec::Gauge | InstrumentSpec::Histogram => None, + }; + let attributes = group + .attributes + .iter() + .filter(|attr| !is_opt_in_requirement(&attr.requirement_level)) + .map(|attr| attr.name.clone()) + .collect(); + + Some(( + name.clone(), + MetricShape { + unit: group.unit.clone().unwrap_or_default(), + monotonic, + attributes, + }, + )) + }) + .collect() + } + + #[cfg(feature = "dev-tools")] + fn is_opt_in_requirement(requirement_level: &RequirementLevel) -> bool { + matches!( + requirement_level, + RequirementLevel::Basic(BasicRequirementLevelSpec::OptIn) + | RequirementLevel::OptIn { .. } + ) + } + + #[cfg(feature = "dev-tools")] + fn emitted_phase1_metric_shapes() -> BTreeMap { + let metrics = projection_fixture_metrics(); + metrics + .iter() + .map(|metric| { + let (monotonic, points) = match metric.data.as_ref().expect("metric data") { + metric::Data::Sum(sum) => (Some(sum.is_monotonic), &sum.data_points), + metric::Data::Gauge(gauge) => (None, &gauge.data_points), + _ => panic!("unsupported metric data for {}", metric.name), + }; + let attributes = points + .iter() + .flat_map(|point| point.attributes.iter()) + .map(|attr| attr.key.clone()) + .collect(); + ( + metric.name.clone(), + MetricShape { + unit: metric.unit.clone(), + monotonic, + attributes, + }, + ) + }) + .collect() + } + + #[cfg(feature = "dev-tools")] + fn is_intentional_semconv_attribute_gap(name: &str, attr: &str) -> bool { + matches!( + (name, attr), + ("system.paging.operations", "system.paging.fault.type") + ) + } + + fn projection_fixture_request() -> ExportMetricsServiceRequest { + HostSnapshot { + now_unix_nano: 2_000, + start_time_unix_nano: 1_000, + counter_starts: CounterStarts::default(), + memory_limit: true, + memory_shared: true, + memory_hugepages: true, + cpu: Some(CpuTimes { + user: 1.0, + nice: 2.0, + system: 3.0, + idle: 4.0, + wait: 5.0, + interrupt: 6.0, + steal: 7.0, + }), + cpu_utilization: Some(CpuTimes { + user: 0.1, + nice: 0.1, + system: 0.2, + idle: 0.3, + wait: 0.1, + interrupt: 0.1, + steal: 0.1, + }), + cpuinfo: CpuInfo { + logical_count: 2, + physical_count: 1, + frequencies_hz: vec![2_400_000_000.0], + }, + memory: Some(MemoryStats { + total: 100, + used: 80, + free: 10, + available: 20, + cached: 5, + buffered: 5, + shared: 7, + slab_reclaimable: 3, + slab_unreclaimable: 2, + hugepages: HugepageStats { + total: 10, + free: 4, + reserved: 2, + surplus: 1, + page_size_bytes: 2 * BYTES_PER_KIB, + }, + }), + uptime_seconds: Some(42.0), + paging: Some(PagingStats { + minor_faults: 9, + major_faults: 1, + swap_in: 2, + swap_out: 3, + }), + swaps: vec![SwapStats { + name: "/dev/swap".to_owned(), + size: 100, + used: 25, + free: 75, + }], + processes: Some(ProcessStats { + running: 4, + blocked: 1, + created: 99, + }), + disks: vec![DiskStats { + name: "sda".to_owned(), + limit_bytes: Some(123), + read_bytes: 10, + write_bytes: 20, + read_ops: 1, + write_ops: 2, + read_merged: 3, + write_merged: 4, + read_time_seconds: 0.5, + write_time_seconds: 0.6, + io_time_seconds: 0.7, + }], + filesystems: vec![FilesystemStats { + device: "/dev/sda1".to_owned(), + mountpoint: "/".to_owned(), + fs_type: "ext4".to_owned(), + mode: "rw", + used: 60, + free: 30, + reserved: 10, + limit_bytes: Some(100), + }], + networks: vec![NetworkStats { + name: "eth0".to_owned(), + rx_bytes: 10, + tx_bytes: 20, + rx_packets: 1, + tx_packets: 2, + rx_errors: 3, + tx_errors: 4, + rx_dropped: 5, + tx_dropped: 6, + }], + resource: HostResource { + host_id: Some("host-id".to_owned()), + host_name: Some("host-name".to_owned()), + host_arch: Some("amd64"), + }, + } + .into_export_request() + } + + #[cfg(feature = "dev-tools")] + fn projection_fixture_metrics() -> Vec { + projection_fixture_request() + .resource_metrics + .into_iter() + .next() + .expect("resource metrics") + .scope_metrics + .into_iter() + .next() + .expect("scope metrics") + .metrics + } + fn assert_metric_shape( metrics: &[Metric], name: &'static str, From 25d09b935842b4e03e6a271603d0e4f5ddf00734 Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Sun, 3 May 2026 07:56:01 -0700 Subject: [PATCH 25/60] Run host metrics semconv check in CI --- .github/workflows/rust-ci.yml | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/.github/workflows/rust-ci.yml b/.github/workflows/rust-ci.yml index 71925f05f1..8609f57e49 100644 --- a/.github/workflows/rust-ci.yml +++ b/.github/workflows/rust-ci.yml @@ -638,6 +638,31 @@ jobs: cargo clippy --all-targets --all-features --workspace -- -D warnings working-directory: ./rust/${{ matrix.folder }} + host-metrics-semconv: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + submodules: true + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + repository: open-telemetry/semantic-conventions + path: semantic-conventions + - uses: dtolnay/rust-toolchain@3c5f7ea28cd621ae0bf5283f0e981fb97b8a7af9 + with: + toolchain: stable + - uses: Swatinem/rust-cache@c19371144df3bb44fab255c43d04cbc2ab54d1c4 # v2.9.1 + with: + workspaces: ./rust/otap-dataflow + - name: Run host metrics semconv drift check + env: + OTAP_HOST_METRICS_SEMCONV_REGISTRY: ${{ github.workspace }}/semantic-conventions/model + run: | + cargo test -p otap-df-core-nodes \ + --features dev-tools,otap-df-otap/crypto-ring \ + emitted_phase1_metric_shapes_match_weaver_semconv --lib -- --ignored + working-directory: ./rust/otap-dataflow + # Required matrix combinations for deny: otap-dataflow only deny_required: runs-on: ubuntu-latest From 4d0063d168a8638f6ed30ce68305f0eef97f11d2 Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Sun, 3 May 2026 09:49:21 -0700 Subject: [PATCH 26/60] Fix host metrics semconv conformance gaps --- .../receivers/host_metrics_receiver/mod.rs | 1 - .../receivers/host_metrics_receiver/procfs.rs | 231 ++++++++++++++++-- 2 files changed, 206 insertions(+), 26 deletions(-) diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs index 76dfe52b78..91577963ab 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs @@ -1267,7 +1267,6 @@ impl local::Receiver for HostMetricsReceiver { Ok(scrape) => { if let Some(metrics) = metrics.as_mut() { metrics.partial_errors.add(scrape.partial_errors); - metrics.source_read_errors.add(scrape.partial_errors); } let pdata = match encode_snapshot(scrape.snapshot) { Ok(pdata) => pdata, diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs index 7a3da67eea..f4c4b6b629 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs @@ -779,16 +779,7 @@ impl HostSnapshot { ], "system.paging.fault.type", ); - push_sum_u64( - &mut metrics, - "system.paging.operations", - "{operation}", - start, - now, - counter_starts, - &[("in", paging.swap_in), ("out", paging.swap_out)], - "system.paging.direction", - ); + push_paging_operations(&mut metrics, start, now, counter_starts, &paging); } for swap in self.swaps { @@ -823,7 +814,7 @@ impl HostSnapshot { now, &[ ("running", processes.running), - ("blocked", processes.blocked), + ("sleeping", processes.blocked), ], "process.state", ); @@ -1084,8 +1075,10 @@ impl CounterTracker { default_start, now, &[ - ("in", paging.swap_in as f64), - ("out", paging.swap_out as f64), + ("in|major", paging.swap_in as f64), + ("out|major", paging.swap_out as f64), + ("in|minor", paging.page_in as f64), + ("out|minor", paging.page_out as f64), ], &mut starts, ); @@ -1425,6 +1418,8 @@ struct HugepageStats { struct PagingStats { minor_faults: u64, major_faults: u64, + page_in: u64, + page_out: u64, swap_in: u64, swap_out: u64, } @@ -1706,6 +1701,8 @@ fn parse_uptime(input: &str) -> Option { fn parse_vmstat(input: &str) -> PagingStats { let mut total_faults = 0; let mut major_faults = 0; + let mut page_in = 0; + let mut page_out = 0; let mut swap_in = 0; let mut swap_out = 0; @@ -1718,6 +1715,8 @@ fn parse_vmstat(input: &str) -> PagingStats { match key { "pgfault" => total_faults = value, "pgmajfault" => major_faults = value, + "pgpgin" => page_in = value, + "pgpgout" => page_out = value, "pswpin" => swap_in = value, "pswpout" => swap_out = value, _ => {} @@ -1727,6 +1726,8 @@ fn parse_vmstat(input: &str) -> PagingStats { PagingStats { minor_faults: total_faults.saturating_sub(major_faults), major_faults, + page_in, + page_out, swap_in, swap_out, } @@ -2179,11 +2180,11 @@ fn push_cpu_frequency(metrics: &mut Vec, now: u64, frequencies_hz: &[f64 } let mut points = Vec::with_capacity(frequencies_hz.len()); for (idx, frequency) in frequencies_hz.iter().enumerate() { - points.push(number_point_f64( + points.push(number_point_i64( vec![kv_str("cpu.logical_number", &idx.to_string())], 0, now, - *frequency, + frequency_hz_i64(*frequency), )); } metrics.push(Metric { @@ -2197,6 +2198,44 @@ fn push_cpu_frequency(metrics: &mut Vec, now: u64, frequencies_hz: &[f64 }); } +fn frequency_hz_i64(value: f64) -> i64 { + if !value.is_finite() || value <= 0.0 { + return 0; + } + if value >= i64::MAX as f64 { + return i64::MAX; + } + value.round() as i64 +} + +fn push_paging_operations( + metrics: &mut Vec, + start: u64, + now: u64, + counter_starts: &CounterStarts, + paging: &PagingStats, +) { + let values = [ + ("in", "major", paging.swap_in), + ("out", "major", paging.swap_out), + ("in", "minor", paging.page_in), + ("out", "minor", paging.page_out), + ]; + let mut points = Vec::with_capacity(values.len()); + for (direction, fault_type, value) in values { + points.push(number_point_i64( + vec![ + kv_str("system.paging.direction", direction), + kv_str("system.paging.fault.type", fault_type), + ], + counter_starts.get_joined("system.paging.operations", direction, fault_type, start), + now, + saturating_i64(value), + )); + } + push_sum_metric(metrics, "system.paging.operations", "{operation}", points); +} + fn push_hugepage_metrics( metrics: &mut Vec, start: u64, @@ -2791,7 +2830,7 @@ mod tests { use weaver_resolver::SchemaResolver; #[cfg(feature = "dev-tools")] use weaver_semconv::{ - attribute::{BasicRequirementLevelSpec, RequirementLevel}, + attribute::{AttributeType, BasicRequirementLevelSpec, RequirementLevel, ValueSpec}, group::{GroupType, InstrumentSpec}, registry_repo::RegistryRepo, }; @@ -2816,6 +2855,7 @@ mod tests { assert_metric_shape(metrics, "system.cpu.logical.count", "{cpu}", Some(false)); assert_metric_shape(metrics, "system.cpu.physical.count", "{cpu}", Some(false)); assert_metric_shape(metrics, "system.cpu.frequency", "Hz", None); + assert_first_point_int(metrics, "system.cpu.frequency", 2_400_000_000); assert_first_point_attr(metrics, "system.cpu.frequency", "cpu.logical_number", "0"); assert_metric_shape(metrics, "system.memory.usage", "By", Some(false)); assert_first_point_attr( @@ -2885,6 +2925,18 @@ mod tests { "{operation}", Some(true), ); + assert_sum_point_attr( + metrics, + "system.paging.operations", + "system.paging.direction", + "in", + ); + assert_sum_point_attr( + metrics, + "system.paging.operations", + "system.paging.fault.type", + "minor", + ); assert_metric_shape(metrics, "system.paging.usage", "By", Some(false)); assert_first_point_attr(metrics, "system.paging.usage", "system.device", "/dev/swap"); assert_metric_shape(metrics, "system.paging.utilization", "1", None); @@ -2965,16 +3017,34 @@ mod tests { emitted.monotonic, semconv.monotonic, "instrument/temporality mismatch for {name}" ); + assert_eq!( + emitted.value_type, semconv.value_type, + "metric value type mismatch for {name}" + ); for attr in &semconv.attributes { - if is_intentional_semconv_attribute_gap(name.as_str(), attr.as_str()) { - continue; - } assert!( emitted.attributes.contains(attr), "missing semconv attribute {attr} on {name}" ); } + for attr in &emitted.attributes { + assert!( + semconv.all_attributes.contains(attr), + "unexpected semconv attribute {attr} on {name}" + ); + } + for (attr, values) in &emitted.enum_values { + let Some(allowed_values) = semconv.enum_values.get(attr) else { + continue; + }; + for value in values { + assert!( + allowed_values.contains(value), + "unexpected enum value {attr}={value} on {name}" + ); + } + } } } @@ -3346,9 +3416,12 @@ mod tests { #[test] fn vmstat_parser_derives_minor_faults() { - let paging = parse_vmstat("pgfault 100\npgmajfault 7\npswpin 3\npswpout 4\n"); + let paging = + parse_vmstat("pgfault 100\npgmajfault 7\npgpgin 5\npgpgout 6\npswpin 3\npswpout 4\n"); assert_eq!(paging.minor_faults, 93); assert_eq!(paging.major_faults, 7); + assert_eq!(paging.page_in, 5); + assert_eq!(paging.page_out, 6); assert_eq!(paging.swap_in, 3); assert_eq!(paging.swap_out, 4); } @@ -3514,6 +3587,16 @@ mod tests { unit: String, monotonic: Option, attributes: BTreeSet, + all_attributes: BTreeSet, + enum_values: BTreeMap>, + value_type: Option, + } + + #[cfg(feature = "dev-tools")] + #[derive(Clone, Copy, Debug, Eq, PartialEq)] + enum MetricValueKind { + Int, + Double, } #[cfg(feature = "dev-tools")] @@ -3572,6 +3655,25 @@ mod tests { .filter(|attr| !is_opt_in_requirement(&attr.requirement_level)) .map(|attr| attr.name.clone()) .collect(); + let all_attributes = group + .attributes + .iter() + .map(|attr| attr.name.clone()) + .collect(); + let enum_values = group + .attributes + .iter() + .filter_map(|attr| match &attr.r#type { + AttributeType::Enum { members } => Some(( + attr.name.clone(), + members + .iter() + .map(|member| value_spec_string(&member.value)) + .collect(), + )), + _ => None, + }) + .collect(); Some(( name.clone(), @@ -3579,12 +3681,40 @@ mod tests { unit: group.unit.clone().unwrap_or_default(), monotonic, attributes, + all_attributes, + enum_values, + value_type: semconv_metric_value_type(group.annotations.as_ref()), }, )) }) .collect() } + #[cfg(feature = "dev-tools")] + fn semconv_metric_value_type( + annotations: Option<&BTreeMap>, + ) -> Option { + let code_generation = annotations?.get("code_generation")?.0.as_mapping()?; + let value_type = code_generation.iter().find_map(|(key, value)| { + (key.as_str() == Some("metric_value_type")).then(|| value.as_str())? + })?; + match value_type { + "int" => Some(MetricValueKind::Int), + "double" => Some(MetricValueKind::Double), + _ => None, + } + } + + #[cfg(feature = "dev-tools")] + fn value_spec_string(value: &ValueSpec) -> String { + match value { + ValueSpec::Int(value) => value.to_string(), + ValueSpec::Double(value) => value.to_string(), + ValueSpec::String(value) => value.clone(), + ValueSpec::Bool(value) => value.to_string(), + } + } + #[cfg(feature = "dev-tools")] fn is_opt_in_requirement(requirement_level: &RequirementLevel) -> bool { matches!( @@ -3610,12 +3740,24 @@ mod tests { .flat_map(|point| point.attributes.iter()) .map(|attr| attr.key.clone()) .collect(); + let mut attribute_values: BTreeMap> = BTreeMap::new(); + for attr in points.iter().flat_map(|point| point.attributes.iter()) { + if let Some(value) = any_value_string(attr.value.as_ref()) { + let _ = attribute_values + .entry(attr.key.clone()) + .or_default() + .insert(value); + } + } ( metric.name.clone(), MetricShape { unit: metric.unit.clone(), monotonic, attributes, + all_attributes: BTreeSet::new(), + enum_values: attribute_values, + value_type: metric_value_type(points), }, ) }) @@ -3623,11 +3765,33 @@ mod tests { } #[cfg(feature = "dev-tools")] - fn is_intentional_semconv_attribute_gap(name: &str, attr: &str) -> bool { - matches!( - (name, attr), - ("system.paging.operations", "system.paging.fault.type") - ) + fn metric_value_type(points: &[NumberDataPoint]) -> Option { + let mut value_type = None; + for point in points { + let point_value_type = match point.value { + Some(number_data_point::Value::AsInt(_)) => MetricValueKind::Int, + Some(number_data_point::Value::AsDouble(_)) => MetricValueKind::Double, + None => continue, + }; + if value_type + .replace(point_value_type) + .is_some_and(|current| current != point_value_type) + { + panic!("mixed int/double data points"); + } + } + value_type + } + + #[cfg(feature = "dev-tools")] + fn any_value_string(value: Option<&AnyValue>) -> Option { + match value?.value.as_ref()? { + any_value::Value::StringValue(value) => Some(value.clone()), + any_value::Value::IntValue(value) => Some(value.to_string()), + any_value::Value::DoubleValue(value) => Some(value.to_string()), + any_value::Value::BoolValue(value) => Some(value.to_string()), + _ => None, + } } fn projection_fixture_request() -> ExportMetricsServiceRequest { @@ -3683,6 +3847,8 @@ mod tests { paging: Some(PagingStats { minor_faults: 9, major_faults: 1, + page_in: 4, + page_out: 5, swap_in: 2, swap_out: 3, }), @@ -3824,6 +3990,21 @@ mod tests { ); } + fn assert_first_point_int(metrics: &[Metric], name: &'static str, expected: i64) { + let metric = metric_by_name(metrics, name); + let point = match metric.data.as_ref().expect("metric data") { + metric::Data::Sum(sum) => sum.data_points.first(), + metric::Data::Gauge(gauge) => gauge.data_points.first(), + _ => None, + } + .expect("data point"); + assert_eq!( + point.value, + Some(number_data_point::Value::AsInt(expected)), + "{name} first point should be int" + ); + } + fn assert_no_first_point_attr(metrics: &[Metric], name: &'static str, key: &'static str) { let metric = metric_by_name(metrics, name); let point = match metric.data.as_ref().expect("metric data") { From 2b20d2f23b20a4d349c1176644c107260f7e5e41 Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Sun, 3 May 2026 10:06:31 -0700 Subject: [PATCH 27/60] Harden host metrics review findings --- .github/workflows/rust-ci.yml | 1 + .../receivers/host_metrics_receiver/mod.rs | 10 +-- .../receivers/host_metrics_receiver/procfs.rs | 68 +++++++++++++------ 3 files changed, 55 insertions(+), 24 deletions(-) diff --git a/.github/workflows/rust-ci.yml b/.github/workflows/rust-ci.yml index 8609f57e49..d782142668 100644 --- a/.github/workflows/rust-ci.yml +++ b/.github/workflows/rust-ci.yml @@ -647,6 +647,7 @@ jobs: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: repository: open-telemetry/semantic-conventions + ref: v1.41.0 path: semantic-conventions - uses: dtolnay/rust-toolchain@3c5f7ea28cd621ae0bf5283f0e981fb97b8a7af9 with: diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs index 91577963ab..422508c139 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs @@ -1062,12 +1062,12 @@ impl FamilyScheduler { Self { entries } } - fn next_due(&self) -> Instant { + fn next_due(&self, now: Instant) -> Instant { self.entries .iter() .map(|entry| entry.next_due) .min() - .expect("scheduler has at least one enabled family") + .unwrap_or(now) } fn mark_due(&mut self, now: Instant) -> ProcfsFamilies { @@ -1253,8 +1253,8 @@ impl local::Receiver for HostMetricsReceiver { } } - _ = sleep_until(scheduler.next_due()) => { - let scheduled_due = scheduler.next_due(); + _ = sleep_until(scheduler.next_due(Instant::now())) => { + let scheduled_due = scheduler.next_due(Instant::now()); let now = Instant::now(); let due = scheduler.mark_due(now); let scrape_start = StdInstant::now(); @@ -1640,7 +1640,7 @@ mod tests { let now = Instant::now(); let mut scheduler = FamilyScheduler::new(&config, now); - assert_eq!(scheduler.next_due(), now + Duration::from_secs(1)); + assert_eq!(scheduler.next_due(now), now + Duration::from_secs(1)); assert_eq!( scheduler.mark_due(now), ProcfsFamilies::default(), diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs index f4c4b6b629..4ba1a5e165 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs @@ -24,6 +24,7 @@ const NANOS_PER_SEC: u64 = 1_000_000_000; const BYTES_PER_KIB: u64 = 1024; const DISKSTAT_SECTOR_BYTES: u64 = 512; const FILESYSTEM_STAT_TIMEOUT: Duration = Duration::from_millis(100); +const COUNTER_KEY_SEPARATOR: char = '\x1f'; /// Procfs-backed source for host metrics. pub struct ProcfsSource { @@ -122,7 +123,7 @@ impl ProcfsSource { buf: String::with_capacity(16 * 1024), clk_tck: clock_ticks_per_second(), previous_cpu: None, - filesystem_worker: FilesystemStatWorker::new(), + filesystem_worker: FilesystemStatWorker::new()?, counter_tracker: CounterTracker::default(), }; source.apply_startup_validation()?; @@ -1315,7 +1316,7 @@ impl CounterTracker { fn counter_key(metric: &'static str, series: &str) -> String { let mut key = String::with_capacity(metric.len() + 1 + series.len()); key.push_str(metric); - key.push('|'); + key.push(COUNTER_KEY_SEPARATOR); key.push_str(series); key } @@ -1323,16 +1324,16 @@ fn counter_key(metric: &'static str, series: &str) -> String { fn counter_key_joined(metric: &'static str, first: &str, second: &'static str) -> String { let mut key = String::with_capacity(metric.len() + 2 + first.len() + second.len()); key.push_str(metric); - key.push('|'); + key.push(COUNTER_KEY_SEPARATOR); key.push_str(first); - key.push('|'); + key.push(COUNTER_KEY_SEPARATOR); key.push_str(second); key } fn counter_key_matches(key: &str, metric: &'static str, series: &str) -> bool { key.strip_prefix(metric) - .and_then(|rest| rest.strip_prefix('|')) + .and_then(|rest| rest.strip_prefix(COUNTER_KEY_SEPARATOR)) == Some(series) } @@ -1344,13 +1345,13 @@ fn counter_key_matches_joined( ) -> bool { let Some(series) = key .strip_prefix(metric) - .and_then(|rest| rest.strip_prefix('|')) + .and_then(|rest| rest.strip_prefix(COUNTER_KEY_SEPARATOR)) else { return false; }; series .strip_prefix(first) - .and_then(|rest| rest.strip_prefix('|')) + .and_then(|rest| rest.strip_prefix(COUNTER_KEY_SEPARATOR)) == Some(second) } @@ -1482,17 +1483,18 @@ struct FilesystemStat { } impl FilesystemStatWorker { - fn new() -> Self { + fn new() -> io::Result { let (tx, rx) = mpsc::sync_channel::(1); - let _ = std::thread::Builder::new() + let _handle = std::thread::Builder::new() .name("host-metrics-statvfs".to_owned()) .spawn(move || { while let Ok(request) = rx.recv() { let result = statvfs_bytes(&request.path); let _ = request.response.send(result); } - }); - Self { tx } + }) + .map_err(io::Error::other)?; + Ok(Self { tx }) } fn statvfs(&self, path: PathBuf, timeout: Duration) -> io::Result { @@ -1937,7 +1939,7 @@ fn unescape_mountinfo(input: &str) -> String { let bytes = input.as_bytes(); let mut escaped = None; for idx in 0..bytes.len() { - if bytes[idx] == b'\\' && idx + 3 < bytes.len() { + if bytes[idx] == b'\\' && idx + 4 <= bytes.len() { escaped = Some(idx); break; } @@ -1946,22 +1948,22 @@ fn unescape_mountinfo(input: &str) -> String { return input.to_owned(); }; - let mut output = String::with_capacity(input.len()); - output.push_str(&input[..first_escape]); + let mut output = Vec::with_capacity(input.len()); + output.extend_from_slice(&bytes[..first_escape]); let mut idx = first_escape; while idx < bytes.len() { - if bytes[idx] == b'\\' && idx + 3 < bytes.len() { + if bytes[idx] == b'\\' && idx + 4 <= bytes.len() { let octal = &input[idx + 1..idx + 4]; if let Ok(value) = u8::from_str_radix(octal, 8) { - output.push(value as char); + output.push(value); idx += 4; continue; } } - output.push(bytes[idx] as char); + output.push(bytes[idx]); idx += 1; } - output + String::from_utf8_lossy(&output).into_owned() } fn parse_netdev( @@ -3094,6 +3096,20 @@ mod tests { assert_eq!(starts.get_joined("system.disk.io", "sda", "write", 10), 10); } + #[test] + fn counter_keys_do_not_collide_with_pipe_in_series_values() { + let metric = "system.disk.io"; + let device = "read|write"; + let joined = counter_key_joined(metric, device, "read"); + assert!(!counter_key_matches_joined( + &joined, + metric, + "read", + "write|read" + )); + assert!(counter_key_matches_joined(&joined, metric, device, "read")); + } + #[test] fn scrape_due_emits_successful_families_after_partial_read_error() { let root = tempfile::tempdir().expect("tempdir"); @@ -3494,6 +3510,20 @@ mod tests { assert_eq!(mounts[0].mountpoint, "/mnt/data disk"); } + #[test] + fn mountinfo_parser_preserves_utf8_while_unescaping_paths() { + let mounts = parse_mountinfo( + "36 25 8:1 / /mnt/caf\u{00e9}\\040disk rw,relatime - ext4 /dev/disk\\040\u{00e9} rw\n", + false, + false, + FilesystemFilters::default(), + ); + + assert_eq!(mounts.len(), 1); + assert_eq!(mounts[0].device, "/dev/disk \u{00e9}"); + assert_eq!(mounts[0].mountpoint, "/mnt/caf\u{00e9} disk"); + } + #[test] fn mountinfo_parser_applies_filesystem_filters() { let include_mounts = CompiledFilter::compile( @@ -3609,7 +3639,7 @@ mod tests { .unwrap_or_else(|_| VirtualDirectoryPath::GitRepo { url: "https://github.com/open-telemetry/semantic-conventions.git".to_owned(), sub_folder: Some("model".to_owned()), - refspec: None, + refspec: Some("v1.41.0".to_owned()), }); let registry_repo = From 4c14e78ff0c4d52bd4450e4ffb02a90d7585b678 Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Sun, 3 May 2026 10:23:35 -0700 Subject: [PATCH 28/60] Fix host metrics review regressions --- .../receivers/host_metrics_receiver/README.md | 98 +++++++++++ .../receivers/host_metrics_receiver/mod.rs | 10 +- .../receivers/host_metrics_receiver/procfs.rs | 164 ++++++++++++++++-- 3 files changed, 258 insertions(+), 14 deletions(-) create mode 100644 rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/README.md diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/README.md b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/README.md new file mode 100644 index 0000000000..5e3eac7e36 --- /dev/null +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/README.md @@ -0,0 +1,98 @@ +# Host Metrics Receiver + +**URN:** `urn:otel:receiver:host_metrics` + +Linux host metrics receiver backed by procfs and sysfs. It emits OpenTelemetry +`system.*` metrics for CPU, memory, paging, system uptime, disk, filesystem, +network, and aggregate process counts. + +## Configuration + +Minimal configuration: + +```yaml +receivers: + host_metrics: + collection_interval: 10s +``` + +Collect from a host root mounted into a container: + +```yaml +receivers: + host_metrics: + collection_interval: 10s + host_view: + root_path: /host + validation: fail_selected +``` + +Enable selected opt-in metrics: + +```yaml +receivers: + host_metrics: + families: + cpu: + utilization: true + memory: + limit: true + hugepages: true + disk: + limit: true + filesystem: + limit: true +``` + +## Configuration Options + +| Field | Type | Default | Description | +| ----- | ---- | ------- | ----------- | +| `collection_interval` | duration | `10s` | Default scrape interval. | +| `initial_delay` | duration | `0s` | Delay before the first scrape. | +| `host_view.root_path` | path | `/` | Host filesystem root to read procfs/sysfs from. | +| `host_view.validation` | enum | `fail_selected` | One of `fail_selected`, `warn_selected`, or `none`. | +| `families..enabled` | bool | `true` | Enables or disables a metric family. | +| `families..interval` | duration | unset | Per-family interval; falls back to `collection_interval`. | +| `families.cpu.utilization` | bool | `false` | Emits derived CPU utilization gauges. | +| `families.memory.limit` | bool | `false` | Emits `system.memory.limit`. | +| `families.memory.shared` | bool | `false` | Emits Linux shared memory. | +| `families.memory.hugepages` | bool | `false` | Emits Linux hugepage metrics. | +| `families.disk.limit` | bool | `false` | Emits disk capacity from sysfs. | +| `families.filesystem.limit` | bool | `false` | Emits filesystem capacity. | +| `families.filesystem.include_virtual_filesystems` | bool | `false` | Includes virtual filesystems such as tmpfs. | + +Families are `cpu`, `memory`, `paging`, `system`, `disk`, `filesystem`, +`network`, and `processes`. + +## Filters + +Disk, filesystem, and network families support include and exclude filters. +Filter `match_type` values are `strict`, `glob`, and `regexp`. + +```yaml +receivers: + host_metrics: + families: + disk: + exclude: + match_type: glob + devices: ["loop*", "ram*"] + network: + exclude: + match_type: strict + interfaces: ["lo"] + filesystem: + exclude_fs_types: + match_type: strict + fs_types: ["tmpfs", "proc", "sysfs"] +``` + +## Current Limits + +- Linux only. +- `families.cpu.per_cpu` is rejected in v1. +- `families.network.include_connection_count` is rejected in v1. +- Process metrics are aggregate host summaries, not per-process scrapes. +- Filesystem collection can time out individual `statvfs` calls; avoid enabling + remote filesystems unless the host environment is known to be healthy. diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs index 422508c139..c3a67d22af 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs @@ -1084,8 +1084,14 @@ impl FamilyScheduler { ScheduledFamilyKind::Network => due.network = true, ScheduledFamilyKind::Processes => due.processes = true, } - while entry.next_due <= now { - entry.next_due += entry.interval; + let elapsed = now.duration_since(entry.next_due); + let missed_ticks = elapsed.as_nanos() / entry.interval.as_nanos() + 1; + let advance = entry + .interval + .saturating_mul(u32::try_from(missed_ticks).unwrap_or(u32::MAX)); + entry.next_due += advance; + if entry.next_due <= now { + entry.next_due = now + entry.interval; } } } diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs index 4ba1a5e165..5bd5cf7641 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs @@ -717,14 +717,16 @@ impl HostSnapshot { ], "system.memory.state", ); - push_updown_single_u64( - &mut metrics, - "system.memory.linux.available", - "By", - start, - now, - memory.available, - ); + if memory.has_available { + push_updown_single_u64( + &mut metrics, + "system.memory.linux.available", + "By", + start, + now, + memory.available, + ); + } push_updown_u64( &mut metrics, "system.memory.linux.slab.usage", @@ -815,7 +817,7 @@ impl HostSnapshot { now, &[ ("running", processes.running), - ("sleeping", processes.blocked), + ("blocked", processes.blocked), ], "process.state", ); @@ -1398,6 +1400,7 @@ struct MemoryStats { used: u64, free: u64, available: u64, + has_available: bool, cached: u64, buffered: u64, shared: u64, @@ -1680,6 +1683,7 @@ fn parse_meminfo(input: &str) -> Option { if total == 0 { return None; } + let has_available = available.is_some(); let available = available.unwrap_or_else(|| free.saturating_add(buffers).saturating_add(cached)); Some(MemoryStats { @@ -1687,6 +1691,7 @@ fn parse_meminfo(input: &str) -> Option { used: total.saturating_sub(available), free, available, + has_available, cached, buffered: buffers, shared, @@ -2183,7 +2188,10 @@ fn push_cpu_frequency(metrics: &mut Vec, now: u64, frequencies_hz: &[f64 let mut points = Vec::with_capacity(frequencies_hz.len()); for (idx, frequency) in frequencies_hz.iter().enumerate() { points.push(number_point_i64( - vec![kv_str("cpu.logical_number", &idx.to_string())], + vec![kv_i64( + "cpu.logical_number", + i64::try_from(idx).unwrap_or(i64::MAX), + )], 0, now, frequency_hz_i64(*frequency), @@ -2787,6 +2795,15 @@ fn kv_str(key: &str, value: &str) -> KeyValue { } } +fn kv_i64(key: &str, value: i64) -> KeyValue { + KeyValue { + key: key.to_owned(), + value: Some(AnyValue { + value: Some(any_value::Value::IntValue(value)), + }), + } +} + fn parse_u64(input: &str) -> u64 { input.parse().unwrap_or_default() } @@ -2832,7 +2849,10 @@ mod tests { use weaver_resolver::SchemaResolver; #[cfg(feature = "dev-tools")] use weaver_semconv::{ - attribute::{AttributeType, BasicRequirementLevelSpec, RequirementLevel, ValueSpec}, + attribute::{ + AttributeType, BasicRequirementLevelSpec, PrimitiveOrArrayTypeSpec, RequirementLevel, + ValueSpec, + }, group::{GroupType, InstrumentSpec}, registry_repo::RegistryRepo, }; @@ -2858,7 +2878,7 @@ mod tests { assert_metric_shape(metrics, "system.cpu.physical.count", "{cpu}", Some(false)); assert_metric_shape(metrics, "system.cpu.frequency", "Hz", None); assert_first_point_int(metrics, "system.cpu.frequency", 2_400_000_000); - assert_first_point_attr(metrics, "system.cpu.frequency", "cpu.logical_number", "0"); + assert_first_point_attr_int(metrics, "system.cpu.frequency", "cpu.logical_number", 0); assert_metric_shape(metrics, "system.memory.usage", "By", Some(false)); assert_first_point_attr( metrics, @@ -2943,6 +2963,7 @@ mod tests { assert_first_point_attr(metrics, "system.paging.usage", "system.device", "/dev/swap"); assert_metric_shape(metrics, "system.paging.utilization", "1", None); assert_metric_shape(metrics, "system.process.count", "{process}", Some(false)); + assert_sum_point_attr(metrics, "system.process.count", "process.state", "blocked"); assert_metric_shape(metrics, "system.process.created", "{process}", Some(true)); assert_metric_shape(metrics, "system.disk.io", "By", Some(true)); assert_first_point_attr(metrics, "system.disk.io", "disk.io.direction", "read"); @@ -3036,11 +3057,23 @@ mod tests { "unexpected semconv attribute {attr} on {name}" ); } + for (attr, emitted_kind) in &emitted.attribute_types { + let Some(semconv_kind) = semconv.attribute_types.get(attr) else { + continue; + }; + assert_eq!( + emitted_kind, semconv_kind, + "attribute value type mismatch for {attr} on {name}" + ); + } for (attr, values) in &emitted.enum_values { let Some(allowed_values) = semconv.enum_values.get(attr) else { continue; }; for value in values { + if is_intentional_semconv_enum_value_gap(name.as_str(), attr.as_str(), value) { + continue; + } assert!( allowed_values.contains(value), "unexpected enum value {attr}={value} on {name}" @@ -3394,6 +3427,7 @@ mod tests { let memory = parse_meminfo("MemTotal: 1000 kB\nMemFree: 100 kB\nBuffers: 20 kB\nCached: 30 kB\n") .expect("memory"); + assert!(!memory.has_available); assert_eq!(memory.available, 150 * BYTES_PER_KIB); assert_eq!(memory.used, 850 * BYTES_PER_KIB); } @@ -3618,6 +3652,7 @@ mod tests { monotonic: Option, attributes: BTreeSet, all_attributes: BTreeSet, + attribute_types: BTreeMap, enum_values: BTreeMap>, value_type: Option, } @@ -3629,6 +3664,15 @@ mod tests { Double, } + #[cfg(feature = "dev-tools")] + #[derive(Clone, Copy, Debug, Eq, PartialEq)] + enum AttributeValueKind { + Int, + Double, + String, + Bool, + } + #[cfg(feature = "dev-tools")] fn load_semconv_registry() -> ResolvedRegistry { let registry_path = std::env::var("OTAP_HOST_METRICS_SEMCONV_REGISTRY") @@ -3704,6 +3748,13 @@ mod tests { _ => None, }) .collect(); + let attribute_types = group + .attributes + .iter() + .filter_map(|attr| { + attribute_value_kind(&attr.r#type).map(|kind| (attr.name.clone(), kind)) + }) + .collect(); Some(( name.clone(), @@ -3712,6 +3763,7 @@ mod tests { monotonic, attributes, all_attributes, + attribute_types, enum_values, value_type: semconv_metric_value_type(group.annotations.as_ref()), }, @@ -3745,6 +3797,46 @@ mod tests { } } + #[cfg(feature = "dev-tools")] + fn attribute_value_kind(attribute_type: &AttributeType) -> Option { + match attribute_type { + AttributeType::PrimitiveOrArray(PrimitiveOrArrayTypeSpec::Int) => { + Some(AttributeValueKind::Int) + } + AttributeType::PrimitiveOrArray(PrimitiveOrArrayTypeSpec::Double) => { + Some(AttributeValueKind::Double) + } + AttributeType::PrimitiveOrArray(PrimitiveOrArrayTypeSpec::String) => { + Some(AttributeValueKind::String) + } + AttributeType::PrimitiveOrArray(PrimitiveOrArrayTypeSpec::Boolean) => { + Some(AttributeValueKind::Bool) + } + AttributeType::Enum { members } => { + members.first().map(|member| value_spec_kind(&member.value)) + } + _ => None, + } + } + + #[cfg(feature = "dev-tools")] + fn value_spec_kind(value: &ValueSpec) -> AttributeValueKind { + match value { + ValueSpec::Int(_) => AttributeValueKind::Int, + ValueSpec::Double(_) => AttributeValueKind::Double, + ValueSpec::String(_) => AttributeValueKind::String, + ValueSpec::Bool(_) => AttributeValueKind::Bool, + } + } + + #[cfg(feature = "dev-tools")] + fn is_intentional_semconv_enum_value_gap(name: &str, attr: &str, value: &str) -> bool { + matches!( + (name, attr, value), + ("system.process.count", "process.state", "blocked") + ) + } + #[cfg(feature = "dev-tools")] fn is_opt_in_requirement(requirement_level: &RequirementLevel) -> bool { matches!( @@ -3771,6 +3863,7 @@ mod tests { .map(|attr| attr.key.clone()) .collect(); let mut attribute_values: BTreeMap> = BTreeMap::new(); + let mut attribute_types: BTreeMap = BTreeMap::new(); for attr in points.iter().flat_map(|point| point.attributes.iter()) { if let Some(value) = any_value_string(attr.value.as_ref()) { let _ = attribute_values @@ -3778,6 +3871,15 @@ mod tests { .or_default() .insert(value); } + if let Some(kind) = any_value_kind(attr.value.as_ref()) { + let previous = attribute_types.insert(attr.key.clone(), kind); + assert!( + previous.is_none() || previous == Some(kind), + "mixed attribute value types for {} on {}", + attr.key, + metric.name + ); + } } ( metric.name.clone(), @@ -3786,6 +3888,7 @@ mod tests { monotonic, attributes, all_attributes: BTreeSet::new(), + attribute_types, enum_values: attribute_values, value_type: metric_value_type(points), }, @@ -3824,6 +3927,17 @@ mod tests { } } + #[cfg(feature = "dev-tools")] + fn any_value_kind(value: Option<&AnyValue>) -> Option { + match value?.value.as_ref()? { + any_value::Value::StringValue(_) => Some(AttributeValueKind::String), + any_value::Value::IntValue(_) => Some(AttributeValueKind::Int), + any_value::Value::DoubleValue(_) => Some(AttributeValueKind::Double), + any_value::Value::BoolValue(_) => Some(AttributeValueKind::Bool), + _ => None, + } + } + fn projection_fixture_request() -> ExportMetricsServiceRequest { HostSnapshot { now_unix_nano: 2_000, @@ -3860,6 +3974,7 @@ mod tests { used: 80, free: 10, available: 20, + has_available: true, cached: 5, buffered: 5, shared: 7, @@ -4035,6 +4150,31 @@ mod tests { ); } + fn assert_first_point_attr_int( + metrics: &[Metric], + name: &'static str, + key: &'static str, + expected: i64, + ) { + let metric = metric_by_name(metrics, name); + let point = match metric.data.as_ref().expect("metric data") { + metric::Data::Sum(sum) => sum.data_points.first(), + metric::Data::Gauge(gauge) => gauge.data_points.first(), + _ => None, + } + .expect("data point"); + assert!( + point.attributes.iter().any(|attr| { + attr.key == key + && matches!( + attr.value.as_ref().and_then(|value| value.value.as_ref()), + Some(any_value::Value::IntValue(actual)) if *actual == expected + ) + }), + "missing int attribute {key}={expected}" + ); + } + fn assert_no_first_point_attr(metrics: &[Metric], name: &'static str, key: &'static str) { let metric = metric_by_name(metrics, name); let point = match metric.data.as_ref().expect("metric data") { From 05ee33e8c9050e14706e4814f2a7f546151c2662 Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Sun, 3 May 2026 23:33:54 -0700 Subject: [PATCH 29/60] replace nix sysconf with libc in host metrics receiver --- rust/otap-dataflow/Cargo.toml | 1 + rust/otap-dataflow/crates/core-nodes/Cargo.toml | 3 ++- .../src/receivers/host_metrics_receiver/procfs.rs | 10 ++++------ 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/rust/otap-dataflow/Cargo.toml b/rust/otap-dataflow/Cargo.toml index ae0a1c0cef..65d78f28c4 100644 --- a/rust/otap-dataflow/Cargo.toml +++ b/rust/otap-dataflow/Cargo.toml @@ -122,6 +122,7 @@ tikv-jemalloc-sys = "0.6.1" memchr = "2.8.0" memmap2 = "0.9" memory-stats = "1" +libc = "0.2" nix = { version = "0.31.0", features = ["process", "signal", "fs", "mman"] } notify = "8.0" # Uses platform-native backend: inotify (Linux), kqueue (macOS), ReadDirectoryChanges (Windows) num_enum = "0.7" diff --git a/rust/otap-dataflow/crates/core-nodes/Cargo.toml b/rust/otap-dataflow/crates/core-nodes/Cargo.toml index dff437166c..e1dbca9c8c 100644 --- a/rust/otap-dataflow/crates/core-nodes/Cargo.toml +++ b/rust/otap-dataflow/crates/core-nodes/Cargo.toml @@ -38,7 +38,8 @@ futures.workspace = true futures-timer.workspace = true humantime-serde.workspace = true linkme.workspace = true -nix = { workspace = true, features = ["feature"] } +libc.workspace = true +nix.workspace = true object_store = {workspace = true, features = ["fs"]} parquet.workspace = true prost.workspace = true diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs index 5bd5cf7641..be99bae8b6 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs @@ -2816,13 +2816,11 @@ fn millis_to_seconds(ms: u64) -> f64 { ms as f64 / 1_000.0 } +#[allow(unsafe_code)] fn clock_ticks_per_second() -> f64 { - nix::unistd::sysconf(nix::unistd::SysconfVar::CLK_TCK) - .ok() - .flatten() - .filter(|ticks| *ticks > 0) - .map(|ticks| ticks as f64) - .unwrap_or(100.0) + // SAFETY: _SC_CLK_TCK is a valid sysconf name; the call has no side effects. + let ticks = unsafe { libc::sysconf(libc::_SC_CLK_TCK) }; + if ticks > 0 { ticks as f64 } else { 100.0 } } fn now_unix_nano() -> u64 { From 8657a6a70c01da14b77ebbe0c1772a81380ac092 Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Sun, 3 May 2026 23:38:12 -0700 Subject: [PATCH 30/60] add direct OTAP Arrow builder for host metrics --- .../receivers/host_metrics_receiver/mod.rs | 1 + .../host_metrics_receiver/otap_builder.rs | 266 ++++++++++++++++++ .../receivers/host_metrics_receiver/procfs.rs | 8 +- 3 files changed, 271 insertions(+), 4 deletions(-) create mode 100644 rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/otap_builder.rs diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs index c3a67d22af..044630a99d 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs @@ -35,6 +35,7 @@ use std::sync::{LazyLock, Mutex}; use std::time::{Duration, Instant as StdInstant}; use tokio::time::{Instant, sleep_until}; +mod otap_builder; mod procfs; use procfs::{HostSnapshot, ProcfsConfig, ProcfsFamilies, ProcfsSource}; diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/otap_builder.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/otap_builder.rs new file mode 100644 index 0000000000..2fd9bf4ceb --- /dev/null +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/otap_builder.rs @@ -0,0 +1,266 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +//! Direct OTAP Arrow record construction for host metrics. +//! +//! Builds `OtapArrowRecords::Metrics` without constructing intermediate OTLP +//! protobuf objects. All metric names, units, and attribute keys are written +//! directly into Arrow column buffers. + +use arrow::error::ArrowError; +use otap_df_pdata::encode::record::attributes::StrKeysAttributesRecordBatchBuilder; +use otap_df_pdata::encode::record::metrics::{ + MetricsRecordBatchBuilder, NumberDataPointsRecordBatchBuilder, +}; +use otap_df_pdata::otap::{Metrics, OtapArrowRecords}; +use otap_df_pdata::otlp::metrics::MetricType; +use otap_df_pdata::proto::opentelemetry::arrow::v1::ArrowPayloadType; + +use crate::receivers::host_metrics_receiver::procfs::HostResource; + +// Semconv version targeted by this receiver. +pub(crate) const SEMCONV_VERSION: &str = "1.41.0"; +const SEMCONV_SCHEMA_URL: &[u8] = b"https://opentelemetry.io/schemas/1.41.0"; + +const SCOPE_NAME: &[u8] = b"otap-df-core-nodes/host-metrics"; +const SCOPE_VERSION: &[u8] = env!("CARGO_PKG_VERSION").as_bytes(); + +// AggregationTemporality::Cumulative = 2 (OTLP proto enum value). +const AGGREGATION_TEMPORALITY_CUMULATIVE: i32 = 2; + +/// Wraps the per-datapoint attribute builder and hides the dp_id from callers. +pub(crate) struct DpAttrWriter<'a> { + attrs: &'a mut StrKeysAttributesRecordBatchBuilder, + dp_id: u32, +} + +impl DpAttrWriter<'_> { + /// Append a string-valued attribute. + pub fn str(&mut self, key: &'static str, value: &str) { + self.attrs.append_parent_id(&self.dp_id); + self.attrs.append_key(key); + self.attrs.any_values_builder.append_str(value.as_bytes()); + } + + /// Append an integer-valued attribute. + pub fn int(&mut self, key: &'static str, value: i64) { + self.attrs.append_parent_id(&self.dp_id); + self.attrs.append_key(key); + self.attrs.any_values_builder.append_int(value); + } +} + +/// Wraps the resource attribute builder and hides the resource_id from callers. +pub(crate) struct ResourceAttrWriter<'a> { + attrs: &'a mut StrKeysAttributesRecordBatchBuilder, +} + +impl ResourceAttrWriter<'_> { + fn str(&mut self, key: &'static str, value: &str) { + self.attrs.append_parent_id(&0u16); + self.attrs.append_key(key); + self.attrs.any_values_builder.append_str(value.as_bytes()); + } +} + +/// Builds an `OtapArrowRecords::Metrics` batch directly from host metric values. +/// +/// Call [`begin_sum_i64`], [`begin_sum_f64`], or [`begin_gauge_f64`] to open a +/// metric, then [`append_i64_dp`] / [`append_f64_dp`] for each data point. +/// Call [`finish`] to produce the final batch. +pub(crate) struct HostMetricsArrowBuilder { + metrics: MetricsRecordBatchBuilder, + ndp: NumberDataPointsRecordBatchBuilder, + resource_attrs: StrKeysAttributesRecordBatchBuilder, + ndp_attrs: StrKeysAttributesRecordBatchBuilder, + curr_metric_id: u16, + curr_dp_id: u32, +} + +impl HostMetricsArrowBuilder { + pub(crate) fn new() -> Self { + Self { + metrics: MetricsRecordBatchBuilder::new(), + ndp: NumberDataPointsRecordBatchBuilder::new(), + resource_attrs: StrKeysAttributesRecordBatchBuilder::new(), + ndp_attrs: StrKeysAttributesRecordBatchBuilder::new(), + curr_metric_id: 0, + curr_dp_id: 0, + } + } + + /// Append resource attributes (host.id, host.name, host.arch, os.type). + /// Must be called exactly once per batch before any metrics are appended. + pub(crate) fn append_resource(&mut self, resource: &HostResource) { + let mut w = ResourceAttrWriter { + attrs: &mut self.resource_attrs, + }; + w.str("os.type", "linux"); + if let Some(id) = &resource.host_id { + w.str("host.id", id); + } + if let Some(name) = &resource.host_name { + w.str("host.name", name); + } + if let Some(arch) = resource.host_arch { + w.str("host.arch", arch); + } + } + + // ── Metric openers ────────────────────────────────────────────────────── + + /// Open a monotonic cumulative Sum metric. Returns the metric_id for use + /// in subsequent [`append_i64_dp`] / [`append_f64_dp`] calls. + pub(crate) fn begin_counter_i64(&mut self, name: &str, unit: &str) -> u16 { + self.begin_metric(name, unit, MetricType::Sum, true) + } + + /// Open a non-monotonic cumulative Sum metric (UpDownCounter). + pub(crate) fn begin_updown_i64(&mut self, name: &str, unit: &str) -> u16 { + self.begin_metric(name, unit, MetricType::Sum, false) + } + + /// Open a Gauge metric (f64). + pub(crate) fn begin_gauge_f64(&mut self, name: &str, unit: &str) -> u16 { + self.begin_metric(name, unit, MetricType::Gauge, false) + } + + fn begin_metric( + &mut self, + name: &str, + unit: &str, + metric_type: MetricType, + is_monotonic: bool, + ) -> u16 { + let id = self.curr_metric_id; + self.metrics.append_id(id); + self.metrics.append_metric_type(metric_type as u8); + self.metrics.append_name(name.as_bytes()); + self.metrics.append_description(&[]); + self.metrics.append_unit(unit.as_bytes()); + match metric_type { + MetricType::Sum => { + self.metrics + .append_aggregation_temporality(Some(AGGREGATION_TEMPORALITY_CUMULATIVE)); + self.metrics.append_is_monotonic(Some(is_monotonic)); + } + _ => { + self.metrics.append_aggregation_temporality(None); + self.metrics.append_is_monotonic(None); + } + } + self.curr_metric_id = self.curr_metric_id.wrapping_add(1); + id + } + + // ── Datapoint appenders ───────────────────────────────────────────────── + + /// Append one i64 data point for `metric_id`. + /// `start` is `start_time_unix_nano`; pass `0` for gauges. + pub(crate) fn append_i64_dp( + &mut self, + metric_id: u16, + start: u64, + now: u64, + value: i64, + attrs: F, + ) where + F: FnOnce(&mut DpAttrWriter<'_>), + { + let dp_id = self.curr_dp_id; + self.ndp.append_id(dp_id); + self.ndp.append_parent_id(metric_id); + self.ndp + .append_start_time_unix_nano(Some(start as i64)); + self.ndp.append_time_unix_nano(now as i64); + self.ndp.append_int_value(Some(value)); + self.ndp.append_double_value(None); + let mut w = DpAttrWriter { + attrs: &mut self.ndp_attrs, + dp_id, + }; + attrs(&mut w); + self.curr_dp_id = self.curr_dp_id.wrapping_add(1); + } + + /// Append one f64 data point for `metric_id`. + /// `start` is `start_time_unix_nano`; pass `0` for gauges. + pub(crate) fn append_f64_dp( + &mut self, + metric_id: u16, + start: u64, + now: u64, + value: f64, + attrs: F, + ) where + F: FnOnce(&mut DpAttrWriter<'_>), + { + let dp_id = self.curr_dp_id; + self.ndp.append_id(dp_id); + self.ndp.append_parent_id(metric_id); + self.ndp + .append_start_time_unix_nano(Some(start as i64)); + self.ndp.append_time_unix_nano(now as i64); + self.ndp.append_int_value(None); + self.ndp.append_double_value(Some(value)); + let mut w = DpAttrWriter { + attrs: &mut self.ndp_attrs, + dp_id, + }; + attrs(&mut w); + self.curr_dp_id = self.curr_dp_id.wrapping_add(1); + } + + // ── Finalization ───────────────────────────────────────────────────────── + + /// Finalize all builders and produce an `OtapArrowRecords::Metrics` batch. + pub(crate) fn finish(mut self) -> Result { + let n = self.metrics.len(); + // Resource: single entry with id=0 repeated for every metric row. + self.metrics.resource.append_id_n(0, n); + self.metrics.resource.append_schema_url_n(Some(SEMCONV_SCHEMA_URL), n); + self.metrics.resource.append_dropped_attributes_count_n(0, n); + // Scope: single entry with id=0. + self.metrics.scope.append_id_n(0, n); + self.metrics.scope.append_name_n(Some(SCOPE_NAME), n); + self.metrics.scope.append_version_n(Some(SCOPE_VERSION), n); + self.metrics.scope.append_dropped_attributes_count_n(0, n); + // Schema URL on scope column. + self.metrics.append_scope_schema_url_n(SEMCONV_SCHEMA_URL, n); + + let mut records = OtapArrowRecords::Metrics(Metrics::default()); + finish_batch( + &mut records, + ArrowPayloadType::UnivariateMetrics, + self.metrics.finish()?, + ); + finish_batch( + &mut records, + ArrowPayloadType::NumberDataPoints, + self.ndp.finish()?, + ); + finish_batch( + &mut records, + ArrowPayloadType::ResourceAttrs, + self.resource_attrs.finish()?, + ); + finish_batch( + &mut records, + ArrowPayloadType::NumberDpAttrs, + self.ndp_attrs.finish()?, + ); + Ok(records) + } +} + +fn finish_batch( + records: &mut OtapArrowRecords, + payload_type: ArrowPayloadType, + rb: arrow::array::RecordBatch, +) { + if rb.num_rows() > 0 { + records + .set(payload_type, rb) + .expect("host metrics record batch schema is valid"); + } +} diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs index be99bae8b6..520e622c51 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs @@ -971,10 +971,10 @@ impl HostSnapshot { } #[derive(Default)] -struct HostResource { - host_id: Option, - host_name: Option, - host_arch: Option<&'static str>, +pub(super) struct HostResource { + pub(super) host_id: Option, + pub(super) host_name: Option, + pub(super) host_arch: Option<&'static str>, } impl HostResource { From 2b6deae122e9958448a256c54138690cdc1b21ea Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Sun, 3 May 2026 23:44:00 -0700 Subject: [PATCH 31/60] replace proto encoding path with direct OTAP Arrow construction --- .../receivers/host_metrics_receiver/mod.rs | 11 +- .../host_metrics_receiver/otap_builder.rs | 17 +- .../receivers/host_metrics_receiver/procfs.rs | 414 ++++++++++++++++++ 3 files changed, 429 insertions(+), 13 deletions(-) diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs index 044630a99d..fd3157d889 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs @@ -18,9 +18,6 @@ use otap_df_engine::receiver::ReceiverWrapper; use otap_df_engine::terminal_state::TerminalState; use otap_df_otap::OTAP_RECEIVER_FACTORIES; use otap_df_otap::pdata::{Context, OtapPdata}; -use otap_df_pdata::encode::encode_metrics_otap_batch; -use otap_df_pdata::otap::OtapArrowRecords; -use otap_df_pdata::proto::opentelemetry::metrics::v1::MetricsData; use otap_df_telemetry::instrument::{Counter, Mmsc}; use otap_df_telemetry::metrics::{MetricSet, MetricSetSnapshot}; use otap_df_telemetry::{otel_info, otel_warn}; @@ -1340,12 +1337,8 @@ impl local::Receiver for HostMetricsReceiver { } } -fn encode_snapshot(snapshot: HostSnapshot) -> Result { - let request = snapshot.into_export_request(); - let data = MetricsData { - resource_metrics: request.resource_metrics, - }; - let records: OtapArrowRecords = encode_metrics_otap_batch(&data)?; +fn encode_snapshot(snapshot: HostSnapshot) -> Result { + let records = snapshot.into_otap_records()?; Ok(OtapPdata::new(Context::default(), records.into())) } diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/otap_builder.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/otap_builder.rs index 2fd9bf4ceb..f7a9ab928f 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/otap_builder.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/otap_builder.rs @@ -109,22 +109,31 @@ impl HostMetricsArrowBuilder { // ── Metric openers ────────────────────────────────────────────────────── - /// Open a monotonic cumulative Sum metric. Returns the metric_id for use - /// in subsequent [`append_i64_dp`] / [`append_f64_dp`] calls. + /// Open a monotonic cumulative Sum metric (i64 data points). pub(crate) fn begin_counter_i64(&mut self, name: &str, unit: &str) -> u16 { self.begin_metric(name, unit, MetricType::Sum, true) } - /// Open a non-monotonic cumulative Sum metric (UpDownCounter). + /// Open a monotonic cumulative Sum metric (f64 data points). + pub(crate) fn begin_counter_f64(&mut self, name: &str, unit: &str) -> u16 { + self.begin_metric(name, unit, MetricType::Sum, true) + } + + /// Open a non-monotonic cumulative Sum metric / UpDownCounter (i64 data points). pub(crate) fn begin_updown_i64(&mut self, name: &str, unit: &str) -> u16 { self.begin_metric(name, unit, MetricType::Sum, false) } - /// Open a Gauge metric (f64). + /// Open a Gauge metric (f64 data points). pub(crate) fn begin_gauge_f64(&mut self, name: &str, unit: &str) -> u16 { self.begin_metric(name, unit, MetricType::Gauge, false) } + /// Open a Gauge metric (i64 data points). + pub(crate) fn begin_gauge_i64(&mut self, name: &str, unit: &str) -> u16 { + self.begin_metric(name, unit, MetricType::Gauge, false) + } + fn begin_metric( &mut self, name: &str, diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs index 520e622c51..a9da68b098 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs @@ -620,6 +620,17 @@ impl HostSnapshot { || !self.networks.is_empty() } + /// Converts a snapshot directly into an OTAP Arrow metrics batch. + pub fn into_otap_records( + self, + ) -> Result { + use crate::receivers::host_metrics_receiver::otap_builder::HostMetricsArrowBuilder; + let mut b = HostMetricsArrowBuilder::new(); + b.append_resource(&self.resource); + project_snapshot(&self, &mut b); + b.finish() + } + /// Converts a snapshot into an OTLP metrics request. pub fn into_export_request(self) -> ExportMetricsServiceRequest { let mut metrics = Vec::with_capacity(64); @@ -994,6 +1005,409 @@ impl HostResource { } } +fn project_snapshot( + snap: &HostSnapshot, + b: &mut crate::receivers::host_metrics_receiver::otap_builder::HostMetricsArrowBuilder, +) { + let now = snap.now_unix_nano; + let start = snap.start_time_unix_nano; + let cs = &snap.counter_starts; + + // ── CPU ────────────────────────────────────────────────────────────────── + if let Some(cpu) = snap.cpu { + let m = b.begin_counter_f64("system.cpu.time", "s"); + for (mode, value) in [ + ("user", cpu.user), + ("nice", cpu.nice), + ("system", cpu.system), + ("idle", cpu.idle), + ("iowait", cpu.wait), + ("interrupt", cpu.interrupt), + ("steal", cpu.steal), + ] { + b.append_f64_dp(m, cs.get("system.cpu.time", mode, start), now, value, |w| { + w.str("cpu.mode", mode); + }); + } + } + if let Some(cpu) = snap.cpu_utilization { + let m = b.begin_gauge_f64("system.cpu.utilization", "1"); + for (mode, value) in [ + ("user", cpu.user), + ("nice", cpu.nice), + ("system", cpu.system), + ("idle", cpu.idle), + ("iowait", cpu.wait), + ("interrupt", cpu.interrupt), + ("steal", cpu.steal), + ] { + b.append_f64_dp(m, 0, now, value, |w| { + w.str("cpu.mode", mode); + }); + } + } + if snap.cpuinfo.logical_count != 0 { + let m = b.begin_updown_i64("system.cpu.logical.count", "{cpu}"); + b.append_i64_dp(m, start, now, saturating_i64(snap.cpuinfo.logical_count), |_| {}); + } + if snap.cpuinfo.physical_count != 0 { + let m = b.begin_updown_i64("system.cpu.physical.count", "{cpu}"); + b.append_i64_dp(m, start, now, saturating_i64(snap.cpuinfo.physical_count), |_| {}); + } + if !snap.cpuinfo.frequencies_hz.is_empty() { + let m = b.begin_gauge_i64("system.cpu.frequency", "Hz"); + for (idx, &freq) in snap.cpuinfo.frequencies_hz.iter().enumerate() { + let logical = i64::try_from(idx).unwrap_or(i64::MAX); + b.append_i64_dp(m, 0, now, frequency_hz_i64(freq), |w| { + w.int("cpu.logical_number", logical); + }); + } + } + + // ── Memory ─────────────────────────────────────────────────────────────── + if let Some(memory) = snap.memory { + let m = b.begin_updown_i64("system.memory.usage", "By"); + for (state, value) in [ + ("used", memory.used), + ("free", memory.free), + ("cached", memory.cached), + ("buffers", memory.buffered), + ] { + b.append_i64_dp(m, start, now, saturating_i64(value), |w| { + w.str("system.memory.state", state); + }); + } + if memory.total > 0 { + let m = b.begin_gauge_f64("system.memory.utilization", "1"); + let total = memory.total as f64; + for (state, value) in [ + ("used", memory.used), + ("free", memory.free), + ("cached", memory.cached), + ("buffers", memory.buffered), + ] { + b.append_f64_dp(m, 0, now, value as f64 / total, |w| { + w.str("system.memory.state", state); + }); + } + } + if memory.has_available { + let m = b.begin_updown_i64("system.memory.linux.available", "By"); + b.append_i64_dp(m, start, now, saturating_i64(memory.available), |_| {}); + } + let m = b.begin_updown_i64("system.memory.linux.slab.usage", "By"); + for (state, value) in [ + ("reclaimable", memory.slab_reclaimable), + ("unreclaimable", memory.slab_unreclaimable), + ] { + b.append_i64_dp(m, start, now, saturating_i64(value), |w| { + w.str("system.memory.linux.slab.state", state); + }); + } + if snap.memory_limit { + let m = b.begin_updown_i64("system.memory.limit", "By"); + b.append_i64_dp(m, start, now, saturating_i64(memory.total), |_| {}); + } + if snap.memory_shared { + let m = b.begin_updown_i64("system.memory.linux.shared", "By"); + b.append_i64_dp(m, start, now, saturating_i64(memory.shared), |_| {}); + } + if snap.memory_hugepages { + project_hugepages(snap, b, start, now, &memory.hugepages); + } + } + + // ── System / uptime ────────────────────────────────────────────────────── + if let Some(uptime) = snap.uptime_seconds { + let m = b.begin_gauge_f64("system.uptime", "s"); + b.append_f64_dp(m, 0, now, uptime, |_| {}); + } + + // ── Paging ─────────────────────────────────────────────────────────────── + if let Some(paging) = snap.paging { + let m = b.begin_counter_i64("system.paging.faults", "{fault}"); + for (fault_type, value) in + [("minor", paging.minor_faults), ("major", paging.major_faults)] + { + b.append_i64_dp( + m, + cs.get("system.paging.faults", fault_type, start), + now, + saturating_i64(value), + |w| { + w.str("system.paging.fault.type", fault_type); + }, + ); + } + let m = b.begin_counter_i64("system.paging.operations", "{operation}"); + for (direction, fault_type, value) in [ + ("in", "major", paging.swap_in), + ("out", "major", paging.swap_out), + ("in", "minor", paging.page_in), + ("out", "minor", paging.page_out), + ] { + b.append_i64_dp( + m, + cs.get_joined("system.paging.operations", direction, fault_type, start), + now, + saturating_i64(value), + |w| { + w.str("system.paging.direction", direction); + w.str("system.paging.fault.type", fault_type); + }, + ); + } + } + for swap in &snap.swaps { + let m = b.begin_updown_i64("system.paging.usage", "By"); + for (state, value) in [("used", swap.used), ("free", swap.free)] { + b.append_i64_dp(m, start, now, saturating_i64(value), |w| { + w.str("system.device", &swap.name); + w.str("system.paging.state", state); + }); + } + let size = swap.size; + if size > 0 { + let m = b.begin_gauge_f64("system.paging.utilization", "1"); + let total = size as f64; + for (state, value) in [("used", swap.used), ("free", swap.free)] { + b.append_f64_dp(m, 0, now, value as f64 / total, |w| { + w.str("system.device", &swap.name); + w.str("system.paging.state", state); + }); + } + } + } + + // ── Processes ──────────────────────────────────────────────────────────── + if let Some(processes) = snap.processes { + let m = b.begin_updown_i64("system.process.count", "{process}"); + for (state, value) in + [("running", processes.running), ("blocked", processes.blocked)] + { + b.append_i64_dp(m, start, now, saturating_i64(value), |w| { + w.str("process.state", state); + }); + } + let m = b.begin_counter_i64("system.process.created", "{process}"); + b.append_i64_dp( + m, + cs.get("system.process.created", "", start), + now, + saturating_i64(processes.created), + |_| {}, + ); + } + + // ── Disk ───────────────────────────────────────────────────────────────── + for disk in &snap.disks { + if let Some(limit_bytes) = disk.limit_bytes { + let m = b.begin_updown_i64("system.disk.limit", "By"); + b.append_i64_dp(m, start, now, saturating_i64(limit_bytes), |w| { + w.str("system.device", &disk.name); + }); + } + let m = b.begin_counter_i64("system.disk.io", "By"); + for (dir, value) in [("read", disk.read_bytes), ("write", disk.write_bytes)] { + b.append_i64_dp( + m, + cs.get_joined("system.disk.io", &disk.name, dir, start), + now, + saturating_i64(value), + |w| { + w.str("system.device", &disk.name); + w.str("disk.io.direction", dir); + }, + ); + } + let m = b.begin_counter_i64("system.disk.operations", "{operation}"); + for (dir, value) in [("read", disk.read_ops), ("write", disk.write_ops)] { + b.append_i64_dp( + m, + cs.get_joined("system.disk.operations", &disk.name, dir, start), + now, + saturating_i64(value), + |w| { + w.str("system.device", &disk.name); + w.str("disk.io.direction", dir); + }, + ); + } + let m = b.begin_counter_f64("system.disk.io_time", "s"); + b.append_f64_dp( + m, + cs.get("system.disk.io_time", &disk.name, start), + now, + disk.io_time_seconds, + |w| { + w.str("system.device", &disk.name); + }, + ); + let m = b.begin_counter_f64("system.disk.operation_time", "s"); + for (dir, value) in [ + ("read", disk.read_time_seconds), + ("write", disk.write_time_seconds), + ] { + b.append_f64_dp( + m, + cs.get_joined("system.disk.operation_time", &disk.name, dir, start), + now, + value, + |w| { + w.str("system.device", &disk.name); + w.str("disk.io.direction", dir); + }, + ); + } + let m = b.begin_counter_i64("system.disk.merged", "{operation}"); + for (dir, value) in [("read", disk.read_merged), ("write", disk.write_merged)] { + b.append_i64_dp( + m, + cs.get_joined("system.disk.merged", &disk.name, dir, start), + now, + saturating_i64(value), + |w| { + w.str("system.device", &disk.name); + w.str("disk.io.direction", dir); + }, + ); + } + } + + // ── Filesystem ─────────────────────────────────────────────────────────── + for fs in &snap.filesystems { + let total = fs.used.saturating_add(fs.free).saturating_add(fs.reserved); + let m = b.begin_updown_i64("system.filesystem.usage", "By"); + for (state, value) in [("used", fs.used), ("free", fs.free), ("reserved", fs.reserved)] + { + b.append_i64_dp(m, start, now, saturating_i64(value), |w| { + w.str("system.device", &fs.device); + w.str("system.filesystem.state", state); + w.str("system.filesystem.type", &fs.fs_type); + w.str("system.filesystem.mode", fs.mode); + w.str("system.filesystem.mountpoint", &fs.mountpoint); + }); + } + if total > 0 { + let m = b.begin_gauge_f64("system.filesystem.utilization", "1"); + let total_f = total as f64; + for (state, value) in + [("used", fs.used), ("free", fs.free), ("reserved", fs.reserved)] + { + b.append_f64_dp(m, 0, now, value as f64 / total_f, |w| { + w.str("system.device", &fs.device); + w.str("system.filesystem.state", state); + w.str("system.filesystem.type", &fs.fs_type); + w.str("system.filesystem.mode", fs.mode); + w.str("system.filesystem.mountpoint", &fs.mountpoint); + }); + } + } + if let Some(limit_bytes) = fs.limit_bytes { + let m = b.begin_updown_i64("system.filesystem.limit", "By"); + b.append_i64_dp(m, start, now, saturating_i64(limit_bytes), |w| { + w.str("system.device", &fs.device); + w.str("system.filesystem.type", &fs.fs_type); + w.str("system.filesystem.mode", fs.mode); + w.str("system.filesystem.mountpoint", &fs.mountpoint); + }); + } + } + + // ── Network ────────────────────────────────────────────────────────────── + for net in &snap.networks { + let m = b.begin_counter_i64("system.network.io", "By"); + for (dir, iface_attr, value) in [ + ("receive", "network.interface.name", net.rx_bytes), + ("transmit", "network.interface.name", net.tx_bytes), + ] { + b.append_i64_dp( + m, + cs.get_joined("system.network.io", &net.name, dir, start), + now, + saturating_i64(value), + |w| { + w.str(iface_attr, &net.name); + w.str("network.io.direction", dir); + }, + ); + } + let m = b.begin_counter_i64("system.network.packet.count", "{packet}"); + for (dir, value) in [("receive", net.rx_packets), ("transmit", net.tx_packets)] { + b.append_i64_dp( + m, + cs.get_joined("system.network.packet.count", &net.name, dir, start), + now, + saturating_i64(value), + |w| { + w.str("system.device", &net.name); + w.str("network.io.direction", dir); + }, + ); + } + let m = b.begin_counter_i64("system.network.packet.dropped", "{packet}"); + for (dir, value) in [("receive", net.rx_dropped), ("transmit", net.tx_dropped)] { + b.append_i64_dp( + m, + cs.get_joined("system.network.packet.dropped", &net.name, dir, start), + now, + saturating_i64(value), + |w| { + w.str("network.interface.name", &net.name); + w.str("network.io.direction", dir); + }, + ); + } + let m = b.begin_counter_i64("system.network.errors", "{error}"); + for (dir, value) in [("receive", net.rx_errors), ("transmit", net.tx_errors)] { + b.append_i64_dp( + m, + cs.get_joined("system.network.errors", &net.name, dir, start), + now, + saturating_i64(value), + |w| { + w.str("network.interface.name", &net.name); + w.str("network.io.direction", dir); + }, + ); + } + } +} + +fn project_hugepages( + snap: &HostSnapshot, + b: &mut crate::receivers::host_metrics_receiver::otap_builder::HostMetricsArrowBuilder, + start: u64, + now: u64, + hugepages: &HugepageStats, +) { + let m = b.begin_updown_i64("system.memory.linux.hugepages.limit", "{page}"); + b.append_i64_dp(m, start, now, saturating_i64(hugepages.total), |_| {}); + let m = b.begin_updown_i64("system.memory.linux.hugepages.page_size", "By"); + b.append_i64_dp(m, start, now, saturating_i64(hugepages.page_size_bytes), |_| {}); + let m = b.begin_updown_i64("system.memory.linux.hugepages.reserved", "{page}"); + b.append_i64_dp(m, start, now, saturating_i64(hugepages.reserved), |_| {}); + let m = b.begin_updown_i64("system.memory.linux.hugepages.surplus", "{page}"); + b.append_i64_dp(m, start, now, saturating_i64(hugepages.surplus), |_| {}); + let used = hugepages.total.saturating_sub(hugepages.free); + let m = b.begin_updown_i64("system.memory.linux.hugepages.usage", "{page}"); + for (state, value) in [("used", used), ("free", hugepages.free)] { + b.append_i64_dp(m, start, now, saturating_i64(value), |w| { + w.str("system.memory.linux.hugepages.state", state); + }); + } + if hugepages.total > 0 { + let total = hugepages.total as f64; + let m = b.begin_gauge_f64("system.memory.linux.hugepages.utilization", "1"); + for (state, value) in [("used", used), ("free", hugepages.free)] { + b.append_f64_dp(m, 0, now, value as f64 / total, |w| { + w.str("system.memory.linux.hugepages.state", state); + }); + } + } + let _ = snap; // suppress unused warning; snap may be used in future extensions +} + #[derive(Default)] struct CounterTracker { states: HashMap, From 9bf73f4c6318ecbabec1c0d484f992c9d86fa67d Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Sun, 3 May 2026 23:50:18 -0700 Subject: [PATCH 32/60] remove proto intermediate path and dead push_* helpers --- .../receivers/host_metrics_receiver/procfs.rs | 1166 +---------------- 1 file changed, 29 insertions(+), 1137 deletions(-) diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs index a9da68b098..e9c2dde96f 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs @@ -4,15 +4,6 @@ //! Linux procfs-backed host metric source. use crate::receivers::host_metrics_receiver::{CompiledFilter, HostViewValidationMode}; -use otap_df_pdata::proto::opentelemetry::collector::metrics::v1::ExportMetricsServiceRequest; -use otap_df_pdata::proto::opentelemetry::common::v1::{ - AnyValue, InstrumentationScope, KeyValue, any_value, -}; -use otap_df_pdata::proto::opentelemetry::metrics::v1::{ - AggregationTemporality, Gauge, Metric, NumberDataPoint, ResourceMetrics, ScopeMetrics, Sum, - metric, number_data_point, -}; -use otap_df_pdata::proto::opentelemetry::resource::v1::Resource; use std::collections::{HashMap, HashSet}; use std::fs::File; use std::io::{self, Read}; @@ -630,355 +621,6 @@ impl HostSnapshot { project_snapshot(&self, &mut b); b.finish() } - - /// Converts a snapshot into an OTLP metrics request. - pub fn into_export_request(self) -> ExportMetricsServiceRequest { - let mut metrics = Vec::with_capacity(64); - let now = self.now_unix_nano; - let start = self.start_time_unix_nano; - let counter_starts = &self.counter_starts; - - if let Some(cpu) = self.cpu { - push_sum_f64( - &mut metrics, - "system.cpu.time", - "s", - start, - now, - counter_starts, - &[ - ("user", cpu.user), - ("nice", cpu.nice), - ("system", cpu.system), - ("idle", cpu.idle), - ("iowait", cpu.wait), - ("interrupt", cpu.interrupt), - ("steal", cpu.steal), - ], - "cpu.mode", - ); - } - if let Some(cpu) = self.cpu_utilization { - push_gauge_f64_by_attr( - &mut metrics, - "system.cpu.utilization", - "1", - now, - &[ - ("user", cpu.user), - ("nice", cpu.nice), - ("system", cpu.system), - ("idle", cpu.idle), - ("iowait", cpu.wait), - ("interrupt", cpu.interrupt), - ("steal", cpu.steal), - ], - "cpu.mode", - ); - } - - if self.cpuinfo.logical_count != 0 { - push_updown_single_u64( - &mut metrics, - "system.cpu.logical.count", - "{cpu}", - start, - now, - self.cpuinfo.logical_count, - ); - } - if self.cpuinfo.physical_count != 0 { - push_updown_single_u64( - &mut metrics, - "system.cpu.physical.count", - "{cpu}", - start, - now, - self.cpuinfo.physical_count, - ); - } - push_cpu_frequency(&mut metrics, now, &self.cpuinfo.frequencies_hz); - - if let Some(memory) = self.memory { - push_updown_u64( - &mut metrics, - "system.memory.usage", - "By", - start, - now, - &[ - ("used", memory.used), - ("free", memory.free), - ("cached", memory.cached), - ("buffers", memory.buffered), - ], - "system.memory.state", - ); - push_gauge_ratio( - &mut metrics, - "system.memory.utilization", - "1", - now, - memory.total, - &[ - ("used", memory.used), - ("free", memory.free), - ("cached", memory.cached), - ("buffers", memory.buffered), - ], - "system.memory.state", - ); - if memory.has_available { - push_updown_single_u64( - &mut metrics, - "system.memory.linux.available", - "By", - start, - now, - memory.available, - ); - } - push_updown_u64( - &mut metrics, - "system.memory.linux.slab.usage", - "By", - start, - now, - &[ - ("reclaimable", memory.slab_reclaimable), - ("unreclaimable", memory.slab_unreclaimable), - ], - "system.memory.linux.slab.state", - ); - if self.memory_limit { - push_updown_single_u64( - &mut metrics, - "system.memory.limit", - "By", - start, - now, - memory.total, - ); - } - if self.memory_shared { - push_updown_single_u64( - &mut metrics, - "system.memory.linux.shared", - "By", - start, - now, - memory.shared, - ); - } - if self.memory_hugepages { - push_hugepage_metrics(&mut metrics, start, now, &memory.hugepages); - } - } - - if let Some(uptime_seconds) = self.uptime_seconds { - push_gauge_f64(&mut metrics, "system.uptime", "s", now, uptime_seconds); - } - - if let Some(paging) = self.paging { - push_sum_u64( - &mut metrics, - "system.paging.faults", - "{fault}", - start, - now, - counter_starts, - &[ - ("minor", paging.minor_faults), - ("major", paging.major_faults), - ], - "system.paging.fault.type", - ); - push_paging_operations(&mut metrics, start, now, counter_starts, &paging); - } - - for swap in self.swaps { - push_updown_u64_with_device( - &mut metrics, - "system.paging.usage", - "By", - start, - now, - &swap.name, - &[("used", swap.used), ("free", swap.free)], - "system.paging.state", - ); - push_gauge_ratio_with_device( - &mut metrics, - "system.paging.utilization", - "1", - now, - &swap.name, - swap.size, - &[("used", swap.used), ("free", swap.free)], - "system.paging.state", - ); - } - - if let Some(processes) = self.processes { - push_updown_u64( - &mut metrics, - "system.process.count", - "{process}", - start, - now, - &[ - ("running", processes.running), - ("blocked", processes.blocked), - ], - "process.state", - ); - push_sum_single_u64( - &mut metrics, - "system.process.created", - "{process}", - start, - now, - counter_starts, - processes.created, - ); - } - - for disk in self.disks { - if let Some(limit_bytes) = disk.limit_bytes { - push_updown_single_u64_with_device( - &mut metrics, - "system.disk.limit", - "By", - start, - now, - &disk.name, - limit_bytes, - ); - } - push_disk_sum( - &mut metrics, - "system.disk.io", - "By", - start, - now, - counter_starts, - &disk, - DiskProjection::Bytes, - ); - push_disk_sum( - &mut metrics, - "system.disk.operations", - "{operation}", - start, - now, - counter_starts, - &disk, - DiskProjection::Operations, - ); - push_disk_sum( - &mut metrics, - "system.disk.io_time", - "s", - start, - now, - counter_starts, - &disk, - DiskProjection::IoTime, - ); - push_disk_sum( - &mut metrics, - "system.disk.operation_time", - "s", - start, - now, - counter_starts, - &disk, - DiskProjection::OperationTime, - ); - push_disk_sum( - &mut metrics, - "system.disk.merged", - "{operation}", - start, - now, - counter_starts, - &disk, - DiskProjection::Merged, - ); - } - - for filesystem in self.filesystems { - push_filesystem_usage(&mut metrics, start, now, &filesystem); - push_filesystem_utilization(&mut metrics, now, &filesystem); - if let Some(limit_bytes) = filesystem.limit_bytes { - push_filesystem_limit(&mut metrics, start, now, &filesystem, limit_bytes); - } - } - - for network in self.networks { - push_network_sum( - &mut metrics, - "system.network.io", - "By", - start, - now, - counter_starts, - &network, - NetworkProjection::Bytes, - ); - push_network_sum( - &mut metrics, - "system.network.packet.count", - "{packet}", - start, - now, - counter_starts, - &network, - NetworkProjection::Packets, - ); - push_network_sum( - &mut metrics, - "system.network.packet.dropped", - "{packet}", - start, - now, - counter_starts, - &network, - NetworkProjection::Dropped, - ); - push_network_sum( - &mut metrics, - "system.network.errors", - "{error}", - start, - now, - counter_starts, - &network, - NetworkProjection::Errors, - ); - } - - ExportMetricsServiceRequest { - resource_metrics: vec![ResourceMetrics { - resource: Some(Resource { - attributes: self.resource.into_attributes(), - dropped_attributes_count: 0, - entity_refs: Vec::new(), - }), - scope_metrics: vec![ScopeMetrics { - scope: Some(InstrumentationScope { - name: "otap-df-core-nodes/host-metrics".to_owned(), - version: env!("CARGO_PKG_VERSION").to_owned(), - attributes: Vec::new(), - dropped_attributes_count: 0, - }), - metrics, - schema_url: String::new(), - }], - schema_url: String::new(), - }], - } - } } #[derive(Default)] @@ -988,22 +630,6 @@ pub(super) struct HostResource { pub(super) host_arch: Option<&'static str>, } -impl HostResource { - fn into_attributes(self) -> Vec { - let mut attributes = Vec::with_capacity(4); - attributes.push(kv_str("os.type", "linux")); - if let Some(host_id) = self.host_id { - attributes.push(kv_str("host.id", &host_id)); - } - if let Some(host_name) = self.host_name { - attributes.push(kv_str("host.name", &host_name)); - } - if let Some(host_arch) = self.host_arch { - attributes.push(kv_str("host.arch", host_arch)); - } - attributes - } -} fn project_snapshot( snap: &HostSnapshot, @@ -2463,165 +2089,6 @@ fn record_partial_error( } } -fn push_gauge_f64( - metrics: &mut Vec, - name: &'static str, - unit: &'static str, - now: u64, - value: f64, -) { - metrics.push(Metric { - name: name.to_owned(), - description: String::new(), - unit: unit.to_owned(), - metadata: Vec::new(), - data: Some(metric::Data::Gauge(Gauge { - data_points: vec![number_point_f64(Vec::new(), 0, now, value)], - })), - }); -} - -fn push_gauge_f64_by_attr( - metrics: &mut Vec, - name: &'static str, - unit: &'static str, - now: u64, - values: &[(&'static str, f64)], - attr_name: &'static str, -) { - let mut points = Vec::with_capacity(values.len()); - for (state, value) in values { - points.push(number_point_f64( - vec![kv_str(attr_name, state)], - 0, - now, - *value, - )); - } - metrics.push(Metric { - name: name.to_owned(), - description: String::new(), - unit: unit.to_owned(), - metadata: Vec::new(), - data: Some(metric::Data::Gauge(Gauge { - data_points: points, - })), - }); -} - -fn push_updown_u64( - metrics: &mut Vec, - name: &'static str, - unit: &'static str, - start: u64, - now: u64, - values: &[(&'static str, u64)], - attr_name: &'static str, -) { - let mut points = Vec::with_capacity(values.len()); - for (state, value) in values { - points.push(number_point_i64( - vec![kv_str(attr_name, state)], - start, - now, - saturating_i64(*value), - )); - } - push_updown_metric(metrics, name, unit, points); -} - -fn push_updown_u64_with_device( - metrics: &mut Vec, - name: &'static str, - unit: &'static str, - start: u64, - now: u64, - device: &str, - values: &[(&'static str, u64)], - attr_name: &'static str, -) { - let mut points = Vec::with_capacity(values.len()); - for (state, value) in values { - points.push(number_point_i64( - vec![kv_str("system.device", device), kv_str(attr_name, state)], - start, - now, - saturating_i64(*value), - )); - } - push_updown_metric(metrics, name, unit, points); -} - -fn push_updown_single_u64( - metrics: &mut Vec, - name: &'static str, - unit: &'static str, - start: u64, - now: u64, - value: u64, -) { - push_updown_metric( - metrics, - name, - unit, - vec![number_point_i64( - Vec::new(), - start, - now, - saturating_i64(value), - )], - ); -} - -fn push_updown_single_u64_with_device( - metrics: &mut Vec, - name: &'static str, - unit: &'static str, - start: u64, - now: u64, - device: &str, - value: u64, -) { - push_updown_metric( - metrics, - name, - unit, - vec![number_point_i64( - vec![kv_str("system.device", device)], - start, - now, - saturating_i64(value), - )], - ); -} - -fn push_cpu_frequency(metrics: &mut Vec, now: u64, frequencies_hz: &[f64]) { - if frequencies_hz.is_empty() { - return; - } - let mut points = Vec::with_capacity(frequencies_hz.len()); - for (idx, frequency) in frequencies_hz.iter().enumerate() { - points.push(number_point_i64( - vec![kv_i64( - "cpu.logical_number", - i64::try_from(idx).unwrap_or(i64::MAX), - )], - 0, - now, - frequency_hz_i64(*frequency), - )); - } - metrics.push(Metric { - name: "system.cpu.frequency".to_owned(), - description: String::new(), - unit: "Hz".to_owned(), - metadata: Vec::new(), - data: Some(metric::Data::Gauge(Gauge { - data_points: points, - })), - }); -} - fn frequency_hz_i64(value: f64) -> i64 { if !value.is_finite() || value <= 0.0 { return 0; @@ -2632,592 +2099,6 @@ fn frequency_hz_i64(value: f64) -> i64 { value.round() as i64 } -fn push_paging_operations( - metrics: &mut Vec, - start: u64, - now: u64, - counter_starts: &CounterStarts, - paging: &PagingStats, -) { - let values = [ - ("in", "major", paging.swap_in), - ("out", "major", paging.swap_out), - ("in", "minor", paging.page_in), - ("out", "minor", paging.page_out), - ]; - let mut points = Vec::with_capacity(values.len()); - for (direction, fault_type, value) in values { - points.push(number_point_i64( - vec![ - kv_str("system.paging.direction", direction), - kv_str("system.paging.fault.type", fault_type), - ], - counter_starts.get_joined("system.paging.operations", direction, fault_type, start), - now, - saturating_i64(value), - )); - } - push_sum_metric(metrics, "system.paging.operations", "{operation}", points); -} - -fn push_hugepage_metrics( - metrics: &mut Vec, - start: u64, - now: u64, - hugepages: &HugepageStats, -) { - push_updown_single_u64( - metrics, - "system.memory.linux.hugepages.limit", - "{page}", - start, - now, - hugepages.total, - ); - push_updown_single_u64( - metrics, - "system.memory.linux.hugepages.page_size", - "By", - start, - now, - hugepages.page_size_bytes, - ); - push_updown_single_u64( - metrics, - "system.memory.linux.hugepages.reserved", - "{page}", - start, - now, - hugepages.reserved, - ); - push_updown_single_u64( - metrics, - "system.memory.linux.hugepages.surplus", - "{page}", - start, - now, - hugepages.surplus, - ); - let used = hugepages.total.saturating_sub(hugepages.free); - push_updown_u64( - metrics, - "system.memory.linux.hugepages.usage", - "{page}", - start, - now, - &[("used", used), ("free", hugepages.free)], - "system.memory.linux.hugepages.state", - ); - push_gauge_ratio( - metrics, - "system.memory.linux.hugepages.utilization", - "1", - now, - hugepages.total, - &[("used", used), ("free", hugepages.free)], - "system.memory.linux.hugepages.state", - ); -} - -fn push_gauge_ratio( - metrics: &mut Vec, - name: &'static str, - unit: &'static str, - now: u64, - total: u64, - values: &[(&'static str, u64)], - attr_name: &'static str, -) { - if total == 0 { - return; - } - let total = total as f64; - let mut points = Vec::with_capacity(values.len()); - for (state, value) in values { - points.push(number_point_f64( - vec![kv_str(attr_name, state)], - 0, - now, - *value as f64 / total, - )); - } - metrics.push(Metric { - name: name.to_owned(), - description: String::new(), - unit: unit.to_owned(), - metadata: Vec::new(), - data: Some(metric::Data::Gauge(Gauge { - data_points: points, - })), - }); -} - -fn push_gauge_ratio_with_device( - metrics: &mut Vec, - name: &'static str, - unit: &'static str, - now: u64, - device: &str, - total: u64, - values: &[(&'static str, u64)], - attr_name: &'static str, -) { - if total == 0 { - return; - } - let total = total as f64; - let mut points = Vec::with_capacity(values.len()); - for (state, value) in values { - points.push(number_point_f64( - vec![kv_str("system.device", device), kv_str(attr_name, state)], - 0, - now, - *value as f64 / total, - )); - } - metrics.push(Metric { - name: name.to_owned(), - description: String::new(), - unit: unit.to_owned(), - metadata: Vec::new(), - data: Some(metric::Data::Gauge(Gauge { - data_points: points, - })), - }); -} - -fn push_sum_f64( - metrics: &mut Vec, - name: &'static str, - unit: &'static str, - start: u64, - now: u64, - counter_starts: &CounterStarts, - values: &[(&'static str, f64)], - attr_name: &'static str, -) { - let mut points = Vec::with_capacity(values.len()); - for (state, value) in values { - points.push(number_point_f64( - vec![kv_str(attr_name, state)], - counter_starts.get(name, state, start), - now, - *value, - )); - } - push_sum_metric(metrics, name, unit, points); -} - -fn push_sum_u64( - metrics: &mut Vec, - name: &'static str, - unit: &'static str, - start: u64, - now: u64, - counter_starts: &CounterStarts, - values: &[(&'static str, u64)], - attr_name: &'static str, -) { - let mut points = Vec::with_capacity(values.len()); - for (state, value) in values { - points.push(number_point_i64( - vec![kv_str(attr_name, state)], - counter_starts.get(name, state, start), - now, - saturating_i64(*value), - )); - } - push_sum_metric(metrics, name, unit, points); -} - -fn push_sum_single_u64( - metrics: &mut Vec, - name: &'static str, - unit: &'static str, - start: u64, - now: u64, - counter_starts: &CounterStarts, - value: u64, -) { - push_sum_metric( - metrics, - name, - unit, - vec![number_point_i64( - Vec::new(), - counter_starts.get(name, "", start), - now, - saturating_i64(value), - )], - ); -} - -fn push_disk_sum( - metrics: &mut Vec, - name: &'static str, - unit: &'static str, - start: u64, - now: u64, - counter_starts: &CounterStarts, - disk: &DiskStats, - projection: DiskProjection, -) { - if let DiskProjection::IoTime = projection { - push_sum_metric( - metrics, - name, - unit, - vec![number_point_f64( - vec![kv_str("system.device", &disk.name)], - counter_starts.get(name, &disk.name, start), - now, - disk.io_time_seconds, - )], - ); - return; - } - - let (read, write) = match projection { - DiskProjection::Bytes => ( - DiskValue::Integer(disk.read_bytes), - DiskValue::Integer(disk.write_bytes), - ), - DiskProjection::Operations => ( - DiskValue::Integer(disk.read_ops), - DiskValue::Integer(disk.write_ops), - ), - DiskProjection::OperationTime => ( - DiskValue::Float(disk.read_time_seconds), - DiskValue::Float(disk.write_time_seconds), - ), - DiskProjection::Merged => ( - DiskValue::Integer(disk.read_merged), - DiskValue::Integer(disk.write_merged), - ), - DiskProjection::IoTime => unreachable!(), - }; - let points = vec![ - disk_number_point(&disk.name, "read", start, now, counter_starts, name, read), - disk_number_point(&disk.name, "write", start, now, counter_starts, name, write), - ]; - push_sum_metric(metrics, name, unit, points); -} - -#[derive(Copy, Clone)] -enum DiskProjection { - Bytes, - Operations, - IoTime, - OperationTime, - Merged, -} - -#[derive(Copy, Clone)] -enum DiskValue { - Integer(u64), - Float(f64), -} - -fn disk_number_point( - device: &str, - direction: &'static str, - start: u64, - now: u64, - counter_starts: &CounterStarts, - metric: &'static str, - value: DiskValue, -) -> NumberDataPoint { - let attributes = vec![ - kv_str("system.device", device), - kv_str("disk.io.direction", direction), - ]; - match value { - DiskValue::Integer(value) => number_point_i64( - attributes, - counter_starts.get_joined(metric, device, direction, start), - now, - saturating_i64(value), - ), - DiskValue::Float(value) => number_point_f64( - attributes, - counter_starts.get_joined(metric, device, direction, start), - now, - value, - ), - } -} - -fn push_filesystem_usage( - metrics: &mut Vec, - start: u64, - now: u64, - filesystem: &FilesystemStats, -) { - let points = vec![ - filesystem_number_point( - filesystem, - "used", - start, - now, - FilesystemValue::Integer(filesystem.used), - ), - filesystem_number_point( - filesystem, - "free", - start, - now, - FilesystemValue::Integer(filesystem.free), - ), - filesystem_number_point( - filesystem, - "reserved", - start, - now, - FilesystemValue::Integer(filesystem.reserved), - ), - ]; - push_updown_metric(metrics, "system.filesystem.usage", "By", points); -} - -fn push_filesystem_utilization(metrics: &mut Vec, now: u64, filesystem: &FilesystemStats) { - let total = filesystem - .used - .saturating_add(filesystem.free) - .saturating_add(filesystem.reserved); - if total == 0 { - return; - } - let total = total as f64; - let points = vec![ - filesystem_number_point( - filesystem, - "used", - 0, - now, - FilesystemValue::Float(filesystem.used as f64 / total), - ), - filesystem_number_point( - filesystem, - "free", - 0, - now, - FilesystemValue::Float(filesystem.free as f64 / total), - ), - filesystem_number_point( - filesystem, - "reserved", - 0, - now, - FilesystemValue::Float(filesystem.reserved as f64 / total), - ), - ]; - metrics.push(Metric { - name: "system.filesystem.utilization".to_owned(), - description: String::new(), - unit: "1".to_owned(), - metadata: Vec::new(), - data: Some(metric::Data::Gauge(Gauge { - data_points: points, - })), - }); -} - -fn push_filesystem_limit( - metrics: &mut Vec, - start: u64, - now: u64, - filesystem: &FilesystemStats, - limit_bytes: u64, -) { - push_updown_metric( - metrics, - "system.filesystem.limit", - "By", - vec![number_point_i64( - vec![ - kv_str("system.device", &filesystem.device), - kv_str("system.filesystem.type", &filesystem.fs_type), - kv_str("system.filesystem.mode", filesystem.mode), - kv_str("system.filesystem.mountpoint", &filesystem.mountpoint), - ], - start, - now, - saturating_i64(limit_bytes), - )], - ); -} - -#[derive(Copy, Clone)] -enum FilesystemValue { - Integer(u64), - Float(f64), -} - -fn filesystem_number_point( - filesystem: &FilesystemStats, - state: &'static str, - start: u64, - now: u64, - value: FilesystemValue, -) -> NumberDataPoint { - let attributes = vec![ - kv_str("system.device", &filesystem.device), - kv_str("system.filesystem.state", state), - kv_str("system.filesystem.type", &filesystem.fs_type), - kv_str("system.filesystem.mode", filesystem.mode), - kv_str("system.filesystem.mountpoint", &filesystem.mountpoint), - ]; - match value { - FilesystemValue::Integer(value) => { - number_point_i64(attributes, start, now, saturating_i64(value)) - } - FilesystemValue::Float(value) => number_point_f64(attributes, start, now, value), - } -} - -fn push_network_sum( - metrics: &mut Vec, - name: &'static str, - unit: &'static str, - start: u64, - now: u64, - counter_starts: &CounterStarts, - network: &NetworkStats, - projection: NetworkProjection, -) { - let (rx, tx, interface_attr) = match projection { - NetworkProjection::Bytes => (network.rx_bytes, network.tx_bytes, "network.interface.name"), - NetworkProjection::Packets => (network.rx_packets, network.tx_packets, "system.device"), - NetworkProjection::Dropped => ( - network.rx_dropped, - network.tx_dropped, - "network.interface.name", - ), - NetworkProjection::Errors => ( - network.rx_errors, - network.tx_errors, - "network.interface.name", - ), - }; - let points = vec![ - number_point_i64( - vec![ - kv_str(interface_attr, &network.name), - kv_str("network.io.direction", "receive"), - ], - counter_starts.get_joined(name, &network.name, "receive", start), - now, - saturating_i64(rx), - ), - number_point_i64( - vec![ - kv_str(interface_attr, &network.name), - kv_str("network.io.direction", "transmit"), - ], - counter_starts.get_joined(name, &network.name, "transmit", start), - now, - saturating_i64(tx), - ), - ]; - push_sum_metric(metrics, name, unit, points); -} - -#[derive(Copy, Clone)] -enum NetworkProjection { - Bytes, - Packets, - Dropped, - Errors, -} - -fn push_sum_metric( - metrics: &mut Vec, - name: &'static str, - unit: &'static str, - points: Vec, -) { - push_sum_metric_with_monotonic(metrics, name, unit, points, true); -} - -fn push_updown_metric( - metrics: &mut Vec, - name: &'static str, - unit: &'static str, - points: Vec, -) { - push_sum_metric_with_monotonic(metrics, name, unit, points, false); -} - -fn push_sum_metric_with_monotonic( - metrics: &mut Vec, - name: &'static str, - unit: &'static str, - points: Vec, - is_monotonic: bool, -) { - metrics.push(Metric { - name: name.to_owned(), - description: String::new(), - unit: unit.to_owned(), - metadata: Vec::new(), - data: Some(metric::Data::Sum(Sum { - data_points: points, - aggregation_temporality: AggregationTemporality::Cumulative.into(), - is_monotonic, - })), - }); -} - -fn number_point_f64( - attributes: Vec, - start_time_unix_nano: u64, - time_unix_nano: u64, - value: f64, -) -> NumberDataPoint { - NumberDataPoint { - attributes, - start_time_unix_nano, - time_unix_nano, - exemplars: Vec::new(), - flags: 0, - value: Some(number_data_point::Value::AsDouble(value)), - } -} - -fn number_point_i64( - attributes: Vec, - start_time_unix_nano: u64, - time_unix_nano: u64, - value: i64, -) -> NumberDataPoint { - NumberDataPoint { - attributes, - start_time_unix_nano, - time_unix_nano, - exemplars: Vec::new(), - flags: 0, - value: Some(number_data_point::Value::AsInt(value)), - } -} - -fn kv_str(key: &str, value: &str) -> KeyValue { - KeyValue { - key: key.to_owned(), - value: Some(AnyValue { - value: Some(any_value::Value::StringValue(value.to_owned())), - }), - } -} - -fn kv_i64(key: &str, value: i64) -> KeyValue { - KeyValue { - key: key.to_owned(), - value: Some(AnyValue { - value: Some(any_value::Value::IntValue(value)), - }), - } -} - fn parse_u64(input: &str) -> u64 { input.parse().unwrap_or_default() } @@ -3251,6 +2132,11 @@ fn saturating_i64(value: u64) -> i64 { #[cfg(test)] mod tests { use super::*; + use otap_df_pdata::proto::opentelemetry::common::v1::{AnyValue, KeyValue, any_value}; + use otap_df_pdata::proto::opentelemetry::metrics::v1::{ + AggregationTemporality, Metric, MetricsData, NumberDataPoint, metric, number_data_point, + }; + use otap_df_pdata::testing::round_trip::decode_metrics; #[cfg(feature = "dev-tools")] use std::collections::{BTreeMap, BTreeSet}; #[cfg(feature = "dev-tools")] @@ -3271,9 +2157,9 @@ mod tests { #[test] fn projection_uses_expected_metric_shapes() { - let request = projection_fixture_request(); + let data = projection_fixture_request(); - let resource_metrics = request.resource_metrics.first().expect("resource metrics"); + let resource_metrics = data.resource_metrics.first().expect("resource metrics"); let resource = resource_metrics.resource.as_ref().expect("resource"); assert_has_attr(&resource.attributes, "os.type", "linux"); assert_has_attr(&resource.attributes, "host.id", "host-id"); @@ -3497,21 +2383,24 @@ mod tests { #[test] fn projection_uses_counter_start_overrides_for_reset_series() { - let request = HostSnapshot { - now_unix_nano: 2_000, - start_time_unix_nano: 1_000, - counter_starts: CounterStarts { - entries: vec![(counter_key("system.process.created", ""), 1_500)], - }, - processes: Some(ProcessStats { - created: 99, - ..ProcessStats::default() - }), - ..HostSnapshot::default() - } - .into_export_request(); + let data = decode_metrics( + HostSnapshot { + now_unix_nano: 2_000, + start_time_unix_nano: 1_000, + counter_starts: CounterStarts { + entries: vec![(counter_key("system.process.created", ""), 1_500)], + }, + processes: Some(ProcessStats { + created: 99, + ..ProcessStats::default() + }), + ..HostSnapshot::default() + } + .into_otap_records() + .expect("encode ok"), + ); - let metrics = &request.resource_metrics[0].scope_metrics[0].metrics; + let metrics = &data.resource_metrics[0].scope_metrics[0].metrics; assert_first_sum_point_start(metrics, "system.process.created", 1_500); } @@ -4350,7 +3239,8 @@ mod tests { } } - fn projection_fixture_request() -> ExportMetricsServiceRequest { + fn projection_fixture_request() -> MetricsData { + decode_metrics( HostSnapshot { now_unix_nano: 2_000, start_time_unix_nano: 1_000, @@ -4460,7 +3350,9 @@ mod tests { host_arch: Some("amd64"), }, } - .into_export_request() + .into_otap_records() + .expect("encode ok"), + ) } #[cfg(feature = "dev-tools")] From f199defae7c1cd39d4c491f1c3448b4e251d7ac4 Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Sun, 3 May 2026 23:51:40 -0700 Subject: [PATCH 33/60] fix clippy: allow dead_code on SEMCONV_VERSION constant --- .../host_metrics_receiver/otap_builder.rs | 20 +- .../receivers/host_metrics_receiver/procfs.rs | 274 ++++++++++-------- 2 files changed, 161 insertions(+), 133 deletions(-) diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/otap_builder.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/otap_builder.rs index f7a9ab928f..a72230cd91 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/otap_builder.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/otap_builder.rs @@ -18,7 +18,8 @@ use otap_df_pdata::proto::opentelemetry::arrow::v1::ArrowPayloadType; use crate::receivers::host_metrics_receiver::procfs::HostResource; -// Semconv version targeted by this receiver. +/// Semconv version targeted by this receiver's projection layer. +#[allow(dead_code)] pub(crate) const SEMCONV_VERSION: &str = "1.41.0"; const SEMCONV_SCHEMA_URL: &[u8] = b"https://opentelemetry.io/schemas/1.41.0"; @@ -179,8 +180,7 @@ impl HostMetricsArrowBuilder { let dp_id = self.curr_dp_id; self.ndp.append_id(dp_id); self.ndp.append_parent_id(metric_id); - self.ndp - .append_start_time_unix_nano(Some(start as i64)); + self.ndp.append_start_time_unix_nano(Some(start as i64)); self.ndp.append_time_unix_nano(now as i64); self.ndp.append_int_value(Some(value)); self.ndp.append_double_value(None); @@ -207,8 +207,7 @@ impl HostMetricsArrowBuilder { let dp_id = self.curr_dp_id; self.ndp.append_id(dp_id); self.ndp.append_parent_id(metric_id); - self.ndp - .append_start_time_unix_nano(Some(start as i64)); + self.ndp.append_start_time_unix_nano(Some(start as i64)); self.ndp.append_time_unix_nano(now as i64); self.ndp.append_int_value(None); self.ndp.append_double_value(Some(value)); @@ -227,15 +226,20 @@ impl HostMetricsArrowBuilder { let n = self.metrics.len(); // Resource: single entry with id=0 repeated for every metric row. self.metrics.resource.append_id_n(0, n); - self.metrics.resource.append_schema_url_n(Some(SEMCONV_SCHEMA_URL), n); - self.metrics.resource.append_dropped_attributes_count_n(0, n); + self.metrics + .resource + .append_schema_url_n(Some(SEMCONV_SCHEMA_URL), n); + self.metrics + .resource + .append_dropped_attributes_count_n(0, n); // Scope: single entry with id=0. self.metrics.scope.append_id_n(0, n); self.metrics.scope.append_name_n(Some(SCOPE_NAME), n); self.metrics.scope.append_version_n(Some(SCOPE_VERSION), n); self.metrics.scope.append_dropped_attributes_count_n(0, n); // Schema URL on scope column. - self.metrics.append_scope_schema_url_n(SEMCONV_SCHEMA_URL, n); + self.metrics + .append_scope_schema_url_n(SEMCONV_SCHEMA_URL, n); let mut records = OtapArrowRecords::Metrics(Metrics::default()); finish_batch( diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs index e9c2dde96f..433827dc45 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs @@ -630,7 +630,6 @@ pub(super) struct HostResource { pub(super) host_arch: Option<&'static str>, } - fn project_snapshot( snap: &HostSnapshot, b: &mut crate::receivers::host_metrics_receiver::otap_builder::HostMetricsArrowBuilder, @@ -674,11 +673,23 @@ fn project_snapshot( } if snap.cpuinfo.logical_count != 0 { let m = b.begin_updown_i64("system.cpu.logical.count", "{cpu}"); - b.append_i64_dp(m, start, now, saturating_i64(snap.cpuinfo.logical_count), |_| {}); + b.append_i64_dp( + m, + start, + now, + saturating_i64(snap.cpuinfo.logical_count), + |_| {}, + ); } if snap.cpuinfo.physical_count != 0 { let m = b.begin_updown_i64("system.cpu.physical.count", "{cpu}"); - b.append_i64_dp(m, start, now, saturating_i64(snap.cpuinfo.physical_count), |_| {}); + b.append_i64_dp( + m, + start, + now, + saturating_i64(snap.cpuinfo.physical_count), + |_| {}, + ); } if !snap.cpuinfo.frequencies_hz.is_empty() { let m = b.begin_gauge_i64("system.cpu.frequency", "Hz"); @@ -752,9 +763,10 @@ fn project_snapshot( // ── Paging ─────────────────────────────────────────────────────────────── if let Some(paging) = snap.paging { let m = b.begin_counter_i64("system.paging.faults", "{fault}"); - for (fault_type, value) in - [("minor", paging.minor_faults), ("major", paging.major_faults)] - { + for (fault_type, value) in [ + ("minor", paging.minor_faults), + ("major", paging.major_faults), + ] { b.append_i64_dp( m, cs.get("system.paging.faults", fault_type, start), @@ -808,9 +820,10 @@ fn project_snapshot( // ── Processes ──────────────────────────────────────────────────────────── if let Some(processes) = snap.processes { let m = b.begin_updown_i64("system.process.count", "{process}"); - for (state, value) in - [("running", processes.running), ("blocked", processes.blocked)] - { + for (state, value) in [ + ("running", processes.running), + ("blocked", processes.blocked), + ] { b.append_i64_dp(m, start, now, saturating_i64(value), |w| { w.str("process.state", state); }); @@ -904,8 +917,11 @@ fn project_snapshot( for fs in &snap.filesystems { let total = fs.used.saturating_add(fs.free).saturating_add(fs.reserved); let m = b.begin_updown_i64("system.filesystem.usage", "By"); - for (state, value) in [("used", fs.used), ("free", fs.free), ("reserved", fs.reserved)] - { + for (state, value) in [ + ("used", fs.used), + ("free", fs.free), + ("reserved", fs.reserved), + ] { b.append_i64_dp(m, start, now, saturating_i64(value), |w| { w.str("system.device", &fs.device); w.str("system.filesystem.state", state); @@ -917,9 +933,11 @@ fn project_snapshot( if total > 0 { let m = b.begin_gauge_f64("system.filesystem.utilization", "1"); let total_f = total as f64; - for (state, value) in - [("used", fs.used), ("free", fs.free), ("reserved", fs.reserved)] - { + for (state, value) in [ + ("used", fs.used), + ("free", fs.free), + ("reserved", fs.reserved), + ] { b.append_f64_dp(m, 0, now, value as f64 / total_f, |w| { w.str("system.device", &fs.device); w.str("system.filesystem.state", state); @@ -1010,7 +1028,13 @@ fn project_hugepages( let m = b.begin_updown_i64("system.memory.linux.hugepages.limit", "{page}"); b.append_i64_dp(m, start, now, saturating_i64(hugepages.total), |_| {}); let m = b.begin_updown_i64("system.memory.linux.hugepages.page_size", "By"); - b.append_i64_dp(m, start, now, saturating_i64(hugepages.page_size_bytes), |_| {}); + b.append_i64_dp( + m, + start, + now, + saturating_i64(hugepages.page_size_bytes), + |_| {}, + ); let m = b.begin_updown_i64("system.memory.linux.hugepages.reserved", "{page}"); b.append_i64_dp(m, start, now, saturating_i64(hugepages.reserved), |_| {}); let m = b.begin_updown_i64("system.memory.linux.hugepages.surplus", "{page}"); @@ -3241,117 +3265,117 @@ mod tests { fn projection_fixture_request() -> MetricsData { decode_metrics( - HostSnapshot { - now_unix_nano: 2_000, - start_time_unix_nano: 1_000, - counter_starts: CounterStarts::default(), - memory_limit: true, - memory_shared: true, - memory_hugepages: true, - cpu: Some(CpuTimes { - user: 1.0, - nice: 2.0, - system: 3.0, - idle: 4.0, - wait: 5.0, - interrupt: 6.0, - steal: 7.0, - }), - cpu_utilization: Some(CpuTimes { - user: 0.1, - nice: 0.1, - system: 0.2, - idle: 0.3, - wait: 0.1, - interrupt: 0.1, - steal: 0.1, - }), - cpuinfo: CpuInfo { - logical_count: 2, - physical_count: 1, - frequencies_hz: vec![2_400_000_000.0], - }, - memory: Some(MemoryStats { - total: 100, - used: 80, - free: 10, - available: 20, - has_available: true, - cached: 5, - buffered: 5, - shared: 7, - slab_reclaimable: 3, - slab_unreclaimable: 2, - hugepages: HugepageStats { - total: 10, - free: 4, - reserved: 2, - surplus: 1, - page_size_bytes: 2 * BYTES_PER_KIB, + HostSnapshot { + now_unix_nano: 2_000, + start_time_unix_nano: 1_000, + counter_starts: CounterStarts::default(), + memory_limit: true, + memory_shared: true, + memory_hugepages: true, + cpu: Some(CpuTimes { + user: 1.0, + nice: 2.0, + system: 3.0, + idle: 4.0, + wait: 5.0, + interrupt: 6.0, + steal: 7.0, + }), + cpu_utilization: Some(CpuTimes { + user: 0.1, + nice: 0.1, + system: 0.2, + idle: 0.3, + wait: 0.1, + interrupt: 0.1, + steal: 0.1, + }), + cpuinfo: CpuInfo { + logical_count: 2, + physical_count: 1, + frequencies_hz: vec![2_400_000_000.0], }, - }), - uptime_seconds: Some(42.0), - paging: Some(PagingStats { - minor_faults: 9, - major_faults: 1, - page_in: 4, - page_out: 5, - swap_in: 2, - swap_out: 3, - }), - swaps: vec![SwapStats { - name: "/dev/swap".to_owned(), - size: 100, - used: 25, - free: 75, - }], - processes: Some(ProcessStats { - running: 4, - blocked: 1, - created: 99, - }), - disks: vec![DiskStats { - name: "sda".to_owned(), - limit_bytes: Some(123), - read_bytes: 10, - write_bytes: 20, - read_ops: 1, - write_ops: 2, - read_merged: 3, - write_merged: 4, - read_time_seconds: 0.5, - write_time_seconds: 0.6, - io_time_seconds: 0.7, - }], - filesystems: vec![FilesystemStats { - device: "/dev/sda1".to_owned(), - mountpoint: "/".to_owned(), - fs_type: "ext4".to_owned(), - mode: "rw", - used: 60, - free: 30, - reserved: 10, - limit_bytes: Some(100), - }], - networks: vec![NetworkStats { - name: "eth0".to_owned(), - rx_bytes: 10, - tx_bytes: 20, - rx_packets: 1, - tx_packets: 2, - rx_errors: 3, - tx_errors: 4, - rx_dropped: 5, - tx_dropped: 6, - }], - resource: HostResource { - host_id: Some("host-id".to_owned()), - host_name: Some("host-name".to_owned()), - host_arch: Some("amd64"), - }, - } - .into_otap_records() - .expect("encode ok"), + memory: Some(MemoryStats { + total: 100, + used: 80, + free: 10, + available: 20, + has_available: true, + cached: 5, + buffered: 5, + shared: 7, + slab_reclaimable: 3, + slab_unreclaimable: 2, + hugepages: HugepageStats { + total: 10, + free: 4, + reserved: 2, + surplus: 1, + page_size_bytes: 2 * BYTES_PER_KIB, + }, + }), + uptime_seconds: Some(42.0), + paging: Some(PagingStats { + minor_faults: 9, + major_faults: 1, + page_in: 4, + page_out: 5, + swap_in: 2, + swap_out: 3, + }), + swaps: vec![SwapStats { + name: "/dev/swap".to_owned(), + size: 100, + used: 25, + free: 75, + }], + processes: Some(ProcessStats { + running: 4, + blocked: 1, + created: 99, + }), + disks: vec![DiskStats { + name: "sda".to_owned(), + limit_bytes: Some(123), + read_bytes: 10, + write_bytes: 20, + read_ops: 1, + write_ops: 2, + read_merged: 3, + write_merged: 4, + read_time_seconds: 0.5, + write_time_seconds: 0.6, + io_time_seconds: 0.7, + }], + filesystems: vec![FilesystemStats { + device: "/dev/sda1".to_owned(), + mountpoint: "/".to_owned(), + fs_type: "ext4".to_owned(), + mode: "rw", + used: 60, + free: 30, + reserved: 10, + limit_bytes: Some(100), + }], + networks: vec![NetworkStats { + name: "eth0".to_owned(), + rx_bytes: 10, + tx_bytes: 20, + rx_packets: 1, + tx_packets: 2, + rx_errors: 3, + tx_errors: 4, + rx_dropped: 5, + tx_dropped: 6, + }], + resource: HostResource { + host_id: Some("host-id".to_owned()), + host_name: Some("host-name".to_owned()), + host_arch: Some("amd64"), + }, + } + .into_otap_records() + .expect("encode ok"), ) } From 84a26b6b6cf9836d5c0b70b01e668e9b8da99585 Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Sun, 3 May 2026 23:59:14 -0700 Subject: [PATCH 34/60] append flags=0 on every number datapoint --- .../src/receivers/host_metrics_receiver/otap_builder.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/otap_builder.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/otap_builder.rs index a72230cd91..f1c488535f 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/otap_builder.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/otap_builder.rs @@ -184,6 +184,7 @@ impl HostMetricsArrowBuilder { self.ndp.append_time_unix_nano(now as i64); self.ndp.append_int_value(Some(value)); self.ndp.append_double_value(None); + self.ndp.append_flags(0); let mut w = DpAttrWriter { attrs: &mut self.ndp_attrs, dp_id, @@ -211,6 +212,7 @@ impl HostMetricsArrowBuilder { self.ndp.append_time_unix_nano(now as i64); self.ndp.append_int_value(None); self.ndp.append_double_value(Some(value)); + self.ndp.append_flags(0); let mut w = DpAttrWriter { attrs: &mut self.ndp_attrs, dp_id, From 2550105383d0b9a9cd04330f87a2ab1e6c533131 Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Mon, 4 May 2026 00:00:22 -0700 Subject: [PATCH 35/60] add compile-time assertion that schema URL matches semconv version --- .../host_metrics_receiver/otap_builder.rs | 22 ++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/otap_builder.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/otap_builder.rs index f1c488535f..2d27821d48 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/otap_builder.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/otap_builder.rs @@ -19,10 +19,30 @@ use otap_df_pdata::proto::opentelemetry::arrow::v1::ArrowPayloadType; use crate::receivers::host_metrics_receiver::procfs::HostResource; /// Semconv version targeted by this receiver's projection layer. -#[allow(dead_code)] +/// +/// `SEMCONV_SCHEMA_URL` must be kept in sync with this value. pub(crate) const SEMCONV_VERSION: &str = "1.41.0"; const SEMCONV_SCHEMA_URL: &[u8] = b"https://opentelemetry.io/schemas/1.41.0"; +const _: () = { + // Enforce that SEMCONV_SCHEMA_URL ends with SEMCONV_VERSION. + let url = SEMCONV_SCHEMA_URL; + let ver = SEMCONV_VERSION.as_bytes(); + assert!( + url.len() >= ver.len(), + "SEMCONV_SCHEMA_URL is shorter than SEMCONV_VERSION" + ); + let suffix = url.split_at(url.len() - ver.len()).1; + let mut i = 0; + while i < ver.len() { + assert!( + suffix[i] == ver[i], + "SEMCONV_SCHEMA_URL suffix does not match SEMCONV_VERSION" + ); + i += 1; + } +}; + const SCOPE_NAME: &[u8] = b"otap-df-core-nodes/host-metrics"; const SCOPE_VERSION: &[u8] = env!("CARGO_PKG_VERSION").as_bytes(); From d4931c42212ef49818f59819d4ce7be67c750a7b Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Mon, 4 May 2026 00:21:52 -0700 Subject: [PATCH 36/60] gate procfs and scheduler on linux, keep platform check for non-linux --- .../receivers/host_metrics_receiver/mod.rs | 72 +++++++++++++++---- .../host_metrics_receiver/otap_builder.rs | 2 + 2 files changed, 60 insertions(+), 14 deletions(-) diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs index fd3157d889..a6f3ab48fa 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs @@ -2,39 +2,62 @@ // SPDX-License-Identifier: Apache-2.0 //! Host metrics receiver. +//! +//! Most implementation is Linux-only (`#[cfg(target_os = "linux")]`). +//! Structs and filters defined here are dead on other platforms by design. +#![cfg_attr(not(target_os = "linux"), allow(dead_code))] +#[cfg(target_os = "linux")] use async_trait::async_trait; use linkme::distributed_slice; use otap_df_config::node::NodeUserConfig; +#[cfg(target_os = "linux")] use otap_df_engine::MessageSourceLocalEffectHandlerExtension; use otap_df_engine::ReceiverFactory; use otap_df_engine::config::ReceiverConfig; use otap_df_engine::context::PipelineContext; +#[cfg(target_os = "linux")] use otap_df_engine::control::NodeControlMsg; +#[cfg(target_os = "linux")] use otap_df_engine::error::{Error, ReceiverErrorKind, TypedError}; +#[cfg(target_os = "linux")] use otap_df_engine::local::receiver as local; use otap_df_engine::node::NodeId; +#[cfg(target_os = "linux")] use otap_df_engine::receiver::ReceiverWrapper; +#[cfg(target_os = "linux")] use otap_df_engine::terminal_state::TerminalState; use otap_df_otap::OTAP_RECEIVER_FACTORIES; -use otap_df_otap::pdata::{Context, OtapPdata}; +#[cfg(target_os = "linux")] +use otap_df_otap::pdata::Context; +use otap_df_otap::pdata::OtapPdata; use otap_df_telemetry::instrument::{Counter, Mmsc}; -use otap_df_telemetry::metrics::{MetricSet, MetricSetSnapshot}; +#[cfg(target_os = "linux")] +use otap_df_telemetry::metrics::MetricSetSnapshot; +use otap_df_telemetry::metrics::MetricSet; +#[cfg(target_os = "linux")] use otap_df_telemetry::{otel_info, otel_warn}; use otap_df_telemetry_macros::metric_set; use regex::Regex; use serde::{Deserialize, Serialize}; +#[cfg(target_os = "linux")] use serde_json::Value; use std::collections::HashSet; use std::path::{Component, Path, PathBuf}; use std::sync::Arc; use std::sync::{LazyLock, Mutex}; -use std::time::{Duration, Instant as StdInstant}; +#[cfg(target_os = "linux")] +use std::time::Instant as StdInstant; +use std::time::Duration; +#[cfg(target_os = "linux")] use tokio::time::{Instant, sleep_until}; +#[cfg(target_os = "linux")] mod otap_builder; +#[cfg(target_os = "linux")] mod procfs; +#[cfg(target_os = "linux")] use procfs::{HostSnapshot, ProcfsConfig, ProcfsFamilies, ProcfsSource}; /// The URN for the host metrics receiver. @@ -623,23 +646,31 @@ pub struct HostMetricsReceiver { pub static HOST_METRICS_RECEIVER: ReceiverFactory = ReceiverFactory { name: HOST_METRICS_RECEIVER_URN, create: |pipeline: PipelineContext, - node: NodeId, - node_config: Arc, - receiver_config: &ReceiverConfig| { + #[cfg(target_os = "linux")] node: NodeId, + #[cfg(not(target_os = "linux"))] _node: NodeId, + #[cfg(target_os = "linux")] node_config: Arc, + #[cfg(not(target_os = "linux"))] _node_config: Arc, + #[cfg(target_os = "linux")] receiver_config: &ReceiverConfig, + #[cfg(not(target_os = "linux"))] _receiver_config: &ReceiverConfig| { validate_supported_platform()?; if pipeline.num_cores() > 1 { return Err(otap_df_config::error::Error::InvalidUserConfig { error: "host-wide collection must run in a one-core source pipeline; use receiver:host_metrics -> exporter:topic and fan out downstream".to_owned(), }); } - let mut receiver = HostMetricsReceiver::from_config(&node_config.config)?; - receiver.metrics = Some(pipeline.register_metrics::()); - Ok(ReceiverWrapper::local( - receiver, - node, - node_config, - receiver_config, - )) + #[cfg(target_os = "linux")] + { + let mut receiver = HostMetricsReceiver::from_config(&node_config.config)?; + receiver.metrics = Some(pipeline.register_metrics::()); + return Ok(ReceiverWrapper::local( + receiver, + node, + node_config, + receiver_config, + )); + } + #[cfg(not(target_os = "linux"))] + unreachable!("validate_supported_platform returned Ok on a non-Linux platform") }, wiring_contract: otap_df_engine::wiring_contract::WiringContract::UNRESTRICTED, validate_config: |config| { @@ -652,6 +683,7 @@ pub static HOST_METRICS_RECEIVER: ReceiverFactory = ReceiverFactory { }, }; +#[cfg(target_os = "linux")] impl HostMetricsReceiver { /// Creates a new host metrics receiver. pub fn new(config: Config) -> Result { @@ -781,14 +813,17 @@ fn validate_family_interval( Ok(()) } +#[cfg(target_os = "linux")] fn duration_nanos(duration: Duration) -> f64 { duration.as_secs_f64() * 1e9 } +#[cfg(target_os = "linux")] fn elapsed_nanos(start: StdInstant) -> f64 { duration_nanos(start.elapsed()) } +#[cfg(target_os = "linux")] fn terminal_state( deadline: StdInstant, metrics: &Option>, @@ -800,6 +835,7 @@ fn terminal_state( } } +#[cfg(target_os = "linux")] fn due_family_count(due: ProcfsFamilies) -> u64 { u64::from(due.cpu) + u64::from(due.memory) @@ -980,6 +1016,7 @@ impl RuntimeFamily { } } +#[cfg(target_os = "linux")] #[derive(Clone, Copy, Debug, Eq, PartialEq)] enum ScheduledFamilyKind { Cpu, @@ -992,16 +1029,19 @@ enum ScheduledFamilyKind { Processes, } +#[cfg(target_os = "linux")] struct ScheduledFamily { kind: ScheduledFamilyKind, interval: Duration, next_due: Instant, } +#[cfg(target_os = "linux")] struct FamilyScheduler { entries: Vec, } +#[cfg(target_os = "linux")] impl FamilyScheduler { fn new(config: &RuntimeConfig, now: Instant) -> Self { let first_due = now + config.initial_delay; @@ -1097,6 +1137,7 @@ impl FamilyScheduler { } } +#[cfg(target_os = "linux")] fn push_scheduled( entries: &mut Vec, kind: ScheduledFamilyKind, @@ -1173,6 +1214,7 @@ fn normalized_root_path(root_path: Option<&Path>) -> Result for HostMetricsReceiver { async fn start( @@ -1337,6 +1379,7 @@ impl local::Receiver for HostMetricsReceiver { } } +#[cfg(target_os = "linux")] fn encode_snapshot(snapshot: HostSnapshot) -> Result { let records = snapshot.into_otap_records()?; Ok(OtapPdata::new(Context::default(), records.into())) @@ -1622,6 +1665,7 @@ mod tests { ); } + #[cfg(target_os = "linux")] #[test] fn scheduler_honors_initial_delay_and_family_intervals() { let config = Config { diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/otap_builder.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/otap_builder.rs index 2d27821d48..5ef55d1bd8 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/otap_builder.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/otap_builder.rs @@ -16,6 +16,7 @@ use otap_df_pdata::otap::{Metrics, OtapArrowRecords}; use otap_df_pdata::otlp::metrics::MetricType; use otap_df_pdata::proto::opentelemetry::arrow::v1::ArrowPayloadType; +#[cfg(target_os = "linux")] use crate::receivers::host_metrics_receiver::procfs::HostResource; /// Semconv version targeted by this receiver's projection layer. @@ -112,6 +113,7 @@ impl HostMetricsArrowBuilder { /// Append resource attributes (host.id, host.name, host.arch, os.type). /// Must be called exactly once per batch before any metrics are appended. + #[cfg(target_os = "linux")] pub(crate) fn append_resource(&mut self, resource: &HostResource) { let mut w = ResourceAttrWriter { attrs: &mut self.resource_attrs, From 363334315a144c839f9691398bd183da48b8e382 Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Mon, 4 May 2026 00:23:33 -0700 Subject: [PATCH 37/60] implement CollectTelemetry handler to emit receiver metrics snapshot --- .../core-nodes/src/receivers/host_metrics_receiver/mod.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs index a6f3ab48fa..6f58c7e254 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs @@ -1284,7 +1284,11 @@ impl local::Receiver for HostMetricsReceiver { msg = ctrl_msg_recv.recv() => { match msg { - Ok(NodeControlMsg::CollectTelemetry { .. }) => {} + Ok(NodeControlMsg::CollectTelemetry { mut metrics_reporter }) => { + if let Some(metrics) = metrics.as_mut() { + let _ = metrics_reporter.report(metrics); + } + } Ok(NodeControlMsg::DrainIngress { deadline, .. }) => { otel_info!("host_metrics_receiver.drain_ingress"); effect_handler.notify_receiver_drained().await?; From 2a86b04bd7f5951623120262261e5a13836861f2 Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Mon, 4 May 2026 09:44:45 -0700 Subject: [PATCH 38/60] propagate OTAP set errors instead of panicking in finish_batch; replace wrapping_add with checked_add --- .../host_metrics_receiver/otap_builder.rs | 38 +++++++------------ 1 file changed, 13 insertions(+), 25 deletions(-) diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/otap_builder.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/otap_builder.rs index 5ef55d1bd8..21e2954a71 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/otap_builder.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/otap_builder.rs @@ -181,7 +181,8 @@ impl HostMetricsArrowBuilder { self.metrics.append_is_monotonic(None); } } - self.curr_metric_id = self.curr_metric_id.wrapping_add(1); + self.curr_metric_id = self.curr_metric_id.checked_add(1) + .expect("metric_id overflow: more than u16::MAX metrics in one batch"); id } @@ -212,7 +213,8 @@ impl HostMetricsArrowBuilder { dp_id, }; attrs(&mut w); - self.curr_dp_id = self.curr_dp_id.wrapping_add(1); + self.curr_dp_id = self.curr_dp_id.checked_add(1) + .expect("dp_id overflow: more than u32::MAX datapoints in one batch"); } /// Append one f64 data point for `metric_id`. @@ -240,7 +242,8 @@ impl HostMetricsArrowBuilder { dp_id, }; attrs(&mut w); - self.curr_dp_id = self.curr_dp_id.wrapping_add(1); + self.curr_dp_id = self.curr_dp_id.checked_add(1) + .expect("dp_id overflow: more than u32::MAX datapoints in one batch"); } // ── Finalization ───────────────────────────────────────────────────────── @@ -266,26 +269,10 @@ impl HostMetricsArrowBuilder { .append_scope_schema_url_n(SEMCONV_SCHEMA_URL, n); let mut records = OtapArrowRecords::Metrics(Metrics::default()); - finish_batch( - &mut records, - ArrowPayloadType::UnivariateMetrics, - self.metrics.finish()?, - ); - finish_batch( - &mut records, - ArrowPayloadType::NumberDataPoints, - self.ndp.finish()?, - ); - finish_batch( - &mut records, - ArrowPayloadType::ResourceAttrs, - self.resource_attrs.finish()?, - ); - finish_batch( - &mut records, - ArrowPayloadType::NumberDpAttrs, - self.ndp_attrs.finish()?, - ); + finish_batch(&mut records, ArrowPayloadType::UnivariateMetrics, self.metrics.finish()?)?; + finish_batch(&mut records, ArrowPayloadType::NumberDataPoints, self.ndp.finish()?)?; + finish_batch(&mut records, ArrowPayloadType::ResourceAttrs, self.resource_attrs.finish()?)?; + finish_batch(&mut records, ArrowPayloadType::NumberDpAttrs, self.ndp_attrs.finish()?)?; Ok(records) } } @@ -294,10 +281,11 @@ fn finish_batch( records: &mut OtapArrowRecords, payload_type: ArrowPayloadType, rb: arrow::array::RecordBatch, -) { +) -> Result<(), ArrowError> { if rb.num_rows() > 0 { records .set(payload_type, rb) - .expect("host metrics record batch schema is valid"); + .map_err(|e| ArrowError::ExternalError(Box::new(e)))?; } + Ok(()) } From bea7421fa6640f32f493cc5a1540fa0c364d6353 Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Mon, 4 May 2026 09:51:07 -0700 Subject: [PATCH 39/60] cargo fmt --- .../receivers/host_metrics_receiver/mod.rs | 4 +-- .../host_metrics_receiver/otap_builder.rs | 36 +++++++++++++++---- 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs index 6f58c7e254..3491648970 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs @@ -32,9 +32,9 @@ use otap_df_otap::OTAP_RECEIVER_FACTORIES; use otap_df_otap::pdata::Context; use otap_df_otap::pdata::OtapPdata; use otap_df_telemetry::instrument::{Counter, Mmsc}; +use otap_df_telemetry::metrics::MetricSet; #[cfg(target_os = "linux")] use otap_df_telemetry::metrics::MetricSetSnapshot; -use otap_df_telemetry::metrics::MetricSet; #[cfg(target_os = "linux")] use otap_df_telemetry::{otel_info, otel_warn}; use otap_df_telemetry_macros::metric_set; @@ -46,9 +46,9 @@ use std::collections::HashSet; use std::path::{Component, Path, PathBuf}; use std::sync::Arc; use std::sync::{LazyLock, Mutex}; +use std::time::Duration; #[cfg(target_os = "linux")] use std::time::Instant as StdInstant; -use std::time::Duration; #[cfg(target_os = "linux")] use tokio::time::{Instant, sleep_until}; diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/otap_builder.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/otap_builder.rs index 21e2954a71..143600ff3c 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/otap_builder.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/otap_builder.rs @@ -181,7 +181,9 @@ impl HostMetricsArrowBuilder { self.metrics.append_is_monotonic(None); } } - self.curr_metric_id = self.curr_metric_id.checked_add(1) + self.curr_metric_id = self + .curr_metric_id + .checked_add(1) .expect("metric_id overflow: more than u16::MAX metrics in one batch"); id } @@ -213,7 +215,9 @@ impl HostMetricsArrowBuilder { dp_id, }; attrs(&mut w); - self.curr_dp_id = self.curr_dp_id.checked_add(1) + self.curr_dp_id = self + .curr_dp_id + .checked_add(1) .expect("dp_id overflow: more than u32::MAX datapoints in one batch"); } @@ -242,7 +246,9 @@ impl HostMetricsArrowBuilder { dp_id, }; attrs(&mut w); - self.curr_dp_id = self.curr_dp_id.checked_add(1) + self.curr_dp_id = self + .curr_dp_id + .checked_add(1) .expect("dp_id overflow: more than u32::MAX datapoints in one batch"); } @@ -269,10 +275,26 @@ impl HostMetricsArrowBuilder { .append_scope_schema_url_n(SEMCONV_SCHEMA_URL, n); let mut records = OtapArrowRecords::Metrics(Metrics::default()); - finish_batch(&mut records, ArrowPayloadType::UnivariateMetrics, self.metrics.finish()?)?; - finish_batch(&mut records, ArrowPayloadType::NumberDataPoints, self.ndp.finish()?)?; - finish_batch(&mut records, ArrowPayloadType::ResourceAttrs, self.resource_attrs.finish()?)?; - finish_batch(&mut records, ArrowPayloadType::NumberDpAttrs, self.ndp_attrs.finish()?)?; + finish_batch( + &mut records, + ArrowPayloadType::UnivariateMetrics, + self.metrics.finish()?, + )?; + finish_batch( + &mut records, + ArrowPayloadType::NumberDataPoints, + self.ndp.finish()?, + )?; + finish_batch( + &mut records, + ArrowPayloadType::ResourceAttrs, + self.resource_attrs.finish()?, + )?; + finish_batch( + &mut records, + ArrowPayloadType::NumberDpAttrs, + self.ndp_attrs.finish()?, + )?; Ok(records) } } From 993fc3d151dedbaabe636c0e29be73066172d1a0 Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Mon, 4 May 2026 16:54:49 -0700 Subject: [PATCH 40/60] Centralize host metrics semconv constants --- .../receivers/host_metrics_receiver/mod.rs | 4 + .../host_metrics_receiver/otap_builder.rs | 34 +-- .../receivers/host_metrics_receiver/procfs.rs | 269 +++++++++--------- .../host_metrics_receiver/semconv.rs | 65 +++++ .../docs/host-metrics-receiver.md | 2 +- 5 files changed, 204 insertions(+), 170 deletions(-) create mode 100644 rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/semconv.rs diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs index 3491648970..3626aa8dd2 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs @@ -56,6 +56,8 @@ use tokio::time::{Instant, sleep_until}; mod otap_builder; #[cfg(target_os = "linux")] mod procfs; +#[cfg(target_os = "linux")] +mod semconv; #[cfg(target_os = "linux")] use procfs::{HostSnapshot, ProcfsConfig, ProcfsFamilies, ProcfsSource}; @@ -76,6 +78,8 @@ pub struct HostMetricsReceiverMetrics { /// Number of fatal scrape failures. #[metric(unit = "{scrape}")] pub scrapes_failed: Counter, + // TODO: Decide whether fixed per-family error counters are needed here. + // Metric-level attributes are not supported by the internal telemetry API today. /// Number of source read errors skipped because other families succeeded. #[metric(unit = "{error}")] pub partial_errors: Counter, diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/otap_builder.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/otap_builder.rs index 143600ff3c..017ee6f07f 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/otap_builder.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/otap_builder.rs @@ -18,31 +18,7 @@ use otap_df_pdata::proto::opentelemetry::arrow::v1::ArrowPayloadType; #[cfg(target_os = "linux")] use crate::receivers::host_metrics_receiver::procfs::HostResource; - -/// Semconv version targeted by this receiver's projection layer. -/// -/// `SEMCONV_SCHEMA_URL` must be kept in sync with this value. -pub(crate) const SEMCONV_VERSION: &str = "1.41.0"; -const SEMCONV_SCHEMA_URL: &[u8] = b"https://opentelemetry.io/schemas/1.41.0"; - -const _: () = { - // Enforce that SEMCONV_SCHEMA_URL ends with SEMCONV_VERSION. - let url = SEMCONV_SCHEMA_URL; - let ver = SEMCONV_VERSION.as_bytes(); - assert!( - url.len() >= ver.len(), - "SEMCONV_SCHEMA_URL is shorter than SEMCONV_VERSION" - ); - let suffix = url.split_at(url.len() - ver.len()).1; - let mut i = 0; - while i < ver.len() { - assert!( - suffix[i] == ver[i], - "SEMCONV_SCHEMA_URL suffix does not match SEMCONV_VERSION" - ); - i += 1; - } -}; +use crate::receivers::host_metrics_receiver::semconv; const SCOPE_NAME: &[u8] = b"otap-df-core-nodes/host-metrics"; const SCOPE_VERSION: &[u8] = env!("CARGO_PKG_VERSION").as_bytes(); @@ -87,8 +63,8 @@ impl ResourceAttrWriter<'_> { /// Builds an `OtapArrowRecords::Metrics` batch directly from host metric values. /// -/// Call [`begin_sum_i64`], [`begin_sum_f64`], or [`begin_gauge_f64`] to open a -/// metric, then [`append_i64_dp`] / [`append_f64_dp`] for each data point. +/// Call a `begin_*` method to open a metric, then [`append_i64_dp`] / +/// [`append_f64_dp`] for each data point. /// Call [`finish`] to produce the final batch. pub(crate) struct HostMetricsArrowBuilder { metrics: MetricsRecordBatchBuilder, @@ -261,7 +237,7 @@ impl HostMetricsArrowBuilder { self.metrics.resource.append_id_n(0, n); self.metrics .resource - .append_schema_url_n(Some(SEMCONV_SCHEMA_URL), n); + .append_schema_url_n(Some(semconv::SCHEMA_URL), n); self.metrics .resource .append_dropped_attributes_count_n(0, n); @@ -272,7 +248,7 @@ impl HostMetricsArrowBuilder { self.metrics.scope.append_dropped_attributes_count_n(0, n); // Schema URL on scope column. self.metrics - .append_scope_schema_url_n(SEMCONV_SCHEMA_URL, n); + .append_scope_schema_url_n(semconv::SCHEMA_URL, n); let mut records = OtapArrowRecords::Metrics(Metrics::default()); finish_batch( diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs index 433827dc45..f277474cf0 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs @@ -3,6 +3,7 @@ //! Linux procfs-backed host metric source. +use crate::receivers::host_metrics_receiver::semconv::metric; use crate::receivers::host_metrics_receiver::{CompiledFilter, HostViewValidationMode}; use std::collections::{HashMap, HashSet}; use std::fs::File; @@ -640,7 +641,7 @@ fn project_snapshot( // ── CPU ────────────────────────────────────────────────────────────────── if let Some(cpu) = snap.cpu { - let m = b.begin_counter_f64("system.cpu.time", "s"); + let m = b.begin_counter_f64(metric::CPU_TIME, "s"); for (mode, value) in [ ("user", cpu.user), ("nice", cpu.nice), @@ -650,13 +651,13 @@ fn project_snapshot( ("interrupt", cpu.interrupt), ("steal", cpu.steal), ] { - b.append_f64_dp(m, cs.get("system.cpu.time", mode, start), now, value, |w| { + b.append_f64_dp(m, cs.get(metric::CPU_TIME, mode, start), now, value, |w| { w.str("cpu.mode", mode); }); } } if let Some(cpu) = snap.cpu_utilization { - let m = b.begin_gauge_f64("system.cpu.utilization", "1"); + let m = b.begin_gauge_f64(metric::CPU_UTILIZATION, "1"); for (mode, value) in [ ("user", cpu.user), ("nice", cpu.nice), @@ -672,7 +673,7 @@ fn project_snapshot( } } if snap.cpuinfo.logical_count != 0 { - let m = b.begin_updown_i64("system.cpu.logical.count", "{cpu}"); + let m = b.begin_updown_i64(metric::CPU_LOGICAL_COUNT, "{cpu}"); b.append_i64_dp( m, start, @@ -682,7 +683,7 @@ fn project_snapshot( ); } if snap.cpuinfo.physical_count != 0 { - let m = b.begin_updown_i64("system.cpu.physical.count", "{cpu}"); + let m = b.begin_updown_i64(metric::CPU_PHYSICAL_COUNT, "{cpu}"); b.append_i64_dp( m, start, @@ -692,7 +693,7 @@ fn project_snapshot( ); } if !snap.cpuinfo.frequencies_hz.is_empty() { - let m = b.begin_gauge_i64("system.cpu.frequency", "Hz"); + let m = b.begin_gauge_i64(metric::CPU_FREQUENCY, "Hz"); for (idx, &freq) in snap.cpuinfo.frequencies_hz.iter().enumerate() { let logical = i64::try_from(idx).unwrap_or(i64::MAX); b.append_i64_dp(m, 0, now, frequency_hz_i64(freq), |w| { @@ -703,7 +704,7 @@ fn project_snapshot( // ── Memory ─────────────────────────────────────────────────────────────── if let Some(memory) = snap.memory { - let m = b.begin_updown_i64("system.memory.usage", "By"); + let m = b.begin_updown_i64(metric::MEMORY_USAGE, "By"); for (state, value) in [ ("used", memory.used), ("free", memory.free), @@ -715,7 +716,7 @@ fn project_snapshot( }); } if memory.total > 0 { - let m = b.begin_gauge_f64("system.memory.utilization", "1"); + let m = b.begin_gauge_f64(metric::MEMORY_UTILIZATION, "1"); let total = memory.total as f64; for (state, value) in [ ("used", memory.used), @@ -729,10 +730,10 @@ fn project_snapshot( } } if memory.has_available { - let m = b.begin_updown_i64("system.memory.linux.available", "By"); + let m = b.begin_updown_i64(metric::MEMORY_LINUX_AVAILABLE, "By"); b.append_i64_dp(m, start, now, saturating_i64(memory.available), |_| {}); } - let m = b.begin_updown_i64("system.memory.linux.slab.usage", "By"); + let m = b.begin_updown_i64(metric::MEMORY_LINUX_SLAB_USAGE, "By"); for (state, value) in [ ("reclaimable", memory.slab_reclaimable), ("unreclaimable", memory.slab_unreclaimable), @@ -742,11 +743,11 @@ fn project_snapshot( }); } if snap.memory_limit { - let m = b.begin_updown_i64("system.memory.limit", "By"); + let m = b.begin_updown_i64(metric::MEMORY_LIMIT, "By"); b.append_i64_dp(m, start, now, saturating_i64(memory.total), |_| {}); } if snap.memory_shared { - let m = b.begin_updown_i64("system.memory.linux.shared", "By"); + let m = b.begin_updown_i64(metric::MEMORY_LINUX_SHARED, "By"); b.append_i64_dp(m, start, now, saturating_i64(memory.shared), |_| {}); } if snap.memory_hugepages { @@ -756,20 +757,20 @@ fn project_snapshot( // ── System / uptime ────────────────────────────────────────────────────── if let Some(uptime) = snap.uptime_seconds { - let m = b.begin_gauge_f64("system.uptime", "s"); + let m = b.begin_gauge_f64(metric::UPTIME, "s"); b.append_f64_dp(m, 0, now, uptime, |_| {}); } // ── Paging ─────────────────────────────────────────────────────────────── if let Some(paging) = snap.paging { - let m = b.begin_counter_i64("system.paging.faults", "{fault}"); + let m = b.begin_counter_i64(metric::PAGING_FAULTS, "{fault}"); for (fault_type, value) in [ ("minor", paging.minor_faults), ("major", paging.major_faults), ] { b.append_i64_dp( m, - cs.get("system.paging.faults", fault_type, start), + cs.get(metric::PAGING_FAULTS, fault_type, start), now, saturating_i64(value), |w| { @@ -777,7 +778,7 @@ fn project_snapshot( }, ); } - let m = b.begin_counter_i64("system.paging.operations", "{operation}"); + let m = b.begin_counter_i64(metric::PAGING_OPERATIONS, "{operation}"); for (direction, fault_type, value) in [ ("in", "major", paging.swap_in), ("out", "major", paging.swap_out), @@ -786,7 +787,7 @@ fn project_snapshot( ] { b.append_i64_dp( m, - cs.get_joined("system.paging.operations", direction, fault_type, start), + cs.get_joined(metric::PAGING_OPERATIONS, direction, fault_type, start), now, saturating_i64(value), |w| { @@ -797,7 +798,7 @@ fn project_snapshot( } } for swap in &snap.swaps { - let m = b.begin_updown_i64("system.paging.usage", "By"); + let m = b.begin_updown_i64(metric::PAGING_USAGE, "By"); for (state, value) in [("used", swap.used), ("free", swap.free)] { b.append_i64_dp(m, start, now, saturating_i64(value), |w| { w.str("system.device", &swap.name); @@ -806,7 +807,7 @@ fn project_snapshot( } let size = swap.size; if size > 0 { - let m = b.begin_gauge_f64("system.paging.utilization", "1"); + let m = b.begin_gauge_f64(metric::PAGING_UTILIZATION, "1"); let total = size as f64; for (state, value) in [("used", swap.used), ("free", swap.free)] { b.append_f64_dp(m, 0, now, value as f64 / total, |w| { @@ -819,7 +820,7 @@ fn project_snapshot( // ── Processes ──────────────────────────────────────────────────────────── if let Some(processes) = snap.processes { - let m = b.begin_updown_i64("system.process.count", "{process}"); + let m = b.begin_updown_i64(metric::PROCESS_COUNT, "{process}"); for (state, value) in [ ("running", processes.running), ("blocked", processes.blocked), @@ -828,10 +829,10 @@ fn project_snapshot( w.str("process.state", state); }); } - let m = b.begin_counter_i64("system.process.created", "{process}"); + let m = b.begin_counter_i64(metric::PROCESS_CREATED, "{process}"); b.append_i64_dp( m, - cs.get("system.process.created", "", start), + cs.get(metric::PROCESS_CREATED, "", start), now, saturating_i64(processes.created), |_| {}, @@ -841,16 +842,16 @@ fn project_snapshot( // ── Disk ───────────────────────────────────────────────────────────────── for disk in &snap.disks { if let Some(limit_bytes) = disk.limit_bytes { - let m = b.begin_updown_i64("system.disk.limit", "By"); + let m = b.begin_updown_i64(metric::DISK_LIMIT, "By"); b.append_i64_dp(m, start, now, saturating_i64(limit_bytes), |w| { w.str("system.device", &disk.name); }); } - let m = b.begin_counter_i64("system.disk.io", "By"); + let m = b.begin_counter_i64(metric::DISK_IO, "By"); for (dir, value) in [("read", disk.read_bytes), ("write", disk.write_bytes)] { b.append_i64_dp( m, - cs.get_joined("system.disk.io", &disk.name, dir, start), + cs.get_joined(metric::DISK_IO, &disk.name, dir, start), now, saturating_i64(value), |w| { @@ -859,11 +860,11 @@ fn project_snapshot( }, ); } - let m = b.begin_counter_i64("system.disk.operations", "{operation}"); + let m = b.begin_counter_i64(metric::DISK_OPERATIONS, "{operation}"); for (dir, value) in [("read", disk.read_ops), ("write", disk.write_ops)] { b.append_i64_dp( m, - cs.get_joined("system.disk.operations", &disk.name, dir, start), + cs.get_joined(metric::DISK_OPERATIONS, &disk.name, dir, start), now, saturating_i64(value), |w| { @@ -872,24 +873,24 @@ fn project_snapshot( }, ); } - let m = b.begin_counter_f64("system.disk.io_time", "s"); + let m = b.begin_counter_f64(metric::DISK_IO_TIME, "s"); b.append_f64_dp( m, - cs.get("system.disk.io_time", &disk.name, start), + cs.get(metric::DISK_IO_TIME, &disk.name, start), now, disk.io_time_seconds, |w| { w.str("system.device", &disk.name); }, ); - let m = b.begin_counter_f64("system.disk.operation_time", "s"); + let m = b.begin_counter_f64(metric::DISK_OPERATION_TIME, "s"); for (dir, value) in [ ("read", disk.read_time_seconds), ("write", disk.write_time_seconds), ] { b.append_f64_dp( m, - cs.get_joined("system.disk.operation_time", &disk.name, dir, start), + cs.get_joined(metric::DISK_OPERATION_TIME, &disk.name, dir, start), now, value, |w| { @@ -898,11 +899,11 @@ fn project_snapshot( }, ); } - let m = b.begin_counter_i64("system.disk.merged", "{operation}"); + let m = b.begin_counter_i64(metric::DISK_MERGED, "{operation}"); for (dir, value) in [("read", disk.read_merged), ("write", disk.write_merged)] { b.append_i64_dp( m, - cs.get_joined("system.disk.merged", &disk.name, dir, start), + cs.get_joined(metric::DISK_MERGED, &disk.name, dir, start), now, saturating_i64(value), |w| { @@ -916,7 +917,7 @@ fn project_snapshot( // ── Filesystem ─────────────────────────────────────────────────────────── for fs in &snap.filesystems { let total = fs.used.saturating_add(fs.free).saturating_add(fs.reserved); - let m = b.begin_updown_i64("system.filesystem.usage", "By"); + let m = b.begin_updown_i64(metric::FILESYSTEM_USAGE, "By"); for (state, value) in [ ("used", fs.used), ("free", fs.free), @@ -931,7 +932,7 @@ fn project_snapshot( }); } if total > 0 { - let m = b.begin_gauge_f64("system.filesystem.utilization", "1"); + let m = b.begin_gauge_f64(metric::FILESYSTEM_UTILIZATION, "1"); let total_f = total as f64; for (state, value) in [ ("used", fs.used), @@ -948,7 +949,7 @@ fn project_snapshot( } } if let Some(limit_bytes) = fs.limit_bytes { - let m = b.begin_updown_i64("system.filesystem.limit", "By"); + let m = b.begin_updown_i64(metric::FILESYSTEM_LIMIT, "By"); b.append_i64_dp(m, start, now, saturating_i64(limit_bytes), |w| { w.str("system.device", &fs.device); w.str("system.filesystem.type", &fs.fs_type); @@ -960,14 +961,14 @@ fn project_snapshot( // ── Network ────────────────────────────────────────────────────────────── for net in &snap.networks { - let m = b.begin_counter_i64("system.network.io", "By"); + let m = b.begin_counter_i64(metric::NETWORK_IO, "By"); for (dir, iface_attr, value) in [ ("receive", "network.interface.name", net.rx_bytes), ("transmit", "network.interface.name", net.tx_bytes), ] { b.append_i64_dp( m, - cs.get_joined("system.network.io", &net.name, dir, start), + cs.get_joined(metric::NETWORK_IO, &net.name, dir, start), now, saturating_i64(value), |w| { @@ -976,11 +977,11 @@ fn project_snapshot( }, ); } - let m = b.begin_counter_i64("system.network.packet.count", "{packet}"); + let m = b.begin_counter_i64(metric::NETWORK_PACKET_COUNT, "{packet}"); for (dir, value) in [("receive", net.rx_packets), ("transmit", net.tx_packets)] { b.append_i64_dp( m, - cs.get_joined("system.network.packet.count", &net.name, dir, start), + cs.get_joined(metric::NETWORK_PACKET_COUNT, &net.name, dir, start), now, saturating_i64(value), |w| { @@ -989,11 +990,11 @@ fn project_snapshot( }, ); } - let m = b.begin_counter_i64("system.network.packet.dropped", "{packet}"); + let m = b.begin_counter_i64(metric::NETWORK_PACKET_DROPPED, "{packet}"); for (dir, value) in [("receive", net.rx_dropped), ("transmit", net.tx_dropped)] { b.append_i64_dp( m, - cs.get_joined("system.network.packet.dropped", &net.name, dir, start), + cs.get_joined(metric::NETWORK_PACKET_DROPPED, &net.name, dir, start), now, saturating_i64(value), |w| { @@ -1002,11 +1003,11 @@ fn project_snapshot( }, ); } - let m = b.begin_counter_i64("system.network.errors", "{error}"); + let m = b.begin_counter_i64(metric::NETWORK_ERRORS, "{error}"); for (dir, value) in [("receive", net.rx_errors), ("transmit", net.tx_errors)] { b.append_i64_dp( m, - cs.get_joined("system.network.errors", &net.name, dir, start), + cs.get_joined(metric::NETWORK_ERRORS, &net.name, dir, start), now, saturating_i64(value), |w| { @@ -1025,9 +1026,9 @@ fn project_hugepages( now: u64, hugepages: &HugepageStats, ) { - let m = b.begin_updown_i64("system.memory.linux.hugepages.limit", "{page}"); + let m = b.begin_updown_i64(metric::MEMORY_LINUX_HUGEPAGES_LIMIT, "{page}"); b.append_i64_dp(m, start, now, saturating_i64(hugepages.total), |_| {}); - let m = b.begin_updown_i64("system.memory.linux.hugepages.page_size", "By"); + let m = b.begin_updown_i64(metric::MEMORY_LINUX_HUGEPAGES_PAGE_SIZE, "By"); b.append_i64_dp( m, start, @@ -1035,12 +1036,12 @@ fn project_hugepages( saturating_i64(hugepages.page_size_bytes), |_| {}, ); - let m = b.begin_updown_i64("system.memory.linux.hugepages.reserved", "{page}"); + let m = b.begin_updown_i64(metric::MEMORY_LINUX_HUGEPAGES_RESERVED, "{page}"); b.append_i64_dp(m, start, now, saturating_i64(hugepages.reserved), |_| {}); - let m = b.begin_updown_i64("system.memory.linux.hugepages.surplus", "{page}"); + let m = b.begin_updown_i64(metric::MEMORY_LINUX_HUGEPAGES_SURPLUS, "{page}"); b.append_i64_dp(m, start, now, saturating_i64(hugepages.surplus), |_| {}); let used = hugepages.total.saturating_sub(hugepages.free); - let m = b.begin_updown_i64("system.memory.linux.hugepages.usage", "{page}"); + let m = b.begin_updown_i64(metric::MEMORY_LINUX_HUGEPAGES_USAGE, "{page}"); for (state, value) in [("used", used), ("free", hugepages.free)] { b.append_i64_dp(m, start, now, saturating_i64(value), |w| { w.str("system.memory.linux.hugepages.state", state); @@ -1048,7 +1049,7 @@ fn project_hugepages( } if hugepages.total > 0 { let total = hugepages.total as f64; - let m = b.begin_gauge_f64("system.memory.linux.hugepages.utilization", "1"); + let m = b.begin_gauge_f64(metric::MEMORY_LINUX_HUGEPAGES_UTILIZATION, "1"); for (state, value) in [("used", used), ("free", hugepages.free)] { b.append_f64_dp(m, 0, now, value as f64 / total, |w| { w.str("system.memory.linux.hugepages.state", state); @@ -1111,7 +1112,7 @@ impl CounterTracker { let mut starts = CounterStarts::default(); if let Some(cpu) = cpu { self.observe_all( - "system.cpu.time", + metric::CPU_TIME, default_start, now, &[ @@ -1128,7 +1129,7 @@ impl CounterTracker { } if let Some(paging) = paging { self.observe_all( - "system.paging.faults", + metric::PAGING_FAULTS, default_start, now, &[ @@ -1138,7 +1139,7 @@ impl CounterTracker { &mut starts, ); self.observe_all( - "system.paging.operations", + metric::PAGING_OPERATIONS, default_start, now, &[ @@ -1152,7 +1153,7 @@ impl CounterTracker { } if let Some(processes) = processes { self.observe( - "system.process.created", + metric::PROCESS_CREATED, "", processes.created as f64, default_start, @@ -1162,7 +1163,7 @@ impl CounterTracker { } for disk in disks { self.observe_disk_all( - "system.disk.io", + metric::DISK_IO, default_start, now, &disk.name, @@ -1173,7 +1174,7 @@ impl CounterTracker { &mut starts, ); self.observe_disk_all( - "system.disk.operations", + metric::DISK_OPERATIONS, default_start, now, &disk.name, @@ -1184,7 +1185,7 @@ impl CounterTracker { &mut starts, ); self.observe( - "system.disk.io_time", + metric::DISK_IO_TIME, &disk.name, disk.io_time_seconds, default_start, @@ -1192,7 +1193,7 @@ impl CounterTracker { &mut starts, ); self.observe_disk_all( - "system.disk.operation_time", + metric::DISK_OPERATION_TIME, default_start, now, &disk.name, @@ -1203,7 +1204,7 @@ impl CounterTracker { &mut starts, ); self.observe_disk_all( - "system.disk.merged", + metric::DISK_MERGED, default_start, now, &disk.name, @@ -1216,7 +1217,7 @@ impl CounterTracker { } for network in networks { self.observe_network( - "system.network.io", + metric::NETWORK_IO, default_start, now, network, @@ -1225,7 +1226,7 @@ impl CounterTracker { &mut starts, ); self.observe_network( - "system.network.packet.count", + metric::NETWORK_PACKET_COUNT, default_start, now, network, @@ -1234,7 +1235,7 @@ impl CounterTracker { &mut starts, ); self.observe_network( - "system.network.packet.dropped", + metric::NETWORK_PACKET_DROPPED, default_start, now, network, @@ -1243,7 +1244,7 @@ impl CounterTracker { &mut starts, ); self.observe_network( - "system.network.errors", + metric::NETWORK_ERRORS, default_start, now, network, @@ -2191,157 +2192,148 @@ mod tests { assert_has_attr(&resource.attributes, "host.arch", "amd64"); let metrics = &resource_metrics.scope_metrics[0].metrics; - assert_metric_shape(metrics, "system.cpu.time", "s", Some(true)); - assert_first_point_attr(metrics, "system.cpu.time", "cpu.mode", "user"); - assert_sum_point_attr(metrics, "system.cpu.time", "cpu.mode", "iowait"); - assert_metric_shape(metrics, "system.cpu.utilization", "1", None); - assert_first_point_attr(metrics, "system.cpu.utilization", "cpu.mode", "user"); - assert_metric_shape(metrics, "system.cpu.logical.count", "{cpu}", Some(false)); - assert_metric_shape(metrics, "system.cpu.physical.count", "{cpu}", Some(false)); - assert_metric_shape(metrics, "system.cpu.frequency", "Hz", None); - assert_first_point_int(metrics, "system.cpu.frequency", 2_400_000_000); - assert_first_point_attr_int(metrics, "system.cpu.frequency", "cpu.logical_number", 0); - assert_metric_shape(metrics, "system.memory.usage", "By", Some(false)); - assert_first_point_attr( - metrics, - "system.memory.usage", - "system.memory.state", - "used", - ); - assert_metric_shape(metrics, "system.memory.utilization", "1", None); - assert_metric_shape(metrics, "system.memory.linux.available", "By", Some(false)); - assert_metric_shape(metrics, "system.memory.linux.slab.usage", "By", Some(false)); - assert_metric_shape(metrics, "system.memory.limit", "By", Some(false)); - assert_metric_shape(metrics, "system.memory.linux.shared", "By", Some(false)); + assert_metric_shape(metrics, metric::CPU_TIME, "s", Some(true)); + assert_first_point_attr(metrics, metric::CPU_TIME, "cpu.mode", "user"); + assert_sum_point_attr(metrics, metric::CPU_TIME, "cpu.mode", "iowait"); + assert_metric_shape(metrics, metric::CPU_UTILIZATION, "1", None); + assert_first_point_attr(metrics, metric::CPU_UTILIZATION, "cpu.mode", "user"); + assert_metric_shape(metrics, metric::CPU_LOGICAL_COUNT, "{cpu}", Some(false)); + assert_metric_shape(metrics, metric::CPU_PHYSICAL_COUNT, "{cpu}", Some(false)); + assert_metric_shape(metrics, metric::CPU_FREQUENCY, "Hz", None); + assert_first_point_int(metrics, metric::CPU_FREQUENCY, 2_400_000_000); + assert_first_point_attr_int(metrics, metric::CPU_FREQUENCY, "cpu.logical_number", 0); + assert_metric_shape(metrics, metric::MEMORY_USAGE, "By", Some(false)); + assert_first_point_attr(metrics, metric::MEMORY_USAGE, "system.memory.state", "used"); + assert_metric_shape(metrics, metric::MEMORY_UTILIZATION, "1", None); + assert_metric_shape(metrics, metric::MEMORY_LINUX_AVAILABLE, "By", Some(false)); + assert_metric_shape(metrics, metric::MEMORY_LINUX_SLAB_USAGE, "By", Some(false)); + assert_metric_shape(metrics, metric::MEMORY_LIMIT, "By", Some(false)); + assert_metric_shape(metrics, metric::MEMORY_LINUX_SHARED, "By", Some(false)); assert_metric_shape( metrics, - "system.memory.linux.hugepages.limit", + metric::MEMORY_LINUX_HUGEPAGES_LIMIT, "{page}", Some(false), ); assert_metric_shape( metrics, - "system.memory.linux.hugepages.page_size", + metric::MEMORY_LINUX_HUGEPAGES_PAGE_SIZE, "By", Some(false), ); assert_metric_shape( metrics, - "system.memory.linux.hugepages.reserved", + metric::MEMORY_LINUX_HUGEPAGES_RESERVED, "{page}", Some(false), ); assert_metric_shape( metrics, - "system.memory.linux.hugepages.surplus", + metric::MEMORY_LINUX_HUGEPAGES_SURPLUS, "{page}", Some(false), ); assert_metric_shape( metrics, - "system.memory.linux.hugepages.usage", + metric::MEMORY_LINUX_HUGEPAGES_USAGE, "{page}", Some(false), ); assert_first_point_attr( metrics, - "system.memory.linux.hugepages.usage", + metric::MEMORY_LINUX_HUGEPAGES_USAGE, "system.memory.linux.hugepages.state", "used", ); assert_metric_shape( metrics, - "system.memory.linux.hugepages.utilization", + metric::MEMORY_LINUX_HUGEPAGES_UTILIZATION, "1", None, ); - assert_metric_shape(metrics, "system.uptime", "s", None); - assert_metric_shape(metrics, "system.paging.faults", "{fault}", Some(true)); + assert_metric_shape(metrics, metric::UPTIME, "s", None); + assert_metric_shape(metrics, metric::PAGING_FAULTS, "{fault}", Some(true)); assert_first_point_attr( metrics, - "system.paging.faults", + metric::PAGING_FAULTS, "system.paging.fault.type", "minor", ); assert_metric_shape( metrics, - "system.paging.operations", + metric::PAGING_OPERATIONS, "{operation}", Some(true), ); assert_sum_point_attr( metrics, - "system.paging.operations", + metric::PAGING_OPERATIONS, "system.paging.direction", "in", ); assert_sum_point_attr( metrics, - "system.paging.operations", + metric::PAGING_OPERATIONS, "system.paging.fault.type", "minor", ); - assert_metric_shape(metrics, "system.paging.usage", "By", Some(false)); - assert_first_point_attr(metrics, "system.paging.usage", "system.device", "/dev/swap"); - assert_metric_shape(metrics, "system.paging.utilization", "1", None); - assert_metric_shape(metrics, "system.process.count", "{process}", Some(false)); - assert_sum_point_attr(metrics, "system.process.count", "process.state", "blocked"); - assert_metric_shape(metrics, "system.process.created", "{process}", Some(true)); - assert_metric_shape(metrics, "system.disk.io", "By", Some(true)); - assert_first_point_attr(metrics, "system.disk.io", "disk.io.direction", "read"); - assert_metric_shape(metrics, "system.disk.operations", "{operation}", Some(true)); - assert_metric_shape(metrics, "system.disk.io_time", "s", Some(true)); - assert_first_point_attr(metrics, "system.disk.io_time", "system.device", "sda"); - assert_metric_shape(metrics, "system.disk.operation_time", "s", Some(true)); - assert_metric_shape(metrics, "system.disk.merged", "{operation}", Some(true)); - assert_metric_shape(metrics, "system.disk.limit", "By", Some(false)); - assert_first_point_attr(metrics, "system.disk.limit", "system.device", "sda"); - assert_metric_shape(metrics, "system.filesystem.usage", "By", Some(false)); + assert_metric_shape(metrics, metric::PAGING_USAGE, "By", Some(false)); + assert_first_point_attr(metrics, metric::PAGING_USAGE, "system.device", "/dev/swap"); + assert_metric_shape(metrics, metric::PAGING_UTILIZATION, "1", None); + assert_metric_shape(metrics, metric::PROCESS_COUNT, "{process}", Some(false)); + assert_sum_point_attr(metrics, metric::PROCESS_COUNT, "process.state", "blocked"); + assert_metric_shape(metrics, metric::PROCESS_CREATED, "{process}", Some(true)); + assert_metric_shape(metrics, metric::DISK_IO, "By", Some(true)); + assert_first_point_attr(metrics, metric::DISK_IO, "disk.io.direction", "read"); + assert_metric_shape(metrics, metric::DISK_OPERATIONS, "{operation}", Some(true)); + assert_metric_shape(metrics, metric::DISK_IO_TIME, "s", Some(true)); + assert_first_point_attr(metrics, metric::DISK_IO_TIME, "system.device", "sda"); + assert_metric_shape(metrics, metric::DISK_OPERATION_TIME, "s", Some(true)); + assert_metric_shape(metrics, metric::DISK_MERGED, "{operation}", Some(true)); + assert_metric_shape(metrics, metric::DISK_LIMIT, "By", Some(false)); + assert_first_point_attr(metrics, metric::DISK_LIMIT, "system.device", "sda"); + assert_metric_shape(metrics, metric::FILESYSTEM_USAGE, "By", Some(false)); assert_first_point_attr( metrics, - "system.filesystem.usage", + metric::FILESYSTEM_USAGE, "system.filesystem.state", "used", ); - assert_metric_shape(metrics, "system.filesystem.utilization", "1", None); - assert_metric_shape(metrics, "system.filesystem.limit", "By", Some(false)); - assert_no_first_point_attr( - metrics, - "system.filesystem.limit", - "system.filesystem.state", - ); - assert_metric_shape(metrics, "system.network.io", "By", Some(true)); + assert_metric_shape(metrics, metric::FILESYSTEM_UTILIZATION, "1", None); + assert_metric_shape(metrics, metric::FILESYSTEM_LIMIT, "By", Some(false)); + assert_no_first_point_attr(metrics, metric::FILESYSTEM_LIMIT, "system.filesystem.state"); + assert_metric_shape(metrics, metric::NETWORK_IO, "By", Some(true)); assert_first_point_attr( metrics, - "system.network.io", + metric::NETWORK_IO, "network.interface.name", "eth0", ); assert_metric_shape( metrics, - "system.network.packet.count", + metric::NETWORK_PACKET_COUNT, "{packet}", Some(true), ); assert_first_point_attr( metrics, - "system.network.packet.count", + metric::NETWORK_PACKET_COUNT, "system.device", "eth0", ); assert_metric_shape( metrics, - "system.network.packet.dropped", + metric::NETWORK_PACKET_DROPPED, "{packet}", Some(true), ); assert_first_point_attr( metrics, - "system.network.packet.dropped", + metric::NETWORK_PACKET_DROPPED, "network.interface.name", "eth0", ); - assert_metric_shape(metrics, "system.network.errors", "{error}", Some(true)); + assert_metric_shape(metrics, metric::NETWORK_ERRORS, "{error}", Some(true)); } #[cfg(feature = "dev-tools")] @@ -2412,7 +2404,7 @@ mod tests { now_unix_nano: 2_000, start_time_unix_nano: 1_000, counter_starts: CounterStarts { - entries: vec![(counter_key("system.process.created", ""), 1_500)], + entries: vec![(counter_key(metric::PROCESS_CREATED, ""), 1_500)], }, processes: Some(ProcessStats { created: 99, @@ -2425,7 +2417,7 @@ mod tests { ); let metrics = &data.resource_metrics[0].scope_metrics[0].metrics; - assert_first_sum_point_start(metrics, "system.process.created", 1_500); + assert_first_sum_point_start(metrics, metric::PROCESS_CREATED, 1_500); } #[test] @@ -2439,8 +2431,8 @@ mod tests { }]; let starts = tracker.snapshot(10, 20, None, None, None, &disks, &[]); - assert_eq!(starts.get_joined("system.disk.io", "sda", "read", 10), 10); - assert_eq!(starts.get_joined("system.disk.io", "sda", "write", 10), 10); + assert_eq!(starts.get_joined(metric::DISK_IO, "sda", "read", 10), 10); + assert_eq!(starts.get_joined(metric::DISK_IO, "sda", "write", 10), 10); let disks = vec![DiskStats { name: "sda".to_owned(), @@ -2450,13 +2442,13 @@ mod tests { }]; let starts = tracker.snapshot(10, 30, None, None, None, &disks, &[]); - assert_eq!(starts.get_joined("system.disk.io", "sda", "read", 10), 30); - assert_eq!(starts.get_joined("system.disk.io", "sda", "write", 10), 10); + assert_eq!(starts.get_joined(metric::DISK_IO, "sda", "read", 10), 30); + assert_eq!(starts.get_joined(metric::DISK_IO, "sda", "write", 10), 10); } #[test] fn counter_keys_do_not_collide_with_pipe_in_series_values() { - let metric = "system.disk.io"; + let metric = metric::DISK_IO; let device = "read|write"; let joined = counter_key_joined(metric, device, "read"); assert!(!counter_key_matches_joined( @@ -3156,10 +3148,7 @@ mod tests { #[cfg(feature = "dev-tools")] fn is_intentional_semconv_enum_value_gap(name: &str, attr: &str, value: &str) -> bool { - matches!( - (name, attr, value), - ("system.process.count", "process.state", "blocked") - ) + name == metric::PROCESS_COUNT && attr == "process.state" && value == "blocked" } #[cfg(feature = "dev-tools")] diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/semconv.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/semconv.rs new file mode 100644 index 0000000000..f190856cea --- /dev/null +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/semconv.rs @@ -0,0 +1,65 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +//! Semantic convention constants used by the host metrics receiver. + +/// Semconv version targeted by this receiver's projection layer. +pub(crate) const VERSION: &str = "1.41.0"; + +/// Schema URL emitted with host metric batches. +pub(crate) const SCHEMA_URL: &[u8] = b"https://opentelemetry.io/schemas/1.41.0"; + +const _: () = { + let url = SCHEMA_URL; + let ver = VERSION.as_bytes(); + assert!(url.len() >= ver.len(), "SCHEMA_URL is shorter than VERSION"); + let suffix = url.split_at(url.len() - ver.len()).1; + let mut i = 0; + while i < ver.len() { + assert!(suffix[i] == ver[i], "SCHEMA_URL suffix must match VERSION"); + i += 1; + } +}; + +pub(crate) mod metric { + pub(crate) const CPU_FREQUENCY: &str = "system.cpu.frequency"; + pub(crate) const CPU_LOGICAL_COUNT: &str = "system.cpu.logical.count"; + pub(crate) const CPU_PHYSICAL_COUNT: &str = "system.cpu.physical.count"; + pub(crate) const CPU_TIME: &str = "system.cpu.time"; + pub(crate) const CPU_UTILIZATION: &str = "system.cpu.utilization"; + pub(crate) const DISK_IO: &str = "system.disk.io"; + pub(crate) const DISK_IO_TIME: &str = "system.disk.io_time"; + pub(crate) const DISK_LIMIT: &str = "system.disk.limit"; + pub(crate) const DISK_MERGED: &str = "system.disk.merged"; + pub(crate) const DISK_OPERATION_TIME: &str = "system.disk.operation_time"; + pub(crate) const DISK_OPERATIONS: &str = "system.disk.operations"; + pub(crate) const FILESYSTEM_LIMIT: &str = "system.filesystem.limit"; + pub(crate) const FILESYSTEM_USAGE: &str = "system.filesystem.usage"; + pub(crate) const FILESYSTEM_UTILIZATION: &str = "system.filesystem.utilization"; + pub(crate) const MEMORY_LIMIT: &str = "system.memory.limit"; + pub(crate) const MEMORY_LINUX_AVAILABLE: &str = "system.memory.linux.available"; + pub(crate) const MEMORY_LINUX_HUGEPAGES_LIMIT: &str = "system.memory.linux.hugepages.limit"; + pub(crate) const MEMORY_LINUX_HUGEPAGES_PAGE_SIZE: &str = + "system.memory.linux.hugepages.page_size"; + pub(crate) const MEMORY_LINUX_HUGEPAGES_RESERVED: &str = + "system.memory.linux.hugepages.reserved"; + pub(crate) const MEMORY_LINUX_HUGEPAGES_SURPLUS: &str = "system.memory.linux.hugepages.surplus"; + pub(crate) const MEMORY_LINUX_HUGEPAGES_USAGE: &str = "system.memory.linux.hugepages.usage"; + pub(crate) const MEMORY_LINUX_HUGEPAGES_UTILIZATION: &str = + "system.memory.linux.hugepages.utilization"; + pub(crate) const MEMORY_LINUX_SHARED: &str = "system.memory.linux.shared"; + pub(crate) const MEMORY_LINUX_SLAB_USAGE: &str = "system.memory.linux.slab.usage"; + pub(crate) const MEMORY_USAGE: &str = "system.memory.usage"; + pub(crate) const MEMORY_UTILIZATION: &str = "system.memory.utilization"; + pub(crate) const NETWORK_ERRORS: &str = "system.network.errors"; + pub(crate) const NETWORK_IO: &str = "system.network.io"; + pub(crate) const NETWORK_PACKET_COUNT: &str = "system.network.packet.count"; + pub(crate) const NETWORK_PACKET_DROPPED: &str = "system.network.packet.dropped"; + pub(crate) const PAGING_FAULTS: &str = "system.paging.faults"; + pub(crate) const PAGING_OPERATIONS: &str = "system.paging.operations"; + pub(crate) const PAGING_USAGE: &str = "system.paging.usage"; + pub(crate) const PAGING_UTILIZATION: &str = "system.paging.utilization"; + pub(crate) const PROCESS_COUNT: &str = "system.process.count"; + pub(crate) const PROCESS_CREATED: &str = "system.process.created"; + pub(crate) const UPTIME: &str = "system.uptime"; +} diff --git a/rust/otap-dataflow/docs/host-metrics-receiver.md b/rust/otap-dataflow/docs/host-metrics-receiver.md index 640ed98a4b..08c38875d5 100644 --- a/rust/otap-dataflow/docs/host-metrics-receiver.md +++ b/rust/otap-dataflow/docs/host-metrics-receiver.md @@ -534,7 +534,7 @@ timestamp. | Linux hugepage metrics | No | Mixed | Mixed | Use current `system.memory.linux.hugepages.*` registry definitions. | | `system.paging.usage` | Yes | UpDownCounter | `By` | `system.paging.state`, `system.device`; use `/proc/swaps` for swap device identity. | | `system.paging.utilization` | Yes | Gauge | `1` | `system.paging.state`, `system.device`; use `/proc/swaps` for swap device identity. | -| `system.paging.operations` | Yes | Counter | `{operation}` | `system.paging.direction` from `pswpin` and `pswpout`; intentionally omit `system.paging.fault.type` because Linux swap-in/out counters are not broken down by fault type. | +| `system.paging.operations` | Yes | Counter | `{operation}` | `system.paging.direction`, `system.paging.fault.type`; follow the current registry shape. Linux projection follows the Go Collector precedent: `pswpin`/`pswpout` as `major`, `pgpgin`/`pgpgout` as `minor`. Linux does not expose this as a direct fault-type split, so maintainers may choose a narrower projection. | | `system.paging.faults` | Yes | Counter | `{fault}` | `system.paging.fault.type`; use `pgmajfault` for `major` and `pgfault - pgmajfault` for `minor` when both are available. | | `system.uptime` | Yes | Gauge | `s` | Prefer `CLOCK_BOOTTIME`; fall back to `/proc/uptime`. Emit double seconds. | | `system.disk.io` | Yes | Counter | `By` | `system.device`, `disk.io.direction`. | From 1e2f34ee65522e1768caf787c65e23329c646218 Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Mon, 4 May 2026 20:59:52 -0700 Subject: [PATCH 41/60] Fix host metrics review issues --- .github/workflows/rust-ci.yml | 5 +++ .../crates/core-nodes/Cargo.toml | 4 +- .../receivers/host_metrics_receiver/mod.rs | 26 +++++++++++-- .../host_metrics_receiver/otap_builder.rs | 4 ++ .../receivers/host_metrics_receiver/procfs.rs | 37 ++++++++++--------- 5 files changed, 54 insertions(+), 22 deletions(-) diff --git a/.github/workflows/rust-ci.yml b/.github/workflows/rust-ci.yml index d782142668..8830f3f230 100644 --- a/.github/workflows/rust-ci.yml +++ b/.github/workflows/rust-ci.yml @@ -812,6 +812,7 @@ jobs: - pest-fmt - no_default_features_check - pipeline_perf_test + - host-metrics-semconv steps: - name: Check if all required jobs succeeded run: | @@ -851,4 +852,8 @@ jobs: echo "pipeline_perf_test failed or was cancelled" exit 1 fi + if [[ "${{ needs.host-metrics-semconv.result }}" != "success" ]]; then + echo "host-metrics-semconv failed or was cancelled" + exit 1 + fi echo "All required checks passed!" diff --git a/rust/otap-dataflow/crates/core-nodes/Cargo.toml b/rust/otap-dataflow/crates/core-nodes/Cargo.toml index e1dbca9c8c..098d3c24cd 100644 --- a/rust/otap-dataflow/crates/core-nodes/Cargo.toml +++ b/rust/otap-dataflow/crates/core-nodes/Cargo.toml @@ -39,7 +39,6 @@ futures-timer.workspace = true humantime-serde.workspace = true linkme.workspace = true libc.workspace = true -nix.workspace = true object_store = {workspace = true, features = ["fs"]} parquet.workspace = true prost.workspace = true @@ -68,6 +67,9 @@ weaver_resolved_schema = { workspace = true, optional = true } weaver_resolver = { workspace = true, optional = true } weaver_semconv = { workspace = true, optional = true } +[target.'cfg(target_os = "linux")'.dependencies] +nix.workspace = true + [features] dev-tools = ["dep:weaver_common", "dep:weaver_forge", "dep:weaver_resolved_schema", "dep:weaver_resolver", "dep:weaver_semconv"] bench = [] diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs index 3626aa8dd2..b4cb9e6068 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs @@ -666,12 +666,12 @@ pub static HOST_METRICS_RECEIVER: ReceiverFactory = ReceiverFactory { { let mut receiver = HostMetricsReceiver::from_config(&node_config.config)?; receiver.metrics = Some(pipeline.register_metrics::()); - return Ok(ReceiverWrapper::local( + Ok(ReceiverWrapper::local( receiver, node, node_config, receiver_config, - )); + )) } #[cfg(not(target_os = "linux"))] unreachable!("validate_supported_platform returned Ok on a non-Linux platform") @@ -683,7 +683,7 @@ pub static HOST_METRICS_RECEIVER: ReceiverFactory = ReceiverFactory { error: e.to_string(), } })?; - validate_config(&config) + RuntimeConfig::try_from(config).map(|_| ()) }, }; @@ -1321,6 +1321,7 @@ impl local::Receiver for HostMetricsReceiver { Ok(scrape) => { if let Some(metrics) = metrics.as_mut() { metrics.partial_errors.add(scrape.partial_errors); + metrics.source_read_errors.add(scrape.partial_errors); } let pdata = match encode_snapshot(scrape.snapshot) { Ok(pdata) => pdata, @@ -1731,4 +1732,23 @@ mod tests { assert!(filter.matches("eth0")); assert!(!filter.matches("lo")); } + + #[test] + fn factory_validation_rejects_invalid_regex_filter() { + let config = serde_json::json!({ + "families": { + "disk": { + "include": { + "devices": ["["], + "match_type": "regexp" + } + } + } + }); + + assert!(matches!( + (HOST_METRICS_RECEIVER.validate_config)(&config), + Err(otap_df_config::error::Error::InvalidUserConfig { .. }) + )); + } } diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/otap_builder.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/otap_builder.rs index 017ee6f07f..7b29e68930 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/otap_builder.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/otap_builder.rs @@ -73,6 +73,7 @@ pub(crate) struct HostMetricsArrowBuilder { ndp_attrs: StrKeysAttributesRecordBatchBuilder, curr_metric_id: u16, curr_dp_id: u32, + resource_appended: bool, } impl HostMetricsArrowBuilder { @@ -84,6 +85,7 @@ impl HostMetricsArrowBuilder { ndp_attrs: StrKeysAttributesRecordBatchBuilder::new(), curr_metric_id: 0, curr_dp_id: 0, + resource_appended: false, } } @@ -91,6 +93,8 @@ impl HostMetricsArrowBuilder { /// Must be called exactly once per batch before any metrics are appended. #[cfg(target_os = "linux")] pub(crate) fn append_resource(&mut self, resource: &HostResource) { + debug_assert!(!self.resource_appended, "resource already appended"); + self.resource_appended = true; let mut w = ResourceAttrWriter { attrs: &mut self.resource_attrs, }; diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs index f277474cf0..2d27de0681 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs @@ -1579,9 +1579,9 @@ fn statvfs_bytes(path: &Path) -> io::Result { let stat = nix::sys::statvfs::statvfs(path).map_err(io::Error::other)?; let block_size = stat.fragment_size(); Ok(FilesystemStat { - total_bytes: u64::from(stat.blocks()).saturating_mul(block_size), - free_bytes: u64::from(stat.blocks_free()).saturating_mul(block_size), - available_bytes: u64::from(stat.blocks_available()).saturating_mul(block_size), + total_bytes: stat.blocks().saturating_mul(block_size), + free_bytes: stat.blocks_free().saturating_mul(block_size), + available_bytes: stat.blocks_available().saturating_mul(block_size), }) } @@ -2159,7 +2159,8 @@ mod tests { use super::*; use otap_df_pdata::proto::opentelemetry::common::v1::{AnyValue, KeyValue, any_value}; use otap_df_pdata::proto::opentelemetry::metrics::v1::{ - AggregationTemporality, Metric, MetricsData, NumberDataPoint, metric, number_data_point, + AggregationTemporality, Metric, MetricsData, NumberDataPoint, metric as otlp_metric, + number_data_point, }; use otap_df_pdata::testing::round_trip::decode_metrics; #[cfg(feature = "dev-tools")] @@ -3167,8 +3168,8 @@ mod tests { .iter() .map(|metric| { let (monotonic, points) = match metric.data.as_ref().expect("metric data") { - metric::Data::Sum(sum) => (Some(sum.is_monotonic), &sum.data_points), - metric::Data::Gauge(gauge) => (None, &gauge.data_points), + otlp_metric::Data::Sum(sum) => (Some(sum.is_monotonic), &sum.data_points), + otlp_metric::Data::Gauge(gauge) => (None, &gauge.data_points), _ => panic!("unsupported metric data for {}", metric.name), }; let attributes = points @@ -3391,7 +3392,7 @@ mod tests { let metric = metric_by_name(metrics, name); assert_eq!(metric.unit, unit); match metric.data.as_ref().expect("metric data") { - metric::Data::Sum(sum) => { + otlp_metric::Data::Sum(sum) => { let expected_monotonic = monotonic_sum.unwrap_or_else(|| panic!("{name} should be a gauge")); assert_eq!( @@ -3405,7 +3406,7 @@ mod tests { .all(|point| point.start_time_unix_nano == 1_000) ); } - metric::Data::Gauge(gauge) => { + otlp_metric::Data::Gauge(gauge) => { assert!(monotonic_sum.is_none(), "{name} should be a cumulative sum"); assert!( gauge @@ -3426,8 +3427,8 @@ mod tests { ) { let metric = metric_by_name(metrics, name); let point = match metric.data.as_ref().expect("metric data") { - metric::Data::Sum(sum) => sum.data_points.first(), - metric::Data::Gauge(gauge) => gauge.data_points.first(), + otlp_metric::Data::Sum(sum) => sum.data_points.first(), + otlp_metric::Data::Gauge(gauge) => gauge.data_points.first(), _ => None, } .expect("data point"); @@ -3441,7 +3442,7 @@ mod tests { value: &'static str, ) { let metric = metric_by_name(metrics, name); - let metric::Data::Sum(sum) = metric.data.as_ref().expect("metric data") else { + let otlp_metric::Data::Sum(sum) = metric.data.as_ref().expect("metric data") else { panic!("{name} should be a cumulative sum"); }; assert!( @@ -3455,8 +3456,8 @@ mod tests { fn assert_first_point_int(metrics: &[Metric], name: &'static str, expected: i64) { let metric = metric_by_name(metrics, name); let point = match metric.data.as_ref().expect("metric data") { - metric::Data::Sum(sum) => sum.data_points.first(), - metric::Data::Gauge(gauge) => gauge.data_points.first(), + otlp_metric::Data::Sum(sum) => sum.data_points.first(), + otlp_metric::Data::Gauge(gauge) => gauge.data_points.first(), _ => None, } .expect("data point"); @@ -3475,8 +3476,8 @@ mod tests { ) { let metric = metric_by_name(metrics, name); let point = match metric.data.as_ref().expect("metric data") { - metric::Data::Sum(sum) => sum.data_points.first(), - metric::Data::Gauge(gauge) => gauge.data_points.first(), + otlp_metric::Data::Sum(sum) => sum.data_points.first(), + otlp_metric::Data::Gauge(gauge) => gauge.data_points.first(), _ => None, } .expect("data point"); @@ -3495,8 +3496,8 @@ mod tests { fn assert_no_first_point_attr(metrics: &[Metric], name: &'static str, key: &'static str) { let metric = metric_by_name(metrics, name); let point = match metric.data.as_ref().expect("metric data") { - metric::Data::Sum(sum) => sum.data_points.first(), - metric::Data::Gauge(gauge) => gauge.data_points.first(), + otlp_metric::Data::Sum(sum) => sum.data_points.first(), + otlp_metric::Data::Gauge(gauge) => gauge.data_points.first(), _ => None, } .expect("data point"); @@ -3508,7 +3509,7 @@ mod tests { fn assert_first_sum_point_start(metrics: &[Metric], name: &'static str, expected_start: u64) { let metric = metric_by_name(metrics, name); - let metric::Data::Sum(sum) = metric.data.as_ref().expect("metric data") else { + let otlp_metric::Data::Sum(sum) = metric.data.as_ref().expect("metric data") else { panic!("{name} should be a cumulative sum"); }; let point = sum.data_points.first().expect("data point"); From d4037913e6356e79e646ace5e234c8379226cb3b Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Mon, 4 May 2026 23:05:07 -0700 Subject: [PATCH 42/60] Harden host metrics scrape behavior --- .../receivers/host_metrics_receiver/mod.rs | 10 +- .../host_metrics_receiver/otap_builder.rs | 8 +- .../receivers/host_metrics_receiver/procfs.rs | 243 +++++++++++------- .../host_metrics_receiver/semconv.rs | 25 ++ 4 files changed, 183 insertions(+), 103 deletions(-) diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs index b4cb9e6068..50f9eefd22 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs @@ -1374,12 +1374,10 @@ impl local::Receiver for HostMetricsReceiver { metrics.source_read_errors.add(1); metrics.scrape_duration_ns.record(elapsed_nanos(scrape_start)); } - return Err(Error::ReceiverError { - receiver: effect_handler.receiver_id(), - kind: ReceiverErrorKind::Other, - error: format!("failed to collect host metrics: {err}"), - source_detail: String::new(), - }); + otel_warn!( + "host metrics scrape failed; receiver will retry", + error = err.to_string() + ); } } } diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/otap_builder.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/otap_builder.rs index 7b29e68930..59db71ea8d 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/otap_builder.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/otap_builder.rs @@ -98,15 +98,15 @@ impl HostMetricsArrowBuilder { let mut w = ResourceAttrWriter { attrs: &mut self.resource_attrs, }; - w.str("os.type", "linux"); + w.str(semconv::attr::OS_TYPE, "linux"); if let Some(id) = &resource.host_id { - w.str("host.id", id); + w.str(semconv::attr::HOST_ID, id); } if let Some(name) = &resource.host_name { - w.str("host.name", name); + w.str(semconv::attr::HOST_NAME, name); } if let Some(arch) = resource.host_arch { - w.str("host.arch", arch); + w.str(semconv::attr::HOST_ARCH, arch); } } diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs index 2d27de0681..6cb196bce1 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs @@ -3,7 +3,7 @@ //! Linux procfs-backed host metric source. -use crate::receivers::host_metrics_receiver::semconv::metric; +use crate::receivers::host_metrics_receiver::semconv::{attr, metric}; use crate::receivers::host_metrics_receiver::{CompiledFilter, HostViewValidationMode}; use std::collections::{HashMap, HashSet}; use std::fs::File; @@ -27,6 +27,8 @@ pub struct ProcfsSource { previous_cpu: Option, filesystem_worker: FilesystemStatWorker, counter_tracker: CounterTracker, + boot_time_unix_nano: Option, + resource: Option, } /// Procfs collection config. @@ -117,6 +119,8 @@ impl ProcfsSource { previous_cpu: None, filesystem_worker: FilesystemStatWorker::new()?, counter_tracker: CounterTracker::default(), + boot_time_unix_nano: None, + resource: None, }; source.apply_startup_validation()?; Ok(source) @@ -138,7 +142,15 @@ impl ProcfsSource { let clk_tck = self.clk_tck; let mut partial_errors = 0; let mut first_error = None; - let needs_stat = due.cpu || due.system || due.processes; + let needs_start_time = due.cpu + || due.memory + || due.paging + || due.disk + || due.filesystem + || due.network + || due.processes; + let needs_stat = + due.cpu || due.processes || (needs_start_time && self.boot_time_unix_nano.is_none()); let stat = match needs_stat .then(|| self.read_path(PathKind::Stat)) .transpose() @@ -150,6 +162,10 @@ impl ProcfsSource { StatSnapshot::default() } }; + if stat.boot_time_unix_nano != 0 { + self.boot_time_unix_nano = Some(stat.boot_time_unix_nano); + } + let start_time_unix_nano = self.boot_time_unix_nano.unwrap_or(now_unix_nano); let cpu_utilization = if due.cpu && self.config.cpu_utilization { let utilization = stat.cpu.and_then(|current| { self.previous_cpu @@ -285,7 +301,7 @@ impl ProcfsSource { exclude_mount_points: exclude_mount_points.as_ref(), }; let mounts = parse_mountinfo(mountinfo, include_virtual, emit_limit, filters); - self.read_filesystems(mounts) + self.read_filesystems(mounts, &mut partial_errors, &mut first_error) } Err(err) => { record_partial_error(&mut partial_errors, &mut first_error, err); @@ -296,11 +312,11 @@ impl ProcfsSource { Vec::new() }; - let resource = self.read_resource(); + let resource = self.read_resource().clone(); let counter_starts = self.counter_tracker.snapshot( - stat.boot_time_unix_nano, + start_time_unix_nano, now_unix_nano, - stat.cpu.as_ref(), + due.cpu.then_some(stat.cpu).flatten().as_ref(), paging.as_ref(), due.processes.then_some(stat.processes).as_ref(), &disks, @@ -309,12 +325,12 @@ impl ProcfsSource { let snapshot = HostSnapshot { now_unix_nano, - start_time_unix_nano: stat.boot_time_unix_nano, + start_time_unix_nano, counter_starts, memory_limit: self.config.memory_limit, memory_shared: self.config.memory_shared, memory_hugepages: self.config.memory_hugepages, - cpu: stat.cpu, + cpu: due.cpu.then_some(stat.cpu).flatten(), cpu_utilization, cpuinfo, memory, @@ -429,15 +445,24 @@ impl ProcfsSource { Ok(sectors.saturating_mul(DISKSTAT_SECTOR_BYTES)) } - fn read_filesystems(&mut self, mounts: Vec) -> Vec { + fn read_filesystems( + &mut self, + mounts: Vec, + partial_errors: &mut u64, + first_error: &mut Option, + ) -> Vec { let mut filesystems = Vec::with_capacity(mounts.len()); for mount in mounts { let path = self.paths.host_path(&mount.mountpoint); - let Ok(stat) = self + let stat = match self .filesystem_worker .statvfs(path, FILESYSTEM_STAT_TIMEOUT) - else { - continue; + { + Ok(stat) => stat, + Err(err) => { + record_partial_error(partial_errors, first_error, err); + continue; + } }; let free = stat.available_bytes; let reserved = stat.free_bytes.saturating_sub(stat.available_bytes); @@ -456,14 +481,19 @@ impl ProcfsSource { filesystems } - fn read_resource(&mut self) -> HostResource { - HostResource { - host_id: self + fn read_resource(&mut self) -> &HostResource { + if self.resource.is_none() { + let host_id = self .read_trimmed_optional(PathKind::MachineId) - .or_else(|| self.read_trimmed_optional(PathKind::DbusMachineId)), - host_name: self.read_trimmed_optional(PathKind::Hostname), - host_arch: host_arch(), + .or_else(|| self.read_trimmed_optional(PathKind::DbusMachineId)); + let host_name = self.read_trimmed_optional(PathKind::Hostname); + self.resource = Some(HostResource { + host_id, + host_name, + host_arch: host_arch(), + }); } + self.resource.as_ref().expect("resource is initialized") } fn read_trimmed_optional(&mut self, kind: PathKind) -> Option { @@ -624,7 +654,7 @@ impl HostSnapshot { } } -#[derive(Default)] +#[derive(Clone, Default)] pub(super) struct HostResource { pub(super) host_id: Option, pub(super) host_name: Option, @@ -652,7 +682,7 @@ fn project_snapshot( ("steal", cpu.steal), ] { b.append_f64_dp(m, cs.get(metric::CPU_TIME, mode, start), now, value, |w| { - w.str("cpu.mode", mode); + w.str(attr::CPU_MODE, mode); }); } } @@ -668,7 +698,7 @@ fn project_snapshot( ("steal", cpu.steal), ] { b.append_f64_dp(m, 0, now, value, |w| { - w.str("cpu.mode", mode); + w.str(attr::CPU_MODE, mode); }); } } @@ -697,7 +727,7 @@ fn project_snapshot( for (idx, &freq) in snap.cpuinfo.frequencies_hz.iter().enumerate() { let logical = i64::try_from(idx).unwrap_or(i64::MAX); b.append_i64_dp(m, 0, now, frequency_hz_i64(freq), |w| { - w.int("cpu.logical_number", logical); + w.int(attr::CPU_LOGICAL_NUMBER, logical); }); } } @@ -712,7 +742,7 @@ fn project_snapshot( ("buffers", memory.buffered), ] { b.append_i64_dp(m, start, now, saturating_i64(value), |w| { - w.str("system.memory.state", state); + w.str(attr::SYSTEM_MEMORY_STATE, state); }); } if memory.total > 0 { @@ -725,7 +755,7 @@ fn project_snapshot( ("buffers", memory.buffered), ] { b.append_f64_dp(m, 0, now, value as f64 / total, |w| { - w.str("system.memory.state", state); + w.str(attr::SYSTEM_MEMORY_STATE, state); }); } } @@ -739,7 +769,7 @@ fn project_snapshot( ("unreclaimable", memory.slab_unreclaimable), ] { b.append_i64_dp(m, start, now, saturating_i64(value), |w| { - w.str("system.memory.linux.slab.state", state); + w.str(attr::SYSTEM_MEMORY_LINUX_SLAB_STATE, state); }); } if snap.memory_limit { @@ -774,11 +804,14 @@ fn project_snapshot( now, saturating_i64(value), |w| { - w.str("system.paging.fault.type", fault_type); + w.str(attr::SYSTEM_PAGING_FAULT_TYPE, fault_type); }, ); } let m = b.begin_counter_i64(metric::PAGING_OPERATIONS, "{operation}"); + // Linux exposes swap operations and page-in/page-out counters separately. + // Semconv requires both direction and fault.type for this metric, so the + // receiver keeps the phase-1 mapping explicit here. for (direction, fault_type, value) in [ ("in", "major", paging.swap_in), ("out", "major", paging.swap_out), @@ -791,8 +824,8 @@ fn project_snapshot( now, saturating_i64(value), |w| { - w.str("system.paging.direction", direction); - w.str("system.paging.fault.type", fault_type); + w.str(attr::SYSTEM_PAGING_DIRECTION, direction); + w.str(attr::SYSTEM_PAGING_FAULT_TYPE, fault_type); }, ); } @@ -801,8 +834,8 @@ fn project_snapshot( let m = b.begin_updown_i64(metric::PAGING_USAGE, "By"); for (state, value) in [("used", swap.used), ("free", swap.free)] { b.append_i64_dp(m, start, now, saturating_i64(value), |w| { - w.str("system.device", &swap.name); - w.str("system.paging.state", state); + w.str(attr::SYSTEM_DEVICE, &swap.name); + w.str(attr::SYSTEM_PAGING_STATE, state); }); } let size = swap.size; @@ -811,8 +844,8 @@ fn project_snapshot( let total = size as f64; for (state, value) in [("used", swap.used), ("free", swap.free)] { b.append_f64_dp(m, 0, now, value as f64 / total, |w| { - w.str("system.device", &swap.name); - w.str("system.paging.state", state); + w.str(attr::SYSTEM_DEVICE, &swap.name); + w.str(attr::SYSTEM_PAGING_STATE, state); }); } } @@ -826,7 +859,7 @@ fn project_snapshot( ("blocked", processes.blocked), ] { b.append_i64_dp(m, start, now, saturating_i64(value), |w| { - w.str("process.state", state); + w.str(attr::PROCESS_STATE, state); }); } let m = b.begin_counter_i64(metric::PROCESS_CREATED, "{process}"); @@ -844,7 +877,7 @@ fn project_snapshot( if let Some(limit_bytes) = disk.limit_bytes { let m = b.begin_updown_i64(metric::DISK_LIMIT, "By"); b.append_i64_dp(m, start, now, saturating_i64(limit_bytes), |w| { - w.str("system.device", &disk.name); + w.str(attr::SYSTEM_DEVICE, &disk.name); }); } let m = b.begin_counter_i64(metric::DISK_IO, "By"); @@ -855,8 +888,8 @@ fn project_snapshot( now, saturating_i64(value), |w| { - w.str("system.device", &disk.name); - w.str("disk.io.direction", dir); + w.str(attr::SYSTEM_DEVICE, &disk.name); + w.str(attr::DISK_IO_DIRECTION, dir); }, ); } @@ -868,8 +901,8 @@ fn project_snapshot( now, saturating_i64(value), |w| { - w.str("system.device", &disk.name); - w.str("disk.io.direction", dir); + w.str(attr::SYSTEM_DEVICE, &disk.name); + w.str(attr::DISK_IO_DIRECTION, dir); }, ); } @@ -880,7 +913,7 @@ fn project_snapshot( now, disk.io_time_seconds, |w| { - w.str("system.device", &disk.name); + w.str(attr::SYSTEM_DEVICE, &disk.name); }, ); let m = b.begin_counter_f64(metric::DISK_OPERATION_TIME, "s"); @@ -894,8 +927,8 @@ fn project_snapshot( now, value, |w| { - w.str("system.device", &disk.name); - w.str("disk.io.direction", dir); + w.str(attr::SYSTEM_DEVICE, &disk.name); + w.str(attr::DISK_IO_DIRECTION, dir); }, ); } @@ -907,8 +940,8 @@ fn project_snapshot( now, saturating_i64(value), |w| { - w.str("system.device", &disk.name); - w.str("disk.io.direction", dir); + w.str(attr::SYSTEM_DEVICE, &disk.name); + w.str(attr::DISK_IO_DIRECTION, dir); }, ); } @@ -924,11 +957,11 @@ fn project_snapshot( ("reserved", fs.reserved), ] { b.append_i64_dp(m, start, now, saturating_i64(value), |w| { - w.str("system.device", &fs.device); - w.str("system.filesystem.state", state); - w.str("system.filesystem.type", &fs.fs_type); - w.str("system.filesystem.mode", fs.mode); - w.str("system.filesystem.mountpoint", &fs.mountpoint); + w.str(attr::SYSTEM_DEVICE, &fs.device); + w.str(attr::SYSTEM_FILESYSTEM_STATE, state); + w.str(attr::SYSTEM_FILESYSTEM_TYPE, &fs.fs_type); + w.str(attr::SYSTEM_FILESYSTEM_MODE, fs.mode); + w.str(attr::SYSTEM_FILESYSTEM_MOUNTPOINT, &fs.mountpoint); }); } if total > 0 { @@ -940,21 +973,21 @@ fn project_snapshot( ("reserved", fs.reserved), ] { b.append_f64_dp(m, 0, now, value as f64 / total_f, |w| { - w.str("system.device", &fs.device); - w.str("system.filesystem.state", state); - w.str("system.filesystem.type", &fs.fs_type); - w.str("system.filesystem.mode", fs.mode); - w.str("system.filesystem.mountpoint", &fs.mountpoint); + w.str(attr::SYSTEM_DEVICE, &fs.device); + w.str(attr::SYSTEM_FILESYSTEM_STATE, state); + w.str(attr::SYSTEM_FILESYSTEM_TYPE, &fs.fs_type); + w.str(attr::SYSTEM_FILESYSTEM_MODE, fs.mode); + w.str(attr::SYSTEM_FILESYSTEM_MOUNTPOINT, &fs.mountpoint); }); } } if let Some(limit_bytes) = fs.limit_bytes { let m = b.begin_updown_i64(metric::FILESYSTEM_LIMIT, "By"); b.append_i64_dp(m, start, now, saturating_i64(limit_bytes), |w| { - w.str("system.device", &fs.device); - w.str("system.filesystem.type", &fs.fs_type); - w.str("system.filesystem.mode", fs.mode); - w.str("system.filesystem.mountpoint", &fs.mountpoint); + w.str(attr::SYSTEM_DEVICE, &fs.device); + w.str(attr::SYSTEM_FILESYSTEM_TYPE, &fs.fs_type); + w.str(attr::SYSTEM_FILESYSTEM_MODE, fs.mode); + w.str(attr::SYSTEM_FILESYSTEM_MOUNTPOINT, &fs.mountpoint); }); } } @@ -963,8 +996,8 @@ fn project_snapshot( for net in &snap.networks { let m = b.begin_counter_i64(metric::NETWORK_IO, "By"); for (dir, iface_attr, value) in [ - ("receive", "network.interface.name", net.rx_bytes), - ("transmit", "network.interface.name", net.tx_bytes), + ("receive", attr::NETWORK_INTERFACE_NAME, net.rx_bytes), + ("transmit", attr::NETWORK_INTERFACE_NAME, net.tx_bytes), ] { b.append_i64_dp( m, @@ -973,7 +1006,7 @@ fn project_snapshot( saturating_i64(value), |w| { w.str(iface_attr, &net.name); - w.str("network.io.direction", dir); + w.str(attr::NETWORK_IO_DIRECTION, dir); }, ); } @@ -985,8 +1018,10 @@ fn project_snapshot( now, saturating_i64(value), |w| { - w.str("system.device", &net.name); - w.str("network.io.direction", dir); + // Semconv uses system.device here, while sibling network + // metrics use network.interface.name. + w.str(attr::SYSTEM_DEVICE, &net.name); + w.str(attr::NETWORK_IO_DIRECTION, dir); }, ); } @@ -998,8 +1033,8 @@ fn project_snapshot( now, saturating_i64(value), |w| { - w.str("network.interface.name", &net.name); - w.str("network.io.direction", dir); + w.str(attr::NETWORK_INTERFACE_NAME, &net.name); + w.str(attr::NETWORK_IO_DIRECTION, dir); }, ); } @@ -1011,8 +1046,8 @@ fn project_snapshot( now, saturating_i64(value), |w| { - w.str("network.interface.name", &net.name); - w.str("network.io.direction", dir); + w.str(attr::NETWORK_INTERFACE_NAME, &net.name); + w.str(attr::NETWORK_IO_DIRECTION, dir); }, ); } @@ -1044,7 +1079,7 @@ fn project_hugepages( let m = b.begin_updown_i64(metric::MEMORY_LINUX_HUGEPAGES_USAGE, "{page}"); for (state, value) in [("used", used), ("free", hugepages.free)] { b.append_i64_dp(m, start, now, saturating_i64(value), |w| { - w.str("system.memory.linux.hugepages.state", state); + w.str(attr::SYSTEM_MEMORY_LINUX_HUGEPAGES_STATE, state); }); } if hugepages.total > 0 { @@ -1052,7 +1087,7 @@ fn project_hugepages( let m = b.begin_gauge_f64(metric::MEMORY_LINUX_HUGEPAGES_UTILIZATION, "1"); for (state, value) in [("used", used), ("free", hugepages.free)] { b.append_f64_dp(m, 0, now, value as f64 / total, |w| { - w.str("system.memory.linux.hugepages.state", state); + w.str(attr::SYSTEM_MEMORY_LINUX_HUGEPAGES_STATE, state); }); } } @@ -2187,24 +2222,29 @@ mod tests { let resource_metrics = data.resource_metrics.first().expect("resource metrics"); let resource = resource_metrics.resource.as_ref().expect("resource"); - assert_has_attr(&resource.attributes, "os.type", "linux"); - assert_has_attr(&resource.attributes, "host.id", "host-id"); - assert_has_attr(&resource.attributes, "host.name", "host-name"); - assert_has_attr(&resource.attributes, "host.arch", "amd64"); + assert_has_attr(&resource.attributes, attr::OS_TYPE, "linux"); + assert_has_attr(&resource.attributes, attr::HOST_ID, "host-id"); + assert_has_attr(&resource.attributes, attr::HOST_NAME, "host-name"); + assert_has_attr(&resource.attributes, attr::HOST_ARCH, "amd64"); let metrics = &resource_metrics.scope_metrics[0].metrics; assert_metric_shape(metrics, metric::CPU_TIME, "s", Some(true)); - assert_first_point_attr(metrics, metric::CPU_TIME, "cpu.mode", "user"); - assert_sum_point_attr(metrics, metric::CPU_TIME, "cpu.mode", "iowait"); + assert_first_point_attr(metrics, metric::CPU_TIME, attr::CPU_MODE, "user"); + assert_sum_point_attr(metrics, metric::CPU_TIME, attr::CPU_MODE, "iowait"); assert_metric_shape(metrics, metric::CPU_UTILIZATION, "1", None); - assert_first_point_attr(metrics, metric::CPU_UTILIZATION, "cpu.mode", "user"); + assert_first_point_attr(metrics, metric::CPU_UTILIZATION, attr::CPU_MODE, "user"); assert_metric_shape(metrics, metric::CPU_LOGICAL_COUNT, "{cpu}", Some(false)); assert_metric_shape(metrics, metric::CPU_PHYSICAL_COUNT, "{cpu}", Some(false)); assert_metric_shape(metrics, metric::CPU_FREQUENCY, "Hz", None); assert_first_point_int(metrics, metric::CPU_FREQUENCY, 2_400_000_000); - assert_first_point_attr_int(metrics, metric::CPU_FREQUENCY, "cpu.logical_number", 0); + assert_first_point_attr_int(metrics, metric::CPU_FREQUENCY, attr::CPU_LOGICAL_NUMBER, 0); assert_metric_shape(metrics, metric::MEMORY_USAGE, "By", Some(false)); - assert_first_point_attr(metrics, metric::MEMORY_USAGE, "system.memory.state", "used"); + assert_first_point_attr( + metrics, + metric::MEMORY_USAGE, + attr::SYSTEM_MEMORY_STATE, + "used", + ); assert_metric_shape(metrics, metric::MEMORY_UTILIZATION, "1", None); assert_metric_shape(metrics, metric::MEMORY_LINUX_AVAILABLE, "By", Some(false)); assert_metric_shape(metrics, metric::MEMORY_LINUX_SLAB_USAGE, "By", Some(false)); @@ -2243,7 +2283,7 @@ mod tests { assert_first_point_attr( metrics, metric::MEMORY_LINUX_HUGEPAGES_USAGE, - "system.memory.linux.hugepages.state", + attr::SYSTEM_MEMORY_LINUX_HUGEPAGES_STATE, "used", ); assert_metric_shape( @@ -2257,7 +2297,7 @@ mod tests { assert_first_point_attr( metrics, metric::PAGING_FAULTS, - "system.paging.fault.type", + attr::SYSTEM_PAGING_FAULT_TYPE, "minor", ); assert_metric_shape( @@ -2269,45 +2309,59 @@ mod tests { assert_sum_point_attr( metrics, metric::PAGING_OPERATIONS, - "system.paging.direction", + attr::SYSTEM_PAGING_DIRECTION, "in", ); assert_sum_point_attr( metrics, metric::PAGING_OPERATIONS, - "system.paging.fault.type", + attr::SYSTEM_PAGING_FAULT_TYPE, "minor", ); assert_metric_shape(metrics, metric::PAGING_USAGE, "By", Some(false)); - assert_first_point_attr(metrics, metric::PAGING_USAGE, "system.device", "/dev/swap"); + assert_first_point_attr( + metrics, + metric::PAGING_USAGE, + attr::SYSTEM_DEVICE, + "/dev/swap", + ); assert_metric_shape(metrics, metric::PAGING_UTILIZATION, "1", None); assert_metric_shape(metrics, metric::PROCESS_COUNT, "{process}", Some(false)); - assert_sum_point_attr(metrics, metric::PROCESS_COUNT, "process.state", "blocked"); + assert_sum_point_attr( + metrics, + metric::PROCESS_COUNT, + attr::PROCESS_STATE, + "blocked", + ); assert_metric_shape(metrics, metric::PROCESS_CREATED, "{process}", Some(true)); assert_metric_shape(metrics, metric::DISK_IO, "By", Some(true)); - assert_first_point_attr(metrics, metric::DISK_IO, "disk.io.direction", "read"); + assert_first_point_attr(metrics, metric::DISK_IO, attr::DISK_IO_DIRECTION, "read"); assert_metric_shape(metrics, metric::DISK_OPERATIONS, "{operation}", Some(true)); assert_metric_shape(metrics, metric::DISK_IO_TIME, "s", Some(true)); - assert_first_point_attr(metrics, metric::DISK_IO_TIME, "system.device", "sda"); + assert_first_point_attr(metrics, metric::DISK_IO_TIME, attr::SYSTEM_DEVICE, "sda"); assert_metric_shape(metrics, metric::DISK_OPERATION_TIME, "s", Some(true)); assert_metric_shape(metrics, metric::DISK_MERGED, "{operation}", Some(true)); assert_metric_shape(metrics, metric::DISK_LIMIT, "By", Some(false)); - assert_first_point_attr(metrics, metric::DISK_LIMIT, "system.device", "sda"); + assert_first_point_attr(metrics, metric::DISK_LIMIT, attr::SYSTEM_DEVICE, "sda"); assert_metric_shape(metrics, metric::FILESYSTEM_USAGE, "By", Some(false)); assert_first_point_attr( metrics, metric::FILESYSTEM_USAGE, - "system.filesystem.state", + attr::SYSTEM_FILESYSTEM_STATE, "used", ); assert_metric_shape(metrics, metric::FILESYSTEM_UTILIZATION, "1", None); assert_metric_shape(metrics, metric::FILESYSTEM_LIMIT, "By", Some(false)); - assert_no_first_point_attr(metrics, metric::FILESYSTEM_LIMIT, "system.filesystem.state"); + assert_no_first_point_attr( + metrics, + metric::FILESYSTEM_LIMIT, + attr::SYSTEM_FILESYSTEM_STATE, + ); assert_metric_shape(metrics, metric::NETWORK_IO, "By", Some(true)); assert_first_point_attr( metrics, metric::NETWORK_IO, - "network.interface.name", + attr::NETWORK_INTERFACE_NAME, "eth0", ); assert_metric_shape( @@ -2319,7 +2373,7 @@ mod tests { assert_first_point_attr( metrics, metric::NETWORK_PACKET_COUNT, - "system.device", + attr::SYSTEM_DEVICE, "eth0", ); assert_metric_shape( @@ -2331,7 +2385,7 @@ mod tests { assert_first_point_attr( metrics, metric::NETWORK_PACKET_DROPPED, - "network.interface.name", + attr::NETWORK_INTERFACE_NAME, "eth0", ); assert_metric_shape(metrics, metric::NETWORK_ERRORS, "{error}", Some(true)); @@ -3001,7 +3055,10 @@ mod tests { .unwrap_or_else(|_| VirtualDirectoryPath::GitRepo { url: "https://github.com/open-telemetry/semantic-conventions.git".to_owned(), sub_folder: Some("model".to_owned()), - refspec: Some("v1.41.0".to_owned()), + refspec: Some(format!( + "v{}", + crate::receivers::host_metrics_receiver::semconv::VERSION + )), }); let registry_repo = @@ -3149,7 +3206,7 @@ mod tests { #[cfg(feature = "dev-tools")] fn is_intentional_semconv_enum_value_gap(name: &str, attr: &str, value: &str) -> bool { - name == metric::PROCESS_COUNT && attr == "process.state" && value == "blocked" + name == metric::PROCESS_COUNT && attr == attr::PROCESS_STATE && value == "blocked" } #[cfg(feature = "dev-tools")] diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/semconv.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/semconv.rs index f190856cea..f55a5bc439 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/semconv.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/semconv.rs @@ -63,3 +63,28 @@ pub(crate) mod metric { pub(crate) const PROCESS_CREATED: &str = "system.process.created"; pub(crate) const UPTIME: &str = "system.uptime"; } + +pub(crate) mod attr { + pub(crate) const CPU_LOGICAL_NUMBER: &str = "cpu.logical_number"; + pub(crate) const CPU_MODE: &str = "cpu.mode"; + pub(crate) const DISK_IO_DIRECTION: &str = "disk.io.direction"; + pub(crate) const HOST_ARCH: &str = "host.arch"; + pub(crate) const HOST_ID: &str = "host.id"; + pub(crate) const HOST_NAME: &str = "host.name"; + pub(crate) const NETWORK_INTERFACE_NAME: &str = "network.interface.name"; + pub(crate) const NETWORK_IO_DIRECTION: &str = "network.io.direction"; + pub(crate) const OS_TYPE: &str = "os.type"; + pub(crate) const PROCESS_STATE: &str = "process.state"; + pub(crate) const SYSTEM_DEVICE: &str = "system.device"; + pub(crate) const SYSTEM_FILESYSTEM_MODE: &str = "system.filesystem.mode"; + pub(crate) const SYSTEM_FILESYSTEM_MOUNTPOINT: &str = "system.filesystem.mountpoint"; + pub(crate) const SYSTEM_FILESYSTEM_STATE: &str = "system.filesystem.state"; + pub(crate) const SYSTEM_FILESYSTEM_TYPE: &str = "system.filesystem.type"; + pub(crate) const SYSTEM_MEMORY_LINUX_HUGEPAGES_STATE: &str = + "system.memory.linux.hugepages.state"; + pub(crate) const SYSTEM_MEMORY_LINUX_SLAB_STATE: &str = "system.memory.linux.slab.state"; + pub(crate) const SYSTEM_MEMORY_STATE: &str = "system.memory.state"; + pub(crate) const SYSTEM_PAGING_DIRECTION: &str = "system.paging.direction"; + pub(crate) const SYSTEM_PAGING_FAULT_TYPE: &str = "system.paging.fault.type"; + pub(crate) const SYSTEM_PAGING_STATE: &str = "system.paging.state"; +} From dbf01f25f8ae1eefdb00c7dd7caef2adb4821ebe Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Mon, 4 May 2026 23:37:42 -0700 Subject: [PATCH 43/60] Align host process state with semconv --- .../receivers/host_metrics_receiver/procfs.rs | 109 ++++++++++++++++-- 1 file changed, 98 insertions(+), 11 deletions(-) diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs index 6cb196bce1..344d3fe3af 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs @@ -854,14 +854,11 @@ fn project_snapshot( // ── Processes ──────────────────────────────────────────────────────────── if let Some(processes) = snap.processes { let m = b.begin_updown_i64(metric::PROCESS_COUNT, "{process}"); - for (state, value) in [ - ("running", processes.running), - ("blocked", processes.blocked), - ] { - b.append_i64_dp(m, start, now, saturating_i64(value), |w| { - w.str(attr::PROCESS_STATE, state); - }); - } + b.append_i64_dp(m, start, now, saturating_i64(processes.running), |w| { + w.str(attr::PROCESS_STATE, "running"); + }); + // /proc/stat procs_blocked has no registered process.state value. + // Do not map it to sleeping; Linux blocked tasks are not the same state. let m = b.begin_counter_i64(metric::PROCESS_CREATED, "{process}"); b.append_i64_dp( m, @@ -2331,7 +2328,7 @@ mod tests { metrics, metric::PROCESS_COUNT, attr::PROCESS_STATE, - "blocked", + "running", ); assert_metric_shape(metrics, metric::PROCESS_CREATED, "{process}", Some(true)); assert_metric_shape(metrics, metric::DISK_IO, "By", Some(true)); @@ -2677,6 +2674,96 @@ mod tests { ); } + #[test] + fn scrape_due_uses_boot_time_for_counter_only_family_ticks() { + let root = tempfile::tempdir().expect("tempdir"); + let proc = root.path().join("proc"); + let proc_one = proc.join("1"); + std::fs::create_dir_all(&proc_one).expect("proc dirs"); + std::fs::write(proc.join("stat"), "btime 123\n").expect("stat"); + std::fs::write( + proc.join("diskstats"), + "8 0 sda 1 0 2 3 4 0 5 6 0 0 0 0 0 0 0 0\n", + ) + .expect("diskstats"); + std::fs::write( + proc_one.join("net/dev"), + "Inter-| Receive | Transmit\n\ + face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed\n\ + eth0: 10 1 0 0 0 0 0 0 20 2 0 0 0 0 0 0\n", + ) + .expect("netdev"); + std::fs::write( + proc.join("vmstat"), + "pgfault 10\npgmajfault 1\npgpgin 2\npgpgout 3\npswpin 4\npswpout 5\n", + ) + .expect("vmstat"); + std::fs::write(proc.join("swaps"), "Filename Type Size Used Priority\n").expect("swaps"); + + let mut source = ProcfsSource::new( + Some(root.path()), + ProcfsConfig { + cpu: false, + memory: false, + paging: true, + system: false, + disk: true, + filesystem: false, + network: true, + processes: false, + cpu_utilization: false, + memory_limit: false, + memory_shared: false, + memory_hugepages: false, + disk_limit: false, + filesystem_include_virtual: false, + filesystem_limit: false, + filesystem_include_devices: None, + filesystem_exclude_devices: None, + filesystem_include_fs_types: None, + filesystem_exclude_fs_types: None, + filesystem_include_mount_points: None, + filesystem_exclude_mount_points: None, + disk_include: None, + disk_exclude: None, + network_include: None, + network_exclude: None, + validation: HostViewValidationMode::None, + }, + ) + .expect("source"); + + let expected_start = 123 * NANOS_PER_SEC; + let disk_scrape = source + .scrape_due(ProcfsFamilies { + disk: true, + ..ProcfsFamilies::default() + }) + .expect("disk scrape"); + assert_eq!(disk_scrape.snapshot.start_time_unix_nano, expected_start); + assert_eq!(disk_scrape.snapshot.disks.len(), 1); + + std::fs::remove_file(proc.join("stat")).expect("remove stat after cache"); + + let network_scrape = source + .scrape_due(ProcfsFamilies { + network: true, + ..ProcfsFamilies::default() + }) + .expect("network scrape"); + assert_eq!(network_scrape.snapshot.start_time_unix_nano, expected_start); + assert_eq!(network_scrape.snapshot.networks.len(), 1); + + let paging_scrape = source + .scrape_due(ProcfsFamilies { + paging: true, + ..ProcfsFamilies::default() + }) + .expect("paging scrape"); + assert_eq!(paging_scrape.snapshot.start_time_unix_nano, expected_start); + assert!(paging_scrape.snapshot.paging.is_some()); + } + #[test] fn scrape_due_reads_filesystem_usage_from_mountinfo() { let root = tempfile::tempdir().expect("tempdir"); @@ -3205,8 +3292,8 @@ mod tests { } #[cfg(feature = "dev-tools")] - fn is_intentional_semconv_enum_value_gap(name: &str, attr: &str, value: &str) -> bool { - name == metric::PROCESS_COUNT && attr == attr::PROCESS_STATE && value == "blocked" + fn is_intentional_semconv_enum_value_gap(_name: &str, _attr: &str, _value: &str) -> bool { + false } #[cfg(feature = "dev-tools")] From 6c12c987dac4b5d87c79004b67d4809ae2d8e60d Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Mon, 4 May 2026 23:54:07 -0700 Subject: [PATCH 44/60] Document host semconv constants ownership --- .../core-nodes/src/receivers/host_metrics_receiver/semconv.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/semconv.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/semconv.rs index f55a5bc439..a212246518 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/semconv.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/semconv.rs @@ -2,6 +2,8 @@ // SPDX-License-Identifier: Apache-2.0 //! Semantic convention constants used by the host metrics receiver. +// Keep these strings centralized here. If this surface grows, prefer generating +// constants from the semconv registry instead of adding scattered literals. /// Semconv version targeted by this receiver's projection layer. pub(crate) const VERSION: &str = "1.41.0"; From 4d86475281a069ad426e3e6506d7dda7b3323417 Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Tue, 5 May 2026 00:03:53 -0700 Subject: [PATCH 45/60] Harden host metrics validation and start times --- .../receivers/host_metrics_receiver/mod.rs | 11 +- .../host_metrics_receiver/otap_builder.rs | 14 ++- .../receivers/host_metrics_receiver/procfs.rs | 112 +++++++++++------- 3 files changed, 87 insertions(+), 50 deletions(-) diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs index 50f9eefd22..65a0fe68cf 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs @@ -678,6 +678,7 @@ pub static HOST_METRICS_RECEIVER: ReceiverFactory = ReceiverFactory { }, wiring_contract: otap_df_engine::wiring_contract::WiringContract::UNRESTRICTED, validate_config: |config| { + validate_supported_platform()?; let config: Config = serde_json::from_value(config.clone()).map_err(|e| { otap_df_config::error::Error::InvalidUserConfig { error: e.to_string(), @@ -1203,10 +1204,14 @@ fn normalized_root_path(root_path: Option<&Path>) -> Result {} - Component::CurDir => {} Component::Normal(part) => normalized.push(part), - Component::ParentDir => { - let _ = normalized.pop(); + Component::CurDir | Component::ParentDir => { + return Err(otap_df_config::error::Error::InvalidUserConfig { + error: format!( + "root_path must not contain . or .. components: {}", + path.display() + ), + }); } Component::Prefix(_) => { return Err(otap_df_config::error::Error::InvalidUserConfig { diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/otap_builder.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/otap_builder.rs index 59db71ea8d..7ed3f0fbbf 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/otap_builder.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/otap_builder.rs @@ -171,11 +171,11 @@ impl HostMetricsArrowBuilder { // ── Datapoint appenders ───────────────────────────────────────────────── /// Append one i64 data point for `metric_id`. - /// `start` is `start_time_unix_nano`; pass `0` for gauges. + /// `start` is `start_time_unix_nano`; use `None` for gauges. pub(crate) fn append_i64_dp( &mut self, metric_id: u16, - start: u64, + start: Option, now: u64, value: i64, attrs: F, @@ -185,7 +185,8 @@ impl HostMetricsArrowBuilder { let dp_id = self.curr_dp_id; self.ndp.append_id(dp_id); self.ndp.append_parent_id(metric_id); - self.ndp.append_start_time_unix_nano(Some(start as i64)); + self.ndp + .append_start_time_unix_nano(start.map(|v| v as i64)); self.ndp.append_time_unix_nano(now as i64); self.ndp.append_int_value(Some(value)); self.ndp.append_double_value(None); @@ -202,11 +203,11 @@ impl HostMetricsArrowBuilder { } /// Append one f64 data point for `metric_id`. - /// `start` is `start_time_unix_nano`; pass `0` for gauges. + /// `start` is `start_time_unix_nano`; use `None` for gauges. pub(crate) fn append_f64_dp( &mut self, metric_id: u16, - start: u64, + start: Option, now: u64, value: f64, attrs: F, @@ -216,7 +217,8 @@ impl HostMetricsArrowBuilder { let dp_id = self.curr_dp_id; self.ndp.append_id(dp_id); self.ndp.append_parent_id(metric_id); - self.ndp.append_start_time_unix_nano(Some(start as i64)); + self.ndp + .append_start_time_unix_nano(start.map(|v| v as i64)); self.ndp.append_time_unix_nano(now as i64); self.ndp.append_int_value(None); self.ndp.append_double_value(Some(value)); diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs index 344d3fe3af..cc5fcbfec4 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs @@ -681,9 +681,15 @@ fn project_snapshot( ("interrupt", cpu.interrupt), ("steal", cpu.steal), ] { - b.append_f64_dp(m, cs.get(metric::CPU_TIME, mode, start), now, value, |w| { - w.str(attr::CPU_MODE, mode); - }); + b.append_f64_dp( + m, + Some(cs.get(metric::CPU_TIME, mode, start)), + now, + value, + |w| { + w.str(attr::CPU_MODE, mode); + }, + ); } } if let Some(cpu) = snap.cpu_utilization { @@ -697,7 +703,7 @@ fn project_snapshot( ("interrupt", cpu.interrupt), ("steal", cpu.steal), ] { - b.append_f64_dp(m, 0, now, value, |w| { + b.append_f64_dp(m, None, now, value, |w| { w.str(attr::CPU_MODE, mode); }); } @@ -706,7 +712,7 @@ fn project_snapshot( let m = b.begin_updown_i64(metric::CPU_LOGICAL_COUNT, "{cpu}"); b.append_i64_dp( m, - start, + Some(start), now, saturating_i64(snap.cpuinfo.logical_count), |_| {}, @@ -716,7 +722,7 @@ fn project_snapshot( let m = b.begin_updown_i64(metric::CPU_PHYSICAL_COUNT, "{cpu}"); b.append_i64_dp( m, - start, + Some(start), now, saturating_i64(snap.cpuinfo.physical_count), |_| {}, @@ -726,7 +732,7 @@ fn project_snapshot( let m = b.begin_gauge_i64(metric::CPU_FREQUENCY, "Hz"); for (idx, &freq) in snap.cpuinfo.frequencies_hz.iter().enumerate() { let logical = i64::try_from(idx).unwrap_or(i64::MAX); - b.append_i64_dp(m, 0, now, frequency_hz_i64(freq), |w| { + b.append_i64_dp(m, None, now, frequency_hz_i64(freq), |w| { w.int(attr::CPU_LOGICAL_NUMBER, logical); }); } @@ -741,7 +747,7 @@ fn project_snapshot( ("cached", memory.cached), ("buffers", memory.buffered), ] { - b.append_i64_dp(m, start, now, saturating_i64(value), |w| { + b.append_i64_dp(m, Some(start), now, saturating_i64(value), |w| { w.str(attr::SYSTEM_MEMORY_STATE, state); }); } @@ -754,31 +760,37 @@ fn project_snapshot( ("cached", memory.cached), ("buffers", memory.buffered), ] { - b.append_f64_dp(m, 0, now, value as f64 / total, |w| { + b.append_f64_dp(m, None, now, value as f64 / total, |w| { w.str(attr::SYSTEM_MEMORY_STATE, state); }); } } if memory.has_available { let m = b.begin_updown_i64(metric::MEMORY_LINUX_AVAILABLE, "By"); - b.append_i64_dp(m, start, now, saturating_i64(memory.available), |_| {}); + b.append_i64_dp( + m, + Some(start), + now, + saturating_i64(memory.available), + |_| {}, + ); } let m = b.begin_updown_i64(metric::MEMORY_LINUX_SLAB_USAGE, "By"); for (state, value) in [ ("reclaimable", memory.slab_reclaimable), ("unreclaimable", memory.slab_unreclaimable), ] { - b.append_i64_dp(m, start, now, saturating_i64(value), |w| { + b.append_i64_dp(m, Some(start), now, saturating_i64(value), |w| { w.str(attr::SYSTEM_MEMORY_LINUX_SLAB_STATE, state); }); } if snap.memory_limit { let m = b.begin_updown_i64(metric::MEMORY_LIMIT, "By"); - b.append_i64_dp(m, start, now, saturating_i64(memory.total), |_| {}); + b.append_i64_dp(m, Some(start), now, saturating_i64(memory.total), |_| {}); } if snap.memory_shared { let m = b.begin_updown_i64(metric::MEMORY_LINUX_SHARED, "By"); - b.append_i64_dp(m, start, now, saturating_i64(memory.shared), |_| {}); + b.append_i64_dp(m, Some(start), now, saturating_i64(memory.shared), |_| {}); } if snap.memory_hugepages { project_hugepages(snap, b, start, now, &memory.hugepages); @@ -788,7 +800,7 @@ fn project_snapshot( // ── System / uptime ────────────────────────────────────────────────────── if let Some(uptime) = snap.uptime_seconds { let m = b.begin_gauge_f64(metric::UPTIME, "s"); - b.append_f64_dp(m, 0, now, uptime, |_| {}); + b.append_f64_dp(m, None, now, uptime, |_| {}); } // ── Paging ─────────────────────────────────────────────────────────────── @@ -800,7 +812,7 @@ fn project_snapshot( ] { b.append_i64_dp( m, - cs.get(metric::PAGING_FAULTS, fault_type, start), + Some(cs.get(metric::PAGING_FAULTS, fault_type, start)), now, saturating_i64(value), |w| { @@ -820,7 +832,7 @@ fn project_snapshot( ] { b.append_i64_dp( m, - cs.get_joined(metric::PAGING_OPERATIONS, direction, fault_type, start), + Some(cs.get_joined(metric::PAGING_OPERATIONS, direction, fault_type, start)), now, saturating_i64(value), |w| { @@ -833,7 +845,7 @@ fn project_snapshot( for swap in &snap.swaps { let m = b.begin_updown_i64(metric::PAGING_USAGE, "By"); for (state, value) in [("used", swap.used), ("free", swap.free)] { - b.append_i64_dp(m, start, now, saturating_i64(value), |w| { + b.append_i64_dp(m, Some(start), now, saturating_i64(value), |w| { w.str(attr::SYSTEM_DEVICE, &swap.name); w.str(attr::SYSTEM_PAGING_STATE, state); }); @@ -843,7 +855,7 @@ fn project_snapshot( let m = b.begin_gauge_f64(metric::PAGING_UTILIZATION, "1"); let total = size as f64; for (state, value) in [("used", swap.used), ("free", swap.free)] { - b.append_f64_dp(m, 0, now, value as f64 / total, |w| { + b.append_f64_dp(m, None, now, value as f64 / total, |w| { w.str(attr::SYSTEM_DEVICE, &swap.name); w.str(attr::SYSTEM_PAGING_STATE, state); }); @@ -854,15 +866,21 @@ fn project_snapshot( // ── Processes ──────────────────────────────────────────────────────────── if let Some(processes) = snap.processes { let m = b.begin_updown_i64(metric::PROCESS_COUNT, "{process}"); - b.append_i64_dp(m, start, now, saturating_i64(processes.running), |w| { - w.str(attr::PROCESS_STATE, "running"); - }); + b.append_i64_dp( + m, + Some(start), + now, + saturating_i64(processes.running), + |w| { + w.str(attr::PROCESS_STATE, "running"); + }, + ); // /proc/stat procs_blocked has no registered process.state value. // Do not map it to sleeping; Linux blocked tasks are not the same state. let m = b.begin_counter_i64(metric::PROCESS_CREATED, "{process}"); b.append_i64_dp( m, - cs.get(metric::PROCESS_CREATED, "", start), + Some(cs.get(metric::PROCESS_CREATED, "", start)), now, saturating_i64(processes.created), |_| {}, @@ -873,7 +891,7 @@ fn project_snapshot( for disk in &snap.disks { if let Some(limit_bytes) = disk.limit_bytes { let m = b.begin_updown_i64(metric::DISK_LIMIT, "By"); - b.append_i64_dp(m, start, now, saturating_i64(limit_bytes), |w| { + b.append_i64_dp(m, Some(start), now, saturating_i64(limit_bytes), |w| { w.str(attr::SYSTEM_DEVICE, &disk.name); }); } @@ -881,7 +899,7 @@ fn project_snapshot( for (dir, value) in [("read", disk.read_bytes), ("write", disk.write_bytes)] { b.append_i64_dp( m, - cs.get_joined(metric::DISK_IO, &disk.name, dir, start), + Some(cs.get_joined(metric::DISK_IO, &disk.name, dir, start)), now, saturating_i64(value), |w| { @@ -894,7 +912,7 @@ fn project_snapshot( for (dir, value) in [("read", disk.read_ops), ("write", disk.write_ops)] { b.append_i64_dp( m, - cs.get_joined(metric::DISK_OPERATIONS, &disk.name, dir, start), + Some(cs.get_joined(metric::DISK_OPERATIONS, &disk.name, dir, start)), now, saturating_i64(value), |w| { @@ -906,7 +924,7 @@ fn project_snapshot( let m = b.begin_counter_f64(metric::DISK_IO_TIME, "s"); b.append_f64_dp( m, - cs.get(metric::DISK_IO_TIME, &disk.name, start), + Some(cs.get(metric::DISK_IO_TIME, &disk.name, start)), now, disk.io_time_seconds, |w| { @@ -920,7 +938,7 @@ fn project_snapshot( ] { b.append_f64_dp( m, - cs.get_joined(metric::DISK_OPERATION_TIME, &disk.name, dir, start), + Some(cs.get_joined(metric::DISK_OPERATION_TIME, &disk.name, dir, start)), now, value, |w| { @@ -933,7 +951,7 @@ fn project_snapshot( for (dir, value) in [("read", disk.read_merged), ("write", disk.write_merged)] { b.append_i64_dp( m, - cs.get_joined(metric::DISK_MERGED, &disk.name, dir, start), + Some(cs.get_joined(metric::DISK_MERGED, &disk.name, dir, start)), now, saturating_i64(value), |w| { @@ -953,7 +971,7 @@ fn project_snapshot( ("free", fs.free), ("reserved", fs.reserved), ] { - b.append_i64_dp(m, start, now, saturating_i64(value), |w| { + b.append_i64_dp(m, Some(start), now, saturating_i64(value), |w| { w.str(attr::SYSTEM_DEVICE, &fs.device); w.str(attr::SYSTEM_FILESYSTEM_STATE, state); w.str(attr::SYSTEM_FILESYSTEM_TYPE, &fs.fs_type); @@ -969,7 +987,7 @@ fn project_snapshot( ("free", fs.free), ("reserved", fs.reserved), ] { - b.append_f64_dp(m, 0, now, value as f64 / total_f, |w| { + b.append_f64_dp(m, None, now, value as f64 / total_f, |w| { w.str(attr::SYSTEM_DEVICE, &fs.device); w.str(attr::SYSTEM_FILESYSTEM_STATE, state); w.str(attr::SYSTEM_FILESYSTEM_TYPE, &fs.fs_type); @@ -980,7 +998,7 @@ fn project_snapshot( } if let Some(limit_bytes) = fs.limit_bytes { let m = b.begin_updown_i64(metric::FILESYSTEM_LIMIT, "By"); - b.append_i64_dp(m, start, now, saturating_i64(limit_bytes), |w| { + b.append_i64_dp(m, Some(start), now, saturating_i64(limit_bytes), |w| { w.str(attr::SYSTEM_DEVICE, &fs.device); w.str(attr::SYSTEM_FILESYSTEM_TYPE, &fs.fs_type); w.str(attr::SYSTEM_FILESYSTEM_MODE, fs.mode); @@ -998,7 +1016,7 @@ fn project_snapshot( ] { b.append_i64_dp( m, - cs.get_joined(metric::NETWORK_IO, &net.name, dir, start), + Some(cs.get_joined(metric::NETWORK_IO, &net.name, dir, start)), now, saturating_i64(value), |w| { @@ -1011,7 +1029,7 @@ fn project_snapshot( for (dir, value) in [("receive", net.rx_packets), ("transmit", net.tx_packets)] { b.append_i64_dp( m, - cs.get_joined(metric::NETWORK_PACKET_COUNT, &net.name, dir, start), + Some(cs.get_joined(metric::NETWORK_PACKET_COUNT, &net.name, dir, start)), now, saturating_i64(value), |w| { @@ -1026,7 +1044,7 @@ fn project_snapshot( for (dir, value) in [("receive", net.rx_dropped), ("transmit", net.tx_dropped)] { b.append_i64_dp( m, - cs.get_joined(metric::NETWORK_PACKET_DROPPED, &net.name, dir, start), + Some(cs.get_joined(metric::NETWORK_PACKET_DROPPED, &net.name, dir, start)), now, saturating_i64(value), |w| { @@ -1039,7 +1057,7 @@ fn project_snapshot( for (dir, value) in [("receive", net.rx_errors), ("transmit", net.tx_errors)] { b.append_i64_dp( m, - cs.get_joined(metric::NETWORK_ERRORS, &net.name, dir, start), + Some(cs.get_joined(metric::NETWORK_ERRORS, &net.name, dir, start)), now, saturating_i64(value), |w| { @@ -1059,23 +1077,35 @@ fn project_hugepages( hugepages: &HugepageStats, ) { let m = b.begin_updown_i64(metric::MEMORY_LINUX_HUGEPAGES_LIMIT, "{page}"); - b.append_i64_dp(m, start, now, saturating_i64(hugepages.total), |_| {}); + b.append_i64_dp(m, Some(start), now, saturating_i64(hugepages.total), |_| {}); let m = b.begin_updown_i64(metric::MEMORY_LINUX_HUGEPAGES_PAGE_SIZE, "By"); b.append_i64_dp( m, - start, + Some(start), now, saturating_i64(hugepages.page_size_bytes), |_| {}, ); let m = b.begin_updown_i64(metric::MEMORY_LINUX_HUGEPAGES_RESERVED, "{page}"); - b.append_i64_dp(m, start, now, saturating_i64(hugepages.reserved), |_| {}); + b.append_i64_dp( + m, + Some(start), + now, + saturating_i64(hugepages.reserved), + |_| {}, + ); let m = b.begin_updown_i64(metric::MEMORY_LINUX_HUGEPAGES_SURPLUS, "{page}"); - b.append_i64_dp(m, start, now, saturating_i64(hugepages.surplus), |_| {}); + b.append_i64_dp( + m, + Some(start), + now, + saturating_i64(hugepages.surplus), + |_| {}, + ); let used = hugepages.total.saturating_sub(hugepages.free); let m = b.begin_updown_i64(metric::MEMORY_LINUX_HUGEPAGES_USAGE, "{page}"); for (state, value) in [("used", used), ("free", hugepages.free)] { - b.append_i64_dp(m, start, now, saturating_i64(value), |w| { + b.append_i64_dp(m, Some(start), now, saturating_i64(value), |w| { w.str(attr::SYSTEM_MEMORY_LINUX_HUGEPAGES_STATE, state); }); } @@ -1083,7 +1113,7 @@ fn project_hugepages( let total = hugepages.total as f64; let m = b.begin_gauge_f64(metric::MEMORY_LINUX_HUGEPAGES_UTILIZATION, "1"); for (state, value) in [("used", used), ("free", hugepages.free)] { - b.append_f64_dp(m, 0, now, value as f64 / total, |w| { + b.append_f64_dp(m, None, now, value as f64 / total, |w| { w.str(attr::SYSTEM_MEMORY_LINUX_HUGEPAGES_STATE, state); }); } From bc47e78f332b0f565ae1ba236aeeb7079ec7d1ce Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Tue, 5 May 2026 08:06:43 -0700 Subject: [PATCH 46/60] Tighten host metrics OTAP builder API --- .../host_metrics_receiver/otap_builder.rs | 91 ++++++++-- .../receivers/host_metrics_receiver/procfs.rs | 168 ++++++++---------- 2 files changed, 144 insertions(+), 115 deletions(-) diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/otap_builder.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/otap_builder.rs index 7ed3f0fbbf..67976e5a9f 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/otap_builder.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/otap_builder.rs @@ -26,6 +26,9 @@ const SCOPE_VERSION: &[u8] = env!("CARGO_PKG_VERSION").as_bytes(); // AggregationTemporality::Cumulative = 2 (OTLP proto enum value). const AGGREGATION_TEMPORALITY_CUMULATIVE: i32 = 2; +#[derive(Clone, Copy)] +pub(crate) struct MetricHandle(u16); + /// Wraps the per-datapoint attribute builder and hides the dp_id from callers. pub(crate) struct DpAttrWriter<'a> { attrs: &'a mut StrKeysAttributesRecordBatchBuilder, @@ -63,8 +66,8 @@ impl ResourceAttrWriter<'_> { /// Builds an `OtapArrowRecords::Metrics` batch directly from host metric values. /// -/// Call a `begin_*` method to open a metric, then [`append_i64_dp`] / -/// [`append_f64_dp`] for each data point. +/// Call a `begin_*` method to open a metric, then one of the typed data point +/// appenders for each value. /// Call [`finish`] to produce the final batch. pub(crate) struct HostMetricsArrowBuilder { metrics: MetricsRecordBatchBuilder, @@ -113,27 +116,27 @@ impl HostMetricsArrowBuilder { // ── Metric openers ────────────────────────────────────────────────────── /// Open a monotonic cumulative Sum metric (i64 data points). - pub(crate) fn begin_counter_i64(&mut self, name: &str, unit: &str) -> u16 { + pub(crate) fn begin_counter_i64(&mut self, name: &str, unit: &str) -> MetricHandle { self.begin_metric(name, unit, MetricType::Sum, true) } /// Open a monotonic cumulative Sum metric (f64 data points). - pub(crate) fn begin_counter_f64(&mut self, name: &str, unit: &str) -> u16 { + pub(crate) fn begin_counter_f64(&mut self, name: &str, unit: &str) -> MetricHandle { self.begin_metric(name, unit, MetricType::Sum, true) } /// Open a non-monotonic cumulative Sum metric / UpDownCounter (i64 data points). - pub(crate) fn begin_updown_i64(&mut self, name: &str, unit: &str) -> u16 { + pub(crate) fn begin_updown_i64(&mut self, name: &str, unit: &str) -> MetricHandle { self.begin_metric(name, unit, MetricType::Sum, false) } /// Open a Gauge metric (f64 data points). - pub(crate) fn begin_gauge_f64(&mut self, name: &str, unit: &str) -> u16 { + pub(crate) fn begin_gauge_f64(&mut self, name: &str, unit: &str) -> MetricHandle { self.begin_metric(name, unit, MetricType::Gauge, false) } /// Open a Gauge metric (i64 data points). - pub(crate) fn begin_gauge_i64(&mut self, name: &str, unit: &str) -> u16 { + pub(crate) fn begin_gauge_i64(&mut self, name: &str, unit: &str) -> MetricHandle { self.begin_metric(name, unit, MetricType::Gauge, false) } @@ -143,7 +146,7 @@ impl HostMetricsArrowBuilder { unit: &str, metric_type: MetricType, is_monotonic: bool, - ) -> u16 { + ) -> MetricHandle { let id = self.curr_metric_id; self.metrics.append_id(id); self.metrics.append_metric_type(metric_type as u8); @@ -165,16 +168,41 @@ impl HostMetricsArrowBuilder { .curr_metric_id .checked_add(1) .expect("metric_id overflow: more than u16::MAX metrics in one batch"); - id + MetricHandle(id) } // ── Datapoint appenders ───────────────────────────────────────────────── - /// Append one i64 data point for `metric_id`. - /// `start` is `start_time_unix_nano`; use `None` for gauges. - pub(crate) fn append_i64_dp( + /// Append one i64 Sum data point. + pub(crate) fn append_i64_sum_dp( + &mut self, + metric: MetricHandle, + start: u64, + now: u64, + value: i64, + attrs: F, + ) where + F: FnOnce(&mut DpAttrWriter<'_>), + { + self.append_i64_dp(metric, Some(start), now, value, attrs); + } + + /// Append one i64 Gauge data point. + pub(crate) fn append_i64_gauge_dp( + &mut self, + metric: MetricHandle, + now: u64, + value: i64, + attrs: F, + ) where + F: FnOnce(&mut DpAttrWriter<'_>), + { + self.append_i64_dp(metric, None, now, value, attrs); + } + + fn append_i64_dp( &mut self, - metric_id: u16, + metric: MetricHandle, start: Option, now: u64, value: i64, @@ -184,7 +212,7 @@ impl HostMetricsArrowBuilder { { let dp_id = self.curr_dp_id; self.ndp.append_id(dp_id); - self.ndp.append_parent_id(metric_id); + self.ndp.append_parent_id(metric.0); self.ndp .append_start_time_unix_nano(start.map(|v| v as i64)); self.ndp.append_time_unix_nano(now as i64); @@ -202,11 +230,36 @@ impl HostMetricsArrowBuilder { .expect("dp_id overflow: more than u32::MAX datapoints in one batch"); } - /// Append one f64 data point for `metric_id`. - /// `start` is `start_time_unix_nano`; use `None` for gauges. - pub(crate) fn append_f64_dp( + /// Append one f64 Sum data point. + pub(crate) fn append_f64_sum_dp( + &mut self, + metric: MetricHandle, + start: u64, + now: u64, + value: f64, + attrs: F, + ) where + F: FnOnce(&mut DpAttrWriter<'_>), + { + self.append_f64_dp(metric, Some(start), now, value, attrs); + } + + /// Append one f64 Gauge data point. + pub(crate) fn append_f64_gauge_dp( + &mut self, + metric: MetricHandle, + now: u64, + value: f64, + attrs: F, + ) where + F: FnOnce(&mut DpAttrWriter<'_>), + { + self.append_f64_dp(metric, None, now, value, attrs); + } + + fn append_f64_dp( &mut self, - metric_id: u16, + metric: MetricHandle, start: Option, now: u64, value: f64, @@ -216,7 +269,7 @@ impl HostMetricsArrowBuilder { { let dp_id = self.curr_dp_id; self.ndp.append_id(dp_id); - self.ndp.append_parent_id(metric_id); + self.ndp.append_parent_id(metric.0); self.ndp .append_start_time_unix_nano(start.map(|v| v as i64)); self.ndp.append_time_unix_nano(now as i64); diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs index cc5fcbfec4..887723b833 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs @@ -108,6 +108,21 @@ pub struct ProcfsFamilies { pub processes: bool, } +impl ProcfsFamilies { + fn enabled_by(self, config: &ProcfsConfig) -> Self { + Self { + cpu: self.cpu && config.cpu, + memory: self.memory && config.memory, + paging: self.paging && config.paging, + system: self.system && config.system, + disk: self.disk && config.disk, + filesystem: self.filesystem && config.filesystem, + network: self.network && config.network, + processes: self.processes && config.processes, + } + } +} + impl ProcfsSource { /// Creates a procfs source rooted at `/` or at a host root bind mount. pub fn new(root_path: Option<&Path>, config: ProcfsConfig) -> io::Result { @@ -128,16 +143,7 @@ impl ProcfsSource { /// Collects one host snapshot for the due family set. pub fn scrape_due(&mut self, due: ProcfsFamilies) -> io::Result { - let due = ProcfsFamilies { - cpu: due.cpu && self.config.cpu, - memory: due.memory && self.config.memory, - paging: due.paging && self.config.paging, - system: due.system && self.config.system, - disk: due.disk && self.config.disk, - filesystem: due.filesystem && self.config.filesystem, - network: due.network && self.config.network, - processes: due.processes && self.config.processes, - }; + let due = due.enabled_by(&self.config); let now_unix_nano = now_unix_nano(); let clk_tck = self.clk_tck; let mut partial_errors = 0; @@ -681,15 +687,9 @@ fn project_snapshot( ("interrupt", cpu.interrupt), ("steal", cpu.steal), ] { - b.append_f64_dp( - m, - Some(cs.get(metric::CPU_TIME, mode, start)), - now, - value, - |w| { - w.str(attr::CPU_MODE, mode); - }, - ); + b.append_f64_sum_dp(m, cs.get(metric::CPU_TIME, mode, start), now, value, |w| { + w.str(attr::CPU_MODE, mode); + }); } } if let Some(cpu) = snap.cpu_utilization { @@ -703,16 +703,16 @@ fn project_snapshot( ("interrupt", cpu.interrupt), ("steal", cpu.steal), ] { - b.append_f64_dp(m, None, now, value, |w| { + b.append_f64_gauge_dp(m, now, value, |w| { w.str(attr::CPU_MODE, mode); }); } } if snap.cpuinfo.logical_count != 0 { let m = b.begin_updown_i64(metric::CPU_LOGICAL_COUNT, "{cpu}"); - b.append_i64_dp( + b.append_i64_sum_dp( m, - Some(start), + start, now, saturating_i64(snap.cpuinfo.logical_count), |_| {}, @@ -720,9 +720,9 @@ fn project_snapshot( } if snap.cpuinfo.physical_count != 0 { let m = b.begin_updown_i64(metric::CPU_PHYSICAL_COUNT, "{cpu}"); - b.append_i64_dp( + b.append_i64_sum_dp( m, - Some(start), + start, now, saturating_i64(snap.cpuinfo.physical_count), |_| {}, @@ -732,7 +732,7 @@ fn project_snapshot( let m = b.begin_gauge_i64(metric::CPU_FREQUENCY, "Hz"); for (idx, &freq) in snap.cpuinfo.frequencies_hz.iter().enumerate() { let logical = i64::try_from(idx).unwrap_or(i64::MAX); - b.append_i64_dp(m, None, now, frequency_hz_i64(freq), |w| { + b.append_i64_gauge_dp(m, now, frequency_hz_i64(freq), |w| { w.int(attr::CPU_LOGICAL_NUMBER, logical); }); } @@ -747,7 +747,7 @@ fn project_snapshot( ("cached", memory.cached), ("buffers", memory.buffered), ] { - b.append_i64_dp(m, Some(start), now, saturating_i64(value), |w| { + b.append_i64_sum_dp(m, start, now, saturating_i64(value), |w| { w.str(attr::SYSTEM_MEMORY_STATE, state); }); } @@ -760,37 +760,31 @@ fn project_snapshot( ("cached", memory.cached), ("buffers", memory.buffered), ] { - b.append_f64_dp(m, None, now, value as f64 / total, |w| { + b.append_f64_gauge_dp(m, now, value as f64 / total, |w| { w.str(attr::SYSTEM_MEMORY_STATE, state); }); } } if memory.has_available { let m = b.begin_updown_i64(metric::MEMORY_LINUX_AVAILABLE, "By"); - b.append_i64_dp( - m, - Some(start), - now, - saturating_i64(memory.available), - |_| {}, - ); + b.append_i64_sum_dp(m, start, now, saturating_i64(memory.available), |_| {}); } let m = b.begin_updown_i64(metric::MEMORY_LINUX_SLAB_USAGE, "By"); for (state, value) in [ ("reclaimable", memory.slab_reclaimable), ("unreclaimable", memory.slab_unreclaimable), ] { - b.append_i64_dp(m, Some(start), now, saturating_i64(value), |w| { + b.append_i64_sum_dp(m, start, now, saturating_i64(value), |w| { w.str(attr::SYSTEM_MEMORY_LINUX_SLAB_STATE, state); }); } if snap.memory_limit { let m = b.begin_updown_i64(metric::MEMORY_LIMIT, "By"); - b.append_i64_dp(m, Some(start), now, saturating_i64(memory.total), |_| {}); + b.append_i64_sum_dp(m, start, now, saturating_i64(memory.total), |_| {}); } if snap.memory_shared { let m = b.begin_updown_i64(metric::MEMORY_LINUX_SHARED, "By"); - b.append_i64_dp(m, Some(start), now, saturating_i64(memory.shared), |_| {}); + b.append_i64_sum_dp(m, start, now, saturating_i64(memory.shared), |_| {}); } if snap.memory_hugepages { project_hugepages(snap, b, start, now, &memory.hugepages); @@ -800,7 +794,7 @@ fn project_snapshot( // ── System / uptime ────────────────────────────────────────────────────── if let Some(uptime) = snap.uptime_seconds { let m = b.begin_gauge_f64(metric::UPTIME, "s"); - b.append_f64_dp(m, None, now, uptime, |_| {}); + b.append_f64_gauge_dp(m, now, uptime, |_| {}); } // ── Paging ─────────────────────────────────────────────────────────────── @@ -810,9 +804,9 @@ fn project_snapshot( ("minor", paging.minor_faults), ("major", paging.major_faults), ] { - b.append_i64_dp( + b.append_i64_sum_dp( m, - Some(cs.get(metric::PAGING_FAULTS, fault_type, start)), + cs.get(metric::PAGING_FAULTS, fault_type, start), now, saturating_i64(value), |w| { @@ -830,9 +824,9 @@ fn project_snapshot( ("in", "minor", paging.page_in), ("out", "minor", paging.page_out), ] { - b.append_i64_dp( + b.append_i64_sum_dp( m, - Some(cs.get_joined(metric::PAGING_OPERATIONS, direction, fault_type, start)), + cs.get_joined(metric::PAGING_OPERATIONS, direction, fault_type, start), now, saturating_i64(value), |w| { @@ -845,7 +839,7 @@ fn project_snapshot( for swap in &snap.swaps { let m = b.begin_updown_i64(metric::PAGING_USAGE, "By"); for (state, value) in [("used", swap.used), ("free", swap.free)] { - b.append_i64_dp(m, Some(start), now, saturating_i64(value), |w| { + b.append_i64_sum_dp(m, start, now, saturating_i64(value), |w| { w.str(attr::SYSTEM_DEVICE, &swap.name); w.str(attr::SYSTEM_PAGING_STATE, state); }); @@ -855,7 +849,7 @@ fn project_snapshot( let m = b.begin_gauge_f64(metric::PAGING_UTILIZATION, "1"); let total = size as f64; for (state, value) in [("used", swap.used), ("free", swap.free)] { - b.append_f64_dp(m, None, now, value as f64 / total, |w| { + b.append_f64_gauge_dp(m, now, value as f64 / total, |w| { w.str(attr::SYSTEM_DEVICE, &swap.name); w.str(attr::SYSTEM_PAGING_STATE, state); }); @@ -866,21 +860,15 @@ fn project_snapshot( // ── Processes ──────────────────────────────────────────────────────────── if let Some(processes) = snap.processes { let m = b.begin_updown_i64(metric::PROCESS_COUNT, "{process}"); - b.append_i64_dp( - m, - Some(start), - now, - saturating_i64(processes.running), - |w| { - w.str(attr::PROCESS_STATE, "running"); - }, - ); + b.append_i64_sum_dp(m, start, now, saturating_i64(processes.running), |w| { + w.str(attr::PROCESS_STATE, "running"); + }); // /proc/stat procs_blocked has no registered process.state value. // Do not map it to sleeping; Linux blocked tasks are not the same state. let m = b.begin_counter_i64(metric::PROCESS_CREATED, "{process}"); - b.append_i64_dp( + b.append_i64_sum_dp( m, - Some(cs.get(metric::PROCESS_CREATED, "", start)), + cs.get(metric::PROCESS_CREATED, "", start), now, saturating_i64(processes.created), |_| {}, @@ -891,15 +879,15 @@ fn project_snapshot( for disk in &snap.disks { if let Some(limit_bytes) = disk.limit_bytes { let m = b.begin_updown_i64(metric::DISK_LIMIT, "By"); - b.append_i64_dp(m, Some(start), now, saturating_i64(limit_bytes), |w| { + b.append_i64_sum_dp(m, start, now, saturating_i64(limit_bytes), |w| { w.str(attr::SYSTEM_DEVICE, &disk.name); }); } let m = b.begin_counter_i64(metric::DISK_IO, "By"); for (dir, value) in [("read", disk.read_bytes), ("write", disk.write_bytes)] { - b.append_i64_dp( + b.append_i64_sum_dp( m, - Some(cs.get_joined(metric::DISK_IO, &disk.name, dir, start)), + cs.get_joined(metric::DISK_IO, &disk.name, dir, start), now, saturating_i64(value), |w| { @@ -910,9 +898,9 @@ fn project_snapshot( } let m = b.begin_counter_i64(metric::DISK_OPERATIONS, "{operation}"); for (dir, value) in [("read", disk.read_ops), ("write", disk.write_ops)] { - b.append_i64_dp( + b.append_i64_sum_dp( m, - Some(cs.get_joined(metric::DISK_OPERATIONS, &disk.name, dir, start)), + cs.get_joined(metric::DISK_OPERATIONS, &disk.name, dir, start), now, saturating_i64(value), |w| { @@ -922,9 +910,9 @@ fn project_snapshot( ); } let m = b.begin_counter_f64(metric::DISK_IO_TIME, "s"); - b.append_f64_dp( + b.append_f64_sum_dp( m, - Some(cs.get(metric::DISK_IO_TIME, &disk.name, start)), + cs.get(metric::DISK_IO_TIME, &disk.name, start), now, disk.io_time_seconds, |w| { @@ -936,9 +924,9 @@ fn project_snapshot( ("read", disk.read_time_seconds), ("write", disk.write_time_seconds), ] { - b.append_f64_dp( + b.append_f64_sum_dp( m, - Some(cs.get_joined(metric::DISK_OPERATION_TIME, &disk.name, dir, start)), + cs.get_joined(metric::DISK_OPERATION_TIME, &disk.name, dir, start), now, value, |w| { @@ -949,9 +937,9 @@ fn project_snapshot( } let m = b.begin_counter_i64(metric::DISK_MERGED, "{operation}"); for (dir, value) in [("read", disk.read_merged), ("write", disk.write_merged)] { - b.append_i64_dp( + b.append_i64_sum_dp( m, - Some(cs.get_joined(metric::DISK_MERGED, &disk.name, dir, start)), + cs.get_joined(metric::DISK_MERGED, &disk.name, dir, start), now, saturating_i64(value), |w| { @@ -971,7 +959,7 @@ fn project_snapshot( ("free", fs.free), ("reserved", fs.reserved), ] { - b.append_i64_dp(m, Some(start), now, saturating_i64(value), |w| { + b.append_i64_sum_dp(m, start, now, saturating_i64(value), |w| { w.str(attr::SYSTEM_DEVICE, &fs.device); w.str(attr::SYSTEM_FILESYSTEM_STATE, state); w.str(attr::SYSTEM_FILESYSTEM_TYPE, &fs.fs_type); @@ -987,7 +975,7 @@ fn project_snapshot( ("free", fs.free), ("reserved", fs.reserved), ] { - b.append_f64_dp(m, None, now, value as f64 / total_f, |w| { + b.append_f64_gauge_dp(m, now, value as f64 / total_f, |w| { w.str(attr::SYSTEM_DEVICE, &fs.device); w.str(attr::SYSTEM_FILESYSTEM_STATE, state); w.str(attr::SYSTEM_FILESYSTEM_TYPE, &fs.fs_type); @@ -998,7 +986,7 @@ fn project_snapshot( } if let Some(limit_bytes) = fs.limit_bytes { let m = b.begin_updown_i64(metric::FILESYSTEM_LIMIT, "By"); - b.append_i64_dp(m, Some(start), now, saturating_i64(limit_bytes), |w| { + b.append_i64_sum_dp(m, start, now, saturating_i64(limit_bytes), |w| { w.str(attr::SYSTEM_DEVICE, &fs.device); w.str(attr::SYSTEM_FILESYSTEM_TYPE, &fs.fs_type); w.str(attr::SYSTEM_FILESYSTEM_MODE, fs.mode); @@ -1014,9 +1002,9 @@ fn project_snapshot( ("receive", attr::NETWORK_INTERFACE_NAME, net.rx_bytes), ("transmit", attr::NETWORK_INTERFACE_NAME, net.tx_bytes), ] { - b.append_i64_dp( + b.append_i64_sum_dp( m, - Some(cs.get_joined(metric::NETWORK_IO, &net.name, dir, start)), + cs.get_joined(metric::NETWORK_IO, &net.name, dir, start), now, saturating_i64(value), |w| { @@ -1027,9 +1015,9 @@ fn project_snapshot( } let m = b.begin_counter_i64(metric::NETWORK_PACKET_COUNT, "{packet}"); for (dir, value) in [("receive", net.rx_packets), ("transmit", net.tx_packets)] { - b.append_i64_dp( + b.append_i64_sum_dp( m, - Some(cs.get_joined(metric::NETWORK_PACKET_COUNT, &net.name, dir, start)), + cs.get_joined(metric::NETWORK_PACKET_COUNT, &net.name, dir, start), now, saturating_i64(value), |w| { @@ -1042,9 +1030,9 @@ fn project_snapshot( } let m = b.begin_counter_i64(metric::NETWORK_PACKET_DROPPED, "{packet}"); for (dir, value) in [("receive", net.rx_dropped), ("transmit", net.tx_dropped)] { - b.append_i64_dp( + b.append_i64_sum_dp( m, - Some(cs.get_joined(metric::NETWORK_PACKET_DROPPED, &net.name, dir, start)), + cs.get_joined(metric::NETWORK_PACKET_DROPPED, &net.name, dir, start), now, saturating_i64(value), |w| { @@ -1055,9 +1043,9 @@ fn project_snapshot( } let m = b.begin_counter_i64(metric::NETWORK_ERRORS, "{error}"); for (dir, value) in [("receive", net.rx_errors), ("transmit", net.tx_errors)] { - b.append_i64_dp( + b.append_i64_sum_dp( m, - Some(cs.get_joined(metric::NETWORK_ERRORS, &net.name, dir, start)), + cs.get_joined(metric::NETWORK_ERRORS, &net.name, dir, start), now, saturating_i64(value), |w| { @@ -1077,35 +1065,23 @@ fn project_hugepages( hugepages: &HugepageStats, ) { let m = b.begin_updown_i64(metric::MEMORY_LINUX_HUGEPAGES_LIMIT, "{page}"); - b.append_i64_dp(m, Some(start), now, saturating_i64(hugepages.total), |_| {}); + b.append_i64_sum_dp(m, start, now, saturating_i64(hugepages.total), |_| {}); let m = b.begin_updown_i64(metric::MEMORY_LINUX_HUGEPAGES_PAGE_SIZE, "By"); - b.append_i64_dp( + b.append_i64_sum_dp( m, - Some(start), + start, now, saturating_i64(hugepages.page_size_bytes), |_| {}, ); let m = b.begin_updown_i64(metric::MEMORY_LINUX_HUGEPAGES_RESERVED, "{page}"); - b.append_i64_dp( - m, - Some(start), - now, - saturating_i64(hugepages.reserved), - |_| {}, - ); + b.append_i64_sum_dp(m, start, now, saturating_i64(hugepages.reserved), |_| {}); let m = b.begin_updown_i64(metric::MEMORY_LINUX_HUGEPAGES_SURPLUS, "{page}"); - b.append_i64_dp( - m, - Some(start), - now, - saturating_i64(hugepages.surplus), - |_| {}, - ); + b.append_i64_sum_dp(m, start, now, saturating_i64(hugepages.surplus), |_| {}); let used = hugepages.total.saturating_sub(hugepages.free); let m = b.begin_updown_i64(metric::MEMORY_LINUX_HUGEPAGES_USAGE, "{page}"); for (state, value) in [("used", used), ("free", hugepages.free)] { - b.append_i64_dp(m, Some(start), now, saturating_i64(value), |w| { + b.append_i64_sum_dp(m, start, now, saturating_i64(value), |w| { w.str(attr::SYSTEM_MEMORY_LINUX_HUGEPAGES_STATE, state); }); } @@ -1113,7 +1089,7 @@ fn project_hugepages( let total = hugepages.total as f64; let m = b.begin_gauge_f64(metric::MEMORY_LINUX_HUGEPAGES_UTILIZATION, "1"); for (state, value) in [("used", used), ("free", hugepages.free)] { - b.append_f64_dp(m, None, now, value as f64 / total, |w| { + b.append_f64_gauge_dp(m, now, value as f64 / total, |w| { w.str(attr::SYSTEM_MEMORY_LINUX_HUGEPAGES_STATE, state); }); } From 759bda3bd7ab43072140330b6090efb74be6ccbf Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Tue, 5 May 2026 10:34:37 -0700 Subject: [PATCH 47/60] Fix paging operation counter starts --- .../receivers/host_metrics_receiver/procfs.rs | 87 ++++++++++++++----- 1 file changed, 67 insertions(+), 20 deletions(-) diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs index 887723b833..52ce3179d8 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs @@ -787,7 +787,7 @@ fn project_snapshot( b.append_i64_sum_dp(m, start, now, saturating_i64(memory.shared), |_| {}); } if snap.memory_hugepages { - project_hugepages(snap, b, start, now, &memory.hugepages); + project_hugepages(b, start, now, &memory.hugepages); } } @@ -998,17 +998,14 @@ fn project_snapshot( // ── Network ────────────────────────────────────────────────────────────── for net in &snap.networks { let m = b.begin_counter_i64(metric::NETWORK_IO, "By"); - for (dir, iface_attr, value) in [ - ("receive", attr::NETWORK_INTERFACE_NAME, net.rx_bytes), - ("transmit", attr::NETWORK_INTERFACE_NAME, net.tx_bytes), - ] { + for (dir, value) in [("receive", net.rx_bytes), ("transmit", net.tx_bytes)] { b.append_i64_sum_dp( m, cs.get_joined(metric::NETWORK_IO, &net.name, dir, start), now, saturating_i64(value), |w| { - w.str(iface_attr, &net.name); + w.str(attr::NETWORK_INTERFACE_NAME, &net.name); w.str(attr::NETWORK_IO_DIRECTION, dir); }, ); @@ -1058,7 +1055,6 @@ fn project_snapshot( } fn project_hugepages( - snap: &HostSnapshot, b: &mut crate::receivers::host_metrics_receiver::otap_builder::HostMetricsArrowBuilder, start: u64, now: u64, @@ -1094,7 +1090,6 @@ fn project_hugepages( }); } } - let _ = snap; // suppress unused warning; snap may be used in future extensions } #[derive(Default)] @@ -1176,18 +1171,22 @@ impl CounterTracker { ], &mut starts, ); - self.observe_all( - metric::PAGING_OPERATIONS, - default_start, - now, - &[ - ("in|major", paging.swap_in as f64), - ("out|major", paging.swap_out as f64), - ("in|minor", paging.page_in as f64), - ("out|minor", paging.page_out as f64), - ], - &mut starts, - ); + for (direction, fault_type, value) in [ + ("in", "major", paging.swap_in), + ("out", "major", paging.swap_out), + ("in", "minor", paging.page_in), + ("out", "minor", paging.page_out), + ] { + self.observe_joined( + metric::PAGING_OPERATIONS, + direction, + fault_type, + value as f64, + default_start, + now, + &mut starts, + ); + } } if let Some(processes) = processes { self.observe( @@ -2504,6 +2503,54 @@ mod tests { assert_eq!(starts.get_joined(metric::DISK_IO, "sda", "write", 10), 10); } + #[test] + fn counter_tracker_rebaselines_paging_operations_by_direction_and_fault_type() { + let mut tracker = CounterTracker::default(); + let paging = PagingStats { + swap_in: 100, + swap_out: 200, + page_in: 300, + page_out: 400, + ..PagingStats::default() + }; + let starts = tracker.snapshot(10, 20, None, Some(&paging), None, &[], &[]); + + assert_eq!( + starts.get_joined(metric::PAGING_OPERATIONS, "in", "major", 10), + 10 + ); + assert_eq!( + starts.get_joined(metric::PAGING_OPERATIONS, "out", "minor", 10), + 10 + ); + + let paging = PagingStats { + swap_in: 50, + swap_out: 250, + page_in: 350, + page_out: 450, + ..PagingStats::default() + }; + let starts = tracker.snapshot(10, 30, None, Some(&paging), None, &[], &[]); + + assert_eq!( + starts.get_joined(metric::PAGING_OPERATIONS, "in", "major", 10), + 30 + ); + assert_eq!( + starts.get_joined(metric::PAGING_OPERATIONS, "out", "major", 10), + 10 + ); + assert_eq!( + starts.get_joined(metric::PAGING_OPERATIONS, "in", "minor", 10), + 10 + ); + assert_eq!( + starts.get_joined(metric::PAGING_OPERATIONS, "out", "minor", 10), + 10 + ); + } + #[test] fn counter_keys_do_not_collide_with_pipe_in_series_values() { let metric = metric::DISK_IO; From 230cbe6d65bdb3c7cc9a39849198a05c111d7b96 Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Tue, 5 May 2026 10:43:34 -0700 Subject: [PATCH 48/60] Reduce host metrics projection rows --- .../crates/core-nodes/Cargo.toml | 2 +- .../receivers/host_metrics_receiver/procfs.rs | 333 +++++++++--------- 2 files changed, 176 insertions(+), 159 deletions(-) diff --git a/rust/otap-dataflow/crates/core-nodes/Cargo.toml b/rust/otap-dataflow/crates/core-nodes/Cargo.toml index 098d3c24cd..66e4c40607 100644 --- a/rust/otap-dataflow/crates/core-nodes/Cargo.toml +++ b/rust/otap-dataflow/crates/core-nodes/Cargo.toml @@ -38,7 +38,6 @@ futures.workspace = true futures-timer.workspace = true humantime-serde.workspace = true linkme.workspace = true -libc.workspace = true object_store = {workspace = true, features = ["fs"]} parquet.workspace = true prost.workspace = true @@ -68,6 +67,7 @@ weaver_resolver = { workspace = true, optional = true } weaver_semconv = { workspace = true, optional = true } [target.'cfg(target_os = "linux")'.dependencies] +libc.workspace = true nix.workspace = true [features] diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs index 52ce3179d8..7e183ebb0e 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs @@ -836,17 +836,22 @@ fn project_snapshot( ); } } - for swap in &snap.swaps { - let m = b.begin_updown_i64(metric::PAGING_USAGE, "By"); - for (state, value) in [("used", swap.used), ("free", swap.free)] { - b.append_i64_sum_dp(m, start, now, saturating_i64(value), |w| { - w.str(attr::SYSTEM_DEVICE, &swap.name); - w.str(attr::SYSTEM_PAGING_STATE, state); - }); - } - let size = swap.size; - if size > 0 { - let m = b.begin_gauge_f64(metric::PAGING_UTILIZATION, "1"); + if !snap.swaps.is_empty() { + let usage_m = b.begin_updown_i64(metric::PAGING_USAGE, "By"); + let mut utilization_m = None; + for swap in &snap.swaps { + for (state, value) in [("used", swap.used), ("free", swap.free)] { + b.append_i64_sum_dp(usage_m, start, now, saturating_i64(value), |w| { + w.str(attr::SYSTEM_DEVICE, &swap.name); + w.str(attr::SYSTEM_PAGING_STATE, state); + }); + } + let size = swap.size; + if size == 0 { + continue; + } + let m = *utilization_m + .get_or_insert_with(|| b.begin_gauge_f64(metric::PAGING_UTILIZATION, "1")); let total = size as f64; for (state, value) in [("used", swap.used), ("free", swap.free)] { b.append_f64_gauge_dp(m, now, value as f64 / total, |w| { @@ -876,106 +881,97 @@ fn project_snapshot( } // ── Disk ───────────────────────────────────────────────────────────────── - for disk in &snap.disks { - if let Some(limit_bytes) = disk.limit_bytes { - let m = b.begin_updown_i64(metric::DISK_LIMIT, "By"); - b.append_i64_sum_dp(m, start, now, saturating_i64(limit_bytes), |w| { - w.str(attr::SYSTEM_DEVICE, &disk.name); - }); - } - let m = b.begin_counter_i64(metric::DISK_IO, "By"); - for (dir, value) in [("read", disk.read_bytes), ("write", disk.write_bytes)] { - b.append_i64_sum_dp( - m, - cs.get_joined(metric::DISK_IO, &disk.name, dir, start), - now, - saturating_i64(value), - |w| { - w.str(attr::SYSTEM_DEVICE, &disk.name); - w.str(attr::DISK_IO_DIRECTION, dir); - }, - ); - } - let m = b.begin_counter_i64(metric::DISK_OPERATIONS, "{operation}"); - for (dir, value) in [("read", disk.read_ops), ("write", disk.write_ops)] { - b.append_i64_sum_dp( - m, - cs.get_joined(metric::DISK_OPERATIONS, &disk.name, dir, start), - now, - saturating_i64(value), - |w| { + if !snap.disks.is_empty() { + let mut limit_m = None; + let io_m = b.begin_counter_i64(metric::DISK_IO, "By"); + let operations_m = b.begin_counter_i64(metric::DISK_OPERATIONS, "{operation}"); + let io_time_m = b.begin_counter_f64(metric::DISK_IO_TIME, "s"); + let operation_time_m = b.begin_counter_f64(metric::DISK_OPERATION_TIME, "s"); + let merged_m = b.begin_counter_i64(metric::DISK_MERGED, "{operation}"); + for disk in &snap.disks { + if let Some(limit_bytes) = disk.limit_bytes { + let m = + *limit_m.get_or_insert_with(|| b.begin_updown_i64(metric::DISK_LIMIT, "By")); + b.append_i64_sum_dp(m, start, now, saturating_i64(limit_bytes), |w| { w.str(attr::SYSTEM_DEVICE, &disk.name); - w.str(attr::DISK_IO_DIRECTION, dir); - }, - ); - } - let m = b.begin_counter_f64(metric::DISK_IO_TIME, "s"); - b.append_f64_sum_dp( - m, - cs.get(metric::DISK_IO_TIME, &disk.name, start), - now, - disk.io_time_seconds, - |w| { - w.str(attr::SYSTEM_DEVICE, &disk.name); - }, - ); - let m = b.begin_counter_f64(metric::DISK_OPERATION_TIME, "s"); - for (dir, value) in [ - ("read", disk.read_time_seconds), - ("write", disk.write_time_seconds), - ] { + }); + } + for (dir, value) in [("read", disk.read_bytes), ("write", disk.write_bytes)] { + b.append_i64_sum_dp( + io_m, + cs.get_joined(metric::DISK_IO, &disk.name, dir, start), + now, + saturating_i64(value), + |w| { + w.str(attr::SYSTEM_DEVICE, &disk.name); + w.str(attr::DISK_IO_DIRECTION, dir); + }, + ); + } + for (dir, value) in [("read", disk.read_ops), ("write", disk.write_ops)] { + b.append_i64_sum_dp( + operations_m, + cs.get_joined(metric::DISK_OPERATIONS, &disk.name, dir, start), + now, + saturating_i64(value), + |w| { + w.str(attr::SYSTEM_DEVICE, &disk.name); + w.str(attr::DISK_IO_DIRECTION, dir); + }, + ); + } b.append_f64_sum_dp( - m, - cs.get_joined(metric::DISK_OPERATION_TIME, &disk.name, dir, start), + io_time_m, + cs.get(metric::DISK_IO_TIME, &disk.name, start), now, - value, - |w| { - w.str(attr::SYSTEM_DEVICE, &disk.name); - w.str(attr::DISK_IO_DIRECTION, dir); - }, - ); - } - let m = b.begin_counter_i64(metric::DISK_MERGED, "{operation}"); - for (dir, value) in [("read", disk.read_merged), ("write", disk.write_merged)] { - b.append_i64_sum_dp( - m, - cs.get_joined(metric::DISK_MERGED, &disk.name, dir, start), - now, - saturating_i64(value), + disk.io_time_seconds, |w| { w.str(attr::SYSTEM_DEVICE, &disk.name); - w.str(attr::DISK_IO_DIRECTION, dir); }, ); + for (dir, value) in [ + ("read", disk.read_time_seconds), + ("write", disk.write_time_seconds), + ] { + b.append_f64_sum_dp( + operation_time_m, + cs.get_joined(metric::DISK_OPERATION_TIME, &disk.name, dir, start), + now, + value, + |w| { + w.str(attr::SYSTEM_DEVICE, &disk.name); + w.str(attr::DISK_IO_DIRECTION, dir); + }, + ); + } + for (dir, value) in [("read", disk.read_merged), ("write", disk.write_merged)] { + b.append_i64_sum_dp( + merged_m, + cs.get_joined(metric::DISK_MERGED, &disk.name, dir, start), + now, + saturating_i64(value), + |w| { + w.str(attr::SYSTEM_DEVICE, &disk.name); + w.str(attr::DISK_IO_DIRECTION, dir); + }, + ); + } } } // ── Filesystem ─────────────────────────────────────────────────────────── - for fs in &snap.filesystems { - let total = fs.used.saturating_add(fs.free).saturating_add(fs.reserved); - let m = b.begin_updown_i64(metric::FILESYSTEM_USAGE, "By"); - for (state, value) in [ - ("used", fs.used), - ("free", fs.free), - ("reserved", fs.reserved), - ] { - b.append_i64_sum_dp(m, start, now, saturating_i64(value), |w| { - w.str(attr::SYSTEM_DEVICE, &fs.device); - w.str(attr::SYSTEM_FILESYSTEM_STATE, state); - w.str(attr::SYSTEM_FILESYSTEM_TYPE, &fs.fs_type); - w.str(attr::SYSTEM_FILESYSTEM_MODE, fs.mode); - w.str(attr::SYSTEM_FILESYSTEM_MOUNTPOINT, &fs.mountpoint); - }); - } - if total > 0 { - let m = b.begin_gauge_f64(metric::FILESYSTEM_UTILIZATION, "1"); - let total_f = total as f64; + if !snap.filesystems.is_empty() { + let usage_m = b.begin_updown_i64(metric::FILESYSTEM_USAGE, "By"); + let mut utilization_m = None; + let mut limit_m = None; + for fs in &snap.filesystems { + let total = fs.used.saturating_add(fs.free).saturating_add(fs.reserved); for (state, value) in [ ("used", fs.used), ("free", fs.free), ("reserved", fs.reserved), ] { - b.append_f64_gauge_dp(m, now, value as f64 / total_f, |w| { + b.append_i64_sum_dp(usage_m, start, now, saturating_i64(value), |w| { w.str(attr::SYSTEM_DEVICE, &fs.device); w.str(attr::SYSTEM_FILESYSTEM_STATE, state); w.str(attr::SYSTEM_FILESYSTEM_TYPE, &fs.fs_type); @@ -983,73 +979,94 @@ fn project_snapshot( w.str(attr::SYSTEM_FILESYSTEM_MOUNTPOINT, &fs.mountpoint); }); } - } - if let Some(limit_bytes) = fs.limit_bytes { - let m = b.begin_updown_i64(metric::FILESYSTEM_LIMIT, "By"); - b.append_i64_sum_dp(m, start, now, saturating_i64(limit_bytes), |w| { - w.str(attr::SYSTEM_DEVICE, &fs.device); - w.str(attr::SYSTEM_FILESYSTEM_TYPE, &fs.fs_type); - w.str(attr::SYSTEM_FILESYSTEM_MODE, fs.mode); - w.str(attr::SYSTEM_FILESYSTEM_MOUNTPOINT, &fs.mountpoint); - }); + if total > 0 { + let m = *utilization_m + .get_or_insert_with(|| b.begin_gauge_f64(metric::FILESYSTEM_UTILIZATION, "1")); + let total_f = total as f64; + for (state, value) in [ + ("used", fs.used), + ("free", fs.free), + ("reserved", fs.reserved), + ] { + b.append_f64_gauge_dp(m, now, value as f64 / total_f, |w| { + w.str(attr::SYSTEM_DEVICE, &fs.device); + w.str(attr::SYSTEM_FILESYSTEM_STATE, state); + w.str(attr::SYSTEM_FILESYSTEM_TYPE, &fs.fs_type); + w.str(attr::SYSTEM_FILESYSTEM_MODE, fs.mode); + w.str(attr::SYSTEM_FILESYSTEM_MOUNTPOINT, &fs.mountpoint); + }); + } + } + if let Some(limit_bytes) = fs.limit_bytes { + let m = *limit_m + .get_or_insert_with(|| b.begin_updown_i64(metric::FILESYSTEM_LIMIT, "By")); + b.append_i64_sum_dp(m, start, now, saturating_i64(limit_bytes), |w| { + w.str(attr::SYSTEM_DEVICE, &fs.device); + w.str(attr::SYSTEM_FILESYSTEM_TYPE, &fs.fs_type); + w.str(attr::SYSTEM_FILESYSTEM_MODE, fs.mode); + w.str(attr::SYSTEM_FILESYSTEM_MOUNTPOINT, &fs.mountpoint); + }); + } } } // ── Network ────────────────────────────────────────────────────────────── - for net in &snap.networks { - let m = b.begin_counter_i64(metric::NETWORK_IO, "By"); - for (dir, value) in [("receive", net.rx_bytes), ("transmit", net.tx_bytes)] { - b.append_i64_sum_dp( - m, - cs.get_joined(metric::NETWORK_IO, &net.name, dir, start), - now, - saturating_i64(value), - |w| { - w.str(attr::NETWORK_INTERFACE_NAME, &net.name); - w.str(attr::NETWORK_IO_DIRECTION, dir); - }, - ); - } - let m = b.begin_counter_i64(metric::NETWORK_PACKET_COUNT, "{packet}"); - for (dir, value) in [("receive", net.rx_packets), ("transmit", net.tx_packets)] { - b.append_i64_sum_dp( - m, - cs.get_joined(metric::NETWORK_PACKET_COUNT, &net.name, dir, start), - now, - saturating_i64(value), - |w| { - // Semconv uses system.device here, while sibling network - // metrics use network.interface.name. - w.str(attr::SYSTEM_DEVICE, &net.name); - w.str(attr::NETWORK_IO_DIRECTION, dir); - }, - ); - } - let m = b.begin_counter_i64(metric::NETWORK_PACKET_DROPPED, "{packet}"); - for (dir, value) in [("receive", net.rx_dropped), ("transmit", net.tx_dropped)] { - b.append_i64_sum_dp( - m, - cs.get_joined(metric::NETWORK_PACKET_DROPPED, &net.name, dir, start), - now, - saturating_i64(value), - |w| { - w.str(attr::NETWORK_INTERFACE_NAME, &net.name); - w.str(attr::NETWORK_IO_DIRECTION, dir); - }, - ); - } - let m = b.begin_counter_i64(metric::NETWORK_ERRORS, "{error}"); - for (dir, value) in [("receive", net.rx_errors), ("transmit", net.tx_errors)] { - b.append_i64_sum_dp( - m, - cs.get_joined(metric::NETWORK_ERRORS, &net.name, dir, start), - now, - saturating_i64(value), - |w| { - w.str(attr::NETWORK_INTERFACE_NAME, &net.name); - w.str(attr::NETWORK_IO_DIRECTION, dir); - }, - ); + if !snap.networks.is_empty() { + let io_m = b.begin_counter_i64(metric::NETWORK_IO, "By"); + let packet_count_m = b.begin_counter_i64(metric::NETWORK_PACKET_COUNT, "{packet}"); + let packet_dropped_m = b.begin_counter_i64(metric::NETWORK_PACKET_DROPPED, "{packet}"); + let errors_m = b.begin_counter_i64(metric::NETWORK_ERRORS, "{error}"); + for net in &snap.networks { + for (dir, value) in [("receive", net.rx_bytes), ("transmit", net.tx_bytes)] { + b.append_i64_sum_dp( + io_m, + cs.get_joined(metric::NETWORK_IO, &net.name, dir, start), + now, + saturating_i64(value), + |w| { + w.str(attr::NETWORK_INTERFACE_NAME, &net.name); + w.str(attr::NETWORK_IO_DIRECTION, dir); + }, + ); + } + for (dir, value) in [("receive", net.rx_packets), ("transmit", net.tx_packets)] { + b.append_i64_sum_dp( + packet_count_m, + cs.get_joined(metric::NETWORK_PACKET_COUNT, &net.name, dir, start), + now, + saturating_i64(value), + |w| { + // Semconv uses system.device here, while sibling network + // metrics use network.interface.name. + w.str(attr::SYSTEM_DEVICE, &net.name); + w.str(attr::NETWORK_IO_DIRECTION, dir); + }, + ); + } + for (dir, value) in [("receive", net.rx_dropped), ("transmit", net.tx_dropped)] { + b.append_i64_sum_dp( + packet_dropped_m, + cs.get_joined(metric::NETWORK_PACKET_DROPPED, &net.name, dir, start), + now, + saturating_i64(value), + |w| { + w.str(attr::NETWORK_INTERFACE_NAME, &net.name); + w.str(attr::NETWORK_IO_DIRECTION, dir); + }, + ); + } + for (dir, value) in [("receive", net.rx_errors), ("transmit", net.tx_errors)] { + b.append_i64_sum_dp( + errors_m, + cs.get_joined(metric::NETWORK_ERRORS, &net.name, dir, start), + now, + saturating_i64(value), + |w| { + w.str(attr::NETWORK_INTERFACE_NAME, &net.name); + w.str(attr::NETWORK_IO_DIRECTION, dir); + }, + ); + } } } } From c00ef86ad7630851d97375e0014f518f6a2b3b2f Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Tue, 5 May 2026 11:11:04 -0700 Subject: [PATCH 49/60] Merge host metric shapes in semconv check --- .../receivers/host_metrics_receiver/procfs.rs | 99 ++++++++++--------- 1 file changed, 54 insertions(+), 45 deletions(-) diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs index 7e183ebb0e..79af7ffebe 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs @@ -3378,52 +3378,61 @@ mod tests { #[cfg(feature = "dev-tools")] fn emitted_phase1_metric_shapes() -> BTreeMap { let metrics = projection_fixture_metrics(); - metrics - .iter() - .map(|metric| { - let (monotonic, points) = match metric.data.as_ref().expect("metric data") { - otlp_metric::Data::Sum(sum) => (Some(sum.is_monotonic), &sum.data_points), - otlp_metric::Data::Gauge(gauge) => (None, &gauge.data_points), - _ => panic!("unsupported metric data for {}", metric.name), - }; - let attributes = points - .iter() - .flat_map(|point| point.attributes.iter()) - .map(|attr| attr.key.clone()) - .collect(); - let mut attribute_values: BTreeMap> = BTreeMap::new(); - let mut attribute_types: BTreeMap = BTreeMap::new(); - for attr in points.iter().flat_map(|point| point.attributes.iter()) { - if let Some(value) = any_value_string(attr.value.as_ref()) { - let _ = attribute_values - .entry(attr.key.clone()) - .or_default() - .insert(value); - } - if let Some(kind) = any_value_kind(attr.value.as_ref()) { - let previous = attribute_types.insert(attr.key.clone(), kind); - assert!( - previous.is_none() || previous == Some(kind), - "mixed attribute value types for {} on {}", - attr.key, - metric.name - ); - } + let mut shapes = BTreeMap::new(); + for metric in &metrics { + let (monotonic, points) = match metric.data.as_ref().expect("metric data") { + otlp_metric::Data::Sum(sum) => (Some(sum.is_monotonic), &sum.data_points), + otlp_metric::Data::Gauge(gauge) => (None, &gauge.data_points), + _ => panic!("unsupported metric data for {}", metric.name), + }; + let value_type = metric_value_type(points); + let shape = shapes + .entry(metric.name.clone()) + .or_insert_with(|| MetricShape { + unit: metric.unit.clone(), + monotonic, + attributes: BTreeSet::new(), + all_attributes: BTreeSet::new(), + attribute_types: BTreeMap::new(), + enum_values: BTreeMap::new(), + value_type, + }); + assert_eq!( + shape.unit, metric.unit, + "unit mismatch across {}", + metric.name + ); + assert_eq!( + shape.monotonic, monotonic, + "instrument/temporality mismatch across {}", + metric.name + ); + assert_eq!( + shape.value_type, value_type, + "value type mismatch across {}", + metric.name + ); + for attr in points.iter().flat_map(|point| point.attributes.iter()) { + let _ = shape.attributes.insert(attr.key.clone()); + if let Some(value) = any_value_string(attr.value.as_ref()) { + let _ = shape + .enum_values + .entry(attr.key.clone()) + .or_default() + .insert(value); } - ( - metric.name.clone(), - MetricShape { - unit: metric.unit.clone(), - monotonic, - attributes, - all_attributes: BTreeSet::new(), - attribute_types, - enum_values: attribute_values, - value_type: metric_value_type(points), - }, - ) - }) - .collect() + if let Some(kind) = any_value_kind(attr.value.as_ref()) { + let previous = shape.attribute_types.insert(attr.key.clone(), kind); + assert!( + previous.is_none() || previous == Some(kind), + "mixed attribute value types for {} on {}", + attr.key, + metric.name + ); + } + } + } + shapes } #[cfg(feature = "dev-tools")] From 74804901d458f21bf35e9f822966271af8762983 Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Tue, 5 May 2026 11:58:15 -0700 Subject: [PATCH 50/60] Group host metric datapoints by parent --- .../receivers/host_metrics_receiver/procfs.rs | 124 +++++++++++------- 1 file changed, 76 insertions(+), 48 deletions(-) diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs index 79af7ffebe..1b3e7f9d2e 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs @@ -837,21 +837,23 @@ fn project_snapshot( } } if !snap.swaps.is_empty() { - let usage_m = b.begin_updown_i64(metric::PAGING_USAGE, "By"); - let mut utilization_m = None; + let m = b.begin_updown_i64(metric::PAGING_USAGE, "By"); for swap in &snap.swaps { for (state, value) in [("used", swap.used), ("free", swap.free)] { - b.append_i64_sum_dp(usage_m, start, now, saturating_i64(value), |w| { + b.append_i64_sum_dp(m, start, now, saturating_i64(value), |w| { w.str(attr::SYSTEM_DEVICE, &swap.name); w.str(attr::SYSTEM_PAGING_STATE, state); }); } + } + } + if snap.swaps.iter().any(|swap| swap.size > 0) { + let m = b.begin_gauge_f64(metric::PAGING_UTILIZATION, "1"); + for swap in &snap.swaps { let size = swap.size; if size == 0 { continue; } - let m = *utilization_m - .get_or_insert_with(|| b.begin_gauge_f64(metric::PAGING_UTILIZATION, "1")); let total = size as f64; for (state, value) in [("used", swap.used), ("free", swap.free)] { b.append_f64_gauge_dp(m, now, value as f64 / total, |w| { @@ -881,24 +883,23 @@ fn project_snapshot( } // ── Disk ───────────────────────────────────────────────────────────────── + if snap.disks.iter().any(|disk| disk.limit_bytes.is_some()) { + let m = b.begin_updown_i64(metric::DISK_LIMIT, "By"); + for disk in &snap.disks { + let Some(limit_bytes) = disk.limit_bytes else { + continue; + }; + b.append_i64_sum_dp(m, start, now, saturating_i64(limit_bytes), |w| { + w.str(attr::SYSTEM_DEVICE, &disk.name); + }); + } + } if !snap.disks.is_empty() { - let mut limit_m = None; - let io_m = b.begin_counter_i64(metric::DISK_IO, "By"); - let operations_m = b.begin_counter_i64(metric::DISK_OPERATIONS, "{operation}"); - let io_time_m = b.begin_counter_f64(metric::DISK_IO_TIME, "s"); - let operation_time_m = b.begin_counter_f64(metric::DISK_OPERATION_TIME, "s"); - let merged_m = b.begin_counter_i64(metric::DISK_MERGED, "{operation}"); + let m = b.begin_counter_i64(metric::DISK_IO, "By"); for disk in &snap.disks { - if let Some(limit_bytes) = disk.limit_bytes { - let m = - *limit_m.get_or_insert_with(|| b.begin_updown_i64(metric::DISK_LIMIT, "By")); - b.append_i64_sum_dp(m, start, now, saturating_i64(limit_bytes), |w| { - w.str(attr::SYSTEM_DEVICE, &disk.name); - }); - } for (dir, value) in [("read", disk.read_bytes), ("write", disk.write_bytes)] { b.append_i64_sum_dp( - io_m, + m, cs.get_joined(metric::DISK_IO, &disk.name, dir, start), now, saturating_i64(value), @@ -908,9 +909,12 @@ fn project_snapshot( }, ); } + } + let m = b.begin_counter_i64(metric::DISK_OPERATIONS, "{operation}"); + for disk in &snap.disks { for (dir, value) in [("read", disk.read_ops), ("write", disk.write_ops)] { b.append_i64_sum_dp( - operations_m, + m, cs.get_joined(metric::DISK_OPERATIONS, &disk.name, dir, start), now, saturating_i64(value), @@ -920,8 +924,11 @@ fn project_snapshot( }, ); } + } + let m = b.begin_counter_f64(metric::DISK_IO_TIME, "s"); + for disk in &snap.disks { b.append_f64_sum_dp( - io_time_m, + m, cs.get(metric::DISK_IO_TIME, &disk.name, start), now, disk.io_time_seconds, @@ -929,12 +936,15 @@ fn project_snapshot( w.str(attr::SYSTEM_DEVICE, &disk.name); }, ); + } + let m = b.begin_counter_f64(metric::DISK_OPERATION_TIME, "s"); + for disk in &snap.disks { for (dir, value) in [ ("read", disk.read_time_seconds), ("write", disk.write_time_seconds), ] { b.append_f64_sum_dp( - operation_time_m, + m, cs.get_joined(metric::DISK_OPERATION_TIME, &disk.name, dir, start), now, value, @@ -944,9 +954,12 @@ fn project_snapshot( }, ); } + } + let m = b.begin_counter_i64(metric::DISK_MERGED, "{operation}"); + for disk in &snap.disks { for (dir, value) in [("read", disk.read_merged), ("write", disk.write_merged)] { b.append_i64_sum_dp( - merged_m, + m, cs.get_joined(metric::DISK_MERGED, &disk.name, dir, start), now, saturating_i64(value), @@ -961,17 +974,14 @@ fn project_snapshot( // ── Filesystem ─────────────────────────────────────────────────────────── if !snap.filesystems.is_empty() { - let usage_m = b.begin_updown_i64(metric::FILESYSTEM_USAGE, "By"); - let mut utilization_m = None; - let mut limit_m = None; + let m = b.begin_updown_i64(metric::FILESYSTEM_USAGE, "By"); for fs in &snap.filesystems { - let total = fs.used.saturating_add(fs.free).saturating_add(fs.reserved); for (state, value) in [ ("used", fs.used), ("free", fs.free), ("reserved", fs.reserved), ] { - b.append_i64_sum_dp(usage_m, start, now, saturating_i64(value), |w| { + b.append_i64_sum_dp(m, start, now, saturating_i64(value), |w| { w.str(attr::SYSTEM_DEVICE, &fs.device); w.str(attr::SYSTEM_FILESYSTEM_STATE, state); w.str(attr::SYSTEM_FILESYSTEM_TYPE, &fs.fs_type); @@ -979,9 +989,17 @@ fn project_snapshot( w.str(attr::SYSTEM_FILESYSTEM_MOUNTPOINT, &fs.mountpoint); }); } + } + } + if snap + .filesystems + .iter() + .any(|fs| fs.used.saturating_add(fs.free).saturating_add(fs.reserved) > 0) + { + let m = b.begin_gauge_f64(metric::FILESYSTEM_UTILIZATION, "1"); + for fs in &snap.filesystems { + let total = fs.used.saturating_add(fs.free).saturating_add(fs.reserved); if total > 0 { - let m = *utilization_m - .get_or_insert_with(|| b.begin_gauge_f64(metric::FILESYSTEM_UTILIZATION, "1")); let total_f = total as f64; for (state, value) in [ ("used", fs.used), @@ -997,29 +1015,30 @@ fn project_snapshot( }); } } - if let Some(limit_bytes) = fs.limit_bytes { - let m = *limit_m - .get_or_insert_with(|| b.begin_updown_i64(metric::FILESYSTEM_LIMIT, "By")); - b.append_i64_sum_dp(m, start, now, saturating_i64(limit_bytes), |w| { - w.str(attr::SYSTEM_DEVICE, &fs.device); - w.str(attr::SYSTEM_FILESYSTEM_TYPE, &fs.fs_type); - w.str(attr::SYSTEM_FILESYSTEM_MODE, fs.mode); - w.str(attr::SYSTEM_FILESYSTEM_MOUNTPOINT, &fs.mountpoint); - }); - } + } + } + if snap.filesystems.iter().any(|fs| fs.limit_bytes.is_some()) { + let m = b.begin_updown_i64(metric::FILESYSTEM_LIMIT, "By"); + for fs in &snap.filesystems { + let Some(limit_bytes) = fs.limit_bytes else { + continue; + }; + b.append_i64_sum_dp(m, start, now, saturating_i64(limit_bytes), |w| { + w.str(attr::SYSTEM_DEVICE, &fs.device); + w.str(attr::SYSTEM_FILESYSTEM_TYPE, &fs.fs_type); + w.str(attr::SYSTEM_FILESYSTEM_MODE, fs.mode); + w.str(attr::SYSTEM_FILESYSTEM_MOUNTPOINT, &fs.mountpoint); + }); } } // ── Network ────────────────────────────────────────────────────────────── if !snap.networks.is_empty() { - let io_m = b.begin_counter_i64(metric::NETWORK_IO, "By"); - let packet_count_m = b.begin_counter_i64(metric::NETWORK_PACKET_COUNT, "{packet}"); - let packet_dropped_m = b.begin_counter_i64(metric::NETWORK_PACKET_DROPPED, "{packet}"); - let errors_m = b.begin_counter_i64(metric::NETWORK_ERRORS, "{error}"); + let m = b.begin_counter_i64(metric::NETWORK_IO, "By"); for net in &snap.networks { for (dir, value) in [("receive", net.rx_bytes), ("transmit", net.tx_bytes)] { b.append_i64_sum_dp( - io_m, + m, cs.get_joined(metric::NETWORK_IO, &net.name, dir, start), now, saturating_i64(value), @@ -1029,9 +1048,12 @@ fn project_snapshot( }, ); } + } + let m = b.begin_counter_i64(metric::NETWORK_PACKET_COUNT, "{packet}"); + for net in &snap.networks { for (dir, value) in [("receive", net.rx_packets), ("transmit", net.tx_packets)] { b.append_i64_sum_dp( - packet_count_m, + m, cs.get_joined(metric::NETWORK_PACKET_COUNT, &net.name, dir, start), now, saturating_i64(value), @@ -1043,9 +1065,12 @@ fn project_snapshot( }, ); } + } + let m = b.begin_counter_i64(metric::NETWORK_PACKET_DROPPED, "{packet}"); + for net in &snap.networks { for (dir, value) in [("receive", net.rx_dropped), ("transmit", net.tx_dropped)] { b.append_i64_sum_dp( - packet_dropped_m, + m, cs.get_joined(metric::NETWORK_PACKET_DROPPED, &net.name, dir, start), now, saturating_i64(value), @@ -1055,9 +1080,12 @@ fn project_snapshot( }, ); } + } + let m = b.begin_counter_i64(metric::NETWORK_ERRORS, "{error}"); + for net in &snap.networks { for (dir, value) in [("receive", net.rx_errors), ("transmit", net.tx_errors)] { b.append_i64_sum_dp( - errors_m, + m, cs.get_joined(metric::NETWORK_ERRORS, &net.name, dir, start), now, saturating_i64(value), From c9d8b2c55c2551274e765f56360a4945dbe2a8f1 Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Tue, 5 May 2026 12:04:33 -0700 Subject: [PATCH 51/60] Align host metrics docs with implementation --- .../receivers/host_metrics_receiver/README.md | 154 ++++++++++++++---- .../docs/host-metrics-receiver.md | 39 +++-- 2 files changed, 140 insertions(+), 53 deletions(-) diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/README.md b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/README.md index 5e3eac7e36..9646b59117 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/README.md +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/README.md @@ -1,5 +1,7 @@ # Host Metrics Receiver + + **URN:** `urn:otel:receiver:host_metrics` Linux host metrics receiver backed by procfs and sysfs. It emits OpenTelemetry @@ -11,37 +13,91 @@ network, and aggregate process counts. Minimal configuration: ```yaml -receivers: - host_metrics: - collection_interval: 10s +groups: + host: + pipelines: + collect: + policies: + resources: + core_allocation: + type: core_count + count: 1 + nodes: + host_metrics: + type: receiver:host_metrics + config: + collection_interval: 10s + publish: + type: exporter:topic + config: + topic: host_metrics + connections: + - from: host_metrics + to: publish ``` Collect from a host root mounted into a container: ```yaml -receivers: - host_metrics: - collection_interval: 10s - host_view: - root_path: /host - validation: fail_selected +groups: + host: + pipelines: + collect: + policies: + resources: + core_allocation: + type: core_count + count: 1 + nodes: + host_metrics: + type: receiver:host_metrics + config: + collection_interval: 10s + host_view: + root_path: /host + validation: fail_selected + publish: + type: exporter:topic + config: + topic: host_metrics + connections: + - from: host_metrics + to: publish ``` Enable selected opt-in metrics: ```yaml -receivers: - host_metrics: - families: - cpu: - utilization: true - memory: - limit: true - hugepages: true - disk: - limit: true - filesystem: - limit: true +groups: + host: + pipelines: + collect: + policies: + resources: + core_allocation: + type: core_count + count: 1 + nodes: + host_metrics: + type: receiver:host_metrics + config: + families: + cpu: + utilization: true + memory: + limit: true + hugepages: true + disk: + limit: true + filesystem: + limit: true + publish: + type: exporter:topic + config: + topic: host_metrics + connections: + - from: host_metrics + to: publish ``` ## Configuration Options @@ -65,34 +121,60 @@ receivers: Families are `cpu`, `memory`, `paging`, `system`, `disk`, `filesystem`, `network`, and `processes`. +Host-wide collection must run in a one-core source pipeline. Use a topic +exporter to fan out to multicore downstream processing when needed. + ## Filters Disk, filesystem, and network families support include and exclude filters. Filter `match_type` values are `strict`, `glob`, and `regexp`. ```yaml -receivers: - host_metrics: - families: - disk: - exclude: - match_type: glob - devices: ["loop*", "ram*"] - network: - exclude: - match_type: strict - interfaces: ["lo"] - filesystem: - exclude_fs_types: - match_type: strict - fs_types: ["tmpfs", "proc", "sysfs"] +groups: + host: + pipelines: + collect: + policies: + resources: + core_allocation: + type: core_count + count: 1 + nodes: + host_metrics: + type: receiver:host_metrics + config: + families: + disk: + exclude: + match_type: glob + devices: ["loop*", "ram*"] + network: + exclude: + match_type: strict + interfaces: ["lo"] + filesystem: + exclude_fs_types: + match_type: strict + fs_types: ["tmpfs", "proc", "sysfs"] + publish: + type: exporter:topic + config: + topic: host_metrics + connections: + - from: host_metrics + to: publish ``` ## Current Limits - Linux only. +- Load metrics are not emitted in v1 because Semantic Conventions 1.41.0 does + not register a system load metric. - `families.cpu.per_cpu` is rejected in v1. - `families.network.include_connection_count` is rejected in v1. - Process metrics are aggregate host summaries, not per-process scrapes. +- `system.process.count` emits the registered `process.state=running` summary. + Linux `procs_blocked` is parsed but not emitted because `blocked` is not a + registered `process.state` value. - Filesystem collection can time out individual `statvfs` calls; avoid enabling remote filesystems unless the host environment is known to be healthy. diff --git a/rust/otap-dataflow/docs/host-metrics-receiver.md b/rust/otap-dataflow/docs/host-metrics-receiver.md index 08c38875d5..21d47d2c10 100644 --- a/rust/otap-dataflow/docs/host-metrics-receiver.md +++ b/rust/otap-dataflow/docs/host-metrics-receiver.md @@ -10,7 +10,7 @@ Receiver URN: `urn:otel:receiver:host_metrics` Target crate: `crates/core-nodes` -Target module: `crates/core-nodes/src/receivers/host_metrics` +Target module: `crates/core-nodes/src/receivers/host_metrics_receiver` The issue explicitly asks for `core-nodes`. If maintainers prefer to stage this receiver in `contrib-nodes` while the implementation and system semantic @@ -99,7 +99,7 @@ partial scrape behavior that emits successfully collected metrics. Use a narrow module layout and keep the boundaries explicit: ```text -crates/core-nodes/src/receivers/host_metrics/ +crates/core-nodes/src/receivers/host_metrics_receiver/ mod.rs config.rs metrics.rs @@ -215,10 +215,10 @@ Rules: - `include_connection_count: true` is invalid in v1. - `processes.mode` only accepts `summary` in v1. - `processes.mode: summary` emits `system.process.count` and - `system.process.created`; `system.process.count` is limited to `running` and - `blocked` states from `/proc/stat`. `blocked` is a documented custom - `process.state` value because the current registry has no well-known value - for `procs_blocked`. It must not emit per-PID series or PID attributes. + `system.process.created`; `system.process.count` is limited to registered + `process.state` values. The v1 implementation emits `running` from + `/proc/stat`. It parses `procs_blocked` but does not emit it because + `blocked` is not a registered `process.state` value. - The load family is not shown in the default example because Semantic Conventions 1.41.0 does not register a load metric. If maintainers choose an experimental Linux load metric, add it as an explicit opt-in. @@ -355,10 +355,13 @@ must not call blocking `statfs` directly on the receiver task. Use a bounded blocking worker path with per-mount timeout/cancellation behavior, and skip remote filesystems plus known virtual filesystem types by default. -For process summary, use `/proc/stat` fields first: `processes`, -`procs_running`, and `procs_blocked`. Do not walk `/proc//stat` in the v1 -default path. A per-PID walk is reserved for future richer process modes and -must tolerate PIDs disappearing between directory read and file read. +For process summary, use `/proc/stat` fields first. Project `processes` into +`system.process.created` and `procs_running` into `system.process.count` with +`process.state=running`. Parse `procs_blocked` for future use, but do not emit +it in v1 because `blocked` is not a registered `process.state` value. Do not +walk `/proc//stat` in the v1 default path. A per-PID walk is reserved for +future richer process modes and must tolerate PIDs disappearing between +directory read and file read. ## Scheduler @@ -550,7 +553,7 @@ timestamp. | `system.network.packet.count` | Yes | Counter | `{packet}` | `system.device`, `network.io.direction`. | | `system.network.packet.dropped` | Yes | Counter | `{packet}` | `network.interface.name`, `network.io.direction`. | | `system.network.errors` | Yes | Counter | `{error}` | `network.interface.name`, `network.io.direction`. | -| `system.process.count` | Yes | UpDownCounter | `{process}` | `process.state`; v1 summary emits `running` and custom `blocked` from `/proc/stat`. | +| `system.process.count` | Yes | UpDownCounter | `{process}` | `process.state`; v1 summary emits `running` from `/proc/stat`. `procs_blocked` is parsed but not emitted because `blocked` is not a registered value. | | `system.process.created` | Yes | Counter | `{process}` | Cumulative process creations from `/proc/stat`. | CPU time and utilization aggregate across logical CPUs by default because @@ -646,14 +649,16 @@ Initial metric set: | `families_scraped` | Counter | `{family}` | Count due families processed. | | `scrape_duration_ns` | Mmsc | `ns` | Scrape duration distribution. | | `scrape_lag_ns` | Mmsc | `ns` | Scheduled time to actual start. | -| `source_read_errors` | Counter | `{error}` | Attributes: `family`, `error_class`. | -| `partial_errors` | Counter | `{error}` | Attributes: `family`, `error_class`. | +| `source_read_errors` | Counter | `{error}` | Total source read errors seen during scrapes. | +| `partial_errors` | Counter | `{error}` | Source read errors skipped because other families succeeded. | | `batches_sent` | Counter | `{batch}` | Downstream sends. | -| `send_failures` | Counter | `{error}` | Attribute: `error_class`. | +| `send_failures` | Counter | `{error}` | Downstream send failures. | -Use `#[attribute_set(name = "...")]` for the low-cardinality attribute set -covering `family` and `error_class`. Do not put source paths or device names -into receiver self-observability metric attributes. +The current internal `MetricSet` API does not support attributes on individual +metric observations. The implementation therefore uses aggregate counters and a +code TODO to decide whether fixed per-family/error-class counters are needed +later. Do not put source paths or device names into receiver +self-observability metric names or attributes. ## Validation Plan From 4a734c0a8bb3f49fec946b5d942c536a28cf8a1c Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Tue, 5 May 2026 14:07:27 -0700 Subject: [PATCH 52/60] Fix host metrics partial scrape test --- .../core-nodes/src/receivers/host_metrics_receiver/procfs.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs index 1b3e7f9d2e..ebf39cd22d 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs @@ -2620,6 +2620,9 @@ mod tests { "MemTotal: 1000 kB\nMemFree: 100 kB\nMemAvailable: 200 kB\n", ) .expect("meminfo"); + // Cumulative metrics read /proc/stat once to cache boot time. Provide + // btime here so this test only exercises the missing diskstats error. + std::fs::write(proc.join("stat"), "btime 1700000000\n").expect("stat"); let mut source = ProcfsSource::new( Some(root.path()), ProcfsConfig { From a8a36ecc6d5d8d6a1f5cc9071b1df32e8a31023e Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Tue, 5 May 2026 14:35:57 -0700 Subject: [PATCH 53/60] Fix host metrics netdev test fixture --- .../core-nodes/src/receivers/host_metrics_receiver/procfs.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs index ebf39cd22d..36552c2d85 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs @@ -2780,7 +2780,7 @@ mod tests { let root = tempfile::tempdir().expect("tempdir"); let proc = root.path().join("proc"); let proc_one = proc.join("1"); - std::fs::create_dir_all(&proc_one).expect("proc dirs"); + std::fs::create_dir_all(proc_one.join("net")).expect("proc dirs"); std::fs::write(proc.join("stat"), "btime 123\n").expect("stat"); std::fs::write( proc.join("diskstats"), From 1e8e42288d4aca0f5e4628e5efedf590f31dcdc2 Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Tue, 5 May 2026 17:17:21 -0700 Subject: [PATCH 54/60] Accept Unix host roots on Windows tests --- .../core-nodes/src/receivers/host_metrics_receiver/mod.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs index 65a0fe68cf..3ead8bce28 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs @@ -1194,7 +1194,8 @@ impl Drop for HostMetricsLease { fn normalized_root_path(root_path: Option<&Path>) -> Result { let path = root_path.unwrap_or_else(|| Path::new("/")); - if !path.is_absolute() { + let path_text = path.to_string_lossy(); + if !path.is_absolute() && !path_text.starts_with('/') { return Err(otap_df_config::error::Error::InvalidUserConfig { error: format!("root_path must be absolute: {}", path.display()), }); From 131b7d6d06b1f641f6dac5892f06c05b2829dbcd Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Wed, 6 May 2026 20:23:02 -0700 Subject: [PATCH 55/60] Address host metrics review feedback --- .../receivers/host_metrics_receiver/config.rs | 835 ++++ .../receivers/host_metrics_receiver/mod.rs | 948 +--- .../receivers/host_metrics_receiver/procfs.rs | 3796 ----------------- .../host_metrics_receiver/procfs/mod.rs | 535 +++ .../host_metrics_receiver/procfs/paths.rs | 93 + .../procfs/projection.rs | 975 +++++ .../host_metrics_receiver/procfs/readings.rs | 768 ++++ .../host_metrics_receiver/procfs/tests.rs | 1742 ++++++++ 8 files changed, 5020 insertions(+), 4672 deletions(-) create mode 100644 rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/config.rs delete mode 100644 rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs create mode 100644 rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/mod.rs create mode 100644 rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/paths.rs create mode 100644 rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/projection.rs create mode 100644 rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/readings.rs create mode 100644 rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/tests.rs diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/config.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/config.rs new file mode 100644 index 0000000000..e93948a651 --- /dev/null +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/config.rs @@ -0,0 +1,835 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +//! Configuration types for the host metrics receiver. + +use regex::RegexSet; +use serde::{Deserialize, Serialize}; +use std::path::{Component, Path, PathBuf}; +use std::time::Duration; + +fn default_collection_interval() -> Duration { + Duration::from_secs(10) +} + +fn default_root_path() -> PathBuf { + PathBuf::from("/") +} + +/// Configuration for the host metrics receiver. +#[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(deny_unknown_fields)] +pub struct Config { + /// Collection interval. + #[serde(default = "default_collection_interval", with = "humantime_serde")] + pub collection_interval: Duration, + + /// Delay before the first scrape. + #[serde(default, with = "humantime_serde")] + pub initial_delay: Duration, + + /// Optional legacy host root path. Prefer `host_view.root_path`. + #[serde(default)] + pub root_path: Option, + + /// Host filesystem view. + #[serde(default)] + pub host_view: HostViewConfig, + + /// Metric family configuration. + #[serde(default)] + pub families: FamiliesConfig, +} + +impl Default for Config { + fn default() -> Self { + Self { + collection_interval: default_collection_interval(), + initial_delay: Duration::ZERO, + root_path: None, + host_view: HostViewConfig::default(), + families: FamiliesConfig::default(), + } + } +} + +/// Host filesystem view configuration. +#[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(default, deny_unknown_fields)] +pub struct HostViewConfig { + /// Root path for the observed host filesystem. + #[serde(default = "default_root_path")] + pub root_path: PathBuf, + /// Startup validation mode. + pub validation: HostViewValidationMode, +} + +impl Default for HostViewConfig { + fn default() -> Self { + Self { + root_path: default_root_path(), + validation: HostViewValidationMode::FailSelected, + } + } +} + +/// Host view startup validation mode. +#[derive(Clone, Copy, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] +#[serde(rename_all = "snake_case")] +pub enum HostViewValidationMode { + /// Fail startup if selected sources are unavailable. + #[default] + FailSelected, + /// Start and disable unavailable selected sources. + WarnSelected, + /// Skip startup validation. + None, +} + +/// Metric family configuration. +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +#[serde(default, deny_unknown_fields)] +pub struct FamiliesConfig { + /// CPU metrics. + pub cpu: CpuFamilyConfig, + /// Memory metrics. + pub memory: MemoryFamilyConfig, + /// Paging metrics. + pub paging: FamilyConfig, + /// System metrics. + pub system: FamilyConfig, + /// Disk metrics. + pub disk: DiskFamilyConfig, + /// Filesystem metrics. + pub filesystem: FilesystemFamilyConfig, + /// Network metrics. + pub network: NetworkFamilyConfig, + /// Process summary metrics. + pub processes: ProcessesFamilyConfig, +} + +impl FamiliesConfig { + fn enabled_count(&self) -> usize { + usize::from(self.cpu.enabled) + + usize::from(self.memory.enabled) + + usize::from(self.paging.enabled) + + usize::from(self.system.enabled) + + usize::from(self.disk.enabled) + + usize::from(self.filesystem.enabled) + + usize::from(self.network.enabled) + + usize::from(self.processes.enabled) + } +} + +/// CPU family config. +#[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(default, deny_unknown_fields)] +pub struct CpuFamilyConfig { + /// Enable CPU metrics. + pub enabled: bool, + /// Family collection interval. Defaults to top-level `collection_interval`. + #[serde(default, with = "humantime_serde::option")] + pub interval: Option, + /// Emit per-logical-CPU time series. Not supported in v1. + pub per_cpu: bool, + /// Emit aggregate CPU utilization derived from CPU time deltas. + pub utilization: bool, +} + +impl Default for CpuFamilyConfig { + fn default() -> Self { + Self { + enabled: true, + interval: None, + per_cpu: false, + utilization: false, + } + } +} + +/// Common family config. +#[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(default, deny_unknown_fields)] +pub struct FamilyConfig { + /// Enable this family. + pub enabled: bool, + /// Family collection interval. Defaults to top-level `collection_interval`. + #[serde(default, with = "humantime_serde::option")] + pub interval: Option, +} + +impl Default for FamilyConfig { + fn default() -> Self { + Self { + enabled: true, + interval: None, + } + } +} + +/// Memory family config. +#[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(default, deny_unknown_fields)] +pub struct MemoryFamilyConfig { + /// Enable memory metrics. + pub enabled: bool, + /// Family collection interval. Defaults to top-level `collection_interval`. + #[serde(default, with = "humantime_serde::option")] + pub interval: Option, + /// Enable memory limit metrics. + pub limit: bool, + /// Enable Linux shared memory metric. + pub shared: bool, + /// Enable Linux hugepage metrics. + pub hugepages: bool, +} + +impl Default for MemoryFamilyConfig { + fn default() -> Self { + Self { + enabled: true, + interval: None, + limit: false, + shared: false, + hugepages: false, + } + } +} + +/// Disk family config. +#[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(default, deny_unknown_fields)] +pub struct DiskFamilyConfig { + /// Enable disk metrics. + pub enabled: bool, + /// Family collection interval. Defaults to top-level `collection_interval`. + #[serde(default, with = "humantime_serde::option")] + pub interval: Option, + /// Enable disk limit metrics. + pub limit: bool, + /// Device include filter. + pub include: Option, + /// Device exclude filter. + pub exclude: Option, +} + +impl Default for DiskFamilyConfig { + fn default() -> Self { + Self { + enabled: true, + interval: None, + limit: false, + include: None, + exclude: None, + } + } +} + +/// Filesystem family config. +#[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(default, deny_unknown_fields)] +pub struct FilesystemFamilyConfig { + /// Enable filesystem metrics. + pub enabled: bool, + /// Family collection interval. Defaults to top-level `collection_interval`. + #[serde(default, with = "humantime_serde::option")] + pub interval: Option, + /// Include virtual filesystems. + pub include_virtual_filesystems: bool, + /// Enable filesystem limit metrics. + pub limit: bool, + /// Device include filter. + pub include_devices: Option, + /// Device exclude filter. + pub exclude_devices: Option, + /// Filesystem type include filter. + pub include_fs_types: Option, + /// Filesystem type exclude filter. + pub exclude_fs_types: Option, + /// Mount point include filter. + pub include_mount_points: Option, + /// Mount point exclude filter. + pub exclude_mount_points: Option, +} + +impl Default for FilesystemFamilyConfig { + fn default() -> Self { + Self { + enabled: true, + interval: None, + include_virtual_filesystems: false, + limit: false, + include_devices: None, + exclude_devices: None, + include_fs_types: None, + exclude_fs_types: None, + include_mount_points: None, + exclude_mount_points: None, + } + } +} + +/// Filesystem type filter config. +#[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(default, deny_unknown_fields)] +pub struct FilesystemTypeFilterConfig { + /// Filesystem types. + pub fs_types: Vec, + /// Match type. + pub match_type: MatchType, +} + +impl Default for FilesystemTypeFilterConfig { + fn default() -> Self { + Self { + fs_types: Vec::new(), + match_type: MatchType::Strict, + } + } +} + +/// Mount point filter config. +#[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(default, deny_unknown_fields)] +pub struct MountPointFilterConfig { + /// Mount points. + pub mount_points: Vec, + /// Match type. + pub match_type: MatchType, +} + +impl Default for MountPointFilterConfig { + fn default() -> Self { + Self { + mount_points: Vec::new(), + match_type: MatchType::Strict, + } + } +} + +/// Network family config. +#[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(default, deny_unknown_fields)] +pub struct NetworkFamilyConfig { + /// Enable network metrics. + pub enabled: bool, + /// Family collection interval. Defaults to top-level `collection_interval`. + #[serde(default, with = "humantime_serde::option")] + pub interval: Option, + /// Interface include filter. + pub include: Option, + /// Interface exclude filter. + pub exclude: Option, + /// Connection count is not supported in v1. + pub include_connection_count: bool, +} + +impl Default for NetworkFamilyConfig { + fn default() -> Self { + Self { + enabled: true, + interval: None, + include: None, + exclude: None, + include_connection_count: false, + } + } +} + +/// Process family config. +#[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(default, deny_unknown_fields)] +pub struct ProcessesFamilyConfig { + /// Enable process summary metrics. + pub enabled: bool, + /// Family collection interval. Defaults to top-level `collection_interval`. + #[serde(default, with = "humantime_serde::option")] + pub interval: Option, + /// Only `summary` is supported in v1. + pub mode: ProcessMode, +} + +impl Default for ProcessesFamilyConfig { + fn default() -> Self { + Self { + enabled: true, + interval: None, + mode: ProcessMode::Summary, + } + } +} + +/// Process collection mode. +#[derive(Clone, Copy, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] +#[serde(rename_all = "snake_case")] +pub enum ProcessMode { + /// Aggregate host process summary. + #[default] + Summary, +} + +/// Disk device filter. +#[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(deny_unknown_fields)] +pub struct DeviceFilterConfig { + /// Device names. + pub devices: Vec, + /// Match type. + #[serde(default)] + pub match_type: MatchType, +} + +/// Network interface filter. +#[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(deny_unknown_fields)] +pub struct InterfaceFilterConfig { + /// Interface names. + pub interfaces: Vec, + /// Match type. + #[serde(default)] + pub match_type: MatchType, +} + +/// Filter match type. +#[derive(Clone, Copy, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] +#[serde(rename_all = "snake_case")] +pub enum MatchType { + /// Exact string match. + #[default] + Strict, + /// Glob match with `*` and `?`. + Glob, + /// Regular expression match. + Regexp, +} + +#[derive(Clone)] +pub(super) struct RuntimeConfig { + pub(super) root_path: PathBuf, + pub(super) validation: HostViewValidationMode, + pub(super) initial_delay: Duration, + pub(super) cpu_utilization: bool, + pub(super) memory_limit: bool, + pub(super) memory_shared: bool, + pub(super) memory_hugepages: bool, + pub(super) families: RuntimeFamilies, +} + +#[derive(Clone)] +pub(super) struct RuntimeFamilies { + pub(super) cpu: RuntimeFamily, + pub(super) memory: RuntimeFamily, + pub(super) paging: RuntimeFamily, + pub(super) system: RuntimeFamily, + pub(super) disk: RuntimeDiskFamily, + pub(super) filesystem: RuntimeFilesystemFamily, + pub(super) network: RuntimeNetworkFamily, + pub(super) processes: RuntimeFamily, +} + +#[derive(Clone)] +pub(super) struct RuntimeFamily { + pub(super) enabled: bool, + pub(super) interval: Duration, +} + +#[derive(Clone)] +pub(super) struct RuntimeDiskFamily { + pub(super) enabled: bool, + pub(super) interval: Duration, + pub(super) limit: bool, + pub(super) include: Option, + pub(super) exclude: Option, +} + +#[derive(Clone)] +pub(super) struct RuntimeFilesystemFamily { + pub(super) enabled: bool, + pub(super) interval: Duration, + pub(super) include_virtual_filesystems: bool, + pub(super) limit: bool, + pub(super) include_devices: Option, + pub(super) exclude_devices: Option, + pub(super) include_fs_types: Option, + pub(super) exclude_fs_types: Option, + pub(super) include_mount_points: Option, + pub(super) exclude_mount_points: Option, +} + +#[derive(Clone)] +pub(super) struct RuntimeNetworkFamily { + pub(super) enabled: bool, + pub(super) interval: Duration, + pub(super) include: Option, + pub(super) exclude: Option, +} + +#[derive(Clone)] +pub(crate) struct CompiledFilter { + match_type: MatchType, + values: Vec, + regex_set: Option, +} + +impl CompiledFilter { + pub(super) fn compile( + match_type: MatchType, + values: Vec, + ) -> Result, otap_df_config::error::Error> { + if values.is_empty() { + return Ok(None); + } + let regex_set = if match_type == MatchType::Regexp { + Some(RegexSet::new(&values).map_err(|err| { + otap_df_config::error::Error::InvalidUserConfig { + error: format!("invalid host metrics regexp filter: {err}"), + } + })?) + } else { + None + }; + Ok(Some(Self { + match_type, + values, + regex_set, + })) + } + + pub(crate) fn matches(&self, value: &str) -> bool { + match self.match_type { + MatchType::Strict => self.values.iter().any(|candidate| candidate == value), + MatchType::Glob => self + .values + .iter() + .any(|candidate| glob_matches(candidate.as_bytes(), value.as_bytes())), + MatchType::Regexp => self + .regex_set + .as_ref() + .is_some_and(|regex_set| regex_set.is_match(value)), + } + } +} + +fn glob_matches(pattern: &[u8], value: &[u8]) -> bool { + let (mut p, mut v) = (0, 0); + let mut star = None; + let mut star_value = 0; + + while v < value.len() { + if p < pattern.len() && (pattern[p] == b'?' || pattern[p] == value[v]) { + p += 1; + v += 1; + } else if p < pattern.len() && pattern[p] == b'*' { + star = Some(p); + p += 1; + star_value = v; + } else if let Some(star_pos) = star { + p = star_pos + 1; + star_value += 1; + v = star_value; + } else { + return false; + } + } + + while p < pattern.len() && pattern[p] == b'*' { + p += 1; + } + p == pattern.len() +} + +pub(super) fn validate_config(config: &Config) -> Result<(), otap_df_config::error::Error> { + if config.collection_interval.is_zero() { + return Err(otap_df_config::error::Error::InvalidUserConfig { + error: "collection_interval must be greater than zero".to_owned(), + }); + } + if config.families.enabled_count() == 0 { + return Err(otap_df_config::error::Error::InvalidUserConfig { + error: "at least one host metrics family must be enabled".to_owned(), + }); + } + if config.families.network.include_connection_count { + return Err(otap_df_config::error::Error::InvalidUserConfig { + error: "network include_connection_count is not supported in v1".to_owned(), + }); + } + if config.families.cpu.per_cpu { + return Err(otap_df_config::error::Error::InvalidUserConfig { + error: "cpu per_cpu is not supported in v1".to_owned(), + }); + } + validate_family_interval( + "cpu", + config.families.cpu.enabled, + config.families.cpu.interval, + )?; + validate_family_interval( + "memory", + config.families.memory.enabled, + config.families.memory.interval, + )?; + validate_family_interval( + "paging", + config.families.paging.enabled, + config.families.paging.interval, + )?; + validate_family_interval( + "system", + config.families.system.enabled, + config.families.system.interval, + )?; + validate_family_interval( + "disk", + config.families.disk.enabled, + config.families.disk.interval, + )?; + validate_family_interval( + "filesystem", + config.families.filesystem.enabled, + config.families.filesystem.interval, + )?; + validate_family_interval( + "network", + config.families.network.enabled, + config.families.network.interval, + )?; + validate_family_interval( + "processes", + config.families.processes.enabled, + config.families.processes.interval, + )?; + let _ = normalized_root_path(Some(effective_root_path(config)?))?; + Ok(()) +} + +pub(super) fn effective_root_path(config: &Config) -> Result<&Path, otap_df_config::error::Error> { + if let Some(root_path) = config.root_path.as_deref() { + let host_view_root = config.host_view.root_path.as_path(); + if host_view_root != Path::new("/") && root_path != host_view_root { + return Err(otap_df_config::error::Error::InvalidUserConfig { + error: "root_path and host_view.root_path cannot both be set to different values" + .to_owned(), + }); + } + Ok(root_path) + } else { + Ok(config.host_view.root_path.as_path()) + } +} + +fn validate_family_interval( + family: &'static str, + enabled: bool, + interval: Option, +) -> Result<(), otap_df_config::error::Error> { + if enabled && interval.is_some_and(|interval| interval.is_zero()) { + return Err(otap_df_config::error::Error::InvalidUserConfig { + error: format!("{family} interval must be greater than zero"), + }); + } + Ok(()) +} + +impl TryFrom for RuntimeConfig { + type Error = otap_df_config::error::Error; + + fn try_from(config: Config) -> Result { + validate_config(&config)?; + let root_path = normalized_root_path(Some(effective_root_path(&config)?))?; + let disk_include = config + .families + .disk + .include + .map(|filter| CompiledFilter::compile(filter.match_type, filter.devices)) + .transpose()? + .flatten(); + let disk_exclude = config + .families + .disk + .exclude + .map(|filter| CompiledFilter::compile(filter.match_type, filter.devices)) + .transpose()? + .flatten(); + let filesystem_include_devices = config + .families + .filesystem + .include_devices + .map(|filter| CompiledFilter::compile(filter.match_type, filter.devices)) + .transpose()? + .flatten(); + let filesystem_exclude_devices = config + .families + .filesystem + .exclude_devices + .map(|filter| CompiledFilter::compile(filter.match_type, filter.devices)) + .transpose()? + .flatten(); + let filesystem_include_fs_types = config + .families + .filesystem + .include_fs_types + .map(|filter| CompiledFilter::compile(filter.match_type, filter.fs_types)) + .transpose()? + .flatten(); + let filesystem_exclude_fs_types = config + .families + .filesystem + .exclude_fs_types + .map(|filter| CompiledFilter::compile(filter.match_type, filter.fs_types)) + .transpose()? + .flatten(); + let filesystem_include_mount_points = config + .families + .filesystem + .include_mount_points + .map(|filter| CompiledFilter::compile(filter.match_type, filter.mount_points)) + .transpose()? + .flatten(); + let filesystem_exclude_mount_points = config + .families + .filesystem + .exclude_mount_points + .map(|filter| CompiledFilter::compile(filter.match_type, filter.mount_points)) + .transpose()? + .flatten(); + let network_include = config + .families + .network + .include + .map(|filter| CompiledFilter::compile(filter.match_type, filter.interfaces)) + .transpose()? + .flatten(); + let network_exclude = config + .families + .network + .exclude + .map(|filter| CompiledFilter::compile(filter.match_type, filter.interfaces)) + .transpose()? + .flatten(); + + Ok(Self { + root_path, + validation: config.host_view.validation, + initial_delay: config.initial_delay, + cpu_utilization: config.families.cpu.utilization, + memory_limit: config.families.memory.limit, + memory_shared: config.families.memory.shared, + memory_hugepages: config.families.memory.hugepages, + families: RuntimeFamilies { + cpu: RuntimeFamily::new_cpu(&config.families.cpu, config.collection_interval), + memory: RuntimeFamily::new_memory( + &config.families.memory, + config.collection_interval, + ), + paging: RuntimeFamily::new(&config.families.paging, config.collection_interval), + system: RuntimeFamily::new(&config.families.system, config.collection_interval), + disk: RuntimeDiskFamily { + enabled: config.families.disk.enabled, + interval: config + .families + .disk + .interval + .unwrap_or(config.collection_interval), + limit: config.families.disk.limit, + include: disk_include, + exclude: disk_exclude, + }, + filesystem: RuntimeFilesystemFamily { + enabled: config.families.filesystem.enabled, + interval: config + .families + .filesystem + .interval + .unwrap_or(config.collection_interval), + include_virtual_filesystems: config + .families + .filesystem + .include_virtual_filesystems, + limit: config.families.filesystem.limit, + include_devices: filesystem_include_devices, + exclude_devices: filesystem_exclude_devices, + include_fs_types: filesystem_include_fs_types, + exclude_fs_types: filesystem_exclude_fs_types, + include_mount_points: filesystem_include_mount_points, + exclude_mount_points: filesystem_exclude_mount_points, + }, + network: RuntimeNetworkFamily { + enabled: config.families.network.enabled, + interval: config + .families + .network + .interval + .unwrap_or(config.collection_interval), + include: network_include, + exclude: network_exclude, + }, + processes: RuntimeFamily { + enabled: config.families.processes.enabled, + interval: config + .families + .processes + .interval + .unwrap_or(config.collection_interval), + }, + }, + }) + } +} + +impl RuntimeFamily { + fn new(config: &FamilyConfig, default_interval: Duration) -> Self { + Self { + enabled: config.enabled, + interval: config.interval.unwrap_or(default_interval), + } + } + + fn new_cpu(config: &CpuFamilyConfig, default_interval: Duration) -> Self { + Self { + enabled: config.enabled, + interval: config.interval.unwrap_or(default_interval), + } + } + + fn new_memory(config: &MemoryFamilyConfig, default_interval: Duration) -> Self { + Self { + enabled: config.enabled, + interval: config.interval.unwrap_or(default_interval), + } + } +} + +pub(super) fn normalized_root_path( + root_path: Option<&Path>, +) -> Result { + let path = root_path.unwrap_or_else(|| Path::new("/")); + let path_text = path.to_string_lossy(); + if !path.is_absolute() && !path_text.starts_with('/') { + return Err(otap_df_config::error::Error::InvalidUserConfig { + error: format!("root_path must be absolute: {}", path.display()), + }); + } + + let mut normalized = PathBuf::from("/"); + for component in path.components() { + match component { + Component::RootDir => {} + Component::Normal(part) => normalized.push(part), + Component::CurDir | Component::ParentDir => { + return Err(otap_df_config::error::Error::InvalidUserConfig { + error: format!( + "root_path must not contain . or .. components: {}", + path.display() + ), + }); + } + Component::Prefix(_) => { + return Err(otap_df_config::error::Error::InvalidUserConfig { + error: format!("root_path must be a Unix absolute path: {}", path.display()), + }); + } + } + } + Ok(normalized) +} diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs index 3ead8bce28..31d22118d7 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs @@ -23,7 +23,6 @@ use otap_df_engine::error::{Error, ReceiverErrorKind, TypedError}; #[cfg(target_os = "linux")] use otap_df_engine::local::receiver as local; use otap_df_engine::node::NodeId; -#[cfg(target_os = "linux")] use otap_df_engine::receiver::ReceiverWrapper; #[cfg(target_os = "linux")] use otap_df_engine::terminal_state::TerminalState; @@ -38,20 +37,20 @@ use otap_df_telemetry::metrics::MetricSetSnapshot; #[cfg(target_os = "linux")] use otap_df_telemetry::{otel_info, otel_warn}; use otap_df_telemetry_macros::metric_set; -use regex::Regex; -use serde::{Deserialize, Serialize}; #[cfg(target_os = "linux")] use serde_json::Value; use std::collections::HashSet; -use std::path::{Component, Path, PathBuf}; +use std::path::PathBuf; use std::sync::Arc; use std::sync::{LazyLock, Mutex}; +#[cfg(any(target_os = "linux", test))] use std::time::Duration; #[cfg(target_os = "linux")] use std::time::Instant as StdInstant; #[cfg(target_os = "linux")] use tokio::time::{Instant, sleep_until}; +mod config; #[cfg(target_os = "linux")] mod otap_builder; #[cfg(target_os = "linux")] @@ -62,6 +61,20 @@ mod semconv; #[cfg(target_os = "linux")] use procfs::{HostSnapshot, ProcfsConfig, ProcfsFamilies, ProcfsSource}; +#[cfg(any(target_os = "linux", test))] +pub(crate) use config::CompiledFilter; +use config::RuntimeConfig; +#[cfg(any(target_os = "linux", test))] +use config::validate_config; +pub use config::{ + Config, CpuFamilyConfig, DeviceFilterConfig, DiskFamilyConfig, FamiliesConfig, FamilyConfig, + FilesystemFamilyConfig, FilesystemTypeFilterConfig, HostViewConfig, HostViewValidationMode, + InterfaceFilterConfig, MatchType, MemoryFamilyConfig, MountPointFilterConfig, + NetworkFamilyConfig, ProcessMode, ProcessesFamilyConfig, +}; +#[cfg(target_os = "linux")] +use config::{RuntimeFamily, effective_root_path, normalized_root_path}; + /// The URN for the host metrics receiver. pub const HOST_METRICS_RECEIVER_URN: &str = "urn:otel:receiver:host_metrics"; @@ -103,540 +116,6 @@ pub struct HostMetricsReceiverMetrics { pub send_failures: Counter, } -fn default_collection_interval() -> Duration { - Duration::from_secs(10) -} - -fn default_root_path() -> PathBuf { - PathBuf::from("/") -} - -/// Configuration for the host metrics receiver. -#[derive(Clone, Debug, Deserialize, Serialize)] -#[serde(deny_unknown_fields)] -pub struct Config { - /// Collection interval. - #[serde(default = "default_collection_interval", with = "humantime_serde")] - pub collection_interval: Duration, - - /// Delay before the first scrape. - #[serde(default, with = "humantime_serde")] - pub initial_delay: Duration, - - /// Optional legacy host root path. Prefer `host_view.root_path`. - #[serde(default)] - pub root_path: Option, - - /// Host filesystem view. - #[serde(default)] - pub host_view: HostViewConfig, - - /// Metric family configuration. - #[serde(default)] - pub families: FamiliesConfig, -} - -impl Default for Config { - fn default() -> Self { - Self { - collection_interval: default_collection_interval(), - initial_delay: Duration::ZERO, - root_path: None, - host_view: HostViewConfig::default(), - families: FamiliesConfig::default(), - } - } -} - -/// Host filesystem view configuration. -#[derive(Clone, Debug, Deserialize, Serialize)] -#[serde(default, deny_unknown_fields)] -pub struct HostViewConfig { - /// Root path for the observed host filesystem. - #[serde(default = "default_root_path")] - pub root_path: PathBuf, - /// Startup validation mode. - pub validation: HostViewValidationMode, -} - -impl Default for HostViewConfig { - fn default() -> Self { - Self { - root_path: default_root_path(), - validation: HostViewValidationMode::FailSelected, - } - } -} - -/// Host view startup validation mode. -#[derive(Clone, Copy, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] -#[serde(rename_all = "snake_case")] -pub enum HostViewValidationMode { - /// Fail startup if selected sources are unavailable. - #[default] - FailSelected, - /// Start and disable unavailable selected sources. - WarnSelected, - /// Skip startup validation. - None, -} - -/// Metric family configuration. -#[derive(Clone, Debug, Default, Deserialize, Serialize)] -#[serde(default, deny_unknown_fields)] -pub struct FamiliesConfig { - /// CPU metrics. - pub cpu: CpuFamilyConfig, - /// Memory metrics. - pub memory: MemoryFamilyConfig, - /// Paging metrics. - pub paging: FamilyConfig, - /// System metrics. - pub system: FamilyConfig, - /// Disk metrics. - pub disk: DiskFamilyConfig, - /// Filesystem metrics. - pub filesystem: FilesystemFamilyConfig, - /// Network metrics. - pub network: NetworkFamilyConfig, - /// Process summary metrics. - pub processes: ProcessesFamilyConfig, -} - -impl FamiliesConfig { - fn enabled_count(&self) -> usize { - usize::from(self.cpu.enabled) - + usize::from(self.memory.enabled) - + usize::from(self.paging.enabled) - + usize::from(self.system.enabled) - + usize::from(self.disk.enabled) - + usize::from(self.filesystem.enabled) - + usize::from(self.network.enabled) - + usize::from(self.processes.enabled) - } -} - -/// CPU family config. -#[derive(Clone, Debug, Deserialize, Serialize)] -#[serde(default, deny_unknown_fields)] -pub struct CpuFamilyConfig { - /// Enable CPU metrics. - pub enabled: bool, - /// Family collection interval. Defaults to top-level `collection_interval`. - #[serde(default, with = "humantime_serde::option")] - pub interval: Option, - /// Emit per-logical-CPU time series. Not supported in v1. - pub per_cpu: bool, - /// Emit aggregate CPU utilization derived from CPU time deltas. - pub utilization: bool, -} - -impl Default for CpuFamilyConfig { - fn default() -> Self { - Self { - enabled: true, - interval: None, - per_cpu: false, - utilization: false, - } - } -} - -/// Common family config. -#[derive(Clone, Debug, Deserialize, Serialize)] -#[serde(default, deny_unknown_fields)] -pub struct FamilyConfig { - /// Enable this family. - pub enabled: bool, - /// Family collection interval. Defaults to top-level `collection_interval`. - #[serde(default, with = "humantime_serde::option")] - pub interval: Option, -} - -impl Default for FamilyConfig { - fn default() -> Self { - Self { - enabled: true, - interval: None, - } - } -} - -/// Memory family config. -#[derive(Clone, Debug, Deserialize, Serialize)] -#[serde(default, deny_unknown_fields)] -pub struct MemoryFamilyConfig { - /// Enable memory metrics. - pub enabled: bool, - /// Family collection interval. Defaults to top-level `collection_interval`. - #[serde(default, with = "humantime_serde::option")] - pub interval: Option, - /// Enable memory limit metrics. - pub limit: bool, - /// Enable Linux shared memory metric. - pub shared: bool, - /// Enable Linux hugepage metrics. - pub hugepages: bool, -} - -impl Default for MemoryFamilyConfig { - fn default() -> Self { - Self { - enabled: true, - interval: None, - limit: false, - shared: false, - hugepages: false, - } - } -} - -/// Disk family config. -#[derive(Clone, Debug, Deserialize, Serialize)] -#[serde(default, deny_unknown_fields)] -pub struct DiskFamilyConfig { - /// Enable disk metrics. - pub enabled: bool, - /// Family collection interval. Defaults to top-level `collection_interval`. - #[serde(default, with = "humantime_serde::option")] - pub interval: Option, - /// Enable disk limit metrics. - pub limit: bool, - /// Device include filter. - pub include: Option, - /// Device exclude filter. - pub exclude: Option, -} - -impl Default for DiskFamilyConfig { - fn default() -> Self { - Self { - enabled: true, - interval: None, - limit: false, - include: None, - exclude: None, - } - } -} - -/// Filesystem family config. -#[derive(Clone, Debug, Deserialize, Serialize)] -#[serde(default, deny_unknown_fields)] -pub struct FilesystemFamilyConfig { - /// Enable filesystem metrics. - pub enabled: bool, - /// Family collection interval. Defaults to top-level `collection_interval`. - #[serde(default, with = "humantime_serde::option")] - pub interval: Option, - /// Include virtual filesystems. - pub include_virtual_filesystems: bool, - /// Enable filesystem limit metrics. - pub limit: bool, - /// Device include filter. - pub include_devices: Option, - /// Device exclude filter. - pub exclude_devices: Option, - /// Filesystem type include filter. - pub include_fs_types: Option, - /// Filesystem type exclude filter. - pub exclude_fs_types: Option, - /// Mount point include filter. - pub include_mount_points: Option, - /// Mount point exclude filter. - pub exclude_mount_points: Option, -} - -impl Default for FilesystemFamilyConfig { - fn default() -> Self { - Self { - enabled: true, - interval: None, - include_virtual_filesystems: false, - limit: false, - include_devices: None, - exclude_devices: None, - include_fs_types: None, - exclude_fs_types: None, - include_mount_points: None, - exclude_mount_points: None, - } - } -} - -/// Filesystem type filter config. -#[derive(Clone, Debug, Deserialize, Serialize)] -#[serde(default, deny_unknown_fields)] -pub struct FilesystemTypeFilterConfig { - /// Filesystem types. - pub fs_types: Vec, - /// Match type. - pub match_type: MatchType, -} - -impl Default for FilesystemTypeFilterConfig { - fn default() -> Self { - Self { - fs_types: Vec::new(), - match_type: MatchType::Strict, - } - } -} - -/// Mount point filter config. -#[derive(Clone, Debug, Deserialize, Serialize)] -#[serde(default, deny_unknown_fields)] -pub struct MountPointFilterConfig { - /// Mount points. - pub mount_points: Vec, - /// Match type. - pub match_type: MatchType, -} - -impl Default for MountPointFilterConfig { - fn default() -> Self { - Self { - mount_points: Vec::new(), - match_type: MatchType::Strict, - } - } -} - -/// Network family config. -#[derive(Clone, Debug, Deserialize, Serialize)] -#[serde(default, deny_unknown_fields)] -pub struct NetworkFamilyConfig { - /// Enable network metrics. - pub enabled: bool, - /// Family collection interval. Defaults to top-level `collection_interval`. - #[serde(default, with = "humantime_serde::option")] - pub interval: Option, - /// Interface include filter. - pub include: Option, - /// Interface exclude filter. - pub exclude: Option, - /// Connection count is not supported in v1. - pub include_connection_count: bool, -} - -impl Default for NetworkFamilyConfig { - fn default() -> Self { - Self { - enabled: true, - interval: None, - include: None, - exclude: None, - include_connection_count: false, - } - } -} - -/// Process family config. -#[derive(Clone, Debug, Deserialize, Serialize)] -#[serde(default, deny_unknown_fields)] -pub struct ProcessesFamilyConfig { - /// Enable process summary metrics. - pub enabled: bool, - /// Family collection interval. Defaults to top-level `collection_interval`. - #[serde(default, with = "humantime_serde::option")] - pub interval: Option, - /// Only `summary` is supported in v1. - pub mode: ProcessMode, -} - -impl Default for ProcessesFamilyConfig { - fn default() -> Self { - Self { - enabled: true, - interval: None, - mode: ProcessMode::Summary, - } - } -} - -/// Process collection mode. -#[derive(Clone, Copy, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] -#[serde(rename_all = "snake_case")] -pub enum ProcessMode { - /// Aggregate host process summary. - #[default] - Summary, -} - -/// Disk device filter. -#[derive(Clone, Debug, Deserialize, Serialize)] -#[serde(deny_unknown_fields)] -pub struct DeviceFilterConfig { - /// Device names. - pub devices: Vec, - /// Match type. - #[serde(default)] - pub match_type: MatchType, -} - -/// Network interface filter. -#[derive(Clone, Debug, Deserialize, Serialize)] -#[serde(deny_unknown_fields)] -pub struct InterfaceFilterConfig { - /// Interface names. - pub interfaces: Vec, - /// Match type. - #[serde(default)] - pub match_type: MatchType, -} - -/// Filter match type. -#[derive(Clone, Copy, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] -#[serde(rename_all = "snake_case")] -pub enum MatchType { - /// Exact string match. - #[default] - Strict, - /// Glob match with `*` and `?`. - Glob, - /// Regular expression match. - Regexp, -} - -#[derive(Clone)] -struct RuntimeConfig { - root_path: PathBuf, - validation: HostViewValidationMode, - initial_delay: Duration, - cpu_utilization: bool, - memory_limit: bool, - memory_shared: bool, - memory_hugepages: bool, - families: RuntimeFamilies, -} - -#[derive(Clone)] -struct RuntimeFamilies { - cpu: RuntimeFamily, - memory: RuntimeFamily, - paging: RuntimeFamily, - system: RuntimeFamily, - disk: RuntimeDiskFamily, - filesystem: RuntimeFilesystemFamily, - network: RuntimeNetworkFamily, - processes: RuntimeFamily, -} - -#[derive(Clone)] -struct RuntimeFamily { - enabled: bool, - interval: Duration, -} - -#[derive(Clone)] -struct RuntimeDiskFamily { - enabled: bool, - interval: Duration, - limit: bool, - include: Option, - exclude: Option, -} - -#[derive(Clone)] -struct RuntimeFilesystemFamily { - enabled: bool, - interval: Duration, - include_virtual_filesystems: bool, - limit: bool, - include_devices: Option, - exclude_devices: Option, - include_fs_types: Option, - exclude_fs_types: Option, - include_mount_points: Option, - exclude_mount_points: Option, -} - -#[derive(Clone)] -struct RuntimeNetworkFamily { - enabled: bool, - interval: Duration, - include: Option, - exclude: Option, -} - -#[derive(Clone)] -pub(crate) struct CompiledFilter { - match_type: MatchType, - values: Vec, - regexes: Vec, -} - -impl CompiledFilter { - fn compile( - match_type: MatchType, - values: Vec, - ) -> Result, otap_df_config::error::Error> { - if values.is_empty() { - return Ok(None); - } - let regexes = if match_type == MatchType::Regexp { - let mut regexes = Vec::with_capacity(values.len()); - for value in &values { - regexes.push(Regex::new(value).map_err(|err| { - otap_df_config::error::Error::InvalidUserConfig { - error: format!("invalid host metrics regexp filter {value:?}: {err}"), - } - })?); - } - regexes - } else { - Vec::new() - }; - Ok(Some(Self { - match_type, - values, - regexes, - })) - } - - pub(crate) fn matches(&self, value: &str) -> bool { - match self.match_type { - MatchType::Strict => self.values.iter().any(|candidate| candidate == value), - MatchType::Glob => self - .values - .iter() - .any(|candidate| glob_matches(candidate.as_bytes(), value.as_bytes())), - MatchType::Regexp => self - .regexes - .iter() - .any(|candidate| candidate.is_match(value)), - } - } -} - -fn glob_matches(pattern: &[u8], value: &[u8]) -> bool { - let (mut p, mut v) = (0, 0); - let mut star = None; - let mut star_value = 0; - - while v < value.len() { - if p < pattern.len() && (pattern[p] == b'?' || pattern[p] == value[v]) { - p += 1; - v += 1; - } else if p < pattern.len() && pattern[p] == b'*' { - star = Some(p); - p += 1; - star_value = v; - } else if let Some(star_pos) = star { - p = star_pos + 1; - star_value += 1; - v = star_value; - } else { - return false; - } - } - - while p < pattern.len() && pattern[p] == b'*' { - p += 1; - } - p == pattern.len() -} - /// Host metrics receiver. pub struct HostMetricsReceiver { config: RuntimeConfig, @@ -649,50 +128,64 @@ pub struct HostMetricsReceiver { /// Declares the host metrics receiver as a local receiver factory. pub static HOST_METRICS_RECEIVER: ReceiverFactory = ReceiverFactory { name: HOST_METRICS_RECEIVER_URN, - create: |pipeline: PipelineContext, - #[cfg(target_os = "linux")] node: NodeId, - #[cfg(not(target_os = "linux"))] _node: NodeId, - #[cfg(target_os = "linux")] node_config: Arc, - #[cfg(not(target_os = "linux"))] _node_config: Arc, - #[cfg(target_os = "linux")] receiver_config: &ReceiverConfig, - #[cfg(not(target_os = "linux"))] _receiver_config: &ReceiverConfig| { - validate_supported_platform()?; - if pipeline.num_cores() > 1 { - return Err(otap_df_config::error::Error::InvalidUserConfig { - error: "host-wide collection must run in a one-core source pipeline; use receiver:host_metrics -> exporter:topic and fan out downstream".to_owned(), - }); - } - #[cfg(target_os = "linux")] - { - let mut receiver = HostMetricsReceiver::from_config(&node_config.config)?; - receiver.metrics = Some(pipeline.register_metrics::()); - Ok(ReceiverWrapper::local( - receiver, - node, - node_config, - receiver_config, - )) - } - #[cfg(not(target_os = "linux"))] - unreachable!("validate_supported_platform returned Ok on a non-Linux platform") - }, + create: create_host_metrics_receiver, wiring_contract: otap_df_engine::wiring_contract::WiringContract::UNRESTRICTED, - validate_config: |config| { - validate_supported_platform()?; - let config: Config = serde_json::from_value(config.clone()).map_err(|e| { - otap_df_config::error::Error::InvalidUserConfig { - error: e.to_string(), - } - })?; - RuntimeConfig::try_from(config).map(|_| ()) - }, + validate_config: validate_host_metrics_config, }; +#[cfg(target_os = "linux")] +fn create_host_metrics_receiver( + pipeline: PipelineContext, + node: NodeId, + node_config: Arc, + receiver_config: &ReceiverConfig, +) -> Result, otap_df_config::error::Error> { + if pipeline.num_cores() > 1 { + return Err(otap_df_config::error::Error::InvalidUserConfig { + error: "host-wide collection must run in a one-core source pipeline; use receiver:host_metrics -> exporter:topic and fan out downstream".to_owned(), + }); + } + let mut receiver = HostMetricsReceiver::from_config(&node_config.config)?; + receiver.metrics = Some(pipeline.register_metrics::()); + Ok(ReceiverWrapper::local( + receiver, + node, + node_config, + receiver_config, + )) +} + +#[cfg(not(target_os = "linux"))] +fn create_host_metrics_receiver( + _pipeline: PipelineContext, + _node: NodeId, + _node_config: Arc, + _receiver_config: &ReceiverConfig, +) -> Result, otap_df_config::error::Error> { + Err(unsupported_platform_error()) +} + +#[cfg(target_os = "linux")] +fn validate_host_metrics_config(config: &Value) -> Result<(), otap_df_config::error::Error> { + let config: Config = serde_json::from_value(config.clone()).map_err(|e| { + otap_df_config::error::Error::InvalidUserConfig { + error: e.to_string(), + } + })?; + RuntimeConfig::try_from(config).map(|_| ()) +} + +#[cfg(not(target_os = "linux"))] +fn validate_host_metrics_config( + _config: &serde_json::Value, +) -> Result<(), otap_df_config::error::Error> { + Err(unsupported_platform_error()) +} + #[cfg(target_os = "linux")] impl HostMetricsReceiver { /// Creates a new host metrics receiver. pub fn new(config: Config) -> Result { - validate_supported_platform()?; let root_path = normalized_root_path(Some(effective_root_path(&config)?))?; let lease = HostMetricsLease::acquire(root_path)?; let config = RuntimeConfig::try_from(config)?; @@ -715,109 +208,13 @@ impl HostMetricsReceiver { } } -fn validate_config(config: &Config) -> Result<(), otap_df_config::error::Error> { - if config.collection_interval.is_zero() { - return Err(otap_df_config::error::Error::InvalidUserConfig { - error: "collection_interval must be greater than zero".to_owned(), - }); - } - if config.families.enabled_count() == 0 { - return Err(otap_df_config::error::Error::InvalidUserConfig { - error: "at least one host metrics family must be enabled".to_owned(), - }); - } - if config.families.network.include_connection_count { - return Err(otap_df_config::error::Error::InvalidUserConfig { - error: "network include_connection_count is not supported in v1".to_owned(), - }); - } - if config.families.cpu.per_cpu { - return Err(otap_df_config::error::Error::InvalidUserConfig { - error: "cpu per_cpu is not supported in v1".to_owned(), - }); - } - validate_family_interval( - "cpu", - config.families.cpu.enabled, - config.families.cpu.interval, - )?; - validate_family_interval( - "memory", - config.families.memory.enabled, - config.families.memory.interval, - )?; - validate_family_interval( - "paging", - config.families.paging.enabled, - config.families.paging.interval, - )?; - validate_family_interval( - "system", - config.families.system.enabled, - config.families.system.interval, - )?; - validate_family_interval( - "disk", - config.families.disk.enabled, - config.families.disk.interval, - )?; - validate_family_interval( - "filesystem", - config.families.filesystem.enabled, - config.families.filesystem.interval, - )?; - validate_family_interval( - "network", - config.families.network.enabled, - config.families.network.interval, - )?; - validate_family_interval( - "processes", - config.families.processes.enabled, - config.families.processes.interval, - )?; - let _ = normalized_root_path(Some(effective_root_path(config)?))?; - Ok(()) -} - -fn validate_supported_platform() -> Result<(), otap_df_config::error::Error> { - if cfg!(target_os = "linux") { - Ok(()) - } else { - Err(otap_df_config::error::Error::InvalidUserConfig { - error: "host_metrics receiver is supported only on Linux".to_owned(), - }) - } -} - -fn effective_root_path(config: &Config) -> Result<&Path, otap_df_config::error::Error> { - if let Some(root_path) = config.root_path.as_deref() { - let host_view_root = config.host_view.root_path.as_path(); - if host_view_root != Path::new("/") && root_path != host_view_root { - return Err(otap_df_config::error::Error::InvalidUserConfig { - error: "root_path and host_view.root_path cannot both be set to different values" - .to_owned(), - }); - } - Ok(root_path) - } else { - Ok(config.host_view.root_path.as_path()) +#[cfg(not(target_os = "linux"))] +fn unsupported_platform_error() -> otap_df_config::error::Error { + otap_df_config::error::Error::InvalidUserConfig { + error: "host_metrics receiver is supported only on Linux".to_owned(), } } -fn validate_family_interval( - family: &'static str, - enabled: bool, - interval: Option, -) -> Result<(), otap_df_config::error::Error> { - if enabled && interval.is_some_and(|interval| interval.is_zero()) { - return Err(otap_df_config::error::Error::InvalidUserConfig { - error: format!("{family} interval must be greater than zero"), - }); - } - Ok(()) -} - #[cfg(target_os = "linux")] fn duration_nanos(duration: Duration) -> f64 { duration.as_secs_f64() * 1e9 @@ -852,175 +249,6 @@ fn due_family_count(due: ProcfsFamilies) -> u64 { + u64::from(due.processes) } -impl TryFrom for RuntimeConfig { - type Error = otap_df_config::error::Error; - - fn try_from(config: Config) -> Result { - validate_config(&config)?; - let root_path = normalized_root_path(Some(effective_root_path(&config)?))?; - let disk_include = config - .families - .disk - .include - .map(|filter| CompiledFilter::compile(filter.match_type, filter.devices)) - .transpose()? - .flatten(); - let disk_exclude = config - .families - .disk - .exclude - .map(|filter| CompiledFilter::compile(filter.match_type, filter.devices)) - .transpose()? - .flatten(); - let filesystem_include_devices = config - .families - .filesystem - .include_devices - .map(|filter| CompiledFilter::compile(filter.match_type, filter.devices)) - .transpose()? - .flatten(); - let filesystem_exclude_devices = config - .families - .filesystem - .exclude_devices - .map(|filter| CompiledFilter::compile(filter.match_type, filter.devices)) - .transpose()? - .flatten(); - let filesystem_include_fs_types = config - .families - .filesystem - .include_fs_types - .map(|filter| CompiledFilter::compile(filter.match_type, filter.fs_types)) - .transpose()? - .flatten(); - let filesystem_exclude_fs_types = config - .families - .filesystem - .exclude_fs_types - .map(|filter| CompiledFilter::compile(filter.match_type, filter.fs_types)) - .transpose()? - .flatten(); - let filesystem_include_mount_points = config - .families - .filesystem - .include_mount_points - .map(|filter| CompiledFilter::compile(filter.match_type, filter.mount_points)) - .transpose()? - .flatten(); - let filesystem_exclude_mount_points = config - .families - .filesystem - .exclude_mount_points - .map(|filter| CompiledFilter::compile(filter.match_type, filter.mount_points)) - .transpose()? - .flatten(); - let network_include = config - .families - .network - .include - .map(|filter| CompiledFilter::compile(filter.match_type, filter.interfaces)) - .transpose()? - .flatten(); - let network_exclude = config - .families - .network - .exclude - .map(|filter| CompiledFilter::compile(filter.match_type, filter.interfaces)) - .transpose()? - .flatten(); - - Ok(Self { - root_path, - validation: config.host_view.validation, - initial_delay: config.initial_delay, - cpu_utilization: config.families.cpu.utilization, - memory_limit: config.families.memory.limit, - memory_shared: config.families.memory.shared, - memory_hugepages: config.families.memory.hugepages, - families: RuntimeFamilies { - cpu: RuntimeFamily::new_cpu(&config.families.cpu, config.collection_interval), - memory: RuntimeFamily::new_memory( - &config.families.memory, - config.collection_interval, - ), - paging: RuntimeFamily::new(&config.families.paging, config.collection_interval), - system: RuntimeFamily::new(&config.families.system, config.collection_interval), - disk: RuntimeDiskFamily { - enabled: config.families.disk.enabled, - interval: config - .families - .disk - .interval - .unwrap_or(config.collection_interval), - limit: config.families.disk.limit, - include: disk_include, - exclude: disk_exclude, - }, - filesystem: RuntimeFilesystemFamily { - enabled: config.families.filesystem.enabled, - interval: config - .families - .filesystem - .interval - .unwrap_or(config.collection_interval), - include_virtual_filesystems: config - .families - .filesystem - .include_virtual_filesystems, - limit: config.families.filesystem.limit, - include_devices: filesystem_include_devices, - exclude_devices: filesystem_exclude_devices, - include_fs_types: filesystem_include_fs_types, - exclude_fs_types: filesystem_exclude_fs_types, - include_mount_points: filesystem_include_mount_points, - exclude_mount_points: filesystem_exclude_mount_points, - }, - network: RuntimeNetworkFamily { - enabled: config.families.network.enabled, - interval: config - .families - .network - .interval - .unwrap_or(config.collection_interval), - include: network_include, - exclude: network_exclude, - }, - processes: RuntimeFamily { - enabled: config.families.processes.enabled, - interval: config - .families - .processes - .interval - .unwrap_or(config.collection_interval), - }, - }, - }) - } -} - -impl RuntimeFamily { - fn new(config: &FamilyConfig, default_interval: Duration) -> Self { - Self { - enabled: config.enabled, - interval: config.interval.unwrap_or(default_interval), - } - } - - fn new_cpu(config: &CpuFamilyConfig, default_interval: Duration) -> Self { - Self { - enabled: config.enabled, - interval: config.interval.unwrap_or(default_interval), - } - } - - fn new_memory(config: &MemoryFamilyConfig, default_interval: Duration) -> Self { - Self { - enabled: config.enabled, - interval: config.interval.unwrap_or(default_interval), - } - } -} - #[cfg(target_os = "linux")] #[derive(Clone, Copy, Debug, Eq, PartialEq)] enum ScheduledFamilyKind { @@ -1192,38 +420,6 @@ impl Drop for HostMetricsLease { } } -fn normalized_root_path(root_path: Option<&Path>) -> Result { - let path = root_path.unwrap_or_else(|| Path::new("/")); - let path_text = path.to_string_lossy(); - if !path.is_absolute() && !path_text.starts_with('/') { - return Err(otap_df_config::error::Error::InvalidUserConfig { - error: format!("root_path must be absolute: {}", path.display()), - }); - } - - let mut normalized = PathBuf::from("/"); - for component in path.components() { - match component { - Component::RootDir => {} - Component::Normal(part) => normalized.push(part), - Component::CurDir | Component::ParentDir => { - return Err(otap_df_config::error::Error::InvalidUserConfig { - error: format!( - "root_path must not contain . or .. components: {}", - path.display() - ), - }); - } - Component::Prefix(_) => { - return Err(otap_df_config::error::Error::InvalidUserConfig { - error: format!("root_path must be a Unix absolute path: {}", path.display()), - }); - } - } - } - Ok(normalized) -} - #[cfg(target_os = "linux")] #[async_trait(?Send)] impl local::Receiver for HostMetricsReceiver { diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs deleted file mode 100644 index 36552c2d85..0000000000 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs.rs +++ /dev/null @@ -1,3796 +0,0 @@ -// Copyright The OpenTelemetry Authors -// SPDX-License-Identifier: Apache-2.0 - -//! Linux procfs-backed host metric source. - -use crate::receivers::host_metrics_receiver::semconv::{attr, metric}; -use crate::receivers::host_metrics_receiver::{CompiledFilter, HostViewValidationMode}; -use std::collections::{HashMap, HashSet}; -use std::fs::File; -use std::io::{self, Read}; -use std::path::{Path, PathBuf}; -use std::sync::mpsc; -use std::time::{Duration, SystemTime, UNIX_EPOCH}; - -const NANOS_PER_SEC: u64 = 1_000_000_000; -const BYTES_PER_KIB: u64 = 1024; -const DISKSTAT_SECTOR_BYTES: u64 = 512; -const FILESYSTEM_STAT_TIMEOUT: Duration = Duration::from_millis(100); -const COUNTER_KEY_SEPARATOR: char = '\x1f'; - -/// Procfs-backed source for host metrics. -pub struct ProcfsSource { - paths: ProcfsPaths, - config: ProcfsConfig, - buf: String, - clk_tck: f64, - previous_cpu: Option, - filesystem_worker: FilesystemStatWorker, - counter_tracker: CounterTracker, - boot_time_unix_nano: Option, - resource: Option, -} - -/// Procfs collection config. -pub struct ProcfsConfig { - /// CPU metrics. - pub cpu: bool, - /// Memory metrics. - pub memory: bool, - /// Paging metrics. - pub paging: bool, - /// System metrics. - pub system: bool, - /// Disk metrics. - pub disk: bool, - /// Filesystem metrics. - pub filesystem: bool, - /// Network metrics. - pub network: bool, - /// Process summary metrics. - pub processes: bool, - /// Derived aggregate CPU utilization. - pub cpu_utilization: bool, - /// Emit memory limit metric. - pub memory_limit: bool, - /// Emit Linux shared memory metric. - pub memory_shared: bool, - /// Emit Linux hugepage metrics. - pub memory_hugepages: bool, - /// Derived disk limit from sysfs block device size. - pub disk_limit: bool, - /// Include virtual filesystems. - pub filesystem_include_virtual: bool, - /// Emit filesystem limit metric. - pub filesystem_limit: bool, - /// Disk include filter. - pub disk_include: Option, - /// Disk exclude filter. - pub disk_exclude: Option, - /// Filesystem device include filter. - pub filesystem_include_devices: Option, - /// Filesystem device exclude filter. - pub filesystem_exclude_devices: Option, - /// Filesystem type include filter. - pub filesystem_include_fs_types: Option, - /// Filesystem type exclude filter. - pub filesystem_exclude_fs_types: Option, - /// Filesystem mount point include filter. - pub filesystem_include_mount_points: Option, - /// Filesystem mount point exclude filter. - pub filesystem_exclude_mount_points: Option, - /// Network include filter. - pub network_include: Option, - /// Network exclude filter. - pub network_exclude: Option, - /// Startup validation mode. - pub validation: HostViewValidationMode, -} - -/// Families due for one scrape. -#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] -pub struct ProcfsFamilies { - /// CPU metrics. - pub cpu: bool, - /// Memory metrics. - pub memory: bool, - /// Paging metrics. - pub paging: bool, - /// System metrics. - pub system: bool, - /// Disk metrics. - pub disk: bool, - /// Filesystem metrics. - pub filesystem: bool, - /// Network metrics. - pub network: bool, - /// Process summary metrics. - pub processes: bool, -} - -impl ProcfsFamilies { - fn enabled_by(self, config: &ProcfsConfig) -> Self { - Self { - cpu: self.cpu && config.cpu, - memory: self.memory && config.memory, - paging: self.paging && config.paging, - system: self.system && config.system, - disk: self.disk && config.disk, - filesystem: self.filesystem && config.filesystem, - network: self.network && config.network, - processes: self.processes && config.processes, - } - } -} - -impl ProcfsSource { - /// Creates a procfs source rooted at `/` or at a host root bind mount. - pub fn new(root_path: Option<&Path>, config: ProcfsConfig) -> io::Result { - let mut source = Self { - paths: ProcfsPaths::new(root_path), - config, - buf: String::with_capacity(16 * 1024), - clk_tck: clock_ticks_per_second(), - previous_cpu: None, - filesystem_worker: FilesystemStatWorker::new()?, - counter_tracker: CounterTracker::default(), - boot_time_unix_nano: None, - resource: None, - }; - source.apply_startup_validation()?; - Ok(source) - } - - /// Collects one host snapshot for the due family set. - pub fn scrape_due(&mut self, due: ProcfsFamilies) -> io::Result { - let due = due.enabled_by(&self.config); - let now_unix_nano = now_unix_nano(); - let clk_tck = self.clk_tck; - let mut partial_errors = 0; - let mut first_error = None; - let needs_start_time = due.cpu - || due.memory - || due.paging - || due.disk - || due.filesystem - || due.network - || due.processes; - let needs_stat = - due.cpu || due.processes || (needs_start_time && self.boot_time_unix_nano.is_none()); - let stat = match needs_stat - .then(|| self.read_path(PathKind::Stat)) - .transpose() - { - Ok(Some(proc_stat)) => parse_stat(proc_stat, clk_tck), - Ok(None) => StatSnapshot::default(), - Err(err) => { - record_partial_error(&mut partial_errors, &mut first_error, err); - StatSnapshot::default() - } - }; - if stat.boot_time_unix_nano != 0 { - self.boot_time_unix_nano = Some(stat.boot_time_unix_nano); - } - let start_time_unix_nano = self.boot_time_unix_nano.unwrap_or(now_unix_nano); - let cpu_utilization = if due.cpu && self.config.cpu_utilization { - let utilization = stat.cpu.and_then(|current| { - self.previous_cpu - .and_then(|previous| cpu_utilization(previous, current)) - }); - self.previous_cpu = stat.cpu; - utilization - } else { - None - }; - - let cpuinfo = match due - .cpu - .then(|| self.read_path(PathKind::Cpuinfo)) - .transpose() - { - Ok(Some(cpuinfo)) => parse_cpuinfo(cpuinfo), - Ok(None) => CpuInfo::default(), - Err(err) => { - record_partial_error(&mut partial_errors, &mut first_error, err); - CpuInfo::default() - } - }; - - let memory = match due - .memory - .then(|| self.read_path(PathKind::Meminfo)) - .transpose() - { - Ok(Some(meminfo)) => parse_meminfo(meminfo), - Ok(None) => None, - Err(err) => { - record_partial_error(&mut partial_errors, &mut first_error, err); - None - } - }; - - let uptime_seconds = match due - .system - .then(|| self.read_path(PathKind::Uptime)) - .transpose() - { - Ok(Some(uptime)) => parse_uptime(uptime), - Ok(None) => None, - Err(err) => { - record_partial_error(&mut partial_errors, &mut first_error, err); - None - } - }; - - let paging = match due - .paging - .then(|| self.read_path(PathKind::Vmstat)) - .transpose() - { - Ok(Some(vmstat)) => Some(parse_vmstat(vmstat)), - Ok(None) => None, - Err(err) => { - record_partial_error(&mut partial_errors, &mut first_error, err); - None - } - }; - - let swaps = match due - .paging - .then(|| self.read_path(PathKind::Swaps)) - .transpose() - { - Ok(Some(swaps)) => parse_swaps(swaps), - Ok(None) => Vec::new(), - Err(err) => { - record_partial_error(&mut partial_errors, &mut first_error, err); - Vec::new() - } - }; - - let disks = if due.disk { - let disk_include = self.config.disk_include.clone(); - let disk_exclude = self.config.disk_exclude.clone(); - match self.read_path(PathKind::Diskstats) { - Ok(diskstats) => { - let mut disks = - parse_diskstats(diskstats, disk_include.as_ref(), disk_exclude.as_ref()); - if self.config.disk_limit { - for disk in &mut disks { - disk.limit_bytes = self.read_disk_limit_bytes(&disk.name).ok(); - } - } - disks - } - Err(err) => { - record_partial_error(&mut partial_errors, &mut first_error, err); - Vec::new() - } - } - } else { - Vec::new() - }; - - let networks = if due.network { - let network_include = self.config.network_include.clone(); - let network_exclude = self.config.network_exclude.clone(); - match self.read_path(PathKind::NetDev) { - Ok(netdev) => { - parse_netdev(netdev, network_include.as_ref(), network_exclude.as_ref()) - } - Err(err) => { - record_partial_error(&mut partial_errors, &mut first_error, err); - Vec::new() - } - } - } else { - Vec::new() - }; - - let filesystems = if due.filesystem { - let include_virtual = self.config.filesystem_include_virtual; - let emit_limit = self.config.filesystem_limit; - let include_devices = self.config.filesystem_include_devices.clone(); - let exclude_devices = self.config.filesystem_exclude_devices.clone(); - let include_fs_types = self.config.filesystem_include_fs_types.clone(); - let exclude_fs_types = self.config.filesystem_exclude_fs_types.clone(); - let include_mount_points = self.config.filesystem_include_mount_points.clone(); - let exclude_mount_points = self.config.filesystem_exclude_mount_points.clone(); - match self.read_path(PathKind::Mountinfo) { - Ok(mountinfo) => { - let filters = FilesystemFilters { - include_devices: include_devices.as_ref(), - exclude_devices: exclude_devices.as_ref(), - include_fs_types: include_fs_types.as_ref(), - exclude_fs_types: exclude_fs_types.as_ref(), - include_mount_points: include_mount_points.as_ref(), - exclude_mount_points: exclude_mount_points.as_ref(), - }; - let mounts = parse_mountinfo(mountinfo, include_virtual, emit_limit, filters); - self.read_filesystems(mounts, &mut partial_errors, &mut first_error) - } - Err(err) => { - record_partial_error(&mut partial_errors, &mut first_error, err); - Vec::new() - } - } - } else { - Vec::new() - }; - - let resource = self.read_resource().clone(); - let counter_starts = self.counter_tracker.snapshot( - start_time_unix_nano, - now_unix_nano, - due.cpu.then_some(stat.cpu).flatten().as_ref(), - paging.as_ref(), - due.processes.then_some(stat.processes).as_ref(), - &disks, - &networks, - ); - - let snapshot = HostSnapshot { - now_unix_nano, - start_time_unix_nano, - counter_starts, - memory_limit: self.config.memory_limit, - memory_shared: self.config.memory_shared, - memory_hugepages: self.config.memory_hugepages, - cpu: due.cpu.then_some(stat.cpu).flatten(), - cpu_utilization, - cpuinfo, - memory, - uptime_seconds, - paging, - swaps, - processes: due.processes.then_some(stat.processes), - disks, - filesystems, - networks, - resource, - }; - if !snapshot.has_metrics() { - return Err(first_error - .unwrap_or_else(|| io::Error::other("host metrics scrape produced no metrics"))); - } - Ok(HostScrape { - snapshot, - partial_errors, - }) - } - - fn apply_startup_validation(&mut self) -> io::Result<()> { - match self.config.validation { - HostViewValidationMode::None => Ok(()), - HostViewValidationMode::FailSelected => self.validate_selected_paths(), - HostViewValidationMode::WarnSelected => { - self.disable_unavailable_sources(); - Ok(()) - } - } - } - - fn validate_selected_paths(&self) -> io::Result<()> { - if self.config.cpu || self.config.system || self.config.processes { - let _ = File::open(self.paths.path(PathKind::Stat))?; - } - if self.config.cpu { - let _ = File::open(self.paths.path(PathKind::Cpuinfo))?; - } - if self.config.memory { - let _ = File::open(self.paths.path(PathKind::Meminfo))?; - } - if self.config.system { - let _ = File::open(self.paths.path(PathKind::Uptime))?; - } - if self.config.paging { - let _ = File::open(self.paths.path(PathKind::Vmstat))?; - let _ = File::open(self.paths.path(PathKind::Swaps))?; - } - if self.config.disk { - let _ = File::open(self.paths.path(PathKind::Diskstats))?; - } - if self.config.filesystem { - let _ = File::open(self.paths.path(PathKind::Mountinfo))?; - } - if self.config.network { - let _ = File::open(self.paths.path(PathKind::NetDev))?; - } - Ok(()) - } - - fn disable_unavailable_sources(&mut self) { - if (self.config.cpu || self.config.system || self.config.processes) - && !self.source_available(PathKind::Stat) - { - self.config.cpu = false; - self.config.system = false; - self.config.processes = false; - } - if self.config.cpu && !self.source_available(PathKind::Cpuinfo) { - self.config.cpu = false; - } - if self.config.memory && !self.source_available(PathKind::Meminfo) { - self.config.memory = false; - } - if self.config.system && !self.source_available(PathKind::Uptime) { - self.config.system = false; - } - if self.config.paging - && (!self.source_available(PathKind::Vmstat) || !self.source_available(PathKind::Swaps)) - { - self.config.paging = false; - } - if self.config.disk && !self.source_available(PathKind::Diskstats) { - self.config.disk = false; - } - if self.config.filesystem && !self.source_available(PathKind::Mountinfo) { - self.config.filesystem = false; - } - if self.config.network && !self.source_available(PathKind::NetDev) { - self.config.network = false; - } - } - - fn source_available(&self, kind: PathKind) -> bool { - File::open(self.paths.path(kind)).is_ok() - } - - fn read_path(&mut self, kind: PathKind) -> io::Result<&str> { - self.buf.clear(); - let mut file = File::open(self.paths.path(kind))?; - let _ = file.read_to_string(&mut self.buf)?; - Ok(self.buf.as_str()) - } - - fn read_disk_limit_bytes(&mut self, disk_name: &str) -> io::Result { - self.buf.clear(); - let mut file = File::open(self.paths.sys_block.join(disk_name).join("size"))?; - let _ = file.read_to_string(&mut self.buf)?; - let sectors = parse_u64(self.buf.trim()); - Ok(sectors.saturating_mul(DISKSTAT_SECTOR_BYTES)) - } - - fn read_filesystems( - &mut self, - mounts: Vec, - partial_errors: &mut u64, - first_error: &mut Option, - ) -> Vec { - let mut filesystems = Vec::with_capacity(mounts.len()); - for mount in mounts { - let path = self.paths.host_path(&mount.mountpoint); - let stat = match self - .filesystem_worker - .statvfs(path, FILESYSTEM_STAT_TIMEOUT) - { - Ok(stat) => stat, - Err(err) => { - record_partial_error(partial_errors, first_error, err); - continue; - } - }; - let free = stat.available_bytes; - let reserved = stat.free_bytes.saturating_sub(stat.available_bytes); - let used = stat.total_bytes.saturating_sub(stat.free_bytes); - filesystems.push(FilesystemStats { - device: mount.device, - mountpoint: mount.mountpoint, - fs_type: mount.fs_type, - mode: mount.mode, - used, - free, - reserved, - limit_bytes: mount.emit_limit.then_some(stat.total_bytes), - }); - } - filesystems - } - - fn read_resource(&mut self) -> &HostResource { - if self.resource.is_none() { - let host_id = self - .read_trimmed_optional(PathKind::MachineId) - .or_else(|| self.read_trimmed_optional(PathKind::DbusMachineId)); - let host_name = self.read_trimmed_optional(PathKind::Hostname); - self.resource = Some(HostResource { - host_id, - host_name, - host_arch: host_arch(), - }); - } - self.resource.as_ref().expect("resource is initialized") - } - - fn read_trimmed_optional(&mut self, kind: PathKind) -> Option { - self.read_path(kind) - .ok() - .map(str::trim) - .filter(|value| !value.is_empty()) - .map(str::to_owned) - } -} - -#[derive(Clone, Debug)] -struct ProcfsPaths { - root: PathBuf, - stat: PathBuf, - cpuinfo: PathBuf, - meminfo: PathBuf, - uptime: PathBuf, - vmstat: PathBuf, - swaps: PathBuf, - diskstats: PathBuf, - mountinfo: PathBuf, - sys_block: PathBuf, - net_dev: PathBuf, - machine_id: PathBuf, - dbus_machine_id: PathBuf, - hostname: PathBuf, -} - -impl ProcfsPaths { - fn new(root_path: Option<&Path>) -> Self { - let root = root_path.unwrap_or_else(|| Path::new("/")); - let host_root = root_path.is_some_and(|path| path != Path::new("/")); - Self { - root: root.to_path_buf(), - stat: root.join("proc/stat"), - cpuinfo: root.join("proc/cpuinfo"), - meminfo: root.join("proc/meminfo"), - uptime: root.join("proc/uptime"), - vmstat: root.join("proc/vmstat"), - swaps: root.join("proc/swaps"), - diskstats: root.join("proc/diskstats"), - mountinfo: if host_root { - root.join("proc/1/mountinfo") - } else { - root.join("proc/self/mountinfo") - }, - sys_block: root.join("sys/block"), - machine_id: root.join("etc/machine-id"), - dbus_machine_id: root.join("var/lib/dbus/machine-id"), - hostname: root.join("proc/sys/kernel/hostname"), - net_dev: if host_root { - root.join("proc/1/net/dev") - } else { - root.join("proc/net/dev") - }, - } - } - - fn path(&self, kind: PathKind) -> &Path { - match kind { - PathKind::Stat => &self.stat, - PathKind::Cpuinfo => &self.cpuinfo, - PathKind::Meminfo => &self.meminfo, - PathKind::Uptime => &self.uptime, - PathKind::Vmstat => &self.vmstat, - PathKind::Swaps => &self.swaps, - PathKind::Diskstats => &self.diskstats, - PathKind::Mountinfo => &self.mountinfo, - PathKind::NetDev => &self.net_dev, - PathKind::MachineId => &self.machine_id, - PathKind::DbusMachineId => &self.dbus_machine_id, - PathKind::Hostname => &self.hostname, - } - } - - fn host_path(&self, host_absolute_path: &str) -> PathBuf { - let relative = host_absolute_path - .strip_prefix('/') - .unwrap_or(host_absolute_path); - self.root.join(relative) - } -} - -#[derive(Copy, Clone)] -enum PathKind { - Stat, - Cpuinfo, - Meminfo, - Uptime, - Vmstat, - Swaps, - Diskstats, - Mountinfo, - NetDev, - MachineId, - DbusMachineId, - Hostname, -} - -/// Result of one host metrics scrape. -pub struct HostScrape { - /// Collected host snapshot. - pub snapshot: HostSnapshot, - /// Number of source read errors skipped because other families succeeded. - pub partial_errors: u64, -} - -/// One host metrics snapshot. -#[derive(Default)] -pub struct HostSnapshot { - now_unix_nano: u64, - start_time_unix_nano: u64, - counter_starts: CounterStarts, - memory_limit: bool, - memory_shared: bool, - memory_hugepages: bool, - cpu: Option, - cpu_utilization: Option, - cpuinfo: CpuInfo, - memory: Option, - uptime_seconds: Option, - paging: Option, - swaps: Vec, - processes: Option, - disks: Vec, - filesystems: Vec, - networks: Vec, - resource: HostResource, -} - -impl HostSnapshot { - fn has_metrics(&self) -> bool { - self.cpu.is_some() - || self.cpu_utilization.is_some() - || self.cpuinfo.logical_count != 0 - || self.cpuinfo.physical_count != 0 - || !self.cpuinfo.frequencies_hz.is_empty() - || self.memory.is_some() - || self.uptime_seconds.is_some() - || self.paging.is_some() - || !self.swaps.is_empty() - || self.processes.is_some() - || !self.disks.is_empty() - || !self.filesystems.is_empty() - || !self.networks.is_empty() - } - - /// Converts a snapshot directly into an OTAP Arrow metrics batch. - pub fn into_otap_records( - self, - ) -> Result { - use crate::receivers::host_metrics_receiver::otap_builder::HostMetricsArrowBuilder; - let mut b = HostMetricsArrowBuilder::new(); - b.append_resource(&self.resource); - project_snapshot(&self, &mut b); - b.finish() - } -} - -#[derive(Clone, Default)] -pub(super) struct HostResource { - pub(super) host_id: Option, - pub(super) host_name: Option, - pub(super) host_arch: Option<&'static str>, -} - -fn project_snapshot( - snap: &HostSnapshot, - b: &mut crate::receivers::host_metrics_receiver::otap_builder::HostMetricsArrowBuilder, -) { - let now = snap.now_unix_nano; - let start = snap.start_time_unix_nano; - let cs = &snap.counter_starts; - - // ── CPU ────────────────────────────────────────────────────────────────── - if let Some(cpu) = snap.cpu { - let m = b.begin_counter_f64(metric::CPU_TIME, "s"); - for (mode, value) in [ - ("user", cpu.user), - ("nice", cpu.nice), - ("system", cpu.system), - ("idle", cpu.idle), - ("iowait", cpu.wait), - ("interrupt", cpu.interrupt), - ("steal", cpu.steal), - ] { - b.append_f64_sum_dp(m, cs.get(metric::CPU_TIME, mode, start), now, value, |w| { - w.str(attr::CPU_MODE, mode); - }); - } - } - if let Some(cpu) = snap.cpu_utilization { - let m = b.begin_gauge_f64(metric::CPU_UTILIZATION, "1"); - for (mode, value) in [ - ("user", cpu.user), - ("nice", cpu.nice), - ("system", cpu.system), - ("idle", cpu.idle), - ("iowait", cpu.wait), - ("interrupt", cpu.interrupt), - ("steal", cpu.steal), - ] { - b.append_f64_gauge_dp(m, now, value, |w| { - w.str(attr::CPU_MODE, mode); - }); - } - } - if snap.cpuinfo.logical_count != 0 { - let m = b.begin_updown_i64(metric::CPU_LOGICAL_COUNT, "{cpu}"); - b.append_i64_sum_dp( - m, - start, - now, - saturating_i64(snap.cpuinfo.logical_count), - |_| {}, - ); - } - if snap.cpuinfo.physical_count != 0 { - let m = b.begin_updown_i64(metric::CPU_PHYSICAL_COUNT, "{cpu}"); - b.append_i64_sum_dp( - m, - start, - now, - saturating_i64(snap.cpuinfo.physical_count), - |_| {}, - ); - } - if !snap.cpuinfo.frequencies_hz.is_empty() { - let m = b.begin_gauge_i64(metric::CPU_FREQUENCY, "Hz"); - for (idx, &freq) in snap.cpuinfo.frequencies_hz.iter().enumerate() { - let logical = i64::try_from(idx).unwrap_or(i64::MAX); - b.append_i64_gauge_dp(m, now, frequency_hz_i64(freq), |w| { - w.int(attr::CPU_LOGICAL_NUMBER, logical); - }); - } - } - - // ── Memory ─────────────────────────────────────────────────────────────── - if let Some(memory) = snap.memory { - let m = b.begin_updown_i64(metric::MEMORY_USAGE, "By"); - for (state, value) in [ - ("used", memory.used), - ("free", memory.free), - ("cached", memory.cached), - ("buffers", memory.buffered), - ] { - b.append_i64_sum_dp(m, start, now, saturating_i64(value), |w| { - w.str(attr::SYSTEM_MEMORY_STATE, state); - }); - } - if memory.total > 0 { - let m = b.begin_gauge_f64(metric::MEMORY_UTILIZATION, "1"); - let total = memory.total as f64; - for (state, value) in [ - ("used", memory.used), - ("free", memory.free), - ("cached", memory.cached), - ("buffers", memory.buffered), - ] { - b.append_f64_gauge_dp(m, now, value as f64 / total, |w| { - w.str(attr::SYSTEM_MEMORY_STATE, state); - }); - } - } - if memory.has_available { - let m = b.begin_updown_i64(metric::MEMORY_LINUX_AVAILABLE, "By"); - b.append_i64_sum_dp(m, start, now, saturating_i64(memory.available), |_| {}); - } - let m = b.begin_updown_i64(metric::MEMORY_LINUX_SLAB_USAGE, "By"); - for (state, value) in [ - ("reclaimable", memory.slab_reclaimable), - ("unreclaimable", memory.slab_unreclaimable), - ] { - b.append_i64_sum_dp(m, start, now, saturating_i64(value), |w| { - w.str(attr::SYSTEM_MEMORY_LINUX_SLAB_STATE, state); - }); - } - if snap.memory_limit { - let m = b.begin_updown_i64(metric::MEMORY_LIMIT, "By"); - b.append_i64_sum_dp(m, start, now, saturating_i64(memory.total), |_| {}); - } - if snap.memory_shared { - let m = b.begin_updown_i64(metric::MEMORY_LINUX_SHARED, "By"); - b.append_i64_sum_dp(m, start, now, saturating_i64(memory.shared), |_| {}); - } - if snap.memory_hugepages { - project_hugepages(b, start, now, &memory.hugepages); - } - } - - // ── System / uptime ────────────────────────────────────────────────────── - if let Some(uptime) = snap.uptime_seconds { - let m = b.begin_gauge_f64(metric::UPTIME, "s"); - b.append_f64_gauge_dp(m, now, uptime, |_| {}); - } - - // ── Paging ─────────────────────────────────────────────────────────────── - if let Some(paging) = snap.paging { - let m = b.begin_counter_i64(metric::PAGING_FAULTS, "{fault}"); - for (fault_type, value) in [ - ("minor", paging.minor_faults), - ("major", paging.major_faults), - ] { - b.append_i64_sum_dp( - m, - cs.get(metric::PAGING_FAULTS, fault_type, start), - now, - saturating_i64(value), - |w| { - w.str(attr::SYSTEM_PAGING_FAULT_TYPE, fault_type); - }, - ); - } - let m = b.begin_counter_i64(metric::PAGING_OPERATIONS, "{operation}"); - // Linux exposes swap operations and page-in/page-out counters separately. - // Semconv requires both direction and fault.type for this metric, so the - // receiver keeps the phase-1 mapping explicit here. - for (direction, fault_type, value) in [ - ("in", "major", paging.swap_in), - ("out", "major", paging.swap_out), - ("in", "minor", paging.page_in), - ("out", "minor", paging.page_out), - ] { - b.append_i64_sum_dp( - m, - cs.get_joined(metric::PAGING_OPERATIONS, direction, fault_type, start), - now, - saturating_i64(value), - |w| { - w.str(attr::SYSTEM_PAGING_DIRECTION, direction); - w.str(attr::SYSTEM_PAGING_FAULT_TYPE, fault_type); - }, - ); - } - } - if !snap.swaps.is_empty() { - let m = b.begin_updown_i64(metric::PAGING_USAGE, "By"); - for swap in &snap.swaps { - for (state, value) in [("used", swap.used), ("free", swap.free)] { - b.append_i64_sum_dp(m, start, now, saturating_i64(value), |w| { - w.str(attr::SYSTEM_DEVICE, &swap.name); - w.str(attr::SYSTEM_PAGING_STATE, state); - }); - } - } - } - if snap.swaps.iter().any(|swap| swap.size > 0) { - let m = b.begin_gauge_f64(metric::PAGING_UTILIZATION, "1"); - for swap in &snap.swaps { - let size = swap.size; - if size == 0 { - continue; - } - let total = size as f64; - for (state, value) in [("used", swap.used), ("free", swap.free)] { - b.append_f64_gauge_dp(m, now, value as f64 / total, |w| { - w.str(attr::SYSTEM_DEVICE, &swap.name); - w.str(attr::SYSTEM_PAGING_STATE, state); - }); - } - } - } - - // ── Processes ──────────────────────────────────────────────────────────── - if let Some(processes) = snap.processes { - let m = b.begin_updown_i64(metric::PROCESS_COUNT, "{process}"); - b.append_i64_sum_dp(m, start, now, saturating_i64(processes.running), |w| { - w.str(attr::PROCESS_STATE, "running"); - }); - // /proc/stat procs_blocked has no registered process.state value. - // Do not map it to sleeping; Linux blocked tasks are not the same state. - let m = b.begin_counter_i64(metric::PROCESS_CREATED, "{process}"); - b.append_i64_sum_dp( - m, - cs.get(metric::PROCESS_CREATED, "", start), - now, - saturating_i64(processes.created), - |_| {}, - ); - } - - // ── Disk ───────────────────────────────────────────────────────────────── - if snap.disks.iter().any(|disk| disk.limit_bytes.is_some()) { - let m = b.begin_updown_i64(metric::DISK_LIMIT, "By"); - for disk in &snap.disks { - let Some(limit_bytes) = disk.limit_bytes else { - continue; - }; - b.append_i64_sum_dp(m, start, now, saturating_i64(limit_bytes), |w| { - w.str(attr::SYSTEM_DEVICE, &disk.name); - }); - } - } - if !snap.disks.is_empty() { - let m = b.begin_counter_i64(metric::DISK_IO, "By"); - for disk in &snap.disks { - for (dir, value) in [("read", disk.read_bytes), ("write", disk.write_bytes)] { - b.append_i64_sum_dp( - m, - cs.get_joined(metric::DISK_IO, &disk.name, dir, start), - now, - saturating_i64(value), - |w| { - w.str(attr::SYSTEM_DEVICE, &disk.name); - w.str(attr::DISK_IO_DIRECTION, dir); - }, - ); - } - } - let m = b.begin_counter_i64(metric::DISK_OPERATIONS, "{operation}"); - for disk in &snap.disks { - for (dir, value) in [("read", disk.read_ops), ("write", disk.write_ops)] { - b.append_i64_sum_dp( - m, - cs.get_joined(metric::DISK_OPERATIONS, &disk.name, dir, start), - now, - saturating_i64(value), - |w| { - w.str(attr::SYSTEM_DEVICE, &disk.name); - w.str(attr::DISK_IO_DIRECTION, dir); - }, - ); - } - } - let m = b.begin_counter_f64(metric::DISK_IO_TIME, "s"); - for disk in &snap.disks { - b.append_f64_sum_dp( - m, - cs.get(metric::DISK_IO_TIME, &disk.name, start), - now, - disk.io_time_seconds, - |w| { - w.str(attr::SYSTEM_DEVICE, &disk.name); - }, - ); - } - let m = b.begin_counter_f64(metric::DISK_OPERATION_TIME, "s"); - for disk in &snap.disks { - for (dir, value) in [ - ("read", disk.read_time_seconds), - ("write", disk.write_time_seconds), - ] { - b.append_f64_sum_dp( - m, - cs.get_joined(metric::DISK_OPERATION_TIME, &disk.name, dir, start), - now, - value, - |w| { - w.str(attr::SYSTEM_DEVICE, &disk.name); - w.str(attr::DISK_IO_DIRECTION, dir); - }, - ); - } - } - let m = b.begin_counter_i64(metric::DISK_MERGED, "{operation}"); - for disk in &snap.disks { - for (dir, value) in [("read", disk.read_merged), ("write", disk.write_merged)] { - b.append_i64_sum_dp( - m, - cs.get_joined(metric::DISK_MERGED, &disk.name, dir, start), - now, - saturating_i64(value), - |w| { - w.str(attr::SYSTEM_DEVICE, &disk.name); - w.str(attr::DISK_IO_DIRECTION, dir); - }, - ); - } - } - } - - // ── Filesystem ─────────────────────────────────────────────────────────── - if !snap.filesystems.is_empty() { - let m = b.begin_updown_i64(metric::FILESYSTEM_USAGE, "By"); - for fs in &snap.filesystems { - for (state, value) in [ - ("used", fs.used), - ("free", fs.free), - ("reserved", fs.reserved), - ] { - b.append_i64_sum_dp(m, start, now, saturating_i64(value), |w| { - w.str(attr::SYSTEM_DEVICE, &fs.device); - w.str(attr::SYSTEM_FILESYSTEM_STATE, state); - w.str(attr::SYSTEM_FILESYSTEM_TYPE, &fs.fs_type); - w.str(attr::SYSTEM_FILESYSTEM_MODE, fs.mode); - w.str(attr::SYSTEM_FILESYSTEM_MOUNTPOINT, &fs.mountpoint); - }); - } - } - } - if snap - .filesystems - .iter() - .any(|fs| fs.used.saturating_add(fs.free).saturating_add(fs.reserved) > 0) - { - let m = b.begin_gauge_f64(metric::FILESYSTEM_UTILIZATION, "1"); - for fs in &snap.filesystems { - let total = fs.used.saturating_add(fs.free).saturating_add(fs.reserved); - if total > 0 { - let total_f = total as f64; - for (state, value) in [ - ("used", fs.used), - ("free", fs.free), - ("reserved", fs.reserved), - ] { - b.append_f64_gauge_dp(m, now, value as f64 / total_f, |w| { - w.str(attr::SYSTEM_DEVICE, &fs.device); - w.str(attr::SYSTEM_FILESYSTEM_STATE, state); - w.str(attr::SYSTEM_FILESYSTEM_TYPE, &fs.fs_type); - w.str(attr::SYSTEM_FILESYSTEM_MODE, fs.mode); - w.str(attr::SYSTEM_FILESYSTEM_MOUNTPOINT, &fs.mountpoint); - }); - } - } - } - } - if snap.filesystems.iter().any(|fs| fs.limit_bytes.is_some()) { - let m = b.begin_updown_i64(metric::FILESYSTEM_LIMIT, "By"); - for fs in &snap.filesystems { - let Some(limit_bytes) = fs.limit_bytes else { - continue; - }; - b.append_i64_sum_dp(m, start, now, saturating_i64(limit_bytes), |w| { - w.str(attr::SYSTEM_DEVICE, &fs.device); - w.str(attr::SYSTEM_FILESYSTEM_TYPE, &fs.fs_type); - w.str(attr::SYSTEM_FILESYSTEM_MODE, fs.mode); - w.str(attr::SYSTEM_FILESYSTEM_MOUNTPOINT, &fs.mountpoint); - }); - } - } - - // ── Network ────────────────────────────────────────────────────────────── - if !snap.networks.is_empty() { - let m = b.begin_counter_i64(metric::NETWORK_IO, "By"); - for net in &snap.networks { - for (dir, value) in [("receive", net.rx_bytes), ("transmit", net.tx_bytes)] { - b.append_i64_sum_dp( - m, - cs.get_joined(metric::NETWORK_IO, &net.name, dir, start), - now, - saturating_i64(value), - |w| { - w.str(attr::NETWORK_INTERFACE_NAME, &net.name); - w.str(attr::NETWORK_IO_DIRECTION, dir); - }, - ); - } - } - let m = b.begin_counter_i64(metric::NETWORK_PACKET_COUNT, "{packet}"); - for net in &snap.networks { - for (dir, value) in [("receive", net.rx_packets), ("transmit", net.tx_packets)] { - b.append_i64_sum_dp( - m, - cs.get_joined(metric::NETWORK_PACKET_COUNT, &net.name, dir, start), - now, - saturating_i64(value), - |w| { - // Semconv uses system.device here, while sibling network - // metrics use network.interface.name. - w.str(attr::SYSTEM_DEVICE, &net.name); - w.str(attr::NETWORK_IO_DIRECTION, dir); - }, - ); - } - } - let m = b.begin_counter_i64(metric::NETWORK_PACKET_DROPPED, "{packet}"); - for net in &snap.networks { - for (dir, value) in [("receive", net.rx_dropped), ("transmit", net.tx_dropped)] { - b.append_i64_sum_dp( - m, - cs.get_joined(metric::NETWORK_PACKET_DROPPED, &net.name, dir, start), - now, - saturating_i64(value), - |w| { - w.str(attr::NETWORK_INTERFACE_NAME, &net.name); - w.str(attr::NETWORK_IO_DIRECTION, dir); - }, - ); - } - } - let m = b.begin_counter_i64(metric::NETWORK_ERRORS, "{error}"); - for net in &snap.networks { - for (dir, value) in [("receive", net.rx_errors), ("transmit", net.tx_errors)] { - b.append_i64_sum_dp( - m, - cs.get_joined(metric::NETWORK_ERRORS, &net.name, dir, start), - now, - saturating_i64(value), - |w| { - w.str(attr::NETWORK_INTERFACE_NAME, &net.name); - w.str(attr::NETWORK_IO_DIRECTION, dir); - }, - ); - } - } - } -} - -fn project_hugepages( - b: &mut crate::receivers::host_metrics_receiver::otap_builder::HostMetricsArrowBuilder, - start: u64, - now: u64, - hugepages: &HugepageStats, -) { - let m = b.begin_updown_i64(metric::MEMORY_LINUX_HUGEPAGES_LIMIT, "{page}"); - b.append_i64_sum_dp(m, start, now, saturating_i64(hugepages.total), |_| {}); - let m = b.begin_updown_i64(metric::MEMORY_LINUX_HUGEPAGES_PAGE_SIZE, "By"); - b.append_i64_sum_dp( - m, - start, - now, - saturating_i64(hugepages.page_size_bytes), - |_| {}, - ); - let m = b.begin_updown_i64(metric::MEMORY_LINUX_HUGEPAGES_RESERVED, "{page}"); - b.append_i64_sum_dp(m, start, now, saturating_i64(hugepages.reserved), |_| {}); - let m = b.begin_updown_i64(metric::MEMORY_LINUX_HUGEPAGES_SURPLUS, "{page}"); - b.append_i64_sum_dp(m, start, now, saturating_i64(hugepages.surplus), |_| {}); - let used = hugepages.total.saturating_sub(hugepages.free); - let m = b.begin_updown_i64(metric::MEMORY_LINUX_HUGEPAGES_USAGE, "{page}"); - for (state, value) in [("used", used), ("free", hugepages.free)] { - b.append_i64_sum_dp(m, start, now, saturating_i64(value), |w| { - w.str(attr::SYSTEM_MEMORY_LINUX_HUGEPAGES_STATE, state); - }); - } - if hugepages.total > 0 { - let total = hugepages.total as f64; - let m = b.begin_gauge_f64(metric::MEMORY_LINUX_HUGEPAGES_UTILIZATION, "1"); - for (state, value) in [("used", used), ("free", hugepages.free)] { - b.append_f64_gauge_dp(m, now, value as f64 / total, |w| { - w.str(attr::SYSTEM_MEMORY_LINUX_HUGEPAGES_STATE, state); - }); - } - } -} - -#[derive(Default)] -struct CounterTracker { - states: HashMap, -} - -struct CounterState { - previous: f64, - start_time_unix_nano: u64, -} - -#[derive(Default)] -struct CounterStarts { - entries: Vec<(String, u64)>, -} - -impl CounterStarts { - fn get(&self, metric: &'static str, series: &str, default_start: u64) -> u64 { - self.entries - .iter() - .find_map(|(key, start)| counter_key_matches(key, metric, series).then_some(*start)) - .unwrap_or(default_start) - } - - fn get_joined( - &self, - metric: &'static str, - first: &str, - second: &'static str, - default_start: u64, - ) -> u64 { - self.entries - .iter() - .find_map(|(key, start)| { - counter_key_matches_joined(key, metric, first, second).then_some(*start) - }) - .unwrap_or(default_start) - } -} - -impl CounterTracker { - fn snapshot( - &mut self, - default_start: u64, - now: u64, - cpu: Option<&CpuTimes>, - paging: Option<&PagingStats>, - processes: Option<&ProcessStats>, - disks: &[DiskStats], - networks: &[NetworkStats], - ) -> CounterStarts { - let mut starts = CounterStarts::default(); - if let Some(cpu) = cpu { - self.observe_all( - metric::CPU_TIME, - default_start, - now, - &[ - ("user", cpu.user), - ("nice", cpu.nice), - ("system", cpu.system), - ("idle", cpu.idle), - ("iowait", cpu.wait), - ("interrupt", cpu.interrupt), - ("steal", cpu.steal), - ], - &mut starts, - ); - } - if let Some(paging) = paging { - self.observe_all( - metric::PAGING_FAULTS, - default_start, - now, - &[ - ("minor", paging.minor_faults as f64), - ("major", paging.major_faults as f64), - ], - &mut starts, - ); - for (direction, fault_type, value) in [ - ("in", "major", paging.swap_in), - ("out", "major", paging.swap_out), - ("in", "minor", paging.page_in), - ("out", "minor", paging.page_out), - ] { - self.observe_joined( - metric::PAGING_OPERATIONS, - direction, - fault_type, - value as f64, - default_start, - now, - &mut starts, - ); - } - } - if let Some(processes) = processes { - self.observe( - metric::PROCESS_CREATED, - "", - processes.created as f64, - default_start, - now, - &mut starts, - ); - } - for disk in disks { - self.observe_disk_all( - metric::DISK_IO, - default_start, - now, - &disk.name, - &[ - ("read", disk.read_bytes as f64), - ("write", disk.write_bytes as f64), - ], - &mut starts, - ); - self.observe_disk_all( - metric::DISK_OPERATIONS, - default_start, - now, - &disk.name, - &[ - ("read", disk.read_ops as f64), - ("write", disk.write_ops as f64), - ], - &mut starts, - ); - self.observe( - metric::DISK_IO_TIME, - &disk.name, - disk.io_time_seconds, - default_start, - now, - &mut starts, - ); - self.observe_disk_all( - metric::DISK_OPERATION_TIME, - default_start, - now, - &disk.name, - &[ - ("read", disk.read_time_seconds), - ("write", disk.write_time_seconds), - ], - &mut starts, - ); - self.observe_disk_all( - metric::DISK_MERGED, - default_start, - now, - &disk.name, - &[ - ("read", disk.read_merged as f64), - ("write", disk.write_merged as f64), - ], - &mut starts, - ); - } - for network in networks { - self.observe_network( - metric::NETWORK_IO, - default_start, - now, - network, - network.rx_bytes, - network.tx_bytes, - &mut starts, - ); - self.observe_network( - metric::NETWORK_PACKET_COUNT, - default_start, - now, - network, - network.rx_packets, - network.tx_packets, - &mut starts, - ); - self.observe_network( - metric::NETWORK_PACKET_DROPPED, - default_start, - now, - network, - network.rx_dropped, - network.tx_dropped, - &mut starts, - ); - self.observe_network( - metric::NETWORK_ERRORS, - default_start, - now, - network, - network.rx_errors, - network.tx_errors, - &mut starts, - ); - } - starts - } - - fn observe_all( - &mut self, - metric: &'static str, - default_start: u64, - now: u64, - values: &[(&str, f64)], - starts: &mut CounterStarts, - ) { - for (series, value) in values { - self.observe(metric, series, *value, default_start, now, starts); - } - } - - fn observe_disk_all( - &mut self, - metric: &'static str, - default_start: u64, - now: u64, - device: &str, - values: &[(&'static str, f64)], - starts: &mut CounterStarts, - ) { - for (direction, value) in values { - self.observe_joined( - metric, - device, - direction, - *value, - default_start, - now, - starts, - ); - } - } - - fn observe_network( - &mut self, - metric: &'static str, - default_start: u64, - now: u64, - network: &NetworkStats, - rx: u64, - tx: u64, - starts: &mut CounterStarts, - ) { - self.observe_joined( - metric, - &network.name, - "receive", - rx as f64, - default_start, - now, - starts, - ); - self.observe_joined( - metric, - &network.name, - "transmit", - tx as f64, - default_start, - now, - starts, - ); - } - - fn observe( - &mut self, - metric: &'static str, - series: &str, - value: f64, - default_start: u64, - now: u64, - starts: &mut CounterStarts, - ) { - self.observe_key( - counter_key(metric, series), - value, - default_start, - now, - starts, - ); - } - - fn observe_joined( - &mut self, - metric: &'static str, - first: &str, - second: &'static str, - value: f64, - default_start: u64, - now: u64, - starts: &mut CounterStarts, - ) { - self.observe_key( - counter_key_joined(metric, first, second), - value, - default_start, - now, - starts, - ); - } - - fn observe_key( - &mut self, - key: String, - value: f64, - default_start: u64, - now: u64, - starts: &mut CounterStarts, - ) { - let state = self.states.entry(key.clone()).or_insert(CounterState { - previous: value, - start_time_unix_nano: default_start, - }); - if state.start_time_unix_nano < default_start { - state.start_time_unix_nano = default_start; - } else if value < state.previous { - state.start_time_unix_nano = now; - } - state.previous = value; - starts.entries.push((key, state.start_time_unix_nano)); - } -} - -fn counter_key(metric: &'static str, series: &str) -> String { - let mut key = String::with_capacity(metric.len() + 1 + series.len()); - key.push_str(metric); - key.push(COUNTER_KEY_SEPARATOR); - key.push_str(series); - key -} - -fn counter_key_joined(metric: &'static str, first: &str, second: &'static str) -> String { - let mut key = String::with_capacity(metric.len() + 2 + first.len() + second.len()); - key.push_str(metric); - key.push(COUNTER_KEY_SEPARATOR); - key.push_str(first); - key.push(COUNTER_KEY_SEPARATOR); - key.push_str(second); - key -} - -fn counter_key_matches(key: &str, metric: &'static str, series: &str) -> bool { - key.strip_prefix(metric) - .and_then(|rest| rest.strip_prefix(COUNTER_KEY_SEPARATOR)) - == Some(series) -} - -fn counter_key_matches_joined( - key: &str, - metric: &'static str, - first: &str, - second: &'static str, -) -> bool { - let Some(series) = key - .strip_prefix(metric) - .and_then(|rest| rest.strip_prefix(COUNTER_KEY_SEPARATOR)) - else { - return false; - }; - series - .strip_prefix(first) - .and_then(|rest| rest.strip_prefix(COUNTER_KEY_SEPARATOR)) - == Some(second) -} - -fn host_arch() -> Option<&'static str> { - match std::env::consts::ARCH { - "aarch64" => Some("arm64"), - "arm" => Some("arm32"), - "powerpc" => Some("ppc32"), - "powerpc64" => Some("ppc64"), - "x86" => Some("x86"), - "x86_64" => Some("amd64"), - _ => None, - } -} - -#[derive(Copy, Clone, Default)] -struct CpuTimes { - user: f64, - nice: f64, - system: f64, - idle: f64, - wait: f64, - interrupt: f64, - steal: f64, -} - -#[derive(Clone, Default)] -struct CpuInfo { - logical_count: u64, - physical_count: u64, - frequencies_hz: Vec, -} - -#[derive(Copy, Clone, Default)] -struct StatSnapshot { - boot_time_unix_nano: u64, - cpu: Option, - processes: ProcessStats, -} - -#[derive(Copy, Clone, Default)] -struct MemoryStats { - total: u64, - used: u64, - free: u64, - available: u64, - has_available: bool, - cached: u64, - buffered: u64, - shared: u64, - slab_reclaimable: u64, - slab_unreclaimable: u64, - hugepages: HugepageStats, -} - -#[derive(Copy, Clone, Default)] -struct HugepageStats { - total: u64, - free: u64, - reserved: u64, - surplus: u64, - page_size_bytes: u64, -} - -#[derive(Copy, Clone, Default)] -struct PagingStats { - minor_faults: u64, - major_faults: u64, - page_in: u64, - page_out: u64, - swap_in: u64, - swap_out: u64, -} - -#[derive(Default)] -struct SwapStats { - name: String, - size: u64, - used: u64, - free: u64, -} - -#[derive(Copy, Clone, Default)] -struct ProcessStats { - running: u64, - blocked: u64, - created: u64, -} - -#[derive(Default)] -struct DiskStats { - name: String, - limit_bytes: Option, - read_bytes: u64, - write_bytes: u64, - read_ops: u64, - write_ops: u64, - read_merged: u64, - write_merged: u64, - read_time_seconds: f64, - write_time_seconds: f64, - io_time_seconds: f64, -} - -#[derive(Default)] -struct FilesystemStats { - device: String, - mountpoint: String, - fs_type: String, - mode: &'static str, - used: u64, - free: u64, - reserved: u64, - limit_bytes: Option, -} - -struct FilesystemStatWorker { - tx: mpsc::SyncSender, -} - -struct FilesystemStatRequest { - path: PathBuf, - response: mpsc::Sender>, -} - -struct FilesystemStat { - total_bytes: u64, - free_bytes: u64, - available_bytes: u64, -} - -impl FilesystemStatWorker { - fn new() -> io::Result { - let (tx, rx) = mpsc::sync_channel::(1); - let _handle = std::thread::Builder::new() - .name("host-metrics-statvfs".to_owned()) - .spawn(move || { - while let Ok(request) = rx.recv() { - let result = statvfs_bytes(&request.path); - let _ = request.response.send(result); - } - }) - .map_err(io::Error::other)?; - Ok(Self { tx }) - } - - fn statvfs(&self, path: PathBuf, timeout: Duration) -> io::Result { - let (response, rx) = mpsc::channel(); - self.tx - .try_send(FilesystemStatRequest { path, response }) - .map_err(|_| io::Error::new(io::ErrorKind::TimedOut, "statvfs worker is busy"))?; - rx.recv_timeout(timeout) - .map_err(|_| io::Error::new(io::ErrorKind::TimedOut, "statvfs timed out"))? - } -} - -fn statvfs_bytes(path: &Path) -> io::Result { - let stat = nix::sys::statvfs::statvfs(path).map_err(io::Error::other)?; - let block_size = stat.fragment_size(); - Ok(FilesystemStat { - total_bytes: stat.blocks().saturating_mul(block_size), - free_bytes: stat.blocks_free().saturating_mul(block_size), - available_bytes: stat.blocks_available().saturating_mul(block_size), - }) -} - -#[derive(Default)] -struct NetworkStats { - name: String, - rx_bytes: u64, - tx_bytes: u64, - rx_packets: u64, - tx_packets: u64, - rx_errors: u64, - tx_errors: u64, - rx_dropped: u64, - tx_dropped: u64, -} - -fn parse_stat(input: &str, clk_tck: f64) -> StatSnapshot { - let mut snapshot = StatSnapshot::default(); - for line in input.lines() { - if let Some(rest) = line.strip_prefix("cpu ") { - snapshot.cpu = parse_cpu_total(rest, clk_tck); - } else if let Some(value) = line.strip_prefix("btime ") { - snapshot.boot_time_unix_nano = parse_u64(value).saturating_mul(NANOS_PER_SEC); - } else if let Some(value) = line.strip_prefix("procs_running ") { - snapshot.processes.running = parse_u64(value); - } else if let Some(value) = line.strip_prefix("procs_blocked ") { - snapshot.processes.blocked = parse_u64(value); - } else if let Some(value) = line.strip_prefix("processes ") { - snapshot.processes.created = parse_u64(value); - } - } - snapshot -} - -fn parse_cpu_total(input: &str, clk_tck: f64) -> Option { - let mut fields = [0_u64; 10]; - let mut count = 0; - for (idx, token) in input.split_whitespace().take(fields.len()).enumerate() { - fields[idx] = parse_u64(token); - count += 1; - } - if count < 4 { - return None; - } - - let user = fields[0].saturating_sub(fields[8]); - let nice = fields[1].saturating_sub(fields[9]); - Some(CpuTimes { - user: ticks_to_seconds(user, clk_tck), - nice: ticks_to_seconds(nice, clk_tck), - system: ticks_to_seconds(fields[2], clk_tck), - idle: ticks_to_seconds(fields[3], clk_tck), - wait: ticks_to_seconds(fields[4], clk_tck), - interrupt: ticks_to_seconds(fields[5].saturating_add(fields[6]), clk_tck), - steal: ticks_to_seconds(fields[7], clk_tck), - }) -} - -fn cpu_utilization(previous: CpuTimes, current: CpuTimes) -> Option { - let user = counter_delta(previous.user, current.user)?; - let nice = counter_delta(previous.nice, current.nice)?; - let system = counter_delta(previous.system, current.system)?; - let idle = counter_delta(previous.idle, current.idle)?; - let wait = counter_delta(previous.wait, current.wait)?; - let interrupt = counter_delta(previous.interrupt, current.interrupt)?; - let steal = counter_delta(previous.steal, current.steal)?; - let total = user + nice + system + idle + wait + interrupt + steal; - (total > 0.0).then(|| CpuTimes { - user: user / total, - nice: nice / total, - system: system / total, - idle: idle / total, - wait: wait / total, - interrupt: interrupt / total, - steal: steal / total, - }) -} - -fn counter_delta(previous: f64, current: f64) -> Option { - (current >= previous).then_some(current - previous) -} - -fn parse_cpuinfo(input: &str) -> CpuInfo { - let mut logical_count = 0; - let mut frequencies_hz = Vec::new(); - let mut physical_cores = HashSet::new(); - let mut physical_id = None; - let mut core_id = None; - - for line in input.lines() { - let Some((key, value)) = line.split_once(':') else { - continue; - }; - let key = key.trim(); - let value = value.trim(); - match key { - "processor" => { - logical_count += 1; - if let (Some(physical), Some(core)) = (physical_id.take(), core_id.take()) { - let _ = physical_cores.insert((physical, core)); - } - } - "physical id" => physical_id = Some(parse_u64(value)), - "core id" => core_id = Some(parse_u64(value)), - "cpu MHz" => { - if let Ok(mhz) = value.parse::() { - frequencies_hz.push(mhz * 1_000_000.0); - } - } - _ => {} - } - } - if let (Some(physical), Some(core)) = (physical_id, core_id) { - let _ = physical_cores.insert((physical, core)); - } - - let physical_count = u64::try_from(physical_cores.len()) - .ok() - .filter(|count| *count != 0) - .unwrap_or(logical_count); - CpuInfo { - logical_count, - physical_count, - frequencies_hz, - } -} - -fn parse_meminfo(input: &str) -> Option { - let mut total = 0; - let mut free = 0; - let mut available = None; - let mut buffers = 0; - let mut cached = 0; - let mut shared = 0; - let mut slab_reclaimable = 0; - let mut slab_unreclaimable = 0; - let mut hugepages = HugepageStats::default(); - - for line in input.lines() { - let mut fields = line.split_whitespace(); - let Some(key) = fields.next() else { - continue; - }; - let raw_value = fields.next().map(parse_u64).unwrap_or_default(); - let value = raw_value * BYTES_PER_KIB; - match key.trim_end_matches(':') { - "MemTotal" => total = value, - "MemFree" => free = value, - "MemAvailable" => available = Some(value), - "Buffers" => buffers = value, - "Cached" => cached = value, - "Shmem" => shared = value, - "SReclaimable" => slab_reclaimable = value, - "SUnreclaim" => slab_unreclaimable = value, - "HugePages_Total" => hugepages.total = raw_value, - "HugePages_Free" => hugepages.free = raw_value, - "HugePages_Rsvd" => hugepages.reserved = raw_value, - "HugePages_Surp" => hugepages.surplus = raw_value, - "Hugepagesize" => hugepages.page_size_bytes = value, - _ => {} - } - } - - if total == 0 { - return None; - } - let has_available = available.is_some(); - let available = - available.unwrap_or_else(|| free.saturating_add(buffers).saturating_add(cached)); - Some(MemoryStats { - total, - used: total.saturating_sub(available), - free, - available, - has_available, - cached, - buffered: buffers, - shared, - slab_reclaimable, - slab_unreclaimable, - hugepages, - }) -} - -fn parse_uptime(input: &str) -> Option { - input.split_whitespace().next()?.parse().ok() -} - -fn parse_vmstat(input: &str) -> PagingStats { - let mut total_faults = 0; - let mut major_faults = 0; - let mut page_in = 0; - let mut page_out = 0; - let mut swap_in = 0; - let mut swap_out = 0; - - for line in input.lines() { - let mut fields = line.split_whitespace(); - let Some(key) = fields.next() else { - continue; - }; - let value = fields.next().map(parse_u64).unwrap_or_default(); - match key { - "pgfault" => total_faults = value, - "pgmajfault" => major_faults = value, - "pgpgin" => page_in = value, - "pgpgout" => page_out = value, - "pswpin" => swap_in = value, - "pswpout" => swap_out = value, - _ => {} - } - } - - PagingStats { - minor_faults: total_faults.saturating_sub(major_faults), - major_faults, - page_in, - page_out, - swap_in, - swap_out, - } -} - -fn parse_swaps(input: &str) -> Vec { - let mut swaps = Vec::new(); - for line in input.lines().skip(1) { - let mut fields = line.split_whitespace(); - let Some(name) = fields.next() else { - continue; - }; - let _kind = fields.next(); - let Some(size_kib) = fields.next() else { - continue; - }; - let Some(used_kib) = fields.next() else { - continue; - }; - let size = parse_u64(size_kib).saturating_mul(BYTES_PER_KIB); - let used = parse_u64(used_kib).saturating_mul(BYTES_PER_KIB); - swaps.push(SwapStats { - name: name.to_owned(), - size, - used, - free: size.saturating_sub(used), - }); - } - swaps -} - -fn parse_diskstats( - input: &str, - include: Option<&CompiledFilter>, - exclude: Option<&CompiledFilter>, -) -> Vec { - let mut disks = Vec::new(); - for line in input.lines() { - let mut fields = line.split_whitespace(); - let _major = fields.next(); - let _minor = fields.next(); - let Some(name) = fields.next() else { - continue; - }; - if !filter_allows(name, include, exclude) { - continue; - } - let Some(read_ops) = fields.next() else { - continue; - }; - let Some(read_merged) = fields.next() else { - continue; - }; - let Some(read_sectors) = fields.next() else { - continue; - }; - let Some(read_ms) = fields.next() else { - continue; - }; - let Some(write_ops) = fields.next() else { - continue; - }; - let Some(write_merged) = fields.next() else { - continue; - }; - let Some(write_sectors) = fields.next() else { - continue; - }; - let Some(write_ms) = fields.next() else { - continue; - }; - let _in_progress = fields.next(); - let Some(io_ms) = fields.next() else { - continue; - }; - disks.push(DiskStats { - name: name.to_owned(), - limit_bytes: None, - read_ops: parse_u64(read_ops), - read_bytes: parse_u64(read_sectors).saturating_mul(DISKSTAT_SECTOR_BYTES), - write_ops: parse_u64(write_ops), - write_bytes: parse_u64(write_sectors).saturating_mul(DISKSTAT_SECTOR_BYTES), - read_merged: parse_u64(read_merged), - write_merged: parse_u64(write_merged), - read_time_seconds: millis_to_seconds(parse_u64(read_ms)), - write_time_seconds: millis_to_seconds(parse_u64(write_ms)), - io_time_seconds: millis_to_seconds(parse_u64(io_ms)), - }); - } - disks -} - -struct FilesystemMount { - device: String, - mountpoint: String, - fs_type: String, - mode: &'static str, - emit_limit: bool, -} - -#[derive(Clone, Copy, Default)] -struct FilesystemFilters<'a> { - include_devices: Option<&'a CompiledFilter>, - exclude_devices: Option<&'a CompiledFilter>, - include_fs_types: Option<&'a CompiledFilter>, - exclude_fs_types: Option<&'a CompiledFilter>, - include_mount_points: Option<&'a CompiledFilter>, - exclude_mount_points: Option<&'a CompiledFilter>, -} - -fn parse_mountinfo( - input: &str, - include_virtual_filesystems: bool, - emit_limit: bool, - filters: FilesystemFilters<'_>, -) -> Vec { - let mut mounts = Vec::new(); - for line in input.lines() { - let Some(separator) = line.find(" - ") else { - continue; - }; - let mut pre_fields = line[..separator].split_whitespace(); - let _mount_id = pre_fields.next(); - let _parent_id = pre_fields.next(); - let _major_minor = pre_fields.next(); - let _root = pre_fields.next(); - let Some(mountpoint) = pre_fields.next() else { - continue; - }; - let Some(options) = pre_fields.next() else { - continue; - }; - - let mut post_fields = line[separator + 3..].split_whitespace(); - let Some(fs_type) = post_fields.next() else { - continue; - }; - let Some(device) = post_fields.next() else { - continue; - }; - if !include_virtual_filesystems && is_skipped_filesystem_type(fs_type) { - continue; - } - if !filter_allows(fs_type, filters.include_fs_types, filters.exclude_fs_types) { - continue; - } - let device = unescape_mountinfo(device); - if !filter_allows(&device, filters.include_devices, filters.exclude_devices) { - continue; - } - let mountpoint = unescape_mountinfo(mountpoint); - if !filter_allows( - &mountpoint, - filters.include_mount_points, - filters.exclude_mount_points, - ) { - continue; - } - mounts.push(FilesystemMount { - device, - mountpoint, - fs_type: fs_type.to_owned(), - mode: filesystem_mode(options), - emit_limit, - }); - } - mounts -} - -fn filesystem_mode(options: &str) -> &'static str { - if options.split(',').any(|option| option == "ro") { - "ro" - } else { - "rw" - } -} - -fn is_skipped_filesystem_type(fs_type: &str) -> bool { - matches!( - fs_type, - "autofs" - | "bpf" - | "binfmt_misc" - | "cgroup" - | "cgroup2" - | "debugfs" - | "devtmpfs" - | "fusectl" - | "mqueue" - | "nsfs" - | "overlay" - | "proc" - | "pstore" - | "squashfs" - | "sysfs" - | "tmpfs" - | "tracefs" - | "nfs" - | "nfs4" - | "cifs" - | "smb3" - | "9p" - ) -} - -fn unescape_mountinfo(input: &str) -> String { - let bytes = input.as_bytes(); - let mut escaped = None; - for idx in 0..bytes.len() { - if bytes[idx] == b'\\' && idx + 4 <= bytes.len() { - escaped = Some(idx); - break; - } - } - let Some(first_escape) = escaped else { - return input.to_owned(); - }; - - let mut output = Vec::with_capacity(input.len()); - output.extend_from_slice(&bytes[..first_escape]); - let mut idx = first_escape; - while idx < bytes.len() { - if bytes[idx] == b'\\' && idx + 4 <= bytes.len() { - let octal = &input[idx + 1..idx + 4]; - if let Ok(value) = u8::from_str_radix(octal, 8) { - output.push(value); - idx += 4; - continue; - } - } - output.push(bytes[idx]); - idx += 1; - } - String::from_utf8_lossy(&output).into_owned() -} - -fn parse_netdev( - input: &str, - include: Option<&CompiledFilter>, - exclude: Option<&CompiledFilter>, -) -> Vec { - let mut interfaces = Vec::new(); - for line in input.lines().skip(2) { - let Some((name, values)) = line.split_once(':') else { - continue; - }; - let name = name.trim(); - if !filter_allows(name, include, exclude) { - continue; - } - let mut fields = values.split_whitespace(); - let Some(rx_bytes) = fields.next() else { - continue; - }; - let Some(rx_packets) = fields.next() else { - continue; - }; - let Some(rx_errors) = fields.next() else { - continue; - }; - let Some(rx_dropped) = fields.next() else { - continue; - }; - let _rx_fifo = fields.next(); - let _rx_frame = fields.next(); - let _rx_compressed = fields.next(); - let _rx_multicast = fields.next(); - let Some(tx_bytes) = fields.next() else { - continue; - }; - let Some(tx_packets) = fields.next() else { - continue; - }; - let Some(tx_errors) = fields.next() else { - continue; - }; - let Some(tx_dropped) = fields.next() else { - continue; - }; - interfaces.push(NetworkStats { - name: name.to_owned(), - rx_bytes: parse_u64(rx_bytes), - rx_packets: parse_u64(rx_packets), - tx_bytes: parse_u64(tx_bytes), - tx_packets: parse_u64(tx_packets), - rx_errors: parse_u64(rx_errors), - tx_errors: parse_u64(tx_errors), - rx_dropped: parse_u64(rx_dropped), - tx_dropped: parse_u64(tx_dropped), - }); - } - interfaces -} - -fn filter_allows( - value: &str, - include: Option<&CompiledFilter>, - exclude: Option<&CompiledFilter>, -) -> bool { - include.is_none_or(|filter| filter.matches(value)) - && !exclude.is_some_and(|filter| filter.matches(value)) -} - -fn record_partial_error( - partial_errors: &mut u64, - first_error: &mut Option, - err: io::Error, -) { - *partial_errors = partial_errors.saturating_add(1); - if first_error.is_none() { - *first_error = Some(err); - } -} - -fn frequency_hz_i64(value: f64) -> i64 { - if !value.is_finite() || value <= 0.0 { - return 0; - } - if value >= i64::MAX as f64 { - return i64::MAX; - } - value.round() as i64 -} - -fn parse_u64(input: &str) -> u64 { - input.parse().unwrap_or_default() -} - -fn ticks_to_seconds(ticks: u64, clk_tck: f64) -> f64 { - ticks as f64 / clk_tck -} - -fn millis_to_seconds(ms: u64) -> f64 { - ms as f64 / 1_000.0 -} - -#[allow(unsafe_code)] -fn clock_ticks_per_second() -> f64 { - // SAFETY: _SC_CLK_TCK is a valid sysconf name; the call has no side effects. - let ticks = unsafe { libc::sysconf(libc::_SC_CLK_TCK) }; - if ticks > 0 { ticks as f64 } else { 100.0 } -} - -fn now_unix_nano() -> u64 { - let Ok(duration) = SystemTime::now().duration_since(UNIX_EPOCH) else { - return 0; - }; - duration.as_secs().saturating_mul(NANOS_PER_SEC) + u64::from(duration.subsec_nanos()) -} - -fn saturating_i64(value: u64) -> i64 { - i64::try_from(value).unwrap_or(i64::MAX) -} - -#[cfg(test)] -mod tests { - use super::*; - use otap_df_pdata::proto::opentelemetry::common::v1::{AnyValue, KeyValue, any_value}; - use otap_df_pdata::proto::opentelemetry::metrics::v1::{ - AggregationTemporality, Metric, MetricsData, NumberDataPoint, metric as otlp_metric, - number_data_point, - }; - use otap_df_pdata::testing::round_trip::decode_metrics; - #[cfg(feature = "dev-tools")] - use std::collections::{BTreeMap, BTreeSet}; - #[cfg(feature = "dev-tools")] - use weaver_common::{result::WResult, vdir::VirtualDirectoryPath}; - #[cfg(feature = "dev-tools")] - use weaver_forge::registry::ResolvedRegistry; - #[cfg(feature = "dev-tools")] - use weaver_resolver::SchemaResolver; - #[cfg(feature = "dev-tools")] - use weaver_semconv::{ - attribute::{ - AttributeType, BasicRequirementLevelSpec, PrimitiveOrArrayTypeSpec, RequirementLevel, - ValueSpec, - }, - group::{GroupType, InstrumentSpec}, - registry_repo::RegistryRepo, - }; - - #[test] - fn projection_uses_expected_metric_shapes() { - let data = projection_fixture_request(); - - let resource_metrics = data.resource_metrics.first().expect("resource metrics"); - let resource = resource_metrics.resource.as_ref().expect("resource"); - assert_has_attr(&resource.attributes, attr::OS_TYPE, "linux"); - assert_has_attr(&resource.attributes, attr::HOST_ID, "host-id"); - assert_has_attr(&resource.attributes, attr::HOST_NAME, "host-name"); - assert_has_attr(&resource.attributes, attr::HOST_ARCH, "amd64"); - - let metrics = &resource_metrics.scope_metrics[0].metrics; - assert_metric_shape(metrics, metric::CPU_TIME, "s", Some(true)); - assert_first_point_attr(metrics, metric::CPU_TIME, attr::CPU_MODE, "user"); - assert_sum_point_attr(metrics, metric::CPU_TIME, attr::CPU_MODE, "iowait"); - assert_metric_shape(metrics, metric::CPU_UTILIZATION, "1", None); - assert_first_point_attr(metrics, metric::CPU_UTILIZATION, attr::CPU_MODE, "user"); - assert_metric_shape(metrics, metric::CPU_LOGICAL_COUNT, "{cpu}", Some(false)); - assert_metric_shape(metrics, metric::CPU_PHYSICAL_COUNT, "{cpu}", Some(false)); - assert_metric_shape(metrics, metric::CPU_FREQUENCY, "Hz", None); - assert_first_point_int(metrics, metric::CPU_FREQUENCY, 2_400_000_000); - assert_first_point_attr_int(metrics, metric::CPU_FREQUENCY, attr::CPU_LOGICAL_NUMBER, 0); - assert_metric_shape(metrics, metric::MEMORY_USAGE, "By", Some(false)); - assert_first_point_attr( - metrics, - metric::MEMORY_USAGE, - attr::SYSTEM_MEMORY_STATE, - "used", - ); - assert_metric_shape(metrics, metric::MEMORY_UTILIZATION, "1", None); - assert_metric_shape(metrics, metric::MEMORY_LINUX_AVAILABLE, "By", Some(false)); - assert_metric_shape(metrics, metric::MEMORY_LINUX_SLAB_USAGE, "By", Some(false)); - assert_metric_shape(metrics, metric::MEMORY_LIMIT, "By", Some(false)); - assert_metric_shape(metrics, metric::MEMORY_LINUX_SHARED, "By", Some(false)); - assert_metric_shape( - metrics, - metric::MEMORY_LINUX_HUGEPAGES_LIMIT, - "{page}", - Some(false), - ); - assert_metric_shape( - metrics, - metric::MEMORY_LINUX_HUGEPAGES_PAGE_SIZE, - "By", - Some(false), - ); - assert_metric_shape( - metrics, - metric::MEMORY_LINUX_HUGEPAGES_RESERVED, - "{page}", - Some(false), - ); - assert_metric_shape( - metrics, - metric::MEMORY_LINUX_HUGEPAGES_SURPLUS, - "{page}", - Some(false), - ); - assert_metric_shape( - metrics, - metric::MEMORY_LINUX_HUGEPAGES_USAGE, - "{page}", - Some(false), - ); - assert_first_point_attr( - metrics, - metric::MEMORY_LINUX_HUGEPAGES_USAGE, - attr::SYSTEM_MEMORY_LINUX_HUGEPAGES_STATE, - "used", - ); - assert_metric_shape( - metrics, - metric::MEMORY_LINUX_HUGEPAGES_UTILIZATION, - "1", - None, - ); - assert_metric_shape(metrics, metric::UPTIME, "s", None); - assert_metric_shape(metrics, metric::PAGING_FAULTS, "{fault}", Some(true)); - assert_first_point_attr( - metrics, - metric::PAGING_FAULTS, - attr::SYSTEM_PAGING_FAULT_TYPE, - "minor", - ); - assert_metric_shape( - metrics, - metric::PAGING_OPERATIONS, - "{operation}", - Some(true), - ); - assert_sum_point_attr( - metrics, - metric::PAGING_OPERATIONS, - attr::SYSTEM_PAGING_DIRECTION, - "in", - ); - assert_sum_point_attr( - metrics, - metric::PAGING_OPERATIONS, - attr::SYSTEM_PAGING_FAULT_TYPE, - "minor", - ); - assert_metric_shape(metrics, metric::PAGING_USAGE, "By", Some(false)); - assert_first_point_attr( - metrics, - metric::PAGING_USAGE, - attr::SYSTEM_DEVICE, - "/dev/swap", - ); - assert_metric_shape(metrics, metric::PAGING_UTILIZATION, "1", None); - assert_metric_shape(metrics, metric::PROCESS_COUNT, "{process}", Some(false)); - assert_sum_point_attr( - metrics, - metric::PROCESS_COUNT, - attr::PROCESS_STATE, - "running", - ); - assert_metric_shape(metrics, metric::PROCESS_CREATED, "{process}", Some(true)); - assert_metric_shape(metrics, metric::DISK_IO, "By", Some(true)); - assert_first_point_attr(metrics, metric::DISK_IO, attr::DISK_IO_DIRECTION, "read"); - assert_metric_shape(metrics, metric::DISK_OPERATIONS, "{operation}", Some(true)); - assert_metric_shape(metrics, metric::DISK_IO_TIME, "s", Some(true)); - assert_first_point_attr(metrics, metric::DISK_IO_TIME, attr::SYSTEM_DEVICE, "sda"); - assert_metric_shape(metrics, metric::DISK_OPERATION_TIME, "s", Some(true)); - assert_metric_shape(metrics, metric::DISK_MERGED, "{operation}", Some(true)); - assert_metric_shape(metrics, metric::DISK_LIMIT, "By", Some(false)); - assert_first_point_attr(metrics, metric::DISK_LIMIT, attr::SYSTEM_DEVICE, "sda"); - assert_metric_shape(metrics, metric::FILESYSTEM_USAGE, "By", Some(false)); - assert_first_point_attr( - metrics, - metric::FILESYSTEM_USAGE, - attr::SYSTEM_FILESYSTEM_STATE, - "used", - ); - assert_metric_shape(metrics, metric::FILESYSTEM_UTILIZATION, "1", None); - assert_metric_shape(metrics, metric::FILESYSTEM_LIMIT, "By", Some(false)); - assert_no_first_point_attr( - metrics, - metric::FILESYSTEM_LIMIT, - attr::SYSTEM_FILESYSTEM_STATE, - ); - assert_metric_shape(metrics, metric::NETWORK_IO, "By", Some(true)); - assert_first_point_attr( - metrics, - metric::NETWORK_IO, - attr::NETWORK_INTERFACE_NAME, - "eth0", - ); - assert_metric_shape( - metrics, - metric::NETWORK_PACKET_COUNT, - "{packet}", - Some(true), - ); - assert_first_point_attr( - metrics, - metric::NETWORK_PACKET_COUNT, - attr::SYSTEM_DEVICE, - "eth0", - ); - assert_metric_shape( - metrics, - metric::NETWORK_PACKET_DROPPED, - "{packet}", - Some(true), - ); - assert_first_point_attr( - metrics, - metric::NETWORK_PACKET_DROPPED, - attr::NETWORK_INTERFACE_NAME, - "eth0", - ); - assert_metric_shape(metrics, metric::NETWORK_ERRORS, "{error}", Some(true)); - } - - #[cfg(feature = "dev-tools")] - #[test] - #[ignore = "dev-only semconv drift check; may access a local or remote semantic-conventions registry"] - fn emitted_phase1_metric_shapes_match_weaver_semconv() { - let registry = load_semconv_registry(); - let semconv_shapes = semconv_system_metric_shapes(®istry); - let emitted_shapes = emitted_phase1_metric_shapes(); - - for (name, emitted) in emitted_shapes { - let semconv = semconv_shapes - .get(&name) - .unwrap_or_else(|| panic!("missing semconv metric {name}")); - - assert_eq!(emitted.unit, semconv.unit, "unit mismatch for {name}"); - assert_eq!( - emitted.monotonic, semconv.monotonic, - "instrument/temporality mismatch for {name}" - ); - assert_eq!( - emitted.value_type, semconv.value_type, - "metric value type mismatch for {name}" - ); - - for attr in &semconv.attributes { - assert!( - emitted.attributes.contains(attr), - "missing semconv attribute {attr} on {name}" - ); - } - for attr in &emitted.attributes { - assert!( - semconv.all_attributes.contains(attr), - "unexpected semconv attribute {attr} on {name}" - ); - } - for (attr, emitted_kind) in &emitted.attribute_types { - let Some(semconv_kind) = semconv.attribute_types.get(attr) else { - continue; - }; - assert_eq!( - emitted_kind, semconv_kind, - "attribute value type mismatch for {attr} on {name}" - ); - } - for (attr, values) in &emitted.enum_values { - let Some(allowed_values) = semconv.enum_values.get(attr) else { - continue; - }; - for value in values { - if is_intentional_semconv_enum_value_gap(name.as_str(), attr.as_str(), value) { - continue; - } - assert!( - allowed_values.contains(value), - "unexpected enum value {attr}={value} on {name}" - ); - } - } - } - } - - #[test] - fn projection_uses_counter_start_overrides_for_reset_series() { - let data = decode_metrics( - HostSnapshot { - now_unix_nano: 2_000, - start_time_unix_nano: 1_000, - counter_starts: CounterStarts { - entries: vec![(counter_key(metric::PROCESS_CREATED, ""), 1_500)], - }, - processes: Some(ProcessStats { - created: 99, - ..ProcessStats::default() - }), - ..HostSnapshot::default() - } - .into_otap_records() - .expect("encode ok"), - ); - - let metrics = &data.resource_metrics[0].scope_metrics[0].metrics; - assert_first_sum_point_start(metrics, metric::PROCESS_CREATED, 1_500); - } - - #[test] - fn counter_tracker_rebaselines_reset_series_only() { - let mut tracker = CounterTracker::default(); - let disks = vec![DiskStats { - name: "sda".to_owned(), - read_bytes: 100, - write_bytes: 200, - ..DiskStats::default() - }]; - let starts = tracker.snapshot(10, 20, None, None, None, &disks, &[]); - - assert_eq!(starts.get_joined(metric::DISK_IO, "sda", "read", 10), 10); - assert_eq!(starts.get_joined(metric::DISK_IO, "sda", "write", 10), 10); - - let disks = vec![DiskStats { - name: "sda".to_owned(), - read_bytes: 50, - write_bytes: 250, - ..DiskStats::default() - }]; - let starts = tracker.snapshot(10, 30, None, None, None, &disks, &[]); - - assert_eq!(starts.get_joined(metric::DISK_IO, "sda", "read", 10), 30); - assert_eq!(starts.get_joined(metric::DISK_IO, "sda", "write", 10), 10); - } - - #[test] - fn counter_tracker_rebaselines_paging_operations_by_direction_and_fault_type() { - let mut tracker = CounterTracker::default(); - let paging = PagingStats { - swap_in: 100, - swap_out: 200, - page_in: 300, - page_out: 400, - ..PagingStats::default() - }; - let starts = tracker.snapshot(10, 20, None, Some(&paging), None, &[], &[]); - - assert_eq!( - starts.get_joined(metric::PAGING_OPERATIONS, "in", "major", 10), - 10 - ); - assert_eq!( - starts.get_joined(metric::PAGING_OPERATIONS, "out", "minor", 10), - 10 - ); - - let paging = PagingStats { - swap_in: 50, - swap_out: 250, - page_in: 350, - page_out: 450, - ..PagingStats::default() - }; - let starts = tracker.snapshot(10, 30, None, Some(&paging), None, &[], &[]); - - assert_eq!( - starts.get_joined(metric::PAGING_OPERATIONS, "in", "major", 10), - 30 - ); - assert_eq!( - starts.get_joined(metric::PAGING_OPERATIONS, "out", "major", 10), - 10 - ); - assert_eq!( - starts.get_joined(metric::PAGING_OPERATIONS, "in", "minor", 10), - 10 - ); - assert_eq!( - starts.get_joined(metric::PAGING_OPERATIONS, "out", "minor", 10), - 10 - ); - } - - #[test] - fn counter_keys_do_not_collide_with_pipe_in_series_values() { - let metric = metric::DISK_IO; - let device = "read|write"; - let joined = counter_key_joined(metric, device, "read"); - assert!(!counter_key_matches_joined( - &joined, - metric, - "read", - "write|read" - )); - assert!(counter_key_matches_joined(&joined, metric, device, "read")); - } - - #[test] - fn scrape_due_emits_successful_families_after_partial_read_error() { - let root = tempfile::tempdir().expect("tempdir"); - let proc = root.path().join("proc"); - std::fs::create_dir(&proc).expect("proc dir"); - std::fs::write( - proc.join("meminfo"), - "MemTotal: 1000 kB\nMemFree: 100 kB\nMemAvailable: 200 kB\n", - ) - .expect("meminfo"); - // Cumulative metrics read /proc/stat once to cache boot time. Provide - // btime here so this test only exercises the missing diskstats error. - std::fs::write(proc.join("stat"), "btime 1700000000\n").expect("stat"); - let mut source = ProcfsSource::new( - Some(root.path()), - ProcfsConfig { - cpu: false, - memory: true, - paging: false, - system: false, - disk: true, - filesystem: false, - network: false, - processes: false, - cpu_utilization: false, - memory_limit: false, - memory_shared: false, - memory_hugepages: false, - disk_limit: false, - filesystem_include_virtual: false, - filesystem_limit: false, - filesystem_include_devices: None, - filesystem_exclude_devices: None, - filesystem_include_fs_types: None, - filesystem_exclude_fs_types: None, - filesystem_include_mount_points: None, - filesystem_exclude_mount_points: None, - disk_include: None, - disk_exclude: None, - network_include: None, - network_exclude: None, - validation: HostViewValidationMode::None, - }, - ) - .expect("source"); - - let scrape = source - .scrape_due(ProcfsFamilies { - memory: true, - disk: true, - ..ProcfsFamilies::default() - }) - .expect("partial scrape"); - - assert_eq!(scrape.partial_errors, 1); - assert!(scrape.snapshot.memory.is_some()); - assert!(scrape.snapshot.disks.is_empty()); - } - - #[test] - fn scrape_due_fails_when_all_due_families_fail() { - let root = tempfile::tempdir().expect("tempdir"); - let mut source = ProcfsSource::new( - Some(root.path()), - ProcfsConfig { - cpu: false, - memory: true, - paging: false, - system: false, - disk: false, - filesystem: false, - network: false, - processes: false, - cpu_utilization: false, - memory_limit: false, - memory_shared: false, - memory_hugepages: false, - disk_limit: false, - filesystem_include_virtual: false, - filesystem_limit: false, - filesystem_include_devices: None, - filesystem_exclude_devices: None, - filesystem_include_fs_types: None, - filesystem_exclude_fs_types: None, - filesystem_include_mount_points: None, - filesystem_exclude_mount_points: None, - disk_include: None, - disk_exclude: None, - network_include: None, - network_exclude: None, - validation: HostViewValidationMode::None, - }, - ) - .expect("source"); - - assert!( - source - .scrape_due(ProcfsFamilies { - memory: true, - ..ProcfsFamilies::default() - }) - .is_err() - ); - } - - #[test] - fn scrape_due_reads_opt_in_disk_limit_from_sysfs() { - let root = tempfile::tempdir().expect("tempdir"); - let proc = root.path().join("proc"); - let sys_sda = root.path().join("sys/block/sda"); - std::fs::create_dir(&proc).expect("proc dir"); - std::fs::create_dir_all(&sys_sda).expect("sys block dir"); - std::fs::write( - proc.join("diskstats"), - "8 0 sda 1 0 2 3 4 0 5 6 0 0 0 0 0 0 0 0\n", - ) - .expect("diskstats"); - std::fs::write(sys_sda.join("size"), "4096\n").expect("disk size"); - let mut source = ProcfsSource::new( - Some(root.path()), - ProcfsConfig { - cpu: false, - memory: false, - paging: false, - system: false, - disk: true, - filesystem: false, - network: false, - processes: false, - cpu_utilization: false, - memory_limit: false, - memory_shared: false, - memory_hugepages: false, - disk_limit: true, - filesystem_include_virtual: false, - filesystem_limit: false, - filesystem_include_devices: None, - filesystem_exclude_devices: None, - filesystem_include_fs_types: None, - filesystem_exclude_fs_types: None, - filesystem_include_mount_points: None, - filesystem_exclude_mount_points: None, - disk_include: None, - disk_exclude: None, - network_include: None, - network_exclude: None, - validation: HostViewValidationMode::None, - }, - ) - .expect("source"); - - let scrape = source - .scrape_due(ProcfsFamilies { - disk: true, - ..ProcfsFamilies::default() - }) - .expect("disk scrape"); - - assert_eq!(scrape.snapshot.disks.len(), 1); - assert_eq!( - scrape.snapshot.disks[0].limit_bytes, - Some(4096 * DISKSTAT_SECTOR_BYTES) - ); - } - - #[test] - fn scrape_due_uses_boot_time_for_counter_only_family_ticks() { - let root = tempfile::tempdir().expect("tempdir"); - let proc = root.path().join("proc"); - let proc_one = proc.join("1"); - std::fs::create_dir_all(proc_one.join("net")).expect("proc dirs"); - std::fs::write(proc.join("stat"), "btime 123\n").expect("stat"); - std::fs::write( - proc.join("diskstats"), - "8 0 sda 1 0 2 3 4 0 5 6 0 0 0 0 0 0 0 0\n", - ) - .expect("diskstats"); - std::fs::write( - proc_one.join("net/dev"), - "Inter-| Receive | Transmit\n\ - face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed\n\ - eth0: 10 1 0 0 0 0 0 0 20 2 0 0 0 0 0 0\n", - ) - .expect("netdev"); - std::fs::write( - proc.join("vmstat"), - "pgfault 10\npgmajfault 1\npgpgin 2\npgpgout 3\npswpin 4\npswpout 5\n", - ) - .expect("vmstat"); - std::fs::write(proc.join("swaps"), "Filename Type Size Used Priority\n").expect("swaps"); - - let mut source = ProcfsSource::new( - Some(root.path()), - ProcfsConfig { - cpu: false, - memory: false, - paging: true, - system: false, - disk: true, - filesystem: false, - network: true, - processes: false, - cpu_utilization: false, - memory_limit: false, - memory_shared: false, - memory_hugepages: false, - disk_limit: false, - filesystem_include_virtual: false, - filesystem_limit: false, - filesystem_include_devices: None, - filesystem_exclude_devices: None, - filesystem_include_fs_types: None, - filesystem_exclude_fs_types: None, - filesystem_include_mount_points: None, - filesystem_exclude_mount_points: None, - disk_include: None, - disk_exclude: None, - network_include: None, - network_exclude: None, - validation: HostViewValidationMode::None, - }, - ) - .expect("source"); - - let expected_start = 123 * NANOS_PER_SEC; - let disk_scrape = source - .scrape_due(ProcfsFamilies { - disk: true, - ..ProcfsFamilies::default() - }) - .expect("disk scrape"); - assert_eq!(disk_scrape.snapshot.start_time_unix_nano, expected_start); - assert_eq!(disk_scrape.snapshot.disks.len(), 1); - - std::fs::remove_file(proc.join("stat")).expect("remove stat after cache"); - - let network_scrape = source - .scrape_due(ProcfsFamilies { - network: true, - ..ProcfsFamilies::default() - }) - .expect("network scrape"); - assert_eq!(network_scrape.snapshot.start_time_unix_nano, expected_start); - assert_eq!(network_scrape.snapshot.networks.len(), 1); - - let paging_scrape = source - .scrape_due(ProcfsFamilies { - paging: true, - ..ProcfsFamilies::default() - }) - .expect("paging scrape"); - assert_eq!(paging_scrape.snapshot.start_time_unix_nano, expected_start); - assert!(paging_scrape.snapshot.paging.is_some()); - } - - #[test] - fn scrape_due_reads_filesystem_usage_from_mountinfo() { - let root = tempfile::tempdir().expect("tempdir"); - let proc_one = root.path().join("proc/1"); - std::fs::create_dir_all(&proc_one).expect("proc one dir"); - std::fs::write( - proc_one.join("mountinfo"), - "36 25 8:1 / / rw,relatime - ext4 /dev/sda1 rw\n", - ) - .expect("mountinfo"); - let mut source = ProcfsSource::new( - Some(root.path()), - ProcfsConfig { - cpu: false, - memory: false, - paging: false, - system: false, - disk: false, - filesystem: true, - network: false, - processes: false, - cpu_utilization: false, - memory_limit: false, - memory_shared: false, - memory_hugepages: false, - disk_limit: false, - filesystem_include_virtual: false, - filesystem_limit: true, - filesystem_include_devices: None, - filesystem_exclude_devices: None, - filesystem_include_fs_types: None, - filesystem_exclude_fs_types: None, - filesystem_include_mount_points: None, - filesystem_exclude_mount_points: None, - disk_include: None, - disk_exclude: None, - network_include: None, - network_exclude: None, - validation: HostViewValidationMode::None, - }, - ) - .expect("source"); - - let scrape = source - .scrape_due(ProcfsFamilies { - filesystem: true, - ..ProcfsFamilies::default() - }) - .expect("filesystem scrape"); - - assert_eq!(scrape.snapshot.filesystems.len(), 1); - assert_eq!(scrape.snapshot.filesystems[0].device, "/dev/sda1"); - assert_eq!(scrape.snapshot.filesystems[0].mountpoint, "/"); - assert_eq!(scrape.snapshot.filesystems[0].fs_type, "ext4"); - assert!(scrape.snapshot.filesystems[0].limit_bytes.is_some()); - } - - #[test] - fn cpu_parser_accepts_missing_newer_fields() { - let cpu = parse_cpu_total("10 20 30 40", 10.0).expect("cpu row"); - assert_eq!(cpu.user, 1.0); - assert_eq!(cpu.nice, 2.0); - assert_eq!(cpu.system, 3.0); - assert_eq!(cpu.idle, 4.0); - assert_eq!(cpu.steal, 0.0); - } - - #[test] - fn cpu_parser_removes_guest_from_user_and_nice() { - let cpu = parse_cpu_total("100 50 30 40 5 2 3 7 10 4", 10.0).expect("cpu row"); - assert_eq!(cpu.user, 9.0); - assert_eq!(cpu.nice, 4.6); - assert_eq!(cpu.interrupt, 0.5); - } - - #[test] - fn cpu_utilization_uses_counter_deltas() { - let utilization = cpu_utilization( - CpuTimes { - user: 1.0, - idle: 1.0, - ..CpuTimes::default() - }, - CpuTimes { - user: 3.0, - idle: 2.0, - ..CpuTimes::default() - }, - ) - .expect("utilization"); - - assert_eq!(utilization.user, 2.0 / 3.0); - assert_eq!(utilization.idle, 1.0 / 3.0); - } - - #[test] - fn cpu_utilization_skips_counter_resets() { - assert!( - cpu_utilization( - CpuTimes { - user: 2.0, - ..CpuTimes::default() - }, - CpuTimes { - user: 1.0, - ..CpuTimes::default() - }, - ) - .is_none() - ); - } - - #[test] - fn clock_ticks_per_second_uses_positive_system_value() { - assert!(clock_ticks_per_second() > 0.0); - } - - #[test] - fn memavailable_fallback_uses_free_buffers_cached() { - let memory = - parse_meminfo("MemTotal: 1000 kB\nMemFree: 100 kB\nBuffers: 20 kB\nCached: 30 kB\n") - .expect("memory"); - assert!(!memory.has_available); - assert_eq!(memory.available, 150 * BYTES_PER_KIB); - assert_eq!(memory.used, 850 * BYTES_PER_KIB); - } - - #[test] - fn meminfo_parser_reads_shared_memory() { - let memory = - parse_meminfo("MemTotal: 1000 kB\nMemFree: 100 kB\nShmem: 12 kB\n").expect("memory"); - assert_eq!(memory.shared, 12 * BYTES_PER_KIB); - } - - #[test] - fn meminfo_parser_reads_hugepage_stats() { - let memory = parse_meminfo( - "MemTotal: 1000 kB\n\ - MemFree: 100 kB\n\ - HugePages_Total: 8\n\ - HugePages_Free: 3\n\ - HugePages_Rsvd: 2\n\ - HugePages_Surp: 1\n\ - Hugepagesize: 2048 kB\n", - ) - .expect("memory"); - - assert_eq!(memory.hugepages.total, 8); - assert_eq!(memory.hugepages.free, 3); - assert_eq!(memory.hugepages.reserved, 2); - assert_eq!(memory.hugepages.surplus, 1); - assert_eq!(memory.hugepages.page_size_bytes, 2048 * BYTES_PER_KIB); - } - - #[test] - fn uptime_parser_reads_first_field() { - assert_eq!(parse_uptime("123.45 67.89"), Some(123.45)); - } - - #[test] - fn vmstat_parser_derives_minor_faults() { - let paging = - parse_vmstat("pgfault 100\npgmajfault 7\npgpgin 5\npgpgout 6\npswpin 3\npswpout 4\n"); - assert_eq!(paging.minor_faults, 93); - assert_eq!(paging.major_faults, 7); - assert_eq!(paging.page_in, 5); - assert_eq!(paging.page_out, 6); - assert_eq!(paging.swap_in, 3); - assert_eq!(paging.swap_out, 4); - } - - #[test] - fn swaps_parser_reads_device_usage() { - let swaps = - parse_swaps("Filename Type Size Used Priority\n/dev/sda2 partition 200 50 -2\n"); - assert_eq!(swaps.len(), 1); - assert_eq!(swaps[0].name, "/dev/sda2"); - assert_eq!(swaps[0].used, 50 * BYTES_PER_KIB); - assert_eq!(swaps[0].free, 150 * BYTES_PER_KIB); - } - - #[test] - fn diskstats_parser_accepts_flush_columns() { - let disks = parse_diskstats("8 0 sda 1 0 2 3 4 0 5 6 0 0 0 0 0 0 0 0\n", None, None); - assert_eq!(disks.len(), 1); - assert_eq!(disks[0].name, "sda"); - assert_eq!(disks[0].read_bytes, 1024); - assert_eq!(disks[0].write_bytes, 2560); - } - - #[test] - fn diskstats_parser_applies_filters_before_parsing_values() { - let exclude = CompiledFilter::compile( - crate::receivers::host_metrics_receiver::MatchType::Glob, - vec!["loop*".to_owned()], - ) - .expect("valid") - .expect("filter"); - let disks = parse_diskstats( - "7 0 loop0 broken row\n8 0 sda 1 0 2 3 4 0 5 6 0 0 0 0 0 0 0 0\n", - None, - Some(&exclude), - ); - - assert_eq!(disks.len(), 1); - assert_eq!(disks[0].name, "sda"); - } - - #[test] - fn mountinfo_parser_skips_virtual_filesystems_by_default() { - let mounts = parse_mountinfo( - "36 25 8:1 / / rw,relatime - ext4 /dev/sda1 rw\n37 25 0:32 / /proc rw,nosuid,nodev,noexec,relatime - proc proc rw\n", - false, - true, - FilesystemFilters::default(), - ); - - assert_eq!(mounts.len(), 1); - assert_eq!(mounts[0].device, "/dev/sda1"); - assert_eq!(mounts[0].mountpoint, "/"); - assert_eq!(mounts[0].fs_type, "ext4"); - assert_eq!(mounts[0].mode, "rw"); - assert!(mounts[0].emit_limit); - } - - #[test] - fn mountinfo_parser_unescapes_paths() { - let mounts = parse_mountinfo( - "36 25 8:1 / /mnt/data\\040disk rw,relatime - ext4 /dev/disk\\040one rw\n", - false, - false, - FilesystemFilters::default(), - ); - - assert_eq!(mounts.len(), 1); - assert_eq!(mounts[0].device, "/dev/disk one"); - assert_eq!(mounts[0].mountpoint, "/mnt/data disk"); - } - - #[test] - fn mountinfo_parser_preserves_utf8_while_unescaping_paths() { - let mounts = parse_mountinfo( - "36 25 8:1 / /mnt/caf\u{00e9}\\040disk rw,relatime - ext4 /dev/disk\\040\u{00e9} rw\n", - false, - false, - FilesystemFilters::default(), - ); - - assert_eq!(mounts.len(), 1); - assert_eq!(mounts[0].device, "/dev/disk \u{00e9}"); - assert_eq!(mounts[0].mountpoint, "/mnt/caf\u{00e9} disk"); - } - - #[test] - fn mountinfo_parser_applies_filesystem_filters() { - let include_mounts = CompiledFilter::compile( - crate::receivers::host_metrics_receiver::MatchType::Glob, - vec!["/data*".to_owned()], - ) - .expect("valid") - .expect("filter"); - let exclude_fs_types = CompiledFilter::compile( - crate::receivers::host_metrics_receiver::MatchType::Strict, - vec!["xfs".to_owned()], - ) - .expect("valid") - .expect("filter"); - let mounts = parse_mountinfo( - "36 25 8:1 / / rw,relatime - ext4 /dev/sda1 rw\n37 25 8:2 / /data rw,relatime - ext4 /dev/sdb1 rw\n38 25 8:3 / /data2 rw,relatime - xfs /dev/sdc1 rw\n", - false, - false, - FilesystemFilters { - include_mount_points: Some(&include_mounts), - exclude_fs_types: Some(&exclude_fs_types), - ..FilesystemFilters::default() - }, - ); - - assert_eq!(mounts.len(), 1); - assert_eq!(mounts[0].device, "/dev/sdb1"); - assert_eq!(mounts[0].mountpoint, "/data"); - } - - #[test] - fn netdev_parser_reads_device_counters() { - let interfaces = parse_netdev( - "Inter-| Receive | Transmit\n face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed\n eth0: 10 2 0 0 0 0 0 0 30 4 0 0 0 0 0 0\n", - None, - None, - ); - assert_eq!(interfaces.len(), 1); - assert_eq!(interfaces[0].name, "eth0"); - assert_eq!(interfaces[0].rx_bytes, 10); - assert_eq!(interfaces[0].tx_packets, 4); - } - - #[test] - fn netdev_parser_applies_interface_filters() { - let include = CompiledFilter::compile( - crate::receivers::host_metrics_receiver::MatchType::Strict, - vec!["eth0".to_owned()], - ) - .expect("valid") - .expect("filter"); - let interfaces = parse_netdev( - "Inter-| Receive | Transmit\n face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed\n lo: 1 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0\n eth0: 10 2 3 4 0 0 0 0 30 4 5 6 0 0 0 0\n", - Some(&include), - None, - ); - - assert_eq!(interfaces.len(), 1); - assert_eq!(interfaces[0].name, "eth0"); - assert_eq!(interfaces[0].rx_errors, 3); - assert_eq!(interfaces[0].tx_dropped, 6); - } - - #[test] - fn root_path_uses_host_pid_one_netdev() { - let paths = ProcfsPaths::new(Some(Path::new("/host"))); - assert_eq!(paths.net_dev, PathBuf::from("/host/proc/1/net/dev")); - assert_eq!(paths.mountinfo, PathBuf::from("/host/proc/1/mountinfo")); - } - - #[test] - fn root_slash_uses_current_proc_netdev() { - let paths = ProcfsPaths::new(Some(Path::new("/"))); - assert_eq!(paths.net_dev, PathBuf::from("/proc/net/dev")); - assert_eq!(paths.mountinfo, PathBuf::from("/proc/self/mountinfo")); - } - - #[test] - fn host_arch_uses_semconv_values() { - if let Some(arch) = host_arch() { - assert!(matches!( - arch, - "amd64" | "arm32" | "arm64" | "ppc32" | "ppc64" | "x86" - )); - } - } - - #[cfg(feature = "dev-tools")] - #[derive(Debug)] - struct MetricShape { - unit: String, - monotonic: Option, - attributes: BTreeSet, - all_attributes: BTreeSet, - attribute_types: BTreeMap, - enum_values: BTreeMap>, - value_type: Option, - } - - #[cfg(feature = "dev-tools")] - #[derive(Clone, Copy, Debug, Eq, PartialEq)] - enum MetricValueKind { - Int, - Double, - } - - #[cfg(feature = "dev-tools")] - #[derive(Clone, Copy, Debug, Eq, PartialEq)] - enum AttributeValueKind { - Int, - Double, - String, - Bool, - } - - #[cfg(feature = "dev-tools")] - fn load_semconv_registry() -> ResolvedRegistry { - let registry_path = std::env::var("OTAP_HOST_METRICS_SEMCONV_REGISTRY") - .map(|path| { - path.parse::() - .expect("valid OTAP_HOST_METRICS_SEMCONV_REGISTRY") - }) - .unwrap_or_else(|_| VirtualDirectoryPath::GitRepo { - url: "https://github.com/open-telemetry/semantic-conventions.git".to_owned(), - sub_folder: Some("model".to_owned()), - refspec: Some(format!( - "v{}", - crate::receivers::host_metrics_receiver::semconv::VERSION - )), - }); - - let registry_repo = - RegistryRepo::try_new("main", ®istry_path).expect("semantic convention registry"); - let registry = match SchemaResolver::load_semconv_repository(registry_repo, false) { - WResult::Ok(registry) | WResult::OkWithNFEs(registry, _) => registry, - WResult::FatalErr(err) => panic!("failed to load semantic convention registry: {err}"), - }; - let resolved_schema = match SchemaResolver::resolve(registry, true) { - WResult::Ok(schema) | WResult::OkWithNFEs(schema, _) => schema, - WResult::FatalErr(err) => { - panic!("failed to resolve semantic convention registry: {err}"); - } - }; - - ResolvedRegistry::try_from_resolved_registry( - &resolved_schema.registry, - resolved_schema.catalog(), - ) - .expect("resolved semantic convention registry") - } - - #[cfg(feature = "dev-tools")] - fn semconv_system_metric_shapes(registry: &ResolvedRegistry) -> BTreeMap { - registry - .groups - .iter() - .filter(|group| group.r#type == GroupType::Metric) - .filter_map(|group| { - let name = group.metric_name.as_ref()?; - if !name.starts_with("system.") { - return None; - } - - let monotonic = match group.instrument.as_ref()? { - InstrumentSpec::Counter => Some(true), - InstrumentSpec::UpDownCounter => Some(false), - InstrumentSpec::Gauge | InstrumentSpec::Histogram => None, - }; - let attributes = group - .attributes - .iter() - .filter(|attr| !is_opt_in_requirement(&attr.requirement_level)) - .map(|attr| attr.name.clone()) - .collect(); - let all_attributes = group - .attributes - .iter() - .map(|attr| attr.name.clone()) - .collect(); - let enum_values = group - .attributes - .iter() - .filter_map(|attr| match &attr.r#type { - AttributeType::Enum { members } => Some(( - attr.name.clone(), - members - .iter() - .map(|member| value_spec_string(&member.value)) - .collect(), - )), - _ => None, - }) - .collect(); - let attribute_types = group - .attributes - .iter() - .filter_map(|attr| { - attribute_value_kind(&attr.r#type).map(|kind| (attr.name.clone(), kind)) - }) - .collect(); - - Some(( - name.clone(), - MetricShape { - unit: group.unit.clone().unwrap_or_default(), - monotonic, - attributes, - all_attributes, - attribute_types, - enum_values, - value_type: semconv_metric_value_type(group.annotations.as_ref()), - }, - )) - }) - .collect() - } - - #[cfg(feature = "dev-tools")] - fn semconv_metric_value_type( - annotations: Option<&BTreeMap>, - ) -> Option { - let code_generation = annotations?.get("code_generation")?.0.as_mapping()?; - let value_type = code_generation.iter().find_map(|(key, value)| { - (key.as_str() == Some("metric_value_type")).then(|| value.as_str())? - })?; - match value_type { - "int" => Some(MetricValueKind::Int), - "double" => Some(MetricValueKind::Double), - _ => None, - } - } - - #[cfg(feature = "dev-tools")] - fn value_spec_string(value: &ValueSpec) -> String { - match value { - ValueSpec::Int(value) => value.to_string(), - ValueSpec::Double(value) => value.to_string(), - ValueSpec::String(value) => value.clone(), - ValueSpec::Bool(value) => value.to_string(), - } - } - - #[cfg(feature = "dev-tools")] - fn attribute_value_kind(attribute_type: &AttributeType) -> Option { - match attribute_type { - AttributeType::PrimitiveOrArray(PrimitiveOrArrayTypeSpec::Int) => { - Some(AttributeValueKind::Int) - } - AttributeType::PrimitiveOrArray(PrimitiveOrArrayTypeSpec::Double) => { - Some(AttributeValueKind::Double) - } - AttributeType::PrimitiveOrArray(PrimitiveOrArrayTypeSpec::String) => { - Some(AttributeValueKind::String) - } - AttributeType::PrimitiveOrArray(PrimitiveOrArrayTypeSpec::Boolean) => { - Some(AttributeValueKind::Bool) - } - AttributeType::Enum { members } => { - members.first().map(|member| value_spec_kind(&member.value)) - } - _ => None, - } - } - - #[cfg(feature = "dev-tools")] - fn value_spec_kind(value: &ValueSpec) -> AttributeValueKind { - match value { - ValueSpec::Int(_) => AttributeValueKind::Int, - ValueSpec::Double(_) => AttributeValueKind::Double, - ValueSpec::String(_) => AttributeValueKind::String, - ValueSpec::Bool(_) => AttributeValueKind::Bool, - } - } - - #[cfg(feature = "dev-tools")] - fn is_intentional_semconv_enum_value_gap(_name: &str, _attr: &str, _value: &str) -> bool { - false - } - - #[cfg(feature = "dev-tools")] - fn is_opt_in_requirement(requirement_level: &RequirementLevel) -> bool { - matches!( - requirement_level, - RequirementLevel::Basic(BasicRequirementLevelSpec::OptIn) - | RequirementLevel::OptIn { .. } - ) - } - - #[cfg(feature = "dev-tools")] - fn emitted_phase1_metric_shapes() -> BTreeMap { - let metrics = projection_fixture_metrics(); - let mut shapes = BTreeMap::new(); - for metric in &metrics { - let (monotonic, points) = match metric.data.as_ref().expect("metric data") { - otlp_metric::Data::Sum(sum) => (Some(sum.is_monotonic), &sum.data_points), - otlp_metric::Data::Gauge(gauge) => (None, &gauge.data_points), - _ => panic!("unsupported metric data for {}", metric.name), - }; - let value_type = metric_value_type(points); - let shape = shapes - .entry(metric.name.clone()) - .or_insert_with(|| MetricShape { - unit: metric.unit.clone(), - monotonic, - attributes: BTreeSet::new(), - all_attributes: BTreeSet::new(), - attribute_types: BTreeMap::new(), - enum_values: BTreeMap::new(), - value_type, - }); - assert_eq!( - shape.unit, metric.unit, - "unit mismatch across {}", - metric.name - ); - assert_eq!( - shape.monotonic, monotonic, - "instrument/temporality mismatch across {}", - metric.name - ); - assert_eq!( - shape.value_type, value_type, - "value type mismatch across {}", - metric.name - ); - for attr in points.iter().flat_map(|point| point.attributes.iter()) { - let _ = shape.attributes.insert(attr.key.clone()); - if let Some(value) = any_value_string(attr.value.as_ref()) { - let _ = shape - .enum_values - .entry(attr.key.clone()) - .or_default() - .insert(value); - } - if let Some(kind) = any_value_kind(attr.value.as_ref()) { - let previous = shape.attribute_types.insert(attr.key.clone(), kind); - assert!( - previous.is_none() || previous == Some(kind), - "mixed attribute value types for {} on {}", - attr.key, - metric.name - ); - } - } - } - shapes - } - - #[cfg(feature = "dev-tools")] - fn metric_value_type(points: &[NumberDataPoint]) -> Option { - let mut value_type = None; - for point in points { - let point_value_type = match point.value { - Some(number_data_point::Value::AsInt(_)) => MetricValueKind::Int, - Some(number_data_point::Value::AsDouble(_)) => MetricValueKind::Double, - None => continue, - }; - if value_type - .replace(point_value_type) - .is_some_and(|current| current != point_value_type) - { - panic!("mixed int/double data points"); - } - } - value_type - } - - #[cfg(feature = "dev-tools")] - fn any_value_string(value: Option<&AnyValue>) -> Option { - match value?.value.as_ref()? { - any_value::Value::StringValue(value) => Some(value.clone()), - any_value::Value::IntValue(value) => Some(value.to_string()), - any_value::Value::DoubleValue(value) => Some(value.to_string()), - any_value::Value::BoolValue(value) => Some(value.to_string()), - _ => None, - } - } - - #[cfg(feature = "dev-tools")] - fn any_value_kind(value: Option<&AnyValue>) -> Option { - match value?.value.as_ref()? { - any_value::Value::StringValue(_) => Some(AttributeValueKind::String), - any_value::Value::IntValue(_) => Some(AttributeValueKind::Int), - any_value::Value::DoubleValue(_) => Some(AttributeValueKind::Double), - any_value::Value::BoolValue(_) => Some(AttributeValueKind::Bool), - _ => None, - } - } - - fn projection_fixture_request() -> MetricsData { - decode_metrics( - HostSnapshot { - now_unix_nano: 2_000, - start_time_unix_nano: 1_000, - counter_starts: CounterStarts::default(), - memory_limit: true, - memory_shared: true, - memory_hugepages: true, - cpu: Some(CpuTimes { - user: 1.0, - nice: 2.0, - system: 3.0, - idle: 4.0, - wait: 5.0, - interrupt: 6.0, - steal: 7.0, - }), - cpu_utilization: Some(CpuTimes { - user: 0.1, - nice: 0.1, - system: 0.2, - idle: 0.3, - wait: 0.1, - interrupt: 0.1, - steal: 0.1, - }), - cpuinfo: CpuInfo { - logical_count: 2, - physical_count: 1, - frequencies_hz: vec![2_400_000_000.0], - }, - memory: Some(MemoryStats { - total: 100, - used: 80, - free: 10, - available: 20, - has_available: true, - cached: 5, - buffered: 5, - shared: 7, - slab_reclaimable: 3, - slab_unreclaimable: 2, - hugepages: HugepageStats { - total: 10, - free: 4, - reserved: 2, - surplus: 1, - page_size_bytes: 2 * BYTES_PER_KIB, - }, - }), - uptime_seconds: Some(42.0), - paging: Some(PagingStats { - minor_faults: 9, - major_faults: 1, - page_in: 4, - page_out: 5, - swap_in: 2, - swap_out: 3, - }), - swaps: vec![SwapStats { - name: "/dev/swap".to_owned(), - size: 100, - used: 25, - free: 75, - }], - processes: Some(ProcessStats { - running: 4, - blocked: 1, - created: 99, - }), - disks: vec![DiskStats { - name: "sda".to_owned(), - limit_bytes: Some(123), - read_bytes: 10, - write_bytes: 20, - read_ops: 1, - write_ops: 2, - read_merged: 3, - write_merged: 4, - read_time_seconds: 0.5, - write_time_seconds: 0.6, - io_time_seconds: 0.7, - }], - filesystems: vec![FilesystemStats { - device: "/dev/sda1".to_owned(), - mountpoint: "/".to_owned(), - fs_type: "ext4".to_owned(), - mode: "rw", - used: 60, - free: 30, - reserved: 10, - limit_bytes: Some(100), - }], - networks: vec![NetworkStats { - name: "eth0".to_owned(), - rx_bytes: 10, - tx_bytes: 20, - rx_packets: 1, - tx_packets: 2, - rx_errors: 3, - tx_errors: 4, - rx_dropped: 5, - tx_dropped: 6, - }], - resource: HostResource { - host_id: Some("host-id".to_owned()), - host_name: Some("host-name".to_owned()), - host_arch: Some("amd64"), - }, - } - .into_otap_records() - .expect("encode ok"), - ) - } - - #[cfg(feature = "dev-tools")] - fn projection_fixture_metrics() -> Vec { - projection_fixture_request() - .resource_metrics - .into_iter() - .next() - .expect("resource metrics") - .scope_metrics - .into_iter() - .next() - .expect("scope metrics") - .metrics - } - - fn assert_metric_shape( - metrics: &[Metric], - name: &'static str, - unit: &'static str, - monotonic_sum: Option, - ) { - let metric = metric_by_name(metrics, name); - assert_eq!(metric.unit, unit); - match metric.data.as_ref().expect("metric data") { - otlp_metric::Data::Sum(sum) => { - let expected_monotonic = - monotonic_sum.unwrap_or_else(|| panic!("{name} should be a gauge")); - assert_eq!( - sum.aggregation_temporality, - AggregationTemporality::Cumulative as i32 - ); - assert_eq!(sum.is_monotonic, expected_monotonic); - assert!( - sum.data_points - .iter() - .all(|point| point.start_time_unix_nano == 1_000) - ); - } - otlp_metric::Data::Gauge(gauge) => { - assert!(monotonic_sum.is_none(), "{name} should be a cumulative sum"); - assert!( - gauge - .data_points - .iter() - .all(|point| point.start_time_unix_nano == 0) - ); - } - _ => panic!("unexpected data kind for {name}"), - } - } - - fn assert_first_point_attr( - metrics: &[Metric], - name: &'static str, - key: &'static str, - value: &'static str, - ) { - let metric = metric_by_name(metrics, name); - let point = match metric.data.as_ref().expect("metric data") { - otlp_metric::Data::Sum(sum) => sum.data_points.first(), - otlp_metric::Data::Gauge(gauge) => gauge.data_points.first(), - _ => None, - } - .expect("data point"); - assert_has_attr(&point.attributes, key, value); - } - - fn assert_sum_point_attr( - metrics: &[Metric], - name: &'static str, - key: &'static str, - value: &'static str, - ) { - let metric = metric_by_name(metrics, name); - let otlp_metric::Data::Sum(sum) = metric.data.as_ref().expect("metric data") else { - panic!("{name} should be a cumulative sum"); - }; - assert!( - sum.data_points - .iter() - .any(|point| has_attr(&point.attributes, key, value)), - "missing point attribute {key}={value}" - ); - } - - fn assert_first_point_int(metrics: &[Metric], name: &'static str, expected: i64) { - let metric = metric_by_name(metrics, name); - let point = match metric.data.as_ref().expect("metric data") { - otlp_metric::Data::Sum(sum) => sum.data_points.first(), - otlp_metric::Data::Gauge(gauge) => gauge.data_points.first(), - _ => None, - } - .expect("data point"); - assert_eq!( - point.value, - Some(number_data_point::Value::AsInt(expected)), - "{name} first point should be int" - ); - } - - fn assert_first_point_attr_int( - metrics: &[Metric], - name: &'static str, - key: &'static str, - expected: i64, - ) { - let metric = metric_by_name(metrics, name); - let point = match metric.data.as_ref().expect("metric data") { - otlp_metric::Data::Sum(sum) => sum.data_points.first(), - otlp_metric::Data::Gauge(gauge) => gauge.data_points.first(), - _ => None, - } - .expect("data point"); - assert!( - point.attributes.iter().any(|attr| { - attr.key == key - && matches!( - attr.value.as_ref().and_then(|value| value.value.as_ref()), - Some(any_value::Value::IntValue(actual)) if *actual == expected - ) - }), - "missing int attribute {key}={expected}" - ); - } - - fn assert_no_first_point_attr(metrics: &[Metric], name: &'static str, key: &'static str) { - let metric = metric_by_name(metrics, name); - let point = match metric.data.as_ref().expect("metric data") { - otlp_metric::Data::Sum(sum) => sum.data_points.first(), - otlp_metric::Data::Gauge(gauge) => gauge.data_points.first(), - _ => None, - } - .expect("data point"); - assert!( - !point.attributes.iter().any(|attr| attr.key == key), - "unexpected attribute {key}" - ); - } - - fn assert_first_sum_point_start(metrics: &[Metric], name: &'static str, expected_start: u64) { - let metric = metric_by_name(metrics, name); - let otlp_metric::Data::Sum(sum) = metric.data.as_ref().expect("metric data") else { - panic!("{name} should be a cumulative sum"); - }; - let point = sum.data_points.first().expect("data point"); - assert_eq!(point.start_time_unix_nano, expected_start); - } - - fn metric_by_name<'a>(metrics: &'a [Metric], name: &'static str) -> &'a Metric { - metrics - .iter() - .find(|metric| metric.name == name) - .unwrap_or_else(|| panic!("missing metric {name}")) - } - - fn assert_has_attr(attributes: &[KeyValue], key: &'static str, value: &'static str) { - assert!( - has_attr(attributes, key, value), - "missing attribute {key}={value}" - ); - } - - fn has_attr(attributes: &[KeyValue], key: &'static str, value: &'static str) -> bool { - attributes.iter().any(|attr| { - attr.key == key - && matches!( - attr.value.as_ref().and_then(|value| value.value.as_ref()), - Some(any_value::Value::StringValue(actual)) if actual == value - ) - }) - } -} diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/mod.rs new file mode 100644 index 0000000000..73431f0879 --- /dev/null +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/mod.rs @@ -0,0 +1,535 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +//! Linux procfs-backed host metric source. + +mod paths; +mod projection; +mod readings; + +use crate::receivers::host_metrics_receiver::{CompiledFilter, HostViewValidationMode}; +use paths::{PathKind, ProcfsPaths}; +use projection::{CounterTracker, HostScrape, host_arch}; +pub(crate) use projection::{HostResource, HostSnapshot}; +use readings::*; +use std::fs::File; +use std::io::{self, Read}; +use std::path::Path; +use std::time::{Duration, Instant}; + +const NANOS_PER_SEC: u64 = 1_000_000_000; +const BYTES_PER_KIB: u64 = 1024; +const DISKSTAT_SECTOR_BYTES: u64 = 512; +const FILESYSTEM_STAT_TIMEOUT: Duration = Duration::from_millis(100); +const FILESYSTEM_SCRAPE_TIMEOUT: Duration = Duration::from_secs(1); +const COUNTER_KEY_SEPARATOR: char = '\x1f'; + +/// Procfs-backed source for host metrics. +pub struct ProcfsSource { + paths: ProcfsPaths, + config: ProcfsConfig, + buf: String, + clk_tck: f64, + previous_cpu: Option, + filesystem_worker: FilesystemStatWorker, + counter_tracker: CounterTracker, + boot_time_unix_nano: Option, + resource: Option, +} + +/// Procfs collection config. +pub struct ProcfsConfig { + /// CPU metrics. + pub cpu: bool, + /// Memory metrics. + pub memory: bool, + /// Paging metrics. + pub paging: bool, + /// System metrics. + pub system: bool, + /// Disk metrics. + pub disk: bool, + /// Filesystem metrics. + pub filesystem: bool, + /// Network metrics. + pub network: bool, + /// Process summary metrics. + pub processes: bool, + /// Derived aggregate CPU utilization. + pub cpu_utilization: bool, + /// Emit memory limit metric. + pub memory_limit: bool, + /// Emit Linux shared memory metric. + pub memory_shared: bool, + /// Emit Linux hugepage metrics. + pub memory_hugepages: bool, + /// Derived disk limit from sysfs block device size. + pub disk_limit: bool, + /// Include virtual filesystems. + pub filesystem_include_virtual: bool, + /// Emit filesystem limit metric. + pub filesystem_limit: bool, + /// Disk include filter. + pub disk_include: Option, + /// Disk exclude filter. + pub disk_exclude: Option, + /// Filesystem device include filter. + pub filesystem_include_devices: Option, + /// Filesystem device exclude filter. + pub filesystem_exclude_devices: Option, + /// Filesystem type include filter. + pub filesystem_include_fs_types: Option, + /// Filesystem type exclude filter. + pub filesystem_exclude_fs_types: Option, + /// Filesystem mount point include filter. + pub filesystem_include_mount_points: Option, + /// Filesystem mount point exclude filter. + pub filesystem_exclude_mount_points: Option, + /// Network include filter. + pub network_include: Option, + /// Network exclude filter. + pub network_exclude: Option, + /// Startup validation mode. + pub validation: HostViewValidationMode, +} + +/// Families due for one scrape. +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +pub struct ProcfsFamilies { + /// CPU metrics. + pub cpu: bool, + /// Memory metrics. + pub memory: bool, + /// Paging metrics. + pub paging: bool, + /// System metrics. + pub system: bool, + /// Disk metrics. + pub disk: bool, + /// Filesystem metrics. + pub filesystem: bool, + /// Network metrics. + pub network: bool, + /// Process summary metrics. + pub processes: bool, +} + +impl ProcfsFamilies { + fn enabled_by(self, config: &ProcfsConfig) -> Self { + Self { + cpu: self.cpu && config.cpu, + memory: self.memory && config.memory, + paging: self.paging && config.paging, + system: self.system && config.system, + disk: self.disk && config.disk, + filesystem: self.filesystem && config.filesystem, + network: self.network && config.network, + processes: self.processes && config.processes, + } + } +} + +impl ProcfsSource { + /// Creates a procfs source rooted at `/` or at a host root bind mount. + pub fn new(root_path: Option<&Path>, config: ProcfsConfig) -> io::Result { + let mut source = Self { + paths: ProcfsPaths::new(root_path), + config, + buf: String::with_capacity(16 * 1024), + clk_tck: clock_ticks_per_second(), + previous_cpu: None, + filesystem_worker: FilesystemStatWorker::new()?, + counter_tracker: CounterTracker::default(), + boot_time_unix_nano: None, + resource: None, + }; + source.apply_startup_validation()?; + Ok(source) + } + + /// Collects one host snapshot for the due family set. + pub fn scrape_due(&mut self, due: ProcfsFamilies) -> io::Result { + let due = due.enabled_by(&self.config); + let now_unix_nano = now_unix_nano(); + let clk_tck = self.clk_tck; + let mut partial_errors = 0; + let mut first_error = None; + let needs_start_time = due.cpu + || due.memory + || due.paging + || due.disk + || due.filesystem + || due.network + || due.processes; + let needs_stat = + due.cpu || due.processes || (needs_start_time && self.boot_time_unix_nano.is_none()); + let stat = match needs_stat + .then(|| self.read_path(PathKind::Stat)) + .transpose() + { + Ok(Some(proc_stat)) => parse_stat(proc_stat, clk_tck), + Ok(None) => StatSnapshot::default(), + Err(err) => { + record_partial_error(&mut partial_errors, &mut first_error, err); + StatSnapshot::default() + } + }; + if stat.boot_time_unix_nano != 0 { + self.boot_time_unix_nano = Some(stat.boot_time_unix_nano); + } + let start_time_unix_nano = self.boot_time_unix_nano.unwrap_or(now_unix_nano); + let cpu_utilization = if due.cpu && self.config.cpu_utilization { + let utilization = stat.cpu.and_then(|current| { + self.previous_cpu + .and_then(|previous| cpu_utilization(previous, current)) + }); + self.previous_cpu = stat.cpu; + utilization + } else { + None + }; + + let cpuinfo = match due + .cpu + .then(|| self.read_path(PathKind::Cpuinfo)) + .transpose() + { + Ok(Some(cpuinfo)) => parse_cpuinfo(cpuinfo), + Ok(None) => CpuInfo::default(), + Err(err) => { + record_partial_error(&mut partial_errors, &mut first_error, err); + CpuInfo::default() + } + }; + + let memory = match due + .memory + .then(|| self.read_path(PathKind::Meminfo)) + .transpose() + { + Ok(Some(meminfo)) => parse_meminfo(meminfo), + Ok(None) => None, + Err(err) => { + record_partial_error(&mut partial_errors, &mut first_error, err); + None + } + }; + + let uptime_seconds = match due + .system + .then(|| self.read_path(PathKind::Uptime)) + .transpose() + { + Ok(Some(uptime)) => parse_uptime(uptime), + Ok(None) => None, + Err(err) => { + record_partial_error(&mut partial_errors, &mut first_error, err); + None + } + }; + + let paging = match due + .paging + .then(|| self.read_path(PathKind::Vmstat)) + .transpose() + { + Ok(Some(vmstat)) => Some(parse_vmstat(vmstat)), + Ok(None) => None, + Err(err) => { + record_partial_error(&mut partial_errors, &mut first_error, err); + None + } + }; + + let swaps = match due + .paging + .then(|| self.read_path(PathKind::Swaps)) + .transpose() + { + Ok(Some(swaps)) => parse_swaps(swaps), + Ok(None) => Vec::new(), + Err(err) => { + record_partial_error(&mut partial_errors, &mut first_error, err); + Vec::new() + } + }; + + let disks = if due.disk { + let disk_include = self.config.disk_include.clone(); + let disk_exclude = self.config.disk_exclude.clone(); + match self.read_path(PathKind::Diskstats) { + Ok(diskstats) => { + let mut disks = + parse_diskstats(diskstats, disk_include.as_ref(), disk_exclude.as_ref()); + if self.config.disk_limit { + for disk in &mut disks { + disk.limit_bytes = self.read_disk_limit_bytes(&disk.name).ok(); + } + } + Some(disks) + } + Err(err) => { + record_partial_error(&mut partial_errors, &mut first_error, err); + None + } + } + } else { + None + }; + + let networks = if due.network { + let network_include = self.config.network_include.clone(); + let network_exclude = self.config.network_exclude.clone(); + match self.read_path(PathKind::NetDev) { + Ok(netdev) => Some(parse_netdev( + netdev, + network_include.as_ref(), + network_exclude.as_ref(), + )), + Err(err) => { + record_partial_error(&mut partial_errors, &mut first_error, err); + None + } + } + } else { + None + }; + + let filesystems = if due.filesystem { + let include_virtual = self.config.filesystem_include_virtual; + let emit_limit = self.config.filesystem_limit; + let include_devices = self.config.filesystem_include_devices.clone(); + let exclude_devices = self.config.filesystem_exclude_devices.clone(); + let include_fs_types = self.config.filesystem_include_fs_types.clone(); + let exclude_fs_types = self.config.filesystem_exclude_fs_types.clone(); + let include_mount_points = self.config.filesystem_include_mount_points.clone(); + let exclude_mount_points = self.config.filesystem_exclude_mount_points.clone(); + match self.read_path(PathKind::Mountinfo) { + Ok(mountinfo) => { + let filters = FilesystemFilters { + include_devices: include_devices.as_ref(), + exclude_devices: exclude_devices.as_ref(), + include_fs_types: include_fs_types.as_ref(), + exclude_fs_types: exclude_fs_types.as_ref(), + include_mount_points: include_mount_points.as_ref(), + exclude_mount_points: exclude_mount_points.as_ref(), + }; + let mounts = parse_mountinfo(mountinfo, include_virtual, emit_limit, filters); + self.read_filesystems(mounts, &mut partial_errors, &mut first_error) + } + Err(err) => { + record_partial_error(&mut partial_errors, &mut first_error, err); + Vec::new() + } + } + } else { + Vec::new() + }; + + let resource = self.read_resource().clone(); + let counter_starts = self.counter_tracker.snapshot( + start_time_unix_nano, + now_unix_nano, + due.cpu.then_some(stat.cpu).flatten().as_ref(), + paging.as_ref(), + due.processes.then_some(stat.processes).as_ref(), + disks.as_deref(), + networks.as_deref(), + ); + + let snapshot = HostSnapshot { + now_unix_nano, + start_time_unix_nano, + counter_starts, + memory_limit: self.config.memory_limit, + memory_shared: self.config.memory_shared, + memory_hugepages: self.config.memory_hugepages, + cpu: due.cpu.then_some(stat.cpu).flatten(), + cpu_utilization, + cpuinfo, + memory, + uptime_seconds, + paging, + swaps, + processes: due.processes.then_some(stat.processes), + disks: disks.unwrap_or_default(), + filesystems, + networks: networks.unwrap_or_default(), + resource, + }; + if !snapshot.has_metrics() { + return Err(first_error + .unwrap_or_else(|| io::Error::other("host metrics scrape produced no metrics"))); + } + Ok(HostScrape { + snapshot, + partial_errors, + }) + } + + fn apply_startup_validation(&mut self) -> io::Result<()> { + match self.config.validation { + HostViewValidationMode::None => Ok(()), + HostViewValidationMode::FailSelected => self.validate_selected_paths(), + HostViewValidationMode::WarnSelected => { + self.disable_unavailable_sources(); + Ok(()) + } + } + } + + fn validate_selected_paths(&self) -> io::Result<()> { + if self.config.cpu || self.config.system || self.config.processes { + let _ = File::open(self.paths.path(PathKind::Stat))?; + } + if self.config.cpu { + let _ = File::open(self.paths.path(PathKind::Cpuinfo))?; + } + if self.config.memory { + let _ = File::open(self.paths.path(PathKind::Meminfo))?; + } + if self.config.system { + let _ = File::open(self.paths.path(PathKind::Uptime))?; + } + if self.config.paging { + let _ = File::open(self.paths.path(PathKind::Vmstat))?; + let _ = File::open(self.paths.path(PathKind::Swaps))?; + } + if self.config.disk { + let _ = File::open(self.paths.path(PathKind::Diskstats))?; + } + if self.config.filesystem { + let _ = File::open(self.paths.path(PathKind::Mountinfo))?; + } + if self.config.network { + let _ = File::open(self.paths.path(PathKind::NetDev))?; + } + Ok(()) + } + + fn disable_unavailable_sources(&mut self) { + if (self.config.cpu || self.config.system || self.config.processes) + && !self.source_available(PathKind::Stat) + { + self.config.cpu = false; + self.config.system = false; + self.config.processes = false; + } + if self.config.cpu && !self.source_available(PathKind::Cpuinfo) { + self.config.cpu = false; + } + if self.config.memory && !self.source_available(PathKind::Meminfo) { + self.config.memory = false; + } + if self.config.system && !self.source_available(PathKind::Uptime) { + self.config.system = false; + } + if self.config.paging + && (!self.source_available(PathKind::Vmstat) || !self.source_available(PathKind::Swaps)) + { + self.config.paging = false; + } + if self.config.disk && !self.source_available(PathKind::Diskstats) { + self.config.disk = false; + } + if self.config.filesystem && !self.source_available(PathKind::Mountinfo) { + self.config.filesystem = false; + } + if self.config.network && !self.source_available(PathKind::NetDev) { + self.config.network = false; + } + } + + fn source_available(&self, kind: PathKind) -> bool { + File::open(self.paths.path(kind)).is_ok() + } + + fn read_path(&mut self, kind: PathKind) -> io::Result<&str> { + self.buf.clear(); + let mut file = File::open(self.paths.path(kind))?; + let _ = file.read_to_string(&mut self.buf)?; + Ok(self.buf.as_str()) + } + + fn read_disk_limit_bytes(&mut self, disk_name: &str) -> io::Result { + self.buf.clear(); + let mut file = File::open(self.paths.sys_block.join(disk_name).join("size"))?; + let _ = file.read_to_string(&mut self.buf)?; + let sectors = parse_u64(self.buf.trim()); + Ok(sectors.saturating_mul(DISKSTAT_SECTOR_BYTES)) + } + + fn read_filesystems( + &mut self, + mounts: Vec, + partial_errors: &mut u64, + first_error: &mut Option, + ) -> Vec { + let started = Instant::now(); + let mut filesystems = Vec::with_capacity(mounts.len()); + for mount in mounts { + let Some(remaining) = FILESYSTEM_SCRAPE_TIMEOUT.checked_sub(started.elapsed()) else { + record_partial_error( + partial_errors, + first_error, + io::Error::new( + io::ErrorKind::TimedOut, + "filesystem scrape budget exhausted", + ), + ); + break; + }; + let path = self.paths.host_path(&mount.mountpoint); + let stat = match self + .filesystem_worker + .statvfs(path, remaining.min(FILESYSTEM_STAT_TIMEOUT)) + { + Ok(stat) => stat, + Err(err) => { + record_partial_error(partial_errors, first_error, err); + continue; + } + }; + let free = stat.available_bytes; + let reserved = stat.free_bytes.saturating_sub(stat.available_bytes); + let used = stat.total_bytes.saturating_sub(stat.free_bytes); + filesystems.push(FilesystemStats { + device: mount.device, + mountpoint: mount.mountpoint, + fs_type: mount.fs_type, + mode: mount.mode, + used, + free, + reserved, + limit_bytes: mount.emit_limit.then_some(stat.total_bytes), + }); + } + filesystems + } + + fn read_resource(&mut self) -> &HostResource { + if self.resource.is_none() { + let host_id = self + .read_trimmed_optional(PathKind::MachineId) + .or_else(|| self.read_trimmed_optional(PathKind::DbusMachineId)); + let host_name = self.read_trimmed_optional(PathKind::Hostname); + self.resource = Some(HostResource { + host_id, + host_name, + host_arch: host_arch(), + }); + } + self.resource.as_ref().expect("resource is initialized") + } + + fn read_trimmed_optional(&mut self, kind: PathKind) -> Option { + self.read_path(kind) + .ok() + .map(str::trim) + .filter(|value| !value.is_empty()) + .map(str::to_owned) + } +} + +#[cfg(test)] +mod tests; diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/paths.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/paths.rs new file mode 100644 index 0000000000..350f01adff --- /dev/null +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/paths.rs @@ -0,0 +1,93 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +use std::path::{Path, PathBuf}; + +#[derive(Clone, Debug)] +pub(super) struct ProcfsPaths { + root: PathBuf, + stat: PathBuf, + cpuinfo: PathBuf, + meminfo: PathBuf, + uptime: PathBuf, + vmstat: PathBuf, + swaps: PathBuf, + diskstats: PathBuf, + pub(super) mountinfo: PathBuf, + pub(super) sys_block: PathBuf, + pub(super) net_dev: PathBuf, + machine_id: PathBuf, + dbus_machine_id: PathBuf, + hostname: PathBuf, +} + +impl ProcfsPaths { + pub(super) fn new(root_path: Option<&Path>) -> Self { + let root = root_path.unwrap_or_else(|| Path::new("/")); + let host_root = root_path.is_some_and(|path| path != Path::new("/")); + Self { + root: root.to_path_buf(), + stat: root.join("proc/stat"), + cpuinfo: root.join("proc/cpuinfo"), + meminfo: root.join("proc/meminfo"), + uptime: root.join("proc/uptime"), + vmstat: root.join("proc/vmstat"), + swaps: root.join("proc/swaps"), + diskstats: root.join("proc/diskstats"), + mountinfo: if host_root { + root.join("proc/1/mountinfo") + } else { + root.join("proc/self/mountinfo") + }, + sys_block: root.join("sys/block"), + machine_id: root.join("etc/machine-id"), + dbus_machine_id: root.join("var/lib/dbus/machine-id"), + hostname: root.join("proc/sys/kernel/hostname"), + net_dev: if host_root { + root.join("proc/1/net/dev") + } else { + root.join("proc/net/dev") + }, + } + } + + pub(super) fn path(&self, kind: PathKind) -> &Path { + match kind { + PathKind::Stat => &self.stat, + PathKind::Cpuinfo => &self.cpuinfo, + PathKind::Meminfo => &self.meminfo, + PathKind::Uptime => &self.uptime, + PathKind::Vmstat => &self.vmstat, + PathKind::Swaps => &self.swaps, + PathKind::Diskstats => &self.diskstats, + PathKind::Mountinfo => &self.mountinfo, + PathKind::NetDev => &self.net_dev, + PathKind::MachineId => &self.machine_id, + PathKind::DbusMachineId => &self.dbus_machine_id, + PathKind::Hostname => &self.hostname, + } + } + + pub(super) fn host_path(&self, host_absolute_path: &str) -> PathBuf { + let relative = host_absolute_path + .strip_prefix('/') + .unwrap_or(host_absolute_path); + self.root.join(relative) + } +} + +#[derive(Copy, Clone)] +pub(super) enum PathKind { + Stat, + Cpuinfo, + Meminfo, + Uptime, + Vmstat, + Swaps, + Diskstats, + Mountinfo, + NetDev, + MachineId, + DbusMachineId, + Hostname, +} diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/projection.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/projection.rs new file mode 100644 index 0000000000..f037526dae --- /dev/null +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/projection.rs @@ -0,0 +1,975 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +use crate::receivers::host_metrics_receiver::otap_builder::HostMetricsArrowBuilder; +use crate::receivers::host_metrics_receiver::semconv::{attr, metric}; +use std::collections::{HashMap, HashSet}; + +use super::COUNTER_KEY_SEPARATOR; +use super::readings::{ + CpuInfo, CpuTimes, DiskStats, FilesystemStats, HugepageStats, MemoryStats, NetworkStats, + PagingStats, ProcessStats, SwapStats, frequency_hz_i64, saturating_i64, +}; + +/// Result of one host metrics scrape. +pub(crate) struct HostScrape { + /// Collected host snapshot. + pub snapshot: HostSnapshot, + /// Number of source read errors skipped because other families succeeded. + pub partial_errors: u64, +} + +/// One host metrics snapshot. +#[derive(Default)] +pub(crate) struct HostSnapshot { + pub(super) now_unix_nano: u64, + pub(super) start_time_unix_nano: u64, + pub(super) counter_starts: CounterStarts, + pub(super) memory_limit: bool, + pub(super) memory_shared: bool, + pub(super) memory_hugepages: bool, + pub(super) cpu: Option, + pub(super) cpu_utilization: Option, + pub(super) cpuinfo: CpuInfo, + pub(super) memory: Option, + pub(super) uptime_seconds: Option, + pub(super) paging: Option, + pub(super) swaps: Vec, + pub(super) processes: Option, + pub(super) disks: Vec, + pub(super) filesystems: Vec, + pub(super) networks: Vec, + pub(super) resource: HostResource, +} + +impl HostSnapshot { + pub(super) fn has_metrics(&self) -> bool { + self.cpu.is_some() + || self.cpu_utilization.is_some() + || self.cpuinfo.logical_count != 0 + || self.cpuinfo.physical_count != 0 + || !self.cpuinfo.frequencies_hz.is_empty() + || self.memory.is_some() + || self.uptime_seconds.is_some() + || self.paging.is_some() + || !self.swaps.is_empty() + || self.processes.is_some() + || !self.disks.is_empty() + || !self.filesystems.is_empty() + || !self.networks.is_empty() + } + + /// Converts a snapshot directly into an OTAP Arrow metrics batch. + pub fn into_otap_records( + self, + ) -> Result { + let mut b = HostMetricsArrowBuilder::new(); + b.append_resource(&self.resource); + project_snapshot(&self, &mut b); + b.finish() + } +} + +#[derive(Clone, Default)] +pub(crate) struct HostResource { + pub(crate) host_id: Option, + pub(crate) host_name: Option, + pub(crate) host_arch: Option<&'static str>, +} + +pub(super) fn project_snapshot(snap: &HostSnapshot, b: &mut HostMetricsArrowBuilder) { + let now = snap.now_unix_nano; + let start = snap.start_time_unix_nano; + let cs = &snap.counter_starts; + + // ── CPU ────────────────────────────────────────────────────────────────── + if let Some(cpu) = snap.cpu { + let m = b.begin_counter_f64(metric::CPU_TIME, "s"); + for (mode, value) in [ + ("user", cpu.user), + ("nice", cpu.nice), + ("system", cpu.system), + ("idle", cpu.idle), + ("iowait", cpu.wait), + ("interrupt", cpu.interrupt), + ("steal", cpu.steal), + ] { + b.append_f64_sum_dp(m, cs.get(metric::CPU_TIME, mode, start), now, value, |w| { + w.str(attr::CPU_MODE, mode); + }); + } + } + if let Some(cpu) = snap.cpu_utilization { + let m = b.begin_gauge_f64(metric::CPU_UTILIZATION, "1"); + for (mode, value) in [ + ("user", cpu.user), + ("nice", cpu.nice), + ("system", cpu.system), + ("idle", cpu.idle), + ("iowait", cpu.wait), + ("interrupt", cpu.interrupt), + ("steal", cpu.steal), + ] { + b.append_f64_gauge_dp(m, now, value, |w| { + w.str(attr::CPU_MODE, mode); + }); + } + } + if snap.cpuinfo.logical_count != 0 { + let m = b.begin_updown_i64(metric::CPU_LOGICAL_COUNT, "{cpu}"); + b.append_i64_sum_dp( + m, + start, + now, + saturating_i64(snap.cpuinfo.logical_count), + |_| {}, + ); + } + if snap.cpuinfo.physical_count != 0 { + let m = b.begin_updown_i64(metric::CPU_PHYSICAL_COUNT, "{cpu}"); + b.append_i64_sum_dp( + m, + start, + now, + saturating_i64(snap.cpuinfo.physical_count), + |_| {}, + ); + } + if !snap.cpuinfo.frequencies_hz.is_empty() { + let m = b.begin_gauge_i64(metric::CPU_FREQUENCY, "Hz"); + for (idx, &freq) in snap.cpuinfo.frequencies_hz.iter().enumerate() { + let logical = i64::try_from(idx).unwrap_or(i64::MAX); + b.append_i64_gauge_dp(m, now, frequency_hz_i64(freq), |w| { + w.int(attr::CPU_LOGICAL_NUMBER, logical); + }); + } + } + + // ── Memory ─────────────────────────────────────────────────────────────── + if let Some(memory) = snap.memory { + let m = b.begin_updown_i64(metric::MEMORY_USAGE, "By"); + for (state, value) in [ + ("used", memory.used), + ("free", memory.free), + ("cached", memory.cached), + ("buffers", memory.buffered), + ] { + b.append_i64_sum_dp(m, start, now, saturating_i64(value), |w| { + w.str(attr::SYSTEM_MEMORY_STATE, state); + }); + } + if memory.total > 0 { + let m = b.begin_gauge_f64(metric::MEMORY_UTILIZATION, "1"); + let total = memory.total as f64; + for (state, value) in [ + ("used", memory.used), + ("free", memory.free), + ("cached", memory.cached), + ("buffers", memory.buffered), + ] { + b.append_f64_gauge_dp(m, now, value as f64 / total, |w| { + w.str(attr::SYSTEM_MEMORY_STATE, state); + }); + } + } + if memory.has_available { + let m = b.begin_updown_i64(metric::MEMORY_LINUX_AVAILABLE, "By"); + b.append_i64_sum_dp(m, start, now, saturating_i64(memory.available), |_| {}); + } + let m = b.begin_updown_i64(metric::MEMORY_LINUX_SLAB_USAGE, "By"); + for (state, value) in [ + ("reclaimable", memory.slab_reclaimable), + ("unreclaimable", memory.slab_unreclaimable), + ] { + b.append_i64_sum_dp(m, start, now, saturating_i64(value), |w| { + w.str(attr::SYSTEM_MEMORY_LINUX_SLAB_STATE, state); + }); + } + if snap.memory_limit { + let m = b.begin_updown_i64(metric::MEMORY_LIMIT, "By"); + b.append_i64_sum_dp(m, start, now, saturating_i64(memory.total), |_| {}); + } + if snap.memory_shared { + let m = b.begin_updown_i64(metric::MEMORY_LINUX_SHARED, "By"); + b.append_i64_sum_dp(m, start, now, saturating_i64(memory.shared), |_| {}); + } + if snap.memory_hugepages { + project_hugepages(b, start, now, &memory.hugepages); + } + } + + // ── System / uptime ────────────────────────────────────────────────────── + if let Some(uptime) = snap.uptime_seconds { + let m = b.begin_gauge_f64(metric::UPTIME, "s"); + b.append_f64_gauge_dp(m, now, uptime, |_| {}); + } + + // ── Paging ─────────────────────────────────────────────────────────────── + if let Some(paging) = snap.paging { + let m = b.begin_counter_i64(metric::PAGING_FAULTS, "{fault}"); + for (fault_type, value) in [ + ("minor", paging.minor_faults), + ("major", paging.major_faults), + ] { + b.append_i64_sum_dp( + m, + cs.get(metric::PAGING_FAULTS, fault_type, start), + now, + saturating_i64(value), + |w| { + w.str(attr::SYSTEM_PAGING_FAULT_TYPE, fault_type); + }, + ); + } + let m = b.begin_counter_i64(metric::PAGING_OPERATIONS, "{operation}"); + // Linux exposes swap operations and page-in/page-out counters separately. + // Semconv requires both direction and fault.type for this metric, so the + // receiver keeps the phase-1 mapping explicit here. + for (direction, fault_type, value) in [ + ("in", "major", paging.swap_in), + ("out", "major", paging.swap_out), + ("in", "minor", paging.page_in), + ("out", "minor", paging.page_out), + ] { + b.append_i64_sum_dp( + m, + cs.get_joined(metric::PAGING_OPERATIONS, direction, fault_type, start), + now, + saturating_i64(value), + |w| { + w.str(attr::SYSTEM_PAGING_DIRECTION, direction); + w.str(attr::SYSTEM_PAGING_FAULT_TYPE, fault_type); + }, + ); + } + } + if !snap.swaps.is_empty() { + let m = b.begin_updown_i64(metric::PAGING_USAGE, "By"); + for swap in &snap.swaps { + for (state, value) in [("used", swap.used), ("free", swap.free)] { + b.append_i64_sum_dp(m, start, now, saturating_i64(value), |w| { + w.str(attr::SYSTEM_DEVICE, &swap.name); + w.str(attr::SYSTEM_PAGING_STATE, state); + }); + } + } + } + if snap.swaps.iter().any(|swap| swap.size > 0) { + let m = b.begin_gauge_f64(metric::PAGING_UTILIZATION, "1"); + for swap in &snap.swaps { + let size = swap.size; + if size == 0 { + continue; + } + let total = size as f64; + for (state, value) in [("used", swap.used), ("free", swap.free)] { + b.append_f64_gauge_dp(m, now, value as f64 / total, |w| { + w.str(attr::SYSTEM_DEVICE, &swap.name); + w.str(attr::SYSTEM_PAGING_STATE, state); + }); + } + } + } + + // ── Processes ──────────────────────────────────────────────────────────── + if let Some(processes) = snap.processes { + let m = b.begin_updown_i64(metric::PROCESS_COUNT, "{process}"); + b.append_i64_sum_dp(m, start, now, saturating_i64(processes.running), |w| { + w.str(attr::PROCESS_STATE, "running"); + }); + // /proc/stat procs_blocked has no registered process.state value. + // Do not map it to sleeping; Linux blocked tasks are not the same state. + let m = b.begin_counter_i64(metric::PROCESS_CREATED, "{process}"); + b.append_i64_sum_dp( + m, + cs.get(metric::PROCESS_CREATED, "", start), + now, + saturating_i64(processes.created), + |_| {}, + ); + } + + // ── Disk ───────────────────────────────────────────────────────────────── + if snap.disks.iter().any(|disk| disk.limit_bytes.is_some()) { + let m = b.begin_updown_i64(metric::DISK_LIMIT, "By"); + for disk in &snap.disks { + let Some(limit_bytes) = disk.limit_bytes else { + continue; + }; + b.append_i64_sum_dp(m, start, now, saturating_i64(limit_bytes), |w| { + w.str(attr::SYSTEM_DEVICE, &disk.name); + }); + } + } + if !snap.disks.is_empty() { + let m = b.begin_counter_i64(metric::DISK_IO, "By"); + for disk in &snap.disks { + for (dir, value) in [("read", disk.read_bytes), ("write", disk.write_bytes)] { + b.append_i64_sum_dp( + m, + cs.get_joined(metric::DISK_IO, &disk.name, dir, start), + now, + saturating_i64(value), + |w| { + w.str(attr::SYSTEM_DEVICE, &disk.name); + w.str(attr::DISK_IO_DIRECTION, dir); + }, + ); + } + } + let m = b.begin_counter_i64(metric::DISK_OPERATIONS, "{operation}"); + for disk in &snap.disks { + for (dir, value) in [("read", disk.read_ops), ("write", disk.write_ops)] { + b.append_i64_sum_dp( + m, + cs.get_joined(metric::DISK_OPERATIONS, &disk.name, dir, start), + now, + saturating_i64(value), + |w| { + w.str(attr::SYSTEM_DEVICE, &disk.name); + w.str(attr::DISK_IO_DIRECTION, dir); + }, + ); + } + } + let m = b.begin_counter_f64(metric::DISK_IO_TIME, "s"); + for disk in &snap.disks { + b.append_f64_sum_dp( + m, + cs.get(metric::DISK_IO_TIME, &disk.name, start), + now, + disk.io_time_seconds, + |w| { + w.str(attr::SYSTEM_DEVICE, &disk.name); + }, + ); + } + let m = b.begin_counter_f64(metric::DISK_OPERATION_TIME, "s"); + for disk in &snap.disks { + for (dir, value) in [ + ("read", disk.read_time_seconds), + ("write", disk.write_time_seconds), + ] { + b.append_f64_sum_dp( + m, + cs.get_joined(metric::DISK_OPERATION_TIME, &disk.name, dir, start), + now, + value, + |w| { + w.str(attr::SYSTEM_DEVICE, &disk.name); + w.str(attr::DISK_IO_DIRECTION, dir); + }, + ); + } + } + let m = b.begin_counter_i64(metric::DISK_MERGED, "{operation}"); + for disk in &snap.disks { + for (dir, value) in [("read", disk.read_merged), ("write", disk.write_merged)] { + b.append_i64_sum_dp( + m, + cs.get_joined(metric::DISK_MERGED, &disk.name, dir, start), + now, + saturating_i64(value), + |w| { + w.str(attr::SYSTEM_DEVICE, &disk.name); + w.str(attr::DISK_IO_DIRECTION, dir); + }, + ); + } + } + } + + // ── Filesystem ─────────────────────────────────────────────────────────── + if !snap.filesystems.is_empty() { + let m = b.begin_updown_i64(metric::FILESYSTEM_USAGE, "By"); + for fs in &snap.filesystems { + for (state, value) in [ + ("used", fs.used), + ("free", fs.free), + ("reserved", fs.reserved), + ] { + b.append_i64_sum_dp(m, start, now, saturating_i64(value), |w| { + w.str(attr::SYSTEM_DEVICE, &fs.device); + w.str(attr::SYSTEM_FILESYSTEM_STATE, state); + w.str(attr::SYSTEM_FILESYSTEM_TYPE, &fs.fs_type); + w.str(attr::SYSTEM_FILESYSTEM_MODE, fs.mode); + w.str(attr::SYSTEM_FILESYSTEM_MOUNTPOINT, &fs.mountpoint); + }); + } + } + } + if snap + .filesystems + .iter() + .any(|fs| fs.used.saturating_add(fs.free).saturating_add(fs.reserved) > 0) + { + let m = b.begin_gauge_f64(metric::FILESYSTEM_UTILIZATION, "1"); + for fs in &snap.filesystems { + let total = fs.used.saturating_add(fs.free).saturating_add(fs.reserved); + if total > 0 { + let total_f = total as f64; + for (state, value) in [ + ("used", fs.used), + ("free", fs.free), + ("reserved", fs.reserved), + ] { + b.append_f64_gauge_dp(m, now, value as f64 / total_f, |w| { + w.str(attr::SYSTEM_DEVICE, &fs.device); + w.str(attr::SYSTEM_FILESYSTEM_STATE, state); + w.str(attr::SYSTEM_FILESYSTEM_TYPE, &fs.fs_type); + w.str(attr::SYSTEM_FILESYSTEM_MODE, fs.mode); + w.str(attr::SYSTEM_FILESYSTEM_MOUNTPOINT, &fs.mountpoint); + }); + } + } + } + } + if snap.filesystems.iter().any(|fs| fs.limit_bytes.is_some()) { + let m = b.begin_updown_i64(metric::FILESYSTEM_LIMIT, "By"); + for fs in &snap.filesystems { + let Some(limit_bytes) = fs.limit_bytes else { + continue; + }; + b.append_i64_sum_dp(m, start, now, saturating_i64(limit_bytes), |w| { + w.str(attr::SYSTEM_DEVICE, &fs.device); + w.str(attr::SYSTEM_FILESYSTEM_TYPE, &fs.fs_type); + w.str(attr::SYSTEM_FILESYSTEM_MODE, fs.mode); + w.str(attr::SYSTEM_FILESYSTEM_MOUNTPOINT, &fs.mountpoint); + }); + } + } + + // ── Network ────────────────────────────────────────────────────────────── + if !snap.networks.is_empty() { + let m = b.begin_counter_i64(metric::NETWORK_IO, "By"); + for net in &snap.networks { + for (dir, value) in [("receive", net.rx_bytes), ("transmit", net.tx_bytes)] { + b.append_i64_sum_dp( + m, + cs.get_joined(metric::NETWORK_IO, &net.name, dir, start), + now, + saturating_i64(value), + |w| { + w.str(attr::NETWORK_INTERFACE_NAME, &net.name); + w.str(attr::NETWORK_IO_DIRECTION, dir); + }, + ); + } + } + let m = b.begin_counter_i64(metric::NETWORK_PACKET_COUNT, "{packet}"); + for net in &snap.networks { + for (dir, value) in [("receive", net.rx_packets), ("transmit", net.tx_packets)] { + b.append_i64_sum_dp( + m, + cs.get_joined(metric::NETWORK_PACKET_COUNT, &net.name, dir, start), + now, + saturating_i64(value), + |w| { + // Semconv uses system.device here, while sibling network + // metrics use network.interface.name. + w.str(attr::SYSTEM_DEVICE, &net.name); + w.str(attr::NETWORK_IO_DIRECTION, dir); + }, + ); + } + } + let m = b.begin_counter_i64(metric::NETWORK_PACKET_DROPPED, "{packet}"); + for net in &snap.networks { + for (dir, value) in [("receive", net.rx_dropped), ("transmit", net.tx_dropped)] { + b.append_i64_sum_dp( + m, + cs.get_joined(metric::NETWORK_PACKET_DROPPED, &net.name, dir, start), + now, + saturating_i64(value), + |w| { + w.str(attr::NETWORK_INTERFACE_NAME, &net.name); + w.str(attr::NETWORK_IO_DIRECTION, dir); + }, + ); + } + } + let m = b.begin_counter_i64(metric::NETWORK_ERRORS, "{error}"); + for net in &snap.networks { + for (dir, value) in [("receive", net.rx_errors), ("transmit", net.tx_errors)] { + b.append_i64_sum_dp( + m, + cs.get_joined(metric::NETWORK_ERRORS, &net.name, dir, start), + now, + saturating_i64(value), + |w| { + w.str(attr::NETWORK_INTERFACE_NAME, &net.name); + w.str(attr::NETWORK_IO_DIRECTION, dir); + }, + ); + } + } + } +} + +pub(super) fn project_hugepages( + b: &mut HostMetricsArrowBuilder, + start: u64, + now: u64, + hugepages: &HugepageStats, +) { + let m = b.begin_updown_i64(metric::MEMORY_LINUX_HUGEPAGES_LIMIT, "{page}"); + b.append_i64_sum_dp(m, start, now, saturating_i64(hugepages.total), |_| {}); + let m = b.begin_updown_i64(metric::MEMORY_LINUX_HUGEPAGES_PAGE_SIZE, "By"); + b.append_i64_sum_dp( + m, + start, + now, + saturating_i64(hugepages.page_size_bytes), + |_| {}, + ); + let m = b.begin_updown_i64(metric::MEMORY_LINUX_HUGEPAGES_RESERVED, "{page}"); + b.append_i64_sum_dp(m, start, now, saturating_i64(hugepages.reserved), |_| {}); + let m = b.begin_updown_i64(metric::MEMORY_LINUX_HUGEPAGES_SURPLUS, "{page}"); + b.append_i64_sum_dp(m, start, now, saturating_i64(hugepages.surplus), |_| {}); + let used = hugepages.total.saturating_sub(hugepages.free); + let m = b.begin_updown_i64(metric::MEMORY_LINUX_HUGEPAGES_USAGE, "{page}"); + for (state, value) in [("used", used), ("free", hugepages.free)] { + b.append_i64_sum_dp(m, start, now, saturating_i64(value), |w| { + w.str(attr::SYSTEM_MEMORY_LINUX_HUGEPAGES_STATE, state); + }); + } + if hugepages.total > 0 { + let total = hugepages.total as f64; + let m = b.begin_gauge_f64(metric::MEMORY_LINUX_HUGEPAGES_UTILIZATION, "1"); + for (state, value) in [("used", used), ("free", hugepages.free)] { + b.append_f64_gauge_dp(m, now, value as f64 / total, |w| { + w.str(attr::SYSTEM_MEMORY_LINUX_HUGEPAGES_STATE, state); + }); + } + } +} + +#[derive(Default)] +pub(super) struct CounterTracker { + states: HashMap, +} + +pub(super) struct CounterState { + previous: f64, + start_time_unix_nano: u64, +} + +#[derive(Default)] +pub(super) struct CounterStarts { + pub(super) entries: Vec<(String, u64)>, +} + +impl CounterStarts { + fn get(&self, metric: &'static str, series: &str, default_start: u64) -> u64 { + self.entries + .iter() + .find_map(|(key, start)| counter_key_matches(key, metric, series).then_some(*start)) + .unwrap_or(default_start) + } + + pub(super) fn get_joined( + &self, + metric: &'static str, + first: &str, + second: &'static str, + default_start: u64, + ) -> u64 { + self.entries + .iter() + .find_map(|(key, start)| { + counter_key_matches_joined(key, metric, first, second).then_some(*start) + }) + .unwrap_or(default_start) + } +} + +impl CounterTracker { + pub(super) fn snapshot( + &mut self, + default_start: u64, + now: u64, + cpu: Option<&CpuTimes>, + paging: Option<&PagingStats>, + processes: Option<&ProcessStats>, + disks: Option<&[DiskStats]>, + networks: Option<&[NetworkStats]>, + ) -> CounterStarts { + let mut starts = CounterStarts::default(); + if let Some(cpu) = cpu { + self.observe_all( + metric::CPU_TIME, + default_start, + now, + &[ + ("user", cpu.user), + ("nice", cpu.nice), + ("system", cpu.system), + ("idle", cpu.idle), + ("iowait", cpu.wait), + ("interrupt", cpu.interrupt), + ("steal", cpu.steal), + ], + &mut starts, + ); + } + if let Some(paging) = paging { + self.observe_all( + metric::PAGING_FAULTS, + default_start, + now, + &[ + ("minor", paging.minor_faults as f64), + ("major", paging.major_faults as f64), + ], + &mut starts, + ); + for (direction, fault_type, value) in [ + ("in", "major", paging.swap_in), + ("out", "major", paging.swap_out), + ("in", "minor", paging.page_in), + ("out", "minor", paging.page_out), + ] { + self.observe_joined( + metric::PAGING_OPERATIONS, + direction, + fault_type, + value as f64, + default_start, + now, + &mut starts, + ); + } + } + if let Some(processes) = processes { + self.observe( + metric::PROCESS_CREATED, + "", + processes.created as f64, + default_start, + now, + &mut starts, + ); + } + if let Some(disks) = disks { + let first_disk_entry = starts.entries.len(); + for disk in disks { + self.observe_disk_all( + metric::DISK_IO, + default_start, + now, + &disk.name, + &[ + ("read", disk.read_bytes as f64), + ("write", disk.write_bytes as f64), + ], + &mut starts, + ); + self.observe_disk_all( + metric::DISK_OPERATIONS, + default_start, + now, + &disk.name, + &[ + ("read", disk.read_ops as f64), + ("write", disk.write_ops as f64), + ], + &mut starts, + ); + self.observe( + metric::DISK_IO_TIME, + &disk.name, + disk.io_time_seconds, + default_start, + now, + &mut starts, + ); + self.observe_disk_all( + metric::DISK_OPERATION_TIME, + default_start, + now, + &disk.name, + &[ + ("read", disk.read_time_seconds), + ("write", disk.write_time_seconds), + ], + &mut starts, + ); + self.observe_disk_all( + metric::DISK_MERGED, + default_start, + now, + &disk.name, + &[ + ("read", disk.read_merged as f64), + ("write", disk.write_merged as f64), + ], + &mut starts, + ); + } + self.prune_scraped_family( + &starts.entries[first_disk_entry..], + &[ + metric::DISK_IO, + metric::DISK_OPERATIONS, + metric::DISK_IO_TIME, + metric::DISK_OPERATION_TIME, + metric::DISK_MERGED, + ], + ); + } + if let Some(networks) = networks { + let first_network_entry = starts.entries.len(); + for network in networks { + self.observe_network( + metric::NETWORK_IO, + default_start, + now, + network, + network.rx_bytes, + network.tx_bytes, + &mut starts, + ); + self.observe_network( + metric::NETWORK_PACKET_COUNT, + default_start, + now, + network, + network.rx_packets, + network.tx_packets, + &mut starts, + ); + self.observe_network( + metric::NETWORK_PACKET_DROPPED, + default_start, + now, + network, + network.rx_dropped, + network.tx_dropped, + &mut starts, + ); + self.observe_network( + metric::NETWORK_ERRORS, + default_start, + now, + network, + network.rx_errors, + network.tx_errors, + &mut starts, + ); + } + self.prune_scraped_family( + &starts.entries[first_network_entry..], + &[ + metric::NETWORK_IO, + metric::NETWORK_PACKET_COUNT, + metric::NETWORK_PACKET_DROPPED, + metric::NETWORK_ERRORS, + ], + ); + } + starts + } + + fn prune_scraped_family( + &mut self, + current_entries: &[(String, u64)], + metrics: &[&'static str], + ) { + let current_keys = current_entries + .iter() + .map(|(key, _)| key.as_str()) + .collect::>(); + self.states.retain(|key, _| { + !metrics + .iter() + .any(|metric| counter_key_is_metric(key, metric)) + || current_keys.contains(key.as_str()) + }); + } + + fn observe_all( + &mut self, + metric: &'static str, + default_start: u64, + now: u64, + values: &[(&str, f64)], + starts: &mut CounterStarts, + ) { + for (series, value) in values { + self.observe(metric, series, *value, default_start, now, starts); + } + } + + fn observe_disk_all( + &mut self, + metric: &'static str, + default_start: u64, + now: u64, + device: &str, + values: &[(&'static str, f64)], + starts: &mut CounterStarts, + ) { + for (direction, value) in values { + self.observe_joined( + metric, + device, + direction, + *value, + default_start, + now, + starts, + ); + } + } + + fn observe_network( + &mut self, + metric: &'static str, + default_start: u64, + now: u64, + network: &NetworkStats, + rx: u64, + tx: u64, + starts: &mut CounterStarts, + ) { + self.observe_joined( + metric, + &network.name, + "receive", + rx as f64, + default_start, + now, + starts, + ); + self.observe_joined( + metric, + &network.name, + "transmit", + tx as f64, + default_start, + now, + starts, + ); + } + + fn observe( + &mut self, + metric: &'static str, + series: &str, + value: f64, + default_start: u64, + now: u64, + starts: &mut CounterStarts, + ) { + self.observe_key( + counter_key(metric, series), + value, + default_start, + now, + starts, + ); + } + + fn observe_joined( + &mut self, + metric: &'static str, + first: &str, + second: &'static str, + value: f64, + default_start: u64, + now: u64, + starts: &mut CounterStarts, + ) { + self.observe_key( + counter_key_joined(metric, first, second), + value, + default_start, + now, + starts, + ); + } + + fn observe_key( + &mut self, + key: String, + value: f64, + default_start: u64, + now: u64, + starts: &mut CounterStarts, + ) { + let state = self.states.entry(key.clone()).or_insert(CounterState { + previous: value, + start_time_unix_nano: default_start, + }); + if state.start_time_unix_nano < default_start { + state.start_time_unix_nano = default_start; + } else if value < state.previous { + state.start_time_unix_nano = now; + } + state.previous = value; + starts.entries.push((key, state.start_time_unix_nano)); + } +} + +pub(super) fn counter_key(metric: &'static str, series: &str) -> String { + let mut key = String::with_capacity(metric.len() + 1 + series.len()); + key.push_str(metric); + key.push(COUNTER_KEY_SEPARATOR); + key.push_str(series); + key +} + +pub(super) fn counter_key_joined( + metric: &'static str, + first: &str, + second: &'static str, +) -> String { + let mut key = String::with_capacity(metric.len() + 2 + first.len() + second.len()); + key.push_str(metric); + key.push(COUNTER_KEY_SEPARATOR); + key.push_str(first); + key.push(COUNTER_KEY_SEPARATOR); + key.push_str(second); + key +} + +pub(super) fn counter_key_matches(key: &str, metric: &'static str, series: &str) -> bool { + key.strip_prefix(metric) + .and_then(|rest| rest.strip_prefix(COUNTER_KEY_SEPARATOR)) + == Some(series) +} + +pub(super) fn counter_key_matches_joined( + key: &str, + metric: &'static str, + first: &str, + second: &'static str, +) -> bool { + let Some(series) = key + .strip_prefix(metric) + .and_then(|rest| rest.strip_prefix(COUNTER_KEY_SEPARATOR)) + else { + return false; + }; + series + .strip_prefix(first) + .and_then(|rest| rest.strip_prefix(COUNTER_KEY_SEPARATOR)) + == Some(second) +} + +fn counter_key_is_metric(key: &str, metric: &'static str) -> bool { + key.strip_prefix(metric) + .is_some_and(|rest| rest.starts_with(COUNTER_KEY_SEPARATOR)) +} + +pub(super) fn host_arch() -> Option<&'static str> { + match std::env::consts::ARCH { + "aarch64" => Some("arm64"), + "arm" => Some("arm32"), + "powerpc" => Some("ppc32"), + "powerpc64" => Some("ppc64"), + "x86" => Some("x86"), + "x86_64" => Some("amd64"), + _ => None, + } +} diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/readings.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/readings.rs new file mode 100644 index 0000000000..c0691b9e80 --- /dev/null +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/readings.rs @@ -0,0 +1,768 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +use crate::receivers::host_metrics_receiver::CompiledFilter; +use std::collections::HashSet; +use std::io; +use std::path::{Path, PathBuf}; +use std::sync::mpsc; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; + +use super::{BYTES_PER_KIB, DISKSTAT_SECTOR_BYTES, NANOS_PER_SEC}; + +#[derive(Copy, Clone, Default)] +pub(super) struct CpuTimes { + pub(super) user: f64, + pub(super) nice: f64, + pub(super) system: f64, + pub(super) idle: f64, + pub(super) wait: f64, + pub(super) interrupt: f64, + pub(super) steal: f64, +} + +#[derive(Clone, Default)] +pub(super) struct CpuInfo { + pub(super) logical_count: u64, + pub(super) physical_count: u64, + pub(super) frequencies_hz: Vec, +} + +#[derive(Copy, Clone, Default)] +pub(super) struct StatSnapshot { + pub(super) boot_time_unix_nano: u64, + pub(super) cpu: Option, + pub(super) processes: ProcessStats, +} + +#[derive(Copy, Clone, Default)] +pub(super) struct MemoryStats { + pub(super) total: u64, + pub(super) used: u64, + pub(super) free: u64, + pub(super) available: u64, + pub(super) has_available: bool, + pub(super) cached: u64, + pub(super) buffered: u64, + pub(super) shared: u64, + pub(super) slab_reclaimable: u64, + pub(super) slab_unreclaimable: u64, + pub(super) hugepages: HugepageStats, +} + +#[derive(Copy, Clone, Default)] +pub(super) struct HugepageStats { + pub(super) total: u64, + pub(super) free: u64, + pub(super) reserved: u64, + pub(super) surplus: u64, + pub(super) page_size_bytes: u64, +} + +#[derive(Copy, Clone, Default)] +pub(super) struct PagingStats { + pub(super) minor_faults: u64, + pub(super) major_faults: u64, + pub(super) page_in: u64, + pub(super) page_out: u64, + pub(super) swap_in: u64, + pub(super) swap_out: u64, +} + +#[derive(Default)] +pub(super) struct SwapStats { + pub(super) name: String, + pub(super) size: u64, + pub(super) used: u64, + pub(super) free: u64, +} + +#[derive(Copy, Clone, Default)] +pub(super) struct ProcessStats { + pub(super) running: u64, + pub(super) blocked: u64, + pub(super) created: u64, +} + +#[derive(Default)] +pub(super) struct DiskStats { + pub(super) name: String, + pub(super) limit_bytes: Option, + pub(super) read_bytes: u64, + pub(super) write_bytes: u64, + pub(super) read_ops: u64, + pub(super) write_ops: u64, + pub(super) read_merged: u64, + pub(super) write_merged: u64, + pub(super) read_time_seconds: f64, + pub(super) write_time_seconds: f64, + pub(super) io_time_seconds: f64, +} + +#[derive(Default)] +pub(super) struct FilesystemStats { + pub(super) device: String, + pub(super) mountpoint: String, + pub(super) fs_type: String, + pub(super) mode: &'static str, + pub(super) used: u64, + pub(super) free: u64, + pub(super) reserved: u64, + pub(super) limit_bytes: Option, +} + +/// Dedicated worker thread for `statvfs` calls. +/// +/// Intentionally not using `tokio::fs` / `spawn_blocking`: `statvfs` can block +/// indefinitely on unhealthy remote or FUSE mounts, and Tokio cannot cancel an +/// in-flight blocking task. With `spawn_blocking`, repeated scrapes during a +/// hang could accumulate stuck tasks on Tokio's global blocking pool, affecting +/// unrelated callers. +/// +/// The dedicated thread plus `sync_channel(1)` caps the blast radius at one +/// worker thread and at most one queued request per receiver. Callers still use +/// a per-mount timeout, and once the queue is full, later scrapes fail fast +/// instead of creating more blocking work. This fits the dfengine +/// thread-per-core / core-locality model better than offloading to Tokio's +/// shared blocking pool. +pub(super) struct FilesystemStatWorker { + tx: mpsc::SyncSender, +} + +pub(super) struct FilesystemStatRequest { + path: PathBuf, + response: mpsc::Sender>, +} + +pub(super) struct FilesystemStat { + pub(super) total_bytes: u64, + pub(super) free_bytes: u64, + pub(super) available_bytes: u64, +} + +impl FilesystemStatWorker { + pub(super) fn new() -> io::Result { + let (tx, rx) = mpsc::sync_channel::(1); + let _handle = std::thread::Builder::new() + .name("host-metrics-statvfs".to_owned()) + .spawn(move || { + while let Ok(request) = rx.recv() { + let result = statvfs_bytes(&request.path); + let _ = request.response.send(result); + } + }) + .map_err(io::Error::other)?; + Ok(Self { tx }) + } + + #[cfg(test)] + pub(super) fn disconnected_for_test() -> Self { + let (tx, rx) = mpsc::sync_channel::(1); + drop(rx); + Self { tx } + } + + pub(super) fn statvfs(&self, path: PathBuf, timeout: Duration) -> io::Result { + let (response, rx) = mpsc::channel(); + self.tx + .try_send(FilesystemStatRequest { path, response }) + .map_err(|err| match err { + mpsc::TrySendError::Full(_) => { + io::Error::new(io::ErrorKind::TimedOut, "statvfs worker is busy") + } + mpsc::TrySendError::Disconnected(_) => { + io::Error::new(io::ErrorKind::BrokenPipe, "statvfs worker stopped") + } + })?; + rx.recv_timeout(timeout).map_err(|err| match err { + mpsc::RecvTimeoutError::Timeout => { + io::Error::new(io::ErrorKind::TimedOut, "statvfs timed out") + } + mpsc::RecvTimeoutError::Disconnected => { + io::Error::new(io::ErrorKind::BrokenPipe, "statvfs worker stopped") + } + })? + } +} + +fn statvfs_bytes(path: &Path) -> io::Result { + let stat = nix::sys::statvfs::statvfs(path).map_err(io::Error::other)?; + let block_size = stat.fragment_size(); + Ok(FilesystemStat { + total_bytes: stat.blocks().saturating_mul(block_size), + free_bytes: stat.blocks_free().saturating_mul(block_size), + available_bytes: stat.blocks_available().saturating_mul(block_size), + }) +} + +#[derive(Default)] +pub(super) struct NetworkStats { + pub(super) name: String, + pub(super) rx_bytes: u64, + pub(super) tx_bytes: u64, + pub(super) rx_packets: u64, + pub(super) tx_packets: u64, + pub(super) rx_errors: u64, + pub(super) tx_errors: u64, + pub(super) rx_dropped: u64, + pub(super) tx_dropped: u64, +} + +pub(super) fn parse_stat(input: &str, clk_tck: f64) -> StatSnapshot { + let mut snapshot = StatSnapshot::default(); + for line in input.lines() { + if let Some(rest) = line.strip_prefix("cpu ") { + snapshot.cpu = parse_cpu_total(rest, clk_tck); + } else if let Some(value) = line.strip_prefix("btime ") { + snapshot.boot_time_unix_nano = parse_u64(value).saturating_mul(NANOS_PER_SEC); + } else if let Some(value) = line.strip_prefix("procs_running ") { + snapshot.processes.running = parse_u64(value); + } else if let Some(value) = line.strip_prefix("procs_blocked ") { + snapshot.processes.blocked = parse_u64(value); + } else if let Some(value) = line.strip_prefix("processes ") { + snapshot.processes.created = parse_u64(value); + } + } + snapshot +} + +pub(super) fn parse_cpu_total(input: &str, clk_tck: f64) -> Option { + let mut fields = [0_u64; 10]; + let mut count = 0; + for (idx, token) in input.split_whitespace().take(fields.len()).enumerate() { + fields[idx] = parse_u64(token); + count += 1; + } + if count < 4 { + return None; + } + + let user = fields[0].saturating_sub(fields[8]); + let nice = fields[1].saturating_sub(fields[9]); + Some(CpuTimes { + user: ticks_to_seconds(user, clk_tck), + nice: ticks_to_seconds(nice, clk_tck), + system: ticks_to_seconds(fields[2], clk_tck), + idle: ticks_to_seconds(fields[3], clk_tck), + wait: ticks_to_seconds(fields[4], clk_tck), + interrupt: ticks_to_seconds(fields[5].saturating_add(fields[6]), clk_tck), + steal: ticks_to_seconds(fields[7], clk_tck), + }) +} + +pub(super) fn cpu_utilization(previous: CpuTimes, current: CpuTimes) -> Option { + let user = counter_delta(previous.user, current.user)?; + let nice = counter_delta(previous.nice, current.nice)?; + let system = counter_delta(previous.system, current.system)?; + let idle = counter_delta(previous.idle, current.idle)?; + let wait = counter_delta(previous.wait, current.wait)?; + let interrupt = counter_delta(previous.interrupt, current.interrupt)?; + let steal = counter_delta(previous.steal, current.steal)?; + let total = user + nice + system + idle + wait + interrupt + steal; + (total > 0.0).then(|| CpuTimes { + user: user / total, + nice: nice / total, + system: system / total, + idle: idle / total, + wait: wait / total, + interrupt: interrupt / total, + steal: steal / total, + }) +} + +pub(super) fn counter_delta(previous: f64, current: f64) -> Option { + (current >= previous).then_some(current - previous) +} + +pub(super) fn parse_cpuinfo(input: &str) -> CpuInfo { + let mut logical_count = 0; + let mut frequencies_hz = Vec::new(); + let mut physical_cores = HashSet::new(); + let mut physical_id = None; + let mut core_id = None; + + for line in input.lines() { + let Some((key, value)) = line.split_once(':') else { + continue; + }; + let key = key.trim(); + let value = value.trim(); + match key { + "processor" => { + logical_count += 1; + if let (Some(physical), Some(core)) = (physical_id.take(), core_id.take()) { + let _ = physical_cores.insert((physical, core)); + } + } + "physical id" => physical_id = Some(parse_u64(value)), + "core id" => core_id = Some(parse_u64(value)), + "cpu MHz" => { + if let Ok(mhz) = value.parse::() { + frequencies_hz.push(mhz * 1_000_000.0); + } + } + _ => {} + } + } + if let (Some(physical), Some(core)) = (physical_id, core_id) { + let _ = physical_cores.insert((physical, core)); + } + + let physical_count = u64::try_from(physical_cores.len()) + .ok() + .filter(|count| *count != 0) + .unwrap_or(logical_count); + CpuInfo { + logical_count, + physical_count, + frequencies_hz, + } +} + +pub(super) fn parse_meminfo(input: &str) -> Option { + let mut total = 0; + let mut free = 0; + let mut available = None; + let mut buffers = 0; + let mut cached = 0; + let mut shared = 0; + let mut slab_reclaimable = 0; + let mut slab_unreclaimable = 0; + let mut hugepages = HugepageStats::default(); + + for line in input.lines() { + let mut fields = line.split_whitespace(); + let Some(key) = fields.next() else { + continue; + }; + let raw_value = fields.next().map(parse_u64).unwrap_or_default(); + let value = raw_value * BYTES_PER_KIB; + match key.trim_end_matches(':') { + "MemTotal" => total = value, + "MemFree" => free = value, + "MemAvailable" => available = Some(value), + "Buffers" => buffers = value, + "Cached" => cached = value, + "Shmem" => shared = value, + "SReclaimable" => slab_reclaimable = value, + "SUnreclaim" => slab_unreclaimable = value, + "HugePages_Total" => hugepages.total = raw_value, + "HugePages_Free" => hugepages.free = raw_value, + "HugePages_Rsvd" => hugepages.reserved = raw_value, + "HugePages_Surp" => hugepages.surplus = raw_value, + "Hugepagesize" => hugepages.page_size_bytes = value, + _ => {} + } + } + + if total == 0 { + return None; + } + let has_available = available.is_some(); + let available = + available.unwrap_or_else(|| free.saturating_add(buffers).saturating_add(cached)); + Some(MemoryStats { + total, + used: total.saturating_sub(available), + free, + available, + has_available, + cached, + buffered: buffers, + shared, + slab_reclaimable, + slab_unreclaimable, + hugepages, + }) +} + +pub(super) fn parse_uptime(input: &str) -> Option { + input.split_whitespace().next()?.parse().ok() +} + +pub(super) fn parse_vmstat(input: &str) -> PagingStats { + let mut total_faults = 0; + let mut major_faults = 0; + let mut page_in = 0; + let mut page_out = 0; + let mut swap_in = 0; + let mut swap_out = 0; + + for line in input.lines() { + let mut fields = line.split_whitespace(); + let Some(key) = fields.next() else { + continue; + }; + let value = fields.next().map(parse_u64).unwrap_or_default(); + match key { + "pgfault" => total_faults = value, + "pgmajfault" => major_faults = value, + "pgpgin" => page_in = value, + "pgpgout" => page_out = value, + "pswpin" => swap_in = value, + "pswpout" => swap_out = value, + _ => {} + } + } + + PagingStats { + minor_faults: total_faults.saturating_sub(major_faults), + major_faults, + page_in, + page_out, + swap_in, + swap_out, + } +} + +pub(super) fn parse_swaps(input: &str) -> Vec { + let mut swaps = Vec::new(); + for line in input.lines().skip(1) { + let mut fields = line.split_whitespace(); + let Some(name) = fields.next() else { + continue; + }; + let _kind = fields.next(); + let Some(size_kib) = fields.next() else { + continue; + }; + let Some(used_kib) = fields.next() else { + continue; + }; + let size = parse_u64(size_kib).saturating_mul(BYTES_PER_KIB); + let used = parse_u64(used_kib).saturating_mul(BYTES_PER_KIB); + swaps.push(SwapStats { + name: name.to_owned(), + size, + used, + free: size.saturating_sub(used), + }); + } + swaps +} + +pub(super) fn parse_diskstats( + input: &str, + include: Option<&CompiledFilter>, + exclude: Option<&CompiledFilter>, +) -> Vec { + let mut disks = Vec::new(); + for line in input.lines() { + let mut fields = line.split_whitespace(); + let _major = fields.next(); + let _minor = fields.next(); + let Some(name) = fields.next() else { + continue; + }; + if !filter_allows(name, include, exclude) { + continue; + } + let Some(read_ops) = fields.next() else { + continue; + }; + let Some(read_merged) = fields.next() else { + continue; + }; + let Some(read_sectors) = fields.next() else { + continue; + }; + let Some(read_ms) = fields.next() else { + continue; + }; + let Some(write_ops) = fields.next() else { + continue; + }; + let Some(write_merged) = fields.next() else { + continue; + }; + let Some(write_sectors) = fields.next() else { + continue; + }; + let Some(write_ms) = fields.next() else { + continue; + }; + let _in_progress = fields.next(); + let Some(io_ms) = fields.next() else { + continue; + }; + disks.push(DiskStats { + name: name.to_owned(), + limit_bytes: None, + read_ops: parse_u64(read_ops), + read_bytes: parse_u64(read_sectors).saturating_mul(DISKSTAT_SECTOR_BYTES), + write_ops: parse_u64(write_ops), + write_bytes: parse_u64(write_sectors).saturating_mul(DISKSTAT_SECTOR_BYTES), + read_merged: parse_u64(read_merged), + write_merged: parse_u64(write_merged), + read_time_seconds: millis_to_seconds(parse_u64(read_ms)), + write_time_seconds: millis_to_seconds(parse_u64(write_ms)), + io_time_seconds: millis_to_seconds(parse_u64(io_ms)), + }); + } + disks +} + +pub(super) struct FilesystemMount { + pub(super) device: String, + pub(super) mountpoint: String, + pub(super) fs_type: String, + pub(super) mode: &'static str, + pub(super) emit_limit: bool, +} + +#[derive(Clone, Copy, Default)] +pub(super) struct FilesystemFilters<'a> { + pub(super) include_devices: Option<&'a CompiledFilter>, + pub(super) exclude_devices: Option<&'a CompiledFilter>, + pub(super) include_fs_types: Option<&'a CompiledFilter>, + pub(super) exclude_fs_types: Option<&'a CompiledFilter>, + pub(super) include_mount_points: Option<&'a CompiledFilter>, + pub(super) exclude_mount_points: Option<&'a CompiledFilter>, +} + +pub(super) fn parse_mountinfo( + input: &str, + include_virtual_filesystems: bool, + emit_limit: bool, + filters: FilesystemFilters<'_>, +) -> Vec { + let mut mounts = Vec::new(); + for line in input.lines() { + let Some(separator) = line.find(" - ") else { + continue; + }; + let mut pre_fields = line[..separator].split_whitespace(); + let _mount_id = pre_fields.next(); + let _parent_id = pre_fields.next(); + let _major_minor = pre_fields.next(); + let _root = pre_fields.next(); + let Some(mountpoint) = pre_fields.next() else { + continue; + }; + let Some(options) = pre_fields.next() else { + continue; + }; + + let mut post_fields = line[separator + 3..].split_whitespace(); + let Some(fs_type) = post_fields.next() else { + continue; + }; + let Some(device) = post_fields.next() else { + continue; + }; + if !include_virtual_filesystems && is_skipped_filesystem_type(fs_type) { + continue; + } + if !filter_allows(fs_type, filters.include_fs_types, filters.exclude_fs_types) { + continue; + } + let device = unescape_mountinfo(device); + if !filter_allows(&device, filters.include_devices, filters.exclude_devices) { + continue; + } + let mountpoint = unescape_mountinfo(mountpoint); + if !filter_allows( + &mountpoint, + filters.include_mount_points, + filters.exclude_mount_points, + ) { + continue; + } + mounts.push(FilesystemMount { + device, + mountpoint, + fs_type: fs_type.to_owned(), + mode: filesystem_mode(options), + emit_limit, + }); + } + mounts +} + +pub(super) fn filesystem_mode(options: &str) -> &'static str { + if options.split(',').any(|option| option == "ro") { + "ro" + } else { + "rw" + } +} + +pub(super) fn is_skipped_filesystem_type(fs_type: &str) -> bool { + if fs_type == "fuse" || fs_type == "fuseblk" || fs_type.starts_with("fuse.") { + return true; + } + matches!( + fs_type, + "autofs" + | "bpf" + | "binfmt_misc" + | "cgroup" + | "cgroup2" + | "debugfs" + | "devtmpfs" + | "fusectl" + | "mqueue" + | "nsfs" + | "overlay" + | "proc" + | "pstore" + | "squashfs" + | "sysfs" + | "tmpfs" + | "tracefs" + | "nfs" + | "nfs4" + | "cifs" + | "smb3" + | "9p" + ) +} + +pub(super) fn unescape_mountinfo(input: &str) -> String { + let bytes = input.as_bytes(); + let mut escaped = None; + for idx in 0..bytes.len() { + if bytes[idx] == b'\\' && idx + 4 <= bytes.len() { + escaped = Some(idx); + break; + } + } + let Some(first_escape) = escaped else { + return input.to_owned(); + }; + + let mut output = Vec::with_capacity(input.len()); + output.extend_from_slice(&bytes[..first_escape]); + let mut idx = first_escape; + while idx < bytes.len() { + if bytes[idx] == b'\\' && idx + 4 <= bytes.len() { + let octal = &input[idx + 1..idx + 4]; + if let Ok(value) = u8::from_str_radix(octal, 8) { + output.push(value); + idx += 4; + continue; + } + } + output.push(bytes[idx]); + idx += 1; + } + String::from_utf8_lossy(&output).into_owned() +} + +pub(super) fn parse_netdev( + input: &str, + include: Option<&CompiledFilter>, + exclude: Option<&CompiledFilter>, +) -> Vec { + let mut interfaces = Vec::new(); + for line in input.lines().skip(2) { + let Some((name, values)) = line.split_once(':') else { + continue; + }; + let name = name.trim(); + if !filter_allows(name, include, exclude) { + continue; + } + let mut fields = values.split_whitespace(); + let Some(rx_bytes) = fields.next() else { + continue; + }; + let Some(rx_packets) = fields.next() else { + continue; + }; + let Some(rx_errors) = fields.next() else { + continue; + }; + let Some(rx_dropped) = fields.next() else { + continue; + }; + let _rx_fifo = fields.next(); + let _rx_frame = fields.next(); + let _rx_compressed = fields.next(); + let _rx_multicast = fields.next(); + let Some(tx_bytes) = fields.next() else { + continue; + }; + let Some(tx_packets) = fields.next() else { + continue; + }; + let Some(tx_errors) = fields.next() else { + continue; + }; + let Some(tx_dropped) = fields.next() else { + continue; + }; + interfaces.push(NetworkStats { + name: name.to_owned(), + rx_bytes: parse_u64(rx_bytes), + rx_packets: parse_u64(rx_packets), + tx_bytes: parse_u64(tx_bytes), + tx_packets: parse_u64(tx_packets), + rx_errors: parse_u64(rx_errors), + tx_errors: parse_u64(tx_errors), + rx_dropped: parse_u64(rx_dropped), + tx_dropped: parse_u64(tx_dropped), + }); + } + interfaces +} + +pub(super) fn filter_allows( + value: &str, + include: Option<&CompiledFilter>, + exclude: Option<&CompiledFilter>, +) -> bool { + include.is_none_or(|filter| filter.matches(value)) + && !exclude.is_some_and(|filter| filter.matches(value)) +} + +pub(super) fn record_partial_error( + partial_errors: &mut u64, + first_error: &mut Option, + err: io::Error, +) { + *partial_errors = partial_errors.saturating_add(1); + if first_error.is_none() { + *first_error = Some(err); + } +} + +pub(super) fn frequency_hz_i64(value: f64) -> i64 { + if !value.is_finite() || value <= 0.0 { + return 0; + } + if value >= i64::MAX as f64 { + return i64::MAX; + } + value.round() as i64 +} + +pub(super) fn parse_u64(input: &str) -> u64 { + input.parse().unwrap_or_default() +} + +pub(super) fn ticks_to_seconds(ticks: u64, clk_tck: f64) -> f64 { + ticks as f64 / clk_tck +} + +pub(super) fn millis_to_seconds(ms: u64) -> f64 { + ms as f64 / 1_000.0 +} + +#[allow(unsafe_code)] +pub(super) fn clock_ticks_per_second() -> f64 { + // SAFETY: _SC_CLK_TCK is a valid sysconf name; the call has no side effects. + let ticks = unsafe { libc::sysconf(libc::_SC_CLK_TCK) }; + if ticks > 0 { ticks as f64 } else { 100.0 } +} + +pub(super) fn now_unix_nano() -> u64 { + let Ok(duration) = SystemTime::now().duration_since(UNIX_EPOCH) else { + return 0; + }; + duration.as_secs().saturating_mul(NANOS_PER_SEC) + u64::from(duration.subsec_nanos()) +} + +pub(super) fn saturating_i64(value: u64) -> i64 { + i64::try_from(value).unwrap_or(i64::MAX) +} diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/tests.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/tests.rs new file mode 100644 index 0000000000..c2b6d40d7f --- /dev/null +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/tests.rs @@ -0,0 +1,1742 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +use super::*; +use crate::receivers::host_metrics_receiver::semconv::{attr, metric}; +use otap_df_pdata::proto::opentelemetry::common::v1::{KeyValue, any_value}; +use otap_df_pdata::proto::opentelemetry::metrics::v1::{ + AggregationTemporality, Metric, MetricsData, metric as otlp_metric, number_data_point, +}; +use otap_df_pdata::testing::round_trip::decode_metrics; +use projection::{CounterStarts, counter_key, counter_key_joined, counter_key_matches_joined}; +#[cfg(feature = "dev-tools")] +use std::collections::{BTreeMap, BTreeSet}; +use std::io::ErrorKind; +use std::path::PathBuf; +#[cfg(feature = "dev-tools")] +use weaver_common::{result::WResult, vdir::VirtualDirectoryPath}; +#[cfg(feature = "dev-tools")] +use weaver_forge::registry::ResolvedRegistry; +#[cfg(feature = "dev-tools")] +use weaver_resolver::SchemaResolver; +#[cfg(feature = "dev-tools")] +use weaver_semconv::{ + attribute::{ + AttributeType, BasicRequirementLevelSpec, PrimitiveOrArrayTypeSpec, RequirementLevel, + ValueSpec, + }, + group::{GroupType, InstrumentSpec}, + registry_repo::RegistryRepo, +}; + +#[test] +fn projection_uses_expected_metric_shapes() { + let data = projection_fixture_request(); + + let resource_metrics = data.resource_metrics.first().expect("resource metrics"); + let resource = resource_metrics.resource.as_ref().expect("resource"); + assert_has_attr(&resource.attributes, attr::OS_TYPE, "linux"); + assert_has_attr(&resource.attributes, attr::HOST_ID, "host-id"); + assert_has_attr(&resource.attributes, attr::HOST_NAME, "host-name"); + assert_has_attr(&resource.attributes, attr::HOST_ARCH, "amd64"); + + let metrics = &resource_metrics.scope_metrics[0].metrics; + assert_metric_shape(metrics, metric::CPU_TIME, "s", Some(true)); + assert_first_point_attr(metrics, metric::CPU_TIME, attr::CPU_MODE, "user"); + assert_sum_point_attr(metrics, metric::CPU_TIME, attr::CPU_MODE, "iowait"); + assert_metric_shape(metrics, metric::CPU_UTILIZATION, "1", None); + assert_first_point_attr(metrics, metric::CPU_UTILIZATION, attr::CPU_MODE, "user"); + assert_metric_shape(metrics, metric::CPU_LOGICAL_COUNT, "{cpu}", Some(false)); + assert_metric_shape(metrics, metric::CPU_PHYSICAL_COUNT, "{cpu}", Some(false)); + assert_metric_shape(metrics, metric::CPU_FREQUENCY, "Hz", None); + assert_first_point_int(metrics, metric::CPU_FREQUENCY, 2_400_000_000); + assert_first_point_attr_int(metrics, metric::CPU_FREQUENCY, attr::CPU_LOGICAL_NUMBER, 0); + assert_metric_shape(metrics, metric::MEMORY_USAGE, "By", Some(false)); + assert_first_point_attr( + metrics, + metric::MEMORY_USAGE, + attr::SYSTEM_MEMORY_STATE, + "used", + ); + assert_metric_shape(metrics, metric::MEMORY_UTILIZATION, "1", None); + assert_metric_shape(metrics, metric::MEMORY_LINUX_AVAILABLE, "By", Some(false)); + assert_metric_shape(metrics, metric::MEMORY_LINUX_SLAB_USAGE, "By", Some(false)); + assert_metric_shape(metrics, metric::MEMORY_LIMIT, "By", Some(false)); + assert_metric_shape(metrics, metric::MEMORY_LINUX_SHARED, "By", Some(false)); + assert_metric_shape( + metrics, + metric::MEMORY_LINUX_HUGEPAGES_LIMIT, + "{page}", + Some(false), + ); + assert_metric_shape( + metrics, + metric::MEMORY_LINUX_HUGEPAGES_PAGE_SIZE, + "By", + Some(false), + ); + assert_metric_shape( + metrics, + metric::MEMORY_LINUX_HUGEPAGES_RESERVED, + "{page}", + Some(false), + ); + assert_metric_shape( + metrics, + metric::MEMORY_LINUX_HUGEPAGES_SURPLUS, + "{page}", + Some(false), + ); + assert_metric_shape( + metrics, + metric::MEMORY_LINUX_HUGEPAGES_USAGE, + "{page}", + Some(false), + ); + assert_first_point_attr( + metrics, + metric::MEMORY_LINUX_HUGEPAGES_USAGE, + attr::SYSTEM_MEMORY_LINUX_HUGEPAGES_STATE, + "used", + ); + assert_metric_shape( + metrics, + metric::MEMORY_LINUX_HUGEPAGES_UTILIZATION, + "1", + None, + ); + assert_metric_shape(metrics, metric::UPTIME, "s", None); + assert_metric_shape(metrics, metric::PAGING_FAULTS, "{fault}", Some(true)); + assert_first_point_attr( + metrics, + metric::PAGING_FAULTS, + attr::SYSTEM_PAGING_FAULT_TYPE, + "minor", + ); + assert_metric_shape( + metrics, + metric::PAGING_OPERATIONS, + "{operation}", + Some(true), + ); + assert_sum_point_attr( + metrics, + metric::PAGING_OPERATIONS, + attr::SYSTEM_PAGING_DIRECTION, + "in", + ); + assert_sum_point_attr( + metrics, + metric::PAGING_OPERATIONS, + attr::SYSTEM_PAGING_FAULT_TYPE, + "minor", + ); + assert_metric_shape(metrics, metric::PAGING_USAGE, "By", Some(false)); + assert_first_point_attr( + metrics, + metric::PAGING_USAGE, + attr::SYSTEM_DEVICE, + "/dev/swap", + ); + assert_metric_shape(metrics, metric::PAGING_UTILIZATION, "1", None); + assert_metric_shape(metrics, metric::PROCESS_COUNT, "{process}", Some(false)); + assert_sum_point_attr( + metrics, + metric::PROCESS_COUNT, + attr::PROCESS_STATE, + "running", + ); + assert_metric_shape(metrics, metric::PROCESS_CREATED, "{process}", Some(true)); + assert_metric_shape(metrics, metric::DISK_IO, "By", Some(true)); + assert_first_point_attr(metrics, metric::DISK_IO, attr::DISK_IO_DIRECTION, "read"); + assert_metric_shape(metrics, metric::DISK_OPERATIONS, "{operation}", Some(true)); + assert_metric_shape(metrics, metric::DISK_IO_TIME, "s", Some(true)); + assert_first_point_attr(metrics, metric::DISK_IO_TIME, attr::SYSTEM_DEVICE, "sda"); + assert_metric_shape(metrics, metric::DISK_OPERATION_TIME, "s", Some(true)); + assert_metric_shape(metrics, metric::DISK_MERGED, "{operation}", Some(true)); + assert_metric_shape(metrics, metric::DISK_LIMIT, "By", Some(false)); + assert_first_point_attr(metrics, metric::DISK_LIMIT, attr::SYSTEM_DEVICE, "sda"); + assert_metric_shape(metrics, metric::FILESYSTEM_USAGE, "By", Some(false)); + assert_first_point_attr( + metrics, + metric::FILESYSTEM_USAGE, + attr::SYSTEM_FILESYSTEM_STATE, + "used", + ); + assert_metric_shape(metrics, metric::FILESYSTEM_UTILIZATION, "1", None); + assert_metric_shape(metrics, metric::FILESYSTEM_LIMIT, "By", Some(false)); + assert_no_first_point_attr( + metrics, + metric::FILESYSTEM_LIMIT, + attr::SYSTEM_FILESYSTEM_STATE, + ); + assert_metric_shape(metrics, metric::NETWORK_IO, "By", Some(true)); + assert_first_point_attr( + metrics, + metric::NETWORK_IO, + attr::NETWORK_INTERFACE_NAME, + "eth0", + ); + assert_metric_shape( + metrics, + metric::NETWORK_PACKET_COUNT, + "{packet}", + Some(true), + ); + assert_first_point_attr( + metrics, + metric::NETWORK_PACKET_COUNT, + attr::SYSTEM_DEVICE, + "eth0", + ); + assert_metric_shape( + metrics, + metric::NETWORK_PACKET_DROPPED, + "{packet}", + Some(true), + ); + assert_first_point_attr( + metrics, + metric::NETWORK_PACKET_DROPPED, + attr::NETWORK_INTERFACE_NAME, + "eth0", + ); + assert_metric_shape(metrics, metric::NETWORK_ERRORS, "{error}", Some(true)); +} + +#[cfg(feature = "dev-tools")] +#[test] +#[ignore = "dev-only semconv drift check; may access a local or remote semantic-conventions registry"] +fn emitted_phase1_metric_shapes_match_weaver_semconv() { + let registry = load_semconv_registry(); + let semconv_shapes = semconv_system_metric_shapes(®istry); + let emitted_shapes = emitted_phase1_metric_shapes(); + + for (name, emitted) in emitted_shapes { + let semconv = semconv_shapes + .get(&name) + .unwrap_or_else(|| panic!("missing semconv metric {name}")); + + assert_eq!(emitted.unit, semconv.unit, "unit mismatch for {name}"); + assert_eq!( + emitted.monotonic, semconv.monotonic, + "instrument/temporality mismatch for {name}" + ); + assert_eq!( + emitted.value_type, semconv.value_type, + "metric value type mismatch for {name}" + ); + + for attr in &semconv.attributes { + assert!( + emitted.attributes.contains(attr), + "missing semconv attribute {attr} on {name}" + ); + } + for attr in &emitted.attributes { + assert!( + semconv.all_attributes.contains(attr), + "unexpected semconv attribute {attr} on {name}" + ); + } + for (attr, emitted_kind) in &emitted.attribute_types { + let Some(semconv_kind) = semconv.attribute_types.get(attr) else { + continue; + }; + assert_eq!( + emitted_kind, semconv_kind, + "attribute value type mismatch for {attr} on {name}" + ); + } + for (attr, values) in &emitted.enum_values { + let Some(allowed_values) = semconv.enum_values.get(attr) else { + continue; + }; + for value in values { + if is_intentional_semconv_enum_value_gap(name.as_str(), attr.as_str(), value) { + continue; + } + assert!( + allowed_values.contains(value), + "unexpected enum value {attr}={value} on {name}" + ); + } + } + } +} + +#[test] +fn projection_uses_counter_start_overrides_for_reset_series() { + let data = decode_metrics( + HostSnapshot { + now_unix_nano: 2_000, + start_time_unix_nano: 1_000, + counter_starts: CounterStarts { + entries: vec![(counter_key(metric::PROCESS_CREATED, ""), 1_500)], + }, + processes: Some(ProcessStats { + created: 99, + ..ProcessStats::default() + }), + ..HostSnapshot::default() + } + .into_otap_records() + .expect("encode ok"), + ); + + let metrics = &data.resource_metrics[0].scope_metrics[0].metrics; + assert_first_sum_point_start(metrics, metric::PROCESS_CREATED, 1_500); +} + +#[test] +fn counter_tracker_rebaselines_reset_series_only() { + let mut tracker = CounterTracker::default(); + let disks = vec![DiskStats { + name: "sda".to_owned(), + read_bytes: 100, + write_bytes: 200, + ..DiskStats::default() + }]; + let starts = tracker.snapshot(10, 20, None, None, None, Some(&disks), None); + + assert_eq!(starts.get_joined(metric::DISK_IO, "sda", "read", 10), 10); + assert_eq!(starts.get_joined(metric::DISK_IO, "sda", "write", 10), 10); + + let disks = vec![DiskStats { + name: "sda".to_owned(), + read_bytes: 50, + write_bytes: 250, + ..DiskStats::default() + }]; + let starts = tracker.snapshot(10, 30, None, None, None, Some(&disks), None); + + assert_eq!(starts.get_joined(metric::DISK_IO, "sda", "read", 10), 30); + assert_eq!(starts.get_joined(metric::DISK_IO, "sda", "write", 10), 10); +} + +#[test] +fn counter_tracker_rebaselines_paging_operations_by_direction_and_fault_type() { + let mut tracker = CounterTracker::default(); + let paging = PagingStats { + swap_in: 100, + swap_out: 200, + page_in: 300, + page_out: 400, + ..PagingStats::default() + }; + let starts = tracker.snapshot(10, 20, None, Some(&paging), None, None, None); + + assert_eq!( + starts.get_joined(metric::PAGING_OPERATIONS, "in", "major", 10), + 10 + ); + assert_eq!( + starts.get_joined(metric::PAGING_OPERATIONS, "out", "minor", 10), + 10 + ); + + let paging = PagingStats { + swap_in: 50, + swap_out: 250, + page_in: 350, + page_out: 450, + ..PagingStats::default() + }; + let starts = tracker.snapshot(10, 30, None, Some(&paging), None, None, None); + + assert_eq!( + starts.get_joined(metric::PAGING_OPERATIONS, "in", "major", 10), + 30 + ); + assert_eq!( + starts.get_joined(metric::PAGING_OPERATIONS, "out", "major", 10), + 10 + ); + assert_eq!( + starts.get_joined(metric::PAGING_OPERATIONS, "in", "minor", 10), + 10 + ); + assert_eq!( + starts.get_joined(metric::PAGING_OPERATIONS, "out", "minor", 10), + 10 + ); +} + +#[test] +fn counter_tracker_prunes_disappeared_disk_series_only_when_disk_is_scraped() { + let mut tracker = CounterTracker::default(); + let disks = vec![DiskStats { + name: "sda".to_owned(), + read_bytes: 100, + write_bytes: 200, + ..DiskStats::default() + }]; + let starts = tracker.snapshot(10, 20, None, None, None, Some(&disks), None); + assert_eq!(starts.get_joined(metric::DISK_IO, "sda", "read", 10), 10); + + let _ = tracker.snapshot(20, 30, None, None, None, None, None); + let disks = vec![DiskStats { + name: "sda".to_owned(), + read_bytes: 150, + write_bytes: 250, + ..DiskStats::default() + }]; + let starts = tracker.snapshot(30, 40, None, None, None, Some(&disks), None); + assert_eq!(starts.get_joined(metric::DISK_IO, "sda", "read", 30), 30); + + let empty_disks = Vec::new(); + let _ = tracker.snapshot(40, 50, None, None, None, Some(&empty_disks), None); + let disks = vec![DiskStats { + name: "sda".to_owned(), + read_bytes: 200, + write_bytes: 300, + ..DiskStats::default() + }]; + let starts = tracker.snapshot(50, 60, None, None, None, Some(&disks), None); + assert_eq!(starts.get_joined(metric::DISK_IO, "sda", "read", 50), 50); +} + +#[test] +fn counter_tracker_prunes_disappeared_network_series_only_when_network_is_scraped() { + let mut tracker = CounterTracker::default(); + let networks = vec![NetworkStats { + name: "veth0".to_owned(), + rx_bytes: 100, + tx_bytes: 200, + ..NetworkStats::default() + }]; + let starts = tracker.snapshot(10, 20, None, None, None, None, Some(&networks)); + assert_eq!( + starts.get_joined(metric::NETWORK_IO, "veth0", "receive", 10), + 10 + ); + + let _ = tracker.snapshot(20, 30, None, None, None, None, None); + let networks = vec![NetworkStats { + name: "veth0".to_owned(), + rx_bytes: 150, + tx_bytes: 250, + ..NetworkStats::default() + }]; + let starts = tracker.snapshot(30, 40, None, None, None, None, Some(&networks)); + assert_eq!( + starts.get_joined(metric::NETWORK_IO, "veth0", "receive", 30), + 30 + ); + + let empty_networks = Vec::new(); + let _ = tracker.snapshot(40, 50, None, None, None, None, Some(&empty_networks)); + let networks = vec![NetworkStats { + name: "veth0".to_owned(), + rx_bytes: 200, + tx_bytes: 300, + ..NetworkStats::default() + }]; + let starts = tracker.snapshot(50, 60, None, None, None, None, Some(&networks)); + assert_eq!( + starts.get_joined(metric::NETWORK_IO, "veth0", "receive", 50), + 50 + ); +} + +#[test] +fn counter_keys_do_not_collide_with_pipe_in_series_values() { + let metric = metric::DISK_IO; + let device = "read|write"; + let joined = counter_key_joined(metric, device, "read"); + assert!(!counter_key_matches_joined( + &joined, + metric, + "read", + "write|read" + )); + assert!(counter_key_matches_joined(&joined, metric, device, "read")); +} + +#[test] +fn scrape_due_emits_successful_families_after_partial_read_error() { + let root = tempfile::tempdir().expect("tempdir"); + let proc = root.path().join("proc"); + std::fs::create_dir(&proc).expect("proc dir"); + std::fs::write( + proc.join("meminfo"), + "MemTotal: 1000 kB\nMemFree: 100 kB\nMemAvailable: 200 kB\n", + ) + .expect("meminfo"); + // Cumulative metrics read /proc/stat once to cache boot time. Provide + // btime here so this test only exercises the missing diskstats error. + std::fs::write(proc.join("stat"), "btime 1700000000\n").expect("stat"); + let mut source = ProcfsSource::new( + Some(root.path()), + ProcfsConfig { + cpu: false, + memory: true, + paging: false, + system: false, + disk: true, + filesystem: false, + network: false, + processes: false, + cpu_utilization: false, + memory_limit: false, + memory_shared: false, + memory_hugepages: false, + disk_limit: false, + filesystem_include_virtual: false, + filesystem_limit: false, + filesystem_include_devices: None, + filesystem_exclude_devices: None, + filesystem_include_fs_types: None, + filesystem_exclude_fs_types: None, + filesystem_include_mount_points: None, + filesystem_exclude_mount_points: None, + disk_include: None, + disk_exclude: None, + network_include: None, + network_exclude: None, + validation: HostViewValidationMode::None, + }, + ) + .expect("source"); + + let scrape = source + .scrape_due(ProcfsFamilies { + memory: true, + disk: true, + ..ProcfsFamilies::default() + }) + .expect("partial scrape"); + + assert_eq!(scrape.partial_errors, 1); + assert!(scrape.snapshot.memory.is_some()); + assert!(scrape.snapshot.disks.is_empty()); +} + +#[test] +fn scrape_due_preserves_disk_counter_state_after_diskstats_read_error() { + let root = tempfile::tempdir().expect("tempdir"); + let proc = root.path().join("proc"); + std::fs::create_dir_all(&proc).expect("proc dir"); + std::fs::write(proc.join("stat"), "btime 1700000000\n").expect("stat"); + std::fs::write( + proc.join("meminfo"), + "MemTotal: 1000 kB\nMemFree: 100 kB\nMemAvailable: 200 kB\n", + ) + .expect("meminfo"); + std::fs::write( + proc.join("diskstats"), + "8 0 sda 1 0 100 0 2 0 200 0 0 0 0 0 0 0 0\n", + ) + .expect("diskstats"); + let mut source = ProcfsSource::new( + Some(root.path()), + ProcfsConfig { + cpu: false, + memory: true, + paging: false, + system: false, + disk: true, + filesystem: false, + network: false, + processes: false, + cpu_utilization: false, + memory_limit: false, + memory_shared: false, + memory_hugepages: false, + disk_limit: false, + filesystem_include_virtual: false, + filesystem_limit: false, + filesystem_include_devices: None, + filesystem_exclude_devices: None, + filesystem_include_fs_types: None, + filesystem_exclude_fs_types: None, + filesystem_include_mount_points: None, + filesystem_exclude_mount_points: None, + disk_include: None, + disk_exclude: None, + network_include: None, + network_exclude: None, + validation: HostViewValidationMode::None, + }, + ) + .expect("source"); + + let first = source + .scrape_due(ProcfsFamilies { + disk: true, + ..ProcfsFamilies::default() + }) + .expect("first disk scrape"); + let first_start = first + .snapshot + .counter_starts + .get_joined(metric::DISK_IO, "sda", "read", 0); + + std::fs::remove_file(proc.join("diskstats")).expect("remove diskstats"); + let partial = source + .scrape_due(ProcfsFamilies { + memory: true, + disk: true, + ..ProcfsFamilies::default() + }) + .expect("partial scrape"); + assert_eq!(partial.partial_errors, 1); + assert!(partial.snapshot.disks.is_empty()); + + std::fs::write( + proc.join("diskstats"), + "8 0 sda 1 0 50 0 2 0 100 0 0 0 0 0 0 0 0\n", + ) + .expect("diskstats after reset"); + let after_error = source + .scrape_due(ProcfsFamilies { + disk: true, + ..ProcfsFamilies::default() + }) + .expect("disk scrape after read error"); + let reset_start = + after_error + .snapshot + .counter_starts + .get_joined(metric::DISK_IO, "sda", "read", 0); + assert!( + reset_start > first_start, + "disk counter state should survive read errors so the later reset is detected" + ); +} + +#[test] +fn filesystem_stat_worker_reports_disconnect_as_broken_pipe() { + let worker = FilesystemStatWorker::disconnected_for_test(); + match worker.statvfs(PathBuf::from("/"), Duration::from_millis(1)) { + Ok(_) => panic!("worker is disconnected"), + Err(err) => assert_eq!(err.kind(), ErrorKind::BrokenPipe), + } +} + +#[test] +fn scrape_due_fails_when_all_due_families_fail() { + let root = tempfile::tempdir().expect("tempdir"); + let mut source = ProcfsSource::new( + Some(root.path()), + ProcfsConfig { + cpu: false, + memory: true, + paging: false, + system: false, + disk: false, + filesystem: false, + network: false, + processes: false, + cpu_utilization: false, + memory_limit: false, + memory_shared: false, + memory_hugepages: false, + disk_limit: false, + filesystem_include_virtual: false, + filesystem_limit: false, + filesystem_include_devices: None, + filesystem_exclude_devices: None, + filesystem_include_fs_types: None, + filesystem_exclude_fs_types: None, + filesystem_include_mount_points: None, + filesystem_exclude_mount_points: None, + disk_include: None, + disk_exclude: None, + network_include: None, + network_exclude: None, + validation: HostViewValidationMode::None, + }, + ) + .expect("source"); + + assert!( + source + .scrape_due(ProcfsFamilies { + memory: true, + ..ProcfsFamilies::default() + }) + .is_err() + ); +} + +#[test] +fn scrape_due_reads_opt_in_disk_limit_from_sysfs() { + let root = tempfile::tempdir().expect("tempdir"); + let proc = root.path().join("proc"); + let sys_sda = root.path().join("sys/block/sda"); + std::fs::create_dir(&proc).expect("proc dir"); + std::fs::create_dir_all(&sys_sda).expect("sys block dir"); + std::fs::write( + proc.join("diskstats"), + "8 0 sda 1 0 2 3 4 0 5 6 0 0 0 0 0 0 0 0\n", + ) + .expect("diskstats"); + std::fs::write(sys_sda.join("size"), "4096\n").expect("disk size"); + let mut source = ProcfsSource::new( + Some(root.path()), + ProcfsConfig { + cpu: false, + memory: false, + paging: false, + system: false, + disk: true, + filesystem: false, + network: false, + processes: false, + cpu_utilization: false, + memory_limit: false, + memory_shared: false, + memory_hugepages: false, + disk_limit: true, + filesystem_include_virtual: false, + filesystem_limit: false, + filesystem_include_devices: None, + filesystem_exclude_devices: None, + filesystem_include_fs_types: None, + filesystem_exclude_fs_types: None, + filesystem_include_mount_points: None, + filesystem_exclude_mount_points: None, + disk_include: None, + disk_exclude: None, + network_include: None, + network_exclude: None, + validation: HostViewValidationMode::None, + }, + ) + .expect("source"); + + let scrape = source + .scrape_due(ProcfsFamilies { + disk: true, + ..ProcfsFamilies::default() + }) + .expect("disk scrape"); + + assert_eq!(scrape.snapshot.disks.len(), 1); + assert_eq!( + scrape.snapshot.disks[0].limit_bytes, + Some(4096 * DISKSTAT_SECTOR_BYTES) + ); +} + +#[test] +fn scrape_due_uses_boot_time_for_counter_only_family_ticks() { + let root = tempfile::tempdir().expect("tempdir"); + let proc = root.path().join("proc"); + let proc_one = proc.join("1"); + std::fs::create_dir_all(proc_one.join("net")).expect("proc dirs"); + std::fs::write(proc.join("stat"), "btime 123\n").expect("stat"); + std::fs::write( + proc.join("diskstats"), + "8 0 sda 1 0 2 3 4 0 5 6 0 0 0 0 0 0 0 0\n", + ) + .expect("diskstats"); + std::fs::write( + proc_one.join("net/dev"), + "Inter-| Receive | Transmit\n\ + face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed\n\ + eth0: 10 1 0 0 0 0 0 0 20 2 0 0 0 0 0 0\n", + ) + .expect("netdev"); + std::fs::write( + proc.join("vmstat"), + "pgfault 10\npgmajfault 1\npgpgin 2\npgpgout 3\npswpin 4\npswpout 5\n", + ) + .expect("vmstat"); + std::fs::write(proc.join("swaps"), "Filename Type Size Used Priority\n").expect("swaps"); + + let mut source = ProcfsSource::new( + Some(root.path()), + ProcfsConfig { + cpu: false, + memory: false, + paging: true, + system: false, + disk: true, + filesystem: false, + network: true, + processes: false, + cpu_utilization: false, + memory_limit: false, + memory_shared: false, + memory_hugepages: false, + disk_limit: false, + filesystem_include_virtual: false, + filesystem_limit: false, + filesystem_include_devices: None, + filesystem_exclude_devices: None, + filesystem_include_fs_types: None, + filesystem_exclude_fs_types: None, + filesystem_include_mount_points: None, + filesystem_exclude_mount_points: None, + disk_include: None, + disk_exclude: None, + network_include: None, + network_exclude: None, + validation: HostViewValidationMode::None, + }, + ) + .expect("source"); + + let expected_start = 123 * NANOS_PER_SEC; + let disk_scrape = source + .scrape_due(ProcfsFamilies { + disk: true, + ..ProcfsFamilies::default() + }) + .expect("disk scrape"); + assert_eq!(disk_scrape.snapshot.start_time_unix_nano, expected_start); + assert_eq!(disk_scrape.snapshot.disks.len(), 1); + + std::fs::remove_file(proc.join("stat")).expect("remove stat after cache"); + + let network_scrape = source + .scrape_due(ProcfsFamilies { + network: true, + ..ProcfsFamilies::default() + }) + .expect("network scrape"); + assert_eq!(network_scrape.snapshot.start_time_unix_nano, expected_start); + assert_eq!(network_scrape.snapshot.networks.len(), 1); + + let paging_scrape = source + .scrape_due(ProcfsFamilies { + paging: true, + ..ProcfsFamilies::default() + }) + .expect("paging scrape"); + assert_eq!(paging_scrape.snapshot.start_time_unix_nano, expected_start); + assert!(paging_scrape.snapshot.paging.is_some()); +} + +#[test] +fn scrape_due_reads_filesystem_usage_from_mountinfo() { + let root = tempfile::tempdir().expect("tempdir"); + let proc_one = root.path().join("proc/1"); + std::fs::create_dir_all(&proc_one).expect("proc one dir"); + std::fs::write( + proc_one.join("mountinfo"), + "36 25 8:1 / / rw,relatime - ext4 /dev/sda1 rw\n", + ) + .expect("mountinfo"); + let mut source = ProcfsSource::new( + Some(root.path()), + ProcfsConfig { + cpu: false, + memory: false, + paging: false, + system: false, + disk: false, + filesystem: true, + network: false, + processes: false, + cpu_utilization: false, + memory_limit: false, + memory_shared: false, + memory_hugepages: false, + disk_limit: false, + filesystem_include_virtual: false, + filesystem_limit: true, + filesystem_include_devices: None, + filesystem_exclude_devices: None, + filesystem_include_fs_types: None, + filesystem_exclude_fs_types: None, + filesystem_include_mount_points: None, + filesystem_exclude_mount_points: None, + disk_include: None, + disk_exclude: None, + network_include: None, + network_exclude: None, + validation: HostViewValidationMode::None, + }, + ) + .expect("source"); + + let scrape = source + .scrape_due(ProcfsFamilies { + filesystem: true, + ..ProcfsFamilies::default() + }) + .expect("filesystem scrape"); + + assert_eq!(scrape.snapshot.filesystems.len(), 1); + assert_eq!(scrape.snapshot.filesystems[0].device, "/dev/sda1"); + assert_eq!(scrape.snapshot.filesystems[0].mountpoint, "/"); + assert_eq!(scrape.snapshot.filesystems[0].fs_type, "ext4"); + assert!(scrape.snapshot.filesystems[0].limit_bytes.is_some()); +} + +#[test] +fn cpu_parser_accepts_missing_newer_fields() { + let cpu = parse_cpu_total("10 20 30 40", 10.0).expect("cpu row"); + assert_eq!(cpu.user, 1.0); + assert_eq!(cpu.nice, 2.0); + assert_eq!(cpu.system, 3.0); + assert_eq!(cpu.idle, 4.0); + assert_eq!(cpu.steal, 0.0); +} + +#[test] +fn cpu_parser_removes_guest_from_user_and_nice() { + let cpu = parse_cpu_total("100 50 30 40 5 2 3 7 10 4", 10.0).expect("cpu row"); + assert_eq!(cpu.user, 9.0); + assert_eq!(cpu.nice, 4.6); + assert_eq!(cpu.interrupt, 0.5); +} + +#[test] +fn cpu_utilization_uses_counter_deltas() { + let utilization = cpu_utilization( + CpuTimes { + user: 1.0, + idle: 1.0, + ..CpuTimes::default() + }, + CpuTimes { + user: 3.0, + idle: 2.0, + ..CpuTimes::default() + }, + ) + .expect("utilization"); + + assert_eq!(utilization.user, 2.0 / 3.0); + assert_eq!(utilization.idle, 1.0 / 3.0); +} + +#[test] +fn cpu_utilization_skips_counter_resets() { + assert!( + cpu_utilization( + CpuTimes { + user: 2.0, + ..CpuTimes::default() + }, + CpuTimes { + user: 1.0, + ..CpuTimes::default() + }, + ) + .is_none() + ); +} + +#[test] +fn clock_ticks_per_second_uses_positive_system_value() { + assert!(clock_ticks_per_second() > 0.0); +} + +#[test] +fn memavailable_fallback_uses_free_buffers_cached() { + let memory = + parse_meminfo("MemTotal: 1000 kB\nMemFree: 100 kB\nBuffers: 20 kB\nCached: 30 kB\n") + .expect("memory"); + assert!(!memory.has_available); + assert_eq!(memory.available, 150 * BYTES_PER_KIB); + assert_eq!(memory.used, 850 * BYTES_PER_KIB); +} + +#[test] +fn meminfo_parser_reads_shared_memory() { + let memory = + parse_meminfo("MemTotal: 1000 kB\nMemFree: 100 kB\nShmem: 12 kB\n").expect("memory"); + assert_eq!(memory.shared, 12 * BYTES_PER_KIB); +} + +#[test] +fn meminfo_parser_reads_hugepage_stats() { + let memory = parse_meminfo( + "MemTotal: 1000 kB\n\ + MemFree: 100 kB\n\ + HugePages_Total: 8\n\ + HugePages_Free: 3\n\ + HugePages_Rsvd: 2\n\ + HugePages_Surp: 1\n\ + Hugepagesize: 2048 kB\n", + ) + .expect("memory"); + + assert_eq!(memory.hugepages.total, 8); + assert_eq!(memory.hugepages.free, 3); + assert_eq!(memory.hugepages.reserved, 2); + assert_eq!(memory.hugepages.surplus, 1); + assert_eq!(memory.hugepages.page_size_bytes, 2048 * BYTES_PER_KIB); +} + +#[test] +fn uptime_parser_reads_first_field() { + assert_eq!(parse_uptime("123.45 67.89"), Some(123.45)); +} + +#[test] +fn vmstat_parser_derives_minor_faults() { + let paging = + parse_vmstat("pgfault 100\npgmajfault 7\npgpgin 5\npgpgout 6\npswpin 3\npswpout 4\n"); + assert_eq!(paging.minor_faults, 93); + assert_eq!(paging.major_faults, 7); + assert_eq!(paging.page_in, 5); + assert_eq!(paging.page_out, 6); + assert_eq!(paging.swap_in, 3); + assert_eq!(paging.swap_out, 4); +} + +#[test] +fn swaps_parser_reads_device_usage() { + let swaps = parse_swaps("Filename Type Size Used Priority\n/dev/sda2 partition 200 50 -2\n"); + assert_eq!(swaps.len(), 1); + assert_eq!(swaps[0].name, "/dev/sda2"); + assert_eq!(swaps[0].used, 50 * BYTES_PER_KIB); + assert_eq!(swaps[0].free, 150 * BYTES_PER_KIB); +} + +#[test] +fn diskstats_parser_accepts_flush_columns() { + let disks = parse_diskstats("8 0 sda 1 0 2 3 4 0 5 6 0 0 0 0 0 0 0 0\n", None, None); + assert_eq!(disks.len(), 1); + assert_eq!(disks[0].name, "sda"); + assert_eq!(disks[0].read_bytes, 1024); + assert_eq!(disks[0].write_bytes, 2560); +} + +#[test] +fn diskstats_parser_applies_filters_before_parsing_values() { + let exclude = CompiledFilter::compile( + crate::receivers::host_metrics_receiver::MatchType::Glob, + vec!["loop*".to_owned()], + ) + .expect("valid") + .expect("filter"); + let disks = parse_diskstats( + "7 0 loop0 broken row\n8 0 sda 1 0 2 3 4 0 5 6 0 0 0 0 0 0 0 0\n", + None, + Some(&exclude), + ); + + assert_eq!(disks.len(), 1); + assert_eq!(disks[0].name, "sda"); +} + +#[test] +fn mountinfo_parser_skips_virtual_filesystems_by_default() { + let mounts = parse_mountinfo( + "36 25 8:1 / / rw,relatime - ext4 /dev/sda1 rw\n\ + 37 25 0:32 / /proc rw,nosuid,nodev,noexec,relatime - proc proc rw\n\ + 38 25 0:33 / /mnt/fuse rw,relatime - fuse.sshfs sshfs rw\n\ + 39 25 0:34 / /mnt/fuseblk rw,relatime - fuseblk /dev/fuse rw\n\ + 40 25 0:35 / /mnt/nfs rw,relatime - nfs server:/export rw\n", + false, + true, + FilesystemFilters::default(), + ); + + assert_eq!(mounts.len(), 1); + assert_eq!(mounts[0].device, "/dev/sda1"); + assert_eq!(mounts[0].mountpoint, "/"); + assert_eq!(mounts[0].fs_type, "ext4"); + assert_eq!(mounts[0].mode, "rw"); + assert!(mounts[0].emit_limit); +} + +#[test] +fn mountinfo_parser_unescapes_paths() { + let mounts = parse_mountinfo( + "36 25 8:1 / /mnt/data\\040disk rw,relatime - ext4 /dev/disk\\040one rw\n", + false, + false, + FilesystemFilters::default(), + ); + + assert_eq!(mounts.len(), 1); + assert_eq!(mounts[0].device, "/dev/disk one"); + assert_eq!(mounts[0].mountpoint, "/mnt/data disk"); +} + +#[test] +fn mountinfo_parser_preserves_utf8_while_unescaping_paths() { + let mounts = parse_mountinfo( + "36 25 8:1 / /mnt/caf\u{00e9}\\040disk rw,relatime - ext4 /dev/disk\\040\u{00e9} rw\n", + false, + false, + FilesystemFilters::default(), + ); + + assert_eq!(mounts.len(), 1); + assert_eq!(mounts[0].device, "/dev/disk \u{00e9}"); + assert_eq!(mounts[0].mountpoint, "/mnt/caf\u{00e9} disk"); +} + +#[test] +fn mountinfo_parser_applies_filesystem_filters() { + let include_mounts = CompiledFilter::compile( + crate::receivers::host_metrics_receiver::MatchType::Glob, + vec!["/data*".to_owned()], + ) + .expect("valid") + .expect("filter"); + let exclude_fs_types = CompiledFilter::compile( + crate::receivers::host_metrics_receiver::MatchType::Strict, + vec!["xfs".to_owned()], + ) + .expect("valid") + .expect("filter"); + let mounts = parse_mountinfo( + "36 25 8:1 / / rw,relatime - ext4 /dev/sda1 rw\n37 25 8:2 / /data rw,relatime - ext4 /dev/sdb1 rw\n38 25 8:3 / /data2 rw,relatime - xfs /dev/sdc1 rw\n", + false, + false, + FilesystemFilters { + include_mount_points: Some(&include_mounts), + exclude_fs_types: Some(&exclude_fs_types), + ..FilesystemFilters::default() + }, + ); + + assert_eq!(mounts.len(), 1); + assert_eq!(mounts[0].device, "/dev/sdb1"); + assert_eq!(mounts[0].mountpoint, "/data"); +} + +#[test] +fn netdev_parser_reads_device_counters() { + let interfaces = parse_netdev( + "Inter-| Receive | Transmit\n face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed\n eth0: 10 2 0 0 0 0 0 0 30 4 0 0 0 0 0 0\n", + None, + None, + ); + assert_eq!(interfaces.len(), 1); + assert_eq!(interfaces[0].name, "eth0"); + assert_eq!(interfaces[0].rx_bytes, 10); + assert_eq!(interfaces[0].tx_packets, 4); +} + +#[test] +fn netdev_parser_applies_interface_filters() { + let include = CompiledFilter::compile( + crate::receivers::host_metrics_receiver::MatchType::Strict, + vec!["eth0".to_owned()], + ) + .expect("valid") + .expect("filter"); + let interfaces = parse_netdev( + "Inter-| Receive | Transmit\n face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed\n lo: 1 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0\n eth0: 10 2 3 4 0 0 0 0 30 4 5 6 0 0 0 0\n", + Some(&include), + None, + ); + + assert_eq!(interfaces.len(), 1); + assert_eq!(interfaces[0].name, "eth0"); + assert_eq!(interfaces[0].rx_errors, 3); + assert_eq!(interfaces[0].tx_dropped, 6); +} + +#[test] +fn root_path_uses_host_pid_one_netdev() { + let paths = ProcfsPaths::new(Some(Path::new("/host"))); + assert_eq!(paths.net_dev, PathBuf::from("/host/proc/1/net/dev")); + assert_eq!(paths.mountinfo, PathBuf::from("/host/proc/1/mountinfo")); +} + +#[test] +fn root_slash_uses_current_proc_netdev() { + let paths = ProcfsPaths::new(Some(Path::new("/"))); + assert_eq!(paths.net_dev, PathBuf::from("/proc/net/dev")); + assert_eq!(paths.mountinfo, PathBuf::from("/proc/self/mountinfo")); +} + +#[test] +fn host_arch_uses_semconv_values() { + if let Some(arch) = host_arch() { + assert!(matches!( + arch, + "amd64" | "arm32" | "arm64" | "ppc32" | "ppc64" | "x86" + )); + } +} + +#[cfg(feature = "dev-tools")] +#[derive(Debug)] +struct MetricShape { + unit: String, + monotonic: Option, + attributes: BTreeSet, + all_attributes: BTreeSet, + attribute_types: BTreeMap, + enum_values: BTreeMap>, + value_type: Option, +} + +#[cfg(feature = "dev-tools")] +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +enum MetricValueKind { + Int, + Double, +} + +#[cfg(feature = "dev-tools")] +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +enum AttributeValueKind { + Int, + Double, + String, + Bool, +} + +#[cfg(feature = "dev-tools")] +fn load_semconv_registry() -> ResolvedRegistry { + let registry_path = std::env::var("OTAP_HOST_METRICS_SEMCONV_REGISTRY") + .map(|path| { + path.parse::() + .expect("valid OTAP_HOST_METRICS_SEMCONV_REGISTRY") + }) + .unwrap_or_else(|_| VirtualDirectoryPath::GitRepo { + url: "https://github.com/open-telemetry/semantic-conventions.git".to_owned(), + sub_folder: Some("model".to_owned()), + refspec: Some(format!( + "v{}", + crate::receivers::host_metrics_receiver::semconv::VERSION + )), + }); + + let registry_repo = + RegistryRepo::try_new("main", ®istry_path).expect("semantic convention registry"); + let registry = match SchemaResolver::load_semconv_repository(registry_repo, false) { + WResult::Ok(registry) | WResult::OkWithNFEs(registry, _) => registry, + WResult::FatalErr(err) => panic!("failed to load semantic convention registry: {err}"), + }; + let resolved_schema = match SchemaResolver::resolve(registry, true) { + WResult::Ok(schema) | WResult::OkWithNFEs(schema, _) => schema, + WResult::FatalErr(err) => { + panic!("failed to resolve semantic convention registry: {err}"); + } + }; + + ResolvedRegistry::try_from_resolved_registry( + &resolved_schema.registry, + resolved_schema.catalog(), + ) + .expect("resolved semantic convention registry") +} + +#[cfg(feature = "dev-tools")] +fn semconv_system_metric_shapes(registry: &ResolvedRegistry) -> BTreeMap { + registry + .groups + .iter() + .filter(|group| group.r#type == GroupType::Metric) + .filter_map(|group| { + let name = group.metric_name.as_ref()?; + if !name.starts_with("system.") { + return None; + } + + let monotonic = match group.instrument.as_ref()? { + InstrumentSpec::Counter => Some(true), + InstrumentSpec::UpDownCounter => Some(false), + InstrumentSpec::Gauge | InstrumentSpec::Histogram => None, + }; + let attributes = group + .attributes + .iter() + .filter(|attr| !is_opt_in_requirement(&attr.requirement_level)) + .map(|attr| attr.name.clone()) + .collect(); + let all_attributes = group + .attributes + .iter() + .map(|attr| attr.name.clone()) + .collect(); + let enum_values = group + .attributes + .iter() + .filter_map(|attr| match &attr.r#type { + AttributeType::Enum { members } => Some(( + attr.name.clone(), + members + .iter() + .map(|member| value_spec_string(&member.value)) + .collect(), + )), + _ => None, + }) + .collect(); + let attribute_types = group + .attributes + .iter() + .filter_map(|attr| { + attribute_value_kind(&attr.r#type).map(|kind| (attr.name.clone(), kind)) + }) + .collect(); + + Some(( + name.clone(), + MetricShape { + unit: group.unit.clone().unwrap_or_default(), + monotonic, + attributes, + all_attributes, + attribute_types, + enum_values, + value_type: semconv_metric_value_type(group.annotations.as_ref()), + }, + )) + }) + .collect() +} + +#[cfg(feature = "dev-tools")] +fn semconv_metric_value_type( + annotations: Option<&BTreeMap>, +) -> Option { + let code_generation = annotations?.get("code_generation")?.0.as_mapping()?; + let value_type = code_generation.iter().find_map(|(key, value)| { + (key.as_str() == Some("metric_value_type")).then(|| value.as_str())? + })?; + match value_type { + "int" => Some(MetricValueKind::Int), + "double" => Some(MetricValueKind::Double), + _ => None, + } +} + +#[cfg(feature = "dev-tools")] +fn value_spec_string(value: &ValueSpec) -> String { + match value { + ValueSpec::Int(value) => value.to_string(), + ValueSpec::Double(value) => value.to_string(), + ValueSpec::String(value) => value.clone(), + ValueSpec::Bool(value) => value.to_string(), + } +} + +#[cfg(feature = "dev-tools")] +fn attribute_value_kind(attribute_type: &AttributeType) -> Option { + match attribute_type { + AttributeType::PrimitiveOrArray(PrimitiveOrArrayTypeSpec::Int) => { + Some(AttributeValueKind::Int) + } + AttributeType::PrimitiveOrArray(PrimitiveOrArrayTypeSpec::Double) => { + Some(AttributeValueKind::Double) + } + AttributeType::PrimitiveOrArray(PrimitiveOrArrayTypeSpec::String) => { + Some(AttributeValueKind::String) + } + AttributeType::PrimitiveOrArray(PrimitiveOrArrayTypeSpec::Boolean) => { + Some(AttributeValueKind::Bool) + } + AttributeType::Enum { members } => { + members.first().map(|member| value_spec_kind(&member.value)) + } + _ => None, + } +} + +#[cfg(feature = "dev-tools")] +fn value_spec_kind(value: &ValueSpec) -> AttributeValueKind { + match value { + ValueSpec::Int(_) => AttributeValueKind::Int, + ValueSpec::Double(_) => AttributeValueKind::Double, + ValueSpec::String(_) => AttributeValueKind::String, + ValueSpec::Bool(_) => AttributeValueKind::Bool, + } +} + +#[cfg(feature = "dev-tools")] +fn is_intentional_semconv_enum_value_gap(_name: &str, _attr: &str, _value: &str) -> bool { + false +} + +#[cfg(feature = "dev-tools")] +fn is_opt_in_requirement(requirement_level: &RequirementLevel) -> bool { + matches!( + requirement_level, + RequirementLevel::Basic(BasicRequirementLevelSpec::OptIn) | RequirementLevel::OptIn { .. } + ) +} + +#[cfg(feature = "dev-tools")] +fn emitted_phase1_metric_shapes() -> BTreeMap { + let metrics = projection_fixture_metrics(); + let mut shapes = BTreeMap::new(); + for metric in &metrics { + let (monotonic, points) = match metric.data.as_ref().expect("metric data") { + otlp_metric::Data::Sum(sum) => (Some(sum.is_monotonic), &sum.data_points), + otlp_metric::Data::Gauge(gauge) => (None, &gauge.data_points), + _ => panic!("unsupported metric data for {}", metric.name), + }; + let value_type = metric_value_type(points); + let shape = shapes + .entry(metric.name.clone()) + .or_insert_with(|| MetricShape { + unit: metric.unit.clone(), + monotonic, + attributes: BTreeSet::new(), + all_attributes: BTreeSet::new(), + attribute_types: BTreeMap::new(), + enum_values: BTreeMap::new(), + value_type, + }); + assert_eq!( + shape.unit, metric.unit, + "unit mismatch across {}", + metric.name + ); + assert_eq!( + shape.monotonic, monotonic, + "instrument/temporality mismatch across {}", + metric.name + ); + assert_eq!( + shape.value_type, value_type, + "value type mismatch across {}", + metric.name + ); + for attr in points.iter().flat_map(|point| point.attributes.iter()) { + let _ = shape.attributes.insert(attr.key.clone()); + if let Some(value) = any_value_string(attr.value.as_ref()) { + let _ = shape + .enum_values + .entry(attr.key.clone()) + .or_default() + .insert(value); + } + if let Some(kind) = any_value_kind(attr.value.as_ref()) { + let previous = shape.attribute_types.insert(attr.key.clone(), kind); + assert!( + previous.is_none() || previous == Some(kind), + "mixed attribute value types for {} on {}", + attr.key, + metric.name + ); + } + } + } + shapes +} + +#[cfg(feature = "dev-tools")] +fn metric_value_type(points: &[NumberDataPoint]) -> Option { + let mut value_type = None; + for point in points { + let point_value_type = match point.value { + Some(number_data_point::Value::AsInt(_)) => MetricValueKind::Int, + Some(number_data_point::Value::AsDouble(_)) => MetricValueKind::Double, + None => continue, + }; + if value_type + .replace(point_value_type) + .is_some_and(|current| current != point_value_type) + { + panic!("mixed int/double data points"); + } + } + value_type +} + +#[cfg(feature = "dev-tools")] +fn any_value_string(value: Option<&AnyValue>) -> Option { + match value?.value.as_ref()? { + any_value::Value::StringValue(value) => Some(value.clone()), + any_value::Value::IntValue(value) => Some(value.to_string()), + any_value::Value::DoubleValue(value) => Some(value.to_string()), + any_value::Value::BoolValue(value) => Some(value.to_string()), + _ => None, + } +} + +#[cfg(feature = "dev-tools")] +fn any_value_kind(value: Option<&AnyValue>) -> Option { + match value?.value.as_ref()? { + any_value::Value::StringValue(_) => Some(AttributeValueKind::String), + any_value::Value::IntValue(_) => Some(AttributeValueKind::Int), + any_value::Value::DoubleValue(_) => Some(AttributeValueKind::Double), + any_value::Value::BoolValue(_) => Some(AttributeValueKind::Bool), + _ => None, + } +} + +fn projection_fixture_request() -> MetricsData { + decode_metrics( + HostSnapshot { + now_unix_nano: 2_000, + start_time_unix_nano: 1_000, + counter_starts: CounterStarts::default(), + memory_limit: true, + memory_shared: true, + memory_hugepages: true, + cpu: Some(CpuTimes { + user: 1.0, + nice: 2.0, + system: 3.0, + idle: 4.0, + wait: 5.0, + interrupt: 6.0, + steal: 7.0, + }), + cpu_utilization: Some(CpuTimes { + user: 0.1, + nice: 0.1, + system: 0.2, + idle: 0.3, + wait: 0.1, + interrupt: 0.1, + steal: 0.1, + }), + cpuinfo: CpuInfo { + logical_count: 2, + physical_count: 1, + frequencies_hz: vec![2_400_000_000.0], + }, + memory: Some(MemoryStats { + total: 100, + used: 80, + free: 10, + available: 20, + has_available: true, + cached: 5, + buffered: 5, + shared: 7, + slab_reclaimable: 3, + slab_unreclaimable: 2, + hugepages: HugepageStats { + total: 10, + free: 4, + reserved: 2, + surplus: 1, + page_size_bytes: 2 * BYTES_PER_KIB, + }, + }), + uptime_seconds: Some(42.0), + paging: Some(PagingStats { + minor_faults: 9, + major_faults: 1, + page_in: 4, + page_out: 5, + swap_in: 2, + swap_out: 3, + }), + swaps: vec![SwapStats { + name: "/dev/swap".to_owned(), + size: 100, + used: 25, + free: 75, + }], + processes: Some(ProcessStats { + running: 4, + blocked: 1, + created: 99, + }), + disks: vec![DiskStats { + name: "sda".to_owned(), + limit_bytes: Some(123), + read_bytes: 10, + write_bytes: 20, + read_ops: 1, + write_ops: 2, + read_merged: 3, + write_merged: 4, + read_time_seconds: 0.5, + write_time_seconds: 0.6, + io_time_seconds: 0.7, + }], + filesystems: vec![FilesystemStats { + device: "/dev/sda1".to_owned(), + mountpoint: "/".to_owned(), + fs_type: "ext4".to_owned(), + mode: "rw", + used: 60, + free: 30, + reserved: 10, + limit_bytes: Some(100), + }], + networks: vec![NetworkStats { + name: "eth0".to_owned(), + rx_bytes: 10, + tx_bytes: 20, + rx_packets: 1, + tx_packets: 2, + rx_errors: 3, + tx_errors: 4, + rx_dropped: 5, + tx_dropped: 6, + }], + resource: HostResource { + host_id: Some("host-id".to_owned()), + host_name: Some("host-name".to_owned()), + host_arch: Some("amd64"), + }, + } + .into_otap_records() + .expect("encode ok"), + ) +} + +#[cfg(feature = "dev-tools")] +fn projection_fixture_metrics() -> Vec { + projection_fixture_request() + .resource_metrics + .into_iter() + .next() + .expect("resource metrics") + .scope_metrics + .into_iter() + .next() + .expect("scope metrics") + .metrics +} + +fn assert_metric_shape( + metrics: &[Metric], + name: &'static str, + unit: &'static str, + monotonic_sum: Option, +) { + let metric = metric_by_name(metrics, name); + assert_eq!(metric.unit, unit); + match metric.data.as_ref().expect("metric data") { + otlp_metric::Data::Sum(sum) => { + let expected_monotonic = + monotonic_sum.unwrap_or_else(|| panic!("{name} should be a gauge")); + assert_eq!( + sum.aggregation_temporality, + AggregationTemporality::Cumulative as i32 + ); + assert_eq!(sum.is_monotonic, expected_monotonic); + assert!( + sum.data_points + .iter() + .all(|point| point.start_time_unix_nano == 1_000) + ); + } + otlp_metric::Data::Gauge(gauge) => { + assert!(monotonic_sum.is_none(), "{name} should be a cumulative sum"); + assert!( + gauge + .data_points + .iter() + .all(|point| point.start_time_unix_nano == 0) + ); + } + _ => panic!("unexpected data kind for {name}"), + } +} + +fn assert_first_point_attr( + metrics: &[Metric], + name: &'static str, + key: &'static str, + value: &'static str, +) { + let metric = metric_by_name(metrics, name); + let point = match metric.data.as_ref().expect("metric data") { + otlp_metric::Data::Sum(sum) => sum.data_points.first(), + otlp_metric::Data::Gauge(gauge) => gauge.data_points.first(), + _ => None, + } + .expect("data point"); + assert_has_attr(&point.attributes, key, value); +} + +fn assert_sum_point_attr( + metrics: &[Metric], + name: &'static str, + key: &'static str, + value: &'static str, +) { + let metric = metric_by_name(metrics, name); + let otlp_metric::Data::Sum(sum) = metric.data.as_ref().expect("metric data") else { + panic!("{name} should be a cumulative sum"); + }; + assert!( + sum.data_points + .iter() + .any(|point| has_attr(&point.attributes, key, value)), + "missing point attribute {key}={value}" + ); +} + +fn assert_first_point_int(metrics: &[Metric], name: &'static str, expected: i64) { + let metric = metric_by_name(metrics, name); + let point = match metric.data.as_ref().expect("metric data") { + otlp_metric::Data::Sum(sum) => sum.data_points.first(), + otlp_metric::Data::Gauge(gauge) => gauge.data_points.first(), + _ => None, + } + .expect("data point"); + assert_eq!( + point.value, + Some(number_data_point::Value::AsInt(expected)), + "{name} first point should be int" + ); +} + +fn assert_first_point_attr_int( + metrics: &[Metric], + name: &'static str, + key: &'static str, + expected: i64, +) { + let metric = metric_by_name(metrics, name); + let point = match metric.data.as_ref().expect("metric data") { + otlp_metric::Data::Sum(sum) => sum.data_points.first(), + otlp_metric::Data::Gauge(gauge) => gauge.data_points.first(), + _ => None, + } + .expect("data point"); + assert!( + point.attributes.iter().any(|attr| { + attr.key == key + && matches!( + attr.value.as_ref().and_then(|value| value.value.as_ref()), + Some(any_value::Value::IntValue(actual)) if *actual == expected + ) + }), + "missing int attribute {key}={expected}" + ); +} + +fn assert_no_first_point_attr(metrics: &[Metric], name: &'static str, key: &'static str) { + let metric = metric_by_name(metrics, name); + let point = match metric.data.as_ref().expect("metric data") { + otlp_metric::Data::Sum(sum) => sum.data_points.first(), + otlp_metric::Data::Gauge(gauge) => gauge.data_points.first(), + _ => None, + } + .expect("data point"); + assert!( + !point.attributes.iter().any(|attr| attr.key == key), + "unexpected attribute {key}" + ); +} + +fn assert_first_sum_point_start(metrics: &[Metric], name: &'static str, expected_start: u64) { + let metric = metric_by_name(metrics, name); + let otlp_metric::Data::Sum(sum) = metric.data.as_ref().expect("metric data") else { + panic!("{name} should be a cumulative sum"); + }; + let point = sum.data_points.first().expect("data point"); + assert_eq!(point.start_time_unix_nano, expected_start); +} + +fn metric_by_name<'a>(metrics: &'a [Metric], name: &'static str) -> &'a Metric { + metrics + .iter() + .find(|metric| metric.name == name) + .unwrap_or_else(|| panic!("missing metric {name}")) +} + +fn assert_has_attr(attributes: &[KeyValue], key: &'static str, value: &'static str) { + assert!( + has_attr(attributes, key, value), + "missing attribute {key}={value}" + ); +} + +fn has_attr(attributes: &[KeyValue], key: &'static str, value: &'static str) -> bool { + attributes.iter().any(|attr| { + attr.key == key + && matches!( + attr.value.as_ref().and_then(|value| value.value.as_ref()), + Some(any_value::Value::StringValue(actual)) if actual == value + ) + }) +} From 0b4821e01efad70a198016429ec4748eb4074d2e Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Wed, 6 May 2026 20:56:53 -0700 Subject: [PATCH 56/60] Split remote filesystem filtering --- .../receivers/host_metrics_receiver/README.md | 1 + .../receivers/host_metrics_receiver/config.rs | 8 +++ .../receivers/host_metrics_receiver/mod.rs | 3 + .../host_metrics_receiver/procfs/mod.rs | 11 +++- .../host_metrics_receiver/procfs/readings.rs | 17 +++-- .../host_metrics_receiver/procfs/tests.rs | 65 ++++++++++++++++++- 6 files changed, 96 insertions(+), 9 deletions(-) diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/README.md b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/README.md index 9646b59117..051d886c84 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/README.md +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/README.md @@ -117,6 +117,7 @@ groups: | `families.disk.limit` | bool | `false` | Emits disk capacity from sysfs. | | `families.filesystem.limit` | bool | `false` | Emits filesystem capacity. | | `families.filesystem.include_virtual_filesystems` | bool | `false` | Includes virtual filesystems such as tmpfs. | +| `families.filesystem.include_remote_filesystems` | bool | `false` | Includes remote and userspace filesystems such as NFS, CIFS, 9p, and FUSE. | Families are `cpu`, `memory`, `paging`, `system`, `disk`, `filesystem`, `network`, and `processes`. diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/config.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/config.rs index e93948a651..4508e55f79 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/config.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/config.rs @@ -236,6 +236,8 @@ pub struct FilesystemFamilyConfig { pub interval: Option, /// Include virtual filesystems. pub include_virtual_filesystems: bool, + /// Include remote and userspace filesystems. + pub include_remote_filesystems: bool, /// Enable filesystem limit metrics. pub limit: bool, /// Device include filter. @@ -258,6 +260,7 @@ impl Default for FilesystemFamilyConfig { enabled: true, interval: None, include_virtual_filesystems: false, + include_remote_filesystems: false, limit: false, include_devices: None, exclude_devices: None, @@ -447,6 +450,7 @@ pub(super) struct RuntimeFilesystemFamily { pub(super) enabled: bool, pub(super) interval: Duration, pub(super) include_virtual_filesystems: bool, + pub(super) include_remote_filesystems: bool, pub(super) limit: bool, pub(super) include_devices: Option, pub(super) exclude_devices: Option, @@ -746,6 +750,10 @@ impl TryFrom for RuntimeConfig { .families .filesystem .include_virtual_filesystems, + include_remote_filesystems: config + .families + .filesystem + .include_remote_filesystems, limit: config.families.filesystem.limit, include_devices: filesystem_include_devices, exclude_devices: filesystem_exclude_devices, diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs index 31d22118d7..00f554a152 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs @@ -450,6 +450,7 @@ impl local::Receiver for HostMetricsReceiver { memory_hugepages: config.memory_hugepages, disk_limit: config.families.disk.limit, filesystem_include_virtual: config.families.filesystem.include_virtual_filesystems, + filesystem_include_remote: config.families.filesystem.include_remote_filesystems, filesystem_limit: config.families.filesystem.limit, disk_include: config.families.disk.include.clone(), disk_exclude: config.families.disk.exclude.clone(), @@ -766,6 +767,7 @@ mod tests { "filesystem": { "interval": "5m", "include_virtual_filesystems": true, + "include_remote_filesystems": true, "limit": true, "exclude_fs_types": { "fs_types": ["tmpfs"], @@ -785,6 +787,7 @@ mod tests { Some(Duration::from_secs(300)) ); assert!(config.families.filesystem.include_virtual_filesystems); + assert!(config.families.filesystem.include_remote_filesystems); assert!(config.families.filesystem.limit); assert_eq!( config diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/mod.rs index 73431f0879..07f5315bc5 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/mod.rs @@ -67,6 +67,8 @@ pub struct ProcfsConfig { pub disk_limit: bool, /// Include virtual filesystems. pub filesystem_include_virtual: bool, + /// Include remote and userspace filesystems. + pub filesystem_include_remote: bool, /// Emit filesystem limit metric. pub filesystem_limit: bool, /// Disk include filter. @@ -297,6 +299,7 @@ impl ProcfsSource { let filesystems = if due.filesystem { let include_virtual = self.config.filesystem_include_virtual; + let include_remote = self.config.filesystem_include_remote; let emit_limit = self.config.filesystem_limit; let include_devices = self.config.filesystem_include_devices.clone(); let exclude_devices = self.config.filesystem_exclude_devices.clone(); @@ -314,7 +317,13 @@ impl ProcfsSource { include_mount_points: include_mount_points.as_ref(), exclude_mount_points: exclude_mount_points.as_ref(), }; - let mounts = parse_mountinfo(mountinfo, include_virtual, emit_limit, filters); + let mounts = parse_mountinfo( + mountinfo, + include_virtual, + include_remote, + emit_limit, + filters, + ); self.read_filesystems(mounts, &mut partial_errors, &mut first_error) } Err(err) => { diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/readings.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/readings.rs index c0691b9e80..00987a0d20 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/readings.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/readings.rs @@ -523,6 +523,7 @@ pub(super) struct FilesystemFilters<'a> { pub(super) fn parse_mountinfo( input: &str, include_virtual_filesystems: bool, + include_remote_filesystems: bool, emit_limit: bool, filters: FilesystemFilters<'_>, ) -> Vec { @@ -550,7 +551,10 @@ pub(super) fn parse_mountinfo( let Some(device) = post_fields.next() else { continue; }; - if !include_virtual_filesystems && is_skipped_filesystem_type(fs_type) { + if !include_virtual_filesystems && is_virtual_filesystem_type(fs_type) { + continue; + } + if !include_remote_filesystems && is_remote_filesystem_type(fs_type) { continue; } if !filter_allows(fs_type, filters.include_fs_types, filters.exclude_fs_types) { @@ -587,10 +591,14 @@ pub(super) fn filesystem_mode(options: &str) -> &'static str { } } -pub(super) fn is_skipped_filesystem_type(fs_type: &str) -> bool { +pub(super) fn is_remote_filesystem_type(fs_type: &str) -> bool { if fs_type == "fuse" || fs_type == "fuseblk" || fs_type.starts_with("fuse.") { return true; } + matches!(fs_type, "nfs" | "nfs4" | "cifs" | "smb3" | "9p") +} + +pub(super) fn is_virtual_filesystem_type(fs_type: &str) -> bool { matches!( fs_type, "autofs" @@ -610,11 +618,6 @@ pub(super) fn is_skipped_filesystem_type(fs_type: &str) -> bool { | "sysfs" | "tmpfs" | "tracefs" - | "nfs" - | "nfs4" - | "cifs" - | "smb3" - | "9p" ) } diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/tests.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/tests.rs index c2b6d40d7f..cbd5fa7766 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/tests.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/tests.rs @@ -483,6 +483,7 @@ fn scrape_due_emits_successful_families_after_partial_read_error() { memory_hugepages: false, disk_limit: false, filesystem_include_virtual: false, + filesystem_include_remote: false, filesystem_limit: false, filesystem_include_devices: None, filesystem_exclude_devices: None, @@ -545,6 +546,7 @@ fn scrape_due_preserves_disk_counter_state_after_diskstats_read_error() { memory_hugepages: false, disk_limit: false, filesystem_include_virtual: false, + filesystem_include_remote: false, filesystem_limit: false, filesystem_include_devices: None, filesystem_exclude_devices: None, @@ -634,6 +636,7 @@ fn scrape_due_fails_when_all_due_families_fail() { memory_hugepages: false, disk_limit: false, filesystem_include_virtual: false, + filesystem_include_remote: false, filesystem_limit: false, filesystem_include_devices: None, filesystem_exclude_devices: None, @@ -690,6 +693,7 @@ fn scrape_due_reads_opt_in_disk_limit_from_sysfs() { memory_hugepages: false, disk_limit: true, filesystem_include_virtual: false, + filesystem_include_remote: false, filesystem_limit: false, filesystem_include_devices: None, filesystem_exclude_devices: None, @@ -763,6 +767,7 @@ fn scrape_due_uses_boot_time_for_counter_only_family_ticks() { memory_hugepages: false, disk_limit: false, filesystem_include_virtual: false, + filesystem_include_remote: false, filesystem_limit: false, filesystem_include_devices: None, filesystem_exclude_devices: None, @@ -837,6 +842,7 @@ fn scrape_due_reads_filesystem_usage_from_mountinfo() { memory_hugepages: false, disk_limit: false, filesystem_include_virtual: false, + filesystem_include_remote: false, filesystem_limit: true, filesystem_include_devices: None, filesystem_exclude_devices: None, @@ -1018,7 +1024,7 @@ fn diskstats_parser_applies_filters_before_parsing_values() { } #[test] -fn mountinfo_parser_skips_virtual_filesystems_by_default() { +fn mountinfo_parser_skips_virtual_and_remote_filesystems_by_default() { let mounts = parse_mountinfo( "36 25 8:1 / / rw,relatime - ext4 /dev/sda1 rw\n\ 37 25 0:32 / /proc rw,nosuid,nodev,noexec,relatime - proc proc rw\n\ @@ -1026,6 +1032,7 @@ fn mountinfo_parser_skips_virtual_filesystems_by_default() { 39 25 0:34 / /mnt/fuseblk rw,relatime - fuseblk /dev/fuse rw\n\ 40 25 0:35 / /mnt/nfs rw,relatime - nfs server:/export rw\n", false, + false, true, FilesystemFilters::default(), ); @@ -1038,12 +1045,66 @@ fn mountinfo_parser_skips_virtual_filesystems_by_default() { assert!(mounts[0].emit_limit); } +#[test] +fn mountinfo_parser_keeps_remote_filesystems_separate_from_virtual_filesystems() { + let mountinfo = "36 25 8:1 / / rw,relatime - ext4 /dev/sda1 rw\n\ + 37 25 0:32 / /run rw,nosuid,nodev - tmpfs tmpfs rw\n\ + 38 25 0:33 / /mnt/fuse rw,relatime - fuse.sshfs sshfs rw\n\ + 39 25 0:34 / /mnt/fuseblk rw,relatime - fuseblk /dev/fuse rw\n\ + 40 25 0:35 / /mnt/nfs rw,relatime - nfs server:/export rw\n\ + 41 25 0:36 / /mnt/cifs rw,relatime - cifs //server/share rw\n\ + 42 25 0:37 / /mnt/9p rw,relatime - 9p hostshare rw\n"; + + let virtual_only = parse_mountinfo(mountinfo, true, false, false, FilesystemFilters::default()); + assert_eq!(virtual_only.len(), 2); + assert_eq!(virtual_only[0].fs_type, "ext4"); + assert_eq!(virtual_only[1].fs_type, "tmpfs"); + + let remote_only = parse_mountinfo(mountinfo, false, true, false, FilesystemFilters::default()); + assert_eq!(remote_only.len(), 6); + assert_eq!(remote_only[0].fs_type, "ext4"); + assert_eq!(remote_only[1].fs_type, "fuse.sshfs"); + assert_eq!(remote_only[2].fs_type, "fuseblk"); + assert_eq!(remote_only[3].fs_type, "nfs"); + assert_eq!(remote_only[4].fs_type, "cifs"); + assert_eq!(remote_only[5].fs_type, "9p"); + + let all_included = parse_mountinfo(mountinfo, true, true, false, FilesystemFilters::default()); + assert_eq!(all_included.len(), 7); +} + +#[test] +fn mountinfo_parser_applies_filters_after_remote_filesystem_opt_in() { + let include_fs_types = CompiledFilter::compile( + crate::receivers::host_metrics_receiver::MatchType::Strict, + vec!["nfs".to_owned()], + ) + .expect("valid") + .expect("filter"); + let mounts = parse_mountinfo( + "36 25 8:1 / / rw,relatime - ext4 /dev/sda1 rw\n\ + 37 25 0:35 / /mnt/nfs rw,relatime - nfs server:/export rw\n\ + 38 25 0:36 / /mnt/cifs rw,relatime - cifs //server/share rw\n", + false, + true, + false, + FilesystemFilters { + include_fs_types: Some(&include_fs_types), + ..FilesystemFilters::default() + }, + ); + + assert_eq!(mounts.len(), 1); + assert_eq!(mounts[0].fs_type, "nfs"); +} + #[test] fn mountinfo_parser_unescapes_paths() { let mounts = parse_mountinfo( "36 25 8:1 / /mnt/data\\040disk rw,relatime - ext4 /dev/disk\\040one rw\n", false, false, + false, FilesystemFilters::default(), ); @@ -1058,6 +1119,7 @@ fn mountinfo_parser_preserves_utf8_while_unescaping_paths() { "36 25 8:1 / /mnt/caf\u{00e9}\\040disk rw,relatime - ext4 /dev/disk\\040\u{00e9} rw\n", false, false, + false, FilesystemFilters::default(), ); @@ -1084,6 +1146,7 @@ fn mountinfo_parser_applies_filesystem_filters() { "36 25 8:1 / / rw,relatime - ext4 /dev/sda1 rw\n37 25 8:2 / /data rw,relatime - ext4 /dev/sdb1 rw\n38 25 8:3 / /data2 rw,relatime - xfs /dev/sdc1 rw\n", false, false, + false, FilesystemFilters { include_mount_points: Some(&include_mounts), exclude_fs_types: Some(&exclude_fs_types), From a8e9862ec89bc4ed2ceba0b98cea71dbc2ae0d4b Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Wed, 6 May 2026 21:22:31 -0700 Subject: [PATCH 57/60] Fix host metrics dev-tools imports --- .../src/receivers/host_metrics_receiver/procfs/tests.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/tests.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/tests.rs index cbd5fa7766..e38bfc0f11 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/tests.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/tests.rs @@ -3,7 +3,11 @@ use super::*; use crate::receivers::host_metrics_receiver::semconv::{attr, metric}; +#[cfg(feature = "dev-tools")] +use otap_df_pdata::proto::opentelemetry::common::v1::AnyValue; use otap_df_pdata::proto::opentelemetry::common::v1::{KeyValue, any_value}; +#[cfg(feature = "dev-tools")] +use otap_df_pdata::proto::opentelemetry::metrics::v1::NumberDataPoint; use otap_df_pdata::proto::opentelemetry::metrics::v1::{ AggregationTemporality, Metric, MetricsData, metric as otlp_metric, number_data_point, }; From 6eff210c85ed70aaf846dc282c9dc52dd680da1d Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Wed, 6 May 2026 21:58:35 -0700 Subject: [PATCH 58/60] Skip more virtual filesystems --- .../receivers/host_metrics_receiver/procfs/readings.rs | 8 ++++++++ .../src/receivers/host_metrics_receiver/procfs/tests.rs | 4 +++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/readings.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/readings.rs index 00987a0d20..16c7aa4a35 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/readings.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/readings.rs @@ -606,14 +606,22 @@ pub(super) fn is_virtual_filesystem_type(fs_type: &str) -> bool { | "binfmt_misc" | "cgroup" | "cgroup2" + | "configfs" | "debugfs" + | "devpts" | "devtmpfs" + | "efivarfs" | "fusectl" + | "hugetlbfs" | "mqueue" | "nsfs" | "overlay" | "proc" | "pstore" + | "ramfs" + | "rpc_pipefs" + | "securityfs" + | "selinuxfs" | "squashfs" | "sysfs" | "tmpfs" diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/tests.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/tests.rs index e38bfc0f11..f4d8a37ae5 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/tests.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/tests.rs @@ -1034,7 +1034,9 @@ fn mountinfo_parser_skips_virtual_and_remote_filesystems_by_default() { 37 25 0:32 / /proc rw,nosuid,nodev,noexec,relatime - proc proc rw\n\ 38 25 0:33 / /mnt/fuse rw,relatime - fuse.sshfs sshfs rw\n\ 39 25 0:34 / /mnt/fuseblk rw,relatime - fuseblk /dev/fuse rw\n\ - 40 25 0:35 / /mnt/nfs rw,relatime - nfs server:/export rw\n", + 40 25 0:35 / /mnt/nfs rw,relatime - nfs server:/export rw\n\ + 41 25 0:36 / /dev/pts rw,nosuid,noexec,relatime - devpts devpts rw\n\ + 42 25 0:37 / /sys/kernel/security rw,nosuid,nodev,noexec,relatime - securityfs securityfs rw\n", false, false, true, From 6626c5f14729b0244465e51acb5d78379a42c076 Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Wed, 6 May 2026 22:20:20 -0700 Subject: [PATCH 59/60] Stabilize host metrics start time fallback --- .../host_metrics_receiver/procfs/mod.rs | 15 ++- .../host_metrics_receiver/procfs/tests.rs | 116 ++++++++++++++++++ 2 files changed, 129 insertions(+), 2 deletions(-) diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/mod.rs index 07f5315bc5..8cfc52cc37 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/mod.rs @@ -34,6 +34,7 @@ pub struct ProcfsSource { filesystem_worker: FilesystemStatWorker, counter_tracker: CounterTracker, boot_time_unix_nano: Option, + fallback_start_time_unix_nano: u64, resource: Option, } @@ -143,6 +144,7 @@ impl ProcfsSource { filesystem_worker: FilesystemStatWorker::new()?, counter_tracker: CounterTracker::default(), boot_time_unix_nano: None, + fallback_start_time_unix_nano: now_unix_nano(), resource: None, }; source.apply_startup_validation()?; @@ -179,7 +181,9 @@ impl ProcfsSource { if stat.boot_time_unix_nano != 0 { self.boot_time_unix_nano = Some(stat.boot_time_unix_nano); } - let start_time_unix_nano = self.boot_time_unix_nano.unwrap_or(now_unix_nano); + let start_time_unix_nano = self + .boot_time_unix_nano + .unwrap_or(self.fallback_start_time_unix_nano); let cpu_utilization = if due.cpu && self.config.cpu_utilization { let utilization = stat.cpu.and_then(|current| { self.previous_cpu @@ -388,7 +392,14 @@ impl ProcfsSource { } fn validate_selected_paths(&self) -> io::Result<()> { - if self.config.cpu || self.config.system || self.config.processes { + if self.config.cpu + || self.config.memory + || self.config.paging + || self.config.disk + || self.config.filesystem + || self.config.network + || self.config.processes + { let _ = File::open(self.paths.path(PathKind::Stat))?; } if self.config.cpu { diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/tests.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/tests.rs index f4d8a37ae5..dc6fd25b58 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/tests.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/tests.rs @@ -611,6 +611,122 @@ fn scrape_due_preserves_disk_counter_state_after_diskstats_read_error() { ); } +#[test] +fn scrape_due_uses_stable_fallback_start_time_when_stat_is_unavailable() { + let root = tempfile::tempdir().expect("tempdir"); + let proc = root.path().join("proc"); + std::fs::create_dir(&proc).expect("proc dir"); + std::fs::write( + proc.join("diskstats"), + "8 0 sda 1 0 100 0 2 0 200 0 0 0 0 0 0 0 0\n", + ) + .expect("diskstats"); + let mut source = ProcfsSource::new( + Some(root.path()), + ProcfsConfig { + cpu: false, + memory: false, + paging: false, + system: false, + disk: true, + filesystem: false, + network: false, + processes: false, + cpu_utilization: false, + memory_limit: false, + memory_shared: false, + memory_hugepages: false, + disk_limit: false, + filesystem_include_virtual: false, + filesystem_include_remote: false, + filesystem_limit: false, + filesystem_include_devices: None, + filesystem_exclude_devices: None, + filesystem_include_fs_types: None, + filesystem_exclude_fs_types: None, + filesystem_include_mount_points: None, + filesystem_exclude_mount_points: None, + disk_include: None, + disk_exclude: None, + network_include: None, + network_exclude: None, + validation: HostViewValidationMode::None, + }, + ) + .expect("source"); + + let first = source + .scrape_due(ProcfsFamilies { + disk: true, + ..ProcfsFamilies::default() + }) + .expect("first disk scrape"); + std::thread::sleep(Duration::from_millis(1)); + let second = source + .scrape_due(ProcfsFamilies { + disk: true, + ..ProcfsFamilies::default() + }) + .expect("second disk scrape"); + + assert_eq!(first.partial_errors, 1); + assert_eq!(second.partial_errors, 1); + assert_eq!( + first.snapshot.start_time_unix_nano, + second.snapshot.start_time_unix_nano + ); +} + +#[test] +fn validation_requires_stat_for_cumulative_families() { + let root = tempfile::tempdir().expect("tempdir"); + let proc = root.path().join("proc"); + std::fs::create_dir(&proc).expect("proc dir"); + std::fs::write( + proc.join("diskstats"), + "8 0 sda 1 0 100 0 2 0 200 0 0 0 0 0 0 0 0\n", + ) + .expect("diskstats"); + + let err = match ProcfsSource::new( + Some(root.path()), + ProcfsConfig { + cpu: false, + memory: false, + paging: false, + system: false, + disk: true, + filesystem: false, + network: false, + processes: false, + cpu_utilization: false, + memory_limit: false, + memory_shared: false, + memory_hugepages: false, + disk_limit: false, + filesystem_include_virtual: false, + filesystem_include_remote: false, + filesystem_limit: false, + filesystem_include_devices: None, + filesystem_exclude_devices: None, + filesystem_include_fs_types: None, + filesystem_exclude_fs_types: None, + filesystem_include_mount_points: None, + filesystem_exclude_mount_points: None, + disk_include: None, + disk_exclude: None, + network_include: None, + network_exclude: None, + validation: HostViewValidationMode::FailSelected, + }, + ) { + Ok(_) => panic!("missing stat should fail validation for cumulative disk metrics"), + Err(err) => err, + }; + + assert_eq!(err.kind(), ErrorKind::NotFound); +} + #[test] fn filesystem_stat_worker_reports_disconnect_as_broken_pipe() { let worker = FilesystemStatWorker::disconnected_for_test(); From 2b1bbbacc83c824d6d1732a44e2915502a13788b Mon Sep 17 00:00:00 2001 From: Lalit Bhasin Date: Wed, 6 May 2026 23:10:34 -0700 Subject: [PATCH 60/60] Add cooperative host metrics scrape yields --- .../receivers/host_metrics_receiver/mod.rs | 4 +- .../host_metrics_receiver/procfs/mod.rs | 10 +- .../host_metrics_receiver/procfs/tests.rs | 129 +++++++++++------- 3 files changed, 92 insertions(+), 51 deletions(-) diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs index 00f554a152..e59c025cf6 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/mod.rs @@ -520,12 +520,13 @@ impl local::Receiver for HostMetricsReceiver { metrics.families_scraped.add(due_family_count(due)); metrics.scrape_lag_ns.record(duration_nanos(now.saturating_duration_since(scheduled_due))); } - match source.scrape_due(due) { + match source.scrape_due(due).await { Ok(scrape) => { if let Some(metrics) = metrics.as_mut() { metrics.partial_errors.add(scrape.partial_errors); metrics.source_read_errors.add(scrape.partial_errors); } + tokio::task::consume_budget().await; let pdata = match encode_snapshot(scrape.snapshot) { Ok(pdata) => pdata, Err(err) => { @@ -541,6 +542,7 @@ impl local::Receiver for HostMetricsReceiver { }); } }; + tokio::task::consume_budget().await; match effect_handler.try_send_message_with_source_node(pdata) { Ok(()) => { if let Some(metrics) = metrics.as_mut() { diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/mod.rs index 8cfc52cc37..8a3329387a 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/mod.rs @@ -152,7 +152,7 @@ impl ProcfsSource { } /// Collects one host snapshot for the due family set. - pub fn scrape_due(&mut self, due: ProcfsFamilies) -> io::Result { + pub async fn scrape_due(&mut self, due: ProcfsFamilies) -> io::Result { let due = due.enabled_by(&self.config); let now_unix_nano = now_unix_nano(); let clk_tck = self.clk_tck; @@ -260,6 +260,8 @@ impl ProcfsSource { } }; + tokio::task::consume_budget().await; + let disks = if due.disk { let disk_include = self.config.disk_include.clone(); let disk_exclude = self.config.disk_exclude.clone(); @@ -283,6 +285,8 @@ impl ProcfsSource { None }; + tokio::task::consume_budget().await; + let networks = if due.network { let network_include = self.config.network_include.clone(); let network_exclude = self.config.network_exclude.clone(); @@ -301,6 +305,8 @@ impl ProcfsSource { None }; + tokio::task::consume_budget().await; + let filesystems = if due.filesystem { let include_virtual = self.config.filesystem_include_virtual; let include_remote = self.config.filesystem_include_remote; @@ -339,6 +345,8 @@ impl ProcfsSource { Vec::new() }; + tokio::task::consume_budget().await; + let resource = self.read_resource().clone(); let counter_starts = self.counter_tracker.snapshot( start_time_unix_nano, diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/tests.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/tests.rs index dc6fd25b58..799c197fb0 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/tests.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/host_metrics_receiver/procfs/tests.rs @@ -15,7 +15,7 @@ use otap_df_pdata::testing::round_trip::decode_metrics; use projection::{CounterStarts, counter_key, counter_key_joined, counter_key_matches_joined}; #[cfg(feature = "dev-tools")] use std::collections::{BTreeMap, BTreeSet}; -use std::io::ErrorKind; +use std::io::{self, ErrorKind}; use std::path::PathBuf; #[cfg(feature = "dev-tools")] use weaver_common::{result::WResult, vdir::VirtualDirectoryPath}; @@ -33,6 +33,13 @@ use weaver_semconv::{ registry_repo::RegistryRepo, }; +fn block_on_scrape(source: &mut ProcfsSource, due: ProcfsFamilies) -> io::Result { + tokio::runtime::Builder::new_current_thread() + .build() + .expect("runtime") + .block_on(source.scrape_due(due)) +} + #[test] fn projection_uses_expected_metric_shapes() { let data = projection_fixture_request(); @@ -504,13 +511,15 @@ fn scrape_due_emits_successful_families_after_partial_read_error() { ) .expect("source"); - let scrape = source - .scrape_due(ProcfsFamilies { + let scrape = block_on_scrape( + &mut source, + ProcfsFamilies { memory: true, disk: true, ..ProcfsFamilies::default() - }) - .expect("partial scrape"); + }, + ) + .expect("partial scrape"); assert_eq!(scrape.partial_errors, 1); assert!(scrape.snapshot.memory.is_some()); @@ -567,25 +576,29 @@ fn scrape_due_preserves_disk_counter_state_after_diskstats_read_error() { ) .expect("source"); - let first = source - .scrape_due(ProcfsFamilies { + let first = block_on_scrape( + &mut source, + ProcfsFamilies { disk: true, ..ProcfsFamilies::default() - }) - .expect("first disk scrape"); + }, + ) + .expect("first disk scrape"); let first_start = first .snapshot .counter_starts .get_joined(metric::DISK_IO, "sda", "read", 0); std::fs::remove_file(proc.join("diskstats")).expect("remove diskstats"); - let partial = source - .scrape_due(ProcfsFamilies { + let partial = block_on_scrape( + &mut source, + ProcfsFamilies { memory: true, disk: true, ..ProcfsFamilies::default() - }) - .expect("partial scrape"); + }, + ) + .expect("partial scrape"); assert_eq!(partial.partial_errors, 1); assert!(partial.snapshot.disks.is_empty()); @@ -594,12 +607,14 @@ fn scrape_due_preserves_disk_counter_state_after_diskstats_read_error() { "8 0 sda 1 0 50 0 2 0 100 0 0 0 0 0 0 0 0\n", ) .expect("diskstats after reset"); - let after_error = source - .scrape_due(ProcfsFamilies { + let after_error = block_on_scrape( + &mut source, + ProcfsFamilies { disk: true, ..ProcfsFamilies::default() - }) - .expect("disk scrape after read error"); + }, + ) + .expect("disk scrape after read error"); let reset_start = after_error .snapshot @@ -655,19 +670,23 @@ fn scrape_due_uses_stable_fallback_start_time_when_stat_is_unavailable() { ) .expect("source"); - let first = source - .scrape_due(ProcfsFamilies { + let first = block_on_scrape( + &mut source, + ProcfsFamilies { disk: true, ..ProcfsFamilies::default() - }) - .expect("first disk scrape"); + }, + ) + .expect("first disk scrape"); std::thread::sleep(Duration::from_millis(1)); - let second = source - .scrape_due(ProcfsFamilies { + let second = block_on_scrape( + &mut source, + ProcfsFamilies { disk: true, ..ProcfsFamilies::default() - }) - .expect("second disk scrape"); + }, + ) + .expect("second disk scrape"); assert_eq!(first.partial_errors, 1); assert_eq!(second.partial_errors, 1); @@ -774,12 +793,14 @@ fn scrape_due_fails_when_all_due_families_fail() { .expect("source"); assert!( - source - .scrape_due(ProcfsFamilies { + block_on_scrape( + &mut source, + ProcfsFamilies { memory: true, ..ProcfsFamilies::default() - }) - .is_err() + }, + ) + .is_err() ); } @@ -830,12 +851,14 @@ fn scrape_due_reads_opt_in_disk_limit_from_sysfs() { ) .expect("source"); - let scrape = source - .scrape_due(ProcfsFamilies { + let scrape = block_on_scrape( + &mut source, + ProcfsFamilies { disk: true, ..ProcfsFamilies::default() - }) - .expect("disk scrape"); + }, + ) + .expect("disk scrape"); assert_eq!(scrape.snapshot.disks.len(), 1); assert_eq!( @@ -905,32 +928,38 @@ fn scrape_due_uses_boot_time_for_counter_only_family_ticks() { .expect("source"); let expected_start = 123 * NANOS_PER_SEC; - let disk_scrape = source - .scrape_due(ProcfsFamilies { + let disk_scrape = block_on_scrape( + &mut source, + ProcfsFamilies { disk: true, ..ProcfsFamilies::default() - }) - .expect("disk scrape"); + }, + ) + .expect("disk scrape"); assert_eq!(disk_scrape.snapshot.start_time_unix_nano, expected_start); assert_eq!(disk_scrape.snapshot.disks.len(), 1); std::fs::remove_file(proc.join("stat")).expect("remove stat after cache"); - let network_scrape = source - .scrape_due(ProcfsFamilies { + let network_scrape = block_on_scrape( + &mut source, + ProcfsFamilies { network: true, ..ProcfsFamilies::default() - }) - .expect("network scrape"); + }, + ) + .expect("network scrape"); assert_eq!(network_scrape.snapshot.start_time_unix_nano, expected_start); assert_eq!(network_scrape.snapshot.networks.len(), 1); - let paging_scrape = source - .scrape_due(ProcfsFamilies { + let paging_scrape = block_on_scrape( + &mut source, + ProcfsFamilies { paging: true, ..ProcfsFamilies::default() - }) - .expect("paging scrape"); + }, + ) + .expect("paging scrape"); assert_eq!(paging_scrape.snapshot.start_time_unix_nano, expected_start); assert!(paging_scrape.snapshot.paging.is_some()); } @@ -979,12 +1008,14 @@ fn scrape_due_reads_filesystem_usage_from_mountinfo() { ) .expect("source"); - let scrape = source - .scrape_due(ProcfsFamilies { + let scrape = block_on_scrape( + &mut source, + ProcfsFamilies { filesystem: true, ..ProcfsFamilies::default() - }) - .expect("filesystem scrape"); + }, + ) + .expect("filesystem scrape"); assert_eq!(scrape.snapshot.filesystems.len(), 1); assert_eq!(scrape.snapshot.filesystems[0].device, "/dev/sda1");