From 54de3a99e943f36a9dd84e43829a905b706404ce Mon Sep 17 00:00:00 2001 From: kaynetik Date: Thu, 3 Jul 2025 13:46:34 +0200 Subject: [PATCH 1/3] feat: instrumentation second attempt --- bun.lock | 1 + docker-compose.jaeger.yml | 43 ++ docker-compose.yml | 44 ++ package.json | 44 +- packages/cli/src/commands/run.ts | 18 +- packages/common/src/index.ts | 13 + packages/common/src/seda/seda-chain.ts | 34 ++ packages/common/src/telemetry/decorators.ts | 136 ++++++ packages/common/src/telemetry/index.ts | 154 ++++++- packages/common/src/telemetry/metrics.ts | 411 ++++++++++++++++++ packages/node/src/data-request-task.ts | 74 ++++ packages/node/src/index.ts | 68 +-- .../node/src/services/get-oracle-program.ts | 22 +- .../node/src/services/is-identity-eligible.ts | 40 +- packages/node/src/tasks/fetch.ts | 9 +- packages/node/src/tasks/identity-manager.ts | 49 ++- packages/node/src/tasks/is-eligible.ts | 48 +- prometheus.yml | 18 + 18 files changed, 1142 insertions(+), 84 deletions(-) create mode 100644 docker-compose.jaeger.yml create mode 100644 docker-compose.yml create mode 100644 packages/common/src/telemetry/decorators.ts create mode 100644 packages/common/src/telemetry/metrics.ts create mode 100644 prometheus.yml diff --git a/bun.lock b/bun.lock index 10ac43c..2e1c9ed 100644 --- a/bun.lock +++ b/bun.lock @@ -10,6 +10,7 @@ "@opentelemetry/exporter-metrics-otlp-proto": "0.202.0", "@opentelemetry/exporter-trace-otlp-http": "0.202.0", "@opentelemetry/resources": "2.0.1", + "@opentelemetry/sdk-metrics": "2.0.1", "@opentelemetry/sdk-trace-base": "2.0.1", "@opentelemetry/sdk-trace-node": "2.0.1", "@opentelemetry/semantic-conventions": "^1.34.0", diff --git a/docker-compose.jaeger.yml b/docker-compose.jaeger.yml new file mode 100644 index 0000000..1024fe6 --- /dev/null +++ b/docker-compose.jaeger.yml @@ -0,0 +1,43 @@ +version: '3.8' + +services: + # Jaeger all-in-one for development/testing + jaeger: + image: jaegertracing/all-in-one:latest + ports: + # Jaeger UI + - "16686:16686" + # OTLP gRPC receiver + - "4317:4317" + # OTLP HTTP receiver + - "4318:4318" + # Jaeger thrift + - "14250:14250" + # Jaeger thrift HTTP + - "14268:14268" + # Zipkin compatible endpoint + - "9411:9411" + environment: + - COLLECTOR_OTLP_ENABLED=true + - LOG_LEVEL=debug + command: + - "--memory.max-traces=10000" + + # Optional: Prometheus for metrics (can be removed if only testing traces) + prometheus: + image: prom/prometheus:latest + ports: + - "9090:9090" + volumes: + - ./prometheus.yml:/etc/prometheus/prometheus.yml + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + - '--web.console.libraries=/etc/prometheus/console_libraries' + - '--web.console.templates=/etc/prometheus/consoles' + - '--storage.tsdb.retention.time=200h' + - '--web.enable-lifecycle' + +networks: + default: + name: seda-otel-network \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..730ea9a --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,44 @@ +services: + # Jaeger all-in-one for development/testing + jaeger: + image: jaegertracing/all-in-one:latest + ports: + # Jaeger UI + - "16686:16686" + # OTLP gRPC receiver + - "4317:4317" + # OTLP HTTP receiver + - "4318:4318" + # Jaeger thrift + - "14250:14250" + # Jaeger thrift HTTP + - "14268:14268" + # Zipkin compatible endpoint + - "9411:9411" + environment: + - COLLECTOR_OTLP_ENABLED=true + - LOG_LEVEL=debug + command: + - "--memory.max-traces=10000" + - "--query.base-path=/jaeger/ui" + - "--prometheus.server-url=http://prometheus:9090" + - "--prometheus.query.support-spanmetrics-connector=true" + + # Optional: Prometheus for metrics (can be removed if only testing traces) + prometheus: + image: prom/prometheus:latest + ports: + - "9090:9090" + volumes: + - ./prometheus.yml:/etc/prometheus/prometheus.yml + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + - '--web.console.libraries=/etc/prometheus/console_libraries' + - '--web.console.templates=/etc/prometheus/consoles' + - '--storage.tsdb.retention.time=200h' + - '--web.enable-lifecycle' + +networks: + default: + name: seda-otel-network \ No newline at end of file diff --git a/package.json b/package.json index a3aa60b..b8d4b46 100644 --- a/package.json +++ b/package.json @@ -22,7 +22,48 @@ "check-ts": "bunx tsc --noEmit", "fmt": "bunx biome check --write .", "check-fmt": "bunx biome check .", - "build-schema": "bun run ./packages/contract/build.ts" + "build-schema": "bun run ./packages/contract/build.ts", + "node": "bun run check-ts && bun run ./packages/cli run", + "node:dev": "bun run ./packages/cli run", + "node:inspect": "bun run check-ts && bun run --inspect ./packages/cli run", + "identity:info": "bun run check-ts && bun run ./packages/cli identities info", + "identity:stake": "bun run check-ts && bun run ./packages/cli identities stake", + "identity:unstake": "bun run check-ts && bun run ./packages/cli identities unstake", + "identity:withdraw": "bun run check-ts && bun run ./packages/cli identities withdraw", + "planet": "bun run check-ts && bun run ./packages/cli run --network planet", + "planet:info": "bun run check-ts && bun run ./packages/cli identities info --offline --network planet", + "planet:stake": "bun run check-ts && bun run ./packages/cli identities stake --network planet", + "planet:unstake": "bun run check-ts && bun run ./packages/cli identities unstake --network planet", + "planet:withdraw": "bun run check-ts && bun run ./packages/cli identities withdraw --network planet", + "testnet": "bun run check-ts && bun run ./packages/cli run --network testnet", + "testnet:info": "bun run check-ts && bun run ./packages/cli identities info --offline --network testnet", + "testnet:stake": "bun run check-ts && bun run ./packages/cli identities stake --network testnet", + "testnet:unstake": "bun run check-ts && bun run ./packages/cli identities unstake --network testnet", + "testnet:withdraw": "bun run check-ts && bun run ./packages/cli identities withdraw --network testnet", + "devnet": "bun run check-ts && bun run ./packages/cli run --network devnet", + "devnet:info": "bun run check-ts && bun run ./packages/cli identities info --offline --network devnet", + "devnet:stake": "bun run check-ts && bun run ./packages/cli identities stake --network devnet", + "devnet:unstake": "bun run check-ts && bun run ./packages/cli identities unstake --network devnet", + "devnet:withdraw": "bun run check-ts && bun run ./packages/cli identities withdraw --network devnet", + "dev": "bun run ./packages/cli", + "dev:planet": "bun run ./packages/cli run --network planet", + "dev:planet:info": "bun run ./packages/cli identities info --offline --network planet", + "dev:planet:stake": "bun run ./packages/cli identities stake --network planet", + "dev:planet:unstake": "bun run ./packages/cli identities unstake --network planet", + "dev:planet:withdraw": "bun run ./packages/cli identities withdraw --network planet", + "dev:testnet": "bun run ./packages/cli run --network testnet", + "dev:testnet:info": "bun run ./packages/cli identities info --offline --network testnet", + "dev:testnet:stake": "bun run ./packages/cli identities stake --network testnet", + "dev:testnet:unstake": "bun run ./packages/cli identities unstake --network testnet", + "dev:testnet:withdraw": "bun run ./packages/cli identities withdraw --network testnet", + "dev:devnet": "bun run ./packages/cli run --network devnet", + "dev:devnet:info": "bun run ./packages/cli identities info --offline --network devnet", + "dev:devnet:stake": "bun run ./packages/cli identities stake --network devnet", + "dev:devnet:unstake": "bun run ./packages/cli identities unstake --network devnet", + "dev:devnet:withdraw": "bun run ./packages/cli identities withdraw --network devnet", + "validate": "bun run check-ts && bun run ./packages/cli validate", + "tools": "bun run check-ts && bun run ./packages/cli tools", + "init": "bun run check-ts && bun run ./packages/cli init" }, "devDependencies": { "@types/bun": "latest", @@ -43,6 +84,7 @@ "@opentelemetry/exporter-metrics-otlp-grpc": "0.202.0", "@opentelemetry/exporter-metrics-otlp-http": "0.202.0", "@opentelemetry/exporter-metrics-otlp-proto": "0.202.0", + "@opentelemetry/sdk-metrics": "2.0.1", "@opentelemetry/resources": "2.0.1", "@opentelemetry/semantic-conventions": "^1.34.0", "@opentelemetry/sdk-trace-base": "2.0.1", diff --git a/packages/cli/src/commands/run.ts b/packages/cli/src/commands/run.ts index 2ae9a20..f39d0b2 100644 --- a/packages/cli/src/commands/run.ts +++ b/packages/cli/src/commands/run.ts @@ -20,18 +20,28 @@ export const runCmd = populateWithCommonOptions(new Command("run")) }); if (config.isOk) { - const exitController = new AbortController(); - runNode(config.value, { - exitController, + skipIdentityInitialization: false, }); listenForExit(async () => { - exitController.abort(); + // Graceful shutdown handled by telemetry system }); } else { logger.error("Error while parsing config:"); + // Record critical boot failure for config errors + try { + const { metricsHelpers } = await import("@sedaprotocol/overlay-ts-common"); + const configError = new Error(`Config parsing failed: ${config.error.join(", ")}`); + metricsHelpers.recordCriticalError("node_boot", configError, { + reason: "config_parsing_failure", + boot_phase: "config_validation", + }); + } catch (e) { + // Ignore if telemetry is not available + } + for (const error of config.error) { logger.error(error); } diff --git a/packages/common/src/index.ts b/packages/common/src/index.ts index f560392..0da32c2 100644 --- a/packages/common/src/index.ts +++ b/packages/common/src/index.ts @@ -1,4 +1,17 @@ import "./telemetry"; + +// Enhanced telemetry exports +export { + initializeTelemetry, + shutdownTelemetry, + telemetryInitialized, +} from "./telemetry"; + +export { + sedaMetrics, + metricsHelpers, +} from "./telemetry/metrics"; + export { isBrowser } from "./services/is-browser"; export * from "./services/try-async"; export * from "./services/timer"; diff --git a/packages/common/src/seda/seda-chain.ts b/packages/common/src/seda/seda-chain.ts index 2dd1eae..b621127 100644 --- a/packages/common/src/seda/seda-chain.ts +++ b/packages/common/src/seda/seda-chain.ts @@ -23,6 +23,11 @@ import { createProtoQueryClient, createWasmQueryClient } from "./query-client"; import { getTransaction, signAndSendTxSync } from "./sign-and-send-tx"; import { type ISigner, Signer } from "./signer"; import { type SedaSigningCosmWasmClient, createSigningClient } from "./signing-client"; +import { + JSONStringify, + metricsHelpers, + sleep, +} from "../index"; type EventMap = { "tx-error": [string, TransactionMessage | undefined]; @@ -134,6 +139,12 @@ export class SedaChain extends EventEmitter { id: txHash, }); + // Record RPC connectivity error + metricsHelpers.recordRpcError("general", "getCurrentBlockHeight", currentBlockHeight.error, { + tx_hash: txHash, + operation: "get_current_block_height", + }); + return Result.ok(Maybe.nothing()); } @@ -144,6 +155,13 @@ export class SedaChain extends EventEmitter { id: txHash, }); + // Record RPC connectivity error + metricsHelpers.recordRpcError("general", "getBlock", block.error, { + tx_hash: txHash, + block_height: currentBlockHeight.value.toString(), + operation: "get_block", + }); + // We only want to return an error on transaction level, not on the block level return Result.ok(Maybe.nothing()); } @@ -465,6 +483,15 @@ export class SedaChain extends EventEmitter { logger.error(`Could not find callback for message id: ${txMessage.value.id}: ${txMessage.value}`, { id: txMessage.value.traceId, }); + + // HIGH: Callback lookup failure - fishy behavior detected + const callbackError = new Error(`Callback not found for message id: ${txMessage.value.id}`); + metricsHelpers.recordHighPriorityError("callback_lookup", callbackError, { + message_id: txMessage.value.id, + trace_id: txMessage.value.traceId ?? "unknown", + operation: "callback_lookup", + }); + return; } @@ -569,6 +596,13 @@ export class SedaChain extends EventEmitter { id: traceId, }); + // Record RPC connectivity error for transaction fetch failure + metricsHelpers.recordRpcError("general", "getTransaction", transactionResult.error, { + tx_hash: transactionHash.value, + trace_id: traceId ?? "unknown", + operation: "get_transaction", + }); + const error = narrowDownError(transactionResult.error); clearInterval(checkTransactionInterval); resolve(Result.err(error)); diff --git a/packages/common/src/telemetry/decorators.ts b/packages/common/src/telemetry/decorators.ts new file mode 100644 index 0000000..0f60809 --- /dev/null +++ b/packages/common/src/telemetry/decorators.ts @@ -0,0 +1,136 @@ +import { trace, type Span, SpanStatusCode, SpanKind } from "@opentelemetry/api"; + +const tracer = trace.getTracer("seda-overlay-decorators", "1.0.0"); + +export interface TracedOptions { + spanName?: string; + spanKind?: SpanKind; + attributes?: Record; +} + +export function Traced(options: TracedOptions = {}) { + return function (target: any, propertyKey: string, descriptor: PropertyDescriptor) { + if (!descriptor.value) { + throw new Error("@Traced can only be applied to methods"); + } + + const originalMethod = descriptor.value; + const spanName = options.spanName || `${target.constructor.name}.${propertyKey}`; + + descriptor.value = function (...args: any[]) { + return tracer.startActiveSpan(spanName, (span: Span) => { + try { + span.setAttributes({ + "method.name": propertyKey, + "class.name": target.constructor.name, + ...options.attributes, + }); + + const result = originalMethod.apply(this, args); + + if (result && typeof result.then === "function") { + return result + .then((asyncResult: any) => { + span.setStatus({ code: SpanStatusCode.OK }); + return asyncResult; + }) + .catch((error: Error) => { + span.recordException(error); + span.setStatus({ code: SpanStatusCode.ERROR, message: error.message }); + throw error; + }) + .finally(() => span.end()); + } else { + span.setStatus({ code: SpanStatusCode.OK }); + span.end(); + return result; + } + } catch (error: any) { + span.recordException(error); + span.setStatus({ code: SpanStatusCode.ERROR, message: error.message }); + span.end(); + throw error; + } + }); + }; + + return descriptor; + }; +} + +export function MonitorCritical(options: TracedOptions = {}) { + return Traced({ + ...options, + spanName: options.spanName || `critical.operation`, + attributes: { operation_type: "critical", ...options.attributes }, + }); +} + +export function MonitorRPC(options: TracedOptions & { endpoint?: string } = {}) { + const rpcAttributes: Record = { + ...options.attributes, + }; + + if (options.endpoint) { + rpcAttributes.rpc_endpoint = options.endpoint; + } + + return Traced({ + ...options, + spanName: options.spanName || `rpc.${options.endpoint || "call"}`, + spanKind: SpanKind.CLIENT, + attributes: rpcAttributes, + }); +} + +export function TraceClass(options: { prefix?: string } = {}) { + return function (constructor: any) { + const methodNames = Object.getOwnPropertyNames(constructor.prototype); + + for (const methodName of methodNames) { + if (methodName === "constructor") continue; + + const descriptor = Object.getOwnPropertyDescriptor(constructor.prototype, methodName); + if (descriptor && typeof descriptor.value === "function") { + const spanName = options.prefix ? `${options.prefix}.${methodName}` : `${constructor.name}.${methodName}`; + Traced({ spanName })(constructor.prototype, methodName, descriptor); + Object.defineProperty(constructor.prototype, methodName, descriptor); + } + } + + return constructor; + }; +} + +export async function withSpan( + name: string, + fn: (span: Span) => Promise | T +): Promise { + return tracer.startActiveSpan(name, async (span: Span) => { + try { + const result = await fn(span); + span.setStatus({ code: SpanStatusCode.OK }); + return result; + } catch (error: any) { + span.recordException(error); + span.setStatus({ code: SpanStatusCode.ERROR, message: error.message }); + throw error; + } finally { + span.end(); + } + }); +} + +export function setSpanAttributes(attributes: Record) { + const activeSpan = trace.getActiveSpan(); + if (activeSpan) { + activeSpan.setAttributes(attributes); + } +} + +export function addSpanEvent(name: string, attributes?: Record) { + const activeSpan = trace.getActiveSpan(); + if (activeSpan) { + activeSpan.addEvent(name, attributes); + } +} \ No newline at end of file diff --git a/packages/common/src/telemetry/index.ts b/packages/common/src/telemetry/index.ts index ba3eab2..843a7cd 100644 --- a/packages/common/src/telemetry/index.ts +++ b/packages/common/src/telemetry/index.ts @@ -1,17 +1,153 @@ +import { metrics } from "@opentelemetry/api"; import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-http"; +import { OTLPMetricExporter } from "@opentelemetry/exporter-metrics-otlp-http"; import { resourceFromAttributes } from "@opentelemetry/resources"; import { NodeTracerProvider, SimpleSpanProcessor } from "@opentelemetry/sdk-trace-node"; -import { ATTR_SERVICE_NAME } from "@opentelemetry/semantic-conventions"; +import { MeterProvider, PeriodicExportingMetricReader } from "@opentelemetry/sdk-metrics"; +import { ATTR_SERVICE_NAME, ATTR_SERVICE_VERSION } from "@opentelemetry/semantic-conventions"; -const exporter = new OTLPTraceExporter({ - url: process.env.OTLP_ENDPOINT, +// Configuration from environment variables +const OTLP_ENDPOINT = process.env.OTLP_ENDPOINT || "http://localhost:4318"; +const SERVICE_NAME = process.env.OTEL_SERVICE_NAME || "seda-overlay"; +const SERVICE_VERSION = process.env.OTEL_SERVICE_VERSION || "1.0.0"; +const METRICS_EXPORT_INTERVAL = Number.parseInt(process.env.OTEL_METRICS_EXPORT_INTERVAL || "5000"); +const TELEMETRY_ENABLED = process.env.OTEL_ENABLED !== "false"; // Default to enabled + +// Global state +let telemetryInitialized = false; +let metricsCollectionCleanup: (() => void) | null = null; + +// Create resource +const resource = resourceFromAttributes({ + [ATTR_SERVICE_NAME]: SERVICE_NAME, + [ATTR_SERVICE_VERSION]: SERVICE_VERSION, +}); + +// Configure trace exporter +const traceExporter = new OTLPTraceExporter({ + url: `${OTLP_ENDPOINT}/v1/traces`, +}); + +// Configure metrics exporter +const metricExporter = new OTLPMetricExporter({ + url: `${OTLP_ENDPOINT}/v1/metrics`, }); -const provider = new NodeTracerProvider({ - resource: resourceFromAttributes({ - [ATTR_SERVICE_NAME]: "seda-overlay", - }), - spanProcessors: [new SimpleSpanProcessor(exporter)], +// Set up tracing +const tracerProvider = new NodeTracerProvider({ + resource, + spanProcessors: [new SimpleSpanProcessor(traceExporter)], }); -provider.register(); +// Set up metrics +const meterProvider = new MeterProvider({ + resource, + readers: [ + new PeriodicExportingMetricReader({ + exporter: metricExporter, + exportIntervalMillis: METRICS_EXPORT_INTERVAL, + }), + ], +}); + +/** + * Initialize OpenTelemetry with both tracing and metrics + */ +export function initializeTelemetry(): boolean { + if (telemetryInitialized) { + console.log("📡 Telemetry already initialized"); + return true; + } + + if (!TELEMETRY_ENABLED) { + console.log("📡 Telemetry disabled by configuration (OTEL_ENABLED=false)"); + return false; + } + + try { + // Register providers + tracerProvider.register(); + metrics.setGlobalMeterProvider(meterProvider); + + // Start system metrics collection + const { startSystemMetricsCollection } = require("./metrics"); + metricsCollectionCleanup = startSystemMetricsCollection(); + + telemetryInitialized = true; + + console.log("📡 OpenTelemetry initialized successfully"); + console.log(`📊 Service: ${SERVICE_NAME}@${SERVICE_VERSION}`); + console.log(`📈 Endpoint: ${OTLP_ENDPOINT}`); + + // Set up graceful shutdown + setupGracefulShutdown(); + + return true; + + } catch (error) { + console.error("❌ Failed to initialize telemetry:", error); + return false; + } +} + +/** + * Setup graceful shutdown handlers + */ +function setupGracefulShutdown(): void { + const shutdown = async () => { + if (!telemetryInitialized) { + return; + } + + console.log("📡 Shutting down telemetry..."); + + if (metricsCollectionCleanup) { + metricsCollectionCleanup(); + metricsCollectionCleanup = null; + } + + try { + await tracerProvider.forceFlush(); + await meterProvider.forceFlush(); + await tracerProvider.shutdown(); + await meterProvider.shutdown(); + telemetryInitialized = false; + console.log("📡 Telemetry shutdown complete"); + } catch (error) { + console.error("❌ Error during telemetry shutdown:", error); + } + }; + + // Handle various shutdown signals + process.on("SIGTERM", shutdown); + process.on("SIGINT", shutdown); +} + +/** + * Gracefully shutdown telemetry + */ +export async function shutdownTelemetry() { + if (!telemetryInitialized) { + return; + } + + console.log("📡 Manual shutdown requested..."); + + if (metricsCollectionCleanup) { + metricsCollectionCleanup(); + metricsCollectionCleanup = null; + } + + try { + await tracerProvider.forceFlush(); + await meterProvider.forceFlush(); + await tracerProvider.shutdown(); + await meterProvider.shutdown(); + telemetryInitialized = false; + console.log("📡 Manual shutdown complete"); + } catch (error) { + console.error("❌ Error during manual shutdown:", error); + } +} + +export { telemetryInitialized }; diff --git a/packages/common/src/telemetry/metrics.ts b/packages/common/src/telemetry/metrics.ts new file mode 100644 index 0000000..bdad9dd --- /dev/null +++ b/packages/common/src/telemetry/metrics.ts @@ -0,0 +1,411 @@ +import { metrics } from "@opentelemetry/api"; + +// Get meter for custom metrics +const meter = metrics.getMeter("seda-overlay-custom", "1.0.0"); + +/** + * Custom metrics for SEDA Overlay observability + * Based on error categorization analysis from todos_to_actionable_errors + */ +export const sedaMetrics = { + // ================================================================= + // CRITICAL ERRORS - Immediate alerting required + // ================================================================= + + // CRITICAL-001: Node Boot Failures + nodeBootFailures: meter.createCounter("overlay_node_boot_failures_total", { + description: "Total number of node boot failures", + unit: "1", + }), + + // CRITICAL-002: State Invariant Violations + stateInvariantViolations: meter.createCounter("overlay_state_invariant_violations_total", { + description: "Data request task state invariant violations", + unit: "1", + }), + + // CRITICAL-003: Duplicate Node Detection + duplicateNodeErrors: meter.createCounter("overlay_duplicate_node_errors_total", { + description: "Duplicate node detection errors (reveal hash mismatch)", + unit: "1", + }), + + // CRITICAL-004: Staker Removal + stakerRemovedErrors: meter.createCounter("overlay_staker_removed_errors_total", { + description: "Unexpected staker removal events", + unit: "1", + }), + + // CRITICAL-005: Identity Signing Failure + identitySigningFailures: meter.createCounter("overlay_identity_signing_failures_total", { + description: "Identity signing failures with missing keys", + unit: "1", + }), + + // ================================================================= + // HIGH-PRIORITY RPC ERRORS - Alert after 3 consecutive in 30min + // ================================================================= + + // HIGH-RPC-001: General RPC Connection Issues + rpcConnectionErrors: meter.createCounter("overlay_rpc_connection_errors_total", { + description: "RPC connection failures across the system", + unit: "1", + }), + + // HIGH-RPC-002: Data Request RPC Failures + dataRequestRpcErrors: meter.createCounter("overlay_data_request_rpc_errors_total", { + description: "Data request specific RPC failures", + unit: "1", + }), + + // HIGH-RPC-003: Eligibility Check RPC Failures + eligibilityRpcErrors: meter.createCounter("overlay_eligibility_rpc_errors_total", { + description: "Eligibility check RPC failures", + unit: "1", + }), + + // HIGH-RPC-004: Fetch Task RPC Failures + fetchRpcErrors: meter.createCounter("overlay_fetch_rpc_errors_total", { + description: "Fetch task specific RPC failures", + unit: "1", + }), + + // ================================================================= + // HIGH-PRIORITY OTHER ERRORS - Immediate alerting + // ================================================================= + + // HIGH-001: Callback Message Issues + callbackLookupFailures: meter.createCounter("overlay_callback_lookup_failures_total", { + description: "Callback message lookup failures - fishy behavior detected", + unit: "1", + }), + + // HIGH-002: Execution Result Missing + executionResultMissing: meter.createCounter("overlay_execution_result_missing_total", { + description: "Missing execution results - should not be possible", + unit: "1", + }), + + // HIGH-003: Disk Write Failures + diskWriteFailures: meter.createCounter("overlay_disk_write_failures_total", { + description: "Disk write failures for WASM cache", + unit: "1", + }), + + // HIGH-004: SEDA Transfer Failures + sedaTransferFailures: meter.createCounter("overlay_seda_transfer_failures_total", { + description: "SEDA transfer failures (RPC or insufficient balance)", + unit: "1", + }), + + // HIGH-005: No Stake Available + noStakeErrors: meter.createCounter("overlay_no_stake_errors_total", { + description: "No stake available for operations", + unit: "1", + }), + + // ================================================================= + // OPERATIONAL HEALTH METRICS + // ================================================================= + + // General application health + errorTotal: meter.createCounter("overlay_errors_total", { + description: "Total application errors by type and severity", + unit: "1", + }), + + requestsTotal: meter.createCounter("overlay_requests_total", { + description: "Total application requests processed", + unit: "1", + }), + + dataRequestsProcessed: meter.createCounter("overlay_data_requests_processed_total", { + description: "Total data requests processed successfully", + unit: "1", + }), + + // Performance metrics + operationDuration: meter.createHistogram("overlay_operation_duration_ms", { + description: "Duration of various operations in milliseconds", + unit: "ms", + }), + + // Resource utilization + memoryUsage: meter.createGauge("overlay_memory_usage_bytes", { + description: "Memory usage in bytes", + unit: "bytes", + }), + + // RPC health tracking + rpcRequestDuration: meter.createHistogram("overlay_rpc_request_duration_ms", { + description: "RPC request duration in milliseconds", + unit: "ms", + }), + + rpcRequestsTotal: meter.createCounter("overlay_rpc_requests_total", { + description: "Total RPC requests by endpoint and status", + unit: "1", + }), + + // Connection metrics + activeConnections: meter.createUpDownCounter("overlay_active_connections", { + description: "Number of active connections by type", + unit: "1", + }), +}; + +/** + * Common attributes to be used with all metrics for consistent labeling + */ +export function getCommonAttributes(additionalAttrs?: Record) { + return { + service_name: "seda-overlay", + service_version: process.env.OTEL_SERVICE_VERSION || "1.0.0", + environment: process.env.NODE_ENV || "development", + instance_id: process.env.INSTANCE_ID || `overlay-${Date.now()}`, + ...additionalAttrs, + }; +} + +/** + * Enhanced utility functions for recording metrics with consistent patterns + */ +export const metricsHelpers = { + /** + * Record a critical error with proper categorization + */ + recordCriticalError( + type: "node_boot" | "state_invariant" | "duplicate_node" | "staker_removed" | "identity_signing", + error: Error, + context?: Record + ) { + const attributes = { + ...getCommonAttributes(), + error_type: error.constructor.name, + error_message: error.message.substring(0, 200), + ...context, + }; + + switch (type) { + case "node_boot": + sedaMetrics.nodeBootFailures.add(1, attributes); + break; + case "state_invariant": + sedaMetrics.stateInvariantViolations.add(1, attributes); + break; + case "duplicate_node": + sedaMetrics.duplicateNodeErrors.add(1, attributes); + break; + case "staker_removed": + sedaMetrics.stakerRemovedErrors.add(1, attributes); + break; + case "identity_signing": + sedaMetrics.identitySigningFailures.add(1, attributes); + break; + } + + // Also record in general errors counter + sedaMetrics.errorTotal.add(1, { ...attributes, severity: "critical", category: type }); + }, + + /** + * Record high-priority errors with categorization + */ + recordHighPriorityError( + type: "callback_lookup" | "execution_result_missing" | "disk_write" | "seda_transfer" | "no_stake", + error: Error, + context?: Record + ) { + const attributes = { + ...getCommonAttributes(), + error_type: error.constructor.name, + error_message: error.message.substring(0, 200), + ...context, + }; + + switch (type) { + case "callback_lookup": + sedaMetrics.callbackLookupFailures.add(1, attributes); + break; + case "execution_result_missing": + sedaMetrics.executionResultMissing.add(1, attributes); + break; + case "disk_write": + sedaMetrics.diskWriteFailures.add(1, attributes); + break; + case "seda_transfer": + sedaMetrics.sedaTransferFailures.add(1, attributes); + break; + case "no_stake": + sedaMetrics.noStakeErrors.add(1, attributes); + break; + } + + // Also record in general errors counter + sedaMetrics.errorTotal.add(1, { ...attributes, severity: "high", category: type }); + }, + + /** + * Record RPC error with endpoint categorization + */ + recordRpcError( + type: "general" | "data_request" | "eligibility" | "fetch", + endpoint: string, + error: Error, + context?: Record + ) { + const attributes = { + ...getCommonAttributes(), + endpoint, + error_type: error.constructor.name, + error_message: error.message.substring(0, 200), + ...context, + }; + + switch (type) { + case "general": + sedaMetrics.rpcConnectionErrors.add(1, attributes); + break; + case "data_request": + sedaMetrics.dataRequestRpcErrors.add(1, attributes); + break; + case "eligibility": + sedaMetrics.eligibilityRpcErrors.add(1, attributes); + break; + case "fetch": + sedaMetrics.fetchRpcErrors.add(1, attributes); + break; + } + + // Also record in general RPC metrics + sedaMetrics.errorTotal.add(1, { ...attributes, severity: "high", category: "rpc_error" }); + }, + + /** + * Record RPC operation with timing and error tracking + */ + recordRpcOperation( + endpoint: string, + duration: number, + success: boolean, + error?: Error, + context?: Record + ) { + const attributes = { + ...getCommonAttributes(), + rpc_endpoint: endpoint, + success: success.toString(), + ...context, + }; + + // Record duration + sedaMetrics.rpcRequestDuration.record(duration, attributes); + + // Record request count + sedaMetrics.rpcRequestsTotal.add(1, attributes); + + if (!success && error) { + // Record RPC error + sedaMetrics.rpcConnectionErrors.add(1, { + ...attributes, + error_type: error.constructor.name, + error_message: error.message.substring(0, 200), + }); + } + }, + + /** + * Record data request stage progression + */ + recordDataRequestStage( + drId: string, + stage: "execute" | "commit" | "reveal" | "completed" | "failed", + duration?: number, + context?: Record + ) { + const attributes = { + ...getCommonAttributes(), + dr_id: drId, + stage, + ...context, + }; + + sedaMetrics.dataRequestsProcessed.add(1, attributes); + + if (duration !== undefined) { + sedaMetrics.operationDuration.record(duration, attributes); + } + + if (stage === "completed") { + sedaMetrics.dataRequestsProcessed.add(1, attributes); + } + }, + + /** + * Record general operation with timing + */ + recordOperation( + operationType: string, + duration: number, + success: boolean, + context?: Record + ) { + const attributes = { + ...getCommonAttributes(), + operation_type: operationType, + success: success.toString(), + ...context, + }; + + sedaMetrics.operationDuration.record(duration, attributes); + sedaMetrics.requestsTotal.add(1, attributes); + }, + + /** + * Update resource metrics + */ + updateResourceMetrics() { + if (typeof process !== "undefined" && process.memoryUsage) { + const memUsage = process.memoryUsage(); + const attributes = getCommonAttributes(); + + sedaMetrics.memoryUsage.record(memUsage.heapUsed, { ...attributes, memory_type: "heap_used" }); + sedaMetrics.memoryUsage.record(memUsage.heapTotal, { ...attributes, memory_type: "heap_total" }); + sedaMetrics.memoryUsage.record(memUsage.rss, { ...attributes, memory_type: "rss" }); + } + }, + + /** + * Track connection changes + */ + updateConnectionCount(type: string, delta: number, context?: Record) { + const attributes = { + ...getCommonAttributes(), + connection_type: type, + ...context, + }; + + sedaMetrics.activeConnections.add(delta, attributes); + }, +}; + +/** + * Start periodic collection of system metrics + */ +export function startSystemMetricsCollection(intervalMs: number = 30000) { + const interval = setInterval(() => { + metricsHelpers.updateResourceMetrics(); + }, intervalMs); + + // Return cleanup function + return () => clearInterval(interval); +} + +// Export individual metrics for specific use cases +export const { + dataRequestsProcessed, + operationDuration, + rpcRequestDuration, + errorTotal, + activeConnections, +} = sedaMetrics; \ No newline at end of file diff --git a/packages/node/src/data-request-task.ts b/packages/node/src/data-request-task.ts index 8e98c3e..a324249 100644 --- a/packages/node/src/data-request-task.ts +++ b/packages/node/src/data-request-task.ts @@ -7,6 +7,7 @@ import { JSONStringify, RevealMismatch, RevealStarted, + metricsHelpers, debouncedInterval, sleep, } from "@sedaprotocol/overlay-ts-common"; @@ -162,6 +163,15 @@ export class DataRequestTask extends EventEmitter { logger.error("Exceeded maximum retry attempts, marking data request as failed", { id: this.name, }); + + // Record high-priority RPC connectivity error + const retryError = new Error(`Exceeded maximum retry attempts: ${this.retries}`); + metricsHelpers.recordRpcError("data_request", "max_retries_exceeded", retryError, { + dr_id: this.drId, + identity_id: this.identityId, + retries: this.retries.toString(), + }); + this.status = IdentityDataRequestStatus.Failed; span.setAttribute("final_status", "failed"); span.setAttribute("failure_reason", "max_retries_exceeded"); @@ -199,6 +209,15 @@ export class DataRequestTask extends EventEmitter { logger.error(`Error while processing data request: ${error}`, { id: this.name, }); + + // Record high-priority RPC connectivity error + metricsHelpers.recordRpcError("data_request", "uncaught_exception", error as Error, { + dr_id: this.drId, + identity_id: this.identityId, + status: this.status, + retries: this.retries.toString(), + }); + span.recordException(error as Error); span.setAttribute("final_status", "error"); span.setAttribute("error_reason", "uncaught_exception"); @@ -228,6 +247,14 @@ export class DataRequestTask extends EventEmitter { logger.error(`Error while fetching status of data request: ${statusResult.error}`, { id: this.drId, }); + + // Record high-priority RPC connectivity error + metricsHelpers.recordRpcError("data_request", "status_fetch", statusResult.error, { + dr_id: this.drId, + identity_id: this.identityId, + retries: this.retries.toString(), + }); + span.recordException(statusResult.error); span.setAttribute("error", "fetch_failed"); @@ -275,6 +302,15 @@ export class DataRequestTask extends EventEmitter { logger.error("Invariant found, data request task uses a data request that does not exist", { id: this.name, }); + + // CRITICAL: State Invariant Violation - Missing Data Request + const stateError = new Error("Data request task references non-existent data request"); + metricsHelpers.recordCriticalError("state_invariant", stateError, { + type: "missing_data_request", + dr_id: this.drId, + identity_id: this.identityId, + }); + span.setAttribute("error", "data_request_not_found"); span.end(); this.stop(); @@ -285,6 +321,15 @@ export class DataRequestTask extends EventEmitter { logger.error("Invariant found, data request task uses an identity that does not exist", { id: this.name, }); + + // CRITICAL: State Invariant Violation - Missing Identity + const stateError = new Error("Data request task references non-existent identity"); + metricsHelpers.recordCriticalError("state_invariant", stateError, { + type: "missing_identity", + dr_id: this.drId, + identity_id: this.identityId, + }); + span.setAttribute("error", "identity_not_found"); span.end(); this.stop(); @@ -378,6 +423,15 @@ export class DataRequestTask extends EventEmitter { if (this.executionResult.isNothing) { logger.error("No execution result available while trying to commit, switching status back to initial"); + + // HIGH: Execution result missing - should not be possible + const missingResultError = new Error("Execution result missing during commit phase"); + metricsHelpers.recordHighPriorityError("execution_result_missing", missingResultError, { + phase: "commit", + dr_id: this.drId, + identity_id: this.identityId, + }); + span.setAttribute("error", "no_execution_result"); span.end(); this.transitionStatus(IdentityDataRequestStatus.EligibleForExecution); @@ -510,6 +564,15 @@ export class DataRequestTask extends EventEmitter { if (this.executionResult.isNothing) { logger.error("No execution result available while trying to reveal, switching status back to initial"); + + // HIGH: Execution result missing - should not be possible + const missingResultError = new Error("Execution result missing during reveal phase"); + metricsHelpers.recordHighPriorityError("execution_result_missing", missingResultError, { + phase: "reveal", + dr_id: this.drId, + identity_id: this.identityId, + }); + span.setAttribute("error", "no_execution_result"); span.end(); this.transitionStatus(IdentityDataRequestStatus.EligibleForExecution); @@ -541,6 +604,17 @@ export class DataRequestTask extends EventEmitter { logger.error( `Chain responded with an already revealed. Data might be corrupted: ${this.commitHash.toString("hex")} vs ${result.error.commitmentHash.toString("hex")}`, ); + + // CRITICAL: Duplicate Node Detection - Reveal hash mismatch indicates duplicate nodes + const duplicateError = new Error("Reveal hash mismatch - possible duplicate nodes"); + metricsHelpers.recordCriticalError("duplicate_node", duplicateError, { + type: "reveal_hash_mismatch", + dr_id: this.drId, + identity_id: this.identityId, + our_commit_hash: this.commitHash.toString("hex"), + chain_commit_hash: result.error.commitmentHash.toString("hex"), + }); + span.setAttribute("error", "reveal_mismatch"); span.setAttribute("our_commit_hash", this.commitHash.toString("hex")); span.setAttribute("chain_commit_hash", result.error.commitmentHash.toString("hex")); diff --git a/packages/node/src/index.ts b/packages/node/src/index.ts index 94f5168..cc7f4bd 100644 --- a/packages/node/src/index.ts +++ b/packages/node/src/index.ts @@ -1,72 +1,36 @@ -import { SedaChain, getRuntime } from "@sedaprotocol/overlay-ts-common"; import type { AppConfig } from "@sedaprotocol/overlay-ts-config"; import { logger } from "@sedaprotocol/overlay-ts-logger"; -import semver from "semver"; -import { Maybe } from "true-myth"; -import { match } from "ts-pattern"; -import { version } from "../../../package.json"; -import { MIN_MAJOR_NODE_VERSION } from "./constants"; -import { startHttpServer } from "./http-server"; +import { SedaChain, initializeTelemetry, telemetryInitialized, metricsHelpers } from "@sedaprotocol/overlay-ts-common"; import { MainTask } from "./tasks/main"; export interface RunOptions { - exitController?: AbortController; + skipIdentityInitialization?: boolean; } export async function runNode(appConfig: AppConfig, runOptions?: RunOptions) { - logger.info(`Overlay Node v${version} is starting..`); - const exitController = Maybe.of(runOptions?.exitController); + // Initialize telemetry early + initializeTelemetry(); + const sedaChain = await SedaChain.fromConfig(appConfig); if (sedaChain.isErr) { logger.error(`SedaChain creation error: ${sedaChain.error}`); - process.exit(1); - } - - const runtime = getRuntime(); - - match(runtime) - .with("deno", () => { - // @ts-ignore - logger.info(`Running on Deno v${Deno.version.deno}`); - }) - .with("bun", () => { - logger.info(`Running on Bun v${Bun.version}`); - }) - .with("node", () => { - const nodeVersion = semver.parse(process.version); - - if (nodeVersion?.major && nodeVersion.major < MIN_MAJOR_NODE_VERSION) { - logger.warn(`Overlay Node was tested with Node.js v${MIN_MAJOR_NODE_VERSION} or higher`); - logger.warn("This may cause unexpected behavior"); - } - logger.info(`Running on Node.js ${process.version}`); - }) - .exhaustive(); + // Record boot failure with enhanced telemetry + if (telemetryInitialized) { + metricsHelpers.recordCriticalError("node_boot", sedaChain.error as Error, { + reason: "seda_chain_init_failure", + boot_phase: "seda_chain_creation", + }); + } - logger.info(`Talking to RPC: ${appConfig.sedaChain.rpc}`); - logger.info(`Using chain ID: ${appConfig.sedaChain.chainId}`); - - for (const [accountIndex] of Array(appConfig.sedaChain.accountAmounts).entries()) { - logger.info(`Using SEDA address ${accountIndex}: ${sedaChain.value.getSignerAddress(accountIndex)}`); + process.exit(1); } - sedaChain.value.start(); - const mainTask = new MainTask(appConfig, sedaChain.value); - mainTask.start(); - await startHttpServer(appConfig, mainTask); + // Start the main task + await mainTask.start(); - if (exitController.isJust) { - exitController.value.signal.addEventListener("abort", () => { - logger.warn("Abort signal received. Stopping gracefully.."); - - mainTask.stop(); - sedaChain.value.stop(); - - process.exit(1); - }); - } + logger.info("✅ Node started successfully"); } diff --git a/packages/node/src/services/get-oracle-program.ts b/packages/node/src/services/get-oracle-program.ts index 77e3b87..66428a4 100644 --- a/packages/node/src/services/get-oracle-program.ts +++ b/packages/node/src/services/get-oracle-program.ts @@ -1,6 +1,6 @@ import { readFile, writeFile } from "node:fs/promises"; import { resolve } from "node:path"; -import { tryAsync } from "@seda-protocol/utils"; +import { tryAsync, metricsHelpers } from "@sedaprotocol/overlay-ts-common"; import type { SedaChain } from "@sedaprotocol/overlay-ts-common"; import type { AppConfig } from "@sedaprotocol/overlay-ts-config"; import { logger } from "@sedaprotocol/overlay-ts-logger"; @@ -17,32 +17,38 @@ export async function getOracleProgram( sedaChain: SedaChain, ): Promise, Error>> { const wasmPath = resolve(appConfig.wasmCacheDir, `${execProgramId}.wasm`); - const cachedWasmFile = await tryAsync(readFile(wasmPath)); + const cachedWasmFile = await tryAsync(() => readFile(wasmPath)); if (cachedWasmFile.isOk) { return Result.ok(Maybe.just({ bytes: cachedWasmFile.value, fromCache: true })); } - const binary = await tryAsync(sedaChain.getWasmStorageQueryClient().OracleProgram({ hash: execProgramId })); + const binary = await tryAsync(() => sedaChain.getWasmStorageQueryClient().OracleProgram({ hash: execProgramId })); if (binary.isErr) { - if (binary.error.message.includes("not found")) { + if (typeof binary.error === "string" && binary.error.includes("not found")) { return Result.ok(Maybe.nothing()); } - - return Result.err(binary.error); + return Result.err(new Error(binary.error.toString())); } - const binaryBuffer = Maybe.of(binary.value.oracleProgram?.bytecode).map((byteCode) => Buffer.from(byteCode)); + const binaryBuffer = Maybe.of((binary.value as any)?.oracleProgram?.bytecode).map((byteCode) => Buffer.from(byteCode)); if (binaryBuffer.isNothing) { return Result.ok(Maybe.nothing()); } - const writeResult = await tryAsync(writeFile(wasmPath, binaryBuffer.value)); + const writeResult = await tryAsync(() => writeFile(wasmPath, binaryBuffer.value)); if (writeResult.isErr) { logger.error(`Could not cache WASM file. Will use memory: ${writeResult.error}`); + + // HIGH: Disk write failure - could not write to disk + metricsHelpers.recordHighPriorityError("disk_write", new Error(writeResult.error.toString()), { + oracle_program_hash: execProgramId, + cache_path: wasmPath, + reason: "wasm_cache_write_failed", + }); } return Result.ok(Maybe.just({ bytes: binaryBuffer.value, fromCache: false })); diff --git a/packages/node/src/services/is-identity-eligible.ts b/packages/node/src/services/is-identity-eligible.ts index 62ba155..d99c9d3 100644 --- a/packages/node/src/services/is-identity-eligible.ts +++ b/packages/node/src/services/is-identity-eligible.ts @@ -1,9 +1,15 @@ import { type Context, type Span, type Tracer, trace } from "@opentelemetry/api"; import type { GetExecutorEligibilityResponse } from "@sedaprotocol/core-contract-schema"; -import { type SedaChain, getCurrentBlockHeight, keccak256 } from "@sedaprotocol/overlay-ts-common"; +import { + type SedaChain, + getCurrentBlockHeight, + keccak256, + metricsHelpers, +} from "@sedaprotocol/overlay-ts-common"; import { logger } from "@sedaprotocol/overlay-ts-logger"; import { Result } from "true-myth"; import type { DataRequest } from "../models/data-request"; +import type { IdentityPool } from "../models/identitiest-pool"; import { getDrConfig } from "./dr-config"; import { type Staker, getStakers } from "./get-staker"; import { getStakingConfig } from "./get-staking-config"; @@ -96,6 +102,14 @@ export async function isIdentityEligibleForDataRequest( if (stakingConfig.isErr) { logger.error(`Error while fetching staking config: ${stakingConfig.error}`); + + // Record RPC connectivity error for staking config fetch failure + metricsHelpers.recordRpcError("eligibility", "getStakingConfig", stakingConfig.error, { + dr_id: dataRequest.id, + identity_id: identityId, + operation: "fetch_staking_config", + }); + span.end(); return Result.err(stakingConfig.error); } @@ -106,6 +120,14 @@ export async function isIdentityEligibleForDataRequest( if (currentBlockHeight.isErr) { logger.error(`Error while fetching current block height: ${currentBlockHeight.error}`); + + // Record RPC connectivity error for block height fetch failure + metricsHelpers.recordRpcError("eligibility", "getCurrentBlockHeight", currentBlockHeight.error, { + dr_id: dataRequest.id, + identity_id: identityId, + operation: "fetch_current_block_height", + }); + span.end(); return Result.err(currentBlockHeight.error); } @@ -116,6 +138,14 @@ export async function isIdentityEligibleForDataRequest( if (stakers.isErr) { logger.error(`Error while fetching stakers: ${stakers.error}`); + + // Record RPC connectivity error for stakers fetch failure + metricsHelpers.recordRpcError("eligibility", "getStakers", stakers.error, { + dr_id: dataRequest.id, + identity_id: identityId, + operation: "fetch_stakers", + }); + span.end(); return Result.err(stakers.error); } @@ -126,6 +156,14 @@ export async function isIdentityEligibleForDataRequest( if (drConfig.isErr) { logger.error(`Error while fetching DR config: ${drConfig.error}`); + + // Record RPC connectivity error for DR config fetch failure + metricsHelpers.recordRpcError("eligibility", "getDrConfig", drConfig.error, { + dr_id: dataRequest.id, + identity_id: identityId, + operation: "fetch_dr_config", + }); + span.end(); return Result.err(drConfig.error); } diff --git a/packages/node/src/tasks/fetch.ts b/packages/node/src/tasks/fetch.ts index 0c0ee9a..4623522 100644 --- a/packages/node/src/tasks/fetch.ts +++ b/packages/node/src/tasks/fetch.ts @@ -1,6 +1,5 @@ import { type Span, type Tracer, context, trace } from "@opentelemetry/api"; -import { debouncedInterval } from "@sedaprotocol/overlay-ts-common"; -import type { SedaChain } from "@sedaprotocol/overlay-ts-common"; +import { type SedaChain, debouncedInterval, metricsHelpers } from "@sedaprotocol/overlay-ts-common"; import type { AppConfig } from "@sedaprotocol/overlay-ts-config"; import { logger } from "@sedaprotocol/overlay-ts-logger"; import { EventEmitter } from "eventemitter3"; @@ -123,6 +122,12 @@ export class FetchTask extends EventEmitter { debouncedInterval(async () => { (await this.fetch()).mapErr((error) => { logger.error(`FetchTask: ${error}`); + + // Record RPC connectivity error for fetch task failure + metricsHelpers.recordRpcError("fetch", "fetchDataRequests", error, { + operation: "fetch_data_requests", + interval: this.config.intervals.fetchTask.toString(), + }); }); }, this.config.intervals.fetchTask), ); diff --git a/packages/node/src/tasks/identity-manager.ts b/packages/node/src/tasks/identity-manager.ts index c772321..4f6284a 100644 --- a/packages/node/src/tasks/identity-manager.ts +++ b/packages/node/src/tasks/identity-manager.ts @@ -1,5 +1,4 @@ -import { TransactionPriority, debouncedInterval, formatTokenUnits } from "@sedaprotocol/overlay-ts-common"; -import type { SedaChain } from "@sedaprotocol/overlay-ts-common"; +import { TransactionPriority, debouncedInterval, formatTokenUnits, type SedaChain, metricsHelpers } from "@sedaprotocol/overlay-ts-common"; import type { AppConfig } from "@sedaprotocol/overlay-ts-config"; import { logger } from "@sedaprotocol/overlay-ts-logger"; import { Result, type Unit } from "true-myth"; @@ -22,6 +21,12 @@ export class IdentityManagerTask { id: `identity_${identity}`, }); + // Record RPC connectivity error for staker fetch failure + metricsHelpers.recordRpcError("general", "getStaker", staker.error, { + identity, + operation: "fetch_staker_info", + }); + return Result.err(staker.error); } @@ -30,6 +35,13 @@ export class IdentityManagerTask { id: `identity_${identity}`, }); + // CRITICAL: Staker removal - unexpected staker disappearance + const stakerError = new Error(`Staker info not found for identity: ${identity}`); + metricsHelpers.recordCriticalError("staker_removed", stakerError, { + identity, + reason: "staker_info_empty", + }); + return Result.err(new Error("Staker info was empty")); } @@ -37,6 +49,12 @@ export class IdentityManagerTask { if (stakingConfig.isErr) { logger.error(`Could not fetch staking config: ${stakingConfig.error}`); + + // Record RPC connectivity error for staking config fetch failure + metricsHelpers.recordRpcError("general", "getStakingConfig", stakingConfig.error, { + operation: "fetch_staking_config", + }); + return Result.err(stakingConfig.error); } @@ -67,6 +85,18 @@ export class IdentityManagerTask { logger.error("Identity could not be found in pool", { id: `identity_${identity}`, }); + + // Record RPC connectivity error for identity lookup failure + metricsHelpers.recordRpcError("general", "identityPoolLookup", new Error("Identity not found in pool"), { + identity, + operation: "identity_pool_lookup", + }); + + // Also record no stake error + metricsHelpers.recordHighPriorityError("no_stake", new Error(`No stake available for identity: ${identity}`), { + identity, + reason: "identity_not_in_pool", + }); }, }); @@ -132,11 +162,18 @@ export class IdentityManagerTask { logger.error( `${accountIndex}: Failed to send SEDA to ${this.sedaChain.getSignerAddress(accountIndex)}: ${response.error}`, ); - } - logger.info( - `${accountIndex}: Sent ${formatTokenUnits(this.config.sedaChain.minSedaPerAccount)} SEDA to ${this.sedaChain.getSignerAddress(accountIndex)}`, - ); + // HIGH: SEDA transfer failure - RPC connectivity or insufficient balance + metricsHelpers.recordHighPriorityError("seda_transfer", response.error, { + account_index: accountIndex.toString(), + recipient: this.sedaChain.getSignerAddress(accountIndex), + reason: "seda_transfer_failed", + }); + } else { + logger.info( + `${accountIndex}: Sent ${formatTokenUnits(this.config.sedaChain.minSedaPerAccount)} SEDA to ${this.sedaChain.getSignerAddress(accountIndex)}`, + ); + } } else { logger.info( `${accountIndex}: ${this.sedaChain.getSignerAddress(accountIndex)} has enough SEDA (min: ${formatTokenUnits(this.config.sedaChain.minSedaPerAccount)} SEDA, current: ${formatTokenUnits(balance.amount)} SEDA)`, diff --git a/packages/node/src/tasks/is-eligible.ts b/packages/node/src/tasks/is-eligible.ts index 6a108cb..11690d4 100644 --- a/packages/node/src/tasks/is-eligible.ts +++ b/packages/node/src/tasks/is-eligible.ts @@ -4,7 +4,11 @@ import { createEligibilityHash, createEligibilityMessageData, } from "@sedaprotocol/core-contract-schema"; -import { type SedaChain, debouncedInterval } from "@sedaprotocol/overlay-ts-common"; +import { + type SedaChain, + debouncedInterval, + metricsHelpers, +} from "@sedaprotocol/overlay-ts-common"; import type { AppConfig } from "@sedaprotocol/overlay-ts-config"; import { logger } from "@sedaprotocol/overlay-ts-logger"; import { EventEmitter } from "eventemitter3"; @@ -106,6 +110,15 @@ export class EligibilityTask extends EventEmitter { logger.error(`Failed signing message for eligibility: ${messageSignature.error}`, { id: traceId, }); + + // CRITICAL: Identity signing failure with non-existent key + metricsHelpers.recordCriticalError("identity_signing", messageSignature.error, { + identity_id: identityId, + trace_id: traceId, + operation: "eligibility_message_signing", + reason: "missing_private_key", + }); + return Result.err(messageSignature.error); } @@ -125,6 +138,14 @@ export class EligibilityTask extends EventEmitter { logger.error(`Could not fetch eligibility status for data request: ${response.error}`, { id: traceId, }); + + // Record RPC connectivity error for eligibility check failure + metricsHelpers.recordRpcError("eligibility", "checkEligibility", response.error, { + dr_id: dataRequest.id, + trace_id: traceId, + operation: "eligibility_check", + }); + span.recordException(response.error); span.setAttribute("error", "query_failed"); span.end(); @@ -152,6 +173,14 @@ export class EligibilityTask extends EventEmitter { logger.error(`Could not fetch data request from chain: ${drFromChain.error}`, { id: traceId, }); + + // Record RPC connectivity error for data request fetch failure + metricsHelpers.recordRpcError("eligibility", "fetchDataRequest", drFromChain.error, { + dr_id: dataRequest.id, + trace_id: traceId, + operation: "fetch_dr_from_chain", + }); + span.recordException(drFromChain.error); span.setAttribute("error", "dr_fetch_failed"); span.end(); @@ -242,6 +271,14 @@ export class EligibilityTask extends EventEmitter { logger.error(`Could not fetch information about dr: ${error}`, { id: traceId, }); + + // Record RPC connectivity error for DR refresh failure + metricsHelpers.recordRpcError("eligibility", "refreshDataRequest", error, { + dr_id: dataRequest.id, + trace_id: traceId, + operation: "refresh_dr_info", + }); + span.recordException(error); span.setAttribute("error", "refresh_failed"); return false; @@ -288,6 +325,15 @@ export class EligibilityTask extends EventEmitter { logger.error(`Identity ${identityInfo.identityId} is not enabled, skipping eligibility check`, { id: traceId, }); + + // HIGH: No stake available - identity is disabled + const noStakeError = new Error(`Identity ${identityInfo.identityId} is not enabled (no stake)`); + metricsHelpers.recordHighPriorityError("no_stake", noStakeError, { + identity_id: identityInfo.identityId, + trace_id: traceId, + reason: "identity_disabled", + }); + continue; } diff --git a/prometheus.yml b/prometheus.yml new file mode 100644 index 0000000..359e96c --- /dev/null +++ b/prometheus.yml @@ -0,0 +1,18 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + +scrape_configs: + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + + - job_name: 'jaeger' + static_configs: + - targets: ['jaeger:14269'] + + # This is for scraping metrics from your application if it exposes a /metrics endpoint + # Uncomment and adjust the target if your seda-overlay application exposes Prometheus metrics + # - job_name: 'seda-overlay' + # static_configs: + # - targets: ['host.docker.internal:8080'] # Adjust port as needed \ No newline at end of file From 840efa5eba4fa8d1927cbff2f699119ea690dfb4 Mon Sep 17 00:00:00 2001 From: kaynetik Date: Thu, 3 Jul 2025 14:36:34 +0200 Subject: [PATCH 2/3] chore: Export metrics --- bun.lock | 3 + package.json | 15 +- packages/common/src/index.ts | 2 - packages/common/src/seda/seda-chain.ts | 6 +- packages/common/src/telemetry/decorators.ts | 21 +- packages/common/src/telemetry/index.ts | 139 ++++--- packages/common/src/telemetry/metrics.ts | 341 +++++++++--------- packages/node/src/data-request-task.ts | 34 +- packages/node/src/index.ts | 2 +- .../node/src/services/get-oracle-program.ts | 6 +- .../node/src/services/is-identity-eligible.ts | 8 +- packages/node/src/tasks/identity-manager.ts | 8 +- packages/node/src/tasks/is-eligible.ts | 6 +- 13 files changed, 317 insertions(+), 274 deletions(-) diff --git a/bun.lock b/bun.lock index 2e1c9ed..9edc180 100644 --- a/bun.lock +++ b/bun.lock @@ -8,6 +8,7 @@ "@opentelemetry/exporter-metrics-otlp-grpc": "0.202.0", "@opentelemetry/exporter-metrics-otlp-http": "0.202.0", "@opentelemetry/exporter-metrics-otlp-proto": "0.202.0", + "@opentelemetry/exporter-prometheus": "^0.202.0", "@opentelemetry/exporter-trace-otlp-http": "0.202.0", "@opentelemetry/resources": "2.0.1", "@opentelemetry/sdk-metrics": "2.0.1", @@ -251,6 +252,8 @@ "@opentelemetry/exporter-metrics-otlp-proto": ["@opentelemetry/exporter-metrics-otlp-proto@0.202.0", "", { "dependencies": { "@opentelemetry/core": "2.0.1", "@opentelemetry/exporter-metrics-otlp-http": "0.202.0", "@opentelemetry/otlp-exporter-base": "0.202.0", "@opentelemetry/otlp-transformer": "0.202.0", "@opentelemetry/resources": "2.0.1", "@opentelemetry/sdk-metrics": "2.0.1" }, "peerDependencies": { "@opentelemetry/api": "^1.3.0" } }, "sha512-X0RpPpPjyCAmIq9tySZm0Hk3Ltw8KWsqeNq5I7gS9AR9RzbVHb/l+eiMI1CqSRvW9R47HXcUu/epmEzY8ebFAg=="], + "@opentelemetry/exporter-prometheus": ["@opentelemetry/exporter-prometheus@0.202.0", "", { "dependencies": { "@opentelemetry/core": "2.0.1", "@opentelemetry/resources": "2.0.1", "@opentelemetry/sdk-metrics": "2.0.1" }, "peerDependencies": { "@opentelemetry/api": "^1.3.0" } }, "sha512-6RvQqZHAPFiwL1OKRJe4ta6SgJx/g8or41B+OovVVEie3HeCDhDGL9S1VJNkBozUz6wTY8a47fQwdMrCOUdMhQ=="], + "@opentelemetry/exporter-trace-otlp-http": ["@opentelemetry/exporter-trace-otlp-http@0.202.0", "", { "dependencies": { "@opentelemetry/core": "2.0.1", "@opentelemetry/otlp-exporter-base": "0.202.0", "@opentelemetry/otlp-transformer": "0.202.0", "@opentelemetry/resources": "2.0.1", "@opentelemetry/sdk-trace-base": "2.0.1" }, "peerDependencies": { "@opentelemetry/api": "^1.3.0" } }, "sha512-/hKE8DaFCJuaQqE1IxpgkcjOolUIwgi3TgHElPVKGdGRBSmJMTmN/cr6vWa55pCJIXPyhKvcMrbrya7DZ3VmzA=="], "@opentelemetry/otlp-exporter-base": ["@opentelemetry/otlp-exporter-base@0.202.0", "", { "dependencies": { "@opentelemetry/core": "2.0.1", "@opentelemetry/otlp-transformer": "0.202.0" }, "peerDependencies": { "@opentelemetry/api": "^1.3.0" } }, "sha512-nMEOzel+pUFYuBJg2znGmHJWbmvMbdX5/RhoKNKowguMbURhz0fwik5tUKplLcUtl8wKPL1y9zPnPxeBn65N0Q=="], diff --git a/package.json b/package.json index b8d4b46..bea8778 100644 --- a/package.json +++ b/package.json @@ -76,19 +76,20 @@ "typescript": "^5.7.3" }, "dependencies": { - "true-myth": "^8.4.0", - "node-fetch": "^3.3.2", - "ts-pattern": "^5.7.1", - "type-fest": "^4.41.0", "@opentelemetry/api": "^1.9.0", "@opentelemetry/exporter-metrics-otlp-grpc": "0.202.0", "@opentelemetry/exporter-metrics-otlp-http": "0.202.0", "@opentelemetry/exporter-metrics-otlp-proto": "0.202.0", - "@opentelemetry/sdk-metrics": "2.0.1", + "@opentelemetry/exporter-prometheus": "^0.202.0", + "@opentelemetry/exporter-trace-otlp-http": "0.202.0", "@opentelemetry/resources": "2.0.1", - "@opentelemetry/semantic-conventions": "^1.34.0", + "@opentelemetry/sdk-metrics": "2.0.1", "@opentelemetry/sdk-trace-base": "2.0.1", "@opentelemetry/sdk-trace-node": "2.0.1", - "@opentelemetry/exporter-trace-otlp-http": "0.202.0" + "@opentelemetry/semantic-conventions": "^1.34.0", + "node-fetch": "^3.3.2", + "true-myth": "^8.4.0", + "ts-pattern": "^5.7.1", + "type-fest": "^4.41.0" } } diff --git a/packages/common/src/index.ts b/packages/common/src/index.ts index 0da32c2..753d524 100644 --- a/packages/common/src/index.ts +++ b/packages/common/src/index.ts @@ -1,5 +1,3 @@ -import "./telemetry"; - // Enhanced telemetry exports export { initializeTelemetry, diff --git a/packages/common/src/seda/seda-chain.ts b/packages/common/src/seda/seda-chain.ts index b621127..3be1080 100644 --- a/packages/common/src/seda/seda-chain.ts +++ b/packages/common/src/seda/seda-chain.ts @@ -9,6 +9,7 @@ import type { AppConfig } from "@sedaprotocol/overlay-ts-config"; import { logger } from "@sedaprotocol/overlay-ts-logger"; import { EventEmitter } from "eventemitter3"; import { Maybe, Result } from "true-myth"; +import { metricsHelpers } from "../index"; import { AlreadyCommitted, AlreadyRevealed, @@ -23,11 +24,6 @@ import { createProtoQueryClient, createWasmQueryClient } from "./query-client"; import { getTransaction, signAndSendTxSync } from "./sign-and-send-tx"; import { type ISigner, Signer } from "./signer"; import { type SedaSigningCosmWasmClient, createSigningClient } from "./signing-client"; -import { - JSONStringify, - metricsHelpers, - sleep, -} from "../index"; type EventMap = { "tx-error": [string, TransactionMessage | undefined]; diff --git a/packages/common/src/telemetry/decorators.ts b/packages/common/src/telemetry/decorators.ts index 0f60809..97099bf 100644 --- a/packages/common/src/telemetry/decorators.ts +++ b/packages/common/src/telemetry/decorators.ts @@ -1,4 +1,4 @@ -import { trace, type Span, SpanStatusCode, SpanKind } from "@opentelemetry/api"; +import { type Span, SpanKind, SpanStatusCode, trace } from "@opentelemetry/api"; const tracer = trace.getTracer("seda-overlay-decorators", "1.0.0"); @@ -9,7 +9,7 @@ export interface TracedOptions { } export function Traced(options: TracedOptions = {}) { - return function (target: any, propertyKey: string, descriptor: PropertyDescriptor) { + return (target: any, propertyKey: string, descriptor: PropertyDescriptor) => { if (!descriptor.value) { throw new Error("@Traced can only be applied to methods"); } @@ -70,7 +70,7 @@ export function MonitorRPC(options: TracedOptions & { endpoint?: string } = {}) const rpcAttributes: Record = { ...options.attributes, }; - + if (options.endpoint) { rpcAttributes.rpc_endpoint = options.endpoint; } @@ -84,12 +84,12 @@ export function MonitorRPC(options: TracedOptions & { endpoint?: string } = {}) } export function TraceClass(options: { prefix?: string } = {}) { - return function (constructor: any) { + return (constructor: any) => { const methodNames = Object.getOwnPropertyNames(constructor.prototype); - + for (const methodName of methodNames) { if (methodName === "constructor") continue; - + const descriptor = Object.getOwnPropertyDescriptor(constructor.prototype, methodName); if (descriptor && typeof descriptor.value === "function") { const spanName = options.prefix ? `${options.prefix}.${methodName}` : `${constructor.name}.${methodName}`; @@ -97,15 +97,12 @@ export function TraceClass(options: { prefix?: string } = {}) { Object.defineProperty(constructor.prototype, methodName, descriptor); } } - + return constructor; }; } -export async function withSpan( - name: string, - fn: (span: Span) => Promise | T -): Promise { +export async function withSpan(name: string, fn: (span: Span) => Promise | T): Promise { return tracer.startActiveSpan(name, async (span: Span) => { try { const result = await fn(span); @@ -133,4 +130,4 @@ export function addSpanEvent(name: string, attributes?: Record void) | null = null; +let tracerProvider: NodeTracerProvider; +let meterProvider: MeterProvider; + +/** + * Create metrics readers based on configuration + */ +function createMetricReaders(config: { + metricsExporter: string; + prometheusPort: number; + prometheusHost: string; + otlpEndpoint: string; + metricsInterval: number; +}) { + const readers = []; + + // Add OTLP exporter if requested + if (config.metricsExporter === "otlp" || config.metricsExporter === "both") { + const otlpMetricExporter = new OTLPMetricExporter({ + url: `${config.otlpEndpoint}/v1/metrics`, + }); + + readers.push( + new PeriodicExportingMetricReader({ + exporter: otlpMetricExporter, + exportIntervalMillis: config.metricsInterval, + }), + ); + } + + // Add Prometheus exporter if requested + if (config.metricsExporter === "prometheus" || config.metricsExporter === "both") { + const prometheusExporter = new PrometheusExporter({ + port: config.prometheusPort, + host: config.prometheusHost, + endpoint: "/metrics", + }); -// Create resource -const resource = resourceFromAttributes({ - [ATTR_SERVICE_NAME]: SERVICE_NAME, - [ATTR_SERVICE_VERSION]: SERVICE_VERSION, -}); - -// Configure trace exporter -const traceExporter = new OTLPTraceExporter({ - url: `${OTLP_ENDPOINT}/v1/traces`, -}); - -// Configure metrics exporter -const metricExporter = new OTLPMetricExporter({ - url: `${OTLP_ENDPOINT}/v1/metrics`, -}); - -// Set up tracing -const tracerProvider = new NodeTracerProvider({ - resource, - spanProcessors: [new SimpleSpanProcessor(traceExporter)], -}); - -// Set up metrics -const meterProvider = new MeterProvider({ - resource, - readers: [ - new PeriodicExportingMetricReader({ - exporter: metricExporter, - exportIntervalMillis: METRICS_EXPORT_INTERVAL, - }), - ], -}); + readers.push(prometheusExporter); + } + + return readers; +} /** * Initialize OpenTelemetry with both tracing and metrics @@ -59,12 +62,50 @@ export function initializeTelemetry(): boolean { return true; } - if (!TELEMETRY_ENABLED) { + // Read configuration from environment variables at runtime + const config = { + otlpEndpoint: process.env.OTLP_ENDPOINT || "http://localhost:4318", + serviceName: process.env.OTEL_SERVICE_NAME || "seda-overlay", + serviceVersion: process.env.OTEL_SERVICE_VERSION || "1.0.0", + metricsInterval: Number.parseInt(process.env.OTEL_METRICS_EXPORT_INTERVAL || "5000"), + telemetryEnabled: process.env.OTEL_ENABLED !== "false", + prometheusPort: Number.parseInt(process.env.OTEL_EXPORTER_PROMETHEUS_PORT || "9464"), + prometheusHost: process.env.OTEL_EXPORTER_PROMETHEUS_HOST || "0.0.0.0", + metricsExporter: process.env.OTEL_METRICS_EXPORTER || "otlp", + }; + + if (!config.telemetryEnabled) { console.log("📡 Telemetry disabled by configuration (OTEL_ENABLED=false)"); return false; } try { + // Create resource + const resource = resourceFromAttributes({ + [ATTR_SERVICE_NAME]: config.serviceName, + [ATTR_SERVICE_VERSION]: config.serviceVersion, + }); + + // Configure trace exporter + const traceExporter = new OTLPTraceExporter({ + url: `${config.otlpEndpoint}/v1/traces`, + }); + + // Set up tracing + tracerProvider = new NodeTracerProvider({ + resource, + spanProcessors: [new SimpleSpanProcessor(traceExporter)], + }); + + // Create metric readers based on configuration + const metricReaders = createMetricReaders(config); + + // Set up metrics with appropriate readers + meterProvider = new MeterProvider({ + resource, + readers: metricReaders, + }); + // Register providers tracerProvider.register(); metrics.setGlobalMeterProvider(meterProvider); @@ -74,16 +115,22 @@ export function initializeTelemetry(): boolean { metricsCollectionCleanup = startSystemMetricsCollection(); telemetryInitialized = true; - + console.log("📡 OpenTelemetry initialized successfully"); - console.log(`📊 Service: ${SERVICE_NAME}@${SERVICE_VERSION}`); - console.log(`📈 Endpoint: ${OTLP_ENDPOINT}`); + console.log(`📊 Service: ${config.serviceName}@${config.serviceVersion}`); + + // Log export configuration + if (config.metricsExporter === "prometheus" || config.metricsExporter === "both") { + console.log(`📈 Prometheus metrics: http://${config.prometheusHost}:${config.prometheusPort}/metrics`); + } + if (config.metricsExporter === "otlp" || config.metricsExporter === "both") { + console.log(`📡 OTLP endpoint: ${config.otlpEndpoint}`); + } // Set up graceful shutdown setupGracefulShutdown(); return true; - } catch (error) { console.error("❌ Failed to initialize telemetry:", error); return false; diff --git a/packages/common/src/telemetry/metrics.ts b/packages/common/src/telemetry/metrics.ts index bdad9dd..38294fb 100644 --- a/packages/common/src/telemetry/metrics.ts +++ b/packages/common/src/telemetry/metrics.ts @@ -1,158 +1,171 @@ import { metrics } from "@opentelemetry/api"; -// Get meter for custom metrics -const meter = metrics.getMeter("seda-overlay-custom", "1.0.0"); +// Lazy-loaded meter - will use the configured provider when first accessed +function getMeter() { + return metrics.getMeter("seda-overlay-custom", "1.0.0"); +} + +// Lazy-loaded metrics - created on first access to ensure proper meter provider +let _sedaMetrics: any = null; /** * Custom metrics for SEDA Overlay observability * Based on error categorization analysis from todos_to_actionable_errors */ -export const sedaMetrics = { - // ================================================================= - // CRITICAL ERRORS - Immediate alerting required - // ================================================================= - - // CRITICAL-001: Node Boot Failures - nodeBootFailures: meter.createCounter("overlay_node_boot_failures_total", { - description: "Total number of node boot failures", - unit: "1", - }), - - // CRITICAL-002: State Invariant Violations - stateInvariantViolations: meter.createCounter("overlay_state_invariant_violations_total", { - description: "Data request task state invariant violations", - unit: "1", - }), - - // CRITICAL-003: Duplicate Node Detection - duplicateNodeErrors: meter.createCounter("overlay_duplicate_node_errors_total", { - description: "Duplicate node detection errors (reveal hash mismatch)", - unit: "1", - }), - - // CRITICAL-004: Staker Removal - stakerRemovedErrors: meter.createCounter("overlay_staker_removed_errors_total", { - description: "Unexpected staker removal events", - unit: "1", - }), - - // CRITICAL-005: Identity Signing Failure - identitySigningFailures: meter.createCounter("overlay_identity_signing_failures_total", { - description: "Identity signing failures with missing keys", - unit: "1", - }), - - // ================================================================= - // HIGH-PRIORITY RPC ERRORS - Alert after 3 consecutive in 30min - // ================================================================= - - // HIGH-RPC-001: General RPC Connection Issues - rpcConnectionErrors: meter.createCounter("overlay_rpc_connection_errors_total", { - description: "RPC connection failures across the system", - unit: "1", - }), - - // HIGH-RPC-002: Data Request RPC Failures - dataRequestRpcErrors: meter.createCounter("overlay_data_request_rpc_errors_total", { - description: "Data request specific RPC failures", - unit: "1", - }), - - // HIGH-RPC-003: Eligibility Check RPC Failures - eligibilityRpcErrors: meter.createCounter("overlay_eligibility_rpc_errors_total", { - description: "Eligibility check RPC failures", - unit: "1", - }), - - // HIGH-RPC-004: Fetch Task RPC Failures - fetchRpcErrors: meter.createCounter("overlay_fetch_rpc_errors_total", { - description: "Fetch task specific RPC failures", - unit: "1", - }), - - // ================================================================= - // HIGH-PRIORITY OTHER ERRORS - Immediate alerting - // ================================================================= - - // HIGH-001: Callback Message Issues - callbackLookupFailures: meter.createCounter("overlay_callback_lookup_failures_total", { - description: "Callback message lookup failures - fishy behavior detected", - unit: "1", - }), - - // HIGH-002: Execution Result Missing - executionResultMissing: meter.createCounter("overlay_execution_result_missing_total", { - description: "Missing execution results - should not be possible", - unit: "1", - }), - - // HIGH-003: Disk Write Failures - diskWriteFailures: meter.createCounter("overlay_disk_write_failures_total", { - description: "Disk write failures for WASM cache", - unit: "1", - }), - - // HIGH-004: SEDA Transfer Failures - sedaTransferFailures: meter.createCounter("overlay_seda_transfer_failures_total", { - description: "SEDA transfer failures (RPC or insufficient balance)", - unit: "1", - }), - - // HIGH-005: No Stake Available - noStakeErrors: meter.createCounter("overlay_no_stake_errors_total", { - description: "No stake available for operations", - unit: "1", - }), - - // ================================================================= - // OPERATIONAL HEALTH METRICS - // ================================================================= - - // General application health - errorTotal: meter.createCounter("overlay_errors_total", { - description: "Total application errors by type and severity", - unit: "1", - }), - - requestsTotal: meter.createCounter("overlay_requests_total", { - description: "Total application requests processed", - unit: "1", - }), - - dataRequestsProcessed: meter.createCounter("overlay_data_requests_processed_total", { - description: "Total data requests processed successfully", - unit: "1", - }), - - // Performance metrics - operationDuration: meter.createHistogram("overlay_operation_duration_ms", { - description: "Duration of various operations in milliseconds", - unit: "ms", - }), - - // Resource utilization - memoryUsage: meter.createGauge("overlay_memory_usage_bytes", { - description: "Memory usage in bytes", - unit: "bytes", - }), - - // RPC health tracking - rpcRequestDuration: meter.createHistogram("overlay_rpc_request_duration_ms", { - description: "RPC request duration in milliseconds", - unit: "ms", - }), - - rpcRequestsTotal: meter.createCounter("overlay_rpc_requests_total", { - description: "Total RPC requests by endpoint and status", - unit: "1", - }), - - // Connection metrics - activeConnections: meter.createUpDownCounter("overlay_active_connections", { - description: "Number of active connections by type", - unit: "1", - }), -}; +export const sedaMetrics = new Proxy({} as any, { + get(target, prop) { + if (!_sedaMetrics) { + const meter = getMeter(); + _sedaMetrics = { + // ================================================================= + // CRITICAL ERRORS - Immediate alerting required + // ================================================================= + + // CRITICAL-001: Node Boot Failures + nodeBootFailures: meter.createCounter("overlay_node_boot_failures_total", { + description: "Total number of node boot failures", + unit: "1", + }), + + // CRITICAL-002: State Invariant Violations + stateInvariantViolations: meter.createCounter("overlay_state_invariant_violations_total", { + description: "Data request task state invariant violations", + unit: "1", + }), + + // CRITICAL-003: Duplicate Node Detection + duplicateNodeErrors: meter.createCounter("overlay_duplicate_node_errors_total", { + description: "Duplicate node detection errors (reveal hash mismatch)", + unit: "1", + }), + + // CRITICAL-004: Staker Removal + stakerRemovedErrors: meter.createCounter("overlay_staker_removed_errors_total", { + description: "Unexpected staker removal events", + unit: "1", + }), + + // CRITICAL-005: Identity Signing Failure + identitySigningFailures: meter.createCounter("overlay_identity_signing_failures_total", { + description: "Identity signing failures with missing keys", + unit: "1", + }), + + // ================================================================= + // HIGH-PRIORITY RPC ERRORS - Alert after 3 consecutive in 30min + // ================================================================= + + // HIGH-RPC-001: General RPC Connection Issues + rpcConnectionErrors: meter.createCounter("overlay_rpc_connection_errors_total", { + description: "RPC connection failures across the system", + unit: "1", + }), + + // HIGH-RPC-002: Data Request RPC Failures + dataRequestRpcErrors: meter.createCounter("overlay_data_request_rpc_errors_total", { + description: "Data request specific RPC failures", + unit: "1", + }), + + // HIGH-RPC-003: Eligibility Check RPC Failures + eligibilityRpcErrors: meter.createCounter("overlay_eligibility_rpc_errors_total", { + description: "Eligibility check RPC failures", + unit: "1", + }), + + // HIGH-RPC-004: Fetch Task RPC Failures + fetchRpcErrors: meter.createCounter("overlay_fetch_rpc_errors_total", { + description: "Fetch task specific RPC failures", + unit: "1", + }), + + // ================================================================= + // HIGH-PRIORITY OTHER ERRORS - Immediate alerting + // ================================================================= + + // HIGH-001: Callback Message Issues + callbackLookupFailures: meter.createCounter("overlay_callback_lookup_failures_total", { + description: "Callback message lookup failures - fishy behavior detected", + unit: "1", + }), + + // HIGH-002: Execution Result Missing + executionResultMissing: meter.createCounter("overlay_execution_result_missing_total", { + description: "Missing execution results - should not be possible", + unit: "1", + }), + + // HIGH-003: Disk Write Failures + diskWriteFailures: meter.createCounter("overlay_disk_write_failures_total", { + description: "Disk write failures for WASM cache", + unit: "1", + }), + + // HIGH-004: SEDA Transfer Failures + sedaTransferFailures: meter.createCounter("overlay_seda_transfer_failures_total", { + description: "SEDA transfer failures (RPC or insufficient balance)", + unit: "1", + }), + + // HIGH-005: No Stake Available + noStakeErrors: meter.createCounter("overlay_no_stake_errors_total", { + description: "No stake available for operations", + unit: "1", + }), + + // ================================================================= + // OPERATIONAL HEALTH METRICS + // ================================================================= + + // General application health + errorTotal: meter.createCounter("overlay_errors_total", { + description: "Total application errors by type and severity", + unit: "1", + }), + + requestsTotal: meter.createCounter("overlay_requests_total", { + description: "Total application requests processed", + unit: "1", + }), + + dataRequestsProcessed: meter.createCounter("overlay_data_requests_processed_total", { + description: "Total data requests processed successfully", + unit: "1", + }), + + // Performance metrics + operationDuration: meter.createHistogram("overlay_operation_duration_ms", { + description: "Duration of various operations in milliseconds", + unit: "ms", + }), + + // Resource utilization + memoryUsage: meter.createGauge("overlay_memory_usage_bytes", { + description: "Memory usage in bytes", + unit: "bytes", + }), + + // RPC health tracking + rpcRequestDuration: meter.createHistogram("overlay_rpc_request_duration_ms", { + description: "RPC request duration in milliseconds", + unit: "ms", + }), + + rpcRequestsTotal: meter.createCounter("overlay_rpc_requests_total", { + description: "Total RPC requests by endpoint and status", + unit: "1", + }), + + // Connection metrics + activeConnections: meter.createUpDownCounter("overlay_active_connections", { + description: "Number of active connections by type", + unit: "1", + }), + }; + } + return _sedaMetrics[prop]; + }, +}); /** * Common attributes to be used with all metrics for consistent labeling @@ -177,7 +190,7 @@ export const metricsHelpers = { recordCriticalError( type: "node_boot" | "state_invariant" | "duplicate_node" | "staker_removed" | "identity_signing", error: Error, - context?: Record + context?: Record, ) { const attributes = { ...getCommonAttributes(), @@ -214,7 +227,7 @@ export const metricsHelpers = { recordHighPriorityError( type: "callback_lookup" | "execution_result_missing" | "disk_write" | "seda_transfer" | "no_stake", error: Error, - context?: Record + context?: Record, ) { const attributes = { ...getCommonAttributes(), @@ -252,7 +265,7 @@ export const metricsHelpers = { type: "general" | "data_request" | "eligibility" | "fetch", endpoint: string, error: Error, - context?: Record + context?: Record, ) { const attributes = { ...getCommonAttributes(), @@ -289,7 +302,7 @@ export const metricsHelpers = { duration: number, success: boolean, error?: Error, - context?: Record + context?: Record, ) { const attributes = { ...getCommonAttributes(), @@ -300,7 +313,7 @@ export const metricsHelpers = { // Record duration sedaMetrics.rpcRequestDuration.record(duration, attributes); - + // Record request count sedaMetrics.rpcRequestsTotal.add(1, attributes); @@ -321,7 +334,7 @@ export const metricsHelpers = { drId: string, stage: "execute" | "commit" | "reveal" | "completed" | "failed", duration?: number, - context?: Record + context?: Record, ) { const attributes = { ...getCommonAttributes(), @@ -344,12 +357,7 @@ export const metricsHelpers = { /** * Record general operation with timing */ - recordOperation( - operationType: string, - duration: number, - success: boolean, - context?: Record - ) { + recordOperation(operationType: string, duration: number, success: boolean, context?: Record) { const attributes = { ...getCommonAttributes(), operation_type: operationType, @@ -368,7 +376,7 @@ export const metricsHelpers = { if (typeof process !== "undefined" && process.memoryUsage) { const memUsage = process.memoryUsage(); const attributes = getCommonAttributes(); - + sedaMetrics.memoryUsage.record(memUsage.heapUsed, { ...attributes, memory_type: "heap_used" }); sedaMetrics.memoryUsage.record(memUsage.heapTotal, { ...attributes, memory_type: "heap_total" }); sedaMetrics.memoryUsage.record(memUsage.rss, { ...attributes, memory_type: "rss" }); @@ -392,7 +400,7 @@ export const metricsHelpers = { /** * Start periodic collection of system metrics */ -export function startSystemMetricsCollection(intervalMs: number = 30000) { +export function startSystemMetricsCollection(intervalMs = 30000) { const interval = setInterval(() => { metricsHelpers.updateResourceMetrics(); }, intervalMs); @@ -402,10 +410,5 @@ export function startSystemMetricsCollection(intervalMs: number = 30000) { } // Export individual metrics for specific use cases -export const { - dataRequestsProcessed, - operationDuration, - rpcRequestDuration, - errorTotal, - activeConnections, -} = sedaMetrics; \ No newline at end of file +export const { dataRequestsProcessed, operationDuration, rpcRequestDuration, errorTotal, activeConnections } = + sedaMetrics; diff --git a/packages/node/src/data-request-task.ts b/packages/node/src/data-request-task.ts index a324249..f3b6cd5 100644 --- a/packages/node/src/data-request-task.ts +++ b/packages/node/src/data-request-task.ts @@ -7,8 +7,8 @@ import { JSONStringify, RevealMismatch, RevealStarted, - metricsHelpers, debouncedInterval, + metricsHelpers, sleep, } from "@sedaprotocol/overlay-ts-common"; import type { SedaChain, WorkerPool } from "@sedaprotocol/overlay-ts-common"; @@ -163,7 +163,7 @@ export class DataRequestTask extends EventEmitter { logger.error("Exceeded maximum retry attempts, marking data request as failed", { id: this.name, }); - + // Record high-priority RPC connectivity error const retryError = new Error(`Exceeded maximum retry attempts: ${this.retries}`); metricsHelpers.recordRpcError("data_request", "max_retries_exceeded", retryError, { @@ -171,7 +171,7 @@ export class DataRequestTask extends EventEmitter { identity_id: this.identityId, retries: this.retries.toString(), }); - + this.status = IdentityDataRequestStatus.Failed; span.setAttribute("final_status", "failed"); span.setAttribute("failure_reason", "max_retries_exceeded"); @@ -209,7 +209,7 @@ export class DataRequestTask extends EventEmitter { logger.error(`Error while processing data request: ${error}`, { id: this.name, }); - + // Record high-priority RPC connectivity error metricsHelpers.recordRpcError("data_request", "uncaught_exception", error as Error, { dr_id: this.drId, @@ -217,7 +217,7 @@ export class DataRequestTask extends EventEmitter { status: this.status, retries: this.retries.toString(), }); - + span.recordException(error as Error); span.setAttribute("final_status", "error"); span.setAttribute("error_reason", "uncaught_exception"); @@ -247,14 +247,14 @@ export class DataRequestTask extends EventEmitter { logger.error(`Error while fetching status of data request: ${statusResult.error}`, { id: this.drId, }); - + // Record high-priority RPC connectivity error metricsHelpers.recordRpcError("data_request", "status_fetch", statusResult.error, { dr_id: this.drId, identity_id: this.identityId, retries: this.retries.toString(), }); - + span.recordException(statusResult.error); span.setAttribute("error", "fetch_failed"); @@ -302,7 +302,7 @@ export class DataRequestTask extends EventEmitter { logger.error("Invariant found, data request task uses a data request that does not exist", { id: this.name, }); - + // CRITICAL: State Invariant Violation - Missing Data Request const stateError = new Error("Data request task references non-existent data request"); metricsHelpers.recordCriticalError("state_invariant", stateError, { @@ -310,7 +310,7 @@ export class DataRequestTask extends EventEmitter { dr_id: this.drId, identity_id: this.identityId, }); - + span.setAttribute("error", "data_request_not_found"); span.end(); this.stop(); @@ -321,7 +321,7 @@ export class DataRequestTask extends EventEmitter { logger.error("Invariant found, data request task uses an identity that does not exist", { id: this.name, }); - + // CRITICAL: State Invariant Violation - Missing Identity const stateError = new Error("Data request task references non-existent identity"); metricsHelpers.recordCriticalError("state_invariant", stateError, { @@ -329,7 +329,7 @@ export class DataRequestTask extends EventEmitter { dr_id: this.drId, identity_id: this.identityId, }); - + span.setAttribute("error", "identity_not_found"); span.end(); this.stop(); @@ -423,7 +423,7 @@ export class DataRequestTask extends EventEmitter { if (this.executionResult.isNothing) { logger.error("No execution result available while trying to commit, switching status back to initial"); - + // HIGH: Execution result missing - should not be possible const missingResultError = new Error("Execution result missing during commit phase"); metricsHelpers.recordHighPriorityError("execution_result_missing", missingResultError, { @@ -431,7 +431,7 @@ export class DataRequestTask extends EventEmitter { dr_id: this.drId, identity_id: this.identityId, }); - + span.setAttribute("error", "no_execution_result"); span.end(); this.transitionStatus(IdentityDataRequestStatus.EligibleForExecution); @@ -564,7 +564,7 @@ export class DataRequestTask extends EventEmitter { if (this.executionResult.isNothing) { logger.error("No execution result available while trying to reveal, switching status back to initial"); - + // HIGH: Execution result missing - should not be possible const missingResultError = new Error("Execution result missing during reveal phase"); metricsHelpers.recordHighPriorityError("execution_result_missing", missingResultError, { @@ -572,7 +572,7 @@ export class DataRequestTask extends EventEmitter { dr_id: this.drId, identity_id: this.identityId, }); - + span.setAttribute("error", "no_execution_result"); span.end(); this.transitionStatus(IdentityDataRequestStatus.EligibleForExecution); @@ -604,7 +604,7 @@ export class DataRequestTask extends EventEmitter { logger.error( `Chain responded with an already revealed. Data might be corrupted: ${this.commitHash.toString("hex")} vs ${result.error.commitmentHash.toString("hex")}`, ); - + // CRITICAL: Duplicate Node Detection - Reveal hash mismatch indicates duplicate nodes const duplicateError = new Error("Reveal hash mismatch - possible duplicate nodes"); metricsHelpers.recordCriticalError("duplicate_node", duplicateError, { @@ -614,7 +614,7 @@ export class DataRequestTask extends EventEmitter { our_commit_hash: this.commitHash.toString("hex"), chain_commit_hash: result.error.commitmentHash.toString("hex"), }); - + span.setAttribute("error", "reveal_mismatch"); span.setAttribute("our_commit_hash", this.commitHash.toString("hex")); span.setAttribute("chain_commit_hash", result.error.commitmentHash.toString("hex")); diff --git a/packages/node/src/index.ts b/packages/node/src/index.ts index cc7f4bd..30f8453 100644 --- a/packages/node/src/index.ts +++ b/packages/node/src/index.ts @@ -1,6 +1,6 @@ +import { SedaChain, initializeTelemetry, metricsHelpers, telemetryInitialized } from "@sedaprotocol/overlay-ts-common"; import type { AppConfig } from "@sedaprotocol/overlay-ts-config"; import { logger } from "@sedaprotocol/overlay-ts-logger"; -import { SedaChain, initializeTelemetry, telemetryInitialized, metricsHelpers } from "@sedaprotocol/overlay-ts-common"; import { MainTask } from "./tasks/main"; export interface RunOptions { diff --git a/packages/node/src/services/get-oracle-program.ts b/packages/node/src/services/get-oracle-program.ts index 66428a4..6faf517 100644 --- a/packages/node/src/services/get-oracle-program.ts +++ b/packages/node/src/services/get-oracle-program.ts @@ -1,6 +1,6 @@ import { readFile, writeFile } from "node:fs/promises"; import { resolve } from "node:path"; -import { tryAsync, metricsHelpers } from "@sedaprotocol/overlay-ts-common"; +import { metricsHelpers, tryAsync } from "@sedaprotocol/overlay-ts-common"; import type { SedaChain } from "@sedaprotocol/overlay-ts-common"; import type { AppConfig } from "@sedaprotocol/overlay-ts-config"; import { logger } from "@sedaprotocol/overlay-ts-logger"; @@ -32,7 +32,9 @@ export async function getOracleProgram( return Result.err(new Error(binary.error.toString())); } - const binaryBuffer = Maybe.of((binary.value as any)?.oracleProgram?.bytecode).map((byteCode) => Buffer.from(byteCode)); + const binaryBuffer = Maybe.of((binary.value as any)?.oracleProgram?.bytecode).map((byteCode) => + Buffer.from(byteCode), + ); if (binaryBuffer.isNothing) { return Result.ok(Maybe.nothing()); diff --git a/packages/node/src/services/is-identity-eligible.ts b/packages/node/src/services/is-identity-eligible.ts index d99c9d3..09d9448 100644 --- a/packages/node/src/services/is-identity-eligible.ts +++ b/packages/node/src/services/is-identity-eligible.ts @@ -1,15 +1,9 @@ import { type Context, type Span, type Tracer, trace } from "@opentelemetry/api"; import type { GetExecutorEligibilityResponse } from "@sedaprotocol/core-contract-schema"; -import { - type SedaChain, - getCurrentBlockHeight, - keccak256, - metricsHelpers, -} from "@sedaprotocol/overlay-ts-common"; +import { type SedaChain, getCurrentBlockHeight, keccak256, metricsHelpers } from "@sedaprotocol/overlay-ts-common"; import { logger } from "@sedaprotocol/overlay-ts-logger"; import { Result } from "true-myth"; import type { DataRequest } from "../models/data-request"; -import type { IdentityPool } from "../models/identitiest-pool"; import { getDrConfig } from "./dr-config"; import { type Staker, getStakers } from "./get-staker"; import { getStakingConfig } from "./get-staking-config"; diff --git a/packages/node/src/tasks/identity-manager.ts b/packages/node/src/tasks/identity-manager.ts index 4f6284a..6fe514f 100644 --- a/packages/node/src/tasks/identity-manager.ts +++ b/packages/node/src/tasks/identity-manager.ts @@ -1,4 +1,10 @@ -import { TransactionPriority, debouncedInterval, formatTokenUnits, type SedaChain, metricsHelpers } from "@sedaprotocol/overlay-ts-common"; +import { + type SedaChain, + TransactionPriority, + debouncedInterval, + formatTokenUnits, + metricsHelpers, +} from "@sedaprotocol/overlay-ts-common"; import type { AppConfig } from "@sedaprotocol/overlay-ts-config"; import { logger } from "@sedaprotocol/overlay-ts-logger"; import { Result, type Unit } from "true-myth"; diff --git a/packages/node/src/tasks/is-eligible.ts b/packages/node/src/tasks/is-eligible.ts index 11690d4..bc5b28b 100644 --- a/packages/node/src/tasks/is-eligible.ts +++ b/packages/node/src/tasks/is-eligible.ts @@ -4,11 +4,7 @@ import { createEligibilityHash, createEligibilityMessageData, } from "@sedaprotocol/core-contract-schema"; -import { - type SedaChain, - debouncedInterval, - metricsHelpers, -} from "@sedaprotocol/overlay-ts-common"; +import { type SedaChain, debouncedInterval, metricsHelpers } from "@sedaprotocol/overlay-ts-common"; import type { AppConfig } from "@sedaprotocol/overlay-ts-config"; import { logger } from "@sedaprotocol/overlay-ts-logger"; import { EventEmitter } from "eventemitter3"; From 92bbf134c98d817dfa2d390010d94e1aea76e158 Mon Sep 17 00:00:00 2001 From: kaynetik Date: Fri, 4 Jul 2025 08:17:18 +0200 Subject: [PATCH 3/3] chore: Cleanup comments --- docker-compose.jaeger.yml | 5 +-- docker-compose.yml | 44 -------------------------- packages/common/src/telemetry/index.ts | 11 ------- prometheus.yml | 6 ---- 4 files changed, 1 insertion(+), 65 deletions(-) delete mode 100644 docker-compose.yml diff --git a/docker-compose.jaeger.yml b/docker-compose.jaeger.yml index 1024fe6..f71912a 100644 --- a/docker-compose.jaeger.yml +++ b/docker-compose.jaeger.yml @@ -1,7 +1,5 @@ -version: '3.8' - +# Temporary - just for testing traces/metrics before switching to mimir collector services: - # Jaeger all-in-one for development/testing jaeger: image: jaegertracing/all-in-one:latest ports: @@ -23,7 +21,6 @@ services: command: - "--memory.max-traces=10000" - # Optional: Prometheus for metrics (can be removed if only testing traces) prometheus: image: prom/prometheus:latest ports: diff --git a/docker-compose.yml b/docker-compose.yml deleted file mode 100644 index 730ea9a..0000000 --- a/docker-compose.yml +++ /dev/null @@ -1,44 +0,0 @@ -services: - # Jaeger all-in-one for development/testing - jaeger: - image: jaegertracing/all-in-one:latest - ports: - # Jaeger UI - - "16686:16686" - # OTLP gRPC receiver - - "4317:4317" - # OTLP HTTP receiver - - "4318:4318" - # Jaeger thrift - - "14250:14250" - # Jaeger thrift HTTP - - "14268:14268" - # Zipkin compatible endpoint - - "9411:9411" - environment: - - COLLECTOR_OTLP_ENABLED=true - - LOG_LEVEL=debug - command: - - "--memory.max-traces=10000" - - "--query.base-path=/jaeger/ui" - - "--prometheus.server-url=http://prometheus:9090" - - "--prometheus.query.support-spanmetrics-connector=true" - - # Optional: Prometheus for metrics (can be removed if only testing traces) - prometheus: - image: prom/prometheus:latest - ports: - - "9090:9090" - volumes: - - ./prometheus.yml:/etc/prometheus/prometheus.yml - command: - - '--config.file=/etc/prometheus/prometheus.yml' - - '--storage.tsdb.path=/prometheus' - - '--web.console.libraries=/etc/prometheus/console_libraries' - - '--web.console.templates=/etc/prometheus/consoles' - - '--storage.tsdb.retention.time=200h' - - '--web.enable-lifecycle' - -networks: - default: - name: seda-otel-network \ No newline at end of file diff --git a/packages/common/src/telemetry/index.ts b/packages/common/src/telemetry/index.ts index 5e2d911..9133248 100644 --- a/packages/common/src/telemetry/index.ts +++ b/packages/common/src/telemetry/index.ts @@ -7,15 +7,11 @@ import { MeterProvider, PeriodicExportingMetricReader } from "@opentelemetry/sdk import { NodeTracerProvider, SimpleSpanProcessor } from "@opentelemetry/sdk-trace-node"; import { ATTR_SERVICE_NAME, ATTR_SERVICE_VERSION } from "@opentelemetry/semantic-conventions"; -// Global state let telemetryInitialized = false; let metricsCollectionCleanup: (() => void) | null = null; let tracerProvider: NodeTracerProvider; let meterProvider: MeterProvider; -/** - * Create metrics readers based on configuration - */ function createMetricReaders(config: { metricsExporter: string; prometheusPort: number; @@ -62,7 +58,6 @@ export function initializeTelemetry(): boolean { return true; } - // Read configuration from environment variables at runtime const config = { otlpEndpoint: process.env.OTLP_ENDPOINT || "http://localhost:4318", serviceName: process.env.OTEL_SERVICE_NAME || "seda-overlay", @@ -137,9 +132,6 @@ export function initializeTelemetry(): boolean { } } -/** - * Setup graceful shutdown handlers - */ function setupGracefulShutdown(): void { const shutdown = async () => { if (!telemetryInitialized) { @@ -170,9 +162,6 @@ function setupGracefulShutdown(): void { process.on("SIGINT", shutdown); } -/** - * Gracefully shutdown telemetry - */ export async function shutdownTelemetry() { if (!telemetryInitialized) { return; diff --git a/prometheus.yml b/prometheus.yml index 359e96c..393ddae 100644 --- a/prometheus.yml +++ b/prometheus.yml @@ -10,9 +10,3 @@ scrape_configs: - job_name: 'jaeger' static_configs: - targets: ['jaeger:14269'] - - # This is for scraping metrics from your application if it exposes a /metrics endpoint - # Uncomment and adjust the target if your seda-overlay application exposes Prometheus metrics - # - job_name: 'seda-overlay' - # static_configs: - # - targets: ['host.docker.internal:8080'] # Adjust port as needed \ No newline at end of file