diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 395a4d6..b8c7538 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -27,10 +27,10 @@ jobs: runs-on: ubuntu-latest steps: - name: git checkout - uses: actions/checkout@v2 + uses: actions/checkout@v4 - name: go cache - uses: actions/cache@v1 + uses: actions/cache@v4 with: path: /home/runner/work/go key: lndmon-${{ runner.os }}-go-${{ env.GO_VERSION }}-${{ github.job }}-${{ hashFiles('**/go.sum') }} @@ -41,7 +41,7 @@ jobs: lndmon-${{ runner.os }}-go- - name: setup go ${{ env.GO_VERSION }} - uses: actions/setup-go@v2 + uses: actions/setup-go@v5 with: go-version: '${{ env.GO_VERSION }}' diff --git a/collectors/log.go b/collectors/log.go index 6c5977d..25914cb 100755 --- a/collectors/log.go +++ b/collectors/log.go @@ -20,6 +20,9 @@ var ( // htlcLogger is a logger for lndmon's htlc collector. htlcLogger = build.NewSubLogger("HTLC", backendLog.Logger) + + // paymentLogger is a logger for lndmon's payments monitor. + paymentLogger = build.NewSubLogger("PMNT", backendLog.Logger) ) // initLogRotator initializes the logging rotator to write logs to logFile and diff --git a/collectors/payments_collector.go b/collectors/payments_collector.go new file mode 100644 index 0000000..e1b4f3e --- /dev/null +++ b/collectors/payments_collector.go @@ -0,0 +1,175 @@ +package collectors + +import ( + "context" + "fmt" + "sync" + + "github.com/lightninglabs/lndclient" + "github.com/lightningnetwork/lnd/lnrpc" + "github.com/lightningnetwork/lnd/lnrpc/routerrpc" + "github.com/prometheus/client_golang/prometheus" +) + +var ( + // totalPayments tracks the total number of payments initiated, labeled + // by final payment status. This permits computation of both throughput + // and success/failure rates. + totalPayments = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "lnd_total_payments", + Help: "Total number of payments initiated, labeled by final status", + }, + []string{"status"}, + ) + + // totalHTLCAttempts is a simple counter which, in combination with the + // payment counter, permits tracking the number of attempts per payment. + totalHTLCAttempts = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "lnd_total_htlc_attempts", + Help: "Total number of HTLC attempts across all payments", + }, + ) + + // paymentAttempts is a histogram for visualizing what portion of + // payments complete within a given number of attempts. + paymentAttempts = prometheus.NewHistogram( + prometheus.HistogramOpts{ + Name: "lnd_payment_attempts_per_payment", + Help: "Histogram tracking the number of attempts per payment", + Buckets: prometheus.ExponentialBuckets(1, 2, 10), + }, + ) +) + +// paymentsMonitor listens for payments and updates Prometheus metrics. +type paymentsMonitor struct { + client routerrpc.RouterClient + + lnd *lndclient.LndServices + + errChan chan error + + // quit is closed to signal that we need to shutdown. + quit chan struct{} + + wg sync.WaitGroup +} + +// newPaymentsMonitor creates a new payments monitor and ensures the context +// includes macaroon authentication. +func newPaymentsMonitor(lnd *lndclient.LndServices, + errChan chan error) *paymentsMonitor { + + return &paymentsMonitor{ + client: routerrpc.NewRouterClient(lnd.ClientConn), + lnd: lnd, + errChan: errChan, + quit: make(chan struct{}), + } +} + +// start subscribes to `TrackPayments` and updates Prometheus metrics. +func (p *paymentsMonitor) start() error { + paymentLogger.Info("Starting payments monitor...") + + // Attach macaroon authentication for the router service. + ctx, cancel := context.WithCancel(context.Background()) + ctx, err := p.lnd.WithMacaroonAuthForService( + ctx, lndclient.RouterServiceMac, + ) + if err != nil { + cancel() + + return fmt.Errorf("failed to get macaroon-authenticated "+ + "context: %w", err) + } + + stream, err := p.client.TrackPayments( + ctx, &routerrpc.TrackPaymentsRequest{ + // NOTE: We only need to know the final result of the + // payment and all attempts. + NoInflightUpdates: true, + }, + ) + if err != nil { + paymentLogger.Errorf("Failed to subscribe to TrackPayments: %v", + err) + + cancel() + + return err + } + + p.wg.Add(1) + go func() { + defer func() { + cancel() + p.wg.Done() + }() + + for { + select { + case <-p.quit: + return + + default: + payment, err := stream.Recv() + if err != nil { + paymentLogger.Errorf("Error receiving "+ + "payment update: %v", err) + + p.errChan <- err + return + } + processPaymentUpdate(payment) + } + } + }() + + return nil +} + +// stop cancels the payments monitor subscription. +func (p *paymentsMonitor) stop() { + paymentLogger.Info("Stopping payments monitor...") + + close(p.quit) + p.wg.Wait() +} + +// collectors returns all of the collectors that the htlc monitor uses. +func (p *paymentsMonitor) collectors() []prometheus.Collector { + return []prometheus.Collector{ + totalPayments, totalHTLCAttempts, paymentAttempts, + } +} + +// processPaymentUpdate updates Prometheus metrics based on received payments. +// +// NOTE: It is expected that this receive the *final* payment update with the +// complete list of all htlc attempts made for this payment. +func processPaymentUpdate(payment *lnrpc.Payment) { + var status string + + switch payment.Status { + case lnrpc.Payment_SUCCEEDED: + status = "succeeded" + case lnrpc.Payment_FAILED: + status = "failed" + default: + // We don't expect this given that this should be a terminal + // payment update. + status = "unknown" + } + + totalPayments.WithLabelValues(status).Inc() + attemptCount := len(payment.Htlcs) + + totalHTLCAttempts.Add(float64(attemptCount)) + paymentAttempts.Observe(float64(attemptCount)) + + paymentLogger.Debugf("Payment %s updated: status=%s, %d attempts", + payment.PaymentHash, status, attemptCount) +} diff --git a/collectors/prometheus.go b/collectors/prometheus.go index c4b61de..e470a21 100644 --- a/collectors/prometheus.go +++ b/collectors/prometheus.go @@ -32,7 +32,8 @@ type PrometheusExporter struct { monitoringCfg *MonitoringConfig - htlcMonitor *htlcMonitor + htlcMonitor *htlcMonitor + paymentsMonitor *paymentsMonitor // collectors is the exporter's active set of collectors. collectors []prometheus.Collector @@ -72,6 +73,9 @@ type MonitoringConfig struct { // DisableHtlc disables collection of HTLCs metrics DisableHtlc bool + // DisablePayments disables collection of payment metrics + DisablePayments bool + // ProgramStartTime stores a best-effort estimate of when lnd/lndmon was // started. ProgramStartTime time.Time @@ -100,6 +104,9 @@ func NewPrometheusExporter(cfg *PrometheusConfig, lnd *lndclient.LndServices, htlcMonitor := newHtlcMonitor(lnd.Router, errChan) + // Create payments monitor. + paymentsMonitor := newPaymentsMonitor(lnd, errChan) + chanCollector := NewChannelsCollector( lnd.Client, errChan, quitChan, monitoringCfg, ) @@ -117,6 +124,12 @@ func NewPrometheusExporter(cfg *PrometheusConfig, lnd *lndclient.LndServices, collectors = append(collectors, htlcMonitor.collectors()...) } + if !monitoringCfg.DisablePayments { + collectors = append( + collectors, paymentsMonitor.collectors()..., + ) + } + if !monitoringCfg.DisableGraph { collectors = append( collectors, NewGraphCollector(lnd.Client, errChan), @@ -124,12 +137,13 @@ func NewPrometheusExporter(cfg *PrometheusConfig, lnd *lndclient.LndServices, } return &PrometheusExporter{ - cfg: cfg, - lnd: lnd, - monitoringCfg: monitoringCfg, - collectors: collectors, - htlcMonitor: htlcMonitor, - errChan: errChan, + cfg: cfg, + lnd: lnd, + monitoringCfg: monitoringCfg, + collectors: collectors, + htlcMonitor: htlcMonitor, + paymentsMonitor: paymentsMonitor, + errChan: errChan, } } @@ -165,6 +179,15 @@ func (p *PrometheusExporter) Start() error { } } + // Start the payment monitor goroutine. This will subscribe to receive + // update for all payments made by lnd and update our payments related + // metrics. + if !p.monitoringCfg.DisablePayments { + if err := p.paymentsMonitor.start(); err != nil { + return err + } + } + // Finally, we'll launch the HTTP server that Prometheus will use to // scape our metrics. go func() { @@ -199,6 +222,10 @@ func (p *PrometheusExporter) Stop() { if !p.monitoringCfg.DisableHtlc { p.htlcMonitor.stop() } + + if !p.monitoringCfg.DisablePayments { + p.paymentsMonitor.stop() + } } // Errors returns an error channel that any failures experienced by its diff --git a/config.go b/config.go index 1e21cb1..a6fb1da 100755 --- a/config.go +++ b/config.go @@ -54,6 +54,9 @@ type config struct { // DisableHtlc disables the collection of HTLCs metrics. DisableHtlc bool `long:"disablehtlc" description:"Do not collect HTLCs metrics"` + + // DisablePayments disables the collection of payments metrics. + DisablePayments bool `long:"disablepayments" description:"Do not collect payments metrics"` } var defaultConfig = config{ diff --git a/lndmon.go b/lndmon.go index 6dd362c..44e5bae 100755 --- a/lndmon.go +++ b/lndmon.go @@ -63,8 +63,9 @@ func start() error { defer lnd.Close() monitoringCfg := collectors.MonitoringConfig{ - DisableGraph: cfg.DisableGraph, - DisableHtlc: cfg.DisableHtlc, + DisableGraph: cfg.DisableGraph, + DisableHtlc: cfg.DisableHtlc, + DisablePayments: cfg.DisablePayments, } if cfg.PrimaryNode != "" { primaryNode, err := route.NewVertexFromStr(cfg.PrimaryNode)