Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,10 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: git checkout
uses: actions/checkout@v2
uses: actions/checkout@v4

- name: go cache
uses: actions/cache@v1
uses: actions/cache@v4
with:
path: /home/runner/work/go
key: lndmon-${{ runner.os }}-go-${{ env.GO_VERSION }}-${{ github.job }}-${{ hashFiles('**/go.sum') }}
Expand All @@ -41,7 +41,7 @@ jobs:
lndmon-${{ runner.os }}-go-

- name: setup go ${{ env.GO_VERSION }}
uses: actions/setup-go@v2
uses: actions/setup-go@v5
with:
go-version: '${{ env.GO_VERSION }}'

Expand Down
3 changes: 3 additions & 0 deletions collectors/log.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ var (

// htlcLogger is a logger for lndmon's htlc collector.
htlcLogger = build.NewSubLogger("HTLC", backendLog.Logger)

// paymentLogger is a logger for lndmon's payments monitor.
paymentLogger = build.NewSubLogger("PMNT", backendLog.Logger)
)

// initLogRotator initializes the logging rotator to write logs to logFile and
Expand Down
175 changes: 175 additions & 0 deletions collectors/payments_collector.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
package collectors

import (
"context"
"fmt"
"sync"

"github.com/lightninglabs/lndclient"
"github.com/lightningnetwork/lnd/lnrpc"
"github.com/lightningnetwork/lnd/lnrpc/routerrpc"
"github.com/prometheus/client_golang/prometheus"
)

var (
// totalPayments tracks the total number of payments initiated, labeled
// by final payment status. This permits computation of both throughput
// and success/failure rates.
totalPayments = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "lnd_total_payments",
Help: "Total number of payments initiated, labeled by final status",
},
[]string{"status"},
)

// totalHTLCAttempts is a simple counter which, in combination with the
// payment counter, permits tracking the number of attempts per payment.
totalHTLCAttempts = prometheus.NewCounter(
prometheus.CounterOpts{
Name: "lnd_total_htlc_attempts",
Help: "Total number of HTLC attempts across all payments",
},
)

// paymentAttempts is a histogram for visualizing what portion of
// payments complete within a given number of attempts.
paymentAttempts = prometheus.NewHistogram(
prometheus.HistogramOpts{
Name: "lnd_payment_attempts_per_payment",
Help: "Histogram tracking the number of attempts per payment",
Buckets: prometheus.ExponentialBuckets(1, 2, 10),
},
)
)

// paymentsMonitor listens for payments and updates Prometheus metrics.
type paymentsMonitor struct {
client routerrpc.RouterClient

lnd *lndclient.LndServices

errChan chan error

// quit is closed to signal that we need to shutdown.
quit chan struct{}

wg sync.WaitGroup
}

// newPaymentsMonitor creates a new payments monitor and ensures the context
// includes macaroon authentication.
func newPaymentsMonitor(lnd *lndclient.LndServices,
errChan chan error) *paymentsMonitor {

return &paymentsMonitor{
client: routerrpc.NewRouterClient(lnd.ClientConn),
lnd: lnd,
errChan: errChan,
quit: make(chan struct{}),
}
}

// start subscribes to `TrackPayments` and updates Prometheus metrics.
func (p *paymentsMonitor) start() error {
paymentLogger.Info("Starting payments monitor...")

// Attach macaroon authentication for the router service.
ctx, cancel := context.WithCancel(context.Background())
ctx, err := p.lnd.WithMacaroonAuthForService(
ctx, lndclient.RouterServiceMac,
)
if err != nil {
cancel()

return fmt.Errorf("failed to get macaroon-authenticated "+
"context: %w", err)
}

stream, err := p.client.TrackPayments(
ctx, &routerrpc.TrackPaymentsRequest{
// NOTE: We only need to know the final result of the
// payment and all attempts.
NoInflightUpdates: true,
},
)
if err != nil {
paymentLogger.Errorf("Failed to subscribe to TrackPayments: %v",
err)

cancel()

return err
}

p.wg.Add(1)
go func() {
defer func() {
cancel()
p.wg.Done()
}()

for {
select {
case <-p.quit:
return

default:
payment, err := stream.Recv()
if err != nil {
paymentLogger.Errorf("Error receiving "+
"payment update: %v", err)

p.errChan <- err
return
}
processPaymentUpdate(payment)
}
}
}()

return nil
}

// stop cancels the payments monitor subscription.
func (p *paymentsMonitor) stop() {
paymentLogger.Info("Stopping payments monitor...")

close(p.quit)
p.wg.Wait()
}

// collectors returns all of the collectors that the htlc monitor uses.
func (p *paymentsMonitor) collectors() []prometheus.Collector {
return []prometheus.Collector{
totalPayments, totalHTLCAttempts, paymentAttempts,
}
}

// processPaymentUpdate updates Prometheus metrics based on received payments.
//
// NOTE: It is expected that this receive the *final* payment update with the
// complete list of all htlc attempts made for this payment.
func processPaymentUpdate(payment *lnrpc.Payment) {
var status string

switch payment.Status {
case lnrpc.Payment_SUCCEEDED:
status = "succeeded"
case lnrpc.Payment_FAILED:
status = "failed"
default:
// We don't expect this given that this should be a terminal
// payment update.
status = "unknown"
}

totalPayments.WithLabelValues(status).Inc()
attemptCount := len(payment.Htlcs)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, I don't think this can quite be used as a proxy for attempts. For that we'd need to watch a payment over time, and increment this counter with each attempt.

We may also want to introspect into the payment state itself: https://lightning.engineering/api-docs/api/lnd/router/track-payment-v2/#lnrpcpaymentpaymentstatus

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Double checked, and I think I'm actually wrong about this. We get a new element here for each new attempt, as it isn't just the set of final HTLCs that were settled.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Prior to learning about this TrackPayments RPC, I had created a draft PR to add these metrics as real time counters in lnd directly. Implemented there, the ChannelRouter can increment counters as each attempt is registered. My hope here was that supplying the NoInflightUpdates directive to the TrackPayments call will give us only the final payment update (settle or fail) from which we can make an accurate determination of how many total attempts were made.


totalHTLCAttempts.Add(float64(attemptCount))
paymentAttempts.Observe(float64(attemptCount))

paymentLogger.Debugf("Payment %s updated: status=%s, %d attempts",
payment.PaymentHash, status, attemptCount)
}
41 changes: 34 additions & 7 deletions collectors/prometheus.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ type PrometheusExporter struct {

monitoringCfg *MonitoringConfig

htlcMonitor *htlcMonitor
htlcMonitor *htlcMonitor
paymentsMonitor *paymentsMonitor

// collectors is the exporter's active set of collectors.
collectors []prometheus.Collector
Expand Down Expand Up @@ -72,6 +73,9 @@ type MonitoringConfig struct {
// DisableHtlc disables collection of HTLCs metrics
DisableHtlc bool

// DisablePayments disables collection of payment metrics
DisablePayments bool

// ProgramStartTime stores a best-effort estimate of when lnd/lndmon was
// started.
ProgramStartTime time.Time
Expand Down Expand Up @@ -100,6 +104,9 @@ func NewPrometheusExporter(cfg *PrometheusConfig, lnd *lndclient.LndServices,

htlcMonitor := newHtlcMonitor(lnd.Router, errChan)

// Create payments monitor.
paymentsMonitor := newPaymentsMonitor(lnd, errChan)

chanCollector := NewChannelsCollector(
lnd.Client, errChan, quitChan, monitoringCfg,
)
Expand All @@ -117,19 +124,26 @@ func NewPrometheusExporter(cfg *PrometheusConfig, lnd *lndclient.LndServices,
collectors = append(collectors, htlcMonitor.collectors()...)
}

if !monitoringCfg.DisablePayments {
collectors = append(
collectors, paymentsMonitor.collectors()...,
)
}

if !monitoringCfg.DisableGraph {
collectors = append(
collectors, NewGraphCollector(lnd.Client, errChan),
)
}

return &PrometheusExporter{
cfg: cfg,
lnd: lnd,
monitoringCfg: monitoringCfg,
collectors: collectors,
htlcMonitor: htlcMonitor,
errChan: errChan,
cfg: cfg,
lnd: lnd,
monitoringCfg: monitoringCfg,
collectors: collectors,
htlcMonitor: htlcMonitor,
paymentsMonitor: paymentsMonitor,
errChan: errChan,
}
}

Expand Down Expand Up @@ -165,6 +179,15 @@ func (p *PrometheusExporter) Start() error {
}
}

// Start the payment monitor goroutine. This will subscribe to receive
// update for all payments made by lnd and update our payments related
// metrics.
if !p.monitoringCfg.DisablePayments {
if err := p.paymentsMonitor.start(); err != nil {
return err
}
}

// Finally, we'll launch the HTTP server that Prometheus will use to
// scape our metrics.
go func() {
Expand Down Expand Up @@ -199,6 +222,10 @@ func (p *PrometheusExporter) Stop() {
if !p.monitoringCfg.DisableHtlc {
p.htlcMonitor.stop()
}

if !p.monitoringCfg.DisablePayments {
p.paymentsMonitor.stop()
}
}

// Errors returns an error channel that any failures experienced by its
Expand Down
3 changes: 3 additions & 0 deletions config.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@ type config struct {

// DisableHtlc disables the collection of HTLCs metrics.
DisableHtlc bool `long:"disablehtlc" description:"Do not collect HTLCs metrics"`

// DisablePayments disables the collection of payments metrics.
DisablePayments bool `long:"disablepayments" description:"Do not collect payments metrics"`
}

var defaultConfig = config{
Expand Down
5 changes: 3 additions & 2 deletions lndmon.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,9 @@ func start() error {
defer lnd.Close()

monitoringCfg := collectors.MonitoringConfig{
DisableGraph: cfg.DisableGraph,
DisableHtlc: cfg.DisableHtlc,
DisableGraph: cfg.DisableGraph,
DisableHtlc: cfg.DisableHtlc,
DisablePayments: cfg.DisablePayments,
}
if cfg.PrimaryNode != "" {
primaryNode, err := route.NewVertexFromStr(cfg.PrimaryNode)
Expand Down