Skip to content

Commit d7a7af0

Browse files
authored
🐛 fix: do not count context errors as failure to renew a lock (#214)
* do not count context errors as failure to renew a lock * add tests for newly exposed funcs
1 parent 280b617 commit d7a7af0

File tree

4 files changed

+75
-20
lines changed

4 files changed

+75
-20
lines changed

Diff for: v2/lockrenewer.go

+20-4
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,10 @@ import (
99

1010
"github.com/Azure/azure-sdk-for-go/sdk/azcore/to"
1111
"github.com/Azure/azure-sdk-for-go/sdk/messaging/azservicebus"
12-
"github.com/Azure/go-shuttle/v2/metrics/processor"
1312
"go.opentelemetry.io/otel/attribute"
1413
"go.opentelemetry.io/otel/trace"
14+
15+
"github.com/Azure/go-shuttle/v2/metrics/processor"
1516
)
1617

1718
// LockRenewer abstracts the servicebus receiver client to only expose lock renewal
@@ -26,25 +27,33 @@ type LockRenewalOptions struct {
2627
// CancelMessageContextOnStop will cancel the downstream message context when the renewal handler is stopped.
2728
// Defaults to true.
2829
CancelMessageContextOnStop *bool
30+
// MetricRecorder allows to pass a custom metric recorder for the LockRenewer.
31+
// Defaults to processor.Metric instance.
32+
MetricRecorder processor.Recorder
2933
}
3034

3135
// NewLockRenewalHandler returns a middleware handler that will renew the lock on the message at the specified interval.
3236
func NewLockRenewalHandler(lockRenewer LockRenewer, options *LockRenewalOptions, handler Handler) HandlerFunc {
3337
interval := 10 * time.Second
3438
cancelMessageContextOnStop := true
39+
metricRecorder := processor.Metric
3540
if options != nil {
3641
if options.Interval != nil {
3742
interval = *options.Interval
3843
}
3944
if options.CancelMessageContextOnStop != nil {
4045
cancelMessageContextOnStop = *options.CancelMessageContextOnStop
4146
}
47+
if options.MetricRecorder != nil {
48+
metricRecorder = options.MetricRecorder
49+
}
4250
}
4351
return func(ctx context.Context, settler MessageSettler, message *azservicebus.ReceivedMessage) {
4452
plr := &peekLockRenewer{
4553
next: handler,
4654
lockRenewer: lockRenewer,
4755
renewalInterval: &interval,
56+
metrics: metricRecorder,
4857
cancelMessageCtxOnStop: cancelMessageContextOnStop,
4958
stopped: make(chan struct{}, 1), // buffered channel to ensure we are not blocking
5059
}
@@ -74,6 +83,7 @@ type peekLockRenewer struct {
7483
next Handler
7584
lockRenewer LockRenewer
7685
renewalInterval *time.Duration
86+
metrics processor.Recorder
7787
alive atomic.Bool
7888
cancelMessageCtxOnStop bool
7989
cancelMessageCtx func()
@@ -124,7 +134,13 @@ func (plr *peekLockRenewer) startPeriodicRenewal(ctx context.Context, message *a
124134
err := plr.lockRenewer.RenewMessageLock(ctx, message, nil)
125135
if err != nil {
126136
log(ctx, fmt.Sprintf("failed to renew lock: %s", err))
127-
processor.Metric.IncMessageLockRenewedFailure(message)
137+
if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
138+
// if the error is a context error
139+
// we stop and let the next loop iteration handle the exit.
140+
plr.stop(ctx)
141+
continue
142+
}
143+
plr.metrics.IncMessageLockRenewedFailure(message)
128144
// The context is canceled when the message handler returns from the processor.
129145
// This can happen if we already entered the interval case when the message processing completes.
130146
// The best we can do is log and retry on the next tick. The sdk already retries operations on recoverable network errors.
@@ -140,14 +156,14 @@ func (plr *peekLockRenewer) startPeriodicRenewal(ctx context.Context, message *a
140156
continue
141157
}
142158
span.AddEvent("message lock renewed", trace.WithAttributes(attribute.Int("count", count)))
143-
processor.Metric.IncMessageLockRenewedSuccess(message)
159+
plr.metrics.IncMessageLockRenewedSuccess(message)
144160
case <-ctx.Done():
145161
log(ctx, "context done: stopping periodic renewal")
146162
span.AddEvent("context done: stopping message lock renewal")
147163
err := ctx.Err()
148164
if errors.Is(err, context.DeadlineExceeded) {
149165
span.RecordError(err)
150-
processor.Metric.IncMessageDeadlineReachedCount(message)
166+
plr.metrics.IncMessageDeadlineReachedCount(message)
151167
}
152168
plr.stop(ctx)
153169
case <-plr.stopped:

Diff for: v2/lockrenewer_test.go

+36-10
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,10 @@ import (
1111
"github.com/Azure/azure-sdk-for-go/sdk/azcore/to"
1212
"github.com/Azure/azure-sdk-for-go/sdk/messaging/azservicebus"
1313
. "github.com/onsi/gomega"
14+
"github.com/prometheus/client_golang/prometheus"
1415

1516
"github.com/Azure/go-shuttle/v2"
17+
"github.com/Azure/go-shuttle/v2/metrics/processor"
1618
)
1719

1820
type fakeSBLockRenewer struct {
@@ -150,24 +152,40 @@ func Test_RenewPeriodically_Error(t *testing.T) {
150152
isRenewerCanceled bool
151153
cancelCtxOnStop *bool
152154
gotMessageCtx context.Context
153-
verify func(g Gomega, tc *testCase)
155+
verify func(g Gomega, tc *testCase, metrics *processor.Informer)
154156
}
155157
testCases := []testCase{
156158
{
157159
name: "continue periodic renewal on unknown error",
158160
renewer: &fakeSBLockRenewer{Err: fmt.Errorf("unknown error")},
159-
verify: func(g Gomega, tc *testCase) {
161+
verify: func(g Gomega, tc *testCase, metrics *processor.Informer) {
160162
g.Eventually(
161163
func(g Gomega) { g.Expect(tc.renewer.RenewCount.Load()).To(Equal(int32(2))) },
162164
130*time.Millisecond,
163165
20*time.Millisecond).Should(Succeed())
164166
},
165167
},
168+
{
169+
name: "stop periodic renewal on context canceled",
170+
isRenewerCanceled: false,
171+
renewer: &fakeSBLockRenewer{Err: context.Canceled},
172+
verify: func(g Gomega, tc *testCase, metrics *processor.Informer) {
173+
g.Consistently(
174+
func(g Gomega) {
175+
g.Expect(tc.renewer.RenewCount.Load()).To(Equal(int32(1)),
176+
"should not attempt to renew")
177+
g.Expect(metrics.GetMessageLockRenewedFailureCount()).To(Equal(float64(0)),
178+
"should not record failure metric")
179+
},
180+
130*time.Millisecond,
181+
20*time.Millisecond).Should(Succeed())
182+
},
183+
},
166184
{
167185
name: "stop periodic renewal on context canceled",
168186
isRenewerCanceled: true,
169187
renewer: &fakeSBLockRenewer{Err: context.Canceled},
170-
verify: func(g Gomega, tc *testCase) {
188+
verify: func(g Gomega, tc *testCase, metrics *processor.Informer) {
171189
g.Consistently(
172190
func(g Gomega) { g.Expect(tc.renewer.RenewCount.Load()).To(Equal(int32(0))) },
173191
130*time.Millisecond,
@@ -177,7 +195,7 @@ func Test_RenewPeriodically_Error(t *testing.T) {
177195
{
178196
name: "stop periodic renewal on permanent error (lockLost)",
179197
renewer: &fakeSBLockRenewer{Err: &azservicebus.Error{Code: azservicebus.CodeLockLost}},
180-
verify: func(g Gomega, tc *testCase) {
198+
verify: func(g Gomega, tc *testCase, metrics *processor.Informer) {
181199
g.Consistently(
182200
func(g Gomega) { g.Expect(tc.renewer.RenewCount.Load()).To(Equal(int32(1))) },
183201
130*time.Millisecond,
@@ -187,7 +205,7 @@ func Test_RenewPeriodically_Error(t *testing.T) {
187205
{
188206
name: "cancel message context on stop by default",
189207
renewer: &fakeSBLockRenewer{Err: &azservicebus.Error{Code: azservicebus.CodeLockLost}},
190-
verify: func(g Gomega, tc *testCase) {
208+
verify: func(g Gomega, tc *testCase, metrics *processor.Informer) {
191209
g.Consistently(
192210
func(g Gomega) { g.Expect(tc.renewer.RenewCount.Load()).To(Equal(int32(1))) },
193211
130*time.Millisecond,
@@ -199,7 +217,7 @@ func Test_RenewPeriodically_Error(t *testing.T) {
199217
name: "does not cancel message context on stop if disabled",
200218
renewer: &fakeSBLockRenewer{Err: &azservicebus.Error{Code: azservicebus.CodeLockLost}},
201219
cancelCtxOnStop: to.Ptr(false),
202-
verify: func(g Gomega, tc *testCase) {
220+
verify: func(g Gomega, tc *testCase, metrics *processor.Informer) {
203221
g.Consistently(
204222
func(g Gomega) {
205223
g.Expect(tc.renewer.RenewCount.Load()).To(Equal(int32(1)))
@@ -212,7 +230,7 @@ func Test_RenewPeriodically_Error(t *testing.T) {
212230
{
213231
name: "continue periodic renewal on transient error (timeout)",
214232
renewer: &fakeSBLockRenewer{Err: &azservicebus.Error{Code: azservicebus.CodeTimeout}},
215-
verify: func(g Gomega, tc *testCase) {
233+
verify: func(g Gomega, tc *testCase, metrics *processor.Informer) {
216234
g.Eventually(
217235
func(g Gomega) { g.Expect(tc.renewer.RenewCount.Load()).To(Equal(int32(2))) },
218236
140*time.Millisecond,
@@ -225,7 +243,15 @@ func Test_RenewPeriodically_Error(t *testing.T) {
225243
t.Run(tc.name, func(t *testing.T) {
226244
t.Parallel()
227245
interval := 50 * time.Millisecond
228-
lr := shuttle.NewLockRenewalHandler(tc.renewer, &shuttle.LockRenewalOptions{Interval: &interval, CancelMessageContextOnStop: tc.cancelCtxOnStop},
246+
reg := processor.NewRegistry()
247+
reg.Init(prometheus.NewRegistry())
248+
informer := processor.NewInformerFor(reg)
249+
lr := shuttle.NewLockRenewalHandler(tc.renewer,
250+
&shuttle.LockRenewalOptions{
251+
Interval: &interval,
252+
CancelMessageContextOnStop: tc.cancelCtxOnStop,
253+
MetricRecorder: reg,
254+
},
229255
shuttle.HandlerFunc(func(ctx context.Context, settler shuttle.MessageSettler,
230256
message *azservicebus.ReceivedMessage) {
231257
tc.gotMessageCtx = ctx
@@ -237,13 +263,13 @@ func Test_RenewPeriodically_Error(t *testing.T) {
237263
}
238264
}))
239265
msg := &azservicebus.ReceivedMessage{}
240-
ctx, cancel := context.WithTimeout(context.TODO(), 200*time.Millisecond)
266+
ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond)
241267
if tc.isRenewerCanceled {
242268
cancel()
243269
}
244270
defer cancel()
245271
lr.Handle(ctx, &fakeSettler{}, msg)
246-
tc.verify(NewWithT(t), &tc)
272+
tc.verify(NewWithT(t), &tc, informer)
247273
})
248274
}
249275
}

Diff for: v2/metrics/processor/types.go

+11-3
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,13 @@ const (
1717
)
1818

1919
var (
20-
metricsRegistry = newRegistry()
20+
metricsRegistry = NewRegistry()
2121
// Metric exposes a Recorder interface to manipulate the Processor metrics.
2222
Metric Recorder = metricsRegistry
2323
)
2424

25-
func newRegistry() *Registry {
25+
// NewRegistry creates a new Registry with initialized prometheus counter definitions
26+
func NewRegistry() *Registry {
2627
return &Registry{
2728
MessageReceivedCount: prom.NewCounterVec(prom.CounterOpts{
2829
Name: "message_received_total",
@@ -59,6 +60,7 @@ func getMessageTypeLabel(msg *azservicebus.ReceivedMessage) prom.Labels {
5960
}
6061
}
6162

63+
// Init registers the counters from the Registry on the prometheus.Registerer
6264
func (m *Registry) Init(reg prom.Registerer) {
6365
reg.MustRegister(
6466
m.MessageReceivedCount,
@@ -68,6 +70,7 @@ func (m *Registry) Init(reg prom.Registerer) {
6870
m.ConcurrentMessageCount)
6971
}
7072

73+
// Registry provides the prometheus metrics for the message processor
7174
type Registry struct {
7275
MessageReceivedCount *prom.CounterVec
7376
MessageHandledCount *prom.CounterVec
@@ -137,7 +140,12 @@ type Informer struct {
137140

138141
// NewInformer creates an Informer for the current registry
139142
func NewInformer() *Informer {
140-
return &Informer{registry: metricsRegistry}
143+
return NewInformerFor(metricsRegistry)
144+
}
145+
146+
// NewInformerFor creates an Informer for the current registry
147+
func NewInformerFor(r *Registry) *Informer {
148+
return &Informer{registry: r}
141149
}
142150

143151
// GetMessageLockRenewedFailureCount retrieves the current value of the MessageLockRenewedFailureCount metric

Diff for: v2/metrics/processor/types_test.go

+8-3
Original file line numberDiff line numberDiff line change
@@ -26,13 +26,18 @@ func (f *fakeRegistry) Unregister(c prometheus.Collector) bool {
2626

2727
func TestRegistry_Init(t *testing.T) {
2828
g := NewWithT(t)
29-
r := newRegistry()
29+
r := NewRegistry()
3030
fRegistry := &fakeRegistry{}
3131
g.Expect(func() { r.Init(prometheus.NewRegistry()) }).ToNot(Panic())
3232
g.Expect(func() { r.Init(fRegistry) }).ToNot(Panic())
3333
g.Expect(fRegistry.collectors).To(HaveLen(5))
3434
Metric.IncMessageReceived(10)
35+
}
3536

37+
func TestNewInformerDefault(t *testing.T) {
38+
i := NewInformer()
39+
g := NewWithT(t)
40+
g.Expect(i.registry).To(Equal(Metric))
3641
}
3742

3843
func TestMetrics(t *testing.T) {
@@ -55,9 +60,9 @@ func TestMetrics(t *testing.T) {
5560
},
5661
} {
5762
g := NewWithT(t)
58-
r := newRegistry()
63+
r := NewRegistry()
5964
registerer := prometheus.NewRegistry()
60-
informer := &Informer{registry: r}
65+
informer := NewInformerFor(r)
6166

6267
// before init
6368
count, err := informer.GetMessageLockRenewedFailureCount()

0 commit comments

Comments
 (0)