-
Notifications
You must be signed in to change notification settings - Fork 12
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Improves/Fixes TableBackedProgressManager #10
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,139 @@ | ||
package main | ||
|
||
import ( | ||
"context" | ||
"fmt" | ||
"log" | ||
"os" | ||
"time" | ||
|
||
"github.com/gocql/gocql" | ||
scyllacdc "github.com/scylladb/scylla-cdc-go" | ||
) | ||
|
||
// Make sure you create the following table before you run this example: | ||
// CREATE TABLE ks.tbl (pk int, ck int, v int, PRIMARY KEY (pk, ck)) WITH cdc = {'enabled': 'true'}; | ||
|
||
func main() { | ||
cluster := gocql.NewCluster("127.0.0.1") | ||
cluster.PoolConfig.HostSelectionPolicy = gocql.TokenAwareHostPolicy(gocql.RoundRobinHostPolicy()) | ||
session, err := cluster.CreateSession() | ||
if err != nil { | ||
log.Fatal(err) | ||
return | ||
} | ||
defer session.Close() | ||
|
||
var progressManager scyllacdc.ProgressManager | ||
progressManager, err = scyllacdc.NewTableBackedProgressManager(session, "ks.cdc_progress", "cdc-replicator") | ||
if err != nil { | ||
log.Fatal(err) | ||
return | ||
} | ||
|
||
logger := log.New(os.Stderr, "", log.Ldate|log.Lmicroseconds|log.Lshortfile) | ||
|
||
cfg := &scyllacdc.ReaderConfig{ | ||
Session: session, | ||
TableNames: []string{"ks.tbl"}, | ||
ChangeConsumerFactory: &myFactory{ | ||
logger: logger, | ||
progressReporterInterval: 30 * time.Second, | ||
}, | ||
Logger: logger, | ||
Advanced: scyllacdc.AdvancedReaderConfig{ | ||
ChangeAgeLimit: 15 * time.Minute, | ||
ConfidenceWindowSize: 10 * time.Second, | ||
PostQueryDelay: 5 * time.Second, | ||
PostFailedQueryDelay: 5 * time.Second, | ||
QueryTimeWindowSize: 5 * 60 * time.Second, | ||
}, | ||
ProgressManager: progressManager, | ||
} | ||
|
||
reader, err := scyllacdc.NewReader(context.Background(), cfg) | ||
if err != nil { | ||
log.Fatal(err) | ||
} | ||
|
||
if err := reader.Run(context.Background()); err != nil { | ||
log.Fatal(err) | ||
} | ||
} | ||
|
||
func consumeChange(ctx context.Context, tableName string, c scyllacdc.Change) error { | ||
for _, changeRow := range c.Delta { | ||
pkRaw, _ := changeRow.GetValue("pk") | ||
ckRaw, _ := changeRow.GetValue("ck") | ||
v := changeRow.GetAtomicChange("v") | ||
|
||
pk := pkRaw.(*int) | ||
ck := ckRaw.(*int) | ||
|
||
fmt.Printf("[%s] Operation: %s, pk: %s, ck: %s\n", tableName, changeRow.GetOperation(), | ||
nullableIntToStr(pk), nullableIntToStr(ck)) | ||
|
||
if v.IsDeleted { | ||
fmt.Printf(" Column v was set to null/deleted\n") | ||
} else { | ||
vInt := v.Value.(*int) | ||
if vInt != nil { | ||
fmt.Printf(" Column v was set to %d\n", *vInt) | ||
} else { | ||
fmt.Print(" Column v was not changed\n") | ||
} | ||
} | ||
} | ||
|
||
return nil | ||
} | ||
|
||
func nullableIntToStr(i *int) string { | ||
if i == nil { | ||
return "null" | ||
} | ||
return fmt.Sprintf("%d", *i) | ||
} | ||
|
||
type myConsumer struct { | ||
// PeriodicProgressReporter is a wrapper around ProgressReporter | ||
// which rate-limits saving the progress | ||
reporter *scyllacdc.PeriodicProgressReporter | ||
logger scyllacdc.Logger | ||
f scyllacdc.ChangeConsumerFunc | ||
tableName string | ||
} | ||
|
||
func (mc *myConsumer) Consume(ctx context.Context, change scyllacdc.Change) error { | ||
// ... do work ... | ||
mc.logger.Printf("myConsumer.Consume...\n") | ||
err := mc.f(ctx, mc.tableName, change) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
mc.reporter.Update(change.Time) | ||
return nil | ||
} | ||
|
||
func (mc *myConsumer) Empty(ctx context.Context, newTime gocql.UUID) error { | ||
mc.reporter.Update(newTime) | ||
return nil | ||
} | ||
|
||
func (mc *myConsumer) End() error { | ||
_ = mc.reporter.SaveAndStop(context.Background()) | ||
return nil | ||
} | ||
|
||
type myFactory struct { | ||
logger scyllacdc.Logger | ||
progressReporterInterval time.Duration | ||
} | ||
|
||
func (f *myFactory) CreateChangeConsumer(ctx context.Context, input scyllacdc.CreateChangeConsumerInput) (scyllacdc.ChangeConsumer, error) { | ||
f.logger.Printf("myFactory.CreateChangeConsumer %s, %s\n", input.TableName, input.StreamID) | ||
reporter := scyllacdc.NewPeriodicProgressReporter(f.logger, f.progressReporterInterval, input.ProgressReporter) | ||
reporter.Start(ctx) | ||
return &myConsumer{reporter, f.logger, consumeChange, input.TableName}, nil | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -236,6 +236,7 @@ func (tbpm *TableBackedProgressManager) SaveProgress(ctx context.Context, gen ti | |
tbpm.concurrentQueryLimiter.Acquire(ctx, 1) | ||
defer tbpm.concurrentQueryLimiter.Release(1) | ||
|
||
//log.Printf("SaveProgress for %s = %s\n", streamID, progress.LastProcessedRecordTime) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. FYI: here's one more unused log |
||
return tbpm.session.Query( | ||
fmt.Sprintf("INSERT INTO %s (generation, application_name, table_name, stream_id, last_timestamp) VALUES (?, ?, ?, ?, ?) USING TTL ?", tbpm.progressTableName), | ||
gen, tbpm.applicationName, tableName, streamID, progress.LastProcessedRecordTime, tbpm.ttl, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -85,23 +85,13 @@ type AdvancedReaderConfig struct { | |
|
||
// The library uses select statements to fetch changes from CDC Log tables. | ||
// Each select fetches changes from a single table and fetches only changes | ||
// from a limited set of CDC streams. If such select returns one or more | ||
// changes then next select to this table and set of CDC streams will be | ||
// issued after a delay. This parameter specifies the length of the delay. | ||
// | ||
// If the parameter is left as 0, the library will automatically adjust | ||
// the length of the delay. | ||
PostNonEmptyQueryDelay time.Duration | ||
|
||
// The library uses select statements to fetch changes from CDC Log tables. | ||
// Each select fetches changes from a single table and fetches only changes | ||
// from a limited set of CDC streams. If such select returns no changes then | ||
// next select to this table and set of CDC streams will be issued after | ||
// from a limited set of CDC streams. The subsequent select after query | ||
// execution to this table and set of CDC streams will be issued after | ||
// a delay. This parameter specifies the length of the delay. | ||
// | ||
// If the parameter is left as 0, the library will automatically adjust | ||
// the length of the delay. | ||
PostEmptyQueryDelay time.Duration | ||
PostQueryDelay time.Duration | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unfortunately, this is a backwards-incompatible change and will require releasing a v2. I'd rather postpone it until more ideas for breaking changes accumulate and releasing v2 makes more sense. Could you explain what is your motivation for unifying both parameters? The idea for having separate delays was that an empty query result was a signal that the library is polling too fast and wastes time on empty queries. Waiting longer for the next query should increase chances that the next query will have a non-empty result. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. TBH I can't remember. I think I couldn't see any valid use case. I agree it's not a good idea to introduce a breaking change for this. But I think it also had something to do with the problem mentioned in regards to |
||
|
||
// If the library tries to read from the CDC log and the read operation | ||
// fails, it will wait some time before attempting to read again. This | ||
|
@@ -143,8 +133,7 @@ func (arc *AdvancedReaderConfig) setDefaults() { | |
} | ||
setIfZero(&arc.ConfidenceWindowSize, 30*time.Second) | ||
|
||
setIfZero(&arc.PostNonEmptyQueryDelay, 10*time.Second) | ||
setIfZero(&arc.PostEmptyQueryDelay, 30*time.Second) | ||
setIfZero(&arc.PostQueryDelay, 10*time.Second) | ||
setIfZero(&arc.PostFailedQueryDelay, 1*time.Second) | ||
|
||
setIfZero(&arc.QueryTimeWindowSize, 30*time.Second) | ||
|
@@ -268,7 +257,7 @@ func (r *Reader) Run(ctx context.Context) error { | |
} | ||
} | ||
|
||
sleepAmount := r.config.Advanced.PostNonEmptyQueryDelay / time.Duration(len(readers)) | ||
sleepAmount := r.config.Advanced.PostQueryDelay / time.Duration(len(readers)) | ||
for i := range readers { | ||
reader := readers[i] | ||
select { | ||
|
Original file line number | Diff line number | Diff line change | ||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -100,6 +100,8 @@ outer: | |||||||||||||||||||||
|
||||||||||||||||||||||
if compareTimeuuid(wnd.begin, wnd.end) < 0 { | ||||||||||||||||||||||
var iter *changeRowIterator | ||||||||||||||||||||||
//sbr.config.Logger.Printf("queryRange: %s.%s :: %s (%s) [%s ... %s]", | ||||||||||||||||||||||
// crq.keyspaceName, crq.tableName, crq.pkCondition, crq.bindArgs[0], wnd.begin.Time(), wnd.end.Time()) | ||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd rather not leave commented out code in the repository. Perhaps the Logger interface could be extended so that it understand various verbosity level and this message could have low verbosity (debug/trace)? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fine for me either way - I mostly wanted to help understand how I've been debugging this - so I committed this along with for review purposes. If you actually would like to ultimately merge this PR, let me know if I should drop.. |
||||||||||||||||||||||
iter, err = crq.queryRange(wnd.begin, wnd.end) | ||||||||||||||||||||||
if err != nil { | ||||||||||||||||||||||
sbr.config.Logger.Printf("error while sending a query (will retry): %s", err) | ||||||||||||||||||||||
|
@@ -112,6 +114,12 @@ outer: | |||||||||||||||||||||
return consumerErr | ||||||||||||||||||||||
} | ||||||||||||||||||||||
hadRows = rowCount > 0 | ||||||||||||||||||||||
|
||||||||||||||||||||||
if !hadRows { | ||||||||||||||||||||||
for _, c := range sbr.consumers { | ||||||||||||||||||||||
err = c.Empty(ctx, wnd.end) | ||||||||||||||||||||||
} | ||||||||||||||||||||||
} | ||||||||||||||||||||||
} | ||||||||||||||||||||||
|
||||||||||||||||||||||
if err == nil { | ||||||||||||||||||||||
|
@@ -126,10 +134,8 @@ outer: | |||||||||||||||||||||
var delay time.Duration | ||||||||||||||||||||||
if err != nil { | ||||||||||||||||||||||
delay = sbr.config.Advanced.PostFailedQueryDelay | ||||||||||||||||||||||
} else if hadRows { | ||||||||||||||||||||||
delay = sbr.config.Advanced.PostNonEmptyQueryDelay | ||||||||||||||||||||||
} else { | ||||||||||||||||||||||
delay = sbr.config.Advanced.PostEmptyQueryDelay | ||||||||||||||||||||||
delay = sbr.config.Advanced.PostQueryDelay | ||||||||||||||||||||||
} | ||||||||||||||||||||||
|
||||||||||||||||||||||
delayUntil := windowProcessingStartTime.Add(delay) | ||||||||||||||||||||||
|
@@ -203,14 +209,14 @@ func (sbr *streamBatchReader) getPollWindow() pollWindow { | |||||||||||||||||||||
if queryWindowRightEnd.Before(confidenceWindowStart) { | ||||||||||||||||||||||
return pollWindow{ | ||||||||||||||||||||||
begin: windowStart, | ||||||||||||||||||||||
end: gocql.MinTimeUUID(queryWindowRightEnd), | ||||||||||||||||||||||
end: gocql.MinTimeUUID(queryWindowRightEnd.Add(sbr.config.Advanced.PostQueryDelay)), | ||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This doesn't seem correct. Due to the distributed nature of Scylla and CDC, rows with recent Do you remember why you introduced that change? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. .. after reading my code multiple times now - I think it was because the window is calculated first, then the delay is resolved and executed. I think this was very weird to understand and maybe some wider refactoring should be done? see ref: Lines 132 to 141 in e0b7b88
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this was one of those details I only was able to discover while debugging with the added detailed logs. I remember this was confusing and time consuming. ^^ With this in place I was able to achieve and actually see the exact 15s expected latency, as I mentioned in my 4th comment. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Right, now I see that the delay is wrong. The window is calculated first, then there is a sleep, then the loop goes to the next iteration and uses the query window calculated before the sleep. This is clearly wrong, the window should be used in a query immediately after being computed. The proper fix would be to move There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Or - even better - There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I also agree that this code could use some refactoring :) Perhaps the logic responsible for tracking per-stream progress and calculating windows could be moved to an abstraction separate from the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, current state of the PR could be called a workaround at best 😅, glad I remembered why added this at least. Thanks for pointing out 🧐 |
||||||||||||||||||||||
|
||||||||||||||||||||||
touchesConfidenceWindow: false, | ||||||||||||||||||||||
} | ||||||||||||||||||||||
} | ||||||||||||||||||||||
return pollWindow{ | ||||||||||||||||||||||
begin: windowStart, | ||||||||||||||||||||||
end: gocql.MinTimeUUID(confidenceWindowStart), | ||||||||||||||||||||||
end: gocql.MinTimeUUID(confidenceWindowStart.Add(sbr.config.Advanced.PostQueryDelay)), | ||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The same issue as above - here, the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (see above) |
||||||||||||||||||||||
|
||||||||||||||||||||||
touchesConfidenceWindow: true, | ||||||||||||||||||||||
} | ||||||||||||||||||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -71,6 +71,7 @@ func (ppr *PeriodicProgressReporter) Start(ctx context.Context) { | |
ppr.mu.Unlock() | ||
|
||
// TODO: Log errors? | ||
//ppr.logger.Printf("MarkProgress for %s: %s", ppr.reporter.streamID, timeToReport.Time()) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same issue with logging here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. also, same as above |
||
err := ppr.reporter.MarkProgress(ctx, Progress{timeToReport}) | ||
if err != nil { | ||
ppr.logger.Printf("failed to save progress for %s: %s", ppr.reporter.streamID, err) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure what are the differences between simple-printer-stateful and complicated-consumer. Is there a good reason why you kept them separate? Perhaps the complicated-consumer could be improved in some way?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What do you mean by 'complicated-consumer'?
I think this simply was the example I've put together and was working with while debugging and attempting to fix the behaviour. So I've added it for reference.
Please feel free to keep, improve or drop. :-)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Never mind, "complicated-consumer" (which is actually called "complicated-printer") was an example that I apparently forgot to push. It's fine then, we can take your example instead.