Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

default log store backend to WAL and allow disabling verification #21700

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
4 changes: 4 additions & 0 deletions .changelog/21700.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
```release-note:enhancement
raft: use raft-wal as the default raft log store.
```

2 changes: 1 addition & 1 deletion agent/config/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -2840,7 +2840,7 @@ func (b *builder) raftLogStoreConfigVal(raw *RaftLogStoreRaw) consul.RaftLogStor
cfg.Backend = stringValWithDefault(raw.Backend, consul.LogStoreBackendDefault)
cfg.DisableLogCache = boolVal(raw.DisableLogCache)

cfg.Verification.Enabled = boolVal(raw.Verification.Enabled)
cfg.Verification.Enabled = boolValWithDefault(raw.Verification.Enabled, true)
cfg.Verification.Interval = b.durationVal("raft_logstore.verification.interval", raw.Verification.Interval)

cfg.BoltDB.NoFreelistSync = boolVal(raw.BoltDBConfig.NoFreelistSync)
Expand Down
37 changes: 19 additions & 18 deletions agent/consul/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -1055,26 +1055,24 @@ func (s *Server) setupRaft() error {
stable = wal
return nil
}
// Only use WAL if there is no existing raft.db, even if it's enabled.
if s.config.LogStoreConfig.Backend == LogStoreBackendDefault && !boltFileExists {

// Only use WAL when no boltdb file exists
useWal := (s.config.LogStoreConfig.Backend == LogStoreBackendWAL || s.config.LogStoreConfig.Backend == LogStoreBackendDefault) && !boltFileExists

if !useWal && (s.config.LogStoreConfig.Backend == LogStoreBackendWAL || s.config.LogStoreConfig.Backend == LogStoreBackendDefault) {
// User configured the new storage, but still has old raft.db. Warn
// them!
s.logger.Warn("BoltDB file raft.db found, IGNORING raft_logstore.backend which is set to 'wal'")
}

// Default to WAL. Only use WAL if there is no existing raft.db, even if it's enabled. Log a warning otherwise
if useWal {
s.config.LogStoreConfig.Backend = LogStoreBackendWAL
if !s.config.LogStoreConfig.Verification.Enabled {
s.config.LogStoreConfig.Verification.Enabled = true
s.config.LogStoreConfig.Verification.Interval = 1 * time.Minute
}
if err = initWAL(); err != nil {
return err
}
} else if s.config.LogStoreConfig.Backend == LogStoreBackendWAL && !boltFileExists {
if err = initWAL(); err != nil {
return err
}
} else {
if s.config.LogStoreConfig.Backend == LogStoreBackendWAL {
// User configured the new storage, but still has old raft.db. Warn
// them!
s.logger.Warn("BoltDB file raft.db found, IGNORING raft_logstore.backend which is set to 'wal'")
}

s.config.LogStoreConfig.Backend = LogStoreBackendBoltDB
// Create the backend raft store for logs and stable storage.
store, err := raftboltdb.New(raftboltdb.Options{
Expand All @@ -1096,11 +1094,14 @@ func (s *Server) setupRaft() error {

// See if log verification is enabled
if s.config.LogStoreConfig.Verification.Enabled {
if s.config.LogStoreConfig.Verification.Interval == 0 {
s.config.LogStoreConfig.Verification.Interval = 1 * time.Minute
}
mc := walmetrics.NewGoMetricsCollector([]string{"raft", "logstore", "verifier"}, nil, nil)
reportFn := makeLogVerifyReportFn(s.logger.Named("raft.logstore.verifier"))
verifier := verifier.NewLogStore(log, isLogVerifyCheckpoint, reportFn, mc)
s.raftStore = verifier
log = verifier
v := verifier.NewLogStore(log, isLogVerifyCheckpoint, reportFn, mc)
s.raftStore = v
log = v
}

// Wrap the store in a LogCache to improve performance.
Expand Down
114 changes: 114 additions & 0 deletions agent/consul/server_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@ import (
"github.com/hashicorp/go-uuid"
"github.com/hashicorp/memberlist"
"github.com/hashicorp/raft"
raftboltdb "github.com/hashicorp/raft-boltdb/v2"
raftwal "github.com/hashicorp/raft-wal"
"github.com/hashicorp/raft-wal/verifier"

"github.com/hashicorp/consul/agent/connect"
"github.com/hashicorp/consul/agent/consul/multilimiter"
Expand Down Expand Up @@ -388,6 +391,117 @@ func TestServer_StartStop(t *testing.T) {
}
}

func TestServer_RaftBackend_Default(t *testing.T) {
t.Parallel()
// Start up a server and then stop it.
_, s1 := testServerWithConfig(t, func(config *Config) {
config.LogStoreConfig.Backend = LogStoreBackendDefault
config.LogStoreConfig.Verification.Enabled = false
})
_, ok := s1.raftStore.(*raftwal.WAL)
defer func() {
if err := s1.Shutdown(); err != nil {
t.Fatalf("err: %v", err)
}
}()
require.True(t, ok)

}

func TestServer_RaftBackend_Verifier_WAL(t *testing.T) {
t.Parallel()
// Start up a server and then stop it.
_, s1 := testServerWithConfig(t, func(config *Config) {
config.LogStoreConfig.Backend = LogStoreBackendDefault
config.LogStoreConfig.Verification.Enabled = true
})
_, ok := s1.raftStore.(*verifier.LogStore)
defer func() {
if err := s1.Shutdown(); err != nil {
t.Fatalf("err: %v", err)
}
}()
require.True(t, ok)

}

func TestServer_RaftBackend_WAL_WithExistingBoltDB(t *testing.T) {
t.Parallel()

dir := testutil.TempDir(t, "consul")
require.NoError(t, os.MkdirAll(dir+"/"+raftState, os.ModePerm))
dbFile, err := os.Create(dir + "/" + raftState + "raft.db")
require.NoError(t, err)

require.NoError(t, dbFile.Close())

// Start up a server and then stop it.
_, s1 := testServerWithConfig(t, func(config *Config) {
config.LogStoreConfig.Backend = LogStoreBackendWAL
config.LogStoreConfig.Verification.Enabled = false
config.DataDir = dir
})
_, ok := s1.raftStore.(*raftboltdb.BoltStore)
defer func() {
if err := s1.Shutdown(); err != nil {
t.Fatalf("err: %v", err)
}
}()
require.True(t, ok)

}

func TestServer_RaftBackend_WAL(t *testing.T) {
t.Parallel()
// Start up a server and then stop it.
_, s1 := testServerWithConfig(t, func(config *Config) {
config.LogStoreConfig.Backend = LogStoreBackendWAL
config.LogStoreConfig.Verification.Enabled = false
})
_, ok := s1.raftStore.(*raftwal.WAL)
defer func() {
if err := s1.Shutdown(); err != nil {
t.Fatalf("err: %v", err)
}
}()
require.True(t, ok)

}

func TestServer_RaftBackend_Verifier_BoltDB(t *testing.T) {
t.Parallel()
// Start up a server and then stop it.
_, s1 := testServerWithConfig(t, func(config *Config) {
config.LogStoreConfig.Backend = LogStoreBackendBoltDB
config.LogStoreConfig.Verification.Enabled = true
})
_, ok := s1.raftStore.(*verifier.LogStore)
defer func() {
if err := s1.Shutdown(); err != nil {
t.Fatalf("err: %v", err)
}
}()
require.True(t, ok)

}

func TestServer_RaftBackend_BoltDB(t *testing.T) {
t.Parallel()
// Start up a server and then stop it.
_, s1 := testServerWithConfig(t, func(config *Config) {
config.LogStoreConfig.Backend = LogStoreBackendBoltDB
config.LogStoreConfig.Verification.Enabled = false
})
_, ok := s1.raftStore.(*raftboltdb.BoltStore)
defer func() {
if err := s1.Shutdown(); err != nil {
t.Fatalf("err: %v", err)
}
}()
require.True(t, ok)

}

func TestServer_fixupACLDatacenter(t *testing.T) {
if testing.Short() {
t.Skip("too slow for testing.Short")
Expand Down
7 changes: 3 additions & 4 deletions website/content/docs/agent/config/config-files.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -1652,10 +1652,9 @@ subsystem that provides Consul's service mesh capabilities.
v1.15.0.

- `backend` ((#raft_logstore_backend)) Specifies which storage
engine to use to persist logs. Valid options are `boltdb` or `wal`. Default
is `boltdb`. The `wal` option specifies an experimental backend that
should be used with caution. Refer to
[Experimental WAL LogStore backend](/consul/docs/agent/wal-logstore)
engine to use to persist logs. Valid options are `wal` or `boltdb`. Default
is `wal`. Refer to
[WAL LogStore backend](/consul/docs/agent/wal-logstore)
for more information.

- `disable_log_cache` ((#raft_logstore_disable_log_cache)) Disables the in-memory cache for recent logs. We recommend using it for performance testing purposes, as no significant improvement has been measured when the cache is disabled. While the in-memory log cache theoretically prevents disk reads for recent logs, recent logs are also stored in the OS page cache, which does not slow either the `boltdb` or `wal` backend's ability to read them.
Expand Down
137 changes: 2 additions & 135 deletions website/content/docs/agent/wal-logstore/enable.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -5,32 +5,9 @@ description: >-
Learn how to safely configure and test the experimental WAL backend in your Consul deployment.
---

# Enable the experimental WAL LogStore backend
# Enable the WAL LogStore backend

This topic describes how to safely configure and test the WAL backend in your Consul deployment.

The overall process for enabling the WAL LogStore backend for one server consists of the following steps. In production environments, we recommend starting by enabling the backend on a single server . If you eventually choose to expand the test to further servers, you must repeat these steps for each one.

1. Enable log verification.
1. Select target server to enable WAL.
1. Stop target server gracefully.
1. Remove data directory from target server.
1. Update target server's configuration.
1. Start the target server.
1. Monitor target server raft metrics and logs.

!> **Experimental feature:** The WAL LogStore backend is experimental and may contain bugs that could cause data loss. Follow this guide to manage risk during testing.

## Requirements

- Consul v1.15 or later is required for all servers in the datacenter. Refer to the [standard upgrade procedure](/consul/docs/upgrading/instructions/general-process) and the [1.15 upgrade notes](/consul/docs/upgrading/upgrade-specific#consul-1-15-x) for additional information.
- A Consul cluster with at least three nodes are required to safely test the WAL backend without downtime.

We recommend taking the following additional measures:

- Take a snapshot prior to testing.
- Monitor Consul server metrics and logs, and set an alert on specific log events that occur when WAL is enabled. Refer to [Monitor Raft metrics and logs for WAL](/consul/docs/agent/wal-logstore/monitoring) for more information.
- Enable WAL in a pre-production environment and run it for a several days before enabling it in production.
WAL LogStore is now enabled by default unless a BoltDB database already exist.

## Known issues

Expand All @@ -39,113 +16,3 @@ The following issues exist in Consul 1.15.0 and 1.15.1.
* A follower that is disconnected may be unable to catch up if it is using the WAL backend.
* Restoring user snapshots can break replication to WAL-enabled followers.
* Restoring user snapshots can cause a WAL-enabled leader to panic.

## Risks

While their likelihood remains low to very low, be aware of the following risks before implementing the WAL backend:

- If WAL corrupts data on a Consul server agent, the server data cannot be recovered. Restart the server with an empty data directory and reload its state from the leader to resolve the issue.
- WAL may corrupt data or contain a defect that causes the server to panic and crash. WAL may not restart if the defect recurs when WAL reads from the logs on startup. Restart the server with an empty data directory and reload its state from the leader to resolve the issue.
- If WAL corrupts data, clients may read corrupted data from the Consul server, such as invalid IP addresses or unmatched tokens. This outcome is unlikely even if a recurring defect causes WAL to corrupt data because replication uses objects cached in memory instead of reads from disk. Restart the server with an empty data directory and reload its state from the leader to resolve the issue.
- If you enable a Consul CE server to use WAL or enable WAL on a voting server with Consul Enterprise, WAL may corrupt the server's state, become the leader, and replicate the corrupted state to all other servers. In this case, restoring from backup is required to recover a completely uncorrupted state. Test WAL on a non-voting server in Enterprise to prevent this outcome. You can add a new non-voting server to the cluster to test with if there are no existing ones.

## Enable log verification

You must enable log verification on all voting servers in Enterprise and all servers in CE because the leader writes verification checkpoints.

1. On each voting server, add the following to the server's configuration file:

```hcl
raft_logstore {
verification {
enabled = true
interval = "60s"
}
}
```

1. Restart the server to apply the changes. The `consul reload` command is not sufficient to apply `raft_logstore` configuration changes.
1. Run the `consul operator raft list-peers` command to wait for each server to become a healthy voter before moving on to the next. This may take a few minutes for large snapshots.

When complete, the server's logs should contain verifier reports that appear like the following example:

```log hideClipboard
2023-01-31T14:44:31.174Z [INFO] agent.server.raft.logstore.verifier: verification checksum OK: elapsed=488.463268ms leaderChecksum=f15db83976f2328c rangeEnd=357802 rangeStart=298132 readChecksum=f15db83976f2328c
```
Comment on lines -52 to -74
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't this section on log verification still relevant info even when WAL is defaulted on?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Log verification is enabled by default now as part of WAL. The reasoning is that it have minimal impact but great benefits in case of bugs.

I will double check if it's documented as part of the logstore config properly.


## Select target server to enable WAL

If you are using Consul CE, or Consul Enterprise without non-voting servers, select a follower server to enable WAL. As noted in [Risks](#risks), Consul Enterprise users with non-voting servers should first select a non-voting server, or consider adding another server as a non-voter to test on.

Retrieve the current state of the servers by running the following command:

```shell-session
$ consul operator raft list-peers
```

## Stop target server

Stop the target server gracefully. For example, if you are using `systemd`,
run the following command:

```shell-session
$ systemctl stop consul
```

If your environment uses configuration management automation that might interfere with this process, such as Chef or Puppet, you must disable them until you have completely enabled WAL as a storage backend.

## Remove data directory from target server

Temporarily moving the data directory to a different location is less destructive than deleting it. We recommend moving it in cases where you unsuccessfully enable WAL. Do not use the old data directory (`/data-dir/raft.bak`) for recovery after restarting the server. We recommend eventually deleting the old directory.

The following example assumes the `data_dir` in the server's configuration is `/data-dir` and renames it to `/data-dir.bak`.

```shell-session
$ mv /data-dir/raft /data-dir/raft.bak
```

When switching backends, you must always remove _the entire raft directory_, not just the `raft.db` file or `wal` directory. The log must always be consistent with the snapshots to avoid undefined behavior or data loss.

## Update target server configuration

Add the following to the target server's configuration file:

```hcl
raft_logstore {
backend = "wal"
verification {
enabled = true
interval = "60s"
}
}
```

## Start target server

Start the target server. For example, if you are using `systemd`, run the following command:

```shell-session
$ systemctl start consul
```

Watch for the server to become a healthy voter again.

```shell-session
$ consul operator raft list-peers
```

## Monitor target server Raft metrics and logs

Refer to [Monitor Raft metrics and logs for WAL](/consul/docs/agent/wal-logstore/monitoring) for details.

We recommend leaving the cluster in the test configuration for several days or weeks, as long as you observe no errors. An extended test provides more confidence that WAL operates correctly under varied workloads and during routine server restarts. If you observe any errors, end the test immediately and report them.

If you disabled configuration management automation, consider reenabling it during the testing phase to pick up other updates for the host. You must ensure that it does not revert the Consul configuration file and remove the altered backend configuration. One way to do this is add the `raft_logstore` block to a separate file that is not managed by your automation. This file can either be added to the directory if [`-config-dir`](/consul/docs/agent/config/cli-flags#_config_dir) is used or as an additional [`-config-file`](/consul/docs/agent/config/cli-flags#_config_file) argument.

## Next steps

- If you observe any verification errors, performance anomalies, or other suspicious behavior from the target server during the test, you should immediately follow [the procedure to revert back to BoltDB](/consul/docs/agent/wal-logstore/revert-to-boltdb). Report failures through GitHub.

- If you do not see errors and would like to expand the test further, you can repeat the above procedure on another target server. We suggest waiting after each test expansion and slowly rolling WAL out to other parts of your environment. Once the majority of your servers use WAL, any bugs not yet discovered may result in cluster unavailability.

- If you wish to permanently enable WAL on all servers, repeat the steps described in this topic for each server. Even if `backend = "wal"` is set in logs, servers continue to use BoltDB if they find an existing raft.db file in the data directory.
10 changes: 3 additions & 7 deletions website/content/docs/agent/wal-logstore/index.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,16 @@
layout: docs
page_title: WAL LogStore Backend Overview
description: >-
The experimental WAL (write-ahead log) LogStore backend shipped in Consul 1.15 is intended to replace the BoltDB backend, improving performance and log storage issues.
The write-ahead log (WAL) LogStore is the default backend for Consul. Learn about the WAL backend and how it is different from the BoltDB backend.
---

# Experimental WAL LogStore backend overview
# Write-ahead Log (WAL) LogStore backend overview

This topic provides an overview of the WAL (write-ahead log) LogStore backend.
The WAL backend is an experimental feature. Refer to
The WAL backend is now the default Consul LogStore when a BoltDB database is not already in place. Refer to
[Requirements](/consul/docs/agent/wal-logstore/enable#requirements) for
supported environments and known issues.

We do not recommend enabling the WAL backend in production without following
[our guide for safe
testing](/consul/docs/agent/wal-logstore/enable).

## WAL versus BoltDB

WAL implements a traditional log with rotating, append-only log files. WAL resolves many issues with the existing `LogStore` provided by the BoltDB backend. The BoltDB `LogStore` is a copy-on-write BTree, which is not optimized for append-only, write-heavy workloads.
Expand Down
Loading
Loading