Skip to content

Commit

Permalink
Fix ClearNonces index out of range (#227)
Browse files Browse the repository at this point in the history
* Fix clearnonces index out of range

* Auto remove lockfile after unclean shutdown

* fix cmd
  • Loading branch information
agouin authored Nov 30, 2023
1 parent 976815f commit 6a974bb
Show file tree
Hide file tree
Showing 6 changed files with 102 additions and 45 deletions.
9 changes: 4 additions & 5 deletions cmd/horcrux/cmd/start.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,18 @@ func startCmd() *cobra.Command {
Args: cobra.NoArgs,
SilenceUsage: true,
RunE: func(cmd *cobra.Command, args []string) error {
err := signer.RequireNotRunning(config.PidFile)
out := cmd.OutOrStdout()
logger := cometlog.NewTMLogger(cometlog.NewSyncWriter(out))

err := signer.RequireNotRunning(logger, config.PidFile)
if err != nil {
return err
}

out := cmd.OutOrStdout()

if _, err := legacyConfig(); err == nil {
return fmt.Errorf("this is a legacy config. run `horcrux config migrate` to migrate to the latest format")
}

logger := cometlog.NewTMLogger(cometlog.NewSyncWriter(out))

// create all directories up to the state directory
if err = os.MkdirAll(config.StateDir, 0700); err != nil {
return err
Expand Down
21 changes: 14 additions & 7 deletions cmd/horcrux/cmd/state.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"github.com/strangelove-ventures/horcrux/signer"

cometjson "github.com/cometbft/cometbft/libs/json"
cometlog "github.com/cometbft/cometbft/libs/log"
)

// Snippet Taken from https://raw.githubusercontent.com/cometbft/cometbft/main/privval/file.go
Expand Down Expand Up @@ -82,14 +83,17 @@ func setStateCmd() *cobra.Command {
RunE: func(cmd *cobra.Command, args []string) error {
chainID := args[0]

out := cmd.OutOrStdout()
logger := cometlog.NewTMLogger(cometlog.NewSyncWriter(out))

if _, err := os.Stat(config.HomeDir); os.IsNotExist(err) {
cmd.SilenceUsage = false
return fmt.Errorf("%s does not exist, initialize config with horcrux config init and try again", config.HomeDir)
}

// Resetting the priv_validator_state.json should only be allowed if the
// signer is not running.
if err := signer.RequireNotRunning(config.PidFile); err != nil {
if err := signer.RequireNotRunning(logger, config.PidFile); err != nil {
return err
}

Expand All @@ -109,7 +113,7 @@ func setStateCmd() *cobra.Command {
return err
}

fmt.Fprintf(cmd.OutOrStdout(), "Setting height %d\n", height)
fmt.Fprintf(out, "Setting height %d\n", height)

pv.NoncePublic, cs.NoncePublic = nil, nil
signState := signer.SignStateConsensus{
Expand Down Expand Up @@ -150,9 +154,12 @@ func importStateCmd() *cobra.Command {
return fmt.Errorf("%s does not exist, initialize config with horcrux config init and try again", config.HomeDir)
}

out := cmd.OutOrStdout()
logger := cometlog.NewTMLogger(cometlog.NewSyncWriter(out))

// Resetting the priv_validator_state.json should only be allowed if the
// signer is not running.
if err := signer.RequireNotRunning(config.PidFile); err != nil {
if err := signer.RequireNotRunning(logger, config.PidFile); err != nil {
return err
}

Expand All @@ -170,11 +177,11 @@ func importStateCmd() *cobra.Command {

// Allow user to paste in priv_validator_state.json

fmt.Println("IMPORTANT: Your validator should already be STOPPED. You must copy the latest state..")
fmt.Fprintln(out, "IMPORTANT: Your validator should already be STOPPED. You must copy the latest state..")
<-time.After(2 * time.Second)
fmt.Println("")
fmt.Println("Paste your old priv_validator_state.json. Input a blank line after the pasted JSON to continue.")
fmt.Println("")
fmt.Fprintln(out, "")
fmt.Fprintln(out, "Paste your old priv_validator_state.json. Input a blank line after the pasted JSON to continue.")
fmt.Fprintln(out, "")

var textBuffer strings.Builder

Expand Down
9 changes: 6 additions & 3 deletions signer/cosigner_nonce_cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -353,19 +353,22 @@ func (cnc *CosignerNonceCache) PruneNonces() int {
func (cnc *CosignerNonceCache) ClearNonces(cosigner Cosigner) {
cnc.cache.mu.Lock()
defer cnc.cache.mu.Unlock()
for i, cn := range cnc.cache.cache {
for i := 0; i < len(cnc.cache.cache); i++ {
cn := cnc.cache.cache[i]

deleteID := -1
for i, n := range cn.Nonces {
for j, n := range cn.Nonces {
if n.Cosigner.GetID() == cosigner.GetID() {
// remove cosigner from this nonce.
deleteID = i
deleteID = j
break
}
}
if deleteID >= 0 {
if len(cn.Nonces)-1 < int(cnc.threshold) {
// If cosigners on this nonce drops below threshold, delete it as it's no longer usable
cnc.cache.Delete(i)
i--
} else {
cn.Nonces = append(cn.Nonces[:deleteID], cn.Nonces[deleteID+1:]...)
}
Expand Down
46 changes: 46 additions & 0 deletions signer/cosigner_nonce_cache_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,52 @@ func TestNonceCache(_ *testing.T) {
nc.Delete(0)
}

func TestClearNonces(t *testing.T) {
lcs, _ := getTestLocalCosigners(t, 2, 3)
cosigners := make([]Cosigner, len(lcs))
for i, lc := range lcs {
cosigners[i] = lc
}

cnc := CosignerNonceCache{
threshold: 2,
}

for i := 0; i < 10; i++ {
// When deleting nonce for cosigner 1 ([0]),
// these nonce will drop below threshold and be deleted.
cnc.cache.Add(&CachedNonce{
UUID: uuid.New(),
Expiration: time.Now().Add(1 * time.Second),
Nonces: []CosignerNoncesRel{
{Cosigner: cosigners[0]},
{Cosigner: cosigners[1]},
},
})
// When deleting nonce for cosigner 1 ([0]), these nonces will still be above threshold,
// so they will remain without cosigner 1.
cnc.cache.Add(&CachedNonce{
UUID: uuid.New(),
Expiration: time.Now().Add(1 * time.Second),
Nonces: []CosignerNoncesRel{
{Cosigner: cosigners[0]},
{Cosigner: cosigners[1]},
{Cosigner: cosigners[2]},
},
})
}

require.Equal(t, 20, cnc.cache.Size())

cnc.ClearNonces(cosigners[0])

require.Equal(t, 10, cnc.cache.Size())

for _, n := range cnc.cache.cache {
require.Len(t, n.Nonces, 2)
}
}

type mockPruner struct {
cnc *CosignerNonceCache
count int
Expand Down
17 changes: 12 additions & 5 deletions signer/services.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ import (
cometservice "github.com/cometbft/cometbft/libs/service"
)

func RequireNotRunning(pidFilePath string) error {
func RequireNotRunning(log cometlog.Logger, pidFilePath string) error {
if _, err := os.Stat(pidFilePath); err != nil {
if os.IsNotExist(err) {
// lock file does not exist, can continue starting daemon
Expand Down Expand Up @@ -41,17 +41,24 @@ func RequireNotRunning(pidFilePath string) error {

process, err := os.FindProcess(int(pid))
if err != nil {
return fmt.Errorf(`unclean shutdown detected. PID file exists at %s but PID %d can not be found.
manual deletion of PID file required. %w`, pidFilePath, pid, err)
return fmt.Errorf("error checking pid %d: %w", pid, err)
}

err = process.Signal(syscall.Signal(0))
if err == nil {
return fmt.Errorf("horcrux is already running on PID: %d", pid)
}
if errors.Is(err, os.ErrProcessDone) {
return fmt.Errorf(`unclean shutdown detected. PID file exists at %s but PID %d is not running.
manual deletion of PID file required`, pidFilePath, pid)
log.Error(
"Unclean shutdown detected. PID file exists at but process with that ID cannot be found. Removing lock file",
"pid", pid,
"pid_file", pidFilePath,
"error", err,
)
if err := os.Remove(pidFilePath); err != nil {
return fmt.Errorf("failed to delete pid file %s: %w", pidFilePath, err)
}
return nil
}

errno, ok := err.(syscall.Errno)
Expand Down
45 changes: 20 additions & 25 deletions signer/services_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ import (
"fmt"
"os"
"path/filepath"
"runtime"
"strconv"
"strings"
"sync"
Expand Down Expand Up @@ -72,7 +71,7 @@ func TestIsRunning(t *testing.T) {
pidBz, err := os.ReadFile(pidFilePath)
require.NoError(t, err)

err = signer.RequireNotRunning(pidFilePath)
err = signer.RequireNotRunning(cometlog.NewNopLogger(), pidFilePath)
expectedErrorMsg := fmt.Sprintf("horcrux is already running on PID: %s", strings.TrimSpace(string(pidBz)))
require.EqualError(t, err, expectedErrorMsg)
}
Expand All @@ -81,20 +80,26 @@ func TestIsNotRunning(t *testing.T) {
homeDir := t.TempDir()
pidFilePath := filepath.Join(homeDir, "horcrux.pid")

err := signer.RequireNotRunning(pidFilePath)
err := signer.RequireNotRunning(cometlog.NewNopLogger(), pidFilePath)
require.NoError(t, err)
}

func getNonExistentPid() (int, error) {
func maxPid() int {
const defaultMaxPid = 100000
maxPidBytes, err := os.ReadFile("/proc/sys/kernel/pid_max")
if err != nil {
return -1, err
return defaultMaxPid
}
maxPid, err := strconv.ParseUint(strings.TrimSpace(string(maxPidBytes)), 10, 64)
maxPid, err := strconv.ParseInt(strings.TrimSpace(string(maxPidBytes)), 10, 32)
if err != nil {
return -1, err
return defaultMaxPid
}
for pid := 1; pid <= int(maxPid); pid++ {
return int(maxPid)
}

func getUnusedPid() (int, error) {
max := maxPid()
for pid := 1; pid <= max; pid++ {
process, err := os.FindProcess(pid)
if err != nil {
continue
Expand All @@ -106,26 +111,15 @@ func getNonExistentPid() (int, error) {
if errors.Is(err, os.ErrProcessDone) {
return pid, nil
}
errno, ok := err.(syscall.Errno)
if !ok {
continue
}
if errno == syscall.ESRCH {
return pid, nil
}
}
return -1, errors.New("could not find unused PID")
}

func TestIsRunningNonExistentPid(t *testing.T) {
if runtime.GOOS != "linux" {
t.Skip("test only valid on Linux")
}

homeDir := t.TempDir()
pidFilePath := filepath.Join(homeDir, "horcrux.pid")

pid, err := getNonExistentPid()
pid, err := getUnusedPid()
require.NoError(t, err)

err = os.WriteFile(
Expand All @@ -135,10 +129,11 @@ func TestIsRunningNonExistentPid(t *testing.T) {
)
require.NoError(t, err, "error writing pid file")

err = signer.RequireNotRunning(pidFilePath)
expectedErrorMsg := fmt.Sprintf(`unclean shutdown detected. PID file exists at %s but PID %d is not running.
manual deletion of PID file required`, pidFilePath, pid)
require.EqualError(t, err, expectedErrorMsg)
err = signer.RequireNotRunning(cometlog.NewNopLogger(), pidFilePath)
require.Nil(t, err)

_, err = os.Stat(pidFilePath)
require.ErrorIs(t, err, os.ErrNotExist)
}

func TestConcurrentStart(t *testing.T) {
Expand Down Expand Up @@ -216,7 +211,7 @@ func TestIsRunningAndWaitForService(t *testing.T) {
}
panicFunction := func() {
defer recoverFromPanic()
err = signer.RequireNotRunning(pidFilePath)
err = signer.RequireNotRunning(cometlog.NewNopLogger(), pidFilePath)
}
go panicFunction()
wg.Wait()
Expand Down

0 comments on commit 6a974bb

Please sign in to comment.