Skip to content

Commit af1b98a

Browse files
committed
Restart when SystemCertPool should change
This fixes a downstream bug There was a problem downstream where the OpenShift servivce-ca was not yet available, and due to the way the manifests were set up, the service-ca was considered to be part of the SystemCertPool. The problem is that the SystemCertPool, once initialized, will never reload itself. We can get into this situation when we use SSL_CERT_DIR and SSL_CERT_FILE to provide OpenShift CAs to be used by containers/image for pulling. These environment variables change the source of the SystemCertPool. The CertPoolWatcher then watches these locations, and tries to update the pool it provides to the HTTPS client connecting to catalogd. But the SystemCertPool is never updated. I tried to fix this downstream by removing SSL_CERT_DIR, and specifying the `--pull-cas-dir` option. This means that containers/image would directly use certificates that we specify, rather than the default location. But this breaks the use of custom CAs for local image registries. The containers/image package does not provide a way to manipulate the certificate locations beyond a simple directory setting, and we need to leave that directory setting as the default in downstream because it (i.e. /etc/docker/certs.d) is a host- mounted directory that contains certificates for local image registries. And it is possible to configure a custom CA for a local image registry, so that directory must be included, ALONG with the OpenShift provided CAs and service-ca, which is defined by SSL_CERT_DIR. But because of the use of SSL_CERT_DIR to include the OpenShift service-ca, if the service-ca was not available at startup, but became available later, it was not possible to reload the SystemCertPool. Which could cause problems in operator-controller when it tried to connect to catalogd. The fundamental problem is that there's no way to refresh the SystemCertPool, which will become more and more of an issue as certificate lifetimes decrease. Using SSL_CERT_DIR allows us to use the CertPoolWatcher to notice changes to the SystemCertPool. This will allow us to restart the process when certificates change (e.g. OpenShift service-ca becomes available). Changes: * Update CertPoolWatcher to restart on changes to SSL_CERT_DIR and SSL_CERT_FILE * Update CertPoolWatcher to use a Runnable interface, so that it can be added to the manager, and started later, which may improve the changes that the service-ca is ready. * Add CertPoolWatcher to catalogd With this, my downstream manifest change should be reverted. Signed-off-by: Todd Short <[email protected]>
1 parent 97fabe5 commit af1b98a

File tree

4 files changed

+210
-58
lines changed

4 files changed

+210
-58
lines changed

cmd/catalogd/main.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ import (
6060
"github.com/operator-framework/operator-controller/internal/catalogd/webhook"
6161
sharedcontrollers "github.com/operator-framework/operator-controller/internal/shared/controllers"
6262
fsutil "github.com/operator-framework/operator-controller/internal/shared/util/fs"
63+
httputil "github.com/operator-framework/operator-controller/internal/shared/util/http"
6364
imageutil "github.com/operator-framework/operator-controller/internal/shared/util/image"
6465
"github.com/operator-framework/operator-controller/internal/shared/util/pullsecretcache"
6566
sautil "github.com/operator-framework/operator-controller/internal/shared/util/sa"
@@ -291,6 +292,17 @@ func run(ctx context.Context) error {
291292
return err
292293
}
293294

295+
// This watches the pullCasDir and the SSL_CERT_DIR, and SSL_CERT_FILE for changes
296+
cpw, err := httputil.NewCertPoolWatcher(cfg.pullCasDir, ctrl.Log.WithName("cert-pool"))
297+
if err != nil {
298+
setupLog.Error(err, "unable to create CA certificate pool watcher")
299+
return err
300+
}
301+
if err = mgr.Add(cpw); err != nil {
302+
setupLog.Error(err, "unable to add certificate pool watcher to manager")
303+
return err
304+
}
305+
294306
if cfg.systemNamespace == "" {
295307
cfg.systemNamespace = podNamespace()
296308
}

cmd/operator-controller/main.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -338,7 +338,11 @@ func run() error {
338338

339339
certPoolWatcher, err := httputil.NewCertPoolWatcher(cfg.catalogdCasDir, ctrl.Log.WithName("cert-pool"))
340340
if err != nil {
341-
setupLog.Error(err, "unable to create CA certificate pool")
341+
setupLog.Error(err, "unable to create CA certificate pool watcher")
342+
return err
343+
}
344+
if err = mgr.Add(certPoolWatcher); err != nil {
345+
setupLog.Error(err, "unable to add certificate pool watcher to manager")
342346
return err
343347
}
344348

internal/shared/util/http/certpoolwatcher.go

Lines changed: 84 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package http
22

33
import (
4+
"context"
45
"crypto/x509"
56
"fmt"
67
"os"
@@ -14,13 +15,15 @@ import (
1415
)
1516

1617
type CertPoolWatcher struct {
17-
generation int
18-
dir string
19-
mx sync.RWMutex
20-
pool *x509.CertPool
21-
log logr.Logger
22-
watcher *fsnotify.Watcher
23-
done chan bool
18+
generation int
19+
dir string
20+
sslCertPaths []string
21+
mx sync.RWMutex
22+
pool *x509.CertPool
23+
log logr.Logger
24+
watcher *fsnotify.Watcher
25+
done chan bool
26+
restart func(int)
2427
}
2528

2629
// Returns the current CertPool and the generation number
@@ -33,75 +36,100 @@ func (cpw *CertPoolWatcher) Get() (*x509.CertPool, int, error) {
3336
return cpw.pool.Clone(), cpw.generation, nil
3437
}
3538

39+
// Change the restart behavior
40+
// The default is to os.Exit(0) when a change to SSL_CERT_DIR or SSL_CERT_FILE
41+
// is detected, this allows a regeneration of the SystemCertPool.
42+
func (cpw *CertPoolWatcher) Restart(f func(int)) {
43+
cpw.restart = f
44+
}
45+
46+
// Indicate that you're done with the CertPoolWatcher so it can terminate
47+
// the watcher go func
3648
func (cpw *CertPoolWatcher) Done() {
3749
cpw.done <- true
3850
}
3951

40-
func NewCertPoolWatcher(caDir string, log logr.Logger) (*CertPoolWatcher, error) {
41-
pool, err := NewCertPool(caDir, log)
52+
func (cpw *CertPoolWatcher) Start(ctx context.Context) error {
53+
var err error
54+
cpw.pool, err = NewCertPool(cpw.dir, cpw.log)
4255
if err != nil {
43-
return nil, err
56+
return err
4457
}
45-
watcher, err := fsnotify.NewWatcher()
58+
cpw.watcher, err = fsnotify.NewWatcher()
4659
if err != nil {
47-
return nil, err
60+
return err
4861
}
4962

50-
// If the SSL_CERT_DIR or SSL_CERT_FILE environment variables are
51-
// specified, this means that we have some control over the system root
52-
// location, thus they may change, thus we should watch those locations.
53-
sslCertDir := os.Getenv("SSL_CERT_DIR")
54-
sslCertFile := os.Getenv("SSL_CERT_FILE")
55-
log.V(defaultLogLevel).Info("SSL environment", "SSL_CERT_DIR", sslCertDir, "SSL_CERT_FILE", sslCertFile)
56-
57-
watchPaths := strings.Split(sslCertDir, ":")
58-
watchPaths = append(watchPaths, caDir, sslCertFile)
59-
watchPaths = slices.DeleteFunc(watchPaths, func(p string) bool {
60-
if p == "" {
61-
return true
62-
}
63-
if _, err := os.Stat(p); err != nil {
64-
return true
65-
}
66-
return false
67-
})
63+
watchPaths := append(cpw.sslCertPaths, cpw.dir)
64+
watchPaths = slices.DeleteFunc(watchPaths, deleteEmptyStrings)
6865

6966
for _, p := range watchPaths {
70-
if err := watcher.Add(p); err != nil {
71-
return nil, err
67+
if err := cpw.watcher.Add(p); err != nil {
68+
return err
7269
}
73-
logPath(p, "watching certificate", log)
70+
logPath(p, "watching certificate", cpw.log)
7471
}
7572

76-
cpw := &CertPoolWatcher{
77-
generation: 1,
78-
dir: caDir,
79-
pool: pool,
80-
log: log,
81-
watcher: watcher,
82-
done: make(chan bool),
83-
}
8473
go func() {
8574
for {
8675
select {
87-
case <-watcher.Events:
76+
case e := <-cpw.watcher.Events:
77+
cpw.checkForRestart(e.Name)
8878
cpw.drainEvents()
8979
cpw.update()
90-
case err := <-watcher.Errors:
91-
log.Error(err, "error watching certificate dir")
80+
case err := <-cpw.watcher.Errors:
81+
cpw.log.Error(err, "error watching certificate dir")
9282
os.Exit(1)
83+
case <-ctx.Done():
84+
cpw.Done()
9385
case <-cpw.done:
94-
err := watcher.Close()
86+
err := cpw.watcher.Close()
9587
if err != nil {
96-
log.Error(err, "error closing watcher")
88+
cpw.log.Error(err, "error closing watcher")
9789
}
9890
return
9991
}
10092
}
10193
}()
94+
return nil
95+
}
96+
97+
func NewCertPoolWatcher(caDir string, log logr.Logger) (*CertPoolWatcher, error) {
98+
// If the SSL_CERT_DIR or SSL_CERT_FILE environment variables are
99+
// specified, this means that we have some control over the system root
100+
// location, thus they may change, thus we should watch those locations.
101+
//
102+
// BECAUSE THE SYSTEM POOL WILL NOT UPDATE, WE HAVE TO RESTART IF THERE
103+
// CHANGES TO EITHER OF THESE LOCATIONS: SSL_CERT_DIR, SSL_CERT_FILE
104+
//
105+
sslCertDir := os.Getenv("SSL_CERT_DIR")
106+
sslCertFile := os.Getenv("SSL_CERT_FILE")
107+
log.V(defaultLogLevel).Info("SSL environment", "SSL_CERT_DIR", sslCertDir, "SSL_CERT_FILE", sslCertFile)
108+
109+
sslCertPaths := append(strings.Split(sslCertDir, ":"), sslCertFile)
110+
sslCertPaths = slices.DeleteFunc(sslCertPaths, deleteEmptyStrings)
111+
112+
cpw := &CertPoolWatcher{
113+
generation: 1,
114+
dir: caDir,
115+
sslCertPaths: sslCertPaths,
116+
log: log,
117+
done: make(chan bool),
118+
restart: os.Exit,
119+
}
102120
return cpw, nil
103121
}
104122

123+
func deleteEmptyStrings(p string) bool {
124+
if p == "" {
125+
return true
126+
}
127+
if _, err := os.Stat(p); err != nil {
128+
return true
129+
}
130+
return false
131+
}
132+
105133
func (cpw *CertPoolWatcher) update() {
106134
cpw.log.Info("updating certificate pool")
107135
pool, err := NewCertPool(cpw.dir, cpw.log)
@@ -115,6 +143,14 @@ func (cpw *CertPoolWatcher) update() {
115143
cpw.generation++
116144
}
117145

146+
func (cpw *CertPoolWatcher) checkForRestart(name string) {
147+
for _, p := range cpw.sslCertPaths {
148+
if strings.Contains(name, p) {
149+
cpw.restart(0)
150+
}
151+
}
152+
}
153+
118154
// Drain as many events as possible before doing anything
119155
// Otherwise, we will be hit with an event for _every_ entry in the
120156
// directory, and end up doing an update for each one
@@ -124,7 +160,8 @@ func (cpw *CertPoolWatcher) drainEvents() {
124160
select {
125161
case <-drainTimer.C:
126162
return
127-
case <-cpw.watcher.Events:
163+
case e := <-cpw.watcher.Events:
164+
cpw.checkForRestart(e.Name)
128165
}
129166
if !drainTimer.Stop() {
130167
<-drainTimer.C

internal/shared/util/http/certpoolwatcher_test.go

Lines changed: 109 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111
"math/big"
1212
"os"
1313
"path/filepath"
14+
"sync/atomic"
1415
"testing"
1516
"time"
1617

@@ -61,7 +62,94 @@ func createCert(t *testing.T, name string) {
6162
// ignore the key
6263
}
6364

64-
func TestCertPoolWatcher(t *testing.T) {
65+
func createTempCaDir(t *testing.T) string {
66+
tmpCaDir, err := os.MkdirTemp("", "ca-dir")
67+
require.NoError(t, err)
68+
createCert(t, filepath.Join(tmpCaDir, "test1.pem"))
69+
return tmpCaDir
70+
}
71+
72+
func TestCertPoolWatcherCaDir(t *testing.T) {
73+
// create a temporary CA directory
74+
tmpCaDir := createTempCaDir(t)
75+
defer os.RemoveAll(tmpCaDir)
76+
77+
// Create the cert pool watcher
78+
cpw, err := httputil.NewCertPoolWatcher(tmpCaDir, log.FromContext(context.Background()))
79+
require.NoError(t, err)
80+
restarted := &atomic.Bool{}
81+
restarted.Store(false)
82+
cpw.Restart(func(int) { restarted.Store(true) })
83+
err = cpw.Start(context.Background())
84+
require.NoError(t, err)
85+
defer cpw.Done()
86+
87+
// Get the original pool
88+
firstPool, firstGen, err := cpw.Get()
89+
require.NoError(t, err)
90+
require.NotNil(t, firstPool)
91+
92+
// Create a second cert in the CA directory
93+
certName := filepath.Join(tmpCaDir, "test2.pem")
94+
t.Logf("Create cert file at %q\n", certName)
95+
createCert(t, certName)
96+
97+
require.Eventually(t, func() bool {
98+
secondPool, secondGen, err := cpw.Get()
99+
if err != nil {
100+
return false
101+
}
102+
// Should NOT restart, because this is not SSL_CERT_DIR nor SSL_CERT_FILE
103+
return secondGen != firstGen && !firstPool.Equal(secondPool) && !restarted.Load()
104+
}, 10*time.Second, time.Second)
105+
}
106+
107+
func TestCertPoolWatcherSslCertDir(t *testing.T) {
108+
// create a temporary CA directory for SSL_CERT_DIR
109+
tmpSslDir := createTempCaDir(t)
110+
defer os.RemoveAll(tmpSslDir)
111+
112+
// Update environment variables for the watcher - some of these should not exist
113+
os.Setenv("SSL_CERT_DIR", tmpSslDir+":/tmp/does-not-exist.dir")
114+
115+
// Create a different CaDir
116+
tmpCaDir := createTempCaDir(t)
117+
defer os.RemoveAll(tmpCaDir)
118+
119+
// Create the cert pool watcher
120+
cpw, err := httputil.NewCertPoolWatcher(tmpCaDir, log.FromContext(context.Background()))
121+
require.NoError(t, err)
122+
restarted := &atomic.Bool{}
123+
restarted.Store(false)
124+
cpw.Restart(func(int) { restarted.Store(true) })
125+
err = cpw.Start(context.Background())
126+
require.NoError(t, err)
127+
defer cpw.Done()
128+
129+
// Get the original pool
130+
firstPool, firstGen, err := cpw.Get()
131+
require.NoError(t, err)
132+
require.NotNil(t, firstPool)
133+
134+
// Create a second cert in SSL_CIR_DIR
135+
certName := filepath.Join(tmpSslDir, "test2.pem")
136+
t.Logf("Create cert file at %q\n", certName)
137+
createCert(t, certName)
138+
139+
require.Eventually(t, func() bool {
140+
_, secondGen, err := cpw.Get()
141+
if err != nil {
142+
return false
143+
}
144+
// Because SSL_CERT_DIR is part of the SysemCertPool:
145+
// 1. CPW only watches: it doesn't actually load it, that's the SystemCertPool's responsibility
146+
// 2. Because the SystemCertPool never changes, we can't directly compare the pools
147+
// 3. If SSL_CERT_DIR changes, we should expect a restart
148+
return secondGen != firstGen && restarted.Load()
149+
}, 10*time.Second, time.Second)
150+
}
151+
152+
func TestCertPoolWatcherSslCertFile(t *testing.T) {
65153
// create a temporary directory
66154
tmpDir, err := os.MkdirTemp("", "cert-pool")
67155
require.NoError(t, err)
@@ -72,12 +160,20 @@ func TestCertPoolWatcher(t *testing.T) {
72160
t.Logf("Create cert file at %q\n", certName)
73161
createCert(t, certName)
74162

75-
// Update environment variables for the watcher - some of these should not exist
76-
os.Setenv("SSL_CERT_DIR", tmpDir+":/tmp/does-not-exist.dir")
77-
os.Setenv("SSL_CERT_FILE", "/tmp/does-not-exist.file")
163+
// Update environment variables for the watcher
164+
os.Setenv("SSL_CERT_FILE", certName)
165+
166+
// Create a different CaDir
167+
tmpCaDir := createTempCaDir(t)
168+
defer os.RemoveAll(tmpCaDir)
78169

79170
// Create the cert pool watcher
80-
cpw, err := httputil.NewCertPoolWatcher(tmpDir, log.FromContext(context.Background()))
171+
cpw, err := httputil.NewCertPoolWatcher(tmpCaDir, log.FromContext(context.Background()))
172+
require.NoError(t, err)
173+
restarted := &atomic.Bool{}
174+
restarted.Store(false)
175+
cpw.Restart(func(int) { restarted.Store(true) })
176+
err = cpw.Start(context.Background())
81177
require.NoError(t, err)
82178
defer cpw.Done()
83179

@@ -86,16 +182,19 @@ func TestCertPoolWatcher(t *testing.T) {
86182
require.NoError(t, err)
87183
require.NotNil(t, firstPool)
88184

89-
// Create a second cert
90-
certName = filepath.Join(tmpDir, "test2.pem")
185+
// Update the SSL_CERT_FILE
91186
t.Logf("Create cert file at %q\n", certName)
92187
createCert(t, certName)
93188

94189
require.Eventually(t, func() bool {
95-
secondPool, secondGen, err := cpw.Get()
190+
_, secondGen, err := cpw.Get()
96191
if err != nil {
97192
return false
98193
}
99-
return secondGen != firstGen && !firstPool.Equal(secondPool)
100-
}, 30*time.Second, time.Second)
194+
// Because SSL_CERT_FILE is part of the SysemCertPool:
195+
// 1. CPW only watches: it doesn't actually load it, that's the SystemCertPool's responsibility
196+
// 2. Because the SystemCertPool never changes, we can't directly compare the pools
197+
// 3. If SSL_CERT_FILE changes, we should expect a restart
198+
return secondGen != firstGen && restarted.Load()
199+
}, 10*time.Second, time.Second)
101200
}

0 commit comments

Comments
 (0)