Skip to content

Commit ff8e932

Browse files
committed
Restart when SystemCertPool should change
This fixes a downstream bug There was a problem downstream where the OpenShift servivce-ca was not yet available, and due to the way the manifests were set up, the service-ca was considered to be part of the SystemCertPool. The problem is that the SystemCertPool, once initialized, will never reload itself. We can get into this situation when we use SSL_CERT_DIR and SSL_CERT_FILE to provide OpenShift CAs to be used by containers/image for pulling. These environment variables change the source of the SystemCertPool. The CertPoolWatcher then watches these locations, and tries to update the pool it provides to the HTTPS client connecting to catalogd. But the SystemCertPool is never updated. (It did not help that there was no explicit CertPoolWatcher for the pull CAs.) I tried to fix this downstream by removing SSL_CERT_DIR, and specifying the `--pull-cas-dir` option. This means that containers/image would directly use certificates that we specify, rather than the default location. But this breaks the use of custom CAs for local image registries. The containers/image package does not provide a way to manipulate the certificate locations beyond a simple directory setting, and we need to leave that directory setting as the default in downstream because it (i.e. /etc/docker/certs.d) is a host- mounted directory that contains certificates for local image registries. And it is possible to configure a custom CA for a local image registry, so that directory must be included, ALONG with the OpenShift provided CAs and service-ca, which is defined by SSL_CERT_DIR. But because of the use of SSL_CERT_DIR to include the OpenShift service-ca, if the service-ca was not available at startup, but became available later, it was not possible to reload the SystemCertPool. Which could cause problems in operator-controller when it tried to connect to catalogd. The fundamental problem is that there's no way to refresh the SystemCertPool, which will become more and more of an issue as certificate lifetimes decrease. Using SSL_CERT_DIR allows us to use the CertPoolWatcher to notice changes to the SystemCertPool. This will allow us to restart the process when certificates change (e.g. OpenShift service-ca becomes available). Changes: * Update CertPoolWatcher to restart on changes to SSL_CERT_DIR and SSL_CERT_FILE * Update CertPoolWatcher to use a Runnable interface, so that it can be added to the manager, and started later, which may improve the changes that the service-ca is ready. * Update CertPoolWatcher to not be created when there's nothing to watch. * Add CertPoolWatcher to catalogd for pull CAs * Add CertPoolWatcher to operator-controller for pull CAs * Improve logging With this, my downstream manifest change should be reverted. Signed-off-by: Todd Short <[email protected]> Assisted-by: Claude Code (alternatives)
1 parent 182a0a6 commit ff8e932

File tree

6 files changed

+298
-68
lines changed

6 files changed

+298
-68
lines changed

cmd/catalogd/main.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ import (
6060
"github.com/operator-framework/operator-controller/internal/catalogd/webhook"
6161
sharedcontrollers "github.com/operator-framework/operator-controller/internal/shared/controllers"
6262
fsutil "github.com/operator-framework/operator-controller/internal/shared/util/fs"
63+
httputil "github.com/operator-framework/operator-controller/internal/shared/util/http"
6364
imageutil "github.com/operator-framework/operator-controller/internal/shared/util/image"
6465
"github.com/operator-framework/operator-controller/internal/shared/util/pullsecretcache"
6566
sautil "github.com/operator-framework/operator-controller/internal/shared/util/sa"
@@ -291,6 +292,18 @@ func run(ctx context.Context) error {
291292
return err
292293
}
293294

295+
// This watches the pullCasDir and the SSL_CERT_DIR, and SSL_CERT_FILE for changes
296+
cpwPull, err := httputil.NewCertPoolWatcher(cfg.pullCasDir, ctrl.Log.WithName("pull-ca-pool"))
297+
if err != nil {
298+
setupLog.Error(err, "unable to create pull-ca-pool watcher")
299+
return err
300+
}
301+
cpwPull.Restart(os.Exit)
302+
if err = mgr.Add(cpwPull); err != nil {
303+
setupLog.Error(err, "unable to add pull-ca-pool watcher to manager")
304+
return err
305+
}
306+
294307
if cfg.systemNamespace == "" {
295308
cfg.systemNamespace = podNamespace()
296309
}

cmd/operator-controller/main.go

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -336,9 +336,26 @@ func run() error {
336336
return err
337337
}
338338

339-
certPoolWatcher, err := httputil.NewCertPoolWatcher(cfg.catalogdCasDir, ctrl.Log.WithName("cert-pool"))
339+
cpwCatalogd, err := httputil.NewCertPoolWatcher(cfg.catalogdCasDir, ctrl.Log.WithName("catalogd-ca-pool"))
340340
if err != nil {
341-
setupLog.Error(err, "unable to create CA certificate pool")
341+
setupLog.Error(err, "unable to create catalogd-ca-pool watcher")
342+
return err
343+
}
344+
cpwCatalogd.Restart(os.Exit)
345+
if err = mgr.Add(cpwCatalogd); err != nil {
346+
setupLog.Error(err, "unable to add catalogd-ca-pool watcher to manager")
347+
return err
348+
}
349+
350+
// This watches the pullCasDir and the SSL_CERT_DIR, and SSL_CERT_FILE for changes
351+
cpwPull, err := httputil.NewCertPoolWatcher(cfg.pullCasDir, ctrl.Log.WithName("pull-ca-pool"))
352+
if err != nil {
353+
setupLog.Error(err, "unable to create pull-ca-pool watcher")
354+
return err
355+
}
356+
cpwPull.Restart(os.Exit)
357+
if err = mgr.Add(cpwPull); err != nil {
358+
setupLog.Error(err, "unable to add pull-ca-pool watcher to manager")
342359
return err
343360
}
344361

@@ -392,7 +409,7 @@ func run() error {
392409
}
393410
catalogClientBackend := cache.NewFilesystemCache(catalogsCachePath)
394411
catalogClient := catalogclient.New(catalogClientBackend, func() (*http.Client, error) {
395-
return httputil.BuildHTTPClient(certPoolWatcher)
412+
return httputil.BuildHTTPClient(cpwCatalogd)
396413
})
397414

398415
resolver := &resolve.CatalogResolver{

internal/shared/util/http/certlog.go

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,16 @@ func logFile(filename, path, action string, log logr.Logger) {
122122
log.Error(err, "error in os.ReadFile()", "file", filepath)
123123
return
124124
}
125-
logPem(data, filename, path, action, log)
125+
if len(data) > 0 {
126+
logPem(data, filename, path, action, log)
127+
return
128+
}
129+
// Indicate that the file is empty
130+
args := []any{"file", filename, "empty", "true"}
131+
if path != "" {
132+
args = append(args, "directory", path)
133+
}
134+
log.V(defaultLogLevel).Info(action, args...)
126135
}
127136

128137
func logPem(data []byte, filename, path, action string, log logr.Logger) {

internal/shared/util/http/certpoolwatcher.go

Lines changed: 101 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package http
22

33
import (
4+
"context"
45
"crypto/x509"
56
"fmt"
67
"os"
@@ -14,13 +15,15 @@ import (
1415
)
1516

1617
type CertPoolWatcher struct {
17-
generation int
18-
dir string
19-
mx sync.RWMutex
20-
pool *x509.CertPool
21-
log logr.Logger
22-
watcher *fsnotify.Watcher
23-
done chan bool
18+
generation int
19+
dir string
20+
sslCertPaths []string
21+
mx sync.RWMutex
22+
pool *x509.CertPool
23+
log logr.Logger
24+
watcher *fsnotify.Watcher
25+
done chan bool
26+
restart func(int)
2427
}
2528

2629
// Returns the current CertPool and the generation number
@@ -33,77 +36,111 @@ func (cpw *CertPoolWatcher) Get() (*x509.CertPool, int, error) {
3336
return cpw.pool.Clone(), cpw.generation, nil
3437
}
3538

36-
func (cpw *CertPoolWatcher) Done() {
37-
cpw.done <- true
39+
// Change the restart behavior
40+
func (cpw *CertPoolWatcher) Restart(f func(int)) {
41+
cpw.restart = f
3842
}
3943

40-
func NewCertPoolWatcher(caDir string, log logr.Logger) (*CertPoolWatcher, error) {
41-
pool, err := NewCertPool(caDir, log)
42-
if err != nil {
43-
return nil, err
44+
// Indicate that you're done with the CertPoolWatcher so it can terminate
45+
// the watcher go func
46+
func (cpw *CertPoolWatcher) Done() {
47+
if cpw.watcher != nil {
48+
cpw.done <- true
4449
}
45-
watcher, err := fsnotify.NewWatcher()
50+
}
51+
52+
func (cpw *CertPoolWatcher) Start(ctx context.Context) error {
53+
var err error
54+
cpw.pool, err = NewCertPool(cpw.dir, cpw.log)
4655
if err != nil {
47-
return nil, err
56+
return err
4857
}
4958

50-
// If the SSL_CERT_DIR or SSL_CERT_FILE environment variables are
51-
// specified, this means that we have some control over the system root
52-
// location, thus they may change, thus we should watch those locations.
53-
sslCertDir := os.Getenv("SSL_CERT_DIR")
54-
sslCertFile := os.Getenv("SSL_CERT_FILE")
55-
log.V(defaultLogLevel).Info("SSL environment", "SSL_CERT_DIR", sslCertDir, "SSL_CERT_FILE", sslCertFile)
59+
watchPaths := append(cpw.sslCertPaths, cpw.dir)
60+
watchPaths = slices.DeleteFunc(watchPaths, deleteEmptyStrings)
5661

57-
watchPaths := strings.Split(sslCertDir, ":")
58-
watchPaths = append(watchPaths, caDir, sslCertFile)
59-
watchPaths = slices.DeleteFunc(watchPaths, func(p string) bool {
60-
if p == "" {
61-
return true
62-
}
63-
if _, err := os.Stat(p); err != nil {
64-
return true
65-
}
66-
return false
67-
})
62+
// Nothing was configured to be watched, which means this is
63+
// using the SystemCertPool, so we still need to no error out
64+
if len(watchPaths) == 0 {
65+
cpw.log.Info("No paths to watch")
66+
return nil
67+
}
68+
69+
cpw.watcher, err = fsnotify.NewWatcher()
70+
if err != nil {
71+
return err
72+
}
6873

6974
for _, p := range watchPaths {
70-
if err := watcher.Add(p); err != nil {
71-
return nil, err
75+
if err := cpw.watcher.Add(p); err != nil {
76+
cpw.watcher.Close()
77+
cpw.watcher = nil
78+
return err
7279
}
73-
logPath(p, "watching certificate", log)
80+
logPath(p, "watching certificate", cpw.log)
7481
}
7582

76-
cpw := &CertPoolWatcher{
77-
generation: 1,
78-
dir: caDir,
79-
pool: pool,
80-
log: log,
81-
watcher: watcher,
82-
done: make(chan bool),
83-
}
8483
go func() {
8584
for {
8685
select {
87-
case <-watcher.Events:
86+
case e := <-cpw.watcher.Events:
87+
cpw.checkForRestart(e.Name)
8888
cpw.drainEvents()
89-
cpw.update()
90-
case err := <-watcher.Errors:
91-
log.Error(err, "error watching certificate dir")
89+
cpw.update(e.Name)
90+
case err := <-cpw.watcher.Errors:
91+
cpw.log.Error(err, "error watching certificate dir")
9292
os.Exit(1)
93+
case <-ctx.Done():
94+
cpw.Done()
9395
case <-cpw.done:
94-
err := watcher.Close()
96+
err := cpw.watcher.Close()
9597
if err != nil {
96-
log.Error(err, "error closing watcher")
98+
cpw.log.Error(err, "error closing watcher")
9799
}
98100
return
99101
}
100102
}
101103
}()
104+
return nil
105+
}
106+
107+
func NewCertPoolWatcher(caDir string, log logr.Logger) (*CertPoolWatcher, error) {
108+
// If the SSL_CERT_DIR or SSL_CERT_FILE environment variables are
109+
// specified, this means that we have some control over the system root
110+
// location, thus they may change, thus we should watch those locations.
111+
//
112+
// BECAUSE THE SYSTEM POOL WILL NOT UPDATE, WE HAVE TO RESTART IF THERE
113+
// CHANGES TO EITHER OF THESE LOCATIONS: SSL_CERT_DIR, SSL_CERT_FILE
114+
//
115+
sslCertDir := os.Getenv("SSL_CERT_DIR")
116+
sslCertFile := os.Getenv("SSL_CERT_FILE")
117+
log.V(defaultLogLevel).Info("SSL environment", "SSL_CERT_DIR", sslCertDir, "SSL_CERT_FILE", sslCertFile)
118+
119+
sslCertPaths := append(strings.Split(sslCertDir, ":"), sslCertFile)
120+
sslCertPaths = slices.DeleteFunc(sslCertPaths, deleteEmptyStrings)
121+
122+
cpw := &CertPoolWatcher{
123+
generation: 1,
124+
dir: caDir,
125+
sslCertPaths: sslCertPaths,
126+
log: log,
127+
done: make(chan bool),
128+
}
102129
return cpw, nil
103130
}
104131

105-
func (cpw *CertPoolWatcher) update() {
106-
cpw.log.Info("updating certificate pool")
132+
func deleteEmptyStrings(p string) bool {
133+
if p == "" {
134+
return true
135+
}
136+
if _, err := os.Stat(p); err != nil {
137+
return true
138+
}
139+
return false
140+
}
141+
142+
func (cpw *CertPoolWatcher) update(name string) {
143+
cpw.log.Info("updating certificate pool", "file", name)
107144
pool, err := NewCertPool(cpw.dir, cpw.log)
108145
if err != nil {
109146
cpw.log.Error(err, "error updating certificate pool")
@@ -115,6 +152,17 @@ func (cpw *CertPoolWatcher) update() {
115152
cpw.generation++
116153
}
117154

155+
func (cpw *CertPoolWatcher) checkForRestart(name string) {
156+
for _, p := range cpw.sslCertPaths {
157+
if strings.Contains(name, p) {
158+
cpw.log.Info("restarting due to file change", "file", name)
159+
if cpw.restart != nil {
160+
cpw.restart(0)
161+
}
162+
}
163+
}
164+
}
165+
118166
// Drain as many events as possible before doing anything
119167
// Otherwise, we will be hit with an event for _every_ entry in the
120168
// directory, and end up doing an update for each one
@@ -124,7 +172,8 @@ func (cpw *CertPoolWatcher) drainEvents() {
124172
select {
125173
case <-drainTimer.C:
126174
return
127-
case <-cpw.watcher.Events:
175+
case e := <-cpw.watcher.Events:
176+
cpw.checkForRestart(e.Name)
128177
}
129178
if !drainTimer.Stop() {
130179
<-drainTimer.C

0 commit comments

Comments
 (0)