From d4c44a56836a02f20bd913ff2cdac22583629bb4 Mon Sep 17 00:00:00 2001
From: Timo Beckers <timo@isovalent.com>
Date: Wed, 19 Feb 2025 16:22:43 +0100
Subject: [PATCH 1/3] prog: factor retry decision into separate testable
 function

This commit special-cases LogDisabled into a clause without a retry loop since
it doesn't need a buffer allocated. It also simplifies the logic somewhat.

A follow-up commit will add another separate clause for a new LogSizeOverride
CollectionOption to factor out even more complexity related to buffer sizing.

The main purpose of this commit is to extract the decision-making logic around
retries. Instead of directly manipulating loop control flow, a function is
introduced that issues a retry verdict, and it can be tested indendently of
the program load action.

Signed-off-by: Timo Beckers <timo@isovalent.com>
---
 prog.go      | 100 +++++++++++++++++++++++++++++----------------------
 prog_test.go |  40 +++++++++++++++++++++
 2 files changed, 97 insertions(+), 43 deletions(-)

diff --git a/prog.go b/prog.go
index 8fcae6a3f..85adf3b0f 100644
--- a/prog.go
+++ b/prog.go
@@ -409,59 +409,39 @@ func newProgramWithOptions(spec *ProgramSpec, opts ProgramOptions) (*Program, er
 		}
 	}
 
-	// The caller requested a specific verifier log level. Set up the log buffer
-	// so that there is a chance of loading the program in a single shot.
-	logSize := internal.Between(opts.LogSizeStart, minVerifierLogSize, maxVerifierLogSize)
 	var logBuf []byte
-	if !opts.LogDisabled && opts.LogLevel != 0 {
-		logBuf = make([]byte, logSize)
-		attr.LogLevel = opts.LogLevel
-		attr.LogSize = uint32(len(logBuf))
-		attr.LogBuf = sys.NewSlicePointer(logBuf)
-	}
+	var fd *sys.FD
 
-	for {
-		var fd *sys.FD
+	if opts.LogDisabled {
+		// Loading with logging disabled should never retry.
 		fd, err = sys.ProgLoad(attr)
 		if err == nil {
-			return &Program{unix.ByteSliceToString(logBuf), fd, spec.Name, "", spec.Type}, nil
-		}
-
-		if opts.LogDisabled {
-			break
+			return &Program{"", fd, spec.Name, "", spec.Type}, nil
 		}
-
-		if attr.LogTrueSize != 0 && attr.LogSize >= attr.LogTrueSize {
-			// The log buffer already has the correct size.
-			break
-		}
-
-		if attr.LogSize != 0 && !errors.Is(err, unix.ENOSPC) {
-			// Logging is enabled and the error is not ENOSPC, so we can infer
-			// that the log buffer is large enough.
-			break
+	} else {
+		// Only specify log size if log level is also specified. Setting size
+		// without level results in EINVAL. Level will be bumped to LogLevelBranch
+		// if the first load fails.
+		if opts.LogLevel != 0 {
+			attr.LogLevel = opts.LogLevel
+			attr.LogSize = internal.Between(opts.LogSizeStart, minVerifierLogSize, maxVerifierLogSize)
 		}
 
-		if attr.LogLevel == 0 {
-			// Logging is not enabled but loading the program failed. Enable
-			// basic logging.
-			attr.LogLevel = LogLevelBranch
-		}
+		for {
+			if attr.LogLevel != 0 {
+				logBuf = make([]byte, attr.LogSize)
+				attr.LogBuf = sys.NewSlicePointer(logBuf)
+			}
 
-		// Make an educated guess how large the buffer should be by multiplying.
-		// Ensure the size doesn't overflow.
-		const factor = 2
-		logSize = internal.Between(logSize, minVerifierLogSize, maxVerifierLogSize/factor)
-		logSize *= factor
+			fd, err = sys.ProgLoad(attr)
+			if err == nil {
+				return &Program{unix.ByteSliceToString(logBuf), fd, spec.Name, "", spec.Type}, nil
+			}
 
-		if attr.LogTrueSize != 0 {
-			// The kernel has given us a hint how large the log buffer has to be.
-			logSize = attr.LogTrueSize
+			if !retryLogAttrs(attr, opts.LogSizeStart, err) {
+				break
+			}
 		}
-
-		logBuf = make([]byte, logSize)
-		attr.LogSize = logSize
-		attr.LogBuf = sys.NewSlicePointer(logBuf)
 	}
 
 	end := bytes.IndexByte(logBuf, 0)
@@ -511,6 +491,40 @@ func newProgramWithOptions(spec *ProgramSpec, opts ProgramOptions) (*Program, er
 	return nil, internal.ErrorWithLog("load program", err, logBuf)
 }
 
+func retryLogAttrs(attr *sys.ProgLoadAttr, startSize uint32, err error) bool {
+	// ENOSPC means the log was enabled on the previous iteration, so we only
+	// need to grow the buffer.
+	if errors.Is(err, unix.ENOSPC) {
+		if attr.LogTrueSize != 0 {
+			// Kernel supports LogTrueSize and previous iteration undershot the buffer
+			// size. Try again with the given true size.
+			attr.LogSize = attr.LogTrueSize
+			return true
+		}
+
+		// ENOSPC means we've loaded the program before and the log buffer was too
+		// small. Make an educated guess how large the buffer should be by
+		// multiplying. Ensure the size doesn't overflow.
+		const factor = 2
+		attr.LogSize = internal.Between(attr.LogSize, minVerifierLogSize, maxVerifierLogSize/factor)
+		attr.LogSize *= factor
+
+		return true
+	}
+
+	if attr.LogLevel == 0 {
+		// Loading the program failed, it wasn't a buffer-related error, and the log
+		// was disabled the previous iteration. Enable basic logging and retry.
+		attr.LogLevel = LogLevelBranch
+		attr.LogSize = internal.Between(startSize, minVerifierLogSize, maxVerifierLogSize)
+		return true
+	}
+
+	// Loading the program failed for a reason other than buffer size and the log
+	// was already enabled the previous iteration. Don't retry.
+	return false
+}
+
 // NewProgramFromFD creates a program from a raw fd.
 //
 // You should not use fd after calling this function.
diff --git a/prog_test.go b/prog_test.go
index 7781a0ca6..e1877b8a1 100644
--- a/prog_test.go
+++ b/prog_test.go
@@ -544,6 +544,46 @@ func TestProgramVerifierLog(t *testing.T) {
 	prog.Close()
 }
 
+func TestProgramVerifierLogRetry(t *testing.T) {
+	someError := errors.New("not a buffer error")
+
+	t.Run("retry with oversized buffer", func(t *testing.T) {
+		// First load failure, without logging enabled. Retry with logging enabled.
+		attr := &sys.ProgLoadAttr{LogLevel: 0, LogSize: 0}
+		qt.Assert(t, qt.IsTrue(retryLogAttrs(attr, 0, someError)))
+		qt.Assert(t, qt.Equals(attr.LogLevel, LogLevelBranch))
+		qt.Assert(t, qt.Equals(attr.LogSize, minVerifierLogSize))
+
+		// Second failure with logging enabled. No buffer error, don't retry.
+		qt.Assert(t, qt.IsFalse(retryLogAttrs(attr, 0, someError)))
+		qt.Assert(t, qt.Equals(attr.LogLevel, LogLevelBranch))
+		qt.Assert(t, qt.Equals(attr.LogSize, minVerifierLogSize))
+	})
+
+	t.Run("retry with undersized buffer, no log_true_size", func(t *testing.T) {
+		// First load failure, without logging enabled. Retry with larger buffer.
+		attr := &sys.ProgLoadAttr{LogLevel: 0, LogSize: 0}
+		qt.Assert(t, qt.IsTrue(retryLogAttrs(attr, 0, someError)))
+
+		// Second failure, this time the kernel signals an undersized buffer. Retry
+		// with double the size.
+		qt.Assert(t, qt.IsTrue(retryLogAttrs(attr, 0, unix.ENOSPC)))
+		qt.Assert(t, qt.Equals(attr.LogSize, minVerifierLogSize*2))
+	})
+
+	t.Run("retry with undersized buffer, with log_true_size", func(t *testing.T) {
+		// First load failure, without logging enabled. Retry with larger buffer.
+		attr := &sys.ProgLoadAttr{LogLevel: 0, LogSize: 0}
+		qt.Assert(t, qt.IsTrue(retryLogAttrs(attr, 0, someError)))
+
+		// Second failure, the kernel signals undersized buffer and also sets
+		// log_true_size. Retry with the exact size required.
+		attr.LogTrueSize = 123
+		qt.Assert(t, qt.IsTrue(retryLogAttrs(attr, 0, unix.ENOSPC)))
+		qt.Assert(t, qt.Equals(attr.LogSize, 123))
+	})
+}
+
 func TestProgramWithUnsatisfiedMap(t *testing.T) {
 	coll, err := LoadCollectionSpec("testdata/loader-el.elf")
 	if err != nil {

From 9e5357d5fb222906d24afeb560dc9b36d8ad4011 Mon Sep 17 00:00:00 2001
From: Timo Beckers <timo@isovalent.com>
Date: Wed, 19 Feb 2025 16:55:36 +0100
Subject: [PATCH 2/3] prog: give up when max verifier log size is reached

Previously, loading a program would loop forever if the log didn't fit
in the kernel's max log size.

Signed-off-by: Timo Beckers <timo@isovalent.com>
---
 prog.go      | 16 ++++++++++++++--
 prog_test.go | 24 ++++++++++++++++++++++++
 2 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/prog.go b/prog.go
index 85adf3b0f..0e8f2d41b 100644
--- a/prog.go
+++ b/prog.go
@@ -503,9 +503,21 @@ func retryLogAttrs(attr *sys.ProgLoadAttr, startSize uint32, err error) bool {
 		}
 
 		// ENOSPC means we've loaded the program before and the log buffer was too
-		// small. Make an educated guess how large the buffer should be by
-		// multiplying. Ensure the size doesn't overflow.
+		// small.
+		if attr.LogSize == maxVerifierLogSize {
+			// Maximum buffer size reached, don't grow or retry.
+			return false
+		}
+
+		// Ensure the size doesn't overflow.
 		const factor = 2
+		if attr.LogSize >= maxVerifierLogSize/factor {
+			attr.LogSize = maxVerifierLogSize
+			return true
+		}
+
+		// Make an educated guess how large the buffer should be by multiplying. Due
+		// to int division, this rounds down odd sizes.
 		attr.LogSize = internal.Between(attr.LogSize, minVerifierLogSize, maxVerifierLogSize/factor)
 		attr.LogSize *= factor
 
diff --git a/prog_test.go b/prog_test.go
index e1877b8a1..8b2b3bfaf 100644
--- a/prog_test.go
+++ b/prog_test.go
@@ -582,6 +582,30 @@ func TestProgramVerifierLogRetry(t *testing.T) {
 		qt.Assert(t, qt.IsTrue(retryLogAttrs(attr, 0, unix.ENOSPC)))
 		qt.Assert(t, qt.Equals(attr.LogSize, 123))
 	})
+
+	t.Run("grow to maximum buffer size", func(t *testing.T) {
+		// Previous loads pushed the log size to (or above) half of the maximum,
+		// which would make it overflow on the next retry. Make sure the log size
+		// actually hits the maximum so we can bail out.
+		attr := &sys.ProgLoadAttr{LogLevel: LogLevelBranch, LogSize: maxVerifierLogSize / 2}
+		qt.Assert(t, qt.IsTrue(retryLogAttrs(attr, 0, unix.ENOSPC)))
+		qt.Assert(t, qt.Equals(attr.LogSize, maxVerifierLogSize))
+
+		// Don't retry if the buffer is already at the maximum size.
+		qt.Assert(t, qt.IsFalse(retryLogAttrs(attr, 0, unix.ENOSPC)))
+	})
+
+	t.Run("start at maximum buffer size", func(t *testing.T) {
+		// The user requested a log buffer exceeding the maximum size, but no log
+		// level. Retry with the maximum size and default log level.
+		attr := &sys.ProgLoadAttr{LogLevel: 0, LogSize: 0}
+		qt.Assert(t, qt.IsTrue(retryLogAttrs(attr, math.MaxUint32, unix.EINVAL)))
+		qt.Assert(t, qt.Equals(attr.LogLevel, LogLevelBranch))
+		qt.Assert(t, qt.Equals(attr.LogSize, maxVerifierLogSize))
+
+		// Log still doesn't fit maximum-size buffer. Don't retry.
+		qt.Assert(t, qt.IsFalse(retryLogAttrs(attr, 0, unix.ENOSPC)))
+	})
 }
 
 func TestProgramWithUnsatisfiedMap(t *testing.T) {

From fbe9df5b62869e8f83d1fe81a07d4b6a1beb56ea Mon Sep 17 00:00:00 2001
From: Timo Beckers <timo@isovalent.com>
Date: Wed, 19 Feb 2025 17:11:04 +0100
Subject: [PATCH 3/3] prog: prevent prog_load loop of death

To limit the damage of any future regressions, limit the attempts we'll take
to load a program.

Signed-off-by: Timo Beckers <timo@isovalent.com>
---
 prog.go      | 12 ++++++++++++
 prog_test.go | 11 +++++++++++
 2 files changed, 23 insertions(+)

diff --git a/prog.go b/prog.go
index 0e8f2d41b..91ce7c90c 100644
--- a/prog.go
+++ b/prog.go
@@ -56,6 +56,12 @@ const minVerifierLogSize = 64 * 1024
 // future, but avoid the unnecessary EINVAL for now.
 const maxVerifierLogSize = math.MaxUint32 >> 2
 
+// maxVerifierAttempts is the maximum number of times the verifier will retry
+// loading a program with a growing log buffer before giving up. Since we double
+// the log size on every attempt, this is the absolute maximum number of
+// attempts before the buffer reaches [maxVerifierLogSize].
+const maxVerifierAttempts = 30
+
 // ProgramOptions control loading a program into the kernel.
 type ProgramOptions struct {
 	// Bitmap controlling the detail emitted by the kernel's eBPF verifier log.
@@ -427,6 +433,7 @@ func newProgramWithOptions(spec *ProgramSpec, opts ProgramOptions) (*Program, er
 			attr.LogSize = internal.Between(opts.LogSizeStart, minVerifierLogSize, maxVerifierLogSize)
 		}
 
+		attempts := 1
 		for {
 			if attr.LogLevel != 0 {
 				logBuf = make([]byte, attr.LogSize)
@@ -441,6 +448,11 @@ func newProgramWithOptions(spec *ProgramSpec, opts ProgramOptions) (*Program, er
 			if !retryLogAttrs(attr, opts.LogSizeStart, err) {
 				break
 			}
+
+			if attempts >= maxVerifierAttempts {
+				return nil, fmt.Errorf("load program: %w (bug: hit %d verifier attempts)", err, maxVerifierAttempts)
+			}
+			attempts++
 		}
 	}
 
diff --git a/prog_test.go b/prog_test.go
index 8b2b3bfaf..9ce7e83fe 100644
--- a/prog_test.go
+++ b/prog_test.go
@@ -606,6 +606,17 @@ func TestProgramVerifierLogRetry(t *testing.T) {
 		// Log still doesn't fit maximum-size buffer. Don't retry.
 		qt.Assert(t, qt.IsFalse(retryLogAttrs(attr, 0, unix.ENOSPC)))
 	})
+
+	t.Run("ensure growth terminates within max attempts", func(t *testing.T) {
+		attr := &sys.ProgLoadAttr{LogLevel: 0, LogSize: 0}
+		var terminated bool
+		for i := 1; i <= maxVerifierAttempts; i++ {
+			if !retryLogAttrs(attr, 0, syscall.ENOSPC) {
+				terminated = true
+			}
+		}
+		qt.Assert(t, qt.IsTrue(terminated))
+	})
 }
 
 func TestProgramWithUnsatisfiedMap(t *testing.T) {