Skip to content

Commit

Permalink
Implement delayed termination to achieve zero downtime upgrades (#1159)
Browse files Browse the repository at this point in the history
Problem:
During an upgrade of NGF, external clients can experience downtime.

Solution:
- Introduce configurable delayed termination.
  - Add sleep subcommand to gateway binary
  - Add lifecycle paramaters to helm to both nginx-gateway and nginx
    containers.
  - Add terminationGracePeriodSeconds parameter to helm.
  - Add affinity parameter to helm (primary needed for testing to
    prevent pods running on the same node).
- Rerun zero downtime non-functional tests.

Testing:
- Manual testing

SOLVES #1155

Co-authored-by: Saylor Berman <[email protected]>
  • Loading branch information
pleshakov and sjberman authored Oct 20, 2023
1 parent ee0c491 commit 29b45e3
Show file tree
Hide file tree
Showing 16 changed files with 1,622 additions and 35 deletions.
110 changes: 81 additions & 29 deletions cmd/gateway/commands.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"errors"
"fmt"
"os"
"time"

"github.com/spf13/cobra"
"go.uber.org/zap"
Expand All @@ -22,23 +23,11 @@ const (
gatewayClassFlag = "gatewayclass"
gatewayClassNameUsage = `The name of the GatewayClass resource. ` +
`Every NGINX Gateway Fabric must have a unique corresponding GatewayClass resource.`
gatewayCtrlNameFlag = "gateway-ctlr-name"
gatewayCtrlNameUsageFmt = `The name of the Gateway controller. ` +
gatewayCtlrNameFlag = "gateway-ctlr-name"
gatewayCtlrNameUsageFmt = `The name of the Gateway controller. ` +
`The controller name must be of the form: DOMAIN/PATH. The controller's domain is '%s'`
)

var (
// Backing values for common cli flags shared among all subcommands
// The values are managed by the Root command.
gatewayCtlrName = stringValidatingValue{
validator: validateGatewayControllerName,
}

gatewayClassName = stringValidatingValue{
validator: validateResourceName,
}
)

func createRootCommand() *cobra.Command {
rootCmd := &cobra.Command{
Use: "gateway",
Expand All @@ -47,20 +36,6 @@ func createRootCommand() *cobra.Command {
},
}

rootCmd.PersistentFlags().Var(
&gatewayCtlrName,
gatewayCtrlNameFlag,
fmt.Sprintf(gatewayCtrlNameUsageFmt, domain),
)
utilruntime.Must(rootCmd.MarkPersistentFlagRequired(gatewayCtrlNameFlag))

rootCmd.PersistentFlags().Var(
&gatewayClassName,
gatewayClassFlag,
gatewayClassNameUsage,
)
utilruntime.Must(rootCmd.MarkPersistentFlagRequired(gatewayClassFlag))

return rootCmd
}

Expand All @@ -82,6 +57,14 @@ func createStaticModeCommand() *cobra.Command {

// flag values
var (
gatewayCtlrName = stringValidatingValue{
validator: validateGatewayControllerName,
}

gatewayClassName = stringValidatingValue{
validator: validateResourceName,
}

updateGCStatus bool
gateway = namespacedNameValue{}
configName = stringValidatingValue{
Expand Down Expand Up @@ -185,6 +168,20 @@ func createStaticModeCommand() *cobra.Command {
},
}

cmd.Flags().Var(
&gatewayCtlrName,
gatewayCtlrNameFlag,
fmt.Sprintf(gatewayCtlrNameUsageFmt, domain),
)
utilruntime.Must(cmd.MarkFlagRequired(gatewayCtlrNameFlag))

cmd.Flags().Var(
&gatewayClassName,
gatewayClassFlag,
gatewayClassNameUsage,
)
utilruntime.Must(cmd.MarkFlagRequired(gatewayClassFlag))

cmd.Flags().Var(
&gateway,
gatewayFlag,
Expand Down Expand Up @@ -271,7 +268,16 @@ func createStaticModeCommand() *cobra.Command {
}

func createProvisionerModeCommand() *cobra.Command {
return &cobra.Command{
var (
gatewayCtlrName = stringValidatingValue{
validator: validateGatewayControllerName,
}
gatewayClassName = stringValidatingValue{
validator: validateResourceName,
}
)

cmd := &cobra.Command{
Use: "provisioner-mode",
Short: "Provision a static-mode NGINX Gateway Fabric Deployment per Gateway resource",
Hidden: true,
Expand All @@ -291,4 +297,50 @@ func createProvisionerModeCommand() *cobra.Command {
})
},
}

cmd.Flags().Var(
&gatewayCtlrName,
gatewayCtlrNameFlag,
fmt.Sprintf(gatewayCtlrNameUsageFmt, domain),
)
utilruntime.Must(cmd.MarkFlagRequired(gatewayCtlrNameFlag))

cmd.Flags().Var(
&gatewayClassName,
gatewayClassFlag,
gatewayClassNameUsage,
)
utilruntime.Must(cmd.MarkFlagRequired(gatewayClassFlag))

return cmd
}

// FIXME(pleshakov): Remove this command once NGF min supported Kubernetes version supports sleep action in
// preStop hook.
// nolint:lll
// See https://github.com/kubernetes/enhancements/tree/4ec371d92dcd4f56a2ab18c8ba20bb85d8d20efe/keps/sig-node/3960-pod-lifecycle-sleep-action
func createSleepCommand() *cobra.Command {
// flag names
const durationFlag = "duration"
// flag values
var duration time.Duration

cmd := &cobra.Command{
Use: "sleep",
Short: "Sleep for specified duration and exit",
Run: func(cmd *cobra.Command, args []string) {
// It is expected that this command is run from lifecycle hook.
// Because logs from hooks are not visible in the container logs, we don't log here at all.
time.Sleep(duration)
},
}

cmd.Flags().DurationVar(
&duration,
durationFlag,
30*time.Second,
"Set the duration of sleep. Must be parsable by https://pkg.go.dev/time#ParseDuration",
)

return cmd
}
86 changes: 80 additions & 6 deletions cmd/gateway/commands_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,17 @@ func testFlag(t *testing.T, cmd *cobra.Command, test flagTestCase) {
}
}

func TestRootCmdFlagValidation(t *testing.T) {
func TestRootCmd(t *testing.T) {
testCase := flagTestCase{
name: "no flags",
args: nil,
wantErr: false,
}

testFlag(t, createRootCommand(), testCase)
}

func TestCommonFlagsValidation(t *testing.T) {
tests := []flagTestCase{
{
name: "valid flags",
Expand Down Expand Up @@ -103,9 +113,11 @@ func TestRootCmdFlagValidation(t *testing.T) {
}

for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
rootCmd := createRootCommand()
testFlag(t, rootCmd, test)
t.Run(test.name+"_static_mode", func(t *testing.T) {
testFlag(t, createStaticModeCommand(), test)
})
t.Run(test.name+"_provisioner_mode", func(t *testing.T) {
testFlag(t, createProvisionerModeCommand(), test)
})
}
}
Expand All @@ -115,6 +127,8 @@ func TestStaticModeCmdFlagValidation(t *testing.T) {
{
name: "valid flags",
args: []string{
"--gateway-ctlr-name=gateway.nginx.org/nginx-gateway", // common and required flag
"--gatewayclass=nginx", // common and required flag
"--gateway=nginx-gateway/nginx",
"--config=nginx-gateway-config",
"--service=nginx-gateway",
Expand All @@ -130,8 +144,11 @@ func TestStaticModeCmdFlagValidation(t *testing.T) {
wantErr: false,
},
{
name: "valid flags, not set",
args: nil,
name: "valid flags, non-required not set",
args: []string{
"--gateway-ctlr-name=gateway.nginx.org/nginx-gateway", // common and required flag
"--gatewayclass=nginx", // common and required flag,
},
wantErr: false,
},
{
Expand Down Expand Up @@ -280,10 +297,67 @@ func TestStaticModeCmdFlagValidation(t *testing.T) {
},
}

// common flags validation is tested separately

for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
cmd := createStaticModeCommand()
testFlag(t, cmd, test)
})
}
}

func TestProvisionerModeCmdFlagValidation(t *testing.T) {
testCase := flagTestCase{
name: "valid flags",
args: []string{
"--gateway-ctlr-name=gateway.nginx.org/nginx-gateway", // common and required flag
"--gatewayclass=nginx", // common and required flag
},
wantErr: false,
}

// common flags validation is tested separately

testFlag(t, createProvisionerModeCommand(), testCase)
}

func TestSleepCmdFlagValidation(t *testing.T) {
tests := []flagTestCase{
{
name: "valid flags",
args: []string{
"--duration=1s",
},
wantErr: false,
},
{
name: "omitted flags",
args: nil,
wantErr: false,
},
{
name: "duration is set to empty string",
args: []string{
"--duration=",
},
wantErr: true,
expectedErrPrefix: `invalid argument "" for "--duration" flag: time: invalid duration ""`,
},
{
name: "duration is invalid",
args: []string{
"--duration=invalid",
},
wantErr: true,
expectedErrPrefix: `invalid argument "invalid" for "--duration" flag: time: invalid duration "invalid"`,
},
}

for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
cmd := createSleepCommand()
testFlag(t, cmd, test)
})
}
}
1 change: 1 addition & 0 deletions cmd/gateway/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ func main() {
rootCmd.AddCommand(
createStaticModeCommand(),
createProvisionerModeCommand(),
createSleepCommand(),
)

if err := rootCmd.Execute(); err != nil {
Expand Down
1 change: 1 addition & 0 deletions conformance/provisioner/static-deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ spec:
mountPath: /var/cache/nginx
- name: nginx-lib
mountPath: /var/lib/nginx
terminationGracePeriodSeconds: 30
serviceAccountName: nginx-gateway
shareProcessNamespace: true
securityContext:
Expand Down
4 changes: 4 additions & 0 deletions deploy/helm-chart/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ The following tables lists the configurable parameters of the NGINX Gateway Fabr
| `nginxGateway.image.repository` | The repository for the NGINX Gateway Fabric image. | ghcr.io/nginxinc/nginx-gateway-fabric |
| `nginxGateway.image.tag` | The tag for the NGINX Gateway Fabric image. | edge |
| `nginxGateway.image.pullPolicy` | The `imagePullPolicy` for the NGINX Gateway Fabric image. | Always |
| `nginxGateway.lifecycle` | The `lifecycle` of the nginx-gateway container. | {} |
| `nginxGateway.gatewayClassName` | The name of the GatewayClass for the NGINX Gateway Fabric deployment. | nginx |
| `nginxGateway.gatewayControllerName` | The name of the Gateway controller. The controller name must be of the form: DOMAIN/PATH. The controller's domain is gateway.nginx.org. | gateway.nginx.org/nginx-gateway-controller |
| `nginxGateway.kind` | The kind of the NGINX Gateway Fabric installation - currently, only Deployment is supported. | deployment |
Expand All @@ -151,6 +152,9 @@ The following tables lists the configurable parameters of the NGINX Gateway Fabr
| `nginx.image.repository` | The repository for the NGINX image. | ghcr.io/nginxinc/nginx-gateway-fabric/nginx |
| `nginx.image.tag` | The tag for the NGINX image. | edge |
| `nginx.image.pullPolicy` | The `imagePullPolicy` for the NGINX image. | Always |
| `nginx.lifecycle` | The `lifecycle` of the nginx container. | {} |
| `terminationGracePeriodSeconds` | The termination grace period of the NGINX Gateway Fabric pod. | 30 |
| `affinity` | The `affinity` of the NGINX Gateway Fabric pod. | {} |
| `serviceAccount.annotations` | The `annotations` for the ServiceAccount used by the NGINX Gateway Fabric deployment. | {} |
| `serviceAccount.name` | Name of the ServiceAccount used by the NGINX Gateway Fabric deployment. | Autogenerated |
| `service.create` | Creates a service to expose the NGINX Gateway Fabric pods. | true |
Expand Down
13 changes: 13 additions & 0 deletions deploy/helm-chart/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,10 @@ spec:
image: {{ .Values.nginxGateway.image.repository }}:{{ .Values.nginxGateway.image.tag | default .Chart.AppVersion }}
imagePullPolicy: {{ .Values.nginxGateway.image.pullPolicy }}
name: nginx-gateway
{{- if .Values.nginxGateway.lifecycle }}
lifecycle:
{{- toYaml .Values.nginxGateway.lifecycle | nindent 10 }}
{{- end }}
ports:
{{- if .Values.metrics.enable }}
- name: metrics
Expand Down Expand Up @@ -100,6 +104,10 @@ spec:
- image: {{ .Values.nginx.image.repository }}:{{ .Values.nginx.image.tag | default .Chart.AppVersion }}
imagePullPolicy: {{ .Values.nginx.image.pullPolicy }}
name: nginx
{{- if .Values.nginx.lifecycle }}
lifecycle:
{{- toYaml .Values.nginx.lifecycle | nindent 10 }}
{{- end }}
ports:
- containerPort: 80
name: http
Expand All @@ -125,6 +133,11 @@ spec:
mountPath: /var/cache/nginx
- name: nginx-lib
mountPath: /var/lib/nginx
terminationGracePeriodSeconds: {{ .Values.terminationGracePeriodSeconds }}
{{- if .Values.affinity }}
affinity:
{{- toYaml .Values.affinity | nindent 8 }}
{{- end }}
serviceAccountName: {{ include "nginx-gateway.serviceAccountName" . }}
shareProcessNamespace: true
securityContext:
Expand Down
12 changes: 12 additions & 0 deletions deploy/helm-chart/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,25 @@ nginxGateway:
## Some environments may need this set to true in order for the control plane to successfully reload NGINX.
allowPrivilegeEscalation: false

## The lifecycle of the nginx-gateway container.
lifecycle: {}

nginx:
## The NGINX image to use
image:
repository: ghcr.io/nginxinc/nginx-gateway-fabric/nginx
tag: edge
pullPolicy: Always

## The lifecycle of the nginx container.
lifecycle: {}

## The termination grace period of the NGINX Gateway Fabric pod.
terminationGracePeriodSeconds: 30

## The affinity of the NGINX Gateway Fabric pod.
affinity: {}

serviceAccount:
annotations: {}
## The name of the service account of the NGINX Gateway Fabric pods. Used for RBAC.
Expand Down
1 change: 1 addition & 0 deletions deploy/manifests/nginx-gateway.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,7 @@ spec:
mountPath: /var/cache/nginx
- name: nginx-lib
mountPath: /var/lib/nginx
terminationGracePeriodSeconds: 30
serviceAccountName: nginx-gateway
shareProcessNamespace: true
securityContext:
Expand Down
Loading

0 comments on commit 29b45e3

Please sign in to comment.