Skip to content

Commit 37658bc

Browse files
authored
fix: capi CI flakes and reconcilation bugs (#172)
* fix: use namespace from reference for external.Get * fix: requeue instead of returning when ControlPlaneEndpoint is not set This change to requeue instead of returning is to ensure intermittent reconcile skips do not happen such as the one that happens in `Workload cluster scaling` tests when scaling up * fix: add custom haproxy config for docker The default haproxy config of CAPD does not have a hard timeout, meaning abrupt config changes or apiservers disappearing can sometimes leave hanging connections. This recovers capi-manager from being stuck in a loop with `Client.Timeout exceeded while awaiting headers` when performing a node cordon operation etc.
1 parent 161c0e9 commit 37658bc

File tree

7 files changed

+318
-10
lines changed

7 files changed

+318
-10
lines changed

controlplane/controllers/ck8scontrolplane_controller.go

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -443,7 +443,7 @@ func (r *CK8sControlPlaneReconciler) reconcile(ctx context.Context, cluster *clu
443443
logger.Info("Reconcile CK8sControlPlane")
444444

445445
// Make sure to reconcile the external infrastructure reference.
446-
if err := r.reconcileExternalReference(ctx, cluster, kcp.Spec.MachineTemplate.InfrastructureRef); err != nil {
446+
if err := r.reconcileExternalReference(ctx, cluster, &kcp.Spec.MachineTemplate.InfrastructureRef); err != nil {
447447
return reconcile.Result{}, err
448448
}
449449

@@ -462,10 +462,12 @@ func (r *CK8sControlPlaneReconciler) reconcile(ctx context.Context, cluster *clu
462462
}
463463
conditions.MarkTrue(kcp, controlplanev1.TokenAvailableCondition)
464464

465-
// If ControlPlaneEndpoint is not set, return early
465+
// If ControlPlaneEndpoint is not set, requeue to wait for it to be set.
466+
// (berkayoz): This change to requeue instead of returning is to ensure
467+
// intermittent reconcile skips such as the one that happens in `Workload cluster scaling` tests
466468
if !cluster.Spec.ControlPlaneEndpoint.IsValid() {
467469
logger.Info("Cluster does not yet have a ControlPlaneEndpoint defined")
468-
return reconcile.Result{}, nil
470+
return reconcile.Result{RequeueAfter: 3 * time.Second}, nil
469471
}
470472

471473
// Generate Cluster Kubeconfig if needed
@@ -561,12 +563,22 @@ func (r *CK8sControlPlaneReconciler) reconcile(ctx context.Context, cluster *clu
561563
return reconcile.Result{}, nil
562564
}
563565

564-
func (r *CK8sControlPlaneReconciler) reconcileExternalReference(ctx context.Context, cluster *clusterv1.Cluster, ref corev1.ObjectReference) error {
566+
func (r *CK8sControlPlaneReconciler) reconcileExternalReference(ctx context.Context, cluster *clusterv1.Cluster, ref *corev1.ObjectReference) error {
565567
if !strings.HasSuffix(ref.Kind, clusterv1.TemplateSuffix) {
566568
return nil
567569
}
568570

569-
obj, err := external.Get(ctx, r.Client, &ref)
571+
logger := r.Log.WithValues("namespace", ref.Namespace, "CK8sControlPlane", ref.Name, "cluster", cluster.Name)
572+
logger.Info("Reconciling external template reference", "ref", ref)
573+
574+
// Ensure the ref namespace is populated for objects not yet defaulted by webhook
575+
// https://github.com/kubernetes-sigs/cluster-api/pull/11361
576+
if ref.Namespace == "" {
577+
ref = ref.DeepCopy()
578+
ref.Namespace = cluster.Namespace
579+
}
580+
581+
obj, err := external.Get(ctx, r.Client, ref)
570582
if err != nil {
571583
return err
572584
}

controlplane/controllers/scale.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -245,6 +245,12 @@ func (r *CK8sControlPlaneReconciler) cloneConfigsAndGenerateMachine(ctx context.
245245
UID: kcp.UID,
246246
}
247247

248+
// Ensure the ref namespace is populated for objects not yet defaulted by webhook
249+
if kcp.Spec.MachineTemplate.InfrastructureRef.Namespace == "" {
250+
kcp.Spec.MachineTemplate.InfrastructureRef = *kcp.Spec.MachineTemplate.InfrastructureRef.DeepCopy()
251+
kcp.Spec.MachineTemplate.InfrastructureRef.Namespace = cluster.Namespace
252+
}
253+
248254
// Clone the infrastructure template
249255
infraRef, err := external.CreateFromTemplate(ctx, &external.CreateFromTemplateInput{
250256
Client: r.Client,

test/e2e/data/infrastructure-docker/cluster-template-kcp-remediation.yaml

Lines changed: 59 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,69 @@ spec:
2121
kind: DockerCluster
2222
name: ${CLUSTER_NAME}
2323
---
24+
# Derived from https://github.com/kubernetes-sigs/cluster-api/blob/3a4074421425869d65bf040a3f4730f2d4492cf9/test/infrastructure/docker/internal/loadbalancer/config.go#L42-L85
25+
apiVersion: v1
26+
kind: ConfigMap
27+
metadata:
28+
name: haproxy-config
29+
data:
30+
value: |
31+
# generated by kind
32+
global
33+
log /dev/log local0
34+
log /dev/log local1 notice
35+
daemon
36+
# limit memory usage to approximately 18 MB
37+
# (see https://github.com/kubernetes-sigs/kind/pull/3115)
38+
maxconn 100000
39+
# (berkayoz): Below is our addition to the original file
40+
# This is to ensure that connections do not hang open when a control plane is removed or configuration is changed abruptly
41+
# This fixes the errors from capi itself like "Client.Timeout exceeded while awaiting headers"
42+
hard-stop-after 5000
43+
44+
resolvers docker
45+
nameserver dns 127.0.0.11:53
46+
47+
defaults
48+
log global
49+
mode tcp
50+
option dontlognull
51+
# TODO: tune these
52+
timeout connect 5000
53+
timeout client 50000
54+
timeout server 50000
55+
# allow to boot despite dns don't resolve backends
56+
default-server init-addr none
57+
58+
frontend stats
59+
mode http
60+
bind *:8404
61+
stats enable
62+
stats uri /stats
63+
stats refresh 1s
64+
stats admin if TRUE
65+
66+
frontend control-plane
67+
bind *:{{ .FrontendControlPlanePort }}
68+
{{ if .IPv6 -}}
69+
bind :::{{ .FrontendControlPlanePort }};
70+
{{- end }}
71+
default_backend kube-apiservers
72+
73+
backend kube-apiservers
74+
option httpchk GET /healthz
75+
{{range $server, $backend := .BackendServers}}
76+
server {{ $server }} {{ JoinHostPort $backend.Address $.BackendControlPlanePort }} weight {{ $backend.Weight }} check check-ssl verify none resolvers docker resolve-prefer {{ if $.IPv6 -}} ipv6 {{- else -}} ipv4 {{- end }}
77+
{{- end}}
78+
---
2479
apiVersion: infrastructure.cluster.x-k8s.io/v1beta1
2580
kind: DockerCluster
2681
metadata:
2782
name: ${CLUSTER_NAME}
28-
spec: {}
83+
spec:
84+
loadBalancer:
85+
customHAProxyConfigTemplateRef:
86+
name: haproxy-config
2987
---
3088
apiVersion: controlplane.cluster.x-k8s.io/v1beta2
3189
kind: CK8sControlPlane

test/e2e/data/infrastructure-docker/cluster-template-md-remediation.yaml

Lines changed: 59 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,69 @@ spec:
2424
kind: DockerCluster
2525
name: ${CLUSTER_NAME}
2626
---
27+
# Derived from https://github.com/kubernetes-sigs/cluster-api/blob/3a4074421425869d65bf040a3f4730f2d4492cf9/test/infrastructure/docker/internal/loadbalancer/config.go#L42-L85
28+
apiVersion: v1
29+
kind: ConfigMap
30+
metadata:
31+
name: haproxy-config
32+
data:
33+
value: |
34+
# generated by kind
35+
global
36+
log /dev/log local0
37+
log /dev/log local1 notice
38+
daemon
39+
# limit memory usage to approximately 18 MB
40+
# (see https://github.com/kubernetes-sigs/kind/pull/3115)
41+
maxconn 100000
42+
# (berkayoz): Below is our addition to the original file
43+
# This is to ensure that connections do not hang open when a control plane is removed or configuration is changed abruptly
44+
# This fixes the errors from capi itself like "Client.Timeout exceeded while awaiting headers"
45+
hard-stop-after 5000
46+
47+
resolvers docker
48+
nameserver dns 127.0.0.11:53
49+
50+
defaults
51+
log global
52+
mode tcp
53+
option dontlognull
54+
# TODO: tune these
55+
timeout connect 5000
56+
timeout client 50000
57+
timeout server 50000
58+
# allow to boot despite dns don't resolve backends
59+
default-server init-addr none
60+
61+
frontend stats
62+
mode http
63+
bind *:8404
64+
stats enable
65+
stats uri /stats
66+
stats refresh 1s
67+
stats admin if TRUE
68+
69+
frontend control-plane
70+
bind *:{{ .FrontendControlPlanePort }}
71+
{{ if .IPv6 -}}
72+
bind :::{{ .FrontendControlPlanePort }};
73+
{{- end }}
74+
default_backend kube-apiservers
75+
76+
backend kube-apiservers
77+
option httpchk GET /healthz
78+
{{range $server, $backend := .BackendServers}}
79+
server {{ $server }} {{ JoinHostPort $backend.Address $.BackendControlPlanePort }} weight {{ $backend.Weight }} check check-ssl verify none resolvers docker resolve-prefer {{ if $.IPv6 -}} ipv6 {{- else -}} ipv4 {{- end }}
80+
{{- end}}
81+
---
2782
apiVersion: infrastructure.cluster.x-k8s.io/v1beta1
2883
kind: DockerCluster
2984
metadata:
3085
name: ${CLUSTER_NAME}
31-
spec: {}
86+
spec:
87+
loadBalancer:
88+
customHAProxyConfigTemplateRef:
89+
name: haproxy-config
3290
---
3391
apiVersion: controlplane.cluster.x-k8s.io/v1beta2
3492
kind: CK8sControlPlane

test/e2e/data/infrastructure-docker/cluster-template-upgrades-max-surge-0.yaml

Lines changed: 59 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,69 @@ spec:
2424
kind: DockerCluster
2525
name: ${CLUSTER_NAME}
2626
---
27+
# Derived from https://github.com/kubernetes-sigs/cluster-api/blob/3a4074421425869d65bf040a3f4730f2d4492cf9/test/infrastructure/docker/internal/loadbalancer/config.go#L42-L85
28+
apiVersion: v1
29+
kind: ConfigMap
30+
metadata:
31+
name: haproxy-config
32+
data:
33+
value: |
34+
# generated by kind
35+
global
36+
log /dev/log local0
37+
log /dev/log local1 notice
38+
daemon
39+
# limit memory usage to approximately 18 MB
40+
# (see https://github.com/kubernetes-sigs/kind/pull/3115)
41+
maxconn 100000
42+
# (berkayoz): Below is our addition to the original file
43+
# This is to ensure that connections do not hang open when a control plane is removed or configuration is changed abruptly
44+
# This fixes the errors from capi itself like "Client.Timeout exceeded while awaiting headers"
45+
hard-stop-after 5000
46+
47+
resolvers docker
48+
nameserver dns 127.0.0.11:53
49+
50+
defaults
51+
log global
52+
mode tcp
53+
option dontlognull
54+
# TODO: tune these
55+
timeout connect 5000
56+
timeout client 50000
57+
timeout server 50000
58+
# allow to boot despite dns don't resolve backends
59+
default-server init-addr none
60+
61+
frontend stats
62+
mode http
63+
bind *:8404
64+
stats enable
65+
stats uri /stats
66+
stats refresh 1s
67+
stats admin if TRUE
68+
69+
frontend control-plane
70+
bind *:{{ .FrontendControlPlanePort }}
71+
{{ if .IPv6 -}}
72+
bind :::{{ .FrontendControlPlanePort }};
73+
{{- end }}
74+
default_backend kube-apiservers
75+
76+
backend kube-apiservers
77+
option httpchk GET /healthz
78+
{{range $server, $backend := .BackendServers}}
79+
server {{ $server }} {{ JoinHostPort $backend.Address $.BackendControlPlanePort }} weight {{ $backend.Weight }} check check-ssl verify none resolvers docker resolve-prefer {{ if $.IPv6 -}} ipv6 {{- else -}} ipv4 {{- end }}
80+
{{- end}}
81+
---
2782
apiVersion: infrastructure.cluster.x-k8s.io/v1beta1
2883
kind: DockerCluster
2984
metadata:
3085
name: ${CLUSTER_NAME}
31-
spec: {}
86+
spec:
87+
loadBalancer:
88+
customHAProxyConfigTemplateRef:
89+
name: haproxy-config
3290
---
3391
apiVersion: controlplane.cluster.x-k8s.io/v1beta2
3492
kind: CK8sControlPlane

test/e2e/data/infrastructure-docker/cluster-template-upgrades.yaml

Lines changed: 59 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,69 @@ spec:
2424
kind: DockerCluster
2525
name: ${CLUSTER_NAME}
2626
---
27+
# Derived from https://github.com/kubernetes-sigs/cluster-api/blob/3a4074421425869d65bf040a3f4730f2d4492cf9/test/infrastructure/docker/internal/loadbalancer/config.go#L42-L85
28+
apiVersion: v1
29+
kind: ConfigMap
30+
metadata:
31+
name: haproxy-config
32+
data:
33+
value: |
34+
# generated by kind
35+
global
36+
log /dev/log local0
37+
log /dev/log local1 notice
38+
daemon
39+
# limit memory usage to approximately 18 MB
40+
# (see https://github.com/kubernetes-sigs/kind/pull/3115)
41+
maxconn 100000
42+
# (berkayoz): Below is our addition to the original file
43+
# This is to ensure that connections do not hang open when a control plane is removed or configuration is changed abruptly
44+
# This fixes the errors from capi itself like "Client.Timeout exceeded while awaiting headers"
45+
hard-stop-after 5000
46+
47+
resolvers docker
48+
nameserver dns 127.0.0.11:53
49+
50+
defaults
51+
log global
52+
mode tcp
53+
option dontlognull
54+
# TODO: tune these
55+
timeout connect 5000
56+
timeout client 50000
57+
timeout server 50000
58+
# allow to boot despite dns don't resolve backends
59+
default-server init-addr none
60+
61+
frontend stats
62+
mode http
63+
bind *:8404
64+
stats enable
65+
stats uri /stats
66+
stats refresh 1s
67+
stats admin if TRUE
68+
69+
frontend control-plane
70+
bind *:{{ .FrontendControlPlanePort }}
71+
{{ if .IPv6 -}}
72+
bind :::{{ .FrontendControlPlanePort }};
73+
{{- end }}
74+
default_backend kube-apiservers
75+
76+
backend kube-apiservers
77+
option httpchk GET /healthz
78+
{{range $server, $backend := .BackendServers}}
79+
server {{ $server }} {{ JoinHostPort $backend.Address $.BackendControlPlanePort }} weight {{ $backend.Weight }} check check-ssl verify none resolvers docker resolve-prefer {{ if $.IPv6 -}} ipv6 {{- else -}} ipv4 {{- end }}
80+
{{- end}}
81+
---
2782
apiVersion: infrastructure.cluster.x-k8s.io/v1beta1
2883
kind: DockerCluster
2984
metadata:
3085
name: ${CLUSTER_NAME}
31-
spec: {}
86+
spec:
87+
loadBalancer:
88+
customHAProxyConfigTemplateRef:
89+
name: haproxy-config
3290
---
3391
apiVersion: controlplane.cluster.x-k8s.io/v1beta2
3492
kind: CK8sControlPlane

0 commit comments

Comments
 (0)