Skip to content

Commit e7c6703

Browse files
committed
GR null route-map: run many times test to catch the bug
- debug in watchfrr - capture coredump on bgp crashes - increase prefix scale on ipv4 Signed-off-by: karampok <[email protected]>
1 parent 2bb3506 commit e7c6703

File tree

9 files changed

+353
-35
lines changed

9 files changed

+353
-35
lines changed

Makefile

+4-1
Original file line numberDiff line numberDiff line change
@@ -238,7 +238,7 @@ $(APIDOCSGEN): $(LOCALBIN)
238238

239239
.PHONY: e2etests
240240
e2etests: ginkgo kubectl
241-
$(GINKGO) -v $(GINKGO_ARGS) --timeout=3h ./e2etests -- --kubectl=$(KUBECTL) $(TEST_ARGS)
241+
$(GINKGO) -v $(GINKGO_ARGS) --timeout=3h --repeat=3 ./e2etests -- --kubectl=$(KUBECTL) $(TEST_ARGS)
242242

243243

244244
.PHONY: kind-cluster
@@ -272,6 +272,9 @@ KIND_EXPORT_LOGS ?=/tmp/kind_logs
272272
.PHONY: kind-export-logs
273273
kind-export-logs:
274274
$(LOCALBIN)/kind export logs --name ${KIND_CLUSTER_NAME} ${KIND_EXPORT_LOGS}
275+
for node in $(shell docker ps -q); do \
276+
docker cp "$$node:/home" "$(KIND_EXPORT_LOGS)/home-$$node"; \
277+
done
275278

276279
.PHONY: generate-all-in-one
277280
generate-all-in-one: manifests kustomize ## Create manifests

config/all-in-one/frr-k8s-prometheus.yaml

+23-3
Original file line numberDiff line numberDiff line change
@@ -1166,10 +1166,25 @@ spec:
11661166
- mountPath: /etc/frr_reloader
11671167
name: reloader
11681168
- command:
1169-
- /bin/sh
1170-
- -c
1169+
- /bin/bash
1170+
- -cx
11711171
- |
1172-
/sbin/tini -- /usr/lib/frr/docker-start &
1172+
if [ -r "/lib/lsb/init-functions" ]; then
1173+
. /lib/lsb/init-functions
1174+
else
1175+
log_success_msg() {
1176+
echo "$@"
1177+
}
1178+
log_warning_msg() {
1179+
echo "$@" >&2
1180+
}
1181+
log_failure_msg() {
1182+
echo "$@" >&2
1183+
}
1184+
fi
1185+
1186+
source /usr/lib/frr/frrcommon.sh
1187+
/usr/lib/frr/watchfrr -l 7 $(daemon_list) &
11731188
attempts=0
11741189
until [[ -f /etc/frr/frr.log || $attempts -eq 60 ]]; do
11751190
sleep 1
@@ -1203,6 +1218,8 @@ spec:
12031218
port: 7573
12041219
periodSeconds: 5
12051220
volumeMounts:
1221+
- mountPath: /tmp/cores
1222+
name: core-path
12061223
- mountPath: /var/run/frr
12071224
name: frr-sockets
12081225
- mountPath: /etc/frr
@@ -1280,6 +1297,9 @@ spec:
12801297
key: node-role.kubernetes.io/control-plane
12811298
operator: Exists
12821299
volumes:
1300+
- hostPath:
1301+
path: /home/core-dump
1302+
name: core-path
12831303
- emptyDir: {}
12841304
name: frr-sockets
12851305
- configMap:

config/all-in-one/frr-k8s.yaml

+23-3
Original file line numberDiff line numberDiff line change
@@ -1135,10 +1135,25 @@ spec:
11351135
- mountPath: /etc/frr_reloader
11361136
name: reloader
11371137
- command:
1138-
- /bin/sh
1139-
- -c
1138+
- /bin/bash
1139+
- -cx
11401140
- |
1141-
/sbin/tini -- /usr/lib/frr/docker-start &
1141+
if [ -r "/lib/lsb/init-functions" ]; then
1142+
. /lib/lsb/init-functions
1143+
else
1144+
log_success_msg() {
1145+
echo "$@"
1146+
}
1147+
log_warning_msg() {
1148+
echo "$@" >&2
1149+
}
1150+
log_failure_msg() {
1151+
echo "$@" >&2
1152+
}
1153+
fi
1154+
1155+
source /usr/lib/frr/frrcommon.sh
1156+
/usr/lib/frr/watchfrr -l 7 $(daemon_list) &
11421157
attempts=0
11431158
until [[ -f /etc/frr/frr.log || $attempts -eq 60 ]]; do
11441159
sleep 1
@@ -1172,6 +1187,8 @@ spec:
11721187
port: 7573
11731188
periodSeconds: 5
11741189
volumeMounts:
1190+
- mountPath: /tmp/cores
1191+
name: core-path
11751192
- mountPath: /var/run/frr
11761193
name: frr-sockets
11771194
- mountPath: /etc/frr
@@ -1249,6 +1266,9 @@ spec:
12491266
key: node-role.kubernetes.io/control-plane
12501267
operator: Exists
12511268
volumes:
1269+
- hostPath:
1270+
path: /home/core-dump
1271+
name: core-path
12521272
- emptyDir: {}
12531273
name: frr-sockets
12541274
- configMap:

config/frr-k8s/frr-k8s.yaml

+23-3
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,8 @@ spec:
103103
- name: TINI_SUBREAPER
104104
value: "true"
105105
volumeMounts:
106+
- mountPath: /tmp/cores
107+
name: core-path
106108
- name: frr-sockets
107109
mountPath: /var/run/frr
108110
- name: frr-conf
@@ -111,10 +113,25 @@ spec:
111113
# If the log file isn't created in 60 seconds the tail fails and the container is restarted.
112114
# This workaround is needed to have the frr logs as part of kubectl logs -c frr < k8s-frr-podname >.
113115
command:
114-
- /bin/sh
115-
- -c
116+
- /bin/bash
117+
- -cx
116118
- |
117-
/sbin/tini -- /usr/lib/frr/docker-start &
119+
if [ -r "/lib/lsb/init-functions" ]; then
120+
. /lib/lsb/init-functions
121+
else
122+
log_success_msg() {
123+
echo "$@"
124+
}
125+
log_warning_msg() {
126+
echo "$@" >&2
127+
}
128+
log_failure_msg() {
129+
echo "$@" >&2
130+
}
131+
fi
132+
133+
source /usr/lib/frr/frrcommon.sh
134+
/usr/lib/frr/watchfrr -l 7 $(daemon_list) &
118135
attempts=0
119136
until [[ -f /etc/frr/frr.log || $attempts -eq 60 ]]; do
120137
sleep 1
@@ -169,6 +186,9 @@ spec:
169186
key: node-role.kubernetes.io/control-plane
170187
operator: Exists
171188
volumes:
189+
- name: core-path
190+
hostPath:
191+
path: /home/core-dump
172192
- name: frr-sockets
173193
emptyDir: {}
174194
- name: frr-startup

e2etests/go.work.sum

+1-2
Original file line numberDiff line numberDiff line change
@@ -818,6 +818,7 @@ github.com/moby/spdystream v0.2.0/go.mod h1:f7i0iNDQJ059oMTcWxx8MA/zKFIuD/lY+0Gq
818818
github.com/moby/spdystream v0.4.0/go.mod h1:xBAYlnt/ay+11ShkdFKNAG7LsyK/tmNBVvVOwrfMgdI=
819819
github.com/moby/sys/mountinfo v0.5.0 h1:2Ks8/r6lopsxWi9m58nlwjaeSzUX9iiL1vj5qB/9ObI=
820820
github.com/moby/sys/mountinfo v0.6.2/go.mod h1:IJb6JQeOklcdMU9F5xQ8ZALD+CUr5VlGpwtX+VE0rpI=
821+
github.com/moby/term v0.5.0 h1:xt8Q1nalod/v7BqbG21f8mQPqH+xAaC9C3N3wfWbVP0=
821822
github.com/moby/term v0.5.0/go.mod h1:8FzsFHVUBGZdbDsJw/ot+X+d5HLUbvklYLJ9uGfcI3Y=
822823
github.com/modern-go/reflect2 v0.0.0-20180701023420-4b7aa43c6742/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0=
823824
github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0=
@@ -1517,8 +1518,6 @@ k8s.io/code-generator v0.29.3/go.mod h1:x47ofBhN4gxYFcxeKA1PYXeaPreAGaDN85Y/lNUs
15171518
k8s.io/code-generator v0.30.2/go.mod h1:RQP5L67QxqgkVquk704CyvWFIq0e6RCMmLTXxjE8dVA=
15181519
k8s.io/code-generator v0.31.0/go.mod h1:84y4w3es8rOJOUUP1rLsIiGlO1JuEaPFXQPA9e/K6U0=
15191520
k8s.io/component-base v0.29.0/go.mod h1:sADonFTQ9Zc9yFLghpDpmNXEdHyQmFIGbiuZbqAXQ1M=
1520-
k8s.io/component-base v0.31.0 h1:/KIzGM5EvPNQcYgwq5NwoQBaOlVFrghoVGr8lG6vNRs=
1521-
k8s.io/component-base v0.31.0/go.mod h1:TYVuzI1QmN4L5ItVdMSXKvH7/DtvIuas5/mm8YT3rTo=
15221521
k8s.io/component-helpers v0.28.3 h1:te9ieTGzcztVktUs92X53P6BamAoP73MK0qQP0WmDqc=
15231522
k8s.io/component-helpers v0.28.3/go.mod h1:oJR7I9ist5UAQ3y/CTdbw6CXxdMZ1Lw2Ua/EZEwnVLs=
15241523
k8s.io/controller-manager v0.26.0/go.mod h1:GxUYtQDBE/RHh7AnZSZqwi2xBPIXlOaWsnDLflKGYrE=

e2etests/tests/graceful_restart.go

+129-20
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,17 @@
33
package tests
44

55
import (
6+
"errors"
67
"fmt"
78
"time"
89

910
"github.com/onsi/ginkgo/v2"
1011
"github.com/openshift-kni/k8sreporter"
1112
"go.universe.tf/e2etest/pkg/frr/container"
13+
frrcontainer "go.universe.tf/e2etest/pkg/frr/container"
14+
15+
"go.universe.tf/e2etest/pkg/executor"
16+
"go.universe.tf/e2etest/pkg/frr"
1217

1318
frrk8sv1beta1 "github.com/metallb/frr-k8s/api/v1beta1"
1419
"github.com/metallb/frrk8stests/pkg/config"
@@ -28,10 +33,11 @@ import (
2833

2934
var _ = ginkgo.Describe("Establish BGP session with EnableGracefulRestart", func() {
3035
var (
31-
cs clientset.Interface
32-
updater *config.Updater
33-
reporter *k8sreporter.KubernetesReporter
34-
nodes []corev1.Node
36+
cs clientset.Interface
37+
updater *config.Updater
38+
reporter *k8sreporter.KubernetesReporter
39+
nodes []corev1.Node
40+
prefixesV4 = scaleUP(200)
3541
)
3642

3743
cleanup := func(u *config.Updater) error {
@@ -63,17 +69,23 @@ var _ = ginkgo.Describe("Establish BGP session with EnableGracefulRestart", func
6369
})
6470

6571
ginkgo.AfterEach(func() {
72+
73+
seed := ginkgo.GinkgoRandomSeed()
74+
testName := fmt.Sprintf("%s-%d", ginkgo.CurrentSpecReport().LeafNodeText, seed)
6675
if ginkgo.CurrentSpecReport().Failed() {
67-
testName := ginkgo.CurrentSpecReport().LeafNodeText
68-
dump.K8sInfo(testName, reporter)
69-
dump.BGPInfo(testName, infra.FRRContainers, cs)
76+
testName += "-failed"
7077
}
78+
ginkgo.By(testName)
79+
dump.K8sInfo(testName, reporter)
80+
dump.BGPInfo(testName, infra.FRRContainers, cs)
7181
})
7282

7383
ginkgo.Context("When restarting the frrk8s deamon pods", func() {
7484

75-
ginkgo.DescribeTable("external BGP peer maintains routes", func(ipFam ipfamily.Family, prefix string) {
85+
ginkgo.DescribeTable("external BGP peer maintains routes", func(ipFam ipfamily.Family, prefix []string) {
7686
frrs := config.ContainersForVRF(infra.FRRContainers, "")
87+
// cnt, err := config.ContainerByName(infra.FRRContainers, "ebgp-multi-hop")
88+
// frrs := []*frrcontainer.FRR{cnt}
7789
for _, c := range frrs {
7890
err := container.PairWithNodes(cs, c, ipFam)
7991
Expect(err).NotTo(HaveOccurred(), "set frr config in infra containers failed")
@@ -88,48 +100,145 @@ var _ = ginkgo.Describe("Establish BGP session with EnableGracefulRestart", func
88100
Namespace: k8s.FRRK8sNamespace,
89101
},
90102
Spec: frrk8sv1beta1.FRRConfigurationSpec{
103+
// NodeSelector: metav1.LabelSelector{
104+
// MatchLabels: map[string]string{
105+
// "kubernetes.io/hostname": nodes[0].GetLabels()["kubernetes.io/hostname"],
106+
// },
107+
// },
91108
BGP: frrk8sv1beta1.BGPConfig{
92109
Routers: []frrk8sv1beta1.Router{
93110
{
94111
ASN: infra.FRRK8sASN,
95112
Neighbors: config.NeighborsFromPeers(peersConfig.PeersV4, peersConfig.PeersV6),
96-
Prefixes: []string{prefix},
113+
Prefixes: prefix,
97114
},
98115
},
99116
},
100117
},
101118
}
119+
ginkgo.By("Before GR test")
102120

103121
err := updater.Update(peersConfig.Secrets, frrConfigCR)
104122
Expect(err).NotTo(HaveOccurred(), "apply the CR in k8s api failed")
105-
106-
check := func() error {
123+
Eventually(func() error {
107124
for _, p := range peersConfig.Peers() {
108-
err := routes.CheckNeighborHasPrefix(p.FRR, p.FRR.RouterConfig.VRF, prefix, nodes)
125+
ValidateFRRPeeredWithNodes(nodes, &p.FRR, ipFam)
126+
neighbors, err := frr.NeighborsInfo(p.FRR)
127+
Expect(err).NotTo(HaveOccurred())
128+
for _, n := range neighbors {
129+
Expect(n.GRInfo.RemoteGrMode).Should(Equal("Restart"))
130+
}
131+
132+
err = routes.CheckNeighborHasPrefix(p.FRR, p.FRR.RouterConfig.VRF, prefix[0], nodes)
109133
if err != nil {
110-
return fmt.Errorf("Neigh %s does not have prefix %s: %w", p.FRR.Name, prefix, err)
134+
return fmt.Errorf("Neigh %s does not have prefixes: %w", p.FRR.Name, err)
111135
}
112136
}
113137
return nil
114-
}
115-
116-
Eventually(check, time.Minute, time.Second).ShouldNot(HaveOccurred(),
117-
"route should exist before we restart frr-k8s")
138+
}, time.Minute, time.Second).ShouldNot(HaveOccurred(), "route should exist before we restart frr-k8s")
118139

140+
ginkgo.By("Start GR test")
119141
c := make(chan struct{})
120142
go func() { // go restart frr-k8s while Consistently check that route exists
121143
defer ginkgo.GinkgoRecover()
122144
err := k8s.RestartFRRK8sPods(cs)
123145
Expect(err).NotTo(HaveOccurred(), "frr-k8s pods failed to restart")
146+
for _, p := range peersConfig.Peers() {
147+
ValidateFRRPeeredWithNodes(nodes, &p.FRR, ipFam)
148+
}
149+
ginkgo.By("FRRK8s pod restarted and BGP established")
124150
close(c)
125151
}()
126152

153+
check := func() error {
154+
var returnError error
155+
156+
for _, p := range peersConfig.Peers() {
157+
err := checkRoutes(p.FRR, prefix)
158+
if err != nil {
159+
returnError = errors.Join(returnError, fmt.Errorf("Neigh %s : %w", p.FRR.Name, err))
160+
for i := 0; i < 20; i++ {
161+
if err := checkRoutes(p.FRR, prefix); err != nil {
162+
ginkgo.By(fmt.Sprintf("%d Neigh %s does NOT have prefix %v", i, p.FRR.Name, err))
163+
} else {
164+
ginkgo.By(fmt.Sprintf("%d Neigh %s does have prefix", i, p.FRR.Name))
165+
}
166+
time.Sleep(time.Second)
167+
}
168+
}
169+
}
170+
return returnError
171+
}
172+
127173
// 2*time.Minute is important because that is the Graceful Restart timer.
128-
Consistently(check, 2*time.Minute, time.Second).ShouldNot(HaveOccurred())
174+
Consistently(check, 30*time.Second, time.Second).ShouldNot(HaveOccurred())
129175
Eventually(c, time.Minute, time.Second).Should(BeClosed(), "restart FRRK8s pods are not yet ready")
130176
},
131-
ginkgo.Entry("IPV4", ipfamily.IPv4, "192.168.2.0/24"),
132-
ginkgo.Entry("IPV6", ipfamily.IPv6, "fc00:f853:ccd:e799::/64"),
177+
ginkgo.Entry("IPV4", ipfamily.IPv4, prefixesV4),
178+
// ginkgo.Entry("IPV6", ipfamily.IPv6, []string{"2001:db8:5555::5/128"}),
133179
)
134180
})
135181
})
182+
183+
func checkRoutes(cnt frrcontainer.FRR, want []string) error {
184+
m := sliceToMap(want)
185+
v4, _, err := frr.Routes(cnt)
186+
if err != nil {
187+
// ignore the docker exec errors
188+
return nil
189+
}
190+
if len(m) == 0 {
191+
return fmt.Errorf("nil map m")
192+
}
193+
if len(v4) == 0 {
194+
IPRoutes(cnt)
195+
return fmt.Errorf("nil map v4")
196+
}
197+
for _, r := range v4 {
198+
// if r.Stale {
199+
// fmt.Printf("S")
200+
// }
201+
delete(m, r.Destination.String())
202+
}
203+
if len(m) > 0 {
204+
return fmt.Errorf("%d routes %+v not found ", len(m), getKeys(m))
205+
}
206+
return nil
207+
}
208+
209+
func scaleUP(size int) []string {
210+
if size > 255 {
211+
panic("255 is max")
212+
}
213+
214+
ret := []string{}
215+
for i := 0; i < size; i++ {
216+
ret = append(ret, fmt.Sprintf("5.5.5.%d/32", i))
217+
}
218+
219+
return ret
220+
}
221+
222+
func sliceToMap(slice []string) map[string]bool {
223+
m := make(map[string]bool)
224+
for _, v := range slice {
225+
m[v] = true
226+
}
227+
return m
228+
}
229+
func getKeys(m map[string]bool) []string {
230+
keys := make([]string, 0, len(m)) // Initialize slice with the capacity of the map length
231+
for key := range m {
232+
keys = append(keys, key)
233+
}
234+
return keys
235+
}
236+
func IPRoutes(exec executor.Executor) error {
237+
cmd := "show ip route bgp"
238+
res, err := exec.Exec("vtysh", "-c", cmd)
239+
if err != nil {
240+
return errors.Join(err, errors.New("Failed to query routes"))
241+
}
242+
fmt.Println("res", res)
243+
return nil
244+
}

0 commit comments

Comments
 (0)