diff --git a/.dockerignore b/.dockerignore index d298dcaad3..ea89279094 100644 --- a/.dockerignore +++ b/.dockerignore @@ -28,6 +28,9 @@ bin /site/public /test +# Allow upgrade test directory +!/test/upgrade + # Created by .ignore support plugin (hsz.mobi) ### Go template # Binaries for programs and plugins diff --git a/cloudbuild.yaml b/cloudbuild.yaml index 9f8003014b..29f08f5452 100644 --- a/cloudbuild.yaml +++ b/cloudbuild.yaml @@ -233,9 +233,18 @@ steps: # End to end tests # - # wait for us to be the oldest ongoing build before we run e2es - - name: gcr.io/cloud-builders/gcloud - id: e2e-wait-to-become-leader + # Build and Push upgrade test + - name: make-docker + id: push-upgrade-test + dir: test/upgrade + env: ['REGISTRY=${_REGISTRY}'] + args: [push] + waitFor: + - push-images + + # Wait for us to be the oldest ongoing build before we run upgrade and e2e tests + - name: gcr.io/google.com/cloudsdktool/cloud-sdk + id: wait-to-become-leader waitFor: [push-images] script: | #!/usr/bin/env bash @@ -258,10 +267,157 @@ steps: - BUILD_ID=$BUILD_ID - TRIGGER_NAME=$TRIGGER_NAME + # Run the upgrade tests parallel, fail this step if any of the tests fail + - name: gcr.io/google.com/cloudsdktool/cloud-sdk + id: submit-upgrade-test-cloud-build + dir: test/upgrade + entrypoint: bash + args: + - -c + - | + #!/usr/bin/env bash + set -e + set -o pipefail + export KUBECONFIG="/root/.kube/config" + mkdir -p /go/src/agones.dev/ /root/.kube/ + ln -s /workspace /go/src/agones.dev/agones + cd /go/src/agones.dev/agones/test/upgrade + + pids=() + typeset -A waitPids # Associative array for mapping `kubectl wait job` pid -> `kubectl wait job` output log name + tmpdir=$(mktemp -d) + trap 'rm -rf -- "$tmpdir"' EXIT SIGTERM + + # Update image tags to include the current build version. + DevVersion="${_BASE_VERSION}-dev-$(git rev-parse --short=7 HEAD)" + export DevVersion + sed "s/\${DevVersion}/${DevVersion}/" upgradeTest.yaml > "${tmpdir}"/upgradeTest.yaml + sed "s/\${DevVersion}/${DevVersion}/" versionMap.yaml > "${tmpdir}"/versionMap.yaml + + # Kill all currently running child processes on exit or if a non-zero signal is seen + trap 'echo Cleaning up any remaining running pids: $(jobs -p) ; kill $(jobs -p) 2> /dev/null || :' EXIT SIGTERM + + cloudProducts=("generic" "gke-autopilot") + declare -A versionsAndRegions=( [1.31]=us-east1 [1.30]=us-central1 [1.29]=us-west1 ) + + for cloudProduct in "${cloudProducts[@]}" + do + for version in "${!versionsAndRegions[@]}" + do + region=${versionsAndRegions[$version]} + if [ "$cloudProduct" = generic ] + then + testCluster="standard-upgrade-test-cluster-${version//./-}" + else + testCluster="gke-autopilot-upgrade-test-cluster-${version//./-}" + fi + testClusterLocation="${region}" + + gcloud container clusters get-credentials "$testCluster" --region="$testClusterLocation" --project="$PROJECT_ID" + + if [ "$cloudProduct" = gke-autopilot ] ; then + # For autopilot clusters use evictable "balloon" pods to keep a buffer in node pool autoscaling. + kubectl apply -f evictablePods.yaml + fi + + # Clean up any existing job / namespace / apiservice from previous run + echo Checking if resources from a previous build of upgrade-test-runner exist and need to be cleaned up on cluster "${testCluster}". + if kubectl get jobs | grep upgrade-test-runner ; then + echo Deleting job from previous run of upgrade-test-runner on cluster "${testCluster}". + kubectl delete job upgrade-test-runner + kubectl wait --for=delete pod -l job-name=upgrade-test-runner --timeout=5m + fi + + # Check if there are any dangling game servers. + if kubectl get gs | grep ".*"; then + # Remove any finalizers so that dangling game servers can be manually deleted. + kubectl get gs -o=custom-columns=:.metadata.name --no-headers | xargs kubectl patch gs -p '{"metadata":{"finalizers":[]}}' --type=merge + sleep 5 + echo Deleting game servers from previous run of upgrade-test-runner on cluster "${testCluster}". + kubectl delete gs -l app=sdk-client-test + fi + + if kubectl get po -l app=sdk-client-test | grep ".*"; then + echo Deleting pods from previous run of upgrade-test-runner on cluster "${testCluster}". + kubectl delete po -l app=sdk-client-test + kubectl wait --for=delete pod -l app=sdk-client-test --timeout=5m + fi + + # The v1.allocation.agones.dev apiservice does not get removed automatically and will prevent the namespace from terminating. + if kubectl get apiservice | grep v1.allocation.agones.dev ; then + echo Deleting v1.allocation.agones.dev from previous run of upgrade-test-runner on cluster "${testCluster}". + kubectl delete apiservice v1.allocation.agones.dev + fi + + if kubectl get namespace | grep agones-system ; then + echo Deleting agones-system namespace from previous run of upgrade-test-runner on cluster "${testCluster}". + kubectl delete namespace agones-system + kubectl wait --for=delete ns agones-system --timeout=5m + fi + + if kubectl get crds | grep agones ; then + echo Deleting crds from previous run of upgrade-test-runner on cluster "${testCluster}". + kubectl get crds -o=custom-columns=:.metadata.name | grep agones | xargs kubectl delete crd + fi + + echo kubectl apply -f permissions.yaml on cluster "${testCluster}" + kubectl apply -f permissions.yaml + echo kubectl apply -f versionMap.yaml on cluster "${testCluster}" + kubectl apply -f "${tmpdir}"/versionMap.yaml + echo kubectl apply -f gameserverTemplate.yaml on cluster "${testCluster}" + kubectl apply -f gameserverTemplate.yaml + + echo kubectl apply -f upgradeTest.yaml on cluster "${testCluster}" + kubectl apply -f "${tmpdir}"/upgradeTest.yaml + + # We need to wait for job pod to be created and ready before we can wait on the job itself. + # TODO: Once all test clusters are at Kubernetes Version >= 1.31 use `kubectl wait --for=create` instead of sleep. + # kubectl wait --for=create pod -l job-name=upgrade-test-runner --timeout=1m + sleep 10s + kubectl wait --for=condition=ready pod -l job-name=upgrade-test-runner --timeout=5m + + echo Wait for job upgrade-test-runner to complete or fail on cluster "${testCluster}" + kubectl wait job/upgrade-test-runner --timeout=20m --for jsonpath='{.status.conditions[*].status}'=True -o jsonpath='{.status.conditions[*].type}' | tee "${tmpdir}"/"${testCluster}".log & + waitPid=$! + pids+=( "$waitPid" ) + waitPids[$waitPid]="${tmpdir}"/"${testCluster}".log + done + done + + for pid in "${pids[@]}"; do + # This block executes when the process exits and pid status==0 + if wait $pid; then + outputLog="${waitPids[$pid]}" + # wait for output to finish writing to file + until [ -s "$outputLog" ]; do sleep 1; done + output=$(<"${outputLog}") + echo "${outputLog}": "${output}" + + # "Complete" is successful job run. + # Version 1.31 has "SuccessCriteriaMet" as the first completion status returned, or "FailureTarget" in case of failure. + if [ "$output" == "Complete" ] || [ "$output" == "SuccessCriteriaMet" ] ; then + continue + else + exit 1 + fi + # This block executes when the process exits and pid status!=0 + else + status=$? + outputLog="${waitPids[$pid]}" + echo "One of the upgrade tests pid $pid from cluster log $outputLog exited with a non-zero status ${status}." + exit $status + fi + done + echo "End of Upgrade Tests" + + waitFor: + - wait-to-become-leader + - push-upgrade-test + # cancel all the orphan e2e test cloud builds, fail to cancel any of the build will fail this whole build - name: gcr.io/cloud-builders/gcloud id: cancel-orphan-e2e-tests - waitFor: [e2e-wait-to-become-leader] + waitFor: [wait-to-become-leader] script: | #!/usr/bin/env bash until gcloud builds list --ongoing --filter "tags:'e2e-test'" --format="value(id)" | xargs --no-run-if-empty gcloud builds cancel @@ -386,7 +542,7 @@ steps: # - name: gcr.io/cloud-builders/gcloud id: cleanup-services - waitFor: [e2e-wait-to-become-leader] + waitFor: [wait-to-become-leader] allowFailure: true entrypoint: bash args: @@ -400,6 +556,7 @@ steps: done substitutions: + _BASE_VERSION: 1.46.0 _CACHE_BUCKET: agones-build-cache _HTMLTEST_CACHE_KEY: htmltest-0.10.1 _CPP_SDK_BUILD_CACHE_KEY: cpp-sdk-build @@ -407,7 +564,7 @@ substitutions: _RUST_SDK_BUILD_CACHE_KEY: rust-sdk-build _REGISTRY: us-docker.pkg.dev/${PROJECT_ID}/ci tags: [ci, 'commit-${COMMIT_SHA}'] -timeout: 18000s # 5h: 3h (e2e-wait-to-become-leader) + 1.5h (e2e timeout) + 0.5h (everything else) +timeout: 18000s # 5h: 3h (wait-to-become-leader) + 1.5h (e2e timeout) + 0.5h (everything else) queueTtl: 259200s # 72h images: - ${_REGISTRY}/agones-controller diff --git a/docs/governance/templates/release_issue.md b/docs/governance/templates/release_issue.md index 1c8aeacdb6..ca1c86b855 100644 --- a/docs/governance/templates/release_issue.md +++ b/docs/governance/templates/release_issue.md @@ -51,7 +51,7 @@ and copy it into a release issue. Fill in relevant values, found inside {} - [ ] Run `make post-build-release` to build the artifacts in GCS(These files will be attached in the release notes) and to push the latest images in the release repository and push chart on agones-chart. - [ ] Run `make shell` and run `gcloud config configurations activate ` to switch Agones development tooling off of the `agones-images` project. -- [ ] Smoke Test: run `make install-release` to view helm releases, uninstall agones-system namesapce, fetch the latest version of Agones, verify the new version, installing agones-system namespace, and list all the pods of agones-system. +- [ ] Smoke Test: run `make install-release` to view helm releases, uninstall agones-system namespace, fetch the latest version of Agones, verify the new version, installing agones-system namespace, and list all the pods of agones-system. - [ ] Attach all assets found in the cloud storage with {version} to the draft GitHub Release. - [ ] Copy any review changes from the release blog post into the draft GitHub release. - [ ] Publish the draft GitHub Release. diff --git a/examples/allocation-endpoint/README.md b/examples/allocation-endpoint/README.md index 214bd07c0b..bc1a065558 100644 --- a/examples/allocation-endpoint/README.md +++ b/examples/allocation-endpoint/README.md @@ -44,7 +44,7 @@ helm upgrade my-release --install --namespace agones-system --create-namespace a --set agones.allocator.service.http.enabled=false ``` -After installing Agones, deploy [ESP](https://cloud.google.com/endpoints/docs/grpc/specify-esp-v2-startup-options) which is an envoy based proxy, deployed as a sidecar along side `agones-alloator` container. Run the following to patch the service deployement, change the service port to ESP and add annotation to `agones-allocator` service account to impersonate GCP service account. +After installing Agones, deploy [ESP](https://cloud.google.com/endpoints/docs/grpc/specify-esp-v2-startup-options) which is an envoy based proxy, deployed as a sidecar along side `agones-alloator` container. Run the following to patch the service deployment, change the service port to ESP and add annotation to `agones-allocator` service account to impersonate GCP service account. Replace [GKE-PROJECT-ID] in `patch-agones-allocator.yaml` with your project ID before running the scripts. diff --git a/examples/allocator-client-csharp/Program.cs b/examples/allocator-client-csharp/Program.cs index 2c665f344a..3302a550df 100644 --- a/examples/allocator-client-csharp/Program.cs +++ b/examples/allocator-client-csharp/Program.cs @@ -13,7 +13,7 @@ class Program static async Task Main(string[] args) { if (args.Length < 6) { - throw new Exception("Arguments are missing. Expecting: "); + throw new Exception("Arguments are missing. Expecting: "); } string clientKey = File.ReadAllText(args[0]); diff --git a/examples/simple-game-server/README.md b/examples/simple-game-server/README.md index 556de4aa08..4ca50611c7 100644 --- a/examples/simple-game-server/README.md +++ b/examples/simple-game-server/README.md @@ -43,8 +43,8 @@ There are some text commands you can send the server to affect its behavior: | "LIST_CONTAINS" | Returns true if the given value is in the given List, false otherwise | | "GET_LIST_LENGTH" | Returns the length (number of values) of the given List as a string | | "GET_LIST_VALUES" | Return the values in the given List as a comma delineated string | -| "APPEND_LIST_VALUE" | Returns if the given value was successfuly added to the List (true) or not (false) | -| "DELETE_LIST_VALUE" | Rreturns if the given value was successfuly deleted from the List (true) or not (false) | +| "APPEND_LIST_VALUE" | Returns if the given value was successfully added to the List (true) or not (false) | +| "DELETE_LIST_VALUE" | Rreturns if the given value was successfully deleted from the List (true) or not (false) | ## Configuration diff --git a/examples/simple-game-server/handlers.go b/examples/simple-game-server/handlers.go index c1ccd20688..23e26f43d9 100644 --- a/examples/simple-game-server/handlers.go +++ b/examples/simple-game-server/handlers.go @@ -283,7 +283,7 @@ func handlePlayerConnected(s *sdk.SDK, parts []string, _ ...context.CancelFunc) return } -// handleGetPlayers returns a comma delimeted list of connected players +// handleGetPlayers returns a comma delimited list of connected players func handleGetPlayers(s *sdk.SDK, parts []string, _ ...context.CancelFunc) (response string, addACK bool, responseError error) { log.Print("Retrieving connected player list") list, err := s.Alpha().GetConnectedPlayers() @@ -535,7 +535,7 @@ func handleGetListValues(s *sdk.SDK, parts []string, _ ...context.CancelFunc) (r return } -// handleAppendListValue returns if the given value was successfuly added to the List or not +// handleAppendListValue returns if the given value was successfully added to the List or not func handleAppendListValue(s *sdk.SDK, parts []string, _ ...context.CancelFunc) (response string, addACK bool, responseError error) { if len(parts) < 3 { response = "Invalid APPEND_LIST_VALUE, should have 2 arguments" @@ -553,7 +553,7 @@ func handleAppendListValue(s *sdk.SDK, parts []string, _ ...context.CancelFunc) return } -// handleDeleteListValue returns if the given value was successfuly deleted from the List or not +// handleDeleteListValue returns if the given value was successfully deleted from the List or not func handleDeleteListValue(s *sdk.SDK, parts []string, _ ...context.CancelFunc) (response string, addACK bool, responseError error) { if len(parts) < 3 { response = "Invalid DELETE_LIST_VALUE, should have 2 arguments" diff --git a/examples/simple-genai-server/main.go b/examples/simple-genai-server/main.go index bfc68690a6..a4607c9bec 100644 --- a/examples/simple-genai-server/main.go +++ b/examples/simple-genai-server/main.go @@ -224,7 +224,7 @@ type Message struct { func handleGenAIRequest(prompt string, clientConn *connection, chatHistory []Message) (string, error) { var jsonStr []byte var err error - // If the endpoint is the NPC API, use the json request format specifc to that API + // If the endpoint is the NPC API, use the json request format specific to that API if clientConn.npc { npcRequest := NPCRequest{ Msg: prompt, @@ -329,7 +329,7 @@ func autonomousChat(prompt string, conn1 *connection, conn2 *connection, numChat autonomousChat(response, conn2, conn1, numChats, stopPhase, chatHistory) } -// Manually interact via TCP with the GenAI endpont +// Manually interact via TCP with the GenAI endpoint func tcpListener(port string, genAiConn *connection) { log.Printf("Starting TCP server, listening on port %s", port) ln, err := net.Listen("tcp", ":"+port) diff --git a/install/helm/agones/templates/controller.yaml b/install/helm/agones/templates/controller.yaml index a56ad3aa3c..58f9389339 100644 --- a/install/helm/agones/templates/controller.yaml +++ b/install/helm/agones/templates/controller.yaml @@ -59,6 +59,9 @@ spec: app: {{ template "agones.name" . }} release: {{ .Release.Name }} heritage: {{ .Release.Service }} +{{- if .Values.agones.controller.labels }} +{{- toYaml .Values.agones.controller.labels | nindent 8 }} +{{- end }} spec: {{- if .Values.agones.controller.topologySpreadConstraints }} topologySpreadConstraints: diff --git a/install/helm/agones/templates/extensions-deployment.yaml b/install/helm/agones/templates/extensions-deployment.yaml index 3745af240b..f6350a96c0 100644 --- a/install/helm/agones/templates/extensions-deployment.yaml +++ b/install/helm/agones/templates/extensions-deployment.yaml @@ -52,6 +52,9 @@ spec: app: {{ template "agones.name" . }} release: {{ .Release.Name }} heritage: {{ .Release.Service }} +{{- if .Values.agones.extensions.labels }} +{{- toYaml .Values.agones.extensions.labels | nindent 8 }} +{{- end }} spec: {{- if .Values.agones.extensions.topologySpreadConstraints }} topologySpreadConstraints: diff --git a/install/helm/agones/templates/service/allocation.yaml b/install/helm/agones/templates/service/allocation.yaml index e360a125d8..8c538029f8 100644 --- a/install/helm/agones/templates/service/allocation.yaml +++ b/install/helm/agones/templates/service/allocation.yaml @@ -65,7 +65,7 @@ spec: targetPort: {{ .Values.agones.allocator.service.grpc.targetPort }} {{- if .Values.agones.allocator.service.grpc.appProtocol }} appProtocol: {{.Values.agones.allocator.service.grpc.appProtocol}} - {{- end}} + {{- end}} {{- if eq .Values.agones.allocator.service.serviceType "NodePort" }} nodePort: {{ .Values.agones.allocator.service.grpc.nodePort }} {{- end }} @@ -79,6 +79,7 @@ spec: loadBalancerIP: {{ .Values.agones.allocator.service.loadBalancerIP }} {{- end }} {{- if eq .Values.agones.allocator.service.serviceType "LoadBalancer" }} + externalTrafficPolicy: {{ .Values.agones.allocator.service.externalTrafficPolicy }} {{- if .Values.agones.allocator.service.loadBalancerSourceRanges }} loadBalancerSourceRanges: {{ toYaml .Values.agones.allocator.service.loadBalancerSourceRanges | indent 4 }} diff --git a/install/helm/agones/values.yaml b/install/helm/agones/values.yaml index b63431b440..85d1ea9ac2 100644 --- a/install/helm/agones/values.yaml +++ b/install/helm/agones/values.yaml @@ -54,6 +54,7 @@ agones: # cpu: 1 # memory: 256Mi nodeSelector: {} + labels: {} annotations: {} tolerations: - key: "agones.dev/agones-system" @@ -105,6 +106,7 @@ agones: # memory: 256Mi nodeSelector: {} annotations: {} + labels: {} # Determines if the Agones extensions should operate in hostNetwork mode. # # This setting is necessary for certain managed Kubernetes clusters (e.g., AWS EKS) that use custom @@ -231,6 +233,7 @@ agones: service: name: agones-allocator serviceType: LoadBalancer + externalTrafficPolicy: Cluster clusterIP: "" loadBalancerIP: "" loadBalancerSourceRanges: [] diff --git a/install/yaml/install.yaml b/install/yaml/install.yaml index 5ed11789a3..19d3c9596f 100644 --- a/install/yaml/install.yaml +++ b/install/yaml/install.yaml @@ -17822,6 +17822,7 @@ spec: targetPort: 8443 protocol: TCP type: LoadBalancer + externalTrafficPolicy: Cluster --- # Source: agones/templates/service/allocation.yaml apiVersion: v1 diff --git a/pkg/fleetautoscalers/fleetautoscalers_test.go b/pkg/fleetautoscalers/fleetautoscalers_test.go index eec82195c0..5a67717363 100644 --- a/pkg/fleetautoscalers/fleetautoscalers_test.go +++ b/pkg/fleetautoscalers/fleetautoscalers_test.go @@ -772,6 +772,31 @@ func TestApplyCounterPolicy(t *testing.T) { wantErr: true, }, }, + "Counter based fleet does not have any replicas": { + fleet: modifiedFleet(func(f *agonesv1.Fleet) { + f.Spec.Template.Spec.Counters = make(map[string]agonesv1.CounterStatus) + f.Spec.Template.Spec.Counters["gamers"] = agonesv1.CounterStatus{ + Count: 0, + Capacity: 7} + f.Status.Replicas = 0 + f.Status.ReadyReplicas = 0 + f.Status.AllocatedReplicas = 0 + f.Status.Counters = make(map[string]agonesv1.AggregatedCounterStatus) + f.Status.Counters["gamers"] = agonesv1.AggregatedCounterStatus{} + }), + featureFlags: string(utilruntime.FeatureCountsAndLists) + "=true", + cp: &autoscalingv1.CounterPolicy{ + Key: "gamers", + MaxCapacity: 100, + MinCapacity: 10, + BufferSize: intstr.FromInt(10), + }, + want: expected{ + replicas: 2, + limited: true, + wantErr: false, + }, + }, "fleet spec does not have counter": { fleet: modifiedFleet(func(f *agonesv1.Fleet) { f.Spec.Template.Spec.Counters = make(map[string]agonesv1.CounterStatus) @@ -1570,7 +1595,7 @@ func TestApplyListPolicy(t *testing.T) { wantErr: true, }, }, - "fleet does not have any replicas": { + "List based fleet does not have any replicas": { fleet: modifiedFleet(func(f *agonesv1.Fleet) { f.Spec.Template.Spec.Lists = make(map[string]agonesv1.ListStatus) f.Spec.Template.Spec.Lists["gamers"] = agonesv1.ListStatus{ diff --git a/pkg/gameserversets/controller.go b/pkg/gameserversets/controller.go index dc0b169a7b..0feef01729 100644 --- a/pkg/gameserversets/controller.go +++ b/pkg/gameserversets/controller.go @@ -639,6 +639,7 @@ func computeStatus(gsSet *agonesv1.GameServerSet, list []*agonesv1.GameServer) a // Initialize list status with empty lists from spec if runtime.FeatureEnabled(runtime.FeatureCountsAndLists) { status.Lists = createInitialListStatus(gsSet) + status.Counters = createInitialCounterStatus(gsSet) } for _, gs := range list { if gs.IsBeingDeleted() { @@ -700,6 +701,14 @@ func createInitialListStatus(gsSet *agonesv1.GameServerSet) map[string]agonesv1. return list } +func createInitialCounterStatus(gsSet *agonesv1.GameServerSet) map[string]agonesv1.AggregatedCounterStatus { + counters := make(map[string]agonesv1.AggregatedCounterStatus) + for name := range gsSet.Spec.Template.Spec.Counters { + counters[name] = agonesv1.AggregatedCounterStatus{} + } + return counters +} + // aggregateCounters adds the contents of a CounterStatus map to an AggregatedCounterStatus map. func aggregateCounters(aggCounterStatus map[string]agonesv1.AggregatedCounterStatus, counterStatus map[string]agonesv1.CounterStatus, diff --git a/pkg/gameserversets/controller_test.go b/pkg/gameserversets/controller_test.go index 511be442d9..d7fe7a2395 100644 --- a/pkg/gameserversets/controller_test.go +++ b/pkg/gameserversets/controller_test.go @@ -412,6 +412,44 @@ func TestComputeStatus(t *testing.T) { assert.Equal(t, expected, computeStatus(gsSet, list)) }) + t.Run("counters with no gameservers", func(t *testing.T) { + utilruntime.FeatureTestMutex.Lock() + defer utilruntime.FeatureTestMutex.Unlock() + + require.NoError(t, utilruntime.ParseFeatures(fmt.Sprintf("%s=true", utilruntime.FeatureCountsAndLists))) + + gsSet := defaultFixture() + gsSet.Spec.Template.Spec.Counters = map[string]agonesv1.CounterStatus{ + "firstCounter": {Capacity: 10, Count: 1}, + "secondCounter": {Capacity: 10, Count: 1}, + } + var list []*agonesv1.GameServer + + expected := agonesv1.GameServerSetStatus{ + Replicas: 0, + ReadyReplicas: 0, + ReservedReplicas: 0, + AllocatedReplicas: 0, + Lists: map[string]agonesv1.AggregatedListStatus{}, + Counters: map[string]agonesv1.AggregatedCounterStatus{ + "firstCounter": { + AllocatedCount: 0, + AllocatedCapacity: 0, + Capacity: 0, + Count: 0, + }, + "secondCounter": { + AllocatedCount: 0, + AllocatedCapacity: 0, + Capacity: 0, + Count: 0, + }, + }, + } + + assert.Equal(t, expected, computeStatus(gsSet, list)) + }) + t.Run("lists", func(t *testing.T) { utilruntime.FeatureTestMutex.Lock() defer utilruntime.FeatureTestMutex.Unlock() @@ -484,7 +522,7 @@ func TestComputeStatus(t *testing.T) { ReadyReplicas: 0, ReservedReplicas: 0, AllocatedReplicas: 0, - Counters: nil, + Counters: map[string]agonesv1.AggregatedCounterStatus{}, Lists: map[string]agonesv1.AggregatedListStatus{ "firstList": { AllocatedCount: 0, diff --git a/site/content/en/docs/Installation/Install Agones/helm.md b/site/content/en/docs/Installation/Install Agones/helm.md index 497d7a382b..a0223e0559 100644 --- a/site/content/en/docs/Installation/Install Agones/helm.md +++ b/site/content/en/docs/Installation/Install Agones/helm.md @@ -150,6 +150,8 @@ The following tables lists the configurable parameters of the Agones chart and t ### Agones Controller +{{% feature expiryVersion="1.46.0" %}} + | Parameter | Description | Default | |----------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------| | `agones.controller.replicas` | The number of replicas to run in the `agones-controller` deployment. | `2` | @@ -190,7 +192,49 @@ The following tables lists the configurable parameters of the Agones chart and t | `agones.controller.maxGameServerDeletionsPerBatch` | Maximum number of GameServer deletion calls per batch | `64` | | `agones.controller.maxPodPendingCount` | Maximum number of pending pods per game server set | `5000` | - +{{% /feature %}} +{{% feature publishVersion="1.46.0" %}} +| Parameter | Description | Default | +|----------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------| +| `agones.controller.replicas` | The number of replicas to run in the `agones-controller` deployment. | `2` | +| `agones.controller.pdb.minAvailable` | Description of the number of pods from that set that must still be available after the eviction, even in the absence of the evicted pod. Can be either an absolute number or a percentage. Mutually Exclusive with `maxUnavailable` | `1` | +| `agones.controller.pdb.maxUnavailable` | Description of the number of pods from that set that can be unavailable after the eviction. It can be either an absolute number or a percentage Mutually Exclusive with `minAvailable` | \`\` | +| `agones.controller.http.port` | Port to use for liveness probe service and metrics | `8080` | +| `agones.controller.healthCheck.initialDelaySeconds` | Initial delay before performing the first probe (in seconds) | `3` | +| `agones.controller.healthCheck.periodSeconds` | Seconds between every liveness probe (in seconds) | `3` | +| `agones.controller.healthCheck.failureThreshold` | Number of times before giving up (in seconds) | `3` | +| `agones.controller.healthCheck.timeoutSeconds` | Number of seconds after which the probe times out (in seconds) | `1` | +| `agones.controller.resources` | Controller [resource requests/limit][resources] | `{}` | +| `agones.controller.generateTLS` | Set to true to generate TLS certificates or false to provide your own certificates | `true` | +| `agones.controller.tlsCert` | Custom TLS certificate provided as a string | \`\` | +| `agones.controller.tlsKey` | Custom TLS private key provided as a string | \`\` | +| `agones.controller.nodeSelector` | Controller [node labels][nodeSelector] for pod assignment | `{}` | +| `agones.controller.tolerations` | Controller [toleration][toleration] labels for pod assignment | `[]` | +| `agones.controller.affinity` | Controller [affinity][affinity] settings for pod assignment | `{}` | +| `agones.controller.labels` | [Labels][labels] added to the Agones controller pods | `{}` | +| `agones.controller.annotations` | [Annotations][annotations] added to the Agones controller pods | `{}` | +| `agones.controller.numWorkers` | Number of workers to spin per resource type | `100` | +| `agones.controller.apiServerQPS` | Maximum sustained queries per second that controller should be making against API Server | `400` | +| `agones.controller.apiServerQPSBurst` | Maximum burst queries per second that controller should be making against API Server | `500` | +| `agones.controller.logLevel` | Agones Controller Log level. Log only entries with that severity and above | `info` | +| `agones.controller.persistentLogs` | Store Agones controller logs in a temporary volume attached to a container for debugging | `true` | +| `agones.controller.persistentLogsSizeLimitMB` | Maximum total size of all Agones container logs in MB | `10000` | +| `agones.controller.disableSecret` | **Deprecated**. Use `agones.extensions.disableSecret` instead. Disables the creation of any allocator secrets. If true, you MUST provide the `{agones.releaseName}-cert` secrets before installation. | `false` | +| `agones.controller.customCertSecretPath` | Remap cert-manager path to server.crt and server.key | `{}` | +| `agones.controller.allocationApiService.annotations` | **Deprecated**. Use `agones.extensions.allocationApiService.annotations` instead. [Annotations][annotations] added to the Agones apiregistration | `{}` | +| `agones.controller.allocationApiService.disableCaBundle` | **Deprecated**. Use `agones.extensions.allocationApiService.disableCaBundle` instead. Disable ca-bundle so it can be injected by cert-manager. | `false` | +| `agones.controller.validatingWebhook.annotations` | **Deprecated**. Use `agones.extensions.validatingWebhook.annotations` instead. [Annotations][annotations] added to the Agones validating webhook | `{}` | +| `agones.controller.validatingWebhook.disableCaBundle` | **Deprecated**. Use `agones.extensions.validatingWebhook.disableCaBundle` instead. Disable ca-bundle so it can be injected by cert-manager | `false` | +| `agones.controller.mutatingWebhook.annotations` | **Deprecated**. Use `agones.extensions.mutatingWebhook.annotations` instead. [Annotations][annotations] added to the Agones mutating webhook | `{}` | +| `agones.controller.mutatingWebhook.disableCaBundle` | **Deprecated**. Use `agones.extensions.mutatingWebhook.disableCaBundle` instead. Disable ca-bundle so it can be injected by cert-manager | `false` | +| `agones.controller.allocationBatchWaitTime` | Wait time between each allocation batch when performing allocations in controller mode | `500ms` | +| `agones.controller.topologySpreadConstraints` | Ensures better resource utilization and high availability by evenly distributing Pods in the agones-system namespace | `{}` | +| `agones.controller.maxCreationParallelism` | Maximum number of parallelizing creation calls in GSS controller | `16` | +| `agones.controller.maxGameServerCreationsPerBatch` | Maximum number of GameServer creation calls per batch | `64` | +| `agones.controller.maxDeletionParallelism` | Maximum number of parallelizing deletion calls in GSS | `64` | +| `agones.controller.maxGameServerDeletionsPerBatch` | Maximum number of GameServer deletion calls per batch | `64` | +| `agones.controller.maxPodPendingCount` | Maximum number of pending pods per game server set | `5000` | +{{% /feature %}} ### Ping Service @@ -232,6 +276,7 @@ The following tables lists the configurable parameters of the Agones chart and t ### Allocator Service +{{% feature expiryVersion="1.46.0" %}} | Parameter | Description | Default | |----------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------| | `agones.allocator.apiServerQPS` | Maximum sustained queries per second that an allocator should be making against API Server | `400` | @@ -288,9 +333,109 @@ The following tables lists the configurable parameters of the Agones chart and t | `agones.allocator.pdb.maxUnavailable` | Description of the number of pods from that set that can be unavailable after the eviction. It can be either an absolute number or a percentage. Mutually Exclusive with `minAvailable` | \`\` | | `agones.allocator.topologySpreadConstraints` | Ensures better resource utilization and high availability by evenly distributing Pods in the agones-system namespace | `{}` | +{{% /feature %}} +{{% feature publishVersion="1.46.0" %}} +| Parameter | Description | Default | +| ----------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------- | +| `agones.allocator.apiServerQPS` | Maximum sustained queries per second that an allocator should be making against API Server | `400` | +| `agones.allocator.apiServerQPSBurst` | Maximum burst queries per second that an allocator should be making against API Server | `500` | +| `agones.allocator.remoteAllocationTimeout` | Remote allocation call timeout. | `10s` | +| `agones.allocator.totalRemoteAllocationTimeout` | Total remote allocation timeout including retries. | `30s` | +| `agones.allocator.logLevel` | Agones Allocator Log level. Log only entries with that severity and above | `info` | +| `agones.allocator.install` | Whether to install the [allocator service][allocator] | `true` | +| `agones.allocator.replicas` | The number of replicas to run in the deployment | `3` | +| `agones.allocator.service.name` | Service name for the allocator | `agones-allocator` | +| `agones.allocator.service.serviceType` | The [Service Type][service] of the HTTP Service | `LoadBalancer` | +| `agones.allocator.service.clusterIP` | The [Cluster IP][clusterIP] of the Agones allocator. If you want [Headless Service][headless-service] for Agones Allocator, you can set `None` to clusterIP. | \`\` | +| `agones.allocator.service.loadBalancerIP` | The [Load Balancer IP][loadBalancer] of the Agones allocator load balancer. Only works if the Kubernetes provider supports this option. | \`\` | +| `agones.allocator.service.loadBalancerSourceRanges` | The [Load Balancer SourceRanges][loadBalancer] of the Agones allocator load balancer. Only works if the Kubernetes provider supports this option. | `[]` | +| `agones.allocator.service.annotations` | [Annotations][annotations] added to the Agones allocator service | `{}` | +| `agones.allocator.service.http.enabled` | If true the [allocator service][allocator] will respond to [REST requests][rest-requests] | `true` | +| `agones.allocator.service.http.appProtocol` | The `appProtocol` to set on the Service for the http allocation port. If left blank, no value is set. | `` | +| `agones.allocator.service.http.port` | The port that is exposed externally by the [allocator service][allocator] for [REST requests][rest-requests] | `443` | +| `agones.allocator.service.http.portName` | The name of exposed port | `http` | +| `agones.allocator.service.http.targetPort` | The port that is used by the allocator pod to listen for [REST requests][rest-requests]. Note that the allocator server cannot bind to low numbered ports. | `8443` | +| `agones.allocator.service.http.nodePort` | If the ServiceType is set to "NodePort", this is the NodePort that the allocator http service is exposed on. | `30000-32767` | +| `agones.allocator.service.http.unallocatedStatusCode` | HTTP status code to return when no GameServer is available for allocation. This setting allows for custom responses when a game server allocation fails, offering flexibility in handling these situations. | `429` | +| `agones.allocator.service.grpc.enabled` | If true the [allocator service][allocator] will respond to [gRPC requests][grpc-requests] | `true` | +| `agones.allocator.service.grpc.port` | The port that is exposed externally by the [allocator service][allocator] for [gRPC requests][grpc-requests] | `443` | +| `agones.allocator.service.grpc.portName` | The name of exposed port | `` | +| `agones.allocator.service.grpc.appProtocol` | The `appProtocol` to set on the Service for the gRPC allocation port. If left blank, no value is set. | `` | +| `agones.allocator.service.grpc.nodePort` | If the ServiceType is set to "NodePort", this is the NodePort that the allocator gRPC service is exposed on. | `30000-32767` | +| `agones.allocator.service.grpc.targetPort` | The port that is used by the allocator pod to listen for [gRPC requests][grpc-requests]. Note that the allocator server cannot bind to low numbered ports. | `8443` | +| `agones.allocator.generateClientTLS` | Set to true to generate client TLS certificates or false to provide certificates in `certs/allocator/allocator-client.default/*` | `true` | +| `agones.allocator.generateTLS` | Set to true to generate TLS certificates or false to provide your own certificates | `true` | +| `agones.allocator.disableMTLS` | Turns off client cert authentication for incoming connections to the allocator. | `false` | +| `agones.allocator.disableTLS` | Turns off TLS security for incoming connections to the allocator. | `false` | +| `agones.allocator.disableSecretCreation` | Disables the creation of any allocator secrets. If true, you MUST provide the `allocator-tls`, `allocator-tls-ca`, and `allocator-client-ca` secrets before installation. | `false` | +| `agones.allocator.tlsCert` | Custom TLS certificate provided as a string | \`\` | +| `agones.allocator.tlsKey` | Custom TLS private key provided as a string | \`\` | +| `agones.allocator.clientCAs` | A map of secret key names to allowed client CA certificates provided as strings | `{}` | +| `agones.allocator.tolerations` | Allocator [toleration][toleration] labels for pod assignment | `[]` | +| `agones.allocator.affinity` | Allocator [affinity][affinity] settings for pod assignment | `{}` | +| `agones.allocator.annotations` | [Annotations][annotations] added to the Agones allocator pods | `{}` | +| `agones.allocator.resources` | Allocator pods [resource requests/limit][resources] | `{}` | +| `agones.allocator.labels` | [Labels][labels] Added to the Agones Allocator pods | `{}` | +| `agones.allocator.readiness.initialDelaySeconds` | Initial delay before performing the first probe (in seconds) | `3` | +| `agones.allocator.readiness.periodSeconds` | Seconds between every liveness probe (in seconds) | `3` | +| `agones.allocator.readiness.failureThreshold` | Number of times before giving up (in seconds) | `3` | +| `agones.allocator.nodeSelector` | Allocator [node labels][nodeSelector] for pod assignment | `{}` | +| `agones.allocator.serviceMetrics.name` | Second Service name for the allocator | `agones-allocator-metrics-service` | +| `agones.allocator.serviceMetrics.annotations` | [Annotations][annotations] added to the Agones allocator second Service | `{}` | +| `agones.allocator.serviceMetrics.http.port` | The port that is exposed within cluster by the [allocator service][allocator] for http requests | `8080` | +| `agones.allocator.serviceMetrics.http.portName` | The name of exposed port | `http` | +| `agones.allocator.allocationBatchWaitTime` | Wait time between each allocation batch when performing allocations in allocator mode | `500ms` | +| `agones.allocator.updateStrategy` | The [strategy](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#strategy) to apply to the ping deployment | `{}` | +| `agones.allocator.pdb.enabled` | Set to `true` to enable the creation of a [PodDisruptionBudget](https://kubernetes.io/docs/tasks/run-application/configure-pdb/) for the allocator deployment | `false` | +| `agones.allocator.pdb.minAvailable` | Description of the number of pods from that set that must still be available after the eviction, even in the absence of the evicted pod. Can be either an absolute number or a percentage. Mutually Exclusive with `maxUnavailable` | `1` | +| `agones.allocator.pdb.maxUnavailable` | Description of the number of pods from that set that can be unavailable after the eviction. It can be either an absolute number or a percentage. Mutually Exclusive with `minAvailable` | \`\` | +| `agones.allocator.topologySpreadConstraints` | Ensures better resource utilization and high availability by evenly distributing Pods in the agones-system namespace | `{}` | +| `agones.allocator.externalTrafficPolicy` | The `externalTrafficPolicy` for the Agones allocator service | `Cluster` | + +{{% /feature %}} ### Extensions +{{% feature expiryVersion="1.46.0" %}} +| Parameter | Description | Default | +|----------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------| +|`agones.extensions.hostNetwork` | Determines if the Agones extensions should operate in hostNetwork mode. If running in hostNetwork mode, you should change `agones.extensions.http.port` and `agones.extensions.webhooks.port` to an available port. | `false` | +| `agones.extensions.http.port` | Port to use for liveness probe service and metrics | `8080` | +|`agones.extensions.webhooks.port` | Port to use for webhook service | `8081` | +| `agones.extensions.healthCheck.initialDelaySeconds` | Initial delay before performing the first probe (in seconds) | `3` | +| `agones.extensions.healthCheck.periodSeconds` | Seconds between every liveness probe (in seconds) | `3` | +| `agones.extensions.healthCheck.failureThreshold` | Number of times before giving up (in seconds) | `3` | +| `agones.extensions.healthCheck.timeoutSeconds` | Number of seconds after which the probe times out (in seconds) | `1` | +| `agones.extensions.resources` | Extensions [resource requests/limit][resources] | `{}` | +| `agones.extensions.generateTLS` | Set to true to generate TLS certificates or false to provide your own certificates | `true` | +| `agones.extensions.tlsCert` | Custom TLS certificate provided as a string | \`\` | +| `agones.extensions.tlsKey` | Custom TLS private key provided as a string | \`\` | +| `agones.extensions.nodeSelector` | Extensions [node labels][nodeSelector] for pod assignment | `{}` | +| `agones.extensions.tolerations` | Extensions [toleration][toleration] labels for pod assignment | `[]` | +| `agones.extensions.affinity` | Extensions [affinity][affinity] settings for pod assignment | `{}` | +| `agones.extensions.annotations` | [Annotations][annotations] added to the Agones extensions pods | `{}` | +| `agones.extensions.numWorkers` | Number of workers to spin per resource type | `100` | +| `agones.extensions.apiServerQPS` | Maximum sustained queries per second that extensions should be making against API Server | `400` | +| `agones.extensions.apiServerQPSBurst` | Maximum burst queries per second that extensions should be making against API Server | `500` | +| `agones.extensions.logLevel` | Agones Extensions Log level. Log only entries with that severity and above | `info` | +| `agones.extensions.persistentLogs` | Store Agones extensions logs in a temporary volume attached to a container for debugging | `true` | +| `agones.extensions.persistentLogsSizeLimitMB` | Maximum total size of all Agones container logs in MB | `10000` | +| `agones.extensions.disableSecret` | Disables the creation of any allocator secrets. You MUST provide the `{agones.releaseName}-cert` secrets before installation if this is set to `true`. | `false` | +| `agones.extensions.customCertSecretPath` | Remap cert-manager path to server.crt and server.key | `{}` | +| `agones.extensions.allocationApiService.annotations` | [Annotations][annotations] added to the Agones API registration. | `{}` | +| `agones.extensions.allocationApiService.disableCaBundle` | Disable ca-bundle so it can be injected by cert-manager. | `false` | +| `agones.extensions.validatingWebhook.annotations` | [Annotations][annotations] added to the Agones validating webhook. | `{}` | +| `agones.extensions.validatingWebhook.disableCaBundle` | Disable ca-bundle so it can be injected by cert-manager. | `false` | +| `agones.extensions.mutatingWebhook.annotations` | [Annotations][annotations] added to the Agones mutating webhook. | `{}` | +| `agones.extensions.mutatingWebhook.disableCaBundle` | Disable ca-bundle so it can be injected by cert-manager. | `false` | +| `agones.extensions.allocationBatchWaitTime` | Wait time between each allocation batch when performing allocations in controller mode | `500ms` | +| `agones.extensions.pdb.minAvailable` | Description of the number of pods from that set that must still be available after the eviction, even in the absence of the evicted pod. Can be either an absolute number or a percentage. Mutually Exclusive with maxUnavailable | `1` | +| `agones.extensions.pdb.maxUnavailable` | Description of the number of pods from that set that can be unavailable after the eviction. It can be either an absolute number or a percentage. Mutually Exclusive with `minAvailable` | \`\` | +| `agones.extensions.replicas` | The number of replicas to run in the deployment | `2` | +| `agones.extensions.topologySpreadConstraints` | Ensures better resource utilization and high availability by evenly distributing Pods in the agones-system namespace | `{}` | + +{{% /feature %}} +{{% feature publishVersion="1.46.0" %}} | Parameter | Description | Default | |----------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------| |`agones.extensions.hostNetwork` | Determines if the Agones extensions should operate in hostNetwork mode. If running in hostNetwork mode, you should change `agones.extensions.http.port` and `agones.extensions.webhooks.port` to an available port. | `false` | @@ -307,6 +452,7 @@ The following tables lists the configurable parameters of the Agones chart and t | `agones.extensions.nodeSelector` | Extensions [node labels][nodeSelector] for pod assignment | `{}` | | `agones.extensions.tolerations` | Extensions [toleration][toleration] labels for pod assignment | `[]` | | `agones.extensions.affinity` | Extensions [affinity][affinity] settings for pod assignment | `{}` | +| `agones.extensions.labels` | [Labels][labels] added to the Agones extensions pods | `{}` | | `agones.extensions.annotations` | [Annotations][annotations] added to the Agones extensions pods | `{}` | | `agones.extensions.numWorkers` | Number of workers to spin per resource type | `100` | | `agones.extensions.apiServerQPS` | Maximum sustained queries per second that extensions should be making against API Server | `400` | @@ -328,6 +474,9 @@ The following tables lists the configurable parameters of the Agones chart and t | `agones.extensions.replicas` | The number of replicas to run in the deployment | `2` | | `agones.extensions.topologySpreadConstraints` | Ensures better resource utilization and high availability by evenly distributing Pods in the agones-system namespace | `{}` | +{{% /feature %}} + + ### GameServers | Parameter | Description | Default | diff --git a/test/e2e/fleetautoscaler_test.go b/test/e2e/fleetautoscaler_test.go index 78a164ea12..b8c2e98827 100644 --- a/test/e2e/fleetautoscaler_test.go +++ b/test/e2e/fleetautoscaler_test.go @@ -944,6 +944,88 @@ func TestCounterAutoscaler(t *testing.T) { } } +// nolint:dupl // Linter errors on lines are duplicate of TestListAutoscalerWithNoReplicas +func TestCounterAutoscalerWithNoReplicas(t *testing.T) { + if !runtime.FeatureEnabled(runtime.FeatureCountsAndLists) { + t.SkipNow() + } + t.Parallel() + + ctx := context.Background() + client := framework.AgonesClient.AgonesV1() + log := e2e.TestLogger(t) + + flt := defaultEmptyFleet(framework.Namespace) + flt.Spec.Template.Spec.Counters = map[string]agonesv1.CounterStatus{ + "games": { + Capacity: 5, + }, + } + + flt, err := client.Fleets(framework.Namespace).Create(ctx, flt.DeepCopy(), metav1.CreateOptions{}) + require.NoError(t, err) + defer client.Fleets(framework.Namespace).Delete(ctx, flt.ObjectMeta.Name, metav1.DeleteOptions{}) // nolint:errcheck + framework.AssertFleetCondition(t, flt, e2e.FleetReadyCount(flt.Spec.Replicas)) + + fleetautoscalers := framework.AgonesClient.AutoscalingV1().FleetAutoscalers(framework.Namespace) + + counterFas := func(f func(fap *autoscalingv1.FleetAutoscalerPolicy)) *autoscalingv1.FleetAutoscaler { + fas := autoscalingv1.FleetAutoscaler{ + ObjectMeta: metav1.ObjectMeta{Name: flt.ObjectMeta.Name + "-counter-autoscaler", Namespace: framework.Namespace}, + Spec: autoscalingv1.FleetAutoscalerSpec{ + FleetName: flt.ObjectMeta.Name, + Policy: autoscalingv1.FleetAutoscalerPolicy{ + Type: autoscalingv1.CounterPolicyType, + }, + Sync: &autoscalingv1.FleetAutoscalerSync{ + Type: autoscalingv1.FixedIntervalSyncType, + FixedInterval: autoscalingv1.FixedIntervalSync{ + Seconds: 1, + }, + }, + }, + } + f(&fas.Spec.Policy) + return &fas + } + testCases := map[string]struct { + fas *autoscalingv1.FleetAutoscaler + wantFasErr bool + wantReplicas int32 + }{ + "Scale Up to MinCapacity": { + fas: counterFas(func(fap *autoscalingv1.FleetAutoscalerPolicy) { + fap.Counter = &autoscalingv1.CounterPolicy{ + Key: "games", + BufferSize: intstr.FromInt(3), + MinCapacity: 16, + MaxCapacity: 100, + } + }), + wantFasErr: false, + wantReplicas: 4, // Capacity:20 + }, + } + for name, testCase := range testCases { + t.Run(name, func(t *testing.T) { + + fas, err := fleetautoscalers.Create(ctx, testCase.fas, metav1.CreateOptions{}) + if testCase.wantFasErr { + assert.Error(t, err) + return + } + assert.NoError(t, err) + + framework.AssertFleetCondition(t, flt, e2e.FleetReadyCount(testCase.wantReplicas)) + fleetautoscalers.Delete(ctx, fas.ObjectMeta.Name, metav1.DeleteOptions{}) // nolint:errcheck + + // Return to starting 0 replicas + framework.ScaleFleet(t, log, flt, 0) + framework.AssertFleetCondition(t, flt, e2e.FleetReadyCount(0)) + }) + } +} + func TestCounterAutoscalerAllocated(t *testing.T) { if !runtime.FeatureEnabled(runtime.FeatureCountsAndLists) { t.SkipNow() @@ -1209,6 +1291,7 @@ func TestListAutoscaler(t *testing.T) { } } +// nolint:dupl // Linter errors on lines are duplicate of TestCounterAutoscalerWithNoReplicas func TestListAutoscalerWithNoReplicas(t *testing.T) { if !runtime.FeatureEnabled(runtime.FeatureCountsAndLists) { t.SkipNow() @@ -1538,9 +1621,8 @@ func TestScheduleAutoscaler(t *testing.T) { stable := framework.AgonesClient.AgonesV1() fleets := stable.Fleets(framework.Namespace) flt, err := fleets.Create(ctx, defaultFleet(framework.Namespace), metav1.CreateOptions{}) - if assert.NoError(t, err) { - defer fleets.Delete(context.Background(), flt.ObjectMeta.Name, metav1.DeleteOptions{}) // nolint:errcheck - } + require.NoError(t, err) + defer fleets.Delete(context.Background(), flt.ObjectMeta.Name, metav1.DeleteOptions{}) // nolint:errcheck framework.AssertFleetCondition(t, flt, e2e.FleetReadyCount(flt.Spec.Replicas)) @@ -1550,7 +1632,7 @@ func TestScheduleAutoscaler(t *testing.T) { scheduleAutoscaler := defaultAutoscalerSchedule(t, flt) scheduleAutoscaler.Spec.Policy.Schedule.ActivePeriod.StartCron = nextCronMinute(time.Now()) fas, err := fleetautoscalers.Create(ctx, scheduleAutoscaler, metav1.CreateOptions{}) - assert.NoError(t, err) + require.NoError(t, err) framework.AssertFleetCondition(t, flt, e2e.FleetReadyCount(5)) fleetautoscalers.Delete(ctx, fas.ObjectMeta.Name, metav1.DeleteOptions{}) // nolint:errcheck @@ -1563,7 +1645,7 @@ func TestScheduleAutoscaler(t *testing.T) { scheduleAutoscaler = defaultAutoscalerSchedule(t, flt) scheduleAutoscaler.Spec.Policy.Schedule.ActivePeriod.StartCron = nextCronMinuteBetween(time.Now()) fas, err = fleetautoscalers.Create(ctx, scheduleAutoscaler, metav1.CreateOptions{}) - assert.NoError(t, err) + require.NoError(t, err) framework.AssertFleetCondition(t, flt, e2e.FleetReadyCount(5)) fleetautoscalers.Delete(ctx, fas.ObjectMeta.Name, metav1.DeleteOptions{}) // nolint:errcheck @@ -1759,8 +1841,13 @@ func nextCronMinute(currentTime time.Time) string { // nextCronMinuteBetween returns the minute between the very next minute // e.g. if the current time is 12:00, this method will return "1-2 * * * *" // meaning between 12:01 - 12:02 +// if the current minute if "59" since 59-0 is invalid, we'll return "0-1 * * * *" and wait for a bit longer on e2e tests. func nextCronMinuteBetween(currentTime time.Time) string { nextMinute := currentTime.Add(time.Minute).Minute() + if nextMinute == 59 { + return "0-1 * * * *" + } + secondMinute := currentTime.Add(2 * time.Minute).Minute() return fmt.Sprintf("%d-%d * * * *", nextMinute, secondMinute) } diff --git a/test/sdk/go/Makefile b/test/sdk/go/Makefile index 45b8d7726a..86ba5dba7b 100644 --- a/test/sdk/go/Makefile +++ b/test/sdk/go/Makefile @@ -29,7 +29,7 @@ project_path := $(dir $(mkfile_path)) root_path = $(realpath $(project_path)/) # Because go mod init in the Dockerfile installs the most recently released version of Agones, this # will need to be built and pushed post-release. During DEV it will be built at DEV - 1. -release_version = 1.44.0 +release_version = 1.45.0 server_tag := $(REGISTRY)/sdk-client-test:$(release_version) # _____ _ diff --git a/test/upgrade/Dockerfile b/test/upgrade/Dockerfile index 52aa6a5e3f..68583bfae4 100644 --- a/test/upgrade/Dockerfile +++ b/test/upgrade/Dockerfile @@ -12,22 +12,23 @@ # See the License for the specific language governing permissions and # limitations under the License. -FROM gcr.io/cloud-builders/gcloud AS builder +FROM golang:1.22.9-alpine AS builder -RUN apt-get update && \ - apt-get install -y curl && \ - apt-get clean +# install curl +RUN apk update && \ + apk upgrade && \ + apk --no-cache add curl WORKDIR /usr/local # install kubectl -ENV KUBECTL_VER=1.29.7 +ENV KUBECTL_VER=1.30.4 RUN curl -LO https://storage.googleapis.com/kubernetes-release/release/v${KUBECTL_VER}/bin/linux/amd64/kubectl && \ chmod go+rx ./kubectl && \ mv ./kubectl /usr/local/bin/kubectl # install Helm package manager -ENV HELM_VER=3.14.3 +ENV HELM_VER=3.16.3 ENV HELM_URL=https://get.helm.sh/helm-v${HELM_VER}-linux-amd64.tar.gz RUN curl -L ${HELM_URL} > /tmp/helm.tar.gz \ && tar -zxvf /tmp/helm.tar.gz -C /tmp \ @@ -35,27 +36,20 @@ RUN curl -L ${HELM_URL} > /tmp/helm.tar.gz \ && chmod go+rx /usr/local/bin/helm \ && rm /tmp/helm.tar.gz && rm -rf /tmp/linux-amd64 -# Build the Go image from source -FROM golang:1.22.6 AS build-stage - +# Copy and build the Go application WORKDIR /agones.dev - -COPY *.go ./ - +COPY test/upgrade/main.go ./ RUN go mod init agones.dev/agones/test/upgrade/testContainer RUN go mod tidy RUN go mod download - RUN CGO_ENABLED=0 GOOS=linux go build -o /upgrade-test -# Copy the above binary into a lean image -FROM gcr.io/distroless/static-debian12:nonroot AS build-release-stage - +# Copy the dev build Agones Helm chart WORKDIR / -COPY --from=build-stage /upgrade-test /upgrade-test -COPY --from=builder /usr/local /usr/local - -USER nonroot:nonroot +# Use a non-root user for security best practices +RUN adduser -D -g '' adduser +USER adduser +COPY --chown=adduser install/helm/agones /install/helm ENTRYPOINT ["/upgrade-test"] diff --git a/test/upgrade/Makefile b/test/upgrade/Makefile index 4b6bfe5a0a..e7c014412f 100644 --- a/test/upgrade/Makefile +++ b/test/upgrade/Makefile @@ -24,12 +24,11 @@ # REGISTRY ?= -mkfile_path := $(abspath $(lastword $(MAKEFILE_LIST))) -project_path := $(dir $(mkfile_path)) -root_path = $(realpath $(project_path)/) -dev_version = 1.44.0-dev -server_tag := $(REGISTRY)/upgrade-test-controller:$(dev_version) - +base_version = 1.46.0 +# Version defaults to the short hash of the latest commit +VERSION ?= $(base_version)-dev-$(shell git rev-parse --short=7 HEAD) +server_tag := $(REGISTRY)/upgrade-test-controller:$(VERSION) +cwd:=$(shell dirname $(realpath $(lastword $(MAKEFILE_LIST)))) # _____ _ # |_ _|_ _ _ __ __ _ ___| |_ ___ # | |/ _` | '__/ _` |/ _ \ __/ __| @@ -37,9 +36,12 @@ server_tag := $(REGISTRY)/upgrade-test-controller:$(dev_version) # |_|\__,_|_| \__, |\___|\__|___/ # |___/ +# Using .ONESHELL allows us to `cd` to the parent directory agones. This gives the Dockerfile the +# context of the agones directory, which allows it to COPY files from any child directory. +.ONESHELL: # Build a docker image for the server, and tag it build: - cd $(root_path) && docker build -f $(project_path)Dockerfile --tag=$(server_tag) . + cd "$(cwd)/../.." && DOCKER_BUILDKIT=1 docker build -f $(cwd)/Dockerfile --tag=$(server_tag) . push: build docker push $(server_tag) diff --git a/test/upgrade/evictablePods.yaml b/test/upgrade/evictablePods.yaml new file mode 100644 index 0000000000..59a6765f4b --- /dev/null +++ b/test/upgrade/evictablePods.yaml @@ -0,0 +1,67 @@ +# Copyright 2024 Google LLC All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Create evictable pods to prevent Autopilot clusters from completely scaling down. +# https://cloud.google.com/kubernetes-engine/docs/how-to/capacity-provisioning +--- +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: low-priority +value: -10 +preemptionPolicy: Never +globalDefault: false +description: "Low priority workloads" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: evictable-pods-deployment +spec: + replicas: 200 + selector: + matchLabels: + app: evictable-pods + template: + metadata: + labels: + app: evictable-pods + # Label for use with packed game server pod affinity rules + agones.dev/role: gameserver + spec: + priorityClassName: low-priority + terminationGracePeriodSeconds: 0 + containers: + - name: ubuntu + image: ubuntu + imagePullPolicy: IfNotPresent + command: ["sleep"] + args: ["infinity"] + resources: + requests: + memory: 52Mi + cpu: 30m + limits: + memory: 52Mi + cpu: 30m + # Use same affinity as packed game server pods + affinity: + podAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchLabels: + agones.dev/role: gameserver + topologyKey: kubernetes.io/hostname + weight: 100 diff --git a/test/upgrade/gameserverTemplate.yaml b/test/upgrade/gameserverTemplate.yaml index 407ed218aa..f93c6088bc 100644 --- a/test/upgrade/gameserverTemplate.yaml +++ b/test/upgrade/gameserverTemplate.yaml @@ -51,16 +51,23 @@ data: metadata: labels: agonesVersion: {{ .AgonesVersion }} + app: sdk-client-test spec: containers: - name: sdk-client-test image: "{{ .Registry }}:{{ .AgonesVersion }}" imagePullPolicy: Always + env: + - name: SHUTDOWN_DELAY_SECONDS + value: "10" + - name: GRACEFUL_TERMINATION_DELAY_SECONDS + value: "10" resources: requests: - memory: 64Mi + memory: 52Mi cpu: 20m limits: - memory: 64Mi + memory: 52Mi cpu: 20m serviceAccountName: agones-sa + restartPolicy: Never diff --git a/test/upgrade/go.mod b/test/upgrade/go.mod deleted file mode 100644 index f7615ef1da..0000000000 --- a/test/upgrade/go.mod +++ /dev/null @@ -1,51 +0,0 @@ -module agones.dev/agones/test/upgrade/testContainer - -go 1.22 - -toolchain go1.22.6 - -require ( - k8s.io/apimachinery v0.31.0 - k8s.io/client-go v0.31.0 -) - -require ( - github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect - github.com/emicklei/go-restful/v3 v3.11.0 // indirect - github.com/fxamacker/cbor/v2 v2.7.0 // indirect - github.com/go-logr/logr v1.4.2 // indirect - github.com/go-openapi/jsonpointer v0.19.6 // indirect - github.com/go-openapi/jsonreference v0.20.2 // indirect - github.com/go-openapi/swag v0.22.4 // indirect - github.com/gogo/protobuf v1.3.2 // indirect - github.com/golang/protobuf v1.5.4 // indirect - github.com/google/gnostic-models v0.6.8 // indirect - github.com/google/go-cmp v0.6.0 // indirect - github.com/google/gofuzz v1.2.0 // indirect - github.com/google/uuid v1.6.0 // indirect - github.com/josharian/intern v1.0.0 // indirect - github.com/json-iterator/go v1.1.12 // indirect - github.com/mailru/easyjson v0.7.7 // indirect - github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect - github.com/modern-go/reflect2 v1.0.2 // indirect - github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect - github.com/onsi/gomega v1.33.1 // indirect - github.com/x448/float16 v0.8.4 // indirect - golang.org/x/net v0.26.0 // indirect - golang.org/x/oauth2 v0.21.0 // indirect - golang.org/x/sys v0.21.0 // indirect - golang.org/x/term v0.21.0 // indirect - golang.org/x/text v0.16.0 // indirect - golang.org/x/time v0.3.0 // indirect - google.golang.org/protobuf v1.34.2 // indirect - gopkg.in/inf.v0 v0.9.1 // indirect - gopkg.in/yaml.v2 v2.4.0 // indirect - gopkg.in/yaml.v3 v3.0.1 // indirect - k8s.io/api v0.31.0 // indirect - k8s.io/klog/v2 v2.130.1 // indirect - k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340 // indirect - k8s.io/utils v0.0.0-20240711033017-18e509b52bc8 // indirect - sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect - sigs.k8s.io/structured-merge-diff/v4 v4.4.1 // indirect - sigs.k8s.io/yaml v1.4.0 // indirect -) diff --git a/test/upgrade/main.go b/test/upgrade/main.go index 6ce62924ae..17104a1eff 100644 --- a/test/upgrade/main.go +++ b/test/upgrade/main.go @@ -28,8 +28,12 @@ import ( "strings" "time" + agonesv1 "agones.dev/agones/pkg/apis/agones/v1" + "agones.dev/agones/pkg/client/clientset/versioned" + "agones.dev/agones/pkg/client/informers/externalversions" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/fields" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/informers" "k8s.io/client-go/kubernetes" @@ -48,17 +52,24 @@ const ( SidecarPullPolicy = "true" // LogLevel sets the Agones Helm configuration log level LogLevel = "debug" + // Timeout sets the amount of time to wait for resources to become ready. Should be more than the + // time for an Autopilot cluster to scale up. + Timeout = 10 * time.Minute // HelmChart is the helm chart for the public Agones releases HelmChart = "agones/agones" + // TestChart is the registry for Agones Helm chart development builds + TestChart = "./install/helm" // AgonesRegistry is the public registry for Agones releases AgonesRegistry = "us-docker.pkg.dev/agones-images/release" - // TestRegistry is the public registry for upgrade test container files - TestRegistry = "us-docker.pkg.dev/agones-images/ci/sdk-client-test" + // TestRegistry is the registry for Agones development builds + TestRegistry = "us-docker.pkg.dev/agones-images/ci" + // ContainerRegistry is the registry for upgrade test container files + ContainerRegistry = "us-docker.pkg.dev/agones-images/ci/sdk-client-test" ) var ( - // Dev is the current development version of Agones - Dev = os.Getenv("Dev") + // DevVersion is the current development version of Agones + DevVersion = os.Getenv("DevVersion") // ReleaseVersion is the latest released version of Agones (DEV - 1). ReleaseVersion = os.Getenv("ReleaseVersion") // PodName the name of the pod this container is running in @@ -81,8 +92,14 @@ func main() { log.Fatal("Could not create the kubernetes api clientset", err) } + agonesClient, err := versioned.NewForConfig(cfg) + if err != nil { + log.Fatal("Could not create the agones api clientset") + } + validConfigs := configTestSetup(ctx, kubeClient) - go watchGameServerPods(kubeClient, make(chan struct{}), make(map[string]podLog), len(validConfigs)*2) + go watchGameServers(agonesClient, len(validConfigs)*2) + go watchGameServerEvents(kubeClient) addAgonesRepo() runConfigWalker(ctx, validConfigs) cleanUpResources() @@ -111,9 +128,10 @@ type gameServerTemplate struct { CountsAndLists bool } -type podLog struct { +type gsLog struct { SdkVersion string GameServerVersion string + GameServerState string } type helmStatuses []struct { @@ -136,7 +154,7 @@ func configTestSetup(ctx context.Context, kubeClient *kubernetes.Clientset) []*c // Get the mappings of valid Kubernetes, Agones, and Feature Gate versions from the configmap. err := json.Unmarshal([]byte(VersionMappings), &versionMap) if err != nil { - log.Fatal("Could not Unmarshal", err) + log.Fatal("Could not Unmarshal ", err) } // Find valid Agones versions and feature gates for the current version of Kubernetes. @@ -148,7 +166,7 @@ func configTestSetup(ctx context.Context, kubeClient *kubernetes.Clientset) []*c countsAndLists := containsCountsAndLists(agonesVersion) ct.agonesVersion = agonesVersion if agonesVersion == "Dev" { - ct.agonesVersion = Dev + ct.agonesVersion = DevVersion // Game server container cannot be created at DEV version due to go.mod only able to access // published Agones versions. Use N-1 for DEV. ct.gameServerPath = createGameServerFile(ReleaseVersion, countsAndLists) @@ -285,19 +303,19 @@ func runConfigWalker(ctx context.Context, validConfigs []*configTest) { for _, config := range validConfigs { registry := AgonesRegistry chart := HelmChart - if config.agonesVersion == Dev { - // TODO: Update to templated value for registry and chart for Dev build - continue + if config.agonesVersion == DevVersion { + registry = TestRegistry + chart = TestChart } err := installAgonesRelease(config.agonesVersion, registry, config.featureGates, ImagePullPolicy, SidecarPullPolicy, LogLevel, chart) if err != nil { - log.Printf("installAgonesRelease err: %s", err) + log.Fatalf("installAgonesRelease err: %s", err) } // Wait for the helm release to install. Waits the same amount of time as the Helm timeout. var helmStatus string - err = wait.PollUntilContextTimeout(ctx, 10*time.Second, 10*time.Minute, true, func(ctx context.Context) (done bool, err error) { + err = wait.PollUntilContextTimeout(ctx, 10*time.Second, Timeout, true, func(_ context.Context) (done bool, err error) { helmStatus = checkHelmStatus(config.agonesVersion) if helmStatus == "deployed" { return true, nil @@ -309,7 +327,11 @@ func runConfigWalker(ctx context.Context, validConfigs []*configTest) { config.agonesVersion, helmStatus) } - go createGameServers(cancelCtx, config.gameServerPath) + gsReady := make(chan bool) + go createGameServers(cancelCtx, config.gameServerPath, gsReady) + // Wait for the first game server pod created to become ready + <-gsReady + close(gsReady) // Allow some soak time at the Agones version before next upgrade time.Sleep(1 * time.Minute) } @@ -332,6 +354,12 @@ func checkHelmStatus(agonesVersion string) string { log.Fatal("Could not Unmarshal", err) } + // Remove the commit sha from the DevVersion i.e. from 1.46.0-dev-7168dd3 to 1.46.0-dev + if agonesVersion == DevVersion { + r := regexp.MustCompile(`1\.\d+\.\d+-dev`) + agonesVersion = r.FindString(DevVersion) + } + for _, status := range helmStatus { if status.AppVersion == agonesVersion { return status.Status @@ -342,8 +370,9 @@ func checkHelmStatus(agonesVersion string) string { // Creates a gameserver yaml file from the mounted gameserver.yaml template. The name of the new // gameserver yaml is based on the Agones version, i.e. gs1440.yaml for Agones version 1.44.0 +// Note: This does not validate the created file. func createGameServerFile(agonesVersion string, countsAndLists bool) string { - gsTmpl := gameServerTemplate{Registry: TestRegistry, AgonesVersion: agonesVersion, CountsAndLists: countsAndLists} + gsTmpl := gameServerTemplate{Registry: ContainerRegistry, AgonesVersion: agonesVersion, CountsAndLists: countsAndLists} gsTemplate, err := template.ParseFiles("gameserver.yaml") if err != nil { @@ -377,12 +406,16 @@ func createGameServerFile(agonesVersion string, countsAndLists bool) string { } // Create a game server every five seconds until the context is cancelled. The game server container -// be the same binary version as the game server file. The SDK version is always the same as the +// is the same binary version as the game server file. The SDK version is always the same as the // version of the Agones controller that created it. The Game Server shuts itself down after the // tests have run as part of the `sdk-client-test` logic. -func createGameServers(ctx context.Context, gsPath string) { +func createGameServers(ctx context.Context, gsPath string, gsReady chan bool) { args := []string{"create", "-f", gsPath} + checkFirstGameServerReady(ctx, gsReady, args...) + ticker := time.NewTicker(5 * time.Second) + retries := 8 + retry := 0 for { select { @@ -391,39 +424,82 @@ func createGameServers(ctx context.Context, gsPath string) { return case <-ticker.C: _, err := runExecCommand(KubectlCmd, args...) - // TODO: Do not ignore error if unable to create due to something other than cluster scale up + // Ignore failures for ~45s at at time to account for the brief (~30s) during which the + // controller service is unavailable during upgrade. if err != nil { - log.Printf("Could not create Gameserver %s: %s", gsPath, err) + if retry > retries { + log.Fatalf("Could not create Gameserver %s: %s. Too many successive errors.", gsPath, err) + } + log.Printf("Could not create Gameserver %s: %s. Retries left: %d.", gsPath, err, retries-retry) + retry++ + } else { + retry = 0 } } } } -// watchGameServerPods watches all game server pods for CrashLoopBackOff. Errors if the number of -// CrashLoopBackOff backoff pods exceeds the number of acceptedFailures. -func watchGameServerPods(kubeClient *kubernetes.Clientset, stopCh chan struct{}, failedPods map[string]podLog, acceptedFailures int) { - // Filter by label agones.dev/role=gameserver to only game server pods - labelOptions := informers.WithTweakListOptions(func(opts *metav1.ListOptions) { - opts.LabelSelector = "agones.dev/role=gameserver" +// checkFirstGameServerReady waits for the Game Server Pod to be running. This may take several +// minutes in Autopilot. +func checkFirstGameServerReady(ctx context.Context, gsReady chan bool, args ...string) { + // Sample output: gameserver.agones.dev/sdk-client-test-5zjdn created + output, err := runExecCommand(KubectlCmd, args...) + if err != nil { + log.Fatalf("Could not create Gameserver: %s", err) + } + r := regexp.MustCompile(`sdk-client-test-\S+`) + gsName := r.FindString(string(output)) + // Game Server has too many states, so using the pod instead as there are only two healthy states. + // Includes the gs name to make output logs easier to read. + getPodStatus := []string{"get", "pod", gsName, "-o=custom-columns=:.status.phase,:.metadata.name", "--no-headers"} + + // Pod is created after Game Server, wait briefly before erroring out on unable to get pod. + retries := 0 + err = wait.PollUntilContextTimeout(ctx, 2*time.Second, Timeout, true, func(_ context.Context) (done bool, err error) { + out, err := runExecCommand(KubectlCmd, getPodStatus...) + if err != nil && retries > 2 { + log.Fatalf("Could not get Gameserver %s state: %s", gsName, err) + } + if err != nil { + retries++ + return false, nil + } + // Sample output: Running sdk-client-test-bbvx9 + podStatus := strings.Split(string(out), " ") + if podStatus[0] == "Running" || podStatus[0] == "Succeeded" { + gsReady <- true + return true, nil + } + return false, nil }) - kubeInformerFactory := informers.NewSharedInformerFactoryWithOptions(kubeClient, 5*time.Second, - informers.WithNamespace("default"), labelOptions) - podInformer := kubeInformerFactory.Core().V1().Pods().Informer() + if err != nil { + log.Fatalf("PollUntilContextTimeout timed out while wait for first gameserver %s to be Ready", gsName) + } +} - _, err := podInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ +// watchGameServers watches all game servers for errors. Errors if the number of failed game servers +// exceeds the number of acceptedFailures. +func watchGameServers(agonesClient *versioned.Clientset, acceptedFailures int) { + stopCh := make(chan struct{}) + failedGs := make(map[string]gsLog) + + agonesInformerFactory := externalversions.NewSharedInformerFactory(agonesClient, 5*time.Second) + gsInformer := agonesInformerFactory.Agones().V1().GameServers().Informer() + + _, err := gsInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ UpdateFunc: func(_, newObj interface{}) { - newPod := newObj.(*v1.Pod) - for _, cs := range newPod.Status.ContainerStatuses { - if cs.Name != "sdk-client-test" || cs.State.Waiting == nil || cs.State.Waiting.Reason != "CrashLoopBackOff" { - continue - } - gsVersion := newPod.Labels["agonesVersion"] - sdkVersion := newPod.Annotations["agones.dev/sdk-version"] - log.Printf("%s for pod: %s with game server binary version %s, and SDK version %s", cs.State.Waiting.Reason, newPod.Name, gsVersion, sdkVersion) - // Put failed pods into the map until it reaches capacity. - failedPods[newPod.Name] = podLog{GameServerVersion: gsVersion, SdkVersion: sdkVersion} - if len(failedPods) > acceptedFailures { - log.Fatalf("Too many Game Server pods in CrashLoopBackOff: %v", failedPods) + newGs := newObj.(*agonesv1.GameServer) + if newGs.Status.State == "Error" || newGs.Status.State == "Unhealthy" { + gsVersion := newGs.Labels["agonesVersion"] + sdkVersion := newGs.Annotations["agones.dev/sdk-version"] + log.Printf("Game server %s with binary version %s, and SDK version %s in %s state\n", + newGs.Name, gsVersion, sdkVersion, newGs.Status.State) + + // Put failed game servers into the map until it reaches capacity. + failedGs[newGs.Name] = gsLog{GameServerVersion: gsVersion, SdkVersion: sdkVersion, + GameServerState: string(newGs.Status.State)} + if len(failedGs) > acceptedFailures { + log.Fatalf("Too many Game Servers in Error or Unhealthy states: %v", failedGs) } } }, @@ -432,9 +508,51 @@ func watchGameServerPods(kubeClient *kubernetes.Clientset, stopCh chan struct{}, log.Fatal("Not able to create AddEventHandler", err) } - go podInformer.Run(stopCh) - if !cache.WaitForCacheSync(stopCh, podInformer.HasSynced) { - log.Fatal("Timed out waiting for caches to sync") + go gsInformer.Run(stopCh) + if !cache.WaitForCacheSync(stopCh, gsInformer.HasSynced) { + log.Fatal("Timed out waiting for game server informer cache to sync") + } +} + +// watchGameServerEvents watches all events on `sdk-client-test` containers for BackOff errors. The +// purpose is to catch ImagePullBackOff errors. +func watchGameServerEvents(kubeClient *kubernetes.Clientset) { + stopCh := make(chan struct{}) + + // Filter by Game Server `sdk-client-test` containers + containerName := "sdk-client-test" + containerPath := "spec.containers{sdk-client-test}" + fieldSelector := fields.OneTermEqualSelector("involvedObject.fieldPath", containerPath).String() + // First delete previous `sdk-client-test` events, otherwise there will be events from previous runs. + _, err := runExecCommand(KubectlCmd, []string{"delete", "events", "--field-selector", fieldSelector}...) + if err != nil { + log.Fatal("Could not delete `sdk-client-test` events", err) + } + + eventOptions := informers.WithTweakListOptions(func(opts *metav1.ListOptions) { + opts.FieldSelector = fieldSelector + }) + kubeInformerFactory := informers.NewSharedInformerFactoryWithOptions(kubeClient, 5*time.Second, + informers.WithNamespace("default"), eventOptions) + eventInformer := kubeInformerFactory.Core().V1().Events().Informer() + + _, err = eventInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: func(obj interface{}) { + newEvent := obj.(*v1.Event) + gsPodName := newEvent.InvolvedObject.Name + if newEvent.Reason == "Failed" { + log.Fatalf("%s on %s %s has failed. Latest event: message %s", containerName, newEvent.Kind, + gsPodName, newEvent.Message) + } + }, + }) + if err != nil { + log.Fatal("Not able to create AddEventHandler", err) + } + + go eventInformer.Run(stopCh) + if !cache.WaitForCacheSync(stopCh, eventInformer.HasSynced) { + log.Fatal("Timed out waiting for eventInformer cache to sync") } } @@ -455,7 +573,7 @@ func cleanUpResources() { // Apiservice v1.allocation.agones.dev, which is part of Service agones-system/agones-controller-service, // does not always get cleaned up on Helm uninstall, and needs to be deleted (if it exists) before // the agones-system namespace can be removed. - // Ignore the error, because an "error" means Helm already uninstall the apiservice. + // Ignore the error, because an "error" means Helm already uninstalled the apiservice. args = []string{"delete", "apiservice", "v1.allocation.agones.dev"} out, err := runExecCommand(KubectlCmd, args...) if err == nil { diff --git a/test/upgrade/permissions.yaml b/test/upgrade/permissions.yaml index 1f4a96005b..54e0d57215 100644 --- a/test/upgrade/permissions.yaml +++ b/test/upgrade/permissions.yaml @@ -24,18 +24,18 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: namespace: default - name: pod-reader + name: pod-manager rules: - apiGroups: [""] # "" indicates the core API group - resources: ["pods"] - verbs: ["get", "watch", "list"] + resources: ["pods", "events"] + verbs: ["get", "delete", "list", "watch"] --- apiVersion: rbac.authorization.k8s.io/v1 # This role binding allows default service account to read all pods in the "default" namespace. # You need to already have a Role named "pod-reader" in that namespace. kind: RoleBinding metadata: - name: read-pods + name: manage-pods namespace: default subjects: - kind: ServiceAccount @@ -44,7 +44,7 @@ subjects: roleRef: # "roleRef" specifies the binding to a Role / ClusterRole kind: Role # this must be Role or ClusterRole - name: pod-reader # this must match the name of the Role or ClusterRole you wish to bind to + name: pod-manager # this must match the name of the Role or ClusterRole you wish to bind to apiGroup: rbac.authorization.k8s.io --- kind: ClusterRole @@ -202,23 +202,23 @@ roleRef: apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: - name: apiservices-creator + name: apiservices-manager rules: - apiGroups: ["apiregistration.k8s.io"] resources: ["apiservices"] - verbs: ["get", "watch", "list", "create", "patch"] + verbs: ["create", "delete", "get", "list", "patch", "watch"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: create-apiservices + name: manage-apiservices subjects: - kind: ServiceAccount name: agones-sa namespace: default roleRef: kind: ClusterRole - name: apiservices-creator + name: apiservices-manager apiGroup: rbac.authorization.k8s.io --- # Agones needs to be able to create Agones CustomResourceDefinitions @@ -249,23 +249,23 @@ roleRef: apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: - name: clusterrole-creator + name: clusterrole-manager rules: - apiGroups: ["rbac.authorization.k8s.io"] resources: ["clusterroles", "clusterrolebindings", "rolebindings"] - verbs: ["get", "watch", "list", "create", "patch"] + verbs: ["create", "delete", "get", "list", "patch", "watch"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: create-clusterroles + name: manager-clusterroles subjects: - kind: ServiceAccount name: agones-sa namespace: default roleRef: kind: ClusterRole - name: clusterrole-creator + name: clusterrole-manager apiGroup: rbac.authorization.k8s.io --- # Agones needs to be able to create deployments @@ -498,3 +498,41 @@ roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole name: sdk +--- +# Source: agones/templates/hooks/sa.yaml +# Permissions to grant to helm on helm uninstall +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + namespace: agones-system + name: helm-cleanup + labels: + app: agones +rules: + - apiGroups: ["agones.dev", "multicluster.agones.dev", "autoscaling.agones.dev"] + resources: ["fleets", "fleetautoscalers", "gameservers", "gameserversets", "gameserverallocationpolicies"] + verbs: ["delete", "get", "list"] + - apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list"] + - apiGroups: [""] + resources: ["configmaps"] + verbs: ["create", "delete", "get", "list"] + - apiGroups: ["batch"] + resources: ["jobs"] + verbs: ["create", "delete", "get", "list"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: helm-cleanup-access + labels: + app: agones +subjects: + - kind: ServiceAccount + name: agones-sa + namespace: default +roleRef: + kind: ClusterRole + name: helm-cleanup + apiGroup: rbac.authorization.k8s.io diff --git a/test/upgrade/upgradeTest.yaml b/test/upgrade/upgradeTest.yaml index ebb301953c..4b549500aa 100644 --- a/test/upgrade/upgradeTest.yaml +++ b/test/upgrade/upgradeTest.yaml @@ -26,8 +26,7 @@ spec: spec: containers: - name: upgrade-test-controller - # TODO: Update image name to use a templated value for current Dev version - image: us-docker.pkg.dev/agones-images/ci/upgrade-test-controller:1.44.0-dev + image: us-docker.pkg.dev/agones-images/ci/upgrade-test-controller:${DevVersion} imagePullPolicy: Always env: - name: PodName diff --git a/test/upgrade/versionMap.yaml b/test/upgrade/versionMap.yaml index b0a7499de0..7c8a1724cd 100644 --- a/test/upgrade/versionMap.yaml +++ b/test/upgrade/versionMap.yaml @@ -18,82 +18,33 @@ kind: ConfigMap metadata: name: version-map data: - Dev: "1.44.0-dev" - ReleaseVersion: "1.43.0" + DevVersion: ${DevVersion} + ReleaseVersion: "1.45.0" version-mappings.json: | { "k8sToAgonesVersions": { - "1.25": [ - "1.34.0", - "1.35.0" - ], - "1.26": [ - "1.34.0", - "1.35.0", - "1.36.0", - "1.37.0", - "1.38.0", - "1.39.0" - ], - "1.27": [ - "1.34.0", - "1.35.0", - "1.36.0", - "1.37.0", - "1.38.0", - "1.39.0", - "1.40.0", - "1.41.0", - "1.42.0" - ], - "1.28": [ - "1.36.0", - "1.37.0", - "1.38.0", - "1.39.0", - "1.40.0", - "1.41.0", - "1.42.0", - "1.43.0", - "Dev" - ], "1.29": [ "1.40.0", "1.41.0", "1.42.0", "1.43.0", + "1.44.0", + "1.45.0", "Dev" ], "1.30": [ "1.43.0", + "1.44.0", + "1.45.0", + "Dev" + ], + "1.31": [ + "1.44.0", + "1.45.0", "Dev" ] }, "agonesVersionFeatureGates": { - "1.34.0": { - "alphaGates": ["PlayerAllocationFilter", "PlayerTracking"], - "betaGates": [] - }, - "1.35.0": { - "alphaGates": ["PlayerAllocationFilter", "PlayerTracking"], - "betaGates": [] - }, - "1.36.0": { - "alphaGates": ["PlayerAllocationFilter", "PlayerTracking"], - "betaGates": [] - }, - "1.37.0": { - "alphaGates": ["CountsAndLists", "DisableResyncOnSDKServer", "GKEAutopilotExtendedDurationPods", "PlayerAllocationFilter", "PlayerTracking"], - "betaGates": [] - }, - "1.38.0": { - "alphaGates": ["CountsAndLists", "DisableResyncOnSDKServer", "GKEAutopilotExtendedDurationPods", "PlayerAllocationFilter", "PlayerTracking"], - "betaGates": [] - }, - "1.39.0": { - "alphaGates": ["CountsAndLists", "DisableResyncOnSDKServer", "GKEAutopilotExtendedDurationPods", "PlayerAllocationFilter", "PlayerTracking"], - "betaGates": [] - }, "1.40.0": { "alphaGates": ["CountsAndLists", "GKEAutopilotExtendedDurationPods", "PlayerAllocationFilter", "PlayerTracking"], "betaGates": ["DisableResyncOnSDKServer"] @@ -110,9 +61,17 @@ data: "alphaGates": ["GKEAutopilotExtendedDurationPods", "PlayerAllocationFilter", "PlayerTracking", "PortPolicyNone", "PortRanges", "RollingUpdateFix"], "betaGates": ["AutopilotPassthroughPort", "CountsAndLists", "DisableResyncOnSDKServer"] }, + "1.44.0": { + "alphaGates": ["PlayerAllocationFilter", "PlayerTracking", "PortPolicyNone", "PortRanges", "RollingUpdateFix", "ScheduledAutoscaler"], + "betaGates": ["AutopilotPassthroughPort", "CountsAndLists", "DisableResyncOnSDKServer", "GKEAutopilotExtendedDurationPods"] + }, + "1.45.0": { + "alphaGates": ["PlayerAllocationFilter", "PlayerTracking", "PortPolicyNone", "PortRanges", "RollingUpdateFix", "ScheduledAutoscaler"], + "betaGates": ["AutopilotPassthroughPort", "CountsAndLists", "DisableResyncOnSDKServer", "GKEAutopilotExtendedDurationPods"] + }, "Dev": { - "alphaGates": ["GKEAutopilotExtendedDurationPods", "PlayerAllocationFilter", "PlayerTracking", "PortPolicyNone", "PortRanges", "RollingUpdateFix", "ScheduledAutoscaler"], - "betaGates": ["AutopilotPassthroughPort", "CountsAndLists", "DisableResyncOnSDKServer"] + "alphaGates": ["PlayerAllocationFilter", "PlayerTracking", "PortPolicyNone", "PortRanges", "RollingUpdateFix", "ScheduledAutoscaler"], + "betaGates": ["AutopilotPassthroughPort", "CountsAndLists", "DisableResyncOnSDKServer", "GKEAutopilotExtendedDurationPods"] } } }