Skip to content
Draft
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 15 additions & 15 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -57,13 +57,13 @@ test-yelpy: .paasta/bin/activate
test-not-yelpy: .paasta/bin/activate
.paasta/bin/tox -e tests

quick-test: .tox/py38-linux
TZ=UTC .tox/py38-linux/bin/py.test --failed-first -x --disable-warnings -- tests
quick-test: .tox/py310-linux
TZ=UTC .tox/py310-linux/bin/py.test --failed-first -x --disable-warnings -- tests

.tox/py38-linux: .paasta/bin/activate
.tox/py310-linux: .paasta/bin/activate
.paasta/bin/tox

dev-api: .tox/py38-linux
dev-api: .tox/py310-linux
.paasta/bin/tox -e dev-api

.paasta/bin/activate: requirements.txt requirements-dev.txt
Expand Down Expand Up @@ -107,7 +107,7 @@ k8s_itests: .paasta/bin/activate
make -C k8s_itests all

.PHONY: k8s_fake_cluster
k8s_fake_cluster: .tox/py38-linux
k8s_fake_cluster: .tox/py310-linux
make -C k8s_itests .fake_cluster

.PHONY: k8s_clean
Expand Down Expand Up @@ -138,44 +138,44 @@ swagger-validate:
-i paasta_tools/api/api_docs/swagger.json

.PHONY: vscode_settings
vscode_settings: .paasta/bin/activate .tox/py38-linux
vscode_settings: .paasta/bin/activate .tox/py310-linux
.paasta/bin/python paasta_tools/contrib/ide_helper.py

etc_paasta_playground soa_config_playground: .paasta/bin/activate .tox/py38-linux
.tox/py38-linux/bin/python paasta_tools/contrib/create_paasta_playground.py
etc_paasta_playground soa_config_playground: .paasta/bin/activate .tox/py310-linux
.tox/py310-linux/bin/python paasta_tools/contrib/create_paasta_playground.py

.PHONY: generate_deployments_for_service
generate_deployments_for_service: | soa_config_playground .tox/py38-linux
generate_deployments_for_service: | soa_config_playground .tox/py310-linux
export KUBECONFIG=./k8s_itests/kubeconfig;\
export PAASTA_SYSTEM_CONFIG_DIR=./etc_paasta_playground/;\
export PAASTA_TEST_CLUSTER=kind-${USER}-k8s-test;\
.tox/py38-linux/bin/python -m paasta_tools.cli.cli list -a -y ./soa_config_playground | shuf | xargs -n 1 --no-run-if-empty \
.tox/py38-linux/bin/python -m paasta_tools.generate_deployments_for_service -d ./soa_config_playground -v -s
.tox/py310-linux/bin/python -m paasta_tools.cli.cli list -a -y ./soa_config_playground | shuf | xargs -n 1 --no-run-if-empty \
.tox/py310-linux/bin/python -m paasta_tools.generate_deployments_for_service -d ./soa_config_playground -v -s

.PHONY: playground-api
playground-api: .tox/py38-linux | soa_config_playground
playground-api: .tox/py310-linux | soa_config_playground
.paasta/bin/tox -e playground-api

.PHONY: setup-kubernetes-job
setup-kubernetes-job: k8s_fake_cluster generate_deployments_for_service
export KUBECONFIG=./k8s_itests/kubeconfig;\
export PAASTA_SYSTEM_CONFIG_DIR=./etc_paasta_playground/;\
export PAASTA_TEST_CLUSTER=kind-${USER}-k8s-test;\
.tox/py38-linux/bin/python -m paasta_tools.list_kubernetes_service_instances -d ./soa_config_playground --shuffle --group-lines 1 | xargs --no-run-if-empty .tox/py38-linux/bin/python -m paasta_tools.setup_kubernetes_job -d ./soa_config_playground -c kind-${USER}-k8s-test
.tox/py310-linux/bin/python -m paasta_tools.list_kubernetes_service_instances -d ./soa_config_playground --shuffle --group-lines 1 | xargs --no-run-if-empty .tox/py310-linux/bin/python -m paasta_tools.setup_kubernetes_job -d ./soa_config_playground -c kind-${USER}-k8s-test

.PHONY: cleanup-kubernetes-jobs
cleanup-kubernetes-jobs:
export KUBECONFIG=./k8s_itests/kubeconfig;\
export PAASTA_SYSTEM_CONFIG_DIR=./etc_paasta_playground/;\
export PAASTA_TEST_CLUSTER=kind-${USER}-k8s-test;\
.tox/py38-linux/bin/python -m paasta_tools.cleanup_kubernetes_jobs -d ./soa_config_playground -c kind-${USER}-k8s-test --force
.tox/py310-linux/bin/python -m paasta_tools.cleanup_kubernetes_jobs -d ./soa_config_playground -c kind-${USER}-k8s-test --force

.PHONY: paasta-secrets-sync
paasta-secrets-sync: setup-kubernetes-job .vault-token
export KUBECONFIG=./k8s_itests/kubeconfig;\
export PAASTA_SYSTEM_CONFIG_DIR=./etc_paasta_playground/;\
export PAASTA_TEST_CLUSTER=kind-${USER}-k8s-test;\
{ .tox/py38-linux/bin/python -m paasta_tools.list_kubernetes_service_instances -d ./soa_config_playground ; echo -n \ _shared; } | cut -f1 -d"." | uniq | shuf | xargs .tox/py38-linux/bin/python -m paasta_tools.kubernetes.bin.paasta_secrets_sync -v -d ./soa_config_playground -t ./.vault-token
{ .tox/py310-linux/bin/python -m paasta_tools.list_kubernetes_service_instances -d ./soa_config_playground ; echo -n \ _shared; } | cut -f1 -d"." | uniq | shuf | xargs .tox/py310-linux/bin/python -m paasta_tools.kubernetes.bin.paasta_secrets_sync -v -d ./soa_config_playground -t ./.vault-token

define ANNOUNCE_CRONS_BODY
The following PaaSTA cron jobs will run on an infinite loop using the PaaSTA Playground k8s cluster:
Expand Down
11 changes: 11 additions & 0 deletions docs/source/yelpsoa_configs.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1066,6 +1066,17 @@ Here is a list of options that PaaSTA will pass through:
This alert sends an email to ``notification_email`` and post notifications
to ``irc_channels``. It does not page. Defaults to **true**.

* ``check_overrides``: Check-specific overrides. This is a dictionary of
check name to overrides, where the overrides are a dictionary with the same
keys allowed as monitoring.yaml.

For example: ::

# monitoring.yaml
team: myteam # For most alerts, myteam will be notified.
check_overrides:
check_autoscaler_max_instances:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

might be nice to have a central location with all the possible checks somewhere

team: otherteam # For the check_autoscaler_max_instances alert, otherteam will be notified.

Monitoring Examples
^^^^^^^^^^^^^^^^^^^
Expand Down
2 changes: 1 addition & 1 deletion k8s_itests/scripts/set-paasta-registry-credentials.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ CLUSTER=$1
for node in $(./kind get nodes --name "${CLUSTER}"); do
echo "Moving credentials to kind node: $node ..."
docker cp "${node}:/etc/containerd/config.toml" "./.tmp/${node}-containerd.toml"
../.tox/py38-linux/bin/python ./scripts/containerd_registry_setup.py "./.tmp/${node}-containerd.toml"
../.tox/py310-linux/bin/python ./scripts/containerd_registry_setup.py "./.tmp/${node}-containerd.toml"
docker cp "./.tmp/${node}-containerd.toml" "${node}:/etc/containerd/config.toml"
rm ./.tmp/${node}-containerd.toml
# restart kubelet and containerd to pick up the updated config
Expand Down
59 changes: 45 additions & 14 deletions paasta_tools/check_autoscaler_max_instances.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,11 @@ async def check_max_instances(
instance_type_class: Type[KubernetesDeploymentConfig],
system_paasta_config: SystemPaastaConfig,
dry_run: bool = False,
):
) -> None:
page_default = (
system_paasta_config.get_check_autoscaler_max_instances_page_default()
)

kube_client = KubeClient()
for service in list_services(soa_dir=soa_dir):
service_config = PaastaServiceConfigLoader(service=service, soa_dir=soa_dir)
Expand Down Expand Up @@ -86,6 +90,10 @@ async def check_max_instances(
)
continue

max_instances_suggestion = ""
alert_threshold_suggestion = ""
disable_paging_suggestion = ""
Comment on lines +93 to +95
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it'd be nice to refactor things to not need to ""-init these, but it's not a big deal imo


if (
autoscaling_status["min_instances"]
== autoscaling_status["max_instances"]
Expand All @@ -99,6 +107,11 @@ async def check_max_instances(
autoscaling_status["desired_replicas"]
>= autoscaling_status["max_instances"]
):
max_instances_suggestion = f"\nYou may want to bump max_instances to at least {autoscaling_status['desired_replicas']}."
alert_threshold_suggestion = (
"\nTo make this alert quieter, adjust"
" autoscaling.metrics_providers[n].max_instances_alert_threshold in yelpsoa-configs."
)

metrics_provider_configs = job_config.get_autoscaling_params()[
"metrics_providers"
Expand Down Expand Up @@ -164,20 +177,38 @@ async def check_max_instances(
output = f"{service}.{instance} is below max_instances."

monitoring_overrides = job_config.get_monitoring()
monitoring_overrides.update(
{
"page": False, # TODO: remove this line once this alert has been deployed for a little while.
"runbook": "y/check-autoscaler-max-instances",
"realert_every": 60, # The check runs once a minute, so this would realert every hour.
"tip": (
"The autoscaler wants to scale up to handle additional load"
" because your service is overloaded, but cannot scale any"
" higher because of max_instances. You may want to bump"
" max_instances. To make this alert quieter, adjust"
" autoscaling.metrics_providers[n].max_instances_alert_threshold in yelpsoa-configs."
),
}

disable_paging_suggestion = "\n".join(
[
"",
"To disable/enable paging, set",
"",
f"{instance}:",
f" ...",
f" monitoring:" f" check_overrides:",
f" check_autoscaler_max_instances.{service}.{instance}:"
f" page: false # or true to enable",
"",
f"in eks-{cluster}.yaml.",
]
)

monitoring_defaults = {
"page": page_default, # This will be re-overridden in send_event if the user has specified it in check_overrides.
"runbook": "y/check-autoscaler-max-instances",
"realert_every": 60, # The check runs once a minute, so this would realert every hour.
"tip": (
"The autoscaler wants to scale up to handle additional load"
" because your service is overloaded, but cannot scale any"
" higher because of max_instances."
f"{max_instances_suggestion}"
f"{alert_threshold_suggestion}"
f"{disable_paging_suggestion}"
),
}
monitoring_overrides = (
monitoring_defaults | monitoring_overrides
) # combine, with preference to overrides
send_event(
service,
check_name=f"check_autoscaler_max_instances.{service}.{instance}",
Expand Down
25 changes: 25 additions & 0 deletions paasta_tools/cli/schemas/kubernetes_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -515,6 +515,31 @@
"minimum": 0
}
]
},
"check_overrides": {
"type": "object",
"properties": {
"team": {
"type": "string"
},
"page": {
"type": "boolean"
},
"alert_after": {
"oneOf": [
{
"type": "string",
"pattern": "^[1-9]+[0-9]*[YMWDhms]$",
"$comment": "See https://pysensu-yelp.readthedocs.io/en/latest/#pysensu_yelp.human_to_seconds"
},
{
"type": "integer",
"minimum": 0
}
]
}
},
"additionalProperties": true
}
},
"additionalProperties": true
Expand Down
34 changes: 34 additions & 0 deletions paasta_tools/monitoring_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,37 @@ def _load_sensu_team_data():
return team_data


def get_check_specific_overrides(overrides, check_name):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: types?

"""
Given a monitoring dict like:
{
"team": "myteam",
"check_every": "1m",
"check_overrides": {
"check_autoscaler_max_instances": {
"team": "otherteam",
},
},
}

and a check_name of "check_autoscaler_max_instances", this function will
return

{
"team": "otherteam",
"check_every": "1m",
}

This allows you to override settings for specific checks, rather than for all checks for a service.
"""
check_overrides = overrides.get("check_overrides", {})
check_specific_overrides = check_overrides.get(check_name, {})
combined = overrides.copy()
combined.update(check_specific_overrides)
combined.pop("check_overrides", None)
return combined


def send_event(
service,
check_name,
Expand All @@ -228,6 +259,9 @@ def send_event(
:param system_paasta_config: A SystemPaastaConfig object representing the system
:param dry_run: Print the Sensu event instead of emitting it
"""

overrides = get_check_specific_overrides(overrides, check_name)

# This function assumes the input is a string like "mumble.main"
team = get_team(overrides, service, soa_dir)
if not team:
Expand Down
6 changes: 6 additions & 0 deletions paasta_tools/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1960,6 +1960,7 @@ class SystemPaastaConfigDict(TypedDict, total=False):
auto_config_instance_types_enabled: Dict[str, bool]
auto_config_instance_type_aliases: Dict[str, str]
auto_hostname_unique_size: int
check_autoscaler_max_instances_page_default: bool
cluster_fqdn_format: str
clusters: Sequence[str]
cluster: str
Expand Down Expand Up @@ -2847,6 +2848,11 @@ def get_ecosystem_for_cluster(self, cluster: str) -> Optional[str]:
# NOTE: this should never happen unless we've gotten bad data
return None

def get_check_autoscaler_max_instances_page_default(self) -> bool:
return self.config_dict.get(
"check_autoscaler_max_instances_page_default", False
)


def _run(
command: Union[str, List[str]],
Expand Down
30 changes: 30 additions & 0 deletions tests/test_monitoring_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -1215,3 +1215,33 @@ def test_send_replication_event_if_under_replication_critical(instance_config):
instance_config.cluster,
)
) in send_event_kwargs["description"]


def test_send_event_respects_check_overrides():
with mock.patch(
"paasta_tools.monitoring_tools.pysensu_yelp.send_event",
autospec=True,
) as mock_pysensu_yelp_send_event:
monitoring_tools.send_event(
service="fake_service",
check_name="fake_check_name",
overrides={
"page": True,
"team": "fake_team",
"check_overrides": {
"fake_check_name": {"page": False},
},
},
status="42",
output="The http port is not open",
soa_dir="/fake/soa/dir",
cluster="fake_cluster",
system_paasta_config=mock.Mock(
get_cluster=mock.Mock(return_value="fake_cluster"),
sensu_host=mock.Mock(return_value="fake_sensu_host"),
sensu_port=mock.Mock(return_value=12345),
),
dry_run=False,
)
_, kwargs = mock_pysensu_yelp_send_event.call_args
assert kwargs["page"] is False
Loading