From 99d8794fe56c4db4a6d9cb9ddce1e9e02ad64a00 Mon Sep 17 00:00:00 2001 From: Evan Krall Date: Tue, 24 Jun 2025 14:05:06 -0700 Subject: [PATCH 01/15] Have RooCode delete a bunch of mesos-related code. --- .../steps/paasta_execute_docker_command.py | 80 -- paasta_tools/api/views/instance.py | 26 +- paasta_tools/api/views/resources.py | 42 +- paasta_tools/broadcast_log_to_services.py | 7 +- paasta_tools/check_spark_jobs.py | 24 +- paasta_tools/cli/cmds/local_run.py | 19 +- .../contrib/get_running_task_allocation.py | 81 +- paasta_tools/frameworks/adhoc_scheduler.py | 71 -- paasta_tools/frameworks/native_scheduler.py | 652 ---------- paasta_tools/frameworks/task_store.py | 245 ---- paasta_tools/mesos/__init__.py | 0 paasta_tools/mesos/cfg.py | 46 - paasta_tools/mesos/cluster.py | 60 - paasta_tools/mesos/exceptions.py | 59 - paasta_tools/mesos/framework.py | 77 -- paasta_tools/mesos/log.py | 48 - paasta_tools/mesos/master.py | 306 ----- paasta_tools/mesos/mesos_file.py | 169 --- paasta_tools/mesos/parallel.py | 52 - paasta_tools/mesos/slave.py | 115 -- paasta_tools/mesos/task.py | 94 -- paasta_tools/mesos/util.py | 69 -- paasta_tools/mesos/zookeeper.py | 37 - paasta_tools/mesos_maintenance.py | 848 ------------- paasta_tools/mesos_tools.py | 1051 ----------------- paasta_tools/metrics/metastatus_lib.py | 355 +----- paasta_tools/paasta_execute_docker_command.py | 123 -- paasta_tools/paasta_native_serviceinit.py | 21 - paasta_tools/smartstack_tools.py | 35 +- paasta_tools/tron_tools.py | 30 +- setup.py | 1 - tests/api/test_resources.py | 55 +- tests/frameworks/test_adhoc_scheduler.py | 249 ---- tests/frameworks/test_native_scheduler.py | 489 -------- tests/frameworks/test_task_store.py | 168 --- tests/mesos/test_cluster.py | 46 - tests/mesos/test_master.py | 127 -- tests/metrics/test_metastatus_lib.py | 340 +----- tests/test_check_spark_jobs.py | 74 +- tests/test_mesos_tools.py | 945 --------------- tests/test_paasta_execute_docker_command.py | 220 ---- tests/test_smartstack_tools.py | 43 +- tests/test_tron_tools.py | 6 +- 43 files changed, 89 insertions(+), 7516 deletions(-) delete mode 100644 general_itests/steps/paasta_execute_docker_command.py delete mode 100644 paasta_tools/frameworks/adhoc_scheduler.py delete mode 100644 paasta_tools/frameworks/native_scheduler.py delete mode 100644 paasta_tools/frameworks/task_store.py delete mode 100644 paasta_tools/mesos/__init__.py delete mode 100644 paasta_tools/mesos/cfg.py delete mode 100644 paasta_tools/mesos/cluster.py delete mode 100644 paasta_tools/mesos/exceptions.py delete mode 100644 paasta_tools/mesos/framework.py delete mode 100644 paasta_tools/mesos/log.py delete mode 100644 paasta_tools/mesos/master.py delete mode 100644 paasta_tools/mesos/mesos_file.py delete mode 100644 paasta_tools/mesos/parallel.py delete mode 100644 paasta_tools/mesos/slave.py delete mode 100644 paasta_tools/mesos/task.py delete mode 100644 paasta_tools/mesos/util.py delete mode 100644 paasta_tools/mesos/zookeeper.py delete mode 100755 paasta_tools/mesos_maintenance.py delete mode 100644 paasta_tools/mesos_tools.py delete mode 100755 paasta_tools/paasta_execute_docker_command.py delete mode 100644 paasta_tools/paasta_native_serviceinit.py delete mode 100644 tests/frameworks/test_adhoc_scheduler.py delete mode 100644 tests/frameworks/test_native_scheduler.py delete mode 100644 tests/frameworks/test_task_store.py delete mode 100644 tests/mesos/test_cluster.py delete mode 100644 tests/mesos/test_master.py delete mode 100644 tests/test_mesos_tools.py delete mode 100644 tests/test_paasta_execute_docker_command.py diff --git a/general_itests/steps/paasta_execute_docker_command.py b/general_itests/steps/paasta_execute_docker_command.py deleted file mode 100644 index 92fd6e4190..0000000000 --- a/general_itests/steps/paasta_execute_docker_command.py +++ /dev/null @@ -1,80 +0,0 @@ -# Copyright 2015-2016 Yelp Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os - -from behave import given -from behave import then -from behave import when -from docker.errors import APIError - -from paasta_tools.utils import _run -from paasta_tools.utils import get_docker_client - - -@given("Docker is available") -def docker_is_available(context): - docker_client = get_docker_client() - assert docker_client.ping() - context.docker_client = docker_client - - -@given("a running docker container with task id {task_id} and image {image_name}") -def create_docker_container(context, task_id, image_name): - container_name = "paasta-itest-execute-in-containers" - image_name = os.getenv("DOCKER_REGISTRY", "docker-dev.yelpcorp.com/") + image_name - try: - context.docker_client.remove_container(container_name, force=True) - except APIError: - pass - context.docker_client.pull(image_name) - container = context.docker_client.create_container( - name=container_name, - image=image_name, - command="/bin/sleep infinity", - environment={"MESOS_TASK_ID": task_id}, - ) - context.docker_client.start(container=container.get("Id")) - context.running_container_id = container.get("Id") - - -@when( - "we paasta_execute_docker_command a command with exit code {code} in container with task id {task_id}" -) -def run_command_in_container(context, code, task_id): - cmd = f'../paasta_tools/paasta_execute_docker_command.py -i {task_id} -c "exit {code}"' - print("Running cmd %s" % cmd) - exit_code, output = _run(cmd) - print(f"Got exitcode {exit_code} with output:\n{output}") - context.return_code = exit_code - - -@then("the exit code is {code}") -def paasta_execute_docker_command_result(context, code): - assert int(code) == int(context.return_code) - - -@then("the docker container has at most {num} exec instances") -def check_container_exec_instances(context, num): - """Modern docker versions remove ExecIDs after they finished, but older - docker versions leave ExecIDs behind. This test is for asserting that - the ExecIDs are cleaned up one way or another""" - container_info = context.docker_client.inspect_container( - context.running_container_id - ) - if container_info["ExecIDs"] is None: - execs = [] - else: - execs = container_info["ExecIDs"] - print("Container info:\n%s" % container_info) - assert len(execs) <= int(num) diff --git a/paasta_tools/api/views/instance.py b/paasta_tools/api/views/instance.py index e40d58df8e..b742d53b13 100644 --- a/paasta_tools/api/views/instance.py +++ b/paasta_tools/api/views/instance.py @@ -17,7 +17,6 @@ """ import asyncio import logging -import re import traceback from typing import Any from typing import Dict @@ -29,13 +28,11 @@ from pyramid.response import Response from pyramid.view import view_config -import paasta_tools.mesos.exceptions as mesos_exceptions from paasta_tools import tron_tools from paasta_tools.api import settings from paasta_tools.api.views.exception import ApiFailure from paasta_tools.cli.cmds.status import get_actual_deployments from paasta_tools.instance import kubernetes as pik -from paasta_tools.mesos_tools import get_all_frameworks as get_all_mesos_frameworks from paasta_tools.utils import compose_job_id from paasta_tools.utils import DeploymentVersion from paasta_tools.utils import NoConfigurationForServiceError @@ -94,34 +91,17 @@ def tron_instance_status( return status -def legacy_remote_run_filter_frameworks(service, instance, frameworks=None): - if frameworks is None: - frameworks = get_all_mesos_frameworks(active_only=True) - - prefix = f"paasta-remote {service}.{instance}" - return [f for f in frameworks if f.name.startswith(prefix)] - - def adhoc_instance_status( instance_status: Mapping[str, Any], service: str, instance: str, verbose: int ) -> List[Dict[str, Any]]: - status = [] - filtered = legacy_remote_run_filter_frameworks(service, instance) - filtered.sort(key=lambda x: x.name) - for f in filtered: - launch_time, run_id = re.match( - r"paasta-remote [^\s]+ (\w+) (\w+)", f.name - ).groups() - status.append( - {"launch_time": launch_time, "run_id": run_id, "framework_id": f.id} - ) - return status + # Mesos support has been removed - adhoc instances no longer run on Mesos + return [] async def _task_result_or_error(future): try: return {"value": await future} - except (AttributeError, mesos_exceptions.SlaveDoesNotExist): + except AttributeError: return {"error_message": "None"} except TimeoutError: return {"error_message": "Timed Out"} diff --git a/paasta_tools/api/views/resources.py b/paasta_tools/api/views/resources.py index 80b6af53e6..1b7b0e2a8a 100644 --- a/paasta_tools/api/views/resources.py +++ b/paasta_tools/api/views/resources.py @@ -15,13 +15,9 @@ """ PaaSTA resource utilization, etc. """ -from a_sync import block from pyramid.response import Response from pyramid.view import view_config -from paasta_tools.mesos_tools import get_mesos_master -from paasta_tools.metrics import metastatus_lib - def parse_filters(filters): # The swagger config verifies that the data is in this format @@ -35,42 +31,6 @@ def parse_filters(filters): @view_config(route_name="resources.utilization", request_method="GET", renderer="json") def resources_utilization(request): - master = get_mesos_master() - mesos_state = block(master.state) - - groupings = request.swagger_data.get("groupings", ["superregion"]) - # swagger actually makes the key None if it's not set - if groupings is None: - groupings = ["superregion"] - grouping_function = metastatus_lib.key_func_for_attribute_multi(groupings) - sorting_function = metastatus_lib.sort_func_for_attributes(groupings) - - filters = request.swagger_data.get("filter", []) - filters = parse_filters(filters) - filter_funcs = [ - metastatus_lib.make_filter_slave_func(attr, vals) - for attr, vals in filters.items() - ] - - resource_info_dict = metastatus_lib.get_resource_utilization_by_grouping( - grouping_func=grouping_function, - mesos_state=mesos_state, - filters=filter_funcs, - sort_func=sorting_function, - ) - + # Mesos support has been removed - resource utilization now only available via Kubernetes response_body = [] - for k, v in resource_info_dict.items(): - group = {"groupings": {}} - for grouping, value in k: - group["groupings"][grouping] = value - for resource, value in v["total"]._asdict().items(): - group[resource] = {"total": value} - for resource, value in v["free"]._asdict().items(): - group[resource]["free"] = value - for resource in v["free"]._fields: - group[resource]["used"] = group[resource]["total"] - group[resource]["free"] - - response_body.append(group) - return Response(json_body=response_body, status_code=200) diff --git a/paasta_tools/broadcast_log_to_services.py b/paasta_tools/broadcast_log_to_services.py index 13f8f7b964..08f3f594bc 100755 --- a/paasta_tools/broadcast_log_to_services.py +++ b/paasta_tools/broadcast_log_to_services.py @@ -15,7 +15,6 @@ import sys from paasta_tools.kubernetes_tools import get_all_kubernetes_services_running_here -from paasta_tools.mesos_tools import MesosSlaveConnectionError from paasta_tools.tron_tools import tron_jobs_running_here from paasta_tools.utils import _log from paasta_tools.utils import DEFAULT_SOA_DIR @@ -41,10 +40,8 @@ def broadcast_log_all_services_running_here(line: str, soa_dir=DEFAULT_SOA_DIR) def get_all_services_running_here(cluster, soa_dir): - try: - tron_services = tron_jobs_running_here() - except MesosSlaveConnectionError: - tron_services = [] + # Tron jobs no longer run via Mesos, so this will return an empty list + tron_services = tron_jobs_running_here() try: kubernetes_services = get_all_kubernetes_services_running_here() diff --git a/paasta_tools/check_spark_jobs.py b/paasta_tools/check_spark_jobs.py index 96d806004c..d865baeb13 100644 --- a/paasta_tools/check_spark_jobs.py +++ b/paasta_tools/check_spark_jobs.py @@ -11,7 +11,6 @@ import pysensu_yelp import requests -from paasta_tools import mesos_tools from paasta_tools.monitoring_tools import send_event from paasta_tools.utils import DEFAULT_SOA_DIR from paasta_tools.utils import list_services @@ -91,27 +90,8 @@ def guess_service(properties): def get_matching_framework_info(min_hours): - frameworks = mesos_tools.get_all_frameworks(active_only=True) - matching_info = [] - min_timedelta = datetime.timedelta(hours=min_hours) - for framework in frameworks: - if not framework.active: - continue - if framework.get("principal") != "spark": - continue - time_running = get_time_running(framework) - if time_running >= min_timedelta: - info = { - "id": framework.id, - "name": framework.name, - "webui_url": framework.get("webui_url"), - "service": guess_service(get_spark_properties(framework)), - "user": framework.user, - "time_running": str(time_running), - } - matching_info.append(info) - - return matching_info + # Mesos support has been removed - Spark frameworks no longer run on Mesos + return [] def format_framework(info): diff --git a/paasta_tools/cli/cmds/local_run.py b/paasta_tools/cli/cmds/local_run.py index 9e1c0da721..26b52fe468 100755 --- a/paasta_tools/cli/cmds/local_run.py +++ b/paasta_tools/cli/cmds/local_run.py @@ -49,7 +49,6 @@ from paasta_tools.kubernetes_tools import KUBE_CONFIG_USER_PATH from paasta_tools.kubernetes_tools import KubeClient from paasta_tools.long_running_service_tools import get_healthcheck_for_instance -from paasta_tools.paasta_execute_docker_command import execute_in_container from paasta_tools.secret_tools import decrypt_secret_environment_variables from paasta_tools.secret_tools import decrypt_secret_volumes from paasta_tools.tron_tools import parse_time_variables @@ -134,6 +133,24 @@ def perform_tcp_healthcheck(url, timeout): return (False, "%s (timeout %d seconds)" % (os.strerror(result), timeout)) +def execute_in_container(docker_client, container_id, command, timeout): + """Execute a command inside a Docker container + + :param docker_client: Docker client object + :param container_id: Docker container id + :param command: command to execute + :param timeout: timeout in seconds + :returns: tuple of (output, return_code) + """ + try: + exec_result = docker_client.exec_create(container_id, command) + exec_output = docker_client.exec_start(exec_result["Id"]) + exec_inspect = docker_client.exec_inspect(exec_result["Id"]) + return (exec_output.decode("utf-8"), exec_inspect["ExitCode"]) + except Exception as e: + return (str(e), 1) + + def perform_cmd_healthcheck(docker_client, container_id, command, timeout): """Returns true if return code of command is 0 when executed inside container, false otherwise diff --git a/paasta_tools/contrib/get_running_task_allocation.py b/paasta_tools/contrib/get_running_task_allocation.py index f776ae9e67..c87d3ce80b 100644 --- a/paasta_tools/contrib/get_running_task_allocation.py +++ b/paasta_tools/contrib/get_running_task_allocation.py @@ -11,16 +11,12 @@ from typing import Optional from typing import Tuple -import a_sync import simplejson as json from kubernetes.client import V1Pod from kubernetes.client import V1ResourceRequirements from paasta_tools import kubernetes_tools -from paasta_tools import mesos_tools from paasta_tools.kubernetes_tools import KubeClient -from paasta_tools.mesos.exceptions import SlaveDoesNotExist -from paasta_tools.mesos.task import Task from paasta_tools.utils import load_system_paasta_config @@ -44,77 +40,6 @@ class TaskAllocationInfo(NamedTuple): namespace: Optional[str] -def get_container_info_from_mesos_task( - task: Task, -) -> Tuple[Optional[str], Optional[float]]: - for status in task["statuses"]: - if status["state"] != "TASK_RUNNING": - continue - container_id = ( - status.get("container_status", {}).get("container_id", {}).get("value") - ) - time_start = status.get("timestamp") - return container_id, time_start - return None, None - - -def get_paasta_service_instance_from_mesos_task( - task: Task, -) -> Tuple[Optional[str], Optional[str]]: - try: - docker_params = task["container"].get("docker", {}).get("parameters", []) - except KeyError: - return None, None - service, instance = None, None - for param in docker_params: - if param["key"] == "label": - label = param["value"] - if label.startswith("paasta_service="): - service = label.split("=")[1] - if label.startswith("paasta_instance="): - instance = label.split("=")[1] - return service, instance - - -async def get_pool_from_mesos_task(task: Task) -> Optional[str]: - try: - attributes = (await task.slave())["attributes"] - return attributes.get("pool", "default") - except SlaveDoesNotExist: - return None - - -@a_sync.to_blocking -async def get_mesos_task_allocation_info() -> Iterable[TaskAllocationInfo]: - tasks = await mesos_tools.get_cached_list_of_running_tasks_from_frameworks() - info_list = [] - for task in tasks: - mesos_container_id, start_time = get_container_info_from_mesos_task(task) - paasta_service, paasta_instance = get_paasta_service_instance_from_mesos_task( - task - ) - paasta_pool = await get_pool_from_mesos_task(task) - info_list.append( - TaskAllocationInfo( - paasta_service=paasta_service, - paasta_instance=paasta_instance, - container_type=MAIN_CONTAINER_TYPE, - paasta_pool=paasta_pool, - resources=task["resources"], - start_time=start_time, - docker_id=None, - pod_name=None, - pod_ip=None, - host_ip=None, - git_sha=None, - config_sha=None, - mesos_container_id=mesos_container_id, - namespace=None, - ) - ) - return info_list - - def get_all_running_kubernetes_pods( kube_client: KubeClient, namespace: str ) -> Iterable[V1Pod]: @@ -261,9 +186,7 @@ def get_task_allocation_info( namespace: str, kube_client: Optional[KubeClient], ) -> Iterable[TaskAllocationInfo]: - if scheduler == "mesos": - return get_mesos_task_allocation_info() - elif scheduler == "kubernetes": + if scheduler == "kubernetes": return get_kubernetes_task_allocation_info(namespace, kube_client) else: return [] @@ -276,7 +199,7 @@ def parse_args() -> argparse.Namespace: help="Scheduler to get task info from", dest="scheduler", default="kubernetes", - choices=["mesos", "kubernetes"], + choices=["kubernetes"], ) parser.add_argument( "--additional-namespaces-exclude", diff --git a/paasta_tools/frameworks/adhoc_scheduler.py b/paasta_tools/frameworks/adhoc_scheduler.py deleted file mode 100644 index a70b1c2004..0000000000 --- a/paasta_tools/frameworks/adhoc_scheduler.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env python -# Copyright 2015-2017 Yelp Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from typing import Dict -from typing import List -from typing import Tuple - -from pymesos import MesosSchedulerDriver - -from paasta_tools.frameworks.constraints import ConstraintState -from paasta_tools.frameworks.native_scheduler import LIVE_TASK_STATES -from paasta_tools.frameworks.native_scheduler import NativeScheduler -from paasta_tools.frameworks.native_service_config import TaskInfo -from paasta_tools.frameworks.native_service_config import UnknownNativeServiceError - - -class AdhocScheduler(NativeScheduler): - def __init__(self, *args, **kwargs): - self.dry_run = kwargs.pop("dry_run") - - if kwargs.get("service_config_overrides") is None: - kwargs["service_config_overrides"] = {} - kwargs["service_config_overrides"].setdefault("instances", 1) - self.finished_countdown = kwargs["service_config_overrides"]["instances"] - - super().__init__(*args, **kwargs) - - def need_to_stop(self): - # Is used to decide whether to stop the driver or try to start more tasks. - return self.finished_countdown == 0 - - def statusUpdate(self, driver: MesosSchedulerDriver, update: Dict): - super().statusUpdate(driver, update) - - if update["state"] not in LIVE_TASK_STATES: - self.finished_countdown -= 1 - - # Stop if task ran and finished - if self.need_to_stop(): - driver.stop() - - def tasks_and_state_for_offer( - self, driver: MesosSchedulerDriver, offer, state: ConstraintState - ) -> Tuple[List[TaskInfo], ConstraintState]: - # In dry run satisfy exit-conditions after we got the offer - if self.dry_run or self.need_to_stop(): - if self.dry_run: - tasks, _ = super().tasks_and_state_for_offer(driver, offer, state) - print("Would have launched: ", tasks) - driver.stop() - return [], state - - return super().tasks_and_state_for_offer(driver, offer, state) - - def kill_tasks_if_necessary(self, *args, **kwargs): - return - - def validate_config(self): - if self.service_config.get_cmd() is None: - raise UnknownNativeServiceError("missing cmd in service config") diff --git a/paasta_tools/frameworks/native_scheduler.py b/paasta_tools/frameworks/native_scheduler.py deleted file mode 100644 index e128cea800..0000000000 --- a/paasta_tools/frameworks/native_scheduler.py +++ /dev/null @@ -1,652 +0,0 @@ -#!/usr/bin/env python -import asyncio -import copy -import getpass -import logging -import random -import threading -import time -import uuid -from typing import Collection -from typing import Dict -from typing import List -from typing import Mapping -from typing import Optional -from typing import Tuple - -import a_sync -import service_configuration_lib -from pymesos import MesosSchedulerDriver -from pymesos.interface import Scheduler - -from paasta_tools import bounce_lib -from paasta_tools import drain_lib -from paasta_tools import mesos_tools -from paasta_tools.frameworks.constraints import check_offer_constraints -from paasta_tools.frameworks.constraints import ConstraintState -from paasta_tools.frameworks.constraints import update_constraint_state -from paasta_tools.frameworks.native_service_config import load_paasta_native_job_config -from paasta_tools.frameworks.native_service_config import NativeServiceConfig -from paasta_tools.frameworks.native_service_config import NativeServiceConfigDict -from paasta_tools.frameworks.native_service_config import TaskInfo -from paasta_tools.frameworks.task_store import MesosTaskParameters -from paasta_tools.frameworks.task_store import TaskStore -from paasta_tools.frameworks.task_store import ZKTaskStore -from paasta_tools.utils import _log -from paasta_tools.utils import DEFAULT_LOGLEVEL -from paasta_tools.utils import DEFAULT_SOA_DIR -from paasta_tools.utils import get_services_for_cluster -from paasta_tools.utils import SystemPaastaConfig - -log = logging.getLogger(__name__) - -MESOS_TASK_SPACER = "." - -# Bring these into local scope for shorter lines of code. -TASK_STAGING = "TASK_STAGING" -TASK_STARTING = "TASK_STARTING" -TASK_RUNNING = "TASK_RUNNING" - -TASK_KILLING = "TASK_KILLING" -TASK_FINISHED = "TASK_FINISHED" -TASK_FAILED = "TASK_FAILED" -TASK_KILLED = "TASK_KILLED" -TASK_LOST = "TASK_LOST" -TASK_ERROR = "TASK_ERROR" - -LIVE_TASK_STATES = (TASK_STAGING, TASK_STARTING, TASK_RUNNING) - - -class ConstraintFailAllTasksError(Exception): - pass - - -class NativeScheduler(Scheduler): - task_store: TaskStore - - def __init__( - self, - service_name: str, - instance_name: str, - cluster: str, - system_paasta_config: SystemPaastaConfig, - staging_timeout: float, - soa_dir: str = DEFAULT_SOA_DIR, - service_config: Optional[NativeServiceConfig] = None, - reconcile_backoff: float = 30, - instance_type: str = "paasta_native", - service_config_overrides: Optional[NativeServiceConfigDict] = None, - reconcile_start_time: float = float("inf"), - task_store_type=ZKTaskStore, - ) -> None: - self.service_name = service_name - self.instance_name = instance_name - self.instance_type = instance_type - self.cluster = cluster - self.system_paasta_config = system_paasta_config - self.soa_dir = soa_dir - - # This will be initialized in registered(). - self.task_store = None - self.task_store_type = task_store_type - - self.service_config_overrides = service_config_overrides or {} - self.constraint_state: ConstraintState = {} - self.constraint_state_lock = threading.Lock() - self.frozen = False - - # don't accept resources until we reconcile. - self.reconcile_start_time = reconcile_start_time - - # wait this long after starting a reconcile before accepting offers. - self.reconcile_backoff = reconcile_backoff - - # wait this long for a task to launch. - self.staging_timeout = staging_timeout - - # Gets set when registered() is called - self.framework_id = None - - # agent_id -> unix timestamp of when we blacklisted it - self.blacklisted_slaves: Dict[str, float] = {} - self.blacklist_timeout = 3600 - - if service_config is not None: - self.service_config = service_config - self.service_config.config_dict.update( # type: ignore - self.service_config_overrides - ) - self.recreate_drain_method() - self.reload_constraints() - self.validate_config() - else: - self.load_config() - - def log(self, line, level=DEFAULT_LOGLEVEL): - _log( - service=self.service_name, - instance=self.instance_name, - component="deploy", - line=line, - level=level, - ) - - def shutdown(self, driver: MesosSchedulerDriver): - # TODO: this is naive, as it does nothing to stop on-going calls - # to statusUpdate or resourceOffers. - self.log( - "Freezing the scheduler. Further status updates and resource offers are ignored." - ) - self.frozen = True - self.log("Killing any remaining live tasks.") - for task, parameters in self.task_store.get_all_tasks().items(): - if parameters.mesos_task_state in LIVE_TASK_STATES: - self.kill_task(driver, task) - self.task_store.close() - - def registered(self, driver: MesosSchedulerDriver, frameworkId, masterInfo): - self.framework_id = frameworkId["value"] - self.log("Registered with framework ID %s" % frameworkId["value"]) - - self.task_store = self.task_store_type( - service_name=self.service_name, - instance_name=self.instance_name, - framework_id=self.framework_id, - system_paasta_config=self.system_paasta_config, - ) - - self.reconcile_start_time = time.time() - driver.reconcileTasks([]) - - def reregistered(self, driver: MesosSchedulerDriver, masterInfo): - self.registered(driver, {"value": driver.framework_id}, masterInfo) - - def resourceOffers(self, driver: MesosSchedulerDriver, offers): - if self.frozen: - return - - if self.within_reconcile_backoff(): - self.log( - "Declining all offers since we started reconciliation too recently" - ) - for offer in offers: - driver.declineOffer(offer.id) - else: - for idx, offer in enumerate(offers): - if offer.agent_id.value in self.blacklisted_slaves: - log.critical( - "Ignoring offer %s from blacklisted slave %s" - % (offer.id.value, offer.agent_id.value) - ) - filters = {"refuse_seconds": self.blacklist_timeout} - driver.declineOffer(offer.id, filters) - del offers[idx] - - self.launch_tasks_for_offers(driver, offers) - - def launch_tasks_for_offers( - self, driver: MesosSchedulerDriver, offers - ) -> List[TaskInfo]: - """For each offer tries to launch all tasks that can fit in there. - Declines offer if no fitting tasks found.""" - launched_tasks: List[TaskInfo] = [] - - for offer in offers: - with self.constraint_state_lock: - try: - tasks, new_state = self.tasks_and_state_for_offer( - driver, offer, self.constraint_state - ) - - if tasks is not None and len(tasks) > 0: - driver.launchTasks([offer.id], tasks) - - for task in tasks: - self.task_store.add_task_if_doesnt_exist( - task["task_id"]["value"], - health=None, - mesos_task_state=TASK_STAGING, - offer=offer, - resources=task["resources"], - ) - launched_tasks.extend(tasks) - self.constraint_state = new_state - else: - driver.declineOffer(offer.id) - except ConstraintFailAllTasksError: - self.log("Offer failed constraints for every task, rejecting 60s") - filters = {"refuse_seconds": 60} - driver.declineOffer(offer.id, filters) - return launched_tasks - - def task_fits(self, offer): - """Checks whether the offer is big enough to fit the tasks""" - needed_resources = { - "cpus": self.service_config.get_cpus(), - "mem": self.service_config.get_mem(), - "disk": self.service_config.get_disk(), - } - for resource in offer.resources: - try: - if resource.scalar.value < needed_resources[resource.name]: - return False - except KeyError: - pass - - return True - - def need_more_tasks(self, name, existingTasks, scheduledTasks): - """Returns whether we need to start more tasks.""" - num_have = 0 - for task, parameters in existingTasks.items(): - if self.is_task_new(name, task) and ( - parameters.mesos_task_state in LIVE_TASK_STATES - ): - num_have += 1 - - for task in scheduledTasks: - if task["name"] == name: - num_have += 1 - - return num_have < self.service_config.get_desired_instances() - - def get_new_tasks(self, name, tasks_with_params: Dict[str, MesosTaskParameters]): - return { - tid: params - for tid, params in tasks_with_params.items() - if ( - self.is_task_new(name, tid) - and (params.mesos_task_state in LIVE_TASK_STATES) - ) - } - - def get_old_tasks(self, name, tasks_with_params: Dict[str, MesosTaskParameters]): - return { - tid: params - for tid, params in tasks_with_params.items() - if ( - (not self.is_task_new(name, tid)) - and (params.mesos_task_state in LIVE_TASK_STATES) - ) - } - - def is_task_new(self, name, tid): - return tid.startswith("%s." % name) - - def log_and_kill(self, driver: MesosSchedulerDriver, task_id): - log.critical( - "Task stuck launching for %ss, assuming to have failed. Killing task." - % self.staging_timeout - ) - self.blacklist_slave(self.task_store.get_task(task_id).offer.agent_id.value) - self.kill_task(driver, task_id) - - def tasks_and_state_for_offer( - self, driver: MesosSchedulerDriver, offer, state: ConstraintState - ) -> Tuple[List[TaskInfo], ConstraintState]: - """Returns collection of tasks that can fit inside an offer.""" - tasks: List[TaskInfo] = [] - offerCpus = 0.0 - offerMem = 0.0 - offerPorts: List[int] = [] - for resource in offer.resources: - if resource.name == "cpus": - offerCpus += resource.scalar.value - elif resource.name == "mem": - offerMem += resource.scalar.value - elif resource.name == "ports": - for rg in resource.ranges.range: - # I believe mesos protobuf ranges are inclusive, but range() is exclusive - offerPorts += range(rg.begin, rg.end + 1) - remainingCpus = offerCpus - remainingMem = offerMem - remainingPorts = set(offerPorts) - - base_task = self.service_config.base_task(self.system_paasta_config) - base_task["agent_id"]["value"] = offer["agent_id"]["value"] - - task_mem = self.service_config.get_mem() - task_cpus = self.service_config.get_cpus() - - # don't mutate existing state - new_constraint_state = copy.deepcopy(state) - total = 0 - failed_constraints = 0 - while self.need_more_tasks( - base_task["name"], self.task_store.get_all_tasks(), tasks - ): - total += 1 - - if not ( - remainingCpus >= task_cpus - and remainingMem >= task_mem - and self.offer_matches_pool(offer) - and len(remainingPorts) >= 1 - ): - break - - if not ( - check_offer_constraints(offer, self.constraints, new_constraint_state) - ): - failed_constraints += 1 - break - - task_port = random.choice(list(remainingPorts)) - - task = copy.deepcopy(base_task) - task["task_id"] = {"value": "{}.{}".format(task["name"], uuid.uuid4().hex)} - - task["container"]["docker"]["port_mappings"][0]["host_port"] = task_port - for resource in task["resources"]: - if resource["name"] == "ports": - resource["ranges"]["range"][0]["begin"] = task_port - resource["ranges"]["range"][0]["end"] = task_port - - tasks.append(task) - - remainingCpus -= task_cpus - remainingMem -= task_mem - remainingPorts -= {task_port} - - update_constraint_state(offer, self.constraints, new_constraint_state) - - # raise constraint error but only if no other tasks fit/fail the offer - if total > 0 and failed_constraints == total: - raise ConstraintFailAllTasksError - - return tasks, new_constraint_state - - def offer_matches_pool(self, offer): - for attribute in offer.attributes: - if attribute.name == "pool": - return attribute.text.value == self.service_config.get_pool() - # we didn't find a pool attribute on this slave, so assume it's not in our pool. - return False - - def within_reconcile_backoff(self): - return time.time() - self.reconcile_backoff < self.reconcile_start_time - - def periodic(self, driver: MesosSchedulerDriver): - if self.frozen: - return - - self.periodic_was_called = True # Used for testing. - if not self.within_reconcile_backoff(): - driver.reviveOffers() - - self.load_config() - self.kill_tasks_if_necessary(driver) - self.check_blacklisted_slaves_for_timeout() - - def statusUpdate(self, driver: MesosSchedulerDriver, update: Dict): - if self.frozen: - return - - # update tasks - task_id = update["task_id"]["value"] - self.log("Task {} is in state {}".format(task_id, update["state"])) - - task_params = self.task_store.update_task( - task_id, mesos_task_state=update["state"] - ) - - if task_params.mesos_task_state not in LIVE_TASK_STATES: - with self.constraint_state_lock: - update_constraint_state( - task_params.offer, self.constraints, self.constraint_state, step=-1 - ) - - driver.acknowledgeStatusUpdate(update) - self.kill_tasks_if_necessary(driver) - - def make_healthiness_sorter( - self, base_task_name: str, all_tasks_with_params: Dict[str, MesosTaskParameters] - ): - def healthiness_score(task_id): - """Return a tuple that can be used as a key for sorting, that expresses our desire to keep this task around. - Higher values (things that sort later) are more desirable.""" - params = all_tasks_with_params[task_id] - - state_score = { - TASK_KILLING: 0, - TASK_FINISHED: 0, - TASK_FAILED: 0, - TASK_KILLED: 0, - TASK_LOST: 0, - TASK_ERROR: 0, - TASK_STAGING: 1, - TASK_STARTING: 2, - TASK_RUNNING: 3, - }[params.mesos_task_state] - - # unhealthy tasks < healthy - # staging < starting < running - # old < new - return ( - params.is_healthy, - state_score, - self.is_task_new(base_task_name, task_id), - ) - - return healthiness_score - - def kill_tasks_if_necessary(self, driver: MesosSchedulerDriver): - base_task = self.service_config.base_task(self.system_paasta_config) - - all_tasks_with_params = self.task_store.get_all_tasks() - - new_tasks_with_params = self.get_new_tasks( - base_task["name"], all_tasks_with_params - ) - happy_new_tasks_with_params = self.get_happy_tasks(new_tasks_with_params) - - desired_instances = self.service_config.get_desired_instances() - # this puts the most-desired tasks first. I would have left them in order of bad->good and used - # new_tasks_by_desirability[:-desired_instances] instead, but list[:-0] is an empty list, rather than the full - # list. - new_task_ids_by_desirability = sorted( - list(new_tasks_with_params.keys()), - key=self.make_healthiness_sorter(base_task["name"], all_tasks_with_params), - reverse=True, - ) - new_task_ids_to_kill = new_task_ids_by_desirability[desired_instances:] - - old_tasks_with_params = self.get_old_tasks( - base_task["name"], all_tasks_with_params - ) - old_draining_tasks_with_params = self.get_draining_tasks(old_tasks_with_params) - old_non_draining_tasks = sorted( - list( - set(old_tasks_with_params.keys()) - set(old_draining_tasks_with_params) - ), - key=self.make_healthiness_sorter(base_task["name"], all_tasks_with_params), - reverse=True, - ) - - actions = bounce_lib.crossover_bounce( - new_config={"instances": desired_instances}, - new_app_running=True, - happy_new_tasks=happy_new_tasks_with_params.keys(), - old_non_draining_tasks=new_task_ids_to_kill + old_non_draining_tasks, - ) - - with a_sync.idle_event_loop(): - futures = [] - for task in set(new_tasks_with_params.keys()) - set( - actions["tasks_to_drain"] - ): - futures.append(asyncio.ensure_future(self.undrain_task(task))) - for task in actions["tasks_to_drain"]: - futures.append(asyncio.ensure_future(self.drain_task(task))) - - if futures: - a_sync.block(asyncio.wait, futures) - - async def kill_if_safe_to_kill(task_id: str): - if await self.drain_method.is_safe_to_kill( - self.make_drain_task(task_id) - ): - self.kill_task(driver, task_id) - - futures = [] - for task, parameters in all_tasks_with_params.items(): - if ( - parameters.is_draining - and parameters.mesos_task_state in LIVE_TASK_STATES - ): - futures.append(asyncio.ensure_future(kill_if_safe_to_kill(task))) - if futures: - a_sync.block(asyncio.wait, futures) - - def get_happy_tasks(self, tasks_with_params: Dict[str, MesosTaskParameters]): - """Filter a dictionary of tasks->params to those that are running and not draining.""" - happy_tasks = {} - for tid, params in tasks_with_params.items(): - if params.mesos_task_state == TASK_RUNNING and not params.is_draining: - happy_tasks[tid] = params - return happy_tasks - - def get_draining_tasks(self, tasks_with_params: Dict[str, MesosTaskParameters]): - """Filter a dictionary of tasks->params to those that are draining.""" - return {t: p for t, p in tasks_with_params.items() if p.is_draining} - - def make_drain_task(self, task_id: str): - """Return a DrainTask object, which is suitable for passing to drain methods.""" - - ports = [] - - params = self.task_store.get_task(task_id) - for resource in params.resources: - if resource["name"] == "ports": - for rg in resource["ranges"]["range"]: - for port in range(rg["begin"], rg["end"] + 1): - ports.append(port) - - return DrainTask( - id=task_id, host=params.offer["agent_id"]["value"], ports=ports - ) - - async def undrain_task(self, task_id: str): - self.log("Undraining task %s" % task_id) - await self.drain_method.stop_draining(self.make_drain_task(task_id)) - self.task_store.update_task(task_id, is_draining=False) - - async def drain_task(self, task_id: str): - self.log("Draining task %s" % task_id) - await self.drain_method.drain(self.make_drain_task(task_id)) - self.task_store.update_task(task_id, is_draining=True) - - def kill_task(self, driver: MesosSchedulerDriver, task_id: str): - self.log("Killing task %s" % task_id) - driver.killTask({"value": task_id}) - self.task_store.update_task(task_id, mesos_task_state=TASK_KILLING) - - def group_tasks_by_version( - self, task_ids: Collection[str] - ) -> Mapping[str, Collection[str]]: - d: Dict[str, List[str]] = {} - for task_id in task_ids: - version = task_id.rsplit(".", 1)[0] - d.setdefault(version, []).append(task_id) - return d - - def load_config(self) -> None: - service_configuration_lib._yaml_cache = {} - self.service_config = load_paasta_native_job_config( - service=self.service_name, - instance=self.instance_name, - instance_type=self.instance_type, - cluster=self.cluster, - soa_dir=self.soa_dir, - config_overrides=self.service_config_overrides, - ) - self.recreate_drain_method() - self.reload_constraints() - self.validate_config() - - def validate_config(self) -> None: - pass - - def recreate_drain_method(self) -> None: - """Re-instantiate self.drain_method. Should be called after self.service_config changes.""" - self.drain_method = drain_lib.get_drain_method( - name=self.service_config.get_drain_method( - self.service_config.service_namespace_config - ), - service=self.service_name, - instance=self.instance_name, - registrations=self.service_config.get_registrations(), - **self.service_config.get_drain_method_params( - self.service_config.service_namespace_config - ), - ) - - def reload_constraints(self): - self.constraints = self.service_config.get_constraints() or [] - - def blacklist_slave(self, agent_id: str): - log.debug("Blacklisting slave: %s" % agent_id) - self.blacklisted_slaves.setdefault(agent_id, time.time()) - - def unblacklist_slave(self, agent_id: str): - if agent_id not in self.blacklisted_slaves: - return - - log.debug("Unblacklisting slave: %s" % agent_id) - with self.blacklisted_slaves_lock: - del self.blacklisted_slaves[agent_id] - - def check_blacklisted_slaves_for_timeout(self): - for agent_id, blacklist_time in self.blacklisted_slaves.items(): - if (blacklist_time + self.blacklist_timeout) < time.time(): - self.unblacklist_slave(agent_id) - - -class DrainTask: - def __init__(self, id, host, ports): - self.id = id - self.host = host - self.ports = ports - - -def find_existing_id_if_exists_or_gen_new(name): - for framework in mesos_tools.get_all_frameworks(active_only=True): - if framework.name == name: - return framework.id - else: - return uuid.uuid4().hex - - -def create_driver(framework_name, scheduler, system_paasta_config, implicit_acks=False): - master_uri = "{}:{}".format( - mesos_tools.get_mesos_leader(), mesos_tools.MESOS_MASTER_PORT - ) - - framework = { - "user": getpass.getuser(), - "name": framework_name, - "failover_timeout": 604800, - "id": {"value": find_existing_id_if_exists_or_gen_new(framework_name)}, - "checkpoint": True, - "principal": system_paasta_config.get_paasta_native_config()["principal"], - } - - driver = MesosSchedulerDriver( - sched=scheduler, - framework=framework, - master_uri=master_uri, - use_addict=True, - implicit_acknowledgements=implicit_acks, - principal=system_paasta_config.get_paasta_native_config()["principal"], - secret=system_paasta_config.get_paasta_native_config()["secret"], - ) - return driver - - -def get_paasta_native_jobs_for_cluster(cluster=None, soa_dir=DEFAULT_SOA_DIR): - """A paasta_native-specific wrapper around utils.get_services_for_cluster - - :param cluster: The cluster to read the configuration for - :param soa_dir: The SOA config directory to read from - :returns: A list of tuples of (service, job_name)""" - return get_services_for_cluster(cluster, "paasta_native", soa_dir) diff --git a/paasta_tools/frameworks/task_store.py b/paasta_tools/frameworks/task_store.py deleted file mode 100644 index 745ae1f471..0000000000 --- a/paasta_tools/frameworks/task_store.py +++ /dev/null @@ -1,245 +0,0 @@ -import copy -import json -from typing import Any -from typing import Dict -from typing import Tuple -from typing import Type -from typing import TypeVar -from typing import Union - -from kazoo.client import KazooClient -from kazoo.exceptions import BadVersionError -from kazoo.exceptions import NodeExistsError -from kazoo.exceptions import NoNodeError -from kazoo.protocol.states import ZnodeStat - -from paasta_tools.utils import _log - - -class MesosTaskParametersIsImmutableError(Exception): - pass - - -_SelfT = TypeVar("_SelfT", bound="MesosTaskParameters") - - -class MesosTaskParameters: - health: Any - mesos_task_state: str - is_draining: bool - is_healthy: bool - offer: Any - resources: Any - - def __init__( - self, - health=None, - mesos_task_state=None, - is_draining=None, - is_healthy=None, - offer=None, - resources=None, - ): - self.__dict__["health"] = health - self.__dict__["mesos_task_state"] = mesos_task_state - self.__dict__["is_draining"] = is_draining - self.__dict__["is_healthy"] = is_healthy - self.__dict__["offer"] = offer - self.__dict__["resources"] = resources - - def __eq__(self, other): - return self.__dict__ == other.__dict__ - - def __repr__(self): - return "{}(\n {})".format( - type(self).__name__, - ",\n ".join(["%s=%r" % kv for kv in self.__dict__.items()]), - ) - - def __setattr__(self, name, value): - raise MesosTaskParametersIsImmutableError() - - def __delattr__(self, name): - raise MesosTaskParametersIsImmutableError() - - def merge(self: _SelfT, **kwargs) -> "MesosTaskParameters": - """Return a merged MesosTaskParameters object, where attributes in other take precedence over self.""" - - new_dict = copy.deepcopy(self.__dict__) - new_dict.update(kwargs) - - return MesosTaskParameters(**new_dict) - - @classmethod - def deserialize(cls: Type[_SelfT], serialized_params: Union[str, bytes]) -> _SelfT: - return cls(**json.loads(serialized_params)) - - def serialize(self): - return json.dumps(self.__dict__).encode("utf-8") - - -class TaskStore: - def __init__(self, service_name, instance_name, framework_id, system_paasta_config): - self.service_name = service_name - self.instance_name = instance_name - self.framework_id = framework_id - self.system_paasta_config = system_paasta_config - - def get_task(self, task_id: str) -> MesosTaskParameters: - """Get task data for task_id. If we don't know about task_id, return None""" - raise NotImplementedError() - - def get_all_tasks(self) -> Dict[str, MesosTaskParameters]: - """Returns a dictionary of task_id -> MesosTaskParameters for all known tasks.""" - raise NotImplementedError() - - def overwrite_task(self, task_id: str, params: MesosTaskParameters) -> None: - raise NotImplementedError() - - def add_task_if_doesnt_exist(self, task_id: str, **kwargs) -> None: - """Add a task if it does not already exist. If it already exists, do nothing.""" - if self.get_task(task_id) is not None: - return - else: - self.overwrite_task(task_id, MesosTaskParameters(**kwargs)) - - def update_task(self, task_id: str, **kwargs) -> MesosTaskParameters: - existing_task = self.get_task(task_id) - if existing_task: - merged_params = existing_task.merge(**kwargs) - else: - merged_params = MesosTaskParameters(**kwargs) - - self.overwrite_task(task_id, merged_params) - return merged_params - - def garbage_collect_old_tasks(self, max_dead_task_age: float) -> None: - # TODO: call me. - # TODO: implement in base class. - raise NotImplementedError() - - def close(self): - pass - - -class DictTaskStore(TaskStore): - def __init__(self, service_name, instance_name, framework_id, system_paasta_config): - self.tasks: Dict[str, MesosTaskParameters] = {} - super().__init__( - service_name, instance_name, framework_id, system_paasta_config - ) - - def get_task(self, task_id: str) -> MesosTaskParameters: - return self.tasks.get(task_id) - - def get_all_tasks(self) -> Dict[str, MesosTaskParameters]: - """Returns a dictionary of task_id -> MesosTaskParameters for all known tasks.""" - return dict(self.tasks) - - def overwrite_task(self, task_id: str, params: MesosTaskParameters) -> None: - # serialize/deserialize to make sure the returned values are the same format as ZKTaskStore. - self.tasks[task_id] = MesosTaskParameters.deserialize(params.serialize()) - - -class ZKTaskStore(TaskStore): - def __init__(self, service_name, instance_name, framework_id, system_paasta_config): - super().__init__( - service_name, instance_name, framework_id, system_paasta_config - ) - self.zk_hosts = system_paasta_config.get_zk_hosts() - - # For some reason, I could not get the code suggested by this SO post to work to ensure_path on the chroot. - # https://stackoverflow.com/a/32785625/25327 - # Plus, it just felt dirty to modify instance attributes of a running connection, especially given that - # KazooClient.set_hosts() doesn't allow you to change the chroot. Must be for a good reason. - - chroot = f"task_store/{service_name}/{instance_name}/{framework_id}" - - temp_zk_client = KazooClient(hosts=self.zk_hosts) - temp_zk_client.start() - temp_zk_client.ensure_path(chroot) - temp_zk_client.stop() - temp_zk_client.close() - - self.zk_client = KazooClient(hosts=f"{self.zk_hosts}/{chroot}") - self.zk_client.start() - self.zk_client.ensure_path("/") - - def close(self): - self.zk_client.stop() - self.zk_client.close() - - def get_task(self, task_id: str) -> MesosTaskParameters: - params, stat = self._get_task(task_id) - return params - - def _get_task(self, task_id: str) -> Tuple[MesosTaskParameters, ZnodeStat]: - """Like get_task, but also returns the ZnodeStat that self.zk_client.get() returns""" - try: - data, stat = self.zk_client.get("/%s" % task_id) - return MesosTaskParameters.deserialize(data), stat - except NoNodeError: - return None, None - except json.decoder.JSONDecodeError: - _log( - service=self.service_name, - instance=self.instance_name, - level="debug", - component="deploy", - line=f"Warning: found non-json-decodable value in zookeeper for task {task_id}: {data}", - ) - return None, None - - def get_all_tasks(self): - all_tasks = {} - - for child_path in self.zk_client.get_children("/"): - task_id = self._task_id_from_zk_path(child_path) - params = self.get_task(task_id) - # sometimes there are bogus child ZK nodes. Ignore them. - if params is not None: - all_tasks[task_id] = params - - return all_tasks - - def update_task(self, task_id: str, **kwargs): - retry = True - while retry: - retry = False - existing_task, stat = self._get_task(task_id) - - zk_path = self._zk_path_from_task_id(task_id) - if existing_task: - merged_params = existing_task.merge(**kwargs) - try: - self.zk_client.set( - zk_path, merged_params.serialize(), version=stat.version - ) - except BadVersionError: - retry = True - else: - merged_params = MesosTaskParameters(**kwargs) - try: - self.zk_client.create(zk_path, merged_params.serialize()) - except NodeExistsError: - retry = True - - return merged_params - - def overwrite_task( - self, task_id: str, params: MesosTaskParameters, version=-1 - ) -> None: - try: - self.zk_client.set( - self._zk_path_from_task_id(task_id), params.serialize(), version=version - ) - except NoNodeError: - self.zk_client.create( - self._zk_path_from_task_id(task_id), params.serialize() - ) - - def _zk_path_from_task_id(self, task_id: str) -> str: - return "/%s" % task_id - - def _task_id_from_zk_path(self, zk_path: str) -> str: - return zk_path.lstrip("/") diff --git a/paasta_tools/mesos/__init__.py b/paasta_tools/mesos/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/paasta_tools/mesos/cfg.py b/paasta_tools/mesos/cfg.py deleted file mode 100644 index 45d17d5e9a..0000000000 --- a/paasta_tools/mesos/cfg.py +++ /dev/null @@ -1,46 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import copy -import errno -import json - - -DEFAULTS = { - "debug": "false", - "log_file": None, - "log_level": "warning", - "master": "localhost:5050", - "max_workers": 5, - "scheme": "http", - "response_timeout": 5, -} - - -def load_mesos_config(config_path, profile="default"): - on_disk = {} - - try: - with open(config_path, "rt") as f: - on_disk = json.load(f)[profile] - except ValueError as e: - raise ValueError("Invalid JSON: {} in {}".format(str(e), config_path)) - except IOError as e: - if e.errno != errno.ENOENT: - raise - - config = copy.deepcopy(DEFAULTS) - config.update(on_disk) - return config diff --git a/paasta_tools/mesos/cluster.py b/paasta_tools/mesos/cluster.py deleted file mode 100644 index db2fec3c7e..0000000000 --- a/paasta_tools/mesos/cluster.py +++ /dev/null @@ -1,60 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import asyncio -import itertools - -from . import exceptions - - -async def get_files_for_tasks(task_list, file_list, max_workers): - no_files_found = True - - async def process(task_fname): - task, fname = task_fname - try: - fobj = await task.file(fname) - except exceptions.SlaveDoesNotExist: - if task is None: - print(f"(Unknown Task):{fname} (Slave no longer exists)") - else: - print(f"{task['id']}:{task_fname} (Slave no longer exists)") - raise exceptions.SkipResult - - if await fobj.exists(): - return fobj - - elements = itertools.chain( - *[[(task, fname) for fname in file_list] for task in task_list] - ) - - futures = [asyncio.ensure_future(process(element)) for element in elements] - - if futures: - for result in asyncio.as_completed(futures): - try: - result = await result - if result: - no_files_found = False - yield result - except exceptions.SkipResult: - pass - - if no_files_found: - raise exceptions.FileNotFoundForTaskException( - "None of the tasks in {} contain the files in list {}".format( - ",".join([task["id"] for task in task_list]), ",".join(file_list) - ) - ) diff --git a/paasta_tools/mesos/exceptions.py b/paasta_tools/mesos/exceptions.py deleted file mode 100644 index 92b4d7361a..0000000000 --- a/paasta_tools/mesos/exceptions.py +++ /dev/null @@ -1,59 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -class MasterNotAvailableException(Exception): - pass - - -class MasterTemporarilyNotAvailableException(Exception): - pass - - -class NoSlavesAvailableError(Exception): - pass - - -class MultipleSlavesForIDError(Exception): - pass - - -class TaskNotFoundException(Exception): - pass - - -class FileNotFoundForTaskException(Exception): - pass - - -class MultipleTasksForIDError(Exception): - pass - - -class FileDoesNotExist(Exception): - pass - - -class MissingExecutor(Exception): - pass - - -class SlaveDoesNotExist(Exception): - pass - - -class SkipResult(Exception): - pass diff --git a/paasta_tools/mesos/framework.py b/paasta_tools/mesos/framework.py deleted file mode 100644 index 26959de852..0000000000 --- a/paasta_tools/mesos/framework.py +++ /dev/null @@ -1,77 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -class Framework: - def __init__(self, items): - self.__items = items - - def __getitem__(self, name): - return self.__items[name] - - def __str__(self): - return f"{self.name}:{self.id}" - - def get(self, name, default=None): - try: - return self[name] - except KeyError: - return default - - @property - def id(self): - return self["id"] - - @property - def name(self): - return self["name"] - - @property - def hostname(self): - return self["hostname"] - - @property - def active(self): - return self["active"] - - @property - def task_count(self): - return len(self["tasks"]) - - @property - def user(self): - return self["user"] - - @property - def cpu_allocated(self): - return self._resource_allocated("cpus") - - @property - def mem_allocated(self): - return self._resource_allocated("mem") - - @property - def disk_allocated(self): - return self._resource_allocated("disk") - - def _resource_allocated(self, resource): - return self["resources"][resource] - - def __eq__(self, other): - return self.__items == other.__items - - def __ne__(self, other): - return not self.__eq__ diff --git a/paasta_tools/mesos/log.py b/paasta_tools/mesos/log.py deleted file mode 100644 index 0a81a015bd..0000000000 --- a/paasta_tools/mesos/log.py +++ /dev/null @@ -1,48 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import functools -import logging -import sys -import time - -debug = logging.debug - - -def fatal(msg, code=1): - print(msg + "\n") - logging.error(msg) - sys.exit(code) - - -def fn(f, *args, **kwargs): - logging.debug("{}: {} {}".format(repr(f), args, kwargs)) - return f(*args, **kwargs) - - -def duration(fn): - @functools.wraps(fn) - def timer(*args, **kwargs): - start = time.time() - try: - return fn(*args, **kwargs) - finally: - debug( - "duration: {}.{}: {:2.2f}s".format( - fn.__module__, fn.__name__, time.time() - start - ) - ) - - return timer diff --git a/paasta_tools/mesos/master.py b/paasta_tools/mesos/master.py deleted file mode 100644 index 890a4842f1..0000000000 --- a/paasta_tools/mesos/master.py +++ /dev/null @@ -1,306 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import fnmatch -import itertools -import json -import logging -import os -import re -from typing import List -from urllib.parse import urljoin -from urllib.parse import urlparse - -import aiohttp -from kazoo.handlers.threading import KazooTimeoutError -from kazoo.retry import KazooRetry -from mypy_extensions import TypedDict -from retry import retry - -from . import exceptions -from . import framework -from . import log -from . import mesos_file -from . import slave -from . import task -from . import util -from . import zookeeper -from paasta_tools.async_utils import async_ttl_cache -from paasta_tools.utils import get_user_agent - -ZOOKEEPER_TIMEOUT = 1 - -INVALID_PATH = "{0} does not have a valid path. Did you forget /mesos?" - -MISSING_MASTER = """unable to connect to a master at {0}. - -Try running `mesos config master zk://localhost:2181/mesos`. See the README for -more examples.""" - -MULTIPLE_SLAVES = "There are multiple slaves with that id. Please choose one: " - -logger = logging.getLogger(__name__) - - -class MesosState(TypedDict): - slaves: List - frameworks: List - orphan_tasks: List - - -MesosMetrics = TypedDict( - "MesosMetrics", - { - "master/cpus_total": int, - "master/cpus_used": int, - "master/disk_total": int, - "master/disk_used": int, - "master/gpus_total": int, - "master/gpus_used": int, - "master/mem_total": int, - "master/mem_used": int, - "master/tasks_running": int, - "master/tasks_staging": int, - "master/tasks_starting": int, - "master/slaves_active": int, - "master/slaves_inactive": int, - }, -) - - -class MesosMaster: - def __init__(self, config): - self.config = config - - def __str__(self): - return "".format(self.key()) - - def key(self): - return self.config["master"] - - @util.CachedProperty(ttl=5) - def host(self): - return "{}://{}".format( - self.config["scheme"], self.resolve(self.config["master"]) - ) - - @util.CachedProperty(ttl=5) - def cache_host(self): - host_url = urlparse(self.host) - replaced = host_url._replace(netloc=host_url.hostname + ":5055") - return replaced.geturl() - - async def _request( - self, url: str, method: str = "GET", cached: bool = False, **kwargs - ) -> aiohttp.ClientResponse: - headers = {"User-Agent": get_user_agent()} - - if cached and self.config.get("use_mesos_cache", False): - # TODO: fall back to original host if this fails? - host = self.cache_host - else: - host = self.host - - try: - async with aiohttp.ClientSession( - conn_timeout=self.config["response_timeout"], - read_timeout=self.config["response_timeout"], - ) as session: - async with session.request( - method=method, url=urljoin(host, url), headers=headers, **kwargs - ) as resp: - # if nobody awaits resp.text() or resp.json() before we exit the session context manager, then the - # http connection gets closed before we read the response; then later calls to resp.text/json will - # fail. - await resp.text() - return resp - - except aiohttp.client_exceptions.ClientConnectionError: - raise exceptions.MasterNotAvailableException(MISSING_MASTER.format(host)) - except aiohttp.client_exceptions.TooManyRedirects: - raise exceptions.MasterTemporarilyNotAvailableException( - ( - "Unable to connect to master at %s, likely due to " - "an ongoing leader election" - ) - % host - ) - - async def fetch(self, url, **kwargs): - return await self._request(url, **kwargs) - - async def post(self, url, **kwargs): - return await self._request(url, method="POST", **kwargs) - - def _file_resolver(self, cfg): - return self.resolve(open(cfg[6:], "r+").read().strip()) - - @retry(KazooTimeoutError, tries=5, delay=0.5, logger=logger) - def _zookeeper_resolver(self, cfg): - hosts, path = cfg[5:].split("/", 1) - path = "/" + path - - retry = KazooRetry(max_tries=10) - with zookeeper.client( - hosts=hosts, read_only=True, connection_retry=retry, command_retry=retry - ) as zk: - - def master_id(key): - return int(key.split("_")[-1]) - - def get_masters(): - return [x for x in zk.get_children(path) if re.search(r"\d+", x)] - - leader = sorted(get_masters(), key=lambda x: master_id(x)) - - if len(leader) == 0: - raise exceptions.MasterNotAvailableException( - f"cannot find any masters at {cfg}" - ) - data, stat = zk.get(os.path.join(path, leader[0])) - - if not data: - exceptions.MasterNotAvailableException( - "Cannot retrieve valid MasterInfo data from ZooKeeper" - ) - else: - data = data.decode("utf8") - - try: - parsed = json.loads(data) - if parsed and "address" in parsed: - ip = parsed["address"].get("ip") - port = parsed["address"].get("port") - if ip and port: - return f"{ip}:{port}" - except ValueError as parse_error: - log.debug( - "[WARN] No JSON content, probably connecting to older " - "Mesos version. Reason: {}".format(parse_error) - ) - raise exceptions.MasterNotAvailableException( - "Failed to parse mesos master ip from ZK" - ) - - @log.duration - def resolve(self, cfg): - """Resolve the URL to the mesos master. - - The value of cfg should be one of: - - host:port - - zk://host1:port1,host2:port2/path - - zk://username:password@host1:port1/path - - file:///path/to/file (where file contains one of the above) - """ - if cfg.startswith("zk:"): - return self._zookeeper_resolver(cfg) - elif cfg.startswith("file:"): - return self._file_resolver(cfg) - else: - return cfg - - @async_ttl_cache(ttl=15, cleanup_self=True) - async def state(self) -> MesosState: - return await (await self.fetch("/master/state.json", cached=True)).json() - - async def state_summary(self) -> MesosState: - return await (await self.fetch("/master/state-summary")).json() - - @async_ttl_cache(ttl=None, cleanup_self=True) - async def slave(self, fltr): - lst = await self.slaves(fltr) - - log.debug(f"master.slave({fltr})") - - if len(lst) == 0: - raise exceptions.SlaveDoesNotExist(f"Slave {fltr} no longer exists.") - - elif len(lst) > 1: - raise exceptions.MultipleSlavesForIDError( - "Multiple slaves matching filter {}. {}".format( - fltr, ",".join([slave.id for slave in lst]) - ) - ) - - return lst[0] - - async def slaves(self, fltr=""): - return [ - slave.MesosSlave(self.config, x) - for x in (await self.state())["slaves"] - if fltr == x["id"] - ] - - async def _task_list(self, active_only=False): - keys = ["tasks"] - if not active_only: - keys.append("completed_tasks") - return itertools.chain( - *[util.merge(x, *keys) for x in await self._framework_list(active_only)] - ) - - async def task(self, fltr): - lst = await self.tasks(fltr) - - if len(lst) == 0: - raise exceptions.TaskNotFoundException( - "Cannot find a task with filter %s" % fltr - ) - - elif len(lst) > 1: - raise exceptions.MultipleTasksForIDError( - "Multiple tasks matching filter {}. {}".format( - fltr, ",".join([task.id for task in lst]) - ) - ) - return lst[0] - - async def orphan_tasks(self): - return (await self.state())["orphan_tasks"] - - # XXX - need to filter on task state as well as id - async def tasks(self, fltr="", active_only=False): - return [ - task.Task(self, x) - for x in await self._task_list(active_only) - if fltr in x["id"] or fnmatch.fnmatch(x["id"], fltr) - ] - - async def framework(self, fwid): - return list(filter(lambda x: x.id == fwid, await self.frameworks()))[0] - - async def _framework_list(self, active_only=False): - keys = ["frameworks"] - if not active_only: - keys.append("completed_frameworks") - return util.merge(await self._frameworks(), *keys) - - @async_ttl_cache(ttl=15, cleanup_self=True) - async def _frameworks(self): - return await (await self.fetch("/master/frameworks", cached=True)).json() - - async def frameworks(self, active_only=False): - return [framework.Framework(f) for f in await self._framework_list(active_only)] - - async def teardown(self, framework_id): - return await self.post("/master/teardown", data="frameworkId=%s" % framework_id) - - async def metrics_snapshot(self) -> MesosMetrics: - return await (await self.fetch("/metrics/snapshot")).json() - - @property # type: ignore - @util.memoize - def log(self): - return mesos_file.File(self, path="/master/log") diff --git a/paasta_tools/mesos/mesos_file.py b/paasta_tools/mesos/mesos_file.py deleted file mode 100644 index f72040ff12..0000000000 --- a/paasta_tools/mesos/mesos_file.py +++ /dev/null @@ -1,169 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os - -from . import exceptions -from paasta_tools.async_utils import async_ttl_cache - - -class File: - - chunk_size = 1024 - - def __init__(self, host, task=None, path=None): - self.host = host - self.task = task - self.path = path - - if self.task is None: - self._host_path = self.path - else: - self._host_path = None # Defer until later (_fetch) so we don't make HTTP requests in __init__. - - self._offset = 0 - - # Used during fetch, class level so the dict isn't constantly alloc'd - self._params = { - "path": self._host_path, - "offset": -1, - "length": self.chunk_size, - } - - def __eq__(self, y): - return self.key() == y.key() - - def __hash__(self): - return hash(self.__str__()) - - def __repr__(self): - return f"" - - def __str__(self): - return f"{self._where}:{self.path}" - - def key(self): - return "{}:{}".format(self.host.key(), self._host_path) - - @property - def _where(self): - return self.task["id"] if self.task is not None else self.host.key() - - async def _fetch(self): - # fill in path if it wasn't set in __init__ - if self._params["path"] is None: - self._params["path"] = os.path.join(await self.task.directory(), self.path) - - resp = await self.host.fetch("/files/read.json", params=self._params) - if resp.status == 404: - raise exceptions.FileDoesNotExist("No such file or directory.") - return await resp.json() - - async def exists(self): - try: - await self.size() - return True - except exceptions.FileDoesNotExist: - return False - except exceptions.SlaveDoesNotExist: - return False - - # When reading a file, it is common to first check whether it exists, then - # look at the size to determine where to seek. Instead of requiring - # multiple requests to the slave, the size is cached for a very short - # period of time. - @async_ttl_cache(ttl=0.5, cleanup_self=True) - async def size(self): - return (await self._fetch())["offset"] - - async def seek(self, offset, whence=os.SEEK_SET): - if whence == os.SEEK_SET: - self._offset = 0 + offset - elif whence == os.SEEK_CUR: - self._offset += offset - elif whence == os.SEEK_END: - self._offset = await self.size() + offset - - def tell(self): - return self._offset - - def _length(self, start, size): - if size and self.tell() - start + self.chunk_size > size: - return size - (self.tell() - start) - return self.chunk_size - - async def _get_chunk(self, loc, size=None): - if size is None: - size = self.chunk_size - - await self.seek(loc, os.SEEK_SET) - self._params["offset"] = loc - self._params["length"] = size - - data = (await self._fetch())["data"] - await self.seek(len(data), os.SEEK_CUR) - return data - - async def _read(self, size=None): - start = self.tell() - - def pre(x): - return x == "" - - def post(x): - return size and (self.tell() - start) >= size - - blob = None - while blob != "" and not (size and (self.tell() - start) >= size): - blob = await self._get_chunk(self.tell(), size=self._length(start, size)) - yield blob - - async def _read_reverse(self, size=None): - fsize = await self.size() - if not size: - size = fsize - - def next_block(): - current = fsize - while (current - self.chunk_size) > (fsize - size): - current -= self.chunk_size - yield current - - for pos in next_block(): - yield await self._get_chunk(pos) - - yield await self._get_chunk(fsize - size, size % self.chunk_size) - - async def _readlines(self, size=None): - last = "" - async for blob in self._read(size): - - # This is not streaming and assumes small chunk sizes - blob_lines = (last + blob).split("\n") - for line in blob_lines[: len(blob_lines) - 1]: - yield line - - last = blob_lines[-1] - - async def _readlines_reverse(self, size=None): - buf = "" - async for blob in self._read_reverse(size): - - blob_lines = (blob + buf).split("\n") - for line in reversed(blob_lines[1:]): - yield line - - buf = blob_lines[0] - yield buf diff --git a/paasta_tools/mesos/parallel.py b/paasta_tools/mesos/parallel.py deleted file mode 100644 index 35b0336955..0000000000 --- a/paasta_tools/mesos/parallel.py +++ /dev/null @@ -1,52 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import concurrent.futures -import contextlib - -from . import exceptions - - -@contextlib.contextmanager -def execute(max_workers): - try: - executor = concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) - yield executor - except KeyboardInterrupt: - # Threads in the ThreadPoolExecutor are created with - # daemon=True. There is, therefore, an atexit function registered - # that allows all the currently running threads to stop before - # allowing the interpreter to stop. Because we don't care whether - # the worker threads exit cleanly or not, we force shutdown to be - # immediate. - concurrent.futures.thread._threads_queues.clear() - raise - finally: - executor.shutdown(wait=False) - - -def stream(fn, elements, workers): - """Yield the results of fn as jobs complete.""" - jobs = [] - - with execute(workers) as executor: - for elem in elements: - jobs.append(executor.submit(fn, elem)) - - for job in concurrent.futures.as_completed(jobs): - try: - yield job.result() - except exceptions.SkipResult: - pass diff --git a/paasta_tools/mesos/slave.py b/paasta_tools/mesos/slave.py deleted file mode 100644 index 256936dd4c..0000000000 --- a/paasta_tools/mesos/slave.py +++ /dev/null @@ -1,115 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from urllib.parse import urljoin - -import aiohttp - -from . import exceptions -from . import mesos_file -from . import util -from paasta_tools.async_utils import async_ttl_cache -from paasta_tools.utils import get_user_agent - - -class MesosSlave: - def __init__(self, config, items): - self.config = config - self.__items = items - - def __getitem__(self, name): - return self.__items[name] - - def __str__(self): - return self.key() - - def key(self): - return self["pid"].split("@")[-1] - - @property - def host(self): - return "{}://{}:{}".format( - self.config["scheme"], self["hostname"], self["pid"].split(":")[-1] - ) - - async def fetch(self, url, **kwargs) -> aiohttp.ClientResponse: - headers = {"User-Agent": get_user_agent()} - async with aiohttp.ClientSession( - conn_timeout=self.config["response_timeout"], - read_timeout=self.config["response_timeout"], - ) as session: - try: - async with session.get( - urljoin(self.host, url), headers=headers, **kwargs - ) as response: - await response.text() - return response - except aiohttp.ClientConnectionError: - raise exceptions.SlaveDoesNotExist( - f"Unable to connect to the slave at {self.host}" - ) - - @async_ttl_cache(ttl=5, cleanup_self=True) - async def state(self): - return await (await self.fetch("/slave(1)/state.json")).json() - - async def frameworks(self): - return util.merge(await self.state(), "frameworks", "completed_frameworks") - - async def task_executor(self, task_id): - for fw in await self.frameworks(): - for exc in util.merge(fw, "executors", "completed_executors"): - if task_id in list( - map( - lambda x: x["id"], - util.merge(exc, "completed_tasks", "tasks", "queued_tasks"), - ) - ): - return exc - raise exceptions.MissingExecutor("No executor has a task by that id") - - async def file_list(self, path): - # The sandbox does not exist on the slave. - if path == "": - return [] - - resp = self.fetch("/files/browse.json", params={"path": path}) - if resp.status_code == 404: - return [] - return await resp.json() - - def file(self, task, path): - return mesos_file.File(self, task, path) - - @async_ttl_cache(ttl=30, cleanup_self=True) - async def stats(self): - return await (await self.fetch("/monitor/statistics.json")).json() - - def executor_stats(self, _id): - return list(filter(lambda x: x["executor_id"])) - - async def task_stats(self, _id): - stats = list(filter(lambda x: x["executor_id"] == _id, await self.stats())) - - # Tasks that are not yet in a RUNNING state have no stats. - if len(stats) == 0: - return {} - else: - return stats[0]["statistics"] - - @property # type: ignore - @util.memoize - def log(self): - return mesos_file.File(self, path="/slave/log") diff --git a/paasta_tools/mesos/task.py b/paasta_tools/mesos/task.py deleted file mode 100644 index e5397fc5e2..0000000000 --- a/paasta_tools/mesos/task.py +++ /dev/null @@ -1,94 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os -import re - -import a_sync - -from . import exceptions -from . import framework -from . import mesos_file -from paasta_tools.async_utils import async_ttl_cache - - -class Task: - - cmd_re = re.compile(r"\(Command: (.+)\)") - - def __init__(self, master, items): - self.master = master - self.__items = items - - def __str__(self): - return "{}:{}".format(a_sync.block(self.slave), self["id"]) - - def __getitem__(self, name): - return self.__items[name] - - async def executor(self): - return await (await self.slave()).task_executor(self["id"]) - - async def framework(self): - return framework.Framework(await self.master.framework(self["framework_id"])) - - @async_ttl_cache(cleanup_self=True) - async def directory(self): - try: - return (await self.executor())["directory"] - except exceptions.MissingExecutor: - return "" - - @async_ttl_cache(cleanup_self=True) - async def slave(self): - return await self.master.slave(self["slave_id"]) - - async def file(self, path): - return mesos_file.File(await self.slave(), self, path) - - async def file_list(self, path): - return await (await self.slave()).file_list(os.path.join(self.directory, path)) - - async def stats(self): - try: - return await (await self.slave()).task_stats(self["id"]) - except exceptions.MissingExecutor: - return {} - - async def cpu_time(self): - st = await self.stats() - secs = st.get("cpus_user_time_secs", 0) + st.get("cpus_system_time_secs", 0) - return secs - - async def cpu_limit(self): - return (await self.stats()).get("cpus_limit", 0) - - async def mem_limit(self): - return (await self.stats()).get("mem_limit_bytes", 0) - - async def rss(self): - return (await self.stats()).get("mem_rss_bytes", 0) - - async def command(self): - try: - result = self.cmd_re.search((await self.executor())["name"]) - except exceptions.MissingExecutor: - result = None - if not result: - return "none" - return result.group(1) - - async def user(self): - return (await self.framework()).user diff --git a/paasta_tools/mesos/util.py b/paasta_tools/mesos/util.py deleted file mode 100644 index 46f5589751..0000000000 --- a/paasta_tools/mesos/util.py +++ /dev/null @@ -1,69 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import functools -import itertools -import time - - -def merge(obj, *keys): - return itertools.chain(*[obj[k] for k in keys]) - - -class CachedProperty: - def __init__(self, ttl=300): - self.ttl = ttl - - def __call__(self, fget, doc=None): - self.fget = fget - self.__doc__ = doc or fget.__doc__ - self.__name__ = fget.__name__ - self.__module__ = fget.__module__ - return self - - def __get__(self, inst, owner): - try: - value, last_update = inst._cache[self.__name__] - if self.ttl > 0 and time.time() - last_update > self.ttl: - raise AttributeError - except (KeyError, AttributeError): - value = self.fget(inst) - try: - cache = inst._cache - except AttributeError: - cache = inst._cache = {} - cache[self.__name__] = (value, time.time()) - return value - - -def memoize(obj): - cache = obj.cache = {} - - @functools.wraps(obj) - def memoizer(*args, **kwargs): - key = str(args) + str(kwargs) - if key not in cache: - cache[key] = obj(*args, **kwargs) - return cache[key] - - return memoizer - - -def humanize_bytes(b): - abbrevs = ((1 << 30, "GB"), (1 << 20, "MB"), (1 << 10, "kB"), (1, "B")) - for factor, suffix in abbrevs: - if b >= factor: - break - return "%.*f %s" % (2, b / float(factor), suffix) diff --git a/paasta_tools/mesos/zookeeper.py b/paasta_tools/mesos/zookeeper.py deleted file mode 100644 index e8cab501f0..0000000000 --- a/paasta_tools/mesos/zookeeper.py +++ /dev/null @@ -1,37 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import contextlib - -import kazoo.client -import kazoo.exceptions -import kazoo.handlers.threading - - -TIMEOUT = 1 - -# Helper for testing -client_class = kazoo.client.KazooClient - - -@contextlib.contextmanager -def client(*args, **kwargs): - zk = client_class(*args, **kwargs) - zk.start(timeout=TIMEOUT) - try: - yield zk - finally: - zk.stop() - zk.close() diff --git a/paasta_tools/mesos_maintenance.py b/paasta_tools/mesos_maintenance.py deleted file mode 100755 index 51ec62108b..0000000000 --- a/paasta_tools/mesos_maintenance.py +++ /dev/null @@ -1,848 +0,0 @@ -#!/usr/bin/env python -# Copyright 2015-2016 Yelp Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import datetime -import json -import logging -from socket import gaierror -from socket import getfqdn -from socket import gethostbyname -from typing import List -from typing import NamedTuple -from typing import Optional - -import a_sync -from dateutil import parser -from pytimeparse import timeparse -from requests import Request -from requests import Session -from requests.exceptions import HTTPError - -from paasta_tools.mesos_tools import get_count_running_tasks_on_slave -from paasta_tools.mesos_tools import get_mesos_config_path -from paasta_tools.mesos_tools import get_mesos_leader -from paasta_tools.mesos_tools import get_mesos_master -from paasta_tools.mesos_tools import MESOS_MASTER_PORT -from paasta_tools.utils import SystemPaastaConfig -from paasta_tools.utils import time_cache -from paasta_tools.utils import to_bytes - - -log = logging.getLogger(__name__) - - -class Hostname(NamedTuple): - host: str - ip: str - - -class Credentials(NamedTuple): - file: str - principal: str - secret: str - - -class Resource(NamedTuple): - name: str - amount: int - - -MAINTENANCE_ROLE = "maintenance" - - -def base_api(mesos_config_path: Optional[str] = None): - """Helper function for making all API requests - - :returns: a function that can be called to make a request - """ - leader = get_mesos_leader(mesos_config_path) - - def execute_request(method, endpoint, timeout=(3, 2), **kwargs): - url = "http://%s:%d%s" % (leader, MESOS_MASTER_PORT, endpoint) - s = Session() - s.auth = (get_principal(), get_secret()) - req = Request(method, url, **kwargs) - prepared = s.prepare_request(req) - try: - resp = s.send(prepared, timeout=timeout) - resp.raise_for_status() - return resp - except HTTPError: - raise HTTPError("Error executing API request calling %s." % url) - - return execute_request - - -def master_api(mesos_config_path: Optional[str] = None): - """Helper function for making API requests to the /master API endpoints - - :returns: a function that can be called to make a request to /master - """ - - def execute_master_api_request(method, endpoint, **kwargs): - base_api_client = base_api(mesos_config_path=mesos_config_path) - return base_api_client(method, "/master%s" % endpoint, **kwargs) - - return execute_master_api_request - - -def operator_api(mesos_config_path: Optional[str] = None): - def execute_operator_api_request(**kwargs): - base_api_client = base_api(mesos_config_path=mesos_config_path) - if "headers" in kwargs: - kwargs["headers"]["Content-Type"] = "application/json" - else: - kwargs["headers"] = {"Content-Type": "application/json"} - data = kwargs.pop("data") - return base_api_client("POST", "/api/v1", data=json.dumps(data), **kwargs) - - return execute_operator_api_request - - -def reserve_api(): - """Helper function for making API requests to the /reserve API endpoints - - :returns: a function that can be called to make a request to /reserve - """ - - def execute_reserve_api_request(method, endpoint, **kwargs): - master_api_client = master_api() - return master_api_client(method, "/reserve%s" % endpoint, **kwargs) - - return execute_reserve_api_request - - -def unreserve_api(): - """Helper function for making API requests to the /unreserve API endpoints - - :returns: a function that can be called to make a request to /unreserve - """ - - def execute_unreserve_api_request(method, endpoint, **kwargs): - master_api_client = master_api() - return master_api_client(method, "/unreserve%s" % endpoint, **kwargs) - - return execute_unreserve_api_request - - -def maintenance_api(): - """Helper function for making API requests to the /master/maintenance API endpoints - - :returns: a function that can be called to make a request to /master/maintenance - """ - - def execute_schedule_api_request(method, endpoint, **kwargs): - master_api_client = master_api() - return master_api_client( - method, "/maintenance%s" % endpoint, timeout=(3, 10), **kwargs - ) - - return execute_schedule_api_request - - -def get_schedule_client(): - """Helper function for making API requests to the /master/maintenance/schedule API endpoints - - :returns: a function that can be called to make a request to /master/maintenance/schedule - """ - - def execute_schedule_api_request(method, endpoint, **kwargs): - maintenance_api_client = maintenance_api() - return maintenance_api_client(method, "/schedule%s" % endpoint, **kwargs) - - return execute_schedule_api_request - - -def get_maintenance_schedule(): - """Makes a GET_MAINTENANCE_SCHEDULE request to the operator api - - :returns: a GET_MAINTENANCE_SCHEDULE response - """ - client_fn = operator_api() - return client_fn(data={"type": "GET_MAINTENANCE_SCHEDULE"}) - - -@time_cache(ttl=10) -def get_maintenance_status(mesos_config_path: Optional[str] = None): - """Makes a GET_MAINTENANCE_STATUS request to the operator api - - :returns: a GET_MAINTENANCE_STATUS response - """ - client_fn = operator_api(mesos_config_path=mesos_config_path) - return client_fn(data={"type": "GET_MAINTENANCE_STATUS"}) - - -def schedule(): - """Get the Mesos maintenance schedule. This contains hostname/ip mappings and their maintenance window. - :returns: GET_MAINTENANCE_SCHEDULE response text - """ - try: - schedule = get_maintenance_schedule() - except HTTPError: - raise HTTPError("Error getting maintenance schedule.") - return schedule.text - - -def get_hosts_with_state( - state, system_paasta_config: Optional[SystemPaastaConfig] = None -) -> List[str]: - """Helper function to check the maintenance status and return all hosts - listed as being in a current state - - :param state: State we are interested in ('down_machines' or 'draining_machines') - :returns: A list of hostnames in the specified state or an empty list if no machines - """ - - mesos_config_path = get_mesos_config_path(system_paasta_config) - try: - status = get_maintenance_status(mesos_config_path).json() - status = status["get_maintenance_status"]["status"] - except HTTPError: - raise HTTPError("Error getting maintenance status.") - if not status or state not in status: - return [] - if "id" in status[state][0]: - return [machine["id"]["hostname"] for machine in status[state]] - else: - return [machine["hostname"] for machine in status[state]] - - -def get_draining_hosts(system_paasta_config: Optional[SystemPaastaConfig] = None): - """Returns a list of hostnames that are marked as draining - - :returns: a list of strings representing hostnames - """ - return get_hosts_with_state( - state="draining_machines", system_paasta_config=system_paasta_config - ) - - -def get_down_hosts(): - """Returns a list of hostnames that are marked as down - - :returns: a list of strings representing hostnames - """ - return get_hosts_with_state(state="down_machines") - - -def is_host_draining(hostname=getfqdn()): - """Checks if the specified hostname is marked as draining - - :param hostname: Hostname we want to check if draining (defaults to current host) - :returns: a boolean representing whether or not the specified hostname is draining - """ - return hostname in get_draining_hosts() - - -def is_host_down(hostname=getfqdn()): - """Checks if the specified hostname is marked as down - - :param hostname: Hostname we want to check if down (defaults to current host) - :returns: a boolean representing whether or not the specified hostname is down - """ - return hostname in get_down_hosts() - - -def get_hosts_forgotten_draining(grace=0): - """Find hosts that are still marked as draining (rather than down) after the start - of their maintenance window. - :param grace: integer number of nanoseconds to allow a host to be left in the draining - state after the start of its maintenance window before we consider it forgotten. - :returns: a list of hostnames of hosts forgotten draining - """ - draining_hosts = get_draining_hosts() - log.debug("draining_hosts: %s" % draining_hosts) - - hosts_past_maintenance_start = get_hosts_past_maintenance_start(grace=grace) - log.debug("hosts_past_maintenance_start: %s" % hosts_past_maintenance_start) - - forgotten_draining = list( - set(draining_hosts).intersection(hosts_past_maintenance_start) - ) - log.debug("forgotten_draining: %s" % forgotten_draining) - - return forgotten_draining - - -def are_hosts_forgotten_draining(): - """Quick way to test if there are any forgotten draining hosts. - :returns: a boolean that is True if there are any forgotten draining - hosts and False otherwise - """ - return bool(get_hosts_forgotten_draining()) - - -def get_hosts_forgotten_down(grace=0): - """Find hosts that are still marked as down (rather than up) after the end - of their maintenance window. - :param grace: integer number of nanoseconds to allow a host to be left in the down - state after the end of its maintenance window before we consider it forgotten. - :returns: a list of hostnames of hosts forgotten down - """ - down_hosts = get_down_hosts() - log.debug("down_hosts: %s" % down_hosts) - - hosts_past_maintenance_end = get_hosts_past_maintenance_end(grace=grace) - log.debug("hosts_past_maintenance_end: %s" % hosts_past_maintenance_end) - - forgotten_down = list(set(down_hosts).intersection(hosts_past_maintenance_end)) - log.debug("forgotten_down: %s" % forgotten_down) - - return forgotten_down - - -def are_hosts_forgotten_down(): - """Quick way to test if there are any forgotten down hosts. - :returns: a boolean that is True if there are any forgotten down - hosts and False otherwise - """ - return bool(get_hosts_forgotten_down()) - - -def parse_timedelta(value): - """Return the delta in nanoseconds. - :param value: a string containing a time format supported by :mod:`pytimeparse` - :returns: an integer (or float) representing the specified delta in nanoseconds - """ - error_msg = "'%s' is not a valid time expression" % value - try: - seconds = timeparse.timeparse(value) - except TypeError: - raise argparse.ArgumentTypeError(error_msg) - if not seconds: - raise argparse.ArgumentTypeError(error_msg) - return seconds_to_nanoseconds(seconds) - - -def parse_datetime(value): - """Return the datetime in nanoseconds. - :param value: a string containing a datetime supported by :mod:`dateutil.parser` - :returns: an integer (or float) representing the specified datetime in nanoseconds - """ - error_msg = "'%s' is not a valid datetime expression" % value - try: - dt = parser.parse(value) - except Exception: - raise argparse.ArgumentTypeError(error_msg) - if not dt: - raise argparse.ArgumentTypeError(error_msg) - return datetime_to_nanoseconds(dt) - - -def datetime_seconds_from_now(seconds): - """Given a number of seconds, returns a datetime object representing that number of seconds in the future from the - current time. - :param seconds: an integer representing a certain number of seconds - :returns: a datetime.timedelta representing now + the specified number of seconds - """ - return now() + datetime.timedelta(seconds=seconds) - - -def now(): - """Returns a datetime object representing the current time - - :returns: a datetime.datetime object representing the current time - """ - return datetime.datetime.now() - - -def seconds_to_nanoseconds(seconds): - """Convert the specified number of seconds to nanoseconds - :param seconds: an integer representing a certain number of seconds - :returns: an integer (or float) representation of the specified number of seconds as nanoseconds - """ - return seconds * 1000000000 - - -def datetime_to_nanoseconds(dt): - """Convert the provided datetime object into nanoseconds - - :returns: an integer (or float) representation of the specified datetime as nanoseconds - """ - return seconds_to_nanoseconds(int(dt.strftime("%s"))) - - -def build_maintenance_payload(hostnames, maint_type): - """Creates the JSON payload necessary to bring the specified hostnames up/down for maintenance. - :param hostnames: a list of hostnames - :returns: a dictionary representing the list of machines to bring up/down for maintenance - """ - return { - "type": maint_type.upper(), - maint_type.lower(): {"machines": get_machine_ids(hostnames)}, - } - - -def hostnames_to_components(hostnames, resolve=False): - """Converts a list of 'host[|ip]' entries into namedtuples containing 'host' and 'ip' attributes, - optionally performing a DNS lookup to resolve the hostname into an IP address - :param hostnames: a list of hostnames where each hostname can be of the form 'host[|ip]' - :param resolve: boolean representing whether to lookup the IP address corresponding to the hostname via DNS - :returns: a namedtuple containing the hostname and IP components - """ - - components = [] - for hostname in hostnames: - # This is to allow specifying a hostname as "hostname|ipaddress" - # to avoid querying DNS for the IP. - if "|" in hostname: - (host, ip) = hostname.split("|") - components.append(Hostname(host=host, ip=ip)) - else: - try: - ip = gethostbyname(hostname) if resolve else None - except gaierror: - log.error(f"Failed to resolve IP for {hostname}, continuing regardless") - continue - components.append(Hostname(host=hostname, ip=ip)) - return components - - -def get_machine_ids(hostnames): - """Helper function to convert a list of hostnames into a JSON list of hostname/ip pairs. - :param hostnames: a list of hostnames - :returns: a dictionary representing the list of machines to bring up/down for maintenance - """ - machine_ids = [] - components = hostnames_to_components(hostnames, resolve=True) - for component in components: - machine_id = {"hostname": component.host, "ip": component.ip} - machine_ids.append(machine_id) - return machine_ids - - -def build_reservation_payload(resources): - """Creates the JSON payload needed to dynamically (un)reserve resources in mesos. - :param resources: list of Resource named tuples specifying the name and amount of the resource to (un)reserve - :returns: a dictionary that can be sent to Mesos to (un)reserve resources - """ - payload = [] - for resource in resources: - payload.append( - { - "name": resource.name, - "type": "SCALAR", - "scalar": {"value": resource.amount}, - "role": MAINTENANCE_ROLE, - "reservation": {"principal": get_principal()}, - } - ) - return payload - - -def build_maintenance_schedule_payload( - hostnames, start=None, duration=None, drain=True -): - """Creates the JSON payload needed to (un)schedule maintenance on the specified hostnames. - :param hostnames: a list of hostnames - :param start: the time to start the maintenance, represented as number of nanoseconds since the epoch - :param duration: length of the maintenance window, represented as number of nanoseconds since the epoch - :param drain: boolean to note whether we are draining (True) the specified hosts or undraining (False) them - :returns: a dictionary that can be sent to Mesos to (un)schedule maintenance - """ - schedule = get_maintenance_schedule().json()["get_maintenance_schedule"]["schedule"] - machine_ids = get_machine_ids(hostnames) - - if drain: - unavailability = dict() - unavailability["start"] = dict() - unavailability["start"]["nanoseconds"] = int(start) - unavailability["duration"] = dict() - unavailability["duration"]["nanoseconds"] = int(duration) - - window = dict() - window["machine_ids"] = machine_ids - window["unavailability"] = unavailability - - if schedule: - for existing_window in schedule["windows"]: - for existing_machine_id in existing_window["machine_ids"]: - # If we already have a maintenance window scheduled for one of the hosts, - # replace it with the new window. - if existing_machine_id in machine_ids: - existing_window["machine_ids"].remove(existing_machine_id) - if not existing_window["machine_ids"]: - schedule["windows"].remove(existing_window) - if drain: - windows = schedule["windows"] + [window] - else: - windows = schedule["windows"] - elif drain: - windows = [window] - else: - windows = [] - - payload = dict() - payload["windows"] = windows - - return { - "type": "UPDATE_MAINTENANCE_SCHEDULE", - "update_maintenance_schedule": {"schedule": payload}, - } - - -def load_credentials(mesos_secrets="/nail/etc/mesos-slave-secret"): - """Loads the mesos-slave credentials from the specified file. These credentials will be used for all - maintenance API requests. - :param mesos_secrets: optional argument specifying the path to the file containing the mesos-slave credentials - :returns: a tuple of the form (username, password) - """ - try: - with open(mesos_secrets) as data_file: - data = json.load(data_file) - except EnvironmentError: - log.error( - "maintenance calls must be run on a Mesos slave containing valid credentials (%s)" - % mesos_secrets - ) - raise - try: - username = data["principal"] - password = data["secret"] - except KeyError: - log.error( - "%s does not contain Mesos slave credentials in the expected format. " - "See http://mesos.apache.org/documentation/latest/authentication/ for details" - % mesos_secrets - ) - raise - return Credentials(file=mesos_secrets, principal=username, secret=password) - - -def get_principal(mesos_secrets="/nail/etc/mesos-slave-secret"): - """Helper function to get the principal from the mesos-slave credentials - :param mesos_secrets: optional argument specifying the path to the file containing the mesos-slave credentials - :returns: a string containing the principal/username - """ - return load_credentials(mesos_secrets).principal - - -def get_secret(mesos_secrets="/nail/etc/mesos-slave-secret"): - """Helper function to get the secret from the mesos-slave credentials - :param mesos_secrets: optional argument specifying the path to the file containing the mesos-slave credentials - :returns: a string containing the secret/password - """ - return load_credentials(mesos_secrets).secret - - -def _make_request_payload(slave_id, reservation_payload): - return { - "slaveId": slave_id.encode("UTF-8"), - # We used to_bytes here since py2 json doesn't have a well defined - # return type. When moving to python 3, replace with .encode() - "resources": to_bytes(json.dumps(reservation_payload)).replace(b"+", b"%20"), - } - - -def _make_operator_reservation_request_payload(slave_id, payload, request_type): - return { - "type": request_type.upper(), - request_type.lower(): {"agent_id": {"value": slave_id}}, - "resources": payload, - } - - -def reserve(slave_id, resources): - """Dynamically reserve resources in mesos to prevent tasks from using them. - :param slave_id: the id of the mesos slave - :param resources: list of Resource named tuples specifying the name and amount of the resource to (un)reserve - :returns: boolean where 0 represents success and 1 is a failure - """ - log.info(f"Dynamically reserving resources on {slave_id}: {resources}") - payload = _make_operator_reservation_request_payload( - slave_id=slave_id, - payload=build_reservation_payload(resources), - request_type="reserve_resources", - ) - client_fn = operator_api() - try: - print(payload) - reserve_output = client_fn(data=payload).text - except HTTPError: - raise HTTPError("Error adding dynamic reservation.") - return reserve_output - - -def unreserve(slave_id, resources): - """Dynamically unreserve resources in mesos to allow tasks to using them. - :param slave_id: the id of the mesos slave - :param resources: list of Resource named tuples specifying the name and amount of the resource to (un)reserve - :returns: boolean where 0 represents success and 1 is a failure - """ - log.info(f"Dynamically unreserving resources on {slave_id}: {resources}") - payload = _make_operator_reservation_request_payload( - slave_id=slave_id, - payload=build_reservation_payload(resources), - request_type="unreserve_resources", - ) - client_fn = operator_api() - try: - unreserve_output = client_fn(data=payload).text - except HTTPError: - raise HTTPError("Error adding dynamic unreservation.") - return unreserve_output - - -def components_to_hosts(components): - """Convert a list of Component namedtuples to a list of their hosts - :param components: a list of Component namedtuples - :returns: list of the hosts associated with each Component - """ - hosts = [] - for component in components: - hosts.append(component.host) - return hosts - - -def reserve_all_resources(hostnames): - """Dynamically reserve all available resources on the specified hosts - :param hostnames: list of hostnames to reserve resources on - """ - mesos_state = a_sync.block(get_mesos_master().state_summary) - components = hostnames_to_components(hostnames) - hosts = components_to_hosts(components) - known_slaves = [ - slave for slave in mesos_state["slaves"] if slave["hostname"] in hosts - ] - for slave in known_slaves: - hostname = slave["hostname"] - log.info("Reserving all resources on %s" % hostname) - slave_id = slave["id"] - resources = [] - for resource in ["disk", "mem", "cpus", "gpus"]: - free_resource = ( - slave["resources"][resource] - slave["used_resources"][resource] - ) - for role in slave["reserved_resources"]: - free_resource -= slave["reserved_resources"][role][resource] - resources.append(Resource(name=resource, amount=free_resource)) - try: - reserve(slave_id=slave_id, resources=resources) - except HTTPError: - raise HTTPError( - f"Failed reserving all of the resources on {hostname} ({slave_id}). Aborting." - ) - - -def unreserve_all_resources(hostnames): - """Dynamically unreserve all available resources on the specified hosts - :param hostnames: list of hostnames to unreserve resources on - """ - mesos_state = a_sync.block(get_mesos_master().state_summary) - components = hostnames_to_components(hostnames) - hosts = components_to_hosts(components) - known_slaves = [ - slave for slave in mesos_state["slaves"] if slave["hostname"] in hosts - ] - for slave in known_slaves: - hostname = slave["hostname"] - log.info("Unreserving all resources on %s" % hostname) - slave_id = slave["id"] - resources = [] - if MAINTENANCE_ROLE in slave["reserved_resources"]: - for resource in ["disk", "mem", "cpus", "gpus"]: - reserved_resource = slave["reserved_resources"][MAINTENANCE_ROLE][ - resource - ] - resources.append(Resource(name=resource, amount=reserved_resource)) - try: - unreserve(slave_id=slave_id, resources=resources) - except HTTPError: - raise HTTPError( - f"Failed unreserving all of the resources on {hostname} ({slave_id}). Aborting." - ) - - -def drain(hostnames, start, duration, reserve_resources=True): - """Schedules a maintenance window for the specified hosts and marks them as draining. - :param hostnames: a list of hostnames - :param start: the time to start the maintenance, represented as number of nanoseconds since the epoch - :param duration: length of the maintenance window, represented as number of nanoseconds since the epoch - :param reserve_resources: bool setting to also reserve the free resources on the agent before the drain call - :returns: None - """ - log.info("Draining: %s" % hostnames) - if reserve_resources: - try: - reserve_all_resources(hostnames) - except HTTPError as e: - log.warning("Failed to reserve resources, will continue to drain: %s" % e) - payload = build_maintenance_schedule_payload(hostnames, start, duration, drain=True) - client_fn = operator_api() - try: - drain_output = client_fn(data=payload).text - except HTTPError: - raise HTTPError("Error performing maintenance drain.") - return drain_output - - -def undrain(hostnames, unreserve_resources=True): - """Unschedules the maintenance window for the specified hosts and unmarks them as draining. They are ready for - regular use. - :param hostnames: a list of hostnames - :param unreserve_resources: bool setting to also unreserve resources on the agent before the undrain call - :returns: None - """ - log.info("Undraining: %s" % hostnames) - if unreserve_resources: - try: - unreserve_all_resources(hostnames) - except HTTPError as e: - log.warning( - "Failed to unreserve resources, will continue to undrain: %s" % e - ) - payload = build_maintenance_schedule_payload(hostnames, drain=False) - client_fn = get_schedule_client() - client_fn = operator_api() - try: - undrain_output = client_fn(data=payload).text - except HTTPError: - raise HTTPError("Error performing maintenance undrain.") - return undrain_output - - -def down(hostnames): - """Marks the specified hostnames as being down for maintenance, and makes them unavailable for use. - :param hostnames: a list of hostnames - :returns: None - """ - log.info("Bringing down: %s" % hostnames) - payload = build_maintenance_payload(hostnames, "start_maintenance") - client_fn = operator_api() - try: - down_output = client_fn(data=payload).text - except HTTPError: - raise HTTPError("Error performing maintenance down.") - return down_output - - -def up(hostnames): - """Marks the specified hostnames as no longer being down for maintenance, and makes them available for use. - :param hostnames: a list of hostnames - :returns: None - """ - log.info("Bringing up: %s" % hostnames) - payload = build_maintenance_payload(hostnames, "stop_maintenance") - client_fn = operator_api() - try: - up_output = client_fn(data=payload).text - except HTTPError: - raise HTTPError("Error performing maintenance up.") - return up_output - - -def raw_status(): - """Get the Mesos maintenance status. This contains hostname/ip mappings for hosts that are either marked as being - down for maintenance or draining. - :returns: Response Object containing status - """ - try: - status = get_maintenance_status() - except HTTPError: - raise HTTPError("Error performing maintenance status.") - return status - - -def status(): - """Get the Mesos maintenance status. This contains hostname/ip mappings for hosts that are either marked as being - down for maintenance or draining. - :returns: Text representation of the status - """ - return raw_status().text - - -def friendly_status(): - """Display the Mesos maintenance status in a human-friendly way. - :returns: Text representation of the human-friendly status - """ - status = raw_status().json()["get_maintenance_status"]["status"] - ret = "" - for machine in status.get("draining_machines", []): - ret += "{} ({}): Draining\n".format( - machine["id"]["hostname"], machine["id"]["ip"] - ) - for machine in status.get("down_machines", []): - ret += "{} ({}): Down\n".format(machine["hostname"], machine["ip"]) - return ret - - -def is_host_drained(hostname): - """Checks if a host has drained successfully by confirming it is - draining and currently running 0 tasks - :param hostname: hostname to check - :returns: True or False - """ - return ( - is_host_draining(hostname=hostname) - and get_count_running_tasks_on_slave(hostname) == 0 - ) - - -def is_host_past_maintenance_start(hostname): - """Checks if a host has reached the start of its maintenance window - :param hostname: hostname to check - :returns: True or False - """ - return hostname in get_hosts_past_maintenance_start() - - -def is_host_past_maintenance_end(hostname): - """Checks if a host has reached the end of its maintenance window - :param hostname: hostname to check - :returns: True or False - """ - return hostname in get_hosts_past_maintenance_end() - - -def get_hosts_past_maintenance_start(grace=0): - """Get a list of hosts that have reached the start of their maintenance window - :param grace: integer number of nanoseconds to allow a host to be left in the draining - state after the start of its maintenance window before we consider it past its maintenance start - :returns: List of hostnames - """ - schedules = get_maintenance_schedule().json()["get_maintenance_schedule"][ - "schedule" - ] - current_time = datetime_to_nanoseconds(now()) - grace - ret = [] - if "windows" in schedules: - for window in schedules["windows"]: - if window["unavailability"]["start"]["nanoseconds"] < current_time: - ret += [host["hostname"] for host in window["machine_ids"]] - log.debug(f"Hosts past maintenance start: {ret}") - return ret - - -def get_hosts_past_maintenance_end(grace=0): - """Get a list of hosts that have reached the end of their maintenance window - :param grace: integer number of nanoseconds to allow a host to be left in the down - state after the end of its maintenance window before we consider it past its maintenance end - :returns: List of hostnames - """ - schedules = get_maintenance_schedule().json()["get_maintenance_schedule"][ - "schedule" - ] - current_time = datetime_to_nanoseconds(now()) - grace - ret = [] - if "windows" in schedules: - for window in schedules["windows"]: - end = ( - window["unavailability"]["start"]["nanoseconds"] - + window["unavailability"]["duration"]["nanoseconds"] - ) - if end < current_time: - ret += [host["hostname"] for host in window["machine_ids"]] - log.debug(f"Hosts past maintenance end: {ret}") - return ret diff --git a/paasta_tools/mesos_tools.py b/paasta_tools/mesos_tools.py deleted file mode 100644 index e219937392..0000000000 --- a/paasta_tools/mesos_tools.py +++ /dev/null @@ -1,1051 +0,0 @@ -# Copyright 2015-2016 Yelp Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import asyncio -import datetime -import itertools -import json -import logging -import re -import socket -from collections import namedtuple -from pathlib import Path -from typing import Any -from typing import Awaitable -from typing import Callable -from typing import Collection -from typing import Dict -from typing import List -from typing import Mapping -from typing import MutableMapping -from typing import NamedTuple -from typing import Optional -from typing import Sequence -from typing import Tuple -from typing import Union -from urllib.parse import urlparse - -import a_sync -import humanize -import requests -from kazoo.client import KazooClient -from mypy_extensions import TypedDict - -import paasta_tools.mesos.cluster as cluster -import paasta_tools.mesos.exceptions as mesos_exceptions -from paasta_tools.async_utils import aiter_to_list -from paasta_tools.async_utils import async_timeout -from paasta_tools.async_utils import async_ttl_cache -from paasta_tools.long_running_service_tools import host_passes_blacklist -from paasta_tools.long_running_service_tools import host_passes_whitelist -from paasta_tools.mesos.cfg import load_mesos_config -from paasta_tools.mesos.exceptions import SlaveDoesNotExist -from paasta_tools.mesos.master import MesosMaster -from paasta_tools.mesos.master import MesosState -from paasta_tools.mesos.task import Task -from paasta_tools.utils import DeployBlacklist -from paasta_tools.utils import DeployWhitelist -from paasta_tools.utils import format_table -from paasta_tools.utils import get_user_agent -from paasta_tools.utils import load_system_paasta_config -from paasta_tools.utils import PaastaColors -from paasta_tools.utils import SystemPaastaConfig -from paasta_tools.utils import TimeoutError - -ZookeeperHostPath = namedtuple("ZookeeperHostPath", ["host", "path"]) -SlaveTaskCount = namedtuple("SlaveTaskCount", ["count", "slave"]) - -DEFAULT_MESOS_CLI_CONFIG_LOCATION = "/nail/etc/mesos-cli.json" - -TERMINAL_STATES = ( - "TASK_ERROR", - "TASK_KILLED", - "TASK_FAILED", - "TASK_FINISHED", - "TASK_DROPPED", - "TASK_GONE", - "TASK_GONE_BY_OPERATOR", -) - -log = logging.getLogger(__name__) -log.addHandler(logging.NullHandler()) - - -def get_mesos_config_path( - system_paasta_config: Optional[SystemPaastaConfig] = None, -) -> str: - """ - Determine where to find the configuration for mesos-cli. - """ - if system_paasta_config is None: - system_paasta_config = load_system_paasta_config() - - return system_paasta_config.get_mesos_cli_config().get( - "path", DEFAULT_MESOS_CLI_CONFIG_LOCATION - ) - - -def get_mesos_config(mesos_config_path: Optional[str] = None) -> Dict: - if mesos_config_path is None: - mesos_config_path = get_mesos_config_path() - return load_mesos_config(mesos_config_path) - - -def get_mesos_master( - mesos_config_path: Optional[str] = None, **overrides: Any -) -> MesosMaster: - config = get_mesos_config(mesos_config_path) - for k, v in overrides.items(): - config[k] = v - return MesosMaster(config) - - -MY_HOSTNAME = socket.getfqdn() -MESOS_MASTER_PORT = 5050 -MESOS_SLAVE_PORT = "5051" - - -class MesosSlaveConnectionError(Exception): - pass - - -class MesosTailLines(NamedTuple): - stdout: List[str] - stderr: List[str] - error_message: str - - -def get_mesos_leader(mesos_config_path: Optional[str] = None) -> str: - """Get the current mesos-master leader's hostname. - Attempts to determine this by using mesos.cli to query ZooKeeper. - - :returns: The current mesos-master hostname""" - try: - url = get_mesos_master(mesos_config_path).host - except mesos_exceptions.MasterNotAvailableException: - log.debug("mesos.cli failed to provide the master host") - raise - log.debug("mesos.cli thinks the master host is: %s" % url) - hostname = urlparse(url).hostname - log.debug("The parsed master hostname is: %s" % hostname) - # This check is necessary, as if we parse a value such as 'localhost:5050', - # it won't have a hostname attribute - if hostname: - try: - host = socket.gethostbyaddr(hostname)[0] - fqdn = socket.getfqdn(host) - except (socket.error, socket.herror, socket.gaierror, socket.timeout): - log.debug("Failed to convert mesos leader hostname to fqdn!") - raise - log.debug("Mesos Leader: %s" % fqdn) - return fqdn - else: - raise ValueError("Expected to receive a valid URL, got: %s" % url) - - -class MesosLeaderUnavailable(Exception): - pass - - -def find_mesos_leader(cluster): - """Find the leader with redirect given one mesos master.""" - master = ( - load_system_paasta_config().get_cluster_fqdn_format().format(cluster=cluster) - ) - if master is None: - raise ValueError("Mesos master is required to find leader") - - url = f"http://{master}:{MESOS_MASTER_PORT}/redirect" - try: - # Timeouts here are for connect, read - response = requests.get(url, timeout=(5, 30)) - except Exception as e: - raise MesosLeaderUnavailable(e) - hostname = urlparse(response.url).hostname - return f"{hostname}:{MESOS_MASTER_PORT}" - - -async def get_current_tasks(job_id: str) -> List[Task]: - """Returns a list of all the tasks with a given job id. - :param job_id: the job id of the tasks. - :return tasks: a list of mesos.cli.Task. - """ - mesos_master = get_mesos_master() - framework_tasks = await mesos_master.tasks(fltr=job_id, active_only=False) - return framework_tasks - - -def is_task_running(task: Task) -> bool: - return task["state"] == "TASK_RUNNING" - - -def filter_running_tasks(tasks: Collection[Task]) -> List[Task]: - """Filters those tasks where it's state is TASK_RUNNING. - :param tasks: a list of mesos.cli.Task - :return filtered: a list of running tasks - """ - return [task for task in tasks if is_task_running(task)] - - -def filter_not_running_tasks(tasks: Collection[Task]) -> List[Task]: - """Filters those tasks where it's state is *not* TASK_RUNNING. - :param tasks: a list of mesos.cli.Task - :return filtered: a list of tasks *not* running - """ - return [task for task in tasks if not is_task_running(task)] - - -async def get_running_tasks_from_frameworks(job_id=""): - """Will include tasks from active and completed frameworks - but NOT orphaned tasks - """ - active_framework_tasks = await get_current_tasks(job_id) - running_tasks = filter_running_tasks(active_framework_tasks) - return running_tasks - - -async def get_all_running_tasks() -> Collection[Task]: - """Will include all running tasks; for now orphans are not included""" - framework_tasks = await get_current_tasks("") - mesos_master = get_mesos_master() - framework_tasks += await mesos_master.orphan_tasks() - running_tasks = filter_running_tasks(framework_tasks) - return running_tasks - - -@async_ttl_cache(ttl=600) -async def get_cached_list_of_all_current_tasks(): - """Returns a cached list of all mesos tasks. - - This function is used by 'paasta status' and 'paasta_serviceinit status' - to avoid re-querying mesos master and re-parsing json to get mesos.Task objects. - - - The async_ttl_cache decorator caches the list for 600 seconds. - ttl doesn't really matter for this function because when we run 'paasta status' - the corresponding HTTP request to mesos master is cached by requests_cache. - - :return tasks: a list of mesos.Task - """ - return await get_current_tasks("") - - -@async_ttl_cache(ttl=600) -async def get_cached_list_of_running_tasks_from_frameworks(): - """Returns a cached list of all running mesos tasks. - See the docstring for get_cached_list_of_all_current_tasks(). - - :return tasks: a list of mesos.Task - """ - return [ - task - for task in filter_running_tasks(await get_cached_list_of_all_current_tasks()) - ] - - -@async_ttl_cache(ttl=600) -async def get_cached_list_of_not_running_tasks_from_frameworks(): - """Returns a cached list of mesos tasks that are NOT running. - See the docstring for get_cached_list_of_all_current_tasks(). - - :return tasks: a list of mesos.Task""" - return [ - task - for task in filter_not_running_tasks( - await get_cached_list_of_all_current_tasks() - ) - ] - - -def select_tasks_by_id(tasks: Collection[Task], job_id: str = "") -> List[Task]: - """Returns a list of the tasks with a given job_id. - - :param tasks: a list of mesos.Task. - :param job_id: the job id. - :return tasks: a list of mesos.Task. - """ - return [task for task in tasks if job_id in task["id"]] - - -async def get_non_running_tasks_from_frameworks(job_id: str = "") -> List[Task]: - """Will include tasks from active and completed frameworks - but NOT orphaned tasks - """ - active_framework_tasks = await get_current_tasks(job_id) - not_running_tasks = filter_not_running_tasks(active_framework_tasks) - return not_running_tasks - - -async def get_short_hostname_from_task(task: Task) -> str: - try: - slave_hostname = (await task.slave())["hostname"] - return slave_hostname.split(".")[0] - except (AttributeError, SlaveDoesNotExist): - return "Unknown" - - -def get_first_status_timestamp(task: Task) -> Optional[float]: - try: - start_time_string = task["statuses"][0]["timestamp"] - return float(start_time_string) - except (IndexError, SlaveDoesNotExist): - return None - - -def get_first_status_timestamp_string(task: Task) -> str: - """Gets the first status timestamp from a task id and returns a human - readable string with the local time and a humanized duration: - ``2015-01-30T08:45 (an hour ago)`` - """ - first_status_timestamp = get_first_status_timestamp(task) - if first_status_timestamp is None: - return "Unknown" - else: - first_status_datetime = datetime.datetime.fromtimestamp(first_status_timestamp) - return "{} ({})".format( - first_status_datetime.strftime("%Y-%m-%dT%H:%M"), - humanize.naturaltime(first_status_datetime), - ) - - -async def get_mem_usage(task: Task) -> str: - try: - task_mem_limit = await task.mem_limit() - task_rss = await task.rss() - if task_mem_limit == 0: - return "Undef" - mem_percent = task_rss / task_mem_limit * 100 - mem_string = "%d/%dMB" % ( - (task_rss / 1024 / 1024), - (task_mem_limit / 1024 / 1024), - ) - if mem_percent > 90: - return PaastaColors.red(mem_string) - else: - return mem_string - except (AttributeError, SlaveDoesNotExist): - return "None" - except TimeoutError: - return "Timed Out" - - -async def get_cpu_shares(task: Task) -> float: - # The CPU shares has an additional .1 allocated to it for executor overhead. - # We subtract this to the true number - # (https://github.com/apache/mesos/blob/dc7c4b6d0bcf778cc0cad57bb108564be734143a/src/slave/constants.hpp#L100) - cpu_shares = await task.cpu_limit() - return cpu_shares - 0.1 - - -async def get_cpu_usage(task: Task) -> str: - """Calculates a metric of used_cpu/allocated_cpu - To do this, we take the total number of cpu-seconds the task has consumed, - (the sum of system and user time), OVER the total cpu time the task - has been allocated. - - The total time a task has been allocated is the total time the task has - been running (https://github.com/mesosphere/mesos/blob/0b092b1b0/src/webui/master/static/js/controllers.js#L140) - multiplied by the "shares" a task has. - """ - try: - start_time = round(task["statuses"][0]["timestamp"]) - current_time = int(datetime.datetime.now().strftime("%s")) - duration_seconds = current_time - start_time - cpu_shares = await get_cpu_shares(task) - allocated_seconds = duration_seconds * cpu_shares - task_stats = await task.stats() - used_seconds = task_stats.get("cpus_system_time_secs", 0.0) + task_stats.get( - "cpus_user_time_secs", 0.0 - ) - if allocated_seconds == 0: - return "Undef" - percent = round(100 * (used_seconds / allocated_seconds), 1) - percent_string = "%s%%" % percent - if percent > 90: - return PaastaColors.red(percent_string) - else: - return percent_string - except (AttributeError, SlaveDoesNotExist): - return "None" - except TimeoutError: - return "Timed Out" - - -async def results_or_unknown(future: Awaitable[str]) -> str: - try: - return await future - except Exception: - return PaastaColors.red("Unknown") - - -async def format_running_mesos_task_row( - task: Task, get_short_task_id: Callable[[str], str] -) -> Tuple[str, ...]: - """Returns a pretty formatted string of a running mesos task attributes""" - - short_task_id = get_short_task_id(task["id"]) - short_hostname_future = asyncio.ensure_future( - results_or_unknown(get_short_hostname_from_task(task)) - ) - mem_usage_future = asyncio.ensure_future(results_or_unknown(get_mem_usage(task))) - cpu_usage_future = asyncio.ensure_future(results_or_unknown(get_cpu_usage(task))) - first_status_timestamp = get_first_status_timestamp_string(task) - - await asyncio.wait([short_hostname_future, mem_usage_future, cpu_usage_future]) - - return ( - short_task_id, - short_hostname_future.result(), - mem_usage_future.result(), - cpu_usage_future.result(), - first_status_timestamp, - ) - - -async def format_non_running_mesos_task_row( - task: Task, get_short_task_id: Callable[[str], str] -) -> Tuple[str, ...]: - """Returns a pretty formatted string of a running mesos task attributes""" - return ( - PaastaColors.grey(get_short_task_id(task["id"])), - PaastaColors.grey(await results_or_unknown(get_short_hostname_from_task(task))), - PaastaColors.grey(get_first_status_timestamp_string(task)), - PaastaColors.grey(task["state"]), - ) - - -@async_timeout() -async def get_tail_lines_for_mesos_task( - task: Task, get_short_task_id: Callable[[str], str], num_tail_lines: int -) -> MutableMapping[str, Sequence[str]]: - tail_lines_dict: MutableMapping[str, Sequence[str]] = {} - mesos_cli_config = get_mesos_config() - - try: - fobjs = await aiter_to_list( - cluster.get_files_for_tasks( - task_list=[task], - file_list=["stdout", "stderr"], - max_workers=mesos_cli_config["max_workers"], - ) - ) - if not fobjs: - return {"stdout": [], "stderr": []} - - fobjs.sort(key=lambda fobj: fobj.path, reverse=True) - - for fobj in fobjs: - # read nlines, starting from EOF - tail = [] - lines_seen = 0 - - async for line in fobj._readlines_reverse(): - tail.append(line) - lines_seen += 1 - if lines_seen >= num_tail_lines: - break - - # reverse the tail, so that EOF is at the bottom again - tail_lines_dict[fobj.path] = tail[::-1] - except ( - mesos_exceptions.MasterNotAvailableException, - mesos_exceptions.SlaveDoesNotExist, - mesos_exceptions.TaskNotFoundException, - mesos_exceptions.FileNotFoundForTaskException, - TimeoutError, - ) as e: - short_task_id = get_short_task_id(task["id"]) - error_name = e.__class__.__name__ - return { - "error_message": f"couldn't read stdout/stderr for {short_task_id} ({error_name})" - } - - return tail_lines_dict - - -def format_tail_lines_for_mesos_task(tail_lines, task_id): - rows = [] - if (tail_lines.stderr or tail_lines.stdout) is not None: - if len(tail_lines.stderr) + len(tail_lines.stdout) == 0: - rows.append(PaastaColors.blue(f" no stdout/stderrr for {task_id}")) - else: - for stdstream in ("stdout", "stderr"): - rows.append(PaastaColors.blue(f"{stdstream} tail for {task_id}")) - rows.extend(f" {line}" for line in getattr(tail_lines, stdstream, [])) - elif tail_lines.error_message is not None: - rows.append(PaastaColors.red(f" {tail_lines.error_message}")) - - return rows - - -@async_timeout() -async def format_stdstreams_tail_for_task(task, get_short_task_id, nlines=10): - tail_lines_dict = await get_tail_lines_for_mesos_task( - task, get_short_task_id, nlines - ) - tail_lines = MesosTailLines( - stdout=tail_lines_dict.get("stdout"), - stderr=tail_lines_dict.get("stderr"), - error_message=tail_lines_dict.get("error_message"), - ) - return [ - f" {line}" - for line in format_tail_lines_for_mesos_task(tail_lines, task["id"]) - ] - - -def zip_tasks_verbose_output(table, stdstreams): - """Zip a list of strings (table) with a list of lists (stdstreams) - :param table: a formatted list of tasks - :param stdstreams: for each task, a list of lines from stdout/stderr tail - """ - if len(table) != len(stdstreams): - raise ValueError("Can only zip same-length lists") - output = [] - for i in range(len(table)): - output.append(table[i]) - output.extend([line for line in stdstreams[i]]) - return output - - -async def format_task_list( - tasks: Sequence[Task], - list_title: str, - table_header: Sequence[str], - get_short_task_id: Callable[[str], str], - format_task_row: Callable[ - [Task, Callable[[str], str]], Awaitable[Union[Sequence[str], str]] - ], - grey: bool, - tail_lines: int, -) -> List[str]: - """Formats a list of tasks, returns a list of output lines - :param tasks: List of tasks as returned by get_*_tasks_from_all_frameworks. - :param list_title: 'Running Tasks:' or 'Non-Running Tasks'. - :param table_header: List of column names used in the tasks table. - :param get_short_task_id: A function which given a task_id returns a short task_id suitable for printing. - :param format_task_row: Formatting function, works on a task and a get_short_task_id function. - :param tail_lines (int): number of lines of stdout/stderr to tail, as obtained from the Mesos sandbox. - :param grey: If True, the list will be made less visually prominent. - :return output: Formatted output (list of output lines). - """ - if not grey: - - def colorize(x): - return x - - else: - - def colorize(x): - return PaastaColors.grey(x) - - output = [] - output.append(colorize(" %s" % list_title)) - table_rows: List[Union[str, Sequence[str]]] = [ - [colorize(th) for th in table_header] - ] - - if tasks: - task_row_futures = [ - asyncio.ensure_future(format_task_row(task, get_short_task_id)) - for task in tasks - ] - await asyncio.wait(task_row_futures) - - for future in task_row_futures: - table_rows.append(future.result()) - - tasks_table = [" %s" % row for row in format_table(table_rows)] - if tail_lines == 0: - output.extend(tasks_table) - else: - stdstreams = [] - for task in tasks: - stdstreams.append( - await format_stdstreams_tail_for_task( - task, get_short_task_id, nlines=tail_lines - ) - ) - output.append(tasks_table[0]) # header - output.extend(zip_tasks_verbose_output(tasks_table[1:], stdstreams)) - - return output - - -@a_sync.to_blocking -async def status_mesos_tasks_verbose( - filter_string: str, get_short_task_id: Callable[[str], str], tail_lines: int = 0 -) -> str: - """Returns detailed information about the mesos tasks for a service. - - :param filter_string: An id used for looking up Mesos tasks - :param get_short_task_id: A function which given a - task_id returns a short task_id suitable for - printing. - :param tail_lines: int representing the number of lines of stdout/err to - report. - """ - output: List[str] = [] - running_and_active_tasks = select_tasks_by_id( - await get_cached_list_of_running_tasks_from_frameworks(), filter_string - ) - list_title = "Running Tasks:" - table_header = [ - "Mesos Task ID", - "Host deployed to", - "Ram", - "CPU", - "Deployed at what localtime", - ] - output.extend( - await format_task_list( - tasks=running_and_active_tasks, - list_title=list_title, - table_header=table_header, - get_short_task_id=get_short_task_id, - format_task_row=format_running_mesos_task_row, - grey=False, - tail_lines=tail_lines, - ) - ) - - non_running_tasks = select_tasks_by_id( - await get_cached_list_of_not_running_tasks_from_frameworks(), filter_string - ) - # Order the tasks by timestamp - non_running_tasks.sort(key=lambda task: get_first_status_timestamp_string(task)) - non_running_tasks_ordered = list(reversed(non_running_tasks[-10:])) - - list_title = "Non-Running Tasks" - table_header = [ - "Mesos Task ID", - "Host deployed to", - "Deployed at what localtime", - "Status", - ] - output.extend( - await format_task_list( - tasks=non_running_tasks_ordered, - list_title=list_title, - table_header=table_header, - get_short_task_id=get_short_task_id, - format_task_row=format_non_running_mesos_task_row, - grey=True, - tail_lines=tail_lines, - ) - ) - - return "\n".join(output) - - -def get_local_slave_state(hostname=None): - """Fetches mesos slave state and returns it as a dict. - - :param hostname: The host from which to fetch slave state. If not specified, defaults to the local machine.""" - if hostname is None: - hostname = socket.getfqdn() - stats_uri = f"http://{hostname}:{MESOS_SLAVE_PORT}/state" - try: - headers = {"User-Agent": get_user_agent()} - response = requests.get(stats_uri, timeout=10, headers=headers) - if response.status_code == 404: - fallback_stats_uri = f"http://{hostname}:{MESOS_SLAVE_PORT}/state.json" - response = requests.get(fallback_stats_uri, timeout=10, headers=headers) - except requests.ConnectionError as e: - raise MesosSlaveConnectionError( - "Could not connect to the mesos slave to see which services are running\n" - "on %s. Is the mesos-slave running?\n" - "Error was: %s\n" % (e.request.url, str(e)) - ) - response.raise_for_status() - return json.loads(response.text) - - -async def get_mesos_quorum(): - """Returns the configured quorum size.""" - return int((await get_master_flags())["flags"]["quorum"]) - - -MesosResources = Mapping[str, Any] - - -class MesosTask(TypedDict): - resources: MesosResources - slave_id: str - id: str - state: str - - -def get_all_tasks_from_state( - mesos_state: MesosState, include_orphans: bool = False -) -> Sequence[MesosTask]: - """Given a mesos state, find the tasks from all frameworks. - :param mesos_state: the mesos_state - :returns: a list of tasks - """ - tasks = [ - task - for framework in mesos_state.get("frameworks", []) - for task in framework.get("tasks", []) - ] - if include_orphans: - tasks += mesos_state.get("orphan_tasks", []) - return tasks - - -async def get_master_flags(): - res = await get_mesos_master().fetch("/master/flags") - return await res.json() - - -def get_zookeeper_host_path(): - zk_url = "zk://%s" % load_system_paasta_config().get_zk_hosts() - parsed = urlparse(zk_url) - return ZookeeperHostPath(host=parsed.netloc, path=parsed.path) - - -def get_zookeeper_config(state): - """Returns dict, containing the zookeeper hosts and path. - :param state: mesos state dictionary""" - re_zk = re.match(r"^zk://([^/]*)/(.*)$", state["flags"]["zk"]) - return {"hosts": re_zk.group(1), "path": re_zk.group(2)} - - -def get_number_of_mesos_masters(host, path): - """Returns an array, containing mesos masters - :param zk_config: dict containing information about zookeeper config. - Masters register themselves in zookeeper by creating ``info_`` entries. - We count these entries to get the number of masters. - """ - zk = KazooClient(hosts=host, read_only=True) - zk.start() - try: - root_entries = zk.get_children(path) - result = [ - info - for info in root_entries - if info.startswith("json.info_") or info.startswith("info_") - ] - return len(result) - finally: - zk.stop() - zk.close() - - -def get_all_slaves_for_blacklist_whitelist( - blacklist: DeployBlacklist, whitelist: DeployWhitelist -): - """ - A wrapper function to get all slaves and filter according to - provided blacklist and whitelist. - - :param blacklist: a blacklist, used to filter mesos slaves by attribute - :param whitelist: a whitelist, used to filter mesos slaves by attribute - - :returns: a list of mesos slave objects, filtered by those which are acceptable - according to the provided blacklist and whitelists. - """ - all_slaves = get_slaves() - return filter_mesos_slaves_by_blacklist(all_slaves, blacklist, whitelist) - - -def get_mesos_slaves_grouped_by_attribute(slaves, attribute): - """Returns a dictionary of unique values and the corresponding hosts for a given Mesos attribute - - :param slaves: a list of mesos slaves to group - :param attribute: an attribute to filter - :returns: a dictionary of the form {'': []} - (response can contain multiple 'attribute_value) - """ - sorted_slaves = sorted( - slaves, - key=lambda slave: ( - slave["attributes"].get(attribute) is None, - slave["attributes"].get(attribute), - ), - ) - return { - key: list(group) - for key, group in itertools.groupby( - sorted_slaves, key=lambda slave: slave["attributes"].get(attribute) - ) - if key - } - - -# TODO: remove to_blocking, convert call sites (smartstack_tools and marathon_serviceinit) to asyncio. -@a_sync.to_blocking -async def get_slaves(): - return (await (await get_mesos_master().fetch("/master/slaves")).json())["slaves"] - - -def filter_mesos_slaves_by_blacklist( - slaves, blacklist: DeployBlacklist, whitelist: DeployWhitelist -): - """Takes an input list of slaves and filters them based on the given blacklist. - The blacklist is in the form of: - - [["location_type", "location]] - - Where the list inside is something like ["region", "uswest1-prod"] - - :returns: The list of mesos slaves after the filter - """ - filtered_slaves = [] - for slave in slaves: - if host_passes_blacklist( - slave["attributes"], blacklist - ) and host_passes_whitelist(slave["attributes"], whitelist): - filtered_slaves.append(slave) - return filtered_slaves - - -def get_container_id_for_mesos_id(client, mesos_task_id): - running_containers = client.containers() - - container_id = None - for container in running_containers: - info = client.inspect_container(container) - if info["Config"]["Env"]: - for env_var in info["Config"]["Env"]: - if ("MESOS_TASK_ID=%s" % mesos_task_id) in env_var: - container_id = info["Id"] - break - - return container_id - - -def get_mesos_id_from_container(container, client): - mesos_id = None - info = client.inspect_container(container) - if info["Config"]["Env"]: - for env_var in info["Config"]["Env"]: - # In marathon it is like this - if "MESOS_TASK_ID=" in env_var: - mesos_id = re.match("MESOS_TASK_ID=(.*)", env_var).group(1) - break - # Chronos it is like this? - if "mesos_task_id=" in env_var: - mesos_id = re.match("mesos_task_id=(.*)", env_var).group(1) - break - return mesos_id - - -def get_mesos_network_for_net(net): - docker_mesos_net_mapping = {"none": "NONE", "bridge": "BRIDGE", "host": "HOST"} - return docker_mesos_net_mapping.get(net, net) - - -async def get_mesos_task_count_by_slave( - mesos_state: MesosState, - slaves_list: Sequence[Dict] = None, - pool: Optional[str] = None, -) -> List[Dict]: - """Get counts of running tasks per mesos slave. - - :param mesos_state: mesos state dict - :param slaves_list: a list of slave dicts to count running tasks for. - :param pool: pool of slaves to return (None means all) - :returns: list of slave dicts {'task_count': SlaveTaskCount} - """ - all_mesos_tasks = await get_all_running_tasks() # empty string = all app ids - slaves = { - slave["id"]: {"count": 0, "slave": slave} - for slave in mesos_state.get("slaves", []) - } - for task in all_mesos_tasks: - try: - task_slave = await task.slave() - if task_slave["id"] not in slaves: - log.debug("Slave {} not found for task".format(task_slave["id"])) - continue - else: - slaves[task_slave["id"]]["count"] += 1 - task_framework = await task.framework() - log.debug(f"Task framework: {task_framework.name}") - except SlaveDoesNotExist: - log.debug( - "Tried to get mesos slaves for task {}, but none existed.".format( - task["id"] - ) - ) - continue - if slaves_list: - for slave in slaves_list: - slave["task_counts"] = SlaveTaskCount( - **slaves[slave["task_counts"].slave["id"]] - ) - slaves_with_counts = list(slaves_list) - elif pool: - slaves_with_counts = [ - {"task_counts": SlaveTaskCount(**slave_counts)} - for slave_counts in slaves.values() - if slave_counts["slave"]["attributes"].get("pool", "default") == pool - ] - else: - slaves_with_counts = [ - {"task_counts": SlaveTaskCount(**slave_counts)} - for slave_counts in slaves.values() - ] - for slave in slaves_with_counts: - log.debug( - "Slave: {}, running {} tasks".format( - slave["task_counts"].slave["hostname"], - slave["task_counts"].count, - ) - ) - return slaves_with_counts - - -def get_count_running_tasks_on_slave(hostname: str) -> int: - """Return the number of tasks running on a particular slave - or 0 if the slave is not found. - :param hostname: hostname of the slave - :returns: integer count of mesos tasks""" - mesos_state = a_sync.block(get_mesos_master().state_summary) - task_counts = a_sync.block(get_mesos_task_count_by_slave, mesos_state) - counts = [ - slave["task_counts"].count - for slave in task_counts - if slave["task_counts"].slave["hostname"] == hostname - ] - if counts: - return counts[0] - else: - return 0 - - -def slave_pid_to_ip(slave_pid: str) -> str: - """Convert slave_pid to IP - - :param: slave pid e.g. slave(1)@10.40.31.172:5051 - :returns: ip address""" - regex = re.compile(r".+?@([\d\.]+):\d+") - return regex.match(slave_pid).group(1) - - -async def list_framework_ids(active_only=False): - return [f.id for f in await get_mesos_master().frameworks(active_only=active_only)] - - -@a_sync.to_blocking -async def get_all_frameworks(active_only=False): - return await get_mesos_master().frameworks(active_only=active_only) - - -def terminate_framework(framework_id): - resp = requests.post( - "http://%s:%d/master/teardown" % (get_mesos_leader(), MESOS_MASTER_PORT), - data={"frameworkId": framework_id}, - ) - resp.raise_for_status() - - -async def get_tasks_from_app_id(app_id, slave_hostname=None): - tasks = await get_running_tasks_from_frameworks(app_id) - if slave_hostname: - tasks = [ - task - for task in tasks - if await filter_task_by_hostname(task, slave_hostname) - ] - return tasks - - -async def get_task(task_id: str, app_id: str = "") -> MesosTask: - tasks = await get_running_tasks_from_frameworks(app_id) - tasks = [task for task in tasks if filter_task_by_task_id(task, task_id)] - if len(tasks) < 1: - raise TaskNotFound(f"Couldn't find task for given id: {task_id}") - if len(tasks) > 1: - raise TooManyTasks( - f"Found more than one task with id: {task_id}, this should not happen!" - ) - return tasks[0] - - -def filter_task_by_task_id(task: MesosTask, task_id: str) -> bool: - return task["id"] == task_id - - -async def filter_task_by_hostname(task, hostname): - return (await task.slave())["hostname"].startswith(hostname) - - -class TaskNotFound(Exception): - pass - - -class TooManyTasks(Exception): - pass - - -# TODO: async this -def mesos_services_running_here( - framework_filter, parse_service_instance_from_executor_id, hostname=None -): - """See what paasta_native services are being run by a mesos-slave on this host. - - :param framework_filter: a function that returns true if we should consider a given framework. - :param parse_service_instance_from_executor_id: A function that returns a tuple of (service, instance) from the - executor ID. - :param hostname: Hostname to fetch mesos slave state from. See get_local_slave_state. - - :returns: A list of triples of (service, instance, port)""" - slave_state = get_local_slave_state(hostname=hostname) - frameworks = [ - fw for fw in slave_state.get("frameworks", []) if framework_filter(fw) - ] - executors = [ - ex - for fw in frameworks - for ex in fw.get("executors", []) - if "TASK_RUNNING" in [t["state"] for t in ex.get("tasks", [])] - ] - srv_list = [] - for executor in executors: - try: - srv_name, srv_instance = parse_service_instance_from_executor_id( - executor["id"] - ) - except ValueError: - log.error( - "Failed to decode paasta service instance from {}".format( - executor["id"] - ) - ) - continue - if "ports" in executor["resources"]: - srv_port = int(re.findall("[0-9]+", executor["resources"]["ports"])[0]) - else: - srv_port = None - srv_list.append((srv_name, srv_instance, srv_port)) - return srv_list - - -def is_task_terminal( - task: MesosTask, -) -> bool: - """Return whether a given mesos task is terminal. - - Terminal states are documented in - http://mesos.apache.org/api/latest/java/org/apache/mesos/Protos.TaskState.html - - :param task: the task to be inspected - :returns: a boolean indicating if the task is considered to be in a terminal state - """ - return task["state"] in TERMINAL_STATES - - -def is_mesos_available() -> bool: - return Path(get_mesos_config_path()).exists() diff --git a/paasta_tools/metrics/metastatus_lib.py b/paasta_tools/metrics/metastatus_lib.py index 0a2dc96efe..a0b97b1d2c 100755 --- a/paasta_tools/metrics/metastatus_lib.py +++ b/paasta_tools/metrics/metastatus_lib.py @@ -12,7 +12,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import copy import itertools import math import re @@ -26,7 +25,6 @@ from typing import Tuple from typing import TypeVar -import a_sync from humanize import naturalsize from kubernetes.client import V1Node from kubernetes.client import V1Pod @@ -41,16 +39,6 @@ from paasta_tools.kubernetes_tools import list_all_deployments from paasta_tools.kubernetes_tools import paasta_prefixed from paasta_tools.kubernetes_tools import PodStatus -from paasta_tools.mesos.master import MesosMetrics -from paasta_tools.mesos.master import MesosState -from paasta_tools.mesos_maintenance import MAINTENANCE_ROLE -from paasta_tools.mesos_tools import get_all_tasks_from_state -from paasta_tools.mesos_tools import get_mesos_quorum -from paasta_tools.mesos_tools import get_number_of_mesos_masters -from paasta_tools.mesos_tools import get_zookeeper_host_path -from paasta_tools.mesos_tools import is_task_terminal -from paasta_tools.mesos_tools import MesosResources -from paasta_tools.mesos_tools import MesosTask from paasta_tools.utils import PaastaColors from paasta_tools.utils import print_with_indent @@ -76,34 +64,6 @@ class ResourceUtilization(NamedTuple): free: int -def get_num_masters() -> int: - """Gets the number of masters from mesos state""" - zookeeper_host_path = get_zookeeper_host_path() - return get_number_of_mesos_masters( - zookeeper_host_path.host, zookeeper_host_path.path - ) - - -def get_mesos_cpu_status( - metrics: MesosMetrics, mesos_state: MesosState -) -> Tuple[int, int, int]: - """Takes in the mesos metrics and analyzes them, returning the status. - - :param metrics: mesos metrics dictionary. - :param mesos_state: mesos state dictionary. - :returns: Tuple of total, used, and available CPUs. - """ - - total = metrics["master/cpus_total"] - used = metrics["master/cpus_used"] - - for slave in mesos_state["slaves"]: - used += reserved_maintenence_resources(slave["reserved_resources"])["cpus"] - - available = total - used - return total, used, available - - def get_kube_cpu_status( nodes: Sequence[V1Node], ) -> Tuple[float, float, float]: @@ -123,26 +83,6 @@ def get_kube_cpu_status( return total, used, available -def get_mesos_memory_status( - metrics: MesosMetrics, mesos_state: MesosState -) -> Tuple[int, int, int]: - """Takes in the mesos metrics and analyzes them, returning the status. - - :param metrics: mesos metrics dictionary. - :param mesos_state: mesos state dictionary. - :returns: Tuple of total, used, and available memory in Mi. - """ - total = metrics["master/mem_total"] - used = metrics["master/mem_used"] - - for slave in mesos_state["slaves"]: - used += reserved_maintenence_resources(slave["reserved_resources"])["mem"] - - available = total - used - - return total, used, available - - def get_kube_memory_status( nodes: Sequence[V1Node], ) -> Tuple[float, float, float]: @@ -163,26 +103,6 @@ def get_kube_memory_status( return total, used, available -def get_mesos_disk_status( - metrics: MesosMetrics, mesos_state: MesosState -) -> Tuple[int, int, int]: - """Takes in the mesos metrics and analyzes them, returning the status. - - :param metrics: mesos metrics dictionary. - :param mesos_state: mesos state dictionary. - :returns: Tuple of total, used, and available disk space in Mi. - """ - - total = metrics["master/disk_total"] - used = metrics["master/disk_used"] - - for slave in mesos_state["slaves"]: - used += reserved_maintenence_resources(slave["reserved_resources"])["disk"] - - available = total - used - return total, used, available - - def get_kube_disk_status( nodes: Sequence[V1Node], ) -> Tuple[float, float, float]: @@ -204,25 +124,6 @@ def get_kube_disk_status( return total, used, available -def get_mesos_gpu_status( - metrics: MesosMetrics, mesos_state: MesosState -) -> Tuple[int, int, int]: - """Takes in the mesos metrics and analyzes them, returning gpus status. - - :param metrics: mesos metrics dictionary. - :param mesos_state: mesos state dictionary. - :returns: Tuple of total, used, and available GPUs. - """ - total = metrics["master/gpus_total"] - used = metrics["master/gpus_used"] - - for slave in mesos_state["slaves"]: - used += reserved_maintenence_resources(slave["reserved_resources"])["gpus"] - - available = total - used - return total, used, available - - def get_kube_gpu_status( nodes: Sequence[V1Node], ) -> Tuple[float, float, float]: @@ -244,11 +145,6 @@ def get_kube_gpu_status( return total, used, available -def filter_mesos_state_metrics(dictionary: Mapping[str, Any]) -> Mapping[str, Any]: - valid_keys = ["cpus", "mem", "disk", "gpus"] - return {key: value for (key, value) in dictionary.items() if key in valid_keys} - - def filter_kube_resources(dictionary: Mapping[str, str]) -> Mapping[str, str]: valid_keys = ["cpu", "memory", "ephemeral-storage", "nvidia.com/gpu"] return {key: value for (key, value) in dictionary.items() if key in valid_keys} @@ -339,7 +235,7 @@ def assert_cpu_health( perc_used = percent_used(total, used) except ZeroDivisionError: return HealthCheckResult( - message="Error reading total available cpu from mesos!", healthy=False + message="Error reading total available cpu!", healthy=False ) if check_threshold(perc_used, threshold): @@ -370,7 +266,7 @@ def assert_memory_health( perc_used = percent_used(total, used) except ZeroDivisionError: return HealthCheckResult( - message="Error reading total available memory from mesos!", healthy=False + message="Error reading total available memory!", healthy=False ) if check_threshold(perc_used, threshold): @@ -401,7 +297,7 @@ def assert_disk_health( perc_used = percent_used(total, used) except ZeroDivisionError: return HealthCheckResult( - message="Error reading total available disk from mesos!", healthy=False + message="Error reading total available disk!", healthy=False ) if check_threshold(perc_used, threshold): @@ -444,19 +340,6 @@ def assert_gpu_health( ) -def assert_mesos_tasks_running( - metrics: MesosMetrics, -) -> HealthCheckResult: - running = metrics["master/tasks_running"] - staging = metrics["master/tasks_staging"] - starting = metrics["master/tasks_starting"] - return HealthCheckResult( - message="Tasks: running: %d staging: %d starting: %d" - % (running, staging, starting), - healthy=True, - ) - - def assert_kube_pods_running( kube_client: KubeClient, namespace: str ) -> HealthCheckResult: @@ -473,12 +356,6 @@ def assert_kube_pods_running( ) -def get_mesos_slaves_health_status( - metrics: MesosMetrics, -) -> Tuple[int, int]: - return metrics["master/slaves_active"], metrics["master/slaves_inactive"] - - def get_kube_nodes_health_status( nodes: Sequence[V1Node], ) -> Tuple[int, int]: @@ -496,32 +373,10 @@ def assert_nodes_health( ) -def assert_quorum_size() -> HealthCheckResult: - masters, quorum = get_num_masters(), a_sync.block(get_mesos_quorum) - if quorum_ok(masters, quorum): - return HealthCheckResult( - message="Quorum: masters: %d configured quorum: %d " % (masters, quorum), - healthy=True, - ) - else: - return HealthCheckResult( - message="CRITICAL: Number of masters (%d) less than configured quorum(%d)." - % (masters, quorum), - healthy=False, - ) - - _KeyFuncRetT = Sequence[Tuple[str, str]] -class _SlaveT(TypedDict): - id: str - resources: MesosResources - reserved_resources: MesosResources - attributes: Mapping[str, str] - - -_GenericNodeT = TypeVar("_GenericNodeT", _SlaveT, V1Node) +_GenericNodeT = TypeVar("_GenericNodeT", bound=V1Node) _GenericNodeGroupingFunctionT = Callable[[_GenericNodeT], _KeyFuncRetT] @@ -530,44 +385,6 @@ class _SlaveT(TypedDict): _GenericNodeSortFunctionT = Callable[[Sequence[_GenericNodeT]], Sequence[_GenericNodeT]] -def key_func_for_attribute( - attribute: str, -) -> Callable[[_SlaveT], str]: - """Return a closure that given a slave, will return the value of a specific - attribute. - - :param attribute: the attribute to inspect in the slave - :returns: a closure, which takes a slave and returns the value of an attribute - """ - - def key_func(slave): - return slave["attributes"].get(attribute, "unknown") - - return key_func - - -def key_func_for_attribute_multi( - attributes: Sequence[str], -) -> _GenericNodeGroupingFunctionT: - """Return a closure that given a slave, will return the value of a list of - attributes, compiled into a hashable tuple - - :param attributes: the attributes to inspect in the slave - :returns: a closure, which takes a slave and returns the value of those attributes - """ - - def get_attribute(slave, attribute): - if attribute == "hostname": - return slave["hostname"] - else: - return slave["attributes"].get(attribute, "unknown") - - def key_func(slave): - return tuple((a, get_attribute(slave, a)) for a in attributes) - - return key_func - - def key_func_for_attribute_multi_kube( attributes: Sequence[str], ) -> Callable[[V1Node], _KeyFuncRetT]: @@ -587,17 +404,6 @@ def key_func(node): return key_func -def sort_func_for_attributes( - attributes: Sequence[str], -) -> _GenericNodeSortFunctionT: - def sort(slaves): - for attribute in attributes: - slaves = sorted(slaves, key=key_func_for_attribute(attribute)) - return slaves - - return sort - - def group_slaves_by_key_func( key_func: _GenericNodeGroupingFunctionT, slaves: Sequence[_GenericNodeT], @@ -627,48 +433,6 @@ class ResourceUtilizationDict(TypedDict): slave_count: int -def calculate_resource_utilization_for_slaves( - slaves: Sequence[_SlaveT], tasks: Sequence[MesosTask] -) -> ResourceUtilizationDict: - """Given a list of slaves and a list of tasks, calculate the total available - resource available in that list of slaves, and the resources consumed by tasks - running on those slaves. - - :param slaves: a list of slaves to calculate resource usage for - :param tasks: the list of tasks running in the mesos cluster - :returns: a dict, containing keys for "free" and "total" resources. Each of these keys - is a ResourceInfo tuple, exposing a number for cpu, disk and mem. - """ - resource_total_dict: _Counter[str] = Counter() - for slave in slaves: - filtered_resources = filter_mesos_state_metrics(slave["resources"]) - resource_total_dict.update(Counter(filtered_resources)) - resource_free_dict = copy.deepcopy(resource_total_dict) - for task in tasks: - task_resources = task["resources"] - resource_free_dict.subtract(Counter(filter_mesos_state_metrics(task_resources))) - for slave in slaves: - filtered_resources = filter_mesos_state_metrics( - reserved_maintenence_resources(slave["reserved_resources"]) - ) - resource_free_dict.subtract(Counter(filtered_resources)) - return { - "free": ResourceInfo( - cpus=resource_free_dict["cpus"], - disk=resource_free_dict["disk"], - mem=resource_free_dict["mem"], - gpus=resource_free_dict.get("gpus", 0), - ), - "total": ResourceInfo( - cpus=resource_total_dict["cpus"], - disk=resource_total_dict["disk"], - mem=resource_total_dict["mem"], - gpus=resource_total_dict.get("gpus", 0), - ), - "slave_count": len(slaves), - } - - _IEC_NUMBER_SUFFIXES = { "k": 1000, "m": 1000**-1, @@ -746,32 +510,6 @@ def calculate_resource_utilization_for_kube_nodes( } -def filter_tasks_for_slaves( - slaves: Sequence[_SlaveT], tasks: Sequence[MesosTask] -) -> Sequence[MesosTask]: - """Given a list of slaves and a list of tasks, return a filtered - list of tasks, where those returned belong to slaves in the list of - slaves - - :param slaves: the list of slaves which the tasks provided should be - running on. - :param tasks: the tasks to filter :returns: a list of tasks, - identical to that provided by the tasks param, but with only those where - the task is running on one of the provided slaves included. - """ - slave_ids = [slave["id"] for slave in slaves] - return [task for task in tasks if task["slave_id"] in slave_ids] - - -def make_filter_slave_func( - attribute: str, values: Sequence[str] -) -> _GenericNodeFilterFunctionT: - def filter_func(slave): - return slave["attributes"].get(attribute, None) in values - - return filter_func - - def filter_slaves( slaves: Sequence[_GenericNodeT], filters: Sequence[_GenericNodeFilterFunctionT] ) -> Sequence[_GenericNodeT]: @@ -787,43 +525,6 @@ def filter_slaves( return [s for s in slaves if all([f(s) for f in filters])] -def get_resource_utilization_by_grouping( - grouping_func: _GenericNodeGroupingFunctionT, - mesos_state: MesosState, - filters: Sequence[_GenericNodeFilterFunctionT] = [], - sort_func: _GenericNodeSortFunctionT = None, -) -> Mapping[_KeyFuncRetT, ResourceUtilizationDict]: - """Given a function used to group slaves and mesos state, calculate - resource utilization for each value of a given attribute. - - :grouping_func: a function that given a slave, will return the value of an - attribute to group by. - :param mesos_state: the mesos state - :param filters: filters to apply to the slaves in the calculation, with - filtering preformed by filter_slaves - :param sort_func: a function that given a list of slaves, will return the - sorted list of slaves. - :returns: a dict of {attribute_value: resource_usage}, where resource usage - is the dict returned by ``calculate_resource_utilization_for_slaves`` for - slaves grouped by attribute value. - """ - slaves: Sequence[_SlaveT] = mesos_state.get("slaves", []) - slaves = filter_slaves(slaves, filters) - if not has_registered_slaves(mesos_state): - raise ValueError("There are no slaves registered in the mesos state.") - - tasks = get_all_tasks_from_state(mesos_state, include_orphans=True) - non_terminal_tasks = [task for task in tasks if not is_task_terminal(task)] - slave_groupings = group_slaves_by_key_func(grouping_func, slaves, sort_func) - - return { - attribute_value: calculate_resource_utilization_for_slaves( - slaves=slaves, tasks=filter_tasks_for_slaves(slaves, non_terminal_tasks) - ) - for attribute_value, slaves in slave_groupings.items() - } - - def get_resource_utilization_by_grouping_kube( grouping_func: _GenericNodeGroupingFunctionT, kube_client: KubeClient, @@ -884,35 +585,6 @@ def resource_utillizations_from_resource_info( ] -def has_registered_slaves( - mesos_state: MesosState, -) -> bool: - """Return a boolean indicating if there are any slaves registered - to the master according to the mesos state. - :param mesos_state: the mesos state from the master - :returns: a boolean, indicating if there are > 0 slaves - """ - return len(mesos_state.get("slaves", [])) > 0 - - -def get_mesos_resource_utilization_health( - mesos_metrics: MesosMetrics, mesos_state: MesosState -) -> Sequence[HealthCheckResult]: - """Perform healthchecks against mesos metrics. - :param mesos_metrics: a dict exposing the mesos metrics described in - https://mesos.apache.org/documentation/latest/monitoring/ - :returns: a list of HealthCheckResult tuples - """ - return [ - assert_cpu_health(get_mesos_cpu_status(mesos_metrics, mesos_state)), - assert_memory_health(get_mesos_memory_status(mesos_metrics, mesos_state)), - assert_disk_health(get_mesos_disk_status(mesos_metrics, mesos_state)), - assert_gpu_health(get_mesos_gpu_status(mesos_metrics, mesos_state)), - assert_mesos_tasks_running(mesos_metrics), - assert_nodes_health(get_mesos_slaves_health_status(mesos_metrics)), - ] - - def get_kube_resource_utilization_health( kube_client: KubeClient, ) -> Sequence[HealthCheckResult]: @@ -932,19 +604,6 @@ def get_kube_resource_utilization_health( ] -def get_mesos_state_status( - mesos_state: MesosState, -) -> Sequence[HealthCheckResult]: - """Perform healthchecks against mesos state. - :param mesos_state: a dict exposing the mesos state described in - https://mesos.apache.org/documentation/latest/endpoints/master/state.json/ - :returns: a list of HealthCheckResult tuples - """ - return [ - assert_quorum_size(), - ] - - def run_healthchecks_with_param( param: Any, healthcheck_functions: Sequence[Callable[..., HealthCheckResult]], @@ -1102,9 +761,3 @@ def get_table_rows_for_resource_info_dict( return attribute_values + format_row_for_resource_utilization_healthchecks( healthcheck_utilization_pairs ) - - -def reserved_maintenence_resources( - resources: MesosResources, -): - return resources.get(MAINTENANCE_ROLE, {"cpus": 0, "mem": 0, "disk": 0, "gpus": 0}) diff --git a/paasta_tools/paasta_execute_docker_command.py b/paasta_tools/paasta_execute_docker_command.py deleted file mode 100755 index 0b74abd40e..0000000000 --- a/paasta_tools/paasta_execute_docker_command.py +++ /dev/null @@ -1,123 +0,0 @@ -#!/usr/bin/env python -# Copyright 2015-2016 Yelp Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Usage: ./paasta_execute_docker_command.py [options] - -This script will attempt to find a running container that contains an environment variable matching -the specified Mesos task ID and then execute the given command with the specified timeout. The script will -print the output of the command and exit with the same return code as the command. - -Command line options: - -- -i , --mesos-id : Specify a Mesos task ID to search for -- -c , --cmd : Shell command to execute in container -- -t , --timeout : Timeout for command -""" -import argparse -import signal -import sys -from contextlib import contextmanager - -from paasta_tools.mesos_tools import get_container_id_for_mesos_id -from paasta_tools.utils import get_docker_client -from paasta_tools.utils import is_using_unprivileged_containers - - -def parse_args(): - parser = argparse.ArgumentParser( - description="Executes given command in Docker container for given Mesos task ID" - ) - parser.add_argument("-i", "--mesos-id", required=True, help="Mesos task ID") - parser.add_argument( - "-c", "--cmd", required=True, help="command to execute in container" - ) - parser.add_argument( - "-t", "--timeout", default=45, type=int, help="timeout for command" - ) - args = parser.parse_args() - return args - - -class TimeoutException(Exception): - pass - - -@contextmanager -def time_limit(seconds): # From http://stackoverflow.com/a/601168/1576438 - def signal_handler(signum, frame): - raise TimeoutException("Timed out!") - - signal.signal(signal.SIGALRM, signal_handler) - signal.alarm(seconds) - try: - yield - finally: - signal.alarm(0) - - -def execute_in_container(docker_client, container_id, cmd, timeout): - container_info = docker_client.inspect_container(container_id) - if ( - container_info["ExecIDs"] - and len(container_info["ExecIDs"]) > 0 - and not is_using_unprivileged_containers() - ): - for possible_exec_id in container_info["ExecIDs"]: - exec_info = docker_client.exec_inspect(possible_exec_id)["ProcessConfig"] - if exec_info["entrypoint"] == "/bin/sh" and exec_info["arguments"] == [ - "-c", - cmd, - ]: - exec_id = possible_exec_id - break - else: - exec_id = docker_client.exec_create(container_id, ["/bin/sh", "-c", cmd])["Id"] - output = docker_client.exec_start(exec_id, stream=False) - return_code = docker_client.exec_inspect(exec_id)["ExitCode"] - return (output, return_code) - - -def main(): - args = parse_args() - - if not args.mesos_id: - print( - "The Mesos task id you supplied seems to be an empty string! Please provide a valid task id." - ) - sys.exit(2) - - docker_client = get_docker_client() - - container_id = get_container_id_for_mesos_id(docker_client, args.mesos_id) - - if container_id: - try: - with time_limit(args.timeout): - output, return_code = execute_in_container( - docker_client, container_id, args.cmd, args.timeout - ) - print(output) - except TimeoutException: - print("Command timed out!") - return_code = 1 - finally: - sys.exit(return_code) - else: - print("Could not find container with MESOS_TASK_ID '%s'." % args.mesos_id) - sys.exit(1) - - -if __name__ == "__main__": - main() diff --git a/paasta_tools/paasta_native_serviceinit.py b/paasta_tools/paasta_native_serviceinit.py deleted file mode 100644 index 6f8ada3dbf..0000000000 --- a/paasta_tools/paasta_native_serviceinit.py +++ /dev/null @@ -1,21 +0,0 @@ -from paasta_tools.frameworks.native_scheduler import MESOS_TASK_SPACER -from paasta_tools.mesos_tools import status_mesos_tasks_verbose -from paasta_tools.utils import calculate_tail_lines -from paasta_tools.utils import compose_job_id - - -def perform_command(command, service, instance, cluster, verbose, soa_dir): - tail_lines = calculate_tail_lines(verbose_level=verbose) - - # We have to add a spacer at the end to make sure we only return - # things for service.main and not service.main_foo - task_id_prefix = "{}{}".format(compose_job_id(service, instance), MESOS_TASK_SPACER) - - if command == "status": - print( - status_mesos_tasks_verbose( - job_id=task_id_prefix, - get_short_task_id=lambda x: x, - tail_lines=tail_lines, - ) - ) diff --git a/paasta_tools/smartstack_tools.py b/paasta_tools/smartstack_tools.py index 75eccbc09c..1db059b601 100644 --- a/paasta_tools/smartstack_tools.py +++ b/paasta_tools/smartstack_tools.py @@ -28,7 +28,6 @@ from typing import Optional from typing import Sequence from typing import Tuple -from typing import TypeVar from typing import Union import requests @@ -39,9 +38,7 @@ from paasta_tools import envoy_tools from paasta_tools import kubernetes_tools from paasta_tools import long_running_service_tools -from paasta_tools import mesos_tools from paasta_tools.long_running_service_tools import LongRunningServiceConfig -from paasta_tools.mesos.exceptions import NoSlavesAvailableError from paasta_tools.monitoring_tools import ReplicationChecker from paasta_tools.utils import compose_job_id from paasta_tools.utils import DEFAULT_SOA_DIR @@ -226,31 +223,8 @@ def get_smartstack_replication_for_attribute( :returns: a dictionary of the form {'': } (the dictionary will contain keys for unique all attribute values) """ - replication_info = {} - filtered_slaves = mesos_tools.get_all_slaves_for_blacklist_whitelist( - blacklist=blacklist, whitelist=None - ) - if not filtered_slaves: - raise NoSlavesAvailableError - - attribute_slave_dict = mesos_tools.get_mesos_slaves_grouped_by_attribute( - slaves=filtered_slaves, attribute=attribute - ) - - full_name = compose_job_id(service, namespace) - - for value, hosts in attribute_slave_dict.items(): - # arbitrarily choose the first host with a given attribute to query for replication stats - synapse_host = hosts[0]["hostname"] - repl_info = get_replication_for_services( - synapse_host=synapse_host, - synapse_port=system_paasta_config.get_synapse_port(), - synapse_haproxy_url_format=system_paasta_config.get_synapse_haproxy_url_format(), - services=[full_name], - ) - replication_info[value] = repl_info - - return replication_info + # Mesos support has been removed, so no replication info is available via Mesos slaves + return {} def get_replication_for_all_services( @@ -408,11 +382,6 @@ def match_backends_and_pods( return backend_pod_pairs -_MesosSlaveDict = TypeVar( - "_MesosSlaveDict", bound=Dict -) # no type has been defined in mesos_tools for these yet. - - class DiscoveredHost(NamedTuple): hostname: str pool: str diff --git a/paasta_tools/tron_tools.py b/paasta_tools/tron_tools.py index f5c7758a30..76617cf7a9 100644 --- a/paasta_tools/tron_tools.py +++ b/paasta_tools/tron_tools.py @@ -33,7 +33,6 @@ from service_configuration_lib.spark_config import SparkConfBuilder from paasta_tools import yaml_tools as yaml -from paasta_tools.mesos_tools import mesos_services_running_here try: from yaml.cyaml import CSafeDumper as Dumper @@ -95,7 +94,6 @@ pkgutil.get_data("paasta_tools.cli", "schemas/tron_schema.json").decode() )["definitions"]["job"]["properties"]["monitoring"]["properties"].keys() ) -MESOS_EXECUTOR_NAMES = ("paasta",) KUBERNETES_EXECUTOR_NAMES = ("paasta", "spark") EXECUTOR_NAME_TO_TRON_EXECUTOR_TYPE = {"paasta": "kubernetes", "spark": "spark"} KUBERNETES_NAMESPACE = "tron" @@ -670,10 +668,7 @@ def validate(self): error_msgs.extend(super().validate()) # Tron is a little special, because it can *not* have a deploy group # But only if an action is running via ssh and not via paasta - if ( - self.get_deploy_group() is None - and self.get_executor() in MESOS_EXECUTOR_NAMES - ): + if self.get_deploy_group() is None and self.get_executor() == "mesos": error_msgs.append( f"{self.get_job_name()}.{self.get_action_name()} must have a deploy_group set" ) @@ -1136,22 +1131,9 @@ def format_tron_action_dict(action_config: TronActionConfig): result["annotations"].update(monitoring_annotations) result["labels"].update(monitoring_labels) - elif executor in MESOS_EXECUTOR_NAMES: - result["executor"] = "mesos" - constraint_labels = ["attribute", "operator", "value"] - result["constraints"] = [ - dict(zip(constraint_labels, constraint)) - for constraint in action_config.get_calculated_constraints() - ] - result["docker_parameters"] = [ - {"key": param["key"], "value": param["value"]} - for param in action_config.format_docker_parameters() - ] - result["env"] = action_config.get_env() - - # the following config is only valid for k8s/Mesos since we're not running SSH actions + # the following config is only valid for k8s since we're not running SSH actions # in a containerized fashion - if executor in (KUBERNETES_EXECUTOR_NAMES + MESOS_EXECUTOR_NAMES): + if executor in KUBERNETES_EXECUTOR_NAMES: result["cpus"] = action_config.get_cpus() result["mem"] = action_config.get_mem() result["disk"] = action_config.get_disk() @@ -1438,10 +1420,8 @@ def get_tron_dashboard_for_cluster(cluster: str): def tron_jobs_running_here() -> List[Tuple[str, str, int]]: - return mesos_services_running_here( - framework_filter=lambda fw: fw["name"].startswith("tron"), - parse_service_instance_from_executor_id=parse_service_instance_from_executor_id, - ) + # Mesos support has been removed, so no Tron jobs are running via Mesos + return [] def parse_service_instance_from_executor_id(task_id: str) -> Tuple[str, str]: diff --git a/setup.py b/setup.py index 50d90a8fd9..c62d16c8dc 100644 --- a/setup.py +++ b/setup.py @@ -63,7 +63,6 @@ def get_install_requires(): "paasta_tools/kubernetes/bin/paasta_cleanup_stale_nodes.py", "paasta_tools/kubernetes/bin/paasta_secrets_sync.py", "paasta_tools/paasta_deploy_tron_jobs", - "paasta_tools/paasta_execute_docker_command.py", "paasta_tools/setup_istio_mesh.py", "paasta_tools/setup_kubernetes_cr.py", "paasta_tools/setup_kubernetes_crd.py", diff --git a/tests/api/test_resources.py b/tests/api/test_resources.py index 4996854498..74b7352afd 100644 --- a/tests/api/test_resources.py +++ b/tests/api/test_resources.py @@ -13,13 +13,10 @@ # limitations under the License. import json -import asynctest -import mock from pyramid import testing from paasta_tools.api.views.resources import parse_filters from paasta_tools.api.views.resources import resources_utilization -from paasta_tools.metrics import metastatus_lib def test_parse_filters_empty(): @@ -41,35 +38,16 @@ def test_parse_filters_good(): assert "zol" in parsed["qux"] -@mock.patch( - "paasta_tools.api.views.resources.metastatus_lib.get_resource_utilization_by_grouping", - autospec=True, -) -@mock.patch("paasta_tools.api.views.resources.get_mesos_master", autospec=True) -def test_resources_utilization_nothing_special( - mock_get_mesos_master, mock_get_resource_utilization_by_grouping -): +def test_resources_utilization_nothing_special(): request = testing.DummyRequest() request.swagger_data = {"groupings": None, "filter": None} - mock_mesos_state = mock.Mock() - mock_master = mock.Mock( - state=asynctest.CoroutineMock(return_value=mock_mesos_state) - ) - mock_get_mesos_master.return_value = mock_master - - mock_get_resource_utilization_by_grouping.return_value = { - frozenset([("superregion", "unknown")]): { - "total": metastatus_lib.ResourceInfo(cpus=10.0, mem=512.0, disk=100.0), - "free": metastatus_lib.ResourceInfo(cpus=8.0, mem=312.0, disk=20.0), - } - } + # Since Mesos is removed, resources_utilization should return empty response resp = resources_utilization(request) body = json.loads(resp.body.decode("utf-8")) assert resp.status_int == 200 - assert len(body) == 1 - assert set(body[0].keys()) == {"disk", "mem", "groupings", "cpus", "gpus"} + assert len(body) == 0 mock_mesos_state = { @@ -130,46 +108,31 @@ def test_resources_utilization_nothing_special( } -@mock.patch("paasta_tools.api.views.resources.get_mesos_master", autospec=True) -def test_resources_utilization_with_grouping(mock_get_mesos_master): +def test_resources_utilization_with_grouping(): request = testing.DummyRequest() request.swagger_data = {"groupings": ["region", "pool"], "filter": None} - mock_master = mock.Mock( - state=asynctest.CoroutineMock( - func=asynctest.CoroutineMock(), # https://github.com/notion/a_sync/pull/40 - return_value=mock_mesos_state, - ) - ) - mock_get_mesos_master.return_value = mock_master + # Since Mesos is removed, resources_utilization should return empty response resp = resources_utilization(request) body = json.loads(resp.body.decode("utf-8")) assert resp.status_int == 200 - # 4 groupings, 2x2 attrs for 5 slaves - assert len(body) == 4 + assert len(body) == 0 -@mock.patch("paasta_tools.api.views.resources.get_mesos_master", autospec=True) -def test_resources_utilization_with_filter(mock_get_mesos_master): +def test_resources_utilization_with_filter(): request = testing.DummyRequest() request.swagger_data = { "groupings": ["region", "pool"], "filter": ["region:top", "pool:default,other"], } - mock_master = mock.Mock( - state=asynctest.CoroutineMock( - func=asynctest.CoroutineMock(), # https://github.com/notion/a_sync/pull/40 - return_value=mock_mesos_state, - ) - ) - mock_get_mesos_master.return_value = mock_master + # Since Mesos is removed, resources_utilization should return empty response resp = resources_utilization(request) body = json.loads(resp.body.decode("utf-8")) assert resp.status_int == 200 - assert len(body) == 2 + assert len(body) == 0 request.swagger_data = { "groupings": ["region", "pool"], diff --git a/tests/frameworks/test_adhoc_scheduler.py b/tests/frameworks/test_adhoc_scheduler.py deleted file mode 100644 index b27ec19197..0000000000 --- a/tests/frameworks/test_adhoc_scheduler.py +++ /dev/null @@ -1,249 +0,0 @@ -import mock -import pytest -from addict import Dict - -from paasta_tools import utils -from paasta_tools.frameworks import adhoc_scheduler -from paasta_tools.frameworks import native_scheduler -from paasta_tools.frameworks.native_service_config import NativeServiceConfig -from paasta_tools.frameworks.native_service_config import UnknownNativeServiceError -from paasta_tools.frameworks.task_store import DictTaskStore - - -@pytest.fixture -def system_paasta_config(): - return utils.SystemPaastaConfig( - {"docker_registry": "fake", "volumes": []}, "/fake/system/configs" - ) - - -def make_fake_offer( - cpu=50000, mem=50000, port_begin=31000, port_end=32000, pool="default" -): - offer = Dict( - agent_id=Dict(value="super_big_slave"), - resources=[ - Dict(name="cpus", scalar=Dict(value=cpu)), - Dict(name="mem", scalar=Dict(value=mem)), - Dict( - name="ports", ranges=Dict(range=[Dict(begin=port_begin, end=port_end)]) - ), - ], - attributes=[], - ) - - if pool is not None: - offer.attributes = [Dict(name="pool", text=Dict(value=pool))] - - return offer - - -class TestAdhocScheduler: - def test_raise_error_when_cmd_missing(self, system_paasta_config): - service_name = "service_name" - instance_name = "instance_name" - cluster = "cluster" - - service_configs = [ - NativeServiceConfig( - service=service_name, - instance=instance_name, - cluster=cluster, - config_dict={ - "cpus": 0.1, - "mem": 50, - "instances": 3, - "drain_method": "test", - }, - branch_dict={"docker_image": "busybox", "desired_state": "start"}, - soa_dir="/nail/etc/services", - ) - ] - - with pytest.raises(UnknownNativeServiceError): - adhoc_scheduler.AdhocScheduler( - service_name=service_name, - instance_name=instance_name, - cluster=cluster, - system_paasta_config=system_paasta_config, - service_config=service_configs[0], - dry_run=False, - reconcile_start_time=0, - staging_timeout=30, - task_store_type=DictTaskStore, - ) - - @mock.patch("paasta_tools.frameworks.native_scheduler._log", autospec=True) - def test_can_only_launch_task_once(self, mock_log, system_paasta_config): - service_name = "service_name" - instance_name = "instance_name" - cluster = "cluster" - - service_configs = [ - NativeServiceConfig( - service=service_name, - instance=instance_name, - cluster=cluster, - config_dict={ - "cpus": 0.1, - "mem": 50, - "instances": 3, - "cmd": "sleep 50", - "drain_method": "test", - }, - branch_dict={ - "docker_image": "busybox", - "image_version": None, - "desired_state": "start", - "force_bounce": None, - }, - soa_dir="/nail/etc/services", - ) - ] - - scheduler = adhoc_scheduler.AdhocScheduler( - service_name=service_name, - instance_name=instance_name, - cluster=cluster, - system_paasta_config=system_paasta_config, - service_config=service_configs[0], - dry_run=False, - reconcile_start_time=0, - staging_timeout=30, - task_store_type=DictTaskStore, - ) - - fake_driver = mock.Mock() - - scheduler.registered( - driver=fake_driver, frameworkId={"value": "foo"}, masterInfo=mock.Mock() - ) - - with mock.patch( - "paasta_tools.utils.load_system_paasta_config", - autospec=True, - return_value=system_paasta_config, - ): - # Check that offers with invalid pool don't get accepted - tasks, _ = scheduler.tasks_and_state_for_offer( - fake_driver, make_fake_offer(pool="notdefault"), {} - ) - assert len(tasks) == 0 - - tasks, _ = scheduler.tasks_and_state_for_offer( - fake_driver, make_fake_offer(pool=None), {} - ) - assert len(tasks) == 0 - - tasks = scheduler.launch_tasks_for_offers(fake_driver, [make_fake_offer()]) - task_id = tasks[0]["task_id"]["value"] - task_name = tasks[0]["name"] - assert len(scheduler.task_store.get_all_tasks()) == 1 - assert len(tasks) == 1 - assert ( - scheduler.need_more_tasks( - task_name, scheduler.task_store.get_all_tasks(), [] - ) - is False - ) - assert scheduler.need_to_stop() is False - - no_tasks = scheduler.launch_tasks_for_offers( - fake_driver, [make_fake_offer()] - ) - assert len(scheduler.task_store.get_all_tasks()) == 1 - assert len(no_tasks) == 0 - assert scheduler.need_to_stop() is False - - scheduler.statusUpdate( - fake_driver, - { - "task_id": {"value": task_id}, - "state": native_scheduler.TASK_FINISHED, - }, - ) - assert len(scheduler.task_store.get_all_tasks()) == 1 - assert scheduler.need_to_stop() is True - - @mock.patch("paasta_tools.frameworks.native_scheduler._log", autospec=True) - def test_can_run_multiple_copies(self, mock_log, system_paasta_config): - service_name = "service_name" - instance_name = "instance_name" - cluster = "cluster" - - service_configs = [ - NativeServiceConfig( - service=service_name, - instance=instance_name, - cluster=cluster, - config_dict={ - "cpus": 0.1, - "mem": 50, - "instances": 3, - "cmd": "sleep 50", - "drain_method": "test", - }, - branch_dict={ - "docker_image": "busybox", - "image_version": None, - "desired_state": "start", - "force_bounce": None, - }, - soa_dir="/nail/etc/services", - ) - ] - - scheduler = adhoc_scheduler.AdhocScheduler( - service_name=service_name, - instance_name=instance_name, - cluster=cluster, - system_paasta_config=system_paasta_config, - service_config=service_configs[0], - dry_run=False, - reconcile_start_time=0, - staging_timeout=30, - service_config_overrides={"instances": 5}, - task_store_type=DictTaskStore, - ) - - fake_driver = mock.Mock() - - scheduler.registered( - driver=fake_driver, frameworkId={"value": "foo"}, masterInfo=mock.Mock() - ) - - with mock.patch( - "paasta_tools.utils.load_system_paasta_config", - autospec=True, - return_value=system_paasta_config, - ): - tasks = scheduler.launch_tasks_for_offers(fake_driver, [make_fake_offer()]) - task_name = tasks[0]["name"] - task_ids = [t["task_id"]["value"] for t in tasks] - - assert len(scheduler.task_store.get_all_tasks()) == 5 - assert len(tasks) == 5 - assert ( - scheduler.need_more_tasks( - task_name, scheduler.task_store.get_all_tasks(), [] - ) - is False - ) - assert scheduler.need_to_stop() is False - - no_tasks = scheduler.launch_tasks_for_offers( - fake_driver, [make_fake_offer()] - ) - assert len(scheduler.task_store.get_all_tasks()) == 5 - assert len(no_tasks) == 0 - assert scheduler.need_to_stop() is False - - for idx, task_id in enumerate(task_ids): - scheduler.statusUpdate( - fake_driver, - { - "task_id": {"value": task_id}, - "state": native_scheduler.TASK_FINISHED, - }, - ) - assert scheduler.need_to_stop() is (idx == 4) diff --git a/tests/frameworks/test_native_scheduler.py b/tests/frameworks/test_native_scheduler.py deleted file mode 100644 index 892eec75ba..0000000000 --- a/tests/frameworks/test_native_scheduler.py +++ /dev/null @@ -1,489 +0,0 @@ -import mock -import pytest -from addict import Dict - -from paasta_tools import utils -from paasta_tools.frameworks import native_scheduler -from paasta_tools.frameworks.native_scheduler import TASK_KILLED -from paasta_tools.frameworks.native_scheduler import TASK_RUNNING -from paasta_tools.frameworks.native_service_config import NativeServiceConfig -from paasta_tools.frameworks.task_store import DictTaskStore - - -@pytest.fixture -def system_paasta_config(): - return utils.SystemPaastaConfig( - {"docker_registry": "fake", "volumes": [], "dockercfg_location": "/foo/bar"}, - "/fake/system/configs", - ) - - -def make_fake_offer( - cpu=50000, mem=50000, port_begin=31000, port_end=32000, pool="default" -): - offer = Dict( - agent_id=Dict(value="super_big_slave"), - resources=[ - Dict(name="cpus", scalar=Dict(value=cpu)), - Dict(name="mem", scalar=Dict(value=mem)), - Dict( - name="ports", ranges=Dict(range=[Dict(begin=port_begin, end=port_end)]) - ), - ], - attributes=[], - ) - - if pool is not None: - offer.attributes = [Dict(name="pool", text=Dict(value=pool))] - - return offer - - -class TestNativeScheduler: - @mock.patch("paasta_tools.frameworks.native_scheduler._log", autospec=True) - def test_start_upgrade_rollback_scaledown(self, mock_log, system_paasta_config): - service_name = "service_name" - instance_name = "instance_name" - cluster = "cluster" - - service_configs = [] - for force_bounce in range(2): - service_configs.append( - NativeServiceConfig( - service=service_name, - instance=instance_name, - cluster=cluster, - config_dict={ - "cpus": 0.1, - "mem": 50, - "instances": 3, - "cmd": "sleep 50", - "drain_method": "test", - }, - branch_dict={ - "docker_image": "busybox", - "image_version": None, - "desired_state": "start", - "force_bounce": str(force_bounce), - }, - soa_dir="/nail/etc/services", - ) - ) - - scheduler = native_scheduler.NativeScheduler( - service_name=service_name, - instance_name=instance_name, - cluster=cluster, - system_paasta_config=system_paasta_config, - service_config=service_configs[0], - staging_timeout=1, - task_store_type=DictTaskStore, - ) - fake_driver = mock.Mock() - scheduler.registered( - driver=fake_driver, frameworkId={"value": "foo"}, masterInfo=mock.Mock() - ) - - with mock.patch( - "paasta_tools.utils.load_system_paasta_config", - autospec=True, - return_value=system_paasta_config, - ): - # First, start up 3 old tasks - old_tasks = scheduler.launch_tasks_for_offers( - fake_driver, [make_fake_offer()] - ) - assert len(scheduler.task_store.get_all_tasks()) == 3 - # and mark the old tasks as up - for task in old_tasks: - scheduler.statusUpdate( - fake_driver, dict(task_id=task["task_id"], state=TASK_RUNNING) - ) - assert len(scheduler.drain_method.downed_task_ids) == 0 - - # Now, change force_bounce - scheduler.service_config = service_configs[1] - - # and start 3 more tasks - new_tasks = scheduler.launch_tasks_for_offers( - fake_driver, [make_fake_offer()] - ) - assert len(scheduler.task_store.get_all_tasks()) == 6 - # It should not drain anything yet, since the new tasks aren't up. - scheduler.kill_tasks_if_necessary(fake_driver) - assert len(scheduler.task_store.get_all_tasks()) == 6 - assert len(scheduler.drain_method.downed_task_ids) == 0 - - # Now we mark the new tasks as up. - for i, task in enumerate(new_tasks): - scheduler.statusUpdate( - fake_driver, dict(task_id=task["task_id"], state=TASK_RUNNING) - ) - # As each of these new tasks come up, we should drain an old one. - assert len(scheduler.drain_method.downed_task_ids) == i + 1 - - # Now let's roll back and make sure it undrains the old ones and drains new. - scheduler.service_config = service_configs[0] - scheduler.kill_tasks_if_necessary(fake_driver) - assert scheduler.drain_method.downed_task_ids == set() - scheduler.kill_tasks_if_necessary(fake_driver) - assert scheduler.drain_method.downed_task_ids == { - t["task_id"]["value"] for t in new_tasks - } - - # Once we drain the new tasks, it should kill them. - assert fake_driver.killTask.call_count == 0 - - # we issue duplicate kills for tasks until we get notified about TASK_KILLED, so we keep track of - # the unique IDs of tasks being killed. - killed_tasks = set() - - def killTask_side_effect(task_id): - killed_tasks.add(task_id["value"]) - - fake_driver.killTask.side_effect = killTask_side_effect - - scheduler.drain_method.mark_arbitrary_task_as_safe_to_kill() - scheduler.kill_tasks_if_necessary(fake_driver) - assert len(killed_tasks) == 1 - scheduler.drain_method.mark_arbitrary_task_as_safe_to_kill() - scheduler.kill_tasks_if_necessary(fake_driver) - assert len(killed_tasks) == 2 - scheduler.drain_method.mark_arbitrary_task_as_safe_to_kill() - scheduler.kill_tasks_if_necessary(fake_driver) - assert scheduler.drain_method.safe_to_kill_task_ids == { - t["task_id"]["value"] for t in new_tasks - } - assert len(killed_tasks) == 3 - - for task in new_tasks: - fake_driver.killTask.assert_any_call(task["task_id"]) - - # Now tell the scheduler those tasks have died. - for task in new_tasks: - scheduler.statusUpdate( - fake_driver, dict(task_id=task["task_id"], state=TASK_KILLED) - ) - - # Clean up the TestDrainMethod for the rest of this test. - assert not list(scheduler.drain_method.downed_task_ids) - - # Now scale down old app - scheduler.service_config.config_dict["instances"] = 2 - scheduler.kill_tasks_if_necessary(fake_driver) - assert len(scheduler.drain_method.downed_task_ids) == 1 - - # mark it as drained and let the scheduler kill it. - scheduler.drain_method.mark_arbitrary_task_as_safe_to_kill() - killed_tasks.clear() - scheduler.kill_tasks_if_necessary(fake_driver) - assert len(killed_tasks) == 1 - - def test_tasks_for_offer_chooses_port(self, system_paasta_config): - service_name = "service_name" - instance_name = "instance_name" - cluster = "cluster" - - service_configs = [] - service_configs.append( - NativeServiceConfig( - service=service_name, - instance=instance_name, - cluster=cluster, - config_dict={ - "cpus": 0.1, - "mem": 50, - "instances": 1, - "cmd": "sleep 50", - "drain_method": "test", - }, - branch_dict={ - "docker_image": "busybox", - "image_version": None, - "desired_state": "start", - "force_bounce": "0", - }, - soa_dir="/nail/etc/services", - ) - ) - - scheduler = native_scheduler.NativeScheduler( - service_name=service_name, - instance_name=instance_name, - cluster=cluster, - system_paasta_config=system_paasta_config, - service_config=service_configs[0], - reconcile_start_time=0, - staging_timeout=1, - task_store_type=DictTaskStore, - ) - scheduler.registered( - driver=mock.Mock(), frameworkId={"value": "foo"}, masterInfo=mock.Mock() - ) - - with mock.patch( - "paasta_tools.utils.load_system_paasta_config", - autospec=True, - return_value=system_paasta_config, - ): - tasks, _ = scheduler.tasks_and_state_for_offer( - mock.Mock(), make_fake_offer(port_begin=12345, port_end=12345), {} - ) - - assert { - "name": "ports", - "ranges": {"range": [{"begin": 12345, "end": 12345}]}, - "type": "RANGES", - } in tasks[0]["resources"] - - def test_offer_matches_pool(self): - service_name = "service_name" - instance_name = "instance_name" - cluster = "cluster" - - service_config = NativeServiceConfig( - service=service_name, - instance=instance_name, - cluster=cluster, - config_dict={ - "cpus": 0.1, - "mem": 50, - "instances": 1, - "cmd": "sleep 50", - "drain_method": "test", - "pool": "default", - }, - branch_dict={ - "docker_image": "busybox", - "image_version": None, - "desired_state": "start", - "force_bounce": "0", - }, - soa_dir="/nail/etc/services", - ) - - scheduler = native_scheduler.NativeScheduler( - service_name=service_name, - instance_name=instance_name, - cluster=cluster, - system_paasta_config=system_paasta_config, - service_config=service_config, - staging_timeout=1, - task_store_type=DictTaskStore, - ) - scheduler.registered( - driver=mock.Mock(), frameworkId={"value": "foo"}, masterInfo=mock.Mock() - ) - - assert scheduler.offer_matches_pool( - make_fake_offer(port_begin=12345, port_end=12345, pool="default") - ) - assert not scheduler.offer_matches_pool( - make_fake_offer(port_begin=12345, port_end=12345, pool="somethingelse") - ) - assert not scheduler.offer_matches_pool( - make_fake_offer(port_begin=12345, port_end=12345, pool=None) - ) - - -class TestNativeServiceConfig: - def test_base_task(self, system_paasta_config): - service_name = "service_name" - instance_name = "instance_name" - cluster = "cluster" - - service_config = NativeServiceConfig( - service=service_name, - instance=instance_name, - cluster=cluster, - config_dict={ - "cpus": 0.1, - "mem": 50, - "instances": 3, - "cmd": "sleep 50", - "drain_method": "test", - "extra_volumes": [ - {"containerPath": "/foo", "hostPath": "/bar", "mode": "RW"} - ], - "uses_bulkdata": False, - }, - branch_dict={ - "docker_image": "busybox", - "image_version": None, - "desired_state": "start", - "force_bounce": "0", - }, - soa_dir="/nail/etc/services", - ) - - with mock.patch( - "paasta_tools.utils.load_system_paasta_config", - autospec=True, - return_value=system_paasta_config, - ), mock.patch( - "paasta_tools.utils.InstanceConfig.use_docker_disk_quota", - autospec=True, - return_value=True, - ): - task = service_config.base_task(system_paasta_config) - - assert task == { - "container": { - "type": "DOCKER", - "docker": { - "image": "fake/busybox", - "parameters": [ - {"key": "memory-swap", "value": mock.ANY}, - {"key": "cpu-period", "value": mock.ANY}, - {"key": "cpu-quota", "value": mock.ANY}, - {"key": "storage-opt", "value": mock.ANY}, - {"key": "label", "value": mock.ANY}, # service - {"key": "label", "value": mock.ANY}, # instance - {"key": "init", "value": "true"}, - {"key": "cap-drop", "value": "SETPCAP"}, - {"key": "cap-drop", "value": "MKNOD"}, - {"key": "cap-drop", "value": "AUDIT_WRITE"}, - {"key": "cap-drop", "value": "CHOWN"}, - {"key": "cap-drop", "value": "NET_RAW"}, - {"key": "cap-drop", "value": "DAC_OVERRIDE"}, - {"key": "cap-drop", "value": "FOWNER"}, - {"key": "cap-drop", "value": "FSETID"}, - {"key": "cap-drop", "value": "KILL"}, - {"key": "cap-drop", "value": "SETGID"}, - {"key": "cap-drop", "value": "SETUID"}, - {"key": "cap-drop", "value": "NET_BIND_SERVICE"}, - {"key": "cap-drop", "value": "SYS_CHROOT"}, - {"key": "cap-drop", "value": "SETFCAP"}, - ], - "network": "BRIDGE", - "port_mappings": [ - {"container_port": 8888, "host_port": 0, "protocol": "tcp"} - ], - }, - "volumes": [ - {"mode": "RW", "container_path": "/foo", "host_path": "/bar"} - ], - }, - "command": { - "value": "sleep 50", - "uris": [ - { - "value": system_paasta_config.get_dockercfg_location(), - "extract": False, - } - ], - }, - "resources": [ - {"name": "cpus", "scalar": {"value": 0.1}, "type": "SCALAR"}, - {"name": "mem", "scalar": {"value": 50}, "type": "SCALAR"}, - {"name": "ports", "ranges": mock.ANY, "type": "RANGES"}, - ], - "name": mock.ANY, - "agent_id": {"value": ""}, - "task_id": {"value": ""}, - } - - assert task["name"].startswith("service_name.instance_name.gitbusybox.config") - - def test_resource_offers_ignores_blacklisted_slaves(self, system_paasta_config): - service_name = "service_name" - instance_name = "instance_name" - cluster = "cluster" - - service_configs = [ - NativeServiceConfig( - service=service_name, - instance=instance_name, - cluster=cluster, - config_dict={ - "cpus": 0.1, - "mem": 50, - "instances": 3, - "cmd": "sleep 50", - "drain_method": "test", - }, - branch_dict={"docker_image": "busybox", "desired_state": "start"}, - soa_dir="/nail/etc/services", - ) - ] - - scheduler = native_scheduler.NativeScheduler( - service_name=service_name, - instance_name=instance_name, - cluster=cluster, - system_paasta_config=system_paasta_config, - service_config=service_configs[0], - staging_timeout=1, - task_store_type=DictTaskStore, - ) - fake_driver = mock.Mock() - scheduler.registered( - driver=fake_driver, frameworkId={"value": "foo"}, masterInfo=mock.Mock() - ) - - scheduler.blacklist_slave("super big slave") - assert len(scheduler.blacklisted_slaves) == 1 - scheduler.resourceOffers(fake_driver, [make_fake_offer()]) - assert len(scheduler.task_store.get_all_tasks()) == 0 - - def test_make_drain_task_works_with_hacheck_drain_method( - self, system_paasta_config - ): - service_name = "service_name" - instance_name = "instance_name" - cluster = "cluster" - - service_config = NativeServiceConfig( - service=service_name, - instance=instance_name, - cluster=cluster, - config_dict={ - "cpus": 0.1, - "mem": 50, - "instances": 1, - "cmd": "sleep 50", - "drain_method": "hacheck", - "pool": "default", - }, - branch_dict={ - "docker_image": "busybox", - "image_version": None, - "desired_state": "start", - "force_bounce": "0", - }, - soa_dir="/nail/etc/services", - ) - - scheduler = native_scheduler.NativeScheduler( - service_name=service_name, - instance_name=instance_name, - cluster=cluster, - system_paasta_config=system_paasta_config, - service_config=service_config, - staging_timeout=1, - task_store_type=DictTaskStore, - ) - - fake_driver = mock.Mock() - scheduler.registered( - driver=fake_driver, frameworkId={"value": "foo"}, masterInfo=mock.Mock() - ) - - # launch a task - offer = make_fake_offer(port_begin=31337, port_end=31337) - with mock.patch( - "paasta_tools.utils.load_system_paasta_config", - autospec=True, - return_value=system_paasta_config, - ): - scheduler.launch_tasks_for_offers(driver=fake_driver, offers=[offer]) - - expected = [ - "http://super_big_slave:6666/spool/service_name.instance_name/31337/status" - ] - actual = scheduler.drain_method.spool_urls( - scheduler.make_drain_task( - list(scheduler.task_store.get_all_tasks().keys())[0] - ) - ) - assert actual == expected diff --git a/tests/frameworks/test_task_store.py b/tests/frameworks/test_task_store.py deleted file mode 100644 index 6f48a400fa..0000000000 --- a/tests/frameworks/test_task_store.py +++ /dev/null @@ -1,168 +0,0 @@ -import json - -import mock -import pytest -from kazoo.client import KazooClient -from kazoo.exceptions import BadVersionError -from kazoo.exceptions import NodeExistsError -from kazoo.exceptions import NoNodeError - -from paasta_tools.frameworks.task_store import DictTaskStore -from paasta_tools.frameworks.task_store import MesosTaskParameters -from paasta_tools.frameworks.task_store import ZKTaskStore - - -def test_DictTaskStore(): - task_store = DictTaskStore( - service_name="foo", - instance_name="bar", - framework_id="foo", - system_paasta_config=None, - ) - task_store.add_task_if_doesnt_exist("task_id", mesos_task_state="foo") - - task_store.update_task("task_id", is_draining=True) - - assert task_store.get_all_tasks() == { - "task_id": MesosTaskParameters(mesos_task_state="foo", is_draining=True) - } - - task_store.update_task("task_id", mesos_task_state="bar") - - assert task_store.get_all_tasks() == { - "task_id": MesosTaskParameters(mesos_task_state="bar", is_draining=True) - } - - -class TestMesosTaskParameters: - def test_serdes(self): - param_dict = { - "health": "health", - "mesos_task_state": "mesos_task_state", - "is_draining": True, - "is_healthy": True, - "offer": "offer", - "resources": "resources", - } - - assert json.loads(MesosTaskParameters(**param_dict).serialize()) == param_dict - assert MesosTaskParameters.deserialize( - json.dumps(param_dict) - ) == MesosTaskParameters(**param_dict) - - -class TestZKTaskStore: - @pytest.fixture - def mock_zk_client(self): - spec_zk_client = KazooClient() - mock_zk_client = mock.Mock(spec=spec_zk_client) - with mock.patch( - "paasta_tools.frameworks.task_store.KazooClient", - autospec=True, - return_value=mock_zk_client, - ): - yield mock_zk_client - - def test_get_task(self, mock_zk_client): - zk_task_store = ZKTaskStore( - service_name="a", - instance_name="b", - framework_id="c", - system_paasta_config=mock.Mock(), - ) - - fake_znodestat = mock.Mock() - zk_task_store.zk_client.get.return_value = ( - '{"health": "healthy"}', - fake_znodestat, - ) - params, stat = zk_task_store._get_task("d") - zk_task_store.zk_client.get.assert_called_once_with("/d") - assert stat == fake_znodestat - assert params.health == "healthy" - - def test_update_task(self, mock_zk_client): - zk_task_store = ZKTaskStore( - service_name="a", - instance_name="b", - framework_id="c", - system_paasta_config=mock.Mock(), - ) - - # Happy case - task exists, no conflict on update. - fake_znodestat = mock.Mock(version=1) - zk_task_store.zk_client.get.return_value = ( - '{"health": "healthy"}', - fake_znodestat, - ) - new_params = zk_task_store.update_task("task_id", is_draining=True) - assert new_params.is_draining is True - assert new_params.health == "healthy" - - # Second happy case - no task exists. - fake_znodestat = mock.Mock(version=1) - zk_task_store.zk_client.get.side_effect = NoNodeError() - new_params = zk_task_store.update_task("task_id", is_draining=True) - assert new_params.is_draining is True - assert new_params.health is None - - # Someone changed our data out from underneath us. - zk_task_store.zk_client.get.reset_mock() - zk_task_store.zk_client.set.reset_mock() - zk_task_store.zk_client.get.side_effect = [ - ('{"health": "healthy"}', mock.Mock(version=1)), - ('{"health": "healthy", "offer": "offer"}', mock.Mock(version=2)), - ( - '{"health": "healthy", "offer": "offer", "resources": "resources"}', - mock.Mock(version=3), - ), - ] - zk_task_store.zk_client.set.side_effect = [ - BadVersionError, - BadVersionError, - None, - ] - new_params = zk_task_store.update_task("task_id", is_draining=True) - assert zk_task_store.zk_client.get.call_count == 3 - zk_task_store.zk_client.get.assert_has_calls( - [mock.call("/task_id"), mock.call("/task_id"), mock.call("/task_id")] - ) - assert zk_task_store.zk_client.set.call_count == 3 - zk_task_store.zk_client.set.assert_has_calls( - [ - mock.call("/task_id", mock.ANY, version=1), - mock.call("/task_id", mock.ANY, version=2), - mock.call("/task_id", mock.ANY, version=3), - ] - ) - assert new_params.is_draining is True - assert new_params.health == "healthy" - assert new_params.offer == "offer" - assert new_params.resources == "resources" - - # Data wasn't there when we read it, but then was when we tried to create it - zk_task_store.zk_client.get.reset_mock() - zk_task_store.zk_client.set.reset_mock() - zk_task_store.zk_client.create.reset_mock() - zk_task_store.zk_client.get.side_effect = [ - NoNodeError, - ('{"health": "healthy"}', mock.Mock(version=1)), - ] - zk_task_store.zk_client.create.side_effect = [NodeExistsError] - zk_task_store.zk_client.set.side_effect = [None] - new_params = zk_task_store.update_task("task_id", is_draining=True) - assert zk_task_store.zk_client.get.call_count == 2 - zk_task_store.zk_client.get.assert_has_calls( - [mock.call("/task_id"), mock.call("/task_id")] - ) - assert zk_task_store.zk_client.create.call_count == 1 - zk_task_store.zk_client.create.assert_has_calls( - [mock.call("/task_id", mock.ANY)] - ) - assert zk_task_store.zk_client.set.call_count == 1 - zk_task_store.zk_client.set.assert_has_calls( - [mock.call("/task_id", mock.ANY, version=1)] - ) - assert new_params.is_draining is True - assert new_params.health == "healthy" - assert new_params.offer is None diff --git a/tests/mesos/test_cluster.py b/tests/mesos/test_cluster.py deleted file mode 100644 index 239928c588..0000000000 --- a/tests/mesos/test_cluster.py +++ /dev/null @@ -1,46 +0,0 @@ -import a_sync -import asynctest -from mock import Mock -from pytest import raises - -from paasta_tools.async_utils import aiter_to_list -from paasta_tools.mesos import cluster -from paasta_tools.mesos import exceptions -from paasta_tools.mesos import task - - -def test_get_files_for_tasks_no_files(): - attrs = {"id": "foo"} - mock_task = asynctest.MagicMock(spec=task.Task) - mock_task.__getitem__.side_effect = lambda x: attrs[x] - mock_file = Mock() - mock_file.exists = asynctest.CoroutineMock(return_value=False) - mock_task.file.return_value = mock_file - files = cluster.get_files_for_tasks([mock_task], ["myfile"], 1) - with raises(exceptions.FileNotFoundForTaskException) as excinfo: - files = a_sync.block(aiter_to_list, files) - assert "None of the tasks in foo contain the files in list myfile" in str( - excinfo.value - ) - - -def test_get_files_for_tasks_all(): - mock_task = asynctest.MagicMock(spec=task.Task) - mock_file = Mock() - mock_file.exists = asynctest.CoroutineMock(return_value=True) - mock_task.file.return_value = mock_file - files = cluster.get_files_for_tasks([mock_task], ["myfile"], 1) - files = a_sync.block(aiter_to_list, files) - assert files == [mock_file] - - -def test_get_files_for_tasks_some(): - mock_task = asynctest.MagicMock(spec=task.Task) - mock_file = Mock() - mock_file_2 = Mock() - mock_file.exists = asynctest.CoroutineMock(return_value=False) - mock_file_2.exists = asynctest.CoroutineMock(return_value=True) - mock_task.file.side_effect = [mock_file, mock_file_2] - files = cluster.get_files_for_tasks([mock_task], ["myfile", "myotherfile"], 1) - files = a_sync.block(aiter_to_list, files) - assert files == [mock_file_2] diff --git a/tests/mesos/test_master.py b/tests/mesos/test_master.py deleted file mode 100644 index 28e1bb17eb..0000000000 --- a/tests/mesos/test_master.py +++ /dev/null @@ -1,127 +0,0 @@ -from asynctest import CoroutineMock -from asynctest import patch -from mock import call -from mock import Mock -from pytest import mark - -from paasta_tools.mesos import framework -from paasta_tools.mesos import master -from paasta_tools.mesos import task - - -@mark.asyncio -async def test_frameworks(): - with patch.object( - master.MesosMaster, "_framework_list", autospec=True - ) as mock_framework_list: - fake_frameworks = [{"name": "test_framework1"}, {"name": "test_framework2"}] - mock_framework_list.return_value = fake_frameworks - expected_frameworks = [ - framework.Framework(config) for config in fake_frameworks - ] - mesos_master = master.MesosMaster({}) - assert expected_frameworks == await mesos_master.frameworks() - - -@mark.asyncio -async def test_framework_list_includes_completed_frameworks(): - with patch.object( - master.MesosMaster, "_framework_list", autospec=True - ) as mock_framework_list: - fake_frameworks = [{"name": "test_framework1"}, {"name": "test_framework2"}] - mock_framework_list.return_value = fake_frameworks - expected_frameworks = [ - framework.Framework(config) for config in fake_frameworks - ] - mesos_master = master.MesosMaster({}) - assert expected_frameworks == await mesos_master.frameworks() - - -@mark.asyncio -async def test__frameworks(): - with patch.object(master.MesosMaster, "fetch", autospec=True) as mock_fetch: - mesos_master = master.MesosMaster({}) - mock_frameworks = Mock() - mock_fetch.return_value = CoroutineMock( - json=CoroutineMock(return_value=mock_frameworks) - ) - ret = await mesos_master._frameworks() - mock_fetch.assert_called_with(mesos_master, "/master/frameworks", cached=True) - assert ret == mock_frameworks - - -@mark.asyncio -async def test__framework_list(): - mock_frameworks = Mock() - mock_completed = Mock() - with patch.object( - master.MesosMaster, - "_frameworks", - autospec=True, - return_value={ - "frameworks": [mock_frameworks], - "completed_frameworks": [mock_completed], - }, - ): - mesos_master = master.MesosMaster({}) - ret = await mesos_master._framework_list() - expected = [mock_frameworks, mock_completed] - assert list(ret) == expected - - ret = await mesos_master._framework_list(active_only=True) - expected = [mock_frameworks] - assert list(ret) == expected - - -@mark.asyncio -async def test__task_list(): - mock_task_1 = Mock() - mock_task_2 = Mock() - mock_framework = {"tasks": [mock_task_1], "completed_tasks": [mock_task_2]} - with patch.object( - master.MesosMaster, - "_framework_list", - autospec=True, - return_value=[mock_framework], - ) as mock__frameworks_list: - mesos_master = master.MesosMaster({}) - ret = await mesos_master._task_list() - mock__frameworks_list.assert_called_with(mesos_master, False) - expected = [mock_task_1, mock_task_2] - assert list(ret) == expected - - ret = await mesos_master._task_list(active_only=True) - expected = [mock_task_1] - assert list(ret) == expected - - ret = await mesos_master._task_list(active_only=False) - expected = [mock_task_1, mock_task_2] - assert list(ret) == expected - - -@mark.asyncio -async def test_tasks(): - with patch.object( - master.MesosMaster, "_task_list", autospec=True - ) as mock__task_list, patch.object(task, "Task", autospec=True) as mock_task: - - mock_task_1 = {"id": "aaa"} - mock_task_2 = {"id": "bbb"} - mock__task_list.return_value = [mock_task_1, mock_task_2] - mock_task.return_value = Mock() - mesos_master = master.MesosMaster({}) - ret = await mesos_master.tasks() - mock_task.assert_has_calls( - [call(mesos_master, mock_task_1), call(mesos_master, mock_task_2)] - ) - mock__task_list.assert_called_with(mesos_master, False) - expected = [mock_task.return_value, mock_task.return_value] - assert list(ret) == expected - - -@mark.asyncio -async def test_orphan_tasks(): - mesos_master = master.MesosMaster({}) - mock_task_1 = Mock() - mesos_master.state = CoroutineMock(return_value={"orphan_tasks": [mock_task_1]}) - assert await mesos_master.orphan_tasks() == [mock_task_1] diff --git a/tests/metrics/test_metastatus_lib.py b/tests/metrics/test_metastatus_lib.py index 45c25b860a..090f20b29a 100644 --- a/tests/metrics/test_metastatus_lib.py +++ b/tests/metrics/test_metastatus_lib.py @@ -12,7 +12,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import inspect import re import mock @@ -39,17 +38,7 @@ def test_fail_check_threshold(): assert not metastatus_lib.check_threshold(80, 30) -def test_get_mesos_cpu_status(): - fake_metrics = {"master/cpus_total": 3, "master/cpus_used": 1} - fake_mesos_state = { - "slaves": [{"reserved_resources": {"maintenance": {"cpus": 1}}}] - } - total, used, available = metastatus_lib.get_mesos_cpu_status( - fake_metrics, fake_mesos_state - ) - assert total == 3 - assert used == 2 - assert available == 1 +# test_get_mesos_cpu_status removed - function deleted with Mesos cleanup def test_get_kube_cpu_status(): @@ -139,25 +128,7 @@ def test_assert_bad_gpu_health(): ) -def test_cpu_health_mesos_reports_zero(): - status = (0, 1, 42) - failure_output, failure_health = metastatus_lib.assert_cpu_health(status) - assert failure_output == "Error reading total available cpu from mesos!" - assert failure_health is False - - -def test_memory_health_mesos_reports_zero(): - status = (0, 1, 42) - failure_output, failure_health = metastatus_lib.assert_memory_health(status) - assert failure_output == "Error reading total available memory from mesos!" - assert failure_health is False - - -def test_disk_health_mesos_reports_zero(): - status = (0, 1, 42) - failure_output, failure_health = metastatus_lib.assert_disk_health(status) - assert failure_output == "Error reading total available disk from mesos!" - assert failure_health is False +# Mesos health check tests removed - functions deleted with Mesos cleanup def test_assert_kube_deployments(): @@ -196,42 +167,7 @@ def test_assert_nodes_health(): assert ok -def test_get_mesos_slaves_health_status(): - fake_slave_info = {"master/slaves_active": 10, "master/slaves_inactive": 7} - active, inactive = metastatus_lib.get_mesos_slaves_health_status(fake_slave_info) - assert active == 10 - assert inactive == 7 - - -def test_assert_mesos_tasks_running(): - fake_tasks_info = { - "master/tasks_running": 20, - "master/tasks_staging": 10, - "master/tasks_starting": 10, - } - output, ok = metastatus_lib.assert_mesos_tasks_running(fake_tasks_info) - assert "Tasks: running: 20 staging: 10 starting: 10" in output - assert ok - - -@patch("paasta_tools.metrics.metastatus_lib.get_mesos_quorum", autospec=True) -@patch("paasta_tools.metrics.metastatus_lib.get_num_masters", autospec=True) -def test_healthy_asssert_quorum_size(mock_num_masters, mock_quorum_size): - mock_num_masters.return_value = 5 - mock_quorum_size.return_value = 3 - output, health = metastatus_lib.assert_quorum_size() - assert health - assert "Quorum: masters: 5 configured quorum: 3 " in output - - -@patch("paasta_tools.metrics.metastatus_lib.get_mesos_quorum", autospec=True) -@patch("paasta_tools.metrics.metastatus_lib.get_num_masters", autospec=True) -def test_unhealthy_asssert_quorum_size(mock_num_masters, mock_quorum_size): - mock_num_masters.return_value = 1 - mock_quorum_size.return_value = 3 - output, health = metastatus_lib.assert_quorum_size() - assert not health - assert "CRITICAL: Number of masters (1) less than configured quorum(3)." in output +# Mesos-related test functions removed - underlying functions deleted with Mesos cleanup def test_status_for_results(): @@ -264,19 +200,7 @@ def test_critical_events_in_outputs(): ) == [("myservice_false", False)] -def test_filter_mesos_state_metrics(): - test_resource_dictionary = { - "cpus": 0, - "mem": 1, - "MEM": 2, - "garbage_data": 3, - "disk": 4, - "gpus": 5, - } - expected = {"cpus": 0, "mem": 1, "disk": 4, "gpus": 5} - assert ( - metastatus_lib.filter_mesos_state_metrics(test_resource_dictionary) == expected - ) +# test_filter_mesos_state_metrics removed - function deleted with Mesos cleanup def test_filter_kube_resources(): @@ -292,21 +216,7 @@ def test_filter_kube_resources(): assert metastatus_lib.filter_kube_resources(test_resource_dictionary) == expected -def test_filter_slaves(): - filters = {"foo": ["one", "two"], "bar": ["three", "four"]} - fns = [metastatus_lib.make_filter_slave_func(k, v) for k, v in filters.items()] - - data = [ - {"name": "aaa", "attributes": {"foo": "one", "bar": "three"}}, - {"name": "bbb", "attributes": {"foo": "one"}}, - {"name": "ccc", "attributes": {"foo": "wrong", "bar": "four"}}, - ] - - slaves = metastatus_lib.filter_slaves(data, fns) - names = [s["name"] for s in slaves] - assert "aaa" in names - assert "bbb" not in names - assert "ccc" not in names +# test_filter_slaves removed - function deleted with Mesos cleanup def test_group_slaves_by_key_func(): @@ -333,195 +243,19 @@ def test_group_slaves_by_key_func(): assert len(list(v)) == 1 -@patch("paasta_tools.metrics.metastatus_lib.group_slaves_by_key_func", autospec=True) -@patch( - "paasta_tools.metrics.metastatus_lib.calculate_resource_utilization_for_slaves", - autospec=True, -) -@patch("paasta_tools.metrics.metastatus_lib.get_all_tasks_from_state", autospec=True) -def test_get_resource_utilization_by_grouping( - mock_get_all_tasks_from_state, - mock_calculate_resource_utilization_for_slaves, - mock_group_slaves_by_key_func, -): - mock_group_slaves_by_key_func.return_value = { - "somenametest-habitat": [{"id": "abcd", "hostname": "test.somewhere.www"}], - "somenametest-habitat-2": [{"id": "abcd", "hostname": "test2.somewhere.www"}], - } - mock_calculate_resource_utilization_for_slaves.return_value = { - "free": metastatus_lib.ResourceInfo(cpus=10, mem=10, disk=10), - "total": metastatus_lib.ResourceInfo(cpus=20, mem=20, disk=20), - } - state = {"frameworks": Mock(), "slaves": [{"id": "abcd"}]} - actual = metastatus_lib.get_resource_utilization_by_grouping( - grouping_func=mock.sentinel.grouping_func, mesos_state=state - ) - mock_get_all_tasks_from_state.assert_called_with(state, include_orphans=True) - assert sorted(actual.keys()) == sorted( - ["somenametest-habitat", "somenametest-habitat-2"] - ) - for k, v in actual.items(): - assert v["total"] == metastatus_lib.ResourceInfo(cpus=20, disk=20, mem=20) - assert v["free"] == metastatus_lib.ResourceInfo(cpus=10, disk=10, mem=10) - - -def test_get_resource_utilization_by_grouping_correctly_groups(): - fake_state = { - "slaves": [ - { - "id": "foo", - "resources": {"disk": 100, "cpus": 10, "mem": 50}, - "reserved_resources": {}, - }, - { - "id": "bar", - "resources": {"disk": 100, "cpus": 10, "mem": 50}, - "reserved_resources": {}, - }, - ], - "frameworks": [ - { - "tasks": [ - { - "state": "TASK_RUNNING", - "resources": {"cpus": 1, "mem": 10, "disk": 10}, - "slave_id": "foo", - }, - { - "state": "TASK_RUNNING", - "resources": {"cpus": 1, "mem": 10, "disk": 10}, - "slave_id": "bar", - }, - ] - } - ], - } +# test_get_resource_utilization_by_grouping removed - function deleted with Mesos cleanup - def grouping_func(x): - return x["id"] - - free_cpus = metastatus_lib.get_resource_utilization_by_grouping( - mesos_state=fake_state, grouping_func=grouping_func - )["foo"]["free"].cpus - assert free_cpus == 9 - - -def test_get_resource_utilization_by_grouping_correctly_multi_groups(): - fake_state = { - "slaves": [ - { - "id": "foo1", - "resources": {"disk": 100, "cpus": 10, "mem": 50}, - "attributes": {"one": "yes", "two": "yes"}, - "reserved_resources": {}, - }, - { - "id": "bar1", - "resources": {"disk": 100, "cpus": 10, "mem": 50}, - "attributes": {"one": "yes", "two": "no"}, - "reserved_resources": {}, - }, - { - "id": "foo2", - "resources": {"disk": 100, "cpus": 10, "mem": 50}, - "attributes": {"one": "no", "two": "yes"}, - "reserved_resources": {}, - }, - { - "id": "bar2", - "resources": {"disk": 100, "cpus": 10, "mem": 50}, - "attributes": {"one": "no", "two": "no"}, - "reserved_resources": {}, - }, - ], - "frameworks": [ - { - "tasks": [ - { - "state": "TASK_RUNNING", - "resources": {"cpus": 1, "mem": 10, "disk": 10}, - "slave_id": "foo1", - }, - { - "state": "TASK_RUNNING", - "resources": {"cpus": 1, "mem": 10, "disk": 10}, - "slave_id": "bar1", - }, - ] - } - ], - } - grouping_func = metastatus_lib.key_func_for_attribute_multi(["one", "two"]) - resp = metastatus_lib.get_resource_utilization_by_grouping( - mesos_state=fake_state, grouping_func=grouping_func - ) - # resp should have 4 keys... - assert len(resp.keys()) == 4 - # Each key should be a set with 2 items... - assert len(list(resp.keys())[0]) == 2 - # Each item in the set should have 2 values (original key, value) - assert len(list(list(resp.keys())[0])[0]) == 2 - - -def test_get_resource_utilization_per_slave(): - tasks = [ - {"resources": {"cpus": 10, "mem": 10, "disk": 10}, "state": "TASK_RUNNING"}, - {"resources": {"cpus": 10, "mem": 10, "disk": 10}, "state": "TASK_RUNNING"}, - ] - slaves = [ - { - "id": "somenametest-slave", - "hostname": "test.somewhere.www", - "resources": {"cpus": 75, "disk": 250, "mem": 100}, - "reserved_resources": {}, - "attributes": {"habitat": "somenametest-habitat"}, - }, - { - "id": "somenametest-slave2", - "hostname": "test2.somewhere.www", - "resources": {"cpus": 500, "disk": 200, "mem": 750}, - "reserved_resources": {"maintenance": {"cpus": 10, "disk": 0, "mem": 150}}, - "attributes": {"habitat": "somenametest-habitat-2"}, - }, - ] - actual = metastatus_lib.calculate_resource_utilization_for_slaves( - slaves=slaves, tasks=tasks - ) - assert sorted(actual.keys()) == sorted(["total", "free", "slave_count"]) - assert actual["total"] == metastatus_lib.ResourceInfo(cpus=575, disk=450, mem=850) - assert actual["free"] == metastatus_lib.ResourceInfo(cpus=545, disk=430, mem=680) - assert actual["slave_count"] == 2 +# test_get_resource_utilization_by_grouping_correctly_groups removed - function deleted with Mesos cleanup -def test_calculate_resource_utilization_for_slaves(): - fake_slaves = [ - { - "id": "somenametest-slave2", - "hostname": "test2.somewhere.www", - "resources": {"cpus": 500, "disk": 200, "mem": 750, "gpus": 5}, - "reserved_resources": {}, - "attributes": {"habitat": "somenametest-habitat-2"}, - } - ] - tasks = [ - { - "resources": {"cpus": 10, "mem": 10, "disk": 10, "gpus": 1}, - "state": "TASK_RUNNING", - }, - { - "resources": {"cpus": 10, "mem": 10, "disk": 10, "gpus": 2}, - "state": "TASK_RUNNING", - }, - ] - free = metastatus_lib.calculate_resource_utilization_for_slaves( - slaves=fake_slaves, tasks=tasks - )["free"] +# test_get_resource_utilization_by_grouping_correctly_multi_groups removed - function deleted with Mesos cleanup + + +# test_get_resource_utilization_per_slave removed - function deleted with Mesos cleanup - assert free.cpus == 480 - assert free.mem == 730 - assert free.disk == 180 - assert free.gpus == 2 + +# test_calculate_resource_utilization_for_slaves removed - function deleted with Mesos cleanup def test_calculate_resource_utilization_for_kube_nodes(): @@ -699,17 +433,10 @@ def test_get_table_rows_for_resource_usage_dict(mock_format_row): assert actual == ["myhabitat", "10/10", "10/10", "10/10"] -def test_key_func_for_attribute(): - assert inspect.isfunction(metastatus_lib.key_func_for_attribute("habitat")) +# test_key_func_for_attribute removed - function deleted with Mesos cleanup -def test_get_mesos_memory_status(): - metrics = {"master/mem_total": 100, "master/mem_used": 50} - fake_mesos_state = { - "slaves": [{"reserved_resources": {"maintenance": {"mem": 33}}}] - } - actual = metastatus_lib.get_mesos_memory_status(metrics, fake_mesos_state) - assert actual == (100, 83, 17) +# test_get_mesos_memory_status removed - function deleted with Mesos cleanup def test_get_kube_memory_status(): @@ -726,13 +453,7 @@ def test_get_kube_memory_status(): assert available == 1 * 1024 -def test_get_mesos_disk_status(): - metrics = {"master/disk_total": 100, "master/disk_used": 50} - fake_mesos_state = { - "slaves": [{"reserved_resources": {"maintenance": {"disk": 33}}}] - } - actual = metastatus_lib.get_mesos_disk_status(metrics, fake_mesos_state) - assert actual == (100, 83, 17) +# test_get_mesos_disk_status removed - function deleted with Mesos cleanup def test_get_kube_disk_status(): @@ -750,13 +471,7 @@ def test_get_kube_disk_status(): assert available == 1 * 1024**2 -def test_get_mesos_gpu_status(): - metrics = {"master/gpus_total": 10, "master/gpus_used": 5} - fake_mesos_state = { - "slaves": [{"reserved_resources": {"maintenance": {"gpus": 2}}}] - } - actual = metastatus_lib.get_mesos_gpu_status(metrics, fake_mesos_state) - assert actual == (10, 7, 3) +# test_get_mesos_gpu_status removed - function deleted with Mesos cleanup def test_get_kube_gpu_status(): @@ -773,26 +488,7 @@ def test_get_kube_gpu_status(): assert available == 1 -def test_reserved_maintenence_resources_no_maintenenance(): - actual = metastatus_lib.reserved_maintenence_resources({}) - assert all([actual[x] == 0 for x in ["cpus", "mem", "disk"]]) - - -def test_reserved_maintenence_resources(): - actual = metastatus_lib.reserved_maintenence_resources( - {"maintenance": {"cpus": 5, "mem": 5, "disk": 5}} - ) - assert all([actual[x] == 5 for x in ["cpus", "mem", "disk"]]) - - -def test_reserved_maintenence_resources_ignores_non_maintenance(): - actual = metastatus_lib.reserved_maintenence_resources( - { - "maintenance": {"cpus": 5, "mem": 5, "disk": 5}, - "myotherole": {"cpus": 5, "mem": 5, "disk": 5}, - } - ) - assert all([actual[x] == 5 for x in ["cpus", "mem", "disk"]]) +# test_reserved_maintenence_resources* functions removed - function deleted with Mesos cleanup def test_suffixed_number_value(): diff --git a/tests/test_check_spark_jobs.py b/tests/test_check_spark_jobs.py index 95551f21e4..3d9c519d2f 100644 --- a/tests/test_check_spark_jobs.py +++ b/tests/test_check_spark_jobs.py @@ -4,7 +4,6 @@ import pytest import paasta_tools.check_spark_jobs as check_spark_jobs -from paasta_tools.mesos.framework import Framework @pytest.fixture @@ -33,49 +32,12 @@ def mock_datetime(mock_current_time): @pytest.fixture def mock_get_frameworks(mock_current_time): - mock_current_timestamp = mock_current_time.timestamp() - mock_frameworks = [ - { - "id": "uuid1", - "name": "not_spark_25_hours", - "user": "test_user", - "active": True, - "registered_time": mock_current_timestamp - 60 * 60 * 25, - }, - { - "id": "uuid2", - "name": "spark_2_hours", - "principal": "spark", - "user": "test_user", - "active": True, - "webui_url": "url2", - "registered_time": mock_current_timestamp - 60 * 60 * 2, - }, - { - "id": "uuid3", - "name": "spark_25_hours", - "principal": "spark", - "user": "test_user", - "active": True, - "webui_url": "url3", - "registered_time": mock_current_timestamp - 60 * 60 * 25, - }, - { - "id": "uuid4", - "name": "spark_25_hours_inactive", - "principal": "spark", - "user": "test_user", - "active": False, - "webui_url": "url4", - "registered_time": mock_current_timestamp - 60 * 60 * 25, - }, - ] with mock.patch( - "paasta_tools.check_spark_jobs.mesos_tools.get_all_frameworks", autospec=True + "paasta_tools.check_spark_jobs.get_matching_framework_info", autospec=True ) as mock_get_frameworks: - mock_get_frameworks.return_value = [ - Framework(config) for config in mock_frameworks - ] + # Since mesos_tools.get_all_frameworks now returns empty list, + # we mock get_matching_framework_info directly for testing + mock_get_frameworks.return_value = [] yield mock_get_frameworks @@ -95,29 +57,13 @@ def test_guess_service(properties, expected): assert check_spark_jobs.guess_service(properties) == expected -@mock.patch("paasta_tools.check_spark_jobs.get_spark_properties", autospec=True) -@mock.patch("paasta_tools.check_spark_jobs.guess_service", autospec=True) -def test_get_matching_framework_info( - mock_guess_service, mock_get_spark_properties, mock_get_frameworks -): - mock_get_spark_properties.return_value = None - - more_than_20_hours = check_spark_jobs.get_matching_framework_info(min_hours=20) - assert len(more_than_20_hours) == 1 - assert more_than_20_hours == [ - { - "id": "uuid3", - "name": "spark_25_hours", - "webui_url": "url3", - "service": mock_guess_service.return_value, - "user": "test_user", - "time_running": "1 day, 1:00:00", - } - ] +def test_get_matching_framework_info(): + # Since Mesos is removed, get_matching_framework_info should return empty list + result = check_spark_jobs.get_matching_framework_info(min_hours=20) + assert result == [] - more_than_one_hour = check_spark_jobs.get_matching_framework_info(min_hours=1) - result_names = [result["name"] for result in more_than_one_hour] - assert result_names == ["spark_2_hours", "spark_25_hours"] + result = check_spark_jobs.get_matching_framework_info(min_hours=1) + assert result == [] @pytest.mark.parametrize( diff --git a/tests/test_mesos_tools.py b/tests/test_mesos_tools.py deleted file mode 100644 index a394bad50f..0000000000 --- a/tests/test_mesos_tools.py +++ /dev/null @@ -1,945 +0,0 @@ -# Copyright 2015-2016 Yelp Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import datetime -import random -import socket - -import asynctest -import docker -import mock -import requests -from pytest import mark -from pytest import raises - -from paasta_tools import mesos -from paasta_tools import mesos_tools -from paasta_tools import utils -from paasta_tools.utils import PaastaColors - - -def test_filter_running_tasks(): - tasks = [ - {"id": 1, "state": "TASK_RUNNING", "framework": {"active": True}}, - {"id": 2, "state": "TASK_FAILED", "framework": {"active": True}}, - ] - running = mesos_tools.filter_running_tasks(tasks) - assert len(running) == 1 - assert running[0]["id"] == 1 - - -def test_filter_not_running_tasks(): - tasks = [{"id": 1, "state": "TASK_RUNNING"}, {"id": 2, "state": "TASK_FAILED"}] - not_running = mesos_tools.filter_not_running_tasks(tasks) - assert len(not_running) == 1 - assert not_running[0]["id"] == 2 - - -@mark.parametrize( - "test_case", - [[0, 0], [10, 1 + 10]], # 1 running task, 10 non-running taks (truncated) -) -def test_status_mesos_tasks_verbose(test_case): - tail_lines, expected_format_tail_call_count = test_case - filter_string = "fake--service.fake--instance" - - with asynctest.patch( - "paasta_tools.mesos_tools.get_cached_list_of_running_tasks_from_frameworks", - autospec=True, - return_value=[{"id": filter_string}], - ), asynctest.patch( - "paasta_tools.mesos_tools.get_cached_list_of_not_running_tasks_from_frameworks", - autospec=True, - ) as get_cached_list_of_not_running_tasks_from_frameworks_patch, asynctest.patch( - "paasta_tools.mesos_tools.format_running_mesos_task_row", autospec=True - ) as format_running_mesos_task_row_patch, asynctest.patch( - "paasta_tools.mesos_tools.format_non_running_mesos_task_row", autospec=True - ) as format_non_running_mesos_task_row_patch, asynctest.patch( - "paasta_tools.mesos_tools.format_stdstreams_tail_for_task", autospec=True - ) as format_stdstreams_tail_for_task_patch: - - template_task_return = { - "id": filter_string, - "statuses": [{"timestamp": "##########"}], - "state": "NOT_RUNNING", - } - non_running_mesos_tasks = [] - for _ in range( - 15 - ): # exercise the code that sorts/truncates the list of non running tasks - task_return = template_task_return.copy() - task_return["statuses"][0]["timestamp"] = str( - 1457109986 + random.randrange(-60 * 60 * 24, 60 * 60 * 24) - ) - non_running_mesos_tasks.append(task_return) - get_cached_list_of_not_running_tasks_from_frameworks_patch.return_value = ( - non_running_mesos_tasks - ) - - format_running_mesos_task_row_patch.return_value = [ - "id", - "host", - "mem", - "cpu", - "time", - ] - format_non_running_mesos_task_row_patch.return_value = [ - "id", - "host", - "time", - "state", - ] - format_stdstreams_tail_for_task_patch.return_value = ["tail"] - - actual = mesos_tools.status_mesos_tasks_verbose( - filter_string=filter_string, - get_short_task_id=mock.sentinel.get_short_task_id, - tail_lines=tail_lines, - ) - assert "Running Tasks" in actual - assert "Non-Running Tasks" in actual - format_running_mesos_task_row_patch.assert_called_once_with( - {"id": filter_string}, mock.sentinel.get_short_task_id - ) - assert ( - format_non_running_mesos_task_row_patch.call_count == 10 - ) # maximum n of tasks we display - assert ( - format_stdstreams_tail_for_task_patch.call_count - == expected_format_tail_call_count - ) - - -@mark.asyncio -async def test_get_cpu_usage_good(): - fake_task = mock.create_autospec(mesos.task.Task) - fake_task.cpu_limit = asynctest.CoroutineMock(return_value=0.35) - fake_duration = 100 - fake_task.stats = asynctest.CoroutineMock( - return_value={"cpus_system_time_secs": 2.5, "cpus_user_time_secs": 0.0} - ) - current_time = datetime.datetime.now() - fake_task.__getitem__.return_value = [ - { - "state": "TASK_RUNNING", - "timestamp": int(current_time.strftime("%s")) - fake_duration, - } - ] - with asynctest.patch( - "paasta_tools.mesos_tools.datetime.datetime", autospec=True - ) as mock_datetime: - mock_datetime.now.return_value = current_time - actual = await mesos_tools.get_cpu_usage(fake_task) - assert "10.0%" == actual - - -@mark.asyncio -async def test_get_cpu_usage_bad(): - fake_task = mock.create_autospec(mesos.task.Task) - fake_task.cpu_limit = asynctest.CoroutineMock(return_value=1.1) - fake_duration = 100 - fake_task.stats = asynctest.CoroutineMock( - return_value={"cpus_system_time_secs": 50.0, "cpus_user_time_secs": 50.0} - ) - current_time = datetime.datetime.now() - fake_task.__getitem__.return_value = [ - { - "state": "TASK_RUNNING", - "timestamp": int(current_time.strftime("%s")) - fake_duration, - } - ] - with asynctest.patch( - "paasta_tools.mesos_tools.datetime.datetime", autospec=True - ) as mock_datetime: - mock_datetime.now.return_value = current_time - actual = await mesos_tools.get_cpu_usage(fake_task) - assert PaastaColors.red("100.0%") in actual - - -@mark.asyncio -async def test_get_cpu_usage_handles_missing_stats(): - fake_task = mock.create_autospec(mesos.task.Task) - fake_task.cpu_limit = asynctest.CoroutineMock(return_value=1.1) - fake_duration = 100 - fake_task.stats = asynctest.CoroutineMock(return_value={}) - fake_task.__getitem__.return_value = [ - { - "state": "TASK_RUNNING", - "timestamp": int(datetime.datetime.now().strftime("%s")) - fake_duration, - } - ] - actual = await mesos_tools.get_cpu_usage(fake_task) - assert "0.0%" in actual - - -@mark.asyncio -async def test_get_mem_usage_good(): - fake_task = mock.create_autospec(mesos.task.Task) - fake_task.rss = asynctest.CoroutineMock(return_value=1024 * 1024 * 10) - fake_task.mem_limit = asynctest.CoroutineMock(return_value=1024 * 1024 * 10 * 10) - actual = await mesos_tools.get_mem_usage(fake_task) - assert actual == "10/100MB" - - -@mark.asyncio -async def test_get_mem_usage_bad(): - fake_task = mock.create_autospec(mesos.task.Task) - fake_task.rss = asynctest.CoroutineMock(return_value=1024 * 1024 * 100) - fake_task.mem_limit = fake_task.rss - actual = await mesos_tools.get_mem_usage(fake_task) - assert actual == PaastaColors.red("100/100MB") - - -@mark.asyncio -async def test_get_mem_usage_divide_by_zero(): - fake_task = mock.create_autospec(mesos.task.Task) - fake_task.rss = asynctest.CoroutineMock(return_value=1024 * 1024 * 10) - fake_task.mem_limit = asynctest.CoroutineMock(return_value=0) - actual = await mesos_tools.get_mem_usage(fake_task) - assert actual == "Undef" - - -def test_get_zookeeper_config(): - zk_hosts = "1.1.1.1:1111,2.2.2.2:2222,3.3.3.3:3333" - zk_path = "fake_path" - fake_state = {"flags": {"zk": f"zk://{zk_hosts}/{zk_path}"}} - expected = {"hosts": zk_hosts, "path": zk_path} - assert mesos_tools.get_zookeeper_config(fake_state) == expected - - -def test_get_mesos_leader(): - fake_url = "http://93.184.216.34:5050" - with asynctest.patch( - "paasta_tools.mesos_tools.get_mesos_master", autospec=True - ) as mock_get_master, asynctest.patch( - "paasta_tools.mesos_tools.socket.gethostbyaddr", autospec=True - ) as mock_gethostbyaddr, asynctest.patch( - "paasta_tools.mesos_tools.socket.getfqdn", autospec=True - ) as mock_getfqdn: - mock_master = mock.Mock() - mock_master.host = fake_url - mock_get_master.return_value = mock_master - mock_gethostbyaddr.return_value = "example.org" - mock_getfqdn.return_value = "example.org" - assert mesos_tools.get_mesos_leader() == "example.org" - - -def test_get_mesos_leader_socket_error(): - fake_url = "http://93.184.216.34:5050" - with asynctest.patch( - "paasta_tools.mesos_tools.get_mesos_master", autospec=True - ) as mock_get_master, asynctest.patch( - "paasta_tools.mesos_tools.socket.gethostbyaddr", - side_effect=socket.error, - autospec=True, - ): - mock_master = mock.Mock() - mock_master.host = fake_url - mock_get_master.return_value = mock_master - with raises(socket.error): - mesos_tools.get_mesos_leader() - - -def test_get_mesos_leader_no_hostname(): - fake_url = "localhost:5050" - with asynctest.patch( - "paasta_tools.mesos_tools.get_mesos_master", autospec=True - ) as mock_get_master: - mock_master = mock.Mock() - mock_master.host = fake_url - mock_get_master.return_value = mock_master - with raises(ValueError): - mesos_tools.get_mesos_leader() - - -@mock.patch( - "paasta_tools.mesos_tools.get_mesos_config", - autospec=True, - return_value={"scheme": "http", "master": "test"}, -) -def test_get_mesos_leader_cli_mesosmasterconnectionerror(mock_get_mesos_config): - with asynctest.patch( - "paasta_tools.mesos.master.MesosMaster.resolve", - side_effect=mesos.exceptions.MasterNotAvailableException, - autospec=True, - ): - with raises(mesos.exceptions.MasterNotAvailableException): - mesos_tools.get_mesos_leader() - - -@mock.patch("paasta_tools.mesos_tools.KazooClient", autospec=True) -def test_get_number_of_mesos_masters( - mock_kazoo, -): - host = "1.1.1.1" - path = "fake_path" - - zk = mock_kazoo.return_value - zk.get_children.return_value = ["log_11", "state", "json.info_1", "info_2"] - assert mesos_tools.get_number_of_mesos_masters(host, path) == 2 - - -@mock.patch("requests.get", autospec=True) -@mock.patch("socket.getfqdn", autospec=True) -def test_get_local_slave_state_connection_error(mock_getfqdn, mock_requests_get): - fake_request = requests.Request("GET", url="doesnt_matter") - mock_getfqdn.return_value = "fake_hostname" - mock_requests_get.side_effect = requests.ConnectionError( - "fake_message", request=fake_request - ) - - with raises(mesos_tools.MesosSlaveConnectionError): - mesos_tools.get_local_slave_state() - - -def test_get_mesos_slaves_grouped_by_attribute(): - fake_value_1 = "fake_value_1" - fake_value_2 = "fake_value_2" - fake_slaves = [ - {"hostname": "fake_host_1", "attributes": {"fake_attribute": fake_value_1}}, - {"hostname": "fake_host_2", "attributes": {"fake_attribute": fake_value_2}}, - {"hostname": "fake_host_3", "attributes": {"fake_attribute": fake_value_1}}, - { - "hostname": "fake_host_4", - "attributes": {"fake_attribute": "fake_other_value"}, - }, - ] - expected = { - "fake_value_1": [ - {"hostname": "fake_host_1", "attributes": {"fake_attribute": fake_value_1}}, - {"hostname": "fake_host_3", "attributes": {"fake_attribute": fake_value_1}}, - ], - "fake_value_2": [ - {"hostname": "fake_host_2", "attributes": {"fake_attribute": fake_value_2}} - ], - "fake_other_value": [ - { - "hostname": "fake_host_4", - "attributes": {"fake_attribute": "fake_other_value"}, - } - ], - } - actual = mesos_tools.get_mesos_slaves_grouped_by_attribute( - fake_slaves, "fake_attribute" - ) - assert actual == expected - - -@mock.patch("paasta_tools.mesos_tools.host_passes_blacklist", autospec=True) -def test_filter_mesos_slaves_by_blacklist_when_unfiltered(mock_slave_passes_blacklist): - mock_slave_passes_blacklist.return_value = True - slaves = [ - {"hostname": "fake_host_1", "attributes": {"fake_attribute": "fake_value_1"}}, - {"hostname": "fake_host_2", "attributes": {"fake_attribute": "fake_value_1"}}, - ] - blacklist = [] - whitelist = None - actual = mesos_tools.filter_mesos_slaves_by_blacklist( - slaves=slaves, blacklist=blacklist, whitelist=whitelist - ) - assert mock_slave_passes_blacklist.call_count == 2 - assert actual == slaves - - -@mock.patch("paasta_tools.mesos_tools.host_passes_blacklist", autospec=True) -def test_filter_mesos_slaves_by_blacklist_when_filtered(mock_slave_passes_blacklist): - mock_slave_passes_blacklist.return_value = False - slaves = [ - {"hostname": "fake_host_1", "attributes": {"fake_attribute": "fake_value_1"}}, - {"hostname": "fake_host_2", "attributes": {"fake_attribute": "fake_value_1"}}, - ] - blacklist = [] - whitelist = None - actual = mesos_tools.filter_mesos_slaves_by_blacklist( - slaves=slaves, blacklist=blacklist, whitelist=whitelist - ) - assert mock_slave_passes_blacklist.call_count == 2 - assert actual == [] - - -def test_get_paasta_execute_docker_healthcheck(): - mock_docker_client = mock.MagicMock(spec_set=docker.APIClient) - fake_container_id = "fake_container_id" - fake_mesos_id = "fake_mesos_id" - fake_container_info = [ - {"Config": {"Env": None}}, - { - "Config": { - "Env": ["fake_key1=fake_value1", "MESOS_TASK_ID=fake_other_mesos_id"] - }, - "Id": "11111", - }, - { - "Config": { - "Env": ["fake_key2=fake_value2", "MESOS_TASK_ID=%s" % fake_mesos_id] - }, - "Id": fake_container_id, - }, - ] - mock_docker_client.containers = mock.MagicMock( - spec_set=docker.APIClient, - return_value=["fake_container_1", "fake_container_2", "fake_container_3"], - ) - mock_docker_client.inspect_container = mock.MagicMock( - spec_set=docker.APIClient, side_effect=fake_container_info - ) - assert ( - mesos_tools.get_container_id_for_mesos_id(mock_docker_client, fake_mesos_id) - == fake_container_id - ) - - -def test_get_paasta_execute_docker_healthcheck_when_not_found(): - mock_docker_client = mock.MagicMock(spec_set=docker.APIClient) - fake_mesos_id = "fake_mesos_id" - fake_container_info = [ - { - "Config": { - "Env": ["fake_key1=fake_value1", "MESOS_TASK_ID=fake_other_mesos_id"] - }, - "Id": "11111", - }, - { - "Config": { - "Env": ["fake_key2=fake_value2", "MESOS_TASK_ID=fake_other_mesos_id2"] - }, - "Id": "2222", - }, - ] - mock_docker_client.containers = mock.MagicMock( - spec_set=docker.APIClient, return_value=["fake_container_1", "fake_container_2"] - ) - mock_docker_client.inspect_container = mock.MagicMock( - spec_set=docker.APIClient, side_effect=fake_container_info - ) - assert ( - mesos_tools.get_container_id_for_mesos_id(mock_docker_client, fake_mesos_id) - is None - ) - - -@mark.parametrize( - "test_case", - [ - [ - ["taska", "taskb"], # test_case0 - OK - [["outlna1", "outlna2", "errlna1"], ["outlnb1", "errlnb1", "errlnb2"]], - [ - "taska", - "outlna1", - "outlna2", - "errlna1", - "taskb", - "outlnb1", - "errlnb1", - "errlnb2", - ], - False, - ], - [["a"], [1, 2], None, True], # test_case1 - can't zip different length lists - ], -) -def test_zip_tasks_verbose_output(test_case): - table, stdstreams, expected, should_raise = test_case - result = None - raised = False - try: - result = mesos_tools.zip_tasks_verbose_output(table, stdstreams) - except ValueError: - raised = True - - assert raised == should_raise - assert result == expected - - -@mark.asyncio -@mark.parametrize( - "test_case", - [ - # task_id, file1, file2, nlines, raise_what - [ - "a_task", # test_case0 - OK - ["stdout", [str(x) for x in range(20)]], - ["stderr", [str(x) for x in range(30)]], - 10, - None, - ], - [ - "a_task", # test_case1 - OK, short stdout, swapped stdout/stderr - ["stderr", [str(x) for x in range(30)]], - ["stdout", ["1", "2"]], - 10, - None, - ], - ["a_task", None, None, 10, mesos.exceptions.MasterNotAvailableException], - ["a_task", None, None, 10, mesos.exceptions.SlaveDoesNotExist], - ["a_task", None, None, 10, mesos.exceptions.TaskNotFoundException], - ["a_task", None, None, 10, mesos.exceptions.FileNotFoundForTaskException], - ["a_task", None, None, 10, utils.TimeoutError], - ], -) -async def test_format_stdstreams_tail_for_task( - test_case, -): - def gen_mesos_cli_fobj(file_path, file_lines): - """mesos.cli.cluster.files (0.1.5), - returns a list of mesos.cli.mesos_file.File - `File` is an iterator-like object. - """ - - async def _readlines_reverse(): - for line in reversed(file_lines): - yield line - - fobj = mock.create_autospec(mesos.mesos_file.File) - fobj.path = file_path - fobj._readlines_reverse = _readlines_reverse - return fobj - - def get_short_task_id(task_id): - return task_id - - def gen_mock_cluster_files(file1, file2, raise_what): - async def mock_cluster_files(*args, **kwargs): - # If we're asked to raise a particular exception we do so. - # .message is set to the exception class name. - if raise_what: - raise raise_what(raise_what) - yield gen_mesos_cli_fobj(file1[0], file1[1]) - yield gen_mesos_cli_fobj(file2[0], file2[1]) - - return mock_cluster_files - - def gen_output(task_id, file1, file2, nlines, raise_what): - error_message = " " + PaastaColors.red( - " couldn't read stdout/stderr for %s (%s)" - ) - output = [] - if not raise_what: - files = [file1, file2] - # reverse sort because stdout is supposed to always come before stderr in the output - files.sort(key=lambda f: f[0], reverse=True) - for f in files: - output.append( - " " + PaastaColors.blue("{} tail for {}".format(f[0], task_id)) - ) - output.extend(f" {line}" for line in f[1][-nlines:]) - else: - output.append(error_message % (task_id, raise_what.__name__)) - return output - - task_id, file1, file2, nlines, raise_what = test_case - - mock_cluster_files = gen_mock_cluster_files(file1, file2, raise_what) - fake_task = {"id": task_id} - expected = gen_output(task_id, file1, file2, nlines, raise_what) - with asynctest.patch("paasta_tools.mesos_tools.get_mesos_config", autospec=True): - with asynctest.patch( - "paasta_tools.mesos_tools.cluster.get_files_for_tasks", - mock_cluster_files, - autospec=None, - ): - result = await mesos_tools.format_stdstreams_tail_for_task( - fake_task, get_short_task_id - ) - assert result == expected - - -def test_slave_pid_to_ip(): - ret = mesos_tools.slave_pid_to_ip("slave(1)@10.40.31.172:5051") - assert ret == "10.40.31.172" - - -@mark.asyncio -async def test_get_mesos_task_count_by_slave(): - with asynctest.patch( - "paasta_tools.mesos_tools.get_all_running_tasks", autospec=True - ) as mock_get_all_running_tasks: - mock_tron = mock.Mock() - mock_tron.name = "tron" - mock_somethingelse = mock.Mock() - mock_somethingelse.name = "somethingelse" - mock_task1 = mock.Mock() - mock_task1.slave = asynctest.CoroutineMock(return_value={"id": "slave1"}) - mock_task1.framework = asynctest.CoroutineMock(return_value=mock_tron) - mock_task2 = mock.Mock() - mock_task2.slave = asynctest.CoroutineMock(return_value={"id": "slave1"}) - mock_task2.framework = asynctest.CoroutineMock(return_value=mock_somethingelse) - mock_task3 = mock.Mock() - mock_task3.slave = asynctest.CoroutineMock(return_value={"id": "slave1"}) - mock_task3.framework = asynctest.CoroutineMock(return_value=mock_somethingelse) - mock_task4 = mock.Mock() - mock_task4.slave = asynctest.CoroutineMock(return_value={"id": "slave2"}) - mock_task4.framework = asynctest.CoroutineMock(return_value=mock_somethingelse) - mock_tasks = [mock_task1, mock_task2, mock_task3, mock_task4] - mock_get_all_running_tasks.return_value = mock_tasks - mock_slave_1 = { - "id": "slave1", - "attributes": {"pool": "default"}, - "hostname": "host1", - } - mock_slave_2 = { - "id": "slave2", - "attributes": {"pool": "default"}, - "hostname": "host2", - } - mock_slave_3 = { - "id": "slave3", - "attributes": {"pool": "another"}, - "hostname": "host3", - } - mock_mesos_state = {"slaves": [mock_slave_1, mock_slave_2, mock_slave_3]} - ret = await mesos_tools.get_mesos_task_count_by_slave( - mock_mesos_state, pool="default" - ) - assert mock_get_all_running_tasks.called - expected = [ - {"task_counts": mesos_tools.SlaveTaskCount(count=3, slave=mock_slave_1)}, - {"task_counts": mesos_tools.SlaveTaskCount(count=1, slave=mock_slave_2)}, - ] - assert len(ret) == len(expected) and utils.sort_dicts(ret) == utils.sort_dicts( - expected - ) - ret = await mesos_tools.get_mesos_task_count_by_slave( - mock_mesos_state, pool=None - ) - assert mock_get_all_running_tasks.called - expected = [ - {"task_counts": mesos_tools.SlaveTaskCount(count=3, slave=mock_slave_1)}, - {"task_counts": mesos_tools.SlaveTaskCount(count=1, slave=mock_slave_2)}, - {"task_counts": mesos_tools.SlaveTaskCount(count=0, slave=mock_slave_3)}, - ] - assert len(ret) == len(expected) and utils.sort_dicts(ret) == utils.sort_dicts( - expected - ) - - # test slaves_list override - mock_task2 = mock.Mock() - mock_task2.slave = asynctest.CoroutineMock(return_value={"id": "slave2"}) - mock_task2.framework = asynctest.CoroutineMock(return_value=mock_somethingelse) - mock_task3 = mock.Mock() - mock_task3.slave = asynctest.CoroutineMock(return_value={"id": "slave2"}) - mock_task3.framework = asynctest.CoroutineMock(return_value=mock_somethingelse) - mock_tasks = [mock_task1, mock_task2, mock_task3, mock_task4] - mock_get_all_running_tasks.return_value = mock_tasks - mock_slaves_list = [ - {"task_counts": mesos_tools.SlaveTaskCount(count=0, slave=mock_slave_1)}, - {"task_counts": mesos_tools.SlaveTaskCount(count=0, slave=mock_slave_2)}, - {"task_counts": mesos_tools.SlaveTaskCount(count=0, slave=mock_slave_3)}, - ] - ret = await mesos_tools.get_mesos_task_count_by_slave( - mock_mesos_state, slaves_list=mock_slaves_list - ) - expected = [ - {"task_counts": mesos_tools.SlaveTaskCount(count=1, slave=mock_slave_1)}, - {"task_counts": mesos_tools.SlaveTaskCount(count=3, slave=mock_slave_2)}, - {"task_counts": mesos_tools.SlaveTaskCount(count=0, slave=mock_slave_3)}, - ] - assert len(ret) == len(expected) and utils.sort_dicts(ret) == utils.sort_dicts( - expected - ) - - # test SlaveDoesNotExist exception handling - mock_task2.__getitem__ = mock.Mock(side_effect="fakeid") - mock_task2.slave = asynctest.CoroutineMock( - return_value=mock.Mock( - __getitem__=mock.Mock(side_effect=mesos.exceptions.SlaveDoesNotExist) - ) - ) - # we expect to handle this SlaveDoesNotExist exception gracefully, and continue on to handle other tasks - mock_tasks = [mock_task1, mock_task2, mock_task3, mock_task4] - mock_get_all_running_tasks.return_value = mock_tasks - mock_slaves_list = [ - {"task_counts": mesos_tools.SlaveTaskCount(count=0, slave=mock_slave_1)}, - {"task_counts": mesos_tools.SlaveTaskCount(count=0, slave=mock_slave_2)}, - {"task_counts": mesos_tools.SlaveTaskCount(count=0, slave=mock_slave_3)}, - ] - ret = await mesos_tools.get_mesos_task_count_by_slave( - mock_mesos_state, slaves_list=mock_slaves_list - ) - # we expect mock_slave_2 to only count 2 tasks, as one of them returned a SlaveDoesNotExist exception - expected = [ - {"task_counts": mesos_tools.SlaveTaskCount(count=1, slave=mock_slave_1)}, - {"task_counts": mesos_tools.SlaveTaskCount(count=2, slave=mock_slave_2)}, - {"task_counts": mesos_tools.SlaveTaskCount(count=0, slave=mock_slave_3)}, - ] - assert len(ret) == len(expected) and utils.sort_dicts(ret) == utils.sort_dicts( - expected - ) - - -def test_get_count_running_tasks_on_slave(): - with asynctest.patch( - "paasta_tools.mesos_tools.get_mesos_master", autospec=True - ) as mock_get_master, asynctest.patch( - "paasta_tools.mesos_tools.get_mesos_task_count_by_slave", autospec=True - ) as mock_get_mesos_task_count_by_slave: - mock_master = mock.Mock() - mock_mesos_state = mock.Mock() - mock_master.state_summary = asynctest.CoroutineMock( - func=asynctest.CoroutineMock(), # https://github.com/notion/a_sync/pull/40 - return_value=mock_mesos_state, - ) - mock_get_master.return_value = mock_master - - mock_slave_counts = [ - {"task_counts": mock.Mock(count=3, slave={"hostname": "host1"})}, - {"task_counts": mock.Mock(count=0, slave={"hostname": "host2"})}, - ] - mock_get_mesos_task_count_by_slave.return_value = mock_slave_counts - - assert mesos_tools.get_count_running_tasks_on_slave("host1") == 3 - assert mesos_tools.get_count_running_tasks_on_slave("host2") == 0 - assert mesos_tools.get_count_running_tasks_on_slave("host3") == 0 - assert mock_master.state_summary.called - mock_get_mesos_task_count_by_slave.assert_called_with(mock_mesos_state) - - -def _ids(list_of_mocks): - return {id(mck) for mck in list_of_mocks} - - -@mark.asyncio -async def test_get_tasks_from_app_id(): - with asynctest.patch( - "paasta_tools.mesos_tools.get_running_tasks_from_frameworks", autospec=True - ) as mock_get_running_tasks_from_frameworks: - mock_task_1 = mock.Mock( - slave=asynctest.CoroutineMock(return_value={"hostname": "host1"}) - ) - mock_task_2 = mock.Mock( - slave=asynctest.CoroutineMock(return_value={"hostname": "host2"}) - ) - mock_task_3 = mock.Mock( - slave=asynctest.CoroutineMock(return_value={"hostname": "host2.domain"}) - ) - mock_get_running_tasks_from_frameworks.return_value = [ - mock_task_1, - mock_task_2, - mock_task_3, - ] - - ret = await mesos_tools.get_tasks_from_app_id("app_id") - mock_get_running_tasks_from_frameworks.assert_called_with("app_id") - expected = [mock_task_1, mock_task_2, mock_task_3] - assert len(expected) == len(ret) and _ids(ret) == _ids(expected) - - ret = await mesos_tools.get_tasks_from_app_id("app_id", slave_hostname="host2") - mock_get_running_tasks_from_frameworks.assert_called_with("app_id") - expected = [mock_task_2, mock_task_3] - assert len(expected) == len(ret) and _ids(ret) == _ids(expected) - - -@mark.asyncio -async def test_get_task(): - with asynctest.patch( - "paasta_tools.mesos_tools.get_running_tasks_from_frameworks", autospec=True - ) as mock_get_running_tasks_from_frameworks: - mock_task_1 = {"id": "123"} - mock_task_2 = {"id": "789"} - mock_task_3 = {"id": "789"} - mock_get_running_tasks_from_frameworks.return_value = [ - mock_task_1, - mock_task_2, - mock_task_3, - ] - ret = await mesos_tools.get_task("123", app_id="app_id") - mock_get_running_tasks_from_frameworks.assert_called_with("app_id") - assert ret == mock_task_1 - - with raises(mesos_tools.TaskNotFound): - await mesos_tools.get_task("111", app_id="app_id") - - with raises(mesos_tools.TooManyTasks): - await mesos_tools.get_task("789", app_id="app_id") - - -@mark.asyncio -async def test_filter_task_by_hostname(): - mock_task = mock.Mock( - slave=asynctest.CoroutineMock(return_value={"hostname": "host1"}) - ) - assert await mesos_tools.filter_task_by_hostname(mock_task, "host1") - assert not await mesos_tools.filter_task_by_hostname(mock_task, "host2") - - -def test_filter_task_by_task_id(): - mock_task = {"id": "123"} - assert mesos_tools.filter_task_by_task_id(mock_task, "123") - assert not mesos_tools.filter_task_by_task_id(mock_task, "456") - - -def test_get_all_tasks_from_state(): - mock_task_1 = mock.Mock() - mock_task_2 = mock.Mock() - mock_task_3 = mock.Mock() - mock_task_4 = mock.Mock() - mock_state = { - "frameworks": [{"tasks": [mock_task_1, mock_task_2]}, {"tasks": [mock_task_3]}], - "orphan_tasks": [mock_task_4], - } - ret = mesos_tools.get_all_tasks_from_state(mock_state) - expected = [mock_task_1, mock_task_2, mock_task_3] - assert len(ret) == len(expected) and ret == expected - - ret = mesos_tools.get_all_tasks_from_state(mock_state, include_orphans=True) - expected = [mock_task_1, mock_task_2, mock_task_3, mock_task_4] - assert len(ret) == len(expected) and ret == expected - - -@mark.asyncio -async def test_get_running_tasks_from_frameworks(): - with asynctest.patch( - "paasta_tools.mesos_tools.get_current_tasks", autospec=True - ) as mock_get_current_tasks, asynctest.patch( - "paasta_tools.mesos_tools.filter_running_tasks", autospec=True - ) as mock_filter_running_tasks: - ret = await mesos_tools.get_running_tasks_from_frameworks(job_id="") - mock_get_current_tasks.assert_called_with("") - mock_filter_running_tasks.assert_called_with( - mock_get_current_tasks.return_value - ) - assert ret == mock_filter_running_tasks.return_value - - -@mark.asyncio -async def test_get_all_running_tasks(): - with asynctest.patch( - "paasta_tools.mesos_tools.get_current_tasks", autospec=True - ) as mock_get_current_tasks, asynctest.patch( - "paasta_tools.mesos_tools.filter_running_tasks", autospec=True - ) as mock_filter_running_tasks, asynctest.patch( - "paasta_tools.mesos_tools.get_mesos_master", autospec=True - ) as mock_get_mesos_master: - mock_task_1 = mock.Mock() - mock_task_2 = mock.Mock() - mock_task_3 = mock.Mock() - - mock_get_current_tasks.return_value = [mock_task_1, mock_task_2] - mock_orphan_tasks = asynctest.CoroutineMock(return_value=[mock_task_3]) - mock_mesos_master = mock.Mock(orphan_tasks=mock_orphan_tasks) - mock_get_mesos_master.return_value = mock_mesos_master - - ret = await mesos_tools.get_all_running_tasks() - mock_get_current_tasks.assert_called_with("") - mock_filter_running_tasks.assert_called_with( - [mock_task_1, mock_task_2, mock_task_3] - ) - assert ret == mock_filter_running_tasks.return_value - - -@mark.asyncio -async def test_get_non_running_tasks_from_frameworks(): - with asynctest.patch( - "paasta_tools.mesos_tools.get_current_tasks", autospec=True - ) as mock_get_current_tasks, asynctest.patch( - "paasta_tools.mesos_tools.filter_not_running_tasks", autospec=True - ) as mock_filter_not_running_tasks: - ret = await mesos_tools.get_non_running_tasks_from_frameworks(job_id="") - mock_get_current_tasks.assert_called_with("") - mock_filter_not_running_tasks.assert_called_with( - mock_get_current_tasks.return_value - ) - assert ret == mock_filter_not_running_tasks.return_value - - -@mark.asyncio -async def test_get_current_tasks(): - with asynctest.patch( - "paasta_tools.mesos_tools.get_mesos_master", autospec=True - ) as mock_get_mesos_master: - mock_task_1 = mock.Mock() - mock_task_2 = mock.Mock() - mock_tasks = asynctest.CoroutineMock(return_value=[mock_task_1, mock_task_2]) - mock_mesos_master = mock.Mock(tasks=mock_tasks) - mock_get_mesos_master.return_value = mock_mesos_master - - expected = [mock_task_1, mock_task_2] - ret = await mesos_tools.get_current_tasks("") - assert ret == expected and len(ret) == len(expected) - - -def test_mesos_services_running_here(): - with mock.patch( - "paasta_tools.mesos_tools.get_local_slave_state", autospec=True - ) as mock_get_local_slave_state: - mock_state = { - "frameworks": [ - { - "name": "marathon2", - "executors": [ - { - "id": "thing.main", - "resources": {"ports": "[31062-31062]"}, - "tasks": [{"state": "TASK_RUNNING"}], - }, - {"id": "thing.another", "tasks": [{"state": "TASK_LOST"}]}, - ], - }, - { - "name": "tron", - "executors": [ - { - "id": "c.main", - "resources": {}, - "tasks": [{"state": "TASK_RUNNING"}], - }, - { - "id": "c.another", - "resources": {}, - "tasks": [{"state": "TASK_RUNNING"}], - }, - ], - }, - ] - } - mock_get_local_slave_state.return_value = {} - assert ( - mesos_tools.mesos_services_running_here( - lambda _: True, lambda id_str: id_str.split(".") - ) - == [] - ) - - mock_get_local_slave_state.return_value = mock_state - expected = [ - ("thing", "main", 31062), - ("c", "main", None), - ("c", "another", None), - ] - assert ( - mesos_tools.mesos_services_running_here( - lambda _: True, lambda id_str: id_str.split(".") - ) - == expected - ) - - mock_fw_filter = mock.Mock(side_effect=[True, False]) - expected = [("thing", "main", 31062)] - assert ( - mesos_tools.mesos_services_running_here( - mock_fw_filter, lambda id_str: id_str.split(".") - ) - == expected - ) - - mock_parse_service_instance_from_executor_id = mock.Mock( - side_effect=[("thing", "main"), ValueError, ("c", "another")] - ) - expected = [("thing", "main", 31062), ("c", "another", None)] - assert ( - mesos_tools.mesos_services_running_here( - lambda _: True, mock_parse_service_instance_from_executor_id - ) - == expected - ) diff --git a/tests/test_paasta_execute_docker_command.py b/tests/test_paasta_execute_docker_command.py deleted file mode 100644 index 82c06637b0..0000000000 --- a/tests/test_paasta_execute_docker_command.py +++ /dev/null @@ -1,220 +0,0 @@ -# Copyright 2015-2016 Yelp Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import docker -import mock -import pytest - -from paasta_tools.paasta_execute_docker_command import execute_in_container -from paasta_tools.paasta_execute_docker_command import main -from paasta_tools.paasta_execute_docker_command import TimeoutException - - -def test_execute_in_container(): - fake_container_id = "fake_container_id" - fake_return_code = 0 - fake_output = "fake_output" - fake_command = "fake_cmd" - mock_docker_client = mock.MagicMock(spec_set=docker.APIClient) - mock_docker_client.exec_start.return_value = fake_output - mock_docker_client.exec_inspect.return_value = {"ExitCode": fake_return_code} - - assert execute_in_container( - mock_docker_client, fake_container_id, fake_command, 1 - ) == (fake_output, fake_return_code) - expected_cmd = ["/bin/sh", "-c", fake_command] - mock_docker_client.exec_create.assert_called_once_with( - fake_container_id, expected_cmd - ) - - -@mock.patch( - "paasta_tools.paasta_execute_docker_command.is_using_unprivileged_containers", - lambda: False, - autospec=None, -) -def test_execute_in_container_reuses_exec(): - fake_container_id = "fake_container_id" - fake_execid = "fake_execid" - fake_return_code = 0 - fake_output = "fake_output" - fake_command = "fake_cmd" - mock_docker_client = mock.MagicMock(spec_set=docker.APIClient) - mock_docker_client.inspect_container.return_value = {"ExecIDs": [fake_execid]} - mock_docker_client.exec_start.return_value = fake_output - mock_docker_client.exec_inspect.return_value = { - "ExitCode": fake_return_code, - "ProcessConfig": {"entrypoint": "/bin/sh", "arguments": ["-c", fake_command]}, - } - - assert execute_in_container( - mock_docker_client, fake_container_id, fake_command, 1 - ) == (fake_output, fake_return_code) - assert mock_docker_client.exec_create.call_count == 0 - mock_docker_client.exec_start.assert_called_once_with(fake_execid, stream=False) - - -@mock.patch( - "paasta_tools.paasta_execute_docker_command.is_using_unprivileged_containers", - lambda: False, - autospec=None, -) -def test_execute_in_container_reuses_only_valid_exec(): - fake_container_id = "fake_container_id" - fake_execid = "fake_execid" - fake_return_code = 0 - fake_output = "fake_output" - fake_command = "fake_cmd" - bad_exec = { - "ExitCode": fake_return_code, - "ProcessConfig": { - "entrypoint": "/bin/sh", - "arguments": ["-c", "some_other_command"], - }, - } - good_exec = { - "ExitCode": fake_return_code, - "ProcessConfig": {"entrypoint": "/bin/sh", "arguments": ["-c", fake_command]}, - } - mock_docker_client = mock.MagicMock(spec_set=docker.APIClient) - mock_docker_client.inspect_container.return_value = { - "ExecIDs": ["fake_other_exec", fake_execid, "fake_other_exec"] - } - mock_docker_client.exec_start.return_value = fake_output - # the last side effect is used to check the exit code of the command - mock_docker_client.exec_inspect.side_effect = [ - bad_exec, - good_exec, - bad_exec, - good_exec, - ] - - assert execute_in_container( - mock_docker_client, fake_container_id, fake_command, 1 - ) == (fake_output, fake_return_code) - assert mock_docker_client.exec_create.call_count == 0 - mock_docker_client.exec_start.assert_called_once_with(fake_execid, stream=False) - - -def test_main(): - fake_container_id = "fake_container_id" - fake_timeout = 3 - with mock.patch( - "paasta_tools.paasta_execute_docker_command.get_container_id_for_mesos_id", - return_value=fake_container_id, - autospec=True, - ), mock.patch( - "paasta_tools.paasta_execute_docker_command.parse_args", autospec=True - ) as args_patch, mock.patch( - "paasta_tools.paasta_execute_docker_command.execute_in_container", - return_value=("fake_output", 0), - autospec=True, - ), mock.patch( - "paasta_tools.paasta_execute_docker_command.time_limit", autospec=True - ) as time_limit_patch: - args_patch.return_value.mesos_id = "fake_task_id" - args_patch.return_value.timeout = fake_timeout - with pytest.raises(SystemExit) as excinfo: - main() - time_limit_patch.assert_called_once_with(fake_timeout) - assert excinfo.value.code == 0 - - -def test_main_with_empty_task_id(): - fake_container_id = "fake_container_id" - fake_timeout = 3 - with mock.patch( - "paasta_tools.paasta_execute_docker_command.get_container_id_for_mesos_id", - return_value=fake_container_id, - autospec=True, - ), mock.patch( - "paasta_tools.paasta_execute_docker_command.parse_args", autospec=True - ) as args_patch, mock.patch( - "paasta_tools.paasta_execute_docker_command.execute_in_container", - return_value=("fake_output", 0), - autospec=True, - ), mock.patch( - "paasta_tools.paasta_execute_docker_command.time_limit", autospec=True - ): - args_patch.return_value.mesos_id = "" - args_patch.return_value.timeout = fake_timeout - with pytest.raises(SystemExit) as excinfo: - main() - assert excinfo.value.code == 2 - - -def test_main_container_not_found_failure(): - with mock.patch( - "paasta_tools.paasta_execute_docker_command.get_container_id_for_mesos_id", - return_value=None, - autospec=True, - ), mock.patch( - "paasta_tools.paasta_execute_docker_command.execute_in_container", - return_value=("fake_output", 2), - autospec=True, - ), mock.patch( - "paasta_tools.paasta_execute_docker_command.parse_args", autospec=True - ) as args_patch, mock.patch( - "paasta_tools.paasta_execute_docker_command.time_limit", autospec=True - ): - args_patch.return_value.mesos_id = "fake_task_id" - with pytest.raises(SystemExit) as excinfo: - main() - assert excinfo.value.code == 1 - - -def test_main_cmd_unclean_exit_failure(): - fake_container_id = "fake_container_id" - with mock.patch( - "paasta_tools.paasta_execute_docker_command.get_container_id_for_mesos_id", - return_value=fake_container_id, - autospec=True, - ), mock.patch( - "paasta_tools.paasta_execute_docker_command.execute_in_container", - return_value=("fake_output", 2), - autospec=True, - ), mock.patch( - "paasta_tools.paasta_execute_docker_command.parse_args", autospec=True - ) as args_patch, mock.patch( - "paasta_tools.paasta_execute_docker_command.time_limit", autospec=True - ): - args_patch.return_value.mesos_id = "fake_task_id" - with pytest.raises(SystemExit) as excinfo: - main() - assert excinfo.value.code == 2 - - -def test_main_timeout_failure(): - fake_container_id = "fake_container_id" - fake_timeout = 3 - with mock.patch( - "paasta_tools.paasta_execute_docker_command.get_container_id_for_mesos_id", - return_value=fake_container_id, - autospec=True, - ), mock.patch( - "paasta_tools.paasta_execute_docker_command.parse_args", autospec=True - ) as args_patch, mock.patch( - "paasta_tools.paasta_execute_docker_command.execute_in_container", - return_value=("fake_output", 0), - autospec=True, - ), mock.patch( - "paasta_tools.paasta_execute_docker_command.time_limit", - side_effect=TimeoutException, - autospec=True, - ) as time_limit_patch: - args_patch.return_value.mesos_id = "fake_task_id" - args_patch.return_value.timeout = fake_timeout - with pytest.raises(SystemExit) as excinfo: - main() - time_limit_patch.assert_called_once_with(fake_timeout) - assert excinfo.value.code == 1 diff --git a/tests/test_smartstack_tools.py b/tests/test_smartstack_tools.py index dd223c23e0..e129485742 100644 --- a/tests/test_smartstack_tools.py +++ b/tests/test_smartstack_tools.py @@ -47,40 +47,17 @@ def test_load_smartstack_info_for_service(system_paasta_config): def test_get_smartstack_replication_for_attribute(system_paasta_config): fake_namespace = "fake_main" fake_service = "fake_service" - mock_filtered_slaves = [ - {"hostname": "hostone", "attributes": {"fake_attribute": "foo"}}, - {"hostname": "hostone", "attributes": {"fake_attribute": "bar"}}, - ] - with mock.patch( - "paasta_tools.mesos_tools.get_all_slaves_for_blacklist_whitelist", - return_value=mock_filtered_slaves, - autospec=True, - ) as mock_get_all_slaves_for_blacklist_whitelist, mock.patch( - "paasta_tools.smartstack_tools.get_replication_for_services", - return_value={}, - autospec=True, - ) as mock_get_replication_for_services: - expected = {"foo": {}, "bar": {}} - actual = smartstack_tools.get_smartstack_replication_for_attribute( - attribute="fake_attribute", - service=fake_service, - namespace=fake_namespace, - blacklist=[], - system_paasta_config=system_paasta_config, - ) - mock_get_all_slaves_for_blacklist_whitelist.assert_called_once_with( - blacklist=[], whitelist=None - ) - assert actual == expected - assert mock_get_replication_for_services.call_count == 2 - - mock_get_replication_for_services.assert_any_call( - synapse_host="hostone", - synapse_port=system_paasta_config.get_synapse_port(), - synapse_haproxy_url_format=system_paasta_config.get_synapse_haproxy_url_format(), - services=["fake_service.fake_main"], - ) + # Since Mesos support has been removed, this function now returns an empty dict + expected = {} + actual = smartstack_tools.get_smartstack_replication_for_attribute( + attribute="fake_attribute", + service=fake_service, + namespace=fake_namespace, + blacklist=[], + system_paasta_config=system_paasta_config, + ) + assert actual == expected def test_get_replication_for_service(): diff --git a/tests/test_tron_tools.py b/tests/test_tron_tools.py index 69ae4b3b5d..acdd4ff8b8 100644 --- a/tests/test_tron_tools.py +++ b/tests/test_tron_tools.py @@ -10,8 +10,8 @@ from paasta_tools import utils from paasta_tools import yaml_tools as yaml from paasta_tools.secret_tools import SHARED_SECRET_SERVICE +from paasta_tools.tron_tools import KUBERNETES_EXECUTOR_NAMES from paasta_tools.tron_tools import MASTER_NAMESPACE -from paasta_tools.tron_tools import MESOS_EXECUTOR_NAMES from paasta_tools.tron_tools import TronActionConfigDict from paasta_tools.utils import CAPS_DROP from paasta_tools.utils import InvalidInstanceConfig @@ -103,7 +103,7 @@ def test_action_config(self, action_config): assert action_config.get_action_name() == "print" assert action_config.get_cluster() == "fake-cluster" - @pytest.mark.parametrize("executor", MESOS_EXECUTOR_NAMES) + @pytest.mark.parametrize("executor", KUBERNETES_EXECUTOR_NAMES) def test_get_env( self, mock_read_soa_metadata, action_config, executor, monkeypatch ): @@ -213,7 +213,7 @@ def test_get_secret_volume_name( def test_get_executor_default(self, action_config): assert action_config.get_executor() == "paasta" - @pytest.mark.parametrize("executor", MESOS_EXECUTOR_NAMES) + @pytest.mark.parametrize("executor", KUBERNETES_EXECUTOR_NAMES) def test_get_executor_paasta(self, executor, action_config): action_config.config_dict["executor"] = executor assert action_config.get_executor() == executor From af85b81414bf0203a6e83bbd0952e502fa8a9475 Mon Sep 17 00:00:00 2001 From: Evan Krall Date: Tue, 24 Jun 2025 16:32:02 -0700 Subject: [PATCH 02/15] Remove tron_jobs_running_here since this was mesos-only --- paasta_tools/broadcast_log_to_services.py | 6 +----- paasta_tools/tron_tools.py | 5 ----- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/paasta_tools/broadcast_log_to_services.py b/paasta_tools/broadcast_log_to_services.py index 08f3f594bc..8cc314b2f4 100755 --- a/paasta_tools/broadcast_log_to_services.py +++ b/paasta_tools/broadcast_log_to_services.py @@ -15,7 +15,6 @@ import sys from paasta_tools.kubernetes_tools import get_all_kubernetes_services_running_here -from paasta_tools.tron_tools import tron_jobs_running_here from paasta_tools.utils import _log from paasta_tools.utils import DEFAULT_SOA_DIR from paasta_tools.utils import load_system_paasta_config @@ -40,15 +39,12 @@ def broadcast_log_all_services_running_here(line: str, soa_dir=DEFAULT_SOA_DIR) def get_all_services_running_here(cluster, soa_dir): - # Tron jobs no longer run via Mesos, so this will return an empty list - tron_services = tron_jobs_running_here() - try: kubernetes_services = get_all_kubernetes_services_running_here() except Exception: kubernetes_services = [] - return tron_services + kubernetes_services + return kubernetes_services def main() -> None: diff --git a/paasta_tools/tron_tools.py b/paasta_tools/tron_tools.py index 76617cf7a9..f1f0c99830 100644 --- a/paasta_tools/tron_tools.py +++ b/paasta_tools/tron_tools.py @@ -1419,11 +1419,6 @@ def get_tron_dashboard_for_cluster(cluster: str): return dashboards["Tron"] -def tron_jobs_running_here() -> List[Tuple[str, str, int]]: - # Mesos support has been removed, so no Tron jobs are running via Mesos - return [] - - def parse_service_instance_from_executor_id(task_id: str) -> Tuple[str, str]: """Parses tron mesos task ids, like schematizer.traffic_generator.28414.turnstyle.46da87d7-6092-4ed4-b926-ffa7b21c7785""" try: From 4fc64a780eb92999cf843d59d3b34715109a45b3 Mon Sep 17 00:00:00 2001 From: Evan Krall Date: Tue, 24 Jun 2025 16:34:12 -0700 Subject: [PATCH 03/15] Remove mesos-only check_spark_jobs.py and related tests --- .../paasta_tools.check_spark_jobs.rst | 7 - docs/source/generated/paasta_tools.rst | 1 - paasta_tools/check_spark_jobs.py | 214 ------------------ setup.py | 1 - tests/test_check_spark_jobs.py | 154 ------------- 5 files changed, 377 deletions(-) delete mode 100644 docs/source/generated/paasta_tools.check_spark_jobs.rst delete mode 100644 paasta_tools/check_spark_jobs.py delete mode 100644 tests/test_check_spark_jobs.py diff --git a/docs/source/generated/paasta_tools.check_spark_jobs.rst b/docs/source/generated/paasta_tools.check_spark_jobs.rst deleted file mode 100644 index 3484afa185..0000000000 --- a/docs/source/generated/paasta_tools.check_spark_jobs.rst +++ /dev/null @@ -1,7 +0,0 @@ -paasta\_tools.check\_spark\_jobs module -======================================= - -.. automodule:: paasta_tools.check_spark_jobs - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/generated/paasta_tools.rst b/docs/source/generated/paasta_tools.rst index de7acffeea..522e15ea3d 100644 --- a/docs/source/generated/paasta_tools.rst +++ b/docs/source/generated/paasta_tools.rst @@ -38,7 +38,6 @@ Submodules paasta_tools.check_kubernetes_services_replication paasta_tools.check_oom_events paasta_tools.check_services_replication_tools - paasta_tools.check_spark_jobs paasta_tools.cleanup_expired_autoscaling_overrides paasta_tools.cleanup_kubernetes_cr paasta_tools.cleanup_kubernetes_crd diff --git a/paasta_tools/check_spark_jobs.py b/paasta_tools/check_spark_jobs.py deleted file mode 100644 index d865baeb13..0000000000 --- a/paasta_tools/check_spark_jobs.py +++ /dev/null @@ -1,214 +0,0 @@ -#!/usr/bin/env python -import argparse -import datetime -import logging -import smtplib -import sys -from collections import defaultdict -from email.message import EmailMessage -from socket import getfqdn - -import pysensu_yelp -import requests - -from paasta_tools.monitoring_tools import send_event -from paasta_tools.utils import DEFAULT_SOA_DIR -from paasta_tools.utils import list_services - - -logger = logging.getLogger(__name__) -email_from_address = f"paasta@{getfqdn()}" - - -JUPYTER_PREFIX = "jupyterhub_" - - -def parse_args(): - parser = argparse.ArgumentParser( - description="Reports long-running Spark frameworks." - ) - parser.add_argument( - "--min-hours", - type=float, - help="Report frameworks that have been registered for more than this duration", - default=0, - ) - parser.add_argument( - "--no-notify", - action="store_true", - help="Skip notifying the teams that own each framework", - ) - parser.add_argument( - "--email-domain", default=None, help="Email domain for notifying users" - ) - return parser.parse_args() - - -def get_time_running(framework): - registered_time = datetime.datetime.fromtimestamp(framework["registered_time"]) - return datetime.datetime.now() - registered_time - - -def get_spark_properties(framework): - webui_url = framework.get("webui_url") - if not webui_url: - return None - - env_endpoint = f"{webui_url}/api/v1/applications/{framework.id}/environment" - try: - response = requests.get(env_endpoint, timeout=5) - except (requests.exceptions.Timeout, requests.exceptions.ConnectionError) as e: - logger.warning(f"Unable to connect to {env_endpoint}: {e!r}") - return None - - if response.status_code != 200: - logger.warning(f"Bad response from {env_endpoint}: {response.status_code}") - return None - - try: - return response.json()["sparkProperties"] - except (ValueError, KeyError): - logger.warning( - f"Unable to get sparkProperties for {framework.id}: got response {response.text}" - ) - return None - - -def guess_service(properties): - if not properties: - return None - for key, value in properties: - if key == "spark.executorEnv.PAASTA_SERVICE": - service = value - break - else: - return None - if service.startswith(JUPYTER_PREFIX): - return service[len(JUPYTER_PREFIX) :] - else: - return service - - -def get_matching_framework_info(min_hours): - # Mesos support has been removed - Spark frameworks no longer run on Mesos - return [] - - -def format_framework(info): - result = [f'{info["name"]} (running for {info["time_running"]})'] - result.append(f' user: {info["user"]}') - result.append(f' job UI: {info["webui_url"]}') - return "\n".join(result) - - -def format_message_for_service(service, frameworks): - output = f"Found the following long-running Spark frameworks associated with service {service}.\n" - output += ( - f"Please check why they are still running and terminate if appropriate.\n\n" - ) - output += "\n".join(format_framework(f) for f in frameworks) - return output - - -def get_messages_by_service(frameworks): - frameworks_by_service = defaultdict(list) - for framework in frameworks: - service = framework["service"] - frameworks_by_service[service].append(framework) - - return { - service: format_message_for_service(service, frameworks) - for service, frameworks in frameworks_by_service.items() - } - - -def update_check_status(service, output, status): - overrides = { - "page": False, - "alert_after": 0, - "tip": "Ask the user to check the job UI and terminate the job if appropriate.", - "runbook": "http://y/spark-debug", - "ticket": True, - } - send_event( - service=service, - check_name=f"long_running_spark_jobs.{service}", - overrides=overrides, - status=status, - output=output, - soa_dir=DEFAULT_SOA_DIR, - ) - - -def email_user(framework_info, email_domain): - guessed_user = None - if framework_info["user"] != "root": - guessed_user = framework_info["user"] - elif framework_info["name"].startswith(JUPYTER_PREFIX): - try: - # the job format is now `___` - guessed_user = framework_info["name"].split("_")[-3] - except IndexError: - pass - - if guessed_user: - print( - f'Guessed {framework_info["name"]} belongs to {guessed_user}, sending email' - ) - else: - print(f"Could not guess user from {framework_info}, skipping user email") - return - - msg = EmailMessage() - msg["From"] = email_from_address - msg["To"] = f"{guessed_user}@{email_domain}" - msg["Subject"] = f'Long-running Spark framework {framework_info["name"]}' - content = "Please check why it is still running and terminate if appropriate.\n" - content += format_framework(framework_info) - msg.set_content(content) - with smtplib.SMTP("localhost") as s: - s.send_message(msg) - - -def report_spark_jobs(min_hours, no_notify, email_domain=None): - frameworks = get_matching_framework_info(min_hours=min_hours) - messages_by_service = get_messages_by_service(frameworks) - valid_services = set(list_services()) - - messages_for_unknown_services = [] - for service, message in messages_by_service.items(): - if service in valid_services: - print(f"{message}\n") - else: - messages_for_unknown_services.append(message) - if messages_for_unknown_services: - print("\nINVALID SERVICES") - print("----------------") - print( - "The following frameworks are associated with services that are not configured in PaaSTA.\n" - ) - print("\n\n".join(messages_for_unknown_services)) - - if not no_notify: - for service in valid_services: - if service in messages_by_service: - update_check_status(service, message, pysensu_yelp.Status.WARNING) - else: - update_check_status( - service, "No long running spark jobs", pysensu_yelp.Status.OK - ) - if email_domain: - for framework in frameworks: - email_user(framework, email_domain) - - return 0 if len(frameworks) == 0 else 1 - - -def main(): - args = parse_args() - logging.basicConfig() - return report_spark_jobs(args.min_hours, args.no_notify, args.email_domain) - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/setup.py b/setup.py index c62d16c8dc..493e6a2adb 100644 --- a/setup.py +++ b/setup.py @@ -47,7 +47,6 @@ def get_install_requires(): "paasta_tools/check_kubernetes_api.py", "paasta_tools/check_kubernetes_services_replication.py", "paasta_tools/check_oom_events.py", - "paasta_tools/check_spark_jobs.py", "paasta_tools/cleanup_kubernetes_cr.py", "paasta_tools/cleanup_kubernetes_crd.py", "paasta_tools/cleanup_kubernetes_jobs.py", diff --git a/tests/test_check_spark_jobs.py b/tests/test_check_spark_jobs.py deleted file mode 100644 index 3d9c519d2f..0000000000 --- a/tests/test_check_spark_jobs.py +++ /dev/null @@ -1,154 +0,0 @@ -import datetime - -import mock -import pytest - -import paasta_tools.check_spark_jobs as check_spark_jobs - - -@pytest.fixture -def mock_smtp(): - with mock.patch( - "paasta_tools.check_spark_jobs.smtplib", autospec=True - ) as mock_smtp: - yield mock_smtp - - -@pytest.fixture -def mock_current_time(): - return datetime.datetime(2019, 4, 3, 0, 0, 0) - - -@pytest.fixture(autouse=True) -def mock_datetime(mock_current_time): - with mock.patch( - "paasta_tools.check_spark_jobs.datetime", autospec=True - ) as mock_time: - mock_time.datetime.now.return_value = mock_current_time - mock_time.datetime.fromtimestamp = datetime.datetime.fromtimestamp - mock_time.timedelta = datetime.timedelta - yield mock_time - - -@pytest.fixture -def mock_get_frameworks(mock_current_time): - with mock.patch( - "paasta_tools.check_spark_jobs.get_matching_framework_info", autospec=True - ) as mock_get_frameworks: - # Since mesos_tools.get_all_frameworks now returns empty list, - # we mock get_matching_framework_info directly for testing - mock_get_frameworks.return_value = [] - yield mock_get_frameworks - - -@pytest.mark.parametrize( - "properties,expected", - [ - (None, None), - ([["something_else", "value"]], None), - ( - [["something_else", "value"], ["spark.executorEnv.PAASTA_SERVICE", "foo"]], - "foo", - ), - ([["spark.executorEnv.PAASTA_SERVICE", "jupyterhub_foo"]], "foo"), - ], -) -def test_guess_service(properties, expected): - assert check_spark_jobs.guess_service(properties) == expected - - -def test_get_matching_framework_info(): - # Since Mesos is removed, get_matching_framework_info should return empty list - result = check_spark_jobs.get_matching_framework_info(min_hours=20) - assert result == [] - - result = check_spark_jobs.get_matching_framework_info(min_hours=1) - assert result == [] - - -@pytest.mark.parametrize( - "framework_user,framework_name,expected", - [ - ("me", "paasta_spark_run_something", "me"), - ("root", "Custom Spark App", None), - ("root", "jupyterhub_bill-search-learning_bill_39904_1234", "bill"), - ], -) -def test_email_user(mock_smtp, framework_user, framework_name, expected): - info = { - "id": "id1", - "name": framework_name, - "webui_url": "url1", - "service": "something", - "user": framework_user, - "time_running": "1 day, 03:32:00", - } - check_spark_jobs.email_user(info, "test.com") - - mock_send_message = mock_smtp.SMTP.return_value.__enter__.return_value.send_message - if expected: - assert mock_send_message.call_count == 1 - msg = mock_send_message.call_args[0][0] - assert msg["To"] == f"{expected}@test.com" - else: - assert mock_send_message.call_count == 0 - - -@mock.patch("paasta_tools.check_spark_jobs.email_user", autospec=True) -@mock.patch("paasta_tools.check_spark_jobs.list_services", autospec=True) -@mock.patch("paasta_tools.check_spark_jobs.get_matching_framework_info", autospec=True) -@mock.patch("paasta_tools.check_spark_jobs.update_check_status", autospec=True) -@pytest.mark.parametrize("no_notify", [True, False]) -def test_report_spark_jobs( - mock_check, mock_get_info, mock_list_services, mock_email_user, no_notify -): - mock_list_services.return_value = ["service1", "service2", "other_service"] - mock_get_info.return_value = [ - { - "id": "uuid1", - "name": "spark1", - "webui_url": "url1", - "service": "service1", - "user": "test_user", - "time_running": "7:00:00", - }, - { - "id": "uuid2", - "name": "spark2", - "webui_url": "url2", - "service": "service2", - "user": "test_user", - "time_running": "7:00:00", - }, - { - "id": "uuid3", - "name": "spark3", - "webui_url": "url3", - "service": "service2", - "user": "test_user", - "time_running": "7:00:00", - }, - { - "id": "uuid3", - "name": "spark3", - "webui_url": "url3", - "service": "service_dne", - "user": "test_user", - "time_running": "7:00:00", - }, - ] - assert check_spark_jobs.report_spark_jobs(1, no_notify, "test.com") == 1 - assert mock_get_info.call_args_list == [mock.call(min_hours=1)] - if no_notify: - assert mock_check.call_count == 0 - else: - assert sorted(mock_check.call_args_list) == sorted( - [ - mock.call("service1", mock.ANY, 1), - mock.call("service2", mock.ANY, 1), - mock.call("other_service", mock.ANY, 0), - ] - ) - assert mock_email_user.call_args_list == [ - mock.call(info, "test.com") for info in mock_get_info.return_value - ] From 048811e76ee5daaa1f27ef9c8a2f76a2762d9510 Mon Sep 17 00:00:00 2001 From: Evan Krall Date: Tue, 24 Jun 2025 17:03:50 -0700 Subject: [PATCH 04/15] Use concrete types in metastatus_lib now that we've dropped mesos support --- paasta_tools/metrics/metastatus_lib.py | 29 ++++++++++++-------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/paasta_tools/metrics/metastatus_lib.py b/paasta_tools/metrics/metastatus_lib.py index a0b97b1d2c..cc79208713 100755 --- a/paasta_tools/metrics/metastatus_lib.py +++ b/paasta_tools/metrics/metastatus_lib.py @@ -23,7 +23,6 @@ from typing import NamedTuple from typing import Sequence from typing import Tuple -from typing import TypeVar from humanize import naturalsize from kubernetes.client import V1Node @@ -376,13 +375,11 @@ def assert_nodes_health( _KeyFuncRetT = Sequence[Tuple[str, str]] -_GenericNodeT = TypeVar("_GenericNodeT", bound=V1Node) +_NodeGroupingFunctionT = Callable[[V1Node], _KeyFuncRetT] -_GenericNodeGroupingFunctionT = Callable[[_GenericNodeT], _KeyFuncRetT] +_NodeFilterFunctionT = Callable[[V1Node], bool] -_GenericNodeFilterFunctionT = Callable[[_GenericNodeT], bool] - -_GenericNodeSortFunctionT = Callable[[Sequence[_GenericNodeT]], Sequence[_GenericNodeT]] +_NodeSortFunctionT = Callable[[Sequence[V1Node]], Sequence[V1Node]] def key_func_for_attribute_multi_kube( @@ -405,10 +402,10 @@ def key_func(node): def group_slaves_by_key_func( - key_func: _GenericNodeGroupingFunctionT, - slaves: Sequence[_GenericNodeT], - sort_func: _GenericNodeSortFunctionT = None, -) -> Mapping[_KeyFuncRetT, Sequence[_GenericNodeT]]: + key_func: _NodeGroupingFunctionT, + slaves: Sequence[V1Node], + sort_func: _NodeSortFunctionT = None, +) -> Mapping[_KeyFuncRetT, Sequence[V1Node]]: """Given a function for grouping slaves, return a dict where keys are the unique values returned by the key_func and the values are all those slaves which @@ -418,7 +415,7 @@ def group_slaves_by_key_func( :param slaves: a list of slaves :returns: a dict of key: [slaves] """ - sorted_slaves: Sequence[_GenericNodeT] + sorted_slaves: Sequence[V1Node] if sort_func is None: sorted_slaves = sorted(slaves, key=key_func) else: @@ -511,8 +508,8 @@ def calculate_resource_utilization_for_kube_nodes( def filter_slaves( - slaves: Sequence[_GenericNodeT], filters: Sequence[_GenericNodeFilterFunctionT] -) -> Sequence[_GenericNodeT]: + slaves: Sequence[V1Node], filters: Sequence[_NodeFilterFunctionT] +) -> Sequence[V1Node]: """Filter slaves by attributes :param slaves: list of slaves to filter @@ -526,12 +523,12 @@ def filter_slaves( def get_resource_utilization_by_grouping_kube( - grouping_func: _GenericNodeGroupingFunctionT, + grouping_func: _NodeGroupingFunctionT, kube_client: KubeClient, *, namespace: str, - filters: Sequence[_GenericNodeFilterFunctionT] = [], - sort_func: _GenericNodeSortFunctionT = None, + filters: Sequence[_NodeFilterFunctionT] = [], + sort_func: _NodeSortFunctionT = None, ) -> Mapping[_KeyFuncRetT, ResourceUtilizationDict]: """Given a function used to group nodes, calculate resource utilization for each value of a given attribute. From a667b42e6c138934bb7cd662996c7b80e6bc5250 Mon Sep 17 00:00:00 2001 From: Evan Krall Date: Tue, 24 Jun 2025 17:15:51 -0700 Subject: [PATCH 05/15] Delete now-unused code in smartstack_tools --- paasta_tools/smartstack_tools.py | 65 -------------------------------- tests/test_smartstack_tools.py | 34 ----------------- 2 files changed, 99 deletions(-) diff --git a/paasta_tools/smartstack_tools.py b/paasta_tools/smartstack_tools.py index 1db059b601..d89948a6e2 100644 --- a/paasta_tools/smartstack_tools.py +++ b/paasta_tools/smartstack_tools.py @@ -37,12 +37,9 @@ from paasta_tools import envoy_tools from paasta_tools import kubernetes_tools -from paasta_tools import long_running_service_tools from paasta_tools.long_running_service_tools import LongRunningServiceConfig from paasta_tools.monitoring_tools import ReplicationChecker from paasta_tools.utils import compose_job_id -from paasta_tools.utils import DEFAULT_SOA_DIR -from paasta_tools.utils import DeployBlacklist from paasta_tools.utils import get_user_agent from paasta_tools.utils import SystemPaastaConfig @@ -165,68 +162,6 @@ def get_multiple_backends( return backends -def load_smartstack_info_for_service( - service: str, - namespace: str, - blacklist: DeployBlacklist, - system_paasta_config: SystemPaastaConfig, - soa_dir: str = DEFAULT_SOA_DIR, -) -> Dict[str, Dict[str, int]]: - """Retrieves number of available backends for given service - - :param service: A service name - :param namespace: A Smartstack namespace - :param blacklist: A list of blacklisted location tuples in the form (location, value) - :param system_paasta_config: A SystemPaastaConfig object representing the system configuration. - :param soa_dir: SOA dir - :returns: a dictionary of the form - - :: - - { - 'location_type': { - 'unique_location_name': { - 'service.instance': <# ofavailable backends> - }, - 'other_unique_location_name': ... - } - } - - """ - service_namespace_config = long_running_service_tools.load_service_namespace_config( - service=service, namespace=namespace, soa_dir=soa_dir - ) - discover_location_type = service_namespace_config.get_discover() - return get_smartstack_replication_for_attribute( - attribute=discover_location_type, - service=service, - namespace=namespace, - blacklist=blacklist, - system_paasta_config=system_paasta_config, - ) - - -def get_smartstack_replication_for_attribute( - attribute: str, - service: str, - namespace: str, - blacklist: DeployBlacklist, - system_paasta_config: SystemPaastaConfig, -) -> Dict[str, Dict[str, int]]: - """Loads smartstack replication from a host with the specified attribute - - :param attribute: a Mesos attribute - :param service: A service name, like 'example_service' - :param namespace: A particular smartstack namespace to inspect, like 'main' - :param blacklist: A list of blacklisted location tuples in the form of (location, value) - :param system_paasta_config: A SystemPaastaConfig object representing the system configuration. - :returns: a dictionary of the form {'': } - (the dictionary will contain keys for unique all attribute values) - """ - # Mesos support has been removed, so no replication info is available via Mesos slaves - return {} - - def get_replication_for_all_services( synapse_host: str, synapse_port: int, synapse_haproxy_url_format: str ) -> Dict[str, int]: diff --git a/tests/test_smartstack_tools.py b/tests/test_smartstack_tools.py index e129485742..44205482d4 100644 --- a/tests/test_smartstack_tools.py +++ b/tests/test_smartstack_tools.py @@ -26,40 +26,6 @@ from paasta_tools.utils import DEFAULT_SYNAPSE_HAPROXY_URL_FORMAT -def test_load_smartstack_info_for_service(system_paasta_config): - with mock.patch( - "paasta_tools.smartstack_tools.long_running_service_tools.load_service_namespace_config", - autospec=True, - ), mock.patch( - "paasta_tools.smartstack_tools.get_smartstack_replication_for_attribute", - autospec=True, - ): - # just a smoke test for now. - smartstack_tools.load_smartstack_info_for_service( - service="service", - namespace="namespace", - soa_dir="fake", - blacklist=[], - system_paasta_config=system_paasta_config, - ) - - -def test_get_smartstack_replication_for_attribute(system_paasta_config): - fake_namespace = "fake_main" - fake_service = "fake_service" - - # Since Mesos support has been removed, this function now returns an empty dict - expected = {} - actual = smartstack_tools.get_smartstack_replication_for_attribute( - attribute="fake_attribute", - service=fake_service, - namespace=fake_namespace, - blacklist=[], - system_paasta_config=system_paasta_config, - ) - assert actual == expected - - def test_get_replication_for_service(): testdir = os.path.dirname(os.path.realpath(__file__)) testdata = os.path.join(testdir, "haproxy_snapshot.txt") From 197d14371e2c1c6bd0061eb561254cec1e42aec0 Mon Sep 17 00:00:00 2001 From: Evan Krall Date: Tue, 24 Jun 2025 17:17:16 -0700 Subject: [PATCH 06/15] fix bug introduced by roocode --- paasta_tools/tron_tools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paasta_tools/tron_tools.py b/paasta_tools/tron_tools.py index f1f0c99830..9fac672c30 100644 --- a/paasta_tools/tron_tools.py +++ b/paasta_tools/tron_tools.py @@ -668,7 +668,7 @@ def validate(self): error_msgs.extend(super().validate()) # Tron is a little special, because it can *not* have a deploy group # But only if an action is running via ssh and not via paasta - if self.get_deploy_group() is None and self.get_executor() == "mesos": + if self.get_deploy_group() is None and self.get_executor() != "ssh": error_msgs.append( f"{self.get_job_name()}.{self.get_action_name()} must have a deploy_group set" ) From 4a46e29c392e18fe19514955c8d8afed5635c6d2 Mon Sep 17 00:00:00 2001 From: Evan Krall Date: Wed, 25 Jun 2025 11:29:00 -0700 Subject: [PATCH 07/15] Remove ability for API to serve instance status of adhoc (only supported mesos anyway) --- paasta_tools/api/views/instance.py | 14 +----------- tests/api/test_instance.py | 34 ------------------------------ 2 files changed, 1 insertion(+), 47 deletions(-) diff --git a/paasta_tools/api/views/instance.py b/paasta_tools/api/views/instance.py index b742d53b13..edc4c33d91 100644 --- a/paasta_tools/api/views/instance.py +++ b/paasta_tools/api/views/instance.py @@ -20,7 +20,6 @@ import traceback from typing import Any from typing import Dict -from typing import List from typing import Mapping from typing import Optional @@ -91,13 +90,6 @@ def tron_instance_status( return status -def adhoc_instance_status( - instance_status: Mapping[str, Any], service: str, instance: str, verbose: int -) -> List[Dict[str, Any]]: - # Mesos support has been removed - adhoc instances no longer run on Mesos - return [] - - async def _task_result_or_error(future): try: return {"value": await future} @@ -174,11 +166,7 @@ def instance_status(request): instance_status["version"] = "" instance_status["git_sha"] = "" try: - if instance_type == "adhoc": - instance_status["adhoc"] = adhoc_instance_status( - instance_status, service, instance, verbose - ) - elif pik.can_handle(instance_type): + if pik.can_handle(instance_type): instance_status.update( pik.instance_status( service=service, diff --git a/tests/api/test_instance.py b/tests/api/test_instance.py index bcbaf96737..b0ba69598d 100644 --- a/tests/api/test_instance.py +++ b/tests/api/test_instance.py @@ -29,7 +29,6 @@ from paasta_tools.long_running_service_tools import ServiceNamespaceConfig from paasta_tools.smartstack_tools import DiscoveredHost from paasta_tools.smartstack_tools import HaproxyBackend -from paasta_tools.utils import DeploymentVersion from paasta_tools.utils import NoConfigurationForServiceError from tests.conftest import wrap_value_in_task @@ -129,39 +128,6 @@ async def test_kubernetes_smartstack_status(mock_job_config): } -@mock.patch("paasta_tools.api.views.instance.adhoc_instance_status", autospec=True) -@mock.patch("paasta_tools.api.views.instance.validate_service_instance", autospec=True) -@mock.patch("paasta_tools.api.views.instance.get_actual_deployments", autospec=True) -def test_instances_status_adhoc( - mock_get_actual_deployments, - mock_validate_service_instance, - mock_adhoc_instance_status, -): - settings.cluster = "fake_cluster" - mock_deployment_version = DeploymentVersion("GIT_SHA", "20220101T000000") - mock_get_actual_deployments.return_value = { - "fake_cluster.fake_instance": mock_deployment_version, - "fake_cluster.fake_instance2": mock_deployment_version, - "fake_cluster2.fake_instance": mock_deployment_version, - "fake_cluster2.fake_instance2": mock_deployment_version, - } - mock_validate_service_instance.return_value = "adhoc" - mock_adhoc_instance_status.return_value = {} - - request = testing.DummyRequest() - request.swagger_data = {"service": "fake_service", "instance": "fake_instance"} - - response = instance.instance_status(request) - assert mock_adhoc_instance_status.called - assert response == { - "service": "fake_service", - "instance": "fake_instance", - "git_sha": "GIT_SHA", - "version": mock_deployment_version.short_sha_repr(), - "adhoc": {}, - } - - def test_add_executor_info(): mock_mesos_task = mock.Mock() mock_executor = { From cfa6352e15ffa6cb93d8b37f94a594f0e9222edb Mon Sep 17 00:00:00 2001 From: Evan Krall Date: Wed, 25 Jun 2025 11:52:10 -0700 Subject: [PATCH 08/15] Remove resources.utilization --- paasta_tools/api/api.py | 1 - paasta_tools/api/api_docs/oapi.yaml | 37 ----- paasta_tools/api/api_docs/swagger.json | 45 ------ paasta_tools/api/client.py | 2 - paasta_tools/api/views/resources.py | 36 ----- paasta_tools/paastaapi/api/resources_api.py | 157 -------------------- paasta_tools/paastaapi/apis/__init__.py | 1 - tests/api/test_resources.py | 145 ------------------ 8 files changed, 424 deletions(-) delete mode 100644 paasta_tools/api/views/resources.py delete mode 100644 paasta_tools/paastaapi/api/resources_api.py delete mode 100644 tests/api/test_resources.py diff --git a/paasta_tools/api/api.py b/paasta_tools/api/api.py index eaa33d8f53..b73cadf1b5 100644 --- a/paasta_tools/api/api.py +++ b/paasta_tools/api/api.py @@ -137,7 +137,6 @@ def make_app(global_config=None): ) config.include(profiling) - config.add_route("resources.utilization", "/v1/resources/utilization") config.add_route( "service.instance.status", "/v1/services/{service}/{instance}/status" ) diff --git a/paasta_tools/api/api_docs/oapi.yaml b/paasta_tools/api/api_docs/oapi.yaml index 8bdc93720d..627cfbd80b 100644 --- a/paasta_tools/api/api_docs/oapi.yaml +++ b/paasta_tools/api/api_docs/oapi.yaml @@ -1383,43 +1383,6 @@ paths: description: Service instance not found "500": description: Failure - /resources/utilization: - get: - operationId: resources - parameters: - - description: comma separated list of keys to group by - in: query - name: groupings - required: false - schema: - items: - type: string - type: array - style: simple - - description: List of slave filters in format 'filter=attr_name:value1,value2&filter=attr2:value3,value4'. - Matches attr_name=(value1 OR value2) AND attr2=(value3 OR value4) - explode: true - in: query - name: filter - required: false - style: form - schema: - items: - pattern: (.*):(.*,)*(.*) - type: string - type: array - responses: - "200": - content: - application/json: - schema: - $ref: '#/components/schemas/Resource' - description: Resources in the cluster, filtered and grouped by parameters - "400": - description: Poorly formated query parameters - summary: Get resources in the cluster - tags: - - resources /service_autoscaler/pause: delete: operationId: delete_service_autoscaler_pause diff --git a/paasta_tools/api/api_docs/swagger.json b/paasta_tools/api/api_docs/swagger.json index fb6e003a2c..c75ea9f85b 100644 --- a/paasta_tools/api/api_docs/swagger.json +++ b/paasta_tools/api/api_docs/swagger.json @@ -352,51 +352,6 @@ ] } }, - "/resources/utilization": { - "get": { - "responses": { - "200": { - "description": "Resources in the cluster, filtered and grouped by parameters", - "schema": { - "$ref": "#/definitions/Resource" - } - }, - "400": { - "description": "Poorly formated query parameters" - } - }, - "summary": "Get resources in the cluster", - "operationId": "resources", - "tags": [ - "resources" - ], - "parameters": [ - { - "in": "query", - "description": "comma separated list of keys to group by", - "name": "groupings", - "required": false, - "type": "array", - "collectionFormat": "csv", - "items": { - "type": "string" - } - }, - { - "in": "query", - "description": "List of slave filters in format 'filter=attr_name:value1,value2&filter=attr2:value3,value4'. Matches attr_name=(value1 OR value2) AND attr2=(value3 OR value4)", - "name": "filter", - "required": false, - "type": "array", - "collectionFormat": "multi", - "items": { - "type": "string", - "pattern": "(.*):(.*,)*(.*)" - } - } - ] - } - }, "/services": { "get": { "responses": { diff --git a/paasta_tools/api/client.py b/paasta_tools/api/client.py index 2a1ce3de13..179b055f89 100644 --- a/paasta_tools/api/client.py +++ b/paasta_tools/api/client.py @@ -35,7 +35,6 @@ class PaastaOApiClient: autoscaler: paastaapis.AutoscalerApi default: paastaapis.DefaultApi - resources: paastaapis.ResourcesApi service: paastaapis.ServiceApi remote_run: paastaapis.RemoteRunApi api_error: Type[paastaapi.ApiException] @@ -71,7 +70,6 @@ def get_paasta_oapi_client_by_url( return PaastaOApiClient( autoscaler=paastaapis.AutoscalerApi(client), default=paastaapis.DefaultApi(client), - resources=paastaapis.ResourcesApi(client), service=paastaapis.ServiceApi(client), remote_run=paastaapis.RemoteRunApi(client), api_error=paastaapi.ApiException, diff --git a/paasta_tools/api/views/resources.py b/paasta_tools/api/views/resources.py deleted file mode 100644 index 1b7b0e2a8a..0000000000 --- a/paasta_tools/api/views/resources.py +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env python -# Copyright 2015-2016 Yelp Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -PaaSTA resource utilization, etc. -""" -from pyramid.response import Response -from pyramid.view import view_config - - -def parse_filters(filters): - # The swagger config verifies that the data is in this format - # "pattern": "(.*):(.*,)*(.*)" - if filters is None: - return {} - f = {s[0]: s[1] for s in [e.split(":") for e in filters]} - f = {k: v.split(",") for k, v in f.items()} - return f - - -@view_config(route_name="resources.utilization", request_method="GET", renderer="json") -def resources_utilization(request): - # Mesos support has been removed - resource utilization now only available via Kubernetes - response_body = [] - return Response(json_body=response_body, status_code=200) diff --git a/paasta_tools/paastaapi/api/resources_api.py b/paasta_tools/paastaapi/api/resources_api.py deleted file mode 100644 index 9bc3555e75..0000000000 --- a/paasta_tools/paastaapi/api/resources_api.py +++ /dev/null @@ -1,157 +0,0 @@ -# coding: utf-8 - -""" - Paasta API - - No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator) # noqa: E501 - - The version of the OpenAPI document: 1.2.0 - Generated by: https://openapi-generator.tech -""" - - -import re # noqa: F401 -import sys # noqa: F401 - -from paasta_tools.paastaapi.api_client import ApiClient, Endpoint -from paasta_tools.paastaapi.model_utils import ( # noqa: F401 - check_allowed_values, - check_validations, - date, - datetime, - file_type, - none_type, - validate_and_convert_types -) -from paasta_tools.paastaapi.model.resource import Resource - - -class ResourcesApi(object): - """NOTE: This class is auto generated by OpenAPI Generator - Ref: https://openapi-generator.tech - - Do not edit the class manually. - """ - - def __init__(self, api_client=None): - if api_client is None: - api_client = ApiClient() - self.api_client = api_client - - def __resources( - self, - **kwargs - ): - """Get resources in the cluster # noqa: E501 - - This method makes a synchronous HTTP request by default. To make an - asynchronous HTTP request, please pass async_req=True - - >>> thread = api.resources(async_req=True) - >>> result = thread.get() - - - Keyword Args: - groupings ([str]): comma separated list of keys to group by. [optional] - filter ([str]): List of slave filters in format 'filter=attr_name:value1,value2&filter=attr2:value3,value4'. Matches attr_name=(value1 OR value2) AND attr2=(value3 OR value4). [optional] - _return_http_data_only (bool): response data without head status - code and headers. Default is True. - _preload_content (bool): if False, the urllib3.HTTPResponse object - will be returned without reading/decoding response data. - Default is True. - _request_timeout (float/tuple): timeout setting for this request. If one - number provided, it will be total request timeout. It can also - be a pair (tuple) of (connection, read) timeouts. - Default is None. - _check_input_type (bool): specifies if type checking - should be done one the data sent to the server. - Default is True. - _check_return_type (bool): specifies if type checking - should be done one the data received from the server. - Default is True. - _host_index (int/None): specifies the index of the server - that we want to use. - Default is read from the configuration. - async_req (bool): execute request asynchronously - - Returns: - Resource - If the method is called asynchronously, returns the request - thread. - """ - kwargs['async_req'] = kwargs.get( - 'async_req', False - ) - kwargs['_return_http_data_only'] = kwargs.get( - '_return_http_data_only', True - ) - kwargs['_preload_content'] = kwargs.get( - '_preload_content', True - ) - kwargs['_request_timeout'] = kwargs.get( - '_request_timeout', None - ) - kwargs['_check_input_type'] = kwargs.get( - '_check_input_type', True - ) - kwargs['_check_return_type'] = kwargs.get( - '_check_return_type', True - ) - kwargs['_host_index'] = kwargs.get('_host_index') - return self.call_with_http_info(**kwargs) - - self.resources = Endpoint( - settings={ - 'response_type': (Resource,), - 'auth': [], - 'endpoint_path': '/resources/utilization', - 'operation_id': 'resources', - 'http_method': 'GET', - 'servers': None, - }, - params_map={ - 'all': [ - 'groupings', - 'filter', - ], - 'required': [], - 'nullable': [ - ], - 'enum': [ - ], - 'validation': [ - ] - }, - root_map={ - 'validations': { - }, - 'allowed_values': { - }, - 'openapi_types': { - 'groupings': - ([str],), - 'filter': - ([str],), - }, - 'attribute_map': { - 'groupings': 'groupings', - 'filter': 'filter', - }, - 'location_map': { - 'groupings': 'query', - 'filter': 'query', - }, - 'collection_format_map': { - 'groupings': 'csv', - 'filter': 'multi', - } - }, - headers_map={ - 'accept': [ - 'application/json' - ], - 'content_type': [], - }, - api_client=api_client, - callable=__resources - ) diff --git a/paasta_tools/paastaapi/apis/__init__.py b/paasta_tools/paastaapi/apis/__init__.py index ab815d8c70..d921029cc4 100644 --- a/paasta_tools/paastaapi/apis/__init__.py +++ b/paasta_tools/paastaapi/apis/__init__.py @@ -18,5 +18,4 @@ from paasta_tools.paastaapi.api.autoscaler_api import AutoscalerApi from paasta_tools.paastaapi.api.default_api import DefaultApi from paasta_tools.paastaapi.api.remote_run_api import RemoteRunApi -from paasta_tools.paastaapi.api.resources_api import ResourcesApi from paasta_tools.paastaapi.api.service_api import ServiceApi diff --git a/tests/api/test_resources.py b/tests/api/test_resources.py deleted file mode 100644 index 74b7352afd..0000000000 --- a/tests/api/test_resources.py +++ /dev/null @@ -1,145 +0,0 @@ -# Copyright 2015-2016 Yelp Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import json - -from pyramid import testing - -from paasta_tools.api.views.resources import parse_filters -from paasta_tools.api.views.resources import resources_utilization - - -def test_parse_filters_empty(): - filters = None - parsed = parse_filters(filters) - - assert parsed == {} - - -def test_parse_filters_good(): - filters = ["foo:bar,baz", "qux:zol"] - - parsed = parse_filters(filters) - - assert "foo" in parsed.keys() - assert "qux" in parsed.keys() - assert "bar" in parsed["foo"] - assert "baz" in parsed["foo"] - assert "zol" in parsed["qux"] - - -def test_resources_utilization_nothing_special(): - request = testing.DummyRequest() - request.swagger_data = {"groupings": None, "filter": None} - - # Since Mesos is removed, resources_utilization should return empty response - resp = resources_utilization(request) - body = json.loads(resp.body.decode("utf-8")) - - assert resp.status_int == 200 - assert len(body) == 0 - - -mock_mesos_state = { - "slaves": [ - { - "id": "foo1", - "resources": {"disk": 100, "cpus": 10, "mem": 50}, - "attributes": {"pool": "default", "region": "top"}, - "reserved_resources": {}, - }, - { - "id": "bar1", - "resources": {"disk": 100, "cpus": 10, "mem": 50}, - "attributes": {"pool": "default", "region": "bottom"}, - "reserved_resources": {}, - }, - { - "id": "foo2", - "resources": {"disk": 100, "cpus": 10, "mem": 50}, - "attributes": {"pool": "other", "region": "top"}, - "reserved_resources": {}, - }, - { - "id": "bar2", - "resources": {"disk": 100, "cpus": 10, "mem": 50}, - "attributes": {"pool": "other", "region": "bottom"}, - "reserved_resources": {}, - }, - { - "id": "foo3", - "resources": {"disk": 100, "cpus": 10, "mem": 50}, - "attributes": {"pool": "other", "region": "top"}, - "reserved_resources": {}, - }, - { - "id": "bar2", - "resources": {"disk": 100, "cpus": 10, "mem": 50}, - "attributes": {"pool": "other", "region": "bottom"}, - "reserved_resources": {}, - }, - ], - "frameworks": [ - { - "tasks": [ - { - "state": "TASK_RUNNING", - "resources": {"cpus": 1, "mem": 10, "disk": 10}, - "slave_id": "foo1", - }, - { - "state": "TASK_RUNNING", - "resources": {"cpus": 1, "mem": 10, "disk": 10}, - "slave_id": "bar1", - }, - ] - } - ], -} - - -def test_resources_utilization_with_grouping(): - request = testing.DummyRequest() - request.swagger_data = {"groupings": ["region", "pool"], "filter": None} - - # Since Mesos is removed, resources_utilization should return empty response - resp = resources_utilization(request) - body = json.loads(resp.body.decode("utf-8")) - - assert resp.status_int == 200 - assert len(body) == 0 - - -def test_resources_utilization_with_filter(): - request = testing.DummyRequest() - request.swagger_data = { - "groupings": ["region", "pool"], - "filter": ["region:top", "pool:default,other"], - } - - # Since Mesos is removed, resources_utilization should return empty response - resp = resources_utilization(request) - body = json.loads(resp.body.decode("utf-8")) - - assert resp.status_int == 200 - assert len(body) == 0 - - request.swagger_data = { - "groupings": ["region", "pool"], - "filter": ["region:non-exist", "pool:default,other"], - } - resp = resources_utilization(request) - body = json.loads(resp.body.decode("utf-8")) - - assert resp.status_int == 200 - assert len(body) == 0 From e5ae96819761dec50ef50536601bf91aa1bf21cf Mon Sep 17 00:00:00 2001 From: Evan Krall Date: Wed, 25 Jun 2025 12:03:36 -0700 Subject: [PATCH 09/15] Delete stupid comments added by roocode --- tests/metrics/test_metastatus_lib.py | 45 ---------------------------- 1 file changed, 45 deletions(-) diff --git a/tests/metrics/test_metastatus_lib.py b/tests/metrics/test_metastatus_lib.py index 090f20b29a..48f716238e 100644 --- a/tests/metrics/test_metastatus_lib.py +++ b/tests/metrics/test_metastatus_lib.py @@ -38,9 +38,6 @@ def test_fail_check_threshold(): assert not metastatus_lib.check_threshold(80, 30) -# test_get_mesos_cpu_status removed - function deleted with Mesos cleanup - - def test_get_kube_cpu_status(): fake_nodes = [ V1Node(status=V1NodeStatus(allocatable={"cpu": "1"}, capacity={"cpu": "3"})) @@ -128,9 +125,6 @@ def test_assert_bad_gpu_health(): ) -# Mesos health check tests removed - functions deleted with Mesos cleanup - - def test_assert_kube_deployments(): with mock.patch( "paasta_tools.metrics.metastatus_lib.list_all_deployments", autospec=True @@ -167,9 +161,6 @@ def test_assert_nodes_health(): assert ok -# Mesos-related test functions removed - underlying functions deleted with Mesos cleanup - - def test_status_for_results(): assert metastatus_lib.status_for_results( [ @@ -200,9 +191,6 @@ def test_critical_events_in_outputs(): ) == [("myservice_false", False)] -# test_filter_mesos_state_metrics removed - function deleted with Mesos cleanup - - def test_filter_kube_resources(): test_resource_dictionary = { "cpu": 0, @@ -216,9 +204,6 @@ def test_filter_kube_resources(): assert metastatus_lib.filter_kube_resources(test_resource_dictionary) == expected -# test_filter_slaves removed - function deleted with Mesos cleanup - - def test_group_slaves_by_key_func(): slaves = [ { @@ -243,21 +228,6 @@ def test_group_slaves_by_key_func(): assert len(list(v)) == 1 -# test_get_resource_utilization_by_grouping removed - function deleted with Mesos cleanup - - -# test_get_resource_utilization_by_grouping_correctly_groups removed - function deleted with Mesos cleanup - - -# test_get_resource_utilization_by_grouping_correctly_multi_groups removed - function deleted with Mesos cleanup - - -# test_get_resource_utilization_per_slave removed - function deleted with Mesos cleanup - - -# test_calculate_resource_utilization_for_slaves removed - function deleted with Mesos cleanup - - def test_calculate_resource_utilization_for_kube_nodes(): fake_nodes = [ V1Node( @@ -433,12 +403,6 @@ def test_get_table_rows_for_resource_usage_dict(mock_format_row): assert actual == ["myhabitat", "10/10", "10/10", "10/10"] -# test_key_func_for_attribute removed - function deleted with Mesos cleanup - - -# test_get_mesos_memory_status removed - function deleted with Mesos cleanup - - def test_get_kube_memory_status(): fake_nodes = [ V1Node( @@ -453,9 +417,6 @@ def test_get_kube_memory_status(): assert available == 1 * 1024 -# test_get_mesos_disk_status removed - function deleted with Mesos cleanup - - def test_get_kube_disk_status(): fake_nodes = [ V1Node( @@ -471,9 +432,6 @@ def test_get_kube_disk_status(): assert available == 1 * 1024**2 -# test_get_mesos_gpu_status removed - function deleted with Mesos cleanup - - def test_get_kube_gpu_status(): fake_nodes = [ V1Node( @@ -488,9 +446,6 @@ def test_get_kube_gpu_status(): assert available == 1 -# test_reserved_maintenence_resources* functions removed - function deleted with Mesos cleanup - - def test_suffixed_number_value(): assert metastatus_lib.suffixed_number_value("5k") == 5 * 1000 assert metastatus_lib.suffixed_number_value("5m") == 5 * 1000**-1 From 7a7b30a4fdc9ba84d50e8d86305356b126a72ff7 Mon Sep 17 00:00:00 2001 From: Evan Krall Date: Wed, 25 Jun 2025 16:48:47 -0700 Subject: [PATCH 10/15] Delete metastatus_lib.py, moving suffixed_number_value to utils --- .../check_autoscaler_max_instances.py | 2 +- paasta_tools/metrics/metastatus_lib.py | 760 ------------------ paasta_tools/utils.py | 26 + tests/metrics/test_metastatus_lib.py | 460 ----------- tests/test_utils.py | 14 + 5 files changed, 41 insertions(+), 1221 deletions(-) delete mode 100755 paasta_tools/metrics/metastatus_lib.py delete mode 100644 tests/metrics/test_metastatus_lib.py diff --git a/paasta_tools/check_autoscaler_max_instances.py b/paasta_tools/check_autoscaler_max_instances.py index 6d86a4f098..3186e465db 100755 --- a/paasta_tools/check_autoscaler_max_instances.py +++ b/paasta_tools/check_autoscaler_max_instances.py @@ -11,12 +11,12 @@ from paasta_tools.kubernetes_tools import get_kubernetes_app_name from paasta_tools.kubernetes_tools import KubeClient from paasta_tools.kubernetes_tools import KubernetesDeploymentConfig -from paasta_tools.metrics.metastatus_lib import suffixed_number_value from paasta_tools.monitoring_tools import send_event from paasta_tools.paasta_service_config_loader import PaastaServiceConfigLoader from paasta_tools.utils import DEFAULT_SOA_DIR from paasta_tools.utils import list_services from paasta_tools.utils import load_system_paasta_config +from paasta_tools.utils import suffixed_number_value from paasta_tools.utils import SystemPaastaConfig log = logging.getLogger(__name__) diff --git a/paasta_tools/metrics/metastatus_lib.py b/paasta_tools/metrics/metastatus_lib.py deleted file mode 100755 index cc79208713..0000000000 --- a/paasta_tools/metrics/metastatus_lib.py +++ /dev/null @@ -1,760 +0,0 @@ -#!/usr/bin/env python -# Copyright 2015-2016 Yelp Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import itertools -import math -import re -from collections import Counter -from collections import namedtuple -from typing import Any -from typing import Callable -from typing import Mapping -from typing import NamedTuple -from typing import Sequence -from typing import Tuple - -from humanize import naturalsize -from kubernetes.client import V1Node -from kubernetes.client import V1Pod -from mypy_extensions import TypedDict -from typing_extensions import Counter as _Counter - -from paasta_tools.kubernetes_tools import get_all_nodes_cached -from paasta_tools.kubernetes_tools import get_all_pods_cached -from paasta_tools.kubernetes_tools import get_pod_status -from paasta_tools.kubernetes_tools import is_node_ready -from paasta_tools.kubernetes_tools import KubeClient -from paasta_tools.kubernetes_tools import list_all_deployments -from paasta_tools.kubernetes_tools import paasta_prefixed -from paasta_tools.kubernetes_tools import PodStatus -from paasta_tools.utils import PaastaColors -from paasta_tools.utils import print_with_indent - - -DEFAULT_KUBERNETES_CPU_REQUEST = "100m" -DEFAULT_KUBERNETES_MEMORY_REQUEST = "200M" -DEFAULT_KUBERNETES_DISK_REQUEST = "0" - - -class ResourceInfo(namedtuple("ResourceInfo", ["cpus", "mem", "disk", "gpus"])): - def __new__(cls, cpus, mem, disk, gpus=0): - return super().__new__(cls, cpus, mem, disk, gpus) - - -class HealthCheckResult(NamedTuple): - message: str - healthy: bool - - -class ResourceUtilization(NamedTuple): - metric: str - total: int - free: int - - -def get_kube_cpu_status( - nodes: Sequence[V1Node], -) -> Tuple[float, float, float]: - """Takes in the list of Kubernetes nodes and analyzes them, returning the status. - - :param nodes: list of Kubernetes nodes. - :returns: Tuple of total, used, and available CPUs. - """ - - total = 0.0 - available = 0.0 - for node in nodes: - available += suffixed_number_value(node.status.allocatable["cpu"]) - total += suffixed_number_value(node.status.capacity["cpu"]) - - used = total - available - return total, used, available - - -def get_kube_memory_status( - nodes: Sequence[V1Node], -) -> Tuple[float, float, float]: - """Takes in the list of Kubernetes nodes and analyzes them, returning the status. - - :param nodes: list of Kubernetes nodes. - :returns: Tuple of total, used, and available memory in Mi. - """ - total = 0.0 - available = 0.0 - for node in nodes: - available += suffixed_number_value(node.status.allocatable["memory"]) - total += suffixed_number_value(node.status.capacity["memory"]) - - total //= 1024 * 1024 - available //= 1024 * 1024 - used = total - available - return total, used, available - - -def get_kube_disk_status( - nodes: Sequence[V1Node], -) -> Tuple[float, float, float]: - """Takes in the list of Kubernetes nodes and analyzes them, returning the status. - - :param nodes: list of Kubernetes nodes. - :returns: Tuple of total, used, and available disk space in Mi. - """ - - total = 0.0 - available = 0.0 - for node in nodes: - available += suffixed_number_value(node.status.allocatable["ephemeral-storage"]) - total += suffixed_number_value(node.status.capacity["ephemeral-storage"]) - - total //= 1024 * 1024 - available //= 1024 * 1024 - used = total - available - return total, used, available - - -def get_kube_gpu_status( - nodes: Sequence[V1Node], -) -> Tuple[float, float, float]: - """Takes in the list of Kubernetes nodes and analyzes them, returning the status. - - :param nodes: list of Kubernetes nodes. - :returns: Tuple of total, used, and available GPUs. - """ - - total = 0.0 - available = 0.0 - for node in nodes: - available += suffixed_number_value( - node.status.allocatable.get("nvidia.com/gpu", "0") - ) - total += suffixed_number_value(node.status.capacity.get("nvidia.com/gpu", "0")) - - used = total - available - return total, used, available - - -def filter_kube_resources(dictionary: Mapping[str, str]) -> Mapping[str, str]: - valid_keys = ["cpu", "memory", "ephemeral-storage", "nvidia.com/gpu"] - return {key: value for (key, value) in dictionary.items() if key in valid_keys} - - -class ResourceParser: - @staticmethod - def cpus(resources): - resources = resources or {} - return suffixed_number_value( - resources.get("cpu", DEFAULT_KUBERNETES_CPU_REQUEST) - ) - - @staticmethod - def mem(resources): - resources = resources or {} - return suffixed_number_value( - resources.get("memory", DEFAULT_KUBERNETES_MEMORY_REQUEST) - ) - - @staticmethod - def disk(resources): - resources = resources or {} - return suffixed_number_value( - resources.get("ephemeral-storage", DEFAULT_KUBERNETES_DISK_REQUEST) - ) - - -def allocated_node_resources(pods: Sequence[V1Pod]) -> Mapping[str, float]: - cpus = mem = disk = 0 - for pod in pods: - cpus += sum( - ResourceParser.cpus(c.resources.requests) for c in pod.spec.containers - ) - mem += sum( - ResourceParser.mem(c.resources.requests) for c in pod.spec.containers - ) - disk += sum( - ResourceParser.disk(c.resources.requests) for c in pod.spec.containers - ) - return {"cpu": cpus, "memory": mem, "ephemeral-storage": disk} - - -def healthcheck_result_for_resource_utilization( - resource_utilization: ResourceUtilization, threshold: int -) -> HealthCheckResult: - """Given a resource data dict, assert that cpu - data is ok. - - :param resource_utilization: the resource_utilization tuple to check - :returns: a HealthCheckResult - """ - try: - utilization = percent_used( - resource_utilization.total, - resource_utilization.total - resource_utilization.free, - ) - except ZeroDivisionError: - utilization = 0 - message = "{}: {:.2f}/{:.2f}({:.2f}%) used. Threshold ({:.2f}%)".format( - resource_utilization.metric, - float(resource_utilization.total - resource_utilization.free), - resource_utilization.total, - utilization, - threshold, - ) - healthy = utilization <= threshold - return HealthCheckResult(message=message, healthy=healthy) - - -def quorum_ok(masters: int, quorum: int) -> bool: - return masters >= quorum - - -def check_threshold(percent_used: float, threshold: int) -> bool: - return (100 - percent_used) > threshold - - -def percent_used(total: float, used: float) -> float: - return round(used / float(total) * 100.0, 2) - - -def assert_cpu_health( - cpu_status: Tuple[float, float, float], threshold: int = 10 -) -> HealthCheckResult: - total, used, available = cpu_status - try: - perc_used = percent_used(total, used) - except ZeroDivisionError: - return HealthCheckResult( - message="Error reading total available cpu!", healthy=False - ) - - if check_threshold(perc_used, threshold): - return HealthCheckResult( - message="CPUs: %.2f / %d in use (%s)" - % (used, total, PaastaColors.green("%.2f%%" % perc_used)), - healthy=True, - ) - else: - return HealthCheckResult( - message="CRITICAL: Less than %d%% CPUs available. (Currently using %.2f%% of %d)" - % (threshold, perc_used, total), - healthy=False, - ) - - -def assert_memory_health( - memory_status: Tuple[float, float, float], threshold: int = 10 -) -> HealthCheckResult: - total: float - used: float - total, used, _ = memory_status - - total /= 1024 - used /= 1024 - - try: - perc_used = percent_used(total, used) - except ZeroDivisionError: - return HealthCheckResult( - message="Error reading total available memory!", healthy=False - ) - - if check_threshold(perc_used, threshold): - return HealthCheckResult( - message="Memory: %0.2f / %0.2fGB in use (%s)" - % (used, total, PaastaColors.green("%.2f%%" % perc_used)), - healthy=True, - ) - else: - return HealthCheckResult( - message="CRITICAL: Less than %d%% memory available. (Currently using %.2f%% of %.2fGB)" - % (threshold, perc_used, total), - healthy=False, - ) - - -def assert_disk_health( - disk_status: Tuple[float, float, float], threshold: int = 10 -) -> HealthCheckResult: - total: float - used: float - total, used, _ = disk_status - - total /= 1024 - used /= 1024 - - try: - perc_used = percent_used(total, used) - except ZeroDivisionError: - return HealthCheckResult( - message="Error reading total available disk!", healthy=False - ) - - if check_threshold(perc_used, threshold): - return HealthCheckResult( - message="Disk: %0.2f / %0.2fGB in use (%s)" - % (used, total, PaastaColors.green("%.2f%%" % perc_used)), - healthy=True, - ) - else: - return HealthCheckResult( - message="CRITICAL: Less than %d%% disk available. (Currently using %.2f%%)" - % (threshold, perc_used), - healthy=False, - ) - - -def assert_gpu_health( - gpu_status: Tuple[float, float, float], threshold: int = 0 -) -> HealthCheckResult: - total, used, available = gpu_status - - if math.isclose(total, 0): - # assume that no gpus is healthy since most machines don't have them - return HealthCheckResult(message="No GPUs found!", healthy=True) - else: - perc_used = percent_used(total, used) - - if check_threshold(perc_used, threshold): - # only whole gpus can be used - return HealthCheckResult( - message="GPUs: %d / %d in use (%s)" - % (used, total, PaastaColors.green("%.2f%%" % perc_used)), - healthy=True, - ) - else: - return HealthCheckResult( - message="CRITICAL: Less than %d%% GPUs available. (Currently using %.2f%% of %d)" - % (threshold, perc_used, total), - healthy=False, - ) - - -def assert_kube_pods_running( - kube_client: KubeClient, namespace: str -) -> HealthCheckResult: - statuses = [ - get_pod_status(pod) for pod in get_all_pods_cached(kube_client, namespace) - ] - running = statuses.count(PodStatus.RUNNING) - pending = statuses.count(PodStatus.PENDING) - failed = statuses.count(PodStatus.FAILED) - healthy = running > 0 - return HealthCheckResult( - message=f"Pods: running: {running} pending: {pending} failed: {failed}", - healthy=healthy, - ) - - -def get_kube_nodes_health_status( - nodes: Sequence[V1Node], -) -> Tuple[int, int]: - statuses = [is_node_ready(node) for node in nodes] - return statuses.count(True), statuses.count(False) - - -def assert_nodes_health( - nodes_health_status: Tuple[int, int], -) -> HealthCheckResult: - active, inactive = nodes_health_status - healthy = active > 0 - return HealthCheckResult( - message="Nodes: active: %d inactive: %d" % (active, inactive), healthy=healthy - ) - - -_KeyFuncRetT = Sequence[Tuple[str, str]] - - -_NodeGroupingFunctionT = Callable[[V1Node], _KeyFuncRetT] - -_NodeFilterFunctionT = Callable[[V1Node], bool] - -_NodeSortFunctionT = Callable[[Sequence[V1Node]], Sequence[V1Node]] - - -def key_func_for_attribute_multi_kube( - attributes: Sequence[str], -) -> Callable[[V1Node], _KeyFuncRetT]: - """Return a closure that given a node, will return the value of a list of - attributes, compiled into a hashable tuple - - :param attributes: the attributes to inspect in the slave - :returns: a closure, which takes a node and returns the value of those attributes - """ - - def get_attribute(node, attribute): - return node.metadata.labels.get(paasta_prefixed(attribute), "unknown") - - def key_func(node): - return tuple((a, get_attribute(node, a)) for a in attributes) - - return key_func - - -def group_slaves_by_key_func( - key_func: _NodeGroupingFunctionT, - slaves: Sequence[V1Node], - sort_func: _NodeSortFunctionT = None, -) -> Mapping[_KeyFuncRetT, Sequence[V1Node]]: - """Given a function for grouping slaves, return a - dict where keys are the unique values returned by - the key_func and the values are all those slaves which - have that specific value. - - :param key_func: a function which consumes a slave and returns a value - :param slaves: a list of slaves - :returns: a dict of key: [slaves] - """ - sorted_slaves: Sequence[V1Node] - if sort_func is None: - sorted_slaves = sorted(slaves, key=key_func) - else: - sorted_slaves = sort_func(slaves) - - return {k: list(v) for k, v in itertools.groupby(sorted_slaves, key=key_func)} - - -class ResourceUtilizationDict(TypedDict): - free: ResourceInfo - total: ResourceInfo - slave_count: int - - -_IEC_NUMBER_SUFFIXES = { - "k": 1000, - "m": 1000**-1, - "M": 1000**2, - "G": 1000**3, - "T": 1000**4, - "P": 1000**5, - "Ki": 1024, - "Mi": 1024**2, - "Gi": 1024**3, - "Ti": 1024**4, - "Pi": 1024**5, -} - - -def suffixed_number_value(s: str) -> float: - pattern = r"(?P\d+)(?P\w*)" - match = re.match(pattern, s) - number, suff = match.groups() - - if suff in _IEC_NUMBER_SUFFIXES: - return float(number) * _IEC_NUMBER_SUFFIXES[suff] - else: - return float(number) - - -def suffixed_number_dict_values(d: Mapping[Any, str]) -> Mapping[Any, float]: - return {k: suffixed_number_value(v) for k, v in d.items()} - - -def calculate_resource_utilization_for_kube_nodes( - nodes: Sequence[V1Node], - pods_by_node: Mapping[str, Sequence[V1Pod]], -) -> ResourceUtilizationDict: - """Given a list of Kubernetes nodes, calculate the total available - resource available and the resources consumed in that list of nodes. - - :param nodes: a list of Kubernetes nodes to calculate resource usage for - :returns: a dict, containing keys for "free" and "total" resources. Each of these keys - is a ResourceInfo tuple, exposing a number for cpu, disk and mem. - """ - resource_total_dict: _Counter[str] = Counter() - resource_free_dict: _Counter[str] = Counter() - for node in nodes: - allocatable_resources = suffixed_number_dict_values( - filter_kube_resources(node.status.allocatable) - ) - resource_total_dict.update(Counter(allocatable_resources)) - allocated_resources = allocated_node_resources(pods_by_node[node.metadata.name]) - resource_free_dict.update( - Counter( - { - "cpu": allocatable_resources["cpu"] - allocated_resources["cpu"], - "ephemeral-storage": allocatable_resources["ephemeral-storage"] - - allocated_resources["ephemeral-storage"], - "memory": allocatable_resources["memory"] - - allocated_resources["memory"], - } - ) - ) - return { - "free": ResourceInfo( - cpus=resource_free_dict["cpu"], - disk=resource_free_dict["ephemeral-storage"] / (1024**2), - mem=resource_free_dict["memory"] / (1024**2), - gpus=resource_free_dict.get("nvidia.com/gpu", 0), - ), - "total": ResourceInfo( - cpus=resource_total_dict["cpu"], - disk=resource_total_dict["ephemeral-storage"] / (1024**2), - mem=resource_total_dict["memory"] / (1024**2), - gpus=resource_total_dict.get("nvidia.com/gpu", 0), - ), - "slave_count": len(nodes), - } - - -def filter_slaves( - slaves: Sequence[V1Node], filters: Sequence[_NodeFilterFunctionT] -) -> Sequence[V1Node]: - """Filter slaves by attributes - - :param slaves: list of slaves to filter - :param filters: list of functions that take a slave and return whether the - slave should be included - :returns: list of slaves that return true for all the filters - """ - if filters is None: - return slaves - return [s for s in slaves if all([f(s) for f in filters])] - - -def get_resource_utilization_by_grouping_kube( - grouping_func: _NodeGroupingFunctionT, - kube_client: KubeClient, - *, - namespace: str, - filters: Sequence[_NodeFilterFunctionT] = [], - sort_func: _NodeSortFunctionT = None, -) -> Mapping[_KeyFuncRetT, ResourceUtilizationDict]: - """Given a function used to group nodes, calculate resource utilization - for each value of a given attribute. - - :grouping_func: a function that given a node, will return the value of an - attribute to group by. - :param kube_client: the Kubernetes client - :param filters: filters to apply to the nodes in the calculation, with - filtering preformed by filter_slaves - :param sort_func: a function that given a list of nodes, will return the - sorted list of nodes. - :returns: a dict of {attribute_value: resource_usage}, where resource usage - is the dict returned by ``calculate_resource_utilization_for_kube_nodes`` for - nodes grouped by attribute value. - """ - nodes = get_all_nodes_cached(kube_client) - nodes = filter_slaves(nodes, filters) - if len(nodes) == 0: - raise ValueError("There are no nodes registered in the Kubernetes.") - - node_groupings = group_slaves_by_key_func(grouping_func, nodes, sort_func) - - pods = get_all_pods_cached(kube_client, namespace) - - pods_by_node = {} - for node in nodes: - pods_by_node[node.metadata.name] = [ - pod for pod in pods if pod.spec.node_name == node.metadata.name - ] - return { - attribute_value: calculate_resource_utilization_for_kube_nodes( - nodes, pods_by_node - ) - for attribute_value, nodes in node_groupings.items() - } - - -def resource_utillizations_from_resource_info( - total: ResourceInfo, free: ResourceInfo -) -> Sequence[ResourceUtilization]: - """ - Given two ResourceInfo tuples, one for total and one for free, - create a ResourceUtilization tuple for each metric in the ResourceInfo. - :param total: - :param free: - :returns: ResourceInfo for a metric - """ - return [ - ResourceUtilization(metric=field, total=total[index], free=free[index]) - for index, field in enumerate(ResourceInfo._fields) - ] - - -def get_kube_resource_utilization_health( - kube_client: KubeClient, -) -> Sequence[HealthCheckResult]: - """Perform healthchecks against Kubernetes. - :param kube_client: the KUbernetes client - :returns: a list of HealthCheckResult tuples - """ - - nodes = get_all_nodes_cached(kube_client) - - return [ - assert_cpu_health(get_kube_cpu_status(nodes)), - assert_memory_health(get_kube_memory_status(nodes)), - assert_disk_health(get_kube_disk_status(nodes)), - assert_gpu_health(get_kube_gpu_status(nodes)), - assert_nodes_health(get_kube_nodes_health_status(nodes)), - ] - - -def run_healthchecks_with_param( - param: Any, - healthcheck_functions: Sequence[Callable[..., HealthCheckResult]], - format_options: Mapping[str, Any] = {}, -) -> Sequence[HealthCheckResult]: - return [ - healthcheck(param, **format_options) for healthcheck in healthcheck_functions - ] - - -def assert_kube_deployments( - kube_client: KubeClient, namespace: str -) -> HealthCheckResult: - num_deployments = len(list_all_deployments(kube_client, namespace)) - return HealthCheckResult( - message=f"Kubernetes deployments: {num_deployments:>3}", healthy=True - ) - - -def get_kube_status( - kube_client: KubeClient, namespace: str -) -> Sequence[HealthCheckResult]: - """Gather information about Kubernetes. - :param kube_client: the KUbernetes client - :return: string containing the status - """ - return run_healthchecks_with_param( - [kube_client, namespace], [assert_kube_deployments, assert_kube_pods_running] - ) - - -def critical_events_in_outputs(healthcheck_outputs): - """Given a list of HealthCheckResults return those which are unhealthy.""" - return [ - healthcheck - for healthcheck in healthcheck_outputs - if healthcheck.healthy is False - ] - - -def generate_summary_for_check(name, ok): - """Given a check name and a boolean indicating if the service is OK, return - a formatted message. - """ - status = PaastaColors.green("OK") if ok is True else PaastaColors.red("CRITICAL") - summary = f"{name} Status: {status}" - return summary - - -def status_for_results(healthcheck_results): - """Given a list of HealthCheckResult tuples, return the ok status - for each one. - :param healthcheck_results: a list of HealthCheckResult tuples - :returns: a list of booleans. - """ - return [result.healthy for result in healthcheck_results] - - -def print_results_for_healthchecks(summary, ok, results, verbose, indent=2): - print(summary) - if verbose >= 1: - for health_check_result in results: - if health_check_result.healthy: - print_with_indent(health_check_result.message, indent) - else: - print_with_indent(PaastaColors.red(health_check_result.message), indent) - elif not ok: - unhealthy_results = critical_events_in_outputs(results) - for health_check_result in unhealthy_results: - print_with_indent(PaastaColors.red(health_check_result.message), indent) - - -def healthcheck_result_resource_utilization_pair_for_resource_utilization( - utilization, threshold -): - """Given a ResourceUtilization, produce a tuple of (HealthCheckResult, ResourceUtilization), - where that HealthCheckResult describes the 'health' of a given utilization. - :param utilization: a ResourceUtilization tuple - :param threshold: a threshold which decides the health of the given ResourceUtilization - :returns: a tuple of (HealthCheckResult, ResourceUtilization) - """ - return ( - healthcheck_result_for_resource_utilization(utilization, threshold), - utilization, - ) - - -def format_table_column_for_healthcheck_resource_utilization_pair( - healthcheck_utilization_pair, -): - """Given a tuple of (HealthCheckResult, ResourceUtilization), return a - string representation of the ResourceUtilization such that it is formatted - according to the value of HealthCheckResult.healthy. - - :param healthcheck_utilization_pair: a tuple of (HealthCheckResult, ResourceUtilization) - :returns: a string representing the ResourceUtilization. - """ - color_func = ( - PaastaColors.green - if healthcheck_utilization_pair[0].healthy - else PaastaColors.red - ) - utilization = ( - healthcheck_utilization_pair[1].total - healthcheck_utilization_pair[1].free - ) - if int(healthcheck_utilization_pair[1].total) == 0: - utilization_perc = 100 - else: - utilization_perc = ( - utilization / float(healthcheck_utilization_pair[1].total) * 100 - ) - if healthcheck_utilization_pair[1].metric not in ["cpus", "gpus"]: - return color_func( - "{}/{} ({:.2f}%)".format( - naturalsize(utilization * 1024 * 1024, gnu=True), - naturalsize( - healthcheck_utilization_pair[1].total * 1024 * 1024, gnu=True - ), - utilization_perc, - ) - ) - else: - return color_func( - "{:.2f}/{:.0f} ({:.2f}%)".format( - utilization, healthcheck_utilization_pair[1].total, utilization_perc - ) - ) - - -def format_row_for_resource_utilization_healthchecks(healthcheck_utilization_pairs): - """Given a list of (HealthCheckResult, ResourceUtilization) tuples, return a list with each of those - tuples represented by a formatted string. - - :param healthcheck_utilization_pairs: a list of (HealthCheckResult, ResourceUtilization) tuples. - :returns: a list containing a string representation of each (HealthCheckResult, ResourceUtilization) tuple. - """ - return [ - format_table_column_for_healthcheck_resource_utilization_pair(pair) - for pair in healthcheck_utilization_pairs - ] - - -def get_table_rows_for_resource_info_dict( - attribute_values, healthcheck_utilization_pairs -): - """A wrapper method to join together - - :param attribute: The attribute value and formatted columns to be shown in - a single row. :param attribute_value: The value of the attribute - associated with the row. This becomes index 0 in the array returned. - :param healthcheck_utilization_pairs: a list of 2-tuples, where each tuple has the elements - (HealthCheckResult, ResourceUtilization) - :returns: a list of strings, representing a row in a table to be formatted. - """ - return attribute_values + format_row_for_resource_utilization_healthchecks( - healthcheck_utilization_pairs - ) diff --git a/paasta_tools/utils.py b/paasta_tools/utils.py index 9dc25d3a2b..1ff510e897 100644 --- a/paasta_tools/utils.py +++ b/paasta_tools/utils.py @@ -4350,3 +4350,29 @@ def write_yaml_configuration_file( default_flow_style=False, allow_unicode=False, ) + + +_IEC_NUMBER_SUFFIXES = { + "k": 1000, + "m": 1000**-1, + "M": 1000**2, + "G": 1000**3, + "T": 1000**4, + "P": 1000**5, + "Ki": 1024, + "Mi": 1024**2, + "Gi": 1024**3, + "Ti": 1024**4, + "Pi": 1024**5, +} + + +def suffixed_number_value(s: str) -> float: + pattern = r"(?P\d+)(?P\w*)" + match = re.match(pattern, s) + number, suff = match.groups() + + if suff in _IEC_NUMBER_SUFFIXES: + return float(number) * _IEC_NUMBER_SUFFIXES[suff] + else: + return float(number) diff --git a/tests/metrics/test_metastatus_lib.py b/tests/metrics/test_metastatus_lib.py deleted file mode 100644 index 48f716238e..0000000000 --- a/tests/metrics/test_metastatus_lib.py +++ /dev/null @@ -1,460 +0,0 @@ -#!/usr/bin/env python -# Copyright 2015-2016 Yelp Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import re - -import mock -from kubernetes.client import V1Container -from kubernetes.client import V1Node -from kubernetes.client import V1NodeStatus -from kubernetes.client import V1ObjectMeta -from kubernetes.client import V1Pod -from kubernetes.client import V1PodSpec -from kubernetes.client import V1PodStatus -from kubernetes.client import V1ResourceRequirements -from mock import Mock -from mock import patch - -from paasta_tools.metrics import metastatus_lib -from paasta_tools.utils import PaastaColors - - -def test_ok_check_threshold(): - assert metastatus_lib.check_threshold(10, 30) - - -def test_fail_check_threshold(): - assert not metastatus_lib.check_threshold(80, 30) - - -def test_get_kube_cpu_status(): - fake_nodes = [ - V1Node(status=V1NodeStatus(allocatable={"cpu": "1"}, capacity={"cpu": "3"})) - ] - total, used, available = metastatus_lib.get_kube_cpu_status(fake_nodes) - assert total == 3 - assert used == 2 - assert available == 1 - - -def test_ok_cpu_health(): - ok_status = (10, 1, 9) - ok_output, ok_health = metastatus_lib.assert_cpu_health(ok_status) - assert ok_health - assert "CPUs: 1.00 / 10 in use (%s)" % PaastaColors.green("10.00%") in ok_output - - -def test_bad_cpu_health(): - failure_status = (10, 9, 1) - failure_output, failure_health = metastatus_lib.assert_cpu_health(failure_status) - assert not failure_health - assert ( - "CRITICAL: Less than 10% CPUs available. (Currently using 90.00% of 10)" - in failure_output - ) - - -def test_assert_memory_health(): - ok_status = (1024, 512, 512) - ok_output, ok_health = metastatus_lib.assert_memory_health(ok_status) - assert ok_health - assert ( - "Memory: 0.50 / 1.00GB in use (%s)" % PaastaColors.green("50.00%") in ok_output - ) - - -def test_failing_memory_health(): - failure_status = (1024, 1000, 24) - failure_output, failure_health = metastatus_lib.assert_memory_health(failure_status) - assert not failure_health - assert ( - "CRITICAL: Less than 10% memory available. (Currently using 97.66% of 1.00GB)" - in failure_output - ) - - -def test_assert_disk_health(): - ok_status = (1024, 512, 512) - ok_output, ok_health = metastatus_lib.assert_disk_health(ok_status) - assert ok_health - assert "Disk: 0.50 / 1.00GB in use (%s)" % PaastaColors.green("50.00%") in ok_output - - -def test_failing_disk_health(): - failure_status = (1024, 1000, 24) - failure_output, failure_health = metastatus_lib.assert_disk_health(failure_status) - assert not failure_health - assert ( - "CRITICAL: Less than 10% disk available. (Currently using 97.66%)" - in failure_output - ) - - -def test_assert_gpu_health(): - ok_status = (3, 1, 2) - ok_output, ok_health = metastatus_lib.assert_gpu_health(ok_status) - assert ok_health - assert "GPUs: 1 / 3 in use (%s)" % PaastaColors.green("33.33%") in ok_output - - -def test_assert_no_gpu_health(): - zero_status = (0, 0, 0) - zero_output, zero_health = metastatus_lib.assert_gpu_health(zero_status) - assert zero_health - assert "No GPUs found!" in zero_output - - -def test_assert_bad_gpu_health(): - bad_status = (4, 3, 1) - bad_output, bad_health = metastatus_lib.assert_gpu_health(bad_status, threshold=50) - assert not bad_health - assert ( - "CRITICAL: Less than 50% GPUs available. (Currently using 75.00% of 4)" - in bad_output - ) - - -def test_assert_kube_deployments(): - with mock.patch( - "paasta_tools.metrics.metastatus_lib.list_all_deployments", autospec=True - ) as mock_list_all_deployments: - client = Mock() - mock_list_all_deployments.return_value = ["KubeDeployment:1"] - output, ok = metastatus_lib.assert_kube_deployments(client, namespace="paasta") - assert re.match("Kubernetes deployments: 1", output) - assert ok - - -def test_assert_kube_pods_running(): - with mock.patch( - "paasta_tools.metrics.metastatus_lib.get_all_pods_cached", autospec=True - ) as mock_get_all_pods: - client = Mock() - mock_get_all_pods.return_value = [ - V1Pod(status=V1PodStatus(phase="Running")), - V1Pod(status=V1PodStatus(phase="Pending")), - V1Pod(status=V1PodStatus(phase="Pending")), - V1Pod(status=V1PodStatus(phase="Failed")), - V1Pod(status=V1PodStatus(phase="Failed")), - V1Pod(status=V1PodStatus(phase="Failed")), - ] - output, ok = metastatus_lib.assert_kube_pods_running(client, namespace="paasta") - assert re.match("Pods: running: 1 pending: 2 failed: 3", output) - assert ok - - -def test_assert_nodes_health(): - nodes_health_status = (10, 10) - output, ok = metastatus_lib.assert_nodes_health(nodes_health_status) - assert "Nodes: active: 10 inactive: 10" in output - assert ok - - -def test_status_for_results(): - assert metastatus_lib.status_for_results( - [ - metastatus_lib.HealthCheckResult(message="message", healthy=True), - metastatus_lib.HealthCheckResult(message="message", healthy=False), - ] - ) == [True, False] - - -def test_generate_summary_for_results_ok(): - assert metastatus_lib.generate_summary_for_check( - "Myservice", True - ) == "Myservice Status: %s" % PaastaColors.green("OK") - - -def test_generate_summary_for_results_critical(): - assert metastatus_lib.generate_summary_for_check( - "Myservice", False - ) == "Myservice Status: %s" % PaastaColors.red("CRITICAL") - - -def test_critical_events_in_outputs(): - assert metastatus_lib.critical_events_in_outputs( - [ - metastatus_lib.HealthCheckResult("myservice", True), - metastatus_lib.HealthCheckResult("myservice_false", False), - ] - ) == [("myservice_false", False)] - - -def test_filter_kube_resources(): - test_resource_dictionary = { - "cpu": 0, - "memory": 1, - "MEMORY": 2, - "garbage_data": 3, - "ephemeral-storage": 4, - "nvidia.com/gpu": 5, - } - expected = {"cpu": 0, "memory": 1, "ephemeral-storage": 4, "nvidia.com/gpu": 5} - assert metastatus_lib.filter_kube_resources(test_resource_dictionary) == expected - - -def test_group_slaves_by_key_func(): - slaves = [ - { - "id": "somenametest-slave", - "hostname": "test.somewhere.www", - "resources": {"cpus": 75, "disk": 250, "mem": 100}, - "attributes": {"habitat": "somenametest-habitat"}, - }, - { - "id": "somenametest-slave2", - "hostname": "test2.somewhere.www", - "resources": {"cpus": 500, "disk": 200, "mem": 750}, - "attributes": {"habitat": "somenametest-habitat-2"}, - }, - ] - actual = metastatus_lib.group_slaves_by_key_func( - lambda x: x["attributes"]["habitat"], slaves - ) - assert len(actual.items()) == 2 - for k, v in actual.items(): - print(k, v) - assert len(list(v)) == 1 - - -def test_calculate_resource_utilization_for_kube_nodes(): - fake_nodes = [ - V1Node( - metadata=V1ObjectMeta(name="fake_node1"), - status=V1NodeStatus( - allocatable={ - "cpu": "500", - "ephemeral-storage": "200Mi", - "memory": "750Mi", - }, - ), - ) - ] - fake_pods_by_node = { - "fake_node1": [ - V1Pod( - metadata=V1ObjectMeta(name="pod1"), - status=V1PodStatus(phase="Running"), - spec=V1PodSpec( - containers=[ - V1Container( - name="container1", - resources=V1ResourceRequirements( - requests={ - "cpu": "20", - "ephemeral-storage": "20Mi", - "memory": "20Mi", - } - ), - ) - ] - ), - ) - ] - } - free = metastatus_lib.calculate_resource_utilization_for_kube_nodes( - nodes=fake_nodes, pods_by_node=fake_pods_by_node - )["free"] - - assert free.cpus == 480 - assert free.mem == 730 - assert free.disk == 180 - - -def test_healthcheck_result_for_resource_utilization_ok(): - expected_message = "cpus: 5.00/10.00(50.00%) used. Threshold (90.00%)" - expected = metastatus_lib.HealthCheckResult(message=expected_message, healthy=True) - resource_utilization = metastatus_lib.ResourceUtilization( - metric="cpus", total=10, free=5 - ) - assert ( - metastatus_lib.healthcheck_result_for_resource_utilization( - resource_utilization=resource_utilization, threshold=90 - ) - == expected - ) - - -def test_healthcheck_result_for_resource_utilization_unhealthy(): - expected_message = "cpus: 5.00/10.00(50.00%) used. Threshold (10.00%)" - expected = metastatus_lib.HealthCheckResult(message=expected_message, healthy=False) - resource_utilization = metastatus_lib.ResourceUtilization( - metric="cpus", total=10, free=5 - ) - assert ( - metastatus_lib.healthcheck_result_for_resource_utilization( - resource_utilization=resource_utilization, threshold=10 - ) - == expected - ) - - -def test_healthcheck_result_for_resource_utilization_zero(): - expected_message = "cpus: 0.00/0.00(0.00%) used. Threshold (10.00%)" - expected = metastatus_lib.HealthCheckResult(message=expected_message, healthy=True) - resource_utilization = metastatus_lib.ResourceUtilization( - metric="cpus", total=0, free=0 - ) - assert ( - metastatus_lib.healthcheck_result_for_resource_utilization( - resource_utilization=resource_utilization, threshold=10 - ) - == expected - ) - - -def test_format_table_column_for_healthcheck_resource_utilization_pair_healthy_human_non_cpu(): - fake_healthcheckresult = Mock() - fake_healthcheckresult.healthy = True - fake_healthcheckresult.metric = "mem" - fake_resource_utilization = Mock() - fake_resource_utilization.free = 10 - fake_resource_utilization.total = 20 - fake_resource_utilization.metric = "mem" - expected = PaastaColors.green("10.0M/20.0M (50.00%)") - assert ( - metastatus_lib.format_table_column_for_healthcheck_resource_utilization_pair( - (fake_healthcheckresult, fake_resource_utilization) - ) - == expected - ) - - -def test_format_table_column_for_healthcheck_resource_utilization_pair_healthy_human_cpu(): - fake_healthcheckresult = Mock() - fake_healthcheckresult.healthy = True - fake_healthcheckresult.metric = "mem" - fake_resource_utilization = Mock() - fake_resource_utilization.free = 10.114 - fake_resource_utilization.total = 20 - fake_resource_utilization.metric = "cpus" - expected = PaastaColors.green("9.89/20 (49.43%)") - assert ( - metastatus_lib.format_table_column_for_healthcheck_resource_utilization_pair( - (fake_healthcheckresult, fake_resource_utilization) - ) - == expected - ) - - -def test_format_table_column_for_healthcheck_resource_utilization_pair_unhealthy_human(): - fake_healthcheckresult = Mock() - fake_healthcheckresult.healthy = False - fake_healthcheckresult.metric = "mem" - fake_resource_utilization = Mock() - fake_resource_utilization.free = 10 - fake_resource_utilization.total = 20 - expected = PaastaColors.red("10.0M/20.0M (50.00%)") - assert ( - metastatus_lib.format_table_column_for_healthcheck_resource_utilization_pair( - (fake_healthcheckresult, fake_resource_utilization) - ) - == expected - ) - - -def test_format_table_column_for_healthcheck_resource_utilization_pair_zero_human(): - fake_healthcheckresult = Mock() - fake_healthcheckresult.healthy = False - fake_healthcheckresult.metric = "mem" - fake_resource_utilization = Mock() - fake_resource_utilization.free = 0 - fake_resource_utilization.total = 0 - expected = PaastaColors.red("0B/0B (100.00%)") - assert ( - metastatus_lib.format_table_column_for_healthcheck_resource_utilization_pair( - (fake_healthcheckresult, fake_resource_utilization) - ) - == expected - ) - - -@patch( - "paasta_tools.metrics.metastatus_lib.format_table_column_for_healthcheck_resource_utilization_pair", - autospec=True, -) -def test_format_row_for_resource_utilization_checks(mock_format_row): - fake_pairs = [(Mock(), Mock()), (Mock(), Mock()), (Mock(), Mock())] - assert metastatus_lib.format_row_for_resource_utilization_healthchecks(fake_pairs) - assert mock_format_row.call_count == len(fake_pairs) - - -@patch( - "paasta_tools.metrics.metastatus_lib.format_row_for_resource_utilization_healthchecks", - autospec=True, -) -def test_get_table_rows_for_resource_usage_dict(mock_format_row): - fake_pairs = [(Mock(), Mock()), (Mock(), Mock()), (Mock(), Mock())] - mock_format_row.return_value = ["10/10", "10/10", "10/10"] - actual = metastatus_lib.get_table_rows_for_resource_info_dict( - ["myhabitat"], fake_pairs - ) - assert actual == ["myhabitat", "10/10", "10/10", "10/10"] - - -def test_get_kube_memory_status(): - fake_nodes = [ - V1Node( - status=V1NodeStatus( - allocatable={"memory": "1Gi"}, capacity={"memory": "4Gi"} - ) - ) - ] - total, used, available = metastatus_lib.get_kube_memory_status(fake_nodes) - assert total == 4 * 1024 - assert used == 3 * 1024 - assert available == 1 * 1024 - - -def test_get_kube_disk_status(): - fake_nodes = [ - V1Node( - status=V1NodeStatus( - allocatable={"ephemeral-storage": "1Ti"}, - capacity={"ephemeral-storage": "4Ti"}, - ) - ) - ] - total, used, available = metastatus_lib.get_kube_disk_status(fake_nodes) - assert total == 4 * 1024**2 - assert used == 3 * 1024**2 - assert available == 1 * 1024**2 - - -def test_get_kube_gpu_status(): - fake_nodes = [ - V1Node( - status=V1NodeStatus( - allocatable={"nvidia.com/gpu": "1"}, capacity={"nvidia.com/gpu": "4"} - ) - ) - ] - total, used, available = metastatus_lib.get_kube_gpu_status(fake_nodes) - assert total == 4 - assert used == 3 - assert available == 1 - - -def test_suffixed_number_value(): - assert metastatus_lib.suffixed_number_value("5k") == 5 * 1000 - assert metastatus_lib.suffixed_number_value("5m") == 5 * 1000**-1 - assert metastatus_lib.suffixed_number_value("5M") == 5 * 1000**2 - assert metastatus_lib.suffixed_number_value("5G") == 5 * 1000**3 - assert metastatus_lib.suffixed_number_value("5T") == 5 * 1000**4 - assert metastatus_lib.suffixed_number_value("5P") == 5 * 1000**5 - assert metastatus_lib.suffixed_number_value("5Ki") == 5 * 1024 - assert metastatus_lib.suffixed_number_value("5Mi") == 5 * 1024**2 - assert metastatus_lib.suffixed_number_value("5Gi") == 5 * 1024**3 - assert metastatus_lib.suffixed_number_value("5Ti") == 5 * 1024**4 - assert metastatus_lib.suffixed_number_value("5Pi") == 5 * 1024**5 diff --git a/tests/test_utils.py b/tests/test_utils.py index b42fe21054..ec6bb1c5b4 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -3012,3 +3012,17 @@ def test_validate_pool_error(cluster, pool, system_paasta_config): ) def test_get_git_sha_from_dockerurl(docker_url, long, expected): assert utils.get_git_sha_from_dockerurl(docker_url, long) == expected + + +def test_suffixed_number_value(): + assert utils.suffixed_number_value("5k") == 5 * 1000 + assert utils.suffixed_number_value("5m") == 5 * 1000**-1 + assert utils.suffixed_number_value("5M") == 5 * 1000**2 + assert utils.suffixed_number_value("5G") == 5 * 1000**3 + assert utils.suffixed_number_value("5T") == 5 * 1000**4 + assert utils.suffixed_number_value("5P") == 5 * 1000**5 + assert utils.suffixed_number_value("5Ki") == 5 * 1024 + assert utils.suffixed_number_value("5Mi") == 5 * 1024**2 + assert utils.suffixed_number_value("5Gi") == 5 * 1024**3 + assert utils.suffixed_number_value("5Ti") == 5 * 1024**4 + assert utils.suffixed_number_value("5Pi") == 5 * 1024**5 From f1a16e2697c9dd27272495912507b0f06ee9f446 Mon Sep 17 00:00:00 2001 From: Evan Krall Date: Wed, 25 Jun 2025 16:54:09 -0700 Subject: [PATCH 11/15] Clean up unused --scheduler parameter from get_running_task_allocation.py --- .../contrib/get_running_task_allocation.py | 27 ++----------------- 1 file changed, 2 insertions(+), 25 deletions(-) diff --git a/paasta_tools/contrib/get_running_task_allocation.py b/paasta_tools/contrib/get_running_task_allocation.py index c87d3ce80b..78c9c324e5 100644 --- a/paasta_tools/contrib/get_running_task_allocation.py +++ b/paasta_tools/contrib/get_running_task_allocation.py @@ -36,7 +36,6 @@ class TaskAllocationInfo(NamedTuple): host_ip: str git_sha: str config_sha: str - mesos_container_id: str # Because Mesos task info does not have docker id namespace: Optional[str] @@ -173,7 +172,6 @@ def get_kubernetes_task_allocation_info( host_ip=info.get("host_ip"), git_sha=info.get("git_sha"), config_sha=info.get("config_sha"), - mesos_container_id=None, namespace=namespace, ) ) @@ -181,26 +179,8 @@ def get_kubernetes_task_allocation_info( return info_list -def get_task_allocation_info( - scheduler: str, - namespace: str, - kube_client: Optional[KubeClient], -) -> Iterable[TaskAllocationInfo]: - if scheduler == "kubernetes": - return get_kubernetes_task_allocation_info(namespace, kube_client) - else: - return [] - - def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="") - parser.add_argument( - "--scheduler", - help="Scheduler to get task info from", - dest="scheduler", - default="kubernetes", - choices=["kubernetes"], - ) parser.add_argument( "--additional-namespaces-exclude", help="full names of namespaces to not fetch allocation info for those that don't match --namespace-prefix-exlude", @@ -244,18 +224,15 @@ def main(args: argparse.Namespace) -> None: all_namespaces, args.additional_namespaces_exclude, ): - display_task_allocation_info( - cluster, args.scheduler, matching_namespace, kube_client - ) + display_task_allocation_info(cluster, matching_namespace, kube_client) def display_task_allocation_info( cluster: str, - scheduler: str, namespace: str, kube_client: Optional[KubeClient], ) -> None: - info_list = get_task_allocation_info(scheduler, namespace, kube_client) + info_list = get_kubernetes_task_allocation_info(namespace, kube_client) timestamp = time.time() for info in info_list: info_dict = info._asdict() From 6893455fc0f162608a0986a3ed4296c10459c6b1 Mon Sep 17 00:00:00 2001 From: Evan Krall Date: Wed, 25 Jun 2025 17:03:37 -0700 Subject: [PATCH 12/15] Put back old execute_in_container function, but fix a bug that can happen if something besides paasta creates an exec before paasta does. --- paasta_tools/cli/cmds/local_run.py | 37 ++++++++++++++++++------------ 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/paasta_tools/cli/cmds/local_run.py b/paasta_tools/cli/cmds/local_run.py index 26b52fe468..5ee7b7e24a 100755 --- a/paasta_tools/cli/cmds/local_run.py +++ b/paasta_tools/cli/cmds/local_run.py @@ -59,6 +59,7 @@ from paasta_tools.utils import get_username from paasta_tools.utils import InstanceConfig from paasta_tools.utils import is_secrets_for_teams_enabled +from paasta_tools.utils import is_using_unprivileged_containers from paasta_tools.utils import list_clusters from paasta_tools.utils import list_services from paasta_tools.utils import load_system_paasta_config @@ -133,22 +134,28 @@ def perform_tcp_healthcheck(url, timeout): return (False, "%s (timeout %d seconds)" % (os.strerror(result), timeout)) -def execute_in_container(docker_client, container_id, command, timeout): - """Execute a command inside a Docker container +def execute_in_container(docker_client, container_id, cmd, timeout): + container_info = docker_client.inspect_container(container_id) + exec_id = None + if ( + container_info["ExecIDs"] + and len(container_info["ExecIDs"]) > 0 + and not is_using_unprivileged_containers() + ): + for possible_exec_id in container_info["ExecIDs"]: + exec_info = docker_client.exec_inspect(possible_exec_id)["ProcessConfig"] + if exec_info["entrypoint"] == "/bin/sh" and exec_info["arguments"] == [ + "-c", + cmd, + ]: + exec_id = possible_exec_id + break - :param docker_client: Docker client object - :param container_id: Docker container id - :param command: command to execute - :param timeout: timeout in seconds - :returns: tuple of (output, return_code) - """ - try: - exec_result = docker_client.exec_create(container_id, command) - exec_output = docker_client.exec_start(exec_result["Id"]) - exec_inspect = docker_client.exec_inspect(exec_result["Id"]) - return (exec_output.decode("utf-8"), exec_inspect["ExitCode"]) - except Exception as e: - return (str(e), 1) + if exec_id is None: + exec_id = docker_client.exec_create(container_id, ["/bin/sh", "-c", cmd])["Id"] + output = docker_client.exec_start(exec_id, stream=False) + return_code = docker_client.exec_inspect(exec_id)["ExitCode"] + return (output, return_code) def perform_cmd_healthcheck(docker_client, container_id, command, timeout): From cae5485bc58c6294e358237a333cd308ef155319 Mon Sep 17 00:00:00 2001 From: Evan Krall Date: Wed, 25 Jun 2025 17:07:38 -0700 Subject: [PATCH 13/15] Remove linting of now-deleted code. --- tox.ini | 1 - 1 file changed, 1 deletion(-) diff --git a/tox.ini b/tox.ini index 52399ba10d..d001b2a006 100644 --- a/tox.ini +++ b/tox.ini @@ -153,7 +153,6 @@ deps = commands = # TODO: upgrade behave if they ever take this reasonable PR pip install git+https://github.com/Yelp/behave@1.2.5-issue_533-fork - pylint -E {toxinidir}/paasta_tools/mesos/ --ignore master.py,task.py behave {posargs} [testenv:mypy] From f0e5b44d1ad55e2bdb78463d647a4157f6a4459b Mon Sep 17 00:00:00 2001 From: Evan Krall Date: Wed, 25 Jun 2025 17:17:11 -0700 Subject: [PATCH 14/15] Move steps from previously-deleted general_itests/steps/paasta_execute_docker_command.py to general_itests/steps/local_run_steps.py to fix itests --- .../paasta_execute_docker_command.feature | 20 ------------------- 1 file changed, 20 deletions(-) delete mode 100644 general_itests/paasta_execute_docker_command.feature diff --git a/general_itests/paasta_execute_docker_command.feature b/general_itests/paasta_execute_docker_command.feature deleted file mode 100644 index 5a7ea25ec0..0000000000 --- a/general_itests/paasta_execute_docker_command.feature +++ /dev/null @@ -1,20 +0,0 @@ -Feature: paasta_execute_docker_command can find and run commands inside a docker container - - Scenario: paasta_execute_docker_command can run a trivial command - Given Docker is available - And a running docker container with task id foo and image ubuntu:xenial - When we paasta_execute_docker_command a command with exit code 0 in container with task id foo - Then the exit code is 0 - - Scenario: paasta_execute_docker_command exits when it cannot find the container - Given Docker is available - And a running docker container with task id foo and image ubuntu:xenial - When we paasta_execute_docker_command a command with exit code 0 in container with task id bar - Then the exit code is 1 - - Scenario: paasta_execute_docker_command reuses exec instances - Given Docker is available - And a running docker container with task id foo and image ubuntu:xenial - When we paasta_execute_docker_command a command with exit code 0 in container with task id foo - And we paasta_execute_docker_command a command with exit code 0 in container with task id foo - Then the docker container has at most 1 exec instances From ebaffaa07f2f53e89f3ac829c9ceeb25ac908489 Mon Sep 17 00:00:00 2001 From: Evan Krall Date: Wed, 25 Jun 2025 17:17:35 -0700 Subject: [PATCH 15/15] Move steps from previously-deleted general_itests/steps/paasta_execute_docker_command.py to general_itests/steps/local_run_steps.py to fix itests --- general_itests/steps/local_run_steps.py | 28 +++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/general_itests/steps/local_run_steps.py b/general_itests/steps/local_run_steps.py index 8d08c02fe5..2b5f6d444d 100644 --- a/general_itests/steps/local_run_steps.py +++ b/general_itests/steps/local_run_steps.py @@ -16,9 +16,11 @@ from behave import given from behave import then from behave import when +from docker.errors import APIError from path import Path from paasta_tools.utils import _run +from paasta_tools.utils import get_docker_client @given("a simple service to test") @@ -86,3 +88,29 @@ def local_run_on_tron_action(context): "--build " ) context.return_code, context.output = _run(command=local_run_cmd, timeout=90) + + +@given("Docker is available") +def docker_is_available(context): + docker_client = get_docker_client() + assert docker_client.ping() + context.docker_client = docker_client + + +@given("a running docker container with task id {task_id} and image {image_name}") +def create_docker_container(context, task_id, image_name): + container_name = "paasta-itest-execute-in-containers" + image_name = os.getenv("DOCKER_REGISTRY", "docker-dev.yelpcorp.com/") + image_name + try: + context.docker_client.remove_container(container_name, force=True) + except APIError: + pass + context.docker_client.pull(image_name) + container = context.docker_client.create_container( + name=container_name, + image=image_name, + command="/bin/sleep infinity", + environment={"MESOS_TASK_ID": task_id}, + ) + context.docker_client.start(container=container.get("Id")) + context.running_container_id = container.get("Id")