From 1f4c34a3b9fd6f6a051e963307a694ec8d8e75a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jelizaveta=20Leme=C5=A1eva?= Date: Wed, 4 Sep 2024 16:37:42 +0200 Subject: [PATCH] feat(job_manager): log pod errors to warning (#468) --- .../kubernetes_job_manager.py | 20 +++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/reana_job_controller/kubernetes_job_manager.py b/reana_job_controller/kubernetes_job_manager.py index 8f1783bd..09309e29 100644 --- a/reana_job_controller/kubernetes_job_manager.py +++ b/reana_job_controller/kubernetes_job_manager.py @@ -287,12 +287,21 @@ def _get_containers_logs(cls, job_pod) -> Optional[str]: ) pod_logs += "{}: :\n {}\n".format(container.name, container_log) if hasattr(container.state.terminated, "reason"): + if container.state.terminated.reason != "Completed": + message = "Job pod {} was terminated, reason: {}, message: {}".format( + job_pod.metadata.name, + container.state.terminated.reason, + container.state.terminated.message, + ) + logging.warn(message) pod_logs += "\n{}\n".format(container.state.terminated.reason) elif container.state.waiting: # No need to fetch logs, as the container has not started yet. - pod_logs += "Container {} failed, error: {}".format( + message = "Container {} failed, error: {}".format( container.name, container.state.waiting.message ) + logging.warn(message) + pod_logs += message return pod_logs except client.rest.ApiException as e: @@ -334,7 +343,9 @@ def get_logs(cls, backend_job_id, **kwargs): if not logs: logs = "" - message = f"\n{job_pod.status.reason}\nThe job was killed due to exceeding timeout" + message = ( + f"{job_pod.status.reason}: The job was killed due to exceeding timeout" + ) try: specified_timeout = job_pod.spec.active_deadline_seconds @@ -345,8 +356,9 @@ def get_logs(cls, backend_job_id, **kwargs): f"Kubernetes job id: {backend_job_id}. Could not get job timeout from Job spec." ) - logs += message - logging.info( + logs += "\n{message}\n" + logging.warn(message) + logging.warn( f"Kubernetes job id: {backend_job_id} was killed due to timeout." )