Skip to content

Commit

Permalink
feat(job_manager): log pod errors to warning (reanahub#468)
Browse files Browse the repository at this point in the history
  • Loading branch information
jlemesh committed Sep 6, 2024
1 parent 9f5a367 commit 1f4c34a
Showing 1 changed file with 16 additions and 4 deletions.
20 changes: 16 additions & 4 deletions reana_job_controller/kubernetes_job_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,12 +287,21 @@ def _get_containers_logs(cls, job_pod) -> Optional[str]:
)
pod_logs += "{}: :\n {}\n".format(container.name, container_log)
if hasattr(container.state.terminated, "reason"):
if container.state.terminated.reason != "Completed":
message = "Job pod {} was terminated, reason: {}, message: {}".format(
job_pod.metadata.name,
container.state.terminated.reason,
container.state.terminated.message,
)
logging.warn(message)
pod_logs += "\n{}\n".format(container.state.terminated.reason)
elif container.state.waiting:
# No need to fetch logs, as the container has not started yet.
pod_logs += "Container {} failed, error: {}".format(
message = "Container {} failed, error: {}".format(
container.name, container.state.waiting.message
)
logging.warn(message)
pod_logs += message

return pod_logs
except client.rest.ApiException as e:
Expand Down Expand Up @@ -334,7 +343,9 @@ def get_logs(cls, backend_job_id, **kwargs):
if not logs:
logs = ""

message = f"\n{job_pod.status.reason}\nThe job was killed due to exceeding timeout"
message = (
f"{job_pod.status.reason}: The job was killed due to exceeding timeout"
)

try:
specified_timeout = job_pod.spec.active_deadline_seconds
Expand All @@ -345,8 +356,9 @@ def get_logs(cls, backend_job_id, **kwargs):
f"Kubernetes job id: {backend_job_id}. Could not get job timeout from Job spec."
)

logs += message
logging.info(
logs += "\n{message}\n"
logging.warn(message)
logging.warn(
f"Kubernetes job id: {backend_job_id} was killed due to timeout."
)

Expand Down

0 comments on commit 1f4c34a

Please sign in to comment.