From 21f5ba8833a2e21df17601497a08396c9bae9ab2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Georg=20G=C3=B6ri?= Date: Thu, 9 Jun 2022 14:44:01 +0200 Subject: [PATCH] Fix exitCode check (#190) The exitCode check works differently then described in the API: https://github.com/kubeflow/common/blob/master/pkg/apis/common/v1/types.go\#L163-L168 In addition the document exitCodes for Docker will not appear in a Kubernetes pod status (anymore?) --- pkg/util/train/train_util.go | 35 +---------------------------------- 1 file changed, 1 insertion(+), 34 deletions(-) diff --git a/pkg/util/train/train_util.go b/pkg/util/train/train_util.go index e412058f..fbb120a8 100644 --- a/pkg/util/train/train_util.go +++ b/pkg/util/train/train_util.go @@ -16,38 +16,5 @@ package train func IsRetryableExitCode(exitCode int32) bool { - if exitCode == 1 || exitCode == 2 || exitCode == 126 || - exitCode == 127 || exitCode == 128 || exitCode == 139 { - // Refers to http://tldp.org/LDP/abs/html/exitcodes.html, we identify the following exit codes - // as permanent errors: - // 1: General errors - // 2: Misuse of shell builtins - // 126: Command invoked cannot execute - // 127: Command not found - // 128: Invalid argument to exit - // 139(128+11): terminated by SIGSEGV(Invalid memory reference) - return false - } - - if exitCode == 130 || exitCode == 137 || exitCode == 143 { - // We think it's retryable error if the container exits due to the following sys signals - // that are usually caused by transient issues(e.g. VM was rescheduled): - // 130(128+2): Container terminated by Control-C - // 137(128+9): Container received a SIGKILL - // 143(128+15): Container received a SIGTERM - // The exit code of container will be 128 + n for fatal error signals. - // More info can be found in: - // http://tldp.org/LDP/abs/html/exitcodes.html, - // https://stackoverflow.com/questions/31297616/what-is-the-authoritative-list-of-docker-run-exit-codes - return true - } - - if exitCode == 138 { - // We allow users to specify exit code for the cases that they think should retry. - // We decide to take the exit code of SIGUSR1(138 = 128 + 10) for user defined retryable error. - return true - } - - // We make no guarantee for other exit status. Currently handling them same as permanent errors. - return false + return exitCode >= 128 }