diff --git a/configure.ac b/configure.ac index deac11f89894..b94ca82d45f7 100644 --- a/configure.ac +++ b/configure.ac @@ -651,10 +651,7 @@ AC_CONFIG_FILES( \ etc/flux-hostlist.pc \ etc/flux-taskmap.pc \ etc/flux.service \ - etc/flux-housekeeping@.service \ src/cmd/flux-run-housekeeping \ - etc/flux-prolog@.service \ - etc/flux-epilog@.service \ src/cmd/flux-run-prolog \ src/cmd/flux-run-epilog \ doc/Makefile \ diff --git a/etc/Makefile.am b/etc/Makefile.am index 546a94814228..6addab1b0375 100644 --- a/etc/Makefile.am +++ b/etc/Makefile.am @@ -1,9 +1,6 @@ #if HAVE_SYSTEMD systemdsystemunit_DATA = \ - flux.service \ - flux-housekeeping@.service \ - flux-prolog@.service \ - flux-epilog@.service + flux.service #endif tmpfilesdir = $(prefix)/lib/tmpfiles.d diff --git a/etc/flux-epilog@.service.in b/etc/flux-epilog@.service.in deleted file mode 100644 index 15c229dd84d8..000000000000 --- a/etc/flux-epilog@.service.in +++ /dev/null @@ -1,9 +0,0 @@ -[Unit] -Description=Epilog for Flux job %I -CollectMode=inactive-or-failed - -[Service] -Type=oneshot -EnvironmentFile=-@X_RUNSTATEDIR@/flux-epilog@%I.env -ExecStart=@X_SYSCONFDIR@/flux/system/epilog -ExecStopPost=-rm -f @X_RUNSTATEDIR@/flux-epilog@%I.env diff --git a/etc/flux-housekeeping@.service.in b/etc/flux-housekeeping@.service.in deleted file mode 100644 index cd6012351faf..000000000000 --- a/etc/flux-housekeeping@.service.in +++ /dev/null @@ -1,18 +0,0 @@ -[Unit] -Description=Housekeeping for Flux job %I -CollectMode=inactive-or-failed - -[Service] -Type=oneshot -EnvironmentFile=-@X_RUNSTATEDIR@/flux-housekeeping@%I.env -ExecStart=@X_SYSCONFDIR@/flux/system/housekeeping -ExecStopPost=-rm -f @X_RUNSTATEDIR@/flux-housekeeping@%I.env -ExecStopPost=-sh -c '\ - if test "$SERVICE_RESULT" != "success"; then \ - message="housekeeping@%I ${SERVICE_RESULT:-failure}"; \ - if test "${EXIT_CODE}${EXIT_STATUS}"; then \ - message="$message: $EXIT_CODE $EXIT_STATUS"; \ - fi; \ - flux resource drain $(flux getattr rank) $message; \ - fi \ -' diff --git a/etc/flux-prolog@.service.in b/etc/flux-prolog@.service.in deleted file mode 100644 index 14d6a1a92ca3..000000000000 --- a/etc/flux-prolog@.service.in +++ /dev/null @@ -1,9 +0,0 @@ -[Unit] -Description=Prolog for Flux job %I -CollectMode=inactive-or-failed - -[Service] -Type=oneshot -EnvironmentFile=-@X_RUNSTATEDIR@/flux-prolog@%I.env -ExecStart=@X_SYSCONFDIR@/flux/system/prolog -ExecStopPost=-rm -f @X_RUNSTATEDIR@/flux-prolog@%I.env diff --git a/src/cmd/flux-run-epilog.in b/src/cmd/flux-run-epilog.in index f996b7f0a576..222fa8b804c3 100755 --- a/src/cmd/flux-run-epilog.in +++ b/src/cmd/flux-run-epilog.in @@ -1,21 +1,7 @@ #!/bin/sh -if test $FLUX_JOB_ID; then - FLUX_JOB_ID=$(flux job id --to=f58plain $FLUX_JOB_ID) -fi -unitname=flux-epilog@${FLUX_JOB_ID:-unknown} +# This script exists for transition purposes and will be removed. +# Please reconfigure the IMP [run.epilog] table to use +# path = "@X_SYSCONFDIR@/flux/system/epilog". -terminate() { - systemctl stop $unitname - exit 1 -} - -trap terminate INT TERM - -umask 022 -printenv >@X_RUNSTATEDIR@/${unitname}.env - -# Run systemctl start in background and `wait` for it so that the trap -# will run immediately when signal is received: -systemctl start $unitname --quiet & -wait $! +exec @X_SYSCONFDIR@/flux/system/epilog diff --git a/src/cmd/flux-run-housekeeping.in b/src/cmd/flux-run-housekeeping.in index 6d4bea11cb0a..71f67b97671f 100755 --- a/src/cmd/flux-run-housekeeping.in +++ b/src/cmd/flux-run-housekeeping.in @@ -1,21 +1,7 @@ #!/bin/sh -if test $FLUX_JOB_ID; then - FLUX_JOB_ID=$(flux job id --to=f58plain $FLUX_JOB_ID) -fi -unitname=flux-housekeeping@${FLUX_JOB_ID:-unknown} +# This script exists for transition purposes and will be removed. +# Please reconfigure the IMP [run.housekeeping] table to use +# path = "@X_SYSCONFDIR@/flux/system/housekeeping". -terminate() { - systemctl stop $unitname - exit 1 -} - -trap terminate INT TERM - -umask 022 -printenv >@X_RUNSTATEDIR@/${unitname}.env - -# Run systemctl start in background and `wait` for it so that the trap -# will run immediately when signal is received: -systemctl start $unitname --quiet & -wait $! +exec @X_SYSCONFDIR@/flux/system/housekeeping diff --git a/src/cmd/flux-run-prolog.in b/src/cmd/flux-run-prolog.in index 944001659958..131039c1784e 100755 --- a/src/cmd/flux-run-prolog.in +++ b/src/cmd/flux-run-prolog.in @@ -1,21 +1,7 @@ #!/bin/sh -if test $FLUX_JOB_ID; then - FLUX_JOB_ID=$(flux job id --to=f58plain $FLUX_JOB_ID) -fi -unitname=flux-prolog@${FLUX_JOB_ID:-unknown} +# This script exists for transition purposes and will be removed. +# Please reconfigure the IMP [run.prolog] table to use +# path = "@X_SYSCONFDIR@/flux/system/prolog". -terminate() { - systemctl stop $unitname - exit 1 -} - -trap terminate INT TERM - -umask 022 -printenv >@X_RUNSTATEDIR@/${unitname}.env - -# Run systemctl start in background and `wait` for it so that the trap -# will run immediately when signal is received: -systemctl start $unitname --quiet & -wait $! +exec @X_SYSCONFDIR@/flux/system/prolog diff --git a/src/modules/job-manager/housekeeping.c b/src/modules/job-manager/housekeeping.c index 6ea9573f356a..17923a98f1a8 100644 --- a/src/modules/job-manager/housekeeping.c +++ b/src/modules/job-manager/housekeeping.c @@ -90,6 +90,7 @@ #include "src/common/libhostlist/hostlist.h" #include "src/common/libutil/fsd.h" #include "src/common/libutil/errprintf.h" +#include "src/common/libutil/errno_safe.h" #include "src/common/libjob/idf58.h" #include "src/common/libsubprocess/bulk-exec.h" #include "src/common/libsubprocess/command.h" @@ -107,6 +108,8 @@ extern char **environ; // -1 = never, 0 = immediate, >0 = time in seconds static const double default_release_after = -1; +static const char *default_exec_service = "rexec"; + struct allocation { flux_jobid_t id; struct rlist *rl; // R, diminished each time a subset is released @@ -119,13 +122,15 @@ struct allocation { double t_start; struct bulk_exec *bulk_exec; void *list_handle; + int kill_signal; }; struct housekeeping { struct job_manager *ctx; flux_cmd_t *cmd; // NULL if not configured + char *exec_service; double release_after; - char *imp_path; + int kill_signal; zlistx_t *allocations; flux_msg_handler_t **handlers; }; @@ -181,6 +186,8 @@ static struct allocation *allocation_create (struct housekeeping *hk, return NULL; a->hk = hk; a->id = id; + a->kill_signal = hk->kill_signal; + a->t_start = flux_reactor_now (flux_get_reactor (hk->ctx->h)); if (!(a->rl = rlist_from_json (R, NULL)) || !(a->pending = rlist_ranks (a->rl)) @@ -191,9 +198,9 @@ static struct allocation *allocation_create (struct housekeeping *hk, allocation_timeout, a)) || !(a->bulk_exec = bulk_exec_create (&bulk_ops, - "rexec", + hk->exec_service, id, - "housekeeping", + "flux-housekeeping", a)) || update_cmd_env (hk->cmd, id, userid) < 0 || bulk_exec_push_cmd (a->bulk_exec, a->pending, hk->cmd, 0) < 0) { @@ -698,7 +705,10 @@ static void housekeeping_kill_cb (flux_t *h, while (a) { if (a->id == jobid || jobid == FLUX_JOBID_ANY) { if (a->bulk_exec) { - f = bulk_exec_kill (a->bulk_exec, ids, signum); + int sig = signum; + if (sig == SIGKILL) + sig = a->kill_signal; + f = bulk_exec_kill (a->bulk_exec, ids, sig); if (flux_future_then (f, -1, kill_continuation, hk) < 0) flux_future_destroy (f); } @@ -742,6 +752,43 @@ static flux_cmd_t *create_cmd (json_t *cmdline) return cmd; } +/* Create the housekeeping command template based on parsed configuration. + * If no 'cmdline' was configured, assume "imp run housekeeping". + * In addition, if the IMP will run as the main process in a transient + * systemd unit, set options for correct signal forwarding semantics. + * 'kill_signal' is assigned (only) if a SIGKILL proxy signal is needed. + */ +static flux_cmd_t *create_cmd_template (json_t *cmdline, + const char *exec_service, + const char *imp_path, + int *kill_signal) +{ + flux_cmd_t *cmd = NULL; + json_t *o = NULL; + bool use_imp = false; + + if (!cmdline) { + if (!(o = json_pack ("[sss]", imp_path, "run", "housekeeping"))) + return NULL; + cmdline = o; + use_imp = true; + } + if (!(cmd = create_cmd (cmdline))) + goto error; + if (use_imp && streq (exec_service, "sdexec")) { + if (flux_cmd_setopt (cmd, "SDEXEC_PROP_KillMode", "process") < 0 + || flux_cmd_setopt (cmd, "SDEXEC_PROP_SendSIGKILL", "off") < 0) + goto error; + *kill_signal = SIGUSR1; + } + json_decref (o); + return cmd; +error: + flux_cmd_destroy (cmd); + ERRNO_SAFE_WRAP (json_decref, o); + return NULL; +} + static int housekeeping_parse_config (const flux_conf_t *conf, flux_error_t *error, void *arg) @@ -755,8 +802,10 @@ static int housekeeping_parse_config (const flux_conf_t *conf, double release_after = default_release_after; flux_cmd_t *cmd = NULL; const char *imp_path = NULL; - char *imp_path_cpy = NULL; int use_systemd_unit = 0; + const char *exec_service = default_exec_service; + char *exec_service_cpy = NULL; + int kill_signal = SIGKILL; if (flux_conf_unpack (conf, &e, @@ -785,8 +834,19 @@ static int housekeeping_parse_config (const flux_conf_t *conf, " - ignoring"); } - // let job-exec handle exec errors - (void)flux_conf_unpack (conf, NULL, "{s?{s?s}}", "exec", "imp", &imp_path); + // let job-exec handle exec parse errors + (void)flux_conf_unpack (conf, + NULL, + "{s?{s?s s?s}}", + "exec", + "imp", &imp_path, + "service", &exec_service); + + if (!cmdline && !imp_path) { + return errprintf (error, + "job-manager.housekeeping implies IMP" + " but exec.imp is undefined"); + } if (release_after_fsd) { if (fsd_parse_duration (release_after_fsd, &release_after) < 0) @@ -795,40 +855,28 @@ static int housekeeping_parse_config (const flux_conf_t *conf, " FSD parse error"); } - if (cmdline) { - if (!(cmd = create_cmd (cmdline))) - return errprintf (error, "error creating housekeeping command"); + if (!(exec_service_cpy = strdup (exec_service))) + return errprintf (error, "error duplicating exec service"); + if (!(cmd = create_cmd_template (cmdline, + exec_service, + imp_path, + &kill_signal))) { + ERRNO_SAFE_WRAP (free, exec_service_cpy); + return errprintf (error, "could not create command template"); } - // if no command line was defined, assume "imp exec housekeeping" - else { - if (!imp_path) { - return errprintf (error, - "job-manager.housekeeping implies IMP" - " but exec.imp is undefined"); - } - json_t *o; - if ((o = json_pack ("[sss]", imp_path, "run", "housekeeping"))) - cmd = create_cmd (o); - json_decref (o); - if (!cmd) - return errprintf (error, "error creating housekeeping command"); - if (!(imp_path_cpy = strdup (imp_path))) { - flux_cmd_destroy (cmd); - return errprintf (error, "error duplicating IMP path"); - } - } done: flux_cmd_destroy (hk->cmd); hk->cmd = cmd; - free (hk->imp_path); - hk->imp_path = imp_path_cpy; + free (hk->exec_service); + hk->exec_service = exec_service_cpy; hk->release_after = release_after; + hk->kill_signal = kill_signal; flux_log (hk->ctx->h, LOG_DEBUG, "housekeeping is %sconfigured%s", hk->cmd ? "" : "not ", - hk->imp_path ? " with IMP" : ""); + (imp_path && !cmdline) ? " with IMP" : ""); return 1; // allow dynamic changes } @@ -850,7 +898,7 @@ void housekeeping_ctx_destroy (struct housekeeping *hk) flux_cmd_destroy (hk->cmd); zlistx_destroy (&hk->allocations); flux_msg_handler_delvec (hk->handlers); - free (hk->imp_path); + free (hk->exec_service); free (hk); errno = saved_errno; } @@ -865,10 +913,13 @@ struct housekeeping *housekeeping_ctx_create (struct job_manager *ctx) return NULL; hk->ctx = ctx; hk->release_after = default_release_after; + if (!(hk->exec_service = strdup (default_exec_service))) + goto error; if (!(hk->allocations = zlistx_new ())) { errno = ENOMEM; goto error; } + hk->kill_signal = SIGKILL; zlistx_set_destructor (hk->allocations, allocation_destructor); if (conf_register_callback (ctx->conf, &error, diff --git a/src/modules/job-manager/plugins/perilog.c b/src/modules/job-manager/plugins/perilog.c index 291b725d807a..07b0bc0f1317 100644 --- a/src/modules/job-manager/plugins/perilog.c +++ b/src/modules/job-manager/plugins/perilog.c @@ -75,12 +75,12 @@ extern char **environ; */ struct perilog_procdesc { flux_cmd_t *cmd; - bool uses_imp; bool prolog; bool per_rank; bool cancel_on_exception; double timeout; double kill_timeout; + int kill_signal; }; /* Global prolog/epilog configuration @@ -89,6 +89,7 @@ static struct perilog_conf { bool initialized; char *imp_path; + char *exec_service; struct perilog_procdesc *prolog; struct perilog_procdesc *epilog; @@ -126,6 +127,8 @@ struct perilog_proc { */ static double default_kill_timeout = 60.; +static const char *default_exec_service = "rexec"; + static struct perilog_proc *procdesc_run (flux_t *h, flux_plugin_t *p, struct perilog_procdesc *pd, @@ -182,9 +185,9 @@ static struct perilog_procdesc *perilog_procdesc_create (json_t *o, int cancel_on_exception = -1; const char *timeout; double kill_timeout = -1.; + bool use_imp = false; flux_cmd_t *cmd = NULL; json_t *command = NULL; - bool uses_imp = false; json_error_t error; const char *name = prolog ? "prolog" : "epilog"; @@ -230,7 +233,7 @@ static struct perilog_procdesc *perilog_procdesc_create (json_t *o, errprintf (errp, "error creating %s command", name); return NULL; } - uses_imp = true; + use_imp = true; } if (!(pd = calloc (1, sizeof (*pd)))) { errprintf (errp, "Out of memory"); @@ -253,11 +256,21 @@ static struct perilog_procdesc *perilog_procdesc_create (json_t *o, goto error; } + /* If the IMP will run as the main process in a transient + * systemd unit, set options for correct signal forwarding semantics. + */ + if (use_imp && streq (perilog_config.exec_service, "sdexec")) { + if (flux_cmd_setopt (cmd, "SDEXEC_PROP_KillMode", "process") < 0 + || flux_cmd_setopt (cmd, "SDEXEC_PROP_SendSIGKILL", "off") < 0) { + errprintf (errp, "error setting sdexec options"); + goto error; + } + pd->kill_signal = SIGUSR1; + } pd->cmd = cmd; pd->kill_timeout = kill_timeout > 0. ? kill_timeout : default_kill_timeout; pd->per_rank = per_rank; pd->prolog = prolog; - pd->uses_imp = uses_imp; /* If cancel_on_exception unset, default to prolog=true, epilog=false * Otherwise, use set value: @@ -819,10 +832,16 @@ static struct perilog_proc *procdesc_run (flux_t *h, goto error; } + /* If using sdexec, bulk-exec sets the unit name (and hence the cgroup + * name) to "NAME-RANK-JOBID.service", where NAME is the bulk_name below. + * The IMP checks the cgroup for a "flux-" prefix to change its signal + * forwarding and straggler handling behavior, so use that prefix here. + */ + char *bulk_name = proc->prolog ? "flux-prolog" : "flux-epilog"; if (!(bulk_exec = bulk_exec_create (&ops, - "rexec", + perilog_config.exec_service, id, - perilog_proc_name (proc), + bulk_name, NULL)) || bulk_exec_push_cmd (bulk_exec, ranks, pd->cmd, 0) < 0) { flux_log_error (h, @@ -1228,6 +1247,7 @@ static void free_config (struct perilog_conf *conf) perilog_procdesc_destroy (conf->epilog); zhashx_destroy (&conf->processes); zlistx_destroy (&conf->log_ignore); + free (conf->exec_service); } /* Initialize a perilog_config object @@ -1245,6 +1265,10 @@ static int conf_init (flux_plugin_t *p, struct perilog_conf *conf) } zhashx_set_destructor (conf->processes, perilog_proc_destructor); + if (!(conf->exec_service = strdup (default_exec_service))) { + flux_log_error (h, "perilog: failed to duplicate exec service name"); + goto error; + } /* Watch for broker transition to CLEANUP. */ @@ -1444,6 +1468,7 @@ static int conf_update_cb (flux_plugin_t *p, flux_error_t error; json_error_t jerror; const char *imp_path = NULL; + const char *exec_service = default_exec_service; json_t *prolog_config = NULL; json_t *epilog_config = NULL; struct perilog_procdesc *prolog = NULL; @@ -1472,9 +1497,10 @@ static int conf_update_cb (flux_plugin_t *p, if (json_unpack_ex (conf, &jerror, 0, - "{s?{s?s} s?{s?o s?o s?{s?o}}}", + "{s?{s?s s?s} s?{s?o s?o s?{s?o}}}", "exec", "imp", &imp_path, + "service", &exec_service, "job-manager", "prolog", &prolog_config, "epilog", &epilog_config, @@ -1486,6 +1512,12 @@ static int conf_update_cb (flux_plugin_t *p, goto error; } + free (perilog_config.exec_service); + if (!(perilog_config.exec_service = strdup (exec_service))) { + errprintf (&error, "failed to duplicate exec service name"); + goto error; + } + /* Capture IMP path before first call to perilog_procdesc_create() */ free (perilog_config.imp_path);