From e0642daccd0724030162a1c142640e2ad5a686e1 Mon Sep 17 00:00:00 2001 From: Kamoltat Sirivadhna Date: Wed, 13 Mar 2024 17:45:59 -0400 Subject: [PATCH] src/teuthology_api/suite: Modify how we handle Error and Success runs The changes includes: 1. make suite route return {"run": run_details, "logs": logs, "job_count": job_count} 2. Improve how we handle Exception by utilizing Queue from python multiprocessing library. Signed-off-by: Kamoltat Sirivadhna --- src/teuthology_api/routes/suite.py | 7 +++- src/teuthology_api/schemas/suite.py | 2 +- src/teuthology_api/services/helpers.py | 37 +++++++++++++++----- src/teuthology_api/services/suite.py | 48 ++++++++++++-------------- 4 files changed, 58 insertions(+), 36 deletions(-) diff --git a/src/teuthology_api/routes/suite.py b/src/teuthology_api/routes/suite.py index 13f1905..5737a16 100644 --- a/src/teuthology_api/routes/suite.py +++ b/src/teuthology_api/routes/suite.py @@ -24,4 +24,9 @@ def create_run( ): args = args.model_dump(by_alias=True) args["--user"] = get_username(request) - return run(args, logs, access_token) + try: + created_run = run(args, logs, access_token) + log.debug(created_run) + except Exception as e: + raise HTTPException(status_code=400, detail=str(e)) + return created_run diff --git a/src/teuthology_api/schemas/suite.py b/src/teuthology_api/schemas/suite.py index 409e219..14b4042 100644 --- a/src/teuthology_api/schemas/suite.py +++ b/src/teuthology_api/schemas/suite.py @@ -37,7 +37,7 @@ class SuiteArgs(BaseArgs): default="https://github.com/ceph/ceph-ci.git", alias="--suite_repo" ) teuthology_branch: Union[str, None] = Field( - default="main", alias="--teuthology-branch" + default=None, alias="--teuthology-branch" ) validate_sha1: Union[str, None] = Field(default="true", alias="--validate-sha1") wait: Union[bool, None] = Field(default=False, alias="--wait") diff --git a/src/teuthology_api/services/helpers.py b/src/teuthology_api/services/helpers.py index ef7abc3..7a93b62 100644 --- a/src/teuthology_api/services/helpers.py +++ b/src/teuthology_api/services/helpers.py @@ -1,4 +1,4 @@ -from multiprocessing import Process +from multiprocessing import Process, Queue import logging import os import uuid @@ -26,26 +26,45 @@ def logs_run(func, args): _id = str(uuid.uuid4()) archive = Path(ARCHIVE_DIR) log_file = archive / f"{_id}.log" - - teuthology_process = Process(target=_execute_with_logs, args=(func, args, log_file)) - teuthology_process.start() - teuthology_process.join() - + teuth_queue = Queue() + teuth_process = Process( + target=_execute_with_logs, + args=(func, args, log_file, teuth_queue) + ) + teuth_process.daemon = True # Set the process as a daemon + teuth_process.start() + teuth_process.join(timeout=180) # Set the timeout value in seconds + if teuth_process.is_alive(): + teuth_process.terminate() # Terminate the process if it exceeds the timeout + teuth_process.join() + raise TimeoutError("Process execution timed out") logs = "" with open(log_file, encoding="utf-8") as file: logs = file.readlines() if os.path.isfile(log_file): os.remove(log_file) - return logs + log.debug(logs) + if teuth_process.exitcode > 0: + e = teuth_queue.get() + log.error(e) + return "fail", e, 0 + else: + job_count = teuth_queue.get() + return "success", logs, job_count -def _execute_with_logs(func, args, log_file): +def _execute_with_logs(func, args, log_file, teuth_queue): """ To store logs, set a new FileHandler for teuthology root logger and then execute the command function. """ teuthology.setup_log_file(log_file) - func(args) + try: + job_count = func(args) + teuth_queue.put(job_count) + except Exception as e: + teuth_queue.put(e) + raise def get_run_details(run_name: str): diff --git a/src/teuthology_api/services/suite.py b/src/teuthology_api/services/suite.py index 99d2d6d..e0047ee 100644 --- a/src/teuthology_api/services/suite.py +++ b/src/teuthology_api/services/suite.py @@ -20,31 +20,29 @@ def run(args, send_logs: bool, access_token: str): detail="You need to be logged in", headers={"WWW-Authenticate": "Bearer"}, ) - try: - args["--timestamp"] = datetime.now().strftime("%Y-%m-%d_%H:%M:%S") - - logs = logs_run(teuthology.suite.main, args) - - # get run details from paddles - run_name = make_run_name( - { - "machine_type": args["--machine-type"], - "user": args["--user"], - "timestamp": args["--timestamp"], - "suite": args["--suite"], - "ceph_branch": args["--ceph"], - "kernel_branch": args["--kernel"], - "flavor": args["--flavor"], - } - ) - run_details = get_run_details(run_name) - if send_logs or args["--dry-run"]: - return {"run": run_details, "logs": logs} - return {"run": run_details} - except Exception as exc: - log.error("teuthology.suite.main failed with the error: %s", repr(exc)) - raise HTTPException(status_code=500, detail=repr(exc)) from exc - + args["--timestamp"] = datetime.now().strftime("%Y-%m-%d_%H:%M:%S") + status, logs, job_count = logs_run(teuthology.suite.main, args) + if status == "fail": + raise logs + if args["--dry-run"] or job_count < 1: + return {"run": {}, "logs": logs, "job_count": job_count} + # get run details from paddles + run_name = make_run_name( + { + "machine_type": args["--machine-type"], + "user": args["--user"], + "timestamp": args["--timestamp"], + "suite": args["--suite"], + "ceph_branch": args["--ceph"], + "kernel_branch": args["--kernel"], + "flavor": args["--flavor"], + } + ) + run_details = get_run_details(run_name) + if send_logs: + return {"run": run_details, "logs":logs, "job_count": job_count} + else: + return {"run": run_details, "job_count": job_count} def make_run_name(run_dic): """