Skip to content

Commit 5c9057f

Browse files
authored
Merge pull request #1817 from ceph/exporter
2 parents a2dfb05 + be1f601 commit 5c9057f

File tree

13 files changed

+322
-97
lines changed

13 files changed

+322
-97
lines changed

docs/siteconfig.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,10 @@ Here is a sample configuration with many of the options set and documented::
2222
# Teuthology can use the entire cluster.
2323
reserve_machines: 5
2424

25+
# The machine types currently in active use; currently only used by
26+
# teuthology-exporter
27+
active_machine_types: ['smithi']
28+
2529
# The host and port to use for the beanstalkd queue. This is required
2630
# for scheduled jobs.
2731
queue_host: localhost

scripts/exporter.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
import docopt
2+
3+
import teuthology.exporter
4+
5+
doc = """
6+
usage: teuthology-exporter --help
7+
teuthology-exporter [--interval INTERVAL]
8+
9+
optional arguments:
10+
-h, --help show this help message and exit
11+
--interval INTERVAL update metrics this often, in seconds
12+
[default: 60]
13+
"""
14+
15+
16+
def main():
17+
args = docopt.docopt(doc)
18+
teuthology.exporter.main(args)

setup.cfg

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ install_requires =
5858
python-openstackclient
5959
requests>2.13.0
6060
sentry-sdk
61+
prometheus_client>=0.16.0
6162
python_requires = >=3.6
6263

6364
[options.entry_points]
@@ -81,6 +82,7 @@ console_scripts =
8182
teuthology-reimage = scripts.reimage:main
8283
teuthology-dispatcher = scripts.dispatcher:main
8384
teuthology-wait = scripts.wait:main
85+
teuthology-exporter = scripts.exporter:main
8486

8587
[options.extras_require]
8688
manhole =
@@ -126,3 +128,6 @@ teuthology.task.install =
126128
daemon-helper
127129
teuthology.task.internal =
128130
edit_sudoers.sh
131+
132+
[flake8]
133+
max-line-length = 100

teuthology/beanstalk.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,9 @@
66
import sys
77
from collections import OrderedDict
88

9+
import teuthology.report as report
10+
911
from teuthology.config import config
10-
from teuthology import report
1112

1213
log = logging.getLogger(__name__)
1314

teuthology/config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,7 @@ class TeuthologyConfig(YamlConfig):
192192
'rocketchat': None,
193193
'sleep_before_teardown': 0,
194194
'ssh_key': None,
195+
'active_machine_types': [],
195196
}
196197

197198
def __init__(self, yaml_path=None):

teuthology/dispatcher/__init__.py

Lines changed: 20 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
import getpass
21
import logging
32
import os
43
import psutil
@@ -7,18 +6,20 @@
76
import yaml
87

98
from datetime import datetime
9+
from typing import Dict, List
10+
11+
import teuthology.dispatcher.supervisor as supervisor
12+
import teuthology.lock.ops as lock_ops
13+
import teuthology.nuke as nuke
14+
import teuthology.worker as worker
1015

1116
from teuthology import setup_log_file, install_except_hook
1217
from teuthology import beanstalk
1318
from teuthology import report
1419
from teuthology.config import config as teuth_config
1520
from teuthology.exceptions import SkipJob
1621
from teuthology.repo_utils import fetch_qa_suite, fetch_teuthology
17-
from teuthology.lock.ops import block_and_lock_machines
18-
from teuthology.dispatcher import supervisor
19-
from teuthology.worker import prep_job
2022
from teuthology import safepath
21-
from teuthology.nuke import nuke
2223

2324
log = logging.getLogger(__name__)
2425
start_time = datetime.utcnow()
@@ -72,7 +73,7 @@ def main(args):
7273
archive_dir = teuth_config.archive_base
7374

7475
# Refuse to start more than one dispatcher per machine type
75-
procs = find_dispatcher_processes(tube)
76+
procs = find_dispatcher_processes().get(tube)
7677
if procs:
7778
raise RuntimeError(
7879
"There is already a teuthology-dispatcher process running:"
@@ -134,7 +135,7 @@ def main(args):
134135
keep_running = False
135136

136137
try:
137-
job_config, teuth_bin_path = prep_job(
138+
job_config, teuth_bin_path = worker.prep_job(
138139
job_config,
139140
log_file_path,
140141
archive_dir,
@@ -175,7 +176,7 @@ def main(args):
175176
error_message = "Saw error while trying to spawn supervisor."
176177
log.exception(error_message)
177178
if 'targets' in job_config:
178-
nuke(supervisor.create_fake_context(job_config), True)
179+
nuke.nuke(supervisor.create_fake_context(job_config), True)
179180
report.try_push_job_info(job_config, dict(
180181
status='fail',
181182
failure_reason=error_message))
@@ -194,33 +195,36 @@ def main(args):
194195
return max(returncodes)
195196

196197

197-
def find_dispatcher_processes(machine_type):
198-
user = getpass.getuser()
198+
def find_dispatcher_processes() -> Dict[str, List[psutil.Process]]:
199199
def match(proc):
200-
if proc.username() != user:
201-
return False
202200
cmdline = proc.cmdline()
203201
if len(cmdline) < 3:
204202
return False
205203
if not cmdline[1].endswith("/teuthology-dispatcher"):
206204
return False
207205
if cmdline[2] == "--supervisor":
208206
return False
209-
if machine_type not in cmdline:
207+
if "--tube" not in cmdline:
210208
return False
211209
if proc.pid == os.getpid():
212210
return False
213211
return True
214212

215-
attrs = ["pid", "username", "cmdline"]
216-
procs = list(filter(match, psutil.process_iter(attrs=attrs)))
213+
procs = {}
214+
attrs = ["pid", "cmdline"]
215+
for proc in psutil.process_iter(attrs=attrs):
216+
if not match(proc):
217+
continue
218+
cmdline = proc.cmdline()
219+
machine_type = cmdline[cmdline.index("--tube") + 1]
220+
procs.setdefault(machine_type, []).append(proc)
217221
return procs
218222

219223

220224
def lock_machines(job_config):
221225
report.try_push_job_info(job_config, dict(status='running'))
222226
fake_ctx = supervisor.create_fake_context(job_config, block=True)
223-
block_and_lock_machines(
227+
lock_ops.block_and_lock_machines(
224228
fake_ctx,
225229
len(job_config['roles']),
226230
job_config['machine_type'],

teuthology/dispatcher/supervisor.py

Lines changed: 28 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -8,17 +8,17 @@
88
from urllib.parse import urljoin
99
from datetime import datetime
1010

11-
import teuthology
11+
import teuthology.lock.ops as lock_ops
12+
import teuthology.nuke as nuke
13+
1214
from teuthology import report
1315
from teuthology import safepath
1416
from teuthology.config import config as teuth_config
1517
from teuthology.exceptions import SkipJob, MaxWhileTries
1618
from teuthology import setup_log_file, install_except_hook
17-
from teuthology.lock.ops import reimage_machines
1819
from teuthology.misc import get_user, archive_logs, compress_logs
1920
from teuthology.config import FakeNamespace
2021
from teuthology.job_status import get_status
21-
from teuthology.nuke import nuke
2222
from teuthology.kill import kill_job
2323
from teuthology.task.internal import add_remotes
2424
from teuthology.misc import decanonicalize_hostname as shortname
@@ -165,6 +165,7 @@ def failure_is_reimage(failure_reason):
165165
else:
166166
return False
167167

168+
168169
def check_for_reimage_failures_and_mark_down(targets, count=10):
169170
# Grab paddles history of jobs in the machine
170171
# and count the number of reimaging errors
@@ -173,9 +174,8 @@ def check_for_reimage_failures_and_mark_down(targets, count=10):
173174
for k, _ in targets.items():
174175
machine = k.split('@')[-1]
175176
url = urljoin(
176-
base_url,
177-
'/nodes/{0}/jobs/?count={1}'.format(
178-
machine, count)
177+
base_url,
178+
'/nodes/{0}/jobs/?count={1}'.format(machine, count)
179179
)
180180
resp = requests.get(url)
181181
jobs = resp.json()
@@ -189,15 +189,16 @@ def check_for_reimage_failures_and_mark_down(targets, count=10):
189189
continue
190190
# Mark machine down
191191
machine_name = shortname(k)
192-
teuthology.lock.ops.update_lock(
193-
machine_name,
194-
description='reimage failed {0} times'.format(count),
195-
status='down',
196-
)
192+
lock_ops.update_lock(
193+
machine_name,
194+
description='reimage failed {0} times'.format(count),
195+
status='down',
196+
)
197197
log.error(
198198
'Reimage failed {0} times ... marking machine down'.format(count)
199199
)
200200

201+
201202
def reimage(job_config):
202203
# Reimage the targets specified in job config
203204
# and update their keys in config after reimaging
@@ -206,12 +207,15 @@ def reimage(job_config):
206207
report.try_push_job_info(ctx.config, dict(status='waiting'))
207208
targets = job_config['targets']
208209
try:
209-
reimaged = reimage_machines(ctx, targets, job_config['machine_type'])
210+
reimaged = lock_ops.reimage_machines(ctx, targets, job_config['machine_type'])
210211
except Exception as e:
211212
log.exception('Reimaging error. Nuking machines...')
212213
# Reimage failures should map to the 'dead' status instead of 'fail'
213-
report.try_push_job_info(ctx.config, dict(status='dead', failure_reason='Error reimaging machines: ' + str(e)))
214-
nuke(ctx, True)
214+
report.try_push_job_info(
215+
ctx.config,
216+
dict(status='dead', failure_reason='Error reimaging machines: ' + str(e))
217+
)
218+
nuke.nuke(ctx, True)
215219
# Machine that fails to reimage after 10 times will be marked down
216220
check_for_reimage_failures_and_mark_down(targets)
217221
raise
@@ -241,18 +245,20 @@ def unlock_targets(job_config):
241245
if not locked:
242246
return
243247
job_status = get_status(job_info)
244-
if job_status == 'pass' or \
245-
(job_config.get('unlock_on_failure', False) and not job_config.get('nuke-on-error', False)):
248+
if job_status == 'pass' or (job_config.get('unlock_on_failure', False)
249+
and not job_config.get('nuke-on-error', False)):
246250
log.info('Unlocking machines...')
247251
fake_ctx = create_fake_context(job_config)
248252
for machine in locked:
249-
teuthology.lock.ops.unlock_one(fake_ctx,
250-
machine, job_info['owner'],
251-
job_info['archive_path'])
253+
lock_ops.unlock_one(
254+
fake_ctx,
255+
machine, job_info['owner'],
256+
job_info['archive_path']
257+
)
252258
if job_status != 'pass' and job_config.get('nuke-on-error', False):
253259
log.info('Nuking machines...')
254260
fake_ctx = create_fake_context(job_config)
255-
nuke(fake_ctx, True)
261+
nuke.nuke(fake_ctx, True)
256262

257263

258264
def run_with_watchdog(process, job_config):
@@ -316,7 +322,8 @@ def run_with_watchdog(process, job_config):
316322
extra_info = dict(status='dead')
317323
if hit_max_timeout:
318324
extra_info['failure_reason'] = 'hit max job timeout'
319-
report.try_push_job_info(job_info, extra_info)
325+
if not (job_config.get('first_in_suite') or job_config.get('last_in_suite')):
326+
report.try_push_job_info(job_info, extra_info)
320327

321328

322329
def create_fake_context(job_config, block=False):

0 commit comments

Comments
 (0)