ceph
diff --git a/‎docs/siteconfig.rst‎
Lines changed: 4 additions & 0 deletions b/‎docs/siteconfig.rst‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎scripts/exporter.py‎
Lines changed: 18 additions & 0 deletions b/‎scripts/exporter.py‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎setup.cfg‎
Lines changed: 5 additions & 0 deletions b/‎setup.cfg‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎teuthology/beanstalk.py‎
Lines changed: 2 additions & 1 deletion b/‎teuthology/beanstalk.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎teuthology/config.py‎
Lines changed: 1 addition & 0 deletions b/‎teuthology/config.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎teuthology/dispatcher/__init__.py‎
Lines changed: 20 additions & 16 deletions b/‎teuthology/dispatcher/__init__.py‎
Lines changed: 20 additions & 16 deletions
diff --git a/‎teuthology/dispatcher/supervisor.py‎
Lines changed: 28 additions & 21 deletions b/‎teuthology/dispatcher/supervisor.py‎
Lines changed: 28 additions & 21 deletions
@@ -22,6 +22,10 @@ Here is a sample configuration with many of the options set and documented::
     # Teuthology can use the entire cluster.
     reserve_machines: 5
 
+    # The machine types currently in active use; currently only used by
+    # teuthology-exporter
+    active_machine_types: ['smithi']
+
     # The host and port to use for the beanstalkd queue. This is required 
     # for scheduled jobs.
     queue_host: localhost
 
@@ -0,0 +1,18 @@
+import docopt
+
+import teuthology.exporter
+
+doc = """
+usage: teuthology-exporter --help
+       teuthology-exporter [--interval INTERVAL]
+
+optional arguments:
+  -h, --help                     show this help message and exit
+  --interval INTERVAL            update metrics this often, in seconds
+                                 [default: 60]
+"""
+
+
+def main():
+    args = docopt.docopt(doc)
+    teuthology.exporter.main(args)
@@ -58,6 +58,7 @@ install_requires =
     python-openstackclient
     requests>2.13.0
     sentry-sdk
+    prometheus_client>=0.16.0
 python_requires = >=3.6
 
 [options.entry_points]
@@ -81,6 +82,7 @@ console_scripts =
     teuthology-reimage = scripts.reimage:main
     teuthology-dispatcher = scripts.dispatcher:main
     teuthology-wait = scripts.wait:main
+    teuthology-exporter = scripts.exporter:main
 
 [options.extras_require]
 manhole =
@@ -126,3 +128,6 @@ teuthology.task.install =
     daemon-helper
 teuthology.task.internal =
     edit_sudoers.sh
+
+[flake8]
+max-line-length = 100
@@ -6,8 +6,9 @@
 import sys
 from collections import OrderedDict
 
+import teuthology.report as report
+
 from teuthology.config import config
-from teuthology import report
 
 log = logging.getLogger(__name__)
 
 
@@ -192,6 +192,7 @@ class TeuthologyConfig(YamlConfig):
         'rocketchat': None,
         'sleep_before_teardown': 0,
         'ssh_key': None,
+        'active_machine_types': [],
     }
 
     def __init__(self, yaml_path=None):
 
@@ -1,4 +1,3 @@
-import getpass
 import logging
 import os
 import psutil
@@ -7,18 +6,20 @@
 import yaml
 
 from datetime import datetime
+from typing import Dict, List
+
+import teuthology.dispatcher.supervisor as supervisor
+import teuthology.lock.ops as lock_ops
+import teuthology.nuke as nuke
+import teuthology.worker as worker
 
 from teuthology import setup_log_file, install_except_hook
 from teuthology import beanstalk
 from teuthology import report
 from teuthology.config import config as teuth_config
 from teuthology.exceptions import SkipJob
 from teuthology.repo_utils import fetch_qa_suite, fetch_teuthology
-from teuthology.lock.ops import block_and_lock_machines
-from teuthology.dispatcher import supervisor
-from teuthology.worker import prep_job
 from teuthology import safepath
-from teuthology.nuke import nuke
 
 log = logging.getLogger(__name__)
 start_time = datetime.utcnow()
@@ -72,7 +73,7 @@ def main(args):
         archive_dir = teuth_config.archive_base
 
     # Refuse to start more than one dispatcher per machine type
-    procs = find_dispatcher_processes(tube)
+    procs = find_dispatcher_processes().get(tube)
     if procs:
         raise RuntimeError(
             "There is already a teuthology-dispatcher process running:"
@@ -134,7 +135,7 @@ def main(args):
             keep_running = False
 
         try:
-            job_config, teuth_bin_path = prep_job(
+            job_config, teuth_bin_path = worker.prep_job(
                 job_config,
                 log_file_path,
                 archive_dir,
@@ -175,7 +176,7 @@ def main(args):
             error_message = "Saw error while trying to spawn supervisor."
             log.exception(error_message)
             if 'targets' in job_config:
-                nuke(supervisor.create_fake_context(job_config), True)
+                nuke.nuke(supervisor.create_fake_context(job_config), True)
             report.try_push_job_info(job_config, dict(
                 status='fail',
                 failure_reason=error_message))
@@ -194,33 +195,36 @@ def main(args):
     return max(returncodes)
 
 
-def find_dispatcher_processes(machine_type):
-    user = getpass.getuser()
+def find_dispatcher_processes() -> Dict[str, List[psutil.Process]]:
     def match(proc):
-        if proc.username() != user:
-            return False
         cmdline = proc.cmdline()
         if len(cmdline) < 3:
             return False
         if not cmdline[1].endswith("/teuthology-dispatcher"):
             return False
         if cmdline[2] == "--supervisor":
             return False
-        if machine_type not in cmdline:
+        if "--tube" not in cmdline:
             return False
         if proc.pid == os.getpid():
             return False
         return True
 
-    attrs = ["pid", "username", "cmdline"]
-    procs = list(filter(match, psutil.process_iter(attrs=attrs)))
+    procs = {}
+    attrs = ["pid", "cmdline"]
+    for proc in psutil.process_iter(attrs=attrs):
+        if not match(proc):
+            continue
+        cmdline = proc.cmdline()
+        machine_type = cmdline[cmdline.index("--tube") + 1]
+        procs.setdefault(machine_type, []).append(proc)
     return procs
 
 
 def lock_machines(job_config):
     report.try_push_job_info(job_config, dict(status='running'))
     fake_ctx = supervisor.create_fake_context(job_config, block=True)
-    block_and_lock_machines(
+    lock_ops.block_and_lock_machines(
         fake_ctx,
         len(job_config['roles']),
         job_config['machine_type'],
 
@@ -8,17 +8,17 @@
 from urllib.parse import urljoin
 from datetime import datetime
 
-import teuthology
+import teuthology.lock.ops as lock_ops
+import teuthology.nuke as nuke
+
 from teuthology import report
 from teuthology import safepath
 from teuthology.config import config as teuth_config
 from teuthology.exceptions import SkipJob, MaxWhileTries
 from teuthology import setup_log_file, install_except_hook
-from teuthology.lock.ops import reimage_machines
 from teuthology.misc import get_user, archive_logs, compress_logs
 from teuthology.config import FakeNamespace
 from teuthology.job_status import get_status
-from teuthology.nuke import nuke
 from teuthology.kill import kill_job
 from teuthology.task.internal import add_remotes
 from teuthology.misc import decanonicalize_hostname as shortname
@@ -165,6 +165,7 @@ def failure_is_reimage(failure_reason):
     else:
         return False
 
+
 def check_for_reimage_failures_and_mark_down(targets, count=10):
     # Grab paddles history of jobs in the machine
     # and count the number of reimaging errors
@@ -173,9 +174,8 @@ def check_for_reimage_failures_and_mark_down(targets, count=10):
     for k, _ in targets.items():
         machine = k.split('@')[-1]
         url = urljoin(
-                base_url,
-                '/nodes/{0}/jobs/?count={1}'.format(
-                machine, count)
+            base_url,
+            '/nodes/{0}/jobs/?count={1}'.format(machine, count)
         )
         resp = requests.get(url)
         jobs = resp.json()
@@ -189,15 +189,16 @@ def check_for_reimage_failures_and_mark_down(targets, count=10):
             continue
         # Mark machine down
         machine_name = shortname(k)
-        teuthology.lock.ops.update_lock(
-	    machine_name,
-	    description='reimage failed {0} times'.format(count),
-	    status='down',
-	)
+        lock_ops.update_lock(
+            machine_name,
+            description='reimage failed {0} times'.format(count),
+            status='down',
+        )
         log.error(
             'Reimage failed {0} times ... marking machine down'.format(count)
         )
 
+
 def reimage(job_config):
     # Reimage the targets specified in job config
     # and update their keys in config after reimaging
@@ -206,12 +207,15 @@ def reimage(job_config):
     report.try_push_job_info(ctx.config, dict(status='waiting'))
     targets = job_config['targets']
     try:
-        reimaged = reimage_machines(ctx, targets, job_config['machine_type'])
+        reimaged = lock_ops.reimage_machines(ctx, targets, job_config['machine_type'])
     except Exception as e:
         log.exception('Reimaging error. Nuking machines...')
         # Reimage failures should map to the 'dead' status instead of 'fail'
-        report.try_push_job_info(ctx.config, dict(status='dead', failure_reason='Error reimaging machines: ' + str(e)))
-        nuke(ctx, True)
+        report.try_push_job_info(
+            ctx.config,
+            dict(status='dead', failure_reason='Error reimaging machines: ' + str(e))
+        )
+        nuke.nuke(ctx, True)
         # Machine that fails to reimage after 10 times will be marked down
         check_for_reimage_failures_and_mark_down(targets)
         raise
@@ -241,18 +245,20 @@ def unlock_targets(job_config):
     if not locked:
         return
     job_status = get_status(job_info)
-    if job_status == 'pass' or \
-            (job_config.get('unlock_on_failure', False) and not job_config.get('nuke-on-error', False)):
+    if job_status == 'pass' or (job_config.get('unlock_on_failure', False)
+                                and not job_config.get('nuke-on-error', False)):
         log.info('Unlocking machines...')
         fake_ctx = create_fake_context(job_config)
         for machine in locked:
-            teuthology.lock.ops.unlock_one(fake_ctx,
-                                           machine, job_info['owner'],
-                                           job_info['archive_path'])
+            lock_ops.unlock_one(
+                fake_ctx,
+                machine, job_info['owner'],
+                job_info['archive_path']
+            )
     if job_status != 'pass' and job_config.get('nuke-on-error', False):
         log.info('Nuking machines...')
         fake_ctx = create_fake_context(job_config)
-        nuke(fake_ctx, True)
+        nuke.nuke(fake_ctx, True)
 
 
 def run_with_watchdog(process, job_config):
@@ -316,7 +322,8 @@ def run_with_watchdog(process, job_config):
     extra_info = dict(status='dead')
     if hit_max_timeout:
         extra_info['failure_reason'] = 'hit max job timeout'
-    report.try_push_job_info(job_info, extra_info)
+    if not (job_config.get('first_in_suite') or job_config.get('last_in_suite')):
+        report.try_push_job_info(job_info, extra_info)
 
 
 def create_fake_context(job_config, block=False):
Original file line number	Diff line number	Diff line change
`@@ -192,6 +192,7 @@ class TeuthologyConfig(YamlConfig):`
`192`	`192`	`'rocketchat': None,`
`193`	`193`	`'sleep_before_teardown': 0,`
`194`	`194`	`'ssh_key': None,`
	`195`	`+ 'active_machine_types': [],`
`195`	`196`	`}`
`196`	`197`
`197`	`198`	`def __init__(self, yaml_path=None):`