diff --git a/test/quality/.yardstick.yaml b/test/quality/.yardstick.yaml
index 2d111bfcd0a..effef38be83 100644
--- a/test/quality/.yardstick.yaml
+++ b/test/quality/.yardstick.yaml
@@ -91,6 +91,11 @@ default_max_year: 2021
 result-sets:
   pr_vs_latest_via_sbom:
     description: "latest released grype vs grype from the current build (via SBOM ingestion)"
+    validations:
+      - max-f1-regression: 0.0
+        max-new-false-negatives: 00
+        max-unlabeled-percent: 10
+        max_year: 2021
     matrix:
       images: *images
 
@@ -112,6 +117,7 @@ result-sets:
           # for local build of grype, use for example:
           version: path:../../+import-db=db.tar.gz
           takes: SBOM
+          label: candidate
 
         - name: grype
           # note: we import a static (pinned) DB as to prevent changes in the DB from affecting the results. The
@@ -121,3 +127,4 @@ result-sets:
           # are testing with is not too stale.
           version: latest+import-db=db.tar.gz
           takes: SBOM
+          label: reference
diff --git a/test/quality/Makefile b/test/quality/Makefile
index f333eb0ad83..574e579f2d4 100644
--- a/test/quality/Makefile
+++ b/test/quality/Makefile
@@ -27,7 +27,7 @@ all: capture validate ## Fetch or capture all data and run all quality checks
 
 .PHONY: validate
 validate: venv $(VULNERABILITY_LABELS)/Makefile ## Run all quality checks against already collected data
-	$(ACTIVATE_VENV) ./gate.py
+	$(ACTIVATE_VENV) yardstick validate -r $(RESULT_SET)
 
 .PHONY: capture
 capture: sboms vulns ## Collect and store all syft and grype results
diff --git a/test/quality/gate.py b/test/quality/gate.py
deleted file mode 100755
index 74a75f557c7..00000000000
--- a/test/quality/gate.py
+++ /dev/null
@@ -1,345 +0,0 @@
-#!/usr/bin/env python3
-import logging
-import os
-import re
-import subprocess
-import sys
-from typing import Optional
-
-import click
-from tabulate import tabulate
-from dataclasses import dataclass, InitVar, field
-
-import yardstick
-from yardstick import store, comparison, artifact, arrange
-from yardstick.cli import display, config
-
-
-# see the .yardstick.yaml configuration for details
-default_result_set = "pr_vs_latest_via_sbom"
-yardstick.utils.grype_db.raise_on_failure(False)
-
-@dataclass
-class Gate:
-    label_comparisons: InitVar[Optional[list[comparison.AgainstLabels]]]
-    label_comparison_stats: InitVar[Optional[comparison.ImageToolLabelStats]]
-
-    reasons: list[str] = field(default_factory=list)
-
-    def __post_init__(self, label_comparisons: Optional[list[comparison.AgainstLabels]], label_comparison_stats: Optional[comparison.ImageToolLabelStats]):
-        if not label_comparisons and not label_comparison_stats:
-            return 
-    
-        reasons = []
-
-        # - fail when current F1 score drops below last release F1 score (or F1 score is indeterminate)
-        # - fail when indeterminate % > 10%
-        # - fail when there is a rise in FNs
-        latest_release_tool, current_tool = guess_tool_orientation(label_comparison_stats.tools)
-
-        latest_release_comparisons_by_image = {comp.config.image: comp for comp in label_comparisons if comp.config.tool == latest_release_tool }
-        current_comparisons_by_image = {comp.config.image: comp for comp in label_comparisons if comp.config.tool == current_tool }
-
-        for image, comp in current_comparisons_by_image.items():
-            latest_f1_score = latest_release_comparisons_by_image[image].summary.f1_score
-            current_f1_score = comp.summary.f1_score
-            if current_f1_score < latest_f1_score:
-                reasons.append(f"current F1 score is lower than the latest release F1 score: {bcolors.BOLD+bcolors.UNDERLINE}current={current_f1_score:0.2f} latest={latest_f1_score:0.2f}{bcolors.RESET} image={image}")
-
-            if comp.summary.indeterminate_percent > 10:
-                reasons.append(f"current indeterminate matches % is greater than 10%: {bcolors.BOLD+bcolors.UNDERLINE}current={comp.summary.indeterminate_percent:0.2f}%{bcolors.RESET} image={image}")
-    
-            latest_fns = latest_release_comparisons_by_image[image].summary.false_negatives
-            current_fns = comp.summary.false_negatives
-            if current_fns > latest_fns:
-                reasons.append(f"current false negatives is greater than the latest release false negatives: {bcolors.BOLD+bcolors.UNDERLINE}current={current_fns} latest={latest_fns}{bcolors.RESET} image={image}")
-
-        self.reasons = reasons
-
-    def passed(self):
-        return len(self.reasons) == 0
-
-def guess_tool_orientation(tools: list[str]):
-    """
-    Given a pair of tools, guess which is latest version, and which is the one
-    being compared to the latest version.
-    Returns (latest_tool, current_tool)
-    """
-    if len(tools) != 2:
-        raise RuntimeError("expected 2 tools, got %s" % tools)
-    tool_a, tool_b = sorted(tools)
-    if tool_a == tool_b:
-        raise ValueError("latest release tool and current tool are the same")
-    if tool_a.endswith("latest"):
-        return tool_a, tool_b
-    elif tool_b.endswith("latest"):
-        return tool_b, tool_a
-
-    if "@path:" in tool_a and "@path:" not in tool_b:
-        # tool_a is a local build, so compare it against tool_b
-        return tool_b, tool_a
-
-    if "@path:" in tool_b and "@path:" not in tool_a:
-        # tool_b is a local build, so compare it against tool_a
-        return tool_a, tool_b
-
-    return tool_a, tool_b
-
-
-
-class bcolors:
-    HEADER = '\033[95m'
-    OKBLUE = '\033[94m'
-    OKCYAN = '\033[96m'
-    OKGREEN = '\033[92m'
-    WARNING = '\033[93m'
-    FAIL = '\033[91m'
-    BOLD = '\033[1m'
-    UNDERLINE = '\033[4m'
-    RESET = '\033[0m'
-
-if not sys.stdout.isatty():
-    bcolors.HEADER = ""
-    bcolors.OKBLUE = ""
-    bcolors.OKCYAN = ""
-    bcolors.OKGREEN = ""
-    bcolors.WARNING = ""
-    bcolors.FAIL = ""
-    bcolors.BOLD = ""
-    bcolors.UNDERLINE = ""
-    bcolors.RESET = ""
-
-def show_results_used(results: list[artifact.ScanResult]):
-    print(f"   Results used:")
-    for idx, result in enumerate(results):
-        branch = "├──"
-        if idx == len(results) - 1:
-            branch = "└──"
-        print(f"    {branch} {result.ID} : {result.config.tool} against {result.config.image}")
-    print()
-
-def validate(cfg: config.Application, result_set: str, images: list[str], always_run_label_comparison: bool, verbosity: int, label_entries: Optional[list[artifact.LabelEntry]] = None):
-    print(f"{bcolors.HEADER}{bcolors.BOLD}Validating with {result_set!r}", bcolors.RESET)
-    result_set_obj = store.result_set.load(name=result_set)
-
-    ret = []
-    for image, result_states in result_set_obj.result_state_by_image.items():
-        if images and image not in images:
-            print("Skipping image:", image)
-            continue
-        print()
-        print("Testing image:", image)
-        for state in result_states:
-            print("   ", f"with {state.request.tool}")
-        print()
-
-        gate = validate_image(cfg, [s.config.path for s in result_states], always_run_label_comparison=always_run_label_comparison, verbosity=verbosity, label_entries=label_entries)
-        ret.append(gate)
-
-        failure = not gate.passed()
-        if failure:
-            print(f"{bcolors.FAIL}{bcolors.BOLD}Failed quality gate{bcolors.RESET}")
-        for reason in gate.reasons:
-            print(f"   - {reason}")
-
-        print()
-        size = 120
-        print("▁"*size)
-        print("░"*size)
-        print("▔"*size)
-    return ret
-
-def validate_image(cfg: config.Application, descriptions: list[str], always_run_label_comparison: bool, verbosity: int, label_entries: Optional[list[artifact.LabelEntry]] = None):
-    # do a relative comparison
-    # - show comparison summary (no gating action)
-    # - list out all individual match differences
-
-    print(f"{bcolors.HEADER}Running relative comparison...", bcolors.RESET)
-    relative_comparison = yardstick.compare_results(descriptions=descriptions, year_max_limit=cfg.default_max_year)
-    show_results_used(relative_comparison.results)
-
-    # show the relative comparison results
-    if verbosity > 0:
-        details = verbosity > 1
-        display.preserved_matches(relative_comparison, details=details, summary=True, common=False)
-        print()
-
-    # bail if there are no differences found
-    if not always_run_label_comparison and not sum([len(relative_comparison.unique[result.ID]) for result in relative_comparison.results]):
-        print("no differences found between tool results")
-        return Gate(None, None)
-
-    # do a label comparison
-    print(f"{bcolors.HEADER}Running comparison against labels...", bcolors.RESET)
-    results, label_entries, comparisons_by_result_id, stats_by_image_tool_pair = yardstick.compare_results_against_labels(descriptions=descriptions, year_max_limit=cfg.default_max_year, label_entries=label_entries)
-    show_results_used(results)
-
-    if verbosity > 0:
-        show_fns = verbosity > 1
-        display.label_comparison(
-                results,
-                comparisons_by_result_id,
-                stats_by_image_tool_pair,
-                show_fns=show_fns,
-                show_summaries=True,
-            )
-
-    latest_release_tool, current_tool = guess_tool_orientation([r.config.tool for r in results])
-
-    # show the relative comparison unique differences paired up with label conclusions (TP/FP/FN/TN/Unknown)
-    all_rows: list[list[Any]] = []
-    for result in relative_comparison.results:
-        label_comparison = comparisons_by_result_id[result.ID]
-        for unique_match in relative_comparison.unique[result.ID]:
-            labels = label_comparison.labels_by_match[unique_match.ID]
-            if not labels:
-                label = "(unknown)"
-            elif len(set(labels)) > 1:
-                label = ", ".join([l.name for l in labels])
-            else:
-                label = labels[0].name
-            
-
-            color = ""
-            commentary = ""
-            if result.config.tool == latest_release_tool:
-                # the tool which found the unique result is the latest release tool...
-                if label == artifact.Label.TruePositive.name:
-                    # drats! we missed a case (this is a new FN)
-                    color = bcolors.FAIL
-                    commentary = "(this is a new FN 😱)"
-                elif artifact.Label.FalsePositive.name in label:
-                    # we got rid of a FP! ["hip!", "hip!"]
-                    color = bcolors.OKBLUE
-                    commentary = "(got rid of a former FP 🙌)"
-            else:
-                # the tool which found the unique result is the current tool...
-                if label == artifact.Label.TruePositive.name:
-                    # highest of fives! we found a new TP that the previous tool release missed!
-                    color = bcolors.OKBLUE
-                    commentary = "(this is a new TP 🙌)"
-                elif artifact.Label.FalsePositive.name in label:
-                    # welp, our changes resulted in a new FP... not great, maybe not terrible?
-                    color = bcolors.FAIL
-                    commentary = "(this is a new FP 😱)"
-
-            all_rows.append(
-                [
-                    f"{color}{result.config.tool} ONLY{bcolors.RESET}",
-                    f"{color}{unique_match.package.name}@{unique_match.package.version}{bcolors.RESET}",
-                    f"{color}{unique_match.vulnerability.id}{bcolors.RESET}",
-                    f"{color}{label}{bcolors.RESET}",
-                    f"{commentary}",
-                ]
-            )
-
-    def escape_ansi(line):
-        ansi_escape = re.compile(r'(?:\x1B[@-_]|[\x80-\x9F])[0-?]*[ -/]*[@-~]')
-        return ansi_escape.sub('', line)
-
-    # sort but don't consider ansi escape codes
-    all_rows = sorted(all_rows, key=lambda x: escape_ansi(str(x[0]+x[1]+x[2]+x[3])))
-    if len(all_rows) == 0:
-        print("No differences found between tooling (with labels)")
-    else:
-        print("Match differences between tooling (with labels):")
-        indent = "   "
-        print(indent + tabulate([["TOOL PARTITION", "PACKAGE", "VULNERABILITY", "LABEL", "COMMENTARY"]]+all_rows, tablefmt="plain").replace("\n", "\n" + indent) + "\n")
-
-
-    # populate the quality gate with data that can evaluate pass/fail conditions
-    return Gate(label_comparisons=comparisons_by_result_id.values(), label_comparison_stats=stats_by_image_tool_pair)
-
-@click.command()
-@click.option("--image", "-i", "images", multiple=True, help="filter down to one or more images to validate with (don't use the full result set)")
-@click.option("--label-comparison", "-l", "always_run_label_comparison", is_flag=True, help="run label comparison irregardless of relative comparison results")
-@click.option("--breakdown-by-ecosystem", "-e", is_flag=True, help="show label comparison results broken down by ecosystem")
-@click.option("--verbose", "-v", "verbosity", count=True, help="show details of all comparisons")
-@click.option("--result-set", "-r", default=default_result_set, help="the result set to use for the quality gate")
-def main(images: list[str], always_run_label_comparison: bool, breakdown_by_ecosystem: bool, verbosity: int, result_set: str):
-    cfg = config.load()
-    setup_logging(verbosity)
-
-    # let's not load any more labels than we need to, base this off of the images we're validating
-    if not images:
-        images = set()
-        result_set_obj = store.result_set.load(name=result_set)
-        for state in result_set_obj.state:
-            images.add(state.config.image)
-        images = sorted(list(images))
- 
-    print("Loading label entries...", end=" ")
-    label_entries = store.labels.load_for_image(images, year_max_limit=cfg.default_max_year)
-    print(f"done! {len(label_entries)} entries loaded")
-
-    result_sets = [result_set] # today only one result set is supported, but more can be added
-    gates = []
-    for result_set in result_sets:
-        gates.extend(validate(cfg, result_set, images=images, always_run_label_comparison=always_run_label_comparison, verbosity=verbosity, label_entries=label_entries))
-        print()
-    
-        if breakdown_by_ecosystem:
-            print(f"{bcolors.HEADER}Breaking down label comparison by ecosystem performance...", bcolors.RESET)
-            results_by_image, label_entries, stats = yardstick.compare_results_against_labels_by_ecosystem(result_set=result_set, year_max_limit=cfg.default_max_year, label_entries=label_entries)
-            display.labels_by_ecosystem_comparison(
-                results_by_image,
-                stats,
-                show_images_used=False,
-            )
-            print()
-
-    failure = not all([gate.passed() for gate in gates])
-    if failure:
-        print("Reasons for quality gate failure:")
-    for gate in gates:
-        for reason in gate.reasons:
-            print(f"   - {reason}")
-
-    if failure:
-        print()
-        print(f"{bcolors.FAIL}{bcolors.BOLD}Quality gate FAILED{bcolors.RESET}")
-        sys.exit(1)
-    else:
-        print(f"{bcolors.OKGREEN}{bcolors.BOLD}Quality gate passed!{bcolors.RESET}")
-
-
-def setup_logging(verbosity: int):
-    # pylint: disable=redefined-outer-name, import-outside-toplevel
-    import logging.config
-
-    if verbosity in [0, 1, 2]:
-        log_level = "WARN"
-    elif verbosity == 3:
-        log_level = "INFO"
-    else:
-        log_level = "DEBUG"
-
-    logging.config.dictConfig(
-        {
-            "version": 1,
-            "formatters": {
-                "standard": {
-                    # [%(module)s.%(funcName)s]
-                    "format": "%(asctime)s [%(levelname)s] %(message)s",
-                    "datefmt": "",
-                },
-            },
-            "handlers": {
-                "default": {
-                    "level": log_level,
-                    "formatter": "standard",
-                    "class": "logging.StreamHandler",
-                    "stream": "ext://sys.stderr",
-                },
-            },
-            "loggers": {
-                "": {  # root logger
-                    "handlers": ["default"],
-                    "level": log_level,
-                },
-            },
-        }
-    )
-
-if __name__ == '__main__':
-    main()
diff --git a/test/quality/requirements.txt b/test/quality/requirements.txt
index 8f8a9a2d7fe..e6857621d08 100644
--- a/test/quality/requirements.txt
+++ b/test/quality/requirements.txt
@@ -1,3 +1,3 @@
-git+https://github.com/anchore/yardstick@v0.9.1
+git+https://github.com/anchore/yardstick@feat-validate-subcommand
 # ../../../yardstick
 tabulate==0.9.0