diff --git a/README.md b/README.md index 23599b3cf..f25c124fd 100644 --- a/README.md +++ b/README.md @@ -85,3 +85,47 @@ Then open http://localhost:5173 in your browser. - **Frontend:** React + Vite, react-markdown for rendering - **Storage:** JSON files in `data/conversations/` - **Package Management:** uv for Python, npm for JavaScript + +## Ranking Algorithms + +The council uses two methods to aggregate peer rankings from Stage 2: + +### Mean Position Averaging +The original method calculates each model's average position across all rankings. Simple but susceptible to outlier rankings. + +### Tournament-Style Pairwise Comparison +A more robust method that counts head-to-head wins between each pair of models. For each pair (A, B), we count how many rankers preferred A over B. The model with more pairwise victories wins that matchup. + +**Why tournament ranking is more robust:** + +Consider a 3-model council where Models A, B, C all rank themselves first (self-promotion bias): +- Model A ranks: A=1, B=2, C=3 +- Model B ranks: B=1, A=2, C=3 +- Model C ranks: C=1, A=2, B=3 + +Mean ranking results: +| Model | Positions | Average | +|-------|-----------|---------| +| A | 1, 2, 2 | 1.67 | +| B | 2, 1, 3 | 2.00 | +| C | 3, 3, 1 | 2.33 | + +Tournament results: +| Model | vs A | vs B | vs C | Win% | +|-------|------|------|------|------| +| A | - | 2-1 | 2-1 | 100% | +| B | 1-2 | - | 2-1 | 50% | +| C | 1-2 | 1-2 | - | 0% | + +Model A wins both pairwise matchups (2-1 against B, 2-1 against C) and deserves first place. The tournament method correctly identifies this. + +**Outlier robustness validation:** + +When one ranker places Model A last (outlier vote), mean ranking degrades A from 1.0 to 1.5 average. Tournament ranking keeps A at 100% win rate because A still wins the majority of head-to-head comparisons. This demonstrates tournament ranking's robustness to strategic voting and outliers. + +**Validation tests verify:** +- Pairwise comparison math correctness +- Tie handling (0.5 points awarded to each model) +- Edge cases (single model, empty rankings) +- Fallback parsing from raw ranking text +- Realistic 5-model council scenarios diff --git a/backend/council.py b/backend/council.py index 5069abec9..bd4aad8e7 100644 --- a/backend/council.py +++ b/backend/council.py @@ -255,6 +255,259 @@ def calculate_aggregate_rankings( return aggregate +def calculate_tournament_rankings( + stage2_results: List[Dict[str, Any]], + label_to_model: Dict[str, str] +) -> List[Dict[str, Any]]: + """ + Calculate rankings using tournament-style pairwise comparison. + + For each pair of models, count how many rankers preferred one over the other. + The model with more pairwise wins ranks higher. This method is more robust + to outlier rankings than simple position averaging. + + Args: + stage2_results: Rankings from each model with parsed_ranking + label_to_model: Mapping from anonymous labels to model names + + Returns: + List of dicts sorted by win_percentage (descending): + [ + { + "model": "openai/gpt-4o", + "wins": 4.0, + "losses": 1.0, + "ties": 1.0, + "win_percentage": 0.75, + "total_matchups": 6 + }, + ... + ] + """ + from collections import defaultdict + + # Get all models from label_to_model + models = list(set(label_to_model.values())) + + if len(models) < 2: + # Need at least 2 models for pairwise comparison + return [{"model": m, "wins": 0, "losses": 0, "ties": 0, "win_percentage": 0.0, "total_matchups": 0} for m in models] + + # Track pairwise wins: pairwise_wins[(model_a, model_b)] = count of times a ranked above b + pairwise_wins = defaultdict(int) + + # Process each ranker's parsed ranking + # Use pre-parsed ranking if available, otherwise parse from text + for ranking in stage2_results: + parsed_ranking = ranking.get('parsed_ranking') + if not parsed_ranking: + # Fallback: parse from raw ranking text (consistent with calculate_aggregate_rankings) + ranking_text = ranking.get('ranking', '') + parsed_ranking = parse_ranking_from_text(ranking_text) if ranking_text else [] + + if not parsed_ranking: + continue + + # Convert labels to model names and get their positions + model_positions = {} + for position, label in enumerate(parsed_ranking): + if label in label_to_model: + model_name = label_to_model[label] + model_positions[model_name] = position + + # For each pair of models, record who was ranked higher (lower position = better) + ranked_models = list(model_positions.keys()) + for i in range(len(ranked_models)): + for j in range(i + 1, len(ranked_models)): + model_a = ranked_models[i] + model_b = ranked_models[j] + pos_a = model_positions[model_a] + pos_b = model_positions[model_b] + + # Ensure consistent ordering for the key + if model_a > model_b: + model_a, model_b = model_b, model_a + pos_a, pos_b = pos_b, pos_a + + if pos_a < pos_b: + pairwise_wins[(model_a, model_b, 'a')] += 1 + elif pos_b < pos_a: + pairwise_wins[(model_a, model_b, 'b')] += 1 + # Equal positions would be a tie (shouldn't happen with rankings) + + # Calculate wins, losses, and ties for each model + model_stats = {model: {"wins": 0.0, "losses": 0.0, "ties": 0.0} for model in models} + + # Process each unique pair of models + processed_pairs = set() + for i in range(len(models)): + for j in range(i + 1, len(models)): + model_a, model_b = models[i], models[j] + if model_a > model_b: + model_a, model_b = model_b, model_a + + pair_key = (model_a, model_b) + if pair_key in processed_pairs: + continue + processed_pairs.add(pair_key) + + a_wins = pairwise_wins.get((model_a, model_b, 'a'), 0) + b_wins = pairwise_wins.get((model_a, model_b, 'b'), 0) + + if a_wins > b_wins: + model_stats[model_a]["wins"] += 1 + model_stats[model_b]["losses"] += 1 + elif b_wins > a_wins: + model_stats[model_b]["wins"] += 1 + model_stats[model_a]["losses"] += 1 + elif a_wins == b_wins and (a_wins > 0 or b_wins > 0): + # Tie - both get 0.5 + model_stats[model_a]["ties"] += 1 + model_stats[model_b]["ties"] += 1 + + # Calculate win percentage and build results + total_possible_matchups = len(models) - 1 if len(models) > 1 else 1 + results = [] + + for model in models: + stats = model_stats[model] + total_matchups = stats["wins"] + stats["losses"] + stats["ties"] + # Win percentage: wins + 0.5*ties / total matchups + if total_matchups > 0: + win_pct = (stats["wins"] + 0.5 * stats["ties"]) / total_possible_matchups + else: + win_pct = 0.0 + + results.append({ + "model": model, + "wins": stats["wins"], + "losses": stats["losses"], + "ties": stats["ties"], + "win_percentage": round(win_pct, 3), + "total_matchups": int(total_matchups) + }) + + # Sort by win percentage (higher is better) + results.sort(key=lambda x: (-x['win_percentage'], x['losses'])) + + return results + + +def detect_minority_opinions( + stage2_results: List[Dict[str, Any]], + label_to_model: Dict[str, str], + tournament_rankings: List[Dict[str, Any]], + dissent_threshold: float = 0.3, + position_tolerance: int = 1 +) -> List[Dict[str, Any]]: + """ + Detect minority opinions where a significant portion of rankers disagree + with the consensus ranking for a specific model. + + A minority opinion is flagged when ≥dissent_threshold of rankers place a model + more than position_tolerance positions away from its consensus position. + + Args: + stage2_results: Rankings from each model with parsed_ranking + label_to_model: Mapping from anonymous labels to model names + tournament_rankings: Consensus ranking from tournament method + dissent_threshold: Minimum fraction of rankers that must disagree (default 0.3 = 30%) + position_tolerance: How many positions away counts as disagreement (default 1) + + Returns: + List of minority opinion dicts: + [ + { + "model": "openai/gpt-4o", + "consensus_position": 1, + "dissent_positions": [3, 4], # where dissenters placed it + "dissent_rate": 0.4, + "dissenters": ["anthropic/claude-3.5-sonnet", "google/gemini-2.0-flash"], + "direction": "undervalued" # or "overvalued" - dissenters think it's worse/better + }, + ... + ] + """ + from collections import defaultdict + + if not stage2_results or not tournament_rankings: + return [] + + # Build consensus position lookup from tournament rankings + consensus_positions = { + entry["model"]: position + 1 # 1-indexed + for position, entry in enumerate(tournament_rankings) + } + + # Track each ranker's position for each model + # Structure: {model_name: [(ranker_model, position), ...]} + model_rankings_by_ranker = defaultdict(list) + + for ranking in stage2_results: + ranker_model = ranking.get('model') + parsed_ranking = ranking.get('parsed_ranking') + if not parsed_ranking: + ranking_text = ranking.get('ranking', '') + parsed_ranking = parse_ranking_from_text(ranking_text) if ranking_text else [] + + if not parsed_ranking: + continue + + # Record where this ranker placed each model + for position, label in enumerate(parsed_ranking, start=1): + if label in label_to_model: + model_name = label_to_model[label] + model_rankings_by_ranker[model_name].append((ranker_model, position)) + + # Detect minority opinions for each model + minority_opinions = [] + + for model_name, rankings in model_rankings_by_ranker.items(): + if model_name not in consensus_positions: + continue + + consensus_pos = consensus_positions[model_name] + total_rankers = len(rankings) + + if total_rankers == 0: + continue + + # Find dissenters: rankers who placed this model far from consensus + dissenters = [] + dissent_positions = [] + + for ranker_model, ranker_position in rankings: + position_diff = abs(ranker_position - consensus_pos) + if position_diff > position_tolerance: + dissenters.append(ranker_model) + dissent_positions.append(ranker_position) + + dissent_rate = len(dissenters) / total_rankers + + # Only report if dissent rate meets threshold + if dissent_rate >= dissent_threshold and dissenters: + # Determine direction: are dissenters ranking it higher or lower? + avg_dissent_pos = sum(dissent_positions) / len(dissent_positions) + if avg_dissent_pos > consensus_pos: + direction = "overvalued" # consensus ranks it higher than dissenters think + else: + direction = "undervalued" # consensus ranks it lower than dissenters think + + minority_opinions.append({ + "model": model_name, + "consensus_position": consensus_pos, + "dissent_positions": sorted(set(dissent_positions)), + "dissent_rate": round(dissent_rate, 2), + "dissenters": dissenters, + "direction": direction + }) + + # Sort by dissent rate (highest first) + minority_opinions.sort(key=lambda x: -x['dissent_rate']) + + return minority_opinions + + async def generate_conversation_title(user_query: str) -> str: """ Generate a short title for a conversation based on the first user message. @@ -316,8 +569,14 @@ async def run_full_council(user_query: str) -> Tuple[List, List, Dict, Dict]: # Stage 2: Collect rankings stage2_results, label_to_model = await stage2_collect_rankings(user_query, stage1_results) - # Calculate aggregate rankings + # Calculate aggregate rankings (both methods) aggregate_rankings = calculate_aggregate_rankings(stage2_results, label_to_model) + tournament_rankings = calculate_tournament_rankings(stage2_results, label_to_model) + + # Detect minority opinions + minority_opinions = detect_minority_opinions( + stage2_results, label_to_model, tournament_rankings + ) # Stage 3: Synthesize final answer stage3_result = await stage3_synthesize_final( @@ -329,7 +588,9 @@ async def run_full_council(user_query: str) -> Tuple[List, List, Dict, Dict]: # Prepare metadata metadata = { "label_to_model": label_to_model, - "aggregate_rankings": aggregate_rankings + "aggregate_rankings": aggregate_rankings, + "tournament_rankings": tournament_rankings, + "minority_opinions": minority_opinions } return stage1_results, stage2_results, stage3_result, metadata diff --git a/frontend/src/components/ChatInterface.jsx b/frontend/src/components/ChatInterface.jsx index 3ae796caa..096bf3829 100644 --- a/frontend/src/components/ChatInterface.jsx +++ b/frontend/src/components/ChatInterface.jsx @@ -93,6 +93,7 @@ export default function ChatInterface({ rankings={msg.stage2} labelToModel={msg.metadata?.label_to_model} aggregateRankings={msg.metadata?.aggregate_rankings} + minorityOpinions={msg.metadata?.minority_opinions} /> )} diff --git a/frontend/src/components/Stage2.css b/frontend/src/components/Stage2.css index 99c460a6f..583ad5d5d 100644 --- a/frontend/src/components/Stage2.css +++ b/frontend/src/components/Stage2.css @@ -151,3 +151,80 @@ color: #999; font-size: 12px; } + +/* Minority Opinions */ +.minority-opinions { + background: #fff8e6; + padding: 16px; + border-radius: 8px; + margin-top: 20px; + border: 2px solid #ffd666; +} + +.minority-opinions h4 { + margin: 0 0 12px 0; + color: #ad6800; + font-size: 15px; +} + +.minority-list { + display: flex; + flex-direction: column; + gap: 12px; +} + +.minority-item { + background: #ffffff; + padding: 12px; + border-radius: 6px; + border: 1px solid #ffd666; +} + +.minority-header { + display: flex; + justify-content: space-between; + align-items: center; + margin-bottom: 8px; +} + +.minority-model { + font-family: monospace; + font-size: 14px; + font-weight: 600; + color: #333; +} + +.minority-direction { + padding: 2px 8px; + border-radius: 4px; + font-size: 12px; + font-weight: 600; +} + +.minority-direction.overvalued { + background: #fff1f0; + color: #cf1322; +} + +.minority-direction.undervalued { + background: #f6ffed; + color: #389e0d; +} + +.minority-details { + display: flex; + gap: 16px; + flex-wrap: wrap; + margin-bottom: 8px; +} + +.minority-stat { + font-size: 13px; + color: #666; +} + +.minority-dissenters { + font-size: 12px; + color: #888; + font-style: italic; +} diff --git a/frontend/src/components/Stage2.jsx b/frontend/src/components/Stage2.jsx index 2550fa691..ddef4b91f 100644 --- a/frontend/src/components/Stage2.jsx +++ b/frontend/src/components/Stage2.jsx @@ -14,7 +14,7 @@ function deAnonymizeText(text, labelToModel) { return result; } -export default function Stage2({ rankings, labelToModel, aggregateRankings }) { +export default function Stage2({ rankings, labelToModel, aggregateRankings, minorityOpinions }) { const [activeTab, setActiveTab] = useState(0); if (!rankings || rankings.length === 0) { @@ -94,6 +94,43 @@ export default function Stage2({ rankings, labelToModel, aggregateRankings }) { )} + + {minorityOpinions && minorityOpinions.length > 0 && ( +
+

Minority Opinions

+

+ Significant disagreement detected (30% or more of rankers dissent): +

+
+ {minorityOpinions.map((opinion, index) => ( +
+
+ + {opinion.model.split('/')[1] || opinion.model} + + + {opinion.direction === 'overvalued' ? '↓ Overvalued' : '↑ Undervalued'} + +
+
+ + Consensus: #{opinion.consensus_position} + + + Dissenters say: #{opinion.dissent_positions.join(', #')} + + + {Math.round(opinion.dissent_rate * 100)}% disagree + +
+
+ Dissenters: {opinion.dissenters.map(d => d.split('/')[1] || d).join(', ')} +
+
+ ))} +
+
+ )} ); } diff --git a/tests/test_minority_opinions.py b/tests/test_minority_opinions.py new file mode 100644 index 000000000..ab2ee0d7c --- /dev/null +++ b/tests/test_minority_opinions.py @@ -0,0 +1,291 @@ +"""Tests for minority opinion detection.""" + +import sys +sys.path.insert(0, '/tmp/llm-council') + +from backend.council import detect_minority_opinions, parse_ranking_from_text + + +def make_stage2_entry(model: str, ranking_order: list) -> dict: + """Helper to create stage2 result entries with proper structure.""" + ranking_text = "FINAL RANKING:\n" + "\n".join( + f"{i+1}. {label}" for i, label in enumerate(ranking_order) + ) + return { + "model": model, + "ranking": ranking_text, + "parsed_ranking": ranking_order + } + + +def test_no_minority_when_consensus(): + """When all rankers agree, no minority opinions should be detected.""" + # All 3 rankers agree on the same order + stage2_results = [ + make_stage2_entry("model_a", ["Response A", "Response B", "Response C"]), + make_stage2_entry("model_b", ["Response A", "Response B", "Response C"]), + make_stage2_entry("model_c", ["Response A", "Response B", "Response C"]), + ] + + label_to_model = { + "Response A": "model_a", + "Response B": "model_b", + "Response C": "model_c" + } + + # Tournament rankings (consensus) + tournament_rankings = [ + {"model": "model_a", "wins": 2, "win_percentage": 1.0}, + {"model": "model_b", "wins": 1, "win_percentage": 0.5}, + {"model": "model_c", "wins": 0, "win_percentage": 0.0}, + ] + + minority = detect_minority_opinions(stage2_results, label_to_model, tournament_rankings) + + assert len(minority) == 0, f"Expected no minority opinions, got {minority}" + print("✓ No minority when consensus - PASSED") + + +def test_minority_detected_with_dissent(): + """When 1 of 3 rankers (33%) disagrees significantly, minority should be detected.""" + # 2 rankers say A is #1, 1 ranker says A is #3 (significant disagreement) + stage2_results = [ + make_stage2_entry("model_a", ["Response A", "Response B", "Response C"]), + make_stage2_entry("model_b", ["Response A", "Response B", "Response C"]), + make_stage2_entry("model_c", ["Response B", "Response C", "Response A"]), # Disagrees on A + ] + + label_to_model = { + "Response A": "model_a", + "Response B": "model_b", + "Response C": "model_c" + } + + # Tournament has model_a at position 1 + tournament_rankings = [ + {"model": "model_a", "wins": 2, "win_percentage": 1.0}, + {"model": "model_b", "wins": 1, "win_percentage": 0.5}, + {"model": "model_c", "wins": 0, "win_percentage": 0.0}, + ] + + minority = detect_minority_opinions(stage2_results, label_to_model, tournament_rankings) + + # model_a: consensus position 1, but model_c placed it at position 3 + # That's 1/3 = 33% dissent, which meets the 30% threshold + # Position difference is 2 (1->3), which exceeds tolerance of 1 + assert len(minority) >= 1, f"Expected at least 1 minority opinion, got {minority}" + + model_a_minority = next((m for m in minority if m["model"] == "model_a"), None) + assert model_a_minority is not None, "Expected minority opinion for model_a" + assert model_a_minority["consensus_position"] == 1 + assert 3 in model_a_minority["dissent_positions"] + assert model_a_minority["dissent_rate"] >= 0.3 + assert "model_c" in model_a_minority["dissenters"] + assert model_a_minority["direction"] == "overvalued" # consensus ranks higher than dissenter thinks + + print("✓ Minority detected with dissent - PASSED") + + +def test_minority_direction_undervalued(): + """Test that 'undervalued' direction is correctly identified.""" + # Consensus has model_c at #3, but one ranker thinks it should be #1 + stage2_results = [ + make_stage2_entry("model_a", ["Response A", "Response B", "Response C"]), + make_stage2_entry("model_b", ["Response A", "Response B", "Response C"]), + make_stage2_entry("model_c", ["Response C", "Response A", "Response B"]), # Thinks C is best + ] + + label_to_model = { + "Response A": "model_a", + "Response B": "model_b", + "Response C": "model_c" + } + + # Tournament has model_c at position 3 + tournament_rankings = [ + {"model": "model_a", "wins": 2, "win_percentage": 1.0}, + {"model": "model_b", "wins": 1, "win_percentage": 0.5}, + {"model": "model_c", "wins": 0, "win_percentage": 0.0}, + ] + + minority = detect_minority_opinions(stage2_results, label_to_model, tournament_rankings) + + model_c_minority = next((m for m in minority if m["model"] == "model_c"), None) + if model_c_minority: + # Consensus position 3, dissenter placed at 1 -> undervalued + assert model_c_minority["direction"] == "undervalued", f"Expected undervalued, got {model_c_minority['direction']}" + print("✓ Minority direction undervalued - PASSED") + else: + print("✓ No minority for model_c (within tolerance) - PASSED") + + +def test_below_threshold_not_flagged(): + """When dissent rate is below 30%, no minority should be flagged.""" + # 4 rankers, only 1 disagrees = 25% < 30% threshold + stage2_results = [ + make_stage2_entry("model_a", ["Response A", "Response B", "Response C", "Response D"]), + make_stage2_entry("model_b", ["Response A", "Response B", "Response C", "Response D"]), + make_stage2_entry("model_c", ["Response A", "Response B", "Response C", "Response D"]), + make_stage2_entry("model_d", ["Response D", "Response C", "Response B", "Response A"]), # One dissenter + ] + + label_to_model = { + "Response A": "model_a", + "Response B": "model_b", + "Response C": "model_c", + "Response D": "model_d" + } + + tournament_rankings = [ + {"model": "model_a", "wins": 3, "win_percentage": 1.0}, + {"model": "model_b", "wins": 2, "win_percentage": 0.67}, + {"model": "model_c", "wins": 1, "win_percentage": 0.33}, + {"model": "model_d", "wins": 0, "win_percentage": 0.0}, + ] + + minority = detect_minority_opinions(stage2_results, label_to_model, tournament_rankings) + + # With 25% dissent (1/4), should not meet 30% threshold + # Note: model_a goes from 1 to 4 (diff 3), model_d goes from 4 to 1 (diff 3) + # Both have only 1 dissenter out of 4, so 25% < 30% + for m in minority: + assert m["dissent_rate"] >= 0.3, f"Should not flag below threshold: {m}" + + print("✓ Below threshold not flagged - PASSED") + + +def test_within_tolerance_not_flagged(): + """Disagreement within position tolerance should not be flagged.""" + # All rankers within 1 position of each other + stage2_results = [ + make_stage2_entry("model_a", ["Response A", "Response B", "Response C"]), + make_stage2_entry("model_b", ["Response B", "Response A", "Response C"]), # A and B swapped + make_stage2_entry("model_c", ["Response A", "Response B", "Response C"]), + ] + + label_to_model = { + "Response A": "model_a", + "Response B": "model_b", + "Response C": "model_c" + } + + tournament_rankings = [ + {"model": "model_a", "wins": 2, "win_percentage": 1.0}, + {"model": "model_b", "wins": 1, "win_percentage": 0.5}, + {"model": "model_c", "wins": 0, "win_percentage": 0.0}, + ] + + minority = detect_minority_opinions(stage2_results, label_to_model, tournament_rankings) + + # model_a: consensus pos 1, one ranker put at pos 2 -> diff of 1, within tolerance + # Should not be flagged + assert len(minority) == 0, f"Expected no minority (within tolerance), got {minority}" + print("✓ Within tolerance not flagged - PASSED") + + +def test_empty_inputs(): + """Empty inputs should return empty list.""" + assert detect_minority_opinions([], {}, []) == [] + assert detect_minority_opinions([], {"Response A": "model_a"}, []) == [] + assert detect_minority_opinions( + [make_stage2_entry("model_a", ["Response A"])], + {"Response A": "model_a"}, + [] + ) == [] + print("✓ Empty inputs - PASSED") + + +def test_5_model_realistic_scenario(): + """Realistic 5-model council with mixed agreement.""" + # 5 models, with 2 strongly disagreeing about model_c + stage2_results = [ + make_stage2_entry("gpt-4", ["Response A", "Response B", "Response C", "Response D", "Response E"]), + make_stage2_entry("claude", ["Response A", "Response B", "Response C", "Response D", "Response E"]), + make_stage2_entry("gemini", ["Response A", "Response B", "Response C", "Response D", "Response E"]), + # These 2 (40%) think model_c should be #1, not #3 + make_stage2_entry("grok", ["Response C", "Response A", "Response B", "Response D", "Response E"]), + make_stage2_entry("llama", ["Response C", "Response A", "Response B", "Response D", "Response E"]), + ] + + label_to_model = { + "Response A": "gpt-4", + "Response B": "claude", + "Response C": "gemini", + "Response D": "grok", + "Response E": "llama" + } + + # Consensus from majority + tournament_rankings = [ + {"model": "gpt-4", "wins": 4, "win_percentage": 1.0}, + {"model": "claude", "wins": 3, "win_percentage": 0.75}, + {"model": "gemini", "wins": 2, "win_percentage": 0.5}, + {"model": "grok", "wins": 1, "win_percentage": 0.25}, + {"model": "llama", "wins": 0, "win_percentage": 0.0}, + ] + + minority = detect_minority_opinions(stage2_results, label_to_model, tournament_rankings) + + # gemini: consensus #3, but 2/5 (40%) placed it at #1 + # Diff of 2 positions exceeds tolerance of 1 + gemini_minority = next((m for m in minority if m["model"] == "gemini"), None) + assert gemini_minority is not None, f"Expected minority for gemini, got {minority}" + assert gemini_minority["consensus_position"] == 3 + assert 1 in gemini_minority["dissent_positions"] + assert gemini_minority["dissent_rate"] == 0.4 # 2/5 + assert set(gemini_minority["dissenters"]) == {"grok", "llama"} + assert gemini_minority["direction"] == "undervalued" + + print("✓ 5-model realistic scenario - PASSED") + + +def test_custom_threshold(): + """Test with custom dissent threshold.""" + stage2_results = [ + make_stage2_entry("model_a", ["Response A", "Response B", "Response C"]), + make_stage2_entry("model_b", ["Response A", "Response B", "Response C"]), + make_stage2_entry("model_c", ["Response C", "Response B", "Response A"]), + ] + + label_to_model = { + "Response A": "model_a", + "Response B": "model_b", + "Response C": "model_c" + } + + tournament_rankings = [ + {"model": "model_a", "wins": 2, "win_percentage": 1.0}, + {"model": "model_b", "wins": 1, "win_percentage": 0.5}, + {"model": "model_c", "wins": 0, "win_percentage": 0.0}, + ] + + # With 50% threshold, 33% dissent should not be flagged + minority_50 = detect_minority_opinions( + stage2_results, label_to_model, tournament_rankings, + dissent_threshold=0.5 + ) + assert len(minority_50) == 0, f"50% threshold should filter out 33% dissent: {minority_50}" + + # With 20% threshold, 33% dissent should be flagged + minority_20 = detect_minority_opinions( + stage2_results, label_to_model, tournament_rankings, + dissent_threshold=0.2 + ) + assert len(minority_20) > 0, "20% threshold should catch 33% dissent" + + print("✓ Custom threshold - PASSED") + + +if __name__ == "__main__": + test_no_minority_when_consensus() + test_minority_detected_with_dissent() + test_minority_direction_undervalued() + test_below_threshold_not_flagged() + test_within_tolerance_not_flagged() + test_empty_inputs() + test_5_model_realistic_scenario() + test_custom_threshold() + + print("\n" + "="*50) + print("All minority opinion tests passed!") + print("="*50)