Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,3 +85,47 @@ Then open http://localhost:5173 in your browser.
- **Frontend:** React + Vite, react-markdown for rendering
- **Storage:** JSON files in `data/conversations/`
- **Package Management:** uv for Python, npm for JavaScript

## Ranking Algorithms

The council uses two methods to aggregate peer rankings from Stage 2:

### Mean Position Averaging
The original method calculates each model's average position across all rankings. Simple but susceptible to outlier rankings.

### Tournament-Style Pairwise Comparison
A more robust method that counts head-to-head wins between each pair of models. For each pair (A, B), we count how many rankers preferred A over B. The model with more pairwise victories wins that matchup.

**Why tournament ranking is more robust:**

Consider a 3-model council where Models A, B, C all rank themselves first (self-promotion bias):
- Model A ranks: A=1, B=2, C=3
- Model B ranks: B=1, A=2, C=3
- Model C ranks: C=1, A=2, B=3

Mean ranking results:
| Model | Positions | Average |
|-------|-----------|---------|
| A | 1, 2, 2 | 1.67 |
| B | 2, 1, 3 | 2.00 |
| C | 3, 3, 1 | 2.33 |

Tournament results:
| Model | vs A | vs B | vs C | Win% |
|-------|------|------|------|------|
| A | - | 2-1 | 2-1 | 100% |
| B | 1-2 | - | 2-1 | 50% |
| C | 1-2 | 1-2 | - | 0% |

Model A wins both pairwise matchups (2-1 against B, 2-1 against C) and deserves first place. The tournament method correctly identifies this.

**Outlier robustness validation:**

When one ranker places Model A last (outlier vote), mean ranking degrades A from 1.0 to 1.5 average. Tournament ranking keeps A at 100% win rate because A still wins the majority of head-to-head comparisons. This demonstrates tournament ranking's robustness to strategic voting and outliers.

**Validation tests verify:**
- Pairwise comparison math correctness
- Tie handling (0.5 points awarded to each model)
- Edge cases (single model, empty rankings)
- Fallback parsing from raw ranking text
- Realistic 5-model council scenarios
265 changes: 263 additions & 2 deletions backend/council.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,259 @@ def calculate_aggregate_rankings(
return aggregate


def calculate_tournament_rankings(
stage2_results: List[Dict[str, Any]],
label_to_model: Dict[str, str]
) -> List[Dict[str, Any]]:
"""
Calculate rankings using tournament-style pairwise comparison.

For each pair of models, count how many rankers preferred one over the other.
The model with more pairwise wins ranks higher. This method is more robust
to outlier rankings than simple position averaging.

Args:
stage2_results: Rankings from each model with parsed_ranking
label_to_model: Mapping from anonymous labels to model names

Returns:
List of dicts sorted by win_percentage (descending):
[
{
"model": "openai/gpt-4o",
"wins": 4.0,
"losses": 1.0,
"ties": 1.0,
"win_percentage": 0.75,
"total_matchups": 6
},
...
]
"""
from collections import defaultdict

# Get all models from label_to_model
models = list(set(label_to_model.values()))

if len(models) < 2:
# Need at least 2 models for pairwise comparison
return [{"model": m, "wins": 0, "losses": 0, "ties": 0, "win_percentage": 0.0, "total_matchups": 0} for m in models]

# Track pairwise wins: pairwise_wins[(model_a, model_b)] = count of times a ranked above b
pairwise_wins = defaultdict(int)

# Process each ranker's parsed ranking
# Use pre-parsed ranking if available, otherwise parse from text
for ranking in stage2_results:
parsed_ranking = ranking.get('parsed_ranking')
if not parsed_ranking:
# Fallback: parse from raw ranking text (consistent with calculate_aggregate_rankings)
ranking_text = ranking.get('ranking', '')
parsed_ranking = parse_ranking_from_text(ranking_text) if ranking_text else []

if not parsed_ranking:
continue

# Convert labels to model names and get their positions
model_positions = {}
for position, label in enumerate(parsed_ranking):
if label in label_to_model:
model_name = label_to_model[label]
model_positions[model_name] = position

# For each pair of models, record who was ranked higher (lower position = better)
ranked_models = list(model_positions.keys())
for i in range(len(ranked_models)):
for j in range(i + 1, len(ranked_models)):
model_a = ranked_models[i]
model_b = ranked_models[j]
pos_a = model_positions[model_a]
pos_b = model_positions[model_b]

# Ensure consistent ordering for the key
if model_a > model_b:
model_a, model_b = model_b, model_a
pos_a, pos_b = pos_b, pos_a

if pos_a < pos_b:
pairwise_wins[(model_a, model_b, 'a')] += 1
elif pos_b < pos_a:
pairwise_wins[(model_a, model_b, 'b')] += 1
# Equal positions would be a tie (shouldn't happen with rankings)

# Calculate wins, losses, and ties for each model
model_stats = {model: {"wins": 0.0, "losses": 0.0, "ties": 0.0} for model in models}

# Process each unique pair of models
processed_pairs = set()
for i in range(len(models)):
for j in range(i + 1, len(models)):
model_a, model_b = models[i], models[j]
if model_a > model_b:
model_a, model_b = model_b, model_a

pair_key = (model_a, model_b)
if pair_key in processed_pairs:
continue
processed_pairs.add(pair_key)

a_wins = pairwise_wins.get((model_a, model_b, 'a'), 0)
b_wins = pairwise_wins.get((model_a, model_b, 'b'), 0)

if a_wins > b_wins:
model_stats[model_a]["wins"] += 1
model_stats[model_b]["losses"] += 1
elif b_wins > a_wins:
model_stats[model_b]["wins"] += 1
model_stats[model_a]["losses"] += 1
elif a_wins == b_wins and (a_wins > 0 or b_wins > 0):
# Tie - both get 0.5
model_stats[model_a]["ties"] += 1
model_stats[model_b]["ties"] += 1

# Calculate win percentage and build results
total_possible_matchups = len(models) - 1 if len(models) > 1 else 1
results = []

for model in models:
stats = model_stats[model]
total_matchups = stats["wins"] + stats["losses"] + stats["ties"]
# Win percentage: wins + 0.5*ties / total matchups
if total_matchups > 0:
win_pct = (stats["wins"] + 0.5 * stats["ties"]) / total_possible_matchups
else:
win_pct = 0.0

results.append({
"model": model,
"wins": stats["wins"],
"losses": stats["losses"],
"ties": stats["ties"],
"win_percentage": round(win_pct, 3),
"total_matchups": int(total_matchups)
})

# Sort by win percentage (higher is better)
results.sort(key=lambda x: (-x['win_percentage'], x['losses']))

return results


def detect_minority_opinions(
stage2_results: List[Dict[str, Any]],
label_to_model: Dict[str, str],
tournament_rankings: List[Dict[str, Any]],
dissent_threshold: float = 0.3,
position_tolerance: int = 1
) -> List[Dict[str, Any]]:
"""
Detect minority opinions where a significant portion of rankers disagree
with the consensus ranking for a specific model.

A minority opinion is flagged when ≥dissent_threshold of rankers place a model
more than position_tolerance positions away from its consensus position.

Args:
stage2_results: Rankings from each model with parsed_ranking
label_to_model: Mapping from anonymous labels to model names
tournament_rankings: Consensus ranking from tournament method
dissent_threshold: Minimum fraction of rankers that must disagree (default 0.3 = 30%)
position_tolerance: How many positions away counts as disagreement (default 1)

Returns:
List of minority opinion dicts:
[
{
"model": "openai/gpt-4o",
"consensus_position": 1,
"dissent_positions": [3, 4], # where dissenters placed it
"dissent_rate": 0.4,
"dissenters": ["anthropic/claude-3.5-sonnet", "google/gemini-2.0-flash"],
"direction": "undervalued" # or "overvalued" - dissenters think it's worse/better
},
...
]
"""
from collections import defaultdict

if not stage2_results or not tournament_rankings:
return []

# Build consensus position lookup from tournament rankings
consensus_positions = {
entry["model"]: position + 1 # 1-indexed
for position, entry in enumerate(tournament_rankings)
}

# Track each ranker's position for each model
# Structure: {model_name: [(ranker_model, position), ...]}
model_rankings_by_ranker = defaultdict(list)

for ranking in stage2_results:
ranker_model = ranking.get('model')
parsed_ranking = ranking.get('parsed_ranking')
if not parsed_ranking:
ranking_text = ranking.get('ranking', '')
parsed_ranking = parse_ranking_from_text(ranking_text) if ranking_text else []

if not parsed_ranking:
continue

# Record where this ranker placed each model
for position, label in enumerate(parsed_ranking, start=1):
if label in label_to_model:
model_name = label_to_model[label]
model_rankings_by_ranker[model_name].append((ranker_model, position))

# Detect minority opinions for each model
minority_opinions = []

for model_name, rankings in model_rankings_by_ranker.items():
if model_name not in consensus_positions:
continue

consensus_pos = consensus_positions[model_name]
total_rankers = len(rankings)

if total_rankers == 0:
continue

# Find dissenters: rankers who placed this model far from consensus
dissenters = []
dissent_positions = []

for ranker_model, ranker_position in rankings:
position_diff = abs(ranker_position - consensus_pos)
if position_diff > position_tolerance:
dissenters.append(ranker_model)
dissent_positions.append(ranker_position)

dissent_rate = len(dissenters) / total_rankers

# Only report if dissent rate meets threshold
if dissent_rate >= dissent_threshold and dissenters:
# Determine direction: are dissenters ranking it higher or lower?
avg_dissent_pos = sum(dissent_positions) / len(dissent_positions)
if avg_dissent_pos > consensus_pos:
direction = "overvalued" # consensus ranks it higher than dissenters think
else:
direction = "undervalued" # consensus ranks it lower than dissenters think

minority_opinions.append({
"model": model_name,
"consensus_position": consensus_pos,
"dissent_positions": sorted(set(dissent_positions)),
"dissent_rate": round(dissent_rate, 2),
"dissenters": dissenters,
"direction": direction
})

# Sort by dissent rate (highest first)
minority_opinions.sort(key=lambda x: -x['dissent_rate'])

return minority_opinions


async def generate_conversation_title(user_query: str) -> str:
"""
Generate a short title for a conversation based on the first user message.
Expand Down Expand Up @@ -316,8 +569,14 @@ async def run_full_council(user_query: str) -> Tuple[List, List, Dict, Dict]:
# Stage 2: Collect rankings
stage2_results, label_to_model = await stage2_collect_rankings(user_query, stage1_results)

# Calculate aggregate rankings
# Calculate aggregate rankings (both methods)
aggregate_rankings = calculate_aggregate_rankings(stage2_results, label_to_model)
tournament_rankings = calculate_tournament_rankings(stage2_results, label_to_model)

# Detect minority opinions
minority_opinions = detect_minority_opinions(
stage2_results, label_to_model, tournament_rankings
)

# Stage 3: Synthesize final answer
stage3_result = await stage3_synthesize_final(
Expand All @@ -329,7 +588,9 @@ async def run_full_council(user_query: str) -> Tuple[List, List, Dict, Dict]:
# Prepare metadata
metadata = {
"label_to_model": label_to_model,
"aggregate_rankings": aggregate_rankings
"aggregate_rankings": aggregate_rankings,
"tournament_rankings": tournament_rankings,
"minority_opinions": minority_opinions
}

return stage1_results, stage2_results, stage3_result, metadata
1 change: 1 addition & 0 deletions frontend/src/components/ChatInterface.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ export default function ChatInterface({
rankings={msg.stage2}
labelToModel={msg.metadata?.label_to_model}
aggregateRankings={msg.metadata?.aggregate_rankings}
minorityOpinions={msg.metadata?.minority_opinions}
/>
)}

Expand Down
Loading