Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,3 +85,47 @@ Then open http://localhost:5173 in your browser.
- **Frontend:** React + Vite, react-markdown for rendering
- **Storage:** JSON files in `data/conversations/`
- **Package Management:** uv for Python, npm for JavaScript

## Ranking Algorithms

The council uses two methods to aggregate peer rankings from Stage 2:

### Mean Position Averaging
The original method calculates each model's average position across all rankings. Simple but susceptible to outlier rankings.

### Tournament-Style Pairwise Comparison
A more robust method that counts head-to-head wins between each pair of models. For each pair (A, B), we count how many rankers preferred A over B. The model with more pairwise victories wins that matchup.

**Why tournament ranking is more robust:**

Consider a 3-model council where Models A, B, C all rank themselves first (self-promotion bias):
- Model A ranks: A=1, B=2, C=3
- Model B ranks: B=1, A=2, C=3
- Model C ranks: C=1, A=2, B=3

Mean ranking results:
| Model | Positions | Average |
|-------|-----------|---------|
| A | 1, 2, 2 | 1.67 |
| B | 2, 1, 3 | 2.00 |
| C | 3, 3, 1 | 2.33 |

Tournament results:
| Model | vs A | vs B | vs C | Win% |
|-------|------|------|------|------|
| A | - | 2-1 | 2-1 | 100% |
| B | 1-2 | - | 2-1 | 50% |
| C | 1-2 | 1-2 | - | 0% |

Model A wins both pairwise matchups (2-1 against B, 2-1 against C) and deserves first place. The tournament method correctly identifies this.

**Outlier robustness validation:**

When one ranker places Model A last (outlier vote), mean ranking degrades A from 1.0 to 1.5 average. Tournament ranking keeps A at 100% win rate because A still wins the majority of head-to-head comparisons. This demonstrates tournament ranking's robustness to strategic voting and outliers.

**Validation tests verify:**
- Pairwise comparison math correctness
- Tie handling (0.5 points awarded to each model)
- Edge cases (single model, empty rankings)
- Fallback parsing from raw ranking text
- Realistic 5-model council scenarios
144 changes: 142 additions & 2 deletions backend/council.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,144 @@ def calculate_aggregate_rankings(
return aggregate


def calculate_tournament_rankings(
stage2_results: List[Dict[str, Any]],
label_to_model: Dict[str, str]
) -> List[Dict[str, Any]]:
"""
Calculate rankings using tournament-style pairwise comparison.

For each pair of models, count how many rankers preferred one over the other.
The model with more pairwise wins ranks higher. This method is more robust
to outlier rankings than simple position averaging.

Args:
stage2_results: Rankings from each model with parsed_ranking
label_to_model: Mapping from anonymous labels to model names

Returns:
List of dicts sorted by win_percentage (descending):
[
{
"model": "openai/gpt-4o",
"wins": 4.0,
"losses": 1.0,
"ties": 1.0,
"win_percentage": 0.75,
"total_matchups": 6
},
...
]
"""
from collections import defaultdict

# Get all models from label_to_model
models = list(set(label_to_model.values()))

if len(models) < 2:
# Need at least 2 models for pairwise comparison
return [{"model": m, "wins": 0, "losses": 0, "ties": 0, "win_percentage": 0.0, "total_matchups": 0} for m in models]

# Track pairwise wins: pairwise_wins[(model_a, model_b)] = count of times a ranked above b
pairwise_wins = defaultdict(int)

# Process each ranker's parsed ranking
# Use pre-parsed ranking if available, otherwise parse from text
for ranking in stage2_results:
parsed_ranking = ranking.get('parsed_ranking')
if not parsed_ranking:
# Fallback: parse from raw ranking text (consistent with calculate_aggregate_rankings)
ranking_text = ranking.get('ranking', '')
parsed_ranking = parse_ranking_from_text(ranking_text) if ranking_text else []

if not parsed_ranking:
continue

# Convert labels to model names and get their positions
model_positions = {}
for position, label in enumerate(parsed_ranking):
if label in label_to_model:
model_name = label_to_model[label]
model_positions[model_name] = position

# For each pair of models, record who was ranked higher (lower position = better)
ranked_models = list(model_positions.keys())
for i in range(len(ranked_models)):
for j in range(i + 1, len(ranked_models)):
model_a = ranked_models[i]
model_b = ranked_models[j]
pos_a = model_positions[model_a]
pos_b = model_positions[model_b]

# Ensure consistent ordering for the key
if model_a > model_b:
model_a, model_b = model_b, model_a
pos_a, pos_b = pos_b, pos_a

if pos_a < pos_b:
pairwise_wins[(model_a, model_b, 'a')] += 1
elif pos_b < pos_a:
pairwise_wins[(model_a, model_b, 'b')] += 1
# Equal positions would be a tie (shouldn't happen with rankings)

# Calculate wins, losses, and ties for each model
model_stats = {model: {"wins": 0.0, "losses": 0.0, "ties": 0.0} for model in models}

# Process each unique pair of models
processed_pairs = set()
for i in range(len(models)):
for j in range(i + 1, len(models)):
model_a, model_b = models[i], models[j]
if model_a > model_b:
model_a, model_b = model_b, model_a

pair_key = (model_a, model_b)
if pair_key in processed_pairs:
continue
processed_pairs.add(pair_key)

a_wins = pairwise_wins.get((model_a, model_b, 'a'), 0)
b_wins = pairwise_wins.get((model_a, model_b, 'b'), 0)

if a_wins > b_wins:
model_stats[model_a]["wins"] += 1
model_stats[model_b]["losses"] += 1
elif b_wins > a_wins:
model_stats[model_b]["wins"] += 1
model_stats[model_a]["losses"] += 1
elif a_wins == b_wins and (a_wins > 0 or b_wins > 0):
# Tie - both get 0.5
model_stats[model_a]["ties"] += 1
model_stats[model_b]["ties"] += 1

# Calculate win percentage and build results
total_possible_matchups = len(models) - 1 if len(models) > 1 else 1
results = []

for model in models:
stats = model_stats[model]
total_matchups = stats["wins"] + stats["losses"] + stats["ties"]
# Win percentage: wins + 0.5*ties / total matchups
if total_matchups > 0:
win_pct = (stats["wins"] + 0.5 * stats["ties"]) / total_possible_matchups
else:
win_pct = 0.0

results.append({
"model": model,
"wins": stats["wins"],
"losses": stats["losses"],
"ties": stats["ties"],
"win_percentage": round(win_pct, 3),
"total_matchups": int(total_matchups)
})

# Sort by win percentage (higher is better)
results.sort(key=lambda x: (-x['win_percentage'], x['losses']))

return results


async def generate_conversation_title(user_query: str) -> str:
"""
Generate a short title for a conversation based on the first user message.
Expand Down Expand Up @@ -316,8 +454,9 @@ async def run_full_council(user_query: str) -> Tuple[List, List, Dict, Dict]:
# Stage 2: Collect rankings
stage2_results, label_to_model = await stage2_collect_rankings(user_query, stage1_results)

# Calculate aggregate rankings
# Calculate aggregate rankings (both methods)
aggregate_rankings = calculate_aggregate_rankings(stage2_results, label_to_model)
tournament_rankings = calculate_tournament_rankings(stage2_results, label_to_model)

# Stage 3: Synthesize final answer
stage3_result = await stage3_synthesize_final(
Expand All @@ -329,7 +468,8 @@ async def run_full_council(user_query: str) -> Tuple[List, List, Dict, Dict]:
# Prepare metadata
metadata = {
"label_to_model": label_to_model,
"aggregate_rankings": aggregate_rankings
"aggregate_rankings": aggregate_rankings,
"tournament_rankings": tournament_rankings
}

return stage1_results, stage2_results, stage3_result, metadata