Calibration plot for frame diagnostics streamlit (#308)

hummuscience · Muad Abd El Hay · claude · web-flow · commit 620459af064e · 2025-08-21T17:44:39.000-04:00
* Remove unexpected save_heatmaps parameter from export_predictions_and_labeled_video call The function doesn't accept this parameter, causing a TypeError. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com> * Add multi-model calibration plot feature to labeled frame diagnostics - Add calibration plot section to labeled_frame_diagnostics.py Streamlit app - Support for multi-model comparison with interactive plot controls - New plot_calibration_diagram_multi() function for comparing multiple models - Display Expected Calibration Error (ECE) for each model in legend - Configurable error threshold and number of bins - Color-coded calibration curves for easy visual comparison - Robust error handling for missing data cases 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com> --------- Co-authored-by: Muad Abd El Hay <abdelhaym@ESI-nbFRI070.ESI.local> Co-authored-by: Claude <noreply@anthropic.com>
diff --git a/lightning_pose/apps/labeled_frame_diagnostics.py b/lightning_pose/apps/labeled_frame_diagnostics.py
@@ -21,6 +21,7 @@
     get_y_label,
     make_plotly_catplot,
     make_plotly_scatterplot,
+    plot_calibration_diagram_multi,
 )
 from lightning_pose.apps.utils import (
     build_precomputed_metrics_df,
@@ -282,6 +283,91 @@ def run():
 
             st.plotly_chart(fig_scatter)
 
+        # ---------------------------------------------------
+        # calibration plot
+        # ---------------------------------------------------
+        st.header("Model Calibration Analysis")
+
+        col9, col10, col11 = st.columns(3)
+
+        with col9:
+            models_for_calib = st.multiselect(
+                "Select models to compare:", new_names, key="models_calib"
+            )
+
+        with col10:
+            n_bins = st.slider("Number of bins:", min_value=5, max_value=20, value=10)
+
+        with col11:
+            error_threshold = st.number_input(
+                "Error threshold (pixels):",
+                min_value=1.0, max_value=50.0, value=5.0, step=1.0
+            )
+
+        # Process calibration data for all selected models
+        if models_for_calib and 'pixel error' in metric_options:
+            # Collect data for all selected models
+            models_data = []
+            for model_name in models_for_calib:
+                confidence_df = dframes_metrics[model_name]['confidence']
+                pixel_error_df = df_metrics['pixel error']
+                pixel_error_df_model = pixel_error_df[
+                    (pixel_error_df.model_name == model_name) &
+                    (pixel_error_df.set == data_type)
+                ]
+
+                if keypoint_to_plot != "mean":
+                    # Get confidence and error for specific keypoint
+                    conf_cols = [
+                        c for c in confidence_df.columns
+                        if c[0] == keypoint_to_plot and c[1] == 'likelihood'
+                    ]
+                    if conf_cols:
+                        confidences = confidence_df.loc[
+                            confidence_df.iloc[:, -1] == data_type, conf_cols[0]
+                        ].values
+                        errors = pixel_error_df_model[keypoint_to_plot].values
+                    else:
+                        confidences = np.array([])
+                        errors = np.array([])
+                else:
+                    # Calculate mean confidence and error across all keypoints
+                    conf_cols = [c for c in confidence_df.columns if c[1] == 'likelihood']
+                    confidences_all = confidence_df.loc[
+                        confidence_df.iloc[:, -1] == data_type, conf_cols
+                    ].values
+                    confidences = np.nanmean(confidences_all, axis=1)
+
+                    error_cols = [kp for kp in keypoint_names]
+                    errors_all = pixel_error_df_model[error_cols].values
+                    errors = np.nanmean(errors_all, axis=1)
+
+                if len(confidences) > 0 and len(errors) > 0:
+                    # Calculate accuracies based on error threshold
+                    accuracies = (errors <= error_threshold).astype(float)
+                    models_data.append({
+                        'model_name': model_name,
+                        'confidences': confidences,
+                        'accuracies': accuracies
+                    })
+
+            if models_data:
+                # Create multi-model calibration plot
+                fig_calib = plot_calibration_diagram_multi(
+                    models_data=models_data,
+                    n_bins=n_bins,
+                    keypoint_name=keypoint_to_plot,
+                    data_type=data_type,
+                    error_threshold=error_threshold
+                )
+                st.plotly_chart(fig_calib)
+            else:
+                st.warning("No data available for calibration plot.")
+        elif not models_for_calib:
+            st.info("Please select at least one model for calibration analysis.")
+        else:
+            st.warning("Pixel error metric not available for calibration analysis.")
+
 
 if __name__ == "__main__":
 
diff --git a/lightning_pose/apps/plots.py b/lightning_pose/apps/plots.py
@@ -6,6 +6,7 @@
 import seaborn as sns
 from matplotlib import pyplot as plt
 from plotly.subplots import make_subplots
+from sklearn.calibration import calibration_curve
 
 pix_error_key = "pixel error"
 conf_error_key = "confidence"
@@ -267,3 +268,270 @@ def plot_precomputed_traces(df_metrics, df_traces, cols):
     )
 
     return fig_traces
+
+
+def plot_calibration_diagram(
+    confidences,
+    accuracies,
+    n_bins=10,
+    model_name="Model",
+    keypoint_name="",
+    data_type="",
+    error_threshold=5.0,
+):
+    """
+    Plot calibration diagram for pose estimation model using Plotly.
+
+    Args:
+        confidences: predicted confidence scores (0-1)
+        accuracies: binary array indicating if prediction was accurate (1) or not (0)
+        n_bins: number of bins for grouping confidences
+        model_name: name of the model for title
+        keypoint_name: name of the keypoint being analyzed
+        data_type: train/val/test data split
+        error_threshold: pixel error threshold used to determine accuracy
+
+    Returns:
+        Plotly figure object
+    """
+    # Calculate calibration curve
+    fraction_of_positives, mean_predicted_value = calibration_curve(
+        accuracies, confidences, n_bins=n_bins, strategy='uniform'
+    )
+
+    # Calculate expected calibration error (ECE) - simplified version
+    # ECE is the weighted average of the absolute differences between accuracy and confidence
+    if len(mean_predicted_value) > 0 and len(confidences) > 0:
+        # For each bin, calculate |accuracy - confidence| weighted by bin size
+        # We'll recompute bins to ensure consistency
+        bin_edges = np.linspace(0, 1, n_bins + 1)
+        ece = 0.0
+        total_count = 0
+
+        for i in range(n_bins):
+            # Find points in this bin
+            in_bin = (confidences >= bin_edges[i]) & (confidences < bin_edges[i + 1])
+            if i == n_bins - 1:  # Include right edge in last bin
+                in_bin = (confidences >= bin_edges[i]) & (confidences <= bin_edges[i + 1])
+
+            bin_count = np.sum(in_bin)
+            if bin_count > 0:
+                bin_accuracy = np.mean(accuracies[in_bin])
+                bin_confidence = np.mean(confidences[in_bin])
+                ece += bin_count * np.abs(bin_accuracy - bin_confidence)
+                total_count += bin_count
+
+        ece = ece / total_count if total_count > 0 else 0
+    else:
+        ece = 0
+
+    # Create Plotly figure
+    fig = go.Figure()
+
+    # Add perfect calibration line
+    fig.add_trace(go.Scatter(
+        x=[0, 1],
+        y=[0, 1],
+        mode='lines',
+        name='Perfect calibration',
+        line=dict(dash='dash', color='black'),
+        showlegend=True
+    ))
+
+    # Add model calibration curve
+    fig.add_trace(go.Scatter(
+        x=mean_predicted_value,
+        y=fraction_of_positives,
+        mode='markers+lines',
+        name=f'{model_name}',
+        marker=dict(size=10, color='blue'),
+        line=dict(color='blue'),
+        showlegend=True
+    ))
+
+    # Add confidence histogram as marginal
+    fig.add_trace(go.Histogram(
+        x=confidences,
+        name='Confidence distribution',
+        yaxis='y2',
+        opacity=0.3,
+        showlegend=False,
+        marker_color='gray',
+        nbinsx=20
+    ))
+
+    # Update layout
+    title_text = f'Calibration Plot - {model_name}'
+    if keypoint_name:
+        title_text += f' ({keypoint_name})'
+    if data_type:
+        title_text += f' - {data_type} set'
+    title_text += f'<br>Error threshold: {error_threshold:.1f} pixels | ECE: {ece:.3f}'
+
+    fig.update_layout(
+        title=title_text,
+        xaxis=dict(
+            title='Mean Predicted Confidence',
+            range=[0, 1],
+            tickmode='linear',
+            tick0=0,
+            dtick=0.1
+        ),
+        yaxis=dict(
+            title='Fraction of Accurate Predictions',
+            range=[0, 1],
+            tickmode='linear',
+            tick0=0,
+            dtick=0.1
+        ),
+        yaxis2=dict(
+            title='Count',
+            overlaying='y',
+            side='right',
+            showgrid=False
+        ),
+        width=700,
+        height=600,
+        showlegend=True,
+        legend=dict(
+            yanchor="top",
+            y=0.99,
+            xanchor="left",
+            x=0.01
+        ),
+        hovermode='x unified'
+    )
+
+    # Add grid
+    fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')
+    fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')
+
+    return fig
+
+
+def plot_calibration_diagram_multi(
+    models_data,
+    n_bins=10,
+    keypoint_name="",
+    data_type="",
+    error_threshold=5.0,
+):
+    """
+    Plot calibration diagram for multiple pose estimation models using Plotly.
+
+    Args:
+        models_data: list of dicts with keys 'model_name', 'confidences', 'accuracies'
+        n_bins: number of bins for grouping confidences
+        keypoint_name: name of the keypoint being analyzed
+        data_type: train/val/test data split
+        error_threshold: pixel error threshold used to determine accuracy
+
+    Returns:
+        Plotly figure object
+    """
+    # Define colors for different models
+    colors = ['blue', 'red', 'green', 'orange', 'purple', 'brown', 'pink', 'gray']
+
+    # Create Plotly figure
+    fig = go.Figure()
+
+    # Add perfect calibration line
+    fig.add_trace(go.Scatter(
+        x=[0, 1],
+        y=[0, 1],
+        mode='lines',
+        name='Perfect calibration',
+        line=dict(dash='dash', color='black', width=2),
+        showlegend=True
+    ))
+
+    # Add calibration curves for each model
+    ece_values = []
+    for i, model_data in enumerate(models_data):
+        model_name = model_data['model_name']
+        confidences = model_data['confidences']
+        accuracies = model_data['accuracies']
+        color = colors[i % len(colors)]
+
+        # Calculate calibration curve
+        fraction_of_positives, mean_predicted_value = calibration_curve(
+            accuracies, confidences, n_bins=n_bins, strategy='uniform'
+        )
+
+        # Calculate ECE for this model
+        if len(mean_predicted_value) > 0 and len(confidences) > 0:
+            bin_edges = np.linspace(0, 1, n_bins + 1)
+            ece = 0.0
+            total_count = 0
+
+            for j in range(n_bins):
+                in_bin = (confidences >= bin_edges[j]) & (confidences < bin_edges[j + 1])
+                if j == n_bins - 1:  # Include right edge in last bin
+                    in_bin = (confidences >= bin_edges[j]) & (confidences <= bin_edges[j + 1])
+
+                bin_count = np.sum(in_bin)
+                if bin_count > 0:
+                    bin_accuracy = np.mean(accuracies[in_bin])
+                    bin_confidence = np.mean(confidences[in_bin])
+                    ece += bin_count * np.abs(bin_accuracy - bin_confidence)
+                    total_count += bin_count
+
+            ece = ece / total_count if total_count > 0 else 0
+        else:
+            ece = 0
+
+        ece_values.append(ece)
+
+        # Add model calibration curve
+        fig.add_trace(go.Scatter(
+            x=mean_predicted_value,
+            y=fraction_of_positives,
+            mode='markers+lines',
+            name=f'{model_name} (ECE: {ece:.3f})',
+            marker=dict(size=8, color=color),
+            line=dict(color=color, width=2),
+            showlegend=True
+        ))
+
+    # Create title
+    title_text = 'Model Calibration Comparison'
+    if keypoint_name:
+        title_text += f' - {keypoint_name}'
+    if data_type:
+        title_text += f' ({data_type} set)'
+    title_text += f'<br>Error threshold: {error_threshold:.1f} pixels'
+
+    # Update layout
+    fig.update_layout(
+        title=title_text,
+        xaxis=dict(
+            title='Mean Predicted Confidence',
+            range=[0, 1],
+            tickmode='linear',
+            tick0=0,
+            dtick=0.1
+        ),
+        yaxis=dict(
+            title='Fraction of Accurate Predictions',
+            range=[0, 1],
+            tickmode='linear',
+            tick0=0,
+            dtick=0.1
+        ),
+        width=800,
+        height=600,
+        showlegend=True,
+        legend=dict(
+            yanchor="top",
+            y=0.99,
+            xanchor="left",
+            x=0.01
+        ),
+        hovermode='x unified'
+    )
+
+    # Add grid
+    fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')
+    fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')
+
+    return fig