Skip to content

Commit

Permalink
fix: made sure sound event evaluation works with nips example
Browse files Browse the repository at this point in the history
  • Loading branch information
mbsantiago committed Jan 5, 2024
1 parent 1f76da4 commit a1ca192
Show file tree
Hide file tree
Showing 7 changed files with 79 additions and 25 deletions.
2 changes: 1 addition & 1 deletion pdm.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions src/soundevent/evaluation/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,11 @@ def mean_average_precision(
Note also that the y_score input is assumed to be an array of probabilities
for each class.
"""
y_true = np.array(y_true).astype(np.float32)
no_class = np.isnan(y_true)
y_true = y_true[~no_class]
y_score = y_score[~no_class]

return metrics.average_precision_score( # type: ignore
y_true=y_true,
y_score=y_score,
Expand Down
52 changes: 28 additions & 24 deletions src/soundevent/evaluation/tasks/sound_event_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

__all__ = [
"sound_event_detection",
"evaluate_clip",
]

SOUNDEVENT_METRICS: Sequence[metrics.Metric] = (
Expand All @@ -25,6 +26,7 @@
EXAMPLE_METRICS: Sequence[metrics.Metric] = ()

RUN_METRICS: Sequence[metrics.Metric] = (
metrics.mean_average_precision,
metrics.balanced_accuracy,
metrics.accuracy,
metrics.top_3_accuracy,
Expand All @@ -44,18 +46,16 @@ def sound_event_detection(
predicted_classes_scores,
) = _evaluate_clips(clip_predictions, clip_annotations, encoder)

evaluation_metrics = _compute_overall_metrics(
evaluation_metrics = compute_overall_metrics(
true_classes,
predicted_classes_scores,
)

score = _compute_overall_score(evaluated_clips)

return data.Evaluation(
evaluation_task="sound_event_detection",
clip_evaluations=evaluated_clips,
metrics=evaluation_metrics,
score=score,
score=_mean([c.score for c in evaluated_clips]),
)


Expand All @@ -70,22 +70,23 @@ def _evaluate_clips(
predicted_classes_scores = []

for annotations, predictions in iterate_over_valid_clips(
clip_predictions=clip_predictions, clip_annotations=clip_annotations
clip_predictions=clip_predictions,
clip_annotations=clip_annotations,
):
true_class, predicted_classes, evaluated_example = _evaluate_clip(
true_class, predicted_classes, evaluated_clip = evaluate_clip(
clip_annotations=annotations,
clip_predictions=predictions,
encoder=encoder,
)

true_classes.extend(true_class)
predicted_classes_scores.extend(predicted_classes)
evaluated_clips.append(evaluated_example)
evaluated_clips.append(evaluated_clip)

return evaluated_clips, true_classes, np.array(predicted_classes_scores)


def _compute_overall_metrics(true_classes, predicted_classes_scores):
def compute_overall_metrics(true_classes, predicted_classes_scores):
"""Compute evaluation metrics based on true classes and predicted
scores."""
evaluation_metrics = [
Expand All @@ -101,7 +102,7 @@ def _compute_overall_metrics(true_classes, predicted_classes_scores):
return evaluation_metrics


def _evaluate_clip(
def evaluate_clip(
clip_annotations: data.ClipAnnotation,
clip_predictions: data.ClipPrediction,
encoder: Encoder,
Expand All @@ -111,7 +112,7 @@ def _evaluate_clip(
matches: List[data.Match] = []

# Iterate over all matches between predictions and annotations.
for annotation_index, prediction_index, affinity in match_geometries(
for prediction_index, annotation_index, affinity in match_geometries(
source=[
prediction.sound_event.geometry
for prediction in clip_predictions.sound_events
Expand Down Expand Up @@ -168,7 +169,7 @@ def _evaluate_clip(
if annotation_index is not None and prediction_index is not None:
prediction = clip_predictions.sound_events[prediction_index]
annotation = clip_annotations.sound_events[annotation_index]
true_class, predicted_class_scores, match = _evaluate_sound_event(
true_class, predicted_class_scores, match = evaluate_sound_event(
sound_event_prediction=prediction,
sound_event_annotation=annotation,
encoder=encoder,
Expand All @@ -194,15 +195,13 @@ def _evaluate_clip(
)
for metric in EXAMPLE_METRICS
],
score=np.mean( # type: ignore
[match.score for match in matches if match.score],
),
score=_mean([m.score for m in matches]),
matches=matches,
),
)


def _evaluate_sound_event(
def evaluate_sound_event(
sound_event_prediction: data.SoundEventPrediction,
sound_event_annotation: data.SoundEventAnnotation,
encoder: Encoder,
Expand All @@ -215,11 +214,12 @@ def _evaluate_sound_event(
tags=sound_event_prediction.tags,
encoder=encoder,
)
score = metrics.classification_score(true_class, predicted_class_scores)
match = data.Match(
source=sound_event_prediction,
target=sound_event_annotation,
affinity=1,
score=metrics.classification_score(true_class, predicted_class_scores),
score=score,
metrics=[
data.Feature(
name=metric.__name__,
Expand All @@ -231,12 +231,16 @@ def _evaluate_sound_event(
return true_class, predicted_class_scores, match


def _compute_overall_score(
evaluated_examples: Sequence[data.ClipEvaluation],
def _mean(
scores: Sequence[float | None],
) -> float:
non_none_scores = [
example.score
for example in evaluated_examples
if example.score is not None
]
return float(np.mean(non_none_scores)) if non_none_scores else 0.0
valid_scores = [score for score in scores if score is not None]

if not valid_scores:
return 0.0

score = float(np.mean(valid_scores))
if np.isnan(score):
return 0.0

return score
1 change: 1 addition & 0 deletions tests/data/nips4b_dataset.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions tests/data/nips4b_evaluation_set.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions tests/data/nips4b_model_run.json

Large diffs are not rendered by default.

42 changes: 42 additions & 0 deletions tests/test_evaluation/test_sound_event_detection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
"""Test suite for sound event detection evaluation."""
from pathlib import Path

from soundevent import data, io
from soundevent.evaluation import sound_event_detection

TEST_DATA = Path(__file__).parent.parent / "data"


def test_can_evaluate_nips_data():
"""Test that we can evaluate the NIPS data."""
evaluation_set = io.load(
TEST_DATA / "nips4b_evaluation_set.json",
type="evaluation_set",
)
model_run = io.load(
TEST_DATA / "nips4b_model_run.json",
type="model_run",
)

evaluation = sound_event_detection(
model_run.clip_predictions,
evaluation_set.clip_annotations,
tags=evaluation_set.evaluation_tags,
)

assert isinstance(evaluation, data.Evaluation)

# check that all clips have been evaluated
assert len(evaluation.clip_evaluations) == len(
evaluation_set.clip_annotations
)

# check that all metrics are present
assert len(evaluation.metrics) == 4
metric_names = {metric.name for metric in evaluation.metrics}
assert metric_names == {
"balanced_accuracy",
"accuracy",
"top_3_accuracy",
"mean_average_precision",
}

0 comments on commit a1ca192

Please sign in to comment.