Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions changelog.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
# Changelog

## Unreleased

### Added

- New `attention` pooling mode in `eds.span_pooler`
- New `word_pooling_mode=False` in `eds.transformer` to allow returning the worpiece embeddings directly, instead of the mean-pooled word embeddings. At the moment, this only works with `eds.span_pooler` which can pool over wordpieces or words seamlessly.
- New parameter `pruning_params` to `edsnlp.tune` in order to control pruning during tuning.

## v0.19.0 (2025-10-04)

📢 EDS-NLP will drop support for Python 3.7, 3.8 and 3.9 support in the next major release (v0.20.0), in October 2025. Please upgrade to Python 3.10 or later.
Expand Down Expand Up @@ -28,6 +36,7 @@
- New `eds.explode` pipe that splits one document into multiple documents, one per span yielded by its `span_getter` parameter, each new document containing exactly that single span.
- New `Training a span classifier` tutorial, and reorganized deep-learning docs
- `ScheduledOptimizer` now warns when a parameter selector does not match any parameter.
- New `attention` pooling mode in `eds.span_pooler`

### Fixed

Expand Down
4 changes: 1 addition & 3 deletions docs/tutorials/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ We provide step-by-step guides to get you started. We cover the following use-ca

### Base tutorials

<!-- --8<-- [start:tutorials] -->
<!-- --8<-- [start:classic-tutorials] -->

=== card {: href=/tutorials/spacy101 }
Expand Down Expand Up @@ -133,8 +134,5 @@ We also provide tutorials on how to train deep-learning models with EDS-NLP. The
---
Learn how to tune hyperparameters of a model with `edsnlp.tune`.


<!-- --8<-- [end:deep-learning-tutorials] -->


<!-- --8<-- [end:tutorials] -->
40 changes: 34 additions & 6 deletions edsnlp/core/torch_component.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,14 @@ def compute_training_metrics(
This is useful to compute averages when doing multi-gpu training or mini-batch
accumulation since full denominators are not known during the forward pass.
"""
return batch_output
return (
{
**batch_output,
"loss": batch_output["loss"] / count,
}
if "loss" in batch_output
else batch_output
)

def module_forward(self, *args, **kwargs): # pragma: no cover
"""
Expand All @@ -350,6 +357,31 @@ def module_forward(self, *args, **kwargs): # pragma: no cover
"""
return torch.nn.Module.__call__(self, *args, **kwargs)

def preprocess_batch(self, docs: Sequence[Doc], supervision=False, **kwargs):
"""
Convenience method to preprocess a batch of documents.
Features corresponding to the same path are grouped together in a list,
under the same key.

Parameters
----------
docs: Sequence[Doc]
Batch of documents
supervision: bool
Whether to extract supervision features or not

Returns
-------
Dict[str, Sequence[Any]]
The batch of features
"""
batch = [
(self.preprocess_supervised(d) if supervision else self.preprocess(d))
for d in docs
]
batch = decompress_dict(list(batch_compress_dict(batch)))
return batch

def prepare_batch(
self,
docs: Sequence[Doc],
Expand All @@ -374,11 +406,7 @@ def prepare_batch(
-------
Dict[str, Sequence[Any]]
"""
batch = [
(self.preprocess_supervised(doc) if supervision else self.preprocess(doc))
for doc in docs
]
batch = decompress_dict(list(batch_compress_dict(batch)))
batch = self.preprocess_batch(docs, supervision=supervision)
batch = self.collate(batch)
batch = self.batch_to_device(batch, device=device)
return batch
Expand Down
35 changes: 32 additions & 3 deletions edsnlp/metrics/span_attribute.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@

import warnings
from collections import defaultdict
from typing import Any, Dict, Optional
from typing import Any, Dict, Optional, Union, Sequence

from edsnlp import registry
from edsnlp.metrics import Examples, average_precision, make_examples, prf
Expand All @@ -57,6 +57,7 @@ def span_attribute_metric(
default_values: Dict = {},
micro_key: str = "micro",
filter_expr: Optional[str] = None,
split_by_values: Union[str, Sequence[str]] = None,
**kwargs: Any,
):
if "qualifiers" in kwargs:
Expand All @@ -80,6 +81,8 @@ def span_attribute_metric(
if filter_expr is not None:
filter_fn = eval(f"lambda doc: {filter_expr}")
examples = [eg for eg in examples if filter_fn(eg.reference)]
if isinstance(split_by_values, str):
split_by_values = [split_by_values]
labels = defaultdict(lambda: (set(), set(), dict()))
labels["micro"] = (set(), set(), dict())
total_pred_count = 0
Expand Down Expand Up @@ -108,9 +111,15 @@ def span_attribute_metric(
if (top_val or include_falsy) and default_values[attr] != top_val:
labels[attr][2][(eg_idx, beg, end, attr, top_val)] = top_p
labels[micro_key][2][(eg_idx, beg, end, attr, top_val)] = top_p
if split_by_values and attr in split_by_values:
key = f"{attr}:{top_val}"
labels[key][2][(eg_idx, beg, end, attr, top_val)] = top_p
if (value or include_falsy) and default_values[attr] != value:
labels[micro_key][0].add((eg_idx, beg, end, attr, value))
labels[attr][0].add((eg_idx, beg, end, attr, value))
if split_by_values and attr in split_by_values:
key = f"{attr}:{value}"
labels[key][0].add((eg_idx, beg, end, attr, value))

doc_spans = get_spans(eg.reference, span_getter)
for span in doc_spans:
Expand All @@ -124,6 +133,9 @@ def span_attribute_metric(
if (value or include_falsy) and default_values[attr] != value:
labels[micro_key][1].add((eg_idx, beg, end, attr, value))
labels[attr][1].add((eg_idx, beg, end, attr, value))
if split_by_values and attr in split_by_values:
key = f"{attr}:{value}"
labels[key][1].add((eg_idx, beg, end, attr, value))

if total_pred_count != total_gold_count:
raise ValueError(
Expand All @@ -133,14 +145,25 @@ def span_attribute_metric(
"predicted by another NER pipe in your model."
)

return {
metrics = {
name: {
**prf(pred, gold),
"ap": average_precision(pred_with_prob, gold),
}
for name, (pred, gold, pred_with_prob) in labels.items()
}

if split_by_values:
for attr in split_by_values:
submetrics = {"micro": metrics[attr]}
for key in list(metrics.keys()):
if key.startswith(f"{attr}:"):
val = key.split(":", 1)[1]
submetrics[val] = metrics.pop(key)
metrics[attr] = submetrics

return metrics


@registry.metrics.register(
"eds.span_attribute",
Expand Down Expand Up @@ -230,7 +253,10 @@ class SpanAttributeMetric:
Key under which to store the micro‐averaged results across all attributes.
filter_expr : Optional[str]
A Python expression (using `doc`) to filter which examples are scored.

split_by_values : Union[str, Sequence[str]] = None
One or more attributes for which metrics should reported separately for each
attribute value. If `None` (default), metrics are computed on the global attribute-level.
Useful when attributes are multiclass.
Returns
-------
Dict[str, Dict[str, float]]
Expand Down Expand Up @@ -258,6 +284,7 @@ def __init__(
include_falsy: bool = False,
micro_key: str = "micro",
filter_expr: Optional[str] = None,
split_by_values: Union[str, Sequence[str]] = None,
):
if qualifiers is not None:
warnings.warn(
Expand All @@ -270,6 +297,7 @@ def __init__(
self.include_falsy = include_falsy
self.micro_key = micro_key
self.filter_expr = filter_expr
self.split_by_values = split_by_values

__init__.__doc__ = span_attribute_metric.__doc__

Expand All @@ -296,6 +324,7 @@ def __call__(self, *examples: Any):
include_falsy=self.include_falsy,
micro_key=self.micro_key,
filter_expr=self.filter_expr,
split_by_values=self.split_by_values,
)


Expand Down
Loading