Skip to content

Commit

Permalink
fix splitting of short seqs into chars
Browse files Browse the repository at this point in the history
  • Loading branch information
markus583 committed Sep 9, 2024
1 parent 0f675f7 commit 46f3d19
Show file tree
Hide file tree
Showing 3 changed files with 3 additions and 3 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

setup(
name="wtpsplit",
version="2.0.7",
version="2.0.8",
packages=find_packages(),
description="Universal Robust, Efficient and Adaptable Sentence Segmentation",
author="Markus Frohmann, Igor Sterner, Benjamin Minixhofer",
Expand Down
2 changes: 1 addition & 1 deletion wtpsplit/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from wtpsplit.extract import BertCharORTWrapper, PyTorchWrapper, extract
from wtpsplit.utils import Constants, indices_to_sentences, sigmoid, token_to_char_probs

__version__ = "2.0.7"
__version__ = "2.0.8"

warnings.simplefilter("default", DeprecationWarning) # show by default
warnings.simplefilter("ignore", category=FutureWarning) # for tranformers
Expand Down
2 changes: 1 addition & 1 deletion wtpsplit/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -449,7 +449,7 @@ def get_token_spans(tokenizer, offsets_mapping, tokens):

def token_to_char_probs(text, tokens, token_logits, tokenizer, offsets_mapping):
"""Map from token probabalities to character probabilities"""
char_probs = np.full((len(text), token_logits.shape[1]), np.min(token_logits)) # Initialize with very low numbers
char_probs = np.full((len(text), token_logits.shape[1]), -np.inf) # Initialize with very low numbers

valid_indices, valid_offsets = get_token_spans(tokenizer, offsets_mapping, tokens)

Expand Down

0 comments on commit 46f3d19

Please sign in to comment.