fix splitting of short seqs into chars

segment-any-text · Sep 9, 2024 · 46f3d19 · 46f3d19
1 parent 0f675f7
commit 46f3d19
Show file tree

Hide file tree

Showing 3 changed files with 3 additions and 3 deletions.
diff --git a/setup.py b/setup.py
@@ -2,7 +2,7 @@
 
 setup(
     name="wtpsplit",
-    version="2.0.7",
+    version="2.0.8",
     packages=find_packages(),
     description="Universal Robust, Efficient and Adaptable Sentence Segmentation",
     author="Markus Frohmann, Igor Sterner, Benjamin Minixhofer",

diff --git a/wtpsplit/__init__.py b/wtpsplit/__init__.py
@@ -18,7 +18,7 @@
 from wtpsplit.extract import BertCharORTWrapper, PyTorchWrapper, extract
 from wtpsplit.utils import Constants, indices_to_sentences, sigmoid, token_to_char_probs
 
-__version__ = "2.0.7"
+__version__ = "2.0.8"
 
 warnings.simplefilter("default", DeprecationWarning)  # show by default
 warnings.simplefilter("ignore", category=FutureWarning)  # for tranformers

diff --git a/wtpsplit/utils/__init__.py b/wtpsplit/utils/__init__.py
@@ -449,7 +449,7 @@ def get_token_spans(tokenizer, offsets_mapping, tokens):
 
 def token_to_char_probs(text, tokens, token_logits, tokenizer, offsets_mapping):
     """Map from token probabalities to character probabilities"""
-    char_probs = np.full((len(text), token_logits.shape[1]), np.min(token_logits))  # Initialize with very low numbers
+    char_probs = np.full((len(text), token_logits.shape[1]), -np.inf)  # Initialize with very low numbers
 
     valid_indices, valid_offsets = get_token_spans(tokenizer, offsets_mapping, tokens)