Skip to content

Commit

Permalink
Fixing regression: strings.Split() behaves very differently from stri…
Browse files Browse the repository at this point in the history
…ngs.Fields().

Do not eat spaces when splitting sentences into words.
  • Loading branch information
rupor-github committed Jan 11, 2025
1 parent a926028 commit 4772b73
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 15 deletions.
4 changes: 2 additions & 2 deletions Taskfile.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -315,8 +315,8 @@ tasks:
cmds:
- rm -f *.txt
- echo "{{.TATN}}Downloading dictionary patterns from \"ctan.math.utah.edu\"{{.TOFF}}"
- wget -q -r -l1 --no-parent -nd -A.pat.txt http://ctan.math.utah.edu/ctan/tex-archive/language/hyph-utf8/tex/generic/hyph-utf8/patterns/txt
- wget -q -r -l1 --no-parent -nd -A.hyp.txt http://ctan.math.utah.edu/ctan/tex-archive/language/hyph-utf8/tex/generic/hyph-utf8/patterns/txt
- wget -q -r -l1 --no-parent -nd -A.pat.txt http://ctan.math.utah.edu/ctan/tex-archive/language/hyph-utf8/tex/patterns/txt
- wget -q -r -l1 --no-parent -nd -A.hyp.txt http://ctan.math.utah.edu/ctan/tex-archive/language/hyph-utf8/tex/patterns/txt
- gzip -q -f hyph-*.txt
status:
- find -type f -name 'hyph-*.txt.gz' | grep -q .
Expand Down
38 changes: 25 additions & 13 deletions processor/sentences.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,19 +108,31 @@ func splitSentences(t *tokenizer, in string) []string {

// splitWords returns slice of words in sentence.
func splitWords(_ *tokenizer, in string, ignoreNBSP bool) []string {
if ignoreNBSP {
// unicode.IsSpace will eat everything - for backward compatibility
return strings.Fields(in)
var (
result = []string{}
word strings.Builder
)
for _, sym := range in {
if isSep(sym, ignoreNBSP) {
result = append(result, word.String())
word.Reset()
continue
}
word.WriteRune(sym)
}
// exclude NBSP from the list of white space separators for latin1 symbols
return strings.FieldsFunc(in, func(r rune) bool {
if uint32(r) <= unicode.MaxLatin1 {
switch r {
case '\t', '\n', '\v', '\f', '\r', ' ', 0x85:
return true
}
return false
return append(result, word.String())
}

func isSep(r rune, ignoreNBSP bool) bool {
if uint32(r) <= unicode.MaxLatin1 {
switch r {
// exclude NBSP from the list of white space separators for latin1 symbols
case '\t', '\n', '\v', '\f', '\r', ' ', 0x85:
return true
case 0xA0: // NBSP
return ignoreNBSP
}
return unicode.IsSpace(r)
})
return false
}
return unicode.IsSpace(r)
}

0 comments on commit 4772b73

Please sign in to comment.