-
Notifications
You must be signed in to change notification settings - Fork 4
/
tokenizer.go
71 lines (63 loc) · 1.43 KB
/
tokenizer.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
package ling
import (
"strings"
"unicode"
"github.com/liuzl/ling/util"
"github.com/liuzl/tokenizer"
)
const Lower = "lower"
func init() {
Processors["_"] = &Tokenizer{}
}
func Type(text string) TokenType {
switch {
case util.StringIs(text, unicode.IsSpace):
return Space
case util.StringIs(text, unicode.IsSymbol):
return Symbol
case util.StringIs(text, unicode.IsNumber):
return Number
case util.StringIs(text, unicode.IsPunct):
return Punct
//case util.StringIs(text, unicode.IsLetter):
case util.StringIs(text, func(r rune) bool {
if (r < 'a' || r > 'z') && (r < 'A' || r > 'Z') {
return false
}
return true
}):
return Letters
}
return Word
}
func Script(text string) string {
for k, v := range unicode.Scripts {
if util.StringIs(text, func(r rune) bool { return unicode.Is(v, r) }) {
return k
}
}
return "Unknown"
}
type Tokenizer struct {
}
func (t *Tokenizer) Process(d *Document) error {
if d == nil || len(d.Text) == 0 {
return nil
}
var tokens []*Token
var pos int
for i, item := range tokenizer.TokenizePro(d.Text) {
word := item.Text
l := len([]byte(word))
token := &Token{Doc: d, Text: word, Type: Type(word), Script: Script(word),
I: i, StartByte: pos, EndByte: pos + l,
Annotations: map[string]string{Lower: strings.ToLower(word)}}
if item.Norm != "" {
token.Annotations[Norm] = item.Norm
}
pos += l
tokens = append(tokens, token)
}
d.Tokens = tokens
return nil
}