-
Notifications
You must be signed in to change notification settings - Fork 4
/
dict_tagger.go
67 lines (61 loc) · 1.24 KB
/
dict_tagger.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
package ling
import (
"flag"
"fmt"
"github.com/liuzl/d"
)
var (
dictName = flag.String("dict_name", "dict", "dictionary name")
dictWeb = flag.Bool("dict_web", false, "dictionary web api flag")
)
type DictTagger struct {
*d.Dictionary
}
func NewDictTagger() (*DictTagger, error) {
dict, err := d.Load(*dictName)
if err != nil {
return nil, err
}
if *dictWeb {
dict.RegisterWeb()
}
return &DictTagger{dict}, nil
}
func (t *DictTagger) Process(d *Document) error {
if d == nil || len(d.Text) == 0 {
return nil
}
if len(d.Tokens) == 0 {
return fmt.Errorf("tokenization required")
}
r := []rune(d.Text)
for i := 0; i < len(r); i++ {
startByte := len(string(r[:i]))
ret, err := t.PrefixMatch(string(r[i:]))
if err != nil {
return err
}
for k, v := range ret {
if len(v) < 1 {
continue
}
start := -1
end := -1
for _, token := range d.Tokens {
if token.StartByte == startByte {
start = token.I
}
if token.EndByte == startByte+len(k) {
end = token.I + 1
}
}
if start == -1 || end == -1 {
continue
}
span := &Span{Doc: d, Start: start, End: end,
Annotations: map[string]interface{}{"from": "dict", "value": v}}
d.Spans = append(d.Spans, span)
}
}
return nil
}