-
Notifications
You must be signed in to change notification settings - Fork 4
/
regex_tagger.go
46 lines (42 loc) · 972 Bytes
/
regex_tagger.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
package ling
import (
"fmt"
)
func init() {
Processors["regex"] = &RegexTagger{}
}
// RegexTagger is the processor that uses regex expression
type RegexTagger struct {
}
// Process is the function to annotate documents
func (t *RegexTagger) Process(d *Document) error {
if d == nil || len(d.Text) == 0 {
return nil
}
if len(d.Tokens) == 0 {
return fmt.Errorf("tokenization required")
}
for typ, re := range Regexes {
matches := re.FindAllStringIndex(d.Text, -1)
for _, match := range matches {
start := -1
end := -1
for _, token := range d.Tokens {
if token.StartByte == match[0] {
start = token.I
}
if token.EndByte == match[1] {
end = token.I + 1
}
}
if start == -1 || end == -1 {
continue
}
span := &Span{Doc: d, Start: start, End: end,
Annotations: map[string]interface{}{
"from": "regex", "value": map[string]interface{}{typ: ""}}}
d.Spans = append(d.Spans, span)
}
}
return nil
}