-
Notifications
You must be signed in to change notification settings - Fork 0
/
corpus.py
30 lines (26 loc) · 1.05 KB
/
corpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
class Token:
"""Represents a token in a sentence. Contains the word form, lemma, part of
speech, the position of its head, and the type of dependency relation.
"""
def __init__(self, form: str, head: int, relation: str, lemma: str = None, pos: str = None):
self.form = form
self.head = head
self.relation = relation
self.lemma = lemma or "(none)"
self.pos = pos or "(NONE)"
class Sentence:
"""Represents a sentence as a sequence of tokens, and a mapping of potential
heads and dependents (as integer tuples) to features.
"""
def __init__(self, source = None):
if source:
self.tokens = [Token(t.form, None, None, t.lemma, t.pos) for t in source.tokens]
else:
self.tokens = [Token("ROOT", None, None, pos="(ROOT)")]
self.features = {}
def __len__(self) -> int:
return len(self.tokens)
def __getitem__(self, n: int) -> Token:
return self.tokens[n]
def append(self, token: Token) -> None:
self.tokens.append(token)