-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathNgrams.py
130 lines (108 loc) · 4.78 KB
/
Ngrams.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# -*- coding: utf-8 -*-
import codecs
import collections
import Useful_functions as Use
from pymystem3 import Mystem
m = Mystem()
def next_token(text, current_token):
current_index = text.index(current_token)
return text[current_index + 1]
def previous_token(text, current_token):
current_index = text.index(current_token)
return text[current_index - 1]
def collocations(text, n=2):
collocations_list = []
for token in text[:-2]:
if token['token_type'] == 'word':
i = 0
current_token = token
collocation = [token['lemma']]
while i < n-1:
nt = next_token(text, current_token)
if nt['token_type'] == 'punct':
collocation = None
break
else:
if nt['lemma']:
collocation.append(nt['lemma'])
current_token = nt
i += 1
if collocation:
collocations_list.append(collocation)
return collocations_list
def search_collocations(text, n=2, dict_name="dict_ORG_2", label="ORG"):
dictionary = Use.open_dictionary(dict_name)
for token in text[:-n]:
if token['token_type'] == 'word':
i = 0
current_token = token
collocation = [token['lemma']]
while i < n-1:
nt = next_token(text, current_token)
if nt['token_type'] == 'punct':
collocation = None
break
else:
if nt['lemma']:
collocation.append(nt['lemma'])
current_token = nt
i += 1
if collocation:
if ' '.join(collocation) in dictionary:
current_token = token
current_index = text.index(token)
text[current_index]['dict'].append(label + '_B')
i = 0
while i < n-1:
current_token = next_token(text, current_token)
current_index = text.index(current_token)
text[current_index]['dict'].append(label + '_I')
i += 1
return text
def partial_search(text, collocation, label="collocation", rule="partial_search_rule", c_type="list"):
if c_type == "list":
for token in text[:-len(collocation)]:
if ("org_by_dict" not in token['rules'] and "org_by_descr_rule" not in token['rules'] and
"person_by_dict" not in token['rules'] and "popular_name_rule" not in token['rules']):
test = True
i = 0
current_token = token
while i < len(collocation):
if current_token['lemma'] == collocation[i]:
test = True
i += 1
current_token = next_token(text, current_token)
else:
test = False
break
if test:
token["rules"].update({rule: label + "_B"})
current_token = token
i = 1
while i < len(collocation):
current_token = next_token(text, current_token)
current_index = text.index(current_token)
text[current_index]["rules"].update({rule: label + "_I"})
i += 1
elif c_type == 'string':
for token in text:
if ("org_by_dict" not in token['rules'] and "org_by_descr_rule" not in token['rules'] and
"person_by_dict" not in token['rules'] and "popular_name_rule" not in token['rules']):
if token['lemma'] == collocation and token['shape'] == 'capitalized':
token['rules'].update({rule: label + "_B"})
return text
def search_all_collocations(text, dict_name="dict_ORG", label="ORG"):
text = search_collocations(text, n=1, dict_name=dict_name, label=label)
text = search_collocations(text, n=2, dict_name=dict_name, label=label)
text = search_collocations(text, n=3, dict_name=dict_name, label=label)
# text = search_collocations(text, n=4, dict_name=dict_name, label=label)
# text = search_collocations(text, n=5, dict_name=dict_name, label=label)
# text = search_collocations(text, n=6, dict_name=dict_name, label=label)
# text = search_collocations(text, n=8, dict_name=dict_name + '_6', label=label)
# text = search_collocations(text, n=9, dict_name=dict_name + '_6', label=label)
return text
def check_references(text, word):
for token in text:
if token['lemma'] == word and token['shape'] == 'lower':
return False
return True