-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexample.py
executable file
·29 lines (26 loc) · 1.19 KB
/
example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import metapy
def tokens_lowercase(doc):
#Write a token stream that tokenizes with ICUTokenizer (use the argument "suppress_tags=True"),
#lowercases, removes words with less than 2 and more than 5 characters
#performs stemming and creates trigrams (name the final call to ana.analyze as "trigrams")
'''Place your code here'''
tok = metapy.analyzers.ICUTokenizer(suppress_tags=True)
tok = metapy.analyzers.LowercaseFilter(tok)
tok = metapy.analyzers.LengthFilter(tok, min=2, max=5)
tok = metapy.analyzers.Porter2Filter(tok)
ana = metapy.analyzers.NGramWordAnalyzer(3, tok)
trigrams = ana.analyze(doc)
tok.set_content(doc.content())
#leave the rest of the code as is
tok.set_content(doc.content())
tokens, counts = [], []
for token, count in trigrams.items():
counts.append(count)
tokens.append(token)
return tokens
if __name__ == '__main__':
doc = metapy.index.Document()
doc.content("I said that I can't believe that it only costs $19.95! I could only find it for more than $30 before.")
print(doc.content()) #you can access the document string with .content()
tokens = tokens_lowercase(doc)
print(tokens)