-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathtokenize.py
62 lines (52 loc) · 2.12 KB
/
tokenize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#------------------------------------------------------------------------
# Menu of tokenizers
#
# A tokenizer is a function that takes a raw string an returns an ordered
# list of tokens.
#------------------------------------------------------------------------
import re
def tokenize_whitespace(message):
return message.split()
#tokenizer to that removes all puncation expect apostrophes
def tokenize_alpha(message):
#tokenizer to do better than default takes into account punctuation
wordList = re.sub(r"[^\w']"," ", message).split()
return wordList
# tokenizer that removes all puncation and apostrophes
def tokenize_alpha2(message):
wordList = re.sub(r"[^\w]"," ",message).split()
return wordList
#tokenizer that removes all puncation expect "I'm"
def tokenize_alpha3(message):
wordList= re.sub("[^\w]"," ",message).split()
count = 0
for tag in wordList:
if tag is "I" or tag is "i":
if count+1 == len(wordList):
return wordList
elif wordList[count+1] is "M" or wordList[count+1] is "m":
wordList.append("I'm")
wordList.pop(count)
if count+1 <= len(wordList):
wordList.pop(count+1)
count += 1
return wordList
# tokenizer that turnes upper case letters to lower and removes puncation
def tokenize_alpha4(message):
newmessage= message.lower()
wordList= re.sub(r"[\w']"," ",newmessage).split()
return wordList
# map from short label to (function,description) tuple
tokenize_menu = {
'default' : (tokenize_whitespace, "Use original whitespace-separated tokens"),
'alpha' : (tokenize_alpha, "Break on any nonalpha (other than apostrophe)"),
'alpha2': (tokenize_alpha2, "Breaks on any nonalpha characters"),
'alpha3': (tokenize_alpha3, "Breaks on any nonalpha characters expect i'm"),
'alpha4': (tokenize_alpha4, "makes strign lower case breaks on any nonalpha character")
}
tokenize_names = tuple(sorted(tokenize_menu.keys()))
tokenize_default = 'default'
tokenize_alpha= 'alpha'
tokenize_alpha2= 'alpha2'
tokenize_alpha3= 'alpha3'
tokenize_alpha4= 'alpha4'