-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtokenizer.py
42 lines (39 loc) · 1.27 KB
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
from string import split
class Tokenizer(object):
"""Abstract tokenizer class"""
def tokenize(self, string):
raise NotImplementedError( "Should have implemented a tokenize(self, string) method" )
class PuncTokenizer(Tokenizer):
"""Splits string using punctuation as the delimiting character"""
def __init__(self):
pass
def tokenize(self, string):
self.tokens = list()
cur = ''
for n in string:
if ((ord(n) >= 33) and (ord(n) <= 47)) or ((ord(n) >= 58) and (ord(n) <= 64)):
if cur != '':
self.tokens.append(cur)
cur = ''
else:
cur += n
if cur != '':
self.tokens.append(cur)
return self.tokens
class NumTokenizer(Tokenizer):
"""Splits string using digits as the delimiting character"""
def __init__(self):
pass
def tokenize(self, string):
self.tokens = list()
cur = ''
for n in string:
if (ord(n) >= 48) and (ord(n) <= 57):
if cur != '':
self.tokens.append(cur)
cur = ''
else:
cur += n
if cur != '':
self.tokens.append(cur)
return self.tokens