-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathtokenizer.py
158 lines (119 loc) · 4.8 KB
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import clang.cindex
import clang.enumerations
import csv
# set the config
clang.cindex.Config.set_library_path("/usr/local/Cellar/llvm/11.1.0/lib")
class Tokenizer:
# creates the object, does the inital parse
def __init__(self, path=None, c_str=None):
if not path and not c_str:
raise Exception("Requires atleast one argument among path or c_str")
self.index = clang.cindex.Index.create()
if path:
self.tu = self.index.parse(path)
self.path = self.extract_path(path)
if c_str:
self.tu = clang.cindex.Index.create().parse('temp.c', args=[], unsaved_files=[('temp.c', c_str)])
self.path = None
# read in and process the CSV file (once)
self.token_map = {}
handle = open("./token_map.csv", "r")
csv_reader = csv.reader(handle)
for row in csv_reader:
self.token_map[row[0]] = chr(int(row[1]))
# To output for split_functions, must have same path up to last two folders
def extract_path(self, path):
return "".join(path.split("/")[:-2])
# does futher processing on a literal token
def process_literal(self, literal):
cursor_kind = clang.cindex.CursorKind
kind = literal.cursor.kind
if kind == cursor_kind.INTEGER_LITERAL:
return ["NUM"]
if kind == cursor_kind.FLOATING_LITERAL:
return ["NUM"]
if kind == cursor_kind.IMAGINARY_LITERAL:
return ["NUM"]
if kind == cursor_kind.STRING_LITERAL:
return ["STRING"]
if kind == cursor_kind.CHARACTER_LITERAL:
return ["CHAR"]
if kind == cursor_kind.CXX_BOOL_LITERAL_EXPR:
return ["BOOL"]
# catch all other literals
return ["LITERAL"]
# filters out unwanted punctuation
def process_puntuation(self, punctuation):
spelling = punctuation.spelling
# ignore certain characters
if spelling in ["{", "}","(",")",";"]:
return None
return [spelling]
# further processes and identifier token
def process_ident(self, ident):
# are we a "special" ident?
if ident.spelling in ["std", "cout", "cin", "vector", "pair", "string", "NULL", "size_t"]:
return [ident.spelling]
# are we a declaration?
if ident.cursor.kind.is_declaration():
return ["DEC"]
# are we a reference kind?
if ident.cursor.kind.is_reference():
return ["REF"]
# are we a variable use?
if ident.cursor.kind == clang.cindex.CursorKind.DECL_REF_EXPR:
return ["USE"]
# catch all others
return ["IDENT"]
# tokenizes the contents of a specific cursor
def full_tokenize_cursor(self, cursor):
tokens = cursor.get_tokens()
# return final tokens as a list
result = []
for token in tokens:
if token.kind.name == "COMMENT":
# ignore all comments
continue
if token.kind.name == "PUNCTUATION":
punct_or_none = self.process_puntuation(token)
# add only if not ignored
if punct_or_none != None:
result += punct_or_none
continue
if token.kind.name == "LITERAL":
result += self.process_literal(token)
continue
if token.kind.name == "IDENTIFIER":
result += self.process_ident(token)
continue
if token.kind.name == "KEYWORD":
result += [token.spelling]
return result
# tokenizes the entire document
def full_tokenize(self):
cursor = self.tu.cursor
return self.full_tokenize_cursor(cursor)
# attempts to reduce each token to a single character
def full_tokenize_compressed(self):
tokens = self.full_tokenize()
result = []
for token in tokens:
if token in self.token_map:
result.append(self.token_map[token])
else:
print("UNMAPPED TOKEN: {}".format(token))
result.append(token)
return "".join(result)
if __name__ == "__main__":
# testing function
import sys
if len(sys.argv) != 2:
print("please provide a file argument")
exit(1)
tok = Tokenizer(sys.argv[1]) # path to a C++ file
results = tok.split_functions(False)
for res in results:
print(res[0] + " (" + res[2] + "):")
print("Tokens: {}".format(res[1]))
print("Compressed Tokens: {}".format(compress_tokens(res[1])))
print("")