-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathscanner.mll
141 lines (125 loc) · 3.8 KB
/
scanner.mll
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
{
open Parser;;
open Utils;;
exception ScannerError of string;;
let scanner_err msg linenum =
raise (ScannerError (Printf.sprintf msg linenum))
;;
}
let alpha = ['a'-'z' 'A'-'Z']
let digit = ['0'-'9']
let squote = '\''
let bslash = '\\'
let octal_dig = ['0'-'7']
let octal_triplet = (octal_dig)(octal_dig)(octal_dig)
let integer = digit+
let exp = 'e'['-''+']?['0'-'9']+
let sliterals = ([^ '\\' '"' '\n'] | ('\\'[^ '\n' ]))*
let cfloat = (
((digit)+'.'(digit)* (exp)?) |
((digit)* '.'(digit)+(exp)?) |
((digit)+exp)
)
let normal_id = (alpha | '_')(alpha | digit | '_')*
let whitespace = [' ' '\t' '\r' '\n']
let print_char = [' '-'~']
rule tokenize = parse
[' ' '\t' '\r'] { tokenize lexbuf }
| '\n' { Lexing.new_line lexbuf; tokenize lexbuf }
| '(' { LPAREN }
| ')' { RPAREN }
| '{' { LBRACE }
| '}' { RBRACE }
| '[' { LBRACKET }
| ']' { RBRACKET }
| ',' { COMMA }
| ';' { SEMI }
(* Operators *)
| '+' { PLUS }
| '-' { MINUS }
| '*' { TIMES }
| '/' { DIVIDE }
| '=' { ASSIGN }
| '%' { MOD }
| "+=" { PLUSEQ }
| "-=" { MINUSEQ }
| "==" { EQ }
| "!=" { NEQ }
| '<' { LT }
| "<=" { LEQ }
| ">" { GT }
| ">=" { GEQ }
| "&&" { AND }
| "||" { OR }
| "!" { NOT }
| '.' { DOT }
(*Control flow*)
| "if" { IF }
| "else" { ELSE }
| "else" whitespace+ as ws "if" {count_new_lines ws lexbuf; ELIF }
| "for" { FOR }
| "while" { WHILE }
| "break" { BREAK }
| "continue" { CONTINUE }
(*Types*)
| "int" { INT }
| "float" { FLOAT }
| "char" { CHAR }
| "string" { STRING }
| "void" { VOID }
| "struct" { STRUCT }
| "socket" { SOCKET }
| "file" { FILE }
(*Functions*)
| "return" { RETURN }
(*Memory*)
| "new" { NEW }
| "delete" { DELETE }
(* Now the tokens that have to be matched with regex *)
| "//" { scomment lexbuf }
| "/*" { mcomment lexbuf }
| normal_id as lxm {ID(lxm)}
| integer as lxm { INTLIT(int_of_string lxm) }
(* | '"' ((print_char)* as str) '"' { STRLIT(str) } *)
| '"' { STRLIT(strlit "" lexbuf) }
| squote bslash ((octal_triplet) as oct_num) squote { CHARLIT(int_of_string ("0o" ^ oct_num)) }
| squote squote { scanner_err "empty char literal on line %d" (line_num lexbuf)}
| squote bslash (['f' 'n' 'r' 't' '\\' '0' '\''] as spec_char) squote {
match spec_char with
'f' -> CHARLIT(12)
| 'n' -> CHARLIT(10)
| 'r' -> CHARLIT(13)
| 't' -> CHARLIT(9)
| '\\' -> CHARLIT(92)
| '0' -> CHARLIT(0)
| '\'' -> CHARLIT(47)
| _ -> scanner_err "[COMPILER BUG] unaccounted escape character line %d" (line_num lexbuf)
}
| squote print_char squote as lxm {CHARLIT(Char.code(lxm.[1]))} (*For chars like 'a'*)
| cfloat as flt { FLOATLIT(float_of_string flt) } (* TODO Optional negative sign *)
(* Error cases *)
| '"' | squote { scanner_err "unmatched quote on line %d" (line_num lexbuf) }
| _ { scanner_err "illegal character on line %d" (line_num lexbuf) }
| eof { EOF }
and strlit str_so_far = parse
'"' { str_so_far }
| "\\n" { strlit (str_so_far ^ "\n") lexbuf }
| "\\t" { strlit (str_so_far ^ "\t") lexbuf }
| "\\r" { strlit (str_so_far ^ "\r") lexbuf }
| "\\0" { strlit (str_so_far ^ (String.make 1 (Char.chr 0))) lexbuf }
| "\\\\" { strlit (str_so_far ^ "\\") lexbuf }
| "\"" { strlit (str_so_far ^ "\"") lexbuf }
| "\\" ((octal_triplet) as oct_num)
{ strlit (str_so_far ^ (String.make 1 (Char.chr (int_of_string ("0o" ^ oct_num))))) lexbuf }
| print_char as lxm { strlit (str_so_far ^ (String.make 1 lxm)) lexbuf }
| eof { scanner_err "unmatched quote on line %d" (line_num lexbuf) }
| _ { scanner_err "illegal character in string literal on line %d" (line_num lexbuf) }
and scomment = parse
'\n' { Lexing.new_line lexbuf; tokenize lexbuf }
| eof { tokenize lexbuf }
| _ { scomment lexbuf }
and mcomment = parse
"*/" { tokenize lexbuf }
| '\n' { Lexing.new_line lexbuf; mcomment lexbuf }
| eof { raise (ScannerError("reached end of file with an unclosed multiline comment"))}
| _ { mcomment lexbuf }