Skip to content

Commit cb19a9a

Browse files
committedSep 25, 2016
init project open sourced
1 parent a8f40a0 commit cb19a9a

12 files changed

+1725
-30
lines changed
 

‎.gitignore

+4-29
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,4 @@
1-
# Compiled Object files
2-
*.slo
3-
*.lo
4-
*.o
5-
*.obj
6-
7-
# Precompiled Headers
8-
*.gch
9-
*.pch
10-
11-
# Compiled Dynamic libraries
12-
*.so
13-
*.dylib
14-
*.dll
15-
16-
# Fortran module files
17-
*.mod
18-
*.smod
19-
20-
# Compiled Static libraries
21-
*.lai
22-
*.la
23-
*.a
24-
*.lib
25-
26-
# Executables
27-
*.exe
28-
*.out
29-
*.app
1+
.vscode/
2+
build/
3+
test/*.exe
4+
test/*.out

‎CMakeLists.txt

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
cmake_minimum_required (VERSION 3.1.0)
2+
project (spre_test)
3+
4+
include_directories(include)
5+
file(GLOB SOURCES "test/*.cpp")
6+
7+
add_executable(spre_test ${SOURCES})
8+
9+
set_property(TARGET spre_test PROPERTY CXX_STANDARD 14)
10+
set_property(TARGET spre_test PROPERTY CXX_STANDARD_REQUIRED ON)

‎README.md

+75-1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,76 @@
11
# SRL-CPP
2-
C++ implementation of SRL.
2+
3+
This is a header-only C++ libary for Simple Regex Language.
4+
5+
**Currently under development.**
6+
7+
## Usage
8+
9+
You only need to include the library directory and use C++ 14 standard to compile. The namespace used is `spre::`.
10+
11+
The library uses C++ 11 features heavily, and uses `make_unique` from C++ 14. You could use latest Visual Studio, or `g++-4.9` or later, or `clang++-3.8` or later. The project uses `cmake` as the builing system.
12+
13+
```cpp
14+
// test.cpp
15+
#include <string>
16+
#include <iostream>
17+
#include <spre/spre.hpp>
18+
int main()
19+
{
20+
std::string src = "literally \"something\"";
21+
spre::SRL srl(src);
22+
std::cout << srl.get_pattern() << std::endl;
23+
return 0;
24+
}
25+
```
26+
27+
```bash
28+
$ tree .
29+
.
30+
|-- test.cpp
31+
`-- include
32+
`-- spre
33+
|-- ast.hpp
34+
|-- ...
35+
...
36+
37+
$ g++ -I./include -std=c++14 test.cpp
38+
$ ./test
39+
(?:something)
40+
```
41+
42+
The project uses `cmake` as the builing system. It's especially useful for development in Visual Studio in Windows.
43+
44+
## License
45+
46+
MIT.
47+
48+
## Limitations and TODOs
49+
50+
- The `Builder` is yet to be implemented.
51+
- The error reports are implemented as outputing to `stderr`.
52+
53+
## Technical Structures
54+
55+
First of all, it's designed to be a header-only library. Thus everything are written in the header files (`.hpp`). Ideally users only need to `#include <spre/spre.hpp>`.
56+
57+
The library is written as a light-weight compiler-like thing, although SRL is a DSL and does not have control flow (as a subset of Regex) thus could not be considered turing-complete. As a result, this library has lexer and parser and code generator. This library has specific lexer instead of using `yacc`. The code is written following the tutorials from [llvm](http://llvm.org/docs/tutorial/LangImpl02.html) and [@](http://frozengene.github.io/blog/compiler/2014/08/10/compiler_tutorial_03/).
58+
59+
The structure:
60+
61+
```txt
62+
63+
token.hpp
64+
|
65+
V
66+
dictionary.hpp ast.hpp
67+
| |
68+
V V
69+
lexer.hpp ---------> parser.hpp --------> generator.hpp
70+
(get tokens) (get (vector of) asts) (get the compiled regex string)
71+
|
72+
V
73+
spre.hpp
74+
(`SRL` and `Builder`)
75+
```
76+

‎include/spre/ast.hpp

+207
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
/*
2+
* the ideas behind this file is heavily borrowed from
3+
* http://llvm.org/docs/tutorial/LangImpl02.html
4+
* that is UIUC (BSD-like) lincensed
5+
*/
6+
7+
#ifndef SIMPLEREGEXLANGUAGE_AST_H_
8+
#define SIMPLEREGEXLANGUAGE_AST_H_
9+
10+
#include <string>
11+
#include <vector>
12+
#include <memory>
13+
14+
using std::string;
15+
using std::vector;
16+
using std::unique_ptr;
17+
18+
19+
namespace spre
20+
{
21+
class ExprAST
22+
{
23+
public:
24+
virtual string get_val() const = 0;
25+
virtual ~ExprAST() = default;
26+
};
27+
28+
29+
class CharacterExprAST: public ExprAST
30+
{
31+
public:
32+
CharacterExprAST(const string &val = "");
33+
string get_val() const override;
34+
private:
35+
const string val_;
36+
};
37+
38+
CharacterExprAST::CharacterExprAST(const string &val): val_(val)
39+
{
40+
}
41+
42+
inline string CharacterExprAST::get_val() const
43+
{
44+
return val_;
45+
}
46+
47+
48+
class QuantifierExprAST: public ExprAST
49+
{
50+
public:
51+
QuantifierExprAST(const string &val);
52+
string get_val() const override;
53+
private:
54+
const string val_;
55+
};
56+
57+
QuantifierExprAST::QuantifierExprAST(const string &val): val_(val)
58+
{
59+
}
60+
61+
inline string QuantifierExprAST::get_val() const
62+
{
63+
return val_;
64+
}
65+
66+
67+
class GroupExprAST : public ExprAST
68+
{
69+
public:
70+
GroupExprAST(vector<unique_ptr<ExprAST>> cond);
71+
GroupExprAST(vector<unique_ptr<ExprAST>> cond,
72+
const string &name, vector<unique_ptr<ExprAST>> until_cond);
73+
void set_name(const string &name);
74+
void set_until_cond(vector<unique_ptr<ExprAST>> until_cond);
75+
string get_val() const override;
76+
private:
77+
vector<unique_ptr<ExprAST>> cond_;
78+
string name_;
79+
vector<unique_ptr<ExprAST>> until_cond_; // maybe useless
80+
};
81+
82+
GroupExprAST::GroupExprAST(vector<unique_ptr<ExprAST>> cond)
83+
: cond_(std::move(cond))
84+
{
85+
}
86+
87+
GroupExprAST::GroupExprAST(vector<unique_ptr<ExprAST>> cond,
88+
const string &name, vector<unique_ptr<ExprAST>> until_cond)
89+
: cond_(std::move(cond)), name_(name), until_cond_(std::move(until_cond))
90+
{
91+
}
92+
93+
inline void GroupExprAST::set_name(const string &name)
94+
{
95+
name_ = name;
96+
}
97+
98+
inline void GroupExprAST::set_until_cond(vector<unique_ptr<ExprAST>> until_cond)
99+
{
100+
until_cond_ = std::move(until_cond);
101+
}
102+
103+
inline string GroupExprAST::get_val() const
104+
{
105+
if (cond_.size() != 0)
106+
{
107+
string res = "(";
108+
109+
if (name_.size() != 0)
110+
{
111+
res.append("<");
112+
res.append(name_);
113+
res.append(">");
114+
}
115+
116+
for (auto const &iter : cond_)
117+
{
118+
res.append(iter->get_val());
119+
}
120+
res.append(")");
121+
return res;
122+
}
123+
124+
// error!
125+
return "";
126+
}
127+
128+
129+
class LookAroundExprAST : public ExprAST
130+
{
131+
public:
132+
LookAroundExprAST(const vector<string> vals,
133+
vector<unique_ptr<ExprAST>> cond = vector<unique_ptr<ExprAST>>());
134+
string get_val() const override;
135+
private:
136+
const vector<string> vals_;
137+
vector<unique_ptr<ExprAST>> cond_;
138+
};
139+
140+
LookAroundExprAST::LookAroundExprAST(const vector<string> vals,
141+
vector<unique_ptr<ExprAST>> cond)
142+
: vals_(std::move(vals)), cond_(std::move(cond))
143+
{
144+
}
145+
146+
inline string LookAroundExprAST::get_val() const
147+
{
148+
if (vals_.size() == 1)
149+
{
150+
return vals_[0];
151+
}
152+
if (vals_.size() == 2 && cond_.size() != 0)
153+
{
154+
string res = vals_[0];
155+
for (auto const &iter: cond_)
156+
{
157+
res.append(iter->get_val());
158+
}
159+
res.append(vals_[1]);
160+
return res;
161+
}
162+
163+
// error!
164+
return "";
165+
}
166+
167+
168+
class FlagExprAST : public ExprAST
169+
{
170+
public:
171+
FlagExprAST(const string &val);
172+
string get_val() const override;
173+
private:
174+
const string val_;
175+
};
176+
177+
FlagExprAST::FlagExprAST(const string &val) : val_(val)
178+
{
179+
}
180+
181+
inline string FlagExprAST::get_val() const
182+
{
183+
return val_;
184+
}
185+
186+
187+
class AnchorExprAST : public ExprAST
188+
{
189+
public:
190+
AnchorExprAST(const string &val);
191+
string get_val() const override;
192+
private:
193+
const string val_;
194+
};
195+
196+
AnchorExprAST::AnchorExprAST(const string &val) : val_(val)
197+
{
198+
}
199+
200+
inline string AnchorExprAST::get_val() const
201+
{
202+
return val_;
203+
}
204+
205+
}
206+
207+
#endif // !SIMPLEREGEXLANGUAGE_AST_H_

‎include/spre/builder.hpp

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
#ifndef SIMPLEREGEXLANGUAGE_BUILDER_H_
2+
#define SIMPLEREGEXLANGUAGE_BUILDER_H_
3+
4+
namespace spre
5+
{
6+
class Builder
7+
{
8+
public:
9+
Builder();
10+
~Builder();
11+
12+
private:
13+
14+
};
15+
16+
Builder::Builder()
17+
{
18+
}
19+
20+
Builder::~Builder()
21+
{
22+
}
23+
24+
25+
}
26+
27+
#endif // !SIMPLEREGEXLANGUAGE_BUILDER_H_

‎include/spre/dictionary.hpp

+294
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,294 @@
1+
/*
2+
* the ideas behind this file is heavily borrowed from
3+
* http://frozengene.github.io/blog/compiler/2014/08/10/compiler_tutorial_03/
4+
* that is BSD lincensed
5+
*/
6+
7+
#ifndef SIMPLEREGEXLANGUAGE_DICTIONARY_H_
8+
#define SIMPLEREGEXLANGUAGE_DICTIONARY_H_
9+
10+
#include <tuple>
11+
#include <unordered_map>
12+
#include <string>
13+
#include "spre/token.hpp"
14+
15+
using std::tuple;
16+
using std::make_tuple;
17+
using std::unordered_map;
18+
using std::string;
19+
20+
namespace spre
21+
{
22+
using MetaType = tuple<TokenType, TokenValue>;
23+
24+
class Dictionary
25+
{
26+
public:
27+
Dictionary();
28+
~Dictionary();
29+
bool has_token(const string &name) const;
30+
size_t get_key_max_length() const;
31+
MetaType get(const string &name) const;
32+
TokenType get_token_type(const string &name) const;
33+
TokenValue get_token_value(const string &name) const;
34+
private:
35+
unordered_map<string, MetaType> dictionary_;
36+
size_t key_max_len_; // cache the max length of all the keys!
37+
};
38+
39+
Dictionary::Dictionary():
40+
dictionary_{
41+
{
42+
"literally",
43+
make_tuple(TokenType::CHARACTER, TokenValue::LITERALLY)
44+
},
45+
{
46+
"one of",
47+
make_tuple(TokenType::CHARACTER, TokenValue::ONE_OF)
48+
},
49+
{
50+
"letter",
51+
make_tuple(TokenType::CHARACTER, TokenValue::LETTER)
52+
},
53+
{
54+
"uppercase letter",
55+
make_tuple(TokenType::CHARACTER, TokenValue::UPPERCASE_LETTER)
56+
},
57+
{
58+
"any character",
59+
make_tuple(TokenType::CHARACTER, TokenValue::ANY_CHARACTER)
60+
},
61+
{
62+
"no character",
63+
make_tuple(TokenType::CHARACTER, TokenValue::NO_CHARACTER)
64+
},
65+
{
66+
"digit",
67+
make_tuple(TokenType::CHARACTER, TokenValue::DIGIT)
68+
},
69+
{
70+
"anything",
71+
make_tuple(TokenType::CHARACTER, TokenValue::ANYTHING)
72+
},
73+
{
74+
"new line",
75+
make_tuple(TokenType::CHARACTER, TokenValue::NEW_LINE)
76+
},
77+
{
78+
"whitespace",
79+
make_tuple(TokenType::CHARACTER, TokenValue::WHITESPACE)
80+
},
81+
{
82+
"no whitespace",
83+
make_tuple(TokenType::CHARACTER, TokenValue::NO_WHITESPACE)
84+
},
85+
{
86+
"tab",
87+
make_tuple(TokenType::CHARACTER, TokenValue::TAB)
88+
},
89+
{
90+
"raw",
91+
make_tuple(TokenType::CHARACTER, TokenValue::RAW)
92+
},
93+
{
94+
"from",
95+
make_tuple(TokenType::CHARACTER, TokenValue::FROM)
96+
},
97+
{
98+
"to",
99+
make_tuple(TokenType::CHARACTER, TokenValue::TO)
100+
},
101+
102+
103+
{
104+
"exactly",
105+
make_tuple(TokenType::QUANTIFIER, TokenValue::EXCATLY_X_TIMES)
106+
},
107+
{
108+
"exactly 1 time",
109+
make_tuple(TokenType::QUANTIFIER, TokenValue::EXACTLY_ONE_TIME)
110+
},
111+
{
112+
"once",
113+
make_tuple(TokenType::QUANTIFIER, TokenValue::ONCE)
114+
},
115+
{
116+
"twice",
117+
make_tuple(TokenType::QUANTIFIER, TokenValue::TWICE)
118+
},
119+
{
120+
"between",
121+
make_tuple(TokenType::QUANTIFIER, TokenValue::BETWEEN_X_AND_Y_TIMES)
122+
},
123+
{
124+
"optional",
125+
make_tuple(TokenType::QUANTIFIER, TokenValue::OPTIONAL)
126+
},
127+
{
128+
"once or more",
129+
make_tuple(TokenType::QUANTIFIER, TokenValue::ONCE_OR_MORE)
130+
},
131+
{
132+
"never or more",
133+
make_tuple(TokenType::QUANTIFIER, TokenValue::NEVER_OR_MORE)
134+
},
135+
{
136+
"at least",
137+
make_tuple(TokenType::QUANTIFIER, TokenValue::AT_LEAST_X_TIMES)
138+
},
139+
{
140+
"time",
141+
make_tuple(TokenType::QUANTIFIER, TokenValue::TIME)
142+
},
143+
{
144+
"times",
145+
make_tuple(TokenType::QUANTIFIER, TokenValue::TIMES)
146+
},
147+
{
148+
"and",
149+
make_tuple(TokenType::QUANTIFIER, TokenValue::AND)
150+
},
151+
152+
153+
{
154+
"capture",
155+
make_tuple(TokenType::GROUP, TokenValue::CAPTURE_AS)
156+
},
157+
{
158+
"any of",
159+
make_tuple(TokenType::GROUP, TokenValue::ANY_OF)
160+
},
161+
{
162+
"until",
163+
make_tuple(TokenType::GROUP, TokenValue::UNTIL)
164+
},
165+
{
166+
"as",
167+
make_tuple(TokenType::GROUP, TokenValue::AS)
168+
},
169+
170+
171+
{
172+
"if followed by",
173+
make_tuple(TokenType::LOOKAROUND, TokenValue::IF_FOLLOWED_BY)
174+
},
175+
{
176+
"if not followed by",
177+
make_tuple(TokenType::LOOKAROUND, TokenValue::IF_NOT_FOLLOWED_BY)
178+
},
179+
{
180+
"if already had",
181+
make_tuple(TokenType::LOOKAROUND, TokenValue::IF_ALREADY_HAD)
182+
},
183+
{
184+
"if not already had",
185+
make_tuple(TokenType::LOOKAROUND, TokenValue::IF_NOT_ALREADY_HAD)
186+
},
187+
188+
189+
{
190+
"case insensitive",
191+
make_tuple(TokenType::FLAG, TokenValue::CASE_INSENSITIVE)
192+
},
193+
{
194+
"multi line",
195+
make_tuple(TokenType::FLAG, TokenValue::MULTI_LINE)
196+
},
197+
{
198+
"all lazy",
199+
make_tuple(TokenType::FLAG, TokenValue::ALL_LAZY)
200+
},
201+
202+
203+
{
204+
"begin with",
205+
make_tuple(TokenType::ANCHOR, TokenValue::BEGIN_WITH)
206+
},
207+
{
208+
"starts with",
209+
make_tuple(TokenType::ANCHOR, TokenValue::STARTS_WITH)
210+
},
211+
{
212+
"must end",
213+
make_tuple(TokenType::ANCHOR, TokenValue::MUST_END)
214+
},
215+
216+
217+
{
218+
",",
219+
make_tuple(TokenType::SRC_WHITESPECE, TokenValue::SPACE)
220+
},
221+
{
222+
" ",
223+
make_tuple(TokenType::SRC_WHITESPECE, TokenValue::SPACE)
224+
},
225+
{
226+
"\n",
227+
make_tuple(TokenType::SRC_WHITESPECE, TokenValue::SPACE)
228+
},
229+
230+
231+
{
232+
"\"",
233+
make_tuple(TokenType::DELIMITER, TokenValue::STRING)
234+
},
235+
{
236+
"\'",
237+
make_tuple(TokenType::DELIMITER, TokenValue::STRING)
238+
},
239+
{
240+
"(",
241+
make_tuple(TokenType::DELIMITER, TokenValue::GROUP_START)
242+
},
243+
{
244+
")",
245+
make_tuple(TokenType::DELIMITER, TokenValue::GROUP_END)
246+
}
247+
},
248+
key_max_len_(0)
249+
{
250+
for (auto const &iter: dictionary_)
251+
{
252+
if (key_max_len_ < (iter.first).length())
253+
{
254+
key_max_len_ = (iter.first).length();
255+
}
256+
}
257+
}
258+
259+
Dictionary::~Dictionary()
260+
{
261+
}
262+
263+
inline size_t Dictionary::get_key_max_length() const
264+
{
265+
return key_max_len_;
266+
}
267+
268+
inline bool Dictionary::has_token(const string &name) const
269+
{
270+
auto iter = dictionary_.find(name);
271+
return iter != dictionary_.end();
272+
}
273+
274+
inline MetaType Dictionary::get(const string &name) const
275+
{
276+
if (!has_token(name))
277+
{
278+
return make_tuple(TokenType::UNDEFINED, TokenValue::UNDEFINED);
279+
}
280+
return dictionary_.at(name);
281+
}
282+
283+
inline TokenType Dictionary::get_token_type(const string &name) const
284+
{
285+
return std::get<0>(get(name));
286+
}
287+
288+
inline TokenValue Dictionary::get_token_value(const string &name) const
289+
{
290+
return std::get<1>(get(name));
291+
}
292+
}
293+
294+
#endif // !SIMPLEREGEXLANGUAGE_DICTIONARY_H_

‎include/spre/generator.hpp

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
#ifndef SIMPLEREGEXLANGUAGE_GENERATOR_H_
2+
#define SIMPLEREGEXLANGUAGE_GENERATOR_H_
3+
4+
#include "spre/parser.hpp"
5+
6+
namespace spre
7+
{
8+
class Generator
9+
{
10+
public:
11+
Generator(Parser &parser);
12+
~Generator();
13+
14+
private:
15+
Parser parser_;
16+
};
17+
18+
Generator::Generator(Parser &parser): parser_(parser)
19+
{
20+
}
21+
22+
Generator::~Generator()
23+
{
24+
}
25+
}
26+
27+
#endif // !SIMPLEREGEXLANGUAGE_GENERATOR_H_

‎include/spre/lexer.hpp

+348
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,348 @@
1+
#ifndef SIMPLEREGEXLANGUAGE_LEXER_H_
2+
#define SIMPLEREGEXLANGUAGE_LEXER_H_
3+
4+
#include <string>
5+
#include <cctype>
6+
#include "spre/dictionary.hpp"
7+
#include "spre/token.hpp"
8+
9+
using std::string;
10+
11+
namespace spre
12+
{
13+
class Lexer
14+
{
15+
public:
16+
explicit Lexer(const string &src = "");
17+
~Lexer();
18+
Token get_token() const;
19+
Token get_next_token();
20+
bool has_ended() const;
21+
22+
enum class State
23+
{
24+
NONE,
25+
END_OF_FILE,
26+
IDENTIFIER,
27+
NUMBER,
28+
STRING
29+
};
30+
31+
private:
32+
const string src_;
33+
const size_t src_len_; // cache the length
34+
size_t src_cursor_; // cursor always points to next char
35+
char curr_char_; // always src_[src_cursor_ - 1] == curr_char_
36+
string buffer_; // one string object to eat the chars while necessary
37+
State state_;
38+
Token token_;
39+
Dictionary dictionary_;
40+
41+
void move_to_next_char();
42+
char peek_prev_char() const;
43+
char peek_next_char() const;
44+
void handle_eof_state();
45+
void handle_identifier_state();
46+
void handle_number_state();
47+
void handle_string_state(char string_state_delimiter = '\"');
48+
};
49+
50+
Lexer::Lexer(const string &src) :
51+
src_(src), src_len_(src.length()),
52+
src_cursor_(0), curr_char_(' '),
53+
token_(Token()),
54+
state_(State::NONE)
55+
{
56+
}
57+
58+
Lexer::~Lexer()
59+
{
60+
}
61+
62+
inline Token Lexer::get_token() const
63+
{
64+
return token_;
65+
}
66+
67+
inline bool Lexer::has_ended() const
68+
{
69+
// here is a concept issue, how to define ended?
70+
// for example,
71+
// "some string"
72+
// ^^
73+
// curr_char_ src_cursor <- is it ended?
74+
// "some string"
75+
// ^^
76+
// curr_char_ src_cursor <- or is it ended?
77+
return src_cursor_ >= src_len_;
78+
}
79+
80+
inline void Lexer::move_to_next_char()
81+
{
82+
curr_char_ = src_cursor_ < src_len_ ? src_[src_cursor_] : '\0';
83+
src_cursor_ += 1; // the position next to that of curr_char_
84+
}
85+
86+
inline char Lexer::peek_prev_char() const
87+
{
88+
// we pretend there are spaces before the beginning of the source code
89+
return src_cursor_ >= 2 ? src_[src_cursor_ - 2] : ' ';
90+
}
91+
92+
inline char Lexer::peek_next_char() const
93+
{
94+
// maybe special eof?
95+
// now '\0' is used
96+
return src_cursor_ < src_len_ ? src_[src_cursor_] : '\0';
97+
}
98+
99+
inline Token Lexer::get_next_token()
100+
{
101+
102+
bool is_matched = false;
103+
char string_state_delimiter = '\"';
104+
105+
do
106+
{
107+
if (state_ != State::NONE)
108+
{
109+
is_matched = true;
110+
}
111+
112+
switch (state_)
113+
{
114+
case State::NONE:
115+
move_to_next_char();
116+
break;
117+
case State::END_OF_FILE:
118+
handle_eof_state();
119+
break;
120+
case State::IDENTIFIER:
121+
handle_identifier_state();
122+
break;
123+
case State::NUMBER:
124+
handle_number_state();
125+
break;
126+
case State::STRING:
127+
handle_string_state(string_state_delimiter);
128+
break;
129+
default:
130+
break;
131+
}
132+
133+
if (state_ == State::NONE)
134+
{
135+
if (curr_char_ == '\0')
136+
{
137+
state_ = State::END_OF_FILE;
138+
}
139+
else if (std::isalpha(curr_char_) || curr_char_ == '(' || curr_char_ == ')')
140+
{
141+
state_ = State::IDENTIFIER;
142+
}
143+
else if (std::isdigit(curr_char_))
144+
{
145+
state_ = State::NUMBER;
146+
}
147+
else if (curr_char_ == '\"' || curr_char_ == '\'')
148+
{
149+
state_ = State::STRING;
150+
string_state_delimiter = curr_char_;
151+
}
152+
else if (std::isspace(curr_char_) || curr_char_ == ',')
153+
{
154+
//state_ = State::NONE;
155+
}
156+
}
157+
158+
} while (!is_matched);
159+
160+
return token_;
161+
}
162+
163+
inline void Lexer::handle_eof_state()
164+
{
165+
token_ = Token("eof", TokenType::END_OF_FILE, TokenValue::END_OF_FILE);
166+
buffer_.clear();
167+
}
168+
169+
inline void Lexer::handle_identifier_state()
170+
{
171+
// try to find the keywords inside the dictionary
172+
bool found = false;
173+
if (token_.get_token_value() == TokenValue::FROM)
174+
{
175+
// special case, from a to z
176+
// we treat "a to z" as a token as TokenValue::TO
177+
// pattern: a char + spaces + "to" + spaces + a char
178+
// tricky...
179+
char a = curr_char_;
180+
move_to_next_char();
181+
char s1 = curr_char_;
182+
while (std::isspace(curr_char_))
183+
{
184+
s1 = ' ';
185+
move_to_next_char();
186+
}
187+
char to_t = curr_char_;
188+
move_to_next_char();
189+
char to_o = curr_char_;
190+
move_to_next_char();
191+
char s2 = curr_char_;
192+
while (std::isspace(curr_char_))
193+
{
194+
s2 = ' ';
195+
move_to_next_char();
196+
}
197+
char z = curr_char_;
198+
if (
199+
(
200+
(std::isalpha(a) && std::isalpha(z))
201+
|| (std::isdigit(a) && std::isdigit(z))
202+
)
203+
&& s1 == ' '
204+
&& to_t == 't'
205+
&& to_o == 'o'
206+
&& s2 == ' '
207+
)
208+
{
209+
buffer_.push_back(a);
210+
buffer_.push_back(z);
211+
token_ = Token(buffer_, TokenType::CHARACTER, TokenValue::TO);
212+
found = true;
213+
}
214+
/*
215+
do
216+
{
217+
if (!std::isalpha(curr_char_) && !std::isdigit(curr_char_))
218+
{
219+
break;
220+
}
221+
buffer_.push_back(curr_char_);
222+
move_to_next_char();
223+
224+
if (!std::isspace(curr_char_))
225+
{
226+
break;
227+
}
228+
while (std::isspace(curr_char_))
229+
{
230+
move_to_next_char();
231+
}
232+
233+
if (curr_char_ != 't')
234+
{
235+
break;
236+
}
237+
move_to_next_char();
238+
if (curr_char_ != 'o')
239+
{
240+
break;
241+
}
242+
move_to_next_char();
243+
244+
if (!std::isspace(curr_char_))
245+
{
246+
break;
247+
}
248+
while (std::isspace(curr_char_))
249+
{
250+
move_to_next_char();
251+
}
252+
253+
if (!std::isalpha(curr_char_) && !std::isdigit(curr_char_))
254+
{
255+
break;
256+
}
257+
buffer_.push_back(curr_char_);
258+
token_ = Token(buffer_, TokenType::CHARACTER, TokenValue::TO);
259+
found = true;
260+
} while (false);
261+
*/
262+
}
263+
else
264+
{
265+
do
266+
{
267+
buffer_.push_back(std::tolower(curr_char_)); // because it's case insensitive
268+
move_to_next_char();
269+
270+
// TODO: something is the prefix of something
271+
// exactly, exactly 1 time, exactly 1 times
272+
// once, once or more
273+
found = dictionary_.has_token(buffer_);
274+
} while (
275+
!found
276+
&&
277+
(
278+
std::isalpha(curr_char_)
279+
|| (curr_char_ == ' ' && peek_next_char() != ' ') // or std::isspace()
280+
|| buffer_.length() <= dictionary_.get_key_max_length()
281+
)
282+
);
283+
284+
if (found)
285+
{
286+
token_ = Token(buffer_,
287+
dictionary_.get_token_type(buffer_),
288+
dictionary_.get_token_value(buffer_));
289+
}
290+
}
291+
// the identifiers / keys in srl
292+
// are all alpha or at most one ' ' between words
293+
// no line breaks allowed here
294+
// and they cannot exceed the max length of all the keys
295+
296+
if (!found)
297+
{
298+
// then we have trouble, we could not find any available identifier
299+
// TODO
300+
token_ = Token();
301+
}
302+
303+
buffer_.clear();
304+
state_ = State::NONE;
305+
}
306+
307+
inline void Lexer::handle_number_state()
308+
{
309+
do
310+
{
311+
buffer_.push_back(curr_char_); // eat the digits
312+
move_to_next_char();
313+
} while (std::isdigit(curr_char_)); // curr_char_ != '\0' &&
314+
315+
token_ = Token(buffer_, TokenType::SRC_NUMBER, TokenValue::NUMBER);
316+
buffer_.clear();
317+
state_ = State::NONE;
318+
}
319+
320+
inline void Lexer::handle_string_state(char string_state_delimiter)
321+
{
322+
move_to_next_char(); // eat the left '\"'
323+
while (curr_char_ != '\0'
324+
&& (curr_char_ != string_state_delimiter || peek_prev_char() == '\\'))
325+
{
326+
buffer_.push_back(curr_char_);
327+
move_to_next_char();
328+
}
329+
330+
if (curr_char_ == '\0')
331+
{
332+
// then we have a trouble, the string literal does not end correctly
333+
// TODO
334+
token_ = Token();
335+
}
336+
else
337+
{
338+
move_to_next_char(); // eat the right '\"'
339+
token_ = Token(buffer_, TokenType::SRC_STRING, TokenValue::STRING);
340+
}
341+
342+
buffer_.clear();
343+
state_ = State::NONE;
344+
}
345+
346+
}
347+
348+
#endif // !SIMPLEREGEXLANGUAGE_LEXER_H_

‎include/spre/parser.hpp

+533
Large diffs are not rendered by default.

‎include/spre/spre.hpp

+47
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
#ifndef SIMPLEREGEXLANGUAGE_SPRE_H_
2+
#define SIMPLEREGEXLANGUAGE_SPRE_H_
3+
4+
#include "spre/parser.hpp"
5+
#include "spre/generator.hpp"
6+
7+
namespace spre
8+
{
9+
class SRL
10+
{
11+
public:
12+
SRL();
13+
~SRL();
14+
15+
private:
16+
17+
};
18+
19+
SRL::SRL()
20+
{
21+
}
22+
23+
SRL::~SRL()
24+
{
25+
}
26+
27+
28+
class Builder
29+
{
30+
public:
31+
Builder();
32+
~Builder();
33+
34+
private:
35+
36+
};
37+
38+
Builder::Builder()
39+
{
40+
}
41+
42+
Builder::~Builder()
43+
{
44+
}
45+
}
46+
47+
#endif // !SIMPLEREGEXLANGUAGE_SPRE_H_

‎include/spre/token.hpp

+134
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
/*
2+
* the ideas behind this file is heavily borrowed from
3+
* http://frozengene.github.io/blog/compiler/2014/08/10/compiler_tutorial_03/
4+
* that is BSD lincensed
5+
*/
6+
7+
#ifndef SIMPLEREGEXLANGUAGE_TOKEN_H_
8+
#define SIMPLEREGEXLANGUAGE_TOKEN_H_
9+
10+
#include <string>
11+
12+
using std::string;
13+
14+
namespace spre
15+
{
16+
enum class TokenType
17+
{
18+
CHARACTER,
19+
QUANTIFIER,
20+
GROUP,
21+
LOOKAROUND,
22+
FLAG,
23+
ANCHOR,
24+
SRC_WHITESPECE,
25+
SRC_NUMBER,
26+
SRC_STRING,
27+
//COMMENT,
28+
DELIMITER,
29+
END_OF_FILE,
30+
UNDEFINED
31+
};
32+
33+
enum class TokenValue
34+
{
35+
LITERALLY,
36+
ONE_OF,
37+
LETTER,
38+
UPPERCASE_LETTER,
39+
ANY_CHARACTER,
40+
NO_CHARACTER,
41+
DIGIT,
42+
ANYTHING,
43+
NEW_LINE,
44+
WHITESPACE,
45+
NO_WHITESPACE,
46+
TAB,
47+
RAW,
48+
FROM,
49+
TO,
50+
51+
EXCATLY_X_TIMES,
52+
EXACTLY_ONE_TIME,
53+
ONCE,
54+
TWICE,
55+
BETWEEN_X_AND_Y_TIMES,
56+
OPTIONAL,
57+
ONCE_OR_MORE,
58+
NEVER_OR_MORE,
59+
AT_LEAST_X_TIMES,
60+
TIME,
61+
TIMES,
62+
AND,
63+
64+
CAPTURE_AS,
65+
ANY_OF,
66+
UNTIL,
67+
AS,
68+
69+
IF_FOLLOWED_BY,
70+
IF_NOT_FOLLOWED_BY,
71+
IF_ALREADY_HAD,
72+
IF_NOT_ALREADY_HAD,
73+
74+
CASE_INSENSITIVE,
75+
MULTI_LINE,
76+
ALL_LAZY,
77+
78+
BEGIN_WITH,
79+
STARTS_WITH,
80+
MUST_END,
81+
82+
SPACE,
83+
NUMBER,
84+
STRING,
85+
GROUP_START,
86+
GROUP_END,
87+
88+
END_OF_FILE,
89+
UNDEFINED
90+
};
91+
92+
class Token
93+
{
94+
public:
95+
Token(string val = "undefined",
96+
TokenType token_type = TokenType::UNDEFINED,
97+
TokenValue token_value = TokenValue::UNDEFINED);
98+
~Token();
99+
string get_value() const;
100+
TokenType get_token_type() const;
101+
TokenValue get_token_value() const;
102+
private:
103+
string val_;
104+
TokenType token_type_;
105+
TokenValue token_value_;
106+
};
107+
108+
Token::Token(string val, TokenType token_type, TokenValue token_value)
109+
: val_(val), token_type_(token_type), token_value_(token_value)
110+
{
111+
}
112+
113+
Token::~Token()
114+
{
115+
}
116+
117+
inline string Token::get_value() const
118+
{
119+
return val_;
120+
}
121+
122+
inline TokenType Token::get_token_type() const
123+
{
124+
return token_type_;
125+
}
126+
127+
inline TokenValue Token::get_token_value() const
128+
{
129+
return token_value_;
130+
}
131+
132+
}
133+
134+
#endif // !SIMPLEREGEXLANGUAGE_TOKEN_H_

‎test/spre_test.cpp

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
#include <string>
2+
#include <iostream>
3+
#include <tuple>
4+
#include <vector>
5+
#include "spre/spre.hpp"
6+
#include "spre/token.hpp"
7+
#include "spre/dictionary.hpp"
8+
9+
int main() {
10+
spre::Lexer lexer("letter \"haha\"");
11+
12+
while (!lexer.has_ended())
13+
{
14+
std::cout << lexer.get_next_token().get_value() << std::endl;
15+
}
16+
std::cout << lexer.get_next_token().get_value() << std::endl;
17+
18+
return 0;
19+
}

0 commit comments

Comments
 (0)
Please sign in to comment.