SimpleRegex · Sep 25, 2016
diff --git a/‎.gitignore
+4-29 b/‎.gitignore
+4-29
diff --git a/‎CMakeLists.txt
+10 b/‎CMakeLists.txt
+10
diff --git a/‎README.md
+75-1 b/‎README.md
+75-1
diff --git a/‎include/spre/ast.hpp
+207 b/‎include/spre/ast.hpp
+207
diff --git a/‎include/spre/builder.hpp
+27 b/‎include/spre/builder.hpp
+27
diff --git a/‎include/spre/dictionary.hpp
+294 b/‎include/spre/dictionary.hpp
+294
diff --git a/‎include/spre/generator.hpp
+27 b/‎include/spre/generator.hpp
+27
diff --git a/‎include/spre/lexer.hpp
+348 b/‎include/spre/lexer.hpp
+348
diff --git a/‎include/spre/parser.hpp
+533 b/‎include/spre/parser.hpp
+533
diff --git a/‎include/spre/spre.hpp
+47 b/‎include/spre/spre.hpp
+47
diff --git a/‎include/spre/token.hpp
+134 b/‎include/spre/token.hpp
+134
diff --git a/‎test/spre_test.cpp
+19 b/‎test/spre_test.cpp
+19
@@ -1,29 +1,4 @@
-# Compiled Object files
-*.slo
-*.lo
-*.o
-*.obj
-
-# Precompiled Headers
-*.gch
-*.pch
-
-# Compiled Dynamic libraries
-*.so
-*.dylib
-*.dll
-
-# Fortran module files
-*.mod
-*.smod
-
-# Compiled Static libraries
-*.lai
-*.la
-*.a
-*.lib
-
-# Executables
-*.exe
-*.out
-*.app
+.vscode/
+build/
+test/*.exe
+test/*.out
@@ -0,0 +1,10 @@
+cmake_minimum_required (VERSION 3.1.0)
+project (spre_test)
+
+include_directories(include)
+file(GLOB SOURCES "test/*.cpp")
+
+add_executable(spre_test ${SOURCES})
+
+set_property(TARGET spre_test PROPERTY CXX_STANDARD 14)
+set_property(TARGET spre_test PROPERTY CXX_STANDARD_REQUIRED ON)
@@ -1,2 +1,76 @@
 # SRL-CPP
-C++ implementation of SRL.
+
+This is a header-only C++ libary for Simple Regex Language.
+
+**Currently under development.**
+
+## Usage
+
+You only need to include the library directory and use C++ 14 standard to compile. The namespace used is `spre::`.
+
+The library uses C++ 11 features heavily, and uses `make_unique` from C++ 14. You could use latest Visual Studio, or `g++-4.9` or later, or `clang++-3.8` or later. The project uses `cmake` as the builing system. 
+
+```cpp
+// test.cpp
+#include <string>
+#include <iostream>
+#include <spre/spre.hpp>
+int main()
+{
+    std::string src = "literally \"something\"";
+    spre::SRL srl(src);
+    std::cout << srl.get_pattern() << std::endl;
+    return 0;
+}
+```
+
+```bash
+$ tree .
+.
+|-- test.cpp
+`-- include
+    `-- spre
+        |-- ast.hpp
+        |-- ...
+        ...
+
+$ g++ -I./include -std=c++14 test.cpp
+$ ./test
+(?:something)
+```
+
+The project uses `cmake` as the builing system. It's especially useful for development in Visual Studio in Windows.
+
+## License
+
+MIT.
+
+## Limitations and TODOs
+
+- The `Builder` is yet to be implemented.
+- The error reports are implemented as outputing to `stderr`.
+
+## Technical Structures
+
+First of all, it's designed to be a header-only library. Thus everything are written in the header files (`.hpp`). Ideally users only need to `#include <spre/spre.hpp>`.
+
+The library is written as a light-weight compiler-like thing, although SRL is a DSL and does not have control flow (as a subset of Regex) thus could not be considered turing-complete. As a result, this library has lexer and parser and code generator. This library has specific lexer instead of using `yacc`. The code is written following the tutorials from [llvm](http://llvm.org/docs/tutorial/LangImpl02.html) and [@](http://frozengene.github.io/blog/compiler/2014/08/10/compiler_tutorial_03/).
+
+The structure:
+
+```txt
+
+token.hpp
+  |
+  V
+dictionary.hpp         ast.hpp
+  |                       |  
+  V                       V
+lexer.hpp  ---------> parser.hpp --------> generator.hpp
+(get tokens)  (get (vector of) asts) (get the compiled regex string)
+                                             |
+                                             V
+                                          spre.hpp
+                                        (`SRL` and `Builder`)
+```
+
@@ -0,0 +1,207 @@
+/*
+* the ideas behind this file is heavily borrowed from
+* http://llvm.org/docs/tutorial/LangImpl02.html
+* that is UIUC (BSD-like) lincensed
+*/
+
+#ifndef SIMPLEREGEXLANGUAGE_AST_H_
+#define SIMPLEREGEXLANGUAGE_AST_H_
+
+#include <string>
+#include <vector>
+#include <memory>
+
+using std::string;
+using std::vector;
+using std::unique_ptr;
+
+
+namespace spre
+{
+	class ExprAST
+	{
+	public:
+		virtual string get_val() const = 0;
+		virtual ~ExprAST() = default;
+	};
+
+
+	class CharacterExprAST: public ExprAST
+	{
+	public:
+		CharacterExprAST(const string &val = "");
+		string get_val() const override;
+	private:
+		const string val_;
+	};
+
+	CharacterExprAST::CharacterExprAST(const string &val): val_(val)
+	{
+	}
+
+	inline string CharacterExprAST::get_val() const
+	{
+		return val_;
+	}
+
+
+	class QuantifierExprAST: public ExprAST
+	{
+	public:
+		QuantifierExprAST(const string &val);
+		string get_val() const override;
+	private:
+		const string val_;
+	};
+
+	QuantifierExprAST::QuantifierExprAST(const string &val): val_(val)
+	{
+	}
+
+	inline string QuantifierExprAST::get_val() const
+	{
+		return val_;
+	}
+
+
+	class GroupExprAST : public ExprAST
+	{
+	public:
+		GroupExprAST(vector<unique_ptr<ExprAST>> cond);
+		GroupExprAST(vector<unique_ptr<ExprAST>> cond,
+			const string &name, vector<unique_ptr<ExprAST>> until_cond);
+		void set_name(const string &name);
+		void set_until_cond(vector<unique_ptr<ExprAST>> until_cond);
+		string get_val() const override;
+	private:
+		vector<unique_ptr<ExprAST>> cond_;
+		string name_;
+		vector<unique_ptr<ExprAST>> until_cond_; // maybe useless
+	};
+
+	GroupExprAST::GroupExprAST(vector<unique_ptr<ExprAST>> cond)
+		: cond_(std::move(cond))
+	{
+	}
+
+	GroupExprAST::GroupExprAST(vector<unique_ptr<ExprAST>> cond,
+		const string &name, vector<unique_ptr<ExprAST>> until_cond)
+		: cond_(std::move(cond)), name_(name), until_cond_(std::move(until_cond))
+	{
+	}
+
+	inline void GroupExprAST::set_name(const string &name)
+	{
+		name_ = name;
+	}
+	
+	inline void GroupExprAST::set_until_cond(vector<unique_ptr<ExprAST>> until_cond)
+	{
+		until_cond_ = std::move(until_cond);
+	}
+	
+	inline string GroupExprAST::get_val() const
+	{
+		if (cond_.size() != 0)
+		{
+			string res = "(";
+
+			if (name_.size() != 0)
+			{
+				res.append("<");
+				res.append(name_);
+				res.append(">");
+			}
+
+			for (auto const &iter : cond_)
+			{
+				res.append(iter->get_val());
+			}
+			res.append(")");
+			return res;
+		}
+
+		// error!
+		return "";
+	}
+
+
+	class LookAroundExprAST : public ExprAST
+	{
+	public:
+		LookAroundExprAST(const vector<string> vals, 
+			vector<unique_ptr<ExprAST>> cond = vector<unique_ptr<ExprAST>>());
+		string get_val() const override;
+	private:
+		const vector<string> vals_;
+		vector<unique_ptr<ExprAST>> cond_;
+	};
+
+	LookAroundExprAST::LookAroundExprAST(const vector<string> vals,
+		vector<unique_ptr<ExprAST>> cond)
+		: vals_(std::move(vals)), cond_(std::move(cond))
+	{
+	}
+
+	inline string LookAroundExprAST::get_val() const
+	{
+		if (vals_.size() == 1)
+		{
+			return vals_[0];
+		}
+		if (vals_.size() == 2 && cond_.size() != 0)
+		{
+			string res = vals_[0];
+			for (auto const &iter: cond_)
+			{
+				res.append(iter->get_val());
+			}
+			res.append(vals_[1]);
+			return res;
+		}
+
+		// error!
+		return ""; 
+	}
+
+
+	class FlagExprAST : public ExprAST
+	{
+	public:
+		FlagExprAST(const string &val);
+		string get_val() const override;
+	private:
+		const string val_;
+	};
+
+	FlagExprAST::FlagExprAST(const string &val) : val_(val)
+	{
+	}
+
+	inline string FlagExprAST::get_val() const
+	{
+		return val_;
+	}
+
+
+	class AnchorExprAST : public ExprAST
+	{
+	public:
+		AnchorExprAST(const string &val);
+		string get_val() const override;
+	private:
+		const string val_;
+	};
+
+	AnchorExprAST::AnchorExprAST(const string &val) : val_(val)
+	{
+	}
+
+	inline string AnchorExprAST::get_val() const
+	{
+		return val_;
+	}
+
+}
+
+#endif // !SIMPLEREGEXLANGUAGE_AST_H_
@@ -0,0 +1,27 @@
+#ifndef SIMPLEREGEXLANGUAGE_BUILDER_H_
+#define SIMPLEREGEXLANGUAGE_BUILDER_H_
+
+namespace spre
+{
+    class Builder
+    {
+    public:
+        Builder();
+        ~Builder();
+
+    private:
+
+    };
+
+    Builder::Builder()
+    {
+    }
+
+    Builder::~Builder()
+    {
+    }
+
+
+}
+
+#endif // !SIMPLEREGEXLANGUAGE_BUILDER_H_
@@ -0,0 +1,294 @@
+/*
+* the ideas behind this file is heavily borrowed from
+* http://frozengene.github.io/blog/compiler/2014/08/10/compiler_tutorial_03/
+* that is BSD lincensed
+*/
+
+#ifndef SIMPLEREGEXLANGUAGE_DICTIONARY_H_
+#define SIMPLEREGEXLANGUAGE_DICTIONARY_H_
+
+#include <tuple>
+#include <unordered_map>
+#include <string>
+#include "spre/token.hpp"
+
+using std::tuple;
+using std::make_tuple;
+using std::unordered_map;
+using std::string;
+
+namespace spre
+{
+	using MetaType = tuple<TokenType, TokenValue>;
+
+	class Dictionary
+	{
+	public:
+		Dictionary();
+		~Dictionary();
+		bool has_token(const string &name) const;
+		size_t get_key_max_length() const;
+		MetaType get(const string &name) const;
+		TokenType get_token_type(const string &name) const;
+		TokenValue get_token_value(const string &name) const;
+	private:
+		unordered_map<string, MetaType> dictionary_;
+		size_t key_max_len_; // cache the max length of all the keys!
+	};
+
+	Dictionary::Dictionary():
+		dictionary_{
+			{
+				"literally",
+				make_tuple(TokenType::CHARACTER, TokenValue::LITERALLY)
+			},
+			{
+				"one of",
+				make_tuple(TokenType::CHARACTER, TokenValue::ONE_OF)
+			},
+			{
+				"letter",
+				make_tuple(TokenType::CHARACTER, TokenValue::LETTER)
+			},
+			{
+				"uppercase letter",
+				make_tuple(TokenType::CHARACTER, TokenValue::UPPERCASE_LETTER)
+			},
+			{
+				"any character",
+				make_tuple(TokenType::CHARACTER, TokenValue::ANY_CHARACTER)
+			},
+			{
+				"no character",
+				make_tuple(TokenType::CHARACTER, TokenValue::NO_CHARACTER)
+			},
+			{
+				"digit",
+				make_tuple(TokenType::CHARACTER, TokenValue::DIGIT)
+			},
+			{
+				"anything",
+				make_tuple(TokenType::CHARACTER, TokenValue::ANYTHING)
+			},
+			{
+				"new line",
+				make_tuple(TokenType::CHARACTER, TokenValue::NEW_LINE)
+			},
+			{
+				"whitespace",
+				make_tuple(TokenType::CHARACTER, TokenValue::WHITESPACE)
+			},
+			{
+				"no whitespace",
+				make_tuple(TokenType::CHARACTER, TokenValue::NO_WHITESPACE)
+			},
+			{
+				"tab",
+				make_tuple(TokenType::CHARACTER, TokenValue::TAB)
+			},
+			{
+				"raw",
+				make_tuple(TokenType::CHARACTER, TokenValue::RAW)
+			},
+			{
+				"from",
+				make_tuple(TokenType::CHARACTER, TokenValue::FROM)
+			},
+			{
+				"to",
+				make_tuple(TokenType::CHARACTER, TokenValue::TO)
+			},
+
+
+			{
+				"exactly",
+				make_tuple(TokenType::QUANTIFIER, TokenValue::EXCATLY_X_TIMES)
+			},
+			{
+				"exactly 1 time",
+				make_tuple(TokenType::QUANTIFIER, TokenValue::EXACTLY_ONE_TIME)
+			},
+			{
+				"once",
+				make_tuple(TokenType::QUANTIFIER, TokenValue::ONCE)
+			},
+			{
+				"twice",
+				make_tuple(TokenType::QUANTIFIER, TokenValue::TWICE)
+			},
+			{
+				"between",
+				make_tuple(TokenType::QUANTIFIER, TokenValue::BETWEEN_X_AND_Y_TIMES)
+			},
+			{
+				"optional",
+				make_tuple(TokenType::QUANTIFIER, TokenValue::OPTIONAL)
+			},
+			{
+				"once or more",
+				make_tuple(TokenType::QUANTIFIER, TokenValue::ONCE_OR_MORE)
+			},
+			{
+				"never or more",
+				make_tuple(TokenType::QUANTIFIER, TokenValue::NEVER_OR_MORE)
+			},
+			{
+				"at least",
+				make_tuple(TokenType::QUANTIFIER, TokenValue::AT_LEAST_X_TIMES)
+			},
+			{
+				"time",
+				make_tuple(TokenType::QUANTIFIER, TokenValue::TIME)
+			},
+			{
+				"times",
+				make_tuple(TokenType::QUANTIFIER, TokenValue::TIMES)
+			},
+			{
+				"and",
+				make_tuple(TokenType::QUANTIFIER, TokenValue::AND)
+			},
+
+
+			{
+				"capture",
+				make_tuple(TokenType::GROUP, TokenValue::CAPTURE_AS)
+			},
+			{
+				"any of",
+				make_tuple(TokenType::GROUP, TokenValue::ANY_OF)
+			},
+			{
+				"until",
+				make_tuple(TokenType::GROUP, TokenValue::UNTIL)
+			},
+			{
+				"as",
+				make_tuple(TokenType::GROUP, TokenValue::AS)
+			},
+
+
+			{
+				"if followed by",
+				make_tuple(TokenType::LOOKAROUND, TokenValue::IF_FOLLOWED_BY)
+			},
+			{
+				"if not followed by",
+				make_tuple(TokenType::LOOKAROUND, TokenValue::IF_NOT_FOLLOWED_BY)
+			},
+			{
+				"if already had",
+				make_tuple(TokenType::LOOKAROUND, TokenValue::IF_ALREADY_HAD)
+			},
+			{
+				"if not already had",
+				make_tuple(TokenType::LOOKAROUND, TokenValue::IF_NOT_ALREADY_HAD)
+			},
+
+
+			{
+				"case insensitive",
+				make_tuple(TokenType::FLAG, TokenValue::CASE_INSENSITIVE)
+			},
+			{
+				"multi line",
+				make_tuple(TokenType::FLAG, TokenValue::MULTI_LINE)
+			},
+			{
+				"all lazy",
+				make_tuple(TokenType::FLAG, TokenValue::ALL_LAZY)
+			},
+
+
+			{
+				"begin with",
+				make_tuple(TokenType::ANCHOR, TokenValue::BEGIN_WITH)
+			},
+			{
+				"starts with",
+				make_tuple(TokenType::ANCHOR, TokenValue::STARTS_WITH)
+			},
+			{
+				"must end",
+				make_tuple(TokenType::ANCHOR, TokenValue::MUST_END)
+			},
+
+
+			{
+				",",
+				make_tuple(TokenType::SRC_WHITESPECE, TokenValue::SPACE)
+			},
+			{
+				" ",
+				make_tuple(TokenType::SRC_WHITESPECE, TokenValue::SPACE)
+			},
+			{
+				"\n",
+				make_tuple(TokenType::SRC_WHITESPECE, TokenValue::SPACE)
+			},
+
+
+			{
+				"\"",
+				make_tuple(TokenType::DELIMITER, TokenValue::STRING)
+			},
+			{
+				"\'",
+				make_tuple(TokenType::DELIMITER, TokenValue::STRING)
+			},
+			{
+				"(",
+				make_tuple(TokenType::DELIMITER, TokenValue::GROUP_START)
+			},
+			{
+				")",
+				make_tuple(TokenType::DELIMITER, TokenValue::GROUP_END)
+			}
+		},
+		key_max_len_(0)
+	{
+		for (auto const &iter: dictionary_)
+		{
+			if (key_max_len_ < (iter.first).length())
+			{
+				key_max_len_ = (iter.first).length();
+			}
+		}
+	}
+
+	Dictionary::~Dictionary()
+	{
+	}
+
+	inline size_t Dictionary::get_key_max_length() const
+	{
+		return key_max_len_;
+	}
+
+	inline bool Dictionary::has_token(const string &name) const
+	{
+		auto iter = dictionary_.find(name);
+		return iter != dictionary_.end();
+	}
+
+	inline MetaType Dictionary::get(const string &name) const
+	{
+		if (!has_token(name))
+		{
+			return make_tuple(TokenType::UNDEFINED, TokenValue::UNDEFINED);
+		}
+		return dictionary_.at(name);
+	}
+
+	inline TokenType Dictionary::get_token_type(const string &name) const
+	{
+		return std::get<0>(get(name));
+	}
+
+	inline TokenValue Dictionary::get_token_value(const string &name) const
+	{
+		return std::get<1>(get(name));
+	}
+}
+
+#endif // !SIMPLEREGEXLANGUAGE_DICTIONARY_H_
@@ -0,0 +1,27 @@
+#ifndef SIMPLEREGEXLANGUAGE_GENERATOR_H_
+#define SIMPLEREGEXLANGUAGE_GENERATOR_H_
+
+#include "spre/parser.hpp"
+
+namespace spre
+{
+	class Generator
+	{
+	public:
+		Generator(Parser &parser);
+		~Generator();
+
+	private:
+		Parser parser_;
+	};
+
+	Generator::Generator(Parser &parser): parser_(parser)
+	{
+	}
+
+	Generator::~Generator()
+	{
+	}
+}
+
+#endif // !SIMPLEREGEXLANGUAGE_GENERATOR_H_
@@ -0,0 +1,348 @@
+#ifndef SIMPLEREGEXLANGUAGE_LEXER_H_
+#define SIMPLEREGEXLANGUAGE_LEXER_H_
+
+#include <string>
+#include <cctype>
+#include "spre/dictionary.hpp"
+#include "spre/token.hpp"
+
+using std::string;
+
+namespace spre
+{
+	class Lexer
+	{
+	public:
+		explicit Lexer(const string &src = "");
+		~Lexer();
+		Token get_token() const;
+		Token get_next_token();
+		bool has_ended() const;
+
+		enum class State
+		{
+			NONE,
+			END_OF_FILE,
+			IDENTIFIER,
+			NUMBER,
+			STRING
+		};
+
+	private:
+		const string src_;
+		const size_t src_len_; // cache the length
+		size_t src_cursor_; // cursor always points to next char
+		char curr_char_; // always src_[src_cursor_ - 1] == curr_char_
+		string buffer_; // one string object to eat the chars while necessary
+		State state_;
+		Token token_;
+		Dictionary dictionary_;
+
+		void move_to_next_char();
+		char peek_prev_char() const;
+		char peek_next_char() const;
+		void handle_eof_state();
+		void handle_identifier_state();
+		void handle_number_state();
+		void handle_string_state(char string_state_delimiter = '\"');
+	};
+
+	Lexer::Lexer(const string &src) :
+		src_(src), src_len_(src.length()),
+		src_cursor_(0), curr_char_(' '),
+		token_(Token()),
+		state_(State::NONE)
+	{
+	}
+
+	Lexer::~Lexer()
+	{
+	}
+
+	inline Token Lexer::get_token() const
+	{
+		return token_;
+	}
+
+	inline bool Lexer::has_ended() const
+	{
+		// here is a concept issue, how to define ended?
+		// for example,
+		// "some string"
+		//            ^^
+		//  curr_char_  src_cursor    <- is it ended?
+		// "some string"
+		//             ^^
+		//   curr_char_  src_cursor    <- or is it ended?
+		return src_cursor_ >= src_len_;
+	}
+
+	inline void Lexer::move_to_next_char()
+	{
+		curr_char_ = src_cursor_ < src_len_ ? src_[src_cursor_] : '\0';
+		src_cursor_ += 1; // the position next to that of curr_char_
+	}
+
+	inline char Lexer::peek_prev_char() const
+	{
+		// we pretend there are spaces before the beginning of the source code
+		return src_cursor_ >= 2 ? src_[src_cursor_ - 2] : ' ';
+	}
+
+	inline char Lexer::peek_next_char() const
+	{
+		// maybe special eof?
+		// now '\0' is used
+		return src_cursor_ < src_len_ ? src_[src_cursor_] : '\0';
+	}
+
+	inline Token Lexer::get_next_token()
+	{
+		
+		bool is_matched = false;
+		char string_state_delimiter = '\"';
+
+		do
+		{
+			if (state_ != State::NONE)
+			{
+				is_matched = true;
+			}
+
+			switch (state_)
+			{
+			case State::NONE:
+				move_to_next_char();
+				break;
+			case State::END_OF_FILE:
+				handle_eof_state();
+				break;
+			case State::IDENTIFIER:
+				handle_identifier_state();
+				break;
+			case State::NUMBER:
+				handle_number_state();
+				break;
+			case State::STRING:
+				handle_string_state(string_state_delimiter);
+				break;
+			default:
+				break;
+			}
+
+			if (state_ == State::NONE)
+			{
+				if (curr_char_ == '\0')
+				{
+					state_ = State::END_OF_FILE;
+				}
+				else if (std::isalpha(curr_char_) || curr_char_ == '(' || curr_char_ == ')')
+				{
+					state_ = State::IDENTIFIER;
+				}
+				else if (std::isdigit(curr_char_))
+				{
+					state_ = State::NUMBER;
+				}
+				else if (curr_char_ == '\"' || curr_char_ == '\'')
+				{
+					state_ = State::STRING;
+					string_state_delimiter = curr_char_;
+				}
+				else if (std::isspace(curr_char_) || curr_char_ == ',')
+				{
+					//state_ = State::NONE;
+				}
+			}
+
+		} while (!is_matched);
+
+		return token_;
+	}
+
+	inline void Lexer::handle_eof_state()
+	{
+		token_ = Token("eof", TokenType::END_OF_FILE, TokenValue::END_OF_FILE);
+		buffer_.clear();
+	}
+
+	inline void Lexer::handle_identifier_state()
+	{
+		// try to find the keywords inside the dictionary
+		bool found = false;
+		if (token_.get_token_value() == TokenValue::FROM)
+		{
+			// special case, from a to z
+			// we treat "a to z" as a token as TokenValue::TO
+			// pattern: a char + spaces + "to" + spaces + a char
+			// tricky...
+			char a = curr_char_;
+			move_to_next_char();
+			char s1 = curr_char_;
+			while (std::isspace(curr_char_))
+			{
+				s1 = ' ';
+				move_to_next_char();
+			}
+			char to_t = curr_char_;
+			move_to_next_char();
+			char to_o = curr_char_;
+			move_to_next_char();
+			char s2 = curr_char_;
+			while (std::isspace(curr_char_))
+			{
+				s2 = ' ';
+				move_to_next_char();
+			}
+			char z = curr_char_;
+			if (
+				(
+					(std::isalpha(a) && std::isalpha(z)) 
+					|| (std::isdigit(a) && std::isdigit(z))
+				)
+				&& s1 == ' '
+				&& to_t == 't'
+				&& to_o == 'o'
+				&& s2 == ' '
+			)
+			{
+				buffer_.push_back(a);
+				buffer_.push_back(z);
+				token_ = Token(buffer_, TokenType::CHARACTER, TokenValue::TO);
+				found = true;
+			}
+			/*
+			do
+			{
+				if (!std::isalpha(curr_char_) && !std::isdigit(curr_char_))
+				{
+					break;
+				}
+				buffer_.push_back(curr_char_);
+				move_to_next_char();
+
+				if (!std::isspace(curr_char_))
+				{
+					break;
+				}
+				while (std::isspace(curr_char_))
+				{
+					move_to_next_char();
+				}
+
+				if (curr_char_ != 't')
+				{
+					break;
+				}
+				move_to_next_char();
+				if (curr_char_ != 'o')
+				{
+					break;
+				}
+				move_to_next_char();
+
+				if (!std::isspace(curr_char_))
+				{
+					break;
+				}
+				while (std::isspace(curr_char_))
+				{
+					move_to_next_char();
+				}
+
+				if (!std::isalpha(curr_char_) && !std::isdigit(curr_char_))
+				{
+					break;
+				}
+				buffer_.push_back(curr_char_);
+				token_ = Token(buffer_, TokenType::CHARACTER, TokenValue::TO);
+				found = true;
+			} while (false);
+			*/
+		}
+		else
+		{
+			do
+			{
+				buffer_.push_back(std::tolower(curr_char_)); // because it's case insensitive
+				move_to_next_char();
+
+				// TODO: something is the prefix of something
+				// exactly, exactly 1 time, exactly 1 times
+				// once, once or more
+				found = dictionary_.has_token(buffer_);
+			} while (
+				!found
+				&&
+				(
+					std::isalpha(curr_char_)
+					|| (curr_char_ == ' ' && peek_next_char() != ' ') // or std::isspace()
+					|| buffer_.length() <= dictionary_.get_key_max_length()
+				)
+			);
+
+			if (found)
+			{
+				token_ = Token(buffer_,
+					dictionary_.get_token_type(buffer_),
+					dictionary_.get_token_value(buffer_));
+			}
+		}
+		// the identifiers / keys in srl 
+		// are all alpha or at most one ' ' between words
+		// no line breaks allowed here
+		// and they cannot exceed the max length of all the keys
+
+		if (!found)
+		{
+			// then we have trouble, we could not find any available identifier
+			// TODO
+			token_ = Token();
+		}
+
+		buffer_.clear();
+		state_ = State::NONE;
+	}
+
+	inline void Lexer::handle_number_state()
+	{
+		do
+		{
+			buffer_.push_back(curr_char_); // eat the digits
+			move_to_next_char();
+		} while (std::isdigit(curr_char_)); // curr_char_ != '\0' &&
+
+		token_ = Token(buffer_, TokenType::SRC_NUMBER, TokenValue::NUMBER);
+		buffer_.clear();
+		state_ = State::NONE;
+	}
+
+	inline void Lexer::handle_string_state(char string_state_delimiter)
+	{
+		move_to_next_char(); // eat the left '\"'
+		while (curr_char_ != '\0' 
+			&& (curr_char_ != string_state_delimiter || peek_prev_char() == '\\'))
+		{
+			buffer_.push_back(curr_char_);
+			move_to_next_char();
+		}
+
+		if (curr_char_ == '\0')
+		{
+			// then we have a trouble, the string literal does not end correctly
+			// TODO
+			token_ = Token();
+		}
+		else
+		{
+			move_to_next_char(); // eat the right '\"'
+			token_ = Token(buffer_, TokenType::SRC_STRING, TokenValue::STRING);
+		}
+
+		buffer_.clear();
+		state_ = State::NONE;
+	}
+
+}
+
+#endif // !SIMPLEREGEXLANGUAGE_LEXER_H_
@@ -0,0 +1,47 @@
+#ifndef SIMPLEREGEXLANGUAGE_SPRE_H_
+#define SIMPLEREGEXLANGUAGE_SPRE_H_
+
+#include "spre/parser.hpp"
+#include "spre/generator.hpp"
+
+namespace spre
+{
+	class SRL
+	{
+	public:
+		SRL();
+		~SRL();
+
+	private:
+
+	};
+
+	SRL::SRL()
+	{
+	}
+
+	SRL::~SRL()
+	{
+	}
+
+
+	class Builder
+	{
+	public:
+		Builder();
+		~Builder();
+
+	private:
+
+	};
+
+	Builder::Builder()
+	{
+	}
+
+	Builder::~Builder()
+	{
+	}
+}
+
+#endif // !SIMPLEREGEXLANGUAGE_SPRE_H_
@@ -0,0 +1,134 @@
+/*
+ * the ideas behind this file is heavily borrowed from
+ * http://frozengene.github.io/blog/compiler/2014/08/10/compiler_tutorial_03/
+ * that is BSD lincensed
+ */
+
+#ifndef SIMPLEREGEXLANGUAGE_TOKEN_H_
+#define SIMPLEREGEXLANGUAGE_TOKEN_H_
+
+#include <string>
+
+using std::string;
+
+namespace spre
+{
+	enum class TokenType
+	{
+		CHARACTER,
+		QUANTIFIER,
+		GROUP,
+		LOOKAROUND,
+		FLAG,
+		ANCHOR,
+		SRC_WHITESPECE,
+		SRC_NUMBER,
+		SRC_STRING,
+		//COMMENT,
+		DELIMITER,
+		END_OF_FILE,
+		UNDEFINED
+	};
+
+	enum class TokenValue
+	{
+		LITERALLY,
+		ONE_OF,
+		LETTER,
+		UPPERCASE_LETTER,
+		ANY_CHARACTER,
+		NO_CHARACTER,
+		DIGIT,
+		ANYTHING,
+		NEW_LINE,
+		WHITESPACE,
+		NO_WHITESPACE,
+		TAB,
+		RAW,
+		FROM,
+		TO,
+
+		EXCATLY_X_TIMES,
+		EXACTLY_ONE_TIME,
+		ONCE,
+		TWICE,
+		BETWEEN_X_AND_Y_TIMES,
+		OPTIONAL,
+		ONCE_OR_MORE,
+		NEVER_OR_MORE,
+		AT_LEAST_X_TIMES,
+		TIME,
+		TIMES,
+		AND,
+
+		CAPTURE_AS,
+		ANY_OF,
+		UNTIL,
+		AS,
+
+		IF_FOLLOWED_BY,
+		IF_NOT_FOLLOWED_BY,
+		IF_ALREADY_HAD,
+		IF_NOT_ALREADY_HAD,
+
+		CASE_INSENSITIVE,
+		MULTI_LINE,
+		ALL_LAZY,
+		
+		BEGIN_WITH,
+		STARTS_WITH,
+		MUST_END,
+
+		SPACE,
+		NUMBER,
+		STRING,
+		GROUP_START,
+		GROUP_END,
+
+		END_OF_FILE,
+		UNDEFINED
+	};
+
+	class Token
+	{
+	public:
+		Token(string val = "undefined", 
+			TokenType token_type = TokenType::UNDEFINED, 
+			TokenValue token_value = TokenValue::UNDEFINED);
+		~Token();
+		string get_value() const;
+		TokenType get_token_type() const;
+		TokenValue get_token_value() const;
+	private:
+		string val_;
+		TokenType token_type_;
+		TokenValue token_value_;
+	};
+
+	Token::Token(string val, TokenType token_type, TokenValue token_value)
+		: val_(val), token_type_(token_type), token_value_(token_value)
+	{
+	}
+
+	Token::~Token()
+	{
+	}
+
+	inline string Token::get_value() const
+	{
+		return val_;
+	}
+
+	inline TokenType Token::get_token_type() const
+	{
+		return token_type_;
+	}
+
+	inline TokenValue Token::get_token_value() const
+	{
+		return token_value_;
+	}
+
+}
+
+#endif // !SIMPLEREGEXLANGUAGE_TOKEN_H_
@@ -0,0 +1,19 @@
+#include <string>
+#include <iostream>
+#include <tuple>
+#include <vector>
+#include "spre/spre.hpp"
+#include "spre/token.hpp"
+#include "spre/dictionary.hpp"
+
+int main() {
+	spre::Lexer lexer("letter \"haha\"");
+	
+	while (!lexer.has_ended())
+	{
+		std::cout << lexer.get_next_token().get_value() << std::endl;
+	}
+	std::cout << lexer.get_next_token().get_value() << std::endl;
+
+    return 0;
+}