|
| 1 | +#ifndef SIMPLEREGEXLANGUAGE_LEXER_H_ |
| 2 | +#define SIMPLEREGEXLANGUAGE_LEXER_H_ |
| 3 | + |
| 4 | +#include <string> |
| 5 | +#include <cctype> |
| 6 | +#include "spre/dictionary.hpp" |
| 7 | +#include "spre/token.hpp" |
| 8 | + |
| 9 | +using std::string; |
| 10 | + |
| 11 | +namespace spre |
| 12 | +{ |
| 13 | + class Lexer |
| 14 | + { |
| 15 | + public: |
| 16 | + explicit Lexer(const string &src = ""); |
| 17 | + ~Lexer(); |
| 18 | + Token get_token() const; |
| 19 | + Token get_next_token(); |
| 20 | + bool has_ended() const; |
| 21 | + |
| 22 | + enum class State |
| 23 | + { |
| 24 | + NONE, |
| 25 | + END_OF_FILE, |
| 26 | + IDENTIFIER, |
| 27 | + NUMBER, |
| 28 | + STRING |
| 29 | + }; |
| 30 | + |
| 31 | + private: |
| 32 | + const string src_; |
| 33 | + const size_t src_len_; // cache the length |
| 34 | + size_t src_cursor_; // cursor always points to next char |
| 35 | + char curr_char_; // always src_[src_cursor_ - 1] == curr_char_ |
| 36 | + string buffer_; // one string object to eat the chars while necessary |
| 37 | + State state_; |
| 38 | + Token token_; |
| 39 | + Dictionary dictionary_; |
| 40 | + |
| 41 | + void move_to_next_char(); |
| 42 | + char peek_prev_char() const; |
| 43 | + char peek_next_char() const; |
| 44 | + void handle_eof_state(); |
| 45 | + void handle_identifier_state(); |
| 46 | + void handle_number_state(); |
| 47 | + void handle_string_state(char string_state_delimiter = '\"'); |
| 48 | + }; |
| 49 | + |
| 50 | + Lexer::Lexer(const string &src) : |
| 51 | + src_(src), src_len_(src.length()), |
| 52 | + src_cursor_(0), curr_char_(' '), |
| 53 | + token_(Token()), |
| 54 | + state_(State::NONE) |
| 55 | + { |
| 56 | + } |
| 57 | + |
| 58 | + Lexer::~Lexer() |
| 59 | + { |
| 60 | + } |
| 61 | + |
| 62 | + inline Token Lexer::get_token() const |
| 63 | + { |
| 64 | + return token_; |
| 65 | + } |
| 66 | + |
| 67 | + inline bool Lexer::has_ended() const |
| 68 | + { |
| 69 | + // here is a concept issue, how to define ended? |
| 70 | + // for example, |
| 71 | + // "some string" |
| 72 | + // ^^ |
| 73 | + // curr_char_ src_cursor <- is it ended? |
| 74 | + // "some string" |
| 75 | + // ^^ |
| 76 | + // curr_char_ src_cursor <- or is it ended? |
| 77 | + return src_cursor_ >= src_len_; |
| 78 | + } |
| 79 | + |
| 80 | + inline void Lexer::move_to_next_char() |
| 81 | + { |
| 82 | + curr_char_ = src_cursor_ < src_len_ ? src_[src_cursor_] : '\0'; |
| 83 | + src_cursor_ += 1; // the position next to that of curr_char_ |
| 84 | + } |
| 85 | + |
| 86 | + inline char Lexer::peek_prev_char() const |
| 87 | + { |
| 88 | + // we pretend there are spaces before the beginning of the source code |
| 89 | + return src_cursor_ >= 2 ? src_[src_cursor_ - 2] : ' '; |
| 90 | + } |
| 91 | + |
| 92 | + inline char Lexer::peek_next_char() const |
| 93 | + { |
| 94 | + // maybe special eof? |
| 95 | + // now '\0' is used |
| 96 | + return src_cursor_ < src_len_ ? src_[src_cursor_] : '\0'; |
| 97 | + } |
| 98 | + |
| 99 | + inline Token Lexer::get_next_token() |
| 100 | + { |
| 101 | + |
| 102 | + bool is_matched = false; |
| 103 | + char string_state_delimiter = '\"'; |
| 104 | + |
| 105 | + do |
| 106 | + { |
| 107 | + if (state_ != State::NONE) |
| 108 | + { |
| 109 | + is_matched = true; |
| 110 | + } |
| 111 | + |
| 112 | + switch (state_) |
| 113 | + { |
| 114 | + case State::NONE: |
| 115 | + move_to_next_char(); |
| 116 | + break; |
| 117 | + case State::END_OF_FILE: |
| 118 | + handle_eof_state(); |
| 119 | + break; |
| 120 | + case State::IDENTIFIER: |
| 121 | + handle_identifier_state(); |
| 122 | + break; |
| 123 | + case State::NUMBER: |
| 124 | + handle_number_state(); |
| 125 | + break; |
| 126 | + case State::STRING: |
| 127 | + handle_string_state(string_state_delimiter); |
| 128 | + break; |
| 129 | + default: |
| 130 | + break; |
| 131 | + } |
| 132 | + |
| 133 | + if (state_ == State::NONE) |
| 134 | + { |
| 135 | + if (curr_char_ == '\0') |
| 136 | + { |
| 137 | + state_ = State::END_OF_FILE; |
| 138 | + } |
| 139 | + else if (std::isalpha(curr_char_) || curr_char_ == '(' || curr_char_ == ')') |
| 140 | + { |
| 141 | + state_ = State::IDENTIFIER; |
| 142 | + } |
| 143 | + else if (std::isdigit(curr_char_)) |
| 144 | + { |
| 145 | + state_ = State::NUMBER; |
| 146 | + } |
| 147 | + else if (curr_char_ == '\"' || curr_char_ == '\'') |
| 148 | + { |
| 149 | + state_ = State::STRING; |
| 150 | + string_state_delimiter = curr_char_; |
| 151 | + } |
| 152 | + else if (std::isspace(curr_char_) || curr_char_ == ',') |
| 153 | + { |
| 154 | + //state_ = State::NONE; |
| 155 | + } |
| 156 | + } |
| 157 | + |
| 158 | + } while (!is_matched); |
| 159 | + |
| 160 | + return token_; |
| 161 | + } |
| 162 | + |
| 163 | + inline void Lexer::handle_eof_state() |
| 164 | + { |
| 165 | + token_ = Token("eof", TokenType::END_OF_FILE, TokenValue::END_OF_FILE); |
| 166 | + buffer_.clear(); |
| 167 | + } |
| 168 | + |
| 169 | + inline void Lexer::handle_identifier_state() |
| 170 | + { |
| 171 | + // try to find the keywords inside the dictionary |
| 172 | + bool found = false; |
| 173 | + if (token_.get_token_value() == TokenValue::FROM) |
| 174 | + { |
| 175 | + // special case, from a to z |
| 176 | + // we treat "a to z" as a token as TokenValue::TO |
| 177 | + // pattern: a char + spaces + "to" + spaces + a char |
| 178 | + // tricky... |
| 179 | + char a = curr_char_; |
| 180 | + move_to_next_char(); |
| 181 | + char s1 = curr_char_; |
| 182 | + while (std::isspace(curr_char_)) |
| 183 | + { |
| 184 | + s1 = ' '; |
| 185 | + move_to_next_char(); |
| 186 | + } |
| 187 | + char to_t = curr_char_; |
| 188 | + move_to_next_char(); |
| 189 | + char to_o = curr_char_; |
| 190 | + move_to_next_char(); |
| 191 | + char s2 = curr_char_; |
| 192 | + while (std::isspace(curr_char_)) |
| 193 | + { |
| 194 | + s2 = ' '; |
| 195 | + move_to_next_char(); |
| 196 | + } |
| 197 | + char z = curr_char_; |
| 198 | + if ( |
| 199 | + ( |
| 200 | + (std::isalpha(a) && std::isalpha(z)) |
| 201 | + || (std::isdigit(a) && std::isdigit(z)) |
| 202 | + ) |
| 203 | + && s1 == ' ' |
| 204 | + && to_t == 't' |
| 205 | + && to_o == 'o' |
| 206 | + && s2 == ' ' |
| 207 | + ) |
| 208 | + { |
| 209 | + buffer_.push_back(a); |
| 210 | + buffer_.push_back(z); |
| 211 | + token_ = Token(buffer_, TokenType::CHARACTER, TokenValue::TO); |
| 212 | + found = true; |
| 213 | + } |
| 214 | + /* |
| 215 | + do |
| 216 | + { |
| 217 | + if (!std::isalpha(curr_char_) && !std::isdigit(curr_char_)) |
| 218 | + { |
| 219 | + break; |
| 220 | + } |
| 221 | + buffer_.push_back(curr_char_); |
| 222 | + move_to_next_char(); |
| 223 | +
|
| 224 | + if (!std::isspace(curr_char_)) |
| 225 | + { |
| 226 | + break; |
| 227 | + } |
| 228 | + while (std::isspace(curr_char_)) |
| 229 | + { |
| 230 | + move_to_next_char(); |
| 231 | + } |
| 232 | +
|
| 233 | + if (curr_char_ != 't') |
| 234 | + { |
| 235 | + break; |
| 236 | + } |
| 237 | + move_to_next_char(); |
| 238 | + if (curr_char_ != 'o') |
| 239 | + { |
| 240 | + break; |
| 241 | + } |
| 242 | + move_to_next_char(); |
| 243 | +
|
| 244 | + if (!std::isspace(curr_char_)) |
| 245 | + { |
| 246 | + break; |
| 247 | + } |
| 248 | + while (std::isspace(curr_char_)) |
| 249 | + { |
| 250 | + move_to_next_char(); |
| 251 | + } |
| 252 | +
|
| 253 | + if (!std::isalpha(curr_char_) && !std::isdigit(curr_char_)) |
| 254 | + { |
| 255 | + break; |
| 256 | + } |
| 257 | + buffer_.push_back(curr_char_); |
| 258 | + token_ = Token(buffer_, TokenType::CHARACTER, TokenValue::TO); |
| 259 | + found = true; |
| 260 | + } while (false); |
| 261 | + */ |
| 262 | + } |
| 263 | + else |
| 264 | + { |
| 265 | + do |
| 266 | + { |
| 267 | + buffer_.push_back(std::tolower(curr_char_)); // because it's case insensitive |
| 268 | + move_to_next_char(); |
| 269 | + |
| 270 | + // TODO: something is the prefix of something |
| 271 | + // exactly, exactly 1 time, exactly 1 times |
| 272 | + // once, once or more |
| 273 | + found = dictionary_.has_token(buffer_); |
| 274 | + } while ( |
| 275 | + !found |
| 276 | + && |
| 277 | + ( |
| 278 | + std::isalpha(curr_char_) |
| 279 | + || (curr_char_ == ' ' && peek_next_char() != ' ') // or std::isspace() |
| 280 | + || buffer_.length() <= dictionary_.get_key_max_length() |
| 281 | + ) |
| 282 | + ); |
| 283 | + |
| 284 | + if (found) |
| 285 | + { |
| 286 | + token_ = Token(buffer_, |
| 287 | + dictionary_.get_token_type(buffer_), |
| 288 | + dictionary_.get_token_value(buffer_)); |
| 289 | + } |
| 290 | + } |
| 291 | + // the identifiers / keys in srl |
| 292 | + // are all alpha or at most one ' ' between words |
| 293 | + // no line breaks allowed here |
| 294 | + // and they cannot exceed the max length of all the keys |
| 295 | + |
| 296 | + if (!found) |
| 297 | + { |
| 298 | + // then we have trouble, we could not find any available identifier |
| 299 | + // TODO |
| 300 | + token_ = Token(); |
| 301 | + } |
| 302 | + |
| 303 | + buffer_.clear(); |
| 304 | + state_ = State::NONE; |
| 305 | + } |
| 306 | + |
| 307 | + inline void Lexer::handle_number_state() |
| 308 | + { |
| 309 | + do |
| 310 | + { |
| 311 | + buffer_.push_back(curr_char_); // eat the digits |
| 312 | + move_to_next_char(); |
| 313 | + } while (std::isdigit(curr_char_)); // curr_char_ != '\0' && |
| 314 | + |
| 315 | + token_ = Token(buffer_, TokenType::SRC_NUMBER, TokenValue::NUMBER); |
| 316 | + buffer_.clear(); |
| 317 | + state_ = State::NONE; |
| 318 | + } |
| 319 | + |
| 320 | + inline void Lexer::handle_string_state(char string_state_delimiter) |
| 321 | + { |
| 322 | + move_to_next_char(); // eat the left '\"' |
| 323 | + while (curr_char_ != '\0' |
| 324 | + && (curr_char_ != string_state_delimiter || peek_prev_char() == '\\')) |
| 325 | + { |
| 326 | + buffer_.push_back(curr_char_); |
| 327 | + move_to_next_char(); |
| 328 | + } |
| 329 | + |
| 330 | + if (curr_char_ == '\0') |
| 331 | + { |
| 332 | + // then we have a trouble, the string literal does not end correctly |
| 333 | + // TODO |
| 334 | + token_ = Token(); |
| 335 | + } |
| 336 | + else |
| 337 | + { |
| 338 | + move_to_next_char(); // eat the right '\"' |
| 339 | + token_ = Token(buffer_, TokenType::SRC_STRING, TokenValue::STRING); |
| 340 | + } |
| 341 | + |
| 342 | + buffer_.clear(); |
| 343 | + state_ = State::NONE; |
| 344 | + } |
| 345 | + |
| 346 | +} |
| 347 | + |
| 348 | +#endif // !SIMPLEREGEXLANGUAGE_LEXER_H_ |
0 commit comments