Skip to content

Commit 0fcc594

Browse files
committed
capture working, fix bug in from to, group start end could emit spaces
1 parent a5881f9 commit 0fcc594

File tree

5 files changed

+146
-89
lines changed

5 files changed

+146
-89
lines changed

include/spre/ast.hpp

+21
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,27 @@ inline string AnchorExprAST::get_val() const
200200
{
201201
return val_;
202202
}
203+
204+
205+
class EOFExprAST : public ExprAST
206+
{
207+
public:
208+
EOFExprAST();
209+
string get_val() const override;
210+
211+
private:
212+
};
213+
214+
EOFExprAST::EOFExprAST()
215+
{
203216
}
204217

218+
inline string EOFExprAST::get_val() const
219+
{
220+
return "";
221+
}
222+
223+
}
224+
225+
205226
#endif // !SIMPLEREGEXLANGUAGE_AST_H_

include/spre/generator.hpp

+25-4
Original file line numberDiff line numberDiff line change
@@ -16,30 +16,51 @@ namespace spre
1616
class Generator
1717
{
1818
public:
19-
explicit Generator(Parser &parser);
19+
explicit Generator(Parser &parser, bool show_error = true);
2020
~Generator();
21+
bool has_error() const;
22+
void report_error() const;
2123
string generate();
2224

2325
private:
2426
Parser parser_;
27+
bool error_flag_;
28+
string error_msg_;
29+
const bool show_error_;
2530
};
2631

27-
Generator::Generator(Parser &parser) : parser_(parser)
32+
Generator::Generator(Parser &parser, bool show_error) : parser_(parser), show_error_(show_error)
2833
{
2934
}
3035

3136
Generator::~Generator()
3237
{
3338
}
3439

40+
inline bool Generator::has_error() const
41+
{
42+
return error_flag_;
43+
}
44+
45+
inline void Generator::report_error() const
46+
{
47+
if (!has_error())
48+
{
49+
return;
50+
}
51+
fprintf(stderr, "generator error: ");
52+
fprintf(stderr, "%s", error_msg_.c_str());
53+
fprintf(stderr, "\n");
54+
}
55+
3556
inline string Generator::generate()
3657
{
3758
string res;
3859
vector<unique_ptr<ExprAST>> h = parser_.parse();
39-
60+
std::cout << "asts length: " << h.size() << "\n";
4061
for (const auto &iter : h)
4162
{
42-
string k = iter->get_val();
63+
string k = iter == nullptr ? "nullptr" : iter->get_val();
4364
res.append(k);
4465
}
4566

include/spre/lexer.hpp

+3-2
Original file line numberDiff line numberDiff line change
@@ -165,9 +165,9 @@ inline Token Lexer::get_next_token()
165165
return token_;
166166
}
167167

168-
if (curr_char_ == ')')
168+
if (curr_char_ == '(' || curr_char_ == ')')
169169
{
170-
// the char before ")" may not be whitespace, so try it here
170+
// the char before "(" and ")" may not be whitespace, so try it here
171171
state_ = State::IDENTIFIER;
172172
handle_identifier_state();
173173
return token_;
@@ -277,6 +277,7 @@ inline void Lexer::handle_identifier_state()
277277
token_ = Token(buffer_, TokenType::CHARACTER, TokenValue::TO);
278278
buffer_.clear();
279279
state_ = State::NONE;
280+
move_to_next_char();
280281
return;
281282
}
282283
else

include/spre/parser.hpp

+94-82
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ class Parser
3838
unique_ptr<LookAroundExprAST> parse_lookaround(const TokenValue &token_value);
3939
unique_ptr<FlagExprAST> parse_flag(const TokenValue &token_value);
4040
unique_ptr<AnchorExprAST> parse_anchor(const TokenValue &token_value);
41+
unique_ptr<EOFExprAST> parse_eof(const TokenValue &token_value);
4142
};
4243

4344
Parser::Parser(Lexer &lexer, bool show_error) : lexer_(lexer), error_flag_(false), show_error_(show_error)
@@ -108,8 +109,7 @@ inline unique_ptr<ExprAST> Parser::parse_token(const Token &token)
108109
ptr = std::move(parse_anchor(token.get_token_value()));
109110
break;
110111
case TokenType::END_OF_FILE:
111-
// we are good
112-
//eof = true;
112+
ptr = std::move(parse_eof(token.get_token_value()));
113113
break;
114114
case TokenType::UNDEFINED:
115115
error_flag_ = true;
@@ -130,6 +130,7 @@ inline unique_ptr<ExprAST> Parser::parse_token(const Token &token)
130130
inline unique_ptr<CharacterExprAST> Parser::parse_character(const TokenValue &token_value)
131131
{
132132
unique_ptr<CharacterExprAST> ptr = nullptr;
133+
133134
if (token_value == TokenValue::LITERALLY || token_value == TokenValue::ONE_OF || token_value == TokenValue::RAW)
134135
{
135136
// expect string literal following
@@ -138,56 +139,34 @@ inline unique_ptr<CharacterExprAST> Parser::parse_character(const TokenValue &to
138139
{
139140
error_flag_ = true;
140141
error_msg_ = "missing string literal";
142+
return ptr;
141143
}
142-
else
144+
145+
string val;
146+
switch (token_value)
143147
{
144-
string val;
145-
switch (token_value)
146-
{
147-
case TokenValue::LITERALLY:
148-
val = "(?:" + next_token.get_value() + ")";
149-
break;
150-
case TokenValue::ONE_OF:
151-
val = "[" + next_token.get_value() + "]";
152-
break;
153-
case TokenValue::RAW:
154-
val = next_token.get_value();
155-
break;
156-
default:
157-
break;
158-
}
159-
ptr = make_unique<CharacterExprAST>(val);
160-
lexer_.get_next_token(); // so we eat the leagal token
148+
case TokenValue::LITERALLY:
149+
val = "(?:" + next_token.get_value() + ")";
150+
break;
151+
case TokenValue::ONE_OF:
152+
val = "[" + next_token.get_value() + "]";
153+
break;
154+
case TokenValue::RAW:
155+
val = next_token.get_value();
156+
break;
157+
default:
158+
break;
161159
}
160+
ptr = make_unique<CharacterExprAST>(val);
161+
lexer_.get_next_token(); // so we eat the leagal token
162+
return ptr;
162163
}
163164

164-
else if (token_value == TokenValue::LETTER || token_value == TokenValue::UPPERCASE_LETTER || token_value == TokenValue::DIGIT)
165+
if (token_value == TokenValue::LETTER || token_value == TokenValue::UPPERCASE_LETTER || token_value == TokenValue::DIGIT)
165166
{
166-
Token next_token = lexer_.get_next_token();
167+
Token guess_from = lexer_.get_next_token();
167168

168-
if (next_token.get_token_value() == TokenValue::FROM)
169-
{
170-
Token next_next_token = lexer_.get_next_token();
171-
if (next_next_token.get_token_value() == TokenValue::TO)
172-
{
173-
// so we have the modifier
174-
string az = next_next_token.get_value();
175-
if (az.length() == 2)
176-
{
177-
az.insert(1, "-");
178-
az.insert(0, "[");
179-
az.append("]");
180-
ptr = make_unique<CharacterExprAST>(az);
181-
lexer_.get_next_token(); // so we eat the leagal tokens from and to
182-
}
183-
}
184-
else
185-
{
186-
error_flag_ = true;
187-
error_msg_ = "\"from\" found, but \"to\" not found";
188-
}
189-
}
190-
else
169+
if (guess_from.get_token_value() != TokenValue::FROM)
191170
{
192171
string val;
193172
switch (token_value)
@@ -205,50 +184,76 @@ inline unique_ptr<CharacterExprAST> Parser::parse_character(const TokenValue &to
205184
break;
206185
}
207186
ptr = make_unique<CharacterExprAST>(val);
208-
lexer_.get_next_token(); // so we eat the leagal token
187+
// now we already at the one after letter/digit/...
188+
// because we already move to here for guessing from
189+
return ptr;
209190
}
210-
}
211-
else
212-
{
213-
string val;
214-
switch (token_value)
215-
{
216-
case TokenValue::ANY_CHARACTER:
217-
val = "\\w";
218-
break;
219-
case TokenValue::NO_CHARACTER:
220-
val = "\\W";
221-
break;
222-
case TokenValue::ANYTHING:
223-
val = ".";
224-
break;
225-
case TokenValue::NEW_LINE:
226-
val = "\\n";
227-
break;
228-
case TokenValue::WHITESPACE:
229-
val = "\\s";
230-
break;
231-
case TokenValue::NO_WHITESPACE:
232-
val = "\\S";
233-
break;
234-
case TokenValue::TAB:
235-
val = "\\t";
236-
break;
237-
default:
238-
break;
239-
}
240-
if (val.length() != 0)
191+
192+
Token guess_to = lexer_.get_next_token();
193+
194+
if (guess_from.get_token_value() != TokenValue::FROM)
241195
{
242-
ptr = make_unique<CharacterExprAST>(val);
243-
lexer_.get_next_token(); // so we eat the leagal token
196+
error_flag_ = true;
197+
error_msg_ = "\"from\" found, but \"to\" not found";
198+
return ptr;
244199
}
245-
else
200+
201+
string az = guess_to.get_value();
202+
203+
if (az.length() != 2)
246204
{
247205
error_flag_ = true;
248-
error_msg_ = "unknown error";
206+
error_msg_ = "the range \"from\" and \"to\" is not well defined";
207+
return ptr;
249208
}
209+
210+
az.insert(1, "-");
211+
az.insert(0, "[");
212+
az.append("]");
213+
ptr = make_unique<CharacterExprAST>(az);
214+
lexer_.get_next_token(); // so we eat the leagal token to
215+
return ptr;
216+
}
217+
218+
string val;
219+
switch (token_value)
220+
{
221+
case TokenValue::ANY_CHARACTER:
222+
val = "\\w";
223+
break;
224+
case TokenValue::NO_CHARACTER:
225+
val = "\\W";
226+
break;
227+
case TokenValue::ANYTHING:
228+
val = ".";
229+
break;
230+
case TokenValue::NEW_LINE:
231+
val = "\\n";
232+
break;
233+
case TokenValue::WHITESPACE:
234+
val = "\\s";
235+
break;
236+
case TokenValue::NO_WHITESPACE:
237+
val = "\\S";
238+
break;
239+
case TokenValue::TAB:
240+
val = "\\t";
241+
break;
242+
default:
243+
break;
244+
}
245+
if (val.length() != 0)
246+
{
247+
ptr = make_unique<CharacterExprAST>(val);
248+
lexer_.get_next_token(); // so we eat the leagal token
249+
}
250+
else
251+
{
252+
error_flag_ = true;
253+
error_msg_ = "unknown error";
250254
}
251255

256+
252257
return std::move(ptr);
253258
}
254259

@@ -400,7 +405,7 @@ inline unique_ptr<GroupExprAST> Parser::parse_group(const TokenValue &token_valu
400405
&& lexer_.get_token().get_token_type() != TokenType::END_OF_FILE
401406
&& lexer_.get_token().get_token_type() != TokenType::UNDEFINED);
402407
// after parsing the sub_query_ptr_vec, current token should be ")"!!!
403-
std::cout << "now tokn []"<< lexer_.get_token().get_value() << "[]\n";
408+
404409
if (lexer_.get_token().get_token_value() != TokenValue::GROUP_END)
405410
{
406411
ptr = nullptr;
@@ -579,6 +584,13 @@ inline unique_ptr<AnchorExprAST> Parser::parse_anchor(const TokenValue &token_va
579584

580585
return std::move(ptr);
581586
}
587+
588+
inline unique_ptr<EOFExprAST> Parser::parse_eof(const TokenValue &token_value)
589+
{
590+
// maybe we check the value in the future?
591+
return make_unique<EOFExprAST>();
592+
}
593+
582594
}
583595

584596
#endif // !SIMPLEREGEXLANGUAGE_PARSER_H_

test/spre_test.cpp

+3-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,9 @@
33
#include "spre/spre.hpp"
44

55
int main() {
6-
spre::SRL srl("whitespace, literally \"haha\", digit");
6+
string src = "literally \"haha\", capture(capture(digit from a to z whitespace) as \"inner\") as \"outer\"";
7+
std::cout << "original string:\n" << src << std::endl;
8+
spre::SRL srl(src);
79
std::cout << "final result:\n" << srl.get_pattern() << std::endl;
810

911
return 0;

0 commit comments

Comments
 (0)