Skip to content

Commit

Permalink
Enable parsing of input seeds with syntax errors (#261)
Browse files Browse the repository at this point in the history
Previously, only seed files that could be fully recognized by the
grammar were transformed into trees by grammarinator-parse. This
restriction excluded partially recognizable seeds, which can be
valuable for fuzzing.
With this change, `grammarinator-parse` now supports transforming
seeds containing syntax errors.
  • Loading branch information
renatahodovan authored Dec 14, 2024
1 parent 5746a82 commit eabe195
Show file tree
Hide file tree
Showing 6 changed files with 38 additions and 13 deletions.
45 changes: 32 additions & 13 deletions grammarinator/tool/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,19 @@ def syntaxError(self, recognizer, offendingSymbol, line, column, msg, e):
error.ErrorListener.ConsoleErrorListener.INSTANCE = ConsoleListener()


class ExtendedErrorListener(error.ErrorListener.ErrorListener):
"""
Custom error listener for the ANTLR lexer ensuring to insert the
unrecognized tokens into the tree as well.
"""
def syntaxError(self, recognizer, offendingSymbol, line, column, msg, e):
recognizer.inputStream.consume()
recognizer.type = Token.INVALID_TYPE
recognizer.channel = Token.DEFAULT_CHANNEL
recognizer.emit()
recognizer.type = Token.MIN_USER_TOKEN_TYPE


class ParserTool:
"""
Tool to parse existing sources and create a tree pool from them. These
Expand Down Expand Up @@ -185,8 +198,9 @@ def _antlr_to_grammarinator_tree(self, antlr_node, parser, visited=None):
depth = max(depth, child_depth + 1)
else:
assert isinstance(antlr_node, TerminalNode), f'An ANTLR node must either be a ParserRuleContext or a TerminalNode but {antlr_node.__class__.__name__} was found.'
name, text = parser.symbolicNames[antlr_node.symbol.type] if len(parser.symbolicNames) >= antlr_node.symbol.type else '<INVALID>', antlr_node.symbol.text
name, text = parser.symbolicNames[antlr_node.symbol.type] if len(parser.symbolicNames) > antlr_node.symbol.type else '<INVALID>', antlr_node.symbol.text
assert name, f'{name} is None or empty'

if antlr_node.symbol.type == Token.EOF:
return None, 0, []

Expand Down Expand Up @@ -310,7 +324,9 @@ def _match_seq(grammar_vertices, tree_node_pos):
# They MUST match, since ANTLR has already parsed them
# During matching, quantifier and alternation structures are identified
rule_children, rule_tree_node_pos = _match_seq(self._graph.vertices[rule.name].out_neighbours + [None], 0)
assert rule_children is not None, f'Failed to match {rule.name} tree node to the related grammar rule at {rule_tree_node_pos}.'
if rule_children is None:
logger.warning('Failed to match %s tree node to the related grammar rule at %d.', rule.name, rule_tree_node_pos)
return

# Detach all children from the tree node so that they can be reattached
# in a structured way afterwards
Expand Down Expand Up @@ -368,21 +384,24 @@ def _reattach_children(rule, children):
# Create an ANTLR tree from the input stream and convert it to Grammarinator tree.
def _create_tree(self, input_stream, fn):
try:
parser = self._parser_cls(CommonTokenStream(self._lexer_cls(input_stream)))
lexer = self._lexer_cls(input_stream)
lexer.addErrorListener(ExtendedErrorListener())
parser = self._parser_cls(CommonTokenStream(lexer))
parse_tree_root = getattr(parser, self._rule)()
if not parser._syntaxErrors:
root, depth, rules = self._antlr_to_grammarinator_tree(parse_tree_root, parser)
if depth > self._max_depth:
logger.info('The tree representation of %s is %s, too deep. Skipping.', fn, depth)
return None
if parser._syntaxErrors:
logger.warning('%s syntax errors detected in %s.', parser._syntaxErrors, fn)

root, depth, rules = self._antlr_to_grammarinator_tree(parse_tree_root, parser)
if depth > self._max_depth:
logger.info('The tree representation of %s is %s, too deep. Skipping.', fn, depth)
return None

self._adjust_tree_to_generator(rules)
for transformer in self._transformers:
root = transformer(root)
self._adjust_tree_to_generator(rules)
for transformer in self._transformers:
root = transformer(root)

return root
return root

logger.warning('%s syntax errors detected in %s.', parser._syntaxErrors, fn)
except Exception as e:
logger.warning('Exception while parsing %s.', fn, exc_info=e)
return None
Expand Down
1 change: 1 addition & 0 deletions tests/parser/exp6.grtj
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"t": "p", "n": "start", "c": [{"t": "a", "ai": 0, "i": 1, "c": [{"t": "p", "n": "start_Quantifiers_test", "c": [{"t": "p", "n": "element", "c": [{"t": "l", "n": "<INVALID>", "s": "pass", "z": [1, 1], "i": false}, {"t": "q", "i": 0, "b": 0, "e": 1, "c": []}]}, {"t": "q", "i": 1, "b": 1, "e": -1, "c": [{"t": "qd", "c": [{"t": "l", "n": "<INVALID>", "s": " | ", "z": [1, 1], "i": false}, {"t": "p", "n": "element", "c": [{"t": "l", "n": "<INVALID>", "s": "|", "z": [1, 1], "i": false}, {"t": "l", "n": "<INVALID>", "s": "pass", "z": [1, 1], "i": false}]}]}]}]}]}]}
1 change: 1 addition & 0 deletions tests/parser/exp7.grtj
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"t": "p", "n": "start", "c": [{"t": "a", "ai": 0, "i": 0, "c": [{"t": "p", "n": "start_Quantifiers_test", "c": [{"t": "l", "n": "<INVALID>", "s": "*", "z": [1, 1], "i": false}, {"t": "p", "n": "element", "c": [{"t": "l", "n": "<INVALID>", "s": "pass", "z": [1, 1], "i": false}, {"t": "q", "i": 0, "b": 0, "e": 1, "c": []}]}]}]}]}
1 change: 1 addition & 0 deletions tests/parser/inp6.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pass | | pass
1 change: 1 addition & 0 deletions tests/parser/inp7.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
* pass
2 changes: 2 additions & 0 deletions tests/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
(os.path.join(parser_dir, 'inp3.txt'), os.path.join(parser_dir, 'exp3.grtj')),
(os.path.join(parser_dir, 'inp4.txt'), os.path.join(parser_dir, 'exp4.grtj')),
(os.path.join(parser_dir, 'inp5.txt'), os.path.join(parser_dir, 'exp5.grtj')),
(os.path.join(parser_dir, 'inp6.txt'), os.path.join(parser_dir, 'exp6.grtj')),
(os.path.join(parser_dir, 'inp7.txt'), os.path.join(parser_dir, 'exp7.grtj')),
])
def test_parser(inp, expected, tmpdir):
with open(inp, 'r') as f:
Expand Down

0 comments on commit eabe195

Please sign in to comment.