Merge pull request #38 from jmanuel1/handle-indent-err

Display nice errors from Python tokenizer exceptions
jmanuel1 · Nov 9, 2024 · 737c840 · 737c840
2 parents 849f5af + 20e4ede
commit 737c840
Show file tree

Hide file tree

Showing 9 changed files with 354 additions and 158 deletions.
diff --git a/concat/__main__.py b/concat/__main__.py
@@ -2,7 +2,12 @@
 
 import argparse
 from concat.transpile import parse, transpile_ast, typecheck
-from concat.error_reporting import get_line_at, create_parsing_failure_message
+from concat.error_reporting import (
+    get_line_at,
+    create_indentation_error_message,
+    create_lexical_error_message,
+    create_parsing_failure_message,
+)
 import concat.execute
 import concat.lex
 import concat.parser_combinators
@@ -11,7 +16,7 @@
 import json
 import os.path
 import sys
-from typing import Callable, IO, AnyStr
+from typing import Callable, IO, AnyStr, assert_never
 
 
 filename = '<stdin>'
@@ -52,28 +57,39 @@ def func(name: str) -> IO[AnyStr]:
     '--tokenize',
     action='store_true',
     default=False,
-    help='tokenize input from the given file and print the tokens as a JSON array',
+    help=(
+        'tokenize input from the given file and print the tokens as a JSON '
+        'array'
+    ),
 )
 
-# We should pass any unknown args onto the program we're about to run.
-# FIXME: There might be a better way to go about this, but I think this is fine
-# for now.
-args, rest = arg_parser.parse_known_args()
-sys.argv = [sys.argv[0], *rest]
 
+def tokenize_printing_errors() -> list[concat.lex.Token]:
+    token_results = concat.lex.tokenize(args.file.read())
+    tokens = list[concat.lex.Token]()
+    for r in token_results:
+        if r.type == 'token':
+            tokens.append(r.token)
+        elif r.type == 'indent-err':
+            position = (r.err.lineno or 1, r.err.offset or 0)
+            message = r.err.msg
+            print('Indentation error:')
+            print(
+                create_indentation_error_message(args.file, position, message)
+            )
+        elif r.type == 'token-err':
+            position = r.location
+            message = str(r.err)
+            print('Lexical error:')
+            print(create_lexical_error_message(args.file, position, message))
+        else:
+            assert_never(r)
+    return tokens
 
-if args.tokenize:
-    code = args.file.read()
-    tokens = concat.lex.tokenize(code, should_preserve_comments=True)
-    json.dump(tokens, sys.stdout, cls=concat.lex.TokenEncoder)
-    sys.exit()
 
-# interactive mode
-if args.file.isatty():
-    concat.stdlib.repl.repl([], [], args.debug)
-else:
+def batch_main():
     try:
-        tokens = concat.lex.tokenize(args.file.read())
+        tokens = tokenize_printing_errors()
         concat_ast = parse(tokens)
         recovered_parsing_failures = concat_ast.parsing_failures
         for failure in recovered_parsing_failures:
@@ -121,3 +137,26 @@ def func(name: str) -> IO[AnyStr]:
             sys.exit(1)
     finally:
         args.file.close()
+
+
+def main():
+    # interactive mode
+    if args.file.isatty():
+        concat.stdlib.repl.repl([], [], args.debug)
+    else:
+        batch_main()
+
+
+# We should pass any unknown args onto the program we're about to run.
+# FIXME: There might be a better way to go about this, but I think this is fine
+# for now.
+args, rest = arg_parser.parse_known_args()
+sys.argv = [sys.argv[0], *rest]
+
+if args.tokenize:
+    code = args.file.read()
+    tokens = concat.lex.tokenize(code, should_preserve_comments=True)
+    json.dump(tokens, sys.stdout, cls=concat.lex.TokenEncoder)
+    sys.exit()
+
+main()
diff --git a/concat/error_reporting.py b/concat/error_reporting.py
@@ -16,7 +16,12 @@ def create_parsing_failure_message(
     stream: Sequence[concat.lex.Token],
     failure: concat.parser_combinators.FailureTree,
 ) -> str:
-    location = stream[failure.furthest_index].start
+    if failure.furthest_index < len(stream):
+        location = stream[failure.furthest_index].start
+    elif stream:
+        location = stream[-1].start
+    else:
+        location = (1, 0)
     line = get_line_at(file, location)
     message = f'Expected {failure.expected} at line {location[0]}, column {location[1] + 1}:\n{line.rstrip()}\n{" " * location[1] + "^"}'
     if failure.children:
@@ -26,3 +31,28 @@ def create_parsing_failure_message(
                 create_parsing_failure_message(file, stream, f), '  '
             )
     return message
+
+
+def create_lexical_error_message(
+    file: TextIO, location: concat.astutils.Location, message: str
+) -> str:
+    line = get_line_at(file, location)
+    message = (
+        f'Cannot tokenize file at line {location[0]}, '
+        f'column {location[1] + 1}:\n'
+        f'{line.rstrip()}\n'
+        f'{' ' * location[1] + '^'}\n'
+    )
+    return message
+
+
+def create_indentation_error_message(
+    file: TextIO, location: concat.astutils.Location, message: str
+) -> str:
+    line = get_line_at(file, location)
+    message = (
+        f'Malformed indentation at line {location[0]}, '
+        f'column {location[1] + 1}:\n'
+        f'{line.rstrip()}\n'
+    )
+    return message