Skip to content

Commit

Permalink
WIP - string literals with rect rule
Browse files Browse the repository at this point in the history
  • Loading branch information
cgay committed Sep 17, 2023
1 parent 089f6ef commit 056a146
Show file tree
Hide file tree
Showing 2 changed files with 153 additions and 8 deletions.
97 changes: 94 additions & 3 deletions sources/dfmc/reader/lexer.dylan
Original file line number Diff line number Diff line change
Expand Up @@ -861,8 +861,9 @@ end method hex-escape-character;
define method decode-string
(source-location :: <lexer-source-location>, bpos :: <integer>,
epos :: <integer>, escapes? :: <boolean>)
=> (string :: <byte-string>)
=> (string :: <byte-string>, multi-line? :: <boolean>)
let contents = source-location.source-location-record.contents;
let multi-line? = #f;
local
method skip-hex-escape (pos)
// TODO(cgay): signal better error if '>' not found.
Expand Down Expand Up @@ -903,9 +904,11 @@ define method decode-string
loop(new-position, len + 1, #f, string);
end if;
as(<integer>, '\r') =>
multi-line? := #t;
string & (string[len] := '\n');
loop(pos + 1, len + 1, #t, string);
as(<integer>, '\n') =>
multi-line? := #t;
let increment = if (prev-was-cr?)
0 // already stored a LF
else
Expand All @@ -922,9 +925,93 @@ define method decode-string
let length = loop(bpos, 0, #f, #f);
let string = make(<string>, size: length);
loop(bpos, 0, #f, string);
string
values(string, multi-line?)
end method decode-string;

// https://opendylan.org/proposals/dep-0012-string-literals.html#the-rectangle-rule
// When this is called `string` is known to contain at least one literal
// newline character and the EOL sequence has already been canonicalized to
// just '\n'.
// TEMP NOTES:
// * This doesn't allow ALL whitespace (e.g. \f) on the same line as the
// opening """. Could argue either way but having such whitespace seems
// pathalogical? In fact supporting ANY whitespace at all there is
// questionable.
define function trim-multi-line-prefix
(string :: <string>) => (maybe-trimmed :: <string>)
let len :: <integer> = string.size;
// Whitespace following """ is ignored until '\n'.
let internal-start
= iterate loop (i = 0)
if (i < len)
select (string[i])
' ', '\t' => loop(i + 1);
'\n' => i;
otherwise =>
error("invalid multi-line string literal - only whitespace may"
" follow the start delimiter \"\"\" on the same line");
end
end
end;
// The end delimiter may ONLY have whitespace preceding it, and that
// whitespace defines the prefix that must appear on all lines.
let internal-end
= iterate loop (i = len - 1)
if (i > 0)
select (string[i])
' ', '\t' => loop(i - 1);
'\n' => i;
otherwise =>
error("invalid multi-line string literal - only whitespace may"
" precede the end delimiter \"\"\" on the same line");
end
end
end;
let prefix :: <byte-string> = copy-sequence(string, start: internal-end + 1);
let new-string :: false-or(<string>) = #f;
local
// If new-string is false this counts, otherwise it copies.
method copy-or-count (i :: <integer>, new-index :: <integer>)
if (i >= internal-end)
new-index
else
let c = string[i];
let next-i
= if (c ~== '\n')
i + 1
else
// Expect the same prefix on each line, with one exception: a completely
// blank line is accepted as a blank line in the result because sometimes
// editors are configured to remove trailing whitespace.
iterate check-prefix (p = 0, j = i + 1)
if (p >= prefix.size | (p == 0 & string[j] == '\n'))
j
else
if (prefix[p] == string[j])
check-prefix(p + 1, j + 1)
else
error("invalid multi-line string literal - lines must begin"
" with the same whitespace that precedes the end"
" delimiter (got %=, want %=)",
string[j], prefix[p]);
end;
end
end iterate
end;
if (i > internal-start) // don't copy first \n
if (new-string) new-string[new-index] := c end;
copy-or-count(next-i, new-index + 1)
else
copy-or-count(next-i, new-index)
end
end
end method;
let new-size = copy-or-count(internal-start, 0);
new-string := make(<string>, size: new-size);
copy-or-count(internal-start, 0);
new-string
end function trim-multi-line-prefix;

// Make a <literal-token> when confronted with the #"foo" syntax.
// These are referred to as "unique strings" in the DRM Lexical Syntax.
//
Expand Down Expand Up @@ -1119,7 +1206,11 @@ define method %make-string-literal
=> (res :: <string-fragment>)
let bpos = source-location.start-posn + start-offset;
let epos = source-location.end-posn - end-offset;
let string = decode-string(source-location, bpos, epos, allow-escapes?);
let (string, multi-line?)
= decode-string(source-location, bpos, epos, allow-escapes?);
if (multi-line?)
string := trim-multi-line-prefix(string);
end;
make(<string-fragment>,
record: source-location.source-location-record,
source-position: source-location.source-location-source-position,
Expand Down
64 changes: 59 additions & 5 deletions sources/dfmc/reader/tests/literal-test-suite.dylan
Original file line number Diff line number Diff line change
Expand Up @@ -225,20 +225,73 @@ define test string-literal-test ()
end test;

define test string-literal-multi-line-test ()
/*
let f = read-fragment(#:string:{""""""});
verify-literal(f, "", <string-fragment>);
// Make sure the reader didn't stop at the first pair of double quotes...
let source = source-location-string(fragment-source-location(f));
assert-equal(#:string:{""""""}, source);
verify-literal(read-fragment(#:string:{"""abc"""}), "abc", <string-fragment>);
verify-literal(read-fragment(#:string:{"""a\nc"""}), "a\nc", <string-fragment>);
verify-literal(read-fragment(#:string:{""" abc """}), " abc ", <string-fragment>);
// Leading whitespace (relative to end delim) retained?
verify-literal(read-fragment(#:string:{"""
abc
def
"""}),
" abc\ndef", <string-fragment>);
// End delim to left of start delim, still good?
verify-literal(read-fragment(#:string:{"""
abc
def
"""}),
" abc\ndef", <string-fragment>);
// Whitespace on first line ignored? 0x20 = space
verify-literal(read-fragment(#:string:{"""\<20>\<20>
abc
def
"""}),
" abc\ndef", <string-fragment>);
// Are whitespace lines included? The first blank line below is truly empty
// and the second one has the prefix only (written as \<20> to avoid editors
// removing trailing whitespace).
verify-literal(read-fragment(#:string:{"""
def
\<20>\<20>\<20>
"""}),
"\ndef\n", <string-fragment>);
check-condition("junk on first line",
<error>,
read-fragment(#:string:{"""a (only whitespace allowed after start delim)
c"""}));
check-condition("junk on last line",
<error>,
read-fragment(#:string:{"""
a
c"""})); // end delim should be on new line
check-condition("prefix mismatch non-white",
<error>,
read-fragment(#:string:{"""
a
c (this line not indented enough)
"""}));
// Prefix should be " " but one line has a literal tab in prefix.
/* TODO: the literal tab is causing a weird crash that I don't understand
yet. Gets an odd backtrace in testworks. Needs investigation.
check-condition("prefix mismatch whitespace",
<error>,
read-fragment("\"\"\"\n aaa\n \t bbb\n \"\"\""));
*/
// EOL canonicalization
verify-literal(read-fragment("\"\"\"a\nc\"\"\""), "a\nc", <string-fragment>);
verify-literal(read-fragment("\"\"\"a\r\nc\"\"\""), "a\nc", <string-fragment>);
verify-literal(read-fragment("\"\"\"a\rc\"\"\""), "a\nc", <string-fragment>);
verify-literal(read-fragment("\"\"\"a\n\rc\"\"\""), "a\n\nc", <string-fragment>);
*/
verify-literal(read-fragment("\"\"\"\n a\n b\n x\n c\n \"\"\""), "a\nb\nx\nc", <string-fragment>);
/*
verify-literal(read-fragment("\"\"\"\na\r\nc\n\"\"\""), "a\nc", <string-fragment>);
verify-literal(read-fragment("\"\"\"\na\rc\n\"\"\""), "a\nc", <string-fragment>);
verify-literal(read-fragment("\"\"\"\r\na\n\rc\r\n\"\"\""), "a\n\nc", <string-fragment>);
let char = curry(as, <character>);
// One of every escape sequence. "\a\b\e\f\n\r\t\0\'\"\\"
Expand All @@ -257,6 +310,7 @@ define test string-literal-multi-line-test ()
<string-fragment>);
assert-signals(<invalid-token>, read-fragment(#:string:{"""\1<b>"""}));
*/
end test;

define test string-literal-raw-one-line-test ()
Expand Down

0 comments on commit 056a146

Please sign in to comment.