Better handling of internal tabs in list items.

mkende · Apr 6, 2024 · 52cf880 · 52cf880
1 parent 2bac801
commit 52cf880
Show file tree

Hide file tree

Showing 7 changed files with 76 additions and 46 deletions.
diff --git a/lib/Markdown/Perl/BlockParser.pm b/lib/Markdown/Perl/BlockParser.pm
@@ -41,7 +41,8 @@ sub new {
     last_pos => 0,
     line_ending => '',
     continuation_re => qr//,
-    linkrefs => {}
+    linkrefs => {},
+    matched_prefix_size => 0,
   }, $class;
   lock_keys_plus(%{$this}, qw(forced_line));
 
@@ -267,9 +268,12 @@ sub _test_lazy_continuation {
 
 sub _count_matching_blocks {
   my ($this, $lr) = @_;  # $lr is a scalar *reference* to the current line text.
+  $this->{matched_prefix_size} += 0;
   for my $i (0 .. $#{$this->{blocks_stack}}) {
     local *::_ = $lr;
-    return $i unless $this->{blocks_stack}[$i]{cond}();
+    my $r = $this->{blocks_stack}[$i]{cond}();
+    $this->{matched_prefix_size} += $r if $r;
+    return $i unless $r;
   }
   return @{$this->{blocks_stack}};
 }
@@ -426,7 +430,8 @@ sub _do_indented_code_block {
   if (@{$this->{paragraph}} || $l !~ m/${indented_code_re}/) {
     return;
   }
-  my @code_lines = remove_prefix_spaces(4, $l.$this->line_ending(), $this->get_preserve_tabs);
+  my $preserve_tabs = !$this->get_code_blocks_convert_tabs_to_spaces;
+  my @code_lines = scalar(remove_prefix_spaces(4, $l.$this->line_ending(), $preserve_tabs));
   my $count = 1;  # The number of lines we have read
   my $valid_count = 1;  # The number of lines we know are in the code block.
   my $valid_pos = $this->get_pos();
@@ -437,10 +442,10 @@ sub _do_indented_code_block {
         $valid_pos = $this->get_pos();
         $valid_count = $count;
         push @code_lines,
-            remove_prefix_spaces(4, $nl.$this->line_ending(), $this->get_preserve_tabs);
+            scalar(remove_prefix_spaces(4, $nl.$this->line_ending(), $preserve_tabs));
       } elsif ($nl eq '') {
         push @code_lines,
-            remove_prefix_spaces(4, $nl.$this->line_ending(), $this->get_preserve_tabs);
+            scalar(remove_prefix_spaces(4, $nl.$this->line_ending(), $preserve_tabs));
       } else {
         last;
       }
@@ -482,7 +487,7 @@ sub _do_fenced_code_block {
         last;
       } else {
         # We’re adding one line to the fenced code block
-        push @code_lines, remove_prefix_spaces($indent, $nl.$this->line_ending());
+        push @code_lines, scalar(remove_prefix_spaces($indent, $nl.$this->line_ending()));
       }
     } else {
       # We’re out of our enclosing block and we haven’t seen the end of the
@@ -544,11 +549,7 @@ sub _do_html_block {
     while (defined (my $nl = $this->next_line())) {
       if ($this->_all_blocks_match(\$nl)) {
         if ($nl !~ m/${html_end_condition}/) {
-          if ($this->get_preserve_tabs) {
-            push @html_lines, $nl.$this->line_ending();
-          } else {
-            push @html_lines, remove_prefix_spaces(0, $nl.$this->line_ending(), 0);
-          }
+          push @html_lines, $nl.$this->line_ending();
         } elsif ($nl eq '') {
           # This can only happen for rules 6 and 7 where the end condition
           # line is not part of the HTML block.
@@ -582,14 +583,16 @@ sub _do_block_quotes {
       # the case of a line like '>\t\tfoo' where we need to retain the 6
       # spaces of indentation, to produce a code block starting with two
       # spaces.
-      $_ = remove_prefix_spaces(length($1) + 1, $_);
-      return 1;
+      my $m;
+      ($_, $m) = remove_prefix_spaces(length($1) + 1, $_);
+      # Returns the matched horizontal size.
+      return $m;
     }
     return $this->_test_lazy_continuation($_);
   };
   {
     local *::_ = \$l;
-    $cond->();
+    $this->{matched_prefix_size} += $cond->();
   }
   $this->{skip_next_block_matching} = 1;
   $this->_enter_child_block({type => 'quotes'}, $cond, qr/ {0,3}(?:> ?)?/, $l);
@@ -609,7 +612,7 @@ sub _do_list_item {
   # compute the tab stops. This is better than nothing but won’t work inside
   # other container blocks. In all cases, using tabs instead of space should not
   # be encouraged.
-  my $text_indent = indent_size($text, $indent_marker);
+  my $text_indent = indent_size($text, $indent_marker + $this->{matched_prefix_size});
   # When interrupting a paragraph, the rules are stricter.
   my $mode = $this->get_lists_can_interrupt_paragraph;
   if (@{$this->{paragraph}}) {
@@ -637,7 +640,8 @@ sub _do_list_item {
     }
     if (indent_size($_) >= $indent) {
       $_ = remove_prefix_spaces($indent, $_);
-      return 1;
+      # Returns the matched horizontal size.
+      return $indent;
     }
     # TODO: we probably don’t need to test the list_item_re case here, just
     # the lazy continuation and the emptiness is enough.
@@ -650,6 +654,7 @@ sub _do_list_item {
     # processing the condition and to correctly handle the case where the
     # list marker was followed by tabs.
     $forced_next_line = remove_prefix_spaces($indent, (' ' x $indent_marker).$text);
+    $this->{matched_prefix_size} = $indent;
     $this->{skip_next_block_matching} = 1;
   }
   # Note that we are handling the creation of the lists themselves in the

diff --git a/lib/Markdown/Perl/Inlines.pm b/lib/Markdown/Perl/Inlines.pm
@@ -516,10 +516,6 @@ sub process_styles {
   my $delim = delim_characters($that);
   my %max_delim_run_length = %{$that->get_inline_delimiters_max_run_length};
   while (my @match = $tree->find_in_text(qr/([${delim}])\1*/, $current_child, 0)) {
-    # TODO: add an option to prevent some delimiters to be part of long run
-    # (e.g. max_delimiter_run_length), typically for ~ which can only be in run
-    # of lengths 2 according to GitHub spec (to not collide with code block
-    # probably).
     # We extract the delimiter run into a new node, that will be at $index.
     my ($delim_tree, $index) = $tree->extract($match[0], $match[1], $match[0], $match[2]);
     # We use the type literal so that if we do nothing with the delimiter it

diff --git a/lib/Markdown/Perl/Options.pm b/lib/Markdown/Perl/Options.pm
@@ -422,6 +422,18 @@ _make_option(code_blocks_info => 'language', _enum(qw(ignored language)));
 
 =pod
 
+=head3 B<code_blocks_convert_tabs_to_spaces> I<(boolean, default: false)>
+
+By default, tabs are preserved inside codeblocks. With this option, all tabs (at
+the beginning of the lines or inside) are turned into spaces, aligned with the
+tab stops (currently always a multiple of 4).
+
+=cut
+
+_make_option(code_blocks_convert_tabs_to_spaces => 0, _boolean, (markdown => 1));
+
+=pod
+
 =head3 B<table_blocks_have_cells_for_missing_data> I<(boolean, default: false)>
 
 Whether a table will have a cell in HTML for a missing cell in the markdown
@@ -633,18 +645,6 @@ _make_option(force_final_new_line => 0, _boolean, (markdown => 1));
 
 =pod
 
-=head3 B<preserve_tabs> I<(boolean, default: true)>
-
-When removing prefix spaces in front of some constructs (typically indented code
-blocks), pmarkdown will try to preserve tabs when they are used instead of
-space. If this option is set to false, prefix tabs will be turned into spaces.
-
-=cut
-
-_make_option(preserve_tabs => 1, _boolean, (markdown => 0));
-
-=pod
-
 =head3 B<preserve_white_lines> I<(boolean, default: true)>
 
 By default, pmarkdown will try to preserve lines that contains only whitespace

diff --git a/lib/Markdown/Perl/Util.pm b/lib/Markdown/Perl/Util.pm
@@ -5,9 +5,11 @@ use warnings;
 use utf8;
 use feature ':5.24';
 
+use Carp;
+use English;
 use Exporter 'import';
 use List::MoreUtils 'first_index';
-use List::Util 'max';
+use List::Util 'max', 'min';
 use Unicode::CaseFold 'fc';
 
 our $VERSION = 0.01;
@@ -30,35 +32,48 @@ sub split_while : prototype(&@) {  ## no critic (RequireArgUnpacking)
 # matched to a tab-stop of size 4.
 # Removes all the spaces if there is less than that.
 # If needed, tabs are converted into 4 spaces.
+# In list context, also returns how many spaces were actually matched.
 sub remove_prefix_spaces {
   my ($n, $text, $preserve_tabs) = @_;
   $preserve_tabs //= 1;  # when not specified we do preserve tabs
   if (!$preserve_tabs) {
     my $s = indent_size($text);  # this sets pos($text);
-    return (' ' x max(0, $s - $n)).(substr $text, pos($text));
+    my $ret = (' ' x max(0, $s - $n)).(substr $text, pos($text));
+    return $ret unless wantarray;
+    return ($ret, min($s, $n));
   }
   my $t = int($n / 4);
   my $s = $n % 4;
+  my $m = 0;  # How many spaces we have matched.
   for my $i (1 .. $t) {
     if ($text =~ m/^( {0,3}\t| {4})/) {
       # We remove one full tab-stop from the string.
       substr $text, 0, length($1), '';
+      $m += 4;
     } else {
       # We didn’t have a full tab-stop, so we remove as many spaces as we had.
-      $text =~ m/^( {0,3})/;
-      return substr $text, length($1);  ## no critic (ProhibitCaptureWithoutTest)
+      $text =~ m/^( {0,3})/ or confess 'Unexpected match failure';
+      $m += $LAST_MATCH_END[0] - $LAST_MATCH_START[0];
+      return substr $text, length($1) unless wantarray;
+      return ((substr $text, length($1)), $m);
     }
   }
-  return $text if $s == 0;
-  $text =~ m/^(?<p>\ {0,3}\t|\ {4})*?(?<l>\ {0,3}\t|\ {4})?(?<s>\ {0,3})(?<e>[^ \t].*|$)/xs;  ## no critic (ProhibitComplexRegexes)
-  my $ns = length $+{s};
-  if ($ns >= $s) {
-    return ($+{p} // '').($+{l} // '').(' ' x ($ns - $s)).$+{e};
-  } elsif (length($+{l})) {
-    return ($+{p} // '').(' ' x (4 + $ns - $s)).$+{e};
-  } else {
-    return $+{e};
+  if ($s != 0) {
+    $text =~ m/^(?<p>\ {0,3}\t|\ {4})*?(?<l>\ {0,3}\t|\ {4})?(?<s>\ {0,3})(?<e>[^ \t].*|$)/xs;  ## no critic (ProhibitComplexRegexes)
+    my $ns = length $+{s};
+    if ($ns >= $s) {
+      $text = ($+{p} // '').($+{l} // '').(' ' x ($ns - $s)).$+{e};
+      $m += $s;
+    } elsif (length($+{l})) {
+      $text = ($+{p} // '').(' ' x (4 + $ns - $s)).$+{e};
+      $m += $s;
+    } else {
+      $text = $+{e};
+      $m += $ns;
+    }
   }
+  return $text unless wantarray;
+  return ($text, $m);
 }
 
 # Return the indentation of the given text

diff --git a/t/303-lists.t b/t/303-lists.t
@@ -11,5 +11,6 @@ sub run {
 
 is(run("* a\n* b\n* c\n\n\nfoo"), "<ul>\n<li>a</li>\n<li>b</li>\n<li>c</li>\n</ul>\n<p>foo</p>\n", 'list is tight');
 is(run("1.\tfoo\n\n\tbar"), "<ol>\n<li><p>foo</p>\n<p>bar</p>\n</li>\n</ol>\n", 'indent_with_tabs_after_marker');
+is(run(">1.\tfoo\n>\n>    bar"), "<blockquote>\n<ol>\n<li><p>foo</p>\n<p>bar</p>\n</li>\n</ol>\n</blockquote>\n", 'indent_with_tabs_after_marker_inside_block');
 
 done_testing;
diff --git a/t/902-markdown-test-suite.t b/t/902-markdown-test-suite.t
@@ -10,7 +10,17 @@ use MmdTest;
 use Test2::V0;
 
 # TODO: remove these todos.
-my %opt = (todo => [16, 18, 21, 22]);
+my %opt = (
+  todo => [16, 18, 22],
+  # These are bugs in the Markdown "spec", not in our implementation. All of
+  # these have been tested to be buggy in the real Markdown.pl implementation.
+  bugs => [
+    # The original implementation will emit <strong><em> tag for ***foo***,
+    # however this does not extrapolate well to other cases. In particular:
+    # ***foo** bar* is rendered as the buggy <strong><em>foo</strong> bar</em>
+    21,
+  ],
+);
 
 while ($_ = shift) {
   $opt{test_num} = shift @ARGV if /^-n$/;

diff --git a/t/lib/MmdTest.pm b/t/lib/MmdTest.pm
@@ -42,6 +42,7 @@ sub test_suite {
   skip_all('MMD-Test-Suite must be checked out.') unless -d $test_dir;
   my $i = $opt{start_num} // 0;
   my %todo = map { $_ => 1 } @{$opt{todo} // []};
+  my %bugs = map { $_ => 1 } @{$opt{bugs} // []};
   my $ext = $opt{ext} // 'html';
   for my $md_file (glob "${test_dir}/*.text") {
     $i++;
@@ -52,6 +53,8 @@ sub test_suite {
       skip "Missing html file '${html_file}'" unless -f $html_file;
       if ($todo{$i}) {
         todo 'Not yet supported' => $test;
+      } elsif ($bugs{$i}) {
+        todo 'The spec is buggy' => $test;
       } else {
         $test->();
       }