feat: keep the original content spacing after the list marker (#196)

A list item such as ` - list item` consists of margin indentation (A), a list marker (B), 1-4 spaces (C) and contents. The old code recorded the width from A to C as `ListItem.prepend` and B as `ListItem.leader`, but the Markdown renderer always skipped margin indentation and output exactly just 1 space after every list marker, such as `- list item`. After this change, the width of A is recorded as `ListItem.indentation`, so that the Markdown renderer can correctly restore both A and C. But the formatting features of other tokens skip margin indentation (A), for consistency of all formatting features, the Markdown renderer only restores C, such as `- list item`.
miyuchina · Nov 28, 2023 · ee7ce94 · ee7ce94
1 parent 5f4c550
commit ee7ce94
Show file tree

Hide file tree

Showing 4 changed files with 73 additions and 19 deletions.
diff --git a/mistletoe/block_token.py b/mistletoe/block_token.py
@@ -486,7 +486,7 @@ def check_interrupts_paragraph(cls, lines):
         # and the list must either be unordered or start from 1.
         marker_tuple = ListItem.parse_marker(lines.peek())
         if (marker_tuple is not None):
-            _, leader, content = marker_tuple
+            _, _, leader, content = marker_tuple
             if not content.strip() == '':
                 return not leader[0].isdigit() or leader in ['1.', '1)']
         return False
@@ -499,7 +499,7 @@ def read(cls, lines):
         while True:
             anchor = lines.get_pos()
             output, next_marker = ListItem.read(lines, next_marker)
-            item_leader = output[2]
+            item_leader = output[3]
             if leader is None:
                 leader = item_leader
             elif not cls.same_marker_type(leader, item_leader):
@@ -532,16 +532,18 @@ class ListItem(BlockToken):
 
     Attributes:
         leader (string): a bullet list marker or an ordered list marker.
+        indentation (int): spaces before the leader.
         prepend (int): the start position of the content, i.e., the indentation required
                        for continuation lines.
         loose (bool): whether the list is loose.
     """
-    repr_attributes = ("leader", "prepend", "loose")
-    pattern = re.compile(r' {0,3}(\d{0,9}[.)]|[+\-*])($|\s+)')
+    repr_attributes = ("leader", "indentation", "prepend", "loose")
+    pattern = re.compile(r'( {0,3})(\d{0,9}[.)]|[+\-*])($|\s+)')
     continuation_pattern = re.compile(r'([ \t]*)(\S.*\n|\n)')
 
-    def __init__(self, parse_buffer, prepend, leader):
+    def __init__(self, parse_buffer, indentation, prepend, leader):
         self.leader = leader
+        self.indentation = indentation
         self.prepend = prepend
         self.children = tokenizer.make_tokens(parse_buffer)
         self.loose = parse_buffer.loose
@@ -574,22 +576,25 @@ def parse_marker(cls, line):
 
         The leader is a bullet list marker, or an ordered list marker.
 
+        The indentation is spaces before the leader.
+
         The prepend is the start position of the content, i.e., the indentation required
         for continuation lines.
         """
         match_obj = cls.pattern.match(line)
         if match_obj is None:
             return None
+        indentation = len(match_obj.group(1))
         prepend = len(match_obj.group(0).expandtabs(4))
-        leader = match_obj.group(1)
+        leader = match_obj.group(2)
         content = line[match_obj.end(0):]
-        n_spaces = prepend - match_obj.end(1)
+        n_spaces = prepend - match_obj.end(2)
         if n_spaces > 4:
             # if there are more than 4 spaces after the leader, we treat them as part of the content
             # with the exception of the first (marker separator) space.
             prepend -= n_spaces - 1
             content = ' ' * (n_spaces - 1) + content
-        return prepend, leader, content
+        return indentation, prepend, leader, content
 
     @classmethod
     def read(cls, lines, prev_marker=None):
@@ -599,10 +604,10 @@ def read(cls, lines, prev_marker=None):
         # first line
         line = next(lines)
         next_line = lines.peek()
-        prepend, leader, content = prev_marker if prev_marker else cls.parse_marker(line)
+        indentation, prepend, leader, content = prev_marker if prev_marker else cls.parse_marker(line)
         if content.strip() == '':
             # item starting with a blank line: look for the next non-blank line
-            prepend = len(leader) + 1
+            prepend = indentation + len(leader) + 1
             blanks = 1
             while next_line is not None and next_line.strip() == '':
                 blanks += 1
@@ -614,7 +619,7 @@ def read(cls, lines, prev_marker=None):
                 parse_buffer = tokenizer.ParseBuffer()
                 parse_buffer.loose = True
                 next_marker = cls.parse_marker(next_line) if next_line is not None else None
-                return (parse_buffer, prepend, leader), next_marker
+                return (parse_buffer, indentation, prepend, leader), next_marker
         else:
             line_buffer.append(content)
 
@@ -659,7 +664,7 @@ def read(cls, lines, prev_marker=None):
         # block-level tokens are parsed here, so that footnotes can be
         # recognized before span-level parsing.
         parse_buffer = tokenizer.tokenize_block(line_buffer, _token_types)
-        return (parse_buffer, prepend, leader), next_marker
+        return (parse_buffer, indentation, prepend, leader), next_marker
 
 
 class Table(BlockToken):

diff --git a/mistletoe/markdown_renderer.py b/mistletoe/markdown_renderer.py
@@ -302,15 +302,17 @@ def render_list(
     def render_list_item(
         self, token: block_token.ListItem, max_line_length: int
     ) -> Iterable[str]:
-        indentation = len(token.leader) + 1
+        indentation = token.prepend - token.indentation
         max_child_line_length = (
             max_line_length - indentation if max_line_length else None
         )
         lines = self.blocks_to_lines(
             token.children, max_line_length=max_child_line_length
         )
         return self.prefix_lines(
-            list(lines) or [""], token.leader + " ", " " * indentation
+            list(lines) or [""],
+            token.leader + " " * (indentation - len(token.leader)),
+            " " * indentation
         )
 
     def render_table(

diff --git a/test/test_markdown_renderer.py b/test/test_markdown_renderer.py
@@ -130,11 +130,11 @@ def test_setext_headings(self):
 
     def test_numbered_list(self):
         input = [
-            "  22)  *emphasized list item*\n",
-            "  96)\n",
+            "  22) *emphasized list item*\n",
+            "  96) \n",
             " 128) here begins a nested list.\n",
             "       + apples\n",
-            "       +  bananas\n",
+            "       + bananas\n",
         ]
         output = self.roundtrip(input)
         expected = [
@@ -157,6 +157,53 @@ def test_bulleted_list(self):
         output = self.roundtrip(input)
         self.assertEqual(output, "".join(input))
 
+    # we don't currently support keeping margin indentation:
+    def test_list_item_margin_indentation_not_preserved(self):
+        # 0 to 4 spaces of indentation from the margin
+        input = [
+            "- 0 space: ok.\n",
+            "  subsequent line.\n",
+            " - 1 space: ok.\n",
+            "   subsequent line.\n",
+            "  - 2 spaces: ok.\n",
+            "    subsequent line.\n",
+            "   - 3 spaces: ok.\n",
+            "     subsequent line.\n",
+            "    - 4 spaces: in the paragraph of the above list item.\n",
+            "      subsequent line.\n",
+        ]
+        output = self.roundtrip(input)
+        expected = [
+            "- 0 space: ok.\n",
+            "  subsequent line.\n",
+            "- 1 space: ok.\n",
+            "  subsequent line.\n",
+            "- 2 spaces: ok.\n",
+            "  subsequent line.\n",
+            "- 3 spaces: ok.\n",
+            "  subsequent line.\n",
+            "  - 4 spaces: in the paragraph of the above list item.\n",
+            "  subsequent line.\n",
+        ]
+        self.assertEqual(output, "".join(expected))
+
+    def test_list_item_indentation_after_leader_preserved(self):
+        # leaders followed by 1 to 5 spaces
+        input = [
+            "- 1 space: ok.\n",
+            "  subsequent line.\n",
+            "-  2 spaces: ok.\n",
+            "   subsequent line.\n",
+            "-   3 spaces: ok.\n",
+            "    subsequent line.\n",
+            "-    4 spaces: ok.\n",
+            "     subsequent line.\n",
+            "-     5 spaces: list item starting with indented code.\n",
+            "  subsequent line.\n",
+        ]
+        output = self.roundtrip(input)
+        self.assertEqual(output, "".join(input))
+
     def test_code_blocks(self):
         input = [
             "    this is an indented code block\n",

diff --git a/test/test_repr.py b/test/test_repr.py
@@ -45,12 +45,12 @@ def test_codefence(self):
     def test_unordered_list(self):
         doc = Document("* Foo\n* Bar\n* Baz")
         self._check_repr_matches(doc.children[0], "block_token.List with 3 children loose=False start=None")
-        self._check_repr_matches(doc.children[0].children[0], "block_token.ListItem with 1 child leader='*' prepend=2 loose=False")
+        self._check_repr_matches(doc.children[0].children[0], "block_token.ListItem with 1 child leader='*' indentation=0 prepend=2 loose=False")
 
     def test_ordered_list(self):
         doc = Document("1. Foo\n2. Bar\n3. Baz")
         self._check_repr_matches(doc.children[0], "block_token.List with 3 children loose=False start=1")
-        self._check_repr_matches(doc.children[0].children[0], "block_token.ListItem with 1 child leader='1.' prepend=3 loose=False")
+        self._check_repr_matches(doc.children[0].children[0], "block_token.ListItem with 1 child leader='1.' indentation=0 prepend=3 loose=False")
 
     def test_table(self):
         doc = Document("| Foo | Bar | Baz |\n|:--- |:---:| ---:|\n| Foo | Bar | Baz |\n")