fix: Cache some results of re.finditer (#969)

* fix: Cache some results of re.finditer If a file is formatted, the code attempts to extract the ignored blocks or unformatted blocks several times from the same HTML string. This commit adds a cache for the unique combination sha256(html || config.REGEX). It stores the results as [(begin,end)], as it seems like the accessing .start() or .end() is somewhat costly. (Same applies for the match that is given as argument) * fix: Inline any It took a huge amount of time. With this patch the amount of time needed for https://github.com/openedx/edx-platform goes from 26s to 18s, so 30% less
djlint · Oct 30, 2024 · c48ad92 · c48ad92
1 parent 2ee5fb7
commit c48ad92
Showing 1 changed file with 64 additions and 23 deletions.
diff --git a/djlint/helpers.py b/djlint/helpers.py
@@ -3,6 +3,7 @@
 
 from __future__ import annotations
 
+import hashlib
 import itertools
 from typing import TYPE_CHECKING
 
@@ -11,6 +12,56 @@
 if TYPE_CHECKING:
     from .settings import Config
 
+child_of_unformatted_block_cache_: dict[str, list[tuple[int, int]]] = {}
+inside_ignored_block_cache_: dict[str, list[tuple[int, int]]] = {}
+
+
+def child_of_unformatted_block_cache(
+    config: Config, html: str
+) -> list[tuple[int, int]]:
+    key = hashlib.sha256(
+        (html + config.unformatted_blocks).encode("utf-8")
+    ).hexdigest()
+    if key in child_of_unformatted_block_cache_:
+        return child_of_unformatted_block_cache_[key]
+    matches = [
+        (x.start(0), x.end())
+        for x in re.finditer(
+            config.unformatted_blocks,
+            html,
+            flags=re.DOTALL | re.IGNORECASE | re.VERBOSE | re.MULTILINE,
+        )
+    ]
+    child_of_unformatted_block_cache_[key] = matches
+    return matches
+
+
+def inside_ignored_block_cache(
+    config: Config, html: str
+) -> list[tuple[int, int]]:
+    key = hashlib.sha256(
+        (html + config.unformatted_blocks).encode("utf-8")
+    ).hexdigest()
+    if key in inside_ignored_block_cache_:
+        return inside_ignored_block_cache_[key]
+    matches = [
+        (x.start(0), x.end())
+        for x in itertools.chain(
+            re.finditer(
+                config.ignored_blocks,
+                html,
+                flags=re.DOTALL | re.IGNORECASE | re.VERBOSE | re.MULTILINE,
+            ),
+            re.finditer(
+                config.ignored_inline_blocks,
+                html,
+                flags=re.IGNORECASE | re.VERBOSE,
+            ),
+        )
+    ]
+    inside_ignored_block_cache_[key] = matches
+    return matches
+
 
 def is_ignored_block_opening(config: Config, item: str) -> bool:
     """Find ignored group opening.
@@ -272,37 +323,27 @@ def inside_ignored_block(
     config: Config, html: str, match: re.Match[str]
 ) -> bool:
     """Do not add whitespace if the tag is in a non indent block."""
+    match_start = match.start()
+    match_end = match.end(0)
     return any(
-        ignored_match.start(0) <= match.start()
-        and match.end(0) <= ignored_match.end()
-        for ignored_match in itertools.chain(
-            re.finditer(
-                config.ignored_blocks,
-                html,
-                flags=re.DOTALL | re.IGNORECASE | re.VERBOSE | re.MULTILINE,
-            ),
-            re.finditer(
-                config.ignored_inline_blocks,
-                html,
-                flags=re.IGNORECASE | re.VERBOSE,
-            ),
-        )
+        ignored_match[0] <= match_start and match_end <= ignored_match[1]
+        for ignored_match in inside_ignored_block_cache(config, html)
     )
 
 
 def child_of_unformatted_block(
     config: Config, html: str, match: re.Match[str]
 ) -> bool:
     """Do not add whitespace if the tag is in a non indent block."""
-    return any(
-        ignored_match.start(0) < match.start()
-        and match.end(0) <= ignored_match.end()
-        for ignored_match in re.finditer(
-            config.unformatted_blocks,
-            html,
-            flags=re.DOTALL | re.IGNORECASE | re.VERBOSE | re.MULTILINE,
-        )
-    )
+    match_start = match.start()
+    match_end = match.end(0)
+    ignored_matches = child_of_unformatted_block_cache(config, html)
+    if ignored_matches == []:
+        return False
+    for ignored_match in ignored_matches:
+        if ignored_match[0] < match_start and match_end <= ignored_match[1]:
+            return True
+    return False
 
 
 def child_of_ignored_block(