Skip to content

Commit

Permalink
fix: Cache some results of re.finditer (#969)
Browse files Browse the repository at this point in the history
* fix: Cache some results of re.finditer

If a file is formatted, the code attempts to extract the ignored
blocks or unformatted blocks several times from the same HTML string.

This commit adds a cache for the unique combination sha256(html || config.REGEX).
It stores the results as [(begin,end)], as it seems like the accessing .start()
or .end() is somewhat costly. (Same applies for the match that is given as argument)

* fix: Inline any

It took a huge amount of time. With this patch the amount of time
needed for https://github.com/openedx/edx-platform goes from
26s to 18s, so 30% less
  • Loading branch information
JCWasmx86 authored and monosans committed Oct 30, 2024
1 parent 2ee5fb7 commit c48ad92
Showing 1 changed file with 64 additions and 23 deletions.
87 changes: 64 additions & 23 deletions djlint/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from __future__ import annotations

import hashlib
import itertools
from typing import TYPE_CHECKING

Expand All @@ -11,6 +12,56 @@
if TYPE_CHECKING:
from .settings import Config

child_of_unformatted_block_cache_: dict[str, list[tuple[int, int]]] = {}
inside_ignored_block_cache_: dict[str, list[tuple[int, int]]] = {}


def child_of_unformatted_block_cache(
config: Config, html: str
) -> list[tuple[int, int]]:
key = hashlib.sha256(
(html + config.unformatted_blocks).encode("utf-8")
).hexdigest()
if key in child_of_unformatted_block_cache_:
return child_of_unformatted_block_cache_[key]
matches = [
(x.start(0), x.end())
for x in re.finditer(
config.unformatted_blocks,
html,
flags=re.DOTALL | re.IGNORECASE | re.VERBOSE | re.MULTILINE,
)
]
child_of_unformatted_block_cache_[key] = matches
return matches


def inside_ignored_block_cache(
config: Config, html: str
) -> list[tuple[int, int]]:
key = hashlib.sha256(
(html + config.unformatted_blocks).encode("utf-8")
).hexdigest()
if key in inside_ignored_block_cache_:
return inside_ignored_block_cache_[key]
matches = [
(x.start(0), x.end())
for x in itertools.chain(
re.finditer(
config.ignored_blocks,
html,
flags=re.DOTALL | re.IGNORECASE | re.VERBOSE | re.MULTILINE,
),
re.finditer(
config.ignored_inline_blocks,
html,
flags=re.IGNORECASE | re.VERBOSE,
),
)
]
inside_ignored_block_cache_[key] = matches
return matches


def is_ignored_block_opening(config: Config, item: str) -> bool:
"""Find ignored group opening.
Expand Down Expand Up @@ -272,37 +323,27 @@ def inside_ignored_block(
config: Config, html: str, match: re.Match[str]
) -> bool:
"""Do not add whitespace if the tag is in a non indent block."""
match_start = match.start()
match_end = match.end(0)
return any(
ignored_match.start(0) <= match.start()
and match.end(0) <= ignored_match.end()
for ignored_match in itertools.chain(
re.finditer(
config.ignored_blocks,
html,
flags=re.DOTALL | re.IGNORECASE | re.VERBOSE | re.MULTILINE,
),
re.finditer(
config.ignored_inline_blocks,
html,
flags=re.IGNORECASE | re.VERBOSE,
),
)
ignored_match[0] <= match_start and match_end <= ignored_match[1]
for ignored_match in inside_ignored_block_cache(config, html)
)


def child_of_unformatted_block(
config: Config, html: str, match: re.Match[str]
) -> bool:
"""Do not add whitespace if the tag is in a non indent block."""
return any(
ignored_match.start(0) < match.start()
and match.end(0) <= ignored_match.end()
for ignored_match in re.finditer(
config.unformatted_blocks,
html,
flags=re.DOTALL | re.IGNORECASE | re.VERBOSE | re.MULTILINE,
)
)
match_start = match.start()
match_end = match.end(0)
ignored_matches = child_of_unformatted_block_cache(config, html)
if ignored_matches == []:
return False
for ignored_match in ignored_matches:
if ignored_match[0] < match_start and match_end <= ignored_match[1]:
return True
return False


def child_of_ignored_block(
Expand Down

0 comments on commit c48ad92

Please sign in to comment.