Skip to content

Commit

Permalink
Remove beautiful soup dependency (#1453)
Browse files Browse the repository at this point in the history
* WIP replace beautiful soup with custom HTML util

* Format code with black

* Remove unused imports

* Remove beautifulsoup, bleach and htmlmin

* Bump version to 0.29

* Remove BS4 import

* Fix function call

* Tweak HTML parser

* Tweak parser
  • Loading branch information
rafalp authored Feb 10, 2023
1 parent 80b5398 commit 161da7d
Show file tree
Hide file tree
Showing 33 changed files with 711 additions and 367 deletions.
2 changes: 1 addition & 1 deletion misago/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from .plugins.pluginlist import load_plugin_list_if_exists


__version__ = "0.28.2"
__version__ = "0.29.0"
__released__ = True
5 changes: 0 additions & 5 deletions misago/conf/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,6 @@
MISAGO_MARKUP_EXTENSIONS = []


# Bleach callbacks for linkifying paragraphs

MISAGO_BLEACH_CALLBACKS = []


# Custom post validators

MISAGO_POST_VALIDATORS = []
Expand Down
110 changes: 110 additions & 0 deletions misago/markup/htmlparser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
import html
from dataclasses import dataclass

import html5lib

SINGLETON_TAGS = (
"area",
"base",
"br",
"col",
"command",
"embed",
"hr",
"img",
"input",
"keygen",
"link",
"meta",
"param",
"source",
"track",
"wbr",
)


class Node:
def __str__(self):
raise NotImplementedError("Subclasses of 'Node' need to implement __str__")


@dataclass
class RootNode(Node):
tag = None
children: list

def __str__(self):
return "".join(str(child) for child in self.children)


@dataclass
class ElementNode(Node):
tag: str
attrs: dict
children: list

def __str__(self):
attrs_padding = " " if self.attrs else ""
attrs = " ".join(self.attrs_str())

if self.tag in SINGLETON_TAGS:
return f"<{self.tag}{attrs_padding}{attrs} />"

children = "".join(str(child) for child in self.children)
return f"<{self.tag}{attrs_padding}{attrs}>{children}</{self.tag}>"

def attrs_str(self):
for name, value in self.attrs.items():
if value is True or not value:
yield html.escape(str(name))
else:
yield (f'{html.escape(str(name))}="{html.escape(str(value))}"')


@dataclass
class TextNode(Node):
text: str

def __str__(self):
return html.escape(self.text)


def parse_html_string(string: str) -> RootNode:
element = html5lib.parse(
string,
namespaceHTMLElements=False,
)

body = element.find("body")
root_node = RootNode(children=[])

if body.text:
root_node.children.append(TextNode(text=body.text))

for child in body:
add_child_node(root_node, child)

return root_node


def add_child_node(parent, element):
node = ElementNode(
tag=element.tag,
attrs=element.attrib,
children=[],
)

if element.text:
node.children.append(TextNode(text=element.text))

parent.children.append(node)

if element.tail:
parent.children.append(TextNode(text=element.tail))

for child in element:
add_child_node(node, child)


def print_html_string(root_node: RootNode) -> str:
return str(root_node)
199 changes: 199 additions & 0 deletions misago/markup/links.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
import re
from typing import Union

from django.http import Http404
from django.urls import resolve

from .htmlparser import ElementNode, RootNode, TextNode

MISAGO_ATTACHMENT_VIEWS = ("misago:attachment", "misago:attachment-thumbnail")
URL_RE = re.compile(
r"(https?://)?"
r"(www\.)?"
r"(\w+((-|_)\w+)?\.)?"
r"\w+((_|-|\w)+)?(\.[a-z][a-z]+)"
r"(:[1-9][0-9]+)?"
r"([^\s<>\[\]\(\);:]+)?"
)


def linkify_texts(node: Union[RootNode, ElementNode]):
# Skip link replacement in some nodes
if node.tag in ("pre", "code", "a"):
return

new_children = []
for child in node.children:
if isinstance(child, TextNode):
if URL_RE.search(child.text):
new_children += replace_links_in_text(child.text)
else:
new_children.append(child)
else:
new_children.append(child)
linkify_texts(child)

node.children = new_children


def replace_links_in_text(text: str) -> list:
nodes = []

while True:
match = URL_RE.search(text)
if not match:
if text:
nodes.append(TextNode(text=text))
return nodes

start, end = match.span()
url = text[start:end]

# Append text between 0 and start to nodes
if start > 0:
nodes.append(TextNode(text=text[:start]))

nodes.append(
ElementNode(
tag="a",
attrs={"href": url},
children=[
TextNode(text=strip_link_protocol(url)),
],
)
)

text = text[end:]


def clean_links(
request,
result,
node: Union[RootNode, ElementNode, TextNode],
force_shva=False,
):
if isinstance(node, TextNode):
return

for child in node.children:
if not isinstance(child, ElementNode):
continue

if child.tag == "a":
clean_link_node(request, result, child, force_shva)
clean_links(request, result, child, force_shva)
elif child.tag == "img":
clean_image_node(request, result, child, force_shva)
else:
clean_links(request, result, child, force_shva)


def clean_link_node(
request,
result: dict,
node: ElementNode,
force_shva: bool,
):
host = request.get_host()
href = node.attrs.get("href") or "/"

if is_internal_link(href, host):
href = clean_internal_link(href, host)
result["internal_links"].append(href)
href = clean_attachment_link(href, force_shva)
else:
result["outgoing_links"].append(strip_link_protocol(href))
href = assert_link_prefix(href)
node.attrs["rel"] = "external nofollow noopener"

node.attrs["target"] = "_blank"
node.attrs["href"] = href

if len(node.children) == 0:
node.children.append(strip_link_protocol(href))
elif len(node.children) == 1 and isinstance(node.children[0], TextNode):
text = node.children[0].text
if URL_RE.match(text):
node.children[0].text = strip_link_protocol(text)


def clean_image_node(
request,
result: dict,
node: ElementNode,
force_shva: bool,
):
host = request.get_host()
src = node.attrs.get("src") or "/"

node.attrs["alt"] = strip_link_protocol(node.attrs["alt"])

if is_internal_link(src, host):
src = clean_internal_link(src, host)
result["images"].append(src)
src = clean_attachment_link(src, force_shva)
else:
result["images"].append(strip_link_protocol(src))
src = assert_link_prefix(src)

node.attrs["src"] = src


def is_internal_link(link, host):
if link.startswith("/") and not link.startswith("//"):
return True

link = strip_link_protocol(link).lstrip("www.").lower()
return link.lower().startswith(host.lstrip("www."))


def strip_link_protocol(link):
if link.lower().startswith("https:"):
link = link[6:]
if link.lower().startswith("http:"):
link = link[5:]
if link.startswith("//"):
link = link[2:]
return link


def assert_link_prefix(link):
if link.lower().startswith("https:"):
return link
if link.lower().startswith("http:"):
return link
if link.startswith("//"):
return "http:%s" % link

return "http://%s" % link


def clean_internal_link(link, host):
link = strip_link_protocol(link)

if link.lower().startswith("www."):
link = link[4:]
if host.lower().startswith("www."):
host = host[4:]

if link.lower().startswith(host):
link = link[len(host) :]

return link or "/"


def clean_attachment_link(link, force_shva=False):
try:
resolution = resolve(link)
if not resolution.namespaces:
return link
url_name = ":".join(resolution.namespaces + [resolution.url_name])
except (Http404, ValueError):
return link

if url_name in MISAGO_ATTACHMENT_VIEWS:
if force_shva:
link = "%s?shva=1" % link
elif link.endswith("?shva=1"):
link = link[:-7]
return link
Loading

0 comments on commit 161da7d

Please sign in to comment.