Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix the issue where HTML elements cannot be dropped from the text selector returned by Selector.jmespath() #298

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 31 additions & 19 deletions parsel/selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -423,6 +423,7 @@ class Selector:
"_huge_tree",
"root",
"_text",
"_text_lazy_html_root",
"body",
"__weakref__",
]
Expand Down Expand Up @@ -507,6 +508,7 @@ def __init__(
self._expr = _expr
self._huge_tree = huge_tree
self._text = text
self._text_lazy_html_root: Optional[etree._Element] = None

def __getstate__(self) -> Any:
raise TypeError("can't pickle Selector objects")
Expand Down Expand Up @@ -606,7 +608,12 @@ def xpath(
)
else:
try:
xpathev = self._get_root(self._text or "", type="html").xpath
if self._text_lazy_html_root is None:
self._text_lazy_html_root = self._get_root(
self.root or "", type="html"
)
if self._text_lazy_html_root is not None:
xpathev = self._text_lazy_html_root.xpath
except AttributeError:
return typing.cast(
SelectorList[_SelectorType], self.selectorlist_cls([])
Expand Down Expand Up @@ -722,25 +729,30 @@ def get(self) -> Any:
For HTML and XML, the result is always a string, and percent-encoded
content is unquoted.
"""
if self.type in ("text", "json"):
if self.type in ("json", "text"):
if self.type == "text" and self._text_lazy_html_root is not None:
return etree.tostring(
self._text_lazy_html_root, encoding="unicode", with_tail=False
)
Comment on lines +734 to +736
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A problem with this approach is that we are assuming HTML, when it could be XML.

return self.root
try:
return typing.cast(
str,
etree.tostring(
self.root,
method=_ctgroup[self.type]["_tostring_method"],
encoding="unicode",
with_tail=False,
),
)
except (AttributeError, TypeError):
if self.root is True:
return "1"
elif self.root is False:
return "0"
else:
return str(self.root)
else:
try:
return typing.cast(
str,
etree.tostring(
self.root,
method=_ctgroup[self.type]["_tostring_method"],
encoding="unicode",
with_tail=False,
),
)
except (AttributeError, TypeError):
if self.root is True:
return "1"
elif self.root is False:
return "0"
else:
return str(self.root)

extract = get

Expand Down
29 changes: 29 additions & 0 deletions tests/test_selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -1007,6 +1007,35 @@ def test_remove_selector(self) -> None:
self.assertIsSelectorList(sel.css("li"))
self.assertEqual(sel.css("li::text").getall(), ["2", "3"])

def test_remove_selector_from_html_in_text(self) -> None:
html = (
"<html><body><style>p{color:red;}</style><p>hello world</p></body></html>"
)
expect_result = "<html><body><p>hello world</p></body></html>"
sel = self.sscls(text=html, type="text")
self.assertEqual(sel.type, "text")
li_sel_list = sel.css("style")
li_sel_list.drop()
self.assertEqual(sel.get(), expect_result)
# The type of the parent selector should not change
self.assertEqual(sel.type, "text")

def test_remove_selector_from_html_in_json(self) -> None:
json_str = """{
"title": "hello world",
"body": "<html><body><style>p{color:red;}</style><p>hello world</p></body></html>"
}
"""
expect_result = "<html><body><p>hello world</p></body></html>"
sel = self.sscls(text=json_str)
html_sel = sel.jmespath("body")[0]
self.assertEqual(html_sel.type, "text")
li_sel_list = html_sel.css("style")
li_sel_list.drop()
self.assertEqual(html_sel.get(), expect_result)
# The type of the parent selector should not change
self.assertEqual(html_sel.type, "text")

def test_remove_pseudo_element_selector_list(self) -> None:
sel = self.sscls(
text="<html><body><ul><li>1</li><li>2</li><li>3</li></ul></body></html>"
Expand Down