From 907f495d1439aaeaf036d7af21ffdf6191025258 Mon Sep 17 00:00:00 2001 From: dream2333 Date: Mon, 10 Jun 2024 02:47:31 +0800 Subject: [PATCH] fix: drop html elements from a text selector correctly --- parsel/selector.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/parsel/selector.py b/parsel/selector.py index 2027599..7e122c7 100644 --- a/parsel/selector.py +++ b/parsel/selector.py @@ -423,6 +423,7 @@ class Selector: "_huge_tree", "root", "_text", + "_text_lazy_html_root", "body", "__weakref__", ] @@ -507,6 +508,10 @@ def __init__( self._expr = _expr self._huge_tree = huge_tree self._text = text + # self._text_to_html_root is used to store a temporary root node when + # converting text to html for xpath queries. This is needed because + # the text may not be valid html and we need to convert it to html + self._text_lazy_html_root =None def __getstate__(self) -> Any: raise TypeError("can't pickle Selector objects") @@ -606,7 +611,9 @@ def xpath( ) else: try: - xpathev = self._get_root(self._text or "", type="html").xpath + if self._text_lazy_html_root is None: + self._text_lazy_html_root = self._get_root(self._text or "", type="html") + xpathev = self._text_lazy_html_root.xpath except AttributeError: return typing.cast( SelectorList[_SelectorType], self.selectorlist_cls([]) @@ -625,7 +632,7 @@ def xpath( except etree.XPathError as exc: raise ValueError(f"XPath error: {exc} in {query}") - if type(result) is not list: + if not isinstance(result, list): result = [result] result = [