diff --git a/.flake8 b/.flake8 index 7e5efc6..cffc2c6 100644 --- a/.flake8 +++ b/.flake8 @@ -1,5 +1,6 @@ [flake8] -ignore = E203,W503 +max-line-length = 88 +ignore = E203,W503,E701 per-file-ignores = docs/conftest.py:E501 parsel/csstranslator.py:E501 diff --git a/.gitignore b/.gitignore index 9a1e3c0..20dec10 100644 --- a/.gitignore +++ b/.gitignore @@ -29,6 +29,7 @@ pip-log.txt nosetests.xml htmlcov .pytest_cache +coverage.xml # Translations *.mo diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index db43480..42a15fc 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,14 +4,14 @@ repos: hooks: - id: bandit args: [-r, -c, .bandit.yml] -- repo: https://github.com/PyCQA/flake8 - rev: 7.0.0 - hooks: - - id: flake8 - repo: https://github.com/psf/black.git rev: 24.2.0 hooks: - id: black +- repo: https://github.com/PyCQA/flake8 + rev: 7.0.0 + hooks: + - id: flake8 - repo: https://github.com/pycqa/isort rev: 5.13.2 hooks: diff --git a/docs/usage.rst b/docs/usage.rst index e3eb91f..0b97d8b 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -140,6 +140,19 @@ pseudo-elements:: >>> selector.css('title::text').get() 'Example website' +To extract all text of one or more element and all their child elements, +formatted as plain text taking into account HTML tags (e.g. ``
`` is +translated as a line break), set ``text=True`` in your call to +:meth:`~parsel.selector.Selector.get` or +:meth:`~parsel.selector.Selector.getall` instead of including +``::text`` (CSS) or ``/text()`` (XPath) in your query:: + + >>> selector.css('#images').get(text=True) + 'Name: My image 1\nName: My image 2\nName: My image 3\nName: My image 4\nName: My image 5' + +See :meth:`Selector.get` for additional parameters that you can use to change +how the extracted plain text is formatted. + As you can see, ``.xpath()`` and ``.css()`` methods return a :class:`~parsel.selector.SelectorList` instance, which is a list of new selectors. This API can be used for quickly selecting nested data:: diff --git a/parsel/selector.py b/parsel/selector.py index 2027599..104db9d 100644 --- a/parsel/selector.py +++ b/parsel/selector.py @@ -22,8 +22,10 @@ ) from warnings import warn +import html_text # type: ignore[import-untyped] import jmespath from lxml import etree, html +from lxml.html.clean import Cleaner # pylint: disable=no-name-in-module from packaging.version import Version from .csstranslator import GenericTranslator, HTMLTranslator @@ -245,30 +247,68 @@ def re_first( return typing.cast(str, el) return default - def getall(self) -> List[str]: + def getall( + self, + *, + text: bool = False, + cleaner: Union[str, None, Cleaner] = "auto", + guess_punct_space: bool = True, + guess_layout: bool = True, + ) -> List[str]: """ Call the ``.get()`` method for each element is this list and return their results flattened, as a list of strings. - """ - return [x.get() for x in self] - - extract = getall - @typing.overload - def get(self, default: None = None) -> Optional[str]: - pass + ``text``, ``cleaner``, ``guess_punct_space`` and ``guess_layout`` + options are passed to :meth:`~.Selector.get`; see + :meth:`~.Selector.get` for more details. + + .. note:: + + When either text extraction or cleaning is requested, they're + performed on each element in the list individually. So, if you match + nested elements (i.e. both parent and descendant), cleaning or + text extraction could be run multiple times on the same part + of the tree. For example, ``selector.xpath("*").getall(text=True)`` + has O(N^2) complexity regarding the number of nodes in the tree, + not O(N). + """ + return [ + x.get( + text=text, + cleaner=cleaner, + guess_punct_space=guess_punct_space, + guess_layout=guess_layout, + ) + for x in self + ] - @typing.overload - def get(self, default: str) -> str: - pass + extract = getall - def get(self, default: Optional[str] = None) -> Any: + def get( + self, + default: Optional[str] = None, + *, + text: bool = False, + cleaner: Union[str, None, Cleaner] = "auto", + guess_punct_space: bool = True, + guess_layout: bool = True, + ) -> Any: """ Return the result of ``.get()`` for the first element in this list. - If the list is empty, return the default value. + If the list is empty, return the ``default`` value. + + ``text``, ``cleaner``, ``guess_punct_space`` and ``guess_layout`` + options are passed to :meth:`Selector.get`; see :meth:`~.Selector.get` + for more details. """ for x in self: - return x.get() + return x.get( + text=text, + cleaner=cleaner, + guess_punct_space=guess_punct_space, + guess_layout=guess_layout, + ) return default extract_first = get @@ -439,6 +479,8 @@ class Selector: } _lxml_smart_strings = False selectorlist_cls = SelectorList["Selector"] + _text_cleaner = html_text.cleaner + _html_cleaner = Cleaner() def __init__( self, @@ -715,40 +757,110 @@ def re_first( default, ) - def get(self) -> Any: + def get( + self, + *, + text: bool = False, + cleaner: Union[str, None, Cleaner] = "auto", + guess_punct_space: bool = True, + guess_layout: bool = True, + ) -> Any: """ Serialize and return the matched nodes. For HTML and XML, the result is always a string, and percent-encoded content is unquoted. + + When ``text`` is False (default), HTML or XML is extracted. Pass + ``text=True`` to extract text content (html-text library is used). + Text extraction algorithm assumes that the document is an HTML + document, and uses HTML-specific rules. + + ``cleaner`` argument allows cleaning HTML before extracting the + content. Allowed values: + + * "auto" (default) - don't clean when text=False, clean with + options tuned for text extraction when text=True; + * "text" - clean with options tuned for text extraction: elements + like ``

hello

" + sel = Selector(div_html) + assert sel.css("script").getall() == [""] + assert sel.cleaned().css("script").getall() == [] + + assert len(sel.css("script")) == 1 + assert len(sel.css("style")) == 1 + assert len(sel.css("p")) == 1 + + assert len(sel.cleaned().css("script")) == 0 + assert len(sel.cleaned().css("style")) == 1 + assert len(sel.cleaned().css("p")) == 1 + + +def test_cleaned_options() -> None: + div_html = "

hello

" + sel = Selector(div_html) + assert len(sel.css("script")) == 1 + assert len(sel.css("style")) == 1 + assert len(sel.css("p")) == 1 + + assert len(sel.cleaned().css("script")) == 0 + assert len(sel.cleaned().css("style")) == 1 + assert len(sel.cleaned().css("p")) == 1 + + assert len(sel.cleaned("html").css("script")) == 0 + assert len(sel.cleaned("html").css("style")) == 1 + assert len(sel.cleaned("html").css("p")) == 1 + + assert len(sel.cleaned("text").css("script")) == 0 + assert len(sel.cleaned("text").css("style")) == 0 + assert len(sel.cleaned("text").css("p")) == 1 + + cleaner = Cleaner(kill_tags=["p"], scripts=False, style=False) + assert len(sel.cleaned(cleaner).css("script")) == 1 + assert len(sel.cleaned(cleaner).css("style")) == 1 + assert len(sel.cleaned(cleaner).css("p")) == 0 + + +def test_get_cleaner() -> None: + div_html = "

P

" + sel = Selector(div_html) + cleaner = Cleaner(kill_tags=["p"], scripts=False, style=False) + + assert sel.get(text=True) == "P" + assert sel.get(text=True, cleaner=None) == "SCRIPT STYLE\n\nP" + assert sel.get(text=True, cleaner="html") == "STYLE\n\nP" + assert sel.get(text=True, cleaner="text") == "P" + assert sel.get(text=True, cleaner=cleaner) == "SCRIPT STYLE" + + div = sel.css("div") + assert div.get() == div_html + assert div.get(cleaner=None) == div_html + assert div.get(cleaner="html") == "

P

" + assert div.get(cleaner="text") == "

P

" + assert ( + div.get(cleaner=cleaner) + == "
" + ) + + +def test_guess_punct_space() -> None: + sel = Selector('

hello"Folks"

') + assert sel.get(text=True, guess_punct_space=False) == 'hello "Folks"' + assert sel.get(text=True, guess_punct_space=True) == 'hello"Folks"' + + assert sel.getall(text=True, guess_punct_space=False) == ['hello "Folks"'] + assert sel.getall(text=True, guess_punct_space=True) == ['hello"Folks"'] + + +def test_guess_layout() -> None: + sel = Selector("
  • option1
  • option2
") + assert sel.get(text=True, guess_layout=False) == "option1 option2" + assert sel.get(text=True, guess_layout=True) == "option1\noption2" + + assert sel.getall(text=True, guess_layout=False) == ["option1 option2"] + assert sel.getall(text=True, guess_layout=True) == ["option1\noption2"]