Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] text extraction in Selector and SelectorList #127

Open
wants to merge 27 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
3c471b8
[tmp] Selector.text and SelectorList.text methods
kmike Nov 2, 2018
8dea4ce
[wip] move converting to text to .get method, add getall support, .cl…
kmike Nov 17, 2018
da7bb80
bump html-text required version number
kmike May 30, 2019
859044c
Merge branch 'master' into selector-text
kmike Feb 9, 2022
7bae279
selector text unit tests
shahidkarimi Mar 11, 2022
e4733ee
code formtting
shahidkarimi Mar 11, 2022
857ca72
code formatting improvements
shahidkarimi Mar 11, 2022
7941093
removed unwated tests
shahidkarimi Apr 4, 2022
102f2e3
Merge pull request #236 from shahidkarimi/selector-text-tests
kmike May 20, 2022
1f917bb
Merge branch 'master' into selector-text
kmike Jun 28, 2022
d87982d
apply black
kmike Jun 28, 2022
14dadbd
fixed failing test
kmike Jun 28, 2022
af0d28a
Make new arguments keyword-only
kmike Jun 28, 2022
1737f83
documentation for selector .get() text
shahidkarimi Aug 12, 2022
17ae5e0
suggested changes in the PR fixed
shahidkarimi Aug 26, 2022
f8f1c66
Merge branch 'master' into selector-text
kmike Nov 10, 2022
c6580cc
Update docs/usage.rst
kmike Nov 13, 2022
419af4b
Merge pull request #248 from shahidkarimi/selector-text-doc
kmike Nov 13, 2022
b8d0352
Merge branch 'master' into selector-text
kmike Apr 24, 2024
ee3e734
fixed typing
kmike May 1, 2024
69456c1
fixed a refactoring issue
kmike May 1, 2024
a492278
document O(N^2) gotcha
kmike May 8, 2024
8b4ae25
make flake8 config compatible with black
kmike May 8, 2024
ccaaa5b
refactor text and cleaning tests; add more of them
kmike May 8, 2024
4eea4fa
fixed default .cleaned cleaner value
kmike May 8, 2024
27c9919
fixed black formatting went wrong
kmike May 8, 2024
852bbef
fix docs references
kmike May 8, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
138 changes: 122 additions & 16 deletions parsel/selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

import six
from lxml import etree, html
from lxml.html.clean import Cleaner
import html_text

from .utils import flatten, iflatten, extract_regex
from .csstranslator import HTMLTranslator, GenericTranslator
Expand Down Expand Up @@ -121,21 +123,42 @@ def re_first(self, regex, default=None, replace_entities=True):
else:
return default

def getall(self):
def getall(self, text=False, cleaner='auto',
guess_punct_space=True, guess_layout=True):
"""
Call the ``.get()`` method for each element is this list and return
their results flattened, as a list of unicode strings.
"""
return [x.get() for x in self]

``text``, ``cleaner``, ``guess_punct_space`` and ``guess_layout``
options are passed to :meth:`~.Selector.get`; see
:meth:`~.Selector.get` for more details.
"""
return [
x.get(
text=text,
cleaner=cleaner,
guess_punct_space=guess_punct_space,
guess_layout=guess_layout
)
for x in self
]
extract = getall

def get(self, default=None):
def get(self, default=None, text=False, cleaner='auto',
guess_punct_space=True, guess_layout=True):
"""
Return the result of ``.get()`` for the first element in this list.
If the list is empty, return the default value.
If the list is empty, return the ``default`` value.

``text``, ``cleaner``, ``guess_punct_space`` and ``guess_layout``
options are passed to :meth:`Selector.get`; see :meth:`~.Selector.get`
for more details.
"""
for x in self:
return x.get()
return x.get(text=text,
cleaner=cleaner,
guess_punct_space=guess_punct_space,
guess_layout=guess_layout)
else:
return default
extract_first = get
Expand All @@ -162,7 +185,7 @@ class Selector(object):
If ``type`` is ``None``, the selector defaults to ``"html"``.
"""

__slots__ = ['text', 'namespaces', 'type', '_expr', 'root',
__slots__ = ['namespaces', 'type', '_expr', 'root',
'__weakref__', '_parser', '_csstranslator', '_tostring_method']

_default_type = None
Expand All @@ -179,6 +202,8 @@ class Selector(object):
}
_lxml_smart_strings = False
selectorlist_cls = SelectorList
_text_cleaner = html_text.cleaner
_html_cleaner = Cleaner()

def __init__(self, text=None, type=None, namespaces=None, root=None,
base_url=None, _expr=None):
Expand Down Expand Up @@ -292,30 +317,87 @@ def re_first(self, regex, default=None, replace_entities=True):
"""
return next(iflatten(self.re(regex, replace_entities=replace_entities)), default)

def get(self):
def get(self, text=False, cleaner='auto',
guess_punct_space=True, guess_layout=True):
"""
Serialize and return the matched nodes in a single unicode string.
Percent encoded content is unquoted.
"""

When ``text`` is False (default), HTML or XML is extracted. Pass
``text=True`` to extract text content (html-text library is used).
Text extraction algorithm assumes that the document is an HTML
document, and uses HTML-specific rules.

``cleaner`` argument allows to clean HTML before extracting the
content. Allowed values:

* "auto" (default) - don't clean when text=False, clean with
options tuned for text extraction when text=True;
* "text" - clean with options tuned for text extraction: elements
like ``<script>`` and ``<style>`` are removed, cleaning options
are tuned for speed, assuming text extraction is the end goal;
* "html" - use default ``lxml.html.clean.Cleaner``. This is useful
if you want to make .get() output more human-readable, but still
preserve HTML tags.
* None - don't clean, even when ``text=True``. Useful if you have
an already cleaned tree, e.g. after calling :meth:`Selector.cleaned`.
* custom ``lxml.html.clean.Cleaner`` objects are also supported.

``guess_punct_space`` and ``guess_layout`` options allow to customize
text extraction algorithm. By default, when ``text=True``,
parsel tries to insert newlines and blank lines as appropriate,
and be smart about whitespaces around inline tags,
so that the text output looks similar to browser's.

Pass ``guess_punct_space=False`` to disable punctuation handling.
This option has no effect when ``text=False``.

Use ``guess_layout=False`` to avoid adding newlines - content will
be just a single line of text, using whitespaces as separators.
This option has no effect when ``text=False``.
"""
sel = self
if cleaner == 'auto':
if text:
sel = self.cleaned('text')
elif cleaner is not None:
sel = self.cleaned(cleaner)
tree = sel.root

if text:
return html_text.etree_to_text(tree,
guess_punct_space=guess_punct_space,
guess_layout=guess_layout
)

try:
return etree.tostring(self.root,
return etree.tostring(tree,
method=self._tostring_method,
encoding='unicode',
with_tail=False)
except (AttributeError, TypeError):
if self.root is True:
if tree is True:
return u'1'
elif self.root is False:
elif tree is False:
return u'0'
else:
return six.text_type(self.root)
return six.text_type(tree)
extract = get

def getall(self):
def getall(self, text=False, cleaner='auto',
guess_punct_space=True, guess_layout=True):
"""
Serialize and return the matched node in a 1-element list of unicode strings.
Serialize and return the matched node in a 1-element list of unicode
strings.

See :meth:`~.Selector.get` for options.
"""
return [self.get()]
return [self.get(
text=text,
cleaner=cleaner,
guess_punct_space=guess_punct_space,
guess_layout=guess_layout,
)]

def register_namespace(self, prefix, uri):
"""
Expand Down Expand Up @@ -346,6 +428,30 @@ def attrib(self):
"""
return dict(self.root.attrib)

def cleaned(self, cleaner='html'):
"""
Return a copy of a Selector, with underlying subtree cleaned.
Allowed values of ``cleaner`` argument:

* "html" (default) - use default ``lxml.html.clean.Cleaner``;
* "text" - clean with options tuned for text extraction: elements
like ``<script>`` and ``<style>`` are removed, cleaning options
are tuned for speed, assuming text extraction is the end goal;
* custom ``lxml.html.clean.Cleaner`` objects are also supported.
"""
if isinstance(cleaner, six.string_types):
if cleaner not in {'html', 'text'}:
raise ValueError("cleaner must be 'html', 'text' or "
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is one gotcha: this exception is raised in .get as well, but in .get there are two more accepted values: "auto" and None. Does it worth fixing?

"an lxml.html.clean.Cleaner instance")
if cleaner == 'html':
cleaner = self._html_cleaner
elif cleaner == 'text':
cleaner = self._text_cleaner
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

an alternative is make these attributes public, and ask users to pass them: sel.cleaned(sel.TEXT_CLEANER) instead of sel.cleaned('text').

root = cleaner.clean_html(self.root)
return self.__class__(root=root, _expr=self._expr,
namespaces=self.namespaces,
type=self.type)

def __bool__(self):
"""
Return ``True`` if there is any real content selected or ``False``
Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ def has_environment_marker_platform_impl_support():
'w3lib>=1.19.0',
'lxml>=2.3',
'six>=1.5.2',
'cssselect>=0.9'
'cssselect>=0.9',
'html-text>=0.4.1',
]
extras_require = {}

Expand Down