From 81b29b994a74067636af7de7d2eb25d3c0a97b46 Mon Sep 17 00:00:00 2001 From: Paulo Costa Date: Wed, 2 Mar 2016 20:03:45 -0300 Subject: [PATCH] Added text_content() method to selectors. It's useful to extract the text contents of HTML nodes as plain old strings, ignoring nested tags and extra spaces. --- parsel/selector.py | 19 +++++++++++++++ tests/test_selector.py | 55 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+) diff --git a/parsel/selector.py b/parsel/selector.py index 282f2e13..be297bef 100644 --- a/parsel/selector.py +++ b/parsel/selector.py @@ -98,6 +98,18 @@ def extract_first(self, default=None): else: return default + def text_content(self): + """ + Call the ``.text_content()`` method for each element is this list and return + their results flattened, as a list of unicode strings. + """ + return [x.text_content() for x in self] + + def text_content_first(self, default=None): + for x in self: + return x.text_content() + else: + return default class Selector(object): """ @@ -222,6 +234,13 @@ def extract(self): else: return six.text_type(self.root) + def text_content(self): + """ + Returns the text content of the element, including the text content of + its children, with no markup. + """ + return six.text_type(self.root.xpath("normalize-space()")) + def register_namespace(self, prefix, uri): """ Register the given namespace to be used in this :class:`Selector`. diff --git a/tests/test_selector.py b/tests/test_selector.py index 29446d4e..3c0e717b 100644 --- a/tests/test_selector.py +++ b/tests/test_selector.py @@ -86,6 +86,61 @@ def test_extract_first_default(self): self.assertEqual(sel.xpath('//div/text()').extract_first(default='missing'), 'missing') + def test_text_content_first(self): + """Test if text_first() returns first element""" + body = u'' + sel = self.sscls(text=body) + + self.assertEqual(sel.xpath('//ul/li').text_content_first(), + sel.xpath('//ul/li').text_content()[0]) + + self.assertEqual(sel.xpath('//ul/li[@id="1"]').text_content_first(), + sel.xpath('//ul/li[@id="1"]').text_content()[0]) + + self.assertEqual(sel.xpath('//ul/li[2]').text_content_first(), + sel.xpath('//ul/li').text_content()[1]) + + self.assertEqual(sel.xpath('//ul/li[@id="doesnt-exist"]').text_content_first(), None) + + self.assertEqual(sel.xpath('//ul/li').text_content_first(), '1') + + self.assertEqual(sel.xpath('//ul/li[2]').text_content_first(), '2'), + + self.assertEqual(sel.xpath('//ul').text_content_first(), '12'), + + def test_text_content_first_default(self): + """Test if text_first() returns default value when no results found""" + body = u'' + sel = self.sscls(text=body) + + self.assertEqual(sel.xpath('//div').text_content_first(default='missing'), 'missing') + + def test_text_content(self): + """Test if text_first() returns default value when no results found""" + body = u'' + sel = self.sscls(text=body) + + self.assertEqual(sel.xpath('//ul').text_content(), [u'12']) + self.assertEqual(sel.xpath('//ul/li').text_content(), [u'1', u'2']) + + def test_text_content_with_spaces(self): + """Test if text_first() returns default value when no results found""" + body = u""" +

+ Mary had a little
+ lamb
+

+
meh meh
+

+ It's + fleece + was white as snow. +

+ """ + sel = self.sscls(text=body) + + self.assertEqual(sel.xpath('//p').text_content(), [u'Mary had a little lamb', u'It\'s fleece was white as snow.']) + def test_re_first(self): """Test if re_first() returns first matched element""" body = u''