From 81b29b994a74067636af7de7d2eb25d3c0a97b46 Mon Sep 17 00:00:00 2001
From: Paulo Costa <paulo.costa@geofusion.com.br>
Date: Wed, 2 Mar 2016 20:03:45 -0300
Subject: [PATCH] Added text_content() method to selectors.

It's useful to extract the text contents of HTML nodes as plain old strings, ignoring nested tags and extra spaces.
---
 parsel/selector.py     | 19 +++++++++++++++
 tests/test_selector.py | 55 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 74 insertions(+)
diff --git a/parsel/selector.py b/parsel/selector.py
index 282f2e13..be297bef 100644
--- a/parsel/selector.py
+++ b/parsel/selector.py
@@ -98,6 +98,18 @@ def extract_first(self, default=None):
         else:
             return default
 
+    def text_content(self):
+        """
+        Call the ``.text_content()`` method for each element is this list and return
+        their results flattened, as a list of unicode strings.
+        """
+        return [x.text_content() for x in self]
+
+    def text_content_first(self, default=None):
+        for x in self:
+            return x.text_content()
+        else:
+            return default
 
 class Selector(object):
     """
@@ -222,6 +234,13 @@ def extract(self):
             else:
                 return six.text_type(self.root)
 
+    def text_content(self):
+        """
+        Returns the text content of the element, including the text  content of
+        its children, with no markup.
+        """
+        return six.text_type(self.root.xpath("normalize-space()"))
+
     def register_namespace(self, prefix, uri):
         """
         Register the given namespace to be used in this :class:`Selector`.
diff --git a/tests/test_selector.py b/tests/test_selector.py
index 29446d4e..3c0e717b 100644
--- a/tests/test_selector.py
+++ b/tests/test_selector.py
@@ -86,6 +86,61 @@ def test_extract_first_default(self):
 
         self.assertEqual(sel.xpath('//div/text()').extract_first(default='missing'), 'missing')
 
+    def test_text_content_first(self):
+        """Test if text_first() returns first element"""
+        body = u'<ul><li id="1">1</li><li id="2">2</li></ul>'
+        sel = self.sscls(text=body)
+
+        self.assertEqual(sel.xpath('//ul/li').text_content_first(),
+                         sel.xpath('//ul/li').text_content()[0])
+
+        self.assertEqual(sel.xpath('//ul/li[@id="1"]').text_content_first(),
+                         sel.xpath('//ul/li[@id="1"]').text_content()[0])
+
+        self.assertEqual(sel.xpath('//ul/li[2]').text_content_first(),
+                         sel.xpath('//ul/li').text_content()[1])
+
+        self.assertEqual(sel.xpath('//ul/li[@id="doesnt-exist"]').text_content_first(), None)
+        
+        self.assertEqual(sel.xpath('//ul/li').text_content_first(), '1')
+        
+        self.assertEqual(sel.xpath('//ul/li[2]').text_content_first(), '2'),
+        
+        self.assertEqual(sel.xpath('//ul').text_content_first(), '12'),
+
+    def test_text_content_first_default(self):
+        """Test if text_first() returns default value when no results found"""
+        body = u'<ul><li id="1">1</li><li id="2">2</li></ul>'
+        sel = self.sscls(text=body)
+
+        self.assertEqual(sel.xpath('//div').text_content_first(default='missing'), 'missing')
+
+    def test_text_content(self):
+        """Test if text_first() returns default value when no results found"""
+        body = u'<ul><li id="1">1</li><li id="2">2</li></ul>'
+        sel = self.sscls(text=body)
+
+        self.assertEqual(sel.xpath('//ul').text_content(), [u'12'])
+        self.assertEqual(sel.xpath('//ul/li').text_content(), [u'1', u'2'])
+        
+    def test_text_content_with_spaces(self):
+        """Test if text_first() returns default value when no results found"""
+        body = u"""
+            <p>
+              Mary <b>had    </b>   a little   <i> <br/>
+              lamb  </i>   
+            </p> 
+            <div>meh meh</div>
+            <p>    
+              It's
+              <txd>fleece</txd>
+              was w<em>hi</em>te as s<span>no</span>w.
+            </p> 
+        """
+        sel = self.sscls(text=body)
+
+        self.assertEqual(sel.xpath('//p').text_content(),  [u'Mary had a little lamb', u'It\'s fleece was white as snow.'])
+
     def test_re_first(self):
         """Test if re_first() returns first matched element"""
         body = u'<ul><li id="1">1</li><li id="2">2</li></ul>'