From 313e6cf8569897fbf4f03fd0b46796449adbd551 Mon Sep 17 00:00:00 2001 From: Valdir Stumm Junior Date: Fri, 31 Mar 2017 16:53:52 -0300 Subject: [PATCH] add image_url field --- books/spiders/books.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/books/spiders/books.py b/books/spiders/books.py index 99ab4346..264c77df 100644 --- a/books/spiders/books.py +++ b/books/spiders/books.py @@ -3,28 +3,30 @@ class BooksSpider(scrapy.Spider): - name = "books" - allowed_domains = ["books.toscrape.com"] + name = 'books' + allowed_domains = ['books.toscrape.com'] start_urls = [ 'http://books.toscrape.com/', ] def parse(self, response): - for book_url in response.css("article.product_pod > h3 > a ::attr(href)").extract(): + for book_url in response.css('article.product_pod > h3 > a ::attr(href)').extract(): yield scrapy.Request(response.urljoin(book_url), callback=self.parse_book_page) - next_page = response.css("li.next > a ::attr(href)").extract_first() + next_page = response.css('li.next > a ::attr(href)').extract_first() if next_page: yield scrapy.Request(response.urljoin(next_page), callback=self.parse) def parse_book_page(self, response): item = {} - product = response.css("div.product_main") - item["title"] = product.css("h1 ::text").extract_first() + product = response.css('div.product_main') + item['title'] = product.css('h1 ::text').extract_first() item['category'] = response.xpath( - "//ul[@class='breadcrumb']/li[@class='active']/preceding-sibling::li[1]/a/text()" + '//ul[@class="breadcrumb"]/li[@class="active"]/preceding-sibling::li[1]/a/text()' ).extract_first() item['description'] = response.xpath( - "//div[@id='product_description']/following-sibling::p/text()" + '//div[@id="product_description"]/following-sibling::p/text()' ).extract_first() item['price'] = response.css('p.price_color ::text').extract_first() + image_url = response.css('div.carousel-inner img ::attr(src)').extract_first() + item['image_url'] = response.urljoin(image_url) yield item