diff --git a/README.md b/README.md index 85ae5a6..f39192a 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,9 @@ # Search Engine Scraper - se-scraper -[![npm](https://img.shields.io/npm/v/se-scraper.svg?style=for-the-badge)](https://www.npmjs.com/package/se-scraper) -[![Donate](https://img.shields.io/badge/donate-paypal-blue.svg?style=for-the-badge)](https://www.paypal.me/incolumitas) -[![Known Vulnerabilities](https://snyk.io/test/github/NikolaiT/se-scraper/badge.svg)](https://snyk.io/test/github/NikolaiT/se-scraper) +THIS IS A CUSTOM FORK of se-scraper, our changes will or have been +upstreamed. + +The original package is [here](https://www.npmjs.com/package/se-scraper). This node module allows you to scrape search engines concurrently with different proxies. @@ -506,4 +507,4 @@ let scrape_config = { num: 100, // Determines the number of results to show, defaults to 10. Maximum is 100. }, } -``` \ No newline at end of file +``` diff --git a/package.json b/package.json index 8533f13..1695d87 100644 --- a/package.json +++ b/package.json @@ -1,8 +1,8 @@ { - "name": "se-scraper", - "version": "1.5.7", + "name": "@monibrand/se-scraper", + "version": "1.6.0-rc.8", "description": "A module using puppeteer to scrape several search engines such as Google, Bing and Duckduckgo", - "homepage": "https://scrapeulous.com/", + "homepage": "https://monibrand.com/", "main": "index.js", "scripts": { "test": "mocha test test/modules" @@ -17,7 +17,7 @@ "author": "Nikolai Tschacher (https://incolumitas.com/)", "repository": { "type": "git", - "url": "https://github.com/NikolaiT/se-scraper" + "url": "https://github.com/Monibrand/se-scraper" }, "license": "ISC", "dependencies": { diff --git a/src/modules/bing.js b/src/modules/bing.js index 78f2d2a..817c9ad 100644 --- a/src/modules/bing.js +++ b/src/modules/bing.js @@ -2,6 +2,8 @@ const cheerio = require('cheerio'); const Scraper = require('./se_scraper'); class BingScraper extends Scraper { + + defaultStartUrl = this.build_start_url('https://www.bing.com/search?') || 'https://www.bing.com/'; async parse_async(html) { diff --git a/src/modules/duckduckgo.js b/src/modules/duckduckgo.js index 2a3a536..678023b 100644 --- a/src/modules/duckduckgo.js +++ b/src/modules/duckduckgo.js @@ -4,6 +4,8 @@ const debug = require('debug')('se-scraper:DuckduckgoScraper'); class DuckduckgoScraper extends Scraper { + defaultStartUrl = 'https://duckduckgo.com/'; + parse(html) { debug('parse'); // load the page source into cheerio @@ -46,11 +48,8 @@ class DuckduckgoScraper extends Scraper { async load_start_page() { debug('load_start_page'); - let startUrl = 'https://duckduckgo.com/'; - - this.last_response = await this.page.goto(startUrl); + this.last_response = await this.page.goto(this.startUrl); await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT }); - return true; } diff --git a/src/modules/google.js b/src/modules/google.js index 38ccad2..1e47533 100644 --- a/src/modules/google.js +++ b/src/modules/google.js @@ -5,6 +5,8 @@ const Scraper = require('./se_scraper'); class GoogleScraper extends Scraper { + defaultStartUrl = 'https://www.google.com'; + constructor(...args) { super(...args); } @@ -13,13 +15,13 @@ class GoogleScraper extends Scraper { const results = await this.page.evaluate(() => { - let _text = (el, s) => { + let _text = (el, s, onlyFirstTextNode) => { let n = el.querySelector(s); if (n) { - return n.innerText; + return (onlyFirstTextNode) ? n.childNodes[0].nodeValue : n.innerText; } else { - return ''; + return; } }; @@ -29,7 +31,7 @@ class GoogleScraper extends Scraper { if (n) { return n.getAttribute(attr); } else { - return null; + return; } }; @@ -111,14 +113,14 @@ class GoogleScraper extends Scraper { // parse right side product information results.right_info.review = _attr(document, '#rhs .cu-container g-review-stars span', 'aria-label'); - let title_el = document.querySelector('#rhs .cu-container g-review-stars'); + let title_el = document.querySelector('#rhs .cu-container .Q7Oxbd'); if (title_el) { - results.right_info.review.title = title_el.parentNode.querySelector('div:first-child').innerText; + results.right_info.title = title_el.innerText; } - let num_reviews_el = document.querySelector('#rhs .cu-container g-review-stars'); + let num_reviews_el = document.querySelector('#rhs .cu-container .PGDKUd'); if (num_reviews_el) { - results.right_info.num_reviews = num_reviews_el.parentNode.querySelector('div:nth-of-type(2)').innerText; + results.right_info.num_reviews = num_reviews_el.innerText; } results.right_info.vendors = []; @@ -127,20 +129,16 @@ class GoogleScraper extends Scraper { document.querySelectorAll('#rhs .cu-container .rhsvw > div > div:nth-child(4) > div > div:nth-child(3) > div').forEach((el) => { results.right_info.vendors.push({ price: _text(el, 'span:nth-of-type(1)'), - merchant_name: _text(el, 'span:nth-child(3) a:nth-child(2)'), + merchant_name: _text(el, '.doUe3s0oL2B__jackpot-merchant a'), merchant_ad_link: _attr(el, 'span:nth-child(3) a:first-child', 'href'), - merchant_link: _attr(el, 'span:nth-child(3) a:nth-child(2)', 'href'), + merchant_link: _attr(el, 'span:nth-child(3) a:nth-child(2)', 'href'), // TODO this is not working anymore source_name: _text(el, 'span:nth-child(4) a'), source_link: _attr(el, 'span:nth-child(4) a', 'href'), - info: _text(el, 'div span'), - shipping: _text(el, 'span:last-child > span'), + info: _text(el, '.SdBHnc.e2CF7c'), + shipping: _text(el, '.JfwJme'), }) }); - if (!results.right_info.title) { - results.right_info = {}; - } - let right_side_info_el = document.getElementById('rhs'); if (right_side_info_el) { @@ -151,26 +149,19 @@ class GoogleScraper extends Scraper { } } - // parse top main column product information - // #tvcap .pla-unit - document.querySelectorAll('#tvcap .pla-unit').forEach((el) => { + // Parse Google Shopping top or left + document.querySelectorAll('.pla-unit').forEach((el) => { let top_product = { tracking_link: _attr(el, '.pla-unit-title a:first-child', 'href'), link: _attr(el, '.pla-unit-title a:nth-child(2)', 'href'), title: _text(el, '.pla-unit-title a:nth-child(2) span'), - price: _text(el, '.pla-unit-title + div'), - shipping: _text(el, '.pla-extensions-container div:nth-of-type(1)'), - vendor_link: _attr(el,'.pla-extensions-container div > a', 'href'), + price: _text(el, '.pla-unit-title + div', true), + originalPrice: _text(el, '.pla-unit-title + div > span'), + shipping: _text(el, '.pla-extensions-container .cYBBsb'), + vendor_link: _attr(el,'.pla-extensions-container a.FfKHB', 'href'), + merchant_name: _text(el,'.LbUacb span:nth-child(1)'), }; - let merchant_node = el.querySelector('.pla-unit-title'); - if (merchant_node) { - let node = merchant_node.parentNode.querySelector('div > span'); - if (node) { - top_product.merchant_name = node.innerText; - } - } - results.top_products.push(top_product); }); @@ -224,29 +215,18 @@ class GoogleScraper extends Scraper { } async load_start_page() { - let startUrl = 'https://www.google.com'; + this.logger.info('Using startUrl: ' + this.startUrl); + this.last_response = await this.page.goto(this.startUrl); - if (this.config.google_settings) { - startUrl = `https://www.${this.config.google_settings.google_domain}/search?q=`; - if (this.config.google_settings.google_domain) { - startUrl = `https://www.${this.config.google_settings.google_domain}/search?`; - } else { - startUrl = `https://www.google.com/search?`; - } + await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT }); - for (var key in this.config.google_settings) { - if (key !== 'google_domain') { - startUrl += `${key}=${this.config.google_settings[key]}&` - } - } + const buttonAccepted = await this.page.$('#L2AGLb'); + if (buttonAccepted) { + await this.page.evaluate(() => { + document.querySelector('#L2AGLb').click(); + }); } - this.logger.info('Using startUrl: ' + startUrl); - - this.last_response = await this.page.goto(startUrl); - - await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT }); - return true; } diff --git a/src/modules/infospace.js b/src/modules/infospace.js index c10c10c..6be7248 100644 --- a/src/modules/infospace.js +++ b/src/modules/infospace.js @@ -41,11 +41,8 @@ class InfospaceScraper extends Scraper { } async load_start_page() { - - let startUrl = this.build_start_url('http://search.infospace.com/search/web?') || 'http://infospace.com/index.html'; - try { - this.last_response = await this.page.goto(startUrl); + this.last_response = await this.page.goto(this.startUrl); await this.page.waitForSelector('input[name="q"]', { timeout: 5000 }); } catch (e) { return false; diff --git a/src/modules/se_scraper.js b/src/modules/se_scraper.js index 3a453ac..89c2b6d 100644 --- a/src/modules/se_scraper.js +++ b/src/modules/se_scraper.js @@ -31,8 +31,8 @@ module.exports = class Scraper { this.proxy = config.proxy; this.keywords = config.keywords; - this.STANDARD_TIMEOUT = 10000; - this.SOLVE_CAPTCHA_TIME = 45000; + this.STANDARD_TIMEOUT = config.standard_timeout; + this.SOLVE_CAPTCHA_TIME = config.solve_captcha_time; this.results = {}; this.result_rank = 1; @@ -272,6 +272,12 @@ module.exports = class Scraper { await this.page.screenshot({ path: `debug_se_scraper_${this.config.search_engine_name}_${keyword}.png` }); } + if (this.config.keep_html_on_error){ + const html_error = await this.page.content(); + e.html_on_error = html_error; + e.lastUrl = await this.page.evaluate(() => {return window.location.href;}); + } + this.metadata.scraping_detected = await this.detected(); if (this.metadata.scraping_detected === true) { @@ -312,7 +318,6 @@ module.exports = class Scraper { for (var key in settings) { baseUrl += `${key}=${settings[key]}&` } - this.logger.info('Using startUrl: ' + baseUrl); return baseUrl; @@ -381,6 +386,10 @@ module.exports = class Scraper { } + get startUrl(){ + return this.build_start_url(this.config.startUrl || this.defaultStartUrl); + } + /** * * @returns true if startpage was loaded correctly. diff --git a/src/modules/yandex.js b/src/modules/yandex.js index 3666cc1..132ca8b 100644 --- a/src/modules/yandex.js +++ b/src/modules/yandex.js @@ -4,6 +4,8 @@ const Scraper = require('./se_scraper'); class YandexScraper extends Scraper { + defaultStartUrl = 'https://yandex.com'; + constructor(...args) { super(...args); } @@ -71,11 +73,10 @@ class YandexScraper extends Scraper { } async load_start_page() { - let startUrl = 'https://yandex.com'; + + this.logger.info('Using startUrl: ' + this.startUrl); - this.logger.info('Using startUrl: ' + startUrl); - - this.last_response = await this.page.goto(startUrl); + this.last_response = await this.page.goto(this.startUrl); await this.page.waitForSelector('input[name="text"]', { timeout: this.STANDARD_TIMEOUT }); diff --git a/src/node_scraper.js b/src/node_scraper.js index 2dec432..b71fe61 100644 --- a/src/node_scraper.js +++ b/src/node_scraper.js @@ -139,6 +139,9 @@ class ScrapeManager { //custom_func: resolve('examples/pluggable.js'), custom_func: null, throw_on_detection: false, + keep_html_on_error: false, + standard_timeout: 10000, + solve_captcha_time: 45000, // List of proxies to use ['socks5://78.94.172.42:1080', 'http://localhost:1080'] proxies: null, // a file with one proxy per line. Example: diff --git a/test/keep_html_on_error.js b/test/keep_html_on_error.js new file mode 100644 index 0000000..e731a41 --- /dev/null +++ b/test/keep_html_on_error.js @@ -0,0 +1,108 @@ +'use strict'; +const express = require('express'); +const { createLogger, transports } = require('winston'); +const http = require('http'); +const https = require('https'); +const assert = require('assert'); +const path = require('path'); +const keyCert = require('key-cert'); +const Promise = require('bluebird'); +const Proxy = require('http-mitm-proxy'); + +const debug = require('debug')('se-scraper:test'); +const se_scraper = require('..'); + +const httpPort = 3012; +const httpsPort = httpPort + 1; +const proxyPort = httpPort + 2; + +const fakeSearchEngine = express(); +fakeSearchEngine.get('/search', (req, res) => { + debug('q=%s', req.query.q); + const pageNumber = ((req.query.start/10) || 0) + 1; + res.sendFile(path.join(__dirname, 'mocks/google/' + req.query.q + '_page' + pageNumber + '.html')); +}); +fakeSearchEngine.use(express.static('test/mocks/google', {extensions: ['html']})); + +describe('Config', function(){ + + let httpServer, httpsServer, proxy; + before(async function(){ + // Here mount our fake engine in both http and https listen server + httpServer = http.createServer(fakeSearchEngine); + httpsServer = https.createServer(await keyCert(), fakeSearchEngine); + + proxy = Proxy(); + proxy.onRequest((ctx, callback) => { + ctx.proxyToServerRequestOptions.host = 'localhost'; + ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort; + ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine'; + debug('Proxy request to %s', ctx.clientToProxyRequest.headers.host); + return callback(); + }); + + await Promise.promisify(proxy.listen, {context: proxy})({port: proxyPort}); + await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort); + await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort); + debug('Fake http search engine servers started'); + }); + + after(function(){ + httpsServer.close(); + httpServer.close(); + proxy.close(); + }); + + describe('keep_html_on_error', function(){ + + const testLogger = createLogger({ + transports: [ + new transports.Console({ + level: 'error' + }) + ] + }); + + /** + * Test html_output option + */ + it('html_output single page single keyword', async function () { + + const scrape_job = { + search_engine: 'google', + /* TODO refactor start_url + google_settings: { + start_url: 'http://localhost:' + httpPort + }, + */ + keywords: ['test error'], + }; + + var scraper = new se_scraper.ScrapeManager({ + throw_on_detection: true, + keep_html_on_error: true, + logger: testLogger, + //clean_html_output: false, + //clean_data_images: false, + // TODO refactor start_url so we can use-it instead of depending of the proxy for this test + proxies: ['http://localhost:' + proxyPort], + use_proxies_only: true, + standard_timeout: 500, + }); + await scraper.start(); + await assert.rejects( + async () => { + await scraper.scrape(scrape_job); + }, + (error) => { + assert(error.html_on_error, 'Error is containing the html output'); + return /#fbar/.test(error.message); + } + ) + await scraper.quit(); + + }); + + }); + +}); \ No newline at end of file diff --git a/test/mocks/google/shopping 2_page1.html b/test/mocks/google/shopping 2_page1.html new file mode 100644 index 0000000..4342228 --- /dev/null +++ b/test/mocks/google/shopping 2_page1.html @@ -0,0 +1,209 @@ +cheap lacoste shoes - Recherche Google

Liens d'accessibilité

Passer directement au contenu principalAide sur l'accessibilité
Commentaires sur l'accessibilité
Environ 55 700 000 résultats (0,48 secondes) 
Rappel concernant les règles de confidentialité de Google
Me le rappeler plus tard
Lire
Annonce sponsorisée

Afficher les produits correspondants à cheap lacoste...

Annonces


  1. LACOSTE Shoes 2020 - LACOSTE® Site Officiel‎

    Annonce·www.lacoste.com/
    Annonce·www.lacoste.com/
    Commandez Vite sur le Site Officiel. Livraison Express 48h dès 180€ d'achat !
    Note associée à lacoste.com : 4,8 - Conditions de retour: 60 jours ou plus pour la plupart des articles
    • Polos L1212

      Chic et intemporel, vous porterez
      cet essentiel en toutes occasions.
    • Lacoste Paris Polo

      L'Elégance d'une coupe Chemise :
      Découvrez Le Nouveau Paris Polo.

    Recherches associées

Annonces

  1. Nouvelle Collection - Vêtements - Chaussures - Urban Sport - Accessoires Jusqu'à -70%. Choisissez les modèles qui conviennent le votre style, Vaste choix de tailles & modèles . Manches courtes. Coupe classique. Coupe slim. Types: Polo, T-Shirts, Sweatshirts.

  2. Déstockage Lacoste Chaussures - Lacoste Chaussures -40%?...‎

    Annonce·www.meilleurvendeur.com/déstockage/promos
    Annonce·www.meilleurvendeur.com/déstockage/promos
    Lacoste Chaussures 200 Modèles à Prix Promo. Déstockage - Lacoste Chaussures à prix web ! Modèles de Lacoste Chaussures à Prix Super Réduits. Vite: Lacoste Chaussures à Saisir ! Comparez plus de prix. Faites des économies. Enchères et prix fixe.
Applications Google
diff --git a/test/mocks/google/shopping right product review_page1.html b/test/mocks/google/shopping right product review_page1.html new file mode 100644 index 0000000..bca8e73 --- /dev/null +++ b/test/mocks/google/shopping right product review_page1.html @@ -0,0 +1,220 @@ + + +lacoste 317 - Recherche Google

Liens d'accessibilité

Passer directement au contenu principalAide sur l'accessibilité
Commentaires sur l'accessibilité
Environ 4 160 000 résultats (0,50 secondes) 
Rappel concernant les règles de confidentialité de Google
Me le rappeler plus tard
Lire
Produits similaires
Découvrir d'autres lieux

Résultats complémentaires

Lacoste Lunettes
146 avis
Acheter
Annonce sponsorisée
102,75 € · Edel-Optics FR ·
317 · 2807
Livraison gratuite
317 · 2805
Livraison gratuite
MarqueLacoste
TypeVerres correcteurs
GammeHomme
Types de monturesCerclées
Avis des utilisateurs
146 avis
il y a un an
N'ayant trouvé dans aucune boutique de ma ville des lunettes orange, j'ai effectué une recherche sur internet. +J'ai trouvé et choisi ce modèle sur visio net pour ses couleurs et ses dimensions. +Il correspond exactement à ce que je recherchais…
· Avis publié sur visio-net.fr
Applications Google
\ No newline at end of file diff --git a/test/mocks/google/shopping_page1.html b/test/mocks/google/shopping_page1.html new file mode 100644 index 0000000..55a9a9d --- /dev/null +++ b/test/mocks/google/shopping_page1.html @@ -0,0 +1,213 @@ +cheap lacoste shoes - Recherche Google

Liens d'accessibilité

Passer directement au contenu principalAide sur l'accessibilité
Commentaires sur l'accessibilité
Environ 54 900 000 résultats (0,37 secondes) 
Rappel concernant les règles de confidentialité de Google
Me le rappeler plus tard
Lire

Annonces

  1. Découvrez et Commandez la Nouvelle Collection de Chaussures Lacoste. Livraison standard offerte dès 80€ d'achat. SAV : Mail ou Téléphone. Paiement Sécurisé. Retour Facile et Gratuit. Models: Polos, Chaussures, Robes, Pullover, Sacs, Accessoires, Pantalons.
    Note associée à lacoste.com : 4,8 - Commandes correctes: 95 - 100 %
    C.Cial Forum Des Halles - Porte Rambuteau, Niveau -1, Paris01 42 86 90 65Horaires et services variables

    Recherches associées

Résultats complémentaires

Annonce sponsorisée
Acheter
Acheter
Résultats Shopping
Applications Google
diff --git a/test/mocks/google/test error_page1.html b/test/mocks/google/test error_page1.html new file mode 100644 index 0000000..19e35b5 --- /dev/null +++ b/test/mocks/google/test error_page1.html @@ -0,0 +1 @@ +THIS IS A EMPTY PAGE TO THROW SOME ERROR IN SE-SCRAPER diff --git a/test/modules/google.js b/test/modules/google.js index 83c2ae3..d008e7a 100644 --- a/test/modules/google.js +++ b/test/modules/google.js @@ -120,4 +120,150 @@ describe('Module Google', function(){ }); }); -}); \ No newline at end of file + it('extract google shopping on right', function () { + const googleScraper = new GoogleScraper({ + config: { + search_engine_name: 'google', + throw_on_detection: true, + keywords: ['shopping'], + logger: testLogger, + scrape_from_file: '', + num_pages: 1, + } + }); + googleScraper.STANDARD_TIMEOUT = 500; + return googleScraper.run({page}).then(({results, metadata, num_requests}) => { + assert.strictEqual(num_requests, 1, 'One request should be done'); + assert.strictEqual(results['shopping']['1'].results.length, 10, 'Must have 10 organic results parsed on page 1'); + assert.deepEqual(results['shopping']['1'].top_products, [ + { + 'link': 'https://www.laboutiqueofficielle.com/achat-baskets-basses/classic-series-baskets-317-blanc-144046.html?referer=gshopping&LGWCODE=3010559970809;160079;7403', + 'merchant_name': 'LaBoutiqueOffi...', + 'price': '39,99 €', + 'rank': 1, + 'title': 'Classic Series - Baskets 317 Blanc', + 'tracking_link': '/aclk?sa=l&ai=DChcSEwjJqLX1v4bqAhXJlBgKHYRrDO4YABAEGgJsZQ&sig=AOD64_1OEdvZgHU2YEMPI4JNdeTqLJTVjw&ctype=5&q=&ved=2ahUKEwjPmK31v4bqAhXLxYUKHe8BByEQ9A56BAgOEFU&adurl=', + 'vendor_link': 'https://www.google.com/search?tbm=shop&q=cheap%20lacoste%20shoes', + }, + { + 'link': 'https://www.chausport.com/p/lacoste-carnaby-evo-noire-enfant-173257.html', + 'merchant_name': 'Chausport', + 'price': '45,00 €', + 'rank': 2, + 'title': 'Tennis Lacoste Carnaby Evo Noire Enfant 28', + 'tracking_link': '/aclk?sa=L&ai=DChcSEwjJqLX1v4bqAhXJlBgKHYRrDO4YABAFGgJsZQ&sig=AOD64_0lhZrLNYCENmxzquCMa5M4_D04ng&ctype=5&q=&ved=2ahUKEwjPmK31v4bqAhXLxYUKHe8BByEQ9A56BAgOEGA&adurl=', + 'vendor_link': 'http://www.choozen.fr/nf/gs-cheap%20lacoste%20shoes.htm?kpartnerid=96955353', + }, + { + 'link': 'https://www.getthelabel.com/fr/p/lacoste-baskets-lerond-418/138256', + 'merchant_name': 'GetTheLabel.c...', + 'price': '44,99 €', + 'rank': 3, + 'title': 'Lacoste Baskets Lerond 418 Size 9 in Blanc pour Homme', + 'tracking_link': '/aclk?sa=l&ai=DChcSEwjJqLX1v4bqAhXJlBgKHYRrDO4YABAIGgJsZQ&sig=AOD64_13MoA9It0w-yp3GqriMf13OPLI8w&ctype=5&q=&ved=2ahUKEwjPmK31v4bqAhXLxYUKHe8BByEQ9A56BAgOEG0&adurl=', + 'vendor_link': 'https://highstreetone.com/?search=cheap%20lacoste%20shoes', + }, + { + 'link': 'https://www.sarenza.com/lacoste-carnaby-evo-120-2-s834061-br918-t76-p0000227925#size=39-39', + 'merchant_name': 'Sarenza', + 'price': '45,50 €', + 'originalPrice': '65 €', + 'rank': 4, + 'title': 'Lacoste Carnaby Evo 120 2 Blanc - Baskets - Disponible en 39', + 'tracking_link': '/aclk?sa=l&ai=DChcSEwjJqLX1v4bqAhXJlBgKHYRrDO4YABANGgJsZQ&sig=AOD64_1Q6WUe8YXjhb-y_k0rErD2WUsTqQ&ctype=5&q=&ved=2ahUKEwjPmK31v4bqAhXLxYUKHe8BByEQ9A56BAgOEHk&adurl=', + 'vendor_link': 'https://www.feed-price.com/search/cheap%20lacoste%20shoes', + }, + { + 'link': 'https://www.spartoo.com/Lacoste-CARNABY-EVO-BL-1-x4736301.php?track_id=adwo_fgl&sx=B&utm_source=froogle&utm_medium=comparateurs&utm_content=4736301&utm_campaign=adwo_fgl&size_id=158&fcsize=1&sx=B', + 'merchant_name': 'Spartoo.com', + 'price': '58,00 €', + 'rank': 5, + 'title': 'Lacoste CARNABY EVO BL 1 Baskets basses enfant (garcons)', + 'tracking_link': '/aclk?sa=l&ai=DChcSEwjJqLX1v4bqAhXJlBgKHYRrDO4YABAMGgJsZQ&sig=AOD64_0NfyG0tH5Pc7kPfADKcQflx78H1g&ctype=5&q=&ved=2ahUKEwjPmK31v4bqAhXLxYUKHe8BByEQ9A56BQgOEIcB&adurl=', + 'vendor_link': 'https://www.google.com/search?tbm=shop&q=cheap%20lacoste%20shoes', + }, + { + 'link': 'https://www.nike.com/fr/t/nikecourt-royale-shoe-KyTwJwgV/749747-111', + 'merchant_name': 'Nike Officiel', + 'price': '55,00 €', + 'rank': 6, + 'title': 'Chaussure Nike Court Royale pour Homme - Blanc', + 'tracking_link': '/aclk?sa=l&ai=DChcSEwjJqLX1v4bqAhXJlBgKHYRrDO4YABASGgJsZQ&sig=AOD64_2KQENuVGnvXutmSUufDSa4FnTYsw&ctype=5&q=&ved=2ahUKEwjPmK31v4bqAhXLxYUKHe8BByEQ9A56BQgOEJIB&adurl=', + 'vendor_link': 'https://www.pricesearcher.com/css/search/?p=1&q=cheap%20lacoste%20shoes&utm_source=google&utm_medium=css', + } + ]) + }); + }); + + it('extract google shopping on top', function () { + const googleScraper = new GoogleScraper({ + config: { + search_engine_name: 'google', + throw_on_detection: true, + keywords: ['shopping 2'], + logger: testLogger, + scrape_from_file: '', + num_pages: 1, + } + }); + googleScraper.STANDARD_TIMEOUT = 500; + return googleScraper.run({page}).then(({results, metadata, num_requests}) => { + assert.strictEqual(num_requests, 1, 'One request should be done'); + assert.strictEqual(results['shopping 2']['1'].results.length, 10, 'Must have 10 organic results parsed on page 1'); + assert.deepEqual(results['shopping 2']['1'].top_products[2], { + "link": "https://www.zalando.fr/lacoste-sideline-cub-chaussons-pour-bebe-whitegreen-la216f003-k11.html?size=17&allophones=0", + "merchant_name": "Zalando.fr", + "price": "31,95 €", + "rank": 3, + 'shipping': 'Livraison gratuite', + "title": "Lacoste Sideline CUB Cadeau de naissance white/green, gender.kids.unisex, Taille: 17, Blanc - Imitation cuir/textile", + "tracking_link": "/aclk?sa=l&ai=DChcSEwjt7o3yj4nqAhVZhdUKHbshBNwYABASGgJ3cw&sig=AOD64_0usikwrH4jD5vqtbS7vVoCrNxMOg&ctype=5&q=&ved=2ahUKEwj0w4fyj4nqAhWZDGMBHY7HAzAQww96BAgOEFI&adurl=", + "vendor_link": "https://fr.shoptail.eu/cheap%20lacoste%20shoes", + }) + }); + }); + + it('shopping extract right one product', function () { + const googleScraper = new GoogleScraper({ + config: { + search_engine_name: 'google', + throw_on_detection: true, + keywords: ['shopping right product review'], + logger: testLogger, + scrape_from_file: '', + num_pages: 1, + } + }); + googleScraper.STANDARD_TIMEOUT = 500; + return googleScraper.run({page}).then(({results, metadata, num_requests}) => { + assert.strictEqual(num_requests, 1, 'One request should be done'); + assert.strictEqual(results['shopping right product review']['1'].results.length, 9, 'Must have 9 organic results parsed on page 1'); + assert.deepEqual(results['shopping right product review']['1'].right_info, { + title: 'Lacoste Lunettes', + 'num_reviews': '146 avis', + 'review': 'Note : 4,6 sur 5', + 'vendors': [ + { + 'info': '317 · 2807', + 'merchant_ad_link': 'https://www.googleadservices.com/pagead/aclk?sa=L&ai=DChcSEwihq9C82ojqAhUIyrIKHbIHAx8YABACGgJscg&ohost=www.google.com&cid=CAASE-Roz5UHMJg95vk99OwXQnKbUG0&sig=AOD64_0Wfsw3t3eO_yEtq8lWRIjiF6EqZw&ctype=5&q=&ved=2ahUKEwjsqsi82ojqAhVFPBoKHY38DAIQ9A56BAgNEH0&adurl=', + 'merchant_name': 'Edel-Optics FR', + 'price': '102,75 €', + 'shipping': 'Livraison gratuite', + 'source_link': 'https://www.google.com/search?tbm=shop&q=lacoste%20317', + 'source_name': 'Par Google', + }, + { + 'info': '317 · 2805', + 'merchant_ad_link': 'https://www.googleadservices.com/pagead/aclk?sa=L&ai=DChcSEwihq9C82ojqAhUIyrIKHbIHAx8YABADGgJscg&ohost=www.google.com&cid=CAASE-Roz5UHMJg95vk99OwXQnKbUG0&sig=AOD64_2R4Idoiqc783K8OLyv9W9YQTJfog&ctype=5&q=&ved=2ahUKEwjsqsi82ojqAhVFPBoKHY38DAIQ9A56BQgNEIEB&adurl=', + 'merchant_name': 'EasyLunettes.fr', + 'price': '75,00 €', + 'shipping': 'Livraison gratuite', + 'source_link': 'https://producthero.com/?utm_source=google&utm_medium=css&q=lacoste%20317', + 'source_name': 'Par Producthero', + } + ] + }); + }); + }); + +}); diff --git a/test/proxy.js b/test/proxy.js index c1092ea..209d782 100644 --- a/test/proxy.js +++ b/test/proxy.js @@ -21,7 +21,7 @@ fakeSearchEngine.set('trust proxy', 'loopback'); fakeSearchEngine.get('/test-proxy', (req, res) => { debug('fake-search-engine req.hostname=%s', req.hostname); //debug('req to', req.socket.localAddress, req.socket.localPort); - res.send(req.hostname); + setTimeout(() => res.send(req.hostname), 100); // Add timeout here because raise condition for first test }); describe('Config', function(){ diff --git a/test/scrape-manager.js b/test/scrape-manager.js new file mode 100644 index 0000000..f20ee3f --- /dev/null +++ b/test/scrape-manager.js @@ -0,0 +1,122 @@ +'use strict'; +const express = require('express'); +const { createLogger, transports } = require('winston'); +const http = require('http'); +const https = require('https'); +const assert = require('assert'); +const path = require('path'); +const keyCert = require('key-cert'); +const Promise = require('bluebird'); +const Proxy = require('http-mitm-proxy'); + +const debug = require('debug')('se-scraper:test'); +const se_scraper = require('../'); + +const httpPort = 3012; +const httpsPort = httpPort + 1; +const proxyPort = httpPort + 2; + +const fakeSearchEngine = express(); +fakeSearchEngine.get('/search', (req, res) => { + debug('q=%s', req.query.q); + const pageNumber = ((req.query.start/10) || 0) + 1; + res.sendFile(path.join(__dirname, 'mocks/google/' + req.query.q + '_page' + pageNumber + '.html')); +}); +fakeSearchEngine.use(express.static('test/mocks/google', {extensions: ['html']})); + +describe('ScrapeManager', function(){ + + let httpServer, httpsServer, proxy; + before(async function(){ + // Here mount our fake engine in both http and https listen server + httpServer = http.createServer(fakeSearchEngine); + httpsServer = https.createServer(await keyCert(), fakeSearchEngine); + + proxy = Proxy(); + proxy.onRequest((ctx, callback) => { + ctx.proxyToServerRequestOptions.host = 'localhost'; + ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort; + ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine'; + debug('Proxy request to %s', ctx.clientToProxyRequest.headers.host); + return callback(); + }); + + await Promise.promisify(proxy.listen, {context: proxy})({port: proxyPort}); + await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort); + await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort); + debug('Fake http search engine servers started'); + }); + + after(function(){ + httpsServer.close(); + httpServer.close(); + proxy.close(); + }); + + describe('.quit()', function(){ + + const testLogger = createLogger({ + transports: [ + new transports.Console({ + level: 'error' + }) + ] + }); + + /** + * Test if quit correctly close all opened chrome + */ + it('Ensure all chrome are closed after .quit() has been called', async function () { + + const scrape_job = { + search_engine: 'google', + /* TODO refactor start_url + google_settings: { + start_url: 'http://localhost:' + httpPort + }, + */ + keywords: ['test keyword'], + }; + + var scraper = new se_scraper.ScrapeManager({ + throw_on_detection: true, + logger: testLogger, + // TODO refactor start_url so we can use-it instead of depending of the proxy for this test + proxies: ['http://localhost:' + proxyPort], + use_proxies_only: true, + }); + await scraper.start(); + const { results } = await scraper.scrape(scrape_job); + await scraper.quit(); + + // TODO Check if all puppeteer chrome are stopped here + }); + + + it('Ensure all chrome are closed after .scrape() has been called on index module', async function () { + + const scrape_job = { + search_engine: 'google', + /* TODO refactor start_url + google_settings: { + start_url: 'http://localhost:' + httpPort + }, + */ + keywords: ['test keyword'], + }; + + var results = await se_scraper.scrape({ + throw_on_detection: true, + logger: testLogger, + // TODO refactor start_url so we can use-it instead of depending of the proxy for this test + proxies: ['http://localhost:' + proxyPort], + use_proxies_only: true, + }, scrape_job); + + // TODO Check if all puppeteer chrome are stopped here + + }); + + }); + +}); \ No newline at end of file