NikolaiT · kazerlelutin · Mar 31, 2020 · Apr 1, 2020 · Apr 1, 2020 · Jun 16, 2020
diff --git a/README.md b/README.md
@@ -1,8 +1,9 @@
 # Search Engine Scraper - se-scraper
 
-[![npm](https://img.shields.io/npm/v/se-scraper.svg?style=for-the-badge)](https://www.npmjs.com/package/se-scraper)
-[![Donate](https://img.shields.io/badge/donate-paypal-blue.svg?style=for-the-badge)](https://www.paypal.me/incolumitas)
-[![Known Vulnerabilities](https://snyk.io/test/github/NikolaiT/se-scraper/badge.svg)](https://snyk.io/test/github/NikolaiT/se-scraper)
+THIS IS A CUSTOM FORK of se-scraper, our changes will or have been
+upstreamed.
+
+The original package is [here](https://www.npmjs.com/package/se-scraper).
 
 This node module allows you to scrape search engines concurrently with different proxies.
 
@@ -506,4 +507,4 @@ let scrape_config = {
         num: 100, // Determines the number of results to show, defaults to 10. Maximum is 100.
     },
 }
-```
+```
diff --git a/package.json b/package.json
@@ -1,8 +1,8 @@
 {
-  "name": "se-scraper",
-  "version": "1.5.7",
+  "name": "@monibrand/se-scraper",
+  "version": "1.6.0-rc.8",
   "description": "A module using puppeteer to scrape several search engines such as Google, Bing and Duckduckgo",
-  "homepage": "https://scrapeulous.com/",
+  "homepage": "https://monibrand.com/",
   "main": "index.js",
   "scripts": {
     "test": "mocha test test/modules"
@@ -17,7 +17,7 @@
   "author": "Nikolai Tschacher <[email protected]> (https://incolumitas.com/)",
   "repository": {
     "type": "git",
-    "url": "https://github.com/NikolaiT/se-scraper"
+    "url": "https://github.com/Monibrand/se-scraper"
   },
   "license": "ISC",
   "dependencies": {

diff --git a/src/modules/bing.js b/src/modules/bing.js
@@ -2,6 +2,8 @@ const cheerio = require('cheerio');
 const Scraper = require('./se_scraper');
 
 class BingScraper extends Scraper {
+
+    defaultStartUrl = this.build_start_url('https://www.bing.com/search?') || 'https://www.bing.com/';
 
     async parse_async(html) {
 

diff --git a/src/modules/duckduckgo.js b/src/modules/duckduckgo.js
@@ -4,6 +4,8 @@ const debug = require('debug')('se-scraper:DuckduckgoScraper');
 
 class DuckduckgoScraper extends Scraper {
 
+    defaultStartUrl = 'https://duckduckgo.com/';
+
     parse(html) {
         debug('parse');
         // load the page source into cheerio
@@ -46,11 +48,8 @@ class DuckduckgoScraper extends Scraper {
 
     async load_start_page() {
         debug('load_start_page');
-        let startUrl = 'https://duckduckgo.com/';
-
-        this.last_response = await this.page.goto(startUrl);
+        this.last_response = await this.page.goto(this.startUrl);
         await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
-
         return true;
     }
 

diff --git a/src/modules/google.js b/src/modules/google.js
@@ -5,6 +5,8 @@ const Scraper = require('./se_scraper');
 
 class GoogleScraper extends Scraper {
 
+    defaultStartUrl = 'https://www.google.com';
+
     constructor(...args) {
         super(...args);
     }
@@ -13,13 +15,13 @@ class GoogleScraper extends Scraper {
 
         const results = await this.page.evaluate(() => {
 
-            let _text = (el, s) => {
+            let _text = (el, s, onlyFirstTextNode) => {
                 let n = el.querySelector(s);
 
                 if (n) {
-                    return n.innerText;
+                    return (onlyFirstTextNode) ? n.childNodes[0].nodeValue : n.innerText;
                 } else {
-                    return '';
+                    return;
                 }
             };
 
@@ -29,7 +31,7 @@ class GoogleScraper extends Scraper {
                 if (n) {
                     return n.getAttribute(attr);
                 } else {
-                    return null;
+                    return;
                 }
             };
 
@@ -111,14 +113,14 @@ class GoogleScraper extends Scraper {
             // parse right side product information
             results.right_info.review = _attr(document, '#rhs .cu-container g-review-stars span', 'aria-label');
 
-            let title_el = document.querySelector('#rhs .cu-container g-review-stars');
+            let title_el = document.querySelector('#rhs .cu-container .Q7Oxbd');
             if (title_el) {
-                results.right_info.review.title = title_el.parentNode.querySelector('div:first-child').innerText;
+                results.right_info.title = title_el.innerText;
             }
 
-            let num_reviews_el = document.querySelector('#rhs .cu-container g-review-stars');
+            let num_reviews_el = document.querySelector('#rhs .cu-container .PGDKUd');
             if (num_reviews_el) {
-                results.right_info.num_reviews = num_reviews_el.parentNode.querySelector('div:nth-of-type(2)').innerText;
+                results.right_info.num_reviews = num_reviews_el.innerText;
             }
 
             results.right_info.vendors = [];
@@ -127,20 +129,16 @@ class GoogleScraper extends Scraper {
             document.querySelectorAll('#rhs .cu-container .rhsvw > div > div:nth-child(4) > div > div:nth-child(3) > div').forEach((el) => {
                 results.right_info.vendors.push({
                     price: _text(el, 'span:nth-of-type(1)'),
-                    merchant_name: _text(el, 'span:nth-child(3) a:nth-child(2)'),
+                    merchant_name: _text(el, '.doUe3s0oL2B__jackpot-merchant a'),
                     merchant_ad_link: _attr(el, 'span:nth-child(3) a:first-child', 'href'),
-                    merchant_link: _attr(el, 'span:nth-child(3) a:nth-child(2)', 'href'),
+                    merchant_link: _attr(el, 'span:nth-child(3) a:nth-child(2)', 'href'), // TODO this is not working anymore
                     source_name: _text(el, 'span:nth-child(4) a'),
                     source_link: _attr(el, 'span:nth-child(4) a', 'href'),
-                    info: _text(el, 'div span'),
-                    shipping: _text(el, 'span:last-child > span'),
+                    info: _text(el, '.SdBHnc.e2CF7c'),
+                    shipping: _text(el, '.JfwJme'),
                 })
             });
 
-            if (!results.right_info.title) {
-                results.right_info = {};
-            }
-
             let right_side_info_el = document.getElementById('rhs');
 
             if (right_side_info_el) {
@@ -151,26 +149,19 @@ class GoogleScraper extends Scraper {
                 }
             }
 
-            // parse top main column product information
-            // #tvcap .pla-unit
-            document.querySelectorAll('#tvcap .pla-unit').forEach((el) => {
+            // Parse Google Shopping top or left
+            document.querySelectorAll('.pla-unit').forEach((el) => {
                 let top_product = {
                     tracking_link: _attr(el, '.pla-unit-title a:first-child', 'href'),
                     link: _attr(el, '.pla-unit-title a:nth-child(2)', 'href'),
                     title: _text(el, '.pla-unit-title a:nth-child(2) span'),
-                    price: _text(el, '.pla-unit-title + div'),
-                    shipping: _text(el, '.pla-extensions-container div:nth-of-type(1)'),
-                    vendor_link: _attr(el,'.pla-extensions-container div > a', 'href'),
+                    price: _text(el, '.pla-unit-title + div', true),
+                    originalPrice: _text(el, '.pla-unit-title + div > span'),
+                    shipping: _text(el, '.pla-extensions-container .cYBBsb'),
+                    vendor_link: _attr(el,'.pla-extensions-container a.FfKHB', 'href'),
+                    merchant_name: _text(el,'.LbUacb span:nth-child(1)'),
                 };
 
-                let merchant_node = el.querySelector('.pla-unit-title');
-                if (merchant_node) {
-                    let node = merchant_node.parentNode.querySelector('div > span');
-                    if (node) {
-                        top_product.merchant_name = node.innerText;
-                    }
-                }
-
                 results.top_products.push(top_product);
             });
 
@@ -224,29 +215,18 @@ class GoogleScraper extends Scraper {
     }
 
     async load_start_page() {
-        let startUrl = 'https://www.google.com';
+        this.logger.info('Using startUrl: ' + this.startUrl);
+        this.last_response = await this.page.goto(this.startUrl);
 
-        if (this.config.google_settings) {
-            startUrl = `https://www.${this.config.google_settings.google_domain}/search?q=`;
-            if (this.config.google_settings.google_domain) {
-                startUrl = `https://www.${this.config.google_settings.google_domain}/search?`;
-            } else {
-                startUrl = `https://www.google.com/search?`;
-            }
+        await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
 
-            for (var key in this.config.google_settings) {
-                if (key !== 'google_domain') {
-                    startUrl += `${key}=${this.config.google_settings[key]}&`
-                }
-            }
+        const buttonAccepted = await this.page.$('#L2AGLb');
+        if (buttonAccepted) {
+            await this.page.evaluate(() => {
+                document.querySelector('#L2AGLb').click();
+            });
         }
 
-        this.logger.info('Using startUrl: ' + startUrl);
-
-        this.last_response = await this.page.goto(startUrl);
-
-        await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });
-
         return true;
     }
 

diff --git a/src/modules/infospace.js b/src/modules/infospace.js
@@ -41,11 +41,8 @@ class InfospaceScraper extends Scraper {
     }
 
     async load_start_page() {
-
-        let startUrl = this.build_start_url('http://search.infospace.com/search/web?') || 'http://infospace.com/index.html';
-
         try {
-            this.last_response = await this.page.goto(startUrl);
+            this.last_response = await this.page.goto(this.startUrl);
             await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
         } catch (e) {
             return false;

diff --git a/src/modules/se_scraper.js b/src/modules/se_scraper.js
@@ -31,8 +31,8 @@ module.exports = class Scraper {
         this.proxy = config.proxy;
         this.keywords = config.keywords;
 
-        this.STANDARD_TIMEOUT = 10000;
-        this.SOLVE_CAPTCHA_TIME = 45000;
+        this.STANDARD_TIMEOUT = config.standard_timeout;
+        this.SOLVE_CAPTCHA_TIME = config.solve_captcha_time;
 
         this.results = {};
         this.result_rank = 1;
@@ -272,6 +272,12 @@ module.exports = class Scraper {
                     await this.page.screenshot({ path: `debug_se_scraper_${this.config.search_engine_name}_${keyword}.png` });
                 }
 
+                if (this.config.keep_html_on_error){
+                    const html_error = await this.page.content();
+                    e.html_on_error = html_error;
+                    e.lastUrl = await this.page.evaluate(() => {return window.location.href;});
+                }
+
                 this.metadata.scraping_detected = await this.detected();
 
                 if (this.metadata.scraping_detected === true) {
@@ -312,7 +318,6 @@ module.exports = class Scraper {
             for (var key in settings) {
                 baseUrl += `${key}=${settings[key]}&`
             }
-
             this.logger.info('Using startUrl: ' + baseUrl);
 
             return baseUrl;
@@ -381,6 +386,10 @@ module.exports = class Scraper {
 
     }
 
+    get startUrl(){
+        return this.build_start_url(this.config.startUrl  || this.defaultStartUrl);
+    }
+
     /**
      *
      * @returns true if startpage was loaded correctly.

diff --git a/src/modules/yandex.js b/src/modules/yandex.js
@@ -4,6 +4,8 @@ const Scraper = require('./se_scraper');
 
 class YandexScraper extends Scraper {
 
+    defaultStartUrl = 'https://yandex.com';
+
     constructor(...args) {
         super(...args);
     }
@@ -71,11 +73,10 @@ class YandexScraper extends Scraper {
     }
 
     async load_start_page() {
-        let startUrl = 'https://yandex.com';
+
+        this.logger.info('Using startUrl: ' + this.startUrl);
 
-        this.logger.info('Using startUrl: ' + startUrl);
-
-        this.last_response = await this.page.goto(startUrl);
+        this.last_response = await this.page.goto(this.startUrl);
 
         await this.page.waitForSelector('input[name="text"]', { timeout: this.STANDARD_TIMEOUT });
 

diff --git a/src/node_scraper.js b/src/node_scraper.js
@@ -139,6 +139,9 @@ class ScrapeManager {
             //custom_func: resolve('examples/pluggable.js'),
             custom_func: null,
             throw_on_detection: false,
+            keep_html_on_error: false,
+            standard_timeout: 10000,
+            solve_captcha_time: 45000,
             // List of proxies to use ['socks5://78.94.172.42:1080', 'http://localhost:1080']
             proxies: null,
             // a file with one proxy per line. Example: