Monibrand · Symbolic76 · Aug 6, 2019 · Jan 24, 2020
diff --git a/README.md b/README.md
@@ -1,8 +1,9 @@
 # Search Engine Scraper - se-scraper
 
-[![npm](https://img.shields.io/npm/v/se-scraper.svg?style=for-the-badge)](https://www.npmjs.com/package/se-scraper)
-[![Donate](https://img.shields.io/badge/donate-paypal-blue.svg?style=for-the-badge)](https://www.paypal.me/incolumitas)
-[![Known Vulnerabilities](https://snyk.io/test/github/NikolaiT/se-scraper/badge.svg)](https://snyk.io/test/github/NikolaiT/se-scraper)
+THIS IS A CUSTOM FORK of se-scraper, our changes will or have been
+upstreamed.
+
+The original package is [here](https://www.npmjs.com/package/se-scraper).
 
 This node module allows you to scrape search engines concurrently with different proxies.
 
@@ -506,4 +507,4 @@ let scrape_config = {
         num: 100, // Determines the number of results to show, defaults to 10. Maximum is 100.
     },
 }
-```
+```
diff --git a/package.json b/package.json
@@ -1,8 +1,8 @@
 {
-  "name": "se-scraper",
-  "version": "1.5.7",
+  "name": "@monibrand/se-scraper",
+  "version": "1.6.0-rc.2",
   "description": "A module using puppeteer to scrape several search engines such as Google, Bing and Duckduckgo",
-  "homepage": "https://scrapeulous.com/",
+  "homepage": "https://monibrand.com/",
   "main": "index.js",
   "scripts": {
     "test": "mocha test test/modules"
@@ -17,7 +17,7 @@
   "author": "Nikolai Tschacher <[email protected]> (https://incolumitas.com/)",
   "repository": {
     "type": "git",
-    "url": "https://github.com/NikolaiT/se-scraper"
+    "url": "https://github.com/Monibrand/se-scraper"
   },
   "license": "ISC",
   "dependencies": {

diff --git a/src/modules/se_scraper.js b/src/modules/se_scraper.js
@@ -31,8 +31,8 @@ module.exports = class Scraper {
         this.proxy = config.proxy;
         this.keywords = config.keywords;
 
-        this.STANDARD_TIMEOUT = 10000;
-        this.SOLVE_CAPTCHA_TIME = 45000;
+        this.STANDARD_TIMEOUT = config.standard_timeout;
+        this.SOLVE_CAPTCHA_TIME = config.solve_captcha_time;
 
         this.results = {};
         this.result_rank = 1;
@@ -272,6 +272,12 @@ module.exports = class Scraper {
                     await this.page.screenshot({ path: `debug_se_scraper_${this.config.search_engine_name}_${keyword}.png` });
                 }
 
+                if (this.config.keep_html_on_error){
+                    const html_error = await this.page.content();
+                    e.keep_html_on_error = html_error;
+                    e.lastUrl = await this.page.evaluate(() => {return window.location.href;});
+                }
+
                 this.metadata.scraping_detected = await this.detected();
 
                 if (this.metadata.scraping_detected === true) {

diff --git a/src/node_scraper.js b/src/node_scraper.js
@@ -139,6 +139,9 @@ class ScrapeManager {
             //custom_func: resolve('examples/pluggable.js'),
             custom_func: null,
             throw_on_detection: false,
+            keep_html_on_error: false,
+            standard_timeout: 10000,
+            solve_captcha_time: 45000, 
             // List of proxies to use ['socks5://78.94.172.42:1080', 'http://localhost:1080']
             proxies: null,
             // a file with one proxy per line. Example:

diff --git a/test/keep_html_on_error.js b/test/keep_html_on_error.js
@@ -0,0 +1,109 @@
+'use strict';
+const express = require('express');
+const { createLogger, transports } = require('winston');
+const http = require('http');
+const https = require('https');
+const assert = require('assert');
+const path = require('path');
+const keyCert = require('key-cert');
+const Promise = require('bluebird');
+const Proxy = require('http-mitm-proxy');
+
+const debug = require('debug')('se-scraper:test');
+const se_scraper = require('..');
+
+const httpPort = 3012;
+const httpsPort = httpPort + 1;
+const proxyPort = httpPort + 2;
+
+const fakeSearchEngine = express();
+fakeSearchEngine.get('/search', (req, res) => {
+    debug('q=%s', req.query.q);
+    const pageNumber = ((req.query.start/10) || 0)  + 1;
+    res.sendFile(path.join(__dirname, 'mocks/google/' + req.query.q + '_page' + pageNumber + '.html'));
+});
+fakeSearchEngine.use(express.static('test/mocks/google', {extensions: ['html']}));
+
+describe('Config', function(){
+
+    let httpServer, httpsServer, proxy;
+    before(async function(){
+        // Here mount our fake engine in both http and https listen server
+        httpServer = http.createServer(fakeSearchEngine);
+        httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
+
+        proxy = Proxy();
+        proxy.onRequest((ctx, callback) => {
+            ctx.proxyToServerRequestOptions.host = 'localhost';
+            ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
+            ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
+            debug('Proxy request to %s', ctx.clientToProxyRequest.headers.host);
+            return callback();
+        });
+
+        await Promise.promisify(proxy.listen, {context: proxy})({port: proxyPort});
+        await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
+        await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
+        debug('Fake http search engine servers started');
+    });
+
+    after(function(){
+        httpsServer.close();
+        httpServer.close();
+        proxy.close();
+    });
+
+    describe('keep_html_on_error', function(){
+
+        const testLogger = createLogger({
+            transports: [
+                new transports.Console({
+                    level: 'error'
+                })
+            ]
+        });
+
+        /**
+         * Test html_output option
+         */
+        it('html_output single page single keyword', async function () {
+
+            const scrape_job = {
+                search_engine: 'google',
+                /* TODO refactor start_url
+                google_settings: {
+                    start_url: 'http://localhost:' + httpPort
+                },
+                */
+                keywords: ['test error'],
+            };
+
+            var scraper = new se_scraper.ScrapeManager({
+                throw_on_detection: true,
+                keep_html_on_error: true,
+                logger: testLogger,
+                //clean_html_output: false,
+                //clean_data_images: false,
+                // TODO refactor start_url so we can use-it instead of depending of the proxy for this test
+                proxies: ['http://localhost:' + proxyPort],
+                use_proxies_only: true,
+                standard_timeout: 500,
+            });
+            await scraper.start();
+            await assert.rejects(
+                async () => {
+                    await scraper.scrape(scrape_job);
+                },
+                (error) => {
+                    console.log(error);
+                    assert(error.html_on_error, 'Error is containing the html output');
+                    return /#fbar/.test(error.name);
+                }
+            )
+            await scraper.quit();
+
+        });
+
+    });
+
+});
diff --git a/test/mocks/google/test error_page1.html b/test/mocks/google/test error_page1.html
@@ -0,0 +1 @@
+dfjefiojifiefjoezji jiofjijifjeziojfioj jigjieozjfioejzij   ijfijezifjoizejiofj
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		dfjefiojifiefjoezji jiofjijifjeziojfioj jigjieozjfioejzij ijfijezifjoizejiofj