From f4b91b3cf635ad60218834afd9bb0376ea9cb6f2 Mon Sep 17 00:00:00 2001 From: HugoPoi Date: Tue, 6 Aug 2019 12:44:41 +0200 Subject: [PATCH 1/2] watermark fork version in package.json and README.md --- README.md | 9 +++++---- package.json | 8 ++++---- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 85ae5a6..f39192a 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,9 @@ # Search Engine Scraper - se-scraper -[![npm](https://img.shields.io/npm/v/se-scraper.svg?style=for-the-badge)](https://www.npmjs.com/package/se-scraper) -[![Donate](https://img.shields.io/badge/donate-paypal-blue.svg?style=for-the-badge)](https://www.paypal.me/incolumitas) -[![Known Vulnerabilities](https://snyk.io/test/github/NikolaiT/se-scraper/badge.svg)](https://snyk.io/test/github/NikolaiT/se-scraper) +THIS IS A CUSTOM FORK of se-scraper, our changes will or have been +upstreamed. + +The original package is [here](https://www.npmjs.com/package/se-scraper). This node module allows you to scrape search engines concurrently with different proxies. @@ -506,4 +507,4 @@ let scrape_config = { num: 100, // Determines the number of results to show, defaults to 10. Maximum is 100. }, } -``` \ No newline at end of file +``` diff --git a/package.json b/package.json index 8533f13..24a562c 100644 --- a/package.json +++ b/package.json @@ -1,8 +1,8 @@ { - "name": "se-scraper", - "version": "1.5.7", + "name": "@monibrand/se-scraper", + "version": "1.6.0-rc.2", "description": "A module using puppeteer to scrape several search engines such as Google, Bing and Duckduckgo", - "homepage": "https://scrapeulous.com/", + "homepage": "https://monibrand.com/", "main": "index.js", "scripts": { "test": "mocha test test/modules" @@ -17,7 +17,7 @@ "author": "Nikolai Tschacher (https://incolumitas.com/)", "repository": { "type": "git", - "url": "https://github.com/NikolaiT/se-scraper" + "url": "https://github.com/Monibrand/se-scraper" }, "license": "ISC", "dependencies": { From 4c106c29147c82c3cfdcc2b77084286ef56dec55 Mon Sep 17 00:00:00 2001 From: Damien Date: Fri, 24 Jan 2020 16:22:11 +0100 Subject: [PATCH 2/2] =?UTF-8?q?Ajout=20d'une=20option=20pour=20r=C3=A9cup?= =?UTF-8?q?=C3=A9rer=20le=20HTML=20en=20cas=20d'erreur?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixing details for fbar-selector error fix: removing console log --- src/modules/se_scraper.js | 10 ++- src/node_scraper.js | 3 + test/keep_html_on_error.js | 109 ++++++++++++++++++++++++ test/mocks/google/test error_page1.html | 1 + 4 files changed, 121 insertions(+), 2 deletions(-) create mode 100644 test/keep_html_on_error.js create mode 100644 test/mocks/google/test error_page1.html diff --git a/src/modules/se_scraper.js b/src/modules/se_scraper.js index 3a453ac..2ace925 100644 --- a/src/modules/se_scraper.js +++ b/src/modules/se_scraper.js @@ -31,8 +31,8 @@ module.exports = class Scraper { this.proxy = config.proxy; this.keywords = config.keywords; - this.STANDARD_TIMEOUT = 10000; - this.SOLVE_CAPTCHA_TIME = 45000; + this.STANDARD_TIMEOUT = config.standard_timeout; + this.SOLVE_CAPTCHA_TIME = config.solve_captcha_time; this.results = {}; this.result_rank = 1; @@ -272,6 +272,12 @@ module.exports = class Scraper { await this.page.screenshot({ path: `debug_se_scraper_${this.config.search_engine_name}_${keyword}.png` }); } + if (this.config.keep_html_on_error){ + const html_error = await this.page.content(); + e.keep_html_on_error = html_error; + e.lastUrl = await this.page.evaluate(() => {return window.location.href;}); + } + this.metadata.scraping_detected = await this.detected(); if (this.metadata.scraping_detected === true) { diff --git a/src/node_scraper.js b/src/node_scraper.js index 2dec432..0671efd 100644 --- a/src/node_scraper.js +++ b/src/node_scraper.js @@ -139,6 +139,9 @@ class ScrapeManager { //custom_func: resolve('examples/pluggable.js'), custom_func: null, throw_on_detection: false, + keep_html_on_error: false, + standard_timeout: 10000, + solve_captcha_time: 45000, // List of proxies to use ['socks5://78.94.172.42:1080', 'http://localhost:1080'] proxies: null, // a file with one proxy per line. Example: diff --git a/test/keep_html_on_error.js b/test/keep_html_on_error.js new file mode 100644 index 0000000..631f436 --- /dev/null +++ b/test/keep_html_on_error.js @@ -0,0 +1,109 @@ +'use strict'; +const express = require('express'); +const { createLogger, transports } = require('winston'); +const http = require('http'); +const https = require('https'); +const assert = require('assert'); +const path = require('path'); +const keyCert = require('key-cert'); +const Promise = require('bluebird'); +const Proxy = require('http-mitm-proxy'); + +const debug = require('debug')('se-scraper:test'); +const se_scraper = require('..'); + +const httpPort = 3012; +const httpsPort = httpPort + 1; +const proxyPort = httpPort + 2; + +const fakeSearchEngine = express(); +fakeSearchEngine.get('/search', (req, res) => { + debug('q=%s', req.query.q); + const pageNumber = ((req.query.start/10) || 0) + 1; + res.sendFile(path.join(__dirname, 'mocks/google/' + req.query.q + '_page' + pageNumber + '.html')); +}); +fakeSearchEngine.use(express.static('test/mocks/google', {extensions: ['html']})); + +describe('Config', function(){ + + let httpServer, httpsServer, proxy; + before(async function(){ + // Here mount our fake engine in both http and https listen server + httpServer = http.createServer(fakeSearchEngine); + httpsServer = https.createServer(await keyCert(), fakeSearchEngine); + + proxy = Proxy(); + proxy.onRequest((ctx, callback) => { + ctx.proxyToServerRequestOptions.host = 'localhost'; + ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort; + ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine'; + debug('Proxy request to %s', ctx.clientToProxyRequest.headers.host); + return callback(); + }); + + await Promise.promisify(proxy.listen, {context: proxy})({port: proxyPort}); + await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort); + await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort); + debug('Fake http search engine servers started'); + }); + + after(function(){ + httpsServer.close(); + httpServer.close(); + proxy.close(); + }); + + describe('keep_html_on_error', function(){ + + const testLogger = createLogger({ + transports: [ + new transports.Console({ + level: 'error' + }) + ] + }); + + /** + * Test html_output option + */ + it('html_output single page single keyword', async function () { + + const scrape_job = { + search_engine: 'google', + /* TODO refactor start_url + google_settings: { + start_url: 'http://localhost:' + httpPort + }, + */ + keywords: ['test error'], + }; + + var scraper = new se_scraper.ScrapeManager({ + throw_on_detection: true, + keep_html_on_error: true, + logger: testLogger, + //clean_html_output: false, + //clean_data_images: false, + // TODO refactor start_url so we can use-it instead of depending of the proxy for this test + proxies: ['http://localhost:' + proxyPort], + use_proxies_only: true, + standard_timeout: 500, + }); + await scraper.start(); + await assert.rejects( + async () => { + await scraper.scrape(scrape_job); + }, + (error) => { + console.log(error); + assert(error.html_on_error, 'Error is containing the html output'); + return /#fbar/.test(error.name); + } + ) + await scraper.quit(); + + }); + + }); + +}); \ No newline at end of file diff --git a/test/mocks/google/test error_page1.html b/test/mocks/google/test error_page1.html new file mode 100644 index 0000000..f8f0bd6 --- /dev/null +++ b/test/mocks/google/test error_page1.html @@ -0,0 +1 @@ +dfjefiojifiefjoezji jiofjijifjeziojfioj jigjieozjfioejzij ijfijezifjoizejiofj \ No newline at end of file