Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions src/modules/se_scraper.js
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ module.exports = class Scraper {
this.proxy = config.proxy;
this.keywords = config.keywords;

this.STANDARD_TIMEOUT = 10000;
this.SOLVE_CAPTCHA_TIME = 45000;
this.STANDARD_TIMEOUT = config.standard_timeout;
this.SOLVE_CAPTCHA_TIME = config.solve_captcha_time;

this.results = {};
this.result_rank = 1;
Expand Down Expand Up @@ -272,6 +272,12 @@ module.exports = class Scraper {
await this.page.screenshot({ path: `debug_se_scraper_${this.config.search_engine_name}_${keyword}.png` });
}

if (this.config.keep_html_on_error){
const html_error = await this.page.content();
e.html_on_error = html_error;
e.lastUrl = await this.page.evaluate(() => {return window.location.href;});
}

this.metadata.scraping_detected = await this.detected();

if (this.metadata.scraping_detected === true) {
Expand Down
3 changes: 3 additions & 0 deletions src/node_scraper.js
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,9 @@ class ScrapeManager {
//custom_func: resolve('examples/pluggable.js'),
custom_func: null,
throw_on_detection: false,
keep_html_on_error: false,
standard_timeout: 10000,
solve_captcha_time: 45000,
// List of proxies to use ['socks5://78.94.172.42:1080', 'http://localhost:1080']
proxies: null,
// a file with one proxy per line. Example:
Expand Down
108 changes: 108 additions & 0 deletions test/keep_html_on_error.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
'use strict';
const express = require('express');
const { createLogger, transports } = require('winston');
const http = require('http');
const https = require('https');
const assert = require('assert');
const path = require('path');
const keyCert = require('key-cert');
const Promise = require('bluebird');
const Proxy = require('http-mitm-proxy');

const debug = require('debug')('se-scraper:test');
const se_scraper = require('..');

const httpPort = 3012;
const httpsPort = httpPort + 1;
const proxyPort = httpPort + 2;

const fakeSearchEngine = express();
fakeSearchEngine.get('/search', (req, res) => {
debug('q=%s', req.query.q);
const pageNumber = ((req.query.start/10) || 0) + 1;
res.sendFile(path.join(__dirname, 'mocks/google/' + req.query.q + '_page' + pageNumber + '.html'));
});
fakeSearchEngine.use(express.static('test/mocks/google', {extensions: ['html']}));

describe('Config', function(){

let httpServer, httpsServer, proxy;
before(async function(){
// Here mount our fake engine in both http and https listen server
httpServer = http.createServer(fakeSearchEngine);
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);

proxy = Proxy();
proxy.onRequest((ctx, callback) => {
ctx.proxyToServerRequestOptions.host = 'localhost';
ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
debug('Proxy request to %s', ctx.clientToProxyRequest.headers.host);
return callback();
});

await Promise.promisify(proxy.listen, {context: proxy})({port: proxyPort});
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
debug('Fake http search engine servers started');
});

after(function(){
httpsServer.close();
httpServer.close();
proxy.close();
});

describe('keep_html_on_error', function(){

const testLogger = createLogger({
transports: [
new transports.Console({
level: 'error'
})
]
});

/**
* Test html_output option
*/
it('html_output single page single keyword', async function () {

const scrape_job = {
search_engine: 'google',
/* TODO refactor start_url
google_settings: {
start_url: 'http://localhost:' + httpPort
},
*/
keywords: ['test error'],
};

var scraper = new se_scraper.ScrapeManager({
throw_on_detection: true,
keep_html_on_error: true,
logger: testLogger,
//clean_html_output: false,
//clean_data_images: false,
// TODO refactor start_url so we can use-it instead of depending of the proxy for this test
proxies: ['http://localhost:' + proxyPort],
use_proxies_only: true,
standard_timeout: 500,
});
await scraper.start();
await assert.rejects(
async () => {
await scraper.scrape(scrape_job);
},
(error) => {
assert(error.html_on_error, 'Error is containing the html output');
return /#fbar/.test(error.message);
}
)
await scraper.quit();

});

});

});
1 change: 1 addition & 0 deletions test/mocks/google/test error_page1.html
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
THIS IS A EMPTY PAGE TO THROW SOME ERROR IN SE-SCRAPER
2 changes: 1 addition & 1 deletion test/proxy.js
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ fakeSearchEngine.set('trust proxy', 'loopback');
fakeSearchEngine.get('/test-proxy', (req, res) => {
debug('fake-search-engine req.hostname=%s', req.hostname);
//debug('req to', req.socket.localAddress, req.socket.localPort);
res.send(req.hostname);
setTimeout(() => res.send(req.hostname), 100); // Add timeout here because raise condition for first test
});

describe('Config', function(){
Expand Down