Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
# Search Engine Scraper - se-scraper

[![npm](https://img.shields.io/npm/v/se-scraper.svg?style=for-the-badge)](https://www.npmjs.com/package/se-scraper)
[![Donate](https://img.shields.io/badge/donate-paypal-blue.svg?style=for-the-badge)](https://www.paypal.me/incolumitas)
[![Known Vulnerabilities](https://snyk.io/test/github/NikolaiT/se-scraper/badge.svg)](https://snyk.io/test/github/NikolaiT/se-scraper)
THIS IS A CUSTOM FORK of se-scraper, our changes will or have been
upstreamed.

The original package is [here](https://www.npmjs.com/package/se-scraper).

This node module allows you to scrape search engines concurrently with different proxies.

Expand Down Expand Up @@ -506,4 +507,4 @@ let scrape_config = {
num: 100, // Determines the number of results to show, defaults to 10. Maximum is 100.
},
}
```
```
8 changes: 4 additions & 4 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
{
"name": "se-scraper",
"version": "1.5.7",
"name": "@monibrand/se-scraper",
"version": "1.6.0-rc.2",
"description": "A module using puppeteer to scrape several search engines such as Google, Bing and Duckduckgo",
"homepage": "https://scrapeulous.com/",
"homepage": "https://monibrand.com/",
"main": "index.js",
"scripts": {
"test": "mocha test test/modules"
Expand All @@ -17,7 +17,7 @@
"author": "Nikolai Tschacher <[email protected]> (https://incolumitas.com/)",
"repository": {
"type": "git",
"url": "https://github.com/NikolaiT/se-scraper"
"url": "https://github.com/Monibrand/se-scraper"
},
"license": "ISC",
"dependencies": {
Expand Down
10 changes: 8 additions & 2 deletions src/modules/se_scraper.js
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ module.exports = class Scraper {
this.proxy = config.proxy;
this.keywords = config.keywords;

this.STANDARD_TIMEOUT = 10000;
this.SOLVE_CAPTCHA_TIME = 45000;
this.STANDARD_TIMEOUT = config.standard_timeout;
this.SOLVE_CAPTCHA_TIME = config.solve_captcha_time;

this.results = {};
this.result_rank = 1;
Expand Down Expand Up @@ -272,6 +272,12 @@ module.exports = class Scraper {
await this.page.screenshot({ path: `debug_se_scraper_${this.config.search_engine_name}_${keyword}.png` });
}

if (this.config.keep_html_on_error){
const html_error = await this.page.content();
e.keep_html_on_error = html_error;
e.lastUrl = await this.page.evaluate(() => {return window.location.href;});
}

this.metadata.scraping_detected = await this.detected();

if (this.metadata.scraping_detected === true) {
Expand Down
3 changes: 3 additions & 0 deletions src/node_scraper.js
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,9 @@ class ScrapeManager {
//custom_func: resolve('examples/pluggable.js'),
custom_func: null,
throw_on_detection: false,
keep_html_on_error: false,
standard_timeout: 10000,
solve_captcha_time: 45000,
// List of proxies to use ['socks5://78.94.172.42:1080', 'http://localhost:1080']
proxies: null,
// a file with one proxy per line. Example:
Expand Down
109 changes: 109 additions & 0 deletions test/keep_html_on_error.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
'use strict';
const express = require('express');
const { createLogger, transports } = require('winston');
const http = require('http');
const https = require('https');
const assert = require('assert');
const path = require('path');
const keyCert = require('key-cert');
const Promise = require('bluebird');
const Proxy = require('http-mitm-proxy');

const debug = require('debug')('se-scraper:test');
const se_scraper = require('..');

const httpPort = 3012;
const httpsPort = httpPort + 1;
const proxyPort = httpPort + 2;

const fakeSearchEngine = express();
fakeSearchEngine.get('/search', (req, res) => {
debug('q=%s', req.query.q);
const pageNumber = ((req.query.start/10) || 0) + 1;
res.sendFile(path.join(__dirname, 'mocks/google/' + req.query.q + '_page' + pageNumber + '.html'));
});
fakeSearchEngine.use(express.static('test/mocks/google', {extensions: ['html']}));

describe('Config', function(){

let httpServer, httpsServer, proxy;
before(async function(){
// Here mount our fake engine in both http and https listen server
httpServer = http.createServer(fakeSearchEngine);
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);

proxy = Proxy();
proxy.onRequest((ctx, callback) => {
ctx.proxyToServerRequestOptions.host = 'localhost';
ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
debug('Proxy request to %s', ctx.clientToProxyRequest.headers.host);
return callback();
});

await Promise.promisify(proxy.listen, {context: proxy})({port: proxyPort});
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
debug('Fake http search engine servers started');
});

after(function(){
httpsServer.close();
httpServer.close();
proxy.close();
});

describe('keep_html_on_error', function(){

const testLogger = createLogger({
transports: [
new transports.Console({
level: 'error'
})
]
});

/**
* Test html_output option
*/
it('html_output single page single keyword', async function () {

const scrape_job = {
search_engine: 'google',
/* TODO refactor start_url
google_settings: {
start_url: 'http://localhost:' + httpPort
},
*/
keywords: ['test error'],
};

var scraper = new se_scraper.ScrapeManager({
throw_on_detection: true,
keep_html_on_error: true,
logger: testLogger,
//clean_html_output: false,
//clean_data_images: false,
// TODO refactor start_url so we can use-it instead of depending of the proxy for this test
proxies: ['http://localhost:' + proxyPort],
use_proxies_only: true,
standard_timeout: 500,
});
await scraper.start();
await assert.rejects(
async () => {
await scraper.scrape(scrape_job);
},
(error) => {
console.log(error);
assert(error.html_on_error, 'Error is containing the html output');
return /#fbar/.test(error.name);
}
)
await scraper.quit();

});

});

});
1 change: 1 addition & 0 deletions test/mocks/google/test error_page1.html
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
dfjefiojifiefjoezji jiofjijifjeziojfioj jigjieozjfioejzij ijfijezifjoizejiofj