Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
# Search Engine Scraper - se-scraper

[![npm](https://img.shields.io/npm/v/se-scraper.svg?style=for-the-badge)](https://www.npmjs.com/package/se-scraper)
[![Donate](https://img.shields.io/badge/donate-paypal-blue.svg?style=for-the-badge)](https://www.paypal.me/incolumitas)
[![Known Vulnerabilities](https://snyk.io/test/github/NikolaiT/se-scraper/badge.svg)](https://snyk.io/test/github/NikolaiT/se-scraper)
THIS IS A CUSTOM FORK of se-scraper, our changes will or have been
upstreamed.

The original package is [here](https://www.npmjs.com/package/se-scraper).

This node module allows you to scrape search engines concurrently with different proxies.

Expand Down Expand Up @@ -506,4 +507,4 @@ let scrape_config = {
num: 100, // Determines the number of results to show, defaults to 10. Maximum is 100.
},
}
```
```
8 changes: 4 additions & 4 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
{
"name": "se-scraper",
"version": "1.5.7",
"name": "@monibrand/se-scraper",
"version": "1.6.0-rc.8",
"description": "A module using puppeteer to scrape several search engines such as Google, Bing and Duckduckgo",
"homepage": "https://scrapeulous.com/",
"homepage": "https://monibrand.com/",
"main": "index.js",
"scripts": {
"test": "mocha test test/modules"
Expand All @@ -17,7 +17,7 @@
"author": "Nikolai Tschacher <[email protected]> (https://incolumitas.com/)",
"repository": {
"type": "git",
"url": "https://github.com/NikolaiT/se-scraper"
"url": "https://github.com/Monibrand/se-scraper"
},
"license": "ISC",
"dependencies": {
Expand Down
2 changes: 2 additions & 0 deletions src/modules/bing.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ const cheerio = require('cheerio');
const Scraper = require('./se_scraper');

class BingScraper extends Scraper {

defaultStartUrl = this.build_start_url('https://www.bing.com/search?') || 'https://www.bing.com/';

async parse_async(html) {

Expand Down
7 changes: 3 additions & 4 deletions src/modules/duckduckgo.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ const debug = require('debug')('se-scraper:DuckduckgoScraper');

class DuckduckgoScraper extends Scraper {

defaultStartUrl = 'https://duckduckgo.com/';

parse(html) {
debug('parse');
// load the page source into cheerio
Expand Down Expand Up @@ -46,11 +48,8 @@ class DuckduckgoScraper extends Scraper {

async load_start_page() {
debug('load_start_page');
let startUrl = 'https://duckduckgo.com/';

this.last_response = await this.page.goto(startUrl);
this.last_response = await this.page.goto(this.startUrl);
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });

return true;
}

Expand Down
78 changes: 29 additions & 49 deletions src/modules/google.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ const Scraper = require('./se_scraper');

class GoogleScraper extends Scraper {

defaultStartUrl = 'https://www.google.com';

constructor(...args) {
super(...args);
}
Expand All @@ -13,13 +15,13 @@ class GoogleScraper extends Scraper {

const results = await this.page.evaluate(() => {

let _text = (el, s) => {
let _text = (el, s, onlyFirstTextNode) => {
let n = el.querySelector(s);

if (n) {
return n.innerText;
return (onlyFirstTextNode) ? n.childNodes[0].nodeValue : n.innerText;
} else {
return '';
return;
}
};

Expand All @@ -29,7 +31,7 @@ class GoogleScraper extends Scraper {
if (n) {
return n.getAttribute(attr);
} else {
return null;
return;
}
};

Expand Down Expand Up @@ -111,14 +113,14 @@ class GoogleScraper extends Scraper {
// parse right side product information
results.right_info.review = _attr(document, '#rhs .cu-container g-review-stars span', 'aria-label');

let title_el = document.querySelector('#rhs .cu-container g-review-stars');
let title_el = document.querySelector('#rhs .cu-container .Q7Oxbd');
if (title_el) {
results.right_info.review.title = title_el.parentNode.querySelector('div:first-child').innerText;
results.right_info.title = title_el.innerText;
}

let num_reviews_el = document.querySelector('#rhs .cu-container g-review-stars');
let num_reviews_el = document.querySelector('#rhs .cu-container .PGDKUd');
if (num_reviews_el) {
results.right_info.num_reviews = num_reviews_el.parentNode.querySelector('div:nth-of-type(2)').innerText;
results.right_info.num_reviews = num_reviews_el.innerText;
}

results.right_info.vendors = [];
Expand All @@ -127,20 +129,16 @@ class GoogleScraper extends Scraper {
document.querySelectorAll('#rhs .cu-container .rhsvw > div > div:nth-child(4) > div > div:nth-child(3) > div').forEach((el) => {
results.right_info.vendors.push({
price: _text(el, 'span:nth-of-type(1)'),
merchant_name: _text(el, 'span:nth-child(3) a:nth-child(2)'),
merchant_name: _text(el, '.doUe3s0oL2B__jackpot-merchant a'),
merchant_ad_link: _attr(el, 'span:nth-child(3) a:first-child', 'href'),
merchant_link: _attr(el, 'span:nth-child(3) a:nth-child(2)', 'href'),
merchant_link: _attr(el, 'span:nth-child(3) a:nth-child(2)', 'href'), // TODO this is not working anymore
source_name: _text(el, 'span:nth-child(4) a'),
source_link: _attr(el, 'span:nth-child(4) a', 'href'),
info: _text(el, 'div span'),
shipping: _text(el, 'span:last-child > span'),
info: _text(el, '.SdBHnc.e2CF7c'),
shipping: _text(el, '.JfwJme'),
})
});

if (!results.right_info.title) {
results.right_info = {};
}

let right_side_info_el = document.getElementById('rhs');

if (right_side_info_el) {
Expand All @@ -151,26 +149,19 @@ class GoogleScraper extends Scraper {
}
}

// parse top main column product information
// #tvcap .pla-unit
document.querySelectorAll('#tvcap .pla-unit').forEach((el) => {
// Parse Google Shopping top or left
document.querySelectorAll('.pla-unit').forEach((el) => {
let top_product = {
tracking_link: _attr(el, '.pla-unit-title a:first-child', 'href'),
link: _attr(el, '.pla-unit-title a:nth-child(2)', 'href'),
title: _text(el, '.pla-unit-title a:nth-child(2) span'),
price: _text(el, '.pla-unit-title + div'),
shipping: _text(el, '.pla-extensions-container div:nth-of-type(1)'),
vendor_link: _attr(el,'.pla-extensions-container div > a', 'href'),
price: _text(el, '.pla-unit-title + div', true),
originalPrice: _text(el, '.pla-unit-title + div > span'),
shipping: _text(el, '.pla-extensions-container .cYBBsb'),
vendor_link: _attr(el,'.pla-extensions-container a.FfKHB', 'href'),
merchant_name: _text(el,'.LbUacb span:nth-child(1)'),
};

let merchant_node = el.querySelector('.pla-unit-title');
if (merchant_node) {
let node = merchant_node.parentNode.querySelector('div > span');
if (node) {
top_product.merchant_name = node.innerText;
}
}

results.top_products.push(top_product);
});

Expand Down Expand Up @@ -224,29 +215,18 @@ class GoogleScraper extends Scraper {
}

async load_start_page() {
let startUrl = 'https://www.google.com';
this.logger.info('Using startUrl: ' + this.startUrl);
this.last_response = await this.page.goto(this.startUrl);

if (this.config.google_settings) {
startUrl = `https://www.${this.config.google_settings.google_domain}/search?q=`;
if (this.config.google_settings.google_domain) {
startUrl = `https://www.${this.config.google_settings.google_domain}/search?`;
} else {
startUrl = `https://www.google.com/search?`;
}
await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });

for (var key in this.config.google_settings) {
if (key !== 'google_domain') {
startUrl += `${key}=${this.config.google_settings[key]}&`
}
}
const buttonAccepted = await this.page.$('#L2AGLb');
if (buttonAccepted) {
await this.page.evaluate(() => {
document.querySelector('#L2AGLb').click();
});
}

this.logger.info('Using startUrl: ' + startUrl);

this.last_response = await this.page.goto(startUrl);

await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT });

return true;
}

Expand Down
5 changes: 1 addition & 4 deletions src/modules/infospace.js
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,8 @@ class InfospaceScraper extends Scraper {
}

async load_start_page() {

let startUrl = this.build_start_url('http://search.infospace.com/search/web?') || 'http://infospace.com/index.html';

try {
this.last_response = await this.page.goto(startUrl);
this.last_response = await this.page.goto(this.startUrl);
await this.page.waitForSelector('input[name="q"]', { timeout: 5000 });
} catch (e) {
return false;
Expand Down
15 changes: 12 additions & 3 deletions src/modules/se_scraper.js
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ module.exports = class Scraper {
this.proxy = config.proxy;
this.keywords = config.keywords;

this.STANDARD_TIMEOUT = 10000;
this.SOLVE_CAPTCHA_TIME = 45000;
this.STANDARD_TIMEOUT = config.standard_timeout;
this.SOLVE_CAPTCHA_TIME = config.solve_captcha_time;

this.results = {};
this.result_rank = 1;
Expand Down Expand Up @@ -272,6 +272,12 @@ module.exports = class Scraper {
await this.page.screenshot({ path: `debug_se_scraper_${this.config.search_engine_name}_${keyword}.png` });
}

if (this.config.keep_html_on_error){
const html_error = await this.page.content();
e.html_on_error = html_error;
e.lastUrl = await this.page.evaluate(() => {return window.location.href;});
}

this.metadata.scraping_detected = await this.detected();

if (this.metadata.scraping_detected === true) {
Expand Down Expand Up @@ -312,7 +318,6 @@ module.exports = class Scraper {
for (var key in settings) {
baseUrl += `${key}=${settings[key]}&`
}

this.logger.info('Using startUrl: ' + baseUrl);

return baseUrl;
Expand Down Expand Up @@ -381,6 +386,10 @@ module.exports = class Scraper {

}

get startUrl(){
return this.build_start_url(this.config.startUrl || this.defaultStartUrl);
}

/**
*
* @returns true if startpage was loaded correctly.
Expand Down
9 changes: 5 additions & 4 deletions src/modules/yandex.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ const Scraper = require('./se_scraper');

class YandexScraper extends Scraper {

defaultStartUrl = 'https://yandex.com';

constructor(...args) {
super(...args);
}
Expand Down Expand Up @@ -71,11 +73,10 @@ class YandexScraper extends Scraper {
}

async load_start_page() {
let startUrl = 'https://yandex.com';

this.logger.info('Using startUrl: ' + this.startUrl);

this.logger.info('Using startUrl: ' + startUrl);

this.last_response = await this.page.goto(startUrl);
this.last_response = await this.page.goto(this.startUrl);

await this.page.waitForSelector('input[name="text"]', { timeout: this.STANDARD_TIMEOUT });

Expand Down
3 changes: 3 additions & 0 deletions src/node_scraper.js
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,9 @@ class ScrapeManager {
//custom_func: resolve('examples/pluggable.js'),
custom_func: null,
throw_on_detection: false,
keep_html_on_error: false,
standard_timeout: 10000,
solve_captcha_time: 45000,
// List of proxies to use ['socks5://78.94.172.42:1080', 'http://localhost:1080']
proxies: null,
// a file with one proxy per line. Example:
Expand Down
Loading