Skip to content

Commit 9b27034

Browse files
authored
Update to latest Puppeteer (#26)
* update to pptr 19.11.1 * allow to explicitly specify selector, xpath or timeout to wait for in wait options * use pptr 20.1.2
1 parent f5fdc37 commit 9b27034

File tree

8 files changed

+438
-227
lines changed

8 files changed

+438
-227
lines changed

README.md

+32-10
Original file line numberDiff line numberDiff line change
@@ -56,8 +56,32 @@ This method allow to goto a page with a specific url in puppeteer.
5656
Params:
5757

5858
url - the url which puppeteer should navigate to.
59-
navigationOptions - [possible options to use for request.](https://github.com/GoogleChrome/puppeteer/blob/v1.20.0/docs/api.md#pagegotourl-options)
60-
waitOptions - [wait for selector or timeout](https://github.com/puppeteer/puppeteer/blob/v1.20.0/docs/api.md#pagewaitforselectororfunctionortimeout-options-args) after navigation completes, same as in click or scroll.
59+
navigationOptions - [possible options to use for request.](https://pptr.dev/api/puppeteer.page.goto#remarks)
60+
waitOptions - [wait for selector](https://pptr.dev/api/puppeteer.page.waitforselector), [xpath](https://pptr.dev/api/puppeteer.page.waitforxpath), or timeout after navigation completes.
61+
62+
Example request body
63+
```json5
64+
{
65+
"url": "https://example.com", // <string> URL to navigate page to. The url should include scheme, e.g. https://.
66+
"navigationOptions": { // Navigation parameters which might have the following properties:
67+
"timeout": 30000, // <number> Maximum navigation time in milliseconds, defaults to 30 seconds, pass 0 to disable timeout.
68+
// "waitUntil": <string|Array<string>> When to consider navigation succeeded, defaults to load. Given an array of event strings, navigation is considered to be successful after all events have been fired. Events can be either:
69+
// load - consider navigation to be finished when the load event is fired.
70+
// domcontentloaded - consider navigation to be finished when the DOMContentLoaded event is fired.
71+
// networkidle0 - consider navigation to be finished when there are no more than 0 network connections for at least 500 ms.
72+
// networkidle2 - consider navigation to be finished when there are no more than 2 network connections for at least 500 ms.
73+
// "referer": <string> Referer header value. If provided it will take preference over the referer header value set by page.setExtraHTTPHeaders().
74+
},
75+
"waitOptions": { // Wait for element or timeout after navigation completes
76+
// "timeout": <number> Wait for given timeout in milliseconds
77+
"selector": "span.target", // <string> Wait for element by selector (see https://pptr.dev/api/puppeteer.page.waitforselector)
78+
// "xpath": <string> Wait for element by xpath (see https://pptr.dev/api/puppeteer.page.waitforxpath)
79+
"options": { // <object> Options to wait for elements (see https://pptr.dev/api/puppeteer.waitforselectoroptions)
80+
"timeout": 10000
81+
}
82+
}
83+
}
84+
```
6185

6286
### **/back** and **/forward**
6387
This methods helps to navigate back and forward to see previously seen pages.
@@ -77,9 +101,8 @@ Example request body:
77101
"delay": 0 //<number> Time to wait between mousedown and mouseup in milliseconds. Defaults to 0.
78102
},
79103
"waitOptions": {
80-
// if selectorOrTimeout is a string, then the first argument is treated as a selector or xpath, depending on whether or not it starts with '//', and the method is a shortcut for page.waitForSelector or page.waitForXPath
81-
// if selectorOrTimeout is a number, then the first argument is treated as a timeout in milliseconds and the method returns a promise which resolves after the timeout
82-
"selectorOrTimeout": 5, //default timeout is 1000ms
104+
// selector, xpath or timeout, same as in the goto method
105+
"timeout": 5000, //default timeout is 1000ms
83106
},
84107
"navigationOptions": { // use if click triggers navigation to other page; same as in goXXX methods
85108
"waitUntil": "domcontentloaded",
@@ -96,9 +119,8 @@ Example request body:
96119
{
97120
"selector": "", //<string> A selector to search for element to click. If there are multiple elements satisfying the selector, the first will be clicked.
98121
"waitOptions": {
99-
// if selectorOrTimeout is a string, then the first argument is treated as a selector or xpath, depending on whether or not it starts with '//', and the method is a shortcut for page.waitForSelector or page.waitForXPath
100-
// if selectorOrTimeout is a number, then the first argument is treated as a timeout in milliseconds and the method returns a promise which resolves after the timeout
101-
"selectorOrTimeout": 5, //default timeout is 1000ms
122+
// selector, xpath or timeout, same as in the goto method
123+
"timeout": 5000, //default timeout is 1000ms
102124
}
103125
}
104126
```
@@ -116,8 +138,8 @@ Simple example request body of goto:
116138
async function action(page, request) {
117139
await page.goto(request.query.uri);
118140
let response = { //return response that you want to see as result
119-
context_id: page.browserContext()._id,
120-
page_id: await page._target._targetId,
141+
context_id: page.browserContext().id,
142+
page_id: page.target()._targetId,
121143
html: await page.content(),
122144
cookies: await page.cookies()
123145
};

helpers/utils.js

+43-21
Original file line numberDiff line numberDiff line change
@@ -2,18 +2,17 @@ const { proxyRequest } = require('puppeteer-proxy');
22
const PROXY_URL_KEY = 'puppeteer-service-proxy-url'
33

44
async function findContextInBrowser(browser, contextId) {
5-
6-
for (let context of browser.browserContexts()) {
7-
if (contextId === await context._id) {
5+
for (const context of browser.browserContexts()) {
6+
if (contextId === context.id) {
87
return context;
98
}
109
}
1110
throw "Context not found";
1211
}
1312

1413
async function findPageInContext(context, pageId) {
15-
for (let page of await context.pages()) {
16-
if (pageId === await page._target._targetId) {
14+
for (const page of await context.pages()) {
15+
if (pageId === page.target()._targetId) {
1716
return page;
1817
}
1918
}
@@ -22,31 +21,54 @@ async function findPageInContext(context, pageId) {
2221

2322
exports.closeContexts = async function closeContexts(browser, contextIds) {
2423
// TODO shared locks on contexts and exclusive on pages?
25-
let close_promises = [];
26-
for (let context of browser.browserContexts()) {
27-
if (contextIds.includes(context._id)) {
28-
close_promises.push(context.close());
24+
const closePromises = [];
25+
for (const context of browser.browserContexts()) {
26+
if (contextIds.includes(context.id)) {
27+
closePromises.push(context.close());
2928
}
3029
}
31-
await Promise.all(close_promises);
30+
await Promise.all(closePromises);
3231
};
3332

3433
async function wait(page, waitFor) {
35-
if (waitFor instanceof Object) {
36-
const { selectorOrTimeout, options } = waitFor;
37-
if (selectorOrTimeout) {
38-
await page.waitFor(selectorOrTimeout, options);
34+
let { selector, xpath, timeout, options } = waitFor;
35+
36+
// for compatibility with old waitFor interface
37+
const { selectorOrTimeout } = waitFor;
38+
if (selectorOrTimeout) {
39+
if (!isNaN(selectorOrTimeout)) {
40+
timeout = selectorOrTimeout;
41+
} else if (typeof selectorOrTimeout === 'string') {
42+
if (selectorOrTimeout.startsWith('//')) {
43+
xpath = selectorOrTimeout;
44+
} else {
45+
selector = selectorOrTimeout;
46+
}
3947
}
40-
} else if (waitFor) {
41-
await page.waitFor(waitFor);
48+
}
49+
50+
if ([selector, xpath, timeout].filter(Boolean).length > 1) {
51+
throw "Wait options must contain either a selector, an xpath or a timeout";
52+
}
53+
54+
if (selector) {
55+
return page.waitForSelector(selector, options);
56+
}
57+
if (xpath) {
58+
return page.waitForXPath(xpath, options);
59+
}
60+
if (timeout) {
61+
return new Promise(resolve => setTimeout(resolve, timeout));
4262
}
4363
}
4464

4565
exports.formResponse = async function formResponse(page, closePage, waitFor) {
46-
await wait(page, waitFor);
66+
if (waitFor) {
67+
await wait(page, waitFor);
68+
}
4769

48-
let response = {
49-
contextId: page.browserContext()._id,
70+
const response = {
71+
contextId: page.browserContext().id,
5072
html: await page.content(),
5173
cookies: await page.cookies(),
5274
};
@@ -56,7 +78,7 @@ exports.formResponse = async function formResponse(page, closePage, waitFor) {
5678
}
5779

5880
if (!page.isClosed()) {
59-
response.pageId = await page._target._targetId;
81+
response.pageId = page.target()._targetId;
6082
}
6183

6284
return response;
@@ -118,7 +140,7 @@ exports.getBrowserPage = async function getBrowserPage(browser, request) {
118140
exports.performAction = async function performAction(request, action) {
119141
const lock = request.app.get('lock');
120142
const page = await exports.getBrowserPage(request.app.get('browser'), request);
121-
return lock.acquire(await page._target._targetId, async () => {
143+
return lock.acquire(page.target()._targetId, async () => {
122144
let extraHeaders = {};
123145

124146
if ('body' in request && 'headers' in request.body) {

package.json

+3-3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "scrapy-puppeteer-service",
3-
"version": "0.0.10",
3+
"version": "0.1.0",
44
"private": true,
55
"scripts": {
66
"start": "node ./bin/www"
@@ -9,11 +9,11 @@
99
"async-lock": "^1.3.0",
1010
"body-parser": "^1.19.1",
1111
"cookie-parser": "~1.4.6",
12-
"debug": "~4.3.3",
12+
"debug": "~4.3.4",
1313
"express": "~4.17.3",
1414
"morgan": "~1.10.0",
1515
"npm-run-all": "^4.1.5",
16-
"puppeteer": "^13.1.2",
16+
"puppeteer": "^20.1.2",
1717
"puppeteer-proxy": "^2.1.2"
1818
}
1919
}

routes/action.js

+2-2
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,8 @@ const router = express.Router();
1010
* some actions with page in puppeteer syntax
1111
* ...
1212
* return {
13-
* context_id: page.browserContext()._id,
14-
* page_id: await page._target._targetId,
13+
* context_id: page.browserContext().id,
14+
* page_id: page.target()._targetId,
1515
html: await page.content(),
1616
cookies: await page.cookies()
1717
* };

routes/click.js

+3-6
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@ async function action(page, request) {
1414
} else {
1515
await page.click(request.body.selector, request.body.clickOptions);
1616
}
17-
return utils.formResponse(page, request.query.closePage, request.body.waitOptions || DEFAULT_TIMEOUT);
17+
const waitOptions = request.body.waitOptions || { timeout: DEFAULT_TIMEOUT };
18+
return utils.formResponse(page, request.query.closePage, waitOptions);
1819
}
1920

2021
/**
@@ -25,11 +26,7 @@ async function action(page, request) {
2526
"clickCount", //<number> defaults to 1.
2627
"delay" //<number> Time to wait between mousedown and mouseup in milliseconds. Defaults to 0.
2728
},
28-
"waitOptions": {
29-
// if selectorOrTimeout is a string, then the first argument is treated as a selector or xpath, depending on whether or not it starts with '//', and the method is a shortcut for page.waitForSelector or page.waitForXPath
30-
// if selectorOrTimeout is a number, then the first argument is treated as a timeout in milliseconds and the method returns a promise which resolves after the timeout
31-
"selectorOrTimeout":... default 1,
32-
},
29+
"waitOptions": {...}, // same as in goto action, defaults to 1s timeout
3330
"navigationOptions": {...} // same as in goto action
3431
}
3532
*/

routes/goto.js

+7-1
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,13 @@ async function action(page, request) {
1919
// networkidle2 - consider navigation to be finished when there are no more than 2 network connections for at least 500 ms.
2020
// "referer" <string> Referer header value. If provided it will take preference over the referer header value set by page.setExtraHTTPHeaders().
2121
// },
22-
// "waitOptions": {...} same as in click action
22+
// "waitOptions": {
23+
// "timeout": <number> Wait for given timeout in milliseconds
24+
// "selector": <string> Wait for element by selector (see https://pptr.dev/api/puppeteer.page.waitforselector)
25+
// "xpath": <string> Wait for element by xpath (see https://pptr.dev/api/puppeteer.page.waitforxpath)
26+
// "options": <object> Options to wait for elements (see https://pptr.dev/api/puppeteer.waitforselectoroptions)
27+
// }
28+
// }
2329
//
2430
router.post('/', async function (req, res, next) {
2531

routes/scroll.js

+3-6
Original file line numberDiff line numberDiff line change
@@ -13,18 +13,15 @@ async function action(page, request) {
1313
window.scrollBy(0, document.body.scrollHeight)
1414
});
1515
}
16-
return utils.formResponse(page, request.query.closePage, request.body.waitOptions || DEFAULT_TIMEOUT);
16+
const waitOptions = request.body.waitOptions || { timeout: DEFAULT_TIMEOUT};
17+
return utils.formResponse(page, request.query.closePage, waitOptions);
1718
}
1819

1920
// Method that scrolls page to a certain selector.
2021
// Example body:
2122
// body = {
2223
// "selector": "", //<string> A selector to search for element to scroll
23-
// "waitOptions": {
24-
// // if selectorOrFunctionOrTimeout is a string, then the first argument is treated as a selector or xpath, depending on whether or not it starts with '//', and the method is a shortcut for page.waitForSelector or page.waitForXPath
25-
// // if selectorOrFunctionOrTimeout is a number, then the first argument is treated as a timeout in milliseconds and the method returns a promise which resolves after the timeout
26-
// "selectorOrTimeout":...,
27-
// }
24+
// "waitOptions": {...} // same as in goto action, defaults to 1s timeout
2825
// }
2926
router.post('/', async function (req, res, next) {
3027

0 commit comments

Comments
 (0)