Skip to content

Commit 8139dee

Browse files
authored
Compose action (#53)
* proper handling of exceptions. * action-router model * Now we throw errors and action is more error-prone. * compose action * Fix everything... * Working service! * Docs * Fixes after review * Fixes after review
1 parent b37e0ae commit 8139dee

30 files changed

+406
-290
lines changed

README.md

+45
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,51 @@ Example request body:
149149
}
150150
```
151151

152+
### **/compose**
153+
154+
This POST method allows to combine several puppeteer actions into one.
155+
Note that the method does not expect nested composite actions inside its body.
156+
157+
Example request body:
158+
```json5
159+
{
160+
"actions": [
161+
{
162+
"endpoint": "goto",
163+
"body": {
164+
"url": "<URL>",
165+
"harRecording": false,
166+
},
167+
},
168+
{
169+
"endpoint": "click",
170+
"body": {
171+
"selector": "<SELECTOR>",
172+
},
173+
},
174+
{
175+
"endpoint": "click",
176+
"body": {
177+
"selector": "<SELECTOR>",
178+
},
179+
},
180+
{
181+
"endpoint": "scroll",
182+
"body": {},
183+
},
184+
{
185+
"endpoint": "screenshot",
186+
"body": {
187+
"options": {
188+
"full_page": true,
189+
"type": "jpeg",
190+
},
191+
},
192+
}
193+
],
194+
}
195+
```
196+
152197
### **/scroll**
153198

154199
This POST method allows to scroll page to the first element that is matched by selector and returns page result.

actions/action.js

+33
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
const exceptions = require('../helpers/exceptions');
2+
const utils = require('../helpers/utils'); // For usage inside user's action(page, request) function
3+
4+
/**
5+
* Content-Type: application/javascript
6+
* body = js function as pattern:
7+
* async function action(page, request) {
8+
* ...
9+
* some actions with page in puppeteer syntax
10+
* ...
11+
* return {
12+
* context_id: page.browserContext().id,
13+
* page_id: page.target()._targetId,
14+
* html: await page.content(),
15+
* cookies: await page.cookies()
16+
* };
17+
* };
18+
*/
19+
exports.action = async function action(page, request) {
20+
eval(request.body.toString());
21+
22+
// check action function existence
23+
if (!(typeof action === "function" && action.length >= 1)) {
24+
throw new exceptions.IncorrectArgumentError("Invalid action function.\n" +
25+
"Valid action function: \"async function action(page, request) " +
26+
"{ ... some actions with request and page in puppeteer " +
27+
"syntax};\"");
28+
}
29+
30+
return {
31+
data: await action(page, request)
32+
}
33+
}

actions/click.js

+29
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
const utils = require('../helpers/utils');
2+
3+
const DEFAULT_TIMEOUT = 1000; // 1 second
4+
5+
/*
6+
* body = {
7+
* "selector": "", // <string> A selector to search for element to click. If there are multiple elements satisfying the selector, the first will be clicked.
8+
* "clickOptions": {
9+
* "button", // <"left"|"right"|"middle"> Defaults to left.
10+
* "clickCount", // <number> defaults to 1.
11+
* "delay" // <number> Time to wait between mousedown and mouseup in milliseconds. Defaults to 0.
12+
* },
13+
* "waitOptions": {...}, // same as in goto action, defaults to 1s timeout
14+
* "navigationOptions": {...} // same as in goto action
15+
* }
16+
*/
17+
exports.click = async function click(page, request) {
18+
await page.hover(request.body.selector);
19+
if (request.body.navigationOptions) {
20+
await Promise.all([
21+
page.waitForNavigation(request.body.navigationOptions),
22+
page.click(request.body.selector, request.body.clickOptions),
23+
]);
24+
} else {
25+
await page.click(request.body.selector, request.body.clickOptions);
26+
}
27+
const waitOptions = request.body.waitOptions || { timeout: DEFAULT_TIMEOUT };
28+
return await utils.getContents(page, waitOptions);
29+
}

actions/compose.js

+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
endpoint2action = {
2+
action: require("./action").action,
3+
click: require("./click").click,
4+
fill_form: require("./fill_form").fillForm,
5+
back: require("./goback").goBack,
6+
forward: require("./goforward").goForward,
7+
goto: require("./goto").goto,
8+
har: require("./har").har,
9+
mhtml: require("./mhtml").captureSnapshot,
10+
recaptcha_solver: require("./recaptcha_solver").recaptchaSolver,
11+
screenshot: require("./screenshot").screenshot,
12+
scroll: require("./scroll").scroll,
13+
}
14+
15+
async function compose(page, request) {
16+
const originalClosePage = request.query.closePage;
17+
const originalBody = structuredClone(request.body);
18+
19+
request.query.closePage = false;
20+
delete request.body["actions"];
21+
22+
let response;
23+
try {
24+
for (const action of originalBody["actions"]) {
25+
request.body = action["body"];
26+
response = await endpoint2action[action["endpoint"]](page, request);
27+
}
28+
} finally {
29+
request.query.closePage = originalClosePage;
30+
request.body = originalBody;
31+
}
32+
33+
return response;
34+
}
35+
exports.compose = compose;

actions/fill_form.js

+29
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
const utils = require('../helpers/utils');
2+
3+
/*
4+
* body = {
5+
* "inputMapping": { A dictionary where each key is a CSS selector, and each value is another dictionary containing details about the input for that element:
6+
* "selector": <string> The CSS selector for the input element (used as the key).
7+
* "value": <string> The text to be inputted into the element.
8+
* "delay": <number> A delay (in milliseconds) between each keystroke when inputting the text. Defaults to 0 if not provided.
9+
* },
10+
* "submitButton": <string> The CSS selector for the form's submit button. If provided, the button will be clicked after filling in the form.
11+
* }
12+
*/
13+
exports.fillForm = async function fillForm(page, request) {
14+
const inputMapping = request.body.inputMapping;
15+
const submitButton = request.body.submitButton;
16+
17+
for (const [selector, params] of Object.entries(inputMapping)) {
18+
const value = params.value;
19+
const delay = params.delay || 0;
20+
await page.type(selector, value, { delay });
21+
}
22+
23+
if (submitButton) {
24+
await page.click(submitButton);
25+
}
26+
27+
return await utils.getContents(page);
28+
29+
}

actions/goback.js

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
const utils = require('../helpers/utils');
2+
3+
exports.goBack = async function goBack(page, request) {
4+
await page.goBack(request.body.navigationOptions);
5+
return await utils.getContents(page, request.body.waitOptions);
6+
}

actions/goforward.js

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
const utils = require('../helpers/utils');
2+
3+
exports.goForward = async function goForward(page, request) {
4+
await page.goForward(request.body.navigationOptions);
5+
return await utils.getContents(page, request.body.waitOptions);
6+
}

actions/goto.js

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
const utils = require('../helpers/utils');
2+
3+
/*
4+
* body = {
5+
* "url": <string> URL to navigate page to. The url should include scheme, e.g. https://.
6+
* "navigationOptions": { Navigation parameters which might have the following properties:
7+
* "timeout": <number> Maximum navigation time in milliseconds, defaults to 30 seconds, pass 0 to disable timeout. The default value can be changed by using the page.setDefaultNavigationTimeout(timeout) or page.setDefaultTimeout(timeout) methods.
8+
* "waitUntil": <string|Array<string>> When to consider navigation succeeded, defaults to load. Given an array of event strings, navigation is considered to be successful after all events have been fired. Events can be either:
9+
* load - consider navigation to be finished when the load event is fired.
10+
* domcontentloaded - consider navigation to be finished when the DOMContentLoaded event is fired.
11+
* networkidle0 - consider navigation to be finished when there are no more than 0 network connections for at least 500 ms.
12+
* networkidle2 - consider navigation to be finished when there are no more than 2 network connections for at least 500 ms.
13+
* "referer" <string> Referer header value. If provided it will take preference over the referer header value set by page.setExtraHTTPHeaders().
14+
* },
15+
* "waitOptions": {
16+
* "timeout": <number> Wait for given timeout in milliseconds
17+
* "selector": <string> Wait for element by selector (see https://pptr.dev/api/puppeteer.page.waitforselector)
18+
* "xpath": <string> Wait for element by xpath (see https://pptr.dev/api/puppeteer.page.waitforxpath)
19+
* "options": <object> Options to wait for elements (see https://pptr.dev/api/puppeteer.waitforselectoroptions)
20+
* },
21+
* "harRecording": true,
22+
* }
23+
*/
24+
exports.goto = async function goto(page, request) {
25+
await page.goto(request.body.url, request.body.navigationOptions);
26+
return await utils.getContents(page, request.body.waitOptions);
27+
}

actions/har.js

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
const exceptions = require("../helpers/exceptions");
2+
3+
exports.har = async function har(page, request) {
4+
if (!(page.harWriter)){
5+
throw new exceptions.NoHarWriterError();
6+
}
7+
8+
return {
9+
har: JSON.stringify(await page.harWriter.stop()) // TODO: do we really need JSON.stringify?
10+
};
11+
}

actions/mhtml.js

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
/*
2+
* Captures mhtml snapshot of a page
3+
*/
4+
exports.captureSnapshot = async function captureSnapshot(page, request) {
5+
const cdpSession = await page.target().createCDPSession();
6+
const { data } = await cdpSession.send('Page.captureSnapshot', { format: 'mhtml' });
7+
await cdpSession.detach()
8+
return {
9+
mhtml: data,
10+
};
11+
}

actions/recaptcha_solver.js

+37
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
const utils = require('../helpers/utils')
2+
3+
const DEFAULT_TIMEOUT = 1000; // 1 second
4+
5+
/*
6+
* This module introduces new ability to puppeteer-service.
7+
* It is capable of solving recaptchas on a given web-page.
8+
* If there is no recaptcha on the page nothing bad will happen.
9+
* If there is recaptcha it solves it and then inserts the special code
10+
* into the page automatically.
11+
*
12+
* Returns useful information about recaptcha_solving.
13+
* For more information about return value visit
14+
* https://github.com/berstend/puppeteer-extra/tree/master/packages/puppeteer-extra-plugin-recaptcha#result-object
15+
*/
16+
exports.recaptchaSolver = async function recaptchaSolver(page, request) {
17+
let recaptcha_data;
18+
19+
if (request.body.solve_recaptcha) {
20+
recaptcha_data = await page.solveRecaptchas();
21+
} else {
22+
recaptcha_data = await page.findRecaptchas();
23+
}
24+
25+
const waitOptions = request.body.waitOptions || { timeout: DEFAULT_TIMEOUT };
26+
const contents = await utils.getContents(page, waitOptions);
27+
28+
if (request.query.closePage ||
29+
(request.body.close_on_empty && recaptcha_data['captchas'].length === 0)) {
30+
await page.close();
31+
}
32+
33+
return {
34+
...contents,
35+
recaptcha_data: recaptcha_data,
36+
}
37+
}

actions/screenshot.js

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
/*
2+
* Method that returns screenshots of pages
3+
* more description of options you can see on GitHub:
4+
* https://github.com/GoogleChrome/puppeteer/blob/v1.19.0/docs/api.md#pagescreenshotoptions
5+
*/
6+
exports.screenshot = async function screenshot(page, request) {
7+
delete request.body.options.path; // no path for saving images
8+
request.body.options.encoding = "base64"; // return in base64
9+
let screenshot = await page.screenshot(request.body.options);
10+
return {
11+
screenshot: screenshot
12+
};
13+
}

actions/scroll.js

+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
const utils = require('../helpers/utils');
2+
3+
const DEFAULT_TIMEOUT = 1000; // 1 second
4+
5+
/*
6+
* Method that scrolls page to a certain selector.
7+
* Example body:
8+
* body = {
9+
* "selector": "", // <string> A selector to search for element to scroll
10+
* "waitOptions": {...}, // same as in goto action, defaults to 1s timeout
11+
* }
12+
*/
13+
exports.scroll = async function scroll(page, request) {
14+
if (request.body.selector) {
15+
await page.hover(request.body.selector);
16+
} else {
17+
await page.evaluate(() => {
18+
// scroll down until the bottom of the page to trigger scroll event even at the bottom of a page
19+
window.scrollBy(0, document.body.scrollHeight)
20+
});
21+
}
22+
const waitOptions = request.body.waitOptions || { timeout: DEFAULT_TIMEOUT};
23+
return utils.getContents(page, waitOptions);
24+
}

app.js

+2
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ const bodyParser = require('body-parser');
1010
const AsyncLock = require('async-lock');
1111

1212
const indexRouter = require('./routes/index');
13+
const composeRouter = require('./routes/compose');
1314
const healthCheckRouter = require('./routes/health_check');
1415
const gotoRouter = require('./routes/goto');
1516
const backRouter = require('./routes/goback');
@@ -108,6 +109,7 @@ app.use(bodyParser.raw({ inflate: true, limit: '200kb', type: 'application/javas
108109
app.use(cookieParser());
109110

110111
app.use('/', indexRouter);
112+
app.use('/compose', composeRouter);
111113
app.use('/health_check', healthCheckRouter);
112114
app.use('/goto', gotoRouter);
113115
app.use('/back', backRouter);

helpers/exceptions.js

+10-2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,11 @@
1+
exports.IncorrectArgumentError = class IncorrectArgumentError extends Error {
2+
constructor(message="Passed incorrect argument", ...args) {
3+
super(message, ...args);
4+
this.message = message;
5+
this.name = "IncorrectArgumentError";
6+
}
7+
}
8+
19
exports.PageNotFoundError = class PageNotFoundError extends Error {
210
constructor(message="Page not found", ...args) {
311
super(message, ...args);
@@ -23,9 +31,9 @@ exports.TooManyContextsError = class TooManyContextsError extends Error {
2331
}
2432

2533
exports.NoHarWriterError = class NoHarWriterError extends Error {
26-
constructor(message="There is no initialized Har Writer on the page to which the Har action was applied.", ...args) {
34+
constructor(message="There is no initialized Har Writer on the page to which the Har action was applied", ...args) {
2735
super(message, ...args);
2836
this.message = message;
2937
this.name = "NoHarWriterError";
3038
}
31-
}
39+
}

helpers/middlewares/process_exception.js

+5-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,11 @@ exports.processExceptionMiddleware = async function processExceptionMiddleware(e
1616
res.header('scrapy-puppeteer-service-context-id', contextId);
1717
}
1818

19-
if (err instanceof exceptions.TooManyContextsError) {
19+
if (err instanceof exceptions.IncorrectArgumentError) {
20+
res.status(400);
21+
} else if (err instanceof exceptions.NoHarWriterError) {
22+
res.status(400);
23+
}else if (err instanceof exceptions.TooManyContextsError) {
2024
res.status(429); // Too Many Requests
2125
} else if (err.contextId) { // there was a context, but something went wrong
2226
res.status(500);

helpers/utils.js

+1
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ async function getIds(page) {
8989
pageId: page.target()._targetId,
9090
}
9191
}
92+
exports.getIds = getIds;
9293

9394
exports.getContents = async function getContents(page, waitFor) {
9495
if (waitFor) {

package.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "scrapy-puppeteer-service",
3-
"version": "0.3.7",
3+
"version": "0.3.8",
44
"private": true,
55
"scripts": {
66
"start": "node ./bin/www"

0 commit comments

Comments
 (0)