From 04709644eaf84f097273388fbc3c9ce2418caadd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Glawaty?= Date: Fri, 27 Oct 2023 02:53:33 +0200 Subject: [PATCH] Proxy and evaluate action - Added the scenario option `config.options.proxyUrls`. - Added action `evaluate`. - Added strategy `evaluate` in the action `collectData`. --- CHANGELOG.md | 5 ++ src/action/action-registry.mjs | 2 + src/action/collect-data.mjs | 23 +++++++++- src/action/evaluate.mjs | 46 +++++++++++++++++++ .../scenario/scenario-validator.mjs | 2 + src/crawler/crawler.mjs | 14 +++++- 6 files changed, 89 insertions(+), 3 deletions(-) create mode 100644 src/action/evaluate.mjs diff --git a/CHANGELOG.md b/CHANGELOG.md index d175c43..3f3eac7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased +### Added +- Added the scenario option `config.options.proxyUrls`. +- Added action `evaluate`. +- Added strategy `evaluate` in the action `collectData`. + ## 0.5.3 - 2023-10-23 ### Changed - Database connections now always have UTC timezone. diff --git a/src/action/action-registry.mjs b/src/action/action-registry.mjs index 095f2e6..494fb1b 100644 --- a/src/action/action-registry.mjs +++ b/src/action/action-registry.mjs @@ -5,6 +5,7 @@ import { CollectData } from './collect-data.mjs'; import { Delay } from './delay.mjs'; import { EnqueueLinks } from './enqueue-links.mjs'; import { EnqueueLinksByClicking } from './enqueue-links-by-clicking.mjs'; +import { Evaluate } from './evaluate.mjs'; import { Focus } from './focus.mjs'; import { Hover } from './hover.mjs'; import { KeyboardPress } from './keyboard-press.mjs'; @@ -29,6 +30,7 @@ export class ActionRegistry { this.#addAction(new Delay()); this.#addAction(new EnqueueLinks()); this.#addAction(new EnqueueLinksByClicking()); + this.#addAction(new Evaluate()); this.#addAction(new Focus()); this.#addAction(new Hover()); this.#addAction(new KeyboardPress()); diff --git a/src/action/collect-data.mjs b/src/action/collect-data.mjs index 240459b..1bd7309 100644 --- a/src/action/collect-data.mjs +++ b/src/action/collect-data.mjs @@ -12,6 +12,7 @@ export class CollectData extends AbstractAction { 'static', 'selector.innerText', 'selector.attribute', + 'evaluate', ]; } @@ -56,6 +57,10 @@ export class CollectData extends AbstractAction { yield `the option "${dataKey}.attribute" is required for the selector.attribute strategy and must be a non empty string`; } + if ('evaluate' === dataDef.strategy && (!('script' in dataDef) || 'string' !== typeof dataDef.script || '' === dataDef.script)) { + yield `the option "${dataKey}.script" is required for the ${dataDef.strategy} strategy and must be a non empty string`; + } + if ('multiple' in dataDef && 'boolean' !== typeof dataDef.multiple) { yield `the optional option "${dataKey}.multiple" must be a bool`; } @@ -76,10 +81,24 @@ export class CollectData extends AbstractAction { for (let dataKey in options) { const dataDef = options[dataKey]; - const value = await this.#getDataValue(dataDef, request, page); + let value = undefined; + + try { + value = await this.#getDataValue(dataDef, request, page); + } catch (err) { + value = err; + } if (undefined === value) { await logger.warning(`Unable to collect value for "${request.userData.identity}"."${dataKey}"`); + + continue; + } else if (value instanceof Error) { + value.message = `Unable to collect value for "${request.userData.identity}"."${dataKey}": ${value.message}`; + + await logger.warning(value); + + continue; } data.values[dataKey] = value; @@ -121,6 +140,8 @@ export class CollectData extends AbstractAction { return dataDef.multiple ? values : (0 < values.length ? values.shift() : undefined); }, dataDef); + case 'evaluate': + return await page.evaluate(dataDef.script); default: return undefined; } diff --git a/src/action/evaluate.mjs b/src/action/evaluate.mjs new file mode 100644 index 0000000..6fe60ec --- /dev/null +++ b/src/action/evaluate.mjs @@ -0,0 +1,46 @@ +import { AbstractAction } from './abstract-action.mjs'; + +export class Evaluate extends AbstractAction { + constructor() { + super('evaluate'); + } + + *_doValidateOptions({ options }) { + if (!('script' in options) || 'string' !== typeof options.script || '' === options.script) { + yield 'the option "script" is required and must be a non empty string'; + } + + if ('failOnError' in options && 'boolean' !== typeof options.failOnError) { + yield 'the optional option "failOnError" must be a bool'; + } + + if ('failOnFalsyReturn' in options && 'boolean' !== typeof options.failOnFalsyReturn) { + yield 'the optional option "failFalsyReturn" must be a bool'; + } + } + + async execute(options, { page }) { + const failOnError = 'failOnError' in options ? options.failOnError : true; + const failOnFalsyReturn = 'failOnFalsyReturn' in options ? options.failOnFalsyReturn : false; + let hasError = false; + let result; + + try { + result = await page.evaluate(options.script); + } catch (err) { + if (failOnError) { + if (!err.message.startsWith('Evaluation failed:')) { + err.message = `Evaluation failed: ${err.message}`; + } + + throw err; + } + + hasError = true; + } + + if (failOnFalsyReturn && !hasError && !result) { + throw new Error(`Evaluation failed: Script (${options.script}) returned a falsy result.`); + } + } +} diff --git a/src/application/controller/scenario/scenario-validator.mjs b/src/application/controller/scenario/scenario-validator.mjs index c9b403c..2b4c24e 100644 --- a/src/application/controller/scenario/scenario-validator.mjs +++ b/src/application/controller/scenario/scenario-validator.mjs @@ -73,6 +73,8 @@ export class ScenarioValidator { body('config.options.session.transferredCookies', 'The value must be an array of cookie names.').optional().isArray(), body('config.options.session.transferredCookies.*', 'The value must be a cookie name.').optional().isString(), body('config.options.waitUntil', 'The value must be one of these: ["load", "domcontentloaded", "networkidle0", "networkidle2"].').optional().isString().isIn(['load', 'domcontentloaded', 'networkidle0', 'networkidle2']), + body('config.options.proxyUrls', 'The value must be an array of proxy URLs.').optional().isArray(), + body('config.options.proxyUrls.*', 'The value must be a valid proxy URL.').optional().isURL(), body('config.scenes', 'The value must be a non empty object with string keys.').isObject().bail().custom(scenes => { sceneNames = Object.keys(scenes); diff --git a/src/crawler/crawler.mjs b/src/crawler/crawler.mjs index f3faf70..ae5e803 100644 --- a/src/crawler/crawler.mjs +++ b/src/crawler/crawler.mjs @@ -1,4 +1,4 @@ -import { Configuration } from 'crawlee'; +import { Configuration, ProxyConfiguration } from 'crawlee'; import { PuppeteerCrawler } from './puppeteer-crawler.mjs'; import puppeteerExtra from 'puppeteer-extra'; import StealthPlugin from 'puppeteer-extra-plugin-stealth'; @@ -36,7 +36,9 @@ export class Crawler { * }, * callbackUri?: string, * options?: { + * maxConcurrency?: number, * maxRequests?: number, + * maxRequestRetries?: number, * viewport?: { * width?: number, * height?: number, @@ -45,7 +47,9 @@ export class Crawler { * maxPoolSize?: number, * maxSessionUsageCount?: number, * transferredCookies?: array, - * } + * }, + * waitUntil?: string, + * proxyUrls?: array, * }, * scenes: Object. { await this.#scenarioRepository.addResult(scenarioId, group, identity, data, mergeOnConflict); };