Skip to content

Commit

Permalink
Proxy and evaluate action
Browse files Browse the repository at this point in the history
- Added the scenario option `config.options.proxyUrls`.
- Added action `evaluate`.
- Added strategy `evaluate` in the action `collectData`.
  • Loading branch information
tg666 committed Oct 27, 2023
1 parent 786e6d0 commit 0470964
Show file tree
Hide file tree
Showing 6 changed files with 89 additions and 3 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## Unreleased

### Added
- Added the scenario option `config.options.proxyUrls`.
- Added action `evaluate`.
- Added strategy `evaluate` in the action `collectData`.

## 0.5.3 - 2023-10-23
### Changed
- Database connections now always have UTC timezone.
Expand Down
2 changes: 2 additions & 0 deletions src/action/action-registry.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import { CollectData } from './collect-data.mjs';
import { Delay } from './delay.mjs';
import { EnqueueLinks } from './enqueue-links.mjs';
import { EnqueueLinksByClicking } from './enqueue-links-by-clicking.mjs';
import { Evaluate } from './evaluate.mjs';
import { Focus } from './focus.mjs';
import { Hover } from './hover.mjs';
import { KeyboardPress } from './keyboard-press.mjs';
Expand All @@ -29,6 +30,7 @@ export class ActionRegistry {
this.#addAction(new Delay());
this.#addAction(new EnqueueLinks());
this.#addAction(new EnqueueLinksByClicking());
this.#addAction(new Evaluate());
this.#addAction(new Focus());
this.#addAction(new Hover());
this.#addAction(new KeyboardPress());
Expand Down
23 changes: 22 additions & 1 deletion src/action/collect-data.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ export class CollectData extends AbstractAction {
'static',
'selector.innerText',
'selector.attribute',
'evaluate',
];
}

Expand Down Expand Up @@ -56,6 +57,10 @@ export class CollectData extends AbstractAction {
yield `the option "${dataKey}.attribute" is required for the selector.attribute strategy and must be a non empty string`;
}

if ('evaluate' === dataDef.strategy && (!('script' in dataDef) || 'string' !== typeof dataDef.script || '' === dataDef.script)) {
yield `the option "${dataKey}.script" is required for the ${dataDef.strategy} strategy and must be a non empty string`;
}

if ('multiple' in dataDef && 'boolean' !== typeof dataDef.multiple) {
yield `the optional option "${dataKey}.multiple" must be a bool`;
}
Expand All @@ -76,10 +81,24 @@ export class CollectData extends AbstractAction {

for (let dataKey in options) {
const dataDef = options[dataKey];
const value = await this.#getDataValue(dataDef, request, page);
let value = undefined;

try {
value = await this.#getDataValue(dataDef, request, page);
} catch (err) {
value = err;
}

if (undefined === value) {
await logger.warning(`Unable to collect value for "${request.userData.identity}"."${dataKey}"`);

continue;
} else if (value instanceof Error) {
value.message = `Unable to collect value for "${request.userData.identity}"."${dataKey}": ${value.message}`;

await logger.warning(value);

continue;
}

data.values[dataKey] = value;
Expand Down Expand Up @@ -121,6 +140,8 @@ export class CollectData extends AbstractAction {

return dataDef.multiple ? values : (0 < values.length ? values.shift() : undefined);
}, dataDef);
case 'evaluate':
return await page.evaluate(dataDef.script);
default:
return undefined;
}
Expand Down
46 changes: 46 additions & 0 deletions src/action/evaluate.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import { AbstractAction } from './abstract-action.mjs';

export class Evaluate extends AbstractAction {
constructor() {
super('evaluate');
}

*_doValidateOptions({ options }) {
if (!('script' in options) || 'string' !== typeof options.script || '' === options.script) {
yield 'the option "script" is required and must be a non empty string';
}

if ('failOnError' in options && 'boolean' !== typeof options.failOnError) {
yield 'the optional option "failOnError" must be a bool';
}

if ('failOnFalsyReturn' in options && 'boolean' !== typeof options.failOnFalsyReturn) {
yield 'the optional option "failFalsyReturn" must be a bool';
}
}

async execute(options, { page }) {
const failOnError = 'failOnError' in options ? options.failOnError : true;
const failOnFalsyReturn = 'failOnFalsyReturn' in options ? options.failOnFalsyReturn : false;
let hasError = false;
let result;

try {
result = await page.evaluate(options.script);
} catch (err) {
if (failOnError) {
if (!err.message.startsWith('Evaluation failed:')) {
err.message = `Evaluation failed: ${err.message}`;
}

throw err;
}

hasError = true;
}

if (failOnFalsyReturn && !hasError && !result) {
throw new Error(`Evaluation failed: Script (${options.script}) returned a falsy result.`);
}
}
}
2 changes: 2 additions & 0 deletions src/application/controller/scenario/scenario-validator.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ export class ScenarioValidator {
body('config.options.session.transferredCookies', 'The value must be an array of cookie names.').optional().isArray(),
body('config.options.session.transferredCookies.*', 'The value must be a cookie name.').optional().isString(),
body('config.options.waitUntil', 'The value must be one of these: ["load", "domcontentloaded", "networkidle0", "networkidle2"].').optional().isString().isIn(['load', 'domcontentloaded', 'networkidle0', 'networkidle2']),
body('config.options.proxyUrls', 'The value must be an array of proxy URLs.').optional().isArray(),
body('config.options.proxyUrls.*', 'The value must be a valid proxy URL.').optional().isURL(),
body('config.scenes', 'The value must be a non empty object with string keys.').isObject().bail().custom(scenes => {
sceneNames = Object.keys(scenes);

Expand Down
14 changes: 12 additions & 2 deletions src/crawler/crawler.mjs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { Configuration } from 'crawlee';
import { Configuration, ProxyConfiguration } from 'crawlee';
import { PuppeteerCrawler } from './puppeteer-crawler.mjs';
import puppeteerExtra from 'puppeteer-extra';
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
Expand Down Expand Up @@ -36,7 +36,9 @@ export class Crawler {
* },
* callbackUri?: string,
* options?: {
* maxConcurrency?: number,
* maxRequests?: number,
* maxRequestRetries?: number,
* viewport?: {
* width?: number,
* height?: number,
Expand All @@ -45,7 +47,9 @@ export class Crawler {
* maxPoolSize?: number,
* maxSessionUsageCount?: number,
* transferredCookies?: array<string>,
* }
* },
* waitUntil?: string,
* proxyUrls?: array<string>,
* },
* scenes: Object.<string, array<{
* action: string,
Expand Down Expand Up @@ -190,6 +194,12 @@ export class Crawler {
crawlerOptions.sessionPoolOptions.sessionOptions.maxUsageCount = scenarioSessionOptions.maxSessionUsageCount;
}

if ('proxyUrls' in scenarioOptions && Array.isArray(scenarioOptions.proxyUrls)) {
crawlerOptions.proxyConfiguration = new ProxyConfiguration({
proxyUrls: scenarioOptions.proxyUrls,
});
}

const saveResult = async (group, identity, data, mergeOnConflict = true) => {
await this.#scenarioRepository.addResult(scenarioId, group, identity, data, mergeOnConflict);
};
Expand Down

0 comments on commit 0470964

Please sign in to comment.