From c318980ec11d211b1a5c9e6bdbe76198c5d895be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jind=C5=99ich=20B=C3=A4r?= Date: Thu, 2 Mar 2023 11:01:58 +0100 Subject: [PATCH] feat: add basic support for `setStatusMessage` (#1790) --- .../src/internals/basic-crawler.ts | 47 +++++++++++++++++++ packages/memory-storage/src/memory-storage.ts | 12 +++++ packages/types/src/storages.ts | 5 ++ 3 files changed, 64 insertions(+) diff --git a/packages/basic-crawler/src/internals/basic-crawler.ts b/packages/basic-crawler/src/internals/basic-crawler.ts index 779f88ed199e..5c5d88d66c74 100644 --- a/packages/basic-crawler/src/internals/basic-crawler.ts +++ b/packages/basic-crawler/src/internals/basic-crawler.ts @@ -254,6 +254,11 @@ export interface BasicCrawlerOptions(); @@ -400,6 +406,7 @@ export class BasicCrawler { + const { requestsFailed } = this.stats.state; + const { requestsFailed: previousRequestsFailed } = previousState; + + previousState = { ...this.stats.state }; + + if (requestsFailed - previousRequestsFailed > 0) { + return 'ERROR'; + } + + return 'REGULAR'; + }; + + const log = async () => { + const operationMode = getOperationMode(); + if (operationMode === 'ERROR') { + // eslint-disable-next-line max-len + await client.setStatusMessage?.(`Experiencing problems, ${this.stats.state.requestsFailed - previousState.requestsFailed || this.stats.state.requestsFailed} errors in the past ${this.loggingInterval} seconds.`); + } else { + // eslint-disable-next-line max-len + await client.setStatusMessage?.(`Crawled ${this.stats.state.requestsFinished}/${this.requestQueue?.assumedTotalCount || this.requestList?.length()} pages, ${this.stats.state.requestsFailed} errors.`); + } + }; + + const interval = setInterval(log, this.loggingInterval * 1e3); + return { log, stop: () => clearInterval(interval) }; + } + /** * Runs the crawler. Returns a promise that gets resolved once all the requests are processed. * We can use the `requests` parameter to enqueue the initial requests - it is a shortcut for @@ -594,6 +636,8 @@ export class BasicCrawler { this.log.warning('Pausing... Press CTRL+C again to force exit. To resume, do: CRAWLEE_PURGE_ON_START=0 yarnstart'); @@ -651,6 +695,9 @@ export class BasicCrawler { + s.string.parse(message); + s.object({ + isStatusMessageTerminal: s.boolean.optional, + }).parse(options); + + log.info(`Setting${options.isStatusMessageTerminal ? ' terminal' : ''} status message: ${message}`); + + return Promise.resolve(); + } + /** * Cleans up the default storage directories before the run starts: * - local directory containing the default dataset; diff --git a/packages/types/src/storages.ts b/packages/types/src/storages.ts index f95bb445311d..91282b378286 100644 --- a/packages/types/src/storages.ts +++ b/packages/types/src/storages.ts @@ -281,6 +281,10 @@ export interface RequestQueueOptions { timeoutSecs?: number; } +export interface SetStatusMessageOptions { + isStatusMessageTerminal?: boolean; +} + /** * Represents a storage capable of working with datasets, KV stores and request queues. */ @@ -293,5 +297,6 @@ export interface StorageClient { requestQueue(id: string, options?: RequestQueueOptions): RequestQueueClient; purge?(): Promise; teardown?(): Promise; + setStatusMessage?(message: string, options?: SetStatusMessageOptions): Promise; stats?: { rateLimitErrors: number[] }; }