Skip to content

Commit

Permalink
feat: don't close page before calling errorHandler (#1548)
Browse files Browse the repository at this point in the history
  • Loading branch information
szmarczak authored Sep 19, 2022
1 parent a4ceef0 commit 1c8cd82
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 23 deletions.
10 changes: 10 additions & 0 deletions packages/basic-crawler/src/internals/basic-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -798,6 +798,14 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
return this.requestQueue.fetchNextRequest();
}

/**
* Executed when `errorHandler` finishes or the request is successful.
* Can be used to clean up orphaned browser pages.
*/
protected async _cleanupContext(
_crawlingContext: Context,
) {}

/**
* Wrapper around requestHandler that fetches requests from RequestList/RequestQueue
* then retries them in a case of an error, etc.
Expand Down Expand Up @@ -916,6 +924,8 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
throw secondaryError;
}
} finally {
await this._cleanupContext(crawlingContext);

this.crawlingContexts.delete(crawlingContext.id);
}
}
Expand Down
47 changes: 24 additions & 23 deletions packages/browser-crawler/src/internals/browser-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -426,6 +426,11 @@ export abstract class BrowserCrawler<
});
}

protected override async _cleanupContext(crawlingContext: Context): Promise<void> {
const { page } = crawlingContext;
await page.close().catch((error: Error) => this.log.debug('Error while closing page', { error }));
}

/**
* Wrapper around requestHandler that opens and closes pages etc.
*/
Expand Down Expand Up @@ -466,34 +471,30 @@ export abstract class BrowserCrawler<
// So we must not save the session prior to making sure it was used only once, otherwise we would use it twice.
const { request, session } = crawlingContext;

try {
if (!request.skipNavigation) {
await this._handleNavigation(crawlingContext);
tryCancel();
if (!request.skipNavigation) {
await this._handleNavigation(crawlingContext);
tryCancel();

await this._responseHandler(crawlingContext);
tryCancel();
await this._responseHandler(crawlingContext);
tryCancel();

// save cookies
// TODO: Should we save the cookies also after/only the handle page?
if (this.persistCookiesPerSession) {
const cookies = await crawlingContext.browserController.getCookies(page);
tryCancel();
session?.setCookies(cookies, request.loadedUrl!);
}
// save cookies
// TODO: Should we save the cookies also after/only the handle page?
if (this.persistCookiesPerSession) {
const cookies = await crawlingContext.browserController.getCookies(page);
tryCancel();
session?.setCookies(cookies, request.loadedUrl!);
}
}

await addTimeoutToPromise(
() => Promise.resolve(this.userProvidedRequestHandler(crawlingContext)),
this.requestHandlerTimeoutMillis,
`requestHandler timed out after ${this.requestHandlerTimeoutMillis / 1000} seconds.`,
);
tryCancel();
await addTimeoutToPromise(
() => Promise.resolve(this.userProvidedRequestHandler(crawlingContext)),
this.requestHandlerTimeoutMillis,
`requestHandler timed out after ${this.requestHandlerTimeoutMillis / 1000} seconds.`,
);
tryCancel();

if (session) session.markGood();
} finally {
await page.close().catch((error: Error) => this.log.debug('Error while closing page', { error }));
}
if (session) session.markGood();
}

protected _enhanceCrawlingContextWithPageInfo(crawlingContext: Context, page: CommonPage, createNewSession?: boolean): void {
Expand Down
29 changes: 29 additions & 0 deletions test/core/crawlers/browser_crawler.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,35 @@ describe('BrowserCrawler', () => {
expect(isEvaluated).toBeTruthy();
});

test('errorHandler has open page', async () => {
const requestList = await RequestList.open({
sources: [
{ url: 'http://example.com/?q=1' },
],
});

const result: string[] = [];

const browserCrawler = new BrowserCrawlerTest({
browserPoolOptions: {
browserPlugins: [puppeteerPlugin],
},
requestList,
requestHandler: async (ctx) => {
throw new Error('Test error');
},
maxRequestRetries: 1,
errorHandler: async (ctx, error) => {
result.push(await ctx.page.evaluate(() => window.location.origin));
},
});

await browserCrawler.run();

expect(result.length).toBe(1);
expect(result[0]).toBe('http://example.com');
});

test('should allow modifying gotoOptions by pre navigation hooks', async () => {
const requestList = await RequestList.open({
sources: [
Expand Down

0 comments on commit 1c8cd82

Please sign in to comment.