From 2f7fc7cc1d27553d94a915667f0e6d2af599a80c Mon Sep 17 00:00:00 2001 From: Szymon Marczak <36894700+szmarczak@users.noreply.github.com> Date: Wed, 7 Sep 2022 14:37:34 +0200 Subject: [PATCH] fix: handle redirect cookies (#1521) --- .../src/internals/http-crawler.ts | 22 ++++- test/core/crawlers/http_crawler.test.ts | 80 +++++++++++++++++++ 2 files changed, 98 insertions(+), 4 deletions(-) diff --git a/packages/http-crawler/src/internals/http-crawler.ts b/packages/http-crawler/src/internals/http-crawler.ts index a386d9afb752..b84d35b0fa59 100644 --- a/packages/http-crawler/src/internals/http-crawler.ts +++ b/packages/http-crawler/src/internals/http-crawler.ts @@ -23,7 +23,7 @@ import type { Awaitable, Dictionary } from '@crawlee/types'; import type { RequestLike, ResponseLike } from 'content-type'; import contentTypeParser from 'content-type'; import mime from 'mime-types'; -import type { OptionsInit, Method, Request as GotRequest, Response as GotResponse, GotOptionsInit } from 'got-scraping'; +import type { OptionsInit, Method, Request as GotRequest, Response as GotResponse, GotOptionsInit, Options } from 'got-scraping'; import { gotScraping, TimeoutError } from 'got-scraping'; import type { JsonValue } from 'type-fest'; import { extname } from 'node:path'; @@ -539,7 +539,10 @@ export class HttpCrawler { + private _requestAsBrowser = (options: OptionsInit & { isStream: true }, session?: Session) => { return new Promise((resolve, reject) => { const stream = gotScraping(options); + stream.on('redirect', (updatedOptions: Options, redirectResponse: IncomingMessage) => { + if (this.persistCookiesPerSession) { + session!.setCookiesFromResponse(redirectResponse); + + const cookieString = session!.getCookieString(updatedOptions.url!.toString()); + if (cookieString !== '') { + updatedOptions.headers.Cookie = cookieString; + } + } + }); + stream.on('error', reject); stream.on('response', () => { resolve(addResponsePropertiesToStream(stream)); diff --git a/test/core/crawlers/http_crawler.test.ts b/test/core/crawlers/http_crawler.test.ts index 2fed6b46c7f4..39d89e4519dc 100644 --- a/test/core/crawlers/http_crawler.test.ts +++ b/test/core/crawlers/http_crawler.test.ts @@ -22,6 +22,25 @@ router.set('/invalidContentType', (req, res) => { res.end(`Example Domain`); }); +router.set('/redirectAndCookies', (req, res) => { + res.setHeader('content-type', 'text/html'); + res.setHeader('set-cookie', 'foo=bar'); + res.setHeader('location', '/cookies'); + res.statusCode = 302; + res.end(); +}); + +router.set('/cookies', (req, res) => { + res.setHeader('content-type', 'text/html'); + res.end(JSON.stringify(req.headers.cookie)); +}); + +router.set('/redirectWithoutCookies', (req, res) => { + res.setHeader('location', '/cookies'); + res.statusCode = 302; + res.end(); +}); + let server: http.Server; let url: string; @@ -151,3 +170,64 @@ test('invalid content type defaults to octet-stream', async () => { }, ]); }); + +test('handles cookies from redirects', async () => { + const results: string[] = []; + + const crawler = new HttpCrawler({ + sessionPoolOptions: { + maxPoolSize: 1, + }, + handlePageFunction: async ({ body }) => { + results.push(JSON.parse(body.toString())); + }, + }); + + await crawler.run([`${url}/redirectAndCookies`]); + + expect(results).toStrictEqual([ + 'foo=bar', + ]); +}); + +test('handles cookies from redirects - no empty cookie header', async () => { + const results: string[] = []; + + const crawler = new HttpCrawler({ + sessionPoolOptions: { + maxPoolSize: 1, + }, + handlePageFunction: async ({ body }) => { + const str = body.toString(); + + if (str !== '') { + results.push(JSON.parse(str)); + } + }, + }); + + await crawler.run([`${url}/redirectWithoutCookies`]); + + expect(results).toStrictEqual([]); +}); + +test('no empty cookie header', async () => { + const results: string[] = []; + + const crawler = new HttpCrawler({ + sessionPoolOptions: { + maxPoolSize: 1, + }, + handlePageFunction: async ({ body }) => { + const str = body.toString(); + + if (str !== '') { + results.push(JSON.parse(str)); + } + }, + }); + + await crawler.run([`${url}/cookies`]); + + expect(results).toStrictEqual([]); +});