Skip to content

Commit

Permalink
fix: handle redirect cookies (#1521)
Browse files Browse the repository at this point in the history
  • Loading branch information
szmarczak authored Sep 7, 2022
1 parent 6bfe1ce commit 2f7fc7c
Show file tree
Hide file tree
Showing 2 changed files with 98 additions and 4 deletions.
22 changes: 18 additions & 4 deletions packages/http-crawler/src/internals/http-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ import type { Awaitable, Dictionary } from '@crawlee/types';
import type { RequestLike, ResponseLike } from 'content-type';
import contentTypeParser from 'content-type';
import mime from 'mime-types';
import type { OptionsInit, Method, Request as GotRequest, Response as GotResponse, GotOptionsInit } from 'got-scraping';
import type { OptionsInit, Method, Request as GotRequest, Response as GotResponse, GotOptionsInit, Options } from 'got-scraping';
import { gotScraping, TimeoutError } from 'got-scraping';
import type { JsonValue } from 'type-fest';
import { extname } from 'node:path';
Expand Down Expand Up @@ -539,7 +539,10 @@ export class HttpCrawler<Context extends InternalHttpCrawlingContext<any, any, H
gotOptions.headers ??= {};
Reflect.deleteProperty(gotOptions.headers, 'Cookie');
Reflect.deleteProperty(gotOptions.headers, 'cookie');
gotOptions.headers.Cookie = mergedCookie;

if (mergedCookie !== '') {
gotOptions.headers.Cookie = mergedCookie;
}
}

/**
Expand All @@ -551,7 +554,7 @@ export class HttpCrawler<Context extends InternalHttpCrawlingContext<any, any, H
const opts = this._getRequestOptions(request, session, proxyUrl, gotOptions);

try {
return await this._requestAsBrowser(opts);
return await this._requestAsBrowser(opts, session);
} catch (e) {
if (e instanceof TimeoutError) {
this._handleRequestTimeout(session);
Expand Down Expand Up @@ -729,10 +732,21 @@ export class HttpCrawler<Context extends InternalHttpCrawlingContext<any, any, H
/**
* @internal wraps public utility for mocking purposes
*/
private _requestAsBrowser = (options: OptionsInit & { isStream: true }) => {
private _requestAsBrowser = (options: OptionsInit & { isStream: true }, session?: Session) => {
return new Promise<IncomingMessage>((resolve, reject) => {
const stream = gotScraping(options);

stream.on('redirect', (updatedOptions: Options, redirectResponse: IncomingMessage) => {
if (this.persistCookiesPerSession) {
session!.setCookiesFromResponse(redirectResponse);

const cookieString = session!.getCookieString(updatedOptions.url!.toString());
if (cookieString !== '') {
updatedOptions.headers.Cookie = cookieString;
}
}
});

stream.on('error', reject);
stream.on('response', () => {
resolve(addResponsePropertiesToStream(stream));
Expand Down
80 changes: 80 additions & 0 deletions test/core/crawlers/http_crawler.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,25 @@ router.set('/invalidContentType', (req, res) => {
res.end(`<html><head><title>Example Domain</title></head></html>`);
});

router.set('/redirectAndCookies', (req, res) => {
res.setHeader('content-type', 'text/html');
res.setHeader('set-cookie', 'foo=bar');
res.setHeader('location', '/cookies');
res.statusCode = 302;
res.end();
});

router.set('/cookies', (req, res) => {
res.setHeader('content-type', 'text/html');
res.end(JSON.stringify(req.headers.cookie));
});

router.set('/redirectWithoutCookies', (req, res) => {
res.setHeader('location', '/cookies');
res.statusCode = 302;
res.end();
});

let server: http.Server;
let url: string;

Expand Down Expand Up @@ -151,3 +170,64 @@ test('invalid content type defaults to octet-stream', async () => {
},
]);
});

test('handles cookies from redirects', async () => {
const results: string[] = [];

const crawler = new HttpCrawler({
sessionPoolOptions: {
maxPoolSize: 1,
},
handlePageFunction: async ({ body }) => {
results.push(JSON.parse(body.toString()));
},
});

await crawler.run([`${url}/redirectAndCookies`]);

expect(results).toStrictEqual([
'foo=bar',
]);
});

test('handles cookies from redirects - no empty cookie header', async () => {
const results: string[] = [];

const crawler = new HttpCrawler({
sessionPoolOptions: {
maxPoolSize: 1,
},
handlePageFunction: async ({ body }) => {
const str = body.toString();

if (str !== '') {
results.push(JSON.parse(str));
}
},
});

await crawler.run([`${url}/redirectWithoutCookies`]);

expect(results).toStrictEqual([]);
});

test('no empty cookie header', async () => {
const results: string[] = [];

const crawler = new HttpCrawler({
sessionPoolOptions: {
maxPoolSize: 1,
},
handlePageFunction: async ({ body }) => {
const str = body.toString();

if (str !== '') {
results.push(JSON.parse(str));
}
},
});

await crawler.run([`${url}/cookies`]);

expect(results).toStrictEqual([]);
});

0 comments on commit 2f7fc7c

Please sign in to comment.