Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: cookie monster is no more #1414

Merged
merged 5 commits into from
Jul 22, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 4 additions & 8 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 7 additions & 1 deletion packages/basic-crawler/src/internals/basic-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import type {
SessionPoolOptions,
} from '@crawlee/core';
import {
mergeCookies,
AutoscaledPool,
Configuration,
type CrawlingContext,
Expand Down Expand Up @@ -1062,7 +1063,12 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
}

protected _getCookieHeaderFromRequest(request: Request) {
return request.headers?.Cookie ?? request.headers?.cookie ?? '';
if (request.headers?.Cookie && request.headers?.cookie) {
this.log.warning(`Encountered mixed casing for the cookie headers for request ${request.url} (${request.id}). Their values will be merged.`);
return mergeCookies(request.url, [request.headers.cookie, request.headers.Cookie]);
}

return request.headers?.Cookie || request.headers?.cookie || '';
}
}

Expand Down
3 changes: 1 addition & 2 deletions packages/browser-crawler/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,6 @@
"@crawlee/core": "^3.0.1",
"@crawlee/types": "^3.0.1",
"@crawlee/utils": "^3.0.1",
"ow": "^0.28.1",
"tough-cookie": "^4.0.0"
"ow": "^0.28.1"
}
}
10 changes: 6 additions & 4 deletions packages/browser-crawler/src/internals/browser-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import type {
Session,
} from '@crawlee/core';
import {
cookieStringToToughCookie,
enqueueLinks,
EVENT_SESSION_RETIRED,
handleRequestTimeout,
Expand All @@ -29,7 +30,6 @@ import type {
import { BROWSER_CONTROLLER_EVENTS, BrowserPool } from '@crawlee/browser-pool';
import type { GotOptionsInit, Response as GotResponse } from 'got-scraping';
import ow from 'ow';
import { Cookie } from 'tough-cookie';
import type { BatchAddRequestsResult, Cookie as CookieObject } from '@crawlee/types';
import type { BrowserLaunchContext } from './browser-launcher';

Expand Down Expand Up @@ -544,16 +544,18 @@ export abstract class BrowserCrawler<

protected async _applyCookies({ session, request, page, browserController }: Context, preHooksCookies: string, postHooksCookies: string) {
const sessionCookie = session?.getCookies(request.url) ?? [];
const parsedPreHooksCookies = preHooksCookies.split(/ *; */).map((c) => Cookie.parse(c)?.toJSON());
const parsedPostHooksCookies = postHooksCookies.split(/ *; */).map((c) => Cookie.parse(c)?.toJSON());
const parsedPreHooksCookies = preHooksCookies.split(/ *; */).map((c) => cookieStringToToughCookie(c));
const parsedPostHooksCookies = postHooksCookies.split(/ *; */).map((c) => cookieStringToToughCookie(c));

await browserController.setCookies(
page,
[
...sessionCookie,
...parsedPreHooksCookies,
...parsedPostHooksCookies,
].filter((c): c is CookieObject => typeof c !== 'undefined'),
]
.filter((c): c is CookieObject => typeof c !== 'undefined' && c !== null)
.map((c) => ({ ...c, url: c.domain ? undefined : request.url })),
);
}

Expand Down
48 changes: 46 additions & 2 deletions packages/cheerio-crawler/src/internals/cheerio-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -611,11 +611,52 @@ export class CheerioCrawler extends BasicCrawler<CheerioCrawlingContext> {
*/
private _applyCookies({ session, request }: CrawlingContext, gotOptions: OptionsInit, preHookCookies: string, postHookCookies: string) {
const sessionCookie = session?.getCookieString(request.url) ?? '';
const alteredGotOptionsCookies = (gotOptions.headers?.Cookie ?? gotOptions.headers?.cookie ?? '') as string;
let alteredGotOptionsCookies = (gotOptions.headers?.Cookie || gotOptions.headers?.cookie || '');

const mergedCookie = mergeCookies(request.url, [sessionCookie, preHookCookies, alteredGotOptionsCookies, postHookCookies]);
if (gotOptions.headers?.Cookie && gotOptions.headers?.cookie) {
const {
Cookie: upperCaseHeader,
cookie: lowerCaseHeader,
} = gotOptions.headers;

// eslint-disable-next-line max-len
this.log.warning(`Encountered mixed casing for the cookie headers in the got options for request ${request.url} (${request.id}). Their values will be merged`);

const sourceCookies = [];

if (Array.isArray(lowerCaseHeader)) {
sourceCookies.push(...lowerCaseHeader);
} else {
sourceCookies.push(lowerCaseHeader);
}

if (Array.isArray(upperCaseHeader)) {
sourceCookies.push(...upperCaseHeader);
} else {
sourceCookies.push(upperCaseHeader);
}

alteredGotOptionsCookies = mergeCookies(request.url, sourceCookies);
}

const sourceCookies = [
sessionCookie,
preHookCookies,
];

if (Array.isArray(alteredGotOptionsCookies)) {
sourceCookies.push(...alteredGotOptionsCookies);
} else {
sourceCookies.push(alteredGotOptionsCookies);
}

sourceCookies.push(postHookCookies);

const mergedCookie = mergeCookies(request.url, sourceCookies);

gotOptions.headers ??= {};
Reflect.deleteProperty(gotOptions.headers, 'Cookie');
Reflect.deleteProperty(gotOptions.headers, 'cookie');
gotOptions.headers.Cookie = mergedCookie;
}

Expand Down Expand Up @@ -690,6 +731,9 @@ export class CheerioCrawler extends BasicCrawler<CheerioCrawlingContext> {
isStream: true,
};

// Delete any possible lowercased header for cookie as they are merged in _applyCookies under the uppercase Cookie header
Reflect.deleteProperty(requestOptions.headers!, 'cookie');

// TODO this is incorrect, the check for man in the middle needs to be done
// on individual proxy level, not on the `proxyConfiguration` level,
// because users can use normal + MITM proxies in a single configuration.
Expand Down
124 changes: 124 additions & 0 deletions packages/core/src/cookie_utils.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
import type { BrowserLikeResponse, Dictionary, Cookie as CookieObject } from '@crawlee/types';
import type { IncomingMessage } from 'node:http';
import { Cookie, CookieJar } from 'tough-cookie';
import { CookieParseError } from './session_pool/errors';
import { log } from './log';

/**
* @internal
*/
export function getCookiesFromResponse(response: IncomingMessage | BrowserLikeResponse | { headers: Dictionary<string | string[]> }): Cookie[] {
const headers = typeof response.headers === 'function' ? response.headers() : response.headers;
const cookieHeader = headers['set-cookie'] || '';

try {
return Array.isArray(cookieHeader)
? cookieHeader.map((cookie) => Cookie.parse(cookie)!)
: [Cookie.parse(cookieHeader)!];
} catch (e) {
throw new CookieParseError(cookieHeader);
}
}

/**
* Calculate cookie expiration date
* @param maxAgeSecs
* @returns Calculated date by session max age seconds.
* @internal
*/
export function getDefaultCookieExpirationDate(maxAgeSecs: number) {
return new Date(Date.now() + (maxAgeSecs * 1000));
}

/**
* Transforms tough-cookie to puppeteer cookie.
* @param toughCookie Cookie from CookieJar
* @return Cookie compatible with browser pool
* @internal
*/
export function toughCookieToBrowserPoolCookie(toughCookie: Cookie): CookieObject {
return {
name: toughCookie.key,
value: toughCookie.value,
// Puppeteer and Playwright expect 'expires' to be 'Unix time in seconds', not ms
// If there is no expires date (so defaults to Infinity), we don't provide it to the browsers
expires: toughCookie.expires === 'Infinity' ? undefined : new Date(toughCookie.expires).getTime() / 1000,
domain: toughCookie.domain ?? undefined,
path: toughCookie.path ?? undefined,
secure: toughCookie.secure,
httpOnly: toughCookie.httpOnly,
};
}

/**
* Transforms browser-pool cookie to tough-cookie.
* @param cookieObject Cookie object (for instance from the `page.cookies` method).
* @internal
*/
export function browserPoolCookieToToughCookie(cookieObject: CookieObject, maxAgeSecs: number) {
const isExpiresValid = cookieObject.expires && typeof cookieObject.expires === 'number' && cookieObject.expires > 0;
const expires = isExpiresValid ? new Date(cookieObject.expires! * 1000) : getDefaultCookieExpirationDate(maxAgeSecs);
const domain = typeof cookieObject.domain === 'string' && cookieObject.domain.startsWith('.')
? cookieObject.domain.slice(1)
: cookieObject.domain;

return new Cookie({
key: cookieObject.name,
value: cookieObject.value,
expires,
domain,
path: cookieObject.path,
secure: cookieObject.secure,
httpOnly: cookieObject.httpOnly,
});
}

/**
* @internal
* @param cookieString The cookie string to attempt parsing
* @returns Browser pool compatible cookie, or null if cookie cannot be parsed
*/
export function cookieStringToToughCookie(cookieString: string) {
const parsed = Cookie.parse(cookieString);

if (parsed) {
return toughCookieToBrowserPoolCookie(parsed);
}

return null;
}

/**
* Merges multiple cookie strings. Keys are compared case-sensitively, warning will be logged
* if we see two cookies with same keys but different casing.
* @internal
*/
export function mergeCookies(url: string, sourceCookies: string[]): string {
const jar = new CookieJar();

// ignore empty cookies
for (const sourceCookieString of sourceCookies) {
// ignore empty cookies
if (!sourceCookieString) continue;

const cookies = sourceCookieString.split(/ *; */);

for (const cookieString of cookies) {
// ignore extra spaces
if (!cookieString) continue;

const cookie = Cookie.parse(cookieString)!;
const similarKeyCookie = jar.getCookiesSync(url).find((c) => {
return cookie.key !== c.key && cookie.key.toLowerCase() === c.key.toLowerCase();
});

if (similarKeyCookie) {
log.deprecated(`Found cookies with similar name during cookie merging: '${cookie.key}' and '${similarKeyCookie.key}'`);
}

jar.setCookieSync(cookie, url);
}
}

return jar.getCookieStringSync(url);
}
55 changes: 0 additions & 55 deletions packages/core/src/crawlers/crawler_utils.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
import { TimeoutError } from '@apify/timeout';
import { Cookie, CookieJar } from 'tough-cookie';
import { log } from '../log';
import type { Session } from '../session_pool/session';

/**
Expand All @@ -13,56 +11,3 @@ export function handleRequestTimeout({ session, errorMessage }: { session?: Sess
const timeoutSecs = Number(timeoutMillis) / 1000;
throw new TimeoutError(`Navigation timed out after ${timeoutSecs} seconds.`);
}

/**
* Merges multiple cookie strings. Keys are compared case-sensitively, warning will be logged
* if we see two cookies with same keys but different casing.
* @internal
*/
export function mergeCookies(url: string, sourceCookies: string[]): string {
const jar = new CookieJar();

// ignore empty cookies
for (const sourceCookieString of sourceCookies.filter((c) => c)) {
const cookies = sourceCookieString.split(/ *; */).filter((c) => c); // ignore extra spaces

for (const cookieString of cookies) {
const cookie = Cookie.parse(cookieString)!;
const similarKeyCookie = jar.getCookiesSync(url).find((c) => {
return cookie.key !== c.key && cookie.key.toLowerCase() === c.key.toLowerCase();
});

if (similarKeyCookie) {
log.deprecated(`Found cookies with similar name during cookie merging: '${cookie.key}' and '${similarKeyCookie.key}'`);
}

jar.setCookieSync(cookie, url);
}
}

return jar.getCookieStringSync(url);
}

/**
* @internal
*/
export function diffCookies(url: string, cookieString1 = '', cookieString2 = ''): string {
if (cookieString1 === cookieString2 || !cookieString2) {
return '';
}

if (!cookieString1) {
return cookieString2;
}

const cookies1 = cookieString1.split(/ *; */).filter((item) => Boolean(item)).map((cookie) => Cookie.parse(cookie)!);
const cookies2 = cookieString2.split(/ *; */).filter((item) => Boolean(item)).map((cookie) => Cookie.parse(cookie)!);

const added = cookies2.filter((newCookie) => {
return !cookies1.find((oldCookie) => newCookie.toString() === oldCookie.toString());
});
const jar = new CookieJar();
added.forEach((cookie) => jar.setCookieSync(cookie, url));

return jar.getCookieStringSync(url);
}
1 change: 1 addition & 0 deletions packages/core/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,6 @@ export * from './serialization';
export * from './session_pool';
export * from './storages';
export * from './validators';
export * from './cookie_utils';
export { PseudoUrl } from '@apify/pseudo_url';
export { Dictionary, Awaitable, Constructor, StorageClient } from '@crawlee/types';
1 change: 0 additions & 1 deletion packages/core/src/session_pool/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,3 @@ export * from './errors';
export * from './events';
export * from './session';
export * from './session_pool';
export * from './session_utils';
Loading