Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add parseWithCheerio for puppeteer & playwright #1418

Merged
merged 4 commits into from
Jul 23, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions packages/playwright-crawler/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
"@crawlee/browser-pool": "^3.0.1",
"@crawlee/core": "^3.0.1",
"@crawlee/utils": "^3.0.1",
"cheerio": "1.0.0-rc.12",
"jquery": "^3.6.0",
"ow": "^0.28.1"
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ import { LruCache } from '@apify/datastructures';
import log_ from '@apify/log';
import type { Request } from '@crawlee/core';
import { validators } from '@crawlee/core';
import type { Dictionary } from '@crawlee/utils';
import type { CheerioRoot, Dictionary } from '@crawlee/utils';
import * as cheerio from 'cheerio';
import type { PlaywrightCrawlingContext } from '../playwright-crawler';

const log = log_.child({ prefix: 'Playwright Utils' });
Expand Down Expand Up @@ -191,19 +192,38 @@ export async function gotoExtended(page: Page, request: Request, gotoOptions: Di
return page.goto(url, gotoOptions);
}

/**
* Returns Cheerio handle for `page.content()`, allowing to work with the data same way as with {@link CheerioCrawler}.
*
* **Example usage:**
* ```javascript
* const $ = await playwrightUtils.parseWithCheerio(page);
* const title = $('title').text();
* ```
*
* @param page Playwright [`Page`](https://playwright.dev/docs/api/class-page) object.
*/
export async function parseWithCheerio(page: Page): Promise<CheerioRoot> {
const pageContent = await page.content();
return cheerio.load(pageContent);
}

export interface PlaywrightContextUtils {
injectFile(filePath: string, options?: InjectFileOptions): Promise<unknown>;
injectJQuery(): Promise<unknown>;
parseWithCheerio(): Promise<CheerioRoot>;
}

export function registerUtilsToContext(context: PlaywrightCrawlingContext): void {
context.injectFile = (filePath: string, options?: InjectFileOptions) => injectFile(context.page, filePath, options);
context.injectJQuery = () => injectJQuery(context.page);
context.parseWithCheerio = () => parseWithCheerio(context.page);
}

/** @internal */
export const playwrightUtils = {
injectFile,
injectJQuery,
gotoExtended,
parseWithCheerio,
};
1 change: 1 addition & 0 deletions packages/puppeteer-crawler/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
"@crawlee/browser-pool": "^3.0.1",
"@crawlee/types": "^3.0.1",
"@crawlee/utils": "^3.0.1",
"cheerio": "1.0.0-rc.12",
"jquery": "^3.6.0",
"ow": "^0.28.1"
},
Expand Down
39 changes: 30 additions & 9 deletions packages/puppeteer-crawler/src/internals/utils/puppeteer_utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ import log_ from '@apify/log';
import type { Request } from '@crawlee/core';
import { KeyValueStore, validators } from '@crawlee/core';
import type { Dictionary, BatchAddRequestsResult } from '@crawlee/types';
import type { CheerioRoot } from '@crawlee/utils';
import * as cheerio from 'cheerio';
import type { EnqueueLinksByClickingElementsOptions } from '../enqueue-links/click-elements';
import { enqueueLinksByClickingElements } from '../enqueue-links/click-elements';
import type { InterceptHandler } from './puppeteer_request_interception';
Expand Down Expand Up @@ -106,7 +108,7 @@ const injectedFilesCache = new LruCache({ maxLength: MAX_INJECT_FILE_CACHE_SIZE
*
* File contents are cached for up to 10 files to limit file system access.
*
* @param page Puppeteer [`Page`](https://pptr.dev/#?product=Puppeteer&show=api-class-page) object.
* @param page Puppeteer [`Page`](https://pptr.dev/api/puppeteer.page) object.
* @param filePath File path
* @param [options]
*/
Expand Down Expand Up @@ -152,16 +154,32 @@ export async function injectFile(page: Page, filePath: string, options: InjectFi
* ```
*
* Note that `injectJQuery()` does not affect the Puppeteer's
* [`page.$()`](https://pptr.dev/#?product=Puppeteer&show=api-pageselector)
* [`page.$()`](https://pptr.dev/api/puppeteer.page._/)
* function in any way.
*
* @param page Puppeteer [`Page`](https://pptr.dev/#?product=Puppeteer&show=api-class-page) object.
* @param page Puppeteer [`Page`](https://pptr.dev/api/puppeteer.page) object.
*/
export function injectJQuery(page: Page): Promise<unknown> {
ow(page, ow.object.validate(validators.browserPage));
return injectFile(page, jqueryPath, { surviveNavigations: true });
}

/**
* Returns Cheerio handle for `page.content()`, allowing to work with the data same way as with {@link CheerioCrawler}.
*
* **Example usage:**
* ```javascript
* const $ = await puppeteerUtils.parseWithCheerio(page);
* const title = $('title').text();
* ```
*
* @param page Puppeteer [`Page`](https://pptr.dev/api/puppeteer.page) object.
*/
export async function parseWithCheerio(page: Page): Promise<CheerioRoot> {
const pageContent = await page.content();
return cheerio.load(pageContent);
}

/**
* Forces the Puppeteer browser tab to block loading URLs that match a provided pattern.
* This is useful to speed up crawling of websites, since it reduces the amount
Expand Down Expand Up @@ -201,7 +219,7 @@ export function injectJQuery(page: Page): Promise<unknown> {
* await page.goto('https://cnn.com');
* ```
*
* @param page Puppeteer [`Page`](https://pptr.dev/#?product=Puppeteer&show=api-class-page) object.
* @param page Puppeteer [`Page`](https://pptr.dev/api/puppeteer.page) object.
* @param [options]
*/
export async function blockRequests(page: Page, options: BlockRequestsOptions = {}): Promise<void> {
Expand Down Expand Up @@ -249,7 +267,7 @@ export const blockResources = async (page: Page, resourceTypes = ['stylesheet',
* *IMPORTANT*: Caching responses stores them to memory, so too loose rules could cause memory leaks for longer running crawlers.
* This issue should be resolved or atleast mitigated in future iterations of this feature.
* @param page
* Puppeteer [`Page`](https://pptr.dev/#?product=Puppeteer&show=api-class-page) object.
* Puppeteer [`Page`](https://pptr.dev/api/puppeteer.page) object.
* @param cache
* Object in which responses are stored
* @param responseUrlRules
Expand Down Expand Up @@ -312,7 +330,7 @@ export async function cacheResponses(page: Page, cache: Dictionary<Partial<Respo
* request: Request,
* }
* ```
* Where `page` is a Puppeteer [`Page`](https://pptr.dev/#?product=Puppeteer&show=api-class-page)
* Where `page` is a Puppeteer [`Page`](https://pptr.dev/api/puppeteer.page)
* and `request` is a {@link Request}.
*
* The function is compiled by using the `scriptString` parameter as the function's body,
Expand Down Expand Up @@ -354,7 +372,7 @@ export function compileScript(scriptString: string, context: Dictionary = Object
* *NOTE:* In recent versions of Puppeteer using requests other than GET, overriding headers and adding payloads disables
* browser cache which degrades performance.
*
* @param page Puppeteer [`Page`](https://pptr.dev/#?product=Puppeteer&show=api-class-page) object.
* @param page Puppeteer [`Page`](https://pptr.dev/api/puppeteer.page) object.
* @param request
* @param [gotoOptions] Custom options for `page.goto()`.
*/
Expand Down Expand Up @@ -432,7 +450,7 @@ export interface InfiniteScrollOptions {
/**
* Scrolls to the bottom of a page, or until it times out.
* Loads dynamic content when it hits the bottom of a page, and then continues scrolling.
* @param page Puppeteer [`Page`](https://pptr.dev/#?product=Puppeteer&show=api-class-page) object.
* @param page Puppeteer [`Page`](https://pptr.dev/api/puppeteer.page) object.
* @param [options]
*/
export async function infiniteScroll(page: Page, options: InfiniteScrollOptions = {}): Promise<void> {
Expand Down Expand Up @@ -572,7 +590,7 @@ export interface SaveSnapshotOptions {

/**
* Saves a full screenshot and HTML of the current page into a Key-Value store.
* @param page Puppeteer [`Page`](https://pptr.dev/#?product=Puppeteer&show=api-class-page) object.
* @param page Puppeteer [`Page`](https://pptr.dev/api/puppeteer.page) object.
* @param [options]
*/
export async function saveSnapshot(page: Page, options: SaveSnapshotOptions = {}): Promise<void> {
Expand Down Expand Up @@ -615,6 +633,7 @@ export async function saveSnapshot(page: Page, options: SaveSnapshotOptions = {}
export interface PuppeteerContextUtils {
injectFile(filePath: string, options?: InjectFileOptions): Promise<unknown>;
injectJQuery(): Promise<unknown>;
parseWithCheerio(): Promise<CheerioRoot>;
enqueueLinksByClickingElements(options: Omit<EnqueueLinksByClickingElementsOptions, 'page' | 'requestQueue'>): Promise<BatchAddRequestsResult>;
blockRequests(options?: BlockRequestsOptions): Promise<void>;
blockResources(resourceTypes?: string[]): Promise<void>;
Expand All @@ -630,6 +649,7 @@ export interface PuppeteerContextUtils {
export function registerUtilsToContext(context: PuppeteerCrawlingContext): void {
context.injectFile = (filePath: string, options?: InjectFileOptions) => injectFile(context.page, filePath, options);
context.injectJQuery = () => injectJQuery(context.page);
context.parseWithCheerio = () => parseWithCheerio(context.page);
context.enqueueLinksByClickingElements = (options: Omit<EnqueueLinksByClickingElementsOptions, 'page' | 'requestQueue'>) => enqueueLinksByClickingElements({
page: context.page,
requestQueue: context.crawler.requestQueue!,
Expand Down Expand Up @@ -667,4 +687,5 @@ export const puppeteerUtils = {
removeInterceptRequestHandler,
infiniteScroll,
saveSnapshot,
parseWithCheerio,
};
18 changes: 17 additions & 1 deletion test/core/playwright_utils.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ afterAll(() => {
server.close();
});

describe('Apify.utils.playwright', () => {
describe('playwrightUtils', () => {
let ll: number;
const localStorageEmulator = new MemoryStorageEmulator();

Expand Down Expand Up @@ -158,6 +158,22 @@ describe('Apify.utils.playwright', () => {
}
});

test('parseWithCheerio() works', async () => {
const browser = await launchName(launchContext);

try {
const page = await browser.newPage();
await page.goto('https://www.example.com');

const $ = await playwrightUtils.parseWithCheerio(page);

const title = $('h1').text().trim();
expect(title).toBe('Example Domain');
} finally {
await browser.close();
}
});

test('gotoExtended() works', async () => {
const browser = await chromium.launch({ headless: true });

Expand Down
18 changes: 17 additions & 1 deletion test/core/puppeteer_utils.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ afterAll(() => {
server.close();
});

describe('utils.puppeteer', () => {
describe('puppeteerUtils', () => {
let ll: number;
const localStorageEmulator = new MemoryStorageEmulator();

Expand Down Expand Up @@ -157,6 +157,22 @@ describe('utils.puppeteer', () => {
}
});

test('parseWithCheerio() works', async () => {
const browser = await method(launchContext);

try {
const page = await browser.newPage();
await page.goto('https://www.example.com');

const $ = await puppeteerUtils.parseWithCheerio(page);

const title = $('h1').text().trim();
expect(title).toBe('Example Domain');
} finally {
await browser.close();
}
});

describe('blockRequests()', () => {
let browser: Browser = null;
beforeAll(async () => {
Expand Down