Skip to content

Commit

Permalink
fix(workers): Don't block connection to chrome when failing to downlo…
Browse files Browse the repository at this point in the history
…ad adblock list. hoarder-app#674
  • Loading branch information
MohamedBassem committed Nov 21, 2024
1 parent 393d097 commit 378ad9b
Show file tree
Hide file tree
Showing 5 changed files with 120 additions and 117 deletions.
28 changes: 22 additions & 6 deletions apps/workers/crawlerWorker.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import assert from "assert";
import * as dns from "dns";
import { promises as fs } from "fs";
import * as path from "node:path";
import * as os from "os";
import type { Browser } from "puppeteer";
import { PuppeteerBlocker } from "@ghostery/adblocker-puppeteer";
import { Readability } from "@mozilla/readability";
import { Mutex } from "async-mutex";
import DOMPurify from "dompurify";
Expand All @@ -19,8 +22,8 @@ import metascraperReadability from "metascraper-readability";
import metascraperTitle from "metascraper-title";
import metascraperTwitter from "metascraper-twitter";
import metascraperUrl from "metascraper-url";
import fetch from "node-fetch";
import puppeteer from "puppeteer-extra";
import AdblockerPlugin from "puppeteer-extra-plugin-adblocker";
import StealthPlugin from "puppeteer-extra-plugin-stealth";
import { withTimeout } from "utils";
import { getBookmarkDetails, updateAsset } from "workerUtils";
Expand Down Expand Up @@ -67,6 +70,7 @@ const metascraperParser = metascraper([
]);

let globalBrowser: Browser | undefined;
let globalBlocker: PuppeteerBlocker | undefined;
// Guards the interactions with the browser instance.
// This is needed given that most of the browser APIs are async.
const browserMutex = new Mutex();
Expand Down Expand Up @@ -144,11 +148,20 @@ async function launchBrowser() {
export class CrawlerWorker {
static async build() {
puppeteer.use(StealthPlugin());
puppeteer.use(
AdblockerPlugin({
blockTrackersAndAnnoyances: true,
}),
);
if (serverConfig.crawler.enableAdblocker) {
try {
logger.info("[crawler] Loading adblocker ...");
globalBlocker = await PuppeteerBlocker.fromPrebuiltFull(fetch, {
path: path.join(os.tmpdir(), "hoarder_adblocker.bin"),
read: fs.readFile,
write: fs.writeFile,
});
} catch (e) {
logger.error(
`[crawler] Failed to load adblocker. Will not be blocking ads: ${e}`,
);
}
}
if (!serverConfig.crawler.browserConnectOnDemand) {
await launchBrowser();
} else {
Expand Down Expand Up @@ -238,6 +251,9 @@ async function crawlPage(jobId: string, url: string) {

try {
const page = await context.newPage();
if (globalBlocker) {
await globalBlocker.enableBlockingInPage(page);
}
await page.setUserAgent(
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
);
Expand Down
3 changes: 2 additions & 1 deletion apps/workers/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
"version": "0.1.0",
"private": true,
"dependencies": {
"@ghostery/adblocker-puppeteer": "^2.1.1",
"@hoarder/db": "workspace:^0.1.0",
"@hoarder/shared": "workspace:^0.1.0",
"@hoarder/trpc": "workspace:^0.1.0",
Expand All @@ -28,11 +29,11 @@
"metascraper-twitter": "^5.45.6",
"metascraper-url": "^5.45.22",
"node-cron": "^3.0.3",
"node-fetch": "^3.3.2",
"pdf2json": "^3.0.5",
"pdfjs-dist": "^4.0.379",
"puppeteer": "^22.0.0",
"puppeteer-extra": "^3.3.6",
"puppeteer-extra-plugin-adblocker": "^2.13.6",
"puppeteer-extra-plugin-stealth": "^2.11.2",
"rss-parser": "^3.13.0",
"tesseract.js": "^5.1.1",
Expand Down
1 change: 1 addition & 0 deletions docs/docs/03-configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ Either `OPENAI_API_KEY` or `OLLAMA_BASE_URL` need to be set for automatic taggin
| CRAWLER_VIDEO_DOWNLOAD | No | false | Whether to download videos from the page or not (using yt-dlp) |
| CRAWLER_VIDEO_DOWNLOAD_MAX_SIZE | No | 50 | The maximum file size for the downloaded video. The quality will be chosen accordingly. Use -1 to disable the limit. |
| CRAWLER_VIDEO_DOWNLOAD_TIMEOUT_SEC | No | 600 | How long to wait for the video download to finish |
| CRAWLER_ENABLE_ADBLOCKER | No | true | Whether to enable an adblocker in the crawler or not. If you're facing troubles downloading the adblocking lists on worker startup, you can disable this. |

## OCR Configs

Expand Down
2 changes: 2 additions & 0 deletions packages/shared/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ const allEnv = z.object({
CRAWLER_VIDEO_DOWNLOAD: stringBool("false"),
CRAWLER_VIDEO_DOWNLOAD_MAX_SIZE: z.coerce.number().default(50),
CRAWLER_VIDEO_DOWNLOAD_TIMEOUT_SEC: z.coerce.number().default(10 * 60),
CRAWLER_ENABLE_ADBLOCKER: stringBool("true"),
MEILI_ADDR: z.string().optional(),
MEILI_MASTER_KEY: z.string().default(""),
LOG_LEVEL: z.string().default("debug"),
Expand Down Expand Up @@ -104,6 +105,7 @@ const serverConfigSchema = allEnv.transform((val) => {
downloadVideo: val.CRAWLER_VIDEO_DOWNLOAD,
maxVideoDownloadSize: val.CRAWLER_VIDEO_DOWNLOAD_MAX_SIZE,
downloadVideoTimeout: val.CRAWLER_VIDEO_DOWNLOAD_TIMEOUT_SEC,
enableAdblocker: val.CRAWLER_ENABLE_ADBLOCKER,
},
ocr: {
langs: val.OCR_LANGS,
Expand Down
Loading

0 comments on commit 378ad9b

Please sign in to comment.