Skip to content

Commit

Permalink
feat(scrapeURL): handle PDFs behind anti-bot
Browse files Browse the repository at this point in the history
  • Loading branch information
mogery committed Feb 18, 2025
1 parent e28a444 commit abee9ea
Show file tree
Hide file tree
Showing 8 changed files with 94 additions and 15 deletions.
17 changes: 14 additions & 3 deletions apps/api/src/__tests__/snips/scrape.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ describe("Scrape tests", () => {

expectScrapeToSucceed(response);
expect(response.body.data.markdown).not.toContain(".g.doubleclick.net/");
}, 10000);
}, 30000);

it.concurrent("doesn't block ads if explicitly disabled", async () => {
const response = await scrape({
Expand All @@ -54,7 +54,7 @@ describe("Scrape tests", () => {

expectScrapeToSucceed(response);
expect(response.body.data.markdown).toContain(".g.doubleclick.net/");
}, 10000);
}, 30000);
});

describe("Location API (f-e dependant)", () => {
Expand Down Expand Up @@ -110,5 +110,16 @@ describe("Scrape tests", () => {
expectScrapeToSucceed(response);
expect(typeof response.body.data.screenshot).toBe("string");
}, 15000);
})
});

describe("PDF (f-e dependant)", () => {
it.concurrent("works for PDFs behind anti-bot", async () => {
const response = await scrape({
url: "https://www.researchgate.net/profile/Amir-Leshem/publication/220732050_Robust_adaptive_beamforming_based_on_jointly_estimating_covariance_matrix_and_steering_vector/links/0c96052d2fd8f0a84b000000/Robust-adaptive-beamforming-based-on-jointly-estimating-covariance-matrix-and-steering-vector.pdf"
});

expectScrapeToSucceed(response);
expect(response.body.data.markdown).toContain("Robust adaptive beamforming based on jointly estimating covariance matrix");
}, 60000);
});
});
2 changes: 1 addition & 1 deletion apps/api/src/scraper/scrapeURL/engines/fetch/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ export async function scrapeURLWithFetch(
}
}

specialtyScrapeCheck(
await specialtyScrapeCheck(
meta.logger.child({ method: "scrapeURLWithFetch/specialtyScrapeCheck" }),
Object.fromEntries(response.headers as any),
);
Expand Down
3 changes: 2 additions & 1 deletion apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -120,11 +120,12 @@ async function performFireEngineScrape<
await new Promise((resolve) => setTimeout(resolve, 250));
}

specialtyScrapeCheck(
await specialtyScrapeCheck(
logger.child({
method: "performFireEngineScrape/specialtyScrapeCheck",
}),
status.responseHeaders,
status,
);

const contentType = (Object.entries(status.responseHeaders ?? {}).find(
Expand Down
21 changes: 16 additions & 5 deletions apps/api/src/scraper/scrapeURL/engines/pdf/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@ import * as Sentry from "@sentry/node";
import escapeHtml from "escape-html";
import PdfParse from "pdf-parse";
import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile";
import { RemoveFeatureError, UnsupportedFileError } from "../../error";
import { PDFAntibotError, RemoveFeatureError, UnsupportedFileError } from "../../error";
import { readFile, unlink } from "node:fs/promises";
import path from "node:path";
import type { Response } from "undici";

type PDFProcessorResult = { html: string; markdown?: string };

Expand Down Expand Up @@ -88,9 +89,19 @@ export async function scrapePDF(
};
}

const { response, tempFilePath } = await downloadFile(meta.id, meta.url, {
headers: meta.options.headers,
});
const { response, tempFilePath } = (meta.pdfPrefetch !== undefined && meta.pdfPrefetch !== null)
? { response: meta.pdfPrefetch, tempFilePath: meta.pdfPrefetch.filePath }
: await downloadFile(meta.id, meta.url, {
headers: meta.options.headers,
});

if ((response as any).headers) { // if downloadFile was used
const r: Response = response as any;
const ct = r.headers.get("Content-Type");
if (ct && !ct.includes("application/pdf")) { // if downloaded file wasn't a PDF
throw new PDFAntibotError();
}
}

let result: PDFProcessorResult | null = null;

Expand Down Expand Up @@ -142,7 +153,7 @@ export async function scrapePDF(
await unlink(tempFilePath);

return {
url: response.url,
url: response.url ?? meta.url,
statusCode: response.status,
html: result?.html ?? "",
markdown: result?.markdown ?? "",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ export function scrapeURLWithScrapingBee(
});
}

specialtyScrapeCheck(
await specialtyScrapeCheck(
meta.logger.child({
method: "scrapeURLWithScrapingBee/specialtyScrapeCheck",
}),
Expand Down
25 changes: 23 additions & 2 deletions apps/api/src/scraper/scrapeURL/engines/utils/specialtyHandler.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,30 @@
import { Logger } from "winston";
import { AddFeatureError } from "../../error";
import { FireEngineCheckStatusSuccess } from "../fire-engine/checkStatus";
import path from "path";
import os from "os";
import { writeFile } from "fs/promises";
import { Meta } from "../..";

export function specialtyScrapeCheck(
async function feResToPdfPrefetch(feRes: FireEngineCheckStatusSuccess | undefined): Promise<Meta["pdfPrefetch"]> {
if (!feRes?.file) {
return null;
}

const filePath = path.join(os.tmpdir(), `tempFile-${crypto.randomUUID()}.pdf`);
await writeFile(filePath, Buffer.from(feRes.file.content, "base64"))

return {
status: feRes.pageStatusCode,
url: feRes.url,
filePath,
};
}

export async function specialtyScrapeCheck(
logger: Logger,
headers: Record<string, string> | undefined,
feRes?: FireEngineCheckStatusSuccess,
) {
const contentType = (Object.entries(headers ?? {}).find(
(x) => x[0].toLowerCase() === "content-type",
Expand All @@ -18,7 +39,7 @@ export function specialtyScrapeCheck(
contentType.startsWith("application/pdf;")
) {
// .pdf
throw new AddFeatureError(["pdf"]);
throw new AddFeatureError(["pdf"], await feResToPdfPrefetch(feRes));
} else if (
contentType ===
"application/vnd.openxmlformats-officedocument.wordprocessingml.document" ||
Expand Down
12 changes: 10 additions & 2 deletions apps/api/src/scraper/scrapeURL/error.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { EngineResultsTracker } from ".";
import { EngineResultsTracker, Meta } from ".";
import { Engine, FeatureFlag } from "./engines";

export class EngineError extends Error {
Expand Down Expand Up @@ -28,10 +28,12 @@ export class NoEnginesLeftError extends Error {

export class AddFeatureError extends Error {
public featureFlags: FeatureFlag[];
public pdfPrefetch: Meta["pdfPrefetch"];

constructor(featureFlags: FeatureFlag[]) {
constructor(featureFlags: FeatureFlag[], pdfPrefetch?: Meta["pdfPrefetch"]) {
super("New feature flags have been discovered: " + featureFlags.join(", "));
this.featureFlags = featureFlags;
this.pdfPrefetch = pdfPrefetch;
}
}

Expand Down Expand Up @@ -72,3 +74,9 @@ export class UnsupportedFileError extends Error {
this.reason = reason;
}
}

export class PDFAntibotError extends Error {
constructor() {
super("PDF scrape was prevented by anti-bot")
}
}
27 changes: 27 additions & 0 deletions apps/api/src/scraper/scrapeURL/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import {
AddFeatureError,
EngineError,
NoEnginesLeftError,
PDFAntibotError,
RemoveFeatureError,
SiteError,
TimeoutError,
Expand Down Expand Up @@ -49,6 +50,11 @@ export type Meta = {
logs: any[];
featureFlags: Set<FeatureFlag>;
mock: MockState | null;
pdfPrefetch: {
filePath: string;
url?: string;
status: number;
} | null | undefined; // undefined: no prefetch yet, null: prefetch came back empty
};

function buildFeatureFlags(
Expand Down Expand Up @@ -147,6 +153,7 @@ async function buildMetaObject(
options.useMock !== undefined
? await loadMock(options.useMock, _logger)
: null,
pdfPrefetch: undefined,
};
}

Expand Down Expand Up @@ -303,6 +310,8 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
throw error;
} else if (error instanceof UnsupportedFileError) {
throw error;
} else if (error instanceof PDFAntibotError) {
throw error;
} else {
Sentry.captureException(error);
meta.logger.warn(
Expand Down Expand Up @@ -386,6 +395,9 @@ export async function scrapeURL(
meta.featureFlags = new Set(
[...meta.featureFlags].concat(error.featureFlags),
);
if (error.pdfPrefetch) {
meta.pdfPrefetch = error.pdfPrefetch;
}
} else if (
error instanceof RemoveFeatureError &&
meta.internalOptions.forceEngine === undefined
Expand All @@ -400,6 +412,21 @@ export async function scrapeURL(
(x) => !error.featureFlags.includes(x),
),
);
} else if (
error instanceof PDFAntibotError &&
meta.internalOptions.forceEngine === undefined
) {
if (meta.pdfPrefetch !== undefined) {
meta.logger.error("PDF was prefetched and still blocked by antibot, failing");
throw error;
} else {
meta.logger.debug("PDF was blocked by anti-bot, prefetching with chrome-cdp");
meta.featureFlags = new Set(
[...meta.featureFlags].filter(
(x) => x !== "pdf",
),
);
}
} else {
throw error;
}
Expand Down

0 comments on commit abee9ea

Please sign in to comment.