Skip to content

Commit

Permalink
full working base with zod #35
Browse files Browse the repository at this point in the history
  • Loading branch information
qdequele committed Nov 30, 2024
1 parent ec4dd44 commit a836dd8
Show file tree
Hide file tree
Showing 16 changed files with 313 additions and 131 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,4 @@ COPY --chown=myuser . ./

# Run the image. If you know you won't need headful browsers,
# you can remove the XVFB start script for a micro perf gain.
CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod -- -c $CRAWLER_CONFIG -b /usr/bin/google-chrome --silent
CMD ./start_xvfb_and_run_cmd.sh && npm run start:server -- -c $CRAWLER_CONFIG -b /usr/bin/google-chrome --silent
13 changes: 13 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,17 @@ services:
- MEILI_MASTER_KEY=masterKey
ports:
- "7700:7700"
networks:
- crawler-network

playground:
build:
context: ./playground
dockerfile: Dockerfile
ports:
- "3000:3000"
networks:
- crawler-network

scraper:
build:
Expand All @@ -26,12 +30,21 @@ services:
depends_on:
- meilisearch
- playground
networks:
- crawler-network

redis:
image: redis:latest
restart: always
ports:
- 6379:6379
networks:
- crawler-network

volumes:
meili_data:

networks:
crawler-network:
driver: bridge
name: crawler-network
7 changes: 4 additions & 3 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,12 @@
},
"scripts": {
"build": "yarn tsc",
"start": "yarn tsc && node dist/src/bin/index.js",
"serve": "yarn tsc && node dist/src/server.js",
"start": "node dist/src/bin/index.js",
"start:server": "node dist/src/server/index.js",
"dev": "yarn tsc & node dist/src/bin/index.js",
"dev:server": "yarn tsc & node dist/src/server/index.js",
"lint": "eslint .",
"lint:fix": "eslint . --fix",
"tests": "yarn tsc && node dist/tests/index.js",
"test": "ts-node tests/index.js",
"test:integration": "jest --config jest.integration.config.js",
"test:watch": "jest --watch"
Expand Down
12 changes: 11 additions & 1 deletion playground/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion playground/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
"react": "19.0.0-rc-02c0e824-20241028",
"react-dom": "19.0.0-rc-02c0e824-20241028",
"remark": "^15.0.1",
"remark-html": "^16.0.1"
"remark-html": "^16.0.1",
"zod": "^3.23.8"
},
"devDependencies": {
"@types/node": "^20.14.8",
Expand Down
21 changes: 13 additions & 8 deletions src/bin/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import yargs from "yargs";
import { hideBin } from "yargs/helpers";
import { Sender } from "../sender";
import { Crawler } from "../crawlers";
import { Config } from "../types";
import { Config, ConfigSchema } from "../types";

function getConfig({
configPath,
Expand All @@ -15,15 +15,21 @@ function getConfig({
configPath?: string;
config?: string;
}): Config {
let parsedConfig: unknown;

if (configPath) {
return JSON.parse(
parsedConfig = JSON.parse(
fs.readFileSync(configPath, { encoding: "utf-8" })
) as Config;
);
} else if (config) {
return JSON.parse(config) as Config;
parsedConfig = JSON.parse(config);
} else {
throw new Error("Please provide either --config or --configPath");
}

throw new Error("Please provide either --config or --configPath");
// Validate config against schema
const validatedConfig = ConfigSchema.parse(parsedConfig);
return validatedConfig;
}

// eslint-disable-next-line @typescript-eslint/no-floating-promises
Expand Down Expand Up @@ -57,9 +63,8 @@ function getConfig({
}).argv;

const config = getConfig(argv);
const launchOptions = argv.browserPath
? { executablePath: argv.browserPath }
: {};
const launchOptions =
argv.browserPath ? { executablePath: argv.browserPath } : {};

const sender = new Sender(config);
await sender.init();
Expand Down
79 changes: 48 additions & 31 deletions src/crawlers/base.ts
Original file line number Diff line number Diff line change
Expand Up @@ -30,17 +30,17 @@ export abstract class BaseCrawler {
this.crawlerType = config.crawler_type || "cheerio";

this.scraper =
this.config.strategy === "docssearch"
? new DocsearchScraper(this.sender, this.config)
: this.config.strategy === "schema"
? new SchemaScraper(this.sender, this.config)
: this.config.strategy === "markdown"
? new MarkdownScraper(this.sender, this.config)
: this.config.strategy === "custom"
? new CustomScraper(this.sender, this.config)
: this.config.strategy === "pdf"
? new PDFScraper(this.sender, this.config)
: new DefaultScraper(this.sender, this.config);
this.config.strategy === "docssearch" ?
new DocsearchScraper(this.sender, this.config)
: this.config.strategy === "schema" ?
new SchemaScraper(this.sender, this.config)
: this.config.strategy === "markdown" ?
new MarkdownScraper(this.sender, this.config)
: this.config.strategy === "custom" ?
new CustomScraper(this.sender, this.config)
: this.config.strategy === "pdf" ?
new PDFScraper(this.sender, this.config)
: new DefaultScraper(this.sender, this.config);
}

abstract createRouter(): Router<any>;
Expand All @@ -65,39 +65,44 @@ export abstract class BaseCrawler {
const excluded_crawled_globs = this.__generate_globs(
this.config.urls_to_exclude || []
);
console.log("crawled_globs", crawled_globs);
const indexed_globs = this.__generate_globs(
this.config.urls_to_index || this.urls
);
console.log("indexed_globs", indexed_globs);
const excluded_indexed_globs = this.__generate_globs(
this.config.urls_to_not_index || []
);
console.log("excluded_indexed_globs", excluded_indexed_globs);
log.debug("URL matching check", {
url: request.loadedUrl,
shouldIndex: this.__match_globs(request.loadedUrl, indexed_globs),
isExcluded: this.__match_globs(request.loadedUrl, excluded_indexed_globs),
isPaginated: this.__is_paginated_url(request.loadedUrl),
});

if (request.loadedUrl && !this.__is_paginated_url(request.loadedUrl)) {
if (
this.__match_globs(request.loadedUrl, indexed_globs) &&
!this.__match_globs(request.loadedUrl, excluded_indexed_globs)
) {
// Convert Puppeteer page to Cheerio instance
let $: cheerio.CheerioAPI;
// TODO: Add Playwright support
if (this.crawlerType === "puppeteer") {
const pageContent = await context.page.content();
$ = cheerio.load(pageContent);
} else {
$ = context.$;
}

if (this.config.strategy == "pdf") {
// Check if URL is a PDF
if (request.loadedUrl.toLowerCase().endsWith(".pdf")) {
this.nb_page_indexed++;
const emptyCheerio = cheerio.load("");
await this.scraper.get(request.loadedUrl, emptyCheerio);
try {
if (this.crawlerType === "puppeteer") {
const pageContent = await context.page.content();
$ = cheerio.load(pageContent);
} else {
$ = context.$;
}

if (!$) {
log.error("Cheerio instance is undefined", {
url: request.loadedUrl,
});
return;
}
return;
}

if ($) {
// Check for 404 before incrementing counter and scraping
if (this.__is404Page($)) {
log.debug("404 page detected, skipping", {
Expand All @@ -107,9 +112,14 @@ export abstract class BaseCrawler {
}

this.nb_page_indexed++;
log.debug("Starting scraper.get", { url: request.loadedUrl });
await this.scraper.get(request.loadedUrl, $);
} else {
log.warning("Cheerio context is undefined, skipping scraper.get");
log.debug("Completed scraper.get", { url: request.loadedUrl });
} catch (error) {
log.error("Error processing page", {
url: request.loadedUrl,
error: error instanceof Error ? error.message : String(error),
});
}
}
}
Expand Down Expand Up @@ -253,7 +263,14 @@ export abstract class BaseCrawler {
);

// Check text content
const bodyText = $("body").text().toLowerCase();
const bodyText = $("body")
.clone()
.find("script")
.remove()
.end()
.text()
.toLowerCase();

const hasErrorText = commonErrorTexts.some((text) =>
bodyText.includes(text)
);
Expand Down
10 changes: 7 additions & 3 deletions src/crawlers/cheerio.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@ export class CheerioCrawler extends BaseCrawler {
requestQueue: RequestQueue,
router: Router<CheerioCrawlingContext>
): CheerioCrawlerOptions {
const preNavigationHooks: CheerioHook[] = this.config
.additional_request_headers
? [
const preNavigationHooks: CheerioHook[] =
this.config.additional_request_headers ?
[
(crawlingContext) => {
const { request } = crawlingContext;
request.headers = {
Expand All @@ -51,6 +51,10 @@ export class CheerioCrawler extends BaseCrawler {
}

createCrawlerInstance(options: CheerioCrawlerOptions): CrawleeCheerioCrawler {
if (this.config.strategy === "pdf") {
options.additionalMimeTypes = ["application/pdf"];
}

return new CrawleeCheerioCrawler(options);
}

Expand Down
5 changes: 5 additions & 0 deletions src/crawlers/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,11 @@ export class Crawler {
private static async setupRequestQueue(
urls: string[]
): Promise<RequestQueue> {
if (!urls || !Array.isArray(urls)) {
log.error("Invalid or missing start_urls", { urls });
throw new Error("start_urls must be an array of strings");
}

const requestQueue = await RequestQueue.open(JSON.stringify(urls));

if (this.config?.use_sitemap == true) {
Expand Down
5 changes: 2 additions & 3 deletions src/index.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,2 @@
export { Crawler } from './crawlers'
export { Sender } from './sender'
export { TaskQueue } from './taskQueue'
export { Crawler } from "./crawlers";
export { Sender } from "./sender";
3 changes: 2 additions & 1 deletion src/scrapers/custom.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ export default class CustomScraper {
};

for (const [key, selector] of Object.entries(this.selectors || {})) {
const elements = $(selector);
const elements =
typeof selector === "string" ? $(selector) : $(selector.join(", "));
if (elements.length > 0) {
data[key] = elements
.map((_, el) => this._clean_text($(el).text()))
Expand Down
Loading

0 comments on commit a836dd8

Please sign in to comment.