Skip to content

Commit

Permalink
Transform js files to ts file
Browse files Browse the repository at this point in the history
  • Loading branch information
bidoubiwa committed Jun 21, 2023
1 parent 0320af5 commit 6df4811
Show file tree
Hide file tree
Showing 12 changed files with 248 additions and 99 deletions.
39 changes: 28 additions & 11 deletions src/crawler.js → src/crawler.ts
Original file line number Diff line number Diff line change
@@ -1,25 +1,39 @@
import { createPuppeteerRouter, PuppeteerCrawler } from "crawlee";
import { createPuppeteerRouter, PuppeteerCrawler, Router, PuppeteerCrawlingContext } from "crawlee";
import { minimatch } from "minimatch";
import DefaultScraper from "./scrapers/default.js";
import DocsearchScraper from "./scrapers/docsearch.js";
import CustomScraper from "./scrapers/custom.js";
import SchemaScraper from "./scrapers/schema.js";
import { Sender } from "./sender.js";
import { Config, Scraper } from "./types.js";


type DefaultHandler = Parameters<Parameters<Router<PuppeteerCrawlingContext>['addDefaultHandler']>[0]>[0]

// Crawler class
// This class is responsible for crawling the urls and extract content to send to Meilisearch
// It uses the createPuppeteerRouter method to create a router that will be used by the PuppeteerCrawler.
// The constructor take a Sender object as a parameter
export default class Crawler {
constructor(sender, config) {
sender: Sender
config: Config
urls: string[]
custom_crawler: string // TODO: remove
scraper: Scraper
crawler: PuppeteerCrawler



constructor(sender: Sender, config: Config) {
console.info("Crawler::constructor");
this.sender = sender;
this.config = config;
this.urls = config.crawled_urls;
this.custom_crawler = config.custom_crawler;
this.custom_crawler = config.custom_crawler; // TODO: remove
// init the custome scraper depending on if config.strategy is docsearch, custom or default
this.scraper =
config.strategy == "docsearch"
? new DocsearchScraper(this.sender, config)
config.strategy == "docsearch" // TODO: rename to docssearch
? new DocsearchScraper(this.sender)
: config.strategy == "custom"
? new CustomScraper(this.sender, config)
: config.strategy == "schema"
Expand All @@ -28,6 +42,8 @@ export default class Crawler {

//Create the router
let router = createPuppeteerRouter();

// type DefaultHandler = Parameters<typeof router.addDefaultHandler>[0];
router.addDefaultHandler(this.defaultHandler.bind(this));

// create the crawler
Expand All @@ -48,7 +64,8 @@ export default class Crawler {
await this.crawler.run(this.urls);
}

async defaultHandler({ request, enqueueLinks, page, log }) {
// Should we use `log`
async defaultHandler({ request , enqueueLinks, page }: DefaultHandler ) {
const title = await page.title();
console.log(`${title}`, { url: request.loadedUrl });
const crawled_globs = this.__generate_globs(this.urls);
Expand All @@ -62,7 +79,7 @@ export default class Crawler {
this.config.exclude_indexed_urls || []
);

if (!this.__is_paginated_url(request.loadedUrl)) {
if (request.loadedUrl && !this.__is_paginated_url(request.loadedUrl)) {
//check if the url is in the list of urls to scrap
if (
this.__match_globs(request.loadedUrl, indexed_globs) &&
Expand Down Expand Up @@ -90,7 +107,7 @@ export default class Crawler {
});
}

__generate_globs(urls) {
__generate_globs(urls: string[]) {
return urls.map((url) => {
if (url.endsWith("/")) {
return url + "**";
Expand All @@ -99,11 +116,11 @@ export default class Crawler {
});
}

__match_globs(url, globs) {
__match_globs(url: string, globs: string[]) {
return globs.some((glob) => minimatch(url, glob));
}

__is_file_url(url) {
__is_file_url(url: string) {
const fileExtensions = [
".zip",
".pdf",
Expand Down Expand Up @@ -147,7 +164,7 @@ export default class Crawler {
return fileExtensions.some((extension) => url.endsWith(extension));
}

__is_paginated_url(url) {
__is_paginated_url(url: string) {
const urlObject = new URL(url);
const pathname = urlObject.pathname;
return /\/\d+\//.test(pathname);
Expand Down
23 changes: 0 additions & 23 deletions src/crawler_process.js

This file was deleted.

21 changes: 21 additions & 0 deletions src/crawler_process.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import { Sender } from "./sender.js";
import Crawler from "./crawler.js";
import { Config } from "./types.js";

async function startCrawling(config: Config) {
const sender = new Sender(config);
await sender.init();

const crawler = new Crawler(sender, config);

await crawler.run();
await sender.finish();
}

// Listen for messages from the parent thread
process.on("message", async (message: Config) => {
await startCrawling(message);
if (process.send) {
process.send("Crawling finished");
}
});
6 changes: 3 additions & 3 deletions src/index.js → src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,18 @@ dotenv.config();
import fs from "fs";
import yargs from "yargs";
import { hideBin } from "yargs/helpers";
import Sender from "./sender.js";
import { Sender } from "./sender.js";
import Crawler from "./crawler.js";

// Parse command line arguments and get a configuration file path
const argv = yargs(hideBin(process.argv)).option("config", {
const argv = await yargs(hideBin(process.argv)).option("config", {
alias: "c",
describe: "Path to configuration file",
demandOption: true,
type: "string",
}).argv;

const config = JSON.parse(fs.readFileSync(argv.config));
const config = JSON.parse(fs.readFileSync(argv.config, {encoding: 'utf-8'}));

const sender = new Sender(config);
await sender.init();
Expand Down
1 change: 1 addition & 0 deletions src/scrapers/custom.js
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
// TODO: file should be removed
import prettier from "prettier";
import { v4 as uuidv4 } from "uuid";

Expand Down
50 changes: 31 additions & 19 deletions src/scrapers/default.js → src/scrapers/default.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,16 @@
import prettier from "prettier";
import { v4 as uuidv4 } from "uuid";
import { Sender } from "../sender";
import { Config, Meta } from "../types";
import { Page } from "puppeteer";
import { DefaultData } from "../types";


export default class DefaultScraper {
constructor(sender, config) {
sender: Sender;
settings: Config["custom_settings"];

constructor(sender: Sender, config: Config) {
console.info("DefaultScraper::constructor");
this.sender = sender;
this.settings = config.custom_settings || {
Expand All @@ -23,13 +31,13 @@ export default class DefaultScraper {
this.sender.updateSettings(this.settings);
}

async get(url, page) {
async get(url: string, page: Page) {
const title = await page.title();
//get the meta of the page
const meta = await this._extract_metadata_from_page(page);

//for each page create dataset of consecutive h1, h2, h3, p. at each header after a paragraph, create a new dataset
let data = {};
let data: DefaultData = {} as DefaultData;
let elems = await page.$$(
"main h1, main h2, main h3, main h4, main h5, main h6, main p, main td, main li, main span"
);
Expand All @@ -40,47 +48,47 @@ export default class DefaultScraper {
for (let i = 0; i < elems.length; i++) {
let elem = elems[i];
let tag = await elem.evaluate((el) => el.tagName);
let text = await elem.evaluate((el) => el.textContent);
let text = await elem.evaluate((el) => el.textContent) || '';
text = this._clean_text(text);
data.uid = uuidv4();
data.url = url;
data.title = title;
data.meta = meta;
data.image_url = this._get_image_url_from_meta(meta);
data.page_block = page_block;
let urls_tags = new URL(url).pathname.split("/");
let urls_tags = new URL(url).pathname.split("/"); // TODO: Rename to path_segments
data.urls_tags = urls_tags.slice(1, urls_tags.length - 1);

let id = await elem.evaluate((el) => el.id);
if (tag === "H1") {
if (data["h1"]) {
await this.sender.add(data);
page_block++;
data = {};
data = {} as DefaultData;
}
data["h1"] = text;
data.anchor = "#" + id;
} else if (tag === "H2") {
if (data["h2"]) {
await this.sender.add(data);
page_block++;
data = { h1: data["h1"] };
data = { h1: data["h1"] } as DefaultData;
}
data.anchor = "#" + id;
data["h2"] = text;
} else if (tag === "H3") {
if (data["h3"]) {
await this.sender.add(data);
page_block++;
data = { h1: data["h1"], h2: data["h2"] };
data = { h1: data["h1"], h2: data["h2"] } as DefaultData;
}
data.anchor = "#" + id;
data["h3"] = text;
} else if (tag === "H4") {
if (data["h4"]) {
await this.sender.add(data);
page_block++;
data = { h1: data["h1"], h2: data["h2"], h3: data["h3"] };
data = { h1: data["h1"], h2: data["h2"], h3: data["h3"] } as DefaultData;
}
data.anchor = "#" + id;
data["h4"] = text;
Expand All @@ -93,7 +101,7 @@ export default class DefaultScraper {
h2: data["h2"],
h3: data["h3"],
h4: data["h4"],
};
} as DefaultData;
}
data.anchor = "#" + id;
data["h5"] = text;
Expand All @@ -107,7 +115,7 @@ export default class DefaultScraper {
h3: data["h3"],
h4: data["h4"],
h5: data["h5"],
};
} as DefaultData;
}
data.anchor = "#" + id;
data["h6"] = text;
Expand All @@ -120,7 +128,8 @@ export default class DefaultScraper {
if (!data["p"]) {
data["p"] = [];
}
if (!data["p"].includes(text)) {
// TODO: should we leave `null` values in the `p` array?
if (text && !data["p"].includes(text)) {
data["p"].push(text);
}
}
Expand All @@ -132,7 +141,7 @@ export default class DefaultScraper {

// Remove from a text all multiple spaces, new lines, and leading and trailing spaces, and
// remove '# ' from the beginning of the text
_clean_text(text) {
_clean_text(text: string) {
text = text.replace(/[\r\n]+/gm, " ");
///remove multiple spaces
text = text.replace(/\s+/g, " ");
Expand All @@ -143,11 +152,12 @@ export default class DefaultScraper {
return text;
}


// Extract the meta of a page
async _extract_metadata_from_page(page) {
async _extract_metadata_from_page(page: Page) {
return await page.evaluate(() => {
const metas = document.getElementsByTagName("meta");
const meta = {};
const meta: Meta = {} as Meta;
for (let i = 0; i < metas.length; i++) {
const name = metas[i].getAttribute("name");
const content = metas[i].getAttribute("content");
Expand All @@ -160,18 +170,20 @@ export default class DefaultScraper {
}

// Extract the image url from the meta of a page
_get_image_url_from_meta(meta) {
_get_image_url_from_meta(meta: Meta) {
if (meta["og:image"]) {
return meta["og:image"];
} else if (meta["twitter:image"]) {
return meta["twitter:image"];
} else if (meta["image"]) {
return meta["image"];
}
return;
}

// A function that retro-engineer the hljs generated html to extract the code
async _extract_code_from_page(page) {
// TODO: Does it work?
async _extract_code_from_page(page: Page) {
let code = await page.evaluate(() => {
let code = "";
let pre = document.getElementsByTagName("pre");
Expand All @@ -183,11 +195,11 @@ export default class DefaultScraper {
}
return code;
});
return format_code(code);
return this._format_code(code);
}
// A function that use prettier to format the code that has been extracted in a html page.
// Format only if the language is supported by prettier
_format_code(code) {
_format_code(code: string) {
let formatted_code = "";
try {
formatted_code = prettier.format(code, {
Expand Down
Loading

0 comments on commit 6df4811

Please sign in to comment.