diff --git a/src/crawler.js b/src/crawler.ts similarity index 78% rename from src/crawler.js rename to src/crawler.ts index c23dd60..142c98a 100644 --- a/src/crawler.js +++ b/src/crawler.ts @@ -1,25 +1,39 @@ -import { createPuppeteerRouter, PuppeteerCrawler } from "crawlee"; +import { createPuppeteerRouter, PuppeteerCrawler, Router, PuppeteerCrawlingContext } from "crawlee"; import { minimatch } from "minimatch"; import DefaultScraper from "./scrapers/default.js"; import DocsearchScraper from "./scrapers/docsearch.js"; import CustomScraper from "./scrapers/custom.js"; import SchemaScraper from "./scrapers/schema.js"; +import { Sender } from "./sender.js"; +import { Config, Scraper } from "./types.js"; + + +type DefaultHandler = Parameters['addDefaultHandler']>[0]>[0] // Crawler class // This class is responsible for crawling the urls and extract content to send to Meilisearch // It uses the createPuppeteerRouter method to create a router that will be used by the PuppeteerCrawler. // The constructor take a Sender object as a parameter export default class Crawler { - constructor(sender, config) { + sender: Sender + config: Config + urls: string[] + custom_crawler: string // TODO: remove + scraper: Scraper + crawler: PuppeteerCrawler + + + + constructor(sender: Sender, config: Config) { console.info("Crawler::constructor"); this.sender = sender; this.config = config; this.urls = config.crawled_urls; - this.custom_crawler = config.custom_crawler; + this.custom_crawler = config.custom_crawler; // TODO: remove // init the custome scraper depending on if config.strategy is docsearch, custom or default this.scraper = - config.strategy == "docsearch" - ? new DocsearchScraper(this.sender, config) + config.strategy == "docsearch" // TODO: rename to docssearch + ? new DocsearchScraper(this.sender) : config.strategy == "custom" ? new CustomScraper(this.sender, config) : config.strategy == "schema" @@ -28,6 +42,8 @@ export default class Crawler { //Create the router let router = createPuppeteerRouter(); + + // type DefaultHandler = Parameters[0]; router.addDefaultHandler(this.defaultHandler.bind(this)); // create the crawler @@ -48,7 +64,8 @@ export default class Crawler { await this.crawler.run(this.urls); } - async defaultHandler({ request, enqueueLinks, page, log }) { + // Should we use `log` + async defaultHandler({ request , enqueueLinks, page }: DefaultHandler ) { const title = await page.title(); console.log(`${title}`, { url: request.loadedUrl }); const crawled_globs = this.__generate_globs(this.urls); @@ -62,7 +79,7 @@ export default class Crawler { this.config.exclude_indexed_urls || [] ); - if (!this.__is_paginated_url(request.loadedUrl)) { + if (request.loadedUrl && !this.__is_paginated_url(request.loadedUrl)) { //check if the url is in the list of urls to scrap if ( this.__match_globs(request.loadedUrl, indexed_globs) && @@ -90,7 +107,7 @@ export default class Crawler { }); } - __generate_globs(urls) { + __generate_globs(urls: string[]) { return urls.map((url) => { if (url.endsWith("/")) { return url + "**"; @@ -99,11 +116,11 @@ export default class Crawler { }); } - __match_globs(url, globs) { + __match_globs(url: string, globs: string[]) { return globs.some((glob) => minimatch(url, glob)); } - __is_file_url(url) { + __is_file_url(url: string) { const fileExtensions = [ ".zip", ".pdf", @@ -147,7 +164,7 @@ export default class Crawler { return fileExtensions.some((extension) => url.endsWith(extension)); } - __is_paginated_url(url) { + __is_paginated_url(url: string) { const urlObject = new URL(url); const pathname = urlObject.pathname; return /\/\d+\//.test(pathname); diff --git a/src/crawler_process.js b/src/crawler_process.js deleted file mode 100644 index fac5fb0..0000000 --- a/src/crawler_process.js +++ /dev/null @@ -1,23 +0,0 @@ -import Sender from "./sender.js"; -import Crawler from "./crawler.js"; - -async function startCrawling(config) { - const sender = new Sender({ - meilisearch_host: config.meilisearch_host, - meilisearch_api_key: config.meilisearch_api_key, - meilisearch_index_name: config.meilisearch_index_name, - }); - await sender.init(); - - const urls = config.urls; - const crawler = new Crawler(sender, config); - - await crawler.run(); - await sender.finish(); -} - -// Listen for messages from the parent thread -process.on("message", async (message) => { - await startCrawling(message); - process.send("Crawling finished"); -}); diff --git a/src/crawler_process.ts b/src/crawler_process.ts new file mode 100644 index 0000000..5075c1b --- /dev/null +++ b/src/crawler_process.ts @@ -0,0 +1,21 @@ +import { Sender } from "./sender.js"; +import Crawler from "./crawler.js"; +import { Config } from "./types.js"; + +async function startCrawling(config: Config) { + const sender = new Sender(config); + await sender.init(); + + const crawler = new Crawler(sender, config); + + await crawler.run(); + await sender.finish(); +} + +// Listen for messages from the parent thread +process.on("message", async (message: Config) => { + await startCrawling(message); + if (process.send) { + process.send("Crawling finished"); + } +}); diff --git a/src/index.js b/src/index.ts similarity index 73% rename from src/index.js rename to src/index.ts index 7aefb24..b674b0f 100644 --- a/src/index.js +++ b/src/index.ts @@ -4,18 +4,18 @@ dotenv.config(); import fs from "fs"; import yargs from "yargs"; import { hideBin } from "yargs/helpers"; -import Sender from "./sender.js"; +import { Sender } from "./sender.js"; import Crawler from "./crawler.js"; // Parse command line arguments and get a configuration file path -const argv = yargs(hideBin(process.argv)).option("config", { +const argv = await yargs(hideBin(process.argv)).option("config", { alias: "c", describe: "Path to configuration file", demandOption: true, type: "string", }).argv; -const config = JSON.parse(fs.readFileSync(argv.config)); +const config = JSON.parse(fs.readFileSync(argv.config, {encoding: 'utf-8'})); const sender = new Sender(config); await sender.init(); diff --git a/src/scrapers/custom.js b/src/scrapers/custom.js index 71099fe..a19bbe8 100644 --- a/src/scrapers/custom.js +++ b/src/scrapers/custom.js @@ -1,3 +1,4 @@ +// TODO: file should be removed import prettier from "prettier"; import { v4 as uuidv4 } from "uuid"; diff --git a/src/scrapers/default.js b/src/scrapers/default.ts similarity index 81% rename from src/scrapers/default.js rename to src/scrapers/default.ts index f4539af..6d37883 100644 --- a/src/scrapers/default.js +++ b/src/scrapers/default.ts @@ -1,8 +1,16 @@ import prettier from "prettier"; import { v4 as uuidv4 } from "uuid"; +import { Sender } from "../sender"; +import { Config, Meta } from "../types"; +import { Page } from "puppeteer"; +import { DefaultData } from "../types"; + export default class DefaultScraper { - constructor(sender, config) { + sender: Sender; + settings: Config["custom_settings"]; + + constructor(sender: Sender, config: Config) { console.info("DefaultScraper::constructor"); this.sender = sender; this.settings = config.custom_settings || { @@ -23,13 +31,13 @@ export default class DefaultScraper { this.sender.updateSettings(this.settings); } - async get(url, page) { + async get(url: string, page: Page) { const title = await page.title(); //get the meta of the page const meta = await this._extract_metadata_from_page(page); //for each page create dataset of consecutive h1, h2, h3, p. at each header after a paragraph, create a new dataset - let data = {}; + let data: DefaultData = {} as DefaultData; let elems = await page.$$( "main h1, main h2, main h3, main h4, main h5, main h6, main p, main td, main li, main span" ); @@ -40,7 +48,7 @@ export default class DefaultScraper { for (let i = 0; i < elems.length; i++) { let elem = elems[i]; let tag = await elem.evaluate((el) => el.tagName); - let text = await elem.evaluate((el) => el.textContent); + let text = await elem.evaluate((el) => el.textContent) || ''; text = this._clean_text(text); data.uid = uuidv4(); data.url = url; @@ -48,7 +56,7 @@ export default class DefaultScraper { data.meta = meta; data.image_url = this._get_image_url_from_meta(meta); data.page_block = page_block; - let urls_tags = new URL(url).pathname.split("/"); + let urls_tags = new URL(url).pathname.split("/"); // TODO: Rename to path_segments data.urls_tags = urls_tags.slice(1, urls_tags.length - 1); let id = await elem.evaluate((el) => el.id); @@ -56,7 +64,7 @@ export default class DefaultScraper { if (data["h1"]) { await this.sender.add(data); page_block++; - data = {}; + data = {} as DefaultData; } data["h1"] = text; data.anchor = "#" + id; @@ -64,7 +72,7 @@ export default class DefaultScraper { if (data["h2"]) { await this.sender.add(data); page_block++; - data = { h1: data["h1"] }; + data = { h1: data["h1"] } as DefaultData; } data.anchor = "#" + id; data["h2"] = text; @@ -72,7 +80,7 @@ export default class DefaultScraper { if (data["h3"]) { await this.sender.add(data); page_block++; - data = { h1: data["h1"], h2: data["h2"] }; + data = { h1: data["h1"], h2: data["h2"] } as DefaultData; } data.anchor = "#" + id; data["h3"] = text; @@ -80,7 +88,7 @@ export default class DefaultScraper { if (data["h4"]) { await this.sender.add(data); page_block++; - data = { h1: data["h1"], h2: data["h2"], h3: data["h3"] }; + data = { h1: data["h1"], h2: data["h2"], h3: data["h3"] } as DefaultData; } data.anchor = "#" + id; data["h4"] = text; @@ -93,7 +101,7 @@ export default class DefaultScraper { h2: data["h2"], h3: data["h3"], h4: data["h4"], - }; + } as DefaultData; } data.anchor = "#" + id; data["h5"] = text; @@ -107,7 +115,7 @@ export default class DefaultScraper { h3: data["h3"], h4: data["h4"], h5: data["h5"], - }; + } as DefaultData; } data.anchor = "#" + id; data["h6"] = text; @@ -120,7 +128,8 @@ export default class DefaultScraper { if (!data["p"]) { data["p"] = []; } - if (!data["p"].includes(text)) { + // TODO: should we leave `null` values in the `p` array? + if (text && !data["p"].includes(text)) { data["p"].push(text); } } @@ -132,7 +141,7 @@ export default class DefaultScraper { // Remove from a text all multiple spaces, new lines, and leading and trailing spaces, and // remove '# ' from the beginning of the text - _clean_text(text) { + _clean_text(text: string) { text = text.replace(/[\r\n]+/gm, " "); ///remove multiple spaces text = text.replace(/\s+/g, " "); @@ -143,11 +152,12 @@ export default class DefaultScraper { return text; } + // Extract the meta of a page - async _extract_metadata_from_page(page) { + async _extract_metadata_from_page(page: Page) { return await page.evaluate(() => { const metas = document.getElementsByTagName("meta"); - const meta = {}; + const meta: Meta = {} as Meta; for (let i = 0; i < metas.length; i++) { const name = metas[i].getAttribute("name"); const content = metas[i].getAttribute("content"); @@ -160,7 +170,7 @@ export default class DefaultScraper { } // Extract the image url from the meta of a page - _get_image_url_from_meta(meta) { + _get_image_url_from_meta(meta: Meta) { if (meta["og:image"]) { return meta["og:image"]; } else if (meta["twitter:image"]) { @@ -168,10 +178,12 @@ export default class DefaultScraper { } else if (meta["image"]) { return meta["image"]; } + return; } // A function that retro-engineer the hljs generated html to extract the code - async _extract_code_from_page(page) { + // TODO: Does it work? + async _extract_code_from_page(page: Page) { let code = await page.evaluate(() => { let code = ""; let pre = document.getElementsByTagName("pre"); @@ -183,11 +195,11 @@ export default class DefaultScraper { } return code; }); - return format_code(code); + return this._format_code(code); } // A function that use prettier to format the code that has been extracted in a html page. // Format only if the language is supported by prettier - _format_code(code) { + _format_code(code: string) { let formatted_code = ""; try { formatted_code = prettier.format(code, { diff --git a/src/scrapers/docsearch.js b/src/scrapers/docsearch.ts similarity index 86% rename from src/scrapers/docsearch.js rename to src/scrapers/docsearch.ts index 19fa52d..f18c95d 100644 --- a/src/scrapers/docsearch.js +++ b/src/scrapers/docsearch.ts @@ -1,8 +1,13 @@ -import prettier from "prettier"; import { v4 as uuidv4 } from "uuid"; +import { Sender } from "../sender"; +import { Page } from "puppeteer"; +import { DocsSearchData } from "../types"; export default class DocsearchScaper { - constructor(sender, config) { + sender: Sender; + + // + constructor(sender: Sender) { console.info("DocsearchScaper::constructor"); this.sender = sender; @@ -19,9 +24,9 @@ export default class DocsearchScaper { }); } - async get(url, page) { + async get(url: string, page: Page) { //for each page create dataset of consecutive h1, h2, h3, p. at each header after a paragraph, create a new dataset - let data = {}; + let data = {} as DocsSearchData; let elems = await page.$$( "main h1, main h2, main h3, main h4, main h5, main p, main td, main li, main span" ); @@ -32,7 +37,7 @@ export default class DocsearchScaper { for (let i = 0; i < elems.length; i++) { let elem = elems[i]; let tag = await elem.evaluate((el) => el.tagName); - let text = await elem.evaluate((el) => el.textContent); + let text = await elem.evaluate((el) => el.textContent) || ''; text = this._clean_text(text); data.uid = uuidv4(); data.url = url; @@ -45,7 +50,7 @@ export default class DocsearchScaper { if (data["hierarchy_lvl1"]) { await this.sender.add(data); page_block++; - data = {}; + data = {} as DocsSearchData; } data["hierarchy_lvl1"] = text; data.anchor = "#" + id; @@ -53,7 +58,7 @@ export default class DocsearchScaper { if (data["hierarchy_lvl2"]) { await this.sender.add(data); page_block++; - data = { hierarchy_lvl1: data["hierarchy_lvl1"] }; + data = { hierarchy_lvl1: data["hierarchy_lvl1"] } as DocsSearchData; } data.anchor = "#" + id; data["hierarchy_lvl2"] = text; @@ -64,7 +69,7 @@ export default class DocsearchScaper { data = { hierarchy_lvl1: data["hierarchy_lvl1"], hierarchy_lvl2: data["hierarchy_lvl2"], - }; + } as DocsSearchData; } data.anchor = "#" + id; data["hierarchy_lvl3"] = text; @@ -76,7 +81,7 @@ export default class DocsearchScaper { hierarchy_lvl1: data["hierarchy_lvl1"], hierarchy_lvl2: data["hierarchy_lvl2"], hierarchy_lvl3: data["hierarchy_lvl3"], - }; + } as DocsSearchData; } data.anchor = "#" + id; data["hierarchy_lvl4"] = text; @@ -89,7 +94,7 @@ export default class DocsearchScaper { hierarchy_lvl2: data["hierarchy_lvl2"], hierarchy_lvl3: data["hierarchy_lvl3"], hierarchy_lvl4: data["hierarchy_lvl4"], - }; + } as DocsSearchData; } data.anchor = "#" + id; data["hierarchy_lvl5"] = text; @@ -102,7 +107,7 @@ export default class DocsearchScaper { if (!data["content"]) { data["content"] = []; } - if (!data["content"].includes(text)) { + if (text !== null && !data["content"].includes(text)) { data["content"].push(text); } } @@ -122,7 +127,7 @@ export default class DocsearchScaper { // Remove from a text all multiple spaces, new lines, and leading and trailing spaces, and // remove '# ' from the beginning of the text - _clean_text(text) { + _clean_text(text: string) { text = text.replace(/[\r\n]+/gm, " "); ///remove multiple spaces text = text.replace(/\s+/g, " "); diff --git a/src/scrapers/schema.js b/src/scrapers/schema.ts similarity index 66% rename from src/scrapers/schema.js rename to src/scrapers/schema.ts index c6c26a7..57b5ee2 100644 --- a/src/scrapers/schema.js +++ b/src/scrapers/schema.ts @@ -1,8 +1,14 @@ -import prettier from "prettier"; import { v4 as uuidv4 } from "uuid"; +import { Page } from "puppeteer"; +import { Sender } from "../sender"; +import { Config, SchemaData } from "../types"; export default class SchemaScaper { - constructor(sender, config) { + sender: Sender; + config: Config; + settings_sent: boolean; // TODO: Where is this used? + + constructor(sender: Sender, config: Config) { console.info("SchemaScaper::constructor"); this.sender = sender; this.config = config; @@ -14,11 +20,11 @@ export default class SchemaScaper { } } - async get(url, page) { + async get(url: string, page: Page) { console.log("__extractContent", url); // Get the schema.org data const data = await page.evaluate(() => { - const schema = document.querySelector( + const schema = document.querySelector( "script[type='application/ld+json']" ); if (schema) { @@ -30,7 +36,10 @@ export default class SchemaScaper { if (data.length === 0) return; if (this.config.schema?.only_type) { - if (data["@type"] !== this.config.schema_config?.only_type) return; + // TODO: Might be an error, should be `schema.only_type` and not `schema_config.only_type` + // Leaving old code in comment in case it is not an error + // if (data["@type"] !== this.config.schema_config?.only_type) return; + if (data["@type"] !== this.config.schema?.only_type) return; } this._clean_schema(data); @@ -52,7 +61,7 @@ export default class SchemaScaper { await this.sender.add(data); } - _clean_schema(data) { + _clean_schema(data: SchemaData) { if (data["@context"]) { delete data["@context"]; } diff --git a/src/sender.js b/src/sender.ts similarity index 82% rename from src/sender.js rename to src/sender.ts index aef2d7e..a6cfca0 100644 --- a/src/sender.js +++ b/src/sender.ts @@ -1,8 +1,16 @@ -import { MeiliSearch } from "meilisearch"; +import { MeiliSearch, Settings } from "meilisearch"; +import { Config, DocsSearchData, DefaultData } from "./types"; //Create a class called Sender that will queue the json data and batch it to a Meilisearch instance -export default class Sender { - constructor(config) { +export class Sender { + config: Config + queue: Array + origin_index_name: string + index_name: string + batch_size: number + client: MeiliSearch + + constructor(config: Config) { console.info("Sender::constructor"); this.queue = []; this.config = config; @@ -32,8 +40,9 @@ export default class Sender { } } } catch (e) { + // TODO: better console.log console.log("try to delete a tmp index if it exists"); - // console.error(e); + console.error(e); } if (this.config.primary_key) { @@ -43,7 +52,8 @@ export default class Sender { .update({ primaryKey: this.config.primary_key }); } catch (e) { console.log("try to create or update the index with the primary key"); - // console.error(e); + + // TODO: why? await this.client.createIndex(this.index_name, { primaryKey: this.config.primary_key, }); @@ -52,7 +62,10 @@ export default class Sender { } //Add a json object to the queue - async add(data) { + + + // TODO: should be better specified + async add(data: DocsSearchData | DefaultData) { console.log("Sender::add"); if (this.config.primary_key) { delete data["uid"]; @@ -64,11 +77,11 @@ export default class Sender { await this.__batchSend(); } } else { - const task = await this.client.index(this.index_name).addDocuments(data); + await this.client.index(this.index_name).addDocuments([data]); } } - async updateSettings(settings) { + async updateSettings(settings: Settings) { console.log("Sender::updateSettings"); let task = await this.client .index(this.index_name) @@ -105,6 +118,7 @@ export default class Sender { { indexes: [this.origin_index_name, this.index_name] }, ]); await this.client.index(this.index_name).waitForTask(task.taskUid); + // TODO: should we wait for task? // await this.client.deleteIndex(this.index_name); } } diff --git a/src/server.js b/src/server.ts similarity index 78% rename from src/server.js rename to src/server.ts index 10a8608..4eadbec 100644 --- a/src/server.js +++ b/src/server.ts @@ -3,33 +3,36 @@ dotenv.config(); import express from "express"; import TaskQueue from "./taskQueue.js"; -import Sender from "./sender.js"; +import { Sender } from "./sender.js"; import Crawler from "./crawler.js"; const port = process.env.PORT || 3000; class Server { + taskQueue: TaskQueue; + app: express.Application; + constructor() { this.taskQueue = new TaskQueue(); - this.app = express(); this.app.use(express.json()); this.app.post("/crawl", this.__crawl.bind(this)); this.app.post("/crawl/async", this.__crawl.bind(this)); this.app.post("/crawl/sync", this.__syncCrawl.bind(this)); + // TODO: create route to empty taskQueue this.app.listen(port, () => console.log(`Example app listening on port ${port}!`) ); } - async __crawl(req, res) { + async __crawl(req: express.Request, res: express.Response) { this.taskQueue.add(req.body); console.log("Crawling started"); res.send("Crawling started"); } - async __syncCrawl(req, res) { + async __syncCrawl(req: express.Request, res: express.Response) { const sender = new Sender(req.body); await sender.init(); diff --git a/src/taskQueue.js b/src/taskQueue.ts similarity index 68% rename from src/taskQueue.js rename to src/taskQueue.ts index 2004d7e..6041994 100644 --- a/src/taskQueue.js +++ b/src/taskQueue.ts @@ -1,15 +1,24 @@ -import Queue from "bull"; +import Queue, { Job, DoneCallback } from "bull"; import { MeiliSearch } from "meilisearch"; -import Sender from "./sender.js"; -import Crawler from "./crawler.js"; import { fork } from "child_process"; +import { Config } from "./types"; +// TODO: is not used const redis_url = process.env.REDIS_URL; export default class TaskQueue { + queue: Queue.Queue; + constructor() { console.info("TaskQueue::constructor"); - this.queue = new Queue("crawling", redis_url); + console.log(redis_url) + if (redis_url ) { + this.queue = new Queue("crawling", redis_url); + } + else { + this.queue = new Queue("crawling"); + } + // this.queue.obliterate({ force: true }); // TODO: add route to obliterate queue this.queue.process(this.__process.bind(this)); this.queue.on("added", this.__jobAdded.bind(this)); this.queue.on("completed", this.__jobCompleted.bind(this)); @@ -19,13 +28,13 @@ export default class TaskQueue { this.queue.on("delayed", this.__jobDelayed.bind(this)); } - add(data) { + add(data: Config) { this.queue.add(data); } - async __process(job, done) { + async __process(job: Job, done: DoneCallback) { console.log("Job process", job.id); - const childProcess = fork("./src/crawler_process.js"); + const childProcess = fork("./dist/src/crawler_process.js"); childProcess.send(job.data); childProcess.on("message", (message) => { console.log(message); @@ -33,15 +42,15 @@ export default class TaskQueue { }); } - async __jobAdded(job) { + async __jobAdded(job: Job) { console.log("Job added", job.id); } - async __jobCompleted(job) { + async __jobCompleted(job: Job) { console.log("Job completed", job.id); } - async __jobFailed(job) { + async __jobFailed(job: Job) { console.log("Job failed", job.id); let client = new MeiliSearch({ host: job.data.meilisearch_host, @@ -61,15 +70,16 @@ export default class TaskQueue { } } - async __jobActive(job) { + async __jobActive(job: Job) { + console.log({ job }) console.log("Job active", job.id); } - async __jobWaiting(job) { + async __jobWaiting(job: Job) { console.log("Job waiting", job.id); } - async __jobDelayed(job) { + async __jobDelayed(job: Job) { console.log("Job delayed", job.id); } } diff --git a/src/types.ts b/src/types.ts new file mode 100644 index 0000000..81bb19d --- /dev/null +++ b/src/types.ts @@ -0,0 +1,80 @@ +import { Settings } from "meilisearch"; +import DocsearchScraper from "./scrapers/docsearch"; +import DefaultScraper from "./scrapers/default"; +import CustomScraper from "./scrapers/custom"; +import SchemaScraper from "./scrapers/schema"; + +// TODO: find out which are mandatory or not +export type Config = { + meilisearch_index_name: string // TODO: rename to meilisearch_index_uid ? + meilisearch_host: string // TODO: rename to meilisearch_url + meilisearch_api_key: string + crawled_urls: string[] // TODO: rename to start_urls, as it is conventional + custom_crawler: string // TODO: Remove + queue?: string[] + primary_key?: string + batch_size?: number + custom_settings?: Settings // TODO: rename to meilisearch_settings + strategy?: 'docsearch' | 'default' | 'custom' | 'schema' // TODO: rename docsearch to docssearch + headless?: boolean // TODO: rename to wait_js ? + exclude_crawled_urls?: string[] // TODO: rename to `ignored_urls` ? + indexed_urls?: string[] // TODO: Rename, not sure what it does + exclude_indexed_urls?: string[] // TODO: rename + schema?: SchemaConfig + +} + +export type SchemaConfig = { + convert_dates: boolean + only_type: string +} + +export type Scraper = DocsearchScraper | DefaultScraper | CustomScraper | SchemaScraper + +export type DocsSearchData = { + url: string + uid?: string + anchor: string + hierarchy_lvl0: string | null + hierarchy_lvl1: string | null + hierarchy_lvl2: string | null + hierarchy_lvl3: string | null + hierarchy_lvl4: string | null + hierarchy_lvl5: string | null + content: string[] + hierarchy_radio_lvl0: string | null + hierarchy_radio_lvl1: string | null + hierarchy_radio_lvl2: string | null + hierarchy_radio_lvl3: string | null + hierarchy_radio_lvl4: string | null + hierarchy_radio_lvl5: string | null + +} + +export type DefaultData = { + url: string + uid?: string + anchor: string + title: string + meta: Meta + image_url?: string + page_block: number + urls_tags: string[] + h1?: string | null + h2?: string | null + h3?: string | null + h4?: string | null + h5?: string | null + h6?: string | null + p: string[] +} + +export type SchemaData = { + uid: string + [key: string]: any + +} + +export type Meta = { + [name: string]: string; +}