Skip to content

Commit

Permalink
Merge pull request #72 from meilisearch/remove_requests_list_caching
Browse files Browse the repository at this point in the history
Empty enqueued url's after crawler to avoid caching
  • Loading branch information
bidoubiwa authored Aug 16, 2023
2 parents 22756a6 + f1c4734 commit 8b9a28e
Showing 1 changed file with 24 additions and 14 deletions.
38 changes: 24 additions & 14 deletions src/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@ import {
Router,
PuppeteerCrawlingContext,
PuppeteerCrawlerOptions,
RequestQueue,
} from 'crawlee'

import { minimatch } from 'minimatch'
import DefaultScraper from './scrapers/default'
import DocsearchScraper from './scrapers/docssearch'
Expand All @@ -27,27 +29,35 @@ export class Crawler {
config: Config
urls: string[]
scraper: Scraper
crawler: PuppeteerCrawler
nb_page_crawled = 0
nb_page_indexed = 0
launchOptions: Record<string, any> = {}
launcher?: PuppeteerNode

constructor(
sender: Sender,
config: Config,
launchOptions: Record<string, any> = {},
launcher?: PuppeteerNode
) {
console.info('Crawler::constructor')
this.sender = sender
this.config = config
this.urls = config.start_urls
this.launchOptions = launchOptions
this.launcher = launcher

this.scraper =
config.strategy == 'docssearch'
this.config.strategy == 'docssearch'
? new DocsearchScraper(this.sender)
: config.strategy == 'schema'
? new SchemaScraper(this.sender, config)
: new DefaultScraper(this.sender, config)
: this.config.strategy == 'schema'
? new SchemaScraper(this.sender, this.config)
: new DefaultScraper(this.sender, this.config)
}

async run() {
const requestQueue = await RequestQueue.open(JSON.stringify(this.urls))
// Enqueue the initial requests
await requestQueue.addRequests(this.urls.map((url) => ({ url })))

//Create the router
const router = createPuppeteerRouter()
Expand All @@ -56,25 +66,24 @@ export class Crawler {
router.addDefaultHandler(this.defaultHandler.bind(this))

const puppeteerCrawlerOptions: PuppeteerCrawlerOptions = {
requestQueue,
requestHandler: router,
launchContext: {
launchOptions: {
headless: config.headless || true,
headless: this.config.headless || true,
args: ['--no-sandbox', '--disable-setuid-sandbox'],
ignoreDefaultArgs: ['--disable-extensions'],
...launchOptions,
...this.launchOptions,
},
},
}

if (puppeteerCrawlerOptions.launchContext && launcher) {
puppeteerCrawlerOptions.launchContext.launcher = launcher
if (puppeteerCrawlerOptions.launchContext && this.launcher) {
puppeteerCrawlerOptions.launchContext.launcher = this.launcher
}
// create the crawler
this.crawler = new PuppeteerCrawler(puppeteerCrawlerOptions)
}
const crawler = new PuppeteerCrawler(puppeteerCrawlerOptions)

async run() {
let interval = 5000
if (process.env.WEBHOOK_INTERVAL) {
interval = parseInt(process.env.WEBHOOK_INTERVAL)
Expand All @@ -89,7 +98,7 @@ export class Crawler {
}, interval)

try {
await this.crawler.run(this.urls)
await crawler.run()

await Webhook.get(this.config).active(this.config, {
nb_page_crawled: this.nb_page_crawled,
Expand All @@ -101,6 +110,7 @@ export class Crawler {
} finally {
clearInterval(intervalId)
}
await requestQueue.drop()
}

// Should we use `log`
Expand Down

0 comments on commit 8b9a28e

Please sign in to comment.