Skip to content

Commit

Permalink
Merge pull request #28 from meilisearch/add_typescript
Browse files Browse the repository at this point in the history
Add typescript
  • Loading branch information
bidoubiwa authored Jun 21, 2023
2 parents 966675f + 4c04106 commit 9d984a3
Show file tree
Hide file tree
Showing 16 changed files with 3,534 additions and 103 deletions.
134 changes: 131 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,132 @@
node_modules
# Logs
logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
lerna-debug.log*

# Diagnostic reports (https://nodejs.org/api/report.html)
report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json

# Runtime data
pids
*.pid
*.seed
*.pid.lock

# Directory for instrumented libs generated by jscoverage/JSCover
lib-cov

# Coverage directory used by tools like istanbul
coverage
*.lcov

# nyc test coverage
.nyc_output

# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
.grunt

# Bower dependency directory (https://bower.io/)
bower_components

# node-waf configuration
.lock-wscript

# Compiled binary addons (https://nodejs.org/api/addons.html)
build/Release

# Dependency directories
node_modules/
jspm_packages/

# TypeScript v1 declaration files
typings/

# TypeScript cache
*.tsbuildinfo

# Optional npm cache directory
.npm

# Optional eslint cache
.eslintcache

# Microbundle cache
.rpt2_cache/
.rts2_cache_cjs/
.rts2_cache_es/
.rts2_cache_umd/

# Optional REPL history
.node_repl_history

# Output of 'npm pack'
*.tgz

# Yarn Integrity file
.yarn-integrity

# dotenv environment variables file
.env
.env.test

# parcel-bundler cache (https://parceljs.org/)
.cache

# Next.js build output
.next

# Nuxt.js build / generate output
.nuxt
dist

# Gatsby files
.cache/
# Comment in the public line in if your project uses Gatsby and *not* Next.js
# https://nextjs.org/blog/next-9-1#public-directory-support
# public

# vuepress build output
.vuepress/dist

# Serverless directories
.serverless/

# FuseBox cache
.fusebox/

# DynamoDB Local files
.dynamodb/

# TernJS port file
.tern-port

# misc
.DS_Store

# parcel
.parcel_cache/

############################
# CYPRESS
############################
cypress/screenshots
cypress/videos
cypress/support
cypress/plugins
cypress/fixtures

############################
# MISC
############################

.DS_Store
dist
package
.vscode
.idea
dist_default_export_in_index
no_default_export_in_index
storage
package-lock.json
.env
15 changes: 12 additions & 3 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,19 @@
"yargs": "^17.7.2"
},
"scripts": {
"start": "node src/index.js",
"serve": "node src/server.js",
"start": "tsc && node dist/src/index.js",
"serve": "tsc && node dist/src/server.js",
"test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
},
"author": "It's not you it's me",
"license": "ISC"
"license": "ISC",
"devDependencies": {
"@apify/log": "^2.1.3",
"@apify/tsconfig": "^0.1.0",
"@types/express": "^4.17.17",
"@types/prettier": "^2.7.3",
"@types/uuid": "^9.0.2",
"@types/yargs": "^17.0.24",
"typescript": "^5.1.3"
}
}
33 changes: 24 additions & 9 deletions src/crawler.js → src/crawler.ts
Original file line number Diff line number Diff line change
@@ -1,16 +1,28 @@
import { createPuppeteerRouter, PuppeteerCrawler } from "crawlee";
import { createPuppeteerRouter, PuppeteerCrawler, Router, PuppeteerCrawlingContext } from "crawlee";
import { minimatch } from "minimatch";
import DefaultScraper from "./scrapers/default.js";
import DocsearchScraper from "./scrapers/docsearch.js";
import CustomScraper from "./scrapers/custom.js";
import SchemaScraper from "./scrapers/schema.js";
import { Sender } from "./sender.js";
import { Config, Scraper } from "./types.js";


type DefaultHandler = Parameters<Parameters<Router<PuppeteerCrawlingContext>['addDefaultHandler']>[0]>[0]

// Crawler class
// This class is responsible for crawling the urls and extract content to send to Meilisearch
// It uses the createPuppeteerRouter method to create a router that will be used by the PuppeteerCrawler.
// The constructor take a Sender object as a parameter
export default class Crawler {
constructor(sender, config) {
sender: Sender
config: Config
urls: string[]
custom_crawler: string
scraper: Scraper
crawler: PuppeteerCrawler

constructor(sender: Sender, config: Config) {
console.info("Crawler::constructor");
this.sender = sender;
this.config = config;
Expand All @@ -19,7 +31,7 @@ export default class Crawler {
// init the custome scraper depending on if config.strategy is docsearch, custom or default
this.scraper =
config.strategy == "docsearch"
? new DocsearchScraper(this.sender, config)
? new DocsearchScraper(this.sender)
: config.strategy == "custom"
? new CustomScraper(this.sender, config)
: config.strategy == "schema"
Expand All @@ -28,6 +40,8 @@ export default class Crawler {

//Create the router
let router = createPuppeteerRouter();

// type DefaultHandler = Parameters<typeof router.addDefaultHandler>[0];
router.addDefaultHandler(this.defaultHandler.bind(this));

// create the crawler
Expand All @@ -48,7 +62,8 @@ export default class Crawler {
await this.crawler.run(this.urls);
}

async defaultHandler({ request, enqueueLinks, page, log }) {
// Should we use `log`
async defaultHandler({ request , enqueueLinks, page }: DefaultHandler ) {
const title = await page.title();
console.log(`${title}`, { url: request.loadedUrl });
const crawled_globs = this.__generate_globs(this.urls);
Expand All @@ -62,7 +77,7 @@ export default class Crawler {
this.config.exclude_indexed_urls || []
);

if (!this.__is_paginated_url(request.loadedUrl)) {
if (request.loadedUrl && !this.__is_paginated_url(request.loadedUrl)) {
//check if the url is in the list of urls to scrap
if (
this.__match_globs(request.loadedUrl, indexed_globs) &&
Expand Down Expand Up @@ -90,7 +105,7 @@ export default class Crawler {
});
}

__generate_globs(urls) {
__generate_globs(urls: string[]) {
return urls.map((url) => {
if (url.endsWith("/")) {
return url + "**";
Expand All @@ -99,11 +114,11 @@ export default class Crawler {
});
}

__match_globs(url, globs) {
__match_globs(url: string, globs: string[]) {
return globs.some((glob) => minimatch(url, glob));
}

__is_file_url(url) {
__is_file_url(url: string) {
const fileExtensions = [
".zip",
".pdf",
Expand Down Expand Up @@ -147,7 +162,7 @@ export default class Crawler {
return fileExtensions.some((extension) => url.endsWith(extension));
}

__is_paginated_url(url) {
__is_paginated_url(url: string) {
const urlObject = new URL(url);
const pathname = urlObject.pathname;
return /\/\d+\//.test(pathname);
Expand Down
23 changes: 0 additions & 23 deletions src/crawler_process.js

This file was deleted.

21 changes: 21 additions & 0 deletions src/crawler_process.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import { Sender } from "./sender.js";
import Crawler from "./crawler.js";
import { Config } from "./types.js";

async function startCrawling(config: Config) {
const sender = new Sender(config);
await sender.init();

const crawler = new Crawler(sender, config);

await crawler.run();
await sender.finish();
}

// Listen for messages from the parent thread
process.on("message", async (message: Config) => {
await startCrawling(message);
if (process.send) {
process.send("Crawling finished");
}
});
6 changes: 3 additions & 3 deletions src/index.js → src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,18 @@ dotenv.config();
import fs from "fs";
import yargs from "yargs";
import { hideBin } from "yargs/helpers";
import Sender from "./sender.js";
import { Sender } from "./sender.js";
import Crawler from "./crawler.js";

// Parse command line arguments and get a configuration file path
const argv = yargs(hideBin(process.argv)).option("config", {
const argv = await yargs(hideBin(process.argv)).option("config", {
alias: "c",
describe: "Path to configuration file",
demandOption: true,
type: "string",
}).argv;

const config = JSON.parse(fs.readFileSync(argv.config));
const config = JSON.parse(fs.readFileSync(argv.config, {encoding: 'utf-8'}));

const sender = new Sender(config);
await sender.init();
Expand Down
1 change: 1 addition & 0 deletions src/scrapers/custom.js
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
// TODO: file should be removed
import prettier from "prettier";
import { v4 as uuidv4 } from "uuid";

Expand Down
Loading

0 comments on commit 9d984a3

Please sign in to comment.