Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add typescript #28

Merged
merged 3 commits into from
Jun 21, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
134 changes: 131 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,132 @@
node_modules
# Logs
logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
lerna-debug.log*

# Diagnostic reports (https://nodejs.org/api/report.html)
report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json

# Runtime data
pids
*.pid
*.seed
*.pid.lock

# Directory for instrumented libs generated by jscoverage/JSCover
lib-cov

# Coverage directory used by tools like istanbul
coverage
*.lcov

# nyc test coverage
.nyc_output

# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
.grunt

# Bower dependency directory (https://bower.io/)
bower_components

# node-waf configuration
.lock-wscript

# Compiled binary addons (https://nodejs.org/api/addons.html)
build/Release

# Dependency directories
node_modules/
jspm_packages/

# TypeScript v1 declaration files
typings/

# TypeScript cache
*.tsbuildinfo

# Optional npm cache directory
.npm

# Optional eslint cache
.eslintcache

# Microbundle cache
.rpt2_cache/
.rts2_cache_cjs/
.rts2_cache_es/
.rts2_cache_umd/

# Optional REPL history
.node_repl_history

# Output of 'npm pack'
*.tgz

# Yarn Integrity file
.yarn-integrity

# dotenv environment variables file
.env
.env.test

# parcel-bundler cache (https://parceljs.org/)
.cache

# Next.js build output
.next

# Nuxt.js build / generate output
.nuxt
dist

# Gatsby files
.cache/
# Comment in the public line in if your project uses Gatsby and *not* Next.js
# https://nextjs.org/blog/next-9-1#public-directory-support
# public

# vuepress build output
.vuepress/dist

# Serverless directories
.serverless/

# FuseBox cache
.fusebox/

# DynamoDB Local files
.dynamodb/

# TernJS port file
.tern-port

# misc
.DS_Store

# parcel
.parcel_cache/

############################
# CYPRESS
############################
cypress/screenshots
cypress/videos
cypress/support
cypress/plugins
cypress/fixtures

############################
# MISC
############################

.DS_Store
dist
package
.vscode
.idea
dist_default_export_in_index
no_default_export_in_index
storage
package-lock.json
.env
15 changes: 12 additions & 3 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,19 @@
"yargs": "^17.7.2"
},
"scripts": {
"start": "node src/index.js",
"serve": "node src/server.js",
"start": "tsc && node dist/src/index.js",
"serve": "tsc && node dist/src/server.js",
"test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
},
"author": "It's not you it's me",
"license": "ISC"
"license": "ISC",
"devDependencies": {
"@apify/log": "^2.1.3",
"@apify/tsconfig": "^0.1.0",
"@types/express": "^4.17.17",
"@types/prettier": "^2.7.3",
"@types/uuid": "^9.0.2",
"@types/yargs": "^17.0.24",
"typescript": "^5.1.3"
}
}
39 changes: 28 additions & 11 deletions src/crawler.js → src/crawler.ts
Original file line number Diff line number Diff line change
@@ -1,25 +1,39 @@
import { createPuppeteerRouter, PuppeteerCrawler } from "crawlee";
import { createPuppeteerRouter, PuppeteerCrawler, Router, PuppeteerCrawlingContext } from "crawlee";
import { minimatch } from "minimatch";
import DefaultScraper from "./scrapers/default.js";
import DocsearchScraper from "./scrapers/docsearch.js";
import CustomScraper from "./scrapers/custom.js";
import SchemaScraper from "./scrapers/schema.js";
import { Sender } from "./sender.js";
import { Config, Scraper } from "./types.js";


type DefaultHandler = Parameters<Parameters<Router<PuppeteerCrawlingContext>['addDefaultHandler']>[0]>[0]

// Crawler class
// This class is responsible for crawling the urls and extract content to send to Meilisearch
// It uses the createPuppeteerRouter method to create a router that will be used by the PuppeteerCrawler.
// The constructor take a Sender object as a parameter
export default class Crawler {
constructor(sender, config) {
sender: Sender
config: Config
urls: string[]
custom_crawler: string // TODO: remove
scraper: Scraper
crawler: PuppeteerCrawler



constructor(sender: Sender, config: Config) {
console.info("Crawler::constructor");
this.sender = sender;
this.config = config;
this.urls = config.crawled_urls;
this.custom_crawler = config.custom_crawler;
this.custom_crawler = config.custom_crawler; // TODO: remove
// init the custome scraper depending on if config.strategy is docsearch, custom or default
this.scraper =
config.strategy == "docsearch"
? new DocsearchScraper(this.sender, config)
config.strategy == "docsearch" // TODO: rename to docssearch
bidoubiwa marked this conversation as resolved.
Show resolved Hide resolved
? new DocsearchScraper(this.sender)
: config.strategy == "custom"
? new CustomScraper(this.sender, config)
: config.strategy == "schema"
Expand All @@ -28,6 +42,8 @@ export default class Crawler {

//Create the router
let router = createPuppeteerRouter();

// type DefaultHandler = Parameters<typeof router.addDefaultHandler>[0];
router.addDefaultHandler(this.defaultHandler.bind(this));

// create the crawler
Expand All @@ -48,7 +64,8 @@ export default class Crawler {
await this.crawler.run(this.urls);
}

async defaultHandler({ request, enqueueLinks, page, log }) {
// Should we use `log`
async defaultHandler({ request , enqueueLinks, page }: DefaultHandler ) {
const title = await page.title();
console.log(`${title}`, { url: request.loadedUrl });
const crawled_globs = this.__generate_globs(this.urls);
Expand All @@ -62,7 +79,7 @@ export default class Crawler {
this.config.exclude_indexed_urls || []
);

if (!this.__is_paginated_url(request.loadedUrl)) {
if (request.loadedUrl && !this.__is_paginated_url(request.loadedUrl)) {
//check if the url is in the list of urls to scrap
if (
this.__match_globs(request.loadedUrl, indexed_globs) &&
Expand Down Expand Up @@ -90,7 +107,7 @@ export default class Crawler {
});
}

__generate_globs(urls) {
__generate_globs(urls: string[]) {
return urls.map((url) => {
if (url.endsWith("/")) {
return url + "**";
Expand All @@ -99,11 +116,11 @@ export default class Crawler {
});
}

__match_globs(url, globs) {
__match_globs(url: string, globs: string[]) {
return globs.some((glob) => minimatch(url, glob));
}

__is_file_url(url) {
__is_file_url(url: string) {
const fileExtensions = [
".zip",
".pdf",
Expand Down Expand Up @@ -147,7 +164,7 @@ export default class Crawler {
return fileExtensions.some((extension) => url.endsWith(extension));
}

__is_paginated_url(url) {
__is_paginated_url(url: string) {
const urlObject = new URL(url);
const pathname = urlObject.pathname;
return /\/\d+\//.test(pathname);
Expand Down
23 changes: 0 additions & 23 deletions src/crawler_process.js

This file was deleted.

21 changes: 21 additions & 0 deletions src/crawler_process.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import { Sender } from "./sender.js";
import Crawler from "./crawler.js";
import { Config } from "./types.js";

async function startCrawling(config: Config) {
const sender = new Sender(config);
await sender.init();

const crawler = new Crawler(sender, config);

await crawler.run();
await sender.finish();
}

// Listen for messages from the parent thread
process.on("message", async (message: Config) => {
await startCrawling(message);
if (process.send) {
process.send("Crawling finished");
}
});
6 changes: 3 additions & 3 deletions src/index.js → src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,18 @@ dotenv.config();
import fs from "fs";
import yargs from "yargs";
import { hideBin } from "yargs/helpers";
import Sender from "./sender.js";
import { Sender } from "./sender.js";
import Crawler from "./crawler.js";

// Parse command line arguments and get a configuration file path
const argv = yargs(hideBin(process.argv)).option("config", {
const argv = await yargs(hideBin(process.argv)).option("config", {
bidoubiwa marked this conversation as resolved.
Show resolved Hide resolved
alias: "c",
describe: "Path to configuration file",
demandOption: true,
type: "string",
}).argv;

const config = JSON.parse(fs.readFileSync(argv.config));
const config = JSON.parse(fs.readFileSync(argv.config, {encoding: 'utf-8'}));

const sender = new Sender(config);
await sender.init();
Expand Down
1 change: 1 addition & 0 deletions src/scrapers/custom.js
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
// TODO: file should be removed
import prettier from "prettier";
import { v4 as uuidv4 } from "uuid";

Expand Down
Loading