Skip to content

Commit

Permalink
Add linter
Browse files Browse the repository at this point in the history
  • Loading branch information
bidoubiwa committed Jun 22, 2023
1 parent 9d984a3 commit 6071ab0
Show file tree
Hide file tree
Showing 18 changed files with 1,421 additions and 547 deletions.
6 changes: 6 additions & 0 deletions .eslintignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
node_modules
dist
examples
scripts
tests/env
coverage
61 changes: 61 additions & 0 deletions .eslintrc.cjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
module.exports = {
env: {
browser: true,
es6: true,
es2020: true,
'jest/globals': true,
node: true,
jasmine: true,
},
extends: [
'eslint:recommended',
'plugin:@typescript-eslint/recommended',
'plugin:@typescript-eslint/recommended-requiring-type-checking',
'plugin:prettier/recommended',
],
parser: '@typescript-eslint/parser',
parserOptions: {
ecmaVersion: 2019,
project: ['tsconfig.eslint.json'],
sourceType: 'module',
projectFolderIgnoreList: ['dist'],
},
plugins: ['@typescript-eslint', 'prettier', 'jest'],
rules: {
'no-dupe-class-members': 'off', // Off due to conflict with typescript overload functions
'prettier/prettier': [
'error',
{
singleQuote: true,
arrowParens: 'always',
semi: false,
bracketSpacing: true,
trailingComma: 'es5',
tsdoc: true,
printWidth: 80,
},
],
'@typescript-eslint/array-type': ['warn', { default: 'array-simple' }],
'@typescript-eslint/return-await': 'off',
'@typescript-eslint/no-explicit-any': 'off',
'@typescript-eslint/explicit-function-return-type': 'off',
'@typescript-eslint/member-delimiter-style': [
'error',
{
multiline: {
delimiter: 'none', // 'none' or 'semi' or 'comma'
requireLast: true,
},
singleline: {
delimiter: 'semi', // 'semi' or 'comma'
requireLast: false,
},
},
],
'comma-dangle': 'off',
'@typescript-eslint/ban-ts-ignore': 'off',
'@typescript-eslint/no-misused-promises': ['off'],
'@typescript-eslint/no-unsafe-member-access': ['off'],
'@typescript-eslint/no-unsafe-argument': 'off',
},
}
40 changes: 40 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
name: Tests

on:
pull_request:
push:
# trying and staging branches are for BORS config
branches:
- trying
- staging
- main

jobs:
lint_tests:
runs-on: ubuntu-latest
name: lint tests
steps:
- uses: actions/checkout@v3
- name: Setup node
uses: actions/setup-node@v3
with:
node-version: 16
cache: 'yarn'
- name: Install dependencies
run: yarn
- name: Run JS/TS linter
run: yarn lint
build_test:
runs-on: ubuntu-latest
name: types-check
steps:
- uses: actions/checkout@v3
- name: Setup node
uses: actions/setup-node@v3
with:
node-version: 16
cache: 'yarn'
- name: Install dependencies
run: yarn
- name: Build project
run: yarn build
7 changes: 7 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Run linting tests

```sh
yarn lint # to test
yarn lint:fix # to fix errors
```

9 changes: 9 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,11 @@
"yargs": "^17.7.2"
},
"scripts": {
"build": "tsc",
"start": "tsc && node dist/src/index.js",
"serve": "tsc && node dist/src/server.js",
"lint": "eslint .",
"lint:fix": "eslint . --fix",
"test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
},
"author": "It's not you it's me",
Expand All @@ -29,6 +32,12 @@
"@types/prettier": "^2.7.3",
"@types/uuid": "^9.0.2",
"@types/yargs": "^17.0.24",
"@typescript-eslint/eslint-plugin": "^5.60.0",
"@typescript-eslint/parser": "^5.60.0",
"eslint": "^8.43.0",
"eslint-config-prettier": "^8.8.0",
"eslint-plugin-jest": "^27.2.2",
"eslint-plugin-prettier": "^4.2.1",
"typescript": "^5.1.3"
}
}
182 changes: 94 additions & 88 deletions src/crawler.ts
Original file line number Diff line number Diff line change
@@ -1,14 +1,20 @@
import { createPuppeteerRouter, PuppeteerCrawler, Router, PuppeteerCrawlingContext } from "crawlee";
import { minimatch } from "minimatch";
import DefaultScraper from "./scrapers/default.js";
import DocsearchScraper from "./scrapers/docsearch.js";
import CustomScraper from "./scrapers/custom.js";
import SchemaScraper from "./scrapers/schema.js";
import { Sender } from "./sender.js";
import { Config, Scraper } from "./types.js";


type DefaultHandler = Parameters<Parameters<Router<PuppeteerCrawlingContext>['addDefaultHandler']>[0]>[0]
import {
createPuppeteerRouter,
PuppeteerCrawler,
Router,
PuppeteerCrawlingContext,
} from 'crawlee'
import { minimatch } from 'minimatch'
import DefaultScraper from './scrapers/default.js'
import DocsearchScraper from './scrapers/docsearch.js'
import CustomScraper from './scrapers/custom.js'
import SchemaScraper from './scrapers/schema.js'
import { Sender } from './sender.js'
import { Config, Scraper } from './types.js'

type DefaultHandler = Parameters<
Parameters<Router<PuppeteerCrawlingContext>['addDefaultHandler']>[0]
>[0]

// Crawler class
// This class is responsible for crawling the urls and extract content to send to Meilisearch
Expand All @@ -23,26 +29,26 @@ export default class Crawler {
crawler: PuppeteerCrawler

constructor(sender: Sender, config: Config) {
console.info("Crawler::constructor");
this.sender = sender;
this.config = config;
this.urls = config.crawled_urls;
this.custom_crawler = config.custom_crawler;
console.info('Crawler::constructor')
this.sender = sender
this.config = config
this.urls = config.crawled_urls
this.custom_crawler = config.custom_crawler
// init the custome scraper depending on if config.strategy is docsearch, custom or default
this.scraper =
config.strategy == "docsearch"
config.strategy == 'docsearch'
? new DocsearchScraper(this.sender)
: config.strategy == "custom"
: config.strategy == 'custom'
? new CustomScraper(this.sender, config)
: config.strategy == "schema"
: config.strategy == 'schema'
? new SchemaScraper(this.sender, config)
: new DefaultScraper(this.sender, config);
: new DefaultScraper(this.sender, config)

//Create the router
let router = createPuppeteerRouter();
const router = createPuppeteerRouter()

// type DefaultHandler = Parameters<typeof router.addDefaultHandler>[0];
router.addDefaultHandler(this.defaultHandler.bind(this));
router.addDefaultHandler(this.defaultHandler.bind(this))

// create the crawler
this.crawler = new PuppeteerCrawler({
Expand All @@ -51,39 +57,39 @@ export default class Crawler {
launchContext: {
launchOptions: {
headless: config.headless || true,
args: ["--no-sandbox", "--disable-setuid-sandbox"],
ignoreDefaultArgs: ["--disable-extensions"],
args: ['--no-sandbox', '--disable-setuid-sandbox'],
ignoreDefaultArgs: ['--disable-extensions'],
},
},
});
})
}

async run() {
await this.crawler.run(this.urls);
await this.crawler.run(this.urls)
}

// Should we use `log`
async defaultHandler({ request , enqueueLinks, page }: DefaultHandler ) {
const title = await page.title();
console.log(`${title}`, { url: request.loadedUrl });
const crawled_globs = this.__generate_globs(this.urls);
async defaultHandler({ request, enqueueLinks, page }: DefaultHandler) {
const title = await page.title()
console.log(`${title}`, { url: request.loadedUrl })
const crawled_globs = this.__generate_globs(this.urls)
const excluded_crawled_globs = this.__generate_globs(
this.config.exclude_crawled_urls || []
);
)
const indexed_globs = this.__generate_globs(
this.config.indexed_urls || this.urls
);
)
const excluded_indexed_globs = this.__generate_globs(
this.config.exclude_indexed_urls || []
);
)

if (request.loadedUrl && !this.__is_paginated_url(request.loadedUrl)) {
//check if the url is in the list of urls to scrap
if (
this.__match_globs(request.loadedUrl, indexed_globs) &&
!this.__match_globs(request.loadedUrl, excluded_indexed_globs)
) {
await this.scraper.get(request.loadedUrl, page);
await this.scraper.get(request.loadedUrl, page)
}
}

Expand All @@ -93,78 +99,78 @@ export default class Crawler {
transformRequestFunction: (req) => {
// exclude all links that are files not parsable by puppeteer
if (this.__is_file_url(req.url)) {
return false;
return false
}
// remove all query params to avoid duplicates
const urlObject = new URL(req.url);
urlObject.search = "";
req.url = urlObject.toString();
const urlObject = new URL(req.url)
urlObject.search = ''
req.url = urlObject.toString()

return req;
return req
},
});
})
}

__generate_globs(urls: string[]) {
return urls.map((url) => {
if (url.endsWith("/")) {
return url + "**";
if (url.endsWith('/')) {
return url + '**'
}
return url + "/**";
});
return url + '/**'
})
}

__match_globs(url: string, globs: string[]) {
return globs.some((glob) => minimatch(url, glob));
return globs.some((glob) => minimatch(url, glob))
}

__is_file_url(url: string) {
const fileExtensions = [
".zip",
".pdf",
".doc",
".docx",
".xls",
".xlsx",
".ppt",
".pptx",
".rar",
".tar",
".gz",
".tgz",
".7z",
".bz2",
".jpg",
".jpeg",
".png",
".gif",
".svg",
".css",
".js",
".xml",
".txt",
".csv",
".rtf",
".mp3",
".wav",
".mp4",
".avi",
".mkv",
".mov",
".flv",
".wmv",
".m4v",
".ogg",
".mpg",
".mpeg",
".swf",
];
return fileExtensions.some((extension) => url.endsWith(extension));
'.zip',
'.pdf',
'.doc',
'.docx',
'.xls',
'.xlsx',
'.ppt',
'.pptx',
'.rar',
'.tar',
'.gz',
'.tgz',
'.7z',
'.bz2',
'.jpg',
'.jpeg',
'.png',
'.gif',
'.svg',
'.css',
'.js',
'.xml',
'.txt',
'.csv',
'.rtf',
'.mp3',
'.wav',
'.mp4',
'.avi',
'.mkv',
'.mov',
'.flv',
'.wmv',
'.m4v',
'.ogg',
'.mpg',
'.mpeg',
'.swf',
]
return fileExtensions.some((extension) => url.endsWith(extension))
}

__is_paginated_url(url: string) {
const urlObject = new URL(url);
const pathname = urlObject.pathname;
return /\/\d+\//.test(pathname);
const urlObject = new URL(url)
const pathname = urlObject.pathname
return /\/\d+\//.test(pathname)
}
}
Loading

0 comments on commit 6071ab0

Please sign in to comment.