Skip to content

Commit

Permalink
Remove todo's
Browse files Browse the repository at this point in the history
  • Loading branch information
bidoubiwa committed Jun 21, 2023
1 parent 6df4811 commit 4c04106
Show file tree
Hide file tree
Showing 7 changed files with 15 additions and 33 deletions.
8 changes: 3 additions & 5 deletions src/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,21 +18,19 @@ export default class Crawler {
sender: Sender
config: Config
urls: string[]
custom_crawler: string // TODO: remove
custom_crawler: string
scraper: Scraper
crawler: PuppeteerCrawler



constructor(sender: Sender, config: Config) {
console.info("Crawler::constructor");
this.sender = sender;
this.config = config;
this.urls = config.crawled_urls;
this.custom_crawler = config.custom_crawler; // TODO: remove
this.custom_crawler = config.custom_crawler;
// init the custome scraper depending on if config.strategy is docsearch, custom or default
this.scraper =
config.strategy == "docsearch" // TODO: rename to docssearch
config.strategy == "docsearch"
? new DocsearchScraper(this.sender)
: config.strategy == "custom"
? new CustomScraper(this.sender, config)
Expand Down
3 changes: 1 addition & 2 deletions src/scrapers/default.ts
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ export default class DefaultScraper {
data.meta = meta;
data.image_url = this._get_image_url_from_meta(meta);
data.page_block = page_block;
let urls_tags = new URL(url).pathname.split("/"); // TODO: Rename to path_segments
let urls_tags = new URL(url).pathname.split("/");
data.urls_tags = urls_tags.slice(1, urls_tags.length - 1);

let id = await elem.evaluate((el) => el.id);
Expand Down Expand Up @@ -182,7 +182,6 @@ export default class DefaultScraper {
}

// A function that retro-engineer the hljs generated html to extract the code
// TODO: Does it work?
async _extract_code_from_page(page: Page) {
let code = await page.evaluate(() => {
let code = "";
Expand Down
5 changes: 1 addition & 4 deletions src/scrapers/schema.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import { Config, SchemaData } from "../types";
export default class SchemaScaper {
sender: Sender;
config: Config;
settings_sent: boolean; // TODO: Where is this used?
settings_sent: boolean;

constructor(sender: Sender, config: Config) {
console.info("SchemaScaper::constructor");
Expand Down Expand Up @@ -36,9 +36,6 @@ export default class SchemaScaper {
if (data.length === 0) return;

if (this.config.schema?.only_type) {
// TODO: Might be an error, should be `schema.only_type` and not `schema_config.only_type`
// Leaving old code in comment in case it is not an error
// if (data["@type"] !== this.config.schema_config?.only_type) return;
if (data["@type"] !== this.config.schema?.only_type) return;
}

Expand Down
8 changes: 0 additions & 8 deletions src/sender.ts
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,7 @@ export class Sender {
}
}
} catch (e) {
// TODO: better console.log
console.log("try to delete a tmp index if it exists");
console.error(e);
}

if (this.config.primary_key) {
Expand All @@ -53,7 +51,6 @@ export class Sender {
} catch (e) {
console.log("try to create or update the index with the primary key");

// TODO: why?
await this.client.createIndex(this.index_name, {
primaryKey: this.config.primary_key,
});
Expand All @@ -62,9 +59,6 @@ export class Sender {
}

//Add a json object to the queue


// TODO: should be better specified
async add(data: DocsSearchData | DefaultData) {
console.log("Sender::add");
if (this.config.primary_key) {
Expand Down Expand Up @@ -118,7 +112,5 @@ export class Sender {
{ indexes: [this.origin_index_name, this.index_name] },
]);
await this.client.index(this.index_name).waitForTask(task.taskUid);
// TODO: should we wait for task?
// await this.client.deleteIndex(this.index_name);
}
}
1 change: 0 additions & 1 deletion src/server.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ class Server {
this.app.post("/crawl", this.__crawl.bind(this));
this.app.post("/crawl/async", this.__crawl.bind(this));
this.app.post("/crawl/sync", this.__syncCrawl.bind(this));
// TODO: create route to empty taskQueue

this.app.listen(port, () =>
console.log(`Example app listening on port ${port}!`)
Expand Down
2 changes: 0 additions & 2 deletions src/taskQueue.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ import { MeiliSearch } from "meilisearch";
import { fork } from "child_process";
import { Config } from "./types";

// TODO: is not used
const redis_url = process.env.REDIS_URL;

export default class TaskQueue {
Expand All @@ -18,7 +17,6 @@ export default class TaskQueue {
else {
this.queue = new Queue("crawling");
}
// this.queue.obliterate({ force: true }); // TODO: add route to obliterate queue
this.queue.process(this.__process.bind(this));
this.queue.on("added", this.__jobAdded.bind(this));
this.queue.on("completed", this.__jobCompleted.bind(this));
Expand Down
21 changes: 10 additions & 11 deletions src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,21 @@ import DefaultScraper from "./scrapers/default";
import CustomScraper from "./scrapers/custom";
import SchemaScraper from "./scrapers/schema";

// TODO: find out which are mandatory or not
export type Config = {
meilisearch_index_name: string // TODO: rename to meilisearch_index_uid ?
meilisearch_host: string // TODO: rename to meilisearch_url
meilisearch_index_name: string
meilisearch_host: string
meilisearch_api_key: string
crawled_urls: string[] // TODO: rename to start_urls, as it is conventional
custom_crawler: string // TODO: Remove
crawled_urls: string[]
custom_crawler: string
queue?: string[]
primary_key?: string
batch_size?: number
custom_settings?: Settings // TODO: rename to meilisearch_settings
strategy?: 'docsearch' | 'default' | 'custom' | 'schema' // TODO: rename docsearch to docssearch
headless?: boolean // TODO: rename to wait_js ?
exclude_crawled_urls?: string[] // TODO: rename to `ignored_urls` ?
indexed_urls?: string[] // TODO: Rename, not sure what it does
exclude_indexed_urls?: string[] // TODO: rename
custom_settings?: Settings
strategy?: 'docsearch' | 'default' | 'custom' | 'schema'
headless?: boolean
exclude_crawled_urls?: string[]
indexed_urls?: string[]
exclude_indexed_urls?: string[]
schema?: SchemaConfig

}
Expand Down

0 comments on commit 4c04106

Please sign in to comment.