Skip to content

Commit

Permalink
feat(cli): add cli ability (#16)
Browse files Browse the repository at this point in the history
* feat(cli): add cli ability

* chore(cli): add ua parsing
  • Loading branch information
j-mendez authored Feb 21, 2022
1 parent 2420f2c commit 0945328
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 6 deletions.
13 changes: 7 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,13 +66,14 @@ You can get a working example at [`example.rs`](./example.rs) and run it with
cargo run --example example
```

## TODO
## Cli

- [x] multi-threaded system
- [x] respect _robot.txt_ file
- [x] add configuration object for polite delay, etc..
- [x] add polite delay
- [ ] parse command line arguments
The following can also be ran via command line to run the crawler.
All website options are available except `website.on_link_find_callback`.

```sh
cargo run -- --domain https://choosealicense.com --verbose true --delay 2000
```

## Contribute

Expand Down
54 changes: 54 additions & 0 deletions src/main.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
extern crate spider;

use spider::website::Website;
use std::collections::HashMap;

fn parse_args(mut args: impl Iterator<Item = String>) -> HashMap<String, String> {
let mut flags = HashMap::new();

while let Some(arg) = args.next() {
if let Some(flag) = arg.strip_prefix("-") {
if let Some(option) = flag.strip_prefix("-") {
flags.insert(option.into(), args.next().unwrap_or_default());
} else {
for fchar in flag.chars() {
flags.insert(fchar.into(), String::from("1"));
}
}
}
}

flags
}

fn main() {
let options = parse_args(std::env::args());
let mut website: Website = Website::new(&options["domain"]);

if options.contains_key("respect_robots_txt") {
website.configuration.respect_robots_txt = options["respect_robots_txt"] == "true";
}
if options.contains_key("verbose") {
website.configuration.verbose = options["verbose"] == "true";
}
if options.contains_key("delay") {
website.configuration.delay = options["delay"].parse::<u64>().unwrap();
}
if options.contains_key("concurrency") {
website.configuration.concurrency = options["concurrency"].parse::<usize>().unwrap();
}
if options.contains_key("blacklist_url") {
website.configuration.blacklist_url.push(options["blacklist_url"].to_string());
}

if options.contains_key("user_agent") {
website.configuration.user_agent = Box::leak(options["user_agent"].to_owned().into_boxed_str());
}

// TODO: add on_link_find_callback eval function
// if options.contains_key("on_link_find_callback") {
// website.on_link_find_callback = options["on_link_find_callback"];
// }

website.crawl();
}

0 comments on commit 0945328

Please sign in to comment.