Skallwar · Skallwar · Apr 24, 2022 · Mar 10, 2022 · Mar 14, 2022 · Mar 15, 2022
diff --git a/src/args.rs b/src/args.rs
@@ -86,6 +86,24 @@ pub struct Args {
  )]
  pub user_agent: String,
 
+ /// Regex filter to limit visiting pages to only matched ones
+ #[structopt(
+ long,
+ default_value = ".*",
+ parse(try_from_str = parse_regex),
+ help = "Regex filter to limit to only visiting pages that match this expression"
+ )]
+ pub include_visit: Regex,
+
+ /// Regex filter to limit visiting pages to only matched ones
+ #[structopt(
+ long,
+ default_value = "$^",
+ parse(try_from_str = parse_regex),
+ help = "Regex filter to exclude visiting pages that match this expression"
+ )]
+ pub exclude_visit: Regex,
+
  /// Regex filter to limit saving pages to only matched ones
  #[structopt(
  short,
@@ -94,7 +112,7 @@ pub struct Args {
  parse(try_from_str = parse_regex),
  help = "Regex filter to limit to only saving pages that match this expression"
  )]
- pub include: Regex,
+ pub include_download: Regex,
 
  /// Regex filter to limit saving pages to only matched ones
  #[structopt(
@@ -104,7 +122,14 @@ pub struct Args {
  parse(try_from_str = parse_regex),
  help = "Regex filter to exclude saving pages that match this expression"
  )]
- pub exclude: Regex,
+ pub exclude_download: Regex,
+
+ /// If set, set the visit filter to the values of the download filter
+ #[structopt(
+ long,
+ help = "Use the dowload filter in/exclude regexes for visiting as well"
+ )]
+ pub visit_filter_is_download_filter: bool,
 
  /// HTTP basic authentication credentials
  #[structopt(

diff --git a/src/scraper.rs b/src/scraper.rs
@@ -30,7 +30,7 @@ static MAX_EMPTY_RECEIVES: usize = 10;
 static INFINITE_DEPTH: i32 = -1;
 
 /// Sleep duration on empty recv()
-static SLEEP_MILLIS: u64 = 100;
+static SLEEP_MILLIS: u64 = 500;
 static SLEEP_DURATION: time::Duration = time::Duration::from_millis(SLEEP_MILLIS);
 
 /// Producer and Consumer data structure. Handles the incoming requests and
@@ -49,6 +49,12 @@ impl Scraper {
  pub fn new(args: args::Args) -> Scraper {
  let (tx, rx) = crossbeam::channel::unbounded();
 
+ let mut args = args;
+ if args.visit_filter_is_download_filter {
+ args.include_visit = args.include_download.clone();
+ args.exclude_visit = args.exclude_download.clone();
+ }
+
  Scraper {
  downloader: downloader::Downloader::new(
  args.tries,
@@ -178,7 +184,7 @@ impl Scraper {
 
  dom.find_urls_as_strings()
  .into_iter()
- .filter(|candidate| Scraper::should_visit(candidate))
+ .filter(|candidate| Scraper::should_visit(scraper, candidate))
  .for_each(|next_url| {
  let url_to_parse = Scraper::normalize_url(next_url.clone());
 
@@ -226,6 +232,10 @@ impl Scraper {
  depth: i32,
  ext_depth: i32,
  ) {
+ let download_filter_matches = !scraper.args.exclude_download.is_match(url.as_str())
+ && scraper.args.include_download.is_match(url.as_str());
+ // download the page even if the download filter does not match,
+ // so its links can be discovered and added to the queue
  match scraper.downloader.get(&url) {
  Ok(response) => {
  let data = match response.data {
@@ -246,10 +256,7 @@ impl Scraper {
  let path_map = scraper.path_map.lock().unwrap();
  let path = path_map.get(url.as_str()).unwrap();
 
- if !scraper.args.dry_run
- && !scraper.args.exclude.is_match(url.as_str())
- && scraper.args.include.is_match(url.as_str())
- {
+ if !scraper.args.dry_run && download_filter_matches {
  match response.filename {
  Some(filename) => {
  disk::save_file(&filename, &data, &scraper.args.output);
@@ -274,7 +281,11 @@ impl Scraper {
  scraper.visited_urls.lock().unwrap().insert(url.to_string());
 
  if scraper.args.verbose {
- info!("Visited: {}", url);
+ if download_filter_matches {
+ info!("Downloaded: {}", url);
+ } else {
+ info!("Visited: {}", url);
+ }
  }
  }
 
@@ -333,7 +344,10 @@ impl Scraper {
  }
 
  /// If a URL should be visited (ignores `mail:`, `javascript:` and other pseudo-links)
- fn should_visit(url: &str) -> bool {
+ fn should_visit(scraper: &Scraper, url: &str) -> bool {
+ if scraper.args.exclude_visit.is_match(url) || !scraper.args.include_visit.is_match(url) {
+ return false;
+ }
  match Url::parse(url) {
  /* The given candidate is a valid URL, and not a relative path to
  * the next one. Therefore, we have to check if this URL is valid.
@@ -401,8 +415,11 @@ mod tests {
  user_agent: "suckit".to_string(),
  random_range: 0,
  verbose: true,
- include: Regex::new("jpg").unwrap(),
- exclude: Regex::new("png").unwrap(),
+ include_visit: Regex::new(".*").unwrap(),
+ exclude_visit: Regex::new("^$").unwrap(),
+ include_download: Regex::new("jpg").unwrap(),
+ exclude_download: Regex::new("png").unwrap(),
+ visit_filter_is_download_filter: false,
  auth: Vec::new(),
  continue_on_error: true,
  dry_run: false,
@@ -424,8 +441,11 @@ mod tests {
  user_agent: "suckit".to_string(),
  random_range: 5,
  verbose: true,
- include: Regex::new("jpg").unwrap(),
- exclude: Regex::new("png").unwrap(),
+ include_visit: Regex::new(".*").unwrap(),
+ exclude_visit: Regex::new("^$").unwrap(),
+ include_download: Regex::new("jpg").unwrap(),
+ exclude_download: Regex::new("png").unwrap(),
+ visit_filter_is_download_filter: false,
  auth: Vec::new(),
  continue_on_error: true,
  dry_run: false,

diff --git a/tests/filters.rs b/tests/filters.rs
@@ -20,14 +20,118 @@ fn test_include_exclude() {
  });
 
  // Tests below are grouped together as they depend on the local_http_server above.
- include_filter();
- include_multiple_filters();
- exclude_filter();
+ download_include_filter();
+ download_include_multiple_filters();
+ download_exclude_filter();
+
+ visit_include_filter();
+ visit_include_multiple_filters();
+ visit_exclude_filter();
+}
+
+// Test to use include flag for visiting pages only matching the given pattern.
+fn visit_include_filter() {
+ let output_dir = "w2";
+ let _ = std::fs::remove_dir_all(output_dir);
+
+ let files_dir = format!("{}/{}/", output_dir, IP);
+ let mut cmd = Command::new(env!("CARGO_BIN_EXE_suckit"))
+ .args(&[
+ fixtures::HTTP_ADDR,
+ "-o",
+ output_dir,
+ "--include-visit",
+ "mp[3-4]",
+ "-j",
+ "16",
+ ])
+ .stdout(Stdio::inherit())
+ .stderr(Stdio::inherit())
+ .spawn()
+ .unwrap();
+
+ let status = cmd.wait().unwrap();
+ assert!(status.success());
+ let paths = read_dir(&files_dir).unwrap();
+
+ assert_eq!(
+ paths.count() - 1, // minus one because of index.html which is downloaded unconditionally
+ get_file_count_with_pattern(".mp3", &files_dir).unwrap()
+ );
+
+ std::fs::remove_dir_all(output_dir).unwrap();
+}
+
+// Test demonstrating usage of multiple include patterns for visiting pages only matching the given pattern.
+fn visit_include_multiple_filters() {
+ let output_dir = "w1";
+ let _ = std::fs::remove_dir_all(output_dir);
+
+ let files_dir = format!("{}/{}/", output_dir, IP);
+ let mut cmd = Command::new(env!("CARGO_BIN_EXE_suckit"))
+ .args(&[
+ fixtures::HTTP_ADDR,
+ "-o",
+ output_dir,
+ "--include-visit",
+ "(mp[3-4])|(txt)",
+ "-j",
+ "16",
+ ])
+ .stdout(Stdio::inherit())
+ .stderr(Stdio::inherit())
+ .spawn()
+ .unwrap();
+ let status = cmd.wait().unwrap();
+ assert!(status.success());
+ let paths = read_dir(&files_dir).unwrap();
+ let mp3_count = get_file_count_with_pattern(".mp3", &files_dir).unwrap();
+ let txt_count = get_file_count_with_pattern(".txt", &files_dir).unwrap();
+ assert_eq!(
+ paths.count() - 1, // minus one because of index.html which is downloaded unconditionally
+ mp3_count + txt_count
+ );
+
+ std::fs::remove_dir_all(output_dir).unwrap();
+}
+
+// Test to use exclude flag for excluding pages matching the given pattern.
+fn visit_exclude_filter() {
+ let output_dir = "w3";
+ let _ = std::fs::remove_dir_all(output_dir);
+
+ let files_dir = format!("{}/{}/", output_dir, IP);
+ let mut cmd = Command::new(env!("CARGO_BIN_EXE_suckit"))
+ .args(&[
+ fixtures::HTTP_ADDR,
+ "-o",
+ output_dir,
+ "--exclude-visit",
+ "jpe?g",
+ "-j",
+ "16",
+ ])
+ .stdout(Stdio::inherit())
+ .stderr(Stdio::inherit())
+ .spawn()
+ .unwrap();
+
+ let status = cmd.wait().unwrap();
+ assert!(status.success());
+ let paths = read_dir(&files_dir).unwrap();
+ let mp3_count = get_file_count_with_pattern(".mp3", &files_dir).unwrap();
+ let txt_count = get_file_count_with_pattern(".txt", &files_dir).unwrap();
+ let index_file = 1;
+ assert_eq!(paths.count(), mp3_count + txt_count + index_file);
+
+ std::fs::remove_dir_all(output_dir).unwrap();
 }
 
 // Test to use include flag for downloading pages only matching the given pattern.
-fn include_filter() {
+fn download_include_filter() {
  let output_dir = "w2";
+ let _ = std::fs::remove_dir_all(output_dir);
+
  let files_dir = format!("{}/{}/", output_dir, IP);
  let mut cmd = Command::new(env!("CARGO_BIN_EXE_suckit"))
  .args(&[
@@ -57,8 +161,10 @@ fn include_filter() {
 }
 
 // Test demonstrating usage of multiple include patterns for downloading pages only matching the given pattern.
-fn include_multiple_filters() {
+fn download_include_multiple_filters() {
  let output_dir = "w1";
+ let _ = std::fs::remove_dir_all(output_dir);
+
  let files_dir = format!("{}/{}/", output_dir, IP);
  let mut cmd = Command::new(env!("CARGO_BIN_EXE_suckit"))
  .args(&[
@@ -85,8 +191,10 @@ fn include_multiple_filters() {
 }
 
 // Test to use exclude flag for excluding pages matching the given pattern.
-fn exclude_filter() {
+fn download_exclude_filter() {
  let output_dir = "w3";
+ let _ = std::fs::remove_dir_all(output_dir);
+
  let files_dir = format!("{}/{}/", output_dir, IP);
  let mut cmd = Command::new(env!("CARGO_BIN_EXE_suckit"))
  .args(&[