From 4e557ddba8fd13769481dce7fe552be3cda0856f Mon Sep 17 00:00:00 2001 From: raphTec Date: Thu, 10 Mar 2022 13:22:36 +0100 Subject: [PATCH 1/7] Filter links before downloading / adding to the queue This commit speeds up scraping for scenarios where pages have a high branch factor, that is many links and a majority of these links is excluded by the --exclude / --include rules. This also improves memory usage in these scenarios since the links are not stored. Also the network traffic is reduced by not downloading these links in the first place. --- src/scraper.rs | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/src/scraper.rs b/src/scraper.rs index c72e7f4..27f9041 100644 --- a/src/scraper.rs +++ b/src/scraper.rs @@ -178,7 +178,7 @@ impl Scraper { dom.find_urls_as_strings() .into_iter() - .filter(|candidate| Scraper::should_visit(candidate)) + .filter(|candidate| Scraper::should_visit(scraper, candidate)) .for_each(|next_url| { let url_to_parse = Scraper::normalize_url(next_url.clone()); @@ -246,10 +246,14 @@ impl Scraper { let path_map = scraper.path_map.lock().unwrap(); let path = path_map.get(url.as_str()).unwrap(); - if !scraper.args.dry_run - && !scraper.args.exclude.is_match(url.as_str()) - && scraper.args.include.is_match(url.as_str()) - { + // for the origin URL, we need to check the in/exclude rules since + // it is pushed into the channel unconditionally. + // we want to process its links, but maybe not download it. + // all other links are filtered before they are added to the channel. + let filter_rules_match = (depth > 0 + || (!scraper.args.exclude.is_match(url.as_str()) + && scraper.args.include.is_match(url.as_str()))); + if !scraper.args.dry_run && filter_rules_match { match response.filename { Some(filename) => { disk::save_file(&filename, &data, &scraper.args.output); @@ -333,7 +337,10 @@ impl Scraper { } /// If a URL should be visited (ignores `mail:`, `javascript:` and other pseudo-links) - fn should_visit(url: &str) -> bool { + fn should_visit(scraper: &Scraper, url: &str) -> bool { + if scraper.args.exclude.is_match(url) || !scraper.args.include.is_match(url) { + return false; + } match Url::parse(url) { /* The given candidate is a valid URL, and not a relative path to * the next one. Therefore, we have to check if this URL is valid. From 6927b38a0d1361891c8729cd7ea2a42ebf665b59 Mon Sep 17 00:00:00 2001 From: raphTec Date: Mon, 14 Mar 2022 23:47:16 +0100 Subject: [PATCH 2/7] Fix lint warning (remove brackets) --- src/scraper.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/scraper.rs b/src/scraper.rs index 27f9041..9d76f6d 100644 --- a/src/scraper.rs +++ b/src/scraper.rs @@ -250,9 +250,9 @@ impl Scraper { // it is pushed into the channel unconditionally. // we want to process its links, but maybe not download it. // all other links are filtered before they are added to the channel. - let filter_rules_match = (depth > 0 + let filter_rules_match = depth > 0 || (!scraper.args.exclude.is_match(url.as_str()) - && scraper.args.include.is_match(url.as_str()))); + && scraper.args.include.is_match(url.as_str())); if !scraper.args.dry_run && filter_rules_match { match response.filename { Some(filename) => { From 4df6528a7ca14f432876d86aed62cc5ed1005ba5 Mon Sep 17 00:00:00 2001 From: raphTec Date: Tue, 15 Mar 2022 23:04:05 +0100 Subject: [PATCH 3/7] Add support for distinct dowload and visit regex This allows fine-grained control whether a page is visited, that means its links analyzed, and saved to disk. The decoupling of download and visit filter means the complete website may still be explored while only downloading some files. To speed up scraping, irrelevant links can be easily excluded from visiting. --- src/args.rs | 29 +++++++++++++++++++++++++++-- src/scraper.rs | 41 +++++++++++++++++++++++++++-------------- 2 files changed, 54 insertions(+), 16 deletions(-) diff --git a/src/args.rs b/src/args.rs index c8be299..a1eee67 100644 --- a/src/args.rs +++ b/src/args.rs @@ -86,6 +86,24 @@ pub struct Args { )] pub user_agent: String, + /// Regex filter to limit visiting pages to only matched ones + #[structopt( + long, + default_value = ".*", + parse(try_from_str = parse_regex), + help = "Regex filter to limit to only visiting pages that match this expression" + )] + pub include_visit: Regex, + + /// Regex filter to limit visiting pages to only matched ones + #[structopt( + long, + default_value = "$^", + parse(try_from_str = parse_regex), + help = "Regex filter to exclude visiting pages that match this expression" + )] + pub exclude_visit: Regex, + /// Regex filter to limit saving pages to only matched ones #[structopt( short, @@ -94,7 +112,7 @@ pub struct Args { parse(try_from_str = parse_regex), help = "Regex filter to limit to only saving pages that match this expression" )] - pub include: Regex, + pub include_download: Regex, /// Regex filter to limit saving pages to only matched ones #[structopt( @@ -104,7 +122,14 @@ pub struct Args { parse(try_from_str = parse_regex), help = "Regex filter to exclude saving pages that match this expression" )] - pub exclude: Regex, + pub exclude_download: Regex, + + /// If set, set the visit filter to the values of the download filter + #[structopt( + long, + help = "Use the dowload filter in/exclude regexes for visiting as well" + )] + pub visit_filter_is_download_filter: bool, /// HTTP basic authentication credentials #[structopt( diff --git a/src/scraper.rs b/src/scraper.rs index 9d76f6d..89d4406 100644 --- a/src/scraper.rs +++ b/src/scraper.rs @@ -49,6 +49,12 @@ impl Scraper { pub fn new(args: args::Args) -> Scraper { let (tx, rx) = crossbeam::channel::unbounded(); + let mut args = args; + if args.visit_filter_is_download_filter { + args.include_visit = args.include_download.clone(); + args.exclude_visit = args.exclude_download.clone(); + } + Scraper { downloader: downloader::Downloader::new( args.tries, @@ -226,6 +232,10 @@ impl Scraper { depth: i32, ext_depth: i32, ) { + let download_filter_matches = !scraper.args.exclude_download.is_match(url.as_str()) + && scraper.args.include_download.is_match(url.as_str()); + // download the page even if the download filter does not match, + // so its links can be discovered and added to the queue match scraper.downloader.get(&url) { Ok(response) => { let data = match response.data { @@ -246,14 +256,7 @@ impl Scraper { let path_map = scraper.path_map.lock().unwrap(); let path = path_map.get(url.as_str()).unwrap(); - // for the origin URL, we need to check the in/exclude rules since - // it is pushed into the channel unconditionally. - // we want to process its links, but maybe not download it. - // all other links are filtered before they are added to the channel. - let filter_rules_match = depth > 0 - || (!scraper.args.exclude.is_match(url.as_str()) - && scraper.args.include.is_match(url.as_str())); - if !scraper.args.dry_run && filter_rules_match { + if !scraper.args.dry_run && download_filter_matches { match response.filename { Some(filename) => { disk::save_file(&filename, &data, &scraper.args.output); @@ -278,7 +281,11 @@ impl Scraper { scraper.visited_urls.lock().unwrap().insert(url.to_string()); if scraper.args.verbose { - info!("Visited: {}", url); + if download_filter_matches { + info!("Downloaded: {}", url); + } else { + info!("Visited: {}", url); + } } } @@ -338,7 +345,7 @@ impl Scraper { /// If a URL should be visited (ignores `mail:`, `javascript:` and other pseudo-links) fn should_visit(scraper: &Scraper, url: &str) -> bool { - if scraper.args.exclude.is_match(url) || !scraper.args.include.is_match(url) { + if scraper.args.exclude_visit.is_match(url) || !scraper.args.include_visit.is_match(url) { return false; } match Url::parse(url) { @@ -408,8 +415,11 @@ mod tests { user_agent: "suckit".to_string(), random_range: 0, verbose: true, - include: Regex::new("jpg").unwrap(), - exclude: Regex::new("png").unwrap(), + include_visit: Regex::new(".*").unwrap(), + exclude_visit: Regex::new("^$").unwrap(), + include_download: Regex::new("jpg").unwrap(), + exclude_download: Regex::new("png").unwrap(), + visit_filter_is_download_filter: false, auth: Vec::new(), continue_on_error: true, dry_run: false, @@ -431,8 +441,11 @@ mod tests { user_agent: "suckit".to_string(), random_range: 5, verbose: true, - include: Regex::new("jpg").unwrap(), - exclude: Regex::new("png").unwrap(), + include_visit: Regex::new(".*").unwrap(), + exclude_visit: Regex::new("^$").unwrap(), + include_download: Regex::new("jpg").unwrap(), + exclude_download: Regex::new("png").unwrap(), + visit_filter_is_download_filter: false, auth: Vec::new(), continue_on_error: true, dry_run: false, From a82a57bdd511d58fcfa68ad7cbb04516a4bce210 Mon Sep 17 00:00:00 2001 From: raphTec Date: Wed, 16 Mar 2022 00:07:08 +0100 Subject: [PATCH 4/7] Increase sleep duration --- src/scraper.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scraper.rs b/src/scraper.rs index 89d4406..a398eee 100644 --- a/src/scraper.rs +++ b/src/scraper.rs @@ -30,7 +30,7 @@ static MAX_EMPTY_RECEIVES: usize = 10; static INFINITE_DEPTH: i32 = -1; /// Sleep duration on empty recv() -static SLEEP_MILLIS: u64 = 100; +static SLEEP_MILLIS: u64 = 500; static SLEEP_DURATION: time::Duration = time::Duration::from_millis(SLEEP_MILLIS); /// Producer and Consumer data structure. Handles the incoming requests and From 041347b01ef0eeca923c95ce5c776853b5d4520d Mon Sep 17 00:00:00 2001 From: raphTec Date: Thu, 17 Mar 2022 12:18:01 +0100 Subject: [PATCH 5/7] Rename existing tests --- tests/filters.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/filters.rs b/tests/filters.rs index 51fc861..88626a4 100644 --- a/tests/filters.rs +++ b/tests/filters.rs @@ -20,13 +20,13 @@ fn test_include_exclude() { }); // Tests below are grouped together as they depend on the local_http_server above. - include_filter(); - include_multiple_filters(); - exclude_filter(); + download_include_filter(); + download_include_multiple_filters(); + download_exclude_filter(); } // Test to use include flag for downloading pages only matching the given pattern. -fn include_filter() { +fn download_include_filter() { let output_dir = "w2"; let files_dir = format!("{}/{}/", output_dir, IP); let mut cmd = Command::new(env!("CARGO_BIN_EXE_suckit")) @@ -57,7 +57,7 @@ fn include_filter() { } // Test demonstrating usage of multiple include patterns for downloading pages only matching the given pattern. -fn include_multiple_filters() { +fn download_include_multiple_filters() { let output_dir = "w1"; let files_dir = format!("{}/{}/", output_dir, IP); let mut cmd = Command::new(env!("CARGO_BIN_EXE_suckit")) @@ -85,7 +85,7 @@ fn include_multiple_filters() { } // Test to use exclude flag for excluding pages matching the given pattern. -fn exclude_filter() { +fn download_exclude_filter() { let output_dir = "w3"; let files_dir = format!("{}/{}/", output_dir, IP); let mut cmd = Command::new(env!("CARGO_BIN_EXE_suckit")) From 1b3e3115f2bcc223383c3d8c3e8ddcf03aca3959 Mon Sep 17 00:00:00 2001 From: raphTec Date: Tue, 22 Mar 2022 22:24:44 +0100 Subject: [PATCH 6/7] Empty test directories before running tests Otherwise a failed test leaves the directory populated, which fails future test runs. --- tests/filters.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/filters.rs b/tests/filters.rs index 88626a4..4ade56e 100644 --- a/tests/filters.rs +++ b/tests/filters.rs @@ -28,6 +28,8 @@ fn test_include_exclude() { // Test to use include flag for downloading pages only matching the given pattern. fn download_include_filter() { let output_dir = "w2"; + let _ = std::fs::remove_dir_all(output_dir); + let files_dir = format!("{}/{}/", output_dir, IP); let mut cmd = Command::new(env!("CARGO_BIN_EXE_suckit")) .args(&[ @@ -59,6 +61,8 @@ fn download_include_filter() { // Test demonstrating usage of multiple include patterns for downloading pages only matching the given pattern. fn download_include_multiple_filters() { let output_dir = "w1"; + let _ = std::fs::remove_dir_all(output_dir); + let files_dir = format!("{}/{}/", output_dir, IP); let mut cmd = Command::new(env!("CARGO_BIN_EXE_suckit")) .args(&[ @@ -87,6 +91,8 @@ fn download_include_multiple_filters() { // Test to use exclude flag for excluding pages matching the given pattern. fn download_exclude_filter() { let output_dir = "w3"; + let _ = std::fs::remove_dir_all(output_dir); + let files_dir = format!("{}/{}/", output_dir, IP); let mut cmd = Command::new(env!("CARGO_BIN_EXE_suckit")) .args(&[ From 444453162ebd1355809e0df96b322a9409060686 Mon Sep 17 00:00:00 2001 From: raphTec Date: Tue, 22 Mar 2022 23:32:45 +0100 Subject: [PATCH 7/7] Add visit filter tests --- tests/filters.rs | 102 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) diff --git a/tests/filters.rs b/tests/filters.rs index 4ade56e..63b3902 100644 --- a/tests/filters.rs +++ b/tests/filters.rs @@ -23,6 +23,108 @@ fn test_include_exclude() { download_include_filter(); download_include_multiple_filters(); download_exclude_filter(); + + visit_include_filter(); + visit_include_multiple_filters(); + visit_exclude_filter(); +} + +// Test to use include flag for visiting pages only matching the given pattern. +fn visit_include_filter() { + let output_dir = "w2"; + let _ = std::fs::remove_dir_all(output_dir); + + let files_dir = format!("{}/{}/", output_dir, IP); + let mut cmd = Command::new(env!("CARGO_BIN_EXE_suckit")) + .args(&[ + fixtures::HTTP_ADDR, + "-o", + output_dir, + "--include-visit", + "mp[3-4]", + "-j", + "16", + ]) + .stdout(Stdio::inherit()) + .stderr(Stdio::inherit()) + .spawn() + .unwrap(); + + let status = cmd.wait().unwrap(); + assert!(status.success()); + let paths = read_dir(&files_dir).unwrap(); + + assert_eq!( + paths.count() - 1, // minus one because of index.html which is downloaded unconditionally + get_file_count_with_pattern(".mp3", &files_dir).unwrap() + ); + + std::fs::remove_dir_all(output_dir).unwrap(); +} + +// Test demonstrating usage of multiple include patterns for visiting pages only matching the given pattern. +fn visit_include_multiple_filters() { + let output_dir = "w1"; + let _ = std::fs::remove_dir_all(output_dir); + + let files_dir = format!("{}/{}/", output_dir, IP); + let mut cmd = Command::new(env!("CARGO_BIN_EXE_suckit")) + .args(&[ + fixtures::HTTP_ADDR, + "-o", + output_dir, + "--include-visit", + "(mp[3-4])|(txt)", + "-j", + "16", + ]) + .stdout(Stdio::inherit()) + .stderr(Stdio::inherit()) + .spawn() + .unwrap(); + let status = cmd.wait().unwrap(); + assert!(status.success()); + let paths = read_dir(&files_dir).unwrap(); + let mp3_count = get_file_count_with_pattern(".mp3", &files_dir).unwrap(); + let txt_count = get_file_count_with_pattern(".txt", &files_dir).unwrap(); + assert_eq!( + paths.count() - 1, // minus one because of index.html which is downloaded unconditionally + mp3_count + txt_count + ); + + std::fs::remove_dir_all(output_dir).unwrap(); +} + +// Test to use exclude flag for excluding pages matching the given pattern. +fn visit_exclude_filter() { + let output_dir = "w3"; + let _ = std::fs::remove_dir_all(output_dir); + + let files_dir = format!("{}/{}/", output_dir, IP); + let mut cmd = Command::new(env!("CARGO_BIN_EXE_suckit")) + .args(&[ + fixtures::HTTP_ADDR, + "-o", + output_dir, + "--exclude-visit", + "jpe?g", + "-j", + "16", + ]) + .stdout(Stdio::inherit()) + .stderr(Stdio::inherit()) + .spawn() + .unwrap(); + + let status = cmd.wait().unwrap(); + assert!(status.success()); + let paths = read_dir(&files_dir).unwrap(); + let mp3_count = get_file_count_with_pattern(".mp3", &files_dir).unwrap(); + let txt_count = get_file_count_with_pattern(".txt", &files_dir).unwrap(); + let index_file = 1; + assert_eq!(paths.count(), mp3_count + txt_count + index_file); + + std::fs::remove_dir_all(output_dir).unwrap(); } // Test to use include flag for downloading pages only matching the given pattern.