Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Filter links before downloading / adding to the queue #175

Merged
merged 9 commits into from
Apr 24, 2022
29 changes: 27 additions & 2 deletions src/args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,24 @@ pub struct Args {
)]
pub user_agent: String,

/// Regex filter to limit visiting pages to only matched ones
#[structopt(
long,
default_value = ".*",
parse(try_from_str = parse_regex),
help = "Regex filter to limit to only visiting pages that match this expression"
)]
pub include_visit: Regex,

/// Regex filter to limit visiting pages to only matched ones
#[structopt(
long,
default_value = "$^",
parse(try_from_str = parse_regex),
help = "Regex filter to exclude visiting pages that match this expression"
)]
pub exclude_visit: Regex,

/// Regex filter to limit saving pages to only matched ones
#[structopt(
short,
Expand All @@ -94,7 +112,7 @@ pub struct Args {
parse(try_from_str = parse_regex),
help = "Regex filter to limit to only saving pages that match this expression"
)]
pub include: Regex,
pub include_download: Regex,

/// Regex filter to limit saving pages to only matched ones
#[structopt(
Expand All @@ -104,7 +122,14 @@ pub struct Args {
parse(try_from_str = parse_regex),
help = "Regex filter to exclude saving pages that match this expression"
)]
pub exclude: Regex,
pub exclude_download: Regex,

/// If set, set the visit filter to the values of the download filter
#[structopt(
long,
help = "Use the dowload filter in/exclude regexes for visiting as well"
)]
pub visit_filter_is_download_filter: bool,

/// HTTP basic authentication credentials
#[structopt(
Expand Down
44 changes: 32 additions & 12 deletions src/scraper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ static MAX_EMPTY_RECEIVES: usize = 10;
static INFINITE_DEPTH: i32 = -1;

/// Sleep duration on empty recv()
static SLEEP_MILLIS: u64 = 100;
static SLEEP_MILLIS: u64 = 500;
static SLEEP_DURATION: time::Duration = time::Duration::from_millis(SLEEP_MILLIS);

/// Producer and Consumer data structure. Handles the incoming requests and
Expand All @@ -49,6 +49,12 @@ impl Scraper {
pub fn new(args: args::Args) -> Scraper {
let (tx, rx) = crossbeam::channel::unbounded();

let mut args = args;
if args.visit_filter_is_download_filter {
args.include_visit = args.include_download.clone();
args.exclude_visit = args.exclude_download.clone();
}

Scraper {
downloader: downloader::Downloader::new(
args.tries,
Expand Down Expand Up @@ -178,7 +184,7 @@ impl Scraper {

dom.find_urls_as_strings()
.into_iter()
.filter(|candidate| Scraper::should_visit(candidate))
.filter(|candidate| Scraper::should_visit(scraper, candidate))
.for_each(|next_url| {
let url_to_parse = Scraper::normalize_url(next_url.clone());

Expand Down Expand Up @@ -226,6 +232,10 @@ impl Scraper {
depth: i32,
ext_depth: i32,
) {
let download_filter_matches = !scraper.args.exclude_download.is_match(url.as_str())
&& scraper.args.include_download.is_match(url.as_str());
// download the page even if the download filter does not match,
// so its links can be discovered and added to the queue
match scraper.downloader.get(&url) {
Ok(response) => {
let data = match response.data {
Expand All @@ -246,10 +256,7 @@ impl Scraper {
let path_map = scraper.path_map.lock().unwrap();
let path = path_map.get(url.as_str()).unwrap();

if !scraper.args.dry_run
&& !scraper.args.exclude.is_match(url.as_str())
&& scraper.args.include.is_match(url.as_str())
{
if !scraper.args.dry_run && download_filter_matches {
match response.filename {
Some(filename) => {
disk::save_file(&filename, &data, &scraper.args.output);
Expand All @@ -274,7 +281,11 @@ impl Scraper {
scraper.visited_urls.lock().unwrap().insert(url.to_string());

if scraper.args.verbose {
info!("Visited: {}", url);
if download_filter_matches {
info!("Downloaded: {}", url);
} else {
info!("Visited: {}", url);
}
}
}

Expand Down Expand Up @@ -333,7 +344,10 @@ impl Scraper {
}

/// If a URL should be visited (ignores `mail:`, `javascript:` and other pseudo-links)
fn should_visit(url: &str) -> bool {
fn should_visit(scraper: &Scraper, url: &str) -> bool {
if scraper.args.exclude_visit.is_match(url) || !scraper.args.include_visit.is_match(url) {
return false;
}
match Url::parse(url) {
/* The given candidate is a valid URL, and not a relative path to
* the next one. Therefore, we have to check if this URL is valid.
Expand Down Expand Up @@ -401,8 +415,11 @@ mod tests {
user_agent: "suckit".to_string(),
random_range: 0,
verbose: true,
include: Regex::new("jpg").unwrap(),
exclude: Regex::new("png").unwrap(),
include_visit: Regex::new(".*").unwrap(),
exclude_visit: Regex::new("^$").unwrap(),
include_download: Regex::new("jpg").unwrap(),
exclude_download: Regex::new("png").unwrap(),
visit_filter_is_download_filter: false,
auth: Vec::new(),
continue_on_error: true,
dry_run: false,
Expand All @@ -424,8 +441,11 @@ mod tests {
user_agent: "suckit".to_string(),
random_range: 5,
verbose: true,
include: Regex::new("jpg").unwrap(),
exclude: Regex::new("png").unwrap(),
include_visit: Regex::new(".*").unwrap(),
exclude_visit: Regex::new("^$").unwrap(),
include_download: Regex::new("jpg").unwrap(),
exclude_download: Regex::new("png").unwrap(),
visit_filter_is_download_filter: false,
auth: Vec::new(),
continue_on_error: true,
dry_run: false,
Expand Down
120 changes: 114 additions & 6 deletions tests/filters.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,118 @@ fn test_include_exclude() {
});

// Tests below are grouped together as they depend on the local_http_server above.
include_filter();
include_multiple_filters();
exclude_filter();
download_include_filter();
download_include_multiple_filters();
download_exclude_filter();

visit_include_filter();
visit_include_multiple_filters();
visit_exclude_filter();
}

// Test to use include flag for visiting pages only matching the given pattern.
fn visit_include_filter() {
let output_dir = "w2";
let _ = std::fs::remove_dir_all(output_dir);

let files_dir = format!("{}/{}/", output_dir, IP);
let mut cmd = Command::new(env!("CARGO_BIN_EXE_suckit"))
.args(&[
fixtures::HTTP_ADDR,
"-o",
output_dir,
"--include-visit",
"mp[3-4]",
"-j",
"16",
])
.stdout(Stdio::inherit())
.stderr(Stdio::inherit())
.spawn()
.unwrap();

let status = cmd.wait().unwrap();
assert!(status.success());
let paths = read_dir(&files_dir).unwrap();

assert_eq!(
paths.count() - 1, // minus one because of index.html which is downloaded unconditionally
get_file_count_with_pattern(".mp3", &files_dir).unwrap()
);

std::fs::remove_dir_all(output_dir).unwrap();
}

// Test demonstrating usage of multiple include patterns for visiting pages only matching the given pattern.
fn visit_include_multiple_filters() {
let output_dir = "w1";
let _ = std::fs::remove_dir_all(output_dir);

let files_dir = format!("{}/{}/", output_dir, IP);
let mut cmd = Command::new(env!("CARGO_BIN_EXE_suckit"))
.args(&[
fixtures::HTTP_ADDR,
"-o",
output_dir,
"--include-visit",
"(mp[3-4])|(txt)",
"-j",
"16",
])
.stdout(Stdio::inherit())
.stderr(Stdio::inherit())
.spawn()
.unwrap();
let status = cmd.wait().unwrap();
assert!(status.success());
let paths = read_dir(&files_dir).unwrap();
let mp3_count = get_file_count_with_pattern(".mp3", &files_dir).unwrap();
let txt_count = get_file_count_with_pattern(".txt", &files_dir).unwrap();
assert_eq!(
paths.count() - 1, // minus one because of index.html which is downloaded unconditionally
mp3_count + txt_count
);

std::fs::remove_dir_all(output_dir).unwrap();
}

// Test to use exclude flag for excluding pages matching the given pattern.
fn visit_exclude_filter() {
let output_dir = "w3";
let _ = std::fs::remove_dir_all(output_dir);

let files_dir = format!("{}/{}/", output_dir, IP);
let mut cmd = Command::new(env!("CARGO_BIN_EXE_suckit"))
.args(&[
fixtures::HTTP_ADDR,
"-o",
output_dir,
"--exclude-visit",
"jpe?g",
"-j",
"16",
])
.stdout(Stdio::inherit())
.stderr(Stdio::inherit())
.spawn()
.unwrap();

let status = cmd.wait().unwrap();
assert!(status.success());
let paths = read_dir(&files_dir).unwrap();
let mp3_count = get_file_count_with_pattern(".mp3", &files_dir).unwrap();
let txt_count = get_file_count_with_pattern(".txt", &files_dir).unwrap();
let index_file = 1;
assert_eq!(paths.count(), mp3_count + txt_count + index_file);

std::fs::remove_dir_all(output_dir).unwrap();
}

// Test to use include flag for downloading pages only matching the given pattern.
fn include_filter() {
fn download_include_filter() {
let output_dir = "w2";
let _ = std::fs::remove_dir_all(output_dir);

let files_dir = format!("{}/{}/", output_dir, IP);
let mut cmd = Command::new(env!("CARGO_BIN_EXE_suckit"))
.args(&[
Expand Down Expand Up @@ -57,8 +161,10 @@ fn include_filter() {
}

// Test demonstrating usage of multiple include patterns for downloading pages only matching the given pattern.
fn include_multiple_filters() {
fn download_include_multiple_filters() {
let output_dir = "w1";
let _ = std::fs::remove_dir_all(output_dir);

let files_dir = format!("{}/{}/", output_dir, IP);
let mut cmd = Command::new(env!("CARGO_BIN_EXE_suckit"))
.args(&[
Expand All @@ -85,8 +191,10 @@ fn include_multiple_filters() {
}

// Test to use exclude flag for excluding pages matching the given pattern.
fn exclude_filter() {
fn download_exclude_filter() {
let output_dir = "w3";
let _ = std::fs::remove_dir_all(output_dir);

let files_dir = format!("{}/{}/", output_dir, IP);
let mut cmd = Command::new(env!("CARGO_BIN_EXE_suckit"))
.args(&[
Expand Down