Skip to content

Commit

Permalink
search: add support for searching compressed files
Browse files Browse the repository at this point in the history
This commit adds opt-in support for searching compressed files during
recursive search. This behavior is only enabled when the
`-z/--search-zip` flag is passed to ripgrep. When enabled, a limited set
of common compression formats are recognized via file extension, and a
new process is spawned to perform the decompression. ripgrep then
searches the stdout of that spawned process.

Closes #539
  • Loading branch information
balajisivaraman authored and BurntSushi committed Jan 30, 2018
1 parent a8543f7 commit f007f94
Show file tree
Hide file tree
Showing 18 changed files with 373 additions and 24 deletions.
4 changes: 3 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,10 @@ env:
addons:
apt:
packages:
# Needed for completion-function test
# Needed for completion-function test.
- zsh
# Needed for testing decompression search.
- xz-utils

matrix:
fast_finish: true
Expand Down
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ num_cpus = "1"
regex = "0.2.4"
same-file = "1"
termcolor = { version = "0.3.3", path = "termcolor" }
globset = { version = "0.2.1", path = "globset" }

[build-dependencies]
clap = "2.26"
Expand Down
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,8 @@ increases the times to `2.640s` for ripgrep and `10.277s` for GNU grep.
as UTF-16, latin-1, GBK, EUC-JP, Shift_JIS and more. (Some support for
automatically detecting UTF-16 is provided. Other text encodings must be
specifically specified with the `-E/--encoding` flag.)
* `ripgrep` supports searching files compressed in a common format (gzip, xz,
lzma or bzip2 current) with the `-z/--search-zip` flag.

In other words, use `ripgrep` if you like speed, filtering by default, fewer
bugs, and Unicode support.
Expand All @@ -109,12 +111,10 @@ give you a glimpse at some important downsides or missing features of
support for Unicode categories (e.g., `\p{Sc}` to match currency symbols or
`\p{Lu}` to match any uppercase letter). (Fancier regexes will never be
supported.)
* `ripgrep` doesn't yet support searching compressed files. (Likely to be
supported in the future.)
* `ripgrep` doesn't have multiline search. (Unlikely to ever be supported.)

In other words, if you like fancy regexes, searching compressed files or
multiline search, then `ripgrep` may not quite meet your needs (yet).
In other words, if you like fancy regexes or multiline search, then `ripgrep`
may not quite meet your needs (yet).

### Feature comparison

Expand Down
1 change: 1 addition & 0 deletions complete/_rg
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ _rg() {
'(-w -x --line-regexp --word-regexp)'{-w,--word-regexp}'[only show matches surrounded by word boundaries]'
'(-e -f --file --files --regexp --type-list)1: :_rg_pattern'
'(--type-list)*:file:_files'
'(-z --search-zip)'{-z,--search-zip}'[search in compressed files]'
)

[[ ${_RG_COMPLETE_LIST_ARGS:-} == (1|t*|y*) ]] && {
Expand Down
13 changes: 10 additions & 3 deletions doc/rg.1
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,15 @@ Only show matches surrounded by line boundaries.
This is equivalent to putting ^...$ around the search pattern.
.RS
.RE
.TP
.B \-z, \-\-search\-zip
Search in compressed files.
Currently gz, bz2, xz and lzma formats are supported.
.RS
.PP
Note that ripgrep expects to find the decompression binaries for the
respective formats in your system\[aq]s PATH for use with this flag.
.RE
.SH LESS COMMON OPTIONS
.TP
.B \-A, \-\-after\-context \f[I]NUM\f[]
Expand Down Expand Up @@ -437,9 +446,7 @@ such part on a separate output line.
.TP
.B \-\-passthru, \-\-passthrough
Show both matching and non\-matching lines.
This is equivalent to adding ^ to the list of search patterns.
This option overrides \-\-count and cannot be used with
\-\-only\-matching or \-\-replace.
This option cannot be used with \-\-only\-matching or \-\-replace.
.RS
.RE
.TP
Expand Down
7 changes: 7 additions & 0 deletions doc/rg.1.md
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,13 @@ Project home page: https://github.com/BurntSushi/ripgrep
: Only show matches surrounded by line boundaries. This is equivalent to
putting ^...$ around the search pattern.

-z, --search-zip
: Search in compressed files. Currently gz, bz2, xz and lzma
formats are supported.

Note that ripgrep expects to find the decompression binaries for the
respective formats in your system's PATH for use with this flag.

# LESS COMMON OPTIONS

-A, --after-context *NUM*
Expand Down
4 changes: 4 additions & 0 deletions ignore/src/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ const DEFAULT_TYPES: &'static [(&'static str, &'static [&'static str])] = &[
("avro", &["*.avdl", "*.avpr", "*.avsc"]),
("awk", &["*.awk"]),
("bitbake", &["*.bb", "*.bbappend", "*.bbclass", "*.conf", "*.inc"]),
("bzip2", &["*.bz2"]),
("c", &["*.c", "*.h", "*.H"]),
("cabal", &["*.cabal"]),
("cbor", &["*.cbor"]),
Expand Down Expand Up @@ -137,6 +138,7 @@ const DEFAULT_TYPES: &'static [(&'static str, &'static [&'static str])] = &[
("fsharp", &["*.fs", "*.fsx", "*.fsi"]),
("gn", &["*.gn", "*.gni"]),
("go", &["*.go"]),
("gzip", &["*.gz"]),
("groovy", &["*.groovy", "*.gradle"]),
("h", &["*.h", "*.hpp"]),
("hbs", &["*.hbs"]),
Expand Down Expand Up @@ -184,6 +186,7 @@ const DEFAULT_TYPES: &'static [(&'static str, &'static [&'static str])] = &[
("lisp", &["*.el", "*.jl", "*.lisp", "*.lsp", "*.sc", "*.scm"]),
("log", &["*.log"]),
("lua", &["*.lua"]),
("lzma", &["*.lzma"]),
("m4", &["*.ac", "*.m4"]),
("make", &[
"gnumakefile", "Gnumakefile", "GNUmakefile",
Expand Down Expand Up @@ -276,6 +279,7 @@ const DEFAULT_TYPES: &'static [(&'static str, &'static [&'static str])] = &[
("wiki", &["*.mediawiki", "*.wiki"]),
("webidl", &["*.idl", "*.webidl", "*.widl"]),
("xml", &["*.xml", "*.xml.dist"]),
("xz", &["*.xz"]),
("yacc", &["*.y"]),
("yaml", &["*.yaml", "*.yml"]),
("zsh", &[
Expand Down
14 changes: 11 additions & 3 deletions src/app.rs
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,7 @@ pub fn app() -> App<'static, 'static> {
.arg(flag("type-clear")
.value_name("TYPE").takes_value(true)
.multiple(true).number_of_values(1))
.arg(flag("search-zip").short("z"))
}

struct Usage {
Expand Down Expand Up @@ -450,7 +451,8 @@ lazy_static! {
can be specified by using the --ignore-file flag several times. \
When specifying multiple ignore files, earlier files have lower \
precedence than later files. If you are looking for a way to \
include or exclude files and directories directly used -g instead.");
include or exclude files and directories directly used -g \
instead.");
doc!(h, "follow",
"Follow symbolic links.");
doc!(h, "max-count",
Expand Down Expand Up @@ -592,15 +594,21 @@ lazy_static! {
only clears the default type definitions that are found inside \
of ripgrep.\n\nNote that this MUST be passed to every \
invocation of ripgrep. Type settings are NOT persisted.");
doc!(h, "search-zip",
"Search in compressed files.",
"Search in compressed files. Currently gz, bz2, xz, and \
lzma files are supported. This option expects the decompression \
binaries to be available in the system PATH.");

h
};
}

fn validate_line_number_width(s: String) -> Result<(), String> {
if s.starts_with("0") {
Err(String::from("Custom padding characters are currently not supported. \
Please enter only a numeric value."))
Err(String::from(
"Custom padding characters are currently not supported. \
Please enter only a numeric value."))
} else {
validate_number(s)
}
Expand Down
3 changes: 3 additions & 0 deletions src/args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ pub struct Args {
type_list: bool,
types: Types,
with_filename: bool,
search_zip_files: bool
}

impl Args {
Expand Down Expand Up @@ -229,6 +230,7 @@ impl Args {
.no_messages(self.no_messages)
.quiet(self.quiet)
.text(self.text)
.search_zip_files(self.search_zip_files)
.build()
}

Expand Down Expand Up @@ -365,6 +367,7 @@ impl<'a> ArgMatches<'a> {
type_list: self.is_present("type-list"),
types: self.types()?,
with_filename: with_filename,
search_zip_files: self.is_present("search-zip")
};
if args.mmap {
debug!("will try to use memory maps");
Expand Down
191 changes: 191 additions & 0 deletions src/decompressor.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
use std::collections::HashMap;
use std::ffi::OsStr;
use std::fmt;
use std::io::{self, Read};
use std::path::Path;
use std::process::{self, Stdio};

use globset::{Glob, GlobSet, GlobSetBuilder};

/// A decompression command, contains the command to be spawned as well as any
/// necessary CLI args.
#[derive(Clone, Copy, Debug)]
struct DecompressionCommand {
cmd: &'static str,
args: &'static [&'static str],
}

impl DecompressionCommand {
/// Create a new decompress command
fn new(
cmd: &'static str,
args: &'static [&'static str],
) -> DecompressionCommand {
DecompressionCommand {
cmd, args
}
}
}

impl fmt::Display for DecompressionCommand {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{} {}", self.cmd, self.args.join(" "))
}
}

lazy_static! {
static ref DECOMPRESSION_COMMANDS: HashMap<
&'static str,
DecompressionCommand,
> = {
let mut m = HashMap::new();

const ARGS: &[&str] = &["-d", "-c"];
m.insert("gz", DecompressionCommand::new("gzip", ARGS));
m.insert("bz2", DecompressionCommand::new("bzip2", ARGS));
m.insert("xz", DecompressionCommand::new("xz", ARGS));

const LZMA_ARGS: &[&str] = &["--format=lzma", "-d", "-c"];
m.insert("lzma", DecompressionCommand::new("xz", LZMA_ARGS));

m
};
static ref SUPPORTED_COMPRESSION_FORMATS: GlobSet = {
let mut builder = GlobSetBuilder::new();
builder.add(Glob::new("*.gz").unwrap());
builder.add(Glob::new("*.bz2").unwrap());
builder.add(Glob::new("*.xz").unwrap());
builder.add(Glob::new("*.lzma").unwrap());
builder.build().unwrap()
};
static ref TAR_ARCHIVE_FORMATS: GlobSet = {
let mut builder = GlobSetBuilder::new();
builder.add(Glob::new("*.tar.gz").unwrap());
builder.add(Glob::new("*.tar.xz").unwrap());
builder.add(Glob::new("*.tar.bz2").unwrap());
builder.add(Glob::new("*.tgz").unwrap());
builder.add(Glob::new("*.txz").unwrap());
builder.add(Glob::new("*.tbz2").unwrap());
builder.build().unwrap()
};
}

/// DecompressionReader provides an `io::Read` implementation for a limited
/// set of compression formats.
#[derive(Debug)]
pub struct DecompressionReader {
cmd: DecompressionCommand,
child: process::Child,
done: bool,
}

impl DecompressionReader {
/// Returns a handle to the stdout of the spawned decompression process for
/// `path`, which can be directly searched in the worker. When the returned
/// value is exhausted, the underlying process is reaped. If the underlying
/// process fails, then its stderr is read and converted into a normal
/// io::Error.
///
/// If there is any error in spawning the decompression command, then
/// return `None`, after outputting any necessary debug or error messages.
pub fn from_path(path: &Path) -> Option<DecompressionReader> {
if is_tar_archive(path) {
debug!("{}: skipping tar archive", path.display());
return None;
}
let extension = match path.extension().and_then(OsStr::to_str) {
Some(extension) => extension,
None => {
debug!(
"{}: failed to get compresson extension", path.display());
return None;
}
};
let decompression_cmd = match DECOMPRESSION_COMMANDS.get(extension) {
Some(cmd) => cmd,
None => {
debug!(
"{}: failed to get decompression command", path.display());
return None;
}
};
let cmd = process::Command::new(decompression_cmd.cmd)
.args(decompression_cmd.args)
.arg(path)
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.spawn();
let child = match cmd {
Ok(process) => process,
Err(_) => {
debug!(
"{}: decompression command '{}' not found",
path.display(), decompression_cmd.cmd);
return None;
}
};
Some(DecompressionReader::new(*decompression_cmd, child))
}

fn new(
cmd: DecompressionCommand,
child: process::Child,
) -> DecompressionReader {
DecompressionReader {
cmd: cmd,
child: child,
done: false,
}
}

fn read_error(&mut self) -> io::Result<io::Error> {
let mut errbytes = vec![];
self.child.stderr.as_mut().unwrap().read_to_end(&mut errbytes)?;
let errstr = String::from_utf8_lossy(&errbytes);
let errstr = errstr.trim();

Ok(if errstr.is_empty() {
let msg = format!("decompression command failed: '{}'", self.cmd);
io::Error::new(io::ErrorKind::Other, msg)
} else {
let msg = format!(
"decompression command '{}' failed: {}", self.cmd, errstr);
io::Error::new(io::ErrorKind::Other, msg)
})
}
}

impl io::Read for DecompressionReader {
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
if self.done {
return Ok(0);
}
let nread = self.child.stdout.as_mut().unwrap().read(buf)?;
if nread == 0 {
self.done = true;
// Reap the child now that we're done reading.
// If the command failed, report stderr as an error.
if !self.child.wait()?.success() {
return Err(self.read_error()?);
}
}
Ok(nread)
}
}

/// Returns true if the given path contains a supported compression format or
/// is a TAR archive.
pub fn is_compressed(path: &Path) -> bool {
is_supported_compression_format(path) || is_tar_archive(path)
}

/// Returns true if the given path matches any one of the supported compression
/// formats
fn is_supported_compression_format(path: &Path) -> bool {
SUPPORTED_COMPRESSION_FORMATS.is_match(path)
}

/// Returns true if the given path matches any of the known TAR file formats.
fn is_tar_archive(path: &Path) -> bool {
TAR_ARCHIVE_FORMATS.is_match(path)
}
Loading

0 comments on commit f007f94

Please sign in to comment.