Skip to content

Commit

Permalink
perf: Faster binary-file detection
Browse files Browse the repository at this point in the history
This switches us from a homegrown implementation to `context_inspector`
- Adds some optimizations by looking for the BoM.
- We used the same algorithm for finding Null bytes
- `context_inspector` caps how much of the buffer is searche though

Besides performance, `content_inspector` also has some known-binary
magic numbers to avoid bad detections.

Fixes #34
  • Loading branch information
Ed Page committed Aug 21, 2020
1 parent 443aa5c commit e5dc10c
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 17 deletions.
10 changes: 10 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions crates/typos/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,4 @@ log = "0.4"
unicode-segmentation = "1.6.0"
derive_more = "0.99.9"
derive_setters = "0.1"
content_inspector = "0.2.4"
49 changes: 32 additions & 17 deletions crates/typos/src/checks.rs
Original file line number Diff line number Diff line change
Expand Up @@ -113,10 +113,16 @@ impl ParseIdentifiers {

let buffer = std::fs::read(path)
.map_err(|e| crate::ErrorKind::IoError.into_error().with_source(e))?;
if !explicit && !self.binary && is_binary(&buffer) {
let msg = report::BinaryFile { path };
reporter.report(msg.into());
return Ok(typos_found);
if !explicit && !self.binary {
let content_type = content_inspector::inspect(&buffer);
if content_type.is_binary()
// HACK: We only support UTF-8 at the moment
|| (content_type != content_inspector::ContentType::UTF_8_BOM
&& content_type != content_inspector::ContentType::UTF_8)
let msg = report::BinaryFile { path };
reporter.report(msg.into());
return Ok(typos_found);
}
}

for line in buffer.lines() {
Expand Down Expand Up @@ -182,10 +188,16 @@ impl ParseWords {

let buffer = std::fs::read(path)
.map_err(|e| crate::ErrorKind::IoError.into_error().with_source(e))?;
if !explicit && !self.binary && is_binary(&buffer) {
let msg = report::BinaryFile { path };
reporter.report(msg.into());
return Ok(typos_found);
if !explicit && !self.binary {
let content_type = content_inspector::inspect(&buffer);
// HACK: We only support UTF-8 at the moment
if content_type.is_binary()
|| (content_type != content_inspector::ContentType::UTF_8_BOM
&& content_type != content_inspector::ContentType::UTF_8)
let msg = report::BinaryFile { path };
reporter.report(msg.into());
return Ok(typos_found);
}
}

for line in buffer.lines() {
Expand Down Expand Up @@ -274,10 +286,18 @@ impl Checks {

let buffer = std::fs::read(path)
.map_err(|e| crate::ErrorKind::IoError.into_error().with_source(e))?;
if !explicit && !self.binary && is_binary(&buffer) {
let msg = report::BinaryFile { path };
reporter.report(msg.into());
return Ok(typos_found);
if !explicit && !self.binary {
let content_type = content_inspector::inspect(&buffer);
// HACK: We only support UTF-8 at the moment
if content_type.is_binary()
|| (content_type != content_inspector::ContentType::UTF_8_BOM
&& content_type != content_inspector::ContentType::UTF_8)
{
// HACK: we don't support alternative encodings atm
let msg = report::BinaryFile { path };
reporter.report(msg.into());
return Ok(typos_found);
}
}

for (line_idx, line) in buffer.lines().enumerate() {
Expand Down Expand Up @@ -318,8 +338,3 @@ impl Checks {
Ok(typos_found)
}
}

fn is_binary(buffer: &[u8]) -> bool {
let null_max = std::cmp::min(buffer.len(), 1024);
buffer[0..null_max].find_byte(b'\0').is_some()
}

0 comments on commit e5dc10c

Please sign in to comment.