diff --git a/benches/bash.rs b/benches/bash.rs index d61bcc6..e0cf0c6 100644 --- a/benches/bash.rs +++ b/benches/bash.rs @@ -18,14 +18,19 @@ fn criterion_benchmark(c: &mut Criterion) { b.iter(|| Bash::quote_vec(black_box(&alphanumeric_long))) }); - let complex_short = (1..=255u8).map(char::from).collect::(); - c.bench_function("bash escape complex", |b| { - b.iter(|| Bash::quote_vec(black_box(&complex_short))) + let bytes_short = (1..=255u8).map(char::from).collect::(); + c.bench_function("bash escape bytes", |b| { + b.iter(|| Bash::quote_vec(black_box(&bytes_short))) }); - let complex_long = complex_short.repeat(1000); - c.bench_function("bash escape complex long", |b| { - b.iter(|| Bash::quote_vec(black_box(&complex_long))) + let bytes_long = bytes_short.repeat(1000); + c.bench_function("bash escape bytes long", |b| { + b.iter(|| Bash::quote_vec(black_box(&bytes_long))) + }); + + let utf8 = ('\x01'..=char::MAX).collect::(); + c.bench_function("bash escape utf-8", |b| { + b.iter(|| Bash::quote_vec(black_box(&utf8))) }); } diff --git a/benches/fish.rs b/benches/fish.rs index 9c13e13..dc50d93 100644 --- a/benches/fish.rs +++ b/benches/fish.rs @@ -18,14 +18,19 @@ fn criterion_benchmark(c: &mut Criterion) { b.iter(|| Fish::quote_vec(black_box(&alphanumeric_long))) }); - let complex_short = (1..=255u8).map(char::from).collect::(); - c.bench_function("fish escape complex", |b| { - b.iter(|| Fish::quote_vec(black_box(&complex_short))) + let bytes_short = (1..=255u8).map(char::from).collect::(); + c.bench_function("fish escape bytes", |b| { + b.iter(|| Fish::quote_vec(black_box(&bytes_short))) }); - let complex_long = complex_short.repeat(1000); - c.bench_function("fish escape complex long", |b| { - b.iter(|| Fish::quote_vec(black_box(&complex_long))) + let bytes_long = bytes_short.repeat(1000); + c.bench_function("fish escape bytes long", |b| { + b.iter(|| Fish::quote_vec(black_box(&bytes_long))) + }); + + let utf8 = ('\x01'..=char::MAX).collect::(); + c.bench_function("fish escape utf-8", |b| { + b.iter(|| Fish::quote_vec(black_box(&utf8))) }); } diff --git a/benches/sh.rs b/benches/sh.rs index 9718cfc..2ff96df 100644 --- a/benches/sh.rs +++ b/benches/sh.rs @@ -4,28 +4,33 @@ use shell_quote::Sh; fn criterion_benchmark(c: &mut Criterion) { let empty_string = ""; - c.bench_function("sh escape empty", |b| { + c.bench_function("sh/dash escape empty", |b| { b.iter(|| Sh::quote_vec(black_box(empty_string))) }); let alphanumeric_short = "abcdefghijklmnopqrstuvwxyz0123456789"; - c.bench_function("sh escape a-z", |b| { + c.bench_function("sh/dash escape a-z", |b| { b.iter(|| Sh::quote_vec(black_box(alphanumeric_short))) }); let alphanumeric_long = alphanumeric_short.repeat(1000); - c.bench_function("sh escape a-z long", |b| { + c.bench_function("sh/dash escape a-z long", |b| { b.iter(|| Sh::quote_vec(black_box(&alphanumeric_long))) }); - let complex_short = (1..=255u8).map(char::from).collect::(); - c.bench_function("sh escape complex", |b| { - b.iter(|| Sh::quote_vec(black_box(&complex_short))) + let bytes_short = (1..=255u8).map(char::from).collect::(); + c.bench_function("sh/dash escape bytes", |b| { + b.iter(|| Sh::quote_vec(black_box(&bytes_short))) }); - let complex_long = complex_short.repeat(1000); - c.bench_function("sh escape complex long", |b| { - b.iter(|| Sh::quote_vec(black_box(&complex_long))) + let bytes_long = bytes_short.repeat(1000); + c.bench_function("sh/dash escape bytes long", |b| { + b.iter(|| Sh::quote_vec(black_box(&bytes_long))) + }); + + let utf8 = ('\x01'..=char::MAX).collect::(); + c.bench_function("sh/dash escape utf-8", |b| { + b.iter(|| Sh::quote_vec(black_box(&utf8))) }); } diff --git a/src/bash.rs b/src/bash.rs index 42e329b..a317e8a 100644 --- a/src/bash.rs +++ b/src/bash.rs @@ -108,8 +108,8 @@ impl Bash { /// - The string as-is, if no escaping is necessary. /// - An [ANSI-C escaped string][ansi-c-quoting], like `$'foo\nbar'`. /// - /// See [`quote_into`](#method.quote_into) for a variant that extends an - /// existing `Vec` instead of allocating a new one. + /// See [`quote_into_vec`][`Self::quote_into_vec`] for a variant that + /// extends an existing `Vec` instead of allocating a new one. /// /// # Examples /// @@ -123,17 +123,17 @@ impl Bash { /// https://www.gnu.org/software/bash/manual/html_node/ANSI_002dC-Quoting.html /// pub fn quote_vec<'a, S: ?Sized + Into>>(s: S) -> Vec { + // Here, previously, in the `Escape` cases, an optimisation + // precalculated the required capacity of the output `Vec` to avoid + // reallocations later on, but benchmarks showed that it was slower. It + // _may_ have lowered maximum RAM required, but that was not measured. match s.into() { Quotable::Bytes(bytes) => match bytes::escape_prepare(bytes) { bytes::Prepared::Empty => vec![b'\'', b'\''], bytes::Prepared::Inert => bytes.into(), bytes::Prepared::Escape(esc) => { - // This may be a pointless optimisation, but calculate the - // memory needed to avoid reallocations as we construct the - // output. Since we'll generate a $'…' string, add 3 bytes. - let size: usize = esc.iter().map(bytes::escape_size).sum(); - let mut sout = Vec::with_capacity(size + 3); - bytes::escape_chars(esc, &mut sout); // Do the work. + let mut sout = Vec::new(); + bytes::escape_chars(esc, &mut sout); sout } }, @@ -141,12 +141,8 @@ impl Bash { text::Prepared::Empty => vec![b'\'', b'\''], text::Prepared::Inert => text.into(), text::Prepared::Escape(esc) => { - // This may be a pointless optimisation, but calculate the - // memory needed to avoid reallocations as we construct the - // output. Since we'll generate a $'…' string, add 3 bytes. - let size: usize = esc.iter().map(text::escape_size).sum(); - let mut sout = Vec::with_capacity(size + 3); - text::escape_chars(esc, &mut sout); // Do the work. + let mut sout = Vec::new(); + text::escape_chars(esc, &mut sout); sout } }, @@ -155,7 +151,7 @@ impl Bash { /// Quote a string of bytes into an existing `Vec`. /// - /// See [`quote`](#method.quote) for more details. + /// See [`quote_vec`][`Self::quote_vec`] for more details. /// /// # Examples /// @@ -169,34 +165,20 @@ impl Bash { /// ``` /// pub fn quote_into_vec<'a, S: ?Sized + Into>>(s: S, sout: &mut Vec) { + // Here, previously, in the `Escape` cases, an optimisation + // precalculated the required capacity of the output `Vec` to avoid + // reallocations later on, but benchmarks showed that it was slower. It + // _may_ have lowered maximum RAM required, but that was not measured. match s.into() { Quotable::Bytes(bytes) => match bytes::escape_prepare(bytes) { bytes::Prepared::Empty => sout.extend(b"''"), bytes::Prepared::Inert => sout.extend(bytes), - bytes::Prepared::Escape(esc) => { - // This may be a pointless optimisation, but calculate the - // memory needed to avoid reallocations as we construct the - // output. Since we'll generate a $'…' string, add 3 bytes. - let size: usize = esc.iter().map(bytes::escape_size).sum(); - sout.reserve(size + 3); - let cap = sout.capacity(); - bytes::escape_chars(esc, sout); // Do the work. - debug_assert_eq!(cap, sout.capacity()); // No reallocations. - } + bytes::Prepared::Escape(esc) => bytes::escape_chars(esc, sout), }, Quotable::Text(text) => match text::escape_prepare(text) { text::Prepared::Empty => sout.extend(b"''"), text::Prepared::Inert => sout.extend(text.as_bytes()), - text::Prepared::Escape(esc) => { - // This may be a pointless optimisation, but calculate the - // memory needed to avoid reallocations as we construct the - // output. Since we'll generate a $'…' string, add 3 bytes. - let size: usize = esc.iter().map(text::escape_size).sum(); - sout.reserve(size + 3); - let cap = sout.capacity(); - text::escape_chars(esc, sout); // Do the work. - debug_assert_eq!(cap, sout.capacity()); // No reallocations. - } + text::Prepared::Escape(esc) => text::escape_chars(esc, sout), }, } } @@ -205,7 +187,8 @@ impl Bash { // ---------------------------------------------------------------------------- mod bytes { - use crate::{ascii::Char, util::u8_to_hex_escape}; + use super::u8_to_hex_escape; + use crate::ascii::Char; pub enum Prepared { Empty, @@ -252,34 +235,13 @@ mod bytes { } sout.push(b'\''); } - - pub fn escape_size(char: &Char) -> usize { - use Char::*; - match char { - Bell => 2, - Backspace => 2, - Escape => 2, - FormFeed => 2, - NewLine => 2, - CarriageReturn => 2, - HorizontalTab => 2, - VerticalTab => 2, - Control(_) => 4, - Backslash => 2, - SingleQuote => 2, - DoubleQuote => 1, - Delete => 4, - PrintableInert(_) => 1, - Printable(_) => 1, - Extended(_) => 4, - } - } } // ---------------------------------------------------------------------------- mod text { - use crate::{utf8::Char, util::u8_to_hex_escape}; + use super::u8_to_hex_escape; + use crate::utf8::Char; pub enum Prepared { Empty, @@ -327,26 +289,32 @@ mod text { } sout.push(b'\''); } +} - pub fn escape_size(ch: &Char) -> usize { - use Char::*; - match ch { - Bell => 2, - Backspace => 2, - Escape => 2, - FormFeed => 2, - NewLine => 2, - CarriageReturn => 2, - HorizontalTab => 2, - VerticalTab => 2, - Control(_) => 4, - Backslash => 2, - SingleQuote => 2, - DoubleQuote => 1, - Delete => 4, - PrintableInert(_) => 1, - Printable(_) => 1, - Utf8(ch) => ch.len_utf8(), - } +// ---------------------------------------------------------------------------- + +/// Escape a byte as a 4-byte hex escape sequence. +/// +/// The `\\xHH` format (backslash, a literal "x", two hex characters) is +/// understood by many shells. +#[inline] +fn u8_to_hex_escape(ch: u8) -> [u8; 4] { + const HEX_DIGITS: &[u8] = b"0123456789ABCDEF"; + [ + b'\\', + b'x', + HEX_DIGITS[(ch >> 4) as usize], + HEX_DIGITS[(ch & 0xF) as usize], + ] +} + +#[cfg(test)] +#[test] +fn test_u8_to_hex_escape() { + for ch in u8::MIN..=u8::MAX { + let expected = format!("\\x{ch:02X}"); + let observed = u8_to_hex_escape(ch); + let observed = std::str::from_utf8(&observed).unwrap(); + assert_eq!(observed, &expected); } } diff --git a/src/fish.rs b/src/fish.rs index 673d315..d92190d 100644 --- a/src/fish.rs +++ b/src/fish.rs @@ -78,8 +78,8 @@ impl Fish { /// - The string as-is, if no escaping is necessary. /// - An escaped string, like `'foo \'bar'`, `\a'ABC'` /// - /// See [`quote_into`](#method.quote_into) for a variant that extends an - /// existing `Vec` instead of allocating a new one. + /// See [`quote_into_vec`][`Self::quote_into_vec`] for a variant that + /// extends an existing `Vec` instead of allocating a new one. /// /// # Examples /// @@ -94,8 +94,8 @@ impl Fish { bytes::Prepared::Empty => vec![b'\'', b'\''], bytes::Prepared::Inert => bytes.into(), bytes::Prepared::Escape(esc) => { - let mut sout = Vec::with_capacity(esc.len() + 2); - bytes::escape_chars(esc, &mut sout); // Do the work. + let mut sout = Vec::new(); + bytes::escape_chars(esc, &mut sout); sout } }, @@ -103,8 +103,8 @@ impl Fish { text::Prepared::Empty => vec![b'\'', b'\''], text::Prepared::Inert => text.into(), text::Prepared::Escape(esc) => { - let mut sout = Vec::with_capacity(esc.len() + 2); - text::escape_chars(esc, &mut sout); // Do the work. + let mut sout = Vec::new(); + text::escape_chars(esc, &mut sout); sout } }, @@ -113,7 +113,7 @@ impl Fish { /// Quote a string of bytes into an existing `Vec`. /// - /// See [`quote`](#method.quote) for more details. + /// See [`quote_vec`][`Self::quote_vec`] for more details. /// /// # Examples /// @@ -131,18 +131,12 @@ impl Fish { Quotable::Bytes(bytes) => match bytes::escape_prepare(bytes) { bytes::Prepared::Empty => sout.extend(b"''"), bytes::Prepared::Inert => sout.extend(bytes), - bytes::Prepared::Escape(esc) => { - sout.reserve(esc.len() + 2); - bytes::escape_chars(esc, sout); // Do the work. - } + bytes::Prepared::Escape(esc) => bytes::escape_chars(esc, sout), }, Quotable::Text(text) => match text::escape_prepare(text) { text::Prepared::Empty => sout.extend(b"''"), text::Prepared::Inert => sout.extend(text.as_bytes()), - text::Prepared::Escape(esc) => { - sout.reserve(esc.len() + 2); - text::escape_chars(esc, sout); // Do the work. - } + text::Prepared::Escape(esc) => text::escape_chars(esc, sout), }, } } @@ -151,7 +145,8 @@ impl Fish { // ---------------------------------------------------------------------------- mod bytes { - use crate::{ascii::Char, util::u8_to_hex_escape_uppercase_x}; + use super::u8_to_hex_escape_uppercase_x; + use crate::ascii::Char; pub enum Prepared { Empty, @@ -226,7 +221,8 @@ mod bytes { // ---------------------------------------------------------------------------- mod text { - use crate::{utf8::Char, util::u8_to_hex_escape_uppercase_x}; + use super::u8_to_hex_escape_uppercase_x; + use crate::utf8::Char; pub enum Prepared { Empty, @@ -298,3 +294,39 @@ mod text { } } } + +// ---------------------------------------------------------------------------- + +/// Escape a byte as a 4-byte hex escape sequence _with uppercase "X"_. +/// +/// The `\\XHH` format (backslash, a literal "X", two hex characters) is +/// understood by fish. The `\\xHH` format is _also_ understood, but until fish +/// 3.6.0 it had a weirdness. From the [release notes][]: +/// +/// > The `\\x` and `\\X` escape syntax is now equivalent. `\\xAB` previously +/// > behaved the same as `\\XAB`, except that it would error if the value “AB” +/// > was larger than “7f” (127 in decimal, the highest ASCII value). +/// +/// [release notes]: https://github.com/fish-shell/fish-shell/releases/tag/3.6.0 +/// +#[inline] +fn u8_to_hex_escape_uppercase_x(ch: u8) -> [u8; 4] { + const HEX_DIGITS: &[u8] = b"0123456789ABCDEF"; + [ + b'\\', + b'X', + HEX_DIGITS[(ch >> 4) as usize], + HEX_DIGITS[(ch & 0xF) as usize], + ] +} + +#[cfg(test)] +#[test] +fn test_u8_to_hex_escape_uppercase_x() { + for ch in u8::MIN..=u8::MAX { + let expected = format!("\\X{ch:02X}"); + let observed = u8_to_hex_escape_uppercase_x(ch); + let observed = std::str::from_utf8(&observed).unwrap(); + assert_eq!(observed, &expected); + } +} diff --git a/src/lib.rs b/src/lib.rs index fe8f86f..d16767f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -16,7 +16,6 @@ mod bash; mod fish; mod sh; mod utf8; -mod util; #[cfg(feature = "bash")] pub use bash::Bash; diff --git a/src/sh.rs b/src/sh.rs index ef032f0..8ca16fa 100644 --- a/src/sh.rs +++ b/src/sh.rs @@ -124,8 +124,8 @@ impl Sh { /// - The string as-is, if no quoting is necessary. /// - A string containing single-quoted sections, like `foo' bar'`. /// - /// See [`quote_into`](#method.quote_into) for a variant that extends an - /// existing `Vec` instead of allocating a new one. + /// See [`quote_into_vec`][`Self::quote_into_vec`] for a variant that + /// extends an existing `Vec` instead of allocating a new one. /// /// # Examples /// @@ -144,21 +144,12 @@ impl Sh { Prepared::Empty => vec![b'\'', b'\''], Prepared::Inert => bytes.into(), Prepared::Escape(esc) => { - // This may be a pointless optimisation, but calculate the - // memory needed to avoid reallocations as we construct the - // output. Since we'll generate a '…' string, add 2 bytes. Each - // single quote can consume an extra 1 to 3 bytes, so we assume - // the worse. - let quotes: usize = esc - .iter() - .filter(|char| **char == Char::SingleQuote) - .count(); - - let size: usize = esc.len() + 2 + (quotes * 3); - let mut sout = Vec::with_capacity(size); - let cap = sout.capacity(); - escape_chars(esc, &mut sout); // Do the work. - debug_assert_eq!(cap, sout.capacity()); // No reallocations. + // Here, previously, an optimisation precalculated the required + // capacity of the output `Vec` to avoid reallocations later on, + // but benchmarks showed that it was slower. It _may_ have + // lowered maximum RAM required, but that was not measured. + let mut sout = Vec::new(); + escape_chars(esc, &mut sout); sout } } @@ -166,7 +157,7 @@ impl Sh { /// Quote a string of bytes into an existing `Vec`. /// - /// See [`quote`](#method.quote) for more details. + /// See [`quote_vec`][`Self::quote_vec`] for more details. /// /// # Examples /// @@ -188,21 +179,11 @@ impl Sh { Prepared::Empty => sout.extend(b"''"), Prepared::Inert => sout.extend(bytes), Prepared::Escape(esc) => { - // This may be a pointless optimisation, but calculate the - // memory needed to avoid reallocations as we construct the - // output. Since we'll generate a '…' string, add 2 bytes. Each - // single quote can consume an extra 1 to 3 bytes, so we assume - // the worse. - let quotes: usize = esc - .iter() - .filter(|char| **char == Char::SingleQuote) - .count(); - - let size: usize = esc.len() + 2 + (quotes * 3); - sout.reserve(size); - let cap = sout.capacity(); - escape_chars(esc, sout); // Do the work. - debug_assert_eq!(cap, sout.capacity()); // No reallocations. + // Here, previously, an optimisation precalculated the required + // capacity of the output `Vec` to avoid reallocations later on, + // but benchmarks showed that it was slower. It _may_ have + // lowered maximum RAM required, but that was not measured. + escape_chars(esc, sout); } } } @@ -230,40 +211,40 @@ fn escape_prepare(sin: &[u8]) -> Prepared { } fn escape_chars(esc: Vec, sout: &mut Vec) { - let mut inside_quotes_now = false; + let mut inside_quotes = false; for mode in esc { use Char::*; match mode { PrintableInert(ch) | Extended(ch) => sout.push(ch), Control(ch) | Printable(ch) => { - if inside_quotes_now { + if inside_quotes { sout.push(ch); } else { sout.push(b'\''); - inside_quotes_now = true; + inside_quotes = true; sout.push(ch); } } SingleQuote => { - if inside_quotes_now { + if inside_quotes { sout.extend(b"'\\'"); - inside_quotes_now = false; + inside_quotes = false; } else { sout.extend(b"\\'"); } } ch => { - if inside_quotes_now { + if inside_quotes { sout.push(ch.code()); } else { sout.push(b'\''); - inside_quotes_now = true; + inside_quotes = true; sout.push(ch.code()); } } } } - if inside_quotes_now { + if inside_quotes { sout.push(b'\''); } } diff --git a/src/utf8.rs b/src/utf8.rs index 0fe96b3..2e9e06c 100644 --- a/src/utf8.rs +++ b/src/utf8.rs @@ -1,8 +1,8 @@ #![cfg(any(feature = "bash", feature = "fish", feature = "sh"))] -//! Scanner for ASCII control codes, shell metacharacters, printable characters, -//! and extended codes, i.e. classify each byte in a stream according to where -//! it appears in extended ASCII. +//! Scanner for control codes, shell metacharacters, printable characters, and +//! UTF-8 sequences, i.e. classify each byte in a stream according to where it +//! appears in UTF-8. #[derive(PartialEq)] pub(crate) enum Char { diff --git a/src/util.rs b/src/util.rs deleted file mode 100644 index 30ae2bc..0000000 --- a/src/util.rs +++ /dev/null @@ -1,64 +0,0 @@ -/// Escape a byte as a 4-byte hex escape sequence. -/// -/// The `\\xHH` format (backslash, a literal "x", two hex characters) is -/// understood by many shells. -#[inline] -#[cfg(feature = "bash")] -pub(crate) fn u8_to_hex_escape(ch: u8) -> [u8; 4] { - const HEX_DIGITS: &[u8] = b"0123456789ABCDEF"; - [ - b'\\', - b'x', - HEX_DIGITS[(ch >> 4) as usize], - HEX_DIGITS[(ch & 0xF) as usize], - ] -} - -/// Escape a byte as a 4-byte hex escape sequence _with uppercase "X"_. -/// -/// The `\\XHH` format (backslash, a literal "X", two hex characters) is -/// understood by fish. The `\\xHH` format is _also_ understood, but until fish -/// 3.6.0 it had a weirdness. From the [release notes][]: -/// -/// > The `\\x` and `\\X` escape syntax is now equivalent. `\\xAB` previously -/// > behaved the same as `\\XAB`, except that it would error if the value “AB” -/// > was larger than “7f” (127 in decimal, the highest ASCII value). -/// -/// [release notes]: https://github.com/fish-shell/fish-shell/releases/tag/3.6.0 -/// -#[inline] -#[cfg(feature = "fish")] -pub(crate) fn u8_to_hex_escape_uppercase_x(ch: u8) -> [u8; 4] { - const HEX_DIGITS: &[u8] = b"0123456789ABCDEF"; - [ - b'\\', - b'X', - HEX_DIGITS[(ch >> 4) as usize], - HEX_DIGITS[(ch & 0xF) as usize], - ] -} - -#[cfg(test)] -mod tests { - #[test] - #[cfg(feature = "bash")] - fn test_u8_to_hex_escape() { - for ch in u8::MIN..=u8::MAX { - let expected = format!("\\x{ch:02X}"); - let observed = super::u8_to_hex_escape(ch); - let observed = std::str::from_utf8(&observed).unwrap(); - assert_eq!(observed, &expected); - } - } - - #[test] - #[cfg(feature = "fish")] - fn test_u8_to_hex_escape_uppercase_x() { - for ch in u8::MIN..=u8::MAX { - let expected = format!("\\X{ch:02X}"); - let observed = super::u8_to_hex_escape_uppercase_x(ch); - let observed = std::str::from_utf8(&observed).unwrap(); - assert_eq!(observed, &expected); - } - } -}