diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md index 81c2c280..92f7014e 100644 --- a/RELEASE-NOTES.md +++ b/RELEASE-NOTES.md @@ -1,6 +1,13 @@ +# Next + +- Config methods are const +- Added `EncoderStringWriter` to allow encoding directly to a String +- `EncoderWriter` now owns its delegate writer rather than keeping a reference to it (though refs still work) + - As a consequence, it is now possible to extract the delegate writer from an `EncoderWriter` via `finish()`, which returns `Result` instead of `Result<()>`. + # 0.12.2 -Add `BinHex` alphabet +- Add `BinHex` alphabet # 0.12.1 diff --git a/benches/benchmarks.rs b/benches/benchmarks.rs index 07f88721..3d27bbb7 100644 --- a/benches/benchmarks.rs +++ b/benches/benchmarks.rs @@ -123,6 +123,32 @@ fn do_encode_bench_stream(b: &mut Bencher, &size: &usize) { }); } +fn do_encode_bench_string_stream(b: &mut Bencher, &size: &usize) { + let mut v: Vec = Vec::with_capacity(size); + fill(&mut v); + + b.iter(|| { + let mut stream_enc = write::EncoderStringWriter::new(TEST_CONFIG); + stream_enc.write_all(&v).unwrap(); + stream_enc.flush().unwrap(); + let _ = stream_enc.into_inner(); + }); +} + +fn do_encode_bench_string_reuse_buf_stream(b: &mut Bencher, &size: &usize) { + let mut v: Vec = Vec::with_capacity(size); + fill(&mut v); + + let mut buf = String::new(); + b.iter(|| { + buf.clear(); + let mut stream_enc = write::EncoderStringWriter::from(&mut buf, TEST_CONFIG); + stream_enc.write_all(&v).unwrap(); + stream_enc.flush().unwrap(); + let _ = stream_enc.into_inner(); + }); +} + fn fill(v: &mut Vec) { let cap = v.capacity(); // weak randomness is plenty; we just want to not be completely friendly to the branch predictor @@ -147,6 +173,8 @@ fn encode_benchmarks(byte_sizes: &[usize]) -> ParameterizedBenchmark { .with_function("encode_reuse_buf", do_encode_bench_reuse_buf) .with_function("encode_slice", do_encode_bench_slice) .with_function("encode_reuse_buf_stream", do_encode_bench_stream) + .with_function("encode_string_stream", do_encode_bench_string_stream) + .with_function("encode_string_reuse_buf_stream", do_encode_bench_string_reuse_buf_stream) } fn decode_benchmarks(byte_sizes: &[usize]) -> ParameterizedBenchmark { diff --git a/examples/make_tables.rs b/examples/make_tables.rs index 5ef3075f..db6fcf2b 100644 --- a/examples/make_tables.rs +++ b/examples/make_tables.rs @@ -164,8 +164,14 @@ fn print_decode_table(alphabet: &[u8], const_name: &str, indent_depth: usize) { } fn check_alphabet(alphabet: &[u8]) { + // ensure all characters are distinct assert_eq!(64, alphabet.len()); let mut set: HashSet = HashSet::new(); set.extend(alphabet); assert_eq!(64, set.len()); + + // must be ASCII to be valid as single UTF-8 bytes + for &b in alphabet { + assert!(b <= 0x7F_u8); + } } diff --git a/src/write/encoder.rs b/src/write/encoder.rs index bece69b3..8a48f438 100644 --- a/src/write/encoder.rs +++ b/src/write/encoder.rs @@ -25,27 +25,24 @@ const MIN_ENCODE_CHUNK_SIZE: usize = 3; /// use std::io::Write; /// /// // use a vec as the simplest possible `Write` -- in real code this is probably a file, etc. -/// let mut wrapped_writer = Vec::new(); -/// { -/// let mut enc = base64::write::EncoderWriter::new( -/// &mut wrapped_writer, base64::STANDARD); +/// let mut enc = base64::write::EncoderWriter::new(Vec::new(), base64::STANDARD); /// -/// // handle errors as you normally would -/// enc.write_all(b"asdf").unwrap(); -/// // could leave this out to be called by Drop, if you don't care -/// // about handling errors -/// enc.finish().unwrap(); +/// // handle errors as you normally would +/// enc.write_all(b"asdf").unwrap(); /// -/// } +/// // could leave this out to be called by Drop, if you don't care +/// // about handling errors or getting the delegate writer back +/// let delegate = enc.finish().unwrap(); /// /// // base64 was written to the writer -/// assert_eq!(b"YXNkZg==", &wrapped_writer[..]); +/// assert_eq!(b"YXNkZg==", &delegate[..]); /// /// ``` /// /// # Panics /// -/// Calling `write()` after `finish()` is invalid and will panic. +/// Calling `write()` (or related methods) or `finish()` after `finish()` has completed without +/// error is invalid and will panic. /// /// # Errors /// @@ -56,10 +53,12 @@ const MIN_ENCODE_CHUNK_SIZE: usize = 3; /// /// It has some minor performance loss compared to encoding slices (a couple percent). /// It does not do any heap allocation. -pub struct EncoderWriter<'a, W: 'a + Write> { +pub struct EncoderWriter { config: Config, - /// Where encoded data is written to - w: &'a mut W, + /// Where encoded data is written to. It's an Option as it's None immediately before Drop is + /// called so that finish() can return the underlying writer. None implies that finish() has + /// been called successfully. + delegate: Option, /// Holds a partial chunk, if any, after the last `write()`, so that we may then fill the chunk /// with the next `write()`, encode it, then proceed with the rest of the input normally. extra_input: [u8; MIN_ENCODE_CHUNK_SIZE], @@ -70,13 +69,11 @@ pub struct EncoderWriter<'a, W: 'a + Write> { output: [u8; BUF_SIZE], /// How much of `output` is occupied with encoded data that couldn't be written last time output_occupied_len: usize, - /// True iff padding / partial last chunk has been written. - finished: bool, /// panic safety: don't write again in destructor if writer panicked while we were writing to it panicked: bool, } -impl<'a, W: Write> fmt::Debug for EncoderWriter<'a, W> { +impl fmt::Debug for EncoderWriter { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!( f, @@ -89,17 +86,16 @@ impl<'a, W: Write> fmt::Debug for EncoderWriter<'a, W> { } } -impl<'a, W: Write> EncoderWriter<'a, W> { +impl EncoderWriter { /// Create a new encoder that will write to the provided delegate writer `w`. - pub fn new(w: &'a mut W, config: Config) -> EncoderWriter<'a, W> { + pub fn new(w: W, config: Config) -> EncoderWriter { EncoderWriter { config, - w, + delegate: Some(w), extra_input: [0u8; MIN_ENCODE_CHUNK_SIZE], extra_input_occupied_len: 0, output: [0u8; BUF_SIZE], output_occupied_len: 0, - finished: false, panicked: false, } } @@ -107,20 +103,41 @@ impl<'a, W: Write> EncoderWriter<'a, W> { /// Encode all remaining buffered data and write it, including any trailing incomplete input /// triples and associated padding. /// - /// Once this succeeds, no further writes can be performed, as that would produce invalid - /// base64. + /// Once this succeeds, no further writes or calls to this method are allowed. /// - /// This may write to the delegate writer multiple times if the delegate writer does not accept all input provided - /// to its `write` each invocation. + /// This may write to the delegate writer multiple times if the delegate writer does not accept + /// all input provided to its `write` each invocation. + /// + /// If you don't care about error handling, it is not necessary to call this function, as the + /// equivalent finalization is done by the Drop impl. + /// + /// Returns the writer that this was constructed around. /// /// # Errors /// - /// The first error that is not of [`ErrorKind::Interrupted`] will be returned. - pub fn finish(&mut self) -> Result<()> { - if self.finished { - return Ok(()); + /// The first error that is not of `ErrorKind::Interrupted` will be returned. + pub fn finish(&mut self) -> Result { + // If we could consume self in finish(), we wouldn't have to worry about this case, but + // finish() is retryable in the face of I/O errors, so we can't consume here. + if self.delegate.is_none() { + panic!("Encoder has already had finish() called") }; + self.write_final_leftovers()?; + + let writer = self.delegate.take().expect("Writer must be present"); + + Ok(writer) + } + + /// Write any remaining buffered data to the delegate writer. + fn write_final_leftovers(&mut self) -> Result<()> { + if self.delegate.is_none() { + // finish() has already successfully called this, and we are now in drop() with a None + // writer, so just no-op + return Ok(()); + } + self.write_all_encoded_output()?; if self.extra_input_occupied_len > 0 { @@ -138,7 +155,6 @@ impl<'a, W: Write> EncoderWriter<'a, W> { self.extra_input_occupied_len = 0; } - self.finished = true; Ok(()) } @@ -152,7 +168,11 @@ impl<'a, W: Write> EncoderWriter<'a, W> { /// that no write took place. fn write_to_delegate(&mut self, current_output_len: usize) -> Result<()> { self.panicked = true; - let res = self.w.write(&self.output[..current_output_len]); + let res = self + .delegate + .as_mut() + .expect("Writer must be present") + .write(&self.output[..current_output_len]); self.panicked = false; res.map(|consumed| { @@ -197,7 +217,7 @@ impl<'a, W: Write> EncoderWriter<'a, W> { } } -impl<'a, W: Write> Write for EncoderWriter<'a, W> { +impl Write for EncoderWriter { /// Encode input and then write to the delegate writer. /// /// Under non-error circumstances, this returns `Ok` with the value being the number of bytes @@ -215,7 +235,7 @@ impl<'a, W: Write> Write for EncoderWriter<'a, W> { /// /// Any errors emitted by the delegate writer are returned. fn write(&mut self, input: &[u8]) -> Result { - if self.finished { + if self.delegate.is_none() { panic!("Cannot write more after calling finish()"); } @@ -339,17 +359,23 @@ impl<'a, W: Write> Write for EncoderWriter<'a, W> { /// Because this is usually treated as OK to call multiple times, it will *not* flush any /// incomplete chunks of input or write padding. + /// # Errors + /// + /// The first error that is not of [`ErrorKind::Interrupted`] will be returned. fn flush(&mut self) -> Result<()> { self.write_all_encoded_output()?; - self.w.flush() + self.delegate + .as_mut() + .expect("Writer must be present") + .flush() } } -impl<'a, W: Write> Drop for EncoderWriter<'a, W> { +impl Drop for EncoderWriter { fn drop(&mut self) { if !self.panicked { // like `BufWriter`, ignore errors during drop - let _ = self.finish(); + let _ = self.write_final_leftovers(); } } } diff --git a/src/write/encoder_string_writer.rs b/src/write/encoder_string_writer.rs new file mode 100644 index 00000000..a2033c4a --- /dev/null +++ b/src/write/encoder_string_writer.rs @@ -0,0 +1,174 @@ +use crate::Config; +use std::io; +use std::io::Write; +use super::encoder::EncoderWriter; + +/// A `Write` implementation that base64-encodes data using the provided config and accumulates the +/// resulting base64 in memory, which is then exposed as a String via `into_inner()`. +/// +/// # Examples +/// +/// Buffer base64 in a new String: +/// +/// ``` +/// use std::io::Write; +/// +/// let mut enc = base64::write::EncoderStringWriter::new(base64::STANDARD); +/// +/// enc.write_all(b"asdf").unwrap(); +/// +/// // get the resulting String +/// let b64_string = enc.into_inner(); +/// +/// assert_eq!("YXNkZg==", &b64_string); +/// ``` +/// +/// Or, append to an existing String: +/// +/// ``` +/// use std::io::Write; +/// +/// let mut buf = String::from("base64: "); +/// +/// let mut enc = base64::write::EncoderStringWriter::from(&mut buf, base64::STANDARD); +/// +/// enc.write_all(b"asdf").unwrap(); +/// +/// // release the &mut reference on buf +/// let _ = enc.into_inner(); +/// +/// assert_eq!("base64: YXNkZg==", &buf); +/// ``` +/// +/// # Panics +/// +/// Calling `write()` (or related methods) or `finish()` after `finish()` has completed without +/// error is invalid and will panic. +/// +/// # Performance +/// +/// Because it has to validate that the base64 is UTF-8, it is about 80% as fast as writing plain +/// bytes to a `io::Write`. +pub struct EncoderStringWriter { + encoder: EncoderWriter>, +} + +impl EncoderStringWriter { + /// Create a EncoderStringWriter that will append to the provided `StrConsumer`. + pub fn from(str_consumer: S, config: Config) -> Self { + EncoderStringWriter { encoder: EncoderWriter::new(Utf8SingleCodeUnitWriter { str_consumer }, config) } + } + + /// Encode all remaining buffered data, including any trailing incomplete input triples and + /// associated padding. + /// + /// Once this succeeds, no further writes or calls to this method are allowed. + /// + /// Returns the base64-encoded form of the accumulated written data. + pub fn into_inner(mut self) -> S { + self.encoder.finish() + .expect("Writing to a Vec should never fail") + .str_consumer + } +} + +impl EncoderStringWriter { + /// Create a EncoderStringWriter that will encode into a new String with the provided config. + pub fn new(config: Config) -> Self { + EncoderStringWriter::from(String::new(), config) + } +} + +impl Write for EncoderStringWriter { + fn write(&mut self, buf: &[u8]) -> io::Result { + self.encoder.write(buf) + } + + fn flush(&mut self) -> io::Result<()> { + self.encoder.flush() + } +} + +/// An abstraction around consuming `str`s produced by base64 encoding. +pub trait StrConsumer { + /// Consume the base64 encoded data in `buf` + fn consume(&mut self, buf: &str); +} + +/// As for io::Write, `StrConsumer` is implemented automatically for `&mut S`. +impl StrConsumer for &mut S { + fn consume(&mut self, buf: &str) { + (**self).consume(buf) + } +} + +/// Pushes the str onto the end of the String +impl StrConsumer for String { + fn consume(&mut self, buf: &str) { + self.push_str(buf) + } +} + +/// A `Write` that only can handle bytes that are valid single-byte UTF-8 code units. +/// +/// This is safe because we only use it when writing base64, which is always valid UTF-8. +struct Utf8SingleCodeUnitWriter { + str_consumer: S +} + +impl io::Write for Utf8SingleCodeUnitWriter { + fn write(&mut self, buf: &[u8]) -> io::Result { + // Because we expect all input to be valid utf-8 individual bytes, we can encode any buffer + // length + let s = std::str::from_utf8(buf) + .expect("Input must be valid UTF-8"); + + self.str_consumer.consume(s); + + Ok(buf.len()) + } + + fn flush(&mut self) -> io::Result<()> { + // no op + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use crate::encode_config_buf; + use crate::tests::random_config; + use rand::Rng; + use std::io::Write; + use crate::write::encoder_string_writer::EncoderStringWriter; + + #[test] + fn every_possible_split_of_input() { + let mut rng = rand::thread_rng(); + let mut orig_data = Vec::::new(); + let mut normal_encoded = String::new(); + + let size = 5_000; + + for i in 0..size { + orig_data.clear(); + normal_encoded.clear(); + + for _ in 0..size { + orig_data.push(rng.gen()); + } + + let config = random_config(&mut rng); + encode_config_buf(&orig_data, config, &mut normal_encoded); + + let mut stream_encoder = EncoderStringWriter::new(config); + // Write the first i bytes, then the rest + stream_encoder.write_all(&orig_data[0..i]).unwrap(); + stream_encoder.write_all(&orig_data[i..]).unwrap(); + + let stream_encoded = stream_encoder.into_inner(); + + assert_eq!(normal_encoded, stream_encoded); + } + } +} diff --git a/src/write/encoder_tests.rs b/src/write/encoder_tests.rs index 59a6127a..09b4d3a2 100644 --- a/src/write/encoder_tests.rs +++ b/src/write/encoder_tests.rs @@ -436,7 +436,7 @@ fn writes_that_only_write_part_of_input_and_sometimes_interrupt_produce_correct_ } } - stream_encoder.finish().unwrap(); + let _ = stream_encoder.finish().unwrap(); assert_eq!(orig_len, bytes_consumed); } @@ -500,7 +500,7 @@ fn do_encode_random_config_matches_normal_encode(max_input_len: usize) { bytes_consumed += input_len; } - stream_encoder.finish().unwrap(); + let _ = stream_encoder.finish().unwrap(); assert_eq!(orig_len, bytes_consumed); } diff --git a/src/write/mod.rs b/src/write/mod.rs index f8ed7076..98cb48c4 100644 --- a/src/write/mod.rs +++ b/src/write/mod.rs @@ -1,6 +1,8 @@ //! Implementations of `io::Write` to transparently handle base64. mod encoder; +mod encoder_string_writer; pub use self::encoder::EncoderWriter; +pub use self::encoder_string_writer::EncoderStringWriter; #[cfg(test)] mod encoder_tests;