From 2460adc87a493e34d838d6d1e2650f48ff8bd617 Mon Sep 17 00:00:00 2001 From: Dan Gohman Date: Mon, 20 Nov 2017 12:42:40 -0800 Subject: [PATCH 1/5] Add Read::size_snapshot --- src/libstd/fs.rs | 17 +++++++++++++++++ src/libstd/io/buffered.rs | 9 +++++++++ src/libstd/io/cursor.rs | 10 ++++++++++ src/libstd/io/impls.rs | 15 +++++++++++++++ src/libstd/io/mod.rs | 39 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 90 insertions(+) diff --git a/src/libstd/fs.rs b/src/libstd/fs.rs index b07733d3c803c..4e42a4a3c84a6 100644 --- a/src/libstd/fs.rs +++ b/src/libstd/fs.rs @@ -449,6 +449,19 @@ impl Read for File { self.inner.read(buf) } + fn size_snapshot(&self) -> Option { + if let Ok(meta) = self.metadata() { + let len = meta.len(); + let size = len as usize; + // Don't trust a length of zero. For example, "pseudofiles" on Linux + // like /proc/meminfo report a size of 0. + if size != 0 && size as u64 == len { + return Some(size); + } + } + None + } + #[inline] unsafe fn initializer(&self) -> Initializer { Initializer::nop() @@ -473,6 +486,10 @@ impl<'a> Read for &'a File { self.inner.read(buf) } + fn size_snapshot(&self) -> Option { + (**self).size_snapshot() + } + #[inline] unsafe fn initializer(&self) -> Initializer { Initializer::nop() diff --git a/src/libstd/io/buffered.rs b/src/libstd/io/buffered.rs index 6d3fbc9d26822..9989bd01ab8c9 100644 --- a/src/libstd/io/buffered.rs +++ b/src/libstd/io/buffered.rs @@ -211,6 +211,15 @@ impl Read for BufReader { Ok(nread) } + #[inline] + fn size_snapshot(&self) -> Option { + let buffered_len = self.cap - self.pos; + if let Some(size) = self.inner.size_snapshot() { + return buffered_len.checked_add(size); + } + None + } + // we can't skip unconditionally because of the large buffer case in read. unsafe fn initializer(&self) -> Initializer { self.inner.initializer() diff --git a/src/libstd/io/cursor.rs b/src/libstd/io/cursor.rs index b5ea5531b65a7..b6192435e6c71 100644 --- a/src/libstd/io/cursor.rs +++ b/src/libstd/io/cursor.rs @@ -237,6 +237,16 @@ impl Read for Cursor where T: AsRef<[u8]> { Ok(()) } + fn size_snapshot(&self) -> Option { + if let Some(diff) = (self.inner.as_ref().len() as u64).checked_sub(self.pos) { + let size = diff as usize; + if size as u64 == diff { + return Some(size); + } + } + None + } + #[inline] unsafe fn initializer(&self) -> Initializer { Initializer::nop() diff --git a/src/libstd/io/impls.rs b/src/libstd/io/impls.rs index fe1179a3b4a18..4c5a94da3ac1e 100644 --- a/src/libstd/io/impls.rs +++ b/src/libstd/io/impls.rs @@ -23,6 +23,11 @@ impl<'a, R: Read + ?Sized> Read for &'a mut R { (**self).read(buf) } + #[inline] + fn size_snapshot(&self) -> Option { + (**self).size_snapshot() + } + #[inline] unsafe fn initializer(&self) -> Initializer { (**self).initializer() @@ -92,6 +97,11 @@ impl Read for Box { (**self).read(buf) } + #[inline] + fn size_snapshot(&self) -> Option { + (**self).size_snapshot() + } + #[inline] unsafe fn initializer(&self) -> Initializer { (**self).initializer() @@ -181,6 +191,11 @@ impl<'a> Read for &'a [u8] { Ok(amt) } + #[inline] + fn size_snapshot(&self) -> Option { + Some(self.len()) + } + #[inline] unsafe fn initializer(&self) -> Initializer { Initializer::nop() diff --git a/src/libstd/io/mod.rs b/src/libstd/io/mod.rs index b7a3695b47096..bddf78e912f8c 100644 --- a/src/libstd/io/mod.rs +++ b/src/libstd/io/mod.rs @@ -553,6 +553,21 @@ pub trait Read { Initializer::zeroing() } + /// Return a snapshot of how many bytes would be read from this source until EOF + /// if read immediately, or None if that is unknown. Depending on the source, the + /// size may change at any time, so this isn't a guarantee that exactly that number + /// of bytes will actually be read. + /// + /// This is used by [`read_to_end`] and [`read_to_string`] to pre-allocate a memory buffer. + /// + /// [`read_to_end`]: #method.read_to_end + /// [`read_to_string`]: #method.read_to_string + #[unstable(feature = "read_size_snapshot", issue = /* FIXME */ "0")] + #[inline] + fn size_snapshot(&self) -> Option { + None + } + /// Read all bytes until EOF in this source, placing them into `buf`. /// /// All bytes read from this source will be appended to the specified buffer @@ -1729,6 +1744,19 @@ impl Read for Chain { self.second.read(buf) } + fn size_snapshot(&self) -> Option { + if self.done_first { + self.second.size_snapshot() + } else { + if let Some(second_size) = self.second.size_snapshot() { + if let Some(first_size) = self.first.size_snapshot() { + return first_size.checked_add(second_size); + } + } + None + } + } + unsafe fn initializer(&self) -> Initializer { let initializer = self.first.initializer(); if initializer.should_initialize() { @@ -1927,6 +1955,17 @@ impl Read for Take { Ok(n) } + fn size_snapshot(&self) -> Option { + if let Some(inner_size) = self.inner.size_snapshot() { + let min = cmp::min(self.limit, inner_size as u64); + let size = min as usize; + if size as u64 == min { + return Some(size); + } + } + None + } + unsafe fn initializer(&self) -> Initializer { self.inner.initializer() } From e90bbcab827227ca47a3e65751eece5aec718355 Mon Sep 17 00:00:00 2001 From: Dan Gohman Date: Tue, 21 Nov 2017 13:41:43 -0800 Subject: [PATCH 2/5] Pre-allocate in Read::read_to_end and read_to_string based on size_snapshot In the case of reading a normal file, this creates a buffer of the desired size up front so it avoids over-allocation and re-allocation. It adds an fstat syscall, but in the case of reading a normal file, this eliminates the final 0-byte read syscall at the end. This also changes the default read size pattern from 32,64,128,256,... to 32,32,64,128,... so that default allocation sizes and file reads are aligned at power-of-two boundaries. --- src/libstd/io/mod.rs | 52 +++++++++++++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 17 deletions(-) diff --git a/src/libstd/io/mod.rs b/src/libstd/io/mod.rs index bddf78e912f8c..7459161dc9572 100644 --- a/src/libstd/io/mod.rs +++ b/src/libstd/io/mod.rs @@ -366,32 +366,50 @@ fn append_to_string(buf: &mut String, f: F) -> Result fn read_to_end(r: &mut R, buf: &mut Vec) -> Result { let start_len = buf.len(); let mut g = Guard { len: buf.len(), buf: buf }; - let ret; + let size_snapshot = r.size_snapshot(); + + // Determine the size to start reading with. + let initial_resize_len = if let Some(size) = size_snapshot { + // We know the (present) size. Don't use reserve_exact because when the + // initial size of buf is zero, reserve should still give us exactly the + // size we request, and when it's non-zero, we're concatenating things. + g.buf.reserve(size); + start_len + size + } else { + // We don't know the size. Start with a relatively small guess. + g.buf.reserve(32); + g.buf.capacity() + }; + unsafe { + g.buf.set_len(initial_resize_len); + r.initializer().initialize(&mut g.buf[g.len..]); + } + loop { - if g.len == g.buf.len() { - unsafe { - g.buf.reserve(32); - let capacity = g.buf.capacity(); - g.buf.set_len(capacity); - r.initializer().initialize(&mut g.buf[g.len..]); - } + match r.read(&mut g.buf[g.len..]) { + Ok(0) => break, + Ok(n) => g.len += n, + Err(ref e) if e.kind() == ErrorKind::Interrupted => continue, + Err(e) => return Err(e), } - match r.read(&mut g.buf[g.len..]) { - Ok(0) => { - ret = Ok(g.len - start_len); + if g.len == g.buf.len() { + if size_snapshot.is_some() { + // We finished what the snapshot told us, so we're done. + debug_assert_eq!(size_snapshot.unwrap(), g.len - start_len); break; } - Ok(n) => g.len += n, - Err(ref e) if e.kind() == ErrorKind::Interrupted => {} - Err(e) => { - ret = Err(e); - break; + // We've used up our available buffer space; allocate more. + g.buf.reserve(32); + let capacity = g.buf.capacity(); + unsafe { + g.buf.set_len(capacity); + r.initializer().initialize(&mut g.buf[g.len..]); } } } - ret + Ok(g.len - start_len) } /// The `Read` trait allows for reading bytes from a source. From 430681f2d34ceb1de7be3ab6b32800fffb97b9ec Mon Sep 17 00:00:00 2001 From: Dan Gohman Date: Wed, 22 Nov 2017 09:13:44 -0800 Subject: [PATCH 3/5] Document that Metadata::len() returning 0 means an unknown size. --- src/libstd/fs.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/libstd/fs.rs b/src/libstd/fs.rs index 4e42a4a3c84a6..52aafca8d27a6 100644 --- a/src/libstd/fs.rs +++ b/src/libstd/fs.rs @@ -799,6 +799,8 @@ impl Metadata { /// Returns the size of the file, in bytes, this metadata is for. /// + /// As a special case, a size of `0` indicates that the size is unknown. + /// /// # Examples /// /// ``` From c54fa2837ee1f9c50ca45f6840eebe8097f1eef2 Mon Sep 17 00:00:00 2001 From: Dan Gohman Date: Thu, 30 Nov 2017 15:26:07 -0800 Subject: [PATCH 4/5] Account for current position in `File::size_snapshot` --- src/libstd/fs.rs | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/libstd/fs.rs b/src/libstd/fs.rs index 52aafca8d27a6..d15c4bf278efa 100644 --- a/src/libstd/fs.rs +++ b/src/libstd/fs.rs @@ -452,11 +452,15 @@ impl Read for File { fn size_snapshot(&self) -> Option { if let Ok(meta) = self.metadata() { let len = meta.len(); - let size = len as usize; - // Don't trust a length of zero. For example, "pseudofiles" on Linux - // like /proc/meminfo report a size of 0. - if size != 0 && size as u64 == len { - return Some(size); + if let Ok(position) = self.inner.seek(SeekFrom::Current(0)) { + if let Some(distance) = len.checked_sub(position) { + let size = distance as usize; + // Don't trust a length of zero. For example, "pseudofiles" + // on Linux like /proc/meminfo report a size of 0. + if size != 0 && size as u64 == distance { + return Some(size); + } + } } } None From 5123b2319f7ef6936115e122e9079efc8ee8d1a8 Mon Sep 17 00:00:00 2001 From: Dan Gohman Date: Thu, 30 Nov 2017 15:27:24 -0800 Subject: [PATCH 5/5] Add a comment mentioning that I/O errors are ignored. --- src/libstd/fs.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/libstd/fs.rs b/src/libstd/fs.rs index d15c4bf278efa..89570fde91aa1 100644 --- a/src/libstd/fs.rs +++ b/src/libstd/fs.rs @@ -450,6 +450,8 @@ impl Read for File { } fn size_snapshot(&self) -> Option { + // Ignore I/O errors; we're just querying the size and position of an + // already-open file in preparation for reading from it. if let Ok(meta) = self.metadata() { let len = meta.len(); if let Ok(position) = self.inner.seek(SeekFrom::Current(0)) {