Skip to content

Commit

Permalink
Rollforward [] which was rolled back in [] because it
Browse files Browse the repository at this point in the history
caused some tests using appengine to fail with IOExceptions when calling
FileInputStream.available().  The workaround in the rollforward is to not call
the method.  The one case where we were calling it we had actually already
called a similar method, so trying again doesn't seem valuable anyway.

*** Original change description ***

Implement ByteSource.asCharSource(charset).read() using the decoding string
constructor instead of streaming the contents into a StringBuilder.

this allows us to avoid a number of copies that are currently happening for
each character (1. into a temporary CharBuffer, 2. into a StringBuilder, 3 into
the String char[]) and replace it with simply whatever is required by
ByteSource.read() and the String(byte[], charset) constructor.  For certain
ByteSource implementations (like FileByteSource) ByteSource.read() can often
presize the byte[] correctly and the string constructor can also do some things
to guess at the correct array sizes and avoid copies in the common case.

Benchmarks have shown that this should improve the speed of Files.toString
significantly.

***

-------------
Created by MOE: https://github.com/google/moe
MOE_MIGRATED_REVID=162275181
  • Loading branch information
lukesandberg authored and cgdecker committed Jul 24, 2017
1 parent b87e1f1 commit 22da091
Show file tree
Hide file tree
Showing 6 changed files with 316 additions and 8 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
/*
* Copyright (C) 2017 The Guava Authors
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
* in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing permissions and limitations under
* the License.
*/

package com.google.common.io;

import com.google.caliper.BeforeExperiment;
import com.google.caliper.Benchmark;
import com.google.caliper.Param;
import com.google.caliper.api.VmOptions;
import com.google.common.base.Optional;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.Random;

/**
* Benchmarks for various potential implementations of {@code ByteSource.asCharSource(...).read()}.
*/
// These benchmarks allocate a lot of data so use a large heap
@VmOptions({"-Xms12g", "-Xmx12g", "-d64"})
public class ByteSourceAsCharSourceReadBenchmark {
enum ReadStrategy {
TO_BYTE_ARRAY_NEW_STRING {
@Override
String read(ByteSource byteSource, Charset cs) throws IOException {
return new String(byteSource.read(), cs);
}
},
USING_CHARSTREAMS_COPY {
@Override
String read(ByteSource byteSource, Charset cs) throws IOException {
StringBuilder sb = new StringBuilder();
try (InputStreamReader reader = new InputStreamReader(byteSource.openStream(), cs)) {
CharStreams.copy(reader, sb);
}
return sb.toString();
}
},
// It really seems like this should be faster than TO_BYTE_ARRAY_NEW_STRING. But it just isn't
// my best guess is that the jdk authors have spent more time optimizing that callpath than this
// one. (StringCoding$StringDecoder vs. StreamDecoder). StringCoding has a ton of special cases
// theoretically we could duplicate all that logic here to try to beat 'new String' or at least
// come close.
USING_DECODER_WITH_SIZE_HINT {
@Override
String read(ByteSource byteSource, Charset cs) throws IOException {
Optional<Long> size = byteSource.sizeIfKnown();
// if we know the size and it fits in an int
if (size.isPresent() && size.get().longValue() == size.get().intValue()) {
// otherwise try to presize a StringBuilder
// it is kind of lame that we need to construct a decoder to access this value.
// if this is a concern we could add special cases for some known charsets (like utf8)
// or we could avoid inputstreamreader and use the decoder api directly
// TODO(lukes): in a real implementation we would need to handle overflow conditions
int maxChars = (int) (size.get().intValue() * cs.newDecoder().maxCharsPerByte());
char[] buffer = new char[maxChars];
int bufIndex = 0;
int remaining = buffer.length;
try (InputStreamReader reader = new InputStreamReader(byteSource.openStream(), cs)) {
int nRead = 0;
while (remaining > 0 && (nRead = reader.read(buffer, bufIndex, remaining)) != -1) {
bufIndex += nRead;
remaining -= nRead;
}
if (nRead == -1) {
// we reached EOF
return new String(buffer, 0, bufIndex);
}
// otherwise we got the size wrong. This can happen if the size changes between when
// we called sizeIfKnown and when we started reading the file (or i guess if
// maxCharsPerByte is wrong)
// Fallback to an incremental approach
StringBuilder builder = new StringBuilder(bufIndex + 32);
builder.append(buffer, 0, bufIndex);
buffer = null; // release for gc
CharStreams.copy(reader, builder);
return builder.toString();
}

} else {
return TO_BYTE_ARRAY_NEW_STRING.read(byteSource, cs);
}
}
};

abstract String read(ByteSource byteSource, Charset cs) throws IOException;
}

@Param({"UTF-8"})
String charsetName;

@Param ReadStrategy strategy;

@Param({"10", "1024", "1048576"})
int size;

Charset charset;
ByteSource data;

@BeforeExperiment
public void setUp() {
charset = Charset.forName(charsetName);
StringBuilder sb = new StringBuilder();
Random random = new Random(0xdeadbeef); // for unpredictable but reproducible behavior
sb.ensureCapacity(size);
for (int k = 0; k < size; k++) {
// [9-127) includes all ascii non-control characters
sb.append((char) (random.nextInt(127 - 9) + 9));
}
String string = sb.toString();
sb.setLength(0);
data = ByteSource.wrap(string.getBytes(charset));
}

@Benchmark
public int timeCopy(int reps) throws IOException {
int r = 0;
final Charset localCharset = charset;
final ByteSource localData = data;
final ReadStrategy localStrategy = strategy;
for (int i = 0; i < reps; i++) {
r += localStrategy.read(localData, localCharset).hashCode();
}
return r;
}
}
12 changes: 12 additions & 0 deletions android/guava/src/com/google/common/io/ByteSource.java
Original file line number Diff line number Diff line change
Expand Up @@ -455,6 +455,18 @@ public Reader openStream() throws IOException {
return new InputStreamReader(ByteSource.this.openStream(), charset);
}

@Override
public String read() throws IOException {
// Reading all the data as a byte array is more efficient than the default read()
// implementation because:
// 1. the string constructor can avoid an extra copy most of the time by correctly sizing the
// internal char array (hard to avoid using StringBuilder)
// 2. we avoid extra copies into temporary buffers altogether
// The downside is that this will cause us to store the file bytes in memory twice for a short
// amount of time.
return new String(ByteSource.this.read(), charset);
}

@Override
public String toString() {
return ByteSource.this.toString() + ".asCharSource(" + charset + ")";
Expand Down
13 changes: 9 additions & 4 deletions android/guava/src/com/google/common/io/Files.java
Original file line number Diff line number Diff line change
Expand Up @@ -176,10 +176,15 @@ static byte[] readFile(InputStream in, long expectedSize) throws IOException {
}

// some special files may return size 0 but have content, so read
// the file normally in that case
return expectedSize == 0
? ByteStreams.toByteArray(in)
: ByteStreams.toByteArray(in, (int) expectedSize);
// the file normally in that case guessing at the buffer size to use. Note, there is no point
// in calling the 'toByteArray' overload that doesn't take a size because that calls
// InputStream.available(), but our caller has already done that. So instead just guess that
// the file is 4K bytes long and rely on the fallback in toByteArray to expand the buffer if
// needed.
// This also works around an app-engine bug where FileInputStream.available() consistently
// throws an IOException for certain files, even though FileInputStream.getChannel().size() does
// not!
return ByteStreams.toByteArray(in, expectedSize == 0 ? 4096 : (int) expectedSize);
}

/**
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
/*
* Copyright (C) 2017 The Guava Authors
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
* in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing permissions and limitations under
* the License.
*/

package com.google.common.io;

import com.google.caliper.BeforeExperiment;
import com.google.caliper.Benchmark;
import com.google.caliper.Param;
import com.google.caliper.api.VmOptions;
import com.google.common.base.Optional;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.Random;

/**
* Benchmarks for various potential implementations of {@code ByteSource.asCharSource(...).read()}.
*/
// These benchmarks allocate a lot of data so use a large heap
@VmOptions({"-Xms12g", "-Xmx12g", "-d64"})
public class ByteSourceAsCharSourceReadBenchmark {
enum ReadStrategy {
TO_BYTE_ARRAY_NEW_STRING {
@Override
String read(ByteSource byteSource, Charset cs) throws IOException {
return new String(byteSource.read(), cs);
}
},
USING_CHARSTREAMS_COPY {
@Override
String read(ByteSource byteSource, Charset cs) throws IOException {
StringBuilder sb = new StringBuilder();
try (InputStreamReader reader = new InputStreamReader(byteSource.openStream(), cs)) {
CharStreams.copy(reader, sb);
}
return sb.toString();
}
},
// It really seems like this should be faster than TO_BYTE_ARRAY_NEW_STRING. But it just isn't
// my best guess is that the jdk authors have spent more time optimizing that callpath than this
// one. (StringCoding$StringDecoder vs. StreamDecoder). StringCoding has a ton of special cases
// theoretically we could duplicate all that logic here to try to beat 'new String' or at least
// come close.
USING_DECODER_WITH_SIZE_HINT {
@Override
String read(ByteSource byteSource, Charset cs) throws IOException {
Optional<Long> size = byteSource.sizeIfKnown();
// if we know the size and it fits in an int
if (size.isPresent() && size.get().longValue() == size.get().intValue()) {
// otherwise try to presize a StringBuilder
// it is kind of lame that we need to construct a decoder to access this value.
// if this is a concern we could add special cases for some known charsets (like utf8)
// or we could avoid inputstreamreader and use the decoder api directly
// TODO(lukes): in a real implementation we would need to handle overflow conditions
int maxChars = (int) (size.get().intValue() * cs.newDecoder().maxCharsPerByte());
char[] buffer = new char[maxChars];
int bufIndex = 0;
int remaining = buffer.length;
try (InputStreamReader reader = new InputStreamReader(byteSource.openStream(), cs)) {
int nRead = 0;
while (remaining > 0 && (nRead = reader.read(buffer, bufIndex, remaining)) != -1) {
bufIndex += nRead;
remaining -= nRead;
}
if (nRead == -1) {
// we reached EOF
return new String(buffer, 0, bufIndex);
}
// otherwise we got the size wrong. This can happen if the size changes between when
// we called sizeIfKnown and when we started reading the file (or i guess if
// maxCharsPerByte is wrong)
// Fallback to an incremental approach
StringBuilder builder = new StringBuilder(bufIndex + 32);
builder.append(buffer, 0, bufIndex);
buffer = null; // release for gc
CharStreams.copy(reader, builder);
return builder.toString();
}

} else {
return TO_BYTE_ARRAY_NEW_STRING.read(byteSource, cs);
}
}
};

abstract String read(ByteSource byteSource, Charset cs) throws IOException;
}

@Param({"UTF-8"})
String charsetName;

@Param ReadStrategy strategy;

@Param({"10", "1024", "1048576"})
int size;

Charset charset;
ByteSource data;

@BeforeExperiment
public void setUp() {
charset = Charset.forName(charsetName);
StringBuilder sb = new StringBuilder();
Random random = new Random(0xdeadbeef); // for unpredictable but reproducible behavior
sb.ensureCapacity(size);
for (int k = 0; k < size; k++) {
// [9-127) includes all ascii non-control characters
sb.append((char) (random.nextInt(127 - 9) + 9));
}
String string = sb.toString();
sb.setLength(0);
data = ByteSource.wrap(string.getBytes(charset));
}

@Benchmark
public int timeCopy(int reps) throws IOException {
int r = 0;
final Charset localCharset = charset;
final ByteSource localData = data;
final ReadStrategy localStrategy = strategy;
for (int i = 0; i < reps; i++) {
r += localStrategy.read(localData, localCharset).hashCode();
}
return r;
}
}
12 changes: 12 additions & 0 deletions guava/src/com/google/common/io/ByteSource.java
Original file line number Diff line number Diff line change
Expand Up @@ -455,6 +455,18 @@ public Reader openStream() throws IOException {
return new InputStreamReader(ByteSource.this.openStream(), charset);
}

@Override
public String read() throws IOException {
// Reading all the data as a byte array is more efficient than the default read()
// implementation because:
// 1. the string constructor can avoid an extra copy most of the time by correctly sizing the
// internal char array (hard to avoid using StringBuilder)
// 2. we avoid extra copies into temporary buffers altogether
// The downside is that this will cause us to store the file bytes in memory twice for a short
// amount of time.
return new String(ByteSource.this.read(), charset);
}

@Override
public String toString() {
return ByteSource.this.toString() + ".asCharSource(" + charset + ")";
Expand Down
13 changes: 9 additions & 4 deletions guava/src/com/google/common/io/Files.java
Original file line number Diff line number Diff line change
Expand Up @@ -176,10 +176,15 @@ static byte[] readFile(InputStream in, long expectedSize) throws IOException {
}

// some special files may return size 0 but have content, so read
// the file normally in that case
return expectedSize == 0
? ByteStreams.toByteArray(in)
: ByteStreams.toByteArray(in, (int) expectedSize);
// the file normally in that case guessing at the buffer size to use. Note, there is no point
// in calling the 'toByteArray' overload that doesn't take a size because that calls
// InputStream.available(), but our caller has already done that. So instead just guess that
// the file is 4K bytes long and rely on the fallback in toByteArray to expand the buffer if
// needed.
// This also works around an app-engine bug where FileInputStream.available() consistently
// throws an IOException for certain files, even though FileInputStream.getChannel().size() does
// not!
return ByteStreams.toByteArray(in, expectedSize == 0 ? 4096 : (int) expectedSize);
}

/**
Expand Down

0 comments on commit 22da091

Please sign in to comment.