-
Notifications
You must be signed in to change notification settings - Fork 10.9k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Implement ByteSource.asCharSource(charset).read() using the decoding …
…string constructor instead of streaming the contents into a StringBuilder. this allows us to avoid a number of copies that are currently happening for each character (1. into a temporary CharBuffer, 2. into a StringBuilder, 3 into the String char[]) and replace it with simply whatever is required by ByteSource.read() and the String(byte[], charset) constructor. For certain ByteSource implementations (like FileByteSource) ByteSource.read() can often presize the byte[] correctly and the string constructor can also do some things to guess at the correct array sizes and avoid copies in the common case. Benchmarks have shown that this should improve the speed of Files.toString significantly. further work should also be done for the normal CharSource.read implementation. ------------- Created by MOE: https://github.com/google/moe MOE_MIGRATED_REVID=162220754
- Loading branch information
1 parent
71b5b85
commit ee12894
Showing
4 changed files
with
298 additions
and
0 deletions.
There are no files selected for viewing
137 changes: 137 additions & 0 deletions
137
android/guava-tests/benchmark/com/google/common/io/ByteSourceAsCharSourceReadBenchmark.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,137 @@ | ||
/* | ||
* Copyright (C) 2017 The Guava Authors | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except | ||
* in compliance with the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software distributed under the License | ||
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express | ||
* or implied. See the License for the specific language governing permissions and limitations under | ||
* the License. | ||
*/ | ||
|
||
package com.google.common.io; | ||
|
||
import com.google.caliper.BeforeExperiment; | ||
import com.google.caliper.Benchmark; | ||
import com.google.caliper.Param; | ||
import com.google.caliper.api.VmOptions; | ||
import com.google.common.base.Optional; | ||
import java.io.IOException; | ||
import java.io.InputStreamReader; | ||
import java.nio.charset.Charset; | ||
import java.util.Random; | ||
|
||
/** | ||
* Benchmarks for various potential implementations of {@code ByteSource.asCharSource(...).read()}. | ||
*/ | ||
// These benchmarks allocate a lot of data so use a large heap | ||
@VmOptions({"-Xms12g", "-Xmx12g", "-d64"}) | ||
public class ByteSourceAsCharSourceReadBenchmark { | ||
enum ReadStrategy { | ||
TO_BYTE_ARRAY_NEW_STRING { | ||
@Override | ||
String read(ByteSource byteSource, Charset cs) throws IOException { | ||
return new String(byteSource.read(), cs); | ||
} | ||
}, | ||
USING_CHARSTREAMS_COPY { | ||
@Override | ||
String read(ByteSource byteSource, Charset cs) throws IOException { | ||
StringBuilder sb = new StringBuilder(); | ||
try (InputStreamReader reader = new InputStreamReader(byteSource.openStream(), cs)) { | ||
CharStreams.copy(reader, sb); | ||
} | ||
return sb.toString(); | ||
} | ||
}, | ||
// It really seems like this should be faster than TO_BYTE_ARRAY_NEW_STRING. But it just isn't | ||
// my best guess is that the jdk authors have spent more time optimizing that callpath than this | ||
// one. (StringCoding$StringDecoder vs. StreamDecoder). StringCoding has a ton of special cases | ||
// theoretically we could duplicate all that logic here to try to beat 'new String' or at least | ||
// come close. | ||
USING_DECODER_WITH_SIZE_HINT { | ||
@Override | ||
String read(ByteSource byteSource, Charset cs) throws IOException { | ||
Optional<Long> size = byteSource.sizeIfKnown(); | ||
// if we know the size and it fits in an int | ||
if (size.isPresent() && size.get().longValue() == size.get().intValue()) { | ||
// otherwise try to presize a StringBuilder | ||
// it is kind of lame that we need to construct a decoder to access this value. | ||
// if this is a concern we could add special cases for some known charsets (like utf8) | ||
// or we could avoid inputstreamreader and use the decoder api directly | ||
// TODO(lukes): in a real implementation we would need to handle overflow conditions | ||
int maxChars = (int) (size.get().intValue() * cs.newDecoder().maxCharsPerByte()); | ||
char[] buffer = new char[maxChars]; | ||
int bufIndex = 0; | ||
int remaining = buffer.length; | ||
try (InputStreamReader reader = new InputStreamReader(byteSource.openStream(), cs)) { | ||
int nRead = 0; | ||
while (remaining > 0 && (nRead = reader.read(buffer, bufIndex, remaining)) != -1) { | ||
bufIndex += nRead; | ||
remaining -= nRead; | ||
} | ||
if (nRead == -1) { | ||
// we reached EOF | ||
return new String(buffer, 0, bufIndex); | ||
} | ||
// otherwise we got the size wrong. This can happen if the size changes between when | ||
// we called sizeIfKnown and when we started reading the file (or i guess if | ||
// maxCharsPerByte is wrong) | ||
// Fallback to an incremental approach | ||
StringBuilder builder = new StringBuilder(bufIndex + 32); | ||
builder.append(buffer, 0, bufIndex); | ||
buffer = null; // release for gc | ||
CharStreams.copy(reader, builder); | ||
return builder.toString(); | ||
} | ||
|
||
} else { | ||
return TO_BYTE_ARRAY_NEW_STRING.read(byteSource, cs); | ||
} | ||
} | ||
}; | ||
|
||
abstract String read(ByteSource byteSource, Charset cs) throws IOException; | ||
} | ||
|
||
@Param({"UTF-8"}) | ||
String charsetName; | ||
|
||
@Param ReadStrategy strategy; | ||
|
||
@Param({"10", "1024", "1048576"}) | ||
int size; | ||
|
||
Charset charset; | ||
ByteSource data; | ||
|
||
@BeforeExperiment | ||
public void setUp() { | ||
charset = Charset.forName(charsetName); | ||
StringBuilder sb = new StringBuilder(); | ||
Random random = new Random(0xdeadbeef); // for unpredictable but reproducible behavior | ||
sb.ensureCapacity(size); | ||
for (int k = 0; k < size; k++) { | ||
// [9-127) includes all ascii non-control characters | ||
sb.append((char) (random.nextInt(127 - 9) + 9)); | ||
} | ||
String string = sb.toString(); | ||
sb.setLength(0); | ||
data = ByteSource.wrap(string.getBytes(charset)); | ||
} | ||
|
||
@Benchmark | ||
public int timeCopy(int reps) throws IOException { | ||
int r = 0; | ||
final Charset localCharset = charset; | ||
final ByteSource localData = data; | ||
final ReadStrategy localStrategy = strategy; | ||
for (int i = 0; i < reps; i++) { | ||
r += localStrategy.read(localData, localCharset).hashCode(); | ||
} | ||
return r; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
137 changes: 137 additions & 0 deletions
137
guava-tests/benchmark/com/google/common/io/ByteSourceAsCharSourceReadBenchmark.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,137 @@ | ||
/* | ||
* Copyright (C) 2017 The Guava Authors | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except | ||
* in compliance with the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software distributed under the License | ||
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express | ||
* or implied. See the License for the specific language governing permissions and limitations under | ||
* the License. | ||
*/ | ||
|
||
package com.google.common.io; | ||
|
||
import com.google.caliper.BeforeExperiment; | ||
import com.google.caliper.Benchmark; | ||
import com.google.caliper.Param; | ||
import com.google.caliper.api.VmOptions; | ||
import com.google.common.base.Optional; | ||
import java.io.IOException; | ||
import java.io.InputStreamReader; | ||
import java.nio.charset.Charset; | ||
import java.util.Random; | ||
|
||
/** | ||
* Benchmarks for various potential implementations of {@code ByteSource.asCharSource(...).read()}. | ||
*/ | ||
// These benchmarks allocate a lot of data so use a large heap | ||
@VmOptions({"-Xms12g", "-Xmx12g", "-d64"}) | ||
public class ByteSourceAsCharSourceReadBenchmark { | ||
enum ReadStrategy { | ||
TO_BYTE_ARRAY_NEW_STRING { | ||
@Override | ||
String read(ByteSource byteSource, Charset cs) throws IOException { | ||
return new String(byteSource.read(), cs); | ||
} | ||
}, | ||
USING_CHARSTREAMS_COPY { | ||
@Override | ||
String read(ByteSource byteSource, Charset cs) throws IOException { | ||
StringBuilder sb = new StringBuilder(); | ||
try (InputStreamReader reader = new InputStreamReader(byteSource.openStream(), cs)) { | ||
CharStreams.copy(reader, sb); | ||
} | ||
return sb.toString(); | ||
} | ||
}, | ||
// It really seems like this should be faster than TO_BYTE_ARRAY_NEW_STRING. But it just isn't | ||
// my best guess is that the jdk authors have spent more time optimizing that callpath than this | ||
// one. (StringCoding$StringDecoder vs. StreamDecoder). StringCoding has a ton of special cases | ||
// theoretically we could duplicate all that logic here to try to beat 'new String' or at least | ||
// come close. | ||
USING_DECODER_WITH_SIZE_HINT { | ||
@Override | ||
String read(ByteSource byteSource, Charset cs) throws IOException { | ||
Optional<Long> size = byteSource.sizeIfKnown(); | ||
// if we know the size and it fits in an int | ||
if (size.isPresent() && size.get().longValue() == size.get().intValue()) { | ||
// otherwise try to presize a StringBuilder | ||
// it is kind of lame that we need to construct a decoder to access this value. | ||
// if this is a concern we could add special cases for some known charsets (like utf8) | ||
// or we could avoid inputstreamreader and use the decoder api directly | ||
// TODO(lukes): in a real implementation we would need to handle overflow conditions | ||
int maxChars = (int) (size.get().intValue() * cs.newDecoder().maxCharsPerByte()); | ||
char[] buffer = new char[maxChars]; | ||
int bufIndex = 0; | ||
int remaining = buffer.length; | ||
try (InputStreamReader reader = new InputStreamReader(byteSource.openStream(), cs)) { | ||
int nRead = 0; | ||
while (remaining > 0 && (nRead = reader.read(buffer, bufIndex, remaining)) != -1) { | ||
bufIndex += nRead; | ||
remaining -= nRead; | ||
} | ||
if (nRead == -1) { | ||
// we reached EOF | ||
return new String(buffer, 0, bufIndex); | ||
} | ||
// otherwise we got the size wrong. This can happen if the size changes between when | ||
// we called sizeIfKnown and when we started reading the file (or i guess if | ||
// maxCharsPerByte is wrong) | ||
// Fallback to an incremental approach | ||
StringBuilder builder = new StringBuilder(bufIndex + 32); | ||
builder.append(buffer, 0, bufIndex); | ||
buffer = null; // release for gc | ||
CharStreams.copy(reader, builder); | ||
return builder.toString(); | ||
} | ||
|
||
} else { | ||
return TO_BYTE_ARRAY_NEW_STRING.read(byteSource, cs); | ||
} | ||
} | ||
}; | ||
|
||
abstract String read(ByteSource byteSource, Charset cs) throws IOException; | ||
} | ||
|
||
@Param({"UTF-8"}) | ||
String charsetName; | ||
|
||
@Param ReadStrategy strategy; | ||
|
||
@Param({"10", "1024", "1048576"}) | ||
int size; | ||
|
||
Charset charset; | ||
ByteSource data; | ||
|
||
@BeforeExperiment | ||
public void setUp() { | ||
charset = Charset.forName(charsetName); | ||
StringBuilder sb = new StringBuilder(); | ||
Random random = new Random(0xdeadbeef); // for unpredictable but reproducible behavior | ||
sb.ensureCapacity(size); | ||
for (int k = 0; k < size; k++) { | ||
// [9-127) includes all ascii non-control characters | ||
sb.append((char) (random.nextInt(127 - 9) + 9)); | ||
} | ||
String string = sb.toString(); | ||
sb.setLength(0); | ||
data = ByteSource.wrap(string.getBytes(charset)); | ||
} | ||
|
||
@Benchmark | ||
public int timeCopy(int reps) throws IOException { | ||
int r = 0; | ||
final Charset localCharset = charset; | ||
final ByteSource localData = data; | ||
final ReadStrategy localStrategy = strategy; | ||
for (int i = 0; i < reps; i++) { | ||
r += localStrategy.read(localData, localCharset).hashCode(); | ||
} | ||
return r; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters