Skip to content

Commit

Permalink
AVRO-4074: Optimization for Serializing ASCII Strings (#3198)
Browse files Browse the repository at this point in the history
  • Loading branch information
belugabehr authored Oct 7, 2024
1 parent 67263a3 commit 515edcd
Showing 1 changed file with 40 additions and 0 deletions.
40 changes: 40 additions & 0 deletions lang/java/avro/src/main/java/org/apache/avro/io/BinaryEncoder.java
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@
*/
public abstract class BinaryEncoder extends Encoder {

// Buffer used for writing ASCII strings
private final byte[] stringBuffer = new byte[128];

@Override
public void writeNull() throws IOException {
}
Expand All @@ -48,10 +51,47 @@ public void writeString(Utf8 utf8) throws IOException {

@Override
public void writeString(String string) throws IOException {
/* empty string short-circuit */
if (string.isEmpty()) {
writeZero();
return;
}

/*
* Assume the String is ASCII. If the ASCII String fits into the existing
* buffer, copy the characters into the buffer and write it to the underlying
* Encoder. If the String is too long, or ends up not being ASCII, then
* fall-back to the default JDK mechanism for handling String to byte array.
*/
final int stringLength = string.length();
if (stringLength <= stringBuffer.length) {
boolean onlyAscii = true;
for (int i = 0; onlyAscii && (i < stringLength); i++) {
/*
* The char data type is a single 16-bit Unicode character (UTF-16). ASCII, is a
* 7-bit character encoding. Therefore, if the value is larger than 127, it
* cannot be ASCII. If it is ASCII, it is safe to trim to byte.
*/
final char c = string.charAt(i);
if (c >= 0x80) {
onlyAscii = false;
} else {
stringBuffer[i] = (byte) c;
}
}
if (onlyAscii) {
writeInt(stringLength);
writeFixed(stringBuffer, 0, stringLength);
return;
}
}

/*
* The standard JDK way of turning Strings into byte arrays. Handles UTF-16
* case. However, for ASCII this has the overhead of instantiating a new byte
* array (which pollutes the heap), and then copying the underlying bytes into
* the array,
*/
byte[] bytes = string.getBytes(StandardCharsets.UTF_8);
writeInt(bytes.length);
writeFixed(bytes, 0, bytes.length);
Expand Down

0 comments on commit 515edcd

Please sign in to comment.