fix: update GzipReadableByteChannel to be tolerant of one byte reads (#2512)

BenWhitehead · web-flow · commit 87b63f4995f7 · 2024-04-24T14:32:55.000-04:00
If the ReadChannel chunkSize is 0 there will be no library buffering performed during read calls. If gzip decompression support is enabled on the read channel, there is a possibility someone could read fewer than the 4 bytes we initially read.

This change updates the byte tracking to ensure single byte reads are supported.
diff --git a/google-cloud-storage/src/main/java/com/google/cloud/storage/GzipReadableByteChannel.java b/google-cloud-storage/src/main/java/com/google/cloud/storage/GzipReadableByteChannel.java
@@ -36,6 +36,7 @@ final class GzipReadableByteChannel implements UnbufferedReadableByteChannel {
 
   private boolean retEOF = false;
   private ScatteringByteChannel delegate;
+  private ByteBuffer leftovers;
 
   GzipReadableByteChannel(UnbufferedReadableByteChannel source, ApiFuture<String> contentEncoding) {
     this.source = source;
@@ -51,11 +52,11 @@ public long read(ByteBuffer[] dsts, int offset, int length) throws IOException {
     // if our delegate is null, that means this is the first read attempt
     if (delegate == null) {
       // try to determine if the underlying data coming out of `source` is gzip
-      byte[] first4 = new byte[4]; // 4 bytes = 32-bits
-      final ByteBuffer wrap = ByteBuffer.wrap(first4);
-      // Step 1: initiate a read of the first 4 bytes of the object
+      byte[] firstByte = new byte[1];
+      ByteBuffer wrap = ByteBuffer.wrap(firstByte);
+      // Step 1: initiate a read of the first byte of the object
       //   this will have minimal overhead as the messages coming from gcs are inherently windowed
-      //   if the object size is between 5 and 2MiB the remaining bytes will be held in the channel
+      //   if the object size is between 2 and 2MiB the remaining bytes will be held in the channel
       //   for later read.
       source.read(wrap);
       try {
@@ -65,13 +66,13 @@ public long read(ByteBuffer[] dsts, int offset, int length) throws IOException {
         //   this will have a copy impact as we are no longer controlling all the buffers
         if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) {
           // to wire gzip decompression into the byte path:
-          // Create an input stream of the first4 bytes we already read
-          ByteArrayInputStream first4again = new ByteArrayInputStream(first4);
+          // Create an input stream of the firstByte bytes we already read
+          ByteArrayInputStream firstByteAgain = new ByteArrayInputStream(firstByte);
           // Create an InputStream facade of source
           InputStream sourceInputStream = Channels.newInputStream(source);
-          // create a new InputStream with the first4 bytes prepended to source
+          // create a new InputStream with the firstByte bytes prepended to source
           SequenceInputStream first4AndSource =
-              new SequenceInputStream(first4again, sourceInputStream);
+              new SequenceInputStream(firstByteAgain, sourceInputStream);
           // add gzip decompression
           GZIPInputStream decompress =
               new GZIPInputStream(new OptimisticAvailabilityInputStream(first4AndSource));
@@ -84,14 +85,22 @@ public long read(ByteBuffer[] dsts, int offset, int length) throws IOException {
           // to source
           wrap.flip();
           bytesRead += Buffers.copy(wrap, dsts, offset, length);
+          if (wrap.hasRemaining()) {
+            leftovers = wrap;
+          }
           delegate = source;
         }
       } catch (InterruptedException | ExecutionException e) {
         throw new IOException(e);
       }
+    } else if (leftovers != null && leftovers.hasRemaining()) {
+      bytesRead += Buffers.copy(leftovers, dsts, offset, length);
+      if (!leftovers.hasRemaining()) {
+        leftovers = null;
+      }
     }
 
-    // Because we're pre-reading a few bytes of the object in order to determine if we need to
+    // Because we're pre-reading a byte of the object in order to determine if we need to
     // plumb in gzip decompress, there is the possibility we will reach EOF while probing.
     // In order to maintain correctness of EOF propagation, determine if we will need to signal EOF
     // upon the next read.
diff --git a/google-cloud-storage/src/test/java/com/google/cloud/storage/ITReadChannelGzipHandlingTest.java b/google-cloud-storage/src/test/java/com/google/cloud/storage/ITReadChannelGzipHandlingTest.java
@@ -0,0 +1,66 @@
+/*
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.google.cloud.storage;
+
+import static com.google.common.truth.Truth.assertThat;
+
+import com.google.cloud.ReadChannel;
+import com.google.cloud.storage.Storage.BlobSourceOption;
+import com.google.cloud.storage.TransportCompatibility.Transport;
+import com.google.cloud.storage.it.runner.StorageITRunner;
+import com.google.cloud.storage.it.runner.annotations.Backend;
+import com.google.cloud.storage.it.runner.annotations.CrossRun;
+import com.google.cloud.storage.it.runner.annotations.Inject;
+import com.google.cloud.storage.it.runner.registry.ObjectsFixture;
+import com.google.cloud.storage.it.runner.registry.ObjectsFixture.ObjectAndContent;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+
+@RunWith(StorageITRunner.class)
+@CrossRun(
+    backends = {Backend.PROD},
+    transports = {Transport.HTTP, Transport.GRPC})
+public final class ITReadChannelGzipHandlingTest {
+
+  @Inject public Storage storage;
+  @Inject public ObjectsFixture objFixture;
+
+  @Test
+  public void nonGzipObjectReadOneByteAtATimeNoLibraryBuffering() throws IOException {
+    ObjectAndContent obj512KiB = objFixture.getObj512KiB();
+    BlobInfo info = obj512KiB.getInfo();
+    BlobId blobId = info.getBlobId();
+    byte[] bytes = new byte[1];
+    BlobSourceOption attemptGzipDecompression = BlobSourceOption.shouldReturnRawInputStream(false);
+    try (ReadChannel reader = storage.reader(blobId, attemptGzipDecompression)) {
+      reader.setChunkSize(0);
+
+      // read zero bytes, to trigger things to startup but don't actually pull out any bytes yes
+      reader.read(ByteBuffer.allocate(0));
+
+      byte[] content = obj512KiB.getContent().getBytes();
+      for (int i = 0; i < info.getSize(); i++) {
+        int read = reader.read(ByteBuffer.wrap(bytes));
+        assertThat(read).isEqualTo(1);
+        byte b = bytes[0];
+        assertThat(b).isEqualTo(content[i]);
+      }
+    }
+  }
+}