bazelbuild · fmeum · May 19, 2023 · Sep 5, 2023
diff --git a/src/main/java/com/google/devtools/build/lib/bazel/repository/BUILD b/src/main/java/com/google/devtools/build/lib/bazel/repository/BUILD
@@ -50,6 +50,7 @@ java_library(
         "//src/main/java/com/google/devtools/common/options",
         "//src/main/java/net/starlark/java/eval",
         "//third_party:apache_commons_compress",
+        "//third_party:auto_service",
         "//third_party:auto_value",
         "//third_party:flogger",
         "//third_party:guava",

diff --git a/src/main/java/com/google/devtools/build/lib/bazel/repository/CompressedTarFunction.java b/src/main/java/com/google/devtools/build/lib/bazel/repository/CompressedTarFunction.java
@@ -15,8 +15,10 @@
 package com.google.devtools.build.lib.bazel.repository;
 
 import static com.google.devtools.build.lib.bazel.repository.StripPrefixedPath.maybeDeprefixSymlink;
+import static java.nio.charset.StandardCharsets.ISO_8859_1;
+import static java.nio.charset.StandardCharsets.UTF_8;
 
-import com.google.common.base.Optional;
+import com.google.auto.service.AutoService;
 import com.google.common.io.ByteStreams;
 import com.google.devtools.build.lib.bazel.repository.DecompressorValue.Decompressor;
 import com.google.devtools.build.lib.vfs.FileSystemUtils;
@@ -25,16 +27,33 @@
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CharsetEncoder;
+import java.nio.charset.CoderResult;
+import java.nio.charset.spi.CharsetProvider;
+import java.util.Collections;
 import java.util.Date;
 import java.util.HashMap;
 import java.util.HashSet;
+import java.util.Iterator;
 import java.util.Map;
+import java.util.Optional;
 import java.util.Set;
+import java.util.UUID;
 import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
 import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
 
 /**
  * Common code for unarchiving a compressed TAR file.
+ *
+ * <p>TAR file entries commonly use one of two formats: PAX, which uses UTF-8 encoding for all
+ * strings, and USTAR, which does not specify an encoding. This class interprets USTAR headers as
+ * latin-1, thus preserving the original bytes of the header without enforcing any particular
+ * encoding. Internally, for file system operations, all strings are converted into Bazel's internal
+ * representation of raw bytes stored as latin-1 strings.
  */
 public abstract class CompressedTarFunction implements Decompressor {
   protected abstract InputStream getDecompressorStream(DecompressorDescriptor descriptor)
@@ -54,20 +73,23 @@ public Path decompress(DecompressorDescriptor descriptor)
     Map<Path, PathFragment> symlinks = new HashMap<>();
 
     try (InputStream decompressorStream = getDecompressorStream(descriptor)) {
-      TarArchiveInputStream tarStream = new TarArchiveInputStream(decompressorStream);
+      // USTAR tar headers use an unspecified encoding whereas PAX tar headers always use UTF-8.
+      // We can specify the encoding to use for USTAR headers, but the Charset used for PAX headers
+      // is fixed to UTF-8. We thus specify a custom Charset for the former so that we can
+      // distinguish between the two.
+      TarArchiveInputStream tarStream =
+          new TarArchiveInputStream(decompressorStream, MarkedIso88591Charset.NAME);
       TarArchiveEntry entry;
       while ((entry = tarStream.getNextTarEntry()) != null) {
-        String entryName = entry.getName();
+        String entryName = toRawBytesString(entry.getName());
         entryName = renameFiles.getOrDefault(entryName, entryName);
-        StripPrefixedPath entryPath = StripPrefixedPath.maybeDeprefix(entryName, prefix);
+        StripPrefixedPath entryPath =
+            StripPrefixedPath.maybeDeprefix(entryName.getBytes(ISO_8859_1), prefix);
         foundPrefix = foundPrefix || entryPath.foundPrefix();
 
         if (prefix.isPresent() && !foundPrefix) {
-          Optional<String> suggestion =
-              CouldNotFindPrefixException.maybeMakePrefixSuggestion(entryPath.getPathFragment());
-          if (suggestion.isPresent()) {
-            availablePrefixes.add(suggestion.get());
-          }
+          CouldNotFindPrefixException.maybeMakePrefixSuggestion(entryPath.getPathFragment())
+              .ifPresent(availablePrefixes::add);
         }
 
         if (entryPath.skip()) {
@@ -80,8 +102,11 @@ public Path decompress(DecompressorDescriptor descriptor)
           filePath.createDirectoryAndParents();
         } else {
           if (entry.isSymbolicLink() || entry.isLink()) {
-            PathFragment targetName = PathFragment.create(entry.getLinkName());
-            targetName = maybeDeprefixSymlink(targetName, prefix, descriptor.destinationPath());
+            PathFragment targetName =
+                maybeDeprefixSymlink(
+                    toRawBytesString(entry.getLinkName()).getBytes(ISO_8859_1),
+                    prefix,
+                    descriptor.destinationPath());
             if (entry.isSymbolicLink()) {
               symlinks.put(filePath, targetName);
             } else {
@@ -135,4 +160,99 @@ public Path decompress(DecompressorDescriptor descriptor)
 
     return descriptor.destinationPath();
   }
+
+  /**
+   * Returns a string that contains the raw bytes of the given string encoded in ISO-8859-1,
+   * assuming that the given string was encoded with either UTF-8 or the special
+   * {@link MarkedIso88591Charset}.
+   */
+  private static String toRawBytesString(String name) {
+    // Marked strings are already encoded in ISO-8859-1. Other strings originate from PAX headers
+    // and are thus encoded in UTF-8, which we decode to the raw bytes and then re-encode trivially
+    // in ISO-8859-1.
+    return MarkedIso88591Charset.getRawBytesStringIfMarked(name)
+        .orElseGet(() -> new String(name.getBytes(UTF_8), ISO_8859_1));
+  }
+
+  @AutoService(CharsetProvider.class)
+  public static class MarkedIso88591CharsetProvider extends CharsetProvider {
+    private static final Charset CHARSET = new MarkedIso88591Charset();
+
+    @Override
+    public Iterator<Charset> charsets() {
+      // This charset is only meant for internal use within CompressedTarFunction and thus should
+      // not be discoverable.
+      return Collections.emptyIterator();
+    }
+
+    @Override
+    public Charset charsetForName(String charsetName) {
+      return MarkedIso88591Charset.NAME.equals(charsetName) ? CHARSET : null;
+    }
+  }
+
+  /**
+   * A charset that decodes ISO-8859-1, i.e., produces a String that contains the raw decoded
+   * bytes, and appends a marker to the end of the string to indicate that it was decoded with this
+   * charset.
+   */
+  private static class MarkedIso88591Charset extends Charset {
+    // The name
+    // * must not collide with the name of any other charset.
+    // * must not appear in archive entry names by chance.
+    // * is internal to CompressedTarFunction.
+    // This is best served by a cryptographically random UUID, generated at startup.
+    private static final String NAME = UUID.randomUUID().toString();
+
+    private MarkedIso88591Charset() {
+      super(NAME, new String[0]);
+    }
+
+    public static Optional<String> getRawBytesStringIfMarked(String s) {
+      // Check for the marker in all positions as TarArchiveInputStream manipulates the raw name in
+      // certain cases (for example, appending a '/' to directory names).
+      if (s.contains(NAME)) {
+        return Optional.of(s.replaceAll(NAME, ""));
+      }
+      return Optional.empty();
+    }
+
+    @Override
+    public CharsetDecoder newDecoder() {
+      return new CharsetDecoder(this, 1, 1) {
+        @Override
+        protected CoderResult decodeLoop(ByteBuffer in, CharBuffer out) {
+          // A simple unoptimized ISO-8859-1 decoder.
+          while (in.hasRemaining()) {
+            if (!out.hasRemaining()) {
+              return CoderResult.OVERFLOW;
+            }
+            out.put((char) (in.get() & 0xFF));
+          }
+          return CoderResult.UNDERFLOW;
+        }
+
+        @Override
+        protected CoderResult implFlush(CharBuffer out) {
+          // Append the marker to the end of the buffer to indicate that it was decoded with this
+          // charset.
+          if (out.remaining() < NAME.length()) {
+            return CoderResult.OVERFLOW;
+          }
+          out.put(NAME);
+          return CoderResult.UNDERFLOW;
+        }
+      };
+    }
+
+    @Override
+    public CharsetEncoder newEncoder() {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public boolean contains(Charset cs) {
+      return false;
+    }
+  }
 }
diff --git a/src/main/java/com/google/devtools/build/lib/bazel/repository/DecompressorDescriptor.java b/src/main/java/com/google/devtools/build/lib/bazel/repository/DecompressorDescriptor.java
@@ -15,10 +15,10 @@
 package com.google.devtools.build.lib.bazel.repository;
 
 import com.google.auto.value.AutoValue;
-import com.google.common.base.Optional;
 import com.google.common.collect.ImmutableMap;
 import com.google.devtools.build.lib.vfs.Path;
 import java.util.Map;
+import java.util.Optional;
 
 /** Description of an archive to be decompressed. */
 @AutoValue

diff --git a/src/main/java/com/google/devtools/build/lib/bazel/repository/DecompressorValue.java b/src/main/java/com/google/devtools/build/lib/bazel/repository/DecompressorValue.java
@@ -14,14 +14,17 @@
 
 package com.google.devtools.build.lib.bazel.repository;
 
+import static java.nio.charset.StandardCharsets.ISO_8859_1;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
 import com.google.common.annotations.VisibleForTesting;
-import com.google.common.base.Optional;
 import com.google.devtools.build.lib.rules.repository.RepositoryFunction.RepositoryFunctionException;
 import com.google.devtools.build.lib.vfs.Path;
 import com.google.devtools.build.lib.vfs.PathFragment;
 import com.google.devtools.build.skyframe.SkyFunctionException.Transience;
 import com.google.devtools.build.skyframe.SkyValue;
 import java.io.IOException;
+import java.util.Optional;
 import java.util.Set;
 import net.starlark.java.eval.Starlark;
 
@@ -59,9 +62,14 @@ private static String prepareErrorMessage(String prefix, Set<String> availablePr
       }
 
       public static Optional<String> maybeMakePrefixSuggestion(PathFragment pathFragment) {
-        return pathFragment.isMultiSegment()
-            ? Optional.of(pathFragment.getSegment(0))
-            : Optional.absent();
+        if (!pathFragment.isMultiSegment()) {
+          return Optional.empty();
+        }
+        String rawFirstSegment = pathFragment.getSegment(0);
+        // Users can only specify prefixes from Starlark, which is planned to use UTF-8 for all
+        // strings, but currently still collects the raw bytes in a latin-1 string. We thus
+        // optimistically decode the raw bytes with UTF-8 here for display purposes.
+        return Optional.of(new String(rawFirstSegment.getBytes(ISO_8859_1), UTF_8));
       }
     }
 

diff --git a/src/main/java/com/google/devtools/build/lib/bazel/repository/StripPrefixedPath.java b/src/main/java/com/google/devtools/build/lib/bazel/repository/StripPrefixedPath.java
@@ -14,11 +14,13 @@
 
 package com.google.devtools.build.lib.bazel.repository;
 
-import com.google.common.base.Optional;
+import static java.nio.charset.StandardCharsets.ISO_8859_1;
+
 import com.google.common.base.Preconditions;
 import com.google.devtools.build.lib.concurrent.ThreadSafety;
 import com.google.devtools.build.lib.vfs.Path;
 import com.google.devtools.build.lib.vfs.PathFragment;
+import java.util.Optional;
 
 /**
  * Utility class for removing a prefix from an archive's path.
@@ -36,17 +38,19 @@ public final class StripPrefixedPath {
    * could cause collisions, if a zip file had one entry for bin/some-binary and another entry for
    * /bin/some-binary.
    *
-   * Note that the prefix is stripped to move the files up one level, so if you have an entry
+   * <p>Note that the prefix is stripped to move the files up one level, so if you have an entry
    * "foo/../bar" and a prefix of "foo", the result will be "bar" not "../bar".
    */
-  public static StripPrefixedPath maybeDeprefix(String entry, Optional<String> prefix) {
+  public static StripPrefixedPath maybeDeprefix(byte[] entry, Optional<String> prefix) {
     Preconditions.checkNotNull(entry);
     PathFragment entryPath = relativize(entry);
-    if (!prefix.isPresent()) {
+    if (prefix.isEmpty()) {
       return new StripPrefixedPath(entryPath, false, false);
     }
 
-    PathFragment prefixPath = relativize(prefix.get());
+    // Bazel parses Starlark files, which are the ultimate source of prefixes, as Latin-1
+    // (ISO-8859-1).
+    PathFragment prefixPath = relativize(prefix.get().getBytes(ISO_8859_1));
     boolean found = false;
     boolean skip = false;
     if (entryPath.startsWith(prefixPath)) {
@@ -64,8 +68,8 @@ public static StripPrefixedPath maybeDeprefix(String entry, Optional<String> pre
   /**
    * Normalize the path and, if it is absolute, make it relative (e.g., /foo/bar becomes foo/bar).
    */
-  private static PathFragment relativize(String path) {
-    PathFragment entryPath = PathFragment.create(path);
+  private static PathFragment relativize(byte[] path) {
+    PathFragment entryPath = createPathFragment(path);
     if (entryPath.isAbsolute()) {
       entryPath = entryPath.toRelative();
     }
@@ -79,10 +83,10 @@ private StripPrefixedPath(PathFragment pathFragment, boolean found, boolean skip
   }
 
   public static PathFragment maybeDeprefixSymlink(
-      PathFragment linkPathFragment, Optional<String> prefix, Path root) {
-    boolean wasAbsolute = linkPathFragment.isAbsolute();
+      byte[] rawTarget, Optional<String> prefix, Path root) {
+    boolean wasAbsolute = createPathFragment(rawTarget).isAbsolute();
     // Strip the prefix from the link path if set.
-    linkPathFragment = maybeDeprefix(linkPathFragment.getPathString(), prefix).getPathFragment();
+    PathFragment linkPathFragment = maybeDeprefix(rawTarget, prefix).getPathFragment();
     if (wasAbsolute) {
       // Recover the path to an absolute path as maybeDeprefix() relativize the path
       // even if the prefix is not set
@@ -103,4 +107,10 @@ public boolean skip() {
     return skip;
   }
 
+  static PathFragment createPathFragment(byte[] rawBytes) {
+    // Bazel internally represents paths as raw bytes by using the Latin-1 encoding, which has the
+    // property that (new String(bytes, ISO_8859_1)).getBytes(ISO_8859_1)) equals bytes for every
+    // byte array bytes.
+    return PathFragment.create(new String(rawBytes, ISO_8859_1));
+  }
 }