From 11251a3bb3845763dc85e43c019f3237c887cf04 Mon Sep 17 00:00:00 2001 From: Fabian Meumertzheim Date: Fri, 19 May 2023 10:06:37 +0200 Subject: [PATCH 1/2] Fix handling of non-ASCII characters in archive entry file names When creating a `PathFragment` from a ZIP or TAR entry file name, the raw bytes of the name are now wrapped into a Latin-1 encoded String, which is how Bazel internally represents file paths. Previously, ZIP entries as well as TAR entries with PAX headers would result in ordinary decoded Java strings, resulting in corrupted file names when passed to Bazel's file system operations. --- .../devtools/build/lib/bazel/repository/BUILD | 1 + .../repository/CompressedTarFunction.java | 23 ++++--- .../repository/DecompressorDescriptor.java | 2 +- .../bazel/repository/DecompressorValue.java | 16 +++-- .../bazel/repository/StripPrefixedPath.java | 30 ++++++--- .../lib/bazel/repository/ZipDecompressor.java | 33 +++++----- .../repository/StripPrefixedPathTest.java | 40 ++++++------ src/test/shell/bazel/bazel_workspaces_test.sh | 63 +++++++++++++++++++ 8 files changed, 149 insertions(+), 59 deletions(-) diff --git a/src/main/java/com/google/devtools/build/lib/bazel/repository/BUILD b/src/main/java/com/google/devtools/build/lib/bazel/repository/BUILD index a882276a8273a7..0c8809977e69f1 100644 --- a/src/main/java/com/google/devtools/build/lib/bazel/repository/BUILD +++ b/src/main/java/com/google/devtools/build/lib/bazel/repository/BUILD @@ -41,6 +41,7 @@ java_library( "//src/main/java/com/google/devtools/build/lib/rules:repository/repository_function", "//src/main/java/com/google/devtools/build/lib/rules:repository/workspace_base_rule", "//src/main/java/com/google/devtools/build/lib/rules:repository/workspace_configured_target_factory", + "//src/main/java/com/google/devtools/build/lib/unsafe:string", "//src/main/java/com/google/devtools/build/lib/util", "//src/main/java/com/google/devtools/build/lib/util:os", "//src/main/java/com/google/devtools/build/lib/vfs", diff --git a/src/main/java/com/google/devtools/build/lib/bazel/repository/CompressedTarFunction.java b/src/main/java/com/google/devtools/build/lib/bazel/repository/CompressedTarFunction.java index f38e7d56894ac2..dd591595b11a19 100644 --- a/src/main/java/com/google/devtools/build/lib/bazel/repository/CompressedTarFunction.java +++ b/src/main/java/com/google/devtools/build/lib/bazel/repository/CompressedTarFunction.java @@ -15,8 +15,8 @@ package com.google.devtools.build.lib.bazel.repository; import static com.google.devtools.build.lib.bazel.repository.StripPrefixedPath.maybeDeprefixSymlink; +import static java.nio.charset.StandardCharsets.UTF_8; -import com.google.common.base.Optional; import com.google.common.io.ByteStreams; import com.google.devtools.build.lib.bazel.repository.DecompressorValue.Decompressor; import com.google.devtools.build.lib.vfs.FileSystemUtils; @@ -29,6 +29,7 @@ import java.util.HashMap; import java.util.HashSet; import java.util.Map; +import java.util.Optional; import java.util.Set; import org.apache.commons.compress.archivers.tar.TarArchiveEntry; import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; @@ -54,20 +55,21 @@ public Path decompress(DecompressorDescriptor descriptor) Map symlinks = new HashMap<>(); try (InputStream decompressorStream = getDecompressorStream(descriptor)) { - TarArchiveInputStream tarStream = new TarArchiveInputStream(decompressorStream); + // USTAR tar headers use an unspecified encoding whereas PAX tar headers always use UTF-8. + // Since this would result in ambiguity with Bazel forcing the default JVM encoding to be + // ISO-8859-1, we explicitly specify the encoding as UTF-8. + TarArchiveInputStream tarStream = new TarArchiveInputStream(decompressorStream, UTF_8.name()); TarArchiveEntry entry; while ((entry = tarStream.getNextTarEntry()) != null) { String entryName = entry.getName(); entryName = renameFiles.getOrDefault(entryName, entryName); - StripPrefixedPath entryPath = StripPrefixedPath.maybeDeprefix(entryName, prefix); + StripPrefixedPath entryPath = + StripPrefixedPath.maybeDeprefix(entryName.getBytes(UTF_8), prefix); foundPrefix = foundPrefix || entryPath.foundPrefix(); if (prefix.isPresent() && !foundPrefix) { - Optional suggestion = - CouldNotFindPrefixException.maybeMakePrefixSuggestion(entryPath.getPathFragment()); - if (suggestion.isPresent()) { - availablePrefixes.add(suggestion.get()); - } + CouldNotFindPrefixException.maybeMakePrefixSuggestion(entryPath.getPathFragment()) + .ifPresent(availablePrefixes::add); } if (entryPath.skip()) { @@ -80,8 +82,9 @@ public Path decompress(DecompressorDescriptor descriptor) filePath.createDirectoryAndParents(); } else { if (entry.isSymbolicLink() || entry.isLink()) { - PathFragment targetName = PathFragment.create(entry.getLinkName()); - targetName = maybeDeprefixSymlink(targetName, prefix, descriptor.destinationPath()); + PathFragment targetName = + maybeDeprefixSymlink( + entry.getLinkName().getBytes(UTF_8), prefix, descriptor.destinationPath()); if (entry.isSymbolicLink()) { symlinks.put(filePath, targetName); } else { diff --git a/src/main/java/com/google/devtools/build/lib/bazel/repository/DecompressorDescriptor.java b/src/main/java/com/google/devtools/build/lib/bazel/repository/DecompressorDescriptor.java index 2676fdba14d091..c40d5ab35f38da 100644 --- a/src/main/java/com/google/devtools/build/lib/bazel/repository/DecompressorDescriptor.java +++ b/src/main/java/com/google/devtools/build/lib/bazel/repository/DecompressorDescriptor.java @@ -15,10 +15,10 @@ package com.google.devtools.build.lib.bazel.repository; import com.google.auto.value.AutoValue; -import com.google.common.base.Optional; import com.google.common.collect.ImmutableMap; import com.google.devtools.build.lib.vfs.Path; import java.util.Map; +import java.util.Optional; /** Description of an archive to be decompressed. */ @AutoValue diff --git a/src/main/java/com/google/devtools/build/lib/bazel/repository/DecompressorValue.java b/src/main/java/com/google/devtools/build/lib/bazel/repository/DecompressorValue.java index e32a2decb18741..9deddba06b887f 100644 --- a/src/main/java/com/google/devtools/build/lib/bazel/repository/DecompressorValue.java +++ b/src/main/java/com/google/devtools/build/lib/bazel/repository/DecompressorValue.java @@ -14,14 +14,17 @@ package com.google.devtools.build.lib.bazel.repository; +import static java.nio.charset.StandardCharsets.ISO_8859_1; +import static java.nio.charset.StandardCharsets.UTF_8; + import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Optional; import com.google.devtools.build.lib.rules.repository.RepositoryFunction.RepositoryFunctionException; import com.google.devtools.build.lib.vfs.Path; import com.google.devtools.build.lib.vfs.PathFragment; import com.google.devtools.build.skyframe.SkyFunctionException.Transience; import com.google.devtools.build.skyframe.SkyValue; import java.io.IOException; +import java.util.Optional; import java.util.Set; import net.starlark.java.eval.Starlark; @@ -59,9 +62,14 @@ private static String prepareErrorMessage(String prefix, Set availablePr } public static Optional maybeMakePrefixSuggestion(PathFragment pathFragment) { - return pathFragment.isMultiSegment() - ? Optional.of(pathFragment.getSegment(0)) - : Optional.absent(); + if (!pathFragment.isMultiSegment()) { + return Optional.empty(); + } + String rawFirstSegment = pathFragment.getSegment(0); + // Users can only specify prefixes from Starlark, where strings always use UTF-8, so we + // optimistically decode with it here even though we do not know the original encoding. + return Optional.of(new String(rawFirstSegment.getBytes(ISO_8859_1), UTF_8)); +// return Optional.of(rawFirstSegment); } } diff --git a/src/main/java/com/google/devtools/build/lib/bazel/repository/StripPrefixedPath.java b/src/main/java/com/google/devtools/build/lib/bazel/repository/StripPrefixedPath.java index 77c5815e79a50d..5852b4d0ea5303 100644 --- a/src/main/java/com/google/devtools/build/lib/bazel/repository/StripPrefixedPath.java +++ b/src/main/java/com/google/devtools/build/lib/bazel/repository/StripPrefixedPath.java @@ -14,11 +14,13 @@ package com.google.devtools.build.lib.bazel.repository; -import com.google.common.base.Optional; +import static java.nio.charset.StandardCharsets.ISO_8859_1; + import com.google.common.base.Preconditions; import com.google.devtools.build.lib.concurrent.ThreadSafety; import com.google.devtools.build.lib.vfs.Path; import com.google.devtools.build.lib.vfs.PathFragment; +import java.util.Optional; /** * Utility class for removing a prefix from an archive's path. @@ -36,17 +38,19 @@ public final class StripPrefixedPath { * could cause collisions, if a zip file had one entry for bin/some-binary and another entry for * /bin/some-binary. * - * Note that the prefix is stripped to move the files up one level, so if you have an entry + *

Note that the prefix is stripped to move the files up one level, so if you have an entry * "foo/../bar" and a prefix of "foo", the result will be "bar" not "../bar". */ - public static StripPrefixedPath maybeDeprefix(String entry, Optional prefix) { + public static StripPrefixedPath maybeDeprefix(byte[] entry, Optional prefix) { Preconditions.checkNotNull(entry); PathFragment entryPath = relativize(entry); - if (!prefix.isPresent()) { + if (prefix.isEmpty()) { return new StripPrefixedPath(entryPath, false, false); } - PathFragment prefixPath = relativize(prefix.get()); + // Bazel parses Starlark files, which are the ultimate source of prefixes, as Latin-1 + // (ISO-8859-1). + PathFragment prefixPath = relativize(prefix.get().getBytes(ISO_8859_1)); boolean found = false; boolean skip = false; if (entryPath.startsWith(prefixPath)) { @@ -64,8 +68,8 @@ public static StripPrefixedPath maybeDeprefix(String entry, Optional pre /** * Normalize the path and, if it is absolute, make it relative (e.g., /foo/bar becomes foo/bar). */ - private static PathFragment relativize(String path) { - PathFragment entryPath = PathFragment.create(path); + private static PathFragment relativize(byte[] path) { + PathFragment entryPath = createPathFragment(path); if (entryPath.isAbsolute()) { entryPath = entryPath.toRelative(); } @@ -79,10 +83,10 @@ private StripPrefixedPath(PathFragment pathFragment, boolean found, boolean skip } public static PathFragment maybeDeprefixSymlink( - PathFragment linkPathFragment, Optional prefix, Path root) { - boolean wasAbsolute = linkPathFragment.isAbsolute(); + byte[] rawTarget, Optional prefix, Path root) { + boolean wasAbsolute = createPathFragment(rawTarget).isAbsolute(); // Strip the prefix from the link path if set. - linkPathFragment = maybeDeprefix(linkPathFragment.getPathString(), prefix).getPathFragment(); + PathFragment linkPathFragment = maybeDeprefix(rawTarget, prefix).getPathFragment(); if (wasAbsolute) { // Recover the path to an absolute path as maybeDeprefix() relativize the path // even if the prefix is not set @@ -103,4 +107,10 @@ public boolean skip() { return skip; } + static PathFragment createPathFragment(byte[] rawBytes) { + // Bazel internally represents paths as raw bytes by using the Latin-1 encoding, which has the + // property that (new String(bytes, ISO_8859_1)).getBytes(ISO_8859_1)) equals bytes for every + // byte array bytes. + return PathFragment.create(new String(rawBytes, ISO_8859_1)); + } } diff --git a/src/main/java/com/google/devtools/build/lib/bazel/repository/ZipDecompressor.java b/src/main/java/com/google/devtools/build/lib/bazel/repository/ZipDecompressor.java index 8990e7c4c08e64..4a8f7709a5798b 100644 --- a/src/main/java/com/google/devtools/build/lib/bazel/repository/ZipDecompressor.java +++ b/src/main/java/com/google/devtools/build/lib/bazel/repository/ZipDecompressor.java @@ -15,9 +15,9 @@ package com.google.devtools.build.lib.bazel.repository; import static com.google.devtools.build.lib.bazel.repository.StripPrefixedPath.maybeDeprefixSymlink; +import static java.nio.charset.StandardCharsets.UTF_8; import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Optional; import com.google.common.base.Preconditions; import com.google.common.io.ByteStreams; import com.google.devtools.build.lib.bazel.repository.DecompressorValue.Decompressor; @@ -29,11 +29,11 @@ import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; -import java.nio.charset.Charset; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.Map; +import java.util.Optional; import java.util.Set; import javax.annotation.Nullable; @@ -89,7 +89,8 @@ public Path decompress(DecompressorDescriptor descriptor) for (ZipFileEntry entry : entries) { String entryName = entry.getName(); entryName = renameFiles.getOrDefault(entryName, entryName); - StripPrefixedPath entryPath = StripPrefixedPath.maybeDeprefix(entryName, prefix); + StripPrefixedPath entryPath = + StripPrefixedPath.maybeDeprefix(entryName.getBytes(UTF_8), prefix); foundPrefix = foundPrefix || entryPath.foundPrefix(); if (entryPath.skip()) { continue; @@ -102,12 +103,9 @@ public Path decompress(DecompressorDescriptor descriptor) Set prefixes = new HashSet<>(); for (ZipFileEntry entry : entries) { StripPrefixedPath entryPath = - StripPrefixedPath.maybeDeprefix(entry.getName(), Optional.absent()); - Optional suggestion = - CouldNotFindPrefixException.maybeMakePrefixSuggestion(entryPath.getPathFragment()); - if (suggestion.isPresent()) { - prefixes.add(suggestion.get()); - } + StripPrefixedPath.maybeDeprefix(entry.getName().getBytes(UTF_8), Optional.empty()); + CouldNotFindPrefixException.maybeMakePrefixSuggestion(entryPath.getPathFragment()) + .ifPresent(prefixes::add); } throw new CouldNotFindPrefixException(prefix.get(), prefixes); } @@ -146,17 +144,22 @@ private static void extractZipEntry( // For symlinks, the "compressed data" is actually the target name. int read = reader.getInputStream(entry).read(buffer); Preconditions.checkState(read == buffer.length); - PathFragment target = PathFragment.create(new String(buffer, Charset.defaultCharset())); + + PathFragment target = StripPrefixedPath.createPathFragment(buffer); if (target.containsUplevelReferences()) { PathFragment pointsTo = strippedRelativePath.getParentDirectory().getRelative(target); if (pointsTo.containsUplevelReferences()) { - throw new IOException("Zip entries cannot refer to files outside of their directory: " - + reader.getFilename() + " has a symlink " + strippedRelativePath + " pointing to " - + target); + throw new IOException( + "Zip entries cannot refer to files outside of their directory: " + + reader.getFilename() + + " has a symlink " + + strippedRelativePath + + " pointing to " + + new String(buffer, UTF_8)); } } - target = maybeDeprefixSymlink(target, prefix, destinationDirectory); - symlinks.put(outputPath, target); + + symlinks.put(outputPath, maybeDeprefixSymlink(buffer, prefix, destinationDirectory)); } else { try (InputStream input = reader.getInputStream(entry); OutputStream output = outputPath.getOutputStream()) { diff --git a/src/test/java/com/google/devtools/build/lib/bazel/repository/StripPrefixedPathTest.java b/src/test/java/com/google/devtools/build/lib/bazel/repository/StripPrefixedPathTest.java index df1ea4db0fb19f..784b16348e0580 100644 --- a/src/test/java/com/google/devtools/build/lib/bazel/repository/StripPrefixedPathTest.java +++ b/src/test/java/com/google/devtools/build/lib/bazel/repository/StripPrefixedPathTest.java @@ -15,13 +15,14 @@ package com.google.devtools.build.lib.bazel.repository; import static com.google.common.truth.Truth.assertThat; +import static java.nio.charset.StandardCharsets.UTF_8; -import com.google.common.base.Optional; import com.google.devtools.build.lib.clock.BlazeClock; import com.google.devtools.build.lib.util.OS; import com.google.devtools.build.lib.vfs.DigestHashFunction; import com.google.devtools.build.lib.vfs.PathFragment; import com.google.devtools.build.lib.vfs.inmemoryfs.InMemoryFileSystem; +import java.util.Optional; import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.JUnit4; @@ -33,31 +34,32 @@ public class StripPrefixedPathTest { @Test public void testStrip() { - StripPrefixedPath result = StripPrefixedPath.maybeDeprefix("foo/bar", Optional.of("foo")); + StripPrefixedPath result = + StripPrefixedPath.maybeDeprefix("foo/bar".getBytes(UTF_8), Optional.of("foo")); assertThat(PathFragment.create("bar")).isEqualTo(result.getPathFragment()); assertThat(result.foundPrefix()).isTrue(); assertThat(result.skip()).isFalse(); - result = StripPrefixedPath.maybeDeprefix("foo", Optional.of("foo")); + result = StripPrefixedPath.maybeDeprefix("foo".getBytes(UTF_8), Optional.of("foo")); assertThat(result.skip()).isTrue(); - result = StripPrefixedPath.maybeDeprefix("bar/baz", Optional.of("foo")); + result = StripPrefixedPath.maybeDeprefix("bar/baz".getBytes(UTF_8), Optional.of("foo")); assertThat(result.foundPrefix()).isFalse(); - result = StripPrefixedPath.maybeDeprefix("foof/bar", Optional.of("foo")); + result = StripPrefixedPath.maybeDeprefix("foof/bar".getBytes(UTF_8), Optional.of("foo")); assertThat(result.foundPrefix()).isFalse(); } @Test public void testAbsolute() { - StripPrefixedPath result = StripPrefixedPath.maybeDeprefix( - "/foo/bar", Optional.absent()); + StripPrefixedPath result = + StripPrefixedPath.maybeDeprefix("/foo/bar".getBytes(UTF_8), Optional.empty()); assertThat(result.getPathFragment()).isEqualTo(PathFragment.create("foo/bar")); - result = StripPrefixedPath.maybeDeprefix("///foo/bar/baz", Optional.absent()); + result = StripPrefixedPath.maybeDeprefix("///foo/bar/baz".getBytes(UTF_8), Optional.empty()); assertThat(result.getPathFragment()).isEqualTo(PathFragment.create("foo/bar/baz")); - result = StripPrefixedPath.maybeDeprefix("/foo/bar/baz", Optional.of("/foo")); + result = StripPrefixedPath.maybeDeprefix("/foo/bar/baz".getBytes(UTF_8), Optional.of("/foo")); assertThat(result.getPathFragment()).isEqualTo(PathFragment.create("bar/baz")); } @@ -66,21 +68,21 @@ public void testWindowsAbsolute() { if (OS.getCurrent() != OS.WINDOWS) { return; } - StripPrefixedPath result = StripPrefixedPath.maybeDeprefix( - "c:/foo/bar", Optional.absent()); + StripPrefixedPath result = + StripPrefixedPath.maybeDeprefix("c:/foo/bar".getBytes(UTF_8), Optional.empty()); assertThat(result.getPathFragment()).isEqualTo(PathFragment.create("foo/bar")); } @Test public void testNormalize() { - StripPrefixedPath result = StripPrefixedPath.maybeDeprefix( - "../bar", Optional.absent()); + StripPrefixedPath result = + StripPrefixedPath.maybeDeprefix("../bar".getBytes(UTF_8), Optional.empty()); assertThat(result.getPathFragment()).isEqualTo(PathFragment.create("../bar")); - result = StripPrefixedPath.maybeDeprefix("foo/../baz", Optional.absent()); + result = StripPrefixedPath.maybeDeprefix("foo/../baz".getBytes(UTF_8), Optional.empty()); assertThat(result.getPathFragment()).isEqualTo(PathFragment.create("baz")); - result = StripPrefixedPath.maybeDeprefix("foo/../baz", Optional.of("foo")); + result = StripPrefixedPath.maybeDeprefix("foo/../baz".getBytes(UTF_8), Optional.of("foo")); assertThat(result.getPathFragment()).isEqualTo(PathFragment.create("baz")); } @@ -91,23 +93,23 @@ public void testDeprefixSymlink() { PathFragment relativeNoPrefix = StripPrefixedPath.maybeDeprefixSymlink( - PathFragment.create("a/b"), Optional.absent(), fileSystem.getPath("/usr")); + "a/b".getBytes(UTF_8), Optional.empty(), fileSystem.getPath("/usr")); // there is no attempt to get absolute path for the relative symlinks target path assertThat(relativeNoPrefix).isEqualTo(PathFragment.create("a/b")); PathFragment absoluteNoPrefix = StripPrefixedPath.maybeDeprefixSymlink( - PathFragment.create("/a/b"), Optional.absent(), fileSystem.getPath("/usr")); + "/a/b".getBytes(UTF_8), Optional.empty(), fileSystem.getPath("/usr")); assertThat(absoluteNoPrefix).isEqualTo(PathFragment.create("/usr/a/b")); PathFragment absolutePrefix = StripPrefixedPath.maybeDeprefixSymlink( - PathFragment.create("/root/a/b"), Optional.of("root"), fileSystem.getPath("/usr")); + "/root/a/b".getBytes(UTF_8), Optional.of("root"), fileSystem.getPath("/usr")); assertThat(absolutePrefix).isEqualTo(PathFragment.create("/usr/a/b")); PathFragment relativePrefix = StripPrefixedPath.maybeDeprefixSymlink( - PathFragment.create("root/a/b"), Optional.of("root"), fileSystem.getPath("/usr")); + "root/a/b".getBytes(UTF_8), Optional.of("root"), fileSystem.getPath("/usr")); // there is no attempt to get absolute path for the relative symlinks target path assertThat(relativePrefix).isEqualTo(PathFragment.create("a/b")); } diff --git a/src/test/shell/bazel/bazel_workspaces_test.sh b/src/test/shell/bazel/bazel_workspaces_test.sh index a33b443f17c143..943ee774a85275 100755 --- a/src/test/shell/bazel/bazel_workspaces_test.sh +++ b/src/test/shell/bazel/bazel_workspaces_test.sh @@ -451,6 +451,69 @@ function test_extract_rename_files() { ensure_output_contains_exactly_once "external/repo/out_dir/renamed-A.txt" "Second file: A" } +function test_extract_pax_tar_non_ascii_file_names() { + local archive_tar="${TEST_TMPDIR}/pax.tar" + + pushd "${TEST_TMPDIR}" + mkdir "Ä_pax_∅" + echo "bar" > "Ä_pax_∅/Ä_foo_∅.txt" + tar --format=pax -cvf pax.tar "Ä_pax_∅" + popd + + set_workspace_command " + repository_ctx.extract('${archive_tar}', 'out_dir', 'Ä_pax_∅/')" + + build_and_process_log --exclude_rule "repository @local_config_cc" + + ensure_contains_exactly 'location: .*repos.bzl:3:25' 1 + ensure_contains_atleast 'context: "repository @repo"' 2 + ensure_contains_exactly 'extract_event' 1 + + ensure_output_contains_exactly_once "external/repo/out_dir/Ä_foo_∅.txt" "bar" +} + +function test_extract_ustar_tar_non_ascii_file_names() { + local archive_tar="${TEST_TMPDIR}/ustar.tar" + + pushd "${TEST_TMPDIR}" + mkdir "Ä_ustar_∅" + echo "bar" > "Ä_ustar_∅/Ä_foo_∅.txt" + tar --format=ustar -cvf ustar.tar "Ä_ustar_∅" + popd + + set_workspace_command " + repository_ctx.extract('${archive_tar}', 'out_dir', 'Ä_ustar_∅/')" + + build_and_process_log --exclude_rule "repository @local_config_cc" + + ensure_contains_exactly 'location: .*repos.bzl:3:25' 1 + ensure_contains_atleast 'context: "repository @repo"' 2 + ensure_contains_exactly 'extract_event' 1 + + ensure_output_contains_exactly_once "external/repo/out_dir/Ä_foo_∅.txt" "bar" +} + +function test_extract_default_zip_non_ascii_file_names() { + local archive_tar="${TEST_TMPDIR}/default.zip" + + pushd "${TEST_TMPDIR}" + mkdir "Ä_default_∅" + echo "bar" > "Ä_default_∅/Ä_foo_∅.txt" + zip default.zip -r "Ä_default_∅" + popd + + set_workspace_command " + repository_ctx.extract('${archive_tar}', 'out_dir', 'Ä_default_∅/')" + + build_and_process_log --exclude_rule "repository @local_config_cc" + + ensure_contains_exactly 'location: .*repos.bzl:3:25' 1 + ensure_contains_atleast 'context: "repository @repo"' 2 + ensure_contains_exactly 'extract_event' 1 + + ensure_output_contains_exactly_once "external/repo/out_dir/Ä_foo_∅.txt" "bar" +} + function test_file() { set_workspace_command 'repository_ctx.file("filefile.sh", "echo filefile", True)' From fe1d084db2fe55ac0f7c65abbef87641b2c57b01 Mon Sep 17 00:00:00 2001 From: Fabian Meumertzheim Date: Tue, 5 Sep 2023 10:13:42 +0200 Subject: [PATCH 2/2] Use custom Charset to preserve non-UTF8 PAX headers --- .../devtools/build/lib/bazel/repository/BUILD | 2 +- .../repository/CompressedTarFunction.java | 129 +++++++++++++++++- .../bazel/repository/DecompressorValue.java | 6 +- src/test/shell/bazel/bazel_workspaces_test.sh | 39 +++++- 4 files changed, 163 insertions(+), 13 deletions(-) diff --git a/src/main/java/com/google/devtools/build/lib/bazel/repository/BUILD b/src/main/java/com/google/devtools/build/lib/bazel/repository/BUILD index 0c8809977e69f1..9cf86d6c572743 100644 --- a/src/main/java/com/google/devtools/build/lib/bazel/repository/BUILD +++ b/src/main/java/com/google/devtools/build/lib/bazel/repository/BUILD @@ -41,7 +41,6 @@ java_library( "//src/main/java/com/google/devtools/build/lib/rules:repository/repository_function", "//src/main/java/com/google/devtools/build/lib/rules:repository/workspace_base_rule", "//src/main/java/com/google/devtools/build/lib/rules:repository/workspace_configured_target_factory", - "//src/main/java/com/google/devtools/build/lib/unsafe:string", "//src/main/java/com/google/devtools/build/lib/util", "//src/main/java/com/google/devtools/build/lib/util:os", "//src/main/java/com/google/devtools/build/lib/vfs", @@ -51,6 +50,7 @@ java_library( "//src/main/java/com/google/devtools/common/options", "//src/main/java/net/starlark/java/eval", "//third_party:apache_commons_compress", + "//third_party:auto_service", "//third_party:auto_value", "//third_party:flogger", "//third_party:guava", diff --git a/src/main/java/com/google/devtools/build/lib/bazel/repository/CompressedTarFunction.java b/src/main/java/com/google/devtools/build/lib/bazel/repository/CompressedTarFunction.java index dd591595b11a19..425aed987d5fc4 100644 --- a/src/main/java/com/google/devtools/build/lib/bazel/repository/CompressedTarFunction.java +++ b/src/main/java/com/google/devtools/build/lib/bazel/repository/CompressedTarFunction.java @@ -15,8 +15,10 @@ package com.google.devtools.build.lib.bazel.repository; import static com.google.devtools.build.lib.bazel.repository.StripPrefixedPath.maybeDeprefixSymlink; +import static java.nio.charset.StandardCharsets.ISO_8859_1; import static java.nio.charset.StandardCharsets.UTF_8; +import com.google.auto.service.AutoService; import com.google.common.io.ByteStreams; import com.google.devtools.build.lib.bazel.repository.DecompressorValue.Decompressor; import com.google.devtools.build.lib.vfs.FileSystemUtils; @@ -25,17 +27,33 @@ import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CharsetEncoder; +import java.nio.charset.CoderResult; +import java.nio.charset.spi.CharsetProvider; +import java.util.Collections; import java.util.Date; import java.util.HashMap; import java.util.HashSet; +import java.util.Iterator; import java.util.Map; import java.util.Optional; import java.util.Set; +import java.util.UUID; import org.apache.commons.compress.archivers.tar.TarArchiveEntry; import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; /** * Common code for unarchiving a compressed TAR file. + * + *

TAR file entries commonly use one of two formats: PAX, which uses UTF-8 encoding for all + * strings, and USTAR, which does not specify an encoding. This class interprets USTAR headers as + * latin-1, thus preserving the original bytes of the header without enforcing any particular + * encoding. Internally, for file system operations, all strings are converted into Bazel's internal + * representation of raw bytes stored as latin-1 strings. */ public abstract class CompressedTarFunction implements Decompressor { protected abstract InputStream getDecompressorStream(DecompressorDescriptor descriptor) @@ -56,15 +74,17 @@ public Path decompress(DecompressorDescriptor descriptor) try (InputStream decompressorStream = getDecompressorStream(descriptor)) { // USTAR tar headers use an unspecified encoding whereas PAX tar headers always use UTF-8. - // Since this would result in ambiguity with Bazel forcing the default JVM encoding to be - // ISO-8859-1, we explicitly specify the encoding as UTF-8. - TarArchiveInputStream tarStream = new TarArchiveInputStream(decompressorStream, UTF_8.name()); + // We can specify the encoding to use for USTAR headers, but the Charset used for PAX headers + // is fixed to UTF-8. We thus specify a custom Charset for the former so that we can + // distinguish between the two. + TarArchiveInputStream tarStream = + new TarArchiveInputStream(decompressorStream, MarkedIso88591Charset.NAME); TarArchiveEntry entry; while ((entry = tarStream.getNextTarEntry()) != null) { - String entryName = entry.getName(); + String entryName = toRawBytesString(entry.getName()); entryName = renameFiles.getOrDefault(entryName, entryName); StripPrefixedPath entryPath = - StripPrefixedPath.maybeDeprefix(entryName.getBytes(UTF_8), prefix); + StripPrefixedPath.maybeDeprefix(entryName.getBytes(ISO_8859_1), prefix); foundPrefix = foundPrefix || entryPath.foundPrefix(); if (prefix.isPresent() && !foundPrefix) { @@ -84,7 +104,9 @@ public Path decompress(DecompressorDescriptor descriptor) if (entry.isSymbolicLink() || entry.isLink()) { PathFragment targetName = maybeDeprefixSymlink( - entry.getLinkName().getBytes(UTF_8), prefix, descriptor.destinationPath()); + toRawBytesString(entry.getLinkName()).getBytes(ISO_8859_1), + prefix, + descriptor.destinationPath()); if (entry.isSymbolicLink()) { symlinks.put(filePath, targetName); } else { @@ -138,4 +160,99 @@ public Path decompress(DecompressorDescriptor descriptor) return descriptor.destinationPath(); } + + /** + * Returns a string that contains the raw bytes of the given string encoded in ISO-8859-1, + * assuming that the given string was encoded with either UTF-8 or the special + * {@link MarkedIso88591Charset}. + */ + private static String toRawBytesString(String name) { + // Marked strings are already encoded in ISO-8859-1. Other strings originate from PAX headers + // and are thus encoded in UTF-8, which we decode to the raw bytes and then re-encode trivially + // in ISO-8859-1. + return MarkedIso88591Charset.getRawBytesStringIfMarked(name) + .orElseGet(() -> new String(name.getBytes(UTF_8), ISO_8859_1)); + } + + @AutoService(CharsetProvider.class) + public static class MarkedIso88591CharsetProvider extends CharsetProvider { + private static final Charset CHARSET = new MarkedIso88591Charset(); + + @Override + public Iterator charsets() { + // This charset is only meant for internal use within CompressedTarFunction and thus should + // not be discoverable. + return Collections.emptyIterator(); + } + + @Override + public Charset charsetForName(String charsetName) { + return MarkedIso88591Charset.NAME.equals(charsetName) ? CHARSET : null; + } + } + + /** + * A charset that decodes ISO-8859-1, i.e., produces a String that contains the raw decoded + * bytes, and appends a marker to the end of the string to indicate that it was decoded with this + * charset. + */ + private static class MarkedIso88591Charset extends Charset { + // The name + // * must not collide with the name of any other charset. + // * must not appear in archive entry names by chance. + // * is internal to CompressedTarFunction. + // This is best served by a cryptographically random UUID, generated at startup. + private static final String NAME = UUID.randomUUID().toString(); + + private MarkedIso88591Charset() { + super(NAME, new String[0]); + } + + public static Optional getRawBytesStringIfMarked(String s) { + // Check for the marker in all positions as TarArchiveInputStream manipulates the raw name in + // certain cases (for example, appending a '/' to directory names). + if (s.contains(NAME)) { + return Optional.of(s.replaceAll(NAME, "")); + } + return Optional.empty(); + } + + @Override + public CharsetDecoder newDecoder() { + return new CharsetDecoder(this, 1, 1) { + @Override + protected CoderResult decodeLoop(ByteBuffer in, CharBuffer out) { + // A simple unoptimized ISO-8859-1 decoder. + while (in.hasRemaining()) { + if (!out.hasRemaining()) { + return CoderResult.OVERFLOW; + } + out.put((char) (in.get() & 0xFF)); + } + return CoderResult.UNDERFLOW; + } + + @Override + protected CoderResult implFlush(CharBuffer out) { + // Append the marker to the end of the buffer to indicate that it was decoded with this + // charset. + if (out.remaining() < NAME.length()) { + return CoderResult.OVERFLOW; + } + out.put(NAME); + return CoderResult.UNDERFLOW; + } + }; + } + + @Override + public CharsetEncoder newEncoder() { + throw new UnsupportedOperationException(); + } + + @Override + public boolean contains(Charset cs) { + return false; + } + } } diff --git a/src/main/java/com/google/devtools/build/lib/bazel/repository/DecompressorValue.java b/src/main/java/com/google/devtools/build/lib/bazel/repository/DecompressorValue.java index 9deddba06b887f..98ca4b4fa7e0ee 100644 --- a/src/main/java/com/google/devtools/build/lib/bazel/repository/DecompressorValue.java +++ b/src/main/java/com/google/devtools/build/lib/bazel/repository/DecompressorValue.java @@ -66,10 +66,10 @@ public static Optional maybeMakePrefixSuggestion(PathFragment pathFragme return Optional.empty(); } String rawFirstSegment = pathFragment.getSegment(0); - // Users can only specify prefixes from Starlark, where strings always use UTF-8, so we - // optimistically decode with it here even though we do not know the original encoding. + // Users can only specify prefixes from Starlark, which is planned to use UTF-8 for all + // strings, but currently still collects the raw bytes in a latin-1 string. We thus + // optimistically decode the raw bytes with UTF-8 here for display purposes. return Optional.of(new String(rawFirstSegment.getBytes(ISO_8859_1), UTF_8)); -// return Optional.of(rawFirstSegment); } } diff --git a/src/test/shell/bazel/bazel_workspaces_test.sh b/src/test/shell/bazel/bazel_workspaces_test.sh index 943ee774a85275..28079bed1dd68f 100755 --- a/src/test/shell/bazel/bazel_workspaces_test.sh +++ b/src/test/shell/bazel/bazel_workspaces_test.sh @@ -451,7 +451,10 @@ function test_extract_rename_files() { ensure_output_contains_exactly_once "external/repo/out_dir/renamed-A.txt" "Second file: A" } -function test_extract_pax_tar_non_ascii_file_names() { +# Regression test for https://github.com/bazelbuild/bazel/issues/12986 +# Verifies that tar entries with PAX headers, which are always encoded in UTF-8, are extracted +# correctly. +function test_extract_pax_tar_non_ascii_utf8_file_names() { local archive_tar="${TEST_TMPDIR}/pax.tar" pushd "${TEST_TMPDIR}" @@ -472,7 +475,9 @@ function test_extract_pax_tar_non_ascii_file_names() { ensure_output_contains_exactly_once "external/repo/out_dir/Ä_foo_∅.txt" "bar" } -function test_extract_ustar_tar_non_ascii_file_names() { +# Verifies that tar entries with USTAR headers, for which an encoding isn't specified, are extracted +# correctly if that encoding happens to be UTF-8. +function test_extract_ustar_tar_non_ascii_utf8_file_names() { local archive_tar="${TEST_TMPDIR}/ustar.tar" pushd "${TEST_TMPDIR}" @@ -493,7 +498,35 @@ function test_extract_ustar_tar_non_ascii_file_names() { ensure_output_contains_exactly_once "external/repo/out_dir/Ä_foo_∅.txt" "bar" } -function test_extract_default_zip_non_ascii_file_names() { +# Verifies that tar entries with USTAR headers, for which an encoding isn't specified, are extracted +# correctly if that encoding is not UTF-8. +function test_extract_ustar_tar_non_ascii_non_utf8_file_names() { + if is_darwin; then + echo "Skipping test on macOS due to lack of support for non-UTF-8 filenames" + return + fi + + local archive_tar="${TEST_TMPDIR}/ustar.tar" + + pushd "${TEST_TMPDIR}" + mkdir "Ä_ustar_latin1_∅" + echo "bar" > "$(echo -e 'Ä_ustar_latin1_∅/\xC4_foo_latin1_\xD6.txt')" + tar --format=ustar -cvf ustar.tar "Ä_ustar_latin1_∅" + popd + + set_workspace_command " + repository_ctx.extract('${archive_tar}', 'out_dir', 'Ä_ustar_latin1_∅/')" + + build_and_process_log --exclude_rule "repository @local_config_cc" + + ensure_contains_exactly 'location: .*repos.bzl:3:25' 1 + ensure_contains_atleast 'context: "repository @repo"' 2 + ensure_contains_exactly 'extract_event' 1 + + ensure_output_contains_exactly_once "$(echo -e 'external/repo/out_dir/\xC4_foo_latin1_\xD6.txt')" "bar" +} + +function test_extract_default_zip_non_ascii_utf8_file_names() { local archive_tar="${TEST_TMPDIR}/default.zip" pushd "${TEST_TMPDIR}"