Skip to content

Commit

Permalink
Use custom Charset to preserve non-UTF8 PAX headers
Browse files Browse the repository at this point in the history
  • Loading branch information
fmeum committed Sep 27, 2023
1 parent 11251a3 commit c10070b
Show file tree
Hide file tree
Showing 4 changed files with 160 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ java_library(
"//src/main/java/com/google/devtools/build/lib/rules:repository/repository_function",
"//src/main/java/com/google/devtools/build/lib/rules:repository/workspace_base_rule",
"//src/main/java/com/google/devtools/build/lib/rules:repository/workspace_configured_target_factory",
"//src/main/java/com/google/devtools/build/lib/unsafe:string",
"//src/main/java/com/google/devtools/build/lib/util",
"//src/main/java/com/google/devtools/build/lib/util:os",
"//src/main/java/com/google/devtools/build/lib/vfs",
Expand All @@ -51,6 +50,7 @@ java_library(
"//src/main/java/com/google/devtools/common/options",
"//src/main/java/net/starlark/java/eval",
"//third_party:apache_commons_compress",
"//third_party:auto_service",
"//third_party:auto_value",
"//third_party:flogger",
"//third_party:guava",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,10 @@
package com.google.devtools.build.lib.bazel.repository;

import static com.google.devtools.build.lib.bazel.repository.StripPrefixedPath.maybeDeprefixSymlink;
import static java.nio.charset.StandardCharsets.ISO_8859_1;
import static java.nio.charset.StandardCharsets.UTF_8;

import com.google.auto.service.AutoService;
import com.google.common.io.ByteStreams;
import com.google.devtools.build.lib.bazel.repository.DecompressorValue.Decompressor;
import com.google.devtools.build.lib.vfs.FileSystemUtils;
Expand All @@ -25,17 +27,33 @@
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import java.nio.charset.spi.CharsetProvider;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.UUID;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;

/**
* Common code for unarchiving a compressed TAR file.
*
* <p>TAR file entries commonly use one of two formats: PAX, which uses UTF-8 encoding for all
* strings, and USTAR, which does not specify an encoding. This class interprets USTAR headers as
* latin-1, thus preserving the original bytes of the header without enforcing any particular
* encoding. Internally, for file system operations, all strings are converted into Bazel's internal
* representation of raw bytes stored as latin-1 strings.
*/
public abstract class CompressedTarFunction implements Decompressor {
protected abstract InputStream getDecompressorStream(DecompressorDescriptor descriptor)
Expand All @@ -56,15 +74,17 @@ public Path decompress(DecompressorDescriptor descriptor)

try (InputStream decompressorStream = getDecompressorStream(descriptor)) {
// USTAR tar headers use an unspecified encoding whereas PAX tar headers always use UTF-8.
// Since this would result in ambiguity with Bazel forcing the default JVM encoding to be
// ISO-8859-1, we explicitly specify the encoding as UTF-8.
TarArchiveInputStream tarStream = new TarArchiveInputStream(decompressorStream, UTF_8.name());
// We can specify the encoding to use for USTAR headers, but the Charset used for PAX headers
// is fixed to UTF-8. We thus specify a custom Charset for the former so that we can
// distinguish between the two.
TarArchiveInputStream tarStream =
new TarArchiveInputStream(decompressorStream, MarkedIso88591Charset.NAME);
TarArchiveEntry entry;
while ((entry = tarStream.getNextTarEntry()) != null) {
String entryName = entry.getName();
String entryName = toRawBytesString(entry.getName());
entryName = renameFiles.getOrDefault(entryName, entryName);
StripPrefixedPath entryPath =
StripPrefixedPath.maybeDeprefix(entryName.getBytes(UTF_8), prefix);
StripPrefixedPath.maybeDeprefix(entryName.getBytes(ISO_8859_1), prefix);
foundPrefix = foundPrefix || entryPath.foundPrefix();

if (prefix.isPresent() && !foundPrefix) {
Expand All @@ -84,7 +104,9 @@ public Path decompress(DecompressorDescriptor descriptor)
if (entry.isSymbolicLink() || entry.isLink()) {
PathFragment targetName =
maybeDeprefixSymlink(
entry.getLinkName().getBytes(UTF_8), prefix, descriptor.destinationPath());
toRawBytesString(entry.getLinkName()).getBytes(ISO_8859_1),
prefix,
descriptor.destinationPath());
if (entry.isSymbolicLink()) {
symlinks.put(filePath, targetName);
} else {
Expand Down Expand Up @@ -138,4 +160,99 @@ public Path decompress(DecompressorDescriptor descriptor)

return descriptor.destinationPath();
}

/**
* Returns a string that contains the raw bytes of the given string encoded in ISO-8859-1,
* assuming that the given string was encoded with either UTF-8 or the special
* {@link MarkedIso88591Charset}.
*/
private static String toRawBytesString(String name) {
// Marked strings are already encoded in ISO-8859-1. Other strings originate from PAX headers
// and are thus encoded in UTF-8, which we decode to the raw bytes and then re-encode trivially
// in ISO-8859-1.
return MarkedIso88591Charset.getRawBytesStringIfMarked(name)
.orElseGet(() -> new String(name.getBytes(UTF_8), ISO_8859_1));
}

@AutoService(CharsetProvider.class)
public static class MarkedIso88591CharsetProvider extends CharsetProvider {
private static final Charset CHARSET = new MarkedIso88591Charset();

@Override
public Iterator<Charset> charsets() {
// This charset is only meant for internal use within CompressedTarFunction and thus should
// not be discoverable.
return Collections.emptyIterator();
}

@Override
public Charset charsetForName(String charsetName) {
return MarkedIso88591Charset.NAME.equals(charsetName) ? CHARSET : null;
}
}

/**
* A charset that decodes ISO-8859-1, i.e., produces a String that contains the raw decoded
* bytes, and appends a marker to the end of the string to indicate that it was decoded with this
* charset.
*/
private static class MarkedIso88591Charset extends Charset {
// The name
// * must not collide with the name of any other charset.
// * must not appear in archive entry names by chance.
// * is internal to CompressedTarFunction.
// This is best served by a cryptographically random UUID, generated at startup.
private static final String NAME = UUID.randomUUID().toString();

private MarkedIso88591Charset() {
super(NAME, new String[0]);
}

public static Optional<String> getRawBytesStringIfMarked(String s) {
// Check for the marker in all positions as TarArchiveInputStream manipulates the raw name in
// certain cases (for example, appending a '/' to directory names).
if (s.contains(NAME)) {
return Optional.of(s.replaceAll(NAME, ""));
}
return Optional.empty();
}

@Override
public CharsetDecoder newDecoder() {
return new CharsetDecoder(this, 1, 1) {
@Override
protected CoderResult decodeLoop(ByteBuffer in, CharBuffer out) {
// A simple unoptimized ISO-8859-1 decoder.
while (in.hasRemaining()) {
if (!out.hasRemaining()) {
return CoderResult.OVERFLOW;
}
out.put((char) (in.get() & 0xFF));
}
return CoderResult.UNDERFLOW;
}

@Override
protected CoderResult implFlush(CharBuffer out) {
// Append the marker to the end of the buffer to indicate that it was decoded with this
// charset.
if (out.remaining() < NAME.length()) {
return CoderResult.OVERFLOW;
}
out.put(NAME);
return CoderResult.UNDERFLOW;
}
};
}

@Override
public CharsetEncoder newEncoder() {
throw new UnsupportedOperationException();
}

@Override
public boolean contains(Charset cs) {
return false;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,6 @@ public static Optional<String> maybeMakePrefixSuggestion(PathFragment pathFragme
// Users can only specify prefixes from Starlark, where strings always use UTF-8, so we
// optimistically decode with it here even though we do not know the original encoding.
return Optional.of(new String(rawFirstSegment.getBytes(ISO_8859_1), UTF_8));
// return Optional.of(rawFirstSegment);
}
}

Expand Down
39 changes: 36 additions & 3 deletions src/test/shell/bazel/bazel_workspaces_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -451,7 +451,10 @@ function test_extract_rename_files() {
ensure_output_contains_exactly_once "external/repo/out_dir/renamed-A.txt" "Second file: A"
}

function test_extract_pax_tar_non_ascii_file_names() {
# Regression test for https://github.com/bazelbuild/bazel/issues/12986
# Verifies that tar entries with PAX headers, which are always encoded in UTF-8, are extracted
# correctly.
function test_extract_pax_tar_non_ascii_utf8_file_names() {
local archive_tar="${TEST_TMPDIR}/pax.tar"

pushd "${TEST_TMPDIR}"
Expand All @@ -472,7 +475,9 @@ function test_extract_pax_tar_non_ascii_file_names() {
ensure_output_contains_exactly_once "external/repo/out_dir/Ä_foo_∅.txt" "bar"
}

function test_extract_ustar_tar_non_ascii_file_names() {
# Verifies that tar entries with USTAR headers, for which an encoding isn't specified, are extracted
# correctly if that encoding happens to be UTF-8.
function test_extract_ustar_tar_non_ascii_utf8_file_names() {
local archive_tar="${TEST_TMPDIR}/ustar.tar"

pushd "${TEST_TMPDIR}"
Expand All @@ -493,7 +498,35 @@ function test_extract_ustar_tar_non_ascii_file_names() {
ensure_output_contains_exactly_once "external/repo/out_dir/Ä_foo_∅.txt" "bar"
}

function test_extract_default_zip_non_ascii_file_names() {
# Verifies that tar entries with USTAR headers, for which an encoding isn't specified, are extracted
# correctly if that encoding is not UTF-8.
function test_extract_ustar_tar_non_ascii_non_utf8_file_names() {
if is_darwin; then
echo "Skipping test on macOS due to lack of support for non-UTF-8 filenames"
return
fi

local archive_tar="${TEST_TMPDIR}/ustar.tar"

pushd "${TEST_TMPDIR}"
mkdir "Ä_ustar_latin1_∅"
echo "bar" > "$(echo -e 'Ä_ustar_latin1_∅/\xC4_foo_latin1_\xD6.txt')"
tar --format=ustar -cvf ustar.tar "Ä_ustar_latin1_∅"
popd

set_workspace_command "
repository_ctx.extract('${archive_tar}', 'out_dir', 'Ä_ustar_latin1_∅/')"

build_and_process_log --exclude_rule "repository @local_config_cc"

ensure_contains_exactly 'location: .*repos.bzl:3:25' 1
ensure_contains_atleast 'context: "repository @repo"' 2
ensure_contains_exactly 'extract_event' 1

ensure_output_contains_exactly_once "$(echo -e 'external/repo/out_dir/\xC4_foo_latin1_\xD6.txt')" "bar"
}

function test_extract_default_zip_non_ascii_utf8_file_names() {
local archive_tar="${TEST_TMPDIR}/default.zip"

pushd "${TEST_TMPDIR}"
Expand Down

0 comments on commit c10070b

Please sign in to comment.