Skip to content

Commit

Permalink
Use custom Charset to preserve non-UTF8 PAX headers
Browse files Browse the repository at this point in the history
  • Loading branch information
fmeum committed Sep 5, 2023
1 parent 2875771 commit 20e7202
Show file tree
Hide file tree
Showing 3 changed files with 130 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ java_library(
"//src/main/java/com/google/devtools/common/options",
"//src/main/java/net/starlark/java/eval",
"//third_party:apache_commons_compress",
"//third_party:auto_service",
"//third_party:auto_value",
"//third_party:flogger",
"//third_party:guava",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,10 @@
package com.google.devtools.build.lib.bazel.repository;

import static com.google.devtools.build.lib.bazel.repository.StripPrefixedPath.maybeDeprefixSymlink;
import static java.nio.charset.StandardCharsets.ISO_8859_1;
import static java.nio.charset.StandardCharsets.UTF_8;

import com.google.auto.service.AutoService;
import com.google.common.io.ByteStreams;
import com.google.devtools.build.lib.bazel.repository.DecompressorValue.Decompressor;
import com.google.devtools.build.lib.vfs.FileSystemUtils;
Expand All @@ -25,12 +27,22 @@
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import java.nio.charset.spi.CharsetProvider;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.UUID;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;

Expand All @@ -56,15 +68,17 @@ public Path decompress(DecompressorDescriptor descriptor)

try (InputStream decompressorStream = getDecompressorStream(descriptor)) {
// USTAR tar headers use an unspecified encoding whereas PAX tar headers always use UTF-8.
// Since this would result in ambiguity with Bazel forcing the default JVM encoding to be
// ISO-8859-1, we explicitly specify the encoding as UTF-8.
TarArchiveInputStream tarStream = new TarArchiveInputStream(decompressorStream, UTF_8.name());
// We can specify the encoding to use for USTAR headers, but the Charset used for PAX headers
// is fixed to UTF-8. We thus specify a custom Charset for the former so that we can
// distinguish between the two.
TarArchiveInputStream tarStream =
new TarArchiveInputStream(decompressorStream, MarkedIso88591Charset.NAME);
TarArchiveEntry entry;
while ((entry = tarStream.getNextTarEntry()) != null) {
String entryName = entry.getName();
String entryName = toRawBytesString(entry.getName());
entryName = renameFiles.getOrDefault(entryName, entryName);
StripPrefixedPath entryPath =
StripPrefixedPath.maybeDeprefix(entryName.getBytes(UTF_8), prefix);
StripPrefixedPath.maybeDeprefix(entryName.getBytes(ISO_8859_1), prefix);
foundPrefix = foundPrefix || entryPath.foundPrefix();

if (prefix.isPresent() && !foundPrefix) {
Expand All @@ -84,7 +98,9 @@ public Path decompress(DecompressorDescriptor descriptor)
if (entry.isSymbolicLink() || entry.isLink()) {
PathFragment targetName =
maybeDeprefixSymlink(
entry.getLinkName().getBytes(UTF_8), prefix, descriptor.destinationPath());
toRawBytesString(entry.getLinkName()).getBytes(ISO_8859_1),
prefix,
descriptor.destinationPath());
if (entry.isSymbolicLink()) {
symlinks.put(filePath, targetName);
} else {
Expand Down Expand Up @@ -138,4 +154,87 @@ public Path decompress(DecompressorDescriptor descriptor)

return descriptor.destinationPath();
}

private static String toRawBytesString(String name) {
// Marked strings are already encoded in ISO-8859-1, otherwise the string is encoded in UTF-8
// and we need to re-encode it in ISO-8859-1.
return MarkedIso88591Charset.getStringIfMarked(name)
.orElseGet(() -> new String(name.getBytes(UTF_8), ISO_8859_1));
}

@AutoService(CharsetProvider.class)
public static class MarkedIso88591CharsetProvider extends CharsetProvider {
private static final Charset CHARSET = new MarkedIso88591Charset();

@Override
public Iterator<Charset> charsets() {
// This charset is only meant for internal use within CompressedTarFunction and thus should
// not be discoverable.
return Collections.emptyIterator();
}

@Override
public Charset charsetForName(String charsetName) {
return MarkedIso88591Charset.NAME.equals(charsetName) ? CHARSET : null;
}
}

/**
* A charset that decodes ISO-8859-1 and appends a marker to the end of the string to indicate
* that it was decoded with this charset.
*/
private static class MarkedIso88591Charset extends Charset {
private static final String NAME = UUID.randomUUID().toString();

private MarkedIso88591Charset() {
super(NAME, new String[0]);
}

public static Optional<String> getStringIfMarked(String s) {
// Check for the marker in all positions as TarArchiveInputStream manipulates the raw name in
// certain cases (for example, appending a '/' to directory names).
if (s.contains(NAME)) {
return Optional.of(s.replaceAll(NAME, ""));
}
return Optional.empty();
}

@Override
public CharsetDecoder newDecoder() {
return new CharsetDecoder(this, 1, 1) {
@Override
protected CoderResult decodeLoop(ByteBuffer in, CharBuffer out) {
// A simple unoptimized ISO-8859-1 decoder.
while (in.hasRemaining()) {
if (!out.hasRemaining()) {
return CoderResult.OVERFLOW;
}
out.put((char) (in.get() & 0xFF));
}
return CoderResult.UNDERFLOW;
}

@Override
protected CoderResult implFlush(CharBuffer out) {
// Append the marker to the end of the buffer to indicate that it was decoded with this
// charset.
if (out.remaining() < NAME.length()) {
return CoderResult.OVERFLOW;
}
out.put(NAME);
return CoderResult.UNDERFLOW;
}
};
}

@Override
public CharsetEncoder newEncoder() {
throw new UnsupportedOperationException();
}

@Override
public boolean contains(Charset cs) {
return false;
}
}
}
27 changes: 24 additions & 3 deletions src/test/shell/bazel/bazel_workspaces_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -451,7 +451,7 @@ function test_extract_rename_files() {
ensure_output_contains_exactly_once "external/repo/out_dir/renamed-A.txt" "Second file: A"
}

function test_extract_pax_tar_non_ascii_file_names() {
function test_extract_pax_tar_non_ascii_utf8_file_names() {
local archive_tar="${TEST_TMPDIR}/pax.tar"

pushd "${TEST_TMPDIR}"
Expand All @@ -472,7 +472,7 @@ function test_extract_pax_tar_non_ascii_file_names() {
ensure_output_contains_exactly_once "external/repo/out_dir/Ä_foo_∅.txt" "bar"
}

function test_extract_ustar_tar_non_ascii_file_names() {
function test_extract_ustar_tar_non_ascii_utf8_file_names() {
local archive_tar="${TEST_TMPDIR}/ustar.tar"

pushd "${TEST_TMPDIR}"
Expand All @@ -493,7 +493,28 @@ function test_extract_ustar_tar_non_ascii_file_names() {
ensure_output_contains_exactly_once "external/repo/out_dir/Ä_foo_∅.txt" "bar"
}

function test_extract_default_zip_non_ascii_file_names() {
function test_extract_ustar_tar_non_ascii_non_utf8_file_names() {
local archive_tar="${TEST_TMPDIR}/ustar.tar"

pushd "${TEST_TMPDIR}"
mkdir "Ä_ustar_latin1_∅"
echo "bar" > "$(echo -e 'Ä_ustar_latin1_∅/\xC4_foo_latin1_\xD6.txt')"
tar --format=ustar -cvf ustar.tar "Ä_ustar_latin1_∅"
popd

set_workspace_command "
repository_ctx.extract('${archive_tar}', 'out_dir', 'Ä_ustar_latin1_∅/')"

build_and_process_log --exclude_rule "repository @local_config_cc"

ensure_contains_exactly 'location: .*repos.bzl:3:25' 1
ensure_contains_atleast 'context: "repository @repo"' 2
ensure_contains_exactly 'extract_event' 1

ensure_output_contains_exactly_once "$(echo -e 'external/repo/out_dir/\xC4_foo_latin1_\xD6.txt')" "bar"
}

function test_extract_default_zip_non_ascii_utf8_file_names() {
local archive_tar="${TEST_TMPDIR}/default.zip"

pushd "${TEST_TMPDIR}"
Expand Down

0 comments on commit 20e7202

Please sign in to comment.