Skip to content

Commit

Permalink
On Unix, encode spawned process argv as utf-8 if required by sun.jnu.…
Browse files Browse the repository at this point in the history
…encoding

Starting with JDK 19, on Unix platforms, the argument vector passed to
ProcessBuilder gets encoded to bytes by java.lang.ProcessImpl using the
sun.jnu.encoding encoding.

This causes a problem on macOS, where
* we switched to JDK 20 as of ecf9b9a
* (as on all platforms) argument strings originating from Starlark are
  in Bazel's pseudo latin1 encoding (i.e. byte arrays stored as strings)
* sun.jnu.encoding is hard-coded as utf-8 by the JVM, regardless of
  what we set for the file.encoding property.

This means we need to recode argv from pseudo latin1 to utf-8 before
passing them to ProcessBuilder, so that ProcessImpl can reverse the
process and pass correctly encoded byte arrays to the OS.

Partially addresses #18792

PiperOrigin-RevId: 544366528
Change-Id: I1acb70e489123e0baa190c569e6625259c39de78
  • Loading branch information
tetromino authored and copybara-github committed Jun 29, 2023
1 parent 22d81a2 commit f00439d
Show file tree
Hide file tree
Showing 6 changed files with 174 additions and 1 deletion.
2 changes: 2 additions & 0 deletions src/main/java/com/google/devtools/build/lib/shell/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ java_library(
"//src/main/java/com/google/devtools/build/lib/jni",
"//src/main/java/com/google/devtools/build/lib/util:describable_execution_unit",
"//src/main/java/com/google/devtools/build/lib/util:os",
"//src/main/java/com/google/devtools/build/lib/util:string",
"//src/main/java/com/google/devtools/build/lib/vfs",
"//src/main/java/com/google/devtools/build/lib/vfs:pathfragment",
"//src/main/java/com/google/devtools/build/lib/windows:processes",
Expand All @@ -45,6 +46,7 @@ bootstrap_java_library(
"//src/main/java/com/google/devtools/build/lib/util:describable_execution_unit",
"//src/main/java/com/google/devtools/build/lib/util:filetype",
"//src/main/java/com/google/devtools/build/lib/util:os",
"//src/main/java/com/google/devtools/build/lib/util:string",
"//src/main/java/com/google/devtools/build/lib/vfs:pathfragment",
"//src/main/java/com/google/devtools/build/lib/windows:processes",
"//third_party:auto_value-jars",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,17 @@

package com.google.devtools.build.lib.shell;

import static com.google.common.collect.ImmutableList.toImmutableList;

import com.google.common.collect.ImmutableList;
import com.google.devtools.build.lib.shell.SubprocessBuilder.StreamAction;
import com.google.devtools.build.lib.util.StringUtil;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.lang.ProcessBuilder.Redirect;
import java.util.Objects;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;

Expand Down Expand Up @@ -150,7 +155,15 @@ private synchronized Process start(ProcessBuilder builder) throws IOException {
@Override
public Subprocess create(SubprocessBuilder params) throws IOException {
ProcessBuilder builder = new ProcessBuilder();
builder.command(params.getArgv());
ImmutableList<String> argv = params.getArgv();
if (Runtime.version().feature() >= 19
&& Objects.equals(System.getProperty("sun.jnu.encoding"), "UTF-8")) {
// On JDK 19 and newer, java.lang.ProcessImpl#start encodes argv using sun.jnu.encoding, so if
// sun.jnu.encoding is set to UTF-8, our argv needs to be UTF-8. (Note that on some platforms,
// for example on macOS, sun.jnu.encoding is hard-coded in the JVM as UTF-8.)
argv = argv.stream().map(StringUtil::decodeBytestringUtf8).collect(toImmutableList());
}
builder.command(argv);
if (params.getEnv() != null) {
builder.environment().clear();
builder.environment().putAll(params.getEnv());
Expand Down
53 changes: 53 additions & 0 deletions src/test/shell/integration/unicode_test.bzl
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Copyright 2023 The Bazel Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Rule implementations exercised in unicode_test"""

def _run_executable_rule_impl(ctx):
out = ctx.outputs.out
ctx.actions.run(
executable = ctx.executable.executable,
arguments = [out.path] + ctx.attr.extra_arguments,
outputs = [out],
)
return [DefaultInfo(files = depset([out]))]

run_executable_rule = rule(
implementation = _run_executable_rule_impl,
doc = "Runs `executable` via ctx.actions.run() with `out` as the first argument and `extra_arguments` as remaining arguments",
attrs = {
"executable": attr.label(allow_single_file = True, executable = True, cfg = "exec"),
"out": attr.output(),
"extra_arguments": attr.string_list(),
},
)

def _write_file_rule_impl(ctx):
out = ctx.outputs.out
ctx.actions.write(
output = out,
content = ctx.attr.content,
is_executable = ctx.attr.is_executable,
)
return [DefaultInfo(files = depset([out]))]

write_file_rule = rule(
implementation = _write_file_rule_impl,
doc = "Writes `content` to `out` via ctx.actions.write()",
attrs = {
"content": attr.string(),
"out": attr.output(),
"is_executable": attr.bool(),
},
)
71 changes: 71 additions & 0 deletions src/test/shell/integration/unicode_test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
#!/bin/bash
#
# Copyright 2023 The Bazel Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Test of Bazel's unicode i/o in actions

# --- begin runfiles.bash initialization v3 ---
# Copy-pasted from the Bazel Bash runfiles library v3.
set -uo pipefail; set +e; f=bazel_tools/tools/bash/runfiles/runfiles.bash
source "${RUNFILES_DIR:-/dev/null}/$f" 2>/dev/null || \
source "$(grep -sm1 "^$f " "${RUNFILES_MANIFEST_FILE:-/dev/null}" | cut -f2- -d' ')" 2>/dev/null || \
source "$0.runfiles/$f" 2>/dev/null || \
source "$(grep -sm1 "^$f " "$0.runfiles_manifest" | cut -f2- -d' ')" 2>/dev/null || \
source "$(grep -sm1 "^$f " "$0.exe.runfiles_manifest" | cut -f2- -d' ')" 2>/dev/null || \
{ echo>&2 "ERROR: cannot find $f"; exit 1; }; f=; set -e
# --- end runfiles.bash initialization v3 ---

source "$(rlocation "io_bazel/src/test/shell/integration_test_setup.sh")" \
|| { echo "integration_test_setup.sh not found!" >&2; exit 1; }

touch WORKSPACE
cp "$(rlocation "io_bazel/src/test/shell/integration/unicode_test_BUILD")" BUILD
cp "$(rlocation "io_bazel/src/test/shell/integration/unicode_test.bzl")" .
cp "$(rlocation "io_bazel/src/test/shell/integration/unicode_test_expected.txt")" .

function test_unicode_genrule_cmd {
local test_name="genrule_cmd"
bazel build --genrule_strategy=local --spawn_strategy=local \
--verbose_failures "${test_name}" >& "$TEST_log" \
|| fail "expected build to succeed"

diff -u "${PRODUCT_NAME}-genfiles/${test_name}.out" \
unicode_test_expected.txt \
>>"${TEST_log}" 2>&1 || fail "Output not as expected"
}

function test_unicode_action_run_argument {
local test_name="action_run_argument"
bazel build --genrule_strategy=local --spawn_strategy=local \
--verbose_failures "${test_name}" >& "$TEST_log" \
|| fail "expected build to succeed"

diff -u "${PRODUCT_NAME}-bin/${test_name}.out" \
unicode_test_expected.txt \
>>"${TEST_log}" 2>&1 || fail "Output not as expected"
}

function test_unicode_action_write_content {
local test_name="action_write_content"
bazel build --genrule_strategy=local --spawn_strategy=local \
--verbose_failures "${test_name}" >& "$TEST_log" \
|| fail "expected build to succeed"

diff -u "${PRODUCT_NAME}-bin/${test_name}.out" \
unicode_test_expected.txt \
>>"${TEST_log}" 2>&1 || fail "Output not as expected"
}

run_suite "Integration tests for ${PRODUCT_NAME}'s unicode i/o in actions"
33 changes: 33 additions & 0 deletions src/test/shell/integration/unicode_test_BUILD
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# BUILD file for unicode_test
load(":unicode_test.bzl", "run_executable_rule", "write_file_rule")

# In Russian and Bengali: "Down with mojibake! We want unicode!"
non_ascii_string = "Долой кракозябры! আমরা ইউনিকোড চাই!"

genrule(
name = "genrule_cmd",
cmd = "echo -n \"%s\" > \"$@\"" % non_ascii_string,
outs = ["genrule_cmd.out"],
)

write_file_rule(
name = "shell_echo",
content = """#!/bin/bash
outfile=$1; shift
exec echo -n $@ > $outfile""",
out = "shell_echo.sh",
is_executable = True,
)

run_executable_rule(
name = "action_run_argument",
executable = "shell_echo.sh",
extra_arguments = [non_ascii_string],
out = "action_run_argument.out",
)

write_file_rule(
name = "action_write_content",
content = non_ascii_string,
out = "action_write_content.out",
)
1 change: 1 addition & 0 deletions src/test/shell/integration/unicode_test_expected.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Долой кракозябры! আমরা ইউনিকোড চাই!

0 comments on commit f00439d

Please sign in to comment.