From 43c526f2b1b46a9e6d19c43a01e1144d51ca1111 Mon Sep 17 00:00:00 2001 From: Max Zerzouri Date: Sun, 16 May 2021 09:18:51 +0000 Subject: [PATCH] Update `@base64`, `utf8bytelength` and `fromjson` to handle binary strings --- docs/content/manual/manual.yml | 1 - src/builtin.c | 107 ++++++++++++++++++++++++++++----- tests/base64.test | 10 +++ tests/shtest | 19 ++++-- 4 files changed, 116 insertions(+), 21 deletions(-) diff --git a/docs/content/manual/manual.yml b/docs/content/manual/manual.yml index aee1ed47f6..649d74761c 100644 --- a/docs/content/manual/manual.yml +++ b/docs/content/manual/manual.yml @@ -1916,7 +1916,6 @@ sections: * `@base64d`: The inverse of `@base64`, input is decoded as specified by RFC 4648. - Note\: If the decoded string is not UTF-8, the results are undefined. This syntax can be combined with string interpolation in a useful way. You can follow a `@foo` token with a string diff --git a/src/builtin.c b/src/builtin.c index 1c6b08cd4d..0b5f11bded 100644 --- a/src/builtin.c +++ b/src/builtin.c @@ -464,10 +464,55 @@ static jv f_dump(jq_state *jq, jv input) { static jv f_json_parse(jq_state *jq, jv input) { if (jv_get_kind(input) != JV_KIND_STRING) return type_error(input, "only strings can be parsed"); - jv res = jv_parse_sized(jv_string_value(input), - jv_string_length_bytes(jv_copy(input))); + + const char* i = jv_string_value(input); + const char* end = i + jv_string_length_bytes(jv_copy(input)); + + struct jv_parser* parser = jv_parser_new(0); + int count = 0; + jv value = jv_invalid(); + while (i != NULL) { + const int max_utf8_len = 4; + unsigned char buf[100 + max_utf8_len]; + int buflen = 0; + int c; + while ((buflen + max_utf8_len < sizeof(buf)) && (i = jvp_utf8_extended_next(i, end, JVP_UTF8_REPLACE | JVP_UTF8_ERRORS_UTF8, &c))) { + if (c >= -0xFF && c <= -0x80) { + // Invalid UTF-8 byte, pass through + buf[buflen++] = -c; + } else + buflen += jvp_utf8_encode(c, buf + buflen); + } + jv_parser_set_buf(parser, buf, buflen, i != NULL); + for (;;) { + jv next = jv_parser_next(parser); + if (!jv_is_valid(next)) { + if (jv_invalid_has_msg(jv_copy(next))) { + count++; + jv_free(value); + value = next; + i = NULL; + } + break; + } + jv_free(value); + if (count++ == 0) + value = next; + else { + jv_free(next); + value = jv_invalid_with_msg(jv_string("Unexpected extra JSON values")); + i = NULL; + break; + } + } + } + jv_parser_free(parser); jv_free(input); - return res; + if (count == 0) { + jv_free(value); + value = jv_invalid_with_msg(jv_string("Expected JSON value")); + } + return value; } static jv f_tonumber(jq_state *jq, jv input) { @@ -514,7 +559,19 @@ static jv f_tostring(jq_state *jq, jv input) { static jv f_utf8bytelength(jq_state *jq, jv input) { if (jv_get_kind(input) != JV_KIND_STRING) return type_error(input, "only strings have UTF-8 byte length"); - return jv_number(jv_string_length_bytes(input)); + const char* i = jv_string_value(input); + const char* end = i + jv_string_length_bytes(jv_copy(input)); + int len = 0; + int c; + while ((i = jvp_utf8_extended_next(i, end, JVP_UTF8_REPLACE | JVP_UTF8_ERRORS_UTF8, &c))) { + if (c >= -0xFF && c <= -0x80) { + // Invalid UTF-8 byte, will be passed through + len++; + } else + len += jvp_utf8_encode_length(c); + } + jv_free(input); + return jv_number(len); } #define CHARS_ALPHANUM "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789" @@ -689,21 +746,41 @@ static jv f_format(jq_state *jq, jv input, jv fmt) { jv_free(fmt); input = f_tostring(jq, input); jv line = jv_string(""); - const unsigned char* data = (const unsigned char*)jv_string_value(input); - int len = jv_string_length_bytes(jv_copy(input)); - for (int i=0; i= 3 ? 3 : len-i; - for (int j=0; j<3; j++) { + const char* i = jv_string_value(input); + const char* end = i + jv_string_length_bytes(jv_copy(input)); + uint32_t code = 0; + int n = 0; + int c; + while ((i = jvp_utf8_extended_next(i, end, JVP_UTF8_REPLACE | JVP_UTF8_ERRORS_UTF8, &c))) { + unsigned char ubuf[4]; + int len = 0; + if (c >= -0xFF && c <= -0x80) { + // Invalid UTF-8 byte, pass through + ubuf[len++] = -c; + } else + len += jvp_utf8_encode(c, ubuf); + for (int x = 0; x < len; x++) { code <<= 8; - code |= j < n ? (unsigned)data[i+j] : 0; + code |= ubuf[x]; + if (++n == 3) { + char buf[4]; + for (int j = 0; j < 4; j++) + buf[j] = BASE64_ENCODE_TABLE[(code >> (18 - j*6)) & 0x3f]; + line = jv_string_append_buf(line, buf, sizeof(buf)); + n = 0; + code = 0; + } } + } + if (n > 0) { + assert(n < 3); + code <<= 8*(3 - n); char buf[4]; - for (int j=0; j<4; j++) { + for (int j = 0; j < 4; j++) buf[j] = BASE64_ENCODE_TABLE[(code >> (18 - j*6)) & 0x3f]; - } - if (n < 3) buf[3] = '='; - if (n < 2) buf[2] = '='; + buf[3] = '='; + if (n < 2) + buf[2] = '='; line = jv_string_append_buf(line, buf, sizeof(buf)); } jv_free(input); diff --git a/tests/base64.test b/tests/base64.test index 0f82b0b71d..6507bb83b7 100644 --- a/tests/base64.test +++ b/tests/base64.test @@ -33,3 +33,13 @@ . | try @base64d catch . "QUJDa" "string (\"QUJDa\") trailing base64 byte found" + +# random binary data +(. | @base64d | @base64) == . +"zns0Su1i4JjDfGiR95WOcU8iiPMOrfJTUBm9P1ot2qIMiyk04b0WSIFNTMD7w9ziMV8nSbwpPqNl3JKF1eWZrRRg24rbvh66O1e7Z1xIGPNqTqm+jdzRCkWSryR+67wXRVgD6Q==" +true + +# replace lone surrogates +@base64 +"foo\udca9\ud83dbar" +"Zm9v77+977+9YmFy" diff --git a/tests/shtest b/tests/shtest index eabdf26275..7a1a5f54b7 100755 --- a/tests/shtest +++ b/tests/shtest @@ -123,11 +123,20 @@ cmp $d/out $d/expected clean=false -# Invalid UTF-8 bytes are preserved when encoding/decoding JSON -dd if=/dev/urandom bs=1024 count=1024 >$d/rand 2>/dev/null -$VALGRIND $Q $JQ -sR . $d/rand >$d/out.json -$VALGRIND $Q $JQ -j . $d/out.json >$d/out -cmp $d/out $d/rand +# Invalid UTF-8 bytes are preserved when encoding/decoding JSON and base64 and concatenating binary strings +if dd if=/dev/urandom bs=1024 count=1024 >$d/rand 2>/dev/null; then + $VALGRIND $Q $JQ -sR . $d/rand >$d/out.json + $VALGRIND $Q $JQ -j . $d/out.json >$d/out + cmp $d/out $d/rand + $VALGRIND $Q $JQ -jR fromjson $d/out.json >$d/out + cmp $d/out $d/rand + $VALGRIND $Q $JQ -j '@base64 | @base64d' $d/out.json >$d/out + cmp $d/out $d/rand + base64 $d/rand | $VALGRIND $Q $JQ -R '@base64d' | $VALGRIND $Q $JQ -sj 'add' >$d/out + cmp $d/out $d/rand + $VALGRIND $Q $JQ -nj '$a' --rawfile a $d/rand >$d/out + cmp $d/out $d/rand +fi clean=true