diff --git a/node.gyp b/node.gyp index f50aea3475598d..05063223baf0fa 100644 --- a/node.gyp +++ b/node.gyp @@ -235,6 +235,8 @@ 'src/req-wrap.h', 'src/req-wrap-inl.h', 'src/string_bytes.h', + 'src/string_search.h', + 'src/string_utils.h', 'src/stream_base.h', 'src/stream_base-inl.h', 'src/stream_wrap.h', diff --git a/src/node_url.cc b/src/node_url.cc index 2bdc080953f294..95f7a4ff1e2bfa 100644 --- a/src/node_url.cc +++ b/src/node_url.cc @@ -9,6 +9,7 @@ #include "base-object.h" #include "base-object-inl.h" #include "node_i18n.h" +#include "string_utils.h" #include #include @@ -837,6 +838,7 @@ static url_host_type ParseHost(url_host* host, url_host_type type = HOST_TYPE_FAILED; const char* pointer = input; std::string decoded; + const char *buf = NULL; if (length == 0) goto end; @@ -853,9 +855,19 @@ static url_host_type ParseHost(url_host* host, // First, we have to percent decode PercentDecode(input, length, &decoded); - // Then we have to punycode toASCII - if (!ToASCII(&decoded, &decoded)) - goto end; + // Match browser behavior for ASCII only domains + // and do not run them through ToASCII algorithm. + buf = decoded.c_str(); + if (!stringutils::ContainsNonAscii(buf, strlen(buf))) { + // Lowercase ASCII domains + for (size_t n = 0; n < decoded.size(); n++) { + decoded[n] = ASCIILowercase(decoded[n]); + } + } else { + // Then we have to Unicode IDNA toASCII + if (!ToASCII(&decoded, &decoded)) + goto end; + } // If any of the following characters are still present, we have to fail for (size_t n = 0; n < decoded.size(); n++) { diff --git a/src/string_bytes.cc b/src/string_bytes.cc index 636d7da25d3291..2d33b1bbdf2f00 100644 --- a/src/string_bytes.cc +++ b/src/string_bytes.cc @@ -25,6 +25,7 @@ #include "node.h" #include "node_buffer.h" #include "v8.h" +#include "string_utils.h" #include #include // memcpy @@ -550,60 +551,6 @@ size_t StringBytes::Size(Isolate* isolate, return data_size; } - - - -static bool contains_non_ascii_slow(const char* buf, size_t len) { - for (size_t i = 0; i < len; ++i) { - if (buf[i] & 0x80) - return true; - } - return false; -} - - -static bool contains_non_ascii(const char* src, size_t len) { - if (len < 16) { - return contains_non_ascii_slow(src, len); - } - - const unsigned bytes_per_word = sizeof(uintptr_t); - const unsigned align_mask = bytes_per_word - 1; - const unsigned unaligned = reinterpret_cast(src) & align_mask; - - if (unaligned > 0) { - const unsigned n = bytes_per_word - unaligned; - if (contains_non_ascii_slow(src, n)) - return true; - src += n; - len -= n; - } - - -#if defined(_WIN64) || defined(_LP64) - const uintptr_t mask = 0x8080808080808080ll; -#else - const uintptr_t mask = 0x80808080l; -#endif - - const uintptr_t* srcw = reinterpret_cast(src); - - for (size_t i = 0, n = len / bytes_per_word; i < n; ++i) { - if (srcw[i] & mask) - return true; - } - - const unsigned remainder = len & align_mask; - if (remainder > 0) { - const size_t offset = len - remainder; - if (contains_non_ascii_slow(src + offset, remainder)) - return true; - } - - return false; -} - - static void force_ascii_slow(const char* src, char* dst, size_t len) { for (size_t i = 0; i < len; ++i) { dst[i] = src[i] & 0x7f; @@ -709,7 +656,7 @@ MaybeLocal StringBytes::Encode(Isolate* isolate, } case ASCII: - if (contains_non_ascii(buf, buflen)) { + if (stringutils::ContainsNonAscii(buf, buflen)) { char* out = node::UncheckedMalloc(buflen); if (out == nullptr) { *error = SB_MALLOC_FAILED_ERROR; diff --git a/src/string_utils.h b/src/string_utils.h new file mode 100644 index 00000000000000..a6f9978bf4621a --- /dev/null +++ b/src/string_utils.h @@ -0,0 +1,61 @@ + +#ifndef SRC_STRING_UTILS_H_ +#define SRC_STRING_UTILS_H_ + +#include +#include + +namespace node { +namespace stringutils { + inline static bool contains_non_ascii_slow(const char* buf, size_t len) { + for (size_t i = 0; i < len; ++i) { + if (buf[i] & 0x80) + return true; + } + return false; + } + + inline bool ContainsNonAscii(const char* src, size_t len) { + if (len < 16) { + return contains_non_ascii_slow(src, len); + } + + const unsigned bytes_per_word = sizeof(uintptr_t); + const unsigned align_mask = bytes_per_word - 1; + const unsigned unaligned = reinterpret_cast(src) & align_mask; + + if (unaligned > 0) { + const unsigned n = bytes_per_word - unaligned; + if (contains_non_ascii_slow(src, n)) + return true; + src += n; + len -= n; + } + + + #if defined(_WIN64) || defined(_LP64) + const uintptr_t mask = 0x8080808080808080ll; + #else + const uintptr_t mask = 0x80808080l; + #endif + + const uintptr_t* srcw = reinterpret_cast(src); + + for (size_t i = 0, n = len / bytes_per_word; i < n; ++i) { + if (srcw[i] & mask) + return true; + } + + const unsigned remainder = len & align_mask; + if (remainder > 0) { + const size_t offset = len - remainder; + if (contains_non_ascii_slow(src + offset, remainder)) + return true; + } + + return false; + } +} // namespace stringutils +} // namespace node + +#endif // SRC_STRING_UTILS_H_ diff --git a/test/fixtures/url-domains-with-hyphens.js b/test/fixtures/url-domains-with-hyphens.js new file mode 100644 index 00000000000000..5bd1136d1e6809 --- /dev/null +++ b/test/fixtures/url-domains-with-hyphens.js @@ -0,0 +1,27 @@ +'use strict'; + +module.exports = { + valid: [ + // URLs with hyphen + { + ascii: 'r4---sn-a5mlrn7s.gevideo.com', + unicode: 'r4---sn-a5mlrn7s.gevideo.com' + }, + { + ascii: '-sn-a5mlrn7s.gevideo.com', + unicode: '-sn-a5mlrn7s.gevideo.com' + }, + { + ascii: 'sn-a5mlrn7s-.gevideo.com', + unicode: 'sn-a5mlrn7s-.gevideo.com' + }, + { + ascii: '-sn-a5mlrn7s-.gevideo.com', + unicode: '-sn-a5mlrn7s-.gevideo.com' + }, + { + ascii: '-sn--a5mlrn7s-.gevideo.com', + unicode: '-sn--a5mlrn7s-.gevideo.com' + } + ] +} diff --git a/test/parallel/test-whatwg-url-domainto.js b/test/parallel/test-whatwg-url-domainto.js index 13ff9968705b98..4e1bee2ab55ed8 100644 --- a/test/parallel/test-whatwg-url-domainto.js +++ b/test/parallel/test-whatwg-url-domainto.js @@ -11,6 +11,7 @@ const { domainToASCII, domainToUnicode } = require('url'); // Tests below are not from WPT. const tests = require('../fixtures/url-idna.js'); +const testsHyphenDomains = require('../fixtures/url-domains-with-hyphens.js'); { const expectedError = common.expectsError( @@ -34,6 +35,13 @@ const tests = require('../fixtures/url-idna.js'); } } +{ + for (const [i, { ascii, unicode }] of testsHyphenDomains.valid.entries()) { + assert.strictEqual(ascii, domainToASCII(unicode), + `domainToASCII(${i + 1})`); + } +} + { const convertFunc = { ascii: domainToASCII,