Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

url: fast path ascii domains, do not run ToASCII #13030

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions node.gyp
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,8 @@
'src/req-wrap.h',
'src/req-wrap-inl.h',
'src/string_bytes.h',
'src/string_search.h',
'src/string_utils.h',
'src/stream_base.h',
'src/stream_base-inl.h',
'src/stream_wrap.h',
Expand Down
18 changes: 15 additions & 3 deletions src/node_url.cc
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include "base-object.h"
#include "base-object-inl.h"
#include "node_i18n.h"
#include "string_utils.h"

#include <string>
#include <vector>
Expand Down Expand Up @@ -837,6 +838,7 @@ static url_host_type ParseHost(url_host* host,
url_host_type type = HOST_TYPE_FAILED;
const char* pointer = input;
std::string decoded;
const char *buf = NULL;

if (length == 0)
goto end;
Expand All @@ -853,9 +855,19 @@ static url_host_type ParseHost(url_host* host,
// First, we have to percent decode
PercentDecode(input, length, &decoded);

// Then we have to punycode toASCII
if (!ToASCII(&decoded, &decoded))
goto end;
// Match browser behavior for ASCII only domains
// and do not run them through ToASCII algorithm.
buf = decoded.c_str();
if (!stringutils::ContainsNonAscii(buf, strlen(buf))) {
// Lowercase ASCII domains
for (size_t n = 0; n < decoded.size(); n++) {
decoded[n] = ASCIILowercase(decoded[n]);
}
} else {
// Then we have to Unicode IDNA toASCII
if (!ToASCII(&decoded, &decoded))
goto end;
}

// If any of the following characters are still present, we have to fail
for (size_t n = 0; n < decoded.size(); n++) {
Expand Down
57 changes: 2 additions & 55 deletions src/string_bytes.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include "node.h"
#include "node_buffer.h"
#include "v8.h"
#include "string_utils.h"

#include <limits.h>
#include <string.h> // memcpy
Expand Down Expand Up @@ -550,60 +551,6 @@ size_t StringBytes::Size(Isolate* isolate,
return data_size;
}




static bool contains_non_ascii_slow(const char* buf, size_t len) {
for (size_t i = 0; i < len; ++i) {
if (buf[i] & 0x80)
return true;
}
return false;
}


static bool contains_non_ascii(const char* src, size_t len) {
if (len < 16) {
return contains_non_ascii_slow(src, len);
}

const unsigned bytes_per_word = sizeof(uintptr_t);
const unsigned align_mask = bytes_per_word - 1;
const unsigned unaligned = reinterpret_cast<uintptr_t>(src) & align_mask;

if (unaligned > 0) {
const unsigned n = bytes_per_word - unaligned;
if (contains_non_ascii_slow(src, n))
return true;
src += n;
len -= n;
}


#if defined(_WIN64) || defined(_LP64)
const uintptr_t mask = 0x8080808080808080ll;
#else
const uintptr_t mask = 0x80808080l;
#endif

const uintptr_t* srcw = reinterpret_cast<const uintptr_t*>(src);

for (size_t i = 0, n = len / bytes_per_word; i < n; ++i) {
if (srcw[i] & mask)
return true;
}

const unsigned remainder = len & align_mask;
if (remainder > 0) {
const size_t offset = len - remainder;
if (contains_non_ascii_slow(src + offset, remainder))
return true;
}

return false;
}


static void force_ascii_slow(const char* src, char* dst, size_t len) {
for (size_t i = 0; i < len; ++i) {
dst[i] = src[i] & 0x7f;
Expand Down Expand Up @@ -709,7 +656,7 @@ MaybeLocal<Value> StringBytes::Encode(Isolate* isolate,
}

case ASCII:
if (contains_non_ascii(buf, buflen)) {
if (stringutils::ContainsNonAscii(buf, buflen)) {
char* out = node::UncheckedMalloc(buflen);
if (out == nullptr) {
*error = SB_MALLOC_FAILED_ERROR;
Expand Down
61 changes: 61 additions & 0 deletions src/string_utils.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@

#ifndef SRC_STRING_UTILS_H_
#define SRC_STRING_UTILS_H_

#include <cstddef>
#include <cstdint>

namespace node {
namespace stringutils {
inline static bool contains_non_ascii_slow(const char* buf, size_t len) {
for (size_t i = 0; i < len; ++i) {
if (buf[i] & 0x80)
return true;
}
return false;
}

inline bool ContainsNonAscii(const char* src, size_t len) {
if (len < 16) {
return contains_non_ascii_slow(src, len);
}

const unsigned bytes_per_word = sizeof(uintptr_t);
const unsigned align_mask = bytes_per_word - 1;
const unsigned unaligned = reinterpret_cast<uintptr_t>(src) & align_mask;

if (unaligned > 0) {
const unsigned n = bytes_per_word - unaligned;
if (contains_non_ascii_slow(src, n))
return true;
src += n;
len -= n;
}


#if defined(_WIN64) || defined(_LP64)
const uintptr_t mask = 0x8080808080808080ll;
#else
const uintptr_t mask = 0x80808080l;
#endif

const uintptr_t* srcw = reinterpret_cast<const uintptr_t*>(src);

for (size_t i = 0, n = len / bytes_per_word; i < n; ++i) {
if (srcw[i] & mask)
return true;
}

const unsigned remainder = len & align_mask;
if (remainder > 0) {
const size_t offset = len - remainder;
if (contains_non_ascii_slow(src + offset, remainder))
return true;
}

return false;
}
} // namespace stringutils
} // namespace node

#endif // SRC_STRING_UTILS_H_
27 changes: 27 additions & 0 deletions test/fixtures/url-domains-with-hyphens.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
'use strict';

module.exports = {
valid: [
// URLs with hyphen
{
ascii: 'r4---sn-a5mlrn7s.gevideo.com',
unicode: 'r4---sn-a5mlrn7s.gevideo.com'
},
{
ascii: '-sn-a5mlrn7s.gevideo.com',
unicode: '-sn-a5mlrn7s.gevideo.com'
},
{
ascii: 'sn-a5mlrn7s-.gevideo.com',
unicode: 'sn-a5mlrn7s-.gevideo.com'
},
{
ascii: '-sn-a5mlrn7s-.gevideo.com',
unicode: '-sn-a5mlrn7s-.gevideo.com'
},
{
ascii: '-sn--a5mlrn7s-.gevideo.com',
unicode: '-sn--a5mlrn7s-.gevideo.com'
}
]
}
8 changes: 8 additions & 0 deletions test/parallel/test-whatwg-url-domainto.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ const { domainToASCII, domainToUnicode } = require('url');

// Tests below are not from WPT.
const tests = require('../fixtures/url-idna.js');
const testsHyphenDomains = require('../fixtures/url-domains-with-hyphens.js');

{
const expectedError = common.expectsError(
Expand All @@ -34,6 +35,13 @@ const tests = require('../fixtures/url-idna.js');
}
}

{
for (const [i, { ascii, unicode }] of testsHyphenDomains.valid.entries()) {
assert.strictEqual(ascii, domainToASCII(unicode),
Copy link
Member

@joyeecheung joyeecheung May 15, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If it's only for testing that those domains won't get converted, maybe just make the test cases an array of string and assert.strictEqual(domain, domainToASCII(domain))?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh..never mind, it is supposed to check uppercase characters will be converted to lowercase ones...(is there any in the test cases?)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Made the change, reverting it. Thanks for catching.

`domainToASCII(${i + 1})`);
}
}

{
const convertFunc = {
ascii: domainToASCII,
Expand Down