diff --git a/src/node_buffer.cc b/src/node_buffer.cc index af53c98efbd3c8..21960f88528d3a 100644 --- a/src/node_buffer.cc +++ b/src/node_buffer.cc @@ -28,6 +28,7 @@ #include #include // memcpy +#include #define MIN(a,b) ((a) < (b) ? (a) : (b)) @@ -247,7 +248,7 @@ Handle Buffer::BinarySlice(const Arguments &args) { } -static bool contains_non_ascii(const char* buf, size_t len) { +static bool contains_non_ascii_slow(const char* buf, size_t len) { for (size_t i = 0; i < len; ++i) { if (buf[i] & 0x80) return true; } @@ -255,13 +256,100 @@ static bool contains_non_ascii(const char* buf, size_t len) { } -static void force_ascii(const char* src, char* dst, size_t len) { +static bool contains_non_ascii(const char* src, size_t len) { + if (len < 16) { + return contains_non_ascii_slow(src, len); + } + + const unsigned bytes_per_word = BITS_PER_LONG / CHAR_BIT; + const unsigned align_mask = bytes_per_word - 1; + const unsigned unaligned = reinterpret_cast(src) & align_mask; + + if (unaligned > 0) { + const unsigned n = bytes_per_word - unaligned; + if (contains_non_ascii_slow(src, n)) return true; + src += n; + len -= n; + } + +#if BITS_PER_LONG == 64 + typedef uint64_t word; + const uint64_t mask = 0x8080808080808080ll; +#else + typedef uint32_t word; + const uint32_t mask = 0x80808080l; +#endif + + const word* srcw = reinterpret_cast(src); + + for (size_t i = 0, n = len / bytes_per_word; i < n; ++i) { + if (srcw[i] & mask) return true; + } + + const unsigned remainder = len & align_mask; + if (remainder > 0) { + const size_t offset = len - remainder; + if (contains_non_ascii_slow(src + offset, remainder)) return true; + } + + return false; +} + + +static void force_ascii_slow(const char* src, char* dst, size_t len) { for (size_t i = 0; i < len; ++i) { dst[i] = src[i] & 0x7f; } } +static void force_ascii(const char* src, char* dst, size_t len) { + if (len < 16) { + force_ascii_slow(src, dst, len); + return; + } + + const unsigned bytes_per_word = BITS_PER_LONG / CHAR_BIT; + const unsigned align_mask = bytes_per_word - 1; + const unsigned src_unalign = reinterpret_cast(src) & align_mask; + const unsigned dst_unalign = reinterpret_cast(dst) & align_mask; + + if (src_unalign > 0) { + if (src_unalign == dst_unalign) { + const unsigned unalign = bytes_per_word - src_unalign; + force_ascii_slow(src, dst, unalign); + src += unalign; + dst += unalign; + len -= src_unalign; + } else { + force_ascii_slow(src, dst, len); + return; + } + } + +#if BITS_PER_LONG == 64 + typedef uint64_t word; + const uint64_t mask = ~0x8080808080808080ll; +#else + typedef uint32_t word; + const uint32_t mask = ~0x80808080l; +#endif + + const word* srcw = reinterpret_cast(src); + word* dstw = reinterpret_cast(dst); + + for (size_t i = 0, n = len / bytes_per_word; i < n; ++i) { + dstw[i] = srcw[i] & mask; + } + + const unsigned remainder = len & align_mask; + if (remainder > 0) { + const size_t offset = len - remainder; + force_ascii_slow(src + offset, dst + offset, remainder); + } +} + + Handle Buffer::AsciiSlice(const Arguments &args) { HandleScope scope; Buffer *parent = ObjectWrap::Unwrap(args.This()); diff --git a/src/node_internals.h b/src/node_internals.h index 0502ec90586e42..10b1742fb54f23 100644 --- a/src/node_internals.h +++ b/src/node_internals.h @@ -48,6 +48,12 @@ inline static int snprintf(char* buf, unsigned int len, const char* fmt, ...) { } #endif +#if defined(__x86_64__) +# define BITS_PER_LONG 64 +#else +# define BITS_PER_LONG 32 +#endif + #ifndef offset_of // g++ in strict mode complains loudly about the system offsetof() macro // because it uses NULL as the base address. diff --git a/test/simple/test-buffer-ascii.js b/test/simple/test-buffer-ascii.js index a741a3db1e5ac9..784597a48e40ed 100644 --- a/test/simple/test-buffer-ascii.js +++ b/test/simple/test-buffer-ascii.js @@ -25,3 +25,21 @@ var assert = require('assert'); // ASCII conversion in node.js simply masks off the high bits, // it doesn't do transliteration. assert.equal(Buffer('hérité').toString('ascii'), 'hC)ritC)'); + +// 71 characters, 78 bytes. The ’ character is a triple-byte sequence. +var input = 'C’est, graphiquement, la réunion d’un accent aigu ' + + 'et d’un accent grave.'; + +var expected = 'Cb\u0000\u0019est, graphiquement, la rC)union ' + + 'db\u0000\u0019un accent aigu et db\u0000\u0019un ' + + 'accent grave.'; + +var buf = Buffer(input); + +for (var i = 0; i < expected.length; ++i) { + assert.equal(buf.slice(i).toString('ascii'), expected.slice(i)); + + // Skip remainder of multi-byte sequence. + if (input.charCodeAt(i) > 65535) ++i; + if (input.charCodeAt(i) > 127) ++i; +}