diff --git a/Data/Aeson/Parser/Internal.hs b/Data/Aeson/Parser/Internal.hs index 63393c14c..a506a976a 100644 --- a/Data/Aeson/Parser/Internal.hs +++ b/Data/Aeson/Parser/Internal.hs @@ -34,35 +34,20 @@ module Data.Aeson.Parser.Internal import Prelude () import Prelude.Compat -import Control.Monad.IO.Class (liftIO) import Data.Aeson.Types.Internal (IResult(..), JSONPath, Result(..), Value(..)) import Data.Attoparsec.ByteString.Char8 (Parser, char, endOfInput, scientific, skipSpace, string) -import Data.Bits ((.|.), shiftL) -import Data.ByteString.Internal (ByteString(..)) -import Data.Char (chr) import Data.Text (Text) -import Data.Text.Encoding (decodeUtf8') -import Data.Text.Internal.Encoding.Utf8 (ord2, ord3, ord4) -import Data.Text.Internal.Unsafe.Char (ord) -import Data.Vector as Vector (Vector, empty, fromList, reverse) -import Data.Word (Word8) -import Foreign.ForeignPtr (withForeignPtr) -import Foreign.Marshal.Utils (copyBytes) -import Foreign.Ptr (Ptr, minusPtr, plusPtr) -import Foreign.Storable (poke) -import System.IO.Unsafe (unsafePerformIO) +import Data.Vector as Vector (Vector, empty, fromListN, reverse) import qualified Data.Attoparsec.ByteString as A import qualified Data.Attoparsec.Lazy as L -import qualified Data.Attoparsec.Zepto as Z import qualified Data.ByteString as B -import qualified Data.ByteString.Internal as B import qualified Data.ByteString.Lazy as L -import qualified Data.ByteString.Unsafe as B import qualified Data.HashMap.Strict as H +import Data.Aeson.Parser.Unescape #if MIN_VERSION_ghc_prim(0,3,1) -import GHC.Base (Int#, (==#), isTrue#, orI#, word2Int#) +import GHC.Base (Int#, (==#), isTrue#, word2Int#) import GHC.Word (Word8(W8#)) #endif @@ -125,16 +110,18 @@ objectValues str val = do w <- A.peekWord8' if w == CLOSE_CURLY then A.anyWord8 >> return H.empty - else loop H.empty + else loop [] where - loop m0 = do + -- Why use acc pattern here, you may ask? because 'H.fromList' use 'unsafeInsert' + -- and it's much faster because it's doing in place update to the 'HashMap'! + loop acc = do k <- str <* skipSpace <* char ':' v <- val <* skipSpace - let !m = H.insert k v m0 ch <- A.satisfy $ \w -> w == COMMA || w == CLOSE_CURLY + let acc' = (k, v) : acc if ch == COMMA - then skipSpace >> loop m - else return m + then skipSpace >> loop acc' + else return (H.fromList acc') {-# INLINE objectValues #-} array_ :: Parser Value @@ -151,14 +138,14 @@ arrayValues val = do w <- A.peekWord8' if w == CLOSE_SQUARE then A.anyWord8 >> return Vector.empty - else loop [] + else loop [] 1 where - loop acc = do + loop acc !len = do v <- val <* skipSpace ch <- A.satisfy $ \w -> w == COMMA || w == CLOSE_SQUARE if ch == COMMA - then skipSpace >> loop (v:acc) - else return (Vector.reverse (Vector.fromList (v:acc))) + then skipSpace >> loop (v:acc) (len+1) + else return (Vector.reverse (Vector.fromListN len (v:acc))) {-# INLINE arrayValues #-} -- | Parse any JSON value. You should usually 'json' in preference to @@ -214,101 +201,30 @@ jstring = A.word8 DOUBLE_QUOTE *> jstring_ jstring_ :: Parser Text {-# INLINE jstring_ #-} jstring_ = {-# SCC "jstring_" #-} do - (s, fin) <- A.runScanner startState go - _ <- A.anyWord8 - s1 <- if isEscaped fin - then case unescape s of - Right r -> return r - Left err -> fail err - else return s - case decodeUtf8' s1 of + s <- A.scan startState go <* A.anyWord8 + case unescapeText s of Right r -> return r Left err -> fail $ show err where #if MIN_VERSION_ghc_prim(0,3,1) - isEscaped (S _ escaped) = isTrue# escaped - startState = S 0# 0# - go (S a b) (W8# c) - | isTrue# a = Just (S 0# b) + startState = S 0# + go (S a) (W8# c) + | isTrue# a = Just (S 0#) | isTrue# (word2Int# c ==# 34#) = Nothing -- double quote | otherwise = let a' = word2Int# c ==# 92# -- backslash - in Just (S a' (orI# a' b)) + in Just (S a') -data S = S Int# Int# +data S = S Int# #else - isEscaped (S _ escaped) = escaped - startState = S False False - go (S a b) c - | a = Just (S False b) + startState = False + go a c + | a = Just False | c == DOUBLE_QUOTE = Nothing | otherwise = let a' = c == backslash - in Just (S a' (a' || b)) + in Just a' where backslash = BACKSLASH - -data S = S !Bool !Bool #endif -unescape :: ByteString -> Either String ByteString -unescape s = unsafePerformIO $ do - let len = B.length s - fp <- B.mallocByteString len - -- We perform no bounds checking when writing to the destination - -- string, as unescaping always makes it shorter than the source. - withForeignPtr fp $ \ptr -> do - ret <- Z.parseT (go ptr) s - case ret of - Left err -> return (Left err) - Right p -> do - let newlen = p `minusPtr` ptr - slop = len - newlen - Right <$> if slop >= 128 && slop >= len `quot` 4 - then B.create newlen $ \np -> copyBytes np ptr newlen - else return (PS fp 0 newlen) - where - go ptr = do - h <- Z.takeWhile (/=BACKSLASH) - let rest = do - start <- Z.take 2 - let !slash = B.unsafeHead start - !t = B.unsafeIndex start 1 - escape = case B.elemIndex t "\"\\/ntbrfu" of - Just i -> i - _ -> 255 - if slash /= BACKSLASH || escape == 255 - then fail "invalid JSON escape sequence" - else - if t /= 117 -- 'u' - then copy h ptr >>= word8 (B.unsafeIndex mapping escape) >>= go - else do - a <- hexQuad - if a < 0xd800 || a > 0xdfff - then copy h ptr >>= charUtf8 (chr a) >>= go - else do - b <- Z.string "\\u" *> hexQuad - if a <= 0xdbff && b >= 0xdc00 && b <= 0xdfff - then let !c = ((a - 0xd800) `shiftL` 10) + - (b - 0xdc00) + 0x10000 - in copy h ptr >>= charUtf8 (chr c) >>= go - else fail "invalid UTF-16 surrogates" - done <- Z.atEnd - if done - then copy h ptr - else rest - mapping = "\"\\/\n\t\b\r\f" - -hexQuad :: Z.ZeptoT IO Int -hexQuad = do - s <- Z.take 4 - let hex n | w >= C_0 && w <= C_9 = w - C_0 - | w >= C_a && w <= C_f = w - 87 - | w >= C_A && w <= C_F = w - 55 - | otherwise = 255 - where w = fromIntegral $ B.unsafeIndex s n - a = hex 0; b = hex 1; c = hex 2; d = hex 3 - if (a .|. b .|. c .|. d) /= 255 - then return $! d .|. (c `shiftL` 4) .|. (b `shiftL` 8) .|. (a `shiftL` 12) - else fail "invalid hex escape" - decodeWith :: Parser Value -> (Value -> Result a) -> L.ByteString -> Maybe a decodeWith p to s = case L.parse p s of @@ -372,38 +288,3 @@ jsonEOF = json <* skipSpace <* endOfInput -- end-of-input. See also: 'json''. jsonEOF' :: Parser Value jsonEOF' = json' <* skipSpace <* endOfInput - -word8 :: Word8 -> Ptr Word8 -> Z.ZeptoT IO (Ptr Word8) -word8 w ptr = do - liftIO $ poke ptr w - return $! ptr `plusPtr` 1 - -copy :: ByteString -> Ptr Word8 -> Z.ZeptoT IO (Ptr Word8) -copy (PS fp off len) ptr = - liftIO . withForeignPtr fp $ \src -> do - copyBytes ptr (src `plusPtr` off) len - return $! ptr `plusPtr` len - -charUtf8 :: Char -> Ptr Word8 -> Z.ZeptoT IO (Ptr Word8) -charUtf8 ch ptr - | ch < '\x80' = liftIO $ do - poke ptr (fromIntegral (ord ch)) - return $! ptr `plusPtr` 1 - | ch < '\x800' = liftIO $ do - let (a,b) = ord2 ch - poke ptr a - poke (ptr `plusPtr` 1) b - return $! ptr `plusPtr` 2 - | ch < '\xffff' = liftIO $ do - let (a,b,c) = ord3 ch - poke ptr a - poke (ptr `plusPtr` 1) b - poke (ptr `plusPtr` 2) c - return $! ptr `plusPtr` 3 - | otherwise = liftIO $ do - let (a,b,c,d) = ord4 ch - poke ptr a - poke (ptr `plusPtr` 1) b - poke (ptr `plusPtr` 2) c - poke (ptr `plusPtr` 3) d - return $! ptr `plusPtr` 4 diff --git a/Data/Aeson/Parser/Unescape.hs b/Data/Aeson/Parser/Unescape.hs new file mode 100644 index 000000000..d3682fd16 --- /dev/null +++ b/Data/Aeson/Parser/Unescape.hs @@ -0,0 +1,51 @@ +{-# LANGUAGE ForeignFunctionInterface #-} +{-# LANGUAGE MagicHash #-} +{-# LANGUAGE UnliftedFFITypes #-} + +module Data.Aeson.Parser.Unescape ( + unescapeText +) where + +import Control.Exception (evaluate, throw, try) +import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) +import Data.ByteString as B +import Data.ByteString.Internal as B hiding (c2w) +import qualified Data.Text.Array as A +import Data.Text.Encoding.Error (UnicodeException (..)) +import Data.Text.Internal (Text (..)) +import Data.Text.Internal.Private (runText) +import Data.Text.Unsafe (unsafeDupablePerformIO) +import Data.Word (Word8) +import Foreign.C.Types (CInt (..), CSize (..)) +import Foreign.ForeignPtr (withForeignPtr) +import Foreign.Marshal.Utils (with) +import Foreign.Ptr (Ptr, plusPtr) +import Foreign.Storable (peek) +import GHC.Base (MutableByteArray#) + +foreign import ccall unsafe "_js_decode_string" c_js_decode + :: MutableByteArray# s -> Ptr CSize + -> Ptr Word8 -> Ptr Word8 -> IO CInt + +unescapeText' :: ByteString -> Text +unescapeText' (PS fp off len) = runText $ \done -> do + let go dest = withForeignPtr fp $ \ptr -> + with (0::CSize) $ \destOffPtr -> do + let end = ptr `plusPtr` (off + len) + loop curPtr = do + res <- c_js_decode (A.maBA dest) destOffPtr curPtr end + case res of + 0 -> do + n <- peek destOffPtr + unsafeSTToIO (done dest (fromIntegral n)) + _ -> + throw (DecodeError desc Nothing) + loop (ptr `plusPtr` off) + (unsafeIOToST . go) =<< A.new len + where + desc = "Data.Text.Internal.Encoding.decodeUtf8: Invalid UTF-8 stream" +{-# INLINE unescapeText' #-} + +unescapeText :: ByteString -> Either UnicodeException Text +unescapeText = unsafeDupablePerformIO . try . evaluate . unescapeText' +{-# INLINE unescapeText #-} diff --git a/aeson.cabal b/aeson.cabal index 273660c74..05bded1c1 100644 --- a/aeson.cabal +++ b/aeson.cabal @@ -52,6 +52,7 @@ extra-source-files: examples/*.hs examples/Twitter/*.hs include/*.h + cbits/*.c flag developer description: operate in developer mode @@ -86,6 +87,7 @@ library Data.Aeson.Encoding.Builder Data.Aeson.Internal.Functions Data.Aeson.Parser.Internal + Data.Aeson.Parser.Unescape Data.Aeson.Parser.Time Data.Aeson.Types.FromJSON Data.Aeson.Types.Generic @@ -136,6 +138,7 @@ library ghc-options: -O2 -Wall include-dirs: include + c-sources: cbits/unescape_string.c test-suite tests default-language: Haskell2010 diff --git a/benchmarks/aeson-benchmarks.cabal b/benchmarks/aeson-benchmarks.cabal index b6cee5b97..6a2ce9ea1 100644 --- a/benchmarks/aeson-benchmarks.cabal +++ b/benchmarks/aeson-benchmarks.cabal @@ -11,7 +11,7 @@ flag bytestring-builder library hs-source-dirs: .. . - + c-sources: ../cbits/unescape_string.c exposed-modules: Data.Aeson Data.Aeson.Encoding diff --git a/cbits/unescape_string.c b/cbits/unescape_string.c new file mode 100644 index 000000000..2d3120946 --- /dev/null +++ b/cbits/unescape_string.c @@ -0,0 +1,150 @@ +// Copyright (c) 2008-2009 Bjoern Hoehrmann +// Copyright (c) 2015, Ondrej Palkovsky +// Copyright (c) 2016, Winterland + +#include +#include +#include + + +#define UTF8_ACCEPT 0 +#define UTF8_REJECT 12 + +static const uint8_t utf8d[] = { + // The first part of the table maps bytes to character classes that + // to reduce the size of the transition table and create bitmasks. + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, + + // The second part is a transition table that maps a combination + // of a state of the automaton and a character class to a state. + 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, + 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, + 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, + 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, + 12,36,12,12,12,12,12,12,12,12,12,12, +}; + +static inline uint32_t decode(uint32_t* state, uint32_t* codep, uint32_t byte) { + uint32_t type = utf8d[byte]; + + *codep = (*state != UTF8_ACCEPT) ? + (byte & 0x3fu) | (*codep << 6) : + (0xff >> type) & (byte); + + *state = utf8d[256 + *state + type]; + return *state; +} + +static inline uint16_t decode_hex(uint32_t c) +{ + if (c >= '0' && c <= '9') return c - '0'; + else if (c >= 'a' && c <= 'f') return c - 'a' + 10; + else if (c >= 'A' && c <= 'F') return c - 'A' + 10; + return 0xFFFF; // Should not happen +} + +// Decode, return non-zero value on error +int _js_decode_string(uint16_t *const dest, size_t *destoff, + const uint8_t *s, const uint8_t *const srcend) +{ + uint16_t *d = dest + *destoff; + uint32_t state = 0; + uint32_t codepoint; + + uint8_t surrogate = 0; + uint16_t temp_hex = 0; + uint16_t unidata; + + // Optimized version of dispatch when just an ASCII char is expected + #define DISPATCH_ASCII(label) {\ + if (s >= srcend) {\ + return -1;\ + }\ + codepoint = *s++;\ + goto label;\ + } + + standard: + // Test end of stream + while (s < srcend) { + if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) { + if (state == UTF8_REJECT) { return -1; } + continue; + } + + if (codepoint == '\\') + DISPATCH_ASCII(backslash) + else if (codepoint <= 0xffff) + *d++ = (uint16_t) codepoint; + else { + *d++ = (uint16_t) (0xD7C0 + (codepoint >> 10)); + *d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF)); + } + } + *destoff = d - dest; + // Exit point + return (state != UTF8_ACCEPT); + backslash: + switch (codepoint) { + case '"': + case '\\': + case '/': + *d++ = (uint16_t) codepoint; + goto standard; + break; + case 'b': *d++ = '\b';goto standard; + case 'f': *d++ = '\f';goto standard; + case 'n': *d++ = '\n';goto standard; + case 'r': *d++ = '\r';goto standard; + case 't': *d++ = '\t';goto standard; + case 'u': DISPATCH_ASCII(unicode1);;break; + default: + return -1; + } + unicode1: + temp_hex = decode_hex(codepoint); + if (temp_hex == 0xFFFF) { return -1; } + else unidata = temp_hex << 12; + DISPATCH_ASCII(unicode2); + unicode2: + temp_hex = decode_hex(codepoint); + if (temp_hex == 0xFFFF) { return -1; } + else unidata |= temp_hex << 8; + DISPATCH_ASCII(unicode3); + unicode3: + temp_hex = decode_hex(codepoint); + if (temp_hex == 0xFFFF) { return -1; } + else unidata |= temp_hex << 4; + DISPATCH_ASCII(unicode4); + unicode4: + temp_hex = decode_hex(codepoint); + if (temp_hex == 0xFFFF) { return -1; } + else unidata |= temp_hex; + *d++ = (uint16_t) unidata; + + if (surrogate) { + if (unidata <= 0xDC00 || unidata >= 0xDFFF) // is not low surrogate + return -1; + surrogate = 0; + } else if (unidata >= 0xD800 && unidata <= 0xDBFF ) { // is high surrogate + surrogate = 1; + DISPATCH_ASCII(surrogate1); + } else if (unidata >= 0xDC00 && unidata <= 0xDFFF) { // is low surrogate + return -1; + } + goto standard; + surrogate1: + if (codepoint != '\\') { return -1; } + DISPATCH_ASCII(surrogate2) + surrogate2: + if (codepoint != 'u') { return -1; } + DISPATCH_ASCII(unicode1) +} +