Skip to content

Commit

Permalink
big boi rewrite
Browse files Browse the repository at this point in the history
  • Loading branch information
emilypi committed Jun 16, 2020
1 parent 10f4255 commit 17e7c3a
Show file tree
Hide file tree
Showing 9 changed files with 380 additions and 285 deletions.
8 changes: 6 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,13 @@
#
# haskell-ci 'base16-bytestring.cabal'
#
# To regenerate the script (for example after adjusting tested-with) run
#
# haskell-ci regenerate
#
# For more information, see https://github.com/haskell-CI/haskell-ci
#
# version: 0.9.20200406
# version: 0.10.1
#
version: ~> 1.0
language: c
Expand Down Expand Up @@ -162,5 +166,5 @@ script:
- rm -f cabal.project.local
- ${CABAL} v2-build $WITHCOMPILER --disable-tests --disable-benchmarks all

# REGENDATA ("0.9.20200406",["base16-bytestring.cabal"])
# REGENDATA ("0.10.1",["base16-bytestring.cabal"])
# EOF
183 changes: 64 additions & 119 deletions Data/ByteString/Base16.hs
Original file line number Diff line number Diff line change
@@ -1,32 +1,39 @@
{-# LANGUAGE BangPatterns, MagicHash #-}

{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE CPP #-}
{-# LANGUAGE MagicHash #-}
-- |
-- Module : Data.ByteString.Base16
-- Copyright : (c) 2011 MailRank, Inc.
--
-- License : BSD
-- Maintainer : bos@serpentine.com
-- Maintainer : Herbert Valerio Riedel <hvr@gnu.org>,
-- Mikhail Glushenkov <mikhail.glushenkov@gmail.com>,
-- Emily Pillmore <emilypi@cohomolo.gy>
-- Stability : experimental
-- Portability : GHC
--
-- Fast and efficient encoding and decoding of base16-encoded strings.

--
module Data.ByteString.Base16
(
encode
, decode
) where
( encode
, decode
, decodeLenient
) where

import Data.ByteString (empty)
import Data.ByteString.Base16.Internal
import Data.ByteString.Internal

import Foreign.ForeignPtr
import Foreign.Ptr

import GHC.ForeignPtr
#if __GLASGOW_HASKELL__ >= 702
import System.IO.Unsafe (unsafeDupablePerformIO)
#else
import GHC.IO (unsafeDupablePerformIO)
#endif

import Data.ByteString.Char8 (empty)
import Data.ByteString.Internal (ByteString(..), createAndTrim', unsafeCreate)
import Data.Bits (shiftL)
import Foreign.ForeignPtr (ForeignPtr, withForeignPtr)
import Foreign.Ptr (Ptr, minusPtr, plusPtr)
import Foreign.Storable (peek, poke)
import System.IO.Unsafe (unsafePerformIO)
import GHC.Prim
import GHC.Types
import GHC.Word

-- | Encode a string into base16 form. The result will always be a
-- multiple of 2 bytes in length.
Expand All @@ -37,108 +44,46 @@ import GHC.Word
encode :: ByteString -> ByteString
encode (PS sfp soff slen)
| slen > maxBound `div` 2 =
error "Data.ByteString.Base16.encode: input too long"
| otherwise = unsafeCreate (slen*2) $ \dptr ->
withForeignPtr sfp $ \sptr ->
enc (sptr `plusPtr` soff) dptr
where
enc sptr = go sptr where
e = sptr `plusPtr` slen
go s d | s == e = return ()
| otherwise = do
x <- peek8 s
poke d (tlookup tableHi x)
poke (d `plusPtr` 1) (tlookup tableLo x)
go (s `plusPtr` 1) (d `plusPtr` 2)
tlookup :: Addr# -> Int -> Word8
tlookup table (I# index) = W8# (indexWord8OffAddr# table index)
!tableLo =
"\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x61\x62\x63\x64\x65\x66\
\\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x61\x62\x63\x64\x65\x66\
\\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x61\x62\x63\x64\x65\x66\
\\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x61\x62\x63\x64\x65\x66\
\\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x61\x62\x63\x64\x65\x66\
\\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x61\x62\x63\x64\x65\x66\
\\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x61\x62\x63\x64\x65\x66\
\\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x61\x62\x63\x64\x65\x66\
\\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x61\x62\x63\x64\x65\x66\
\\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x61\x62\x63\x64\x65\x66\
\\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x61\x62\x63\x64\x65\x66\
\\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x61\x62\x63\x64\x65\x66\
\\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x61\x62\x63\x64\x65\x66\
\\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x61\x62\x63\x64\x65\x66\
\\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x61\x62\x63\x64\x65\x66\
\\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x61\x62\x63\x64\x65\x66"#
!tableHi =
"\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\
\\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\
\\x32\x32\x32\x32\x32\x32\x32\x32\x32\x32\x32\x32\x32\x32\x32\x32\
\\x33\x33\x33\x33\x33\x33\x33\x33\x33\x33\x33\x33\x33\x33\x33\x33\
\\x34\x34\x34\x34\x34\x34\x34\x34\x34\x34\x34\x34\x34\x34\x34\x34\
\\x35\x35\x35\x35\x35\x35\x35\x35\x35\x35\x35\x35\x35\x35\x35\x35\
\\x36\x36\x36\x36\x36\x36\x36\x36\x36\x36\x36\x36\x36\x36\x36\x36\
\\x37\x37\x37\x37\x37\x37\x37\x37\x37\x37\x37\x37\x37\x37\x37\x37\
\\x38\x38\x38\x38\x38\x38\x38\x38\x38\x38\x38\x38\x38\x38\x38\x38\
\\x39\x39\x39\x39\x39\x39\x39\x39\x39\x39\x39\x39\x39\x39\x39\x39\
\\x61\x61\x61\x61\x61\x61\x61\x61\x61\x61\x61\x61\x61\x61\x61\x61\
\\x62\x62\x62\x62\x62\x62\x62\x62\x62\x62\x62\x62\x62\x62\x62\x62\
\\x63\x63\x63\x63\x63\x63\x63\x63\x63\x63\x63\x63\x63\x63\x63\x63\
\\x64\x64\x64\x64\x64\x64\x64\x64\x64\x64\x64\x64\x64\x64\x64\x64\
\\x65\x65\x65\x65\x65\x65\x65\x65\x65\x65\x65\x65\x65\x65\x65\x65\
\\x66\x66\x66\x66\x66\x66\x66\x66\x66\x66\x66\x66\x66\x66\x66\x66"#
error "Data.ByteString.Base16.encode: input too long"
| otherwise = unsafeCreate (slen * 2) $ \dptr ->
withForeignPtr sfp $ \sptr ->
encodeLoop dptr
(sptr `plusPtr` soff)
(sptr `plusPtr` (soff + slen))

-- | Decode a string from base16 form. The first element of the
-- returned tuple contains the decoded data. The second element starts
-- at the first invalid base16 sequence in the original string.
-- | Decode a base16-encoded 'ByteString' value.
--
-- Examples:
-- If errors are encountered during the decoding process,
-- then they will be returned in the @Left@ clause of the
-- coproduct.
--
-- > decode "666f6f" == ("foo", "")
-- > decode "66quux" == ("f", "quux")
-- > decode "666quux" == ("f", "6quux")
decode :: ByteString -> (ByteString, ByteString)
decode (PS sfp soff slen) =
unsafePerformIO . createAndTrim' (slen `div` 2) $ \dptr ->
withForeignPtr sfp $ \sptr ->
dec (sptr `plusPtr` soff) dptr
where
dec sptr = go sptr where
e = sptr `plusPtr` if odd slen then slen - 1 else slen
go s d | s == e = let len = e `minusPtr` sptr
in return (0, len `div` 2, ps sfp (soff+len) (slen-len))
| otherwise = do
hi <- hex `fmap` peek8 s
lo <- hex `fmap` peek8 (s `plusPtr` 1)
if lo == 0xff || hi == 0xff
then let len = s `minusPtr` sptr
in return (0, len `div` 2, ps sfp (soff+len) (slen-len))
else do
poke d . fromIntegral $ lo + (hi `shiftL` 4)
go (s `plusPtr` 2) (d `plusPtr` 1)

hex (I# index) = W8# (indexWord8OffAddr# table index)
!table =
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\
\\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\
\\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\
\\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\xff\xff\xff\xff\xff\xff\
\\xff\x0a\x0b\x0c\x0d\x0e\x0f\xff\xff\xff\xff\xff\xff\xff\xff\xff\
\\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\
\\xff\x0a\x0b\x0c\x0d\x0e\x0f\xff\xff\xff\xff\xff\xff\xff\xff\xff\
\\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\
\\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\
\\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\
\\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\
\\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\
\\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\
\\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\
\\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\
\\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"#
decode :: ByteString -> Either String ByteString
decode (PS sfp soff slen)
| slen == 0 = Right empty
| r /= 0 = Left "invalid bytestring size"
| otherwise = unsafeDupablePerformIO $ do
dfp <- mallocPlainForeignPtrBytes q
withForeignPtr dfp $ \dptr ->
withForeignPtr sfp $ \sptr ->
decodeLoop dfp dptr
(plusPtr sptr soff)
(plusPtr sptr (soff + slen))
where
!q = slen `quot` 2
!r = slen `rem` 2

peek8 :: Ptr Word8 -> IO Int
peek8 p = fromIntegral `fmap` peek p

ps :: ForeignPtr Word8 -> Int -> Int -> ByteString
ps fp off len
| len <= 0 = empty
| otherwise = PS fp off len
decodeLenient :: ByteString -> ByteString
decodeLenient (PS !sfp !soff !slen)
| slen == 0 = empty
| otherwise = unsafeDupablePerformIO $ do
dfp <- mallocPlainForeignPtrBytes dlen
withForeignPtr dfp $ \dptr ->
withForeignPtr sfp $ \sptr ->
lenientLoop
dfp
dptr
(plusPtr sptr soff)
(plusPtr sptr (soff + slen))
where
!q = slen `quot` 2
!dlen = q * 2
158 changes: 158 additions & 0 deletions Data/ByteString/Base16/Internal.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE MagicHash #-}
module Data.ByteString.Base16.Internal
( -- * worker loops
encodeLoop
, decodeLoop
, lenientLoop
-- * utils
, c2w
, aix
, reChunk
, unsafeShiftR
) where


import Data.Bits ((.&.), (.|.))
import qualified Data.ByteString as B
import Data.ByteString.Internal (ByteString(..))
import Data.Char (ord)

import Foreign.ForeignPtr
import Foreign.Ptr
import Foreign.Storable

import GHC.Word
import GHC.Exts
(Int(I#), Addr#, indexWord8OffAddr#, word2Int#, uncheckedShiftRL#)


-- ------------------------------------------------------------------ --
-- Loops

encodeLoop
:: Ptr Word8
-> Ptr Word8
-> Ptr Word8
-> IO ()
encodeLoop !dptr !sptr !end = go dptr sptr
where
!hex = "0123456789abcdef"#

go !dst !src
| src == end = return ()
| otherwise = do
!t <- peek src

poke dst (aix (unsafeShiftR t 4) hex)
poke (plusPtr dst 1) (aix (t .&. 0x0f) hex)

go (plusPtr dst 2) (plusPtr src 1)
{-# INLINE encodeLoop #-}

decodeLoop
:: ForeignPtr Word8
-> Ptr Word8
-> Ptr Word8
-> Ptr Word8
-> IO (Either String ByteString)
decodeLoop !dfp !dptr !sptr !end = go dptr sptr
where
err !src = return . Left
$ "invalid character at offset: "
++ show (src `minusPtr` sptr)

!lo = "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\xff\xff\xff\xff\xff\xff\xff\x0a\x0b\x0c\x0d\x0e\x0f\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\x0a\x0b\x0c\x0d\x0e\x0f\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"#

!hi = "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\x00\x10\x20\x30\x40\x50\x60\x70\x80\x90\xff\xff\xff\xff\xff\xff\xff\xa0\xb0\xc0\xd0\xe0\xf0\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xa0\xb0\xc0\xd0\xe0\xf0\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"#

go !dst !src
| src == end = return (Right (PS dfp 0 (src `minusPtr` sptr)))
| otherwise = do
!x <- peek src
!y <- peek (plusPtr src 1)

let !a = aix x hi
!b = aix y lo

if a == 0xff
then err src
else
if b == 0xff
then err (plusPtr src 1)
else do
poke dst (a .|. b)
go (plusPtr dst 1) (plusPtr src 2)
{-# INLINE decodeLoop #-}

lenientLoop
:: ForeignPtr Word8
-> Ptr Word8
-> Ptr Word8
-> Ptr Word8
-> IO ByteString
lenientLoop !dfp !dptr !sptr !end = goHi dptr sptr 0
where
!lo = "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\xff\xff\xff\xff\xff\xff\xff\x0a\x0b\x0c\x0d\x0e\x0f\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\x0a\x0b\x0c\x0d\x0e\x0f\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"#

!hi = "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\x00\x10\x20\x30\x40\x50\x60\x70\x80\x90\xff\xff\xff\xff\xff\xff\xff\xa0\xb0\xc0\xd0\xe0\xf0\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xa0\xb0\xc0\xd0\xe0\xf0\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"#

goHi !dst !src !n
| src == end = return (PS dfp 0 n)
| otherwise = do
!x <- peek src

let !a = aix x hi

if a == 0xff
then goHi dst (plusPtr src 1) n
else goLo dst (plusPtr src 1) a n

goLo !dst !src !a !n
| src == end = return (PS dfp 0 n)
| otherwise = do
!y <- peek src

let !b = aix y lo

if b == 0xff
then goLo dst (plusPtr src 1) a n
else do
poke dst (a .|. b)
goHi (plusPtr dst 1) (plusPtr src 1) (n + 1)
{-# LANGUAGE lenientLoop #-}


-- ------------------------------------------------------------------ --
-- Utils

aix :: Word8 -> Addr# -> Word8
aix (W8# w) table = W8# (indexWord8OffAddr# table (word2Int# w))
{-# INLINE aix #-}

-- | Form a list of chunks, and rechunk the list of bytestrings
-- into length multiples of 2
--
reChunk :: [ByteString] -> [ByteString]
reChunk [] = []
reChunk (c:cs) = case B.length c `divMod` 2 of
(_, 0) -> c : reChunk cs
(n, _) -> case B.splitAt (n * 2) c of
~(m, q) -> m : cont_ q cs
where
cont_ q [] = [q]
cont_ q (a:as) = case B.splitAt 1 a of
~(x, y) -> let q' = B.append q x
in if B.length q' == 2
then
let as' = if B.null y then as else y:as
in q' : reChunk as'
else cont_ q' as

unsafeShiftR :: Word8 -> Int -> Word8
unsafeShiftR (W8# x#) (I# i#) = W8# (x# `uncheckedShiftRL#` i#)
{-# INLINE unsafeShiftR #-}

c2w :: Char -> Word8
c2w = fromIntegral . ord
{-# INLINE c2w #-}
Loading

0 comments on commit 17e7c3a

Please sign in to comment.