diff --git a/src/Text/Parsing/StringParser/String.purs b/src/Text/Parsing/StringParser/CodePoints.purs similarity index 91% rename from src/Text/Parsing/StringParser/String.purs rename to src/Text/Parsing/StringParser/CodePoints.purs index 1f0b2b9..3a8a9d6 100644 --- a/src/Text/Parsing/StringParser/String.purs +++ b/src/Text/Parsing/StringParser/CodePoints.purs @@ -1,6 +1,9 @@ --- | Primitive parsers for strings. - -module Text.Parsing.StringParser.String +-- | Primitive parsers for strings, parsing based on code points. +-- | +-- | These functions will be much slower than the `CodeUnits` alternatives, but +-- | will behave correctly in the presence of Unicode characters made up of +-- | multiple code units. +module Text.Parsing.StringParser.CodePoints ( eof , anyChar , anyDigit @@ -27,8 +30,9 @@ import Data.Char (toCharCode) import Data.Either (Either(..)) import Data.Foldable (class Foldable, foldMap, elem, notElem) import Data.Maybe (Maybe(..)) -import Data.String (Pattern(..), drop, length, indexOf', stripPrefix) +import Data.String.CodePoints (drop, length, indexOf', stripPrefix) import Data.String.CodeUnits (charAt, singleton) +import Data.String.Pattern (Pattern(..)) import Data.String.Regex as Regex import Data.String.Regex.Flags (noFlags) import Text.Parsing.StringParser (Parser(..), ParseError(..), try, fail) diff --git a/src/Text/Parsing/StringParser/CodeUnits.purs b/src/Text/Parsing/StringParser/CodeUnits.purs new file mode 100644 index 0000000..7100480 --- /dev/null +++ b/src/Text/Parsing/StringParser/CodeUnits.purs @@ -0,0 +1,150 @@ +-- | Primitive parsers for strings, parsing based on code units. +-- | +-- | These functions will be much faster than the `CodePoints` alternatives, but +-- | will behave incorrectly when dealing with Unicode characters that consist +-- | of multiple code units. +module Text.Parsing.StringParser.CodeUnits + ( eof + , anyChar + , anyDigit + , string + , satisfy + , char + , whiteSpace + , skipSpaces + , oneOf + , noneOf + , lowerCaseChar + , upperCaseChar + , anyLetter + , alphaNum + , regex + ) where + +import Prelude + +import Control.Alt ((<|>)) +import Data.Array ((..)) +import Data.Array.NonEmpty as NEA +import Data.Char (toCharCode) +import Data.Either (Either(..)) +import Data.Foldable (class Foldable, foldMap, elem, notElem) +import Data.Maybe (Maybe(..)) +import Data.String.CodeUnits (charAt, singleton) +import Data.String.CodeUnits as SCU +import Data.String.Pattern (Pattern(..)) +import Data.String.Regex as Regex +import Data.String.Regex.Flags (noFlags) +import Text.Parsing.StringParser (Parser(..), ParseError(..), try, fail) +import Text.Parsing.StringParser.Combinators (many, ()) + +-- | Match the end of the file. +eof :: Parser Unit +eof = Parser \s -> + case s of + { str, pos } | pos < SCU.length str -> Left { pos, error: ParseError "Expected EOF" } + _ -> Right { result: unit, suffix: s } + +-- | Match any character. +anyChar :: Parser Char +anyChar = Parser \{ str, pos } -> + case charAt pos str of + Just chr -> Right { result: chr, suffix: { str, pos: pos + 1 } } + Nothing -> Left { pos, error: ParseError "Unexpected EOF" } + +-- | Match any digit. +anyDigit :: Parser Char +anyDigit = try do + c <- anyChar + if c >= '0' && c <= '9' + then pure c + else fail $ "Character " <> show c <> " is not a digit" + +-- | Match the specified string. +string :: String -> Parser String +string nt = Parser \s -> + case s of + { str, pos } | SCU.indexOf' (Pattern nt) pos str == Just pos -> Right { result: nt, suffix: { str, pos: pos + SCU.length nt } } + { pos } -> Left { pos, error: ParseError ("Expected '" <> nt <> "'.") } + +-- | Match a character satisfying the given predicate. +satisfy :: (Char -> Boolean) -> Parser Char +satisfy f = try do + c <- anyChar + if f c + then pure c + else fail $ "Character " <> show c <> " did not satisfy predicate" + +-- | Match the specified character. +char :: Char -> Parser Char +char c = satisfy (_ == c) "Could not match character " <> show c + +-- | Match many whitespace characters. +whiteSpace :: Parser String +whiteSpace = do + cs <- many (satisfy \ c -> c == '\n' || c == '\r' || c == ' ' || c == '\t') + pure (foldMap singleton cs) + +-- | Skip many whitespace characters. +skipSpaces :: Parser Unit +skipSpaces = void whiteSpace + +-- | Match one of the characters in the foldable structure. +oneOf :: forall f. Foldable f => f Char -> Parser Char +oneOf = satisfy <<< flip elem + +-- | Match any character not in the foldable structure. +noneOf :: forall f. Foldable f => f Char -> Parser Char +noneOf = satisfy <<< flip notElem + +-- | Match any lower case character. +lowerCaseChar :: Parser Char +lowerCaseChar = try do + c <- anyChar + if toCharCode c `elem` (97 .. 122) + then pure c + else fail $ "Expected a lower case character but found " <> show c + +-- | Match any upper case character. +upperCaseChar :: Parser Char +upperCaseChar = try do + c <- anyChar + if toCharCode c `elem` (65 .. 90) + then pure c + else fail $ "Expected an upper case character but found " <> show c + +-- | Match any letter. +anyLetter :: Parser Char +anyLetter = lowerCaseChar <|> upperCaseChar "Expected a letter" + +-- | Match a letter or a number. +alphaNum :: Parser Char +alphaNum = anyLetter <|> anyDigit "Expected a letter or a number" + +-- | match the regular expression +regex :: String -> Parser String +regex pat = + case Regex.regex pattern noFlags of + Left _ -> + fail $ "Text.Parsing.StringParser.String.regex': illegal regex " <> pat + Right r -> + matchRegex r + where + -- ensure the pattern only matches the current position in the parse + pattern = + case SCU.stripPrefix (Pattern "^") pat of + Nothing -> + "^" <> pat + _ -> + pat + matchRegex :: Regex.Regex -> Parser String + matchRegex r = + Parser \{ str, pos } -> + let + remainder = SCU.drop pos str + in + case NEA.head <$> Regex.match r remainder of + Just (Just matched) -> + Right { result: matched, suffix: { str, pos: pos + SCU.length matched } } + _ -> + Left { pos, error: ParseError "no match" } diff --git a/test/CodePoints.purs b/test/CodePoints.purs new file mode 100644 index 0000000..2fb37d2 --- /dev/null +++ b/test/CodePoints.purs @@ -0,0 +1,99 @@ +module Test.CodePoints where + +import Prelude hiding (between) + +import Control.Alt ((<|>)) +import Data.Either (isLeft, isRight, Either(..)) +import Data.Foldable (fold) +import Data.List (List(Nil), (:)) +import Data.List.Lazy (take, repeat) +import Data.List.NonEmpty (NonEmptyList(..)) +import Data.NonEmpty ((:|)) +import Data.String.CodeUnits (singleton) +import Data.String.Common as SC +import Data.Unfoldable (replicate) +import Effect (Effect) +import Test.Assert (assert', assert) +import Text.Parsing.StringParser (Parser, runParser, try) +import Text.Parsing.StringParser.Combinators (many1, endBy1, sepBy1, optionMaybe, many, manyTill, many1Till, chainl, fix, between) +import Text.Parsing.StringParser.Expr (Assoc(..), Operator(..), buildExprParser) +import Text.Parsing.StringParser.CodePoints (anyDigit, eof, string, anyChar, regex) + +parens :: forall a. Parser a -> Parser a +parens = between (string "(") (string ")") + +nested :: Parser Int +nested = fix $ \p -> (do + _ <- string "a" + pure 0) <|> ((+) 1) <$> parens p + +opTest :: Parser String +opTest = chainl (singleton <$> anyChar) (string "+" $> append) "" + +digit :: Parser Int +digit = string "0" $> 0 + <|> string "1" $> 1 + <|> string "2" $> 2 + <|> string "3" $> 3 + <|> string "4" $> 4 + <|> string "5" $> 5 + <|> string "6" $> 6 + <|> string "7" $> 7 + <|> string "8" $> 8 + <|> string "9" $> 9 + +exprTest :: Parser Int +exprTest = buildExprParser [ [Infix (string "/" >>= \_ -> pure div) AssocRight] + , [Infix (string "*" >>= \_ -> pure mul) AssocRight] + , [Infix (string "-" >>= \_ -> pure sub) AssocRight] + , [Infix (string "+" >>= \_ -> pure add) AssocRight] + ] digit + +tryTest :: Parser String + -- reduce the possible array of matches to 0 or 1 elements to aid Array pattern matching +tryTest = + try (string "aa" <> string "bb") <|> + (string "aa" <> string "cc") + +canParse :: forall a. Parser a -> String -> Boolean +canParse p input = isRight $ runParser p input + +parseFail :: forall a. Parser a -> String -> Boolean +parseFail p input = isLeft $ runParser p input + +expectResult :: forall a. Eq a => a -> Parser a -> String -> Boolean +expectResult res p input = runParser p input == Right res + +testCodePoints :: Effect Unit +testCodePoints = do + assert' "many should not blow the stack" $ canParse (many (string "a")) (SC.joinWith "" $ replicate 100000 "a") + assert' "many failing after" $ parseFail (do + as <- many (string "a") + eof + pure as) (SC.joinWith "" (replicate 100000 "a") <> "b" ) + + assert $ expectResult 3 nested "(((a)))" + assert $ expectResult ("a":"a":"a":Nil) (many (string "a")) "aaa" + assert $ parseFail (many1 (string "a")) "" + assert $ canParse (parens (do + _ <- string "a" + optionMaybe $ string "b")) "(ab)" + assert $ expectResult (NonEmptyList ("a" :| "a":"a":Nil)) (string "a" `sepBy1` string ",") "a,a,a" + assert $ canParse (do + as <- string "a" `endBy1` string "," + eof + pure as) "a,a,a," + assert' "opTest" $ expectResult "abc" opTest "a+b+c" + assert' "exprTest" $ expectResult (-3) exprTest "1*2+3/4-5" + assert' "tryTest "$ canParse tryTest "aacc" + assert $ expectResult (NonEmptyList ('0' :| '1':'2':'3':'4':Nil)) (many1 anyDigit) "01234/" + assert $ expectResult (NonEmptyList ('5' :| '6':'7':'8':'9':Nil)) (many1 anyDigit) "56789:" + assert $ expectResult "aaaa" (regex "a+") "aaaab" + assert $ expectResult ("a":"a":"a":Nil) (manyTill (string "a") (string "b")) "aaab" + assert $ expectResult Nil (manyTill (string "a") (string "b")) "b" + assert $ expectResult (NonEmptyList ("a" :| "a":"a":Nil)) (many1Till (string "a") (string "b")) "aaab" + assert $ parseFail (many1Till (string "a") (string "b")) "b" + -- check against overflow + assert $ canParse (many1Till (string "a") (string "and")) $ (fold <<< take 10000 $ repeat "a") <> "and" + -- check correct order + assert $ expectResult (NonEmptyList ('a' :| 'b':'c':Nil)) (many1Till anyChar (string "d")) "abcd" diff --git a/test/CodeUnits.purs b/test/CodeUnits.purs new file mode 100644 index 0000000..ff81cec --- /dev/null +++ b/test/CodeUnits.purs @@ -0,0 +1,99 @@ +module Test.CodeUnits where + +import Prelude hiding (between) + +import Control.Alt ((<|>)) +import Data.Either (isLeft, isRight, Either(..)) +import Data.Foldable (fold) +import Data.List (List(Nil), (:)) +import Data.List.Lazy (take, repeat) +import Data.List.NonEmpty (NonEmptyList(..)) +import Data.NonEmpty ((:|)) +import Data.String.CodeUnits (singleton) +import Data.String.Common as SC +import Data.Unfoldable (replicate) +import Effect (Effect) +import Test.Assert (assert', assert) +import Text.Parsing.StringParser (Parser, runParser, try) +import Text.Parsing.StringParser.Combinators (many1, endBy1, sepBy1, optionMaybe, many, manyTill, many1Till, chainl, fix, between) +import Text.Parsing.StringParser.Expr (Assoc(..), Operator(..), buildExprParser) +import Text.Parsing.StringParser.CodeUnits (anyDigit, eof, string, anyChar, regex) + +parens :: forall a. Parser a -> Parser a +parens = between (string "(") (string ")") + +nested :: Parser Int +nested = fix $ \p -> (do + _ <- string "a" + pure 0) <|> ((+) 1) <$> parens p + +opTest :: Parser String +opTest = chainl (singleton <$> anyChar) (string "+" $> append) "" + +digit :: Parser Int +digit = string "0" $> 0 + <|> string "1" $> 1 + <|> string "2" $> 2 + <|> string "3" $> 3 + <|> string "4" $> 4 + <|> string "5" $> 5 + <|> string "6" $> 6 + <|> string "7" $> 7 + <|> string "8" $> 8 + <|> string "9" $> 9 + +exprTest :: Parser Int +exprTest = buildExprParser [ [Infix (string "/" >>= \_ -> pure div) AssocRight] + , [Infix (string "*" >>= \_ -> pure mul) AssocRight] + , [Infix (string "-" >>= \_ -> pure sub) AssocRight] + , [Infix (string "+" >>= \_ -> pure add) AssocRight] + ] digit + +tryTest :: Parser String + -- reduce the possible array of matches to 0 or 1 elements to aid Array pattern matching +tryTest = + try (string "aa" <> string "bb") <|> + (string "aa" <> string "cc") + +canParse :: forall a. Parser a -> String -> Boolean +canParse p input = isRight $ runParser p input + +parseFail :: forall a. Parser a -> String -> Boolean +parseFail p input = isLeft $ runParser p input + +expectResult :: forall a. Eq a => a -> Parser a -> String -> Boolean +expectResult res p input = runParser p input == Right res + +testCodeUnits :: Effect Unit +testCodeUnits = do + assert' "many should not blow the stack" $ canParse (many (string "a")) (SC.joinWith "" $ replicate 100000 "a") + assert' "many failing after" $ parseFail (do + as <- many (string "a") + eof + pure as) (SC.joinWith "" (replicate 100000 "a") <> "b" ) + + assert $ expectResult 3 nested "(((a)))" + assert $ expectResult ("a":"a":"a":Nil) (many (string "a")) "aaa" + assert $ parseFail (many1 (string "a")) "" + assert $ canParse (parens (do + _ <- string "a" + optionMaybe $ string "b")) "(ab)" + assert $ expectResult (NonEmptyList ("a" :| "a":"a":Nil)) (string "a" `sepBy1` string ",") "a,a,a" + assert $ canParse (do + as <- string "a" `endBy1` string "," + eof + pure as) "a,a,a," + assert' "opTest" $ expectResult "abc" opTest "a+b+c" + assert' "exprTest" $ expectResult (-3) exprTest "1*2+3/4-5" + assert' "tryTest "$ canParse tryTest "aacc" + assert $ expectResult (NonEmptyList ('0' :| '1':'2':'3':'4':Nil)) (many1 anyDigit) "01234/" + assert $ expectResult (NonEmptyList ('5' :| '6':'7':'8':'9':Nil)) (many1 anyDigit) "56789:" + assert $ expectResult "aaaa" (regex "a+") "aaaab" + assert $ expectResult ("a":"a":"a":Nil) (manyTill (string "a") (string "b")) "aaab" + assert $ expectResult Nil (manyTill (string "a") (string "b")) "b" + assert $ expectResult (NonEmptyList ("a" :| "a":"a":Nil)) (many1Till (string "a") (string "b")) "aaab" + assert $ parseFail (many1Till (string "a") (string "b")) "b" + -- check against overflow + assert $ canParse (many1Till (string "a") (string "and")) $ (fold <<< take 10000 $ repeat "a") <> "and" + -- check correct order + assert $ expectResult (NonEmptyList ('a' :| 'b':'c':Nil)) (many1Till anyChar (string "d")) "abcd" diff --git a/test/Main.purs b/test/Main.purs index 94d4c5a..c1f178f 100644 --- a/test/Main.purs +++ b/test/Main.purs @@ -1,99 +1,16 @@ module Test.Main where -import Prelude hiding (between) +import Prelude -import Control.Alt ((<|>)) -import Data.Either (isLeft, isRight, Either(..)) -import Data.Foldable (fold) -import Data.List (List(Nil), (:)) -import Data.List.Lazy (take, repeat) -import Data.List.NonEmpty (NonEmptyList(..)) -import Data.NonEmpty ((:|)) -import Data.String (joinWith) -import Data.String.CodeUnits (singleton) -import Data.Unfoldable (replicate) import Effect (Effect) -import Test.Assert (assert', assert) -import Text.Parsing.StringParser (Parser, runParser, try) -import Text.Parsing.StringParser.Combinators (many1, endBy1, sepBy1, optionMaybe, many, manyTill, many1Till, chainl, fix, between) -import Text.Parsing.StringParser.Expr (Assoc(..), Operator(..), buildExprParser) -import Text.Parsing.StringParser.String (anyDigit, eof, string, anyChar, regex) - -parens :: forall a. Parser a -> Parser a -parens = between (string "(") (string ")") - -nested :: Parser Int -nested = fix $ \p -> (do - _ <- string "a" - pure 0) <|> ((+) 1) <$> parens p - -opTest :: Parser String -opTest = chainl (singleton <$> anyChar) (string "+" $> append) "" - -digit :: Parser Int -digit = string "0" $> 0 - <|> string "1" $> 1 - <|> string "2" $> 2 - <|> string "3" $> 3 - <|> string "4" $> 4 - <|> string "5" $> 5 - <|> string "6" $> 6 - <|> string "7" $> 7 - <|> string "8" $> 8 - <|> string "9" $> 9 - -exprTest :: Parser Int -exprTest = buildExprParser [ [Infix (string "/" >>= \_ -> pure div) AssocRight] - , [Infix (string "*" >>= \_ -> pure mul) AssocRight] - , [Infix (string "-" >>= \_ -> pure sub) AssocRight] - , [Infix (string "+" >>= \_ -> pure add) AssocRight] - ] digit - -tryTest :: Parser String - -- reduce the possible array of matches to 0 or 1 elements to aid Array pattern matching -tryTest = - try (string "aa" <> string "bb") <|> - (string "aa" <> string "cc") - -canParse :: forall a. Parser a -> String -> Boolean -canParse p input = isRight $ runParser p input - -parseFail :: forall a. Parser a -> String -> Boolean -parseFail p input = isLeft $ runParser p input - -expectResult :: forall a. (Eq a) => a -> Parser a -> String -> Boolean -expectResult res p input = runParser p input == Right res +import Effect.Console (log) +import Test.CodePoints (testCodePoints) +import Test.CodeUnits (testCodeUnits) main :: Effect Unit main = do - assert' "many should not blow the stack" $ canParse (many (string "a")) (joinWith "" $ replicate 100000 "a") - assert' "many failing after" $ parseFail (do - as <- many (string "a") - eof - pure as) (joinWith "" (replicate 100000 "a") <> "b" ) + log "Testing CodePoint parsing\n" + testCodePoints - assert $ expectResult 3 nested "(((a)))" - assert $ expectResult ("a":"a":"a":Nil) (many (string "a")) "aaa" - assert $ parseFail (many1 (string "a")) "" - assert $ canParse (parens (do - _ <- string "a" - optionMaybe $ string "b")) "(ab)" - assert $ expectResult (NonEmptyList ("a" :| "a":"a":Nil)) (string "a" `sepBy1` string ",") "a,a,a" - assert $ canParse (do - as <- string "a" `endBy1` string "," - eof - pure as) "a,a,a," - assert' "opTest" $ expectResult "abc" opTest "a+b+c" - assert' "exprTest" $ expectResult (-3) exprTest "1*2+3/4-5" - assert' "tryTest "$ canParse tryTest "aacc" - assert $ expectResult (NonEmptyList ('0' :| '1':'2':'3':'4':Nil)) (many1 anyDigit) "01234/" - assert $ expectResult (NonEmptyList ('5' :| '6':'7':'8':'9':Nil)) (many1 anyDigit) "56789:" - assert $ expectResult "aaaa" (regex "a+") "aaaab" - assert $ expectResult ("a":"a":"a":Nil) (manyTill (string "a") (string "b")) "aaab" - assert $ expectResult Nil (manyTill (string "a") (string "b")) "b" - assert $ expectResult (NonEmptyList ("a" :| "a":"a":Nil)) (many1Till (string "a") (string "b")) "aaab" - assert $ parseFail (many1Till (string "a") (string "b")) "b" - -- check against overflow - assert $ canParse (many1Till (string "a") (string "and")) $ (fold <<< take 10000 $ repeat "a") <> "and" - -- check correct order - assert $ expectResult (NonEmptyList ('a' :| 'b':'c':Nil)) (many1Till anyChar (string "d")) "abcd" + log "\n\nTesting CodeUnit parsing\n" + testCodeUnits