diff --git a/CHANGELOG.md b/CHANGELOG.md index 81f9506..424879e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,6 +26,7 @@ Breaking changes: prevents a compiler error (i.e. `MixedAssociativityError`) without causing issues with `<$>`. - Rename module prefix from `Text.Parsing.Parser` to `Parsing` (#169 by @jamesdbrock) +- Delete the `regex` parser and replace it with `mkRegex`. (#170 by @jamesdbrock) New features: diff --git a/bench/Json/Parsing.purs b/bench/Json/Parsing.purs index cf6defa..bb109d1 100644 --- a/bench/Json/Parsing.purs +++ b/bench/Json/Parsing.purs @@ -4,13 +4,18 @@ import Prelude hiding (between) import Bench.Json.Common (Json(..)) import Control.Lazy (defer) +import Data.Either (Either(..)) import Data.List (List) import Data.Maybe (Maybe(..)) import Data.Number as Number +import Data.String.Regex.Flags (noFlags) import Data.Tuple (Tuple(..)) +import Effect.Exception (throw) +import Effect.Unsafe (unsafePerformEffect) import Parsing (ParserT, fail) import Parsing.Combinators (between, choice, sepBy, try) import Parsing.String (regex, skipSpaces, string) +import Partial.Unsafe (unsafeCrashWith) json :: forall m. Monad m => ParserT String m Json json = defer \_ -> @@ -38,15 +43,18 @@ jsonArray = defer \_ -> json `sepBy` (try (skipSpaces *> string ",")) jsonString :: forall m. Monad m => ParserT String m String -jsonString = between (string "\"") (string "\"") do - regex {} """\\"|[^"]*""" +jsonString = case regex """\\"|[^"]*""" noFlags of + Left err -> unsafeCrashWith err + Right p -> between (string "\"") (string "\"") p jsonNumber :: forall m. Monad m => ParserT String m Number -jsonNumber = do - n <- regex {} """(\+|-)?(\d+(\.\d*)?|\d*\.\d+)([eE](\+|-)?\d+)?""" - case Number.fromString n of - Just n' -> pure n' - Nothing -> fail "Expected number" +jsonNumber = case regex """(\+|-)?(\d+(\.\d*)?|\d*\.\d+)([eE](\+|-)?\d+)?""" noFlags of + Left err -> unsafeCrashWith err + Right p -> do + n <- p + case Number.fromString n of + Just n' -> pure n' + Nothing -> fail "Expected number" jsonBoolean :: forall m. Monad m => ParserT String m Boolean jsonBoolean = choice diff --git a/spago.dhall b/spago.dhall index eb0952f..33140c1 100644 --- a/spago.dhall +++ b/spago.dhall @@ -17,7 +17,6 @@ , "numbers" , "partial" , "prelude" - , "record" , "strings" , "tailrec" , "transformers" diff --git a/src/Parsing/String.purs b/src/Parsing/String.purs index 25780fb..0302dd7 100644 --- a/src/Parsing/String.purs +++ b/src/Parsing/String.purs @@ -37,7 +37,6 @@ module Parsing.String , noneOfCodePoints , match , regex - , RegexFlagsRow , consumeWith ) where @@ -55,12 +54,10 @@ import Data.String (CodePoint, Pattern(..), codePointAt, length, null, singleton import Data.String as String import Data.String.CodeUnits as SCU import Data.String.Regex as Regex -import Data.String.Regex.Flags (RegexFlags(..), RegexFlagsRec) +import Data.String.Regex.Flags (RegexFlags) import Data.Tuple (Tuple(..), fst) import Partial.Unsafe (unsafePartial) -import Prim.Row (class Nub, class Union) -import Record (merge) -import Parsing (ParseError(..), ParseState(..), ParserT(..), fail) +import Parsing (ParseError(..), ParseState(..), ParserT(..)) import Parsing.Combinators ((), (<~?>)) import Parsing.Pos (Position(..)) @@ -229,101 +226,71 @@ match p = do -- boundary. pure $ Tuple (SCU.take (SCU.length input1 - SCU.length input2) input1) x --- | Parser which uses the `Data.String.Regex` module to match the regular --- | expression pattern passed as the `String` --- | argument to the parser. +-- | Compile a regular expression string into a regular expression parser. +-- | +-- | This function will use the `Data.String.Regex.regex` function to compile and return a parser which can be used +-- | in a `ParserT String m` monad. -- | -- | This parser will try to match the regular expression pattern starting -- | at the current parser position. On success, it will return the matched -- | substring. -- | --- | If the `Regex` pattern string fails to compile then this parser will fail. --- | (Note: It’s not possible to use a precompiled `Regex` because this parser --- | must set flags and make adjustments to the `Regex` pattern string.) +-- | [*MDN Regular Expressions Cheatsheet*](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Cheatsheet) +-- | +-- | This function should be called outside the context of a `ParserT String m` monad, because this function might +-- | fail with a `Left` RegExp compilation error message. +-- | If you call this function inside of the `ParserT String m` monad and then `fail` the parse when the compilation fails, +-- | then that could be confusing because a parser failure is supposed to indicate an invalid input string. +-- | If the compilation failure occurs in an `alt` then the compilation failure might not be reported at all and instead +-- | the input string would be parsed incorrectly. -- | -- | This parser may be useful for quickly consuming a large section of the --- | input `String`, because in a JavaScript runtime environment the `RegExp` +-- | input `String`, because in a JavaScript runtime environment the RegExp -- | runtime is a lot faster than primitive parsers. -- | --- | [*MDN Regular Expressions Cheatsheet*](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Cheatsheet) +-- | #### Example +-- | +-- | This example shows how to compile and run the `xMany` parser which will +-- | capture the regular expression pattern `x*`. +-- | +-- | ```purescript +-- | case regex "x*" noFlags of +-- | Left compileError -> unsafeCrashWith $ "xMany failed to compile: " <> compileError +-- | Right xMany -> runParser "xxxZ" do +-- | xMany +-- | ``` -- | -- | #### Flags -- | --- | The `Record flags` argument to the parser is for `Regex` flags. Here are --- | the default flags. +-- | Set `RegexFlags` with the `Semigroup` instance like this. -- | -- | ```purescript --- | { dotAll: true --- | ignoreCase: false --- | unicode: true --- | } +-- | regex "x*" (dotAll <> ignoreCase) -- | ``` -- | --- | To use the defaults, pass --- | `{}` as the flags argument. For case-insensitive pattern matching, pass --- | `{ignoreCase: true}` as the flags argument. --- | --- | The other `Data.String.Regex.Flags.RegexFlagsRec` fields are mostly --- | nonsense in the context of parsing --- | and use of the other flags may cause strange behavior in the parser. +-- | The `dotAll`, `unicode`, and `ignoreCase` flags might make sense for a `regex` parser. The other flags will +-- | probably cause surprising behavior and you should avoid them. -- | -- | [*MDN Advanced searching with flags*](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions#advanced_searching_with_flags) +regex :: forall m. String -> RegexFlags -> Either String (ParserT String m String) +regex pattern flags = + Regex.regex ("^(" <> pattern <> ")") flags <#> \regexobj -> + consumeWith \input -> do + case NonEmptyArray.head <$> Regex.match regexobj input of + Just (Just consumed) -> do + let remainder = SCU.drop (SCU.length consumed) input + Right { value: consumed, consumed, remainder } + _ -> + Left "No Regex pattern match" + +-- | Consume a portion of the input string while yielding a value. -- | --- | #### Example +-- | Takes a consumption function which takes the remaining input `String` +-- | as its argument and returns three fields: -- | --- | ``` --- | runParser "ababXX" (regex {} "(ab)+") --- | ``` --- | ``` --- | (Right "abab") --- | ``` -regex - :: forall m flags f_ - . Monad m - => Union flags RegexFlagsRow f_ - => Nub f_ RegexFlagsRow - => Record flags - -> String - -> ParserT String m String -regex flags pattern = - -- Prefix a ^ to ensure the pattern only matches the current position in the parse - case Regex.regex ("^(" <> pattern <> ")") flags' of - Left paterr -> - fail $ "Regex pattern error " <> paterr - Right regexobj -> - consumeWith \input -> do - case NonEmptyArray.head <$> Regex.match regexobj input of - Just (Just consumed) -> do - let remainder = SCU.drop (SCU.length consumed) input - Right { value: consumed, consumed, remainder } - _ -> - Left "No Regex pattern match" - where - flags' = RegexFlags - ( merge flags - { dotAll: true - , global: false - , ignoreCase: false - , multiline: false - , sticky: false - , unicode: true - } :: RegexFlagsRec - ) - --- | The fields from `Data.String.Regex.Flags.RegexFlagsRec`. -type RegexFlagsRow = - ( dotAll :: Boolean - , global :: Boolean - , ignoreCase :: Boolean - , multiline :: Boolean - , sticky :: Boolean - , unicode :: Boolean - ) - --- | Consumes a portion of the input string while yielding a value. -- | * `value` is the value to return. --- | * `consumed` is the input that was consumed and is used to update the parser position. --- | * `remainder` is the new input state. +-- | * `consumed` is the input `String` that was consumed. It is used to update the parser position. +-- | * `remainder` is the new remaining input `String`. consumeWith :: forall m a . (String -> Either String { value :: a, consumed :: String, remainder :: String }) diff --git a/test/Main.purs b/test/Main.purs index cd1ca41..bd9ebb1 100644 --- a/test/Main.purs +++ b/test/Main.purs @@ -1,3 +1,8 @@ +-- Run tests: +-- +-- spago -x spago-dev.dhall test +-- + module Test.Main where import Prelude hiding (between, when) @@ -16,9 +21,10 @@ import Data.Number (infinity, isNaN) import Data.String.CodePoints as SCP import Data.String.CodeUnits (fromCharArray, singleton) import Data.String.CodeUnits as SCU +import Data.String.Regex.Flags (RegexFlags, ignoreCase, noFlags) import Data.Tuple (Tuple(..)) import Effect (Effect) -import Effect.Console (logShow) +import Effect.Console (log, logShow) import Partial.Unsafe (unsafePartial) import Test.Assert (assert') import Parsing (ParseError(..), Parser, ParserT, fail, parseErrorMessage, parseErrorPosition, position, region, runParser) @@ -26,7 +32,7 @@ import Parsing.Combinators (between, chainl, chainl1Rec, chainlRec, chainr1Rec, import Parsing.Expr (Assoc(..), Operator(..), buildExprParser) import Parsing.Language (haskellDef, haskellStyle, javaStyle) import Parsing.Pos (Position(..), initialPos) -import Parsing.String (anyChar, anyCodePoint, char, eof, noneOfCodePoints, oneOfCodePoints, regex, rest, satisfy, string, takeN, whiteSpace) +import Parsing.String (anyChar, anyCodePoint, char, eof, regex, noneOfCodePoints, oneOfCodePoints, rest, satisfy, string, takeN, whiteSpace) import Parsing.String.Basic (intDecimal, number, letter) import Parsing.Token (TokenParser, makeTokenParser, match, token, when) import Parsing.Token as Parser.Token @@ -94,6 +100,14 @@ manySatisfyTest = do _ <- char '?' pure (fromCharArray r) +mkRegexTest :: String -> String -> String -> RegexFlags -> (Parser String String -> Parser String String) -> Effect Unit +mkRegexTest input expected pattern flags pars = + case regex pattern flags of + Left err -> assert' ("error: " <> show err) false + Right p -> parseTest input expected $ pars p + +-- TODO everything is stack-safe now. +-- -- This test doesn't test the actual stack safety of these combinators, mainly -- because I don't know how to come up with an example guaranteed to be large -- enough to overflow the stack. But thankfully, their stack safety is more or @@ -559,6 +573,7 @@ javaStyleTest = do main :: Effect Unit main = do + log "\nTESTS String\n" parseErrorTestPosition (many $ char 'f' *> char '?') "foo" @@ -667,6 +682,8 @@ main = do parseErrorTestPosition (string "a\nb\nc\n" *> eof) "a\nb\nc\nd\n" (Position { column: 1, line: 4 }) parseErrorTestPosition (string "\ta" *> eof) "\tab" (Position { column: 10, line: 1 }) + log "\nTESTS number\n" + parseTest "Infinity" infinity number parseTest "+Infinity" infinity number parseTest "-Infinity" (negate infinity) number @@ -681,6 +698,7 @@ main = do parseTest "-6.0" (-6.0) number parseTest "+6.0" (6.0) number + log "\nTESTS Operator\n" -- test from issue #161 -- all the below operators should play well together parseErrorTestMessage @@ -749,24 +767,23 @@ main = do -- TODO This shows the current limitations of the number parser. Ideally this parse should fail. parseTest "1..3" 1.0 $ number <* eof + log "\nTESTS intDecimal\n" parseTest "-300" (-300) intDecimal - parseTest "regex-" "regex" (regex {} "regex" <* char '-' <* eof) - parseTest "-regex" "regex" (char '-' *> regex {} "regex" <* eof) - parseTest "regexregex" "regexregex" (regex {} "(regex)*") - parseTest "regexregex" "regex" (regex {} "(^regex)*") - parseTest "ReGeX" "ReGeX" (regex { ignoreCase: true } "regex") - parseTest "regexcapregexcap" "regexcap" (regex {} "(?regexcap)") - parseTest "regexcapregexcap" "regexcap" (regex {} "(((?(r)e(g)excap)))") - - -- Maybe it is nonsense to allow multiline regex. - -- Because an end-of-line regex pattern `$` will match but then the - -- newline character will not be consumed. - -- Also why does this test fail? I think it should succeed. - -- parseTest "regex\nregex\n" "regex\nregex\n" (regex {dotAll: false, multiline: true} "(^regex$)+") + log "\nTESTS Regex\n" + mkRegexTest "regex-" "regex" "regex" noFlags (\regex -> regex <* char '-' <* eof) + mkRegexTest "-regex" "regex" "regex" noFlags (\regex -> char '-' *> regex <* eof) + mkRegexTest "regexregex" "regexregex" "(regex)*" noFlags identity + mkRegexTest "regexregex" "regex" "(^regex)*" noFlags identity + mkRegexTest "ReGeX" "ReGeX" "regex" ignoreCase identity + mkRegexTest "regexcapregexcap" "regexcap" "(?regexcap)" noFlags identity + mkRegexTest "regexcapregexcap" "regexcap" "(((?(r)e(g)excap)))" noFlags identity + log "\nTESTS Stack Safe Loops\n" stackSafeLoopsTest + log "\nTESTS Token Parser\n" + tokenParserIdentifierTest tokenParserReservedTest tokenParserOperatorTest @@ -799,18 +816,21 @@ main = do tokenParserCommaSepTest tokenParserCommaSep1Test + log "\nTESTS Haskell Style\n" haskellStyleTest + log "\nTESTS Java Style\n" javaStyleTest + log "\nTESTS region\n" + let + prependContext m' (ParseError m pos) = ParseError (m' <> m) pos + p = region (prependContext "context1 ") $ do + _ <- string "a" + region (prependContext "context2 ") $ do + string "b" case runParser "aa" p of Right _ -> assert' "error: ParseError expected!" false Left (ParseError message _) -> do let messageExpected = "context1 context2 Expected \"b\"" assert' ("expected message: " <> messageExpected <> ", message: " <> message) (message == messageExpected) logShow messageExpected - where - prependContext m' (ParseError m pos) = ParseError (m' <> m) pos - p = region (prependContext "context1 ") $ do - _ <- string "a" - region (prependContext "context2 ") $ do - string "b"