diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 98eb9a4..8500fca 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -26,12 +26,15 @@ jobs: .spago output - - name: Install dev dependencies + - name: Install dependencies run: spago install - name: Build source run: spago build --no-install --purs-args '--censor-lib --strict --censor-codes='UserDefinedWarning'' + - name: Install dev dependencies + run: spago -x spago-dev.dhall install + - name: Run tests run: spago -x spago-dev.dhall test --no-install diff --git a/CHANGELOG.md b/CHANGELOG.md index 033b3ed..d1b070b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,8 @@ Breaking changes: New features: +Add `regex` parser. + Bugfixes: Other improvements: diff --git a/bower.json b/bower.json index 6575ff9..7cb6c39 100644 --- a/bower.json +++ b/bower.json @@ -26,6 +26,7 @@ "purescript-newtype": "^v4.0.0", "purescript-numbers": "^v8.0.0", "purescript-prelude": "^v5.0.1", + "purescript-record": "^v3.0.0", "purescript-strings": "^v5.0.0", "purescript-tailrec": "^v5.0.1", "purescript-transformers": "^v5.1.0", diff --git a/spago.dhall b/spago.dhall index 9f3233a..ffd8164 100644 --- a/spago.dhall +++ b/spago.dhall @@ -14,6 +14,7 @@ , "newtype" , "numbers" , "prelude" + , "record" , "strings" , "tailrec" , "transformers" diff --git a/src/Text/Parsing/Parser/String.purs b/src/Text/Parsing/Parser/String.purs index c69fc62..a979c75 100644 --- a/src/Text/Parsing/Parser/String.purs +++ b/src/Text/Parsing/Parser/String.purs @@ -36,19 +36,27 @@ module Text.Parsing.Parser.String , noneOf , noneOfCodePoints , match + , regex + , RegexFlagsRow ) where import Prelude hiding (between) import Control.Monad.State (get, put) import Data.Array (notElem) +import Data.Array.NonEmpty as NonEmptyArray import Data.Char (fromCharCode) import Data.CodePoint.Unicode (isSpace) +import Data.Either (Either(..)) import Data.Foldable (elem) import Data.Maybe (Maybe(..)) import Data.String (CodePoint, Pattern(..), length, null, singleton, splitAt, stripPrefix, uncons) import Data.String.CodeUnits as SCU +import Data.String.Regex as Regex +import Data.String.Regex.Flags (RegexFlags(..), RegexFlagsRec) import Data.Tuple (Tuple(..), fst) +import Prim.Row (class Nub, class Union) +import Record (merge) import Text.Parsing.Parser (ParseState(..), ParserT, consume, fail) import Text.Parsing.Parser.Combinators (skipMany, tryRethrow, (), (<~?>)) import Text.Parsing.Parser.Pos (Position(..)) @@ -208,3 +216,82 @@ match p = do -- | to something other than `newtype CodePoint = CodePoint Int`. unCodePoint :: CodePoint -> Int unCodePoint = unsafeCoerce + +-- | Parser which uses the `Data.String.Regex` module to match the regular +-- | expression pattern passed as the `String` +-- | argument to the parser. +-- | +-- | This parser will try to match the regular expression pattern starting +-- | at the current parser position. On success, it will return the matched +-- | substring. +-- | +-- | This parser may be useful for quickly consuming a large section of the +-- | input `String`, because in a JavaScript runtime environment the `RegExp` +-- | runtime is a lot faster than primitive parsers. +-- | +-- | [*MDN Regular Expressions Cheatsheet*](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Cheatsheet) +-- | +-- | The `Record flags` argument to the parser is for `Regex` flags. Here are +-- | the default flags. +-- | +-- | ```purescript +-- | { dotAll: true +-- | ignoreCase: false +-- | unicode: true +-- | } +-- | ``` +-- | +-- | If you want to use the defaults then pass +-- | `{}` as the flags argument. For case-insensitive pattern matching, pass +-- | `{ignoreCase: true}` as the flags argument. +-- | The other `Data.String.Regex.Flags.RegexFlagsRec` fields are mostly +-- | nonsense in the context of parsing +-- | and use of the other flags may cause strange behavior in the parser. +-- | +-- | [*MDN Advanced searching with flags*](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions#advanced_searching_with_flags) +-- | +-- | If the `Regex` pattern string fails to compile then this parser will fail. +-- | (Note: It’s not possible to use a precompiled `Regex` because this parser +-- | must set flags and make adjustments to the `Regex` pattern string.) +regex + :: forall m flags f_ + . Monad m + => Union flags RegexFlagsRow f_ + => Nub f_ RegexFlagsRow + => Record flags + -> String + -> ParserT String m String +regex flags pattern = + -- Prefix a ^ to ensure the pattern only matches the current position in the parse + case Regex.regex ("^(" <> pattern <> ")") flags' of + Left paterr -> + fail $ "Regex pattern error " <> paterr + Right regexobj -> do + ParseState input position _ <- get + case NonEmptyArray.head <$> Regex.match regexobj input of + Just (Just matched) -> do + let remainder = SCU.drop (SCU.length matched) input + put $ ParseState remainder (updatePosString position matched) true + pure matched + _ -> fail $ "No Regex pattern match" + where + flags' = RegexFlags + ( merge flags + { dotAll: true + , global: false + , ignoreCase: false + , multiline: false + , sticky: false + , unicode: true + } :: RegexFlagsRec + ) + +-- | The fields from `Data.String.Regex.Flags.RegexFlagsRec`. +type RegexFlagsRow = + ( dotAll :: Boolean + , global :: Boolean + , ignoreCase :: Boolean + , multiline :: Boolean + , sticky :: Boolean + , unicode :: Boolean + ) \ No newline at end of file diff --git a/test/Main.purs b/test/Main.purs index db30c75..04d4458 100644 --- a/test/Main.purs +++ b/test/Main.purs @@ -25,7 +25,7 @@ import Text.Parsing.Parser.Combinators (between, chainl, chainl1Rec, chainlRec, import Text.Parsing.Parser.Expr (Assoc(..), Operator(..), buildExprParser) import Text.Parsing.Parser.Language (haskellDef, haskellStyle, javaStyle) import Text.Parsing.Parser.Pos (Position(..), initialPos) -import Text.Parsing.Parser.String (anyChar, anyCodePoint, char, eof, noneOfCodePoints, oneOfCodePoints, rest, satisfy, string, takeN, whiteSpace) +import Text.Parsing.Parser.String (anyChar, anyCodePoint, char, eof, noneOfCodePoints, oneOfCodePoints, regex, rest, satisfy, string, takeN, whiteSpace) import Text.Parsing.Parser.String.Basic (intDecimal, number, letter) import Text.Parsing.Parser.Token (TokenParser, makeTokenParser, match, token, when) import Text.Parsing.Parser.Token as Parser.Token @@ -683,6 +683,18 @@ main = do parseTest "-300" (-300) intDecimal + parseTest "regex-" "regex" (regex {} "regex" <* char '-' <* eof) + parseTest "-regex" "regex" (char '-' *> regex {} "regex" <* eof) + parseTest "regexregex" "regexregex" (regex {} "(regex)*") + parseTest "regexregex" "regex" (regex {} "(^regex)*") + parseTest "ReGeX" "ReGeX" (regex { ignoreCase: true } "regex") + + -- Maybe it is nonsense to allow multiline regex. + -- Because an end-of-line regex pattern `$` will match but then the + -- newline character will not be consumed. + -- Also why does this test fail? I think it should succeed. + -- parseTest "regex\nregex\n" "regex\nregex\n" (regex {dotAll: false, multiline: true} "(^regex$)+") + stackSafeLoopsTest tokenParserIdentifierTest