Skip to content

Commit ff023be

Browse files
authored
Merge pull request #44 from purescript-contrib/both-units-and-points
Support CodeUnit and CodePoint parsing
2 parents 905c883 + ac984f7 commit ff023be

File tree

5 files changed

+364
-95
lines changed

5 files changed

+364
-95
lines changed

src/Text/Parsing/StringParser/String.purs renamed to src/Text/Parsing/StringParser/CodePoints.purs

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
1-
-- | Primitive parsers for strings.
2-
3-
module Text.Parsing.StringParser.String
1+
-- | Primitive parsers for strings, parsing based on code points.
2+
-- |
3+
-- | These functions will be much slower than the `CodeUnits` alternatives, but
4+
-- | will behave correctly in the presence of Unicode characters made up of
5+
-- | multiple code units.
6+
module Text.Parsing.StringParser.CodePoints
47
( eof
58
, anyChar
69
, anyDigit
@@ -27,8 +30,9 @@ import Data.Char (toCharCode)
2730
import Data.Either (Either(..))
2831
import Data.Foldable (class Foldable, foldMap, elem, notElem)
2932
import Data.Maybe (Maybe(..))
30-
import Data.String (Pattern(..), drop, length, indexOf', stripPrefix)
33+
import Data.String.CodePoints (drop, length, indexOf', stripPrefix)
3134
import Data.String.CodeUnits (charAt, singleton)
35+
import Data.String.Pattern (Pattern(..))
3236
import Data.String.Regex as Regex
3337
import Data.String.Regex.Flags (noFlags)
3438
import Text.Parsing.StringParser (Parser(..), ParseError(..), try, fail)
Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
-- | Primitive parsers for strings, parsing based on code units.
2+
-- |
3+
-- | These functions will be much faster than the `CodePoints` alternatives, but
4+
-- | will behave incorrectly when dealing with Unicode characters that consist
5+
-- | of multiple code units.
6+
module Text.Parsing.StringParser.CodeUnits
7+
( eof
8+
, anyChar
9+
, anyDigit
10+
, string
11+
, satisfy
12+
, char
13+
, whiteSpace
14+
, skipSpaces
15+
, oneOf
16+
, noneOf
17+
, lowerCaseChar
18+
, upperCaseChar
19+
, anyLetter
20+
, alphaNum
21+
, regex
22+
) where
23+
24+
import Prelude
25+
26+
import Control.Alt ((<|>))
27+
import Data.Array ((..))
28+
import Data.Array.NonEmpty as NEA
29+
import Data.Char (toCharCode)
30+
import Data.Either (Either(..))
31+
import Data.Foldable (class Foldable, foldMap, elem, notElem)
32+
import Data.Maybe (Maybe(..))
33+
import Data.String.CodeUnits (charAt, singleton)
34+
import Data.String.CodeUnits as SCU
35+
import Data.String.Pattern (Pattern(..))
36+
import Data.String.Regex as Regex
37+
import Data.String.Regex.Flags (noFlags)
38+
import Text.Parsing.StringParser (Parser(..), ParseError(..), try, fail)
39+
import Text.Parsing.StringParser.Combinators (many, (<?>))
40+
41+
-- | Match the end of the file.
42+
eof :: Parser Unit
43+
eof = Parser \s ->
44+
case s of
45+
{ str, pos } | pos < SCU.length str -> Left { pos, error: ParseError "Expected EOF" }
46+
_ -> Right { result: unit, suffix: s }
47+
48+
-- | Match any character.
49+
anyChar :: Parser Char
50+
anyChar = Parser \{ str, pos } ->
51+
case charAt pos str of
52+
Just chr -> Right { result: chr, suffix: { str, pos: pos + 1 } }
53+
Nothing -> Left { pos, error: ParseError "Unexpected EOF" }
54+
55+
-- | Match any digit.
56+
anyDigit :: Parser Char
57+
anyDigit = try do
58+
c <- anyChar
59+
if c >= '0' && c <= '9'
60+
then pure c
61+
else fail $ "Character " <> show c <> " is not a digit"
62+
63+
-- | Match the specified string.
64+
string :: String -> Parser String
65+
string nt = Parser \s ->
66+
case s of
67+
{ str, pos } | SCU.indexOf' (Pattern nt) pos str == Just pos -> Right { result: nt, suffix: { str, pos: pos + SCU.length nt } }
68+
{ pos } -> Left { pos, error: ParseError ("Expected '" <> nt <> "'.") }
69+
70+
-- | Match a character satisfying the given predicate.
71+
satisfy :: (Char -> Boolean) -> Parser Char
72+
satisfy f = try do
73+
c <- anyChar
74+
if f c
75+
then pure c
76+
else fail $ "Character " <> show c <> " did not satisfy predicate"
77+
78+
-- | Match the specified character.
79+
char :: Char -> Parser Char
80+
char c = satisfy (_ == c) <?> "Could not match character " <> show c
81+
82+
-- | Match many whitespace characters.
83+
whiteSpace :: Parser String
84+
whiteSpace = do
85+
cs <- many (satisfy \ c -> c == '\n' || c == '\r' || c == ' ' || c == '\t')
86+
pure (foldMap singleton cs)
87+
88+
-- | Skip many whitespace characters.
89+
skipSpaces :: Parser Unit
90+
skipSpaces = void whiteSpace
91+
92+
-- | Match one of the characters in the foldable structure.
93+
oneOf :: forall f. Foldable f => f Char -> Parser Char
94+
oneOf = satisfy <<< flip elem
95+
96+
-- | Match any character not in the foldable structure.
97+
noneOf :: forall f. Foldable f => f Char -> Parser Char
98+
noneOf = satisfy <<< flip notElem
99+
100+
-- | Match any lower case character.
101+
lowerCaseChar :: Parser Char
102+
lowerCaseChar = try do
103+
c <- anyChar
104+
if toCharCode c `elem` (97 .. 122)
105+
then pure c
106+
else fail $ "Expected a lower case character but found " <> show c
107+
108+
-- | Match any upper case character.
109+
upperCaseChar :: Parser Char
110+
upperCaseChar = try do
111+
c <- anyChar
112+
if toCharCode c `elem` (65 .. 90)
113+
then pure c
114+
else fail $ "Expected an upper case character but found " <> show c
115+
116+
-- | Match any letter.
117+
anyLetter :: Parser Char
118+
anyLetter = lowerCaseChar <|> upperCaseChar <?> "Expected a letter"
119+
120+
-- | Match a letter or a number.
121+
alphaNum :: Parser Char
122+
alphaNum = anyLetter <|> anyDigit <?> "Expected a letter or a number"
123+
124+
-- | match the regular expression
125+
regex :: String -> Parser String
126+
regex pat =
127+
case Regex.regex pattern noFlags of
128+
Left _ ->
129+
fail $ "Text.Parsing.StringParser.String.regex': illegal regex " <> pat
130+
Right r ->
131+
matchRegex r
132+
where
133+
-- ensure the pattern only matches the current position in the parse
134+
pattern =
135+
case SCU.stripPrefix (Pattern "^") pat of
136+
Nothing ->
137+
"^" <> pat
138+
_ ->
139+
pat
140+
matchRegex :: Regex.Regex -> Parser String
141+
matchRegex r =
142+
Parser \{ str, pos } ->
143+
let
144+
remainder = SCU.drop pos str
145+
in
146+
case NEA.head <$> Regex.match r remainder of
147+
Just (Just matched) ->
148+
Right { result: matched, suffix: { str, pos: pos + SCU.length matched } }
149+
_ ->
150+
Left { pos, error: ParseError "no match" }

test/CodePoints.purs

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
module Test.CodePoints where
2+
3+
import Prelude hiding (between)
4+
5+
import Control.Alt ((<|>))
6+
import Data.Either (isLeft, isRight, Either(..))
7+
import Data.Foldable (fold)
8+
import Data.List (List(Nil), (:))
9+
import Data.List.Lazy (take, repeat)
10+
import Data.List.NonEmpty (NonEmptyList(..))
11+
import Data.NonEmpty ((:|))
12+
import Data.String.CodeUnits (singleton)
13+
import Data.String.Common as SC
14+
import Data.Unfoldable (replicate)
15+
import Effect (Effect)
16+
import Test.Assert (assert', assert)
17+
import Text.Parsing.StringParser (Parser, runParser, try)
18+
import Text.Parsing.StringParser.Combinators (many1, endBy1, sepBy1, optionMaybe, many, manyTill, many1Till, chainl, fix, between)
19+
import Text.Parsing.StringParser.Expr (Assoc(..), Operator(..), buildExprParser)
20+
import Text.Parsing.StringParser.CodePoints (anyDigit, eof, string, anyChar, regex)
21+
22+
parens :: forall a. Parser a -> Parser a
23+
parens = between (string "(") (string ")")
24+
25+
nested :: Parser Int
26+
nested = fix $ \p -> (do
27+
_ <- string "a"
28+
pure 0) <|> ((+) 1) <$> parens p
29+
30+
opTest :: Parser String
31+
opTest = chainl (singleton <$> anyChar) (string "+" $> append) ""
32+
33+
digit :: Parser Int
34+
digit = string "0" $> 0
35+
<|> string "1" $> 1
36+
<|> string "2" $> 2
37+
<|> string "3" $> 3
38+
<|> string "4" $> 4
39+
<|> string "5" $> 5
40+
<|> string "6" $> 6
41+
<|> string "7" $> 7
42+
<|> string "8" $> 8
43+
<|> string "9" $> 9
44+
45+
exprTest :: Parser Int
46+
exprTest = buildExprParser [ [Infix (string "/" >>= \_ -> pure div) AssocRight]
47+
, [Infix (string "*" >>= \_ -> pure mul) AssocRight]
48+
, [Infix (string "-" >>= \_ -> pure sub) AssocRight]
49+
, [Infix (string "+" >>= \_ -> pure add) AssocRight]
50+
] digit
51+
52+
tryTest :: Parser String
53+
-- reduce the possible array of matches to 0 or 1 elements to aid Array pattern matching
54+
tryTest =
55+
try (string "aa" <> string "bb") <|>
56+
(string "aa" <> string "cc")
57+
58+
canParse :: forall a. Parser a -> String -> Boolean
59+
canParse p input = isRight $ runParser p input
60+
61+
parseFail :: forall a. Parser a -> String -> Boolean
62+
parseFail p input = isLeft $ runParser p input
63+
64+
expectResult :: forall a. Eq a => a -> Parser a -> String -> Boolean
65+
expectResult res p input = runParser p input == Right res
66+
67+
testCodePoints :: Effect Unit
68+
testCodePoints = do
69+
assert' "many should not blow the stack" $ canParse (many (string "a")) (SC.joinWith "" $ replicate 100000 "a")
70+
assert' "many failing after" $ parseFail (do
71+
as <- many (string "a")
72+
eof
73+
pure as) (SC.joinWith "" (replicate 100000 "a") <> "b" )
74+
75+
assert $ expectResult 3 nested "(((a)))"
76+
assert $ expectResult ("a":"a":"a":Nil) (many (string "a")) "aaa"
77+
assert $ parseFail (many1 (string "a")) ""
78+
assert $ canParse (parens (do
79+
_ <- string "a"
80+
optionMaybe $ string "b")) "(ab)"
81+
assert $ expectResult (NonEmptyList ("a" :| "a":"a":Nil)) (string "a" `sepBy1` string ",") "a,a,a"
82+
assert $ canParse (do
83+
as <- string "a" `endBy1` string ","
84+
eof
85+
pure as) "a,a,a,"
86+
assert' "opTest" $ expectResult "abc" opTest "a+b+c"
87+
assert' "exprTest" $ expectResult (-3) exprTest "1*2+3/4-5"
88+
assert' "tryTest "$ canParse tryTest "aacc"
89+
assert $ expectResult (NonEmptyList ('0' :| '1':'2':'3':'4':Nil)) (many1 anyDigit) "01234/"
90+
assert $ expectResult (NonEmptyList ('5' :| '6':'7':'8':'9':Nil)) (many1 anyDigit) "56789:"
91+
assert $ expectResult "aaaa" (regex "a+") "aaaab"
92+
assert $ expectResult ("a":"a":"a":Nil) (manyTill (string "a") (string "b")) "aaab"
93+
assert $ expectResult Nil (manyTill (string "a") (string "b")) "b"
94+
assert $ expectResult (NonEmptyList ("a" :| "a":"a":Nil)) (many1Till (string "a") (string "b")) "aaab"
95+
assert $ parseFail (many1Till (string "a") (string "b")) "b"
96+
-- check against overflow
97+
assert $ canParse (many1Till (string "a") (string "and")) $ (fold <<< take 10000 $ repeat "a") <> "and"
98+
-- check correct order
99+
assert $ expectResult (NonEmptyList ('a' :| 'b':'c':Nil)) (many1Till anyChar (string "d")) "abcd"

test/CodeUnits.purs

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
module Test.CodeUnits where
2+
3+
import Prelude hiding (between)
4+
5+
import Control.Alt ((<|>))
6+
import Data.Either (isLeft, isRight, Either(..))
7+
import Data.Foldable (fold)
8+
import Data.List (List(Nil), (:))
9+
import Data.List.Lazy (take, repeat)
10+
import Data.List.NonEmpty (NonEmptyList(..))
11+
import Data.NonEmpty ((:|))
12+
import Data.String.CodeUnits (singleton)
13+
import Data.String.Common as SC
14+
import Data.Unfoldable (replicate)
15+
import Effect (Effect)
16+
import Test.Assert (assert', assert)
17+
import Text.Parsing.StringParser (Parser, runParser, try)
18+
import Text.Parsing.StringParser.Combinators (many1, endBy1, sepBy1, optionMaybe, many, manyTill, many1Till, chainl, fix, between)
19+
import Text.Parsing.StringParser.Expr (Assoc(..), Operator(..), buildExprParser)
20+
import Text.Parsing.StringParser.CodeUnits (anyDigit, eof, string, anyChar, regex)
21+
22+
parens :: forall a. Parser a -> Parser a
23+
parens = between (string "(") (string ")")
24+
25+
nested :: Parser Int
26+
nested = fix $ \p -> (do
27+
_ <- string "a"
28+
pure 0) <|> ((+) 1) <$> parens p
29+
30+
opTest :: Parser String
31+
opTest = chainl (singleton <$> anyChar) (string "+" $> append) ""
32+
33+
digit :: Parser Int
34+
digit = string "0" $> 0
35+
<|> string "1" $> 1
36+
<|> string "2" $> 2
37+
<|> string "3" $> 3
38+
<|> string "4" $> 4
39+
<|> string "5" $> 5
40+
<|> string "6" $> 6
41+
<|> string "7" $> 7
42+
<|> string "8" $> 8
43+
<|> string "9" $> 9
44+
45+
exprTest :: Parser Int
46+
exprTest = buildExprParser [ [Infix (string "/" >>= \_ -> pure div) AssocRight]
47+
, [Infix (string "*" >>= \_ -> pure mul) AssocRight]
48+
, [Infix (string "-" >>= \_ -> pure sub) AssocRight]
49+
, [Infix (string "+" >>= \_ -> pure add) AssocRight]
50+
] digit
51+
52+
tryTest :: Parser String
53+
-- reduce the possible array of matches to 0 or 1 elements to aid Array pattern matching
54+
tryTest =
55+
try (string "aa" <> string "bb") <|>
56+
(string "aa" <> string "cc")
57+
58+
canParse :: forall a. Parser a -> String -> Boolean
59+
canParse p input = isRight $ runParser p input
60+
61+
parseFail :: forall a. Parser a -> String -> Boolean
62+
parseFail p input = isLeft $ runParser p input
63+
64+
expectResult :: forall a. Eq a => a -> Parser a -> String -> Boolean
65+
expectResult res p input = runParser p input == Right res
66+
67+
testCodeUnits :: Effect Unit
68+
testCodeUnits = do
69+
assert' "many should not blow the stack" $ canParse (many (string "a")) (SC.joinWith "" $ replicate 100000 "a")
70+
assert' "many failing after" $ parseFail (do
71+
as <- many (string "a")
72+
eof
73+
pure as) (SC.joinWith "" (replicate 100000 "a") <> "b" )
74+
75+
assert $ expectResult 3 nested "(((a)))"
76+
assert $ expectResult ("a":"a":"a":Nil) (many (string "a")) "aaa"
77+
assert $ parseFail (many1 (string "a")) ""
78+
assert $ canParse (parens (do
79+
_ <- string "a"
80+
optionMaybe $ string "b")) "(ab)"
81+
assert $ expectResult (NonEmptyList ("a" :| "a":"a":Nil)) (string "a" `sepBy1` string ",") "a,a,a"
82+
assert $ canParse (do
83+
as <- string "a" `endBy1` string ","
84+
eof
85+
pure as) "a,a,a,"
86+
assert' "opTest" $ expectResult "abc" opTest "a+b+c"
87+
assert' "exprTest" $ expectResult (-3) exprTest "1*2+3/4-5"
88+
assert' "tryTest "$ canParse tryTest "aacc"
89+
assert $ expectResult (NonEmptyList ('0' :| '1':'2':'3':'4':Nil)) (many1 anyDigit) "01234/"
90+
assert $ expectResult (NonEmptyList ('5' :| '6':'7':'8':'9':Nil)) (many1 anyDigit) "56789:"
91+
assert $ expectResult "aaaa" (regex "a+") "aaaab"
92+
assert $ expectResult ("a":"a":"a":Nil) (manyTill (string "a") (string "b")) "aaab"
93+
assert $ expectResult Nil (manyTill (string "a") (string "b")) "b"
94+
assert $ expectResult (NonEmptyList ("a" :| "a":"a":Nil)) (many1Till (string "a") (string "b")) "aaab"
95+
assert $ parseFail (many1Till (string "a") (string "b")) "b"
96+
-- check against overflow
97+
assert $ canParse (many1Till (string "a") (string "and")) $ (fold <<< take 10000 $ repeat "a") <> "and"
98+
-- check correct order
99+
assert $ expectResult (NonEmptyList ('a' :| 'b':'c':Nil)) (many1Till anyChar (string "d")) "abcd"

0 commit comments

Comments
 (0)