justinwoo · justinwoo · Jun 12, 2018 · Jun 8, 2018
diff --git a/bower.json b/bower.json
@@ -14,7 +14,7 @@
   "dependencies": {
     "purescript-prelude": "^4.0.0",
     "purescript-console": "^4.1.0",
-    "purescript-string-parsers": "^4.0.0",
+    "purescript-string-parsers": "justinwoo/purescript-string-parsers#v4.0.1",
     "purescript-generics-rep": "^6.0.0"
   },
   "devDependencies": {

diff --git a/src/LenientHtmlParser.purs b/src/LenientHtmlParser.purs
@@ -3,40 +3,39 @@ module LenientHtmlParser where
 import Prelude
 
 import Control.Alt ((<|>))
-import Data.Array (fromFoldable)
-import Data.Either (Either)
-import Data.Foldable (class Foldable)
+import Data.Either (Either(..))
 import Data.Generic.Rep (class Generic)
 import Data.Generic.Rep.Show (genericShow)
-import Data.List (List, elem)
-import Data.String (trim)
-import Data.String.CodeUnits (dropRight, fromCharArray)
-import Text.Parsing.StringParser (Parser, ParseError, runParser, fail)
+import Data.List (List)
+import Data.Maybe (Maybe(..))
+import Data.String.CodeUnits as SCU
+import Data.String.Pattern (Pattern(..))
+import Text.Parsing.StringParser (ParseError(..), Parser(..), fail, runParser)
 import Text.Parsing.StringParser.Combinators (fix, many, many1, manyTill)
-import Text.Parsing.StringParser.String (anyChar, char, eof, noneOf, regex, satisfy, string)
+import Text.Parsing.StringParser.String (anyChar, char, regex, satisfy)
 
 newtype TagName = TagName String
 derive instance eqTagName :: Eq TagName
 derive instance genericRepTagName :: Generic TagName _
-instance showTagName :: Show TagName where show = genericShow
-
-type Attributes = List Attribute
+derive newtype instance showTagName :: Show TagName
 
 newtype Name = Name String
 derive instance eqName :: Eq Name
 derive instance genericRepName :: Generic Name _
-instance showName :: Show Name where show = genericShow
+derive newtype instance showName :: Show Name
 
 newtype Value = Value String
 derive instance eqValue :: Eq Value
 derive instance genericRepValue :: Generic Value _
-instance showValue :: Show Value where show = genericShow
+derive newtype instance showValue :: Show Value
 
 data Attribute = Attribute Name Value
 derive instance eqAttribute :: Eq Attribute
 derive instance genericRepAttribute :: Generic Attribute _
 instance showAttribute :: Show Attribute where show = genericShow
 
+type Attributes = List Attribute
+
 data Tag
   = TagOpen TagName Attributes
   | TagSingle TagName Attributes
@@ -47,20 +46,16 @@ derive instance eqTag :: Eq Tag
 derive instance genericRepTag :: Generic Tag _
 instance showTag :: Show Tag where show = genericShow
 
-flattenChars :: forall f. Foldable f => f Char -> String
-flattenChars = trim <<< fromCharArray <<< fromFoldable
-
 comment :: Parser Unit
 comment = do
-  _ <- string "<!--"
-  _ <- manyTill anyChar $ string "-->"
+  _ <- regex "<!--"
+  _ <- manyTill anyChar $ regex "-->"
   pure unit
 
 doctype :: Parser Unit
 doctype = do
-  _ <- string "<!DOCTYPE" <|> string "<!doctype"
-  _ <- regex "[^>]*"
-  _ <- char '>'
+  _ <- regex "<!DOCTYPE" <|> regex "<!doctype"
+  _ <- takeStringTill { end: ">", allowEof: true }
   pure unit
 
 skipSpace :: Parser Unit
@@ -81,26 +76,23 @@ lexeme p = p <* skipSpace
 
 validNameString :: Parser String
 validNameString =
-  flattenChars
-  <$> many1 (noneOf ['=', ' ', '<', '>', '/', '"'])
+  regex "[^= <>/\\\"]+"
 
 attribute :: Parser Attribute
 attribute = lexeme do
   name <- validNameString
-  value <- (flattenChars <$> getValue) <|> pure ""
+  value <- getValue <|> pure ""
   pure $ Attribute (Name name) (Value value)
   where
-    termini = ['"', '>', ' ']
     getValue = do
       _ <- char '='
       content <- withQuotes <|> withoutQuotes
       pure content
     withQuotes = do
       _ <- char '"'
-      manyTill anyChar $ void (char '"') <|> eof
+      takeStringTill { allowEof: true, end: "\"" }
     withoutQuotes = do
-      content <- many $ satisfy (not flip elem ['>', ' '])
-      _ <- void (char ' ') <|> eof <|> pure unit
+      content <- regex "[^> ]+"
       pure content
 
 tagOpenOrSingleOrClose :: Parser Tag
@@ -128,21 +120,51 @@ tagOpenOrSingle = lexeme do
     closeTagOpen f =
       char '>' *> pure (f TagOpen)
     closeTagSingle f =
-      string "/>" *> pure (f TagSingle)
+      regex "/>" *> pure (f TagSingle)
 
 tnode :: Parser Tag
 tnode = lexeme do
-  TNode <$> regex "[^<]+" <|> slow
-  where
-    slow = fix \_ ->
-      TNode <<< flattenChars <$> many1 (satisfy ((/=) '<'))
+  TNode <$> regex "[^<]+"
 
 scriptTag :: Parser Tag
 scriptTag = lexeme do
-  _ <- lexeme $ string "<script"
-  attrs <- manyTill attribute (char '>')
-  content <- dropRight 9 <$> regex "[\\s\\S]*</script>"
+  _ <- lexeme $ regex "<script"
+  attrs <- many attribute
+  content <- invalidSelfClosing <|> normal
   pure $ TScript attrs content
+  where
+    invalidSelfClosing = do
+      _ <- (regex "/>")
+      pure ""
+    normal = do
+      _ <- regex ">"
+      content <- takeStringTill { end: "</script>", allowEof: false } <|> pure ""
+      pure content
+
+takeStringTill ::
+  { allowEof :: Boolean
+  , end :: String
+  }
+  -> Parser String
+takeStringTill { end, allowEof } = Parser \{str, pos} ->
+  let
+    len = SCU.length end
+    idx = SCU.indexOf' (Pattern end) pos str
+  in
+    case idx of
+      Nothing -> if allowEof
+        then Right
+          { result: SCU.drop pos str
+          , suffix: { str, pos: SCU.length str }
+          }
+        else Left
+          { pos
+          , error: ParseError $ "Could not close with found character: " <> end
+          }
+      Just i -> Right
+        { result: SCU.take (i - pos) (SCU.drop pos str)
+        , suffix: { str, pos: i + len }
+        }
 
 tag :: Parser Tag
 tag = lexeme do

diff --git a/test/Main.purs b/test/Main.purs
@@ -2,16 +2,24 @@ module Test.Main where
 
 import Prelude
 
-import Data.Either (Either(Right, Left), either)
+import Control.Monad.Rec.Class (Step(..), tailRec)
+import Data.Array (find)
+import Data.Array as Array
+import Data.Either (Either(Right, Left))
+import Data.Foldable (traverse_)
 import Data.List (List(..), (:))
+import Data.List as List
+import Data.Maybe (Maybe(..))
+import Data.String as S
+import Data.String.CodeUnits as SCU
+import Data.Tuple (Tuple(..))
 import Effect (Effect)
 import Effect.Aff (Aff)
-import Global.Unsafe (unsafeStringify)
+import Effect.Class.Console (log, logShow)
 import LenientHtmlParser (Attribute(Attribute), Name(Name), Tag(..), TagName(TagName), Value(Value), attribute, parse, parseTags, tag, tags, tnode)
 import Node.Encoding (Encoding(..))
 import Node.FS.Aff (readTextFile)
 import Test.Unit (failure, success, suite, test)
-import Test.Unit.Assert (assert)
 import Test.Unit.Assert as Assert
 import Test.Unit.Main (runTest)
 import Text.Parsing.StringParser (Parser, unParser)
@@ -68,7 +76,7 @@ testParser :: forall a. Show a => Eq a =>
 testParser p s expected =
   case parse p s of
     Right x -> do
-      assert "parsing worked:" $ x == expected
+      Assert.shouldEqual x expected
     Left e ->
       failure $ "parsing failed: " <> show e
 
@@ -106,13 +114,58 @@ main = runTest do
     test "tag script with attribute" $
       testParser tag """<script src="test"></script>""" $
         TScript (pure (Attribute (Name "src") (Value "test"))) ""
+    test "tag script improper" $
+      testParser tag """<script src="test" >""" $
+        TScript (pure (Attribute (Name "src") (Value "test"))) ""
     test "parseTags" do
       expectTags testHtml expectedTestTags
     test "multiple comments" do
       expectTags testMultiCommentHtml expectedMultiCommentTestTags
+
     test "test fixtures/crap.html" do
       a <- readTextFile UTF8 "test/fixtures/crap.html"
-      either (failure <<< unsafeStringify) (const success) $ unParser tags {str: a, pos: 0}
+      case unParser tags {str: a, pos: 0} of
+        Left e ->
+          failure $ "Failed: " <> show e
+        Right tags -> do
+          -- traverse_ logShow tags.result
+          success
+
+    test "test fixtures/megacrap-formatted.html" do
+      a <- readTextFile UTF8 "test/fixtures/megacrap-formatted.html"
+      case unParser tags {str: a, pos: 0} of
+        Left e -> do
+          failure $ "Failed: " <> show e <> " from around " <> (SCU.take 40 $ SCU.drop (e.pos - 40) a)
+        Right tags -> do
+          -- traverse_ logShow tags.result
+          success
+
     test "test fixtures/megacrap.html" do
       a <- readTextFile UTF8 "test/fixtures/megacrap.html"
-      either (failure <<< unsafeStringify) (const success) $ unParser tags {str: a, pos: 0}
+      let tags = parseTags a
+      case getYTLinks <$> tags of
+        Left e ->
+          failure $ "Failed: " <> show e
+        Right (Tuple tags' List.Nil) -> do
+          traverse_ logShow tags'
+          failure "Unable to find items"
+        Right (Tuple _tags' xs) -> do
+          traverse_ logShow xs
+          success
+
+getYTLinks :: List Tag -> Tuple (List Tag) (List String)
+getYTLinks tags =
+  Tuple tags $ tailRec getLinks (Tuple mempty tags)
+  where
+    getLinks (Tuple acc (TagOpen (TagName "a") attrs : TNode tnode : TagClose (TagName "a") : xs))
+      | Just true <- S.contains (S.Pattern "yt-uix-tile-link") <$> (getAttr "class" attrs)
+      , title <- S.trim tnode
+      , Just (Just href) <- Array.head <<< S.split (S.Pattern "&") <$> getAttr "href" attrs
+      , link <- "https://www.youtube.com" <> href = Loop (Tuple (List.Cons (title <> ": " <> link) acc) xs)
+      | otherwise = Loop (Tuple acc xs)
+    getLinks (Tuple acc (_ : xs)) = Loop (Tuple acc xs)
+    getLinks (Tuple acc _) = Done acc
+    getAttr match xs = getValue <$> find matchName xs
+      where
+        matchName (Attribute (Name name) _) = match == name
+        getValue (Attribute _ (Value x)) = S.trim $ x