Skip to content

fix issues with incorrect and really slow parsing #3

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 12, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion bower.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
"dependencies": {
"purescript-prelude": "^4.0.0",
"purescript-console": "^4.1.0",
"purescript-string-parsers": "^4.0.0",
"purescript-string-parsers": "justinwoo/purescript-string-parsers#v4.0.1",
"purescript-generics-rep": "^6.0.0"
},
"devDependencies": {
Expand Down
94 changes: 58 additions & 36 deletions src/LenientHtmlParser.purs
Original file line number Diff line number Diff line change
Expand Up @@ -3,40 +3,39 @@ module LenientHtmlParser where
import Prelude

import Control.Alt ((<|>))
import Data.Array (fromFoldable)
import Data.Either (Either)
import Data.Foldable (class Foldable)
import Data.Either (Either(..))
import Data.Generic.Rep (class Generic)
import Data.Generic.Rep.Show (genericShow)
import Data.List (List, elem)
import Data.String (trim)
import Data.String.CodeUnits (dropRight, fromCharArray)
import Text.Parsing.StringParser (Parser, ParseError, runParser, fail)
import Data.List (List)
import Data.Maybe (Maybe(..))
import Data.String.CodeUnits as SCU
import Data.String.Pattern (Pattern(..))
import Text.Parsing.StringParser (ParseError(..), Parser(..), fail, runParser)
import Text.Parsing.StringParser.Combinators (fix, many, many1, manyTill)
import Text.Parsing.StringParser.String (anyChar, char, eof, noneOf, regex, satisfy, string)
import Text.Parsing.StringParser.String (anyChar, char, regex, satisfy)

newtype TagName = TagName String
derive instance eqTagName :: Eq TagName
derive instance genericRepTagName :: Generic TagName _
instance showTagName :: Show TagName where show = genericShow

type Attributes = List Attribute
derive newtype instance showTagName :: Show TagName

newtype Name = Name String
derive instance eqName :: Eq Name
derive instance genericRepName :: Generic Name _
instance showName :: Show Name where show = genericShow
derive newtype instance showName :: Show Name

newtype Value = Value String
derive instance eqValue :: Eq Value
derive instance genericRepValue :: Generic Value _
instance showValue :: Show Value where show = genericShow
derive newtype instance showValue :: Show Value

data Attribute = Attribute Name Value
derive instance eqAttribute :: Eq Attribute
derive instance genericRepAttribute :: Generic Attribute _
instance showAttribute :: Show Attribute where show = genericShow

type Attributes = List Attribute

data Tag
= TagOpen TagName Attributes
| TagSingle TagName Attributes
Expand All @@ -47,20 +46,16 @@ derive instance eqTag :: Eq Tag
derive instance genericRepTag :: Generic Tag _
instance showTag :: Show Tag where show = genericShow

flattenChars :: forall f. Foldable f => f Char -> String
flattenChars = trim <<< fromCharArray <<< fromFoldable

comment :: Parser Unit
comment = do
_ <- string "<!--"
_ <- manyTill anyChar $ string "-->"
_ <- regex "<!--"
_ <- manyTill anyChar $ regex "-->"
pure unit

doctype :: Parser Unit
doctype = do
_ <- string "<!DOCTYPE" <|> string "<!doctype"
_ <- regex "[^>]*"
_ <- char '>'
_ <- regex "<!DOCTYPE" <|> regex "<!doctype"
_ <- takeStringTill { end: ">", allowEof: true }
pure unit

skipSpace :: Parser Unit
Expand All @@ -81,26 +76,23 @@ lexeme p = p <* skipSpace

validNameString :: Parser String
validNameString =
flattenChars
<$> many1 (noneOf ['=', ' ', '<', '>', '/', '"'])
regex "[^= <>/\\\"]+"

attribute :: Parser Attribute
attribute = lexeme do
name <- validNameString
value <- (flattenChars <$> getValue) <|> pure ""
value <- getValue <|> pure ""
pure $ Attribute (Name name) (Value value)
where
termini = ['"', '>', ' ']
getValue = do
_ <- char '='
content <- withQuotes <|> withoutQuotes
pure content
withQuotes = do
_ <- char '"'
manyTill anyChar $ void (char '"') <|> eof
takeStringTill { allowEof: true, end: "\"" }
withoutQuotes = do
content <- many $ satisfy (not flip elem ['>', ' '])
_ <- void (char ' ') <|> eof <|> pure unit
content <- regex "[^> ]+"
pure content

tagOpenOrSingleOrClose :: Parser Tag
Expand Down Expand Up @@ -128,21 +120,51 @@ tagOpenOrSingle = lexeme do
closeTagOpen f =
char '>' *> pure (f TagOpen)
closeTagSingle f =
string "/>" *> pure (f TagSingle)
regex "/>" *> pure (f TagSingle)

tnode :: Parser Tag
tnode = lexeme do
TNode <$> regex "[^<]+" <|> slow
where
slow = fix \_ ->
TNode <<< flattenChars <$> many1 (satisfy ((/=) '<'))
TNode <$> regex "[^<]+"

scriptTag :: Parser Tag
scriptTag = lexeme do
_ <- lexeme $ string "<script"
attrs <- manyTill attribute (char '>')
content <- dropRight 9 <$> regex "[\\s\\S]*</script>"
_ <- lexeme $ regex "<script"
attrs <- many attribute
content <- invalidSelfClosing <|> normal
pure $ TScript attrs content
where
invalidSelfClosing = do
_ <- (regex "/>")
pure ""
normal = do
_ <- regex ">"
content <- takeStringTill { end: "</script>", allowEof: false } <|> pure ""
pure content

takeStringTill ::
{ allowEof :: Boolean
, end :: String
}
-> Parser String
takeStringTill { end, allowEof } = Parser \{str, pos} ->
let
len = SCU.length end
idx = SCU.indexOf' (Pattern end) pos str
in
case idx of
Nothing -> if allowEof
then Right
{ result: SCU.drop pos str
, suffix: { str, pos: SCU.length str }
}
else Left
{ pos
, error: ParseError $ "Could not close with found character: " <> end
}
Just i -> Right
{ result: SCU.take (i - pos) (SCU.drop pos str)
, suffix: { str, pos: i + len }
}

tag :: Parser Tag
tag = lexeme do
Expand Down
65 changes: 59 additions & 6 deletions test/Main.purs
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,24 @@ module Test.Main where

import Prelude

import Data.Either (Either(Right, Left), either)
import Control.Monad.Rec.Class (Step(..), tailRec)
import Data.Array (find)
import Data.Array as Array
import Data.Either (Either(Right, Left))
import Data.Foldable (traverse_)
import Data.List (List(..), (:))
import Data.List as List
import Data.Maybe (Maybe(..))
import Data.String as S
import Data.String.CodeUnits as SCU
import Data.Tuple (Tuple(..))
import Effect (Effect)
import Effect.Aff (Aff)
import Global.Unsafe (unsafeStringify)
import Effect.Class.Console (log, logShow)
import LenientHtmlParser (Attribute(Attribute), Name(Name), Tag(..), TagName(TagName), Value(Value), attribute, parse, parseTags, tag, tags, tnode)
import Node.Encoding (Encoding(..))
import Node.FS.Aff (readTextFile)
import Test.Unit (failure, success, suite, test)
import Test.Unit.Assert (assert)
import Test.Unit.Assert as Assert
import Test.Unit.Main (runTest)
import Text.Parsing.StringParser (Parser, unParser)
Expand Down Expand Up @@ -68,7 +76,7 @@ testParser :: forall a. Show a => Eq a =>
testParser p s expected =
case parse p s of
Right x -> do
assert "parsing worked:" $ x == expected
Assert.shouldEqual x expected
Left e ->
failure $ "parsing failed: " <> show e

Expand Down Expand Up @@ -106,13 +114,58 @@ main = runTest do
test "tag script with attribute" $
testParser tag """<script src="test"></script>""" $
TScript (pure (Attribute (Name "src") (Value "test"))) ""
test "tag script improper" $
testParser tag """<script src="test" >""" $
TScript (pure (Attribute (Name "src") (Value "test"))) ""
test "parseTags" do
expectTags testHtml expectedTestTags
test "multiple comments" do
expectTags testMultiCommentHtml expectedMultiCommentTestTags

test "test fixtures/crap.html" do
a <- readTextFile UTF8 "test/fixtures/crap.html"
either (failure <<< unsafeStringify) (const success) $ unParser tags {str: a, pos: 0}
case unParser tags {str: a, pos: 0} of
Left e ->
failure $ "Failed: " <> show e
Right tags -> do
-- traverse_ logShow tags.result
success

test "test fixtures/megacrap-formatted.html" do
a <- readTextFile UTF8 "test/fixtures/megacrap-formatted.html"
case unParser tags {str: a, pos: 0} of
Left e -> do
failure $ "Failed: " <> show e <> " from around " <> (SCU.take 40 $ SCU.drop (e.pos - 40) a)
Right tags -> do
-- traverse_ logShow tags.result
success

test "test fixtures/megacrap.html" do
a <- readTextFile UTF8 "test/fixtures/megacrap.html"
either (failure <<< unsafeStringify) (const success) $ unParser tags {str: a, pos: 0}
let tags = parseTags a
case getYTLinks <$> tags of
Left e ->
failure $ "Failed: " <> show e
Right (Tuple tags' List.Nil) -> do
traverse_ logShow tags'
failure "Unable to find items"
Right (Tuple _tags' xs) -> do
traverse_ logShow xs
success

getYTLinks :: List Tag -> Tuple (List Tag) (List String)
getYTLinks tags =
Tuple tags $ tailRec getLinks (Tuple mempty tags)
where
getLinks (Tuple acc (TagOpen (TagName "a") attrs : TNode tnode : TagClose (TagName "a") : xs))
| Just true <- S.contains (S.Pattern "yt-uix-tile-link") <$> (getAttr "class" attrs)
, title <- S.trim tnode
, Just (Just href) <- Array.head <<< S.split (S.Pattern "&") <$> getAttr "href" attrs
, link <- "https://www.youtube.com" <> href = Loop (Tuple (List.Cons (title <> ": " <> link) acc) xs)
| otherwise = Loop (Tuple acc xs)
getLinks (Tuple acc (_ : xs)) = Loop (Tuple acc xs)
getLinks (Tuple acc _) = Done acc
getAttr match xs = getValue <$> find matchName xs
where
matchName (Attribute (Name name) _) = match == name
getValue (Attribute _ (Value x)) = S.trim $ x
Loading