" port) + (write-children n port) + (display "
\n" port)) + ((block-quote-node? n) + (display "\n" port) + (write-children n port) + (display "\n" port)) + ((code-block-node? n) + (display "
" port)
+ (write-verbatim (node-children n) port)
+ (display "\n
\n" port))
+ ((fenced-code-node? n)
+ (display "" port)
+ (write-verbatim (node-children n) port)
+ (display "\n
\n" port))
+ ((heading-node? n)
+ (let ((tag (case (node-ref n 'level)
+ ((1) "h1")
+ ((2) "h2")
+ ((3) "h3")
+ ((4) "h4")
+ ((5) "h5")
+ ((6) "h6"))))
+ (format port "<~a>" tag)
+ (write-children n port)
+ (format port "~a>\n" tag)))
+ ((list-node? n)
+ (let ((tag (case (node-ref n 'type)
+ ((bullet) "ul")
+ (else "ol")))
+ (tight? (node-ref n 'tight)))
+ (format port "<~a>\n" tag)
+ (for-each (lambda (n)
+ (cond
+ ((item-node? n)
+ (display "" port)
+ (write-verbatim (node-children n) port)
+ (display "
" port))
+ ((softbreak-node? n)
+ (newline port))
+ ((hardbreak-node? n)
+ (display "|$)" regexp/icase)) +(define re-html-block-special-end (make-regexp "(|||)" regexp/icase)) +(define re-html-block-comment-start (make-regexp "^")) +(define re-html-block-template-start (make-regexp "^<\\?")) +(define re-html-block-template-end (make-regexp "\\?>")) +(define re-html-block-declaration-start (make-regexp "^")) +(define re-html-block-cdata-start (make-regexp "^")) +(define re-html-block-well-known-start (make-regexp "^(<|)(address|article|aside|base|basefont|blockquote|body|caption|center|col|colgroup|dd|details|dialog|dir|div|dl|dt|fieldset|figcaption|figure|footer|form|frame|frameset|h1|h2|h3|h4|h5|h6|head|header|hr|html|iframe|legend|li|link|main|menu|menuitem|nav|noframes|ol|optgroup|option|p|param|search|section|summary|table|tbody|td|tfoot|th|thead|title|tr|track|ul)( |\t|>|/>)" regexp/icase)) +(define re-html-tag-name (make-regexp "^[a-zA-Z][0-9a-zA-Z-]*")) +(define %html-whitespace "[ \t]") +(define %html-tag-name "([a-zA-Z][0-9a-zA-Z-]*)") +(define %html-unquoted-attribute-value "[^ \t\"'=<>`]+") +(define %html-single-quoted-attribute-value "'[^']*'") +(define %html-double-quoted-attribute-value "\"[^\"]*\"") +(define %html-attribute-value + (string-append "(" %html-unquoted-attribute-value "|" + %html-single-quoted-attribute-value "|" + %html-double-quoted-attribute-value ")")) +(define %html-attribute-value-spec + (string-append %html-whitespace "*=" + %html-whitespace "*" + %html-attribute-value)) +(define %html-attribute-name "[a-zA-Z_:][a-zA-Z0-9_.:-]*") +(define %html-attribute + (string-append %html-whitespace "+" %html-attribute-name + %html-attribute-value-spec)) +(define %html-open-tag + (string-append "<" %html-tag-name "(" %html-attribute ")*" + %html-whitespace "*/?>")) +(define %html-closing-tag + (string-append "" %html-tag-name %html-whitespace "*>")) +(define re-html-block-other-start + (make-regexp (string-append "^(" %html-open-tag "|" %html-closing-tag ")$"))) + + +(define (html-block parser) + (let ((str (parser-str parser)) + (pos (parser-pos parser))) + (cond ((regexp-exec re-html-block-special-start str pos) => + (lambda (match) (list 'special match))) + ((regexp-exec re-html-block-comment-start str pos) => + (lambda (match) (list 'comment match))) + ((regexp-exec re-html-block-template-start str pos) => + (lambda (match) (list 'template match))) + ((regexp-exec re-html-block-declaration-start str pos) => + (lambda (match) (list 'declaration match))) + ((regexp-exec re-html-block-cdata-start str pos) => + (lambda (match) (list 'cdata match))) + ((regexp-exec re-html-block-well-known-start str pos) => + (lambda (match) (list 'well-known match))) + ((regexp-exec re-html-block-other-start str pos) => + (lambda (match) + (let ((excluded-tags '("pre" "script" "style" "textarea"))) + ;; Match 2 is the tag name for an open tag, match 5 is + ;; the tag name for a closing tag. + (and (not (member (match:substring match 2) excluded-tags)) + (not (member (match:substring match 5) excluded-tags)) + (list 'other match))))) + (else #f)))) + +(define (html-block-end type parser) + (let ((str (parser-str parser)) + (pos (parser-pos parser))) + (match type + ('special (regexp-exec re-html-block-special-end str pos)) + ('comment (regexp-exec re-html-block-comment-end str pos)) + ('template (regexp-exec re-html-block-template-end str pos)) + ('declaration (regexp-exec re-html-block-declaration-end str pos)) + ('cdata (regexp-exec re-html-block-cdata-end str pos)) + ((or 'well-known 'other) (empty-line parser)) + (_ #f)))) + +(define (html-block-string match) + (substring (match:string match) (match:start match))) + +(define (html-tag-name parser) + (let ((match (regexp-exec re-html-tag-name (parser-str parser) (parser-pos parser)))) + (and match + (parser-advance parser (- (match:end match) (parser-pos parser)))))) (define (block-quote parser) (if (and (not (parser-end? parser)) (parser-char=? parser #\>)) diff --git a/tests/html.scm b/tests/html.scm new file mode 100644 index 0000000..156ef6f --- /dev/null +++ b/tests/html.scm @@ -0,0 +1,636 @@ +;; Copyright (C) 2024 David Thompson
++**Hello**, ++ |
++**Hello**, + +_world_. ++ |
+ hi + | +
okay.
+" + "+ hi + | +
Markdown
+bar
+" + "+foo + |
+foo + |
foo
+foo
+import Text.HTML.TagSoup
+
+main :: IO ()
+main = print $ parseTags tags
+
+okay
+" + "
+import Text.HTML.TagSoup
+
+main :: IO ()
+main = print $ parseTags tags
+
+okay")
+
+(test-html
+ "
+okay
+" +" +okay") + +(test-html + " +" + "") + +(test-html + " +okay
+" + " +okay") + +(test-html + " +foo
+" + " +*foo*") + +(test-html + "*bar* +baz
+" + "*bar* +*baz*") + +(test-html + "1. *bar* +" + "1. *bar*") + +(test-html + " +okay
+" + " +okay") + +(test-html + "'; + +?> +okay
+" + "'; + +?> +okay") + +(test-html "\n" "") + +(test-html + " +okay
+" + " +okay") + +(test-html + " +<!-- foo -->
+
+"
+ "
+
+ ")
+
+(test-html
+ " <div>
+
+"
+ " Foo
+Foo + +baz
+" +"Foo + +baz") + +(test-html + "Emphasized text.
++Hi + | +
+Hi + | + +
+ Hi + | + +
Foo
<33> <__>
+" + "<33> <__>") + +(test-html + "<a h*#ref="hi">
+" + "") + +(test-html + "<a href="hi'> <a href=hi'>
+" + " ") + +(test-html + "< a>< +foo><bar/ > +<foo bar=baz +bim!bop />
+" + "< a>< +foo><a href='bar'title=title>
+" + "") + +(test-html + " +" + "</a href="foo">
+" + "foo
+" + "foo ") + +(test-html + "foo foo -->
+foo foo -->
+" + "foo foo --> + +foo foo -->") + +(test-html + "foo
+" + "foo ") + +(test-html + "foo
+" + "foo ") + +(test-html + "foo &<]]>
+" + "foo &<]]>") + +(test-html + " +" + "foo ") + +(test-html + " +" + "foo ") + +(test-html + "<a href=""">
+" + "") + +(test-end)