diff --git a/Makefile.am b/Makefile.am index 84e06e1..b50baf7 100644 --- a/Makefile.am +++ b/Makefile.am @@ -8,6 +8,7 @@ SOURCES = \ commonmark/node.scm \ commonmark/blocks.scm \ commonmark/inlines.scm \ + commonmark/html.scm \ commonmark/sxml.scm \ commonmark.scm @@ -32,7 +33,8 @@ TESTS = \ tests/inlines/links.scm \ tests/inlines/images.scm \ tests/inlines/autolinks.scm \ - tests/inlines/entities.scm + tests/inlines/entities.scm \ + tests/html.scm TEST_EXTENSIONS = .scm diff --git a/commonmark/blocks.scm b/commonmark/blocks.scm index 2b6dd7a..8f8a8c8 100644 --- a/commonmark/blocks.scm +++ b/commonmark/blocks.scm @@ -15,7 +15,8 @@ ;; You should have received a copy of the GNU Lesser General Public License ;; along with guile-commonmark. If not, see . -(define-module (commonmark blocks) +(define-module (commonmark blocks) + #:use-module (ice-9 match) #:use-module (srfi srfi-1) #:use-module (srfi srfi-26) #:use-module (commonmark node) @@ -46,8 +47,21 @@ ((code-block-node? node) (parse-code-block node parser)) ((fenced-code-node? node) (parse-fenced-code node parser)) ((list-node? node) (parse-list node parser)) + ((html-block-node? node) (parse-html node parser)) ((paragraph-node? node) (parse-paragraph node parser)))) +(define (parse-html node parser) + (let ((type (html-block-node-type node)) + (str (parser-rest-str parser))) + (match (node-children node) + ((html) + (if (html-block-end type parser) + (close-node + (if (string-null? str) + node + (make-html-block-node type (string-append html "\n" str)))) + (make-html-block-node type (string-append html "\n" str))))))) + ;; Node Parser -> Node (define (parse-container-block node parser) (cond ((and (no-children? node) (empty-line parser)) ;; empty line @@ -60,7 +74,8 @@ (cond ((and (not (empty-line parser)) (node-closed? new-child) (not (fenced-code-node? new-child)) - (not (heading-node? new-child))) + (not (heading-node? new-child)) + (not (html-block-node? new-child))) (add-child-node (replace-last-child node new-child) (parse-line parser))) (else (replace-last-child node new-child))))))) @@ -93,6 +108,11 @@ (replace-last-child node (join-text-nodes (last-child node) (last-child parsed-line)))) ((code-block-node? parsed-line) (replace-last-child node (add-text (last-child node) (parser-rest-str parser)))) + ;; 1 of 7 types of HTML blocks is *not* allowed to interrupt + ;; a paragraph. + ((and (html-block-node? parsed-line) + (eq? (html-block-node-type parsed-line) 'other)) + (replace-last-child node (add-text (last-child node) (last-child parsed-line)))) (else (close-node node))))) (define (fence-start node) @@ -207,6 +227,9 @@ (let ((nonspace-parser (parser-advance-next-nonspace parser))) (cond ((empty-line nonspace-parser) (make-blank-node)) ((parser-indented? parser nonspace-parser) (make-code-block parser)) + ((html-block nonspace-parser) => (match-lambda + ((type match) + (make-html-block type parser)))) ((thematic-break nonspace-parser) (make-thematic-break)) ((block-quote nonspace-parser) => make-block-quote) ((atx-heading nonspace-parser) => make-atx-heading) @@ -216,6 +239,12 @@ (else (make-paragraph nonspace-parser))))) +(define (make-html-block type parser) + (let ((node (make-html-block-node type (parser-rest-str parser)))) + (if (html-block-end type parser) + (close-node node) + node))) + (define (make-thematic-break) (make-thematic-break-node)) diff --git a/commonmark/html.scm b/commonmark/html.scm new file mode 100644 index 0000000..6ae836a --- /dev/null +++ b/commonmark/html.scm @@ -0,0 +1,164 @@ +;; Copyright (C) 2024 David Thompson +;; +;; This file is part of guile-commonmark +;; +;; guile-commonmark is free software: you can redistribute it and/or +;; modify it under the terms of the GNU Lesser General Public License +;; as published by the Free Software Foundation, either version 3 of +;; the License, or (at your option) any later version. +;; +;; guile-commonmark is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU Lesser General Public License for more details. +;; +;; You should have received a copy of the GNU Lesser General Public License +;; along with guile-commonmark. If not, see . + +(define-module (commonmark html) + #:use-module (commonmark blocks) + #:use-module (commonmark inlines) + #:use-module (commonmark node) + #:use-module (srfi srfi-1) + #:export (commonmark->html)) + +(define* (commonmark->html #:optional (string-or-port (current-input-port))) + (define (node-ref n key) + (assq-ref (node-data n) key)) + (define (node->text n) + (if (text-node? n) + (node-children n) + (fold-text n))) + (define (fold-text n) + (fold (lambda (elem prev) + (append (node->text elem) prev)) + '() + (node-children n))) + (define (concat-text n) + (string-concatenate (fold-text n))) + (define (display/escaped str port) + (string-for-each (lambda (c) + (case c + ((#\") (display """ port)) + ((#\&) (display "&" port)) + ((#\<) (display "<" port)) + ((#\>) (display ">" port)) + (else (display c port)))) + str)) + (define (write-verbatim strs port) + (for-each (lambda (str) (display/escaped str port)) (reverse strs))) + (define (write-children n port) + (for-each (lambda (n) (write-node n port)) (reverse (node-children n)))) + (define (write-node n port) + (cond ((thematic-break-node? n) + (display "
" port)) + ((paragraph-node? n) + (display "

" port) + (write-children n port) + (display "

\n" port)) + ((block-quote-node? n) + (display "
\n" port) + (write-children n port) + (display "
\n" port)) + ((code-block-node? n) + (display "
" port)
+           (write-verbatim (node-children n) port)
+           (display "\n
\n" port)) + ((fenced-code-node? n) + (display "
" port)
+           (write-verbatim (node-children n) port)
+           (display "\n
\n" port)) + ((heading-node? n) + (let ((tag (case (node-ref n 'level) + ((1) "h1") + ((2) "h2") + ((3) "h3") + ((4) "h4") + ((5) "h5") + ((6) "h6")))) + (format port "<~a>" tag) + (write-children n port) + (format port "\n" tag))) + ((list-node? n) + (let ((tag (case (node-ref n 'type) + ((bullet) "ul") + (else "ol"))) + (tight? (node-ref n 'tight))) + (format port "<~a>\n" tag) + (for-each (lambda (n) + (cond + ((item-node? n) + (display "
  • " port) + (cond ((and tight? + (not (null? (node-children n))) + (paragraph-node? (last-child n))) + (write-children (last-child n) port)) + (else + (newline port) + (write-children n port))) + (display "
  • \n" port)) + (else (error "not an item" n)))) + (reverse (node-children n))) + (format port "\n" tag))) + ((text-node? n) + (display/escaped (last-child n) port)) + ((code-span-node? n) + (display "" port) + (write-verbatim (node-children n) port) + (display "" port)) + ((softbreak-node? n) + (newline port)) + ((hardbreak-node? n) + (display "
    " port)) + ((emphasis-node? n) + (let ((tag (case (node-ref n 'type) + ((em) 'em) + (else 'strong)))) + (format port "<~a>" tag) + (write-children n port) + (format port "" tag))) + ((link-node? n) + (let ((dest (node-ref n 'destination)) + (title (node-ref n 'title))) + (display "" port) + (write-children n port) + (display "" port))) + ((image-node? n) + (let ((dest (node-ref n 'destination)) + (title (node-ref n 'title)) + (alt (concat-text n))) + (display "\""" port))) + ((html-block-node? n) + (display (last-child n) port) + (newline port)) + ((inline-html-node? n) + (display (last-child n) port)) + (else (error "unknown node" n)))) + (define (document->html d) + (if (document-node? d) + (call-with-output-string + (lambda (port) + (write-children d port))) + (error "not a document node"))) + (let ((port (if (string? string-or-port) + (open-input-string string-or-port) + string-or-port))) + (document->html (parse-inlines (parse-blocks port))))) diff --git a/commonmark/inlines.scm b/commonmark/inlines.scm index 02cf031..e9bb018 100644 --- a/commonmark/inlines.scm +++ b/commonmark/inlines.scm @@ -43,6 +43,36 @@ "[a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?" "(\\.[a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*)>"))) +;; Regexps for HTML. It's complicated! See section 6.6 of the +;; CommonMark spec. +(define %html-whitespace "[ \t\n]") +(define %html-tag-name "([a-zA-Z][0-9a-zA-Z-]*)") +(define %html-unquoted-attribute-value "[^ \t\n\"'=<>`]+") +(define %html-single-quoted-attribute-value "'[^']*'") +(define %html-double-quoted-attribute-value "\"[^\"]*\"") +(define %html-attribute-value (string-append "(" %html-unquoted-attribute-value "|" + %html-single-quoted-attribute-value "|" + %html-double-quoted-attribute-value ")")) +(define %html-attribute-value-spec (string-append %html-whitespace "*=" + %html-whitespace "*" + %html-attribute-value)) +(define %html-attribute-name "[a-zA-Z_:][a-zA-Z0-9_.:-]*") +(define %html-attribute (string-append %html-whitespace "+" %html-attribute-name + "(" %html-attribute-value-spec ")?")) +(define re-html-open-tag + (make-regexp + (string-append "<" %html-tag-name "(" %html-attribute ")*" + %html-whitespace "*/?>"))) +(define re-html-closing-tag + (make-regexp + (string-append ""))) +(define re-html-comment (make-regexp "")) +(define re-html-end-comment (make-regexp "-->")) +(define re-html-processing (make-regexp "<\\?.*\\?>")) +(define re-html-end-processing (make-regexp "\\?>")) +(define re-html-declaration (make-regexp "")) +(define re-html-cdata (make-regexp "")) +(define re-html-end-cdata (make-regexp "\\]\\]>")) (define (start-ticks? text) (regexp-exec re-start-ticks (text-value text) (text-position text))) @@ -226,6 +256,7 @@ (define (parse-inner node) (cond ((not (node? node)) node) ((or (paragraph-node? node) (heading-node? node)) (parse-inline node ref-proc)) + ((html-block-node? node) node) (else (make-node (node-type node) (node-data node) (map parse-inner (node-children node)))))) (parse-inner node))) @@ -473,11 +504,44 @@ (text-move text (match:end email-match 0))) (values #f text)))))) +(define (parse-html text nodes delim-stack ref-proc) + (define (finish html-match) + (values (make-inline-html-node (match:substring html-match)) + (text-move text (match:end html-match)))) + ;; Comment, processing, and cdata regexps may match too much. For + ;; example, the comment regexp would match the entirety of "<----> + ;; foo -->" but we need to truncate the match to the first instance + ;; of "-->". + (define (finish/maybe-truncate html-match re) + (let* ((html-match (regexp-exec re (text-value text) (text-position text))) + (end (match:end html-match 0))) + (values (make-inline-html-node + (substring (text-value text) (text-position text) end)) + (text-move text end)))) + (define (regexp-exec* re) + (regexp-exec re (text-value text) (text-position text))) + (cond ((regexp-exec* re-html-open-tag) => finish) + ((regexp-exec* re-html-closing-tag) => finish) + ((regexp-exec* re-html-comment) => + (lambda (html-match) + (finish/maybe-truncate html-match re-html-end-comment))) + ((regexp-exec* re-html-processing) => + (lambda (html-match) + (finish/maybe-truncate html-match re-html-end-processing))) + ((regexp-exec* re-html-declaration) => finish) + ((regexp-exec* re-html-cdata) => + (lambda (html-match) + (finish/maybe-truncate html-match re-html-end-cdata))) + (else (values #f text)))) + (define (parse-autolink-or-html text nodes delim-stack ref-proc) (let-values (((autolink text) (parse-autolink text))) (if autolink (parse-char text (cons autolink nodes) delim-stack ref-proc) - (parse-char (text-advance text 1) (cons (make-text-node "<") nodes) delim-stack ref-proc)))) + (let-values (((html text) (parse-html text nodes delim-stack ref-proc))) + (if html + (parse-char text (cons html nodes) delim-stack ref-proc) + (parse-char (text-advance text 1) (cons (make-text-node "<") nodes) delim-stack ref-proc)))))) (define (parse-entity-numeric text nodes delim-stack ref-proc) diff --git a/commonmark/node.scm b/commonmark/node.scm index 3673ee4..8084b2e 100644 --- a/commonmark/node.scm +++ b/commonmark/node.scm @@ -64,6 +64,11 @@ link-node? make-image-node image-node? + make-html-block-node + html-block-node-type + html-block-node? + make-inline-html-node + inline-html-node? child-closed? close-node last-child @@ -93,6 +98,8 @@ ;; - 'code-span ;; - 'emphasis ;; - 'link +;; - 'html-block +;; - 'inline-html ;; interp. The type of CommonMark block node ;; Node is (make-node Node-Type Node-Data (listof Node)) @@ -305,6 +312,22 @@ (define (image-node? node) (node-type? node 'image)) +;; HTML nodes +(define (html-block-node? node) + (node-type? node 'html-block)) + +(define (make-html-block-node type str) + (make-node 'html-block `((type . ,type)) (list str))) + +(define (html-block-node-type node) + (node-get-data node 'type)) + +(define (inline-html-node? node) + (node-type? node 'inline-html)) + +(define (make-inline-html-node str) + (make-node 'inline-html #f (list str))) + (define (child-closed? n) (node-closed? (last-child n))) diff --git a/commonmark/parser.scm b/commonmark/parser.scm index 0456667..607e43c 100644 --- a/commonmark/parser.scm +++ b/commonmark/parser.scm @@ -20,6 +20,7 @@ #:use-module (srfi srfi-2) #:use-module (srfi srfi-9) #:use-module (srfi srfi-26) + #:use-module (ice-9 match) #:use-module (ice-9 regex) #:use-module (commonmark common) #:export (make-parser @@ -61,7 +62,11 @@ link-definition-rest link-definition-label link-definition-destination - link-definition-title)) + link-definition-title + html-block + html-block-continue + html-block-end + html-block-string)) (define-record-type (%make-parser str pos col) @@ -175,7 +180,88 @@ ")*)>"))) (define re-link-destination (make-regexp link-destination)) (define re-link-title (make-regexp link-title)) - +(define re-html-block-special-start (make-regexp "^(|$)" regexp/icase)) +(define re-html-block-special-end (make-regexp "(|||)" regexp/icase)) +(define re-html-block-comment-start (make-regexp "^")) +(define re-html-block-template-start (make-regexp "^<\\?")) +(define re-html-block-template-end (make-regexp "\\?>")) +(define re-html-block-declaration-start (make-regexp "^")) +(define re-html-block-cdata-start (make-regexp "^")) +(define re-html-block-well-known-start (make-regexp "^(<||/>)" regexp/icase)) +(define re-html-tag-name (make-regexp "^[a-zA-Z][0-9a-zA-Z-]*")) +(define %html-whitespace "[ \t]") +(define %html-tag-name "([a-zA-Z][0-9a-zA-Z-]*)") +(define %html-unquoted-attribute-value "[^ \t\"'=<>`]+") +(define %html-single-quoted-attribute-value "'[^']*'") +(define %html-double-quoted-attribute-value "\"[^\"]*\"") +(define %html-attribute-value + (string-append "(" %html-unquoted-attribute-value "|" + %html-single-quoted-attribute-value "|" + %html-double-quoted-attribute-value ")")) +(define %html-attribute-value-spec + (string-append %html-whitespace "*=" + %html-whitespace "*" + %html-attribute-value)) +(define %html-attribute-name "[a-zA-Z_:][a-zA-Z0-9_.:-]*") +(define %html-attribute + (string-append %html-whitespace "+" %html-attribute-name + %html-attribute-value-spec)) +(define %html-open-tag + (string-append "<" %html-tag-name "(" %html-attribute ")*" + %html-whitespace "*/?>")) +(define %html-closing-tag + (string-append "")) +(define re-html-block-other-start + (make-regexp (string-append "^(" %html-open-tag "|" %html-closing-tag ")$"))) + + +(define (html-block parser) + (let ((str (parser-str parser)) + (pos (parser-pos parser))) + (cond ((regexp-exec re-html-block-special-start str pos) => + (lambda (match) (list 'special match))) + ((regexp-exec re-html-block-comment-start str pos) => + (lambda (match) (list 'comment match))) + ((regexp-exec re-html-block-template-start str pos) => + (lambda (match) (list 'template match))) + ((regexp-exec re-html-block-declaration-start str pos) => + (lambda (match) (list 'declaration match))) + ((regexp-exec re-html-block-cdata-start str pos) => + (lambda (match) (list 'cdata match))) + ((regexp-exec re-html-block-well-known-start str pos) => + (lambda (match) (list 'well-known match))) + ((regexp-exec re-html-block-other-start str pos) => + (lambda (match) + (let ((excluded-tags '("pre" "script" "style" "textarea"))) + ;; Match 2 is the tag name for an open tag, match 5 is + ;; the tag name for a closing tag. + (and (not (member (match:substring match 2) excluded-tags)) + (not (member (match:substring match 5) excluded-tags)) + (list 'other match))))) + (else #f)))) + +(define (html-block-end type parser) + (let ((str (parser-str parser)) + (pos (parser-pos parser))) + (match type + ('special (regexp-exec re-html-block-special-end str pos)) + ('comment (regexp-exec re-html-block-comment-end str pos)) + ('template (regexp-exec re-html-block-template-end str pos)) + ('declaration (regexp-exec re-html-block-declaration-end str pos)) + ('cdata (regexp-exec re-html-block-cdata-end str pos)) + ((or 'well-known 'other) (empty-line parser)) + (_ #f)))) + +(define (html-block-string match) + (substring (match:string match) (match:start match))) + +(define (html-tag-name parser) + (let ((match (regexp-exec re-html-tag-name (parser-str parser) (parser-pos parser)))) + (and match + (parser-advance parser (- (match:end match) (parser-pos parser)))))) (define (block-quote parser) (if (and (not (parser-end? parser)) (parser-char=? parser #\>)) diff --git a/tests/html.scm b/tests/html.scm new file mode 100644 index 0000000..156ef6f --- /dev/null +++ b/tests/html.scm @@ -0,0 +1,636 @@ +;; Copyright (C) 2024 David Thompson +;; +;; This file is part of guile-commonmark +;; +;; guile-commonmark is free software: you can redistribute it and/or +;; modify it under the terms of the GNU Lesser General Public License +;; as published by the Free Software Foundation, either version 3 of +;; the License, or (at your option) any later version. +;; +;; guile-commonmark is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU Lesser General Public License for more details. +;; +;; You should have received a copy of the GNU Lesser General Public License +;; along with guile-commonmark. If not, see . + +(define-module (test-html) + #:use-module (srfi srfi-64) + #:use-module (commonmark html)) + +(define-syntax-rule (test-html html markdown) + (test-equal html (call-with-input-string markdown commonmark->html))) + +(test-begin "node") + +;; These tests are straight out of 4.6 of the CommonMark spec. + +(test-html + "
    +
    +**Hello**,
    +

    world. +

    +
    +" + "
    +
    +**Hello**,
    +
    +_world_.
    +
    +
    ") + +(test-html + " + + + +
    + hi +
    +

    okay.

    +" + " + + + +
    + hi +
    + +okay.") + +(test-html + " +*foo*") + +(test-html + "
    +

    Markdown

    +
    +" + "
    + +*Markdown* + +
    ") + +(test-html + "
    +
    +" + "
    +
    ") + +(test-html + "
    +
    +" + "
    +
    ") + +(test-html + "
    +*foo* +

    bar

    +" + "
    +*foo* + +*bar*") + +(test-html + " +" + "") + +(test-html + "
    +foo +
    +" + "
    +foo +
    ") + +(test-html + "
    +``` c +int x = 33; +``` +" + "
    +``` c +int x = 33; +```") + +(test-html + " +*bar* + +" + " +*bar* +") + +(test-html + " +*bar* + +" + " +*bar* +") + +(test-html + " +*bar* + +" + " +*bar* +") + +(test-html + " +*bar* +" + " +*bar*") + +(test-html + " +*foo* + +" + " +*foo* +") + +(test-html + " +

    foo

    +
    +" + " + +*foo* + +") + +(test-html + "

    foo

    +" + "*foo*") + +(test-html + "
    
    +import Text.HTML.TagSoup
    +
    +main :: IO ()
    +main = print $ parseTags tags
    +
    +

    okay

    +" + "
    
    +import Text.HTML.TagSoup
    +
    +main :: IO ()
    +main = print $ parseTags tags
    +
    +okay") + +(test-html + " +

    okay

    +" +" +okay") + +(test-html + " +" + "") + +(test-html + " +

    okay

    +" + " +okay") + +(test-html + " +

    foo

    +" + " +*foo*") + +(test-html + "*bar* +

    baz

    +" + "*bar* +*baz*") + +(test-html + "1. *bar* +" + "1. *bar*") + +(test-html + " +

    okay

    +" + " +okay") + +(test-html + "'; + +?> +

    okay

    +" + "'; + +?> +okay") + +(test-html "\n" "") + +(test-html + " +

    okay

    +" + " +okay") + +(test-html + " +
    <!-- foo -->
    +
    +" + " + + ") + +(test-html + "
    +
    <div>
    +
    +" + "
    + +
    ") + +(test-html + "

    Foo

    +
    +bar +
    +" + "Foo +
    +bar +
    ") + +(test-html + "
    +bar +
    +*foo* +" + "
    +bar +
    +*foo*") + +(test-html + "

    Foo + +baz

    +" +"Foo + +baz") + +(test-html + "
    +

    Emphasized text.

    +
    +" + "
    + +*Emphasized* text. + +
    ") + +(test-html + "
    +*Emphasized* text. +
    +" + "
    +*Emphasized* text. +
    ") + +(test-html + " + + + +
    +Hi +
    +" + " + + + + + + + +
    +Hi +
    ") + +(test-html + " + +
    <td>
    +  Hi
    +</td>
    +
    + +
    +" + " + + + + + + + +
    + Hi +
    ") + +;; Test cases from section 6.6 of the spec. + +(test-html + "

    +" + "") + +(test-html + "

    +" + "") + +(test-html + "

    +" + "") + +(test-html + "

    +" + "") + +(test-html + "

    Foo

    +" + "Foo ") + +(test-html + "

    <33> <__>

    +" + "<33> <__>") + +(test-html + "

    <a h*#ref="hi">

    +" + "
    ") + +(test-html + "

    <a href="hi'> <a href=hi'>

    +" + "
    ") + +(test-html + "

    < a>< +foo><bar/ > +<foo bar=baz +bim!bop />

    +" + "< a>< +foo> +") + +(test-html + "

    <a href='bar'title=title>

    +" + "
    ") + +(test-html + "

    +" + "") + +(test-html + "

    </a href="foo">

    +" + "") + +(test-html + "

    foo

    +" + "foo ") + +(test-html + "

    foo foo -->

    +

    foo foo -->

    +" + "foo foo --> + +foo foo -->") + +(test-html + "

    foo

    +" + "foo ") + +(test-html + "

    foo

    +" + "foo ") + +(test-html + "

    foo &<]]>

    +" + "foo &<]]>") + +(test-html + "

    foo

    +" + "foo ") + +(test-html + "

    foo

    +" + "foo ") + +(test-html + "

    <a href=""">

    +" + "
    ") + +(test-end)