Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add block/inline HTML parsing #23

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ SOURCES = \
commonmark/node.scm \
commonmark/blocks.scm \
commonmark/inlines.scm \
commonmark/html.scm \
commonmark/sxml.scm \
commonmark.scm

Expand All @@ -32,7 +33,8 @@ TESTS = \
tests/inlines/links.scm \
tests/inlines/images.scm \
tests/inlines/autolinks.scm \
tests/inlines/entities.scm
tests/inlines/entities.scm \
tests/html.scm

TEST_EXTENSIONS = .scm

Expand Down
33 changes: 31 additions & 2 deletions commonmark/blocks.scm
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
;; You should have received a copy of the GNU Lesser General Public License
;; along with guile-commonmark. If not, see <http://www.gnu.org/licenses/>.

(define-module (commonmark blocks)
(define-module (commonmark blocks)
#:use-module (ice-9 match)
#:use-module (srfi srfi-1)
#:use-module (srfi srfi-26)
#:use-module (commonmark node)
Expand Down Expand Up @@ -46,8 +47,21 @@
((code-block-node? node) (parse-code-block node parser))
((fenced-code-node? node) (parse-fenced-code node parser))
((list-node? node) (parse-list node parser))
((html-block-node? node) (parse-html node parser))
((paragraph-node? node) (parse-paragraph node parser))))

(define (parse-html node parser)
(let ((type (html-block-node-type node))
(str (parser-rest-str parser)))
(match (node-children node)
((html)
(if (html-block-end type parser)
(close-node
(if (string-null? str)
node
(make-html-block-node type (string-append html "\n" str))))
(make-html-block-node type (string-append html "\n" str)))))))

;; Node Parser -> Node
(define (parse-container-block node parser)
(cond ((and (no-children? node) (empty-line parser)) ;; empty line
Expand All @@ -60,7 +74,8 @@
(cond ((and (not (empty-line parser))
(node-closed? new-child)
(not (fenced-code-node? new-child))
(not (heading-node? new-child)))
(not (heading-node? new-child))
(not (html-block-node? new-child)))
(add-child-node (replace-last-child node new-child)
(parse-line parser)))
(else (replace-last-child node new-child)))))))
Expand Down Expand Up @@ -93,6 +108,11 @@
(replace-last-child node (join-text-nodes (last-child node) (last-child parsed-line))))
((code-block-node? parsed-line)
(replace-last-child node (add-text (last-child node) (parser-rest-str parser))))
;; 1 of 7 types of HTML blocks is *not* allowed to interrupt
;; a paragraph.
((and (html-block-node? parsed-line)
(eq? (html-block-node-type parsed-line) 'other))
(replace-last-child node (add-text (last-child node) (last-child parsed-line))))
(else (close-node node)))))

(define (fence-start node)
Expand Down Expand Up @@ -207,6 +227,9 @@
(let ((nonspace-parser (parser-advance-next-nonspace parser)))
(cond ((empty-line nonspace-parser) (make-blank-node))
((parser-indented? parser nonspace-parser) (make-code-block parser))
((html-block nonspace-parser) => (match-lambda
((type match)
(make-html-block type parser))))
((thematic-break nonspace-parser) (make-thematic-break))
((block-quote nonspace-parser) => make-block-quote)
((atx-heading nonspace-parser) => make-atx-heading)
Expand All @@ -216,6 +239,12 @@
(else (make-paragraph nonspace-parser)))))


(define (make-html-block type parser)
(let ((node (make-html-block-node type (parser-rest-str parser))))
(if (html-block-end type parser)
(close-node node)
node)))

(define (make-thematic-break)
(make-thematic-break-node))

Expand Down
164 changes: 164 additions & 0 deletions commonmark/html.scm
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
;; Copyright (C) 2024 David Thompson <dave@spritely.institute>
;;
;; This file is part of guile-commonmark
;;
;; guile-commonmark is free software: you can redistribute it and/or
;; modify it under the terms of the GNU Lesser General Public License
;; as published by the Free Software Foundation, either version 3 of
;; the License, or (at your option) any later version.
;;
;; guile-commonmark is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;; GNU Lesser General Public License for more details.
;;
;; You should have received a copy of the GNU Lesser General Public License
;; along with guile-commonmark. If not, see <http://www.gnu.org/licenses/>.

(define-module (commonmark html)
#:use-module (commonmark blocks)
#:use-module (commonmark inlines)
#:use-module (commonmark node)
#:use-module (srfi srfi-1)
#:export (commonmark->html))

(define* (commonmark->html #:optional (string-or-port (current-input-port)))
(define (node-ref n key)
(assq-ref (node-data n) key))
(define (node->text n)
(if (text-node? n)
(node-children n)
(fold-text n)))
(define (fold-text n)
(fold (lambda (elem prev)
(append (node->text elem) prev))
'()
(node-children n)))
(define (concat-text n)
(string-concatenate (fold-text n)))
(define (display/escaped str port)
(string-for-each (lambda (c)
(case c
((#\") (display "&quot;" port))
((#\&) (display "&amp;" port))
((#\<) (display "&lt;" port))
((#\>) (display "&gt;" port))
(else (display c port))))
str))
(define (write-verbatim strs port)
(for-each (lambda (str) (display/escaped str port)) (reverse strs)))
(define (write-children n port)
(for-each (lambda (n) (write-node n port)) (reverse (node-children n))))
(define (write-node n port)
(cond ((thematic-break-node? n)
(display "<hr>" port))
((paragraph-node? n)
(display "<p>" port)
(write-children n port)
(display "</p>\n" port))
((block-quote-node? n)
(display "<blockquote>\n" port)
(write-children n port)
(display "</blockquote>\n" port))
((code-block-node? n)
(display "<pre><code>" port)
(write-verbatim (node-children n) port)
(display "\n</code></pre>\n" port))
((fenced-code-node? n)
(display "<pre><code infostring=\"" port)
(display/escaped (node-ref n 'info-string) port)
(display "\">" port)
(write-verbatim (node-children n) port)
(display "\n</code></pre>\n" port))
((heading-node? n)
(let ((tag (case (node-ref n 'level)
((1) "h1")
((2) "h2")
((3) "h3")
((4) "h4")
((5) "h5")
((6) "h6"))))
(format port "<~a>" tag)
(write-children n port)
(format port "</~a>\n" tag)))
((list-node? n)
(let ((tag (case (node-ref n 'type)
((bullet) "ul")
(else "ol")))
(tight? (node-ref n 'tight)))
(format port "<~a>\n" tag)
(for-each (lambda (n)
(cond
((item-node? n)
(display "<li>" port)
(cond ((and tight?
(not (null? (node-children n)))
(paragraph-node? (last-child n)))
(write-children (last-child n) port))
(else
(newline port)
(write-children n port)))
(display "</li>\n" port))
(else (error "not an item" n))))
(reverse (node-children n)))
(format port "</~a>\n" tag)))
((text-node? n)
(display/escaped (last-child n) port))
((code-span-node? n)
(display "<code>" port)
(write-verbatim (node-children n) port)
(display "</code>" port))
((softbreak-node? n)
(newline port))
((hardbreak-node? n)
(display "<br>" port))
((emphasis-node? n)
(let ((tag (case (node-ref n 'type)
((em) 'em)
(else 'strong))))
(format port "<~a>" tag)
(write-children n port)
(format port "</~a>" tag)))
((link-node? n)
(let ((dest (node-ref n 'destination))
(title (node-ref n 'title)))
(display "<a href=\"" port)
(display/escaped dest port)
(display "\"" port)
(when title
(display " title=\"" port)
(display/escaped title port)
(display "\"" port))
(display ">" port)
(write-children n port)
(display "</a>" port)))
((image-node? n)
(let ((dest (node-ref n 'destination))
(title (node-ref n 'title))
(alt (concat-text n)))
(display "<img src=\"" port)
(display/escaped dest port)
(display "\" alt=\"" port)
(display/escaped alt port)
(display "\"" port)
(when title
(display " title=\"" port)
(display/escaped title port)
(display "\"" port))
(display " />" port)))
((html-block-node? n)
(display (last-child n) port)
(newline port))
((inline-html-node? n)
(display (last-child n) port))
(else (error "unknown node" n))))
(define (document->html d)
(if (document-node? d)
(call-with-output-string
(lambda (port)
(write-children d port)))
(error "not a document node")))
(let ((port (if (string? string-or-port)
(open-input-string string-or-port)
string-or-port)))
(document->html (parse-inlines (parse-blocks port)))))
66 changes: 65 additions & 1 deletion commonmark/inlines.scm
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,36 @@
"[a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?"
"(\\.[a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*)>")))

;; Regexps for HTML. It's complicated! See section 6.6 of the
;; CommonMark spec.
(define %html-whitespace "[ \t\n]")
(define %html-tag-name "([a-zA-Z][0-9a-zA-Z-]*)")
(define %html-unquoted-attribute-value "[^ \t\n\"'=<>`]+")
(define %html-single-quoted-attribute-value "'[^']*'")
(define %html-double-quoted-attribute-value "\"[^\"]*\"")
(define %html-attribute-value (string-append "(" %html-unquoted-attribute-value "|"
%html-single-quoted-attribute-value "|"
%html-double-quoted-attribute-value ")"))
(define %html-attribute-value-spec (string-append %html-whitespace "*="
%html-whitespace "*"
%html-attribute-value))
(define %html-attribute-name "[a-zA-Z_:][a-zA-Z0-9_.:-]*")
(define %html-attribute (string-append %html-whitespace "+" %html-attribute-name
"(" %html-attribute-value-spec ")?"))
(define re-html-open-tag
(make-regexp
(string-append "<" %html-tag-name "(" %html-attribute ")*"
%html-whitespace "*/?>")))
(define re-html-closing-tag
(make-regexp
(string-append "</" %html-tag-name %html-whitespace "*>")))
(define re-html-comment (make-regexp "<!--.*-->"))
(define re-html-end-comment (make-regexp "-->"))
(define re-html-processing (make-regexp "<\\?.*\\?>"))
(define re-html-end-processing (make-regexp "\\?>"))
(define re-html-declaration (make-regexp "<![a-zA-Z].*>"))
(define re-html-cdata (make-regexp "<!\\[CDATA\\[.*\\]\\]>"))
(define re-html-end-cdata (make-regexp "\\]\\]>"))

(define (start-ticks? text)
(regexp-exec re-start-ticks (text-value text) (text-position text)))
Expand Down Expand Up @@ -226,6 +256,7 @@
(define (parse-inner node)
(cond ((not (node? node)) node)
((or (paragraph-node? node) (heading-node? node)) (parse-inline node ref-proc))
((html-block-node? node) node)
(else (make-node (node-type node) (node-data node) (map parse-inner (node-children node))))))
(parse-inner node)))

Expand Down Expand Up @@ -473,11 +504,44 @@
(text-move text (match:end email-match 0)))
(values #f text))))))

(define (parse-html text nodes delim-stack ref-proc)
(define (finish html-match)
(values (make-inline-html-node (match:substring html-match))
(text-move text (match:end html-match))))
;; Comment, processing, and cdata regexps may match too much. For
;; example, the comment regexp would match the entirety of "<---->
;; foo -->" but we need to truncate the match to the first instance
;; of "-->".
(define (finish/maybe-truncate html-match re)
(let* ((html-match (regexp-exec re (text-value text) (text-position text)))
(end (match:end html-match 0)))
(values (make-inline-html-node
(substring (text-value text) (text-position text) end))
(text-move text end))))
(define (regexp-exec* re)
(regexp-exec re (text-value text) (text-position text)))
(cond ((regexp-exec* re-html-open-tag) => finish)
((regexp-exec* re-html-closing-tag) => finish)
((regexp-exec* re-html-comment) =>
(lambda (html-match)
(finish/maybe-truncate html-match re-html-end-comment)))
((regexp-exec* re-html-processing) =>
(lambda (html-match)
(finish/maybe-truncate html-match re-html-end-processing)))
((regexp-exec* re-html-declaration) => finish)
((regexp-exec* re-html-cdata) =>
(lambda (html-match)
(finish/maybe-truncate html-match re-html-end-cdata)))
(else (values #f text))))

(define (parse-autolink-or-html text nodes delim-stack ref-proc)
(let-values (((autolink text) (parse-autolink text)))
(if autolink
(parse-char text (cons autolink nodes) delim-stack ref-proc)
(parse-char (text-advance text 1) (cons (make-text-node "<") nodes) delim-stack ref-proc))))
(let-values (((html text) (parse-html text nodes delim-stack ref-proc)))
(if html
(parse-char text (cons html nodes) delim-stack ref-proc)
(parse-char (text-advance text 1) (cons (make-text-node "<") nodes) delim-stack ref-proc))))))


(define (parse-entity-numeric text nodes delim-stack ref-proc)
Expand Down
23 changes: 23 additions & 0 deletions commonmark/node.scm
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,11 @@
link-node?
make-image-node
image-node?
make-html-block-node
html-block-node-type
html-block-node?
make-inline-html-node
inline-html-node?
child-closed?
close-node
last-child
Expand Down Expand Up @@ -93,6 +98,8 @@
;; - 'code-span
;; - 'emphasis
;; - 'link
;; - 'html-block
;; - 'inline-html
;; interp. The type of CommonMark block node

;; Node is (make-node Node-Type Node-Data (listof Node))
Expand Down Expand Up @@ -305,6 +312,22 @@
(define (image-node? node)
(node-type? node 'image))

;; HTML nodes
(define (html-block-node? node)
(node-type? node 'html-block))

(define (make-html-block-node type str)
(make-node 'html-block `((type . ,type)) (list str)))

(define (html-block-node-type node)
(node-get-data node 'type))

(define (inline-html-node? node)
(node-type? node 'inline-html))

(define (make-inline-html-node str)
(make-node 'inline-html #f (list str)))

(define (child-closed? n)
(node-closed? (last-child n)))

Expand Down
Loading