From 5991e9bdce79cd69b01cb98c9af895e7fe8c6fbf Mon Sep 17 00:00:00 2001 From: Darcy Shen Date: Tue, 24 Sep 2024 18:56:44 +0800 Subject: [PATCH] Support copy math content from Kimi Chat --- progs/convert/data/css.scm | 72 ++++ progs/convert/data/sxhtml.scm | 275 +++++++++++++ progs/convert/data/sxml.scm | 199 +++++++++ progs/convert/data/xmltm.scm | 686 ++++++++++++++++++++++++++++++++ progs/convert/html/htmltm.scm | 38 +- progs/convert/html/tmhtml.scm | 6 +- progs/convert/mathml/mathtm.scm | 4 +- tests/htmltm-test.scm | 6 +- 8 files changed, 1266 insertions(+), 20 deletions(-) create mode 100644 progs/convert/data/css.scm create mode 100644 progs/convert/data/sxhtml.scm create mode 100644 progs/convert/data/sxml.scm create mode 100644 progs/convert/data/xmltm.scm diff --git a/progs/convert/data/css.scm b/progs/convert/data/css.scm new file mode 100644 index 0000000..ae45554 --- /dev/null +++ b/progs/convert/data/css.scm @@ -0,0 +1,72 @@ + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; MODULE : css.scm +;; DESCRIPTION : tools for manipulation of CSS attributes +;; COPYRIGHT : (C) 2020 Joris van der Hoeven +;; +;; This software falls under the GNU general public license version 3 or later. +;; It comes WITHOUT ANY WARRANTY WHATSOEVER. For details, see the file LICENSE +;; in the root directory or . +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(texmacs-module (convert data css) + (:use (convert data sxml))) + +(define (css-style-item->pair s) + (with l (string-tokenize-by-char s #\:) + (and (== (length l) 2) + (cons (tm-string-trim-both (car l)) + (tm-string-trim-both (cadr l)))))) + +(define (css-style->assoc s) + (let* ((l1 (string-tokenize-by-char s #\;)) + (l2 (map css-style-item->pair l1))) + (list-filter l2 (lambda (x) x)))) + +(define (pair->css-style-item p) + (string-append (car p) ": " (cdr p))) + +(define (assoc->css-style a) + (with l (map pair->css-style-item a) + (string-recompose l "; "))) + +(define (css-max l1 l2) + (if (and (length? l1) (length? l2)) + (length-max l1 l2) + l2)) + +(define (css-add l1 l2) + (if (and (length? l1) (length? l2)) + (length-add l1 l2) + l2)) + +(define (assoc-change a k v) + (cond ((null? a) a) + ((== (caar a) k) (cons (cons k v) (cdr a))) + (else (cons (car a) (assoc-change (cdr a) k v))))) + +(define (css-merge-attrs a1 a2) + (cond ((null? a1) a2) + ((not (assoc-ref a2 (caar a1))) + (cons (car a1) (css-merge-attrs (cdr a1) a2))) + ((in? (caar a1) '("margin-top" "margin-bottom" + "padding-top" "padding-bottom")) + (let* ((k (caar a1)) + (v (css-max (assoc-ref a1 k) (assoc-ref a2 k))) + (a2* (assoc-change a2 k v))) + (css-merge-attrs (cdr a1) a2*))) + ((in? (caar a1) '("margin-left" "margin-right" + "padding-left" "padding-right")) + (let* ((k (caar a1)) + (v (css-add (assoc-ref a1 k) (assoc-ref a2 k))) + (a2* (assoc-change a2 k v))) + (css-merge-attrs (cdr a1) a2*))) + (else (cons (car a1) (css-merge-attrs (cdr a1) a2))))) + +(tm-define (css-merge-styles s1 s2) + (let* ((a1 (css-style->assoc s1)) + (a2 (css-style->assoc s2)) + (a (css-merge-attrs a1 a2))) + (assoc->css-style a))) diff --git a/progs/convert/data/sxhtml.scm b/progs/convert/data/sxhtml.scm new file mode 100644 index 0000000..863426a --- /dev/null +++ b/progs/convert/data/sxhtml.scm @@ -0,0 +1,275 @@ + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; MODULE : sxhtml.scm +;; DESCRIPTION : Utilities for HTML in SXML format +;; COPYRIGHT : (C) 2002 David Allouche +;; +;; This software falls under the GNU general public license version 3 or later. +;; It comes WITHOUT ANY WARRANTY WHATSOEVER. For details, see the file LICENSE +;; in the root directory or . +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(texmacs-module (convert data sxhtml) + (:use (convert data sxml))) + +;; TODO: rewrite the predicates using DRD +;; TODO: consolidate with htmltm dispatch DRD + +;; Is the node x a HTML element whose name is a given set? +;; (tm-define sxhtml-heading? (ntype-names?? '(h:h1 h:h2 h:h3 h:h4 h:h5 h:h6))) +;; NOTE: outcommented in order to keep inline nature of h5 and h6 +(tm-define sxhtml-heading? (ntype-names?? '(h:h1 h:h2 h:h3 h:h4))) +(tm-define sxhtml-list? (ntype-names?? '(h:ul h:ol h:dl))) +(tm-define sxhtml-table? (ntype-names?? '(h:table))) + +(tm-define (sxhtml-label? x) + ;; Is the node x a h:a element with an id attribute? + (and (sxml-element? x) + (eq? 'h:a (sxml-name x)) + (sxml-attr x 'id))) + +(tm-define (sxhtml-glue-label x label) + ;; Set the id attribute of element x from the id of element label. + (sxml-set-attr x (list 'id (sxml-attr label 'id)))) + +(define table-kid? + (ntype-names?? '(h:tr h:td h:th h:col h:colgroup h:tbody h:thead h:tfoot))) +(define row-group-kid? (ntype-names?? '(h:tr h:td h:th))) +(define row-group? (ntype-names?? '(h:tbody h:thead h:tfoot))) +(define tfoot? (ntype-names?? '(h:tfoot))) +(define col-data? (ntype-names?? '(h:col h:colgroup))) +(define row? (ntype-names?? '(h:tr))) +(define cell? (ntype-names?? '(h:td h:th))) + +(define (shtml-attr-number as name) + (and-let* ((x (shtml-attr-non-null as name))) + (string->number x))) + +(define (shtml-attr-positive-integer as name) + (and-let* ((n (shtml-attr-number as name))) + (if (< n 0) #f (inexact->exact n)))) + +(tm-define (shtml-decode-span as name) + ;; FIXME: zero spans (until end of group) are not supported + (let ((n (shtml-attr-positive-integer as name))) + (cond ((not n) 1) ((zero? n) 1) (else n)))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Correct invalid element nesting in HTML tables +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; A correct TABLE contains only COL, COLGROUP, THEAD, TFOOT and TBODY +;; elements. THEAD, TFOOT and TBODY (row groups) contains only TR elements. TR +;; elements contain only TD elements. +;; +;; Correcting a table is inferring TBODY and TR elements when they are missing +;; and filtering illegal nodes out. +;; +;; TODO: filter contents of COLGROUP elements. + +(tm-define (sxhtml-correct-table x) + ;; Correct all TABLEs in postorder in the sxml tree @x. + (let sub ((x x)) + (cond ((sxhtml-table? x) + (correct-table (sxml-set-content x (map sub (sxml-content x))))) + ((sxml-element? x) + (sxml-set-content x (map sub (sxml-content x)))) + (else x)))) + +(define (correct-table e) + ;; @e must be a h:table element (ns-prefixes must be normalized) + ;; Make TBODY elements explicit. + ;; Collect lone TD elements in TR. + ;; Drop table data not in TD. + (correct-table-content e table-kid? + (lambda (tr tbody) (cons `(h:tr ,@tr) tbody)) + (lambda (tbody kdr) (cons `(h:tbody ,@tbody) kdr)))) + +(define (correct-row-group e) + ;; Collect lone TD elements in TR inside of @e. + ;; Drop table data not in TD. + (correct-table-content e row-group-kid? + (lambda (tr tbody) (cons `(h:tr ,@tr) tbody)) + (lambda (tbody kdr) tbody))) + +(define (correct-row e) + ;; Drop everything but TD elements. + (sxml-set-content e (list-filter (sxml-content e) cell? ))) + +(define (correct-table-content e accept? make-tr make-tbody) + (sxml-set-content + e ((cut <> #f) (list-fold-right + (lambda (kar kdr) (kdr kar)) + (cut correct-table-content/step + accept? make-tr make-tbody <> '() '() '()) + (sxml-content e))))) + +(define (correct-table-content/step accept? cons-tr cons-tbody + kar tr tbody kdr) + (define (state kar tr tbody kdr) + (correct-table-content/step accept? cons-tr cons-tbody kar tr tbody kdr)) + (define (flush-tr curry-tbody-kdr) + ;; curry-tbody-kdr: (proc tbody -> state) + ;; curries @proc like (cut proc <> tbody kdr) with an updated @tbody. + (curry-tbody-kdr (cut state <> '() <> <>) + (if (null? tr) tbody (cons-tr tr tbody)))) + (define (flush-tbody curry-kdr) + ;; curry-kdr: (proc kdr -> state) + ;; curries @proc like (cut proc <> kdr) with an updated @kdr. + (flush-tr + (lambda (proc tbody) + (curry-kdr (cut proc <> '() <>) + (if (null? tbody) kdr (cons-tbody tbody kdr)))))) + (cond ((not kar) (flush-tbody (lambda (proc kdr) kdr))) + ((not (accept? kar)) (cut state <> tr tbody kdr)) + ((cell? kar) (cut state <> (cons kar tr) tbody kdr)) + ((row? kar) + (flush-tr (lambda (proc tbody) + (cut proc <> (cons (correct-row kar) tbody) kdr)))) + ((row-group? kar) + (flush-tbody (lambda (proc kdr) + (cut proc <> (cons (correct-row-group kar) kdr))))) + ((col-data? kar) + (flush-tbody (lambda (proc kdr) (cut proc <> (cons kar kdr))))) + ;; no "else" clause needed (assuming @accept is correct) + )) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Table iterator +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Iterating over cells of a table while maintaining the coordinates of the +;; current cell is a complex operation because table positions which are +;; covered by spanned cells are not represented by cell elements in HTML. +;; Also, the TFOOT element is present before TBODY element but is logically +;; located at the end of the table. + +;;;; MISSING FEATURE ;;;; +;; HTML-4.01 says that ROWSPAN=0 or COLSPAN=0 spans the cell to the end of its +;; containing group (colgroup, tbody, thead, tfoot). However, neither +;; Mozilla-1.4 nor Konqueror 3.1.3 implement this part of the specification. +;; Since this feature also requires a significant added complexity, it is left +;; out. + +(tm-define (sxhtml-table-fold kons knil table) + ;; Fundamental HTML table content iterator. + ;; knil: T + ;; kons: symbol (int @i) (int @j) sxml T -> T + ;; table: sxml + ;; Fold @kons over elements of @table, with (@i @j) the cell position. + ;; + ;; @kons is applied in the following modes: + ;; Entering a row-group : (kons :in-row-group i #f row-group kdr) + ;; Entering a row : (kons :in-row i #f row kdr) + ;; (possible extension: entering a colgroup and iterating over a col) + ;; Leaving a row-group : (kons :out-row-group nrows #f #f kdr) + ;; Leaving a row : (kons :out-row #f ncols #f kdr) + ;; Iterating over a cell : (kons :cell i j cell kdr) + ((cut <> #f) (list-fold (lambda (kar kdr) (kdr kar)) + (cut table-fold/table <> 0 '() '() kons knil) + (sxml-content table)))) + +(define (table-fold/table kar i rowspans footers kons kdr) + ;; @kar is a child element of the TABLE element. + (cond ((not kar) + (if (not footers) kdr + ((cut <> #f) + (list-fold (lambda (kar kdr) (kdr kar)) + (cut table-fold/table <> i rowspans #f kons kdr) + (reverse! footers))))) + ((and footers (tfoot? kar)) + (cut table-fold/table <> i rowspans (cons kar footers) kons kdr)) + ((row-group? kar) + ((cut <> #f) + (list-fold (lambda (kar kdr) (kdr kar)) + (cut table-fold/group <> i i rowspans footers kons + (kons :in-row-group i #f kar kdr)) + (sxml-content kar)))) + ;; ELSE clause for col-data elements. + ;; NOTE: could be extended to support parsing of col-data + (else (cut table-fold/table <> i rowspans footers kons kdr)))) + +(define (table-fold/group kar i0 i rowspans footers kons kdr) + ;; @kar is a child element of a THEAD, TBODY or TFOOT element. + (cond ((not kar) (cut table-fold/table <> i rowspans footers kons + (kons :out-row-group (- i i0) #f #f kdr))) + ((row? kar) + ((cut <> #f) + (list-fold (lambda (kar kdr) (kdr kar)) + (cut table-fold/row <> i0 i 0 rowspans footers kons + (kons :in-row i #f kar kdr)) + (sxml-content kar)))) + ;; ELSE clause should never be reached (the table is corrected). + (else (cut table-fold/group <> i0 i rowspans footers kons kdr)))) + +(define (table-fold/row kar i0 i j rowspans footers kons kdr) + ;; @kar is a child element of a TR element. + (cond ((not kar) + (cut table-fold/group <> i0 (1+ i) (next-rowspans rowspans) footers + kons (kons :out-row #f (skip-spanned-cols j rowspans) #f kdr))) + ((cell? kar) + (let ((j (skip-spanned-cols j rowspans)) + (a (sxml-attr-list kar))) + (let ((rspan (shtml-decode-span a 'rowspan)) + (cspan (shtml-decode-span a 'colspan))) + (cut table-fold/row <> i0 i (+ j cspan) + (if (= 1 rspan) rowspans + (add-rowspan rowspans j rspan cspan)) + footers kons (kons :cell i j kar kdr))))) + ;; ELSE clause should never be reached (the table is corrected). + (else (cut table-fold/row <> i0 i j rowspans footers kons kdr)))) + +;(set-trace-level! sxhtml-table-fold +; table-fold/table table-fold/group table-fold/row) + +;; Columns on which a cell is spanned are remember in a sorted ROWSPANS list. +;; Items of ROWSPANS are lists (J SPAN) where: +;; J : column number (zero-based). ROWSPANS is sorted by ascending J. +;; SPAN : count of additional rows (incl. the current row) where this column +;; is occupied by a spanned cell. + +(define (skip-spanned-cols j rowspans) + (let next ((j j) (rowspans rowspans)) + (cond ((null? rowspans) j) + ((< j (first (car rowspans))) j) + ((= j (first (car rowspans))) (next (1+ j) (cdr rowspans))) + (else (next j (cdr rowspans)))))) + +(define (add-rowspan rowspans j rspan cspan) + (let next ((rowspans rowspans) (j j) (cspan cspan)) + (cond ((zero? cspan) rowspans) + ((null? rowspans) + (cons (list j rspan) (next '() (1+ j) (1- cspan)))) + ((< j (first (car rowspans))) + (cons (list j rspan) (next rowspans (1+ j) (1- cspan)))) + ((= j (first (car rowspans))) + ;; This can only happen with some very vicious incorrect HTML. + (cons (list j (max rspan (second (car rowspans)))) + (next (cdr rowspans) (1+ j) (1- cspan)))) + (else (cons (car rowspans) + (next (cdr rowspans) (1+ j) (1- cspan))))))) + +(define (next-rowspans rowspans) + (reverse! (list-fold (lambda (kar kdr) + (with (col old-span) kar + (let ((span (1- old-span))) + (if (zero? span) + kdr + (cons (list col span) kdr))))) + '() rowspans))) + +;(set-trace-level! skip-spanned-cols add-rowspan next-rowspans) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Table iterator applications +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(tm-define (sxhtml-table-dimension table) + (sxhtml-table-fold + (lambda (msg i j kar kdr) + (with (nrows ncols) kdr + (list (if (eq? msg :out-row-group) (+ nrows i) nrows) + (if (eq? msg :out-row) (max ncols j) ncols)))) + '(0 0) table)) diff --git a/progs/convert/data/sxml.scm b/progs/convert/data/sxml.scm new file mode 100644 index 0000000..c069b3a --- /dev/null +++ b/progs/convert/data/sxml.scm @@ -0,0 +1,199 @@ + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; MODULE : sxml.scm +;; DESCRIPTION : XML data as S-expressions +;; COPYRIGHT : (C) 2002 David Allouche +;; +;; This software falls under the GNU general public license version 3 or later. +;; It comes WITHOUT ANY WARRANTY WHATSOEVER. For details, see the file LICENSE +;; in the root directory or . +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(texmacs-module (convert data sxml)) + +(define (as-string s) + (cond ((symbol? s) (symbol->string s)) + ((number? s) (number->string s)) + ((string? s) s) + (else ""))) + +;; Fundamental acessors +(tm-define sxml-name car) +(define sxml-attr-list! cdadr) + +(tm-define (sxml-has-attr-list? e) + ;; Has the element e an attribute node? + (and (pair? (cdr e)) + (pair? (cadr e)) + (eq? '@ (caadr e)))) + +(tm-define (sxml-element-head e) + ;; Element name and attributes (if present). + (if (sxml-has-attr-list? e) + (list (car e) (cadr e)) + (list (car e)))) + +(tm-define (sxml-content e) + ;; Complement function of sxml-element-head. + (if (sxml-has-attr-list? e) (cddr e) (cdr e))) + +(tm-define (sxml-set-name e name) + ;; Set the name of the sxml element e. + ;; Name is the new element name as a string. + (cons (string->symbol name) (cdr e))) + +(tm-define (sxml-set-content e content) + ;; Replace the subnodes of e with the node-set content. + (append (sxml-element-head e) content)) + +(tm-define (sxml-prepend e l) + ;; Prepend a node set l to the content of an element e. + (append (sxml-element-head e) l + (sxml-content e))) + +(define (sxml-attr-list? e) + ;; Return the attribute list of element e or #f. + (and (sxml-has-attr-list? e) + (sxml-attr-list! e))) + +(tm-define (sxml-attr-list e) + ;; Return the attribute list of element e or empty list. + (or (sxml-attr-list? e) '())) + +(tm-define (shtml-attr-non-null as att) + ;; Get an HTML attribute or false if the attribute is absent, is not set, or + ;; is set to the empty string. + ;; FIXME: this is ugly + (and-let* ((l (assoc att as)) + ((list-length=2? l)) + ((not (string-null? (second l))))) + (second l))) + +(define (sxml-named-attr obj attr-name) + ;; Named attribute of element e or #f. + (and-let* ((l (sxml-attr-list? obj))) + (assq attr-name l))) + +(tm-define (sxml-attr obj attr-name) + ;; Value of a named attribute of element e or #f. + (and-let* ((x (sxml-named-attr obj attr-name))) + (cadr x))) + +(tm-define (sxml-set-attr e attr) + ;; Set an attribute of an element e. Attr is a list (symbol? string?). + ;; Create the attribute list or the attribute if necessary. + (let ((attr-name (car attr))) + `(,(sxml-name e) + (@ ,attr ,@(list-filter (sxml-attr-list e) + (lambda (x) (not (eq? x attr-name))))) + ,@(sxml-content e)))) + +(tm-define (sxml-set-attrs e attrs) + ;; Set several attributes of an element e. Attrs is a list of attributes. + ;; Create the attribute list or attributes if necessary. + (let rec ((e e) (attrs attrs)) + (if (null? attrs) e + (rec (sxml-set-attr e (car attrs)) (cdr attrs))))) + +(tm-define (sxml-set-attrlist e attrs) + ;; Replace the attribute list of @obj by @attrs. + ;; If @attrs is #f, remove the attribute node. + `(,(sxml-name e) + ,@(if attrs `((@ ,@attrs)) '()) + ,@(sxml-content e))) + +(define-macro (sxml-find-name-separator len) + ;; optimized (string-rindex name #\:) + ;; returns position of a separator between namespace-id and LocalName + ;; (copied from sxml-tools) + `(let rpt ((pos (1- ,len))) + (cond + ((negative? pos) #f) + ((char=? #\: (string-ref name pos)) pos) + (else (rpt (1- pos)))))) + +(tm-define (sxml-ncname obj) + ;; Returns Local Part of Qualified Name (Namespaces in XML production [6]) + ;; for given obj, which is ":"-separated suffix of its Qualified Name + ;; If a name of a node given is NCName (Namespaces in XML production [4]), + ;; then it is returned as is. + ;; Please note that while SXML name is a symbol this function returns a + ;; string. + ;; (copied from sxml-tools) + (sxml-name->ncname (sxml-name obj))) + +(tm-define (sxml-name->ncname sxml-name) + (let* ((name (as-string sxml-name)) + (len (string-length name))) + (cond + ((sxml-find-name-separator len) + => (lambda (pos) + (substring name (+ pos 1) len))) + (else name)))) + +(tm-define (sxml-name->ns-id sxml-name) + ;; Returns namespace-id part of given name, or #f if it's LocalName + ;; (copied from sxml-tools) + (let* ((name (as-string sxml-name))) + (cond + ((sxml-find-name-separator (string-length name)) + => (lambda (pos) + (substring name 0 pos))) + (else #f)))) + +(tm-define (sxml-split-name sxml-name) + (let* ((name (as-string sxml-name)) + (len (string-length name))) + (cond + ((sxml-find-name-separator len) + => (lambda (pos) + (values (substring name 0 pos) + (substring name (+ pos 1) len)))) + (else (values #f name))))) + +(tm-define (sxml-strip-ns-prefix prefix x) + ;; Remove a given namespace prefix wherever it appears in element names. + ;; Prefix must be the ns-prefix as a string, and x a document fragment. + (let rec ((x x)) + (if (string? x) x + (sxml-set-content + (if (== prefix (sxml-name->ns-id (sxml-name x))) + (sxml-set-name x (sxml-ncname x)) + x) + (map rec (sxml-content x)))))) + +(tm-define (sxml-set-ns-prefix p x) + (sxml-set-name x (string-append p ":" (sxml-ncname x)))) + +; Predicate which returns #t if is SXML element, otherwise returns #f. +; NOTE: *TOP* is a special element. All element operations are applicable. +(tm-define (sxml-element? obj) + (and (pair? obj) + (symbol? (car obj)) + (not (memq (car obj) + '(@ @@ *PI* *COMMENT* *ENTITY*))))) + +; The function ntype-names?? takes a list of acceptable node names as a +; criterion and returns a function, which, when applied to a node, +; will return #t if the node name is present in criterion list and #f +; othervise. +; ntype-names?? :: ListOfNames -> Node -> Boolean +(tm-define (ntype-names?? crit) + (lambda(node) + (and (pair? node) + (memq (car node) crit)))) + +(tm-define (sxml-control-node? x) + (and (nstring? x) + (let ((name (as-string (sxml-name x)))) + (and (string-starts? name "*") + (string-ends? name "*"))))) + +(tm-define (sxml-top-node? x) + (and (nstring? x) + (== '*TOP* (car x)))) + +(tm-define (sxml-filter-element-content l) + (list-filter l nstring?)) diff --git a/progs/convert/data/xmltm.scm b/progs/convert/data/xmltm.scm new file mode 100644 index 0000000..89aad77 --- /dev/null +++ b/progs/convert/data/xmltm.scm @@ -0,0 +1,686 @@ + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; MODULE : xmltm.scm +;; DESCRIPTION : Common tools to import XML data. +;; COPYRIGHT : (C) 2002-2003 David Allouche +;; +;; This software falls under the GNU general public license version 3 or later. +;; It comes WITHOUT ANY WARRANTY WHATSOEVER. For details, see the file LICENSE +;; in the root directory or . +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(texmacs-module (convert data xmltm) + (:use (convert tools stm) (convert data sxml) + (convert tools environment) (convert tools tmconcat))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; XML namespace normalization +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; The built-in parser xml/html parser is not namespace aware. So we need to +;; perform namespace-prefix normalization ourselves after the parser. Only +;; namespace-normalized xml trees can be reliably queried. +;; +;; Namespace-normalization deletes all "xmlns" and "xmlns:*" attributes and +;; change the name of all elements and attributes to a fixed set of normalized +;; namespace prefixes. +;; +;; Normalized namespace prefixes are: +;; x -- XML - http://www.w3.org/XML/1998/namespace +;; Reserved prefix (e.g. xml:space). +;; h -- XHTML - http://www.w3.org/1999/xhtml +;; Any HTML or HTML-like data. +;; m -- MathML - http://www.w3.org/1998/Math/MathML +;; +;;Non-Normalized namespace prefixes are: +;; g -- Gallina language. +;; c -- Coq XML format (we named it CoqML). +;; +;; Since the parser is designed to be used for conversion to STM data format, +;; no provisions are made to preserve the namespace prefixes used in the +;; orginial sxml tree. Namespace normalization is not reversible. + +(define xmlns-uri-xml "http://www.w3.org/XML/1998/namespace") +(define xmlns-uri-xhtml "http://www.w3.org/1999/xhtml") +(define xmlns-uri-mathml "http://www.w3.org/1998/Math/MathML") +(define xmlns-uri-gallina "Gallina") +(define xmlns-uri-coqml "CoqML") + +;;; Building the namespace bindings environment + +(define-macro (with-xmltm-environment env ns . body) + `(let ((,env (environment))) + (with-environment ,env ((*default* ,ns) + (xml ,xmlns-uri-xml)) + ,@body))) + +(define (consume-xmlns env attrs proc) + (receive (attrs f+bindings ) + ((cut list-partition <> first) + ((cut map <> attrs) + (lambda (attr) + (if (eq? 'xmlns (first attr)) + `(#f *default* ,(second attr)) + (receive (ns-id ncname) (sxml-split-name (first attr)) + (if (== "xmlns" ns-id) + (list #f (string->symbol ncname) (second attr)) + attr)))))) + ;; Either bindings or attributes are prefixed by #f to allow easy + ;; partition. Regular attributes are (likely) more numerous than xmlns + ;; attributes, so bindings are prefixed to reduce prefix-stripping cost. + (if (null? f+bindings) (proc env attrs) + (with-environment* env (map cdr f+bindings) + (cute proc <> attrs))))) + +;;; Converting nodes + +(tm-define (coqml-parse s) + (xmltm-parse xmlns-uri-coqml parse-xml s)) + +(tm-define (gallinatm-parse s) + (xmltm-parse xmlns-uri-gallina parse-xml s)) + +(tm-define (htmltm-parse s) + (xmltm-parse xmlns-uri-xhtml parse-html s)) + +(tm-define (xmltm-parse default-ns parser s) + (with-xmltm-environment + env default-ns + (let sub ((env env) + (t (parser s))) + (cond ((string? t) t) + ((sxml-top-node? t) `(*TOP* ,@(map (cut sub env <>) (cdr t)))) + ((sxml-control-node? t) t) + (else (ns-import-element sub env t)))))) + +(define (null->false x) (if (null? x) #f x)) + +(define (ns-import-element sub env t) + (consume-xmlns + env (sxml-attr-list t) + (lambda (env attrs) + ((cut sxml-set-name <> (ns-import-name env #t (sxml-name t))) + ((cut sxml-set-attrlist <> (null->false (ns-import-attrs env attrs))) + (sxml-set-content t (map (cut sub env <>) + (sxml-content t)))))))) + +(define (ns-import-attrs env attrs) + ((cut map <> attrs) + (lambda (attr) + ;; handles correctly the pseudo-sxml produced by enumerated html + ;; attributes without left hand side. (e.g. ) + (cons (string->symbol (ns-import-name env #f (first attr))) + (cdr attr))))) + +(define (ns-import-name env use-default? name) + (receive (ns-id ncname) (sxml-split-name name) + (let ((ns-uri (cond (ns-id (environment-ref* env (string->symbol ns-id))) + (use-default? (environment-ref env *default*)) + (else "")))) + (string-append + ;; FIXME: user namespace prefix list should be extensible + (cond ((== ns-uri xmlns-uri-xhtml) "h:") + ((== ns-uri xmlns-uri-mathml) "m:") + ((== ns-uri xmlns-uri-gallina) "g:") + ((== ns-uri xmlns-uri-coqml) "c:") + ((== ns-uri xmlns-uri-xml) "x:") + ((string-null? ns-uri) "") + (else (string-append ns-uri ":"))) + ncname)))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; htmltm environment +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(tm-define (initialize-htmltm env proc) + (with-environment* env '((preserve-space? #f)) proc)) + +(tm-define (htmltm-preserve-space? env) + (environment-ref env preserve-space?)) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; String conversion +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(tm-define (xmltm-text s) + (cork-grave->backquote (utf8->cork s))) + +(tm-define (xmltm-url-text s) + ;; (cork-grave->backquote (utf8->cork (url-decode s))) + ;; NOTE: don't decode URL names, or also implement a corresponding + ;; routine for the encoding, when we click on a hyperlink + (string->tmstring s)) + +;; Conversion of Cork GRAVE ACCENT to LEFT SINGLE QUOTATION MARK + +(define cork-grave-char #\nul) +(define cork-grave (list->string '(#\nul))) +(define cork-backquote (list->string '(#\`))) ;; that is GRAVE ACCENT in ASCII + +(define (cork-grave->backquote s) + (if (string-index s cork-grave-char) + (string-replace s cork-grave cork-backquote) + s)) + +;; Decoding URL strings +;; +;; url-decode uses a state-machine. +;; State is represented as a procedure of type P = ((:or char #f) -> P). +;; The next state is produced by applying the state to the next char. +;; Applying the state to #f yields the inverted list of chars of the decoded +;; string. + +(define (url-decode s) + ;; Decode a URL-encoded (rfc-2141) UTF-8 string. + (url-decode/finish (string-fold (lambda (kar kdr) (kdr kar)) + (cut url-decode/trans <> '()) + s))) + +(define (url-decode/finish proc) + (reverse-list->string (proc #f))) + +(define (url-decode/trans kar cs) + ;; State when the next character is not escaped. + (if (not kar) cs + (if (char=? #\% kar) + (cut url-decode/hex-1 <> cs) + (cut url-decode/trans <> (cons kar cs))))) + +(define (url-decode/hex-1 kar cs) + ;; State when the previous character was a % (escape character). + ;; When a % is not followed by a pair of hexadecimal digits (that is an + ;; error), then % and its following chars are preserved. + (if (not kar) (cons #\% cs) + (if (char-hexadecimal? kar) + (cut url-decode/hex-2 <> cs kar) + (cut url-decode/trans <> (cons* kar #\% cs))))) + +(define (url-decode/hex-2 kar cs hex1) + ;; State when the last two characters were a % and an hexadecimal digit + (if (not kar) (cons* hex1 #\% cs) + (if (char-hexadecimal? kar) + (cut url-decode/trans <> (cons (hexadecimal->char hex1 kar) cs)) + (cut url-decode/trans <> (cons* kar hex1 #\% cs))))) + +(tm-define (char-hexadecimal? c) + ;; Is @c an hexadecimal digit. + (char-in-string? c "01234567890ABCDEFabcdef")) + +(define (hexadecimal->char hex1 hex2) + ;; Convert two hexadecimal digits @hex1 and @hex2 to a single character. + (integer->char (+ (* 16 (hexadecimal-digit->integer hex1)) + (hexadecimal-digit->integer hex2)))) + +(tm-define (hexadecimal-digit->integer c) + (cond ((char-numeric? c) (- (char->integer c) 48)) + ((char-in-string? c "ABCDEF") (- (char->integer c) 55)) + ((char-in-string? c "abcdef") (- (char->integer c) 87)))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Label constructors +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(tm-define (xmltm-attr->label a name) + (let ((val (shtml-attr-non-null a name))) + (and val `(label ,(xmltm-url-text val))))) + +(tm-define (xmltm-label-decorate a name t) + (let ((label (xmltm-attr->label a name))) + (if label + (stm-insert-first-data t label) + t))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Whitespace handling +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define (htmltm-space-element env l) + ;; remove string nodes in the sxml node-list @l. + (sxml-filter-element-content l)) + +(define (htmltm-collapse-spaces s) + ;; normalize (convert to #\space) and collapse whitespace in s. + (list->string (string-fold-right htmltm-collapse-spaces/kons '() s))) + +(define (htmltm-collapse-spaces/kons kar kdr) + (cond ((not (tm-char-whitespace? kar)) (cons kar kdr)) + ((null? kdr) (list #\space)) + ((tm-char-whitespace? (first kdr)) kdr) + (else (cons #\space kdr)))) + +(tm-define (htmltm-space-collapse env l) + ;; Collapses whitespaces in sxml node list @l. Correctly merges consecutive + ;; string nodes in @l. + (cond ((null? l) '()) + ((htmltm-preserve-space? env) (htmltm-space-preserve l)) + (else (let ((l2 (list-fold-right htmltm-space-collapse/kons #f l))) + (if (string? (first l2)) + (cons (htmltm-collapse-spaces (car l2)) (cdr l2)) + l2))))) + +(tm-define (htmltm-space-mixed env l) + ;; remove heading and trailing spaces, and collapses whitespaces in sxml node + ;; list @l. Correctly merges consecutive string nodes in @l. + (cond ((null? l) '()) + ((htmltm-preserve-space? env) (htmltm-space-preserve l)) + (else (let ((l2 (list-fold-right htmltm-space-mixed/kons #f l))) + (if (string? (first l2)) + (cons (htmltm-collapse-spaces (tm-string-trim (car l2))) + (cdr l2)) + l2))))) + +(define (htmltm-space-collapse/kons kar kdr) + (cond ((not kdr) ; kar is last node + (list kar)) + ((string? kar) + (if (string? (first kdr)) + (cons (string-append kar (car kdr)) (cdr kdr)) + (cons kar kdr))) + ((string? (first kdr)) + (cons kar (cons (htmltm-collapse-spaces (car kdr)) (cdr kdr)))) + (else + (cons kar kdr)))) + +(define (htmltm-space-mixed/kons kar kdr) + (cond ((not kdr) ; kar is last node + (if (string? kar) + (list (tm-string-trim-right kar)) + (list kar))) + ((string? kar) + (if (string? (first kdr)) + (cons (string-append kar (car kdr)) (cdr kdr)) + (cons kar kdr))) + ((string? (first kdr)) + (cons kar (cons (htmltm-collapse-spaces (car kdr)) (cdr kdr)))) + (else + (cons kar kdr)))) + +(define (htmltm-space-preformatted env l) + ;; Drop newline at start and end of @l if present. + ;; Assumes the parser did newline normalization. + ;; Convert other newlines to

elements. + (cond ((null? l) '()) + ((htmltm-preserve-space? env) (htmltm-space-preserve l)) + (else (htmltm-space-preserve (htmltm-space-pre-first + (htmltm-space-pre-last l)))))) + +(define (htmltm-space-pre-first l) + (let ((x (first l))) + (if (and (string? x) (string-starts? x "\n")) ; WARNING: \n is not R5RS + (cons (string-drop x 1) (cdr l)) + l))) + +(define (htmltm-space-pre-last l) + ;; If @l ends with newline, drop newline. + ;; + ;; WARNING: non-standard (Joris wants to guess the editor's intent). + ;; If @l ends with a line containing only spaces, drop the line. + (let ((x (last l))) + (cond ((nstring? x) l) + ;; Standard compliant case. Supersed by non-standard case. + ;; ((string-ends? x "\n") ; WARNING: \n is not R5RS + ;; (rcons (but-last l) (string-drop-right x 1))) + ((do ((i (1- (string-length x)) (1- i))) + ((!= #\space (string-ref x i)) + (and (== #\newline (string-ref x i)) i))) + => (lambda (n) (rcons (but-last l) (string-take x n)))) + (else l)))) + +(define (htmltm-space-preserve l) + ;; Convert newlines to

elements. + (append-map htmltm-space-preserve/sub l)) + +(define (htmltm-space-preserve/sub x) + (if (string? x) + (let ((l (string-split-lines x))) + (if (null? (cdr l)) l (map (lambda (x) `(h:p ,x)) l))) + (list x))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Producing handlers for dispatch table +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(tm-define (htmltm-handler model kind method args->serial) + ;; Produce a handler for the htmltm-methods% table + ;; model: content model category, for whitespace handling + ;; :empty -- element is defined to be empty, do not change contents. + ;; :element -- text nodes are ignored + ;; TODO: might fallback to 'mixed' if some text is present + ;; :mixed -- drop heading and trailing whitespaces, normalize and + ;; collapse internal whitespaces. + ;; :collapse -- normalize and collapse whitespaces. Preserve heading + ;; and trailing whitespaces. + ;; :pre -- drop newlines at ends and switch to preserved spaces + ;; mode. + ;; kind: either :block or :inline, how is the element rendered + ;; method: to convert the element content to a node-list. + ;; @method will be passed the contents of html element. + ;; name of a unary macro to contain the converted contents + ;; ( ...) use contents converted to a serial as the last + ;; element of this structure. + ;; convert to this literal, ignoring element contents. + ;; args->serial: used iff @method matches or ( ...). + ;; Function to convert a list of sxml nodes to a stm serial. + ;; + ;; The handler takes care of whitespace cleaning and (except when an + ;; method is provided) insert label for id attributes. + ;; + ;; WARNING: If a list element has an id attribute, the converted result will + ;; not be correctly converted back by tmhtml because the first item marker + ;; will be preceded by a label. + ;; + ;; *Preconditions* + ;; List environments must have :block kind. + ;; *Postconditions* + ;; The handler must return a texmacs node list given a sxhtml node list. + ;; Its result may be the empty list, a list of line-structures, or a list + ;; containing a single document node. + ;; *Invariants* + ;; If @kind is :block, the handler will always return a single 'document'. + ;; If @kind is :inline and @method is or (with ...), the handler + ;; will return a 'document' iff the serial built from the converted contents + ;; of the html element is a 'document'. + ;; + ;; WARNING: 2003-08-22. The htmltm-handler mechanism makes some on-the-fly + ;; transformation on the tree while traversing. The subtrees stored as + ;; children of xpath-parent and xpath-root are not processed. So the + ;; descendence of various ancestors might not be consistent. + (let ((clean (cond ((eq? model :empty) (lambda (env c) c)) + ((eq? model :element) htmltm-space-element) + ((eq? model :collapse) htmltm-space-collapse) + ((eq? model :mixed) htmltm-space-mixed) + ((eq? model :pre) htmltm-space-preformatted) + (else (error "Bad model: " model)))) + (proc-alist + (cond ((eq? kind :inline) + `((:procedure . ,htmltm-handler/procedure/inline) + (:environment . ,htmltm-handler/environment/inline) + (:literal . ,htmltm-handler/literal/inline))) + ((eq? kind :block) + `((:procedure . ,htmltm-handler/procedure/block) + (:environment . ,htmltm-handler/environment/block) + (:literal . ,htmltm-handler/literal/block))) + (error "Bad kind: " kind)))) + + (define (proc key) (cdr (assq key proc-alist))) + (define (make-handler proc . extra) + (lambda (env a c) + (let ((cc (clean env c))) + (with-environment env + ((preserve-space? (or (eq? model :pre) + (htmltm-preserve-space? env)))) + (apply proc env a cc extra))))) + + (cond ((procedure? method) + (make-handler (proc :procedure) method)) + ((or (string? method) (symbol? (first method))) + (make-handler (proc :environment) + (if (string? method) ; this is ugly, should be removed + (if (stm-primitive? (string->symbol method)) + `(expand ,method) + `(,(string->symbol method))) + method) + args->serial)) + (else (cut (proc :literal) <> <> <> (stm-serial method)))))) + +(define (htmltm-handler/procedure/inline env a c proc) + (proc env a c)) + +(define (htmltm-handler/procedure/block env a c proc) + (list (stm-unary-document (htmltm-serial + (htmltm-preserve-space? env) + (proc env a c))))) + +(define (htmltm-handler/literal/inline env a c literal) + (list (xmltm-label-decorate a 'id literal))) + +(define (htmltm-handler/literal/block env a c literal) + (list (xmltm-label-decorate a 'id (stm-unary-document literal)))) + +(define (htmltm-handler/environment/inline env a c head args->serial) + (list (let ((t (xmltm-label-decorate a 'id (args->serial env c)))) + (if (stm-document? t) + `(document (,@head ,(stm-remove-unary-document t))) + `(,@head ,t))))) + +(define (htmltm-handler/environment/block env a c head args->serial) + (let ((make-block (if (stm-block-environment? head) + stm-unary-document noop)) + (make-list (if (stm-list-environment? head) + htmltm-list-glue noop)) + (make-label (if (stm-section-environment? head) + (lambda (x) + (xmltm-label-decorate a 'id (rcons head x))) + (lambda (x) + (rcons head (xmltm-label-decorate a 'id x)))))) + (list `(document ,(make-label (make-list (make-block + (args->serial env c)))))))) + +(define (htmltm-list-glue x) + ;; assert (stm-document? x) + (cons 'document + (receive (first kdr) + (car+cdr (list-fold-right htmltm-list-glue/kons '(#f . ()) (cdr x))) + (htmltm-list-glue/flush first kdr)))) + +(define (htmltm-list-glue/kons kar next+kdr) + (cons kar (receive (next kdr) (car+cdr next+kdr) + (if (stm-list-marker? (stm-first-data kar)) + (cond ((not next) (list kar)) + ((stm-list-marker? (stm-first-data next)) + (cons kar kdr)) + (else + (cons + ; WARNING: no extra whitespace cleanup here + (htmltm-serial #t (list kar next)) kdr))) + ;; previous item may be a marker, do not stack kar + ;; but if next is not a marker, stack it now + (htmltm-list-glue/flush next kdr))))) + +(define (htmltm-list-glue/flush line stack) + (cond ((not line) stack) + ((stm-list-marker? (stm-first-data line)) stack) + (else (cons line stack)))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Producing mathml handlers for dispatch table +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(tm-define (mathtm-handler model method . amethod) + ;; model: content model category + ;; :empty -- element defined to be empty + ;; :element -- text node are ignored + ;; TODO: MathML error if actual content do not match model. + ;; :mixed -- drop heading and trailing whitespaces, normalize and + ;; collapse internal whitespaces. + ;; method: to convert the element content to a node-list. + ;; amethod to process global attributes + (if (not (in? model '(:empty :element :mixed))) + (error "Bad model: " model)) + (if (not (procedure? method)) + (error "Bad method: " method)) + (let ((clean (cond ((eq? model :empty) (lambda (env c) c)) + ((eq? model :element) htmltm-space-element) + ((eq? model :mixed) htmltm-space-mixed)))) + ;(let ((proc method)) + (lambda (env a c) + (mathtm-handler/procedure + env a (clean env c) method amethod))));) + +(define (mathtm-handler/procedure env a c proc aproc) + (if (null? aproc) + (proc env a c) + ((car aproc) env a c proc))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Producing gallina handlers for dispatch table +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define gallinatm-raw htmltm-space-preformatted) +(define gallinatm-terms htmltm-space-element) +(define gallinatm-vernac htmltm-space-element) +(define gallinatm-toplvl htmltm-space-element) +(define gallinatm-ltac htmltm-space-element) + +(define (gallinatm-handler/inline env a c proc) + (proc env a c)) + +(define (gallinatm-handler/bloc env a c proc) + `((document ,@(proc env a c)))) + +(tm-define (gallinatm-handler model method) + ;; model: content model category + ;; :toplvl -- text node are ignored + ;; :terms -- text node are ignored + ;; :vernac -- text node are ignored + ;; :ltac -- text node are ignored + ;; :raw -- drop heading and trailing whitespaces, normalize and + ;; collapse internal whitespaces. + ;; method: to convert the element content to a node-list. + (if (not (in? model '(:raw :terms :vernac :toplvl :ltac))) + (error "Bad model: " model)) + (if (not (procedure? method)) + (error "Bad method: " method)) + (let ((clean (cond ((eq? model :raw) gallinatm-raw) + ((eq? model :terms) gallinatm-terms) + ((eq? model :toplvl) gallinatm-toplvl) + ((eq? model :vernac) gallinatm-vernac) + ((eq? model :ltac) gallinatm-ltac))) + (para (cond ((eq? model :raw) gallinatm-handler/inline) + ((eq? model :terms) gallinatm-handler/inline) + ((eq? model :ltac) gallinatm-handler/inline) + ((eq? model :toplvl) gallinatm-handler/bloc) + ((eq? model :vernac) gallinatm-handler/bloc)))) + (let ((proc method)) + (lambda (env a c) + (para env a (clean env c) proc))))) + +(tm-define (gallinatm-serial p? l) + (if p? (stm-serial l stm-document?) + (stm-serial l stm-document? htmltm-make-line htmltm-make-concat))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Producing coqml handlers for dispatch table +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(tm-define (blank? s) + (:synopsis "does @s contain only whitespace?") + (list-and (map tm-char-whitespace? (string->list s)))) + +(define (trim-newlines s) + (letrec ((nl? (lambda (c) (== c #\newline))) + (trim-right (lambda (l) + (if (and (list>0? l) (nl? (car l))) + (trim-right (cdr l)) l))) + (trim-left (lambda (l) + (if (and (list>0? l) (nl? (cAr l))) + (trim-left (cDr l)) l)))) + (list->string (trim-right (trim-left (string->list s)))))) + +(define (coqml-space-cleaning env l) + ;; Drop blank lines. Trim newlines at begin and end of strings. + ;; Conserve spaces. Put text in string tags. + (set! l (filter (lambda (x) (or (nstring? x) + (not (blank? x)))) l)) + (if (and (nnull? l) (null? (filter nstring? l))) + (list (trim-newlines (apply string-append l))) + (map (lambda (x) (if (string? x) `(c:string ,(trim-newlines x)) x)) l))) + +(define coqml-pre coqml-space-cleaning) +(define coqml-elem htmltm-space-element) + +(tm-define (coqml-handler model method) + ;; model: content model category + ;; :element -- text nodes are ignored + ;; :pre -- Drop blank lines. Trim newlines at beginning and ending + ;; of strings. Conserve spaces. Put text in string tags. + ;; method: to convert the element content to a node-list. + (if (not (in? model '(:pre :elem))) + (error "Bad model: " model)) + (if (not (procedure? method)) + (error "Bad method: " method)) + (let ((clean (cond ((eq? model :pre) coqml-pre) + ((eq? model :elem) coqml-elem)))) + (let ((proc method)) + (lambda (env a c) + (proc env a (clean env c)))))) + +(tm-define (coqml-serial p? l) + (if p? (stm-serial l stm-document?) + (stm-serial l stm-document? htmltm-make-line htmltm-make-concat))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Generic XML dispatcher +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define (sxml-meta-logic-ref ns-id ncname) + (cond ((== ns-id "h") (logic-ref htmltm-methods% ncname)) + ((== ns-id "m") (logic-ref mathtm-methods% ncname)) + ((== ns-id "g") (logic-ref gallinatm-methods% ncname)) + ((== ns-id "c") (logic-ref coqml-methods% ncname)) + (else #f))) + +(tm-define (sxml-dispatch x-string x-pass env t) + ;; Generic xml dispatcher + ;; @x-string (string env -> node-list) used to convert strings + ;; @x-pass (method) pass method + ;; NOTE: method == (env attrlist content -> node-list) + (cond ((string? t) (x-string env t)) + ((sxml-top-node? t) (x-pass env '() (sxml-content t))) + ((sxml-control-node? t) '()) + (else + (receive (ns-id ncname) (sxml-split-name (sxml-name t)) + (cond ((sxml-meta-logic-ref ns-id (string->symbol ncname)) + => (cut <> env (sxml-attr-list t) (sxml-content t))) + (else (x-pass env (sxml-attr-list t) (sxml-content t)))))))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Special serial constructors +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; For performance, we maintain the invariant that block structures are always +;; wrapped in an unary 'document'. +;; +;; To maintain this invariant, it is required that htmltm-methods which +;; introduce block structures be declared of :block type. +;; +;; htmltm-handler must also take care that 'expand' and 'with' methods which +;; take a block-structure as input have their result wrapped in a 'document'. + +(tm-define (htmltm-serial p? l) + (if p? (stm-serial l stm-document?) + (stm-serial l stm-document? htmltm-make-line htmltm-make-concat))) + +(define (htmltm-make-line l) + ;; Trim whitespaces at ends of lines. + ;; Drop lines which are empty after simplification. + (stm-concat l htmltm-make-line/concat)) + +(define (htmltm-make-line/concat l) + ;; TODO: preserve empty lines if creator is TeXmacs. + (let ((x (stm-list->concat (stm-line-trim-both l)))) + (if (== "" x) #f x))) + +(define (htmltm-make-concat l) + ;; Trim whitespace around line-breaks. + ;; Invisible nodes between line-breaks and whitespaces are moved. + ;; + ;; TODO: remove whitespaces surrounding invisibles in the middle of concat + (stm-list->concat + (if (null? l) '() + (let ((line-lists (stm-parse-lines l))) + (if (null? (cdr line-lists)) l + (stm-unparse-lines + `(,(stm-line-trim-right (first line-lists)) + ,@(map stm-line-trim-both (cDdr line-lists)) + ,(stm-line-trim (last line-lists))))))))) + +(tm-define (mathtm-serial env l) + ;; Except for the top-level math element, MathML produce only inlines. + ;; Collapse whitespaces. + ;; TODO: consolidate with htmltm-serial + (with c (apply tmconcat l) + (if (not (func? c 'concat)) c + (stm-concat (cdr c) htmltm-make-concat)))) diff --git a/progs/convert/html/htmltm.scm b/progs/convert/html/htmltm.scm index 6abce8f..48bb68b 100644 --- a/progs/convert/html/htmltm.scm +++ b/progs/convert/html/htmltm.scm @@ -13,11 +13,15 @@ (texmacs-module (convert html htmltm) (:use - (convert tools tmlength) (convert tools tmcolor) - (convert tools old-tmtable) (convert tools stm) - (convert tools sxml) (convert tools sxhtml) + (convert tools tmlength) + (convert tools tmcolor) + (convert tools old-tmtable) + (convert tools stm) + (convert data sxml) + (convert data sxhtml) (convert tools environment) - (convert tools xmltm) (convert mathml mathtm))) + (convert data xmltm) + (convert mathml mathtm))) (define (assoc-string-ci key alist) (list-find alist (lambda (pair) (string-ci=? key (car pair))))) @@ -393,14 +397,22 @@ (htmltm-tex-image (shtml-attr-non-null a 'alt)) (htmltm-image env a c))) -(define (htmltm-wikipedia-span env a c) - (cond ((== (shtml-attr-non-null a 'class) "mwe-math-element") - (if (and (pair? c) (func? (car c) 'h:span)) - (htmltm env (car c)) - (htmltm-pass env a c))) - ((== (shtml-attr-non-null a 'class) "texhtml") - (list `(math ,(htmltm-args-serial env c)))) - (else (htmltm-pass env a c)))) +(define (htmltm-span env a c) + (with class-value (shtml-attr-non-null a 'class) + (cond ((== class-value "mwe-math-element") + (if (and (pair? c) (func? (car c) 'h:span)) + (htmltm env (car c)) + (htmltm-pass env a c))) + ((== class-value "texhtml") + (list `(math ,(htmltm-args-serial env c)))) + ((and (== class-value "katex") + (pair? c) + (func? (car c) 'h:span) + (sxml-has-attr-list? (car c)) + (== (shtml-attr-non-null (sxml-attr-list (car c)) 'class) + "katex-mathml")) + (htmltm env (first c))) + (else (htmltm-pass env a c))))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Special rules for improving Scilab documentation rendering @@ -462,7 +474,7 @@ ;; Grouping (div (handler :mixed :block htmltm-pass)) ;; TODO: convert 'align' attributes in div, p and headings - (span (handler :collapse :inline htmltm-wikipedia-span)) + (span (handler :collapse :inline htmltm-span)) ;; Headings (h1 (handler :mixed :block "chapter*")) diff --git a/progs/convert/html/tmhtml.scm b/progs/convert/html/tmhtml.scm index 6f513e8..7302cca 100644 --- a/progs/convert/html/tmhtml.scm +++ b/progs/convert/html/tmhtml.scm @@ -18,9 +18,9 @@ (convert tools tmlength) (convert tools tmtable) (convert tools old-tmtable) - (convert tools sxml) - (convert tools sxhtml) - (convert tools css) + (convert data sxml) + (convert data sxhtml) + (convert data css) (convert html htmlout))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/progs/convert/mathml/mathtm.scm b/progs/convert/mathml/mathtm.scm index e759e36..dd090b7 100644 --- a/progs/convert/mathml/mathtm.scm +++ b/progs/convert/mathml/mathtm.scm @@ -34,8 +34,8 @@ (texmacs-module (convert mathml mathtm) (:use (convert tools tmtable) - (convert tools sxml) - (convert tools xmltm) + (convert data sxml) + (convert data xmltm) (convert mathml mathml-drd))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/tests/htmltm-test.scm b/tests/htmltm-test.scm index b360c60..d8d773e 100644 --- a/tests/htmltm-test.scm +++ b/tests/htmltm-test.scm @@ -15,8 +15,10 @@ (import (liii check)) (texmacs-module (convert html htmltm-test) - (:use (convert html htmltm) (convert tools xmltm) - (convert tools sxml) (convert tools sxhtml))) + (:use (convert html htmltm) + (convert data xmltm) + (convert data sxml) + (convert data sxhtml))) (define (sxml-postorder t proc) (let sub ((t t))