From c553abc79e335033891670416977a570fb1dc0c7 Mon Sep 17 00:00:00 2001 From: Vincenzo Mantova Date: Sun, 1 Aug 2021 12:57:32 +0100 Subject: [PATCH] accept xml input in latexmlc --- lib/LaTeXML.pm | 187 +++++++++++++++++++---------------- lib/LaTeXML/Common/Config.pm | 10 +- 2 files changed, 112 insertions(+), 85 deletions(-) diff --git a/lib/LaTeXML.pm b/lib/LaTeXML.pm index 0be19d1bb..5b008318a 100644 --- a/lib/LaTeXML.pm +++ b/lib/LaTeXML.pm @@ -220,89 +220,106 @@ sub convert { removeMathFormat($opts, 'svg'); maybeAddMathFormat($opts, 'pmml'); } - # 1.5 Prepare a daemon frame - my $latexml = $$self{latexml}; - $latexml->withState(sub { - my ($state) = @_; # Sandbox state - $$state{status} = {}; - my $stomach = $$state{stomach}; - delete $$stomach{rescued_boxes} if $$stomach{rescued_boxes}; - $state->pushDaemonFrame; - $state->assignValue('_authlist', $$opts{authlist}, 'global'); - $state->assignValue('REMOTE_REQUEST', (!$$opts{local}), 'global'); - }); - - # 2 Beginning Core conversion - digest the source: - my ($digested, $dom, $serialized) = (undef, undef, undef); - eval { - alarm($$runtime{TTL}); - my $mode = ($$opts{type} eq 'auto') ? 'TeX' : $$opts{type}; - $digested = $latexml->digestFile($source, preamble => $current_preamble, - postamble => $current_postamble, - mode => $mode, - noinitialize => 1); - $$runtime{TTL} = alarm(0); }; - my $eval_report = $@; - if (!$digested && $eval_report) { - # We can retry finishing digestion if hit a Fatal, - # sometimes there are leftover boxes we can accept. - eval { - alarm($$runtime{TTL}); - $digested = $latexml->withState(sub { - return $latexml->finishDigestion; }); - $$runtime{TTL} = alarm(0); }; - $eval_report .= $@ if $@; } - # 2.1 Now, convert to DOM and output, if desired. - my $core_target = $$opts{format}; - # Default Core target is XML - if ($core_target ne 'tex' and $core_target ne 'box') { - $core_target = 'xml'; } - if ($digested) { + my ($dom, $serialized, $eval_report) = (undef, undef, undef); + if ($$opts{type} eq 'XML') { + if (pathname_is_literaldata($source)) { + $source =~ m/^literal:(.*)$/s; + eval { $dom = LaTeXML::Common::XML::Parser->new()->parseString($1); }; } + else { + eval { $dom = LaTeXML::Common::XML::Parser->new()->parseFile($source); }; } + if (!$dom) { + local $@ = 'Fatal:conversion:unknown XML Parsing failed! (Unknown Reason)' if (!$@); + $eval_report = $@; + $$runtime{status} = colorizeString('XML parsing failed', 'error'); + $$runtime{status_code} = 3; } + else { + $$runtime{status} = colorizeString('No obvious problems', 'success'); + $$runtime{status_code} = 0; } } + else { + # 1.5 Prepare a daemon frame + my $latexml = $$self{latexml}; + $latexml->withState(sub { + my ($state) = @_; # Sandbox state + $$state{status} = {}; + my $stomach = $$state{stomach}; + delete $$stomach{rescued_boxes} if $$stomach{rescued_boxes}; + $state->pushDaemonFrame; + $state->assignValue('_authlist', $$opts{authlist}, 'global'); + $state->assignValue('REMOTE_REQUEST', (!$$opts{local}), 'global'); + }); + + # 2 Beginning Core conversion - digest the source: + my $digested = undef; eval { alarm($$runtime{TTL}); - $latexml->withState(sub { - if ($core_target eq 'tex') { - $serialized = LaTeXML::Core::Token::UnTeX($digested); } - elsif ($core_target eq 'box') { - $serialized = ($$opts{verbosity} > 0 ? $digested->stringify : $digested->toString); } - elsif ($core_target eq 'xml') { - $dom = $latexml->convertDocument($digested); } }); + my $mode = ($$opts{type} eq 'auto') ? 'TeX' : $$opts{type}; + $digested = $latexml->digestFile($source, preamble => $current_preamble, + postamble => $current_postamble, + mode => $mode, + noinitialize => 1); $$runtime{TTL} = alarm(0); }; - $eval_report .= $@ if $@; - # Try to rescue the document if e.g. math parsing hit a Fatal error - if (!$dom && $@ && $core_target eq 'xml') { - $dom = $latexml->withState(sub { - my ($state) = @_; - my $rescued = $$state{rescued_document}; - $rescued->finalize() if $rescued; - return $rescued; }); } } - $$runtime{status} = $latexml->getStatusMessage; - $$runtime{status_code} = $latexml->getStatusCode; - # 2.2 Bookkeeping in case in-eval perl die() deaths occurred - if ($eval_report) { - $$runtime{status} .= "\n" . $eval_report . "\n"; - $$runtime{status_code} = 3; } - - # End daemon run, by popping frame: - $latexml->withState(sub { - my ($state) = @_; # Remove current state frame - ## TODO: This section of option preparations can be factored out as a subroutine if it grows further - ## the general idea is that right before the "pop" of the daemon frame, we have access to all meaningful - ## global state values, and we can preserve the relevant ones for the post-processing stage - ## BEGIN POST-PROCESSING-PREP - $$opts{searchpaths} = $state->lookupValue('SEARCHPATHS'); # save the searchpaths for post-processing - if ($state->lookupValue('LEXEMATIZE_MATH')) { # save potential request for serializing math lexemes - $$opts{math_formats} ||= []; - push @{ $$opts{math_formats} }, 'lexemes'; - # recheck need for parallel - $$opts{parallelmath} = 1 if (@{ $$opts{math_formats} } > 1); } - ## END POST-PROCESSING-PREP - $state->popDaemonFrame; - }); - if ($LaTeXML::UNSAFE_FATAL) { - # If the conversion hit an unsafe fatal, we need to reinitialize - $LaTeXML::UNSAFE_FATAL = 0; - $$self{ready} = 0; + $eval_report = $@; + if (!$digested && $eval_report) { + # We can retry finishing digestion if hit a Fatal, + # sometimes there are leftover boxes we can accept. + eval { + alarm($$runtime{TTL}); + $digested = $latexml->withState(sub { + return $latexml->finishDigestion; }); + $$runtime{TTL} = alarm(0); }; + $eval_report .= $@ if $@; } + # 2.1 Now, convert to DOM and output, if desired. + my $core_target = $$opts{format}; + # Default Core target is XML + if ($core_target ne 'tex' and $core_target ne 'box') { + $core_target = 'xml'; } + if ($digested) { + eval { + alarm($$runtime{TTL}); + $latexml->withState(sub { + if ($core_target eq 'tex') { + $serialized = LaTeXML::Core::Token::UnTeX($digested); } + elsif ($core_target eq 'box') { + $serialized = ($$opts{verbosity} > 0 ? $digested->stringify : $digested->toString); } + elsif ($core_target eq 'xml') { + $dom = $latexml->convertDocument($digested); } }); + $$runtime{TTL} = alarm(0); }; + $eval_report .= $@ if $@; + # Try to rescue the document if e.g. math parsing hit a Fatal error + if (!$dom && $@ && $core_target eq 'xml') { + $dom = $latexml->withState(sub { + my ($state) = @_; + my $rescued = $$state{rescued_document}; + $rescued->finalize() if $rescued; + return $rescued; }); } } + $$runtime{status} = $latexml->getStatusMessage; + $$runtime{status_code} = $latexml->getStatusCode; + # 2.2 Bookkeeping in case in-eval perl die() deaths occurred + if ($eval_report) { + $$runtime{status} .= "\n" . $eval_report . "\n"; + $$runtime{status_code} = 3; } + + # End daemon run, by popping frame: + $latexml->withState(sub { + my ($state) = @_; # Remove current state frame + ## TODO: This section of option preparations can be factored out as a subroutine if it grows further + ## the general idea is that right before the "pop" of the daemon frame, we have access to all meaningful + ## global state values, and we can preserve the relevant ones for the post-processing stage + ## BEGIN POST-PROCESSING-PREP + $$opts{searchpaths} = $state->lookupValue('SEARCHPATHS'); # save the searchpaths for post-processing + if ($state->lookupValue('LEXEMATIZE_MATH')) { # save potential request for serializing math lexemes + $$opts{math_formats} ||= []; + push @{ $$opts{math_formats} }, 'lexemes'; + # recheck need for parallel + $$opts{parallelmath} = 1 if (@{ $$opts{math_formats} } > 1); } + ## END POST-PROCESSING-PREP + $state->popDaemonFrame; + }); + if ($LaTeXML::UNSAFE_FATAL) { + # If the conversion hit an unsafe fatal, we need to reinitialize + $LaTeXML::UNSAFE_FATAL = 0; + $$self{ready} = 0; + } } Note(($$opts{recursive} ? "recursive " : "") . "Conversion complete: " . $$runtime{status}); @@ -362,7 +379,7 @@ sub convert { if ($ref_result =~ /Document$/) { $serialized = $result->toString(1); $serialized = Encode::encode('UTF-8', $serialized) if $serialized; - } else { # fragment case + } else { # fragment case $serialized = $result->toString(1, 1); } } elsif ($$opts{format} =~ /^html/) { @@ -370,7 +387,7 @@ sub convert { # Needs explicit encode call, toStringHTML returns Perl byte strings $serialized = $result->getDocument->toStringHTML; $serialized = Encode::encode('UTF-8', $serialized) if $serialized; } - else { # fragment case + else { # fragment case local $XML::LibXML::setTagCompression = 1; $serialized = $result->toString(1, 1); } } } # Compressed/archive/other case, just pass on @@ -444,6 +461,10 @@ sub convert_post { my $DOCUMENT = LaTeXML::Post::Document->new($dom, %PostOPS); my @procs = (); + + if ($$opts{type} eq 'XML' && $$opts{validate}) { + $DOCUMENT->validate; } + #TODO: Add support for the following: my $dbfile = $$opts{dbfile}; if (defined $dbfile && !-f $dbfile) { @@ -473,7 +494,7 @@ sub convert_post { if ($$opts{crossref}) { require LaTeXML::Post::CrossRef; push(@procs, LaTeXML::Post::CrossRef->new( - db => $DB, urlstyle => $$opts{urlstyle}, + db => $DB, urlstyle => $$opts{urlstyle}, extension => $$opts{extension}, ($$opts{numbersections} ? (number_sections => 1) : ()), ($$opts{navtoc} ? (navigation_toc => $$opts{navtoc}) : ()), diff --git a/lib/LaTeXML/Common/Config.pm b/lib/LaTeXML/Common/Config.pm index cab4c3464..7a26714cb 100644 --- a/lib/LaTeXML/Common/Config.pm +++ b/lib/LaTeXML/Common/Config.pm @@ -24,6 +24,7 @@ use Data::Dumper; our $PROFILES_DB = {}; # Class-wide, caches all profiles that get used while the server is alive our $is_bibtex = qr/(^literal\:\s*\@)|(\.bib$)/; our $is_archive = qr/(^literal\:PK)|(\.zip$)/; +our $is_xml = qr/(^literal\:\x{FEFF}?<\?xml(\x20|\x09|\x0D|\x0A))|(\.xml$)/; use base qw(Exporter); our @EXPORT = (qw(addMathFormat removeMathFormat maybeAddMathFormat)); @@ -61,6 +62,7 @@ sub getopt_specification { "tex" => sub { $$opts{format} = 'tex'; }, "box" => sub { $$opts{format} = 'box'; }, "bibtex" => sub { $$opts{type} = 'BibTeX'; }, + "xmlinput" => sub { $$opts{type} = 'XML'; }, "noparse" => sub { $$opts{mathparse} = 'no'; }, "format=s" => \$$opts{format}, "parse=s" => \$$opts{mathparse}, @@ -210,8 +212,11 @@ sub read { $$opts{source} = $ARGV[0] unless $$opts{source}; # Special source-based guessing needs to happen here, # as we won't have access to the source file/literal/resource later on: - if (!$$opts{type} || ($$opts{type} eq 'auto')) { - $$opts{type} = 'BibTeX' if ($$opts{source} && ($$opts{source} =~ /$is_bibtex/)); } + if ((!$$opts{type} || ($$opts{type} eq 'auto')) && $$opts{source}) { + if ($$opts{source} =~ /$is_bibtex/) { + $$opts{type} = 'BibTeX'; } + elsif ($$opts{source} =~ /$is_xml/) { + $$opts{type} = 'XML'; } } if (!$$opts{whatsin}) { $$opts{whatsin} = 'archive' if ($$opts{source} && ($$opts{source} =~ /$is_archive/)); } return $getOptions_success; @@ -791,6 +796,7 @@ latexmlc [options] --verbose more informative output (can repeat) --strict makes latexml less forgiving of errors --bibtex processes a BibTeX bibliography. + --xmlinput processes an xml file generated by "latexml". --xml requests xml output (default). --tex requests TeX output after expansion. --box requests box output after expansion