Skip to content

Commit

Permalink
accept xml input in latexmlc
Browse files Browse the repository at this point in the history
  • Loading branch information
xworld21 committed Aug 1, 2021
1 parent 9c84912 commit 17c698b
Show file tree
Hide file tree
Showing 2 changed files with 108 additions and 85 deletions.
183 changes: 100 additions & 83 deletions lib/LaTeXML.pm
Original file line number Diff line number Diff line change
Expand Up @@ -220,89 +220,102 @@ sub convert {
removeMathFormat($opts, 'svg');
maybeAddMathFormat($opts, 'pmml'); }

# 1.5 Prepare a daemon frame
my $latexml = $$self{latexml};
$latexml->withState(sub {
my ($state) = @_; # Sandbox state
$$state{status} = {};
my $stomach = $$state{stomach};
delete $$stomach{rescued_boxes} if $$stomach{rescued_boxes};
$state->pushDaemonFrame;
$state->assignValue('_authlist', $$opts{authlist}, 'global');
$state->assignValue('REMOTE_REQUEST', (!$$opts{local}), 'global');
});

# 2 Beginning Core conversion - digest the source:
my ($digested, $dom, $serialized) = (undef, undef, undef);
eval {
alarm($$runtime{TTL});
my $mode = ($$opts{type} eq 'auto') ? 'TeX' : $$opts{type};
$digested = $latexml->digestFile($source, preamble => $current_preamble,
postamble => $current_postamble,
mode => $mode,
noinitialize => 1);
$$runtime{TTL} = alarm(0); };
my $eval_report = $@;
if (!$digested && $eval_report) {
# We can retry finishing digestion if hit a Fatal,
# sometimes there are leftover boxes we can accept.
eval {
alarm($$runtime{TTL});
$digested = $latexml->withState(sub {
return $latexml->finishDigestion; });
$$runtime{TTL} = alarm(0); };
$eval_report .= $@ if $@; }
# 2.1 Now, convert to DOM and output, if desired.
my $core_target = $$opts{format};
# Default Core target is XML
if ($core_target ne 'tex' and $core_target ne 'box') {
$core_target = 'xml'; }
if ($digested) {
my ($dom, $serialized, $eval_report) = (undef, undef, undef);
if ($$opts{type} eq 'XML') {
eval { $dom = LaTeXML::Common::XML::Parser->new()->parseFile($source); };
if (!$dom) {
local $@ = 'Fatal:conversion:unknown XML Parsing failed! (Unknown Reason)' if (!$@);
$eval_report = $@;
$$runtime{status} = colorizeString('XML parsing failed', 'error');
$$runtime{status_code} = 3; }
else {
$$runtime{status} = colorizeString('No obvious problems', 'success');
$$runtime{status_code} = 0; } }
else {
# 1.5 Prepare a daemon frame
my $latexml = $$self{latexml};
$latexml->withState(sub {
my ($state) = @_; # Sandbox state
$$state{status} = {};
my $stomach = $$state{stomach};
delete $$stomach{rescued_boxes} if $$stomach{rescued_boxes};
$state->pushDaemonFrame;
$state->assignValue('_authlist', $$opts{authlist}, 'global');
$state->assignValue('REMOTE_REQUEST', (!$$opts{local}), 'global');
});

# 2 Beginning Core conversion - digest the source:
my $digested = undef;
eval {
alarm($$runtime{TTL});
$latexml->withState(sub {
if ($core_target eq 'tex') {
$serialized = LaTeXML::Core::Token::UnTeX($digested); }
elsif ($core_target eq 'box') {
$serialized = ($$opts{verbosity} > 0 ? $digested->stringify : $digested->toString); }
elsif ($core_target eq 'xml') {
$dom = $latexml->convertDocument($digested); } });
my $mode = ($$opts{type} eq 'auto') ? 'TeX' : $$opts{type};
$digested = $latexml->digestFile($source, preamble => $current_preamble,
postamble => $current_postamble,
mode => $mode,
noinitialize => 1);
$$runtime{TTL} = alarm(0); };
$eval_report .= $@ if $@;
# Try to rescue the document if e.g. math parsing hit a Fatal error
if (!$dom && $@ && $core_target eq 'xml') {
$dom = $latexml->withState(sub {
my ($state) = @_;
my $rescued = $$state{rescued_document};
$rescued->finalize() if $rescued;
return $rescued; }); } }
$$runtime{status} = $latexml->getStatusMessage;
$$runtime{status_code} = $latexml->getStatusCode;
# 2.2 Bookkeeping in case in-eval perl die() deaths occurred
if ($eval_report) {
$$runtime{status} .= "\n" . $eval_report . "\n";
$$runtime{status_code} = 3; }

# End daemon run, by popping frame:
$latexml->withState(sub {
my ($state) = @_; # Remove current state frame
## TODO: This section of option preparations can be factored out as a subroutine if it grows further
## the general idea is that right before the "pop" of the daemon frame, we have access to all meaningful
## global state values, and we can preserve the relevant ones for the post-processing stage
## BEGIN POST-PROCESSING-PREP
$$opts{searchpaths} = $state->lookupValue('SEARCHPATHS'); # save the searchpaths for post-processing
if ($state->lookupValue('LEXEMATIZE_MATH')) { # save potential request for serializing math lexemes
$$opts{math_formats} ||= [];
push @{ $$opts{math_formats} }, 'lexemes';
# recheck need for parallel
$$opts{parallelmath} = 1 if (@{ $$opts{math_formats} } > 1); }
## END POST-PROCESSING-PREP
$state->popDaemonFrame;
});
if ($LaTeXML::UNSAFE_FATAL) {
# If the conversion hit an unsafe fatal, we need to reinitialize
$LaTeXML::UNSAFE_FATAL = 0;
$$self{ready} = 0;
$eval_report = $@;
if (!$digested && $eval_report) {
# We can retry finishing digestion if hit a Fatal,
# sometimes there are leftover boxes we can accept.
eval {
alarm($$runtime{TTL});
$digested = $latexml->withState(sub {
return $latexml->finishDigestion; });
$$runtime{TTL} = alarm(0); };
$eval_report .= $@ if $@; }
# 2.1 Now, convert to DOM and output, if desired.
my $core_target = $$opts{format};
# Default Core target is XML
if ($core_target ne 'tex' and $core_target ne 'box') {
$core_target = 'xml'; }
if ($digested) {
eval {
alarm($$runtime{TTL});
$latexml->withState(sub {
if ($core_target eq 'tex') {
$serialized = LaTeXML::Core::Token::UnTeX($digested); }
elsif ($core_target eq 'box') {
$serialized = ($$opts{verbosity} > 0 ? $digested->stringify : $digested->toString); }
elsif ($core_target eq 'xml') {
$dom = $latexml->convertDocument($digested); } });
$$runtime{TTL} = alarm(0); };
$eval_report .= $@ if $@;
# Try to rescue the document if e.g. math parsing hit a Fatal error
if (!$dom && $@ && $core_target eq 'xml') {
$dom = $latexml->withState(sub {
my ($state) = @_;
my $rescued = $$state{rescued_document};
$rescued->finalize() if $rescued;
return $rescued; }); } }
$$runtime{status} = $latexml->getStatusMessage;
$$runtime{status_code} = $latexml->getStatusCode;
# 2.2 Bookkeeping in case in-eval perl die() deaths occurred
if ($eval_report) {
$$runtime{status} .= "\n" . $eval_report . "\n";
$$runtime{status_code} = 3; }

# End daemon run, by popping frame:
$latexml->withState(sub {
my ($state) = @_; # Remove current state frame
## TODO: This section of option preparations can be factored out as a subroutine if it grows further
## the general idea is that right before the "pop" of the daemon frame, we have access to all meaningful
## global state values, and we can preserve the relevant ones for the post-processing stage
## BEGIN POST-PROCESSING-PREP
$$opts{searchpaths} = $state->lookupValue('SEARCHPATHS'); # save the searchpaths for post-processing
if ($state->lookupValue('LEXEMATIZE_MATH')) { # save potential request for serializing math lexemes
$$opts{math_formats} ||= [];
push @{ $$opts{math_formats} }, 'lexemes';
# recheck need for parallel
$$opts{parallelmath} = 1 if (@{ $$opts{math_formats} } > 1); }
## END POST-PROCESSING-PREP
$state->popDaemonFrame;
});
if ($LaTeXML::UNSAFE_FATAL) {
# If the conversion hit an unsafe fatal, we need to reinitialize
$LaTeXML::UNSAFE_FATAL = 0;
$$self{ready} = 0;
}
}
Note(($$opts{recursive} ? "recursive " : "") . "Conversion complete: " . $$runtime{status});

Expand Down Expand Up @@ -362,15 +375,15 @@ sub convert {
if ($ref_result =~ /Document$/) {
$serialized = $result->toString(1);
$serialized = Encode::encode('UTF-8', $serialized) if $serialized;
} else { # fragment case
} else { # fragment case
$serialized = $result->toString(1, 1);
} }
elsif ($$opts{format} =~ /^html/) {
if (ref($result) =~ /^LaTeXML::(Post::)?Document$/) {
# Needs explicit encode call, toStringHTML returns Perl byte strings
$serialized = $result->getDocument->toStringHTML;
$serialized = Encode::encode('UTF-8', $serialized) if $serialized; }
else { # fragment case
else { # fragment case
local $XML::LibXML::setTagCompression = 1;
$serialized = $result->toString(1, 1); } } }
# Compressed/archive/other case, just pass on
Expand Down Expand Up @@ -444,6 +457,10 @@ sub convert_post {

my $DOCUMENT = LaTeXML::Post::Document->new($dom, %PostOPS);
my @procs = ();

if ($$opts{type} eq 'XML' && $$opts{validate}) {
$DOCUMENT->validate; }

#TODO: Add support for the following:
my $dbfile = $$opts{dbfile};
if (defined $dbfile && !-f $dbfile) {
Expand Down Expand Up @@ -473,7 +490,7 @@ sub convert_post {
if ($$opts{crossref}) {
require LaTeXML::Post::CrossRef;
push(@procs, LaTeXML::Post::CrossRef->new(
db => $DB, urlstyle => $$opts{urlstyle},
db => $DB, urlstyle => $$opts{urlstyle},
extension => $$opts{extension},
($$opts{numbersections} ? (number_sections => 1) : ()),
($$opts{navtoc} ? (navigation_toc => $$opts{navtoc}) : ()),
Expand Down
10 changes: 8 additions & 2 deletions lib/LaTeXML/Common/Config.pm
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ use Data::Dumper;
our $PROFILES_DB = {}; # Class-wide, caches all profiles that get used while the server is alive
our $is_bibtex = qr/(^literal\:\s*\@)|(\.bib$)/;
our $is_archive = qr/(^literal\:PK)|(\.zip$)/;
our $is_xml = qr/(^literal\:<?xml)|(\.xml$)/;

use base qw(Exporter);
our @EXPORT = (qw(addMathFormat removeMathFormat maybeAddMathFormat));
Expand Down Expand Up @@ -61,6 +62,7 @@ sub getopt_specification {
"tex" => sub { $$opts{format} = 'tex'; },
"box" => sub { $$opts{format} = 'box'; },
"bibtex" => sub { $$opts{type} = 'BibTeX'; },
"xmlinput" => sub { $$opts{type} = 'XML'; },
"noparse" => sub { $$opts{mathparse} = 'no'; },
"format=s" => \$$opts{format},
"parse=s" => \$$opts{mathparse},
Expand Down Expand Up @@ -210,8 +212,11 @@ sub read {
$$opts{source} = $ARGV[0] unless $$opts{source};
# Special source-based guessing needs to happen here,
# as we won't have access to the source file/literal/resource later on:
if (!$$opts{type} || ($$opts{type} eq 'auto')) {
$$opts{type} = 'BibTeX' if ($$opts{source} && ($$opts{source} =~ /$is_bibtex/)); }
if ((!$$opts{type} || ($$opts{type} eq 'auto')) && $$opts{source}) {
if ($$opts{source} =~ /$is_bibtex/) {
$$opts{type} = 'BibTeX'; }
elsif ($$opts{source} =~ /$is_xml/) {
$$opts{type} = 'XML'; } }
if (!$$opts{whatsin}) {
$$opts{whatsin} = 'archive' if ($$opts{source} && ($$opts{source} =~ /$is_archive/)); }
return $getOptions_success;
Expand Down Expand Up @@ -791,6 +796,7 @@ latexmlc [options]
--verbose more informative output (can repeat)
--strict makes latexml less forgiving of errors
--bibtex processes a BibTeX bibliography.
--xmlinput processes an xml file generated by "latexml".
--xml requests xml output (default).
--tex requests TeX output after expansion.
--box requests box output after expansion
Expand Down

0 comments on commit 17c698b

Please sign in to comment.