Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

accept xml input in latexmlc #1497

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
187 changes: 104 additions & 83 deletions lib/LaTeXML.pm
Original file line number Diff line number Diff line change
Expand Up @@ -220,89 +220,106 @@ sub convert {
removeMathFormat($opts, 'svg');
maybeAddMathFormat($opts, 'pmml'); }

# 1.5 Prepare a daemon frame
my $latexml = $$self{latexml};
$latexml->withState(sub {
my ($state) = @_; # Sandbox state
$$state{status} = {};
my $stomach = $$state{stomach};
delete $$stomach{rescued_boxes} if $$stomach{rescued_boxes};
$state->pushDaemonFrame;
$state->assignValue('_authlist', $$opts{authlist}, 'global');
$state->assignValue('REMOTE_REQUEST', (!$$opts{local}), 'global');
});

# 2 Beginning Core conversion - digest the source:
my ($digested, $dom, $serialized) = (undef, undef, undef);
eval {
alarm($$runtime{TTL});
my $mode = ($$opts{type} eq 'auto') ? 'TeX' : $$opts{type};
$digested = $latexml->digestFile($source, preamble => $current_preamble,
postamble => $current_postamble,
mode => $mode,
noinitialize => 1);
$$runtime{TTL} = alarm(0); };
my $eval_report = $@;
if (!$digested && $eval_report) {
# We can retry finishing digestion if hit a Fatal,
# sometimes there are leftover boxes we can accept.
eval {
alarm($$runtime{TTL});
$digested = $latexml->withState(sub {
return $latexml->finishDigestion; });
$$runtime{TTL} = alarm(0); };
$eval_report .= $@ if $@; }
# 2.1 Now, convert to DOM and output, if desired.
my $core_target = $$opts{format};
# Default Core target is XML
if ($core_target ne 'tex' and $core_target ne 'box') {
$core_target = 'xml'; }
if ($digested) {
my ($dom, $serialized, $eval_report) = (undef, undef, undef);
if ($$opts{type} eq 'XML') {
if (pathname_is_literaldata($source)) {
$source =~ m/^literal:(.*)$/s;
eval { $dom = LaTeXML::Common::XML::Parser->new()->parseString($1); }; }
else {
eval { $dom = LaTeXML::Common::XML::Parser->new()->parseFile($source); }; }
if (!$dom) {
local $@ = 'Fatal:conversion:unknown XML Parsing failed! (Unknown Reason)' if (!$@);
$eval_report = $@;
$$runtime{status} = colorizeString('XML parsing failed', 'error');
$$runtime{status_code} = 3; }
else {
$$runtime{status} = colorizeString('No obvious problems', 'success');
$$runtime{status_code} = 0; } }
else {
# 1.5 Prepare a daemon frame
my $latexml = $$self{latexml};
$latexml->withState(sub {
my ($state) = @_; # Sandbox state
$$state{status} = {};
my $stomach = $$state{stomach};
delete $$stomach{rescued_boxes} if $$stomach{rescued_boxes};
$state->pushDaemonFrame;
$state->assignValue('_authlist', $$opts{authlist}, 'global');
$state->assignValue('REMOTE_REQUEST', (!$$opts{local}), 'global');
});

# 2 Beginning Core conversion - digest the source:
my $digested = undef;
eval {
alarm($$runtime{TTL});
$latexml->withState(sub {
if ($core_target eq 'tex') {
$serialized = LaTeXML::Core::Token::UnTeX($digested); }
elsif ($core_target eq 'box') {
$serialized = ($$opts{verbosity} > 0 ? $digested->stringify : $digested->toString); }
elsif ($core_target eq 'xml') {
$dom = $latexml->convertDocument($digested); } });
my $mode = ($$opts{type} eq 'auto') ? 'TeX' : $$opts{type};
$digested = $latexml->digestFile($source, preamble => $current_preamble,
postamble => $current_postamble,
mode => $mode,
noinitialize => 1);
$$runtime{TTL} = alarm(0); };
$eval_report .= $@ if $@;
# Try to rescue the document if e.g. math parsing hit a Fatal error
if (!$dom && $@ && $core_target eq 'xml') {
$dom = $latexml->withState(sub {
my ($state) = @_;
my $rescued = $$state{rescued_document};
$rescued->finalize() if $rescued;
return $rescued; }); } }
$$runtime{status} = $latexml->getStatusMessage;
$$runtime{status_code} = $latexml->getStatusCode;
# 2.2 Bookkeeping in case in-eval perl die() deaths occurred
if ($eval_report) {
$$runtime{status} .= "\n" . $eval_report . "\n";
$$runtime{status_code} = 3; }

# End daemon run, by popping frame:
$latexml->withState(sub {
my ($state) = @_; # Remove current state frame
## TODO: This section of option preparations can be factored out as a subroutine if it grows further
## the general idea is that right before the "pop" of the daemon frame, we have access to all meaningful
## global state values, and we can preserve the relevant ones for the post-processing stage
## BEGIN POST-PROCESSING-PREP
$$opts{searchpaths} = $state->lookupValue('SEARCHPATHS'); # save the searchpaths for post-processing
if ($state->lookupValue('LEXEMATIZE_MATH')) { # save potential request for serializing math lexemes
$$opts{math_formats} ||= [];
push @{ $$opts{math_formats} }, 'lexemes';
# recheck need for parallel
$$opts{parallelmath} = 1 if (@{ $$opts{math_formats} } > 1); }
## END POST-PROCESSING-PREP
$state->popDaemonFrame;
});
if ($LaTeXML::UNSAFE_FATAL) {
# If the conversion hit an unsafe fatal, we need to reinitialize
$LaTeXML::UNSAFE_FATAL = 0;
$$self{ready} = 0;
$eval_report = $@;
if (!$digested && $eval_report) {
# We can retry finishing digestion if hit a Fatal,
# sometimes there are leftover boxes we can accept.
eval {
alarm($$runtime{TTL});
$digested = $latexml->withState(sub {
return $latexml->finishDigestion; });
$$runtime{TTL} = alarm(0); };
$eval_report .= $@ if $@; }
# 2.1 Now, convert to DOM and output, if desired.
my $core_target = $$opts{format};
# Default Core target is XML
if ($core_target ne 'tex' and $core_target ne 'box') {
$core_target = 'xml'; }
if ($digested) {
eval {
alarm($$runtime{TTL});
$latexml->withState(sub {
if ($core_target eq 'tex') {
$serialized = LaTeXML::Core::Token::UnTeX($digested); }
elsif ($core_target eq 'box') {
$serialized = ($$opts{verbosity} > 0 ? $digested->stringify : $digested->toString); }
elsif ($core_target eq 'xml') {
$dom = $latexml->convertDocument($digested); } });
$$runtime{TTL} = alarm(0); };
$eval_report .= $@ if $@;
# Try to rescue the document if e.g. math parsing hit a Fatal error
if (!$dom && $@ && $core_target eq 'xml') {
$dom = $latexml->withState(sub {
my ($state) = @_;
my $rescued = $$state{rescued_document};
$rescued->finalize() if $rescued;
return $rescued; }); } }
$$runtime{status} = $latexml->getStatusMessage;
$$runtime{status_code} = $latexml->getStatusCode;
# 2.2 Bookkeeping in case in-eval perl die() deaths occurred
if ($eval_report) {
$$runtime{status} .= "\n" . $eval_report . "\n";
$$runtime{status_code} = 3; }

# End daemon run, by popping frame:
$latexml->withState(sub {
my ($state) = @_; # Remove current state frame
## TODO: This section of option preparations can be factored out as a subroutine if it grows further
## the general idea is that right before the "pop" of the daemon frame, we have access to all meaningful
## global state values, and we can preserve the relevant ones for the post-processing stage
## BEGIN POST-PROCESSING-PREP
$$opts{searchpaths} = $state->lookupValue('SEARCHPATHS'); # save the searchpaths for post-processing
if ($state->lookupValue('LEXEMATIZE_MATH')) { # save potential request for serializing math lexemes
$$opts{math_formats} ||= [];
push @{ $$opts{math_formats} }, 'lexemes';
# recheck need for parallel
$$opts{parallelmath} = 1 if (@{ $$opts{math_formats} } > 1); }
## END POST-PROCESSING-PREP
$state->popDaemonFrame;
});
if ($LaTeXML::UNSAFE_FATAL) {
# If the conversion hit an unsafe fatal, we need to reinitialize
$LaTeXML::UNSAFE_FATAL = 0;
$$self{ready} = 0;
}
}
Note(($$opts{recursive} ? "recursive " : "") . "Conversion complete: " . $$runtime{status});

Expand Down Expand Up @@ -362,15 +379,15 @@ sub convert {
if ($ref_result =~ /Document$/) {
$serialized = $result->toString(1);
$serialized = Encode::encode('UTF-8', $serialized) if $serialized;
} else { # fragment case
} else { # fragment case
$serialized = $result->toString(1, 1);
} }
elsif ($$opts{format} =~ /^html/) {
if (ref($result) =~ /^LaTeXML::(Post::)?Document$/) {
# Needs explicit encode call, toStringHTML returns Perl byte strings
$serialized = $result->getDocument->toStringHTML;
$serialized = Encode::encode('UTF-8', $serialized) if $serialized; }
else { # fragment case
else { # fragment case
local $XML::LibXML::setTagCompression = 1;
$serialized = $result->toString(1, 1); } } }
# Compressed/archive/other case, just pass on
Expand Down Expand Up @@ -444,6 +461,10 @@ sub convert_post {

my $DOCUMENT = LaTeXML::Post::Document->new($dom, %PostOPS);
my @procs = ();

if ($$opts{type} eq 'XML' && $$opts{validate}) {
$DOCUMENT->validate; }

#TODO: Add support for the following:
my $dbfile = $$opts{dbfile};
if (defined $dbfile && !-f $dbfile) {
Expand Down Expand Up @@ -473,7 +494,7 @@ sub convert_post {
if ($$opts{crossref}) {
require LaTeXML::Post::CrossRef;
push(@procs, LaTeXML::Post::CrossRef->new(
db => $DB, urlstyle => $$opts{urlstyle},
db => $DB, urlstyle => $$opts{urlstyle},
extension => $$opts{extension},
($$opts{numbersections} ? (number_sections => 1) : ()),
($$opts{navtoc} ? (navigation_toc => $$opts{navtoc}) : ()),
Expand Down
10 changes: 8 additions & 2 deletions lib/LaTeXML/Common/Config.pm
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ use Data::Dumper;
our $PROFILES_DB = {}; # Class-wide, caches all profiles that get used while the server is alive
our $is_bibtex = qr/(^literal\:\s*\@)|(\.bib$)/;
our $is_archive = qr/(^literal\:PK)|(\.zip$)/;
our $is_xml = qr/(^literal\:\x{FEFF}?<\?xml(\x20|\x09|\x0D|\x0A))|(\.xml$)/;

use base qw(Exporter);
our @EXPORT = (qw(addMathFormat removeMathFormat maybeAddMathFormat));
Expand Down Expand Up @@ -61,6 +62,7 @@ sub getopt_specification {
"tex" => sub { $$opts{format} = 'tex'; },
"box" => sub { $$opts{format} = 'box'; },
"bibtex" => sub { $$opts{type} = 'BibTeX'; },
"xmlinput" => sub { $$opts{type} = 'XML'; },
"noparse" => sub { $$opts{mathparse} = 'no'; },
"format=s" => \$$opts{format},
"parse=s" => \$$opts{mathparse},
Expand Down Expand Up @@ -210,8 +212,11 @@ sub read {
$$opts{source} = $ARGV[0] unless $$opts{source};
# Special source-based guessing needs to happen here,
# as we won't have access to the source file/literal/resource later on:
if (!$$opts{type} || ($$opts{type} eq 'auto')) {
$$opts{type} = 'BibTeX' if ($$opts{source} && ($$opts{source} =~ /$is_bibtex/)); }
if ((!$$opts{type} || ($$opts{type} eq 'auto')) && $$opts{source}) {
if ($$opts{source} =~ /$is_bibtex/) {
$$opts{type} = 'BibTeX'; }
elsif ($$opts{source} =~ /$is_xml/) {
$$opts{type} = 'XML'; } }
if (!$$opts{whatsin}) {
$$opts{whatsin} = 'archive' if ($$opts{source} && ($$opts{source} =~ /$is_archive/)); }
return $getOptions_success;
Expand Down Expand Up @@ -791,6 +796,7 @@ latexmlc [options]
--verbose more informative output (can repeat)
--strict makes latexml less forgiving of errors
--bibtex processes a BibTeX bibliography.
--xmlinput processes an xml file generated by "latexml".
--xml requests xml output (default).
--tex requests TeX output after expansion.
--box requests box output after expansion
Expand Down