-
Notifications
You must be signed in to change notification settings - Fork 30
/
pandoc_converter.ts
333 lines (285 loc) · 12 KB
/
pandoc_converter.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
/*
* pandoc_converter.ts
*
* Copyright (C) 2022 by Posit Software, PBC
*
* Unless you have received this program directly from Posit Software pursuant
* to the terms of a commercial license agreement with Posit Software, then
* this program is licensed to you under the terms of version 3 of the
* GNU Affero General Public License. This program is distributed WITHOUT
* ANY EXPRESS OR IMPLIED WARRANTY, INCLUDING THOSE OF NON-INFRINGEMENT,
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. Please refer to the
* AGPL (http://www.gnu.org/licenses/agpl-3.0.txt) for more details.
*
*/
import * as semver from "semver";
import { Schema, Node as ProsemirrorNode } from 'prosemirror-model';
import { findChildren } from 'prosemirror-utils';
import {
PandocServer,
PandocTokenReader,
PandocNodeWriter,
PandocMarkWriter,
PandocPreprocessorFn,
PandocBlockReaderFn,
PandocPostprocessorFn,
PandocInlineHTMLReaderFn,
PandocTokensFilterFn,
PandocMarkdownPostProcessorFn,
} from '../api/pandoc';
import { haveTableCellsWithInlineRcode } from '../api/rmd';
import { pandocFormatWith, kGfmFormat, kCommonmarkFormat } from '../api/pandoc_format';
import { PandocCapabilities } from '../api/pandoc_capabilities';
import { PandocBlockCapsuleFilter, pandocMarkdownWithBlockCapsules } from '../api/pandoc_capsule';
import { ExtensionManager } from '../editor/editor-extensions';
import { pandocToProsemirror } from './pandoc_to_prosemirror';
import { pandocFromProsemirror } from './pandoc_from_prosemirror';
import { isParagraphNode } from '../api/paragraph';
import { PandocFormat, PandocWriterOptions } from '../api/pandoc-types';
import { escapeRegExpCharacters, lines, normalizeNewlines } from 'core';
export type PandocLineWrapping = 'none' | 'column' | 'sentence';
export interface PandocToProsemirrorResult {
doc: ProsemirrorNode;
line_wrapping: PandocLineWrapping;
unrecognized: string[];
example_lists: boolean;
unparsed_meta: { [key: string]: unknown };
}
export class PandocConverter {
private readonly schema: Schema;
private readonly preprocessors: readonly PandocPreprocessorFn[];
private readonly postprocessors: readonly PandocPostprocessorFn[];
private readonly readers: readonly PandocTokenReader[];
private readonly tokensFilters: readonly PandocTokensFilterFn[];
private readonly blockReaders: readonly PandocBlockReaderFn[];
private readonly inlineHTMLReaders: readonly PandocInlineHTMLReaderFn[];
private readonly blockCapsuleFilters: readonly PandocBlockCapsuleFilter[];
private readonly nodeWriters: readonly PandocNodeWriter[];
private readonly markWriters: readonly PandocMarkWriter[];
private readonly markdownPostProcessors: readonly PandocMarkdownPostProcessorFn[];
private readonly pandoc: PandocServer;
private readonly pandocCapabilities: PandocCapabilities;
constructor(
schema: Schema,
extensions: ExtensionManager,
pandoc: PandocServer,
pandocCapabilities: PandocCapabilities,
) {
this.schema = schema;
this.preprocessors = extensions.pandocPreprocessors();
this.postprocessors = extensions.pandocPostprocessors();
this.readers = extensions.pandocReaders();
this.tokensFilters = extensions.pandocTokensFilters();
this.blockReaders = extensions.pandocBlockReaders();
this.inlineHTMLReaders = extensions.pandocInlineHTMLReaders();
this.blockCapsuleFilters = extensions.pandocBlockCapsuleFilters();
this.nodeWriters = extensions.pandocNodeWriters();
this.markWriters = extensions.pandocMarkWriters();
this.markdownPostProcessors = extensions.pandocMarkdownPostProcessors();
this.pandoc = pandoc;
this.pandocCapabilities = pandocCapabilities;
}
public async toProsemirror(markdown: string, format: PandocFormat): Promise<PandocToProsemirrorResult> {
// normalize newlines (for regex)
markdown = normalizeNewlines(markdown);
// save original markdown (for aligning capsule positions)
const original = markdown;
// adjust format. we always need to *read* raw_html, raw_attribute, and backtick_code_blocks b/c
// that's how preprocessors hoist content through pandoc into our prosemirror token parser.
// we always need to read with auto_identifiers so we can catch any auto-generated ids
// required to fulfill links inside the document (we will strip out heading ids that
// aren't explicit or a link target using the heading_ids returned with the ast).
//
// we always read all forms of tables (since they can always be written back out as raw_html)
//
// we also always read math (since it can always be output as 'asciimath')
// determine type of auto_ids
const autoIds = format.extensions.gfm_auto_identifiers ? 'gfm_auto_identifiers' : 'auto_identifiers';
const targetFormat = adjustedFormat(
format.fullName,
['raw_html', 'raw_attribute', 'backtick_code_blocks', autoIds,
'grid_tables', 'pipe_tables', 'multiline_tables', 'simple_tables',
'tex_math_dollars'],
['smart'],
);
// run preprocessors
this.preprocessors.forEach(preprocessor => {
markdown = preprocessor(markdown);
});
// create source capsules
this.blockCapsuleFilters.forEach(filter => {
markdown = pandocMarkdownWithBlockCapsules(original, markdown, filter);
});
const ast = await this.pandoc.markdownToAst(markdown, targetFormat, []);
const result = pandocToProsemirror(
ast,
this.schema,
format.extensions,
this.readers,
this.tokensFilters,
this.blockReaders,
this.inlineHTMLReaders,
this.blockCapsuleFilters,
);
// run post-processors
this.postprocessors.forEach(postprocessor => {
result.doc = postprocessor(result.doc);
});
// return the doc
return result;
}
// NOTE: For a plain markdown file, this is the closest we can come to cannonicalizing w/ just pandoc:
//
// pandoc MANUAL.md --to markdown-auto_identifiers-smart -o MANUAL.md --self-contained --atx-headers --wrap=none
//
// For R Mardown files, we would need to pull out the Rmd chunks before sending to pandoc.
//
public async fromProsemirror(
doc: ProsemirrorNode,
pandocFormat: PandocFormat,
pandocCapabilities: PandocCapabilities,
options: PandocWriterOptions,
): Promise<string> {
// generate pandoc ast
const output = pandocFromProsemirror(
doc,
this.pandocCapabilities.api_version,
pandocFormat,
this.nodeWriters,
this.markWriters,
);
// adjust format. we always need to be able to write raw_attribute b/c that's how preprocessors
// hoist content through pandoc into our prosemirror token parser. since we open this door when
// reading, users could end up writing raw inlines, and in that case we want them to echo back
// to the source document just the way they came in. for writing markdown from pm we don't
// ever want to generate auto identifiers so we disable them here.
let format = adjustedFormat(
pandocFormat.fullName,
['raw_html', 'raw_attribute'], // always enable
['auto_identifiers', 'gfm_auto_identifiers', 'smart'],
); // always disable
// disable selected format options
format = pandocFormatWith(format, disabledFormatOptions(format, pandocFormat, doc), '');
// prepare pandoc options
let pandocOptions: string[] = [];
if (!options.atxHeaders) {
pandocOptions.push('--markdown-headings=setext');
}
if (options.dpi) {
pandocOptions.push('--dpi');
}
// default to block level references (validate known types)
let referenceLocation = 'block';
if (options.references?.location) {
referenceLocation = ['block', 'section', 'document'].includes(options.references.location)
? options.references.location
: 'block';
}
pandocOptions.push(`--reference-location=${referenceLocation}`);
// references prefix (if any)
if (options.references?.prefix) {
pandocOptions.push('--id-prefix', options.references.prefix);
}
// reference links
if (options.references?.links) {
pandocOptions.push('--reference-links');
}
// provide wrap options
pandocOptions = pandocOptions.concat(wrapOptions(options));
// render to markdown
let markdown = await this.pandoc.astToMarkdown(output.ast, format, pandocOptions);
// normalize newlines (don't know if pandoc uses \r\n on windows)
markdown = markdown.replace(/\r\n|\n\r|\r/g, '\n');
// if the pandoc version is >= 3.1.4 then fix IDs that were broken by
// the change in --id-prefix behavior
if (options.references?.prefix) {
const pandocVersion = pandocSemver(pandocCapabilities);
if (pandocVersion && semver.gte(pandocVersion, "3.1.4")) {
const regex = new RegExp(escapeRegExpCharacters(`{#${options.references.prefix}`), "g");
markdown = markdown.replace(regex, "{#");
}
}
// run post-processors
this.markdownPostProcessors.forEach(postprocessor => {
markdown = postprocessor(markdown);
});
// return
return markdown;
}
}
// extract semver from pandoc version output
function pandocSemver(capabilities: PandocCapabilities) {
const versionLines = lines(capabilities.version);
const version = versionLines[0].replace("pandoc ", "").trim();
return semver.coerce(version);
}
// adjust the specified format
function adjustedFormat(format: string, extensions: string[], disabled: string[]) {
let newFormat = pandocFormatWith(
format,
'',
extensions.map(ext => `+${ext}`).join('') + disabled.map(ext => `-${ext}`).join(''),
);
// any extension specified needs to not have a - anywhere in the format
extensions.forEach(ext => {
newFormat = newFormat.replace('-' + ext, '');
});
return newFormat;
}
function disabledFormatOptions(format: string, pandocFormat: PandocFormat, doc: ProsemirrorNode) {
// (prefer pipe and grid tables). users can still force the availability of these by
// adding those format flags but all known markdown variants that support tables also
// support pipe tables so this seems unlikely to ever be required.
let disabledTableTypes = '-simple_tables-multiline_tables';
// if there are tables with inline R code then disable grid tables (as the inline
// R code will mess up the column boundaries)
if (haveTableCellsWithInlineRcode(doc) ||
(!gridTablesRequired(doc) && pandocFormat.extensions.pipe_tables)) {
disabledTableTypes += '-grid_tables';
}
// gfm and commonmark variants don't allow simple/multiline/grid tables (just pipe tables)
// and it's an error to even include these in the markdown format specifier -- so for
// these modes we just nix the disabling
if (format.startsWith(kGfmFormat) || format.startsWith(kCommonmarkFormat)) {
disabledTableTypes = '';
}
// return
return disabledTableTypes;
}
function gridTablesRequired(doc: ProsemirrorNode) {
const schema = doc.type.schema;
const isTableCell = (node: ProsemirrorNode) => node.type === schema.nodes.table_cell || node.type === schema.nodes.table_header;
return findChildren(doc, isTableCell).some(cell => {
// various things require grid tables (basically anything that requires embedded newlines)
// multiple blocks
if (cell.node.childCount > 1) {
return true;
}
// non paragraph block
if (!isParagraphNode(cell.node.firstChild)) {
return true;
}
// paragraph with hard break
const paraNode = cell.node.firstChild!;
return findChildren(paraNode, node => node.type === schema.nodes.hard_break).length > 0;
});
}
function wrapOptions(options: PandocWriterOptions) {
const pandocOptions: string[] = [];
if (options.wrap) {
if (options.wrap === 'none' || options.wrap === 'sentence') {
pandocOptions.push('--wrap=none');
} else {
const column = parseInt(options.wrap, 10);
if (column) {
pandocOptions.push('--wrap=auto');
pandocOptions.push(`--columns=${column}`);
} else {
pandocOptions.push('--wrap=none');
}
}
} else {
pandocOptions.push('--wrap=none');
}
return pandocOptions;
}