-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathdemo-parselov.js
329 lines (288 loc) · 14.3 KB
/
demo-parselov.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
var fs = require('fs');
var zlib = require('zlib');
var util = require('util');
var as_json = process.argv[4] == "-json";
var input = fs.readFileSync(process.argv[3], {
"encoding": "utf-8"
});
zlib.gunzip(fs.readFileSync(process.argv[2]), function(err, buf) {
var g = JSON.parse(buf);
///////////////////////////////////////////////////////////////////
// Typical grammars do not distinguish between all characters in
// their alphabet, or the alphabet of the input to be parsed, like
// all of Unicode. So in order to save space in the transition
// tables, input symbol are mapped into a smaller set of symbols.
///////////////////////////////////////////////////////////////////
var s = [].map.call(input, function(ch) {
return g.input_to_symbol[ ch.charCodeAt(0) ] || 0
});
///////////////////////////////////////////////////////////////////
// The mapped input symbols are then fed to a deterministic finite
// state automaton. The sequence of states is stored for later use.
// The initial state of the automaton is always `1` by convention.
///////////////////////////////////////////////////////////////////
var fstate = 1;
var forwards = [fstate].concat(s.map(function(i) {
return fstate = g.forwards[fstate].transitions[i] || 0;
}));
///////////////////////////////////////////////////////////////////
// An input string does not necessarily match what the parser is
// expecting. When the whole input is read, and the automaton is
// not in an accepting state, then either there is an error in the
// input, or the input is incomplete. The converse does not necess-
// arily hold. For recursive grammars the automaton might be in an
// accepting state even though the input does not match it.
///////////////////////////////////////////////////////////////////
if (g.forwards[fstate].accepts == "0") {
process.stderr.write("failed around " + forwards.indexOf('0'));
return;
}
///////////////////////////////////////////////////////////////////
// The output of the first deterministic finite state transducer is
// then passed through a second one. At the end of the string it
// knows exactly which paths through the graph have led to a match
// and can now trace them back to eliminate matches that failed.
// The output of the second deterministic finite state transducer
// is a concatenation of edges to be added to the parse graph. As
// before, it starts in state `1` by convention.
///////////////////////////////////////////////////////////////////
var bstate = 1;
var edges = forwards.reverse().map(function(i) {
return bstate = g.backwards[bstate].transitions[i] || 0;
}).reverse();
if (as_json) {
var tree = generate_json_formatted_parse_tree(g, edges);
process.stdout.write(tree + "\n");
return;
}
///////////////////////////////////////////////////////////////////
// The `edges` list is just a list of integers, each identifying a
// set of edges. This is useful for post-processing operations, but
// typical applications will need to resolve them to build a graph
// for traversal. As an example, this function will print out the
// whole parse graph as a GraphViz `dot` file that can be rendered.
///////////////////////////////////////////////////////////////////
write_edges_in_graphviz_dot_format(g, edges);
});
function write_edges_in_graphviz_dot_format(g, edges) {
///////////////////////////////////////////////////////////////////
// An edge consists of two vertices and every vertex can have some
// properties. Among them are a type and a label. Important types
// include "start" and "final" vertices. They signify the beginning
// and end of named captures, and pairs of them correspond to non-
// terminal symbols in a grammar. This function combines type and
// label (the name of the non-terminal) into a vertex label.
///////////////////////////////////////////////////////////////////
var print_label = function(offset, v) {
process.stdout.write(util.format('"%s,%s"[label="%s %s"];\n',
offset,
v,
g.vertices[v].type || "",
g.vertices[v].text || v
));
};
process.stdout.write("digraph {\n");
edges.forEach(function(id, ix) {
/////////////////////////////////////////////////////////////////
// There are two kinds of edges associated with edge identifiers.
// "Null" edges represent transitions that do not consume input
// symbols. They are needed to support nesting of non-terminals,
// non-terminals that match the empty string, among other things.
/////////////////////////////////////////////////////////////////
g.null_edges[id].forEach(function(e) {
process.stdout.write(util.format('"%s,%s" -> "%s,%s";\n',
ix, e[0], ix, e[1]));
print_label(ix, e[0]);
print_label(ix, e[1]);
});
/////////////////////////////////////////////////////////////////
// "Char" edges represent transitions that go outside of a set of
// edges (and into the next) because an input symbol has to be
// read to continue on their path. In other words, they are what
// connects the individual edge sets to one another.
/////////////////////////////////////////////////////////////////
g.char_edges[id].forEach(function(e) {
process.stdout.write(util.format('"%s,%s" -> "%s,%s";\n',
ix, e[0], ix + 1, e[1]));
print_label(ix, e[0]);
print_label(ix + 1, e[1]);
});
});
///////////////////////////////////////////////////////////////////
// If there are cycles in the parse graph at the first or the last
// position, it may be necessary to know which of the vertices
// stands for the start symbol of the grammar. They are avaiable:
///////////////////////////////////////////////////////////////////
var start_vertex = g.start_vertex;
var final_vertex = g.final_vertex;
process.stdout.write("}\n");
}
function generate_json_formatted_parse_tree(g, edges) {
var parsers = [{
output: "",
offset: 0,
vertex: g.start_vertex,
stack: []
}];
///////////////////////////////////////////////////////////////////
// To recap, the result of the initial parse is a list of edge sets
// each of which contains two different kinds of sets of edges. The
// vertices encoded therein can have multiple successors. They come
// from ambiguity and recursion in the input grammar. In order to
// exhaustively search for a parse tree within the parse graph, it
// may be necessary to explore all alternative successors of a ver-
// tex. So whenever alternatives are encountered, they are recorded
// in the `parsers` array, and the following algorithm contines un-
// til either a parse tree has been found or until all alternatives
// are exhausted. So the `while` loop takes care of the latter.
///////////////////////////////////////////////////////////////////
while (parsers.length) {
var p = parsers[0];
if (g.vertices[p.vertex].type == "start") {
///////////////////////////////////////////////////////////////
// Finding a parse tree within a parse graph requires matching
// all starting points of non-terminal symbols to corresponding
// end points so boundaries of a match are properly balanced.
// When a starting point is found, it is pushed on to a stack.
///////////////////////////////////////////////////////////////
p.stack.push({"vertex": p.vertex, "offset": p.offset});
var indent = p.stack.map(function(){ return ' '; }).join("");
p.output += "\n" + indent;
p.output += '[' + JSON.stringify(g.vertices[p.vertex].text)
.replace(/,/g, '\\u002c') + ', [';
}
if (g.vertices[p.vertex].type == "final") {
///////////////////////////////////////////////////////////////
// When there is an opportunity to close the match that is on
// the top of the stack, i.e., when a `final` vertex is found
// on the path that is currently being explored, the vertex can
// be compared to the stack's top element, and if they match,
// we can move on to a successor vertex. On the other hand, if
// the stack is empty, the code has taken a wrong turn. It may
// be better to catch this condition using a sentinel value on
// the stack; vertex `0` is reserved for such uses.
///////////////////////////////////////////////////////////////
if (p.stack.length == 0) {
parsers.shift();
continue;
}
var top = p.stack.pop();
///////////////////////////////////////////////////////////////
// The `start` vertices know which `final` vertex they match
// with, and if the top of the stack is not it, then the whole
// parser is dropped, and the loop will try an alternative that
// has been recorded earlier, if any.
///////////////////////////////////////////////////////////////
if (p.vertex != g.vertices[top.vertex]["with"]) {
parsers.shift();
continue;
}
p.output += '], ' + top.offset + ', ' + p.offset + '],';
}
/////////////////////////////////////////////////////////////////
// Actually, `start` and `final` vertices do not mark entrance
// and exit points of recursions, rather they are re-used if more
// than one such pair of points exists. They are distinguished by
// guarding pairs of `prefix` and `suffix` nodes. They actually
// need to be mapped to stack operations; it is optional but very
// convenient for the `start` and `final` vertices.
/////////////////////////////////////////////////////////////////
if (g.vertices[p.vertex].type == "prefix") {
p.stack.push({"vertex": p.vertex, "offset": p.offset});
}
if (g.vertices[p.vertex].type == "suffix") {
if (p.stack.length == 0) {
parsers.shift();
continue;
}
var top = p.stack.pop();
if (p.vertex != g.vertices[top.vertex]["with"]) {
parsers.shift();
continue;
}
}
/////////////////////////////////////////////////////////////////
// For a successfull match of the whole input, there are three
// conditions to be met: the parser must have reached the end of
// the list of edges, which corresponds to the end of the input;
// there must not be open matches left on the stack, and the ver-
// at the end has to be the final vertex of the whole graph. It
// is possible that there is still a loop around the final vertex
// matching the empty string, but we ignore them here.
/////////////////////////////////////////////////////////////////
if (g.final_vertex == p.vertex) {
if (p.offset + 1 >= edges.length)
if (p.stack.length == 0)
return p.output.replace(/,\]/g, ']').replace(/,$/, '');
}
if (p.offset >= edges.length) {
parsers.shift();
continue;
}
/////////////////////////////////////////////////////////////////
// Without a match and without a parsing failure, the path under
// consideration can be explored further. For that the successors
// of the current vertex have to be retrieved from static data.
/////////////////////////////////////////////////////////////////
var cs = g.char_edges[ edges[p.offset] ].filter(function(e) {
return e && e[0] == p.vertex;
}).map(function(e) {
return { successor: e[1], type: "char" };
});
var ns = g.null_edges[ edges[p.offset] ].filter(function(e) {
return e && e[0] == p.vertex;
}).map(function(e) {
return { successor: e[1], type: "null" };
});
var successors = ns.concat(cs);
/////////////////////////////////////////////////////////////////
// Vertices can have an associated `sort_key` to guide the choice
// among alternative successors. A common disambiguation strategy
// is to pick the "first", "left-most" alternative, in which case
// the `sort_key` corresponds the position of grammar constructs
// in the grammar the parsing data is based on. There are other,
// possibly more complex, strategies that can be used instead.
/////////////////////////////////////////////////////////////////
successors.sort(function(a, b) {
return (g.vertices[a.successor].sort_key || 0) -
(g.vertices[b.successor].sort_key || 0);
});
/////////////////////////////////////////////////////////////////
// It is possible that a vertex has no successors at this point,
// even if there are no errors in the parsing data. In such cases
// this parser has failed to match and alternatives are explored.
/////////////////////////////////////////////////////////////////
if (successors.length < 1) {
parsers.shift();
continue;
}
/////////////////////////////////////////////////////////////////
// Sorting based on the `sort_key` leaves the best successor at
// the first position. The current parser will continue with it.
/////////////////////////////////////////////////////////////////
var chosen = successors.shift();
/////////////////////////////////////////////////////////////////
// All other successors, if there are any, are turned into start
// positions for additional parsers, that may be used instead of
// the current one in case it ultimately fails to match.
/////////////////////////////////////////////////////////////////
successors.forEach(function(s) {
parsers.push({
output: p.output,
offset: s.type == "char" ? p.offset + 1 : p.offset,
vertex: s.successor,
stack: p.stack.slice()
});
});
/////////////////////////////////////////////////////////////////
// Finally, if the successor vertex is taken from `char_edges`,
// meaning an input symbol has been consumed to reach it, the
// parser can move on to the next edge and process the successor.
/////////////////////////////////////////////////////////////////
if (chosen.type == "char") {
p.offset += 1;
}
p.vertex = chosen.successor;
}
return "bad";
}