This repository has been archived by the owner on Dec 16, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 174
/
constituency_parser.py
496 lines (430 loc) · 21.4 KB
/
constituency_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
from typing import Dict, Tuple, List, NamedTuple, Any
import torch
from torch.nn.modules.linear import Linear
from nltk import Tree
from allennlp.common.checks import check_dimensions_match
from allennlp.data import TextFieldTensors, Vocabulary
from allennlp.modules import Seq2SeqEncoder, TimeDistributed, TextFieldEmbedder, FeedForward
from allennlp.modules.token_embedders import Embedding
from allennlp.modules.span_extractors.span_extractor import SpanExtractor
from allennlp.models.model import Model
from allennlp.nn import InitializerApplicator
from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits
from allennlp.nn.util import masked_softmax, get_lengths_from_binary_sequence_mask
from allennlp.training.metrics import CategoricalAccuracy
from allennlp.training.metrics import EvalbBracketingScorer, DEFAULT_EVALB_DIR
from allennlp.common.checks import ConfigurationError
class SpanInformation(NamedTuple):
"""
A helper namedtuple for handling decoding information.
# Parameters
start : `int`
The start index of the span.
end : `int`
The exclusive end index of the span.
no_label_prob : `float`
The probability of this span being assigned the `NO-LABEL` label.
label_prob : `float`
The probability of the most likely label.
"""
start: int
end: int
label_prob: float
no_label_prob: float
label_index: int
@Model.register("constituency_parser")
class SpanConstituencyParser(Model):
"""
This `SpanConstituencyParser` simply encodes a sequence of text
with a stacked `Seq2SeqEncoder`, extracts span representations using a
`SpanExtractor`, and then predicts a label for each span in the sequence.
These labels are non-terminal nodes in a constituency parse tree, which we then
greedily reconstruct.
# Parameters
vocab : `Vocabulary`, required
A Vocabulary, required in order to compute sizes for input/output projections.
text_field_embedder : `TextFieldEmbedder`, required
Used to embed the `tokens` `TextField` we get as input to the model.
span_extractor : `SpanExtractor`, required.
The method used to extract the spans from the encoded sequence.
encoder : `Seq2SeqEncoder`, required.
The encoder that we will use in between embedding tokens and
generating span representations.
feedforward : `FeedForward`, required.
The FeedForward layer that we will use in between the encoder and the linear
projection to a distribution over span labels.
pos_tag_embedding : `Embedding`, optional.
Used to embed the `pos_tags` `SequenceLabelField` we get as input to the model.
initializer : `InitializerApplicator`, optional (default=`InitializerApplicator()`)
Used to initialize the model parameters.
evalb_directory_path : `str`, optional (default=`DEFAULT_EVALB_DIR`)
The path to the directory containing the EVALB executable used to score
bracketed parses. By default, will use the EVALB included with allennlp,
which is located at allennlp/tools/EVALB . If `None`, EVALB scoring
is not used.
"""
def __init__(
self,
vocab: Vocabulary,
text_field_embedder: TextFieldEmbedder,
span_extractor: SpanExtractor,
encoder: Seq2SeqEncoder,
feedforward: FeedForward = None,
pos_tag_embedding: Embedding = None,
initializer: InitializerApplicator = InitializerApplicator(),
evalb_directory_path: str = DEFAULT_EVALB_DIR,
**kwargs,
) -> None:
super().__init__(vocab, **kwargs)
self.text_field_embedder = text_field_embedder
self.span_extractor = span_extractor
self.num_classes = self.vocab.get_vocab_size("labels")
self.encoder = encoder
self.feedforward_layer = TimeDistributed(feedforward) if feedforward else None
self.pos_tag_embedding = pos_tag_embedding or None
if feedforward is not None:
output_dim = feedforward.get_output_dim()
else:
output_dim = span_extractor.get_output_dim()
self.tag_projection_layer = TimeDistributed(Linear(output_dim, self.num_classes))
representation_dim = text_field_embedder.get_output_dim()
if pos_tag_embedding is not None:
representation_dim += pos_tag_embedding.get_output_dim()
check_dimensions_match(
representation_dim,
encoder.get_input_dim(),
"representation dim (tokens + optional POS tags)",
"encoder input dim",
)
check_dimensions_match(
encoder.get_output_dim(),
span_extractor.get_input_dim(),
"encoder input dim",
"span extractor input dim",
)
if feedforward is not None:
check_dimensions_match(
span_extractor.get_output_dim(),
feedforward.get_input_dim(),
"span extractor output dim",
"feedforward input dim",
)
self.tag_accuracy = CategoricalAccuracy()
if evalb_directory_path is not None:
self._evalb_score = EvalbBracketingScorer(evalb_directory_path)
else:
self._evalb_score = None
initializer(self)
def forward(
self, # type: ignore
tokens: TextFieldTensors,
spans: torch.LongTensor,
metadata: List[Dict[str, Any]],
pos_tags: TextFieldTensors = None,
span_labels: torch.LongTensor = None,
) -> Dict[str, torch.Tensor]:
"""
# Parameters
tokens : `TextFieldTensors`, required
The output of `TextField.as_array()`, which should typically be passed directly to a
`TextFieldEmbedder`. This output is a dictionary mapping keys to `TokenIndexer`
tensors. At its most basic, using a `SingleIdTokenIndexer` this is : `{"tokens":
Tensor(batch_size, num_tokens)}`. This dictionary will have the same keys as were used
for the `TokenIndexers` when you created the `TextField` representing your
sequence. The dictionary is designed to be passed directly to a `TextFieldEmbedder`,
which knows how to combine different word representations into a single vector per
token in your input.
spans : `torch.LongTensor`, required.
A tensor of shape `(batch_size, num_spans, 2)` representing the
inclusive start and end indices of all possible spans in the sentence.
metadata : `List[Dict[str, Any]]`, required.
A dictionary of metadata for each batch element which has keys:
tokens : `List[str]`, required.
The original string tokens in the sentence.
gold_tree : `nltk.Tree`, optional (default = `None`)
Gold NLTK trees for use in evaluation.
pos_tags : `List[str]`, optional.
The POS tags for the sentence. These can be used in the
model as embedded features, but they are passed here
in addition for use in constructing the tree.
pos_tags : `torch.LongTensor`, optional (default = `None`)
The output of a `SequenceLabelField` containing POS tags.
span_labels : `torch.LongTensor`, optional (default = `None`)
A torch tensor representing the integer gold class labels for all possible
spans, of shape `(batch_size, num_spans)`.
# Returns
An output dictionary consisting of:
class_probabilities : `torch.FloatTensor`
A tensor of shape `(batch_size, num_spans, span_label_vocab_size)`
representing a distribution over the label classes per span.
spans : `torch.LongTensor`
The original spans tensor.
tokens : `List[List[str]]`, required.
A list of tokens in the sentence for each element in the batch.
pos_tags : `List[List[str]]`, required.
A list of POS tags in the sentence for each element in the batch.
num_spans : `torch.LongTensor`, required.
A tensor of shape (batch_size), representing the lengths of non-padded spans
in `enumerated_spans`.
loss : `torch.FloatTensor`, optional
A scalar loss to be optimised.
"""
embedded_text_input = self.text_field_embedder(tokens)
if pos_tags is not None and self.pos_tag_embedding is not None:
embedded_pos_tags = self.pos_tag_embedding(pos_tags)
embedded_text_input = torch.cat([embedded_text_input, embedded_pos_tags], -1)
elif self.pos_tag_embedding is not None:
raise ConfigurationError("Model uses a POS embedding, but no POS tags were passed.")
mask = get_text_field_mask(tokens)
# Looking at the span start index is enough to know if
# this is padding or not. Shape: (batch_size, num_spans)
span_mask = (spans[:, :, 0] >= 0).squeeze(-1)
if span_mask.dim() == 1:
# This happens if you use batch_size 1 and encounter
# a length 1 sentence in PTB, which do exist. -.-
span_mask = span_mask.unsqueeze(-1)
if span_labels is not None and span_labels.dim() == 1:
span_labels = span_labels.unsqueeze(-1)
num_spans = get_lengths_from_binary_sequence_mask(span_mask)
encoded_text = self.encoder(embedded_text_input, mask)
span_representations = self.span_extractor(encoded_text, spans, mask, span_mask)
if self.feedforward_layer is not None:
span_representations = self.feedforward_layer(span_representations)
logits = self.tag_projection_layer(span_representations)
class_probabilities = masked_softmax(logits, span_mask.unsqueeze(-1))
output_dict = {
"class_probabilities": class_probabilities,
"spans": spans,
"tokens": [meta["tokens"] for meta in metadata],
"pos_tags": [meta.get("pos_tags") for meta in metadata],
"num_spans": num_spans,
}
if span_labels is not None:
loss = sequence_cross_entropy_with_logits(logits, span_labels, span_mask)
self.tag_accuracy(class_probabilities, span_labels, span_mask)
output_dict["loss"] = loss
# The evalb score is expensive to compute, so we only compute
# it for the validation and test sets.
batch_gold_trees = [meta.get("gold_tree") for meta in metadata]
if all(batch_gold_trees) and self._evalb_score is not None and not self.training:
gold_pos_tags: List[List[str]] = [
list(zip(*tree.pos()))[1] for tree in batch_gold_trees
]
predicted_trees = self.construct_trees(
class_probabilities.cpu().data,
spans.cpu().data,
num_spans.data,
output_dict["tokens"],
gold_pos_tags,
)
self._evalb_score(predicted_trees, batch_gold_trees)
return output_dict
def make_output_human_readable(
self, output_dict: Dict[str, torch.Tensor]
) -> Dict[str, torch.Tensor]:
"""
Constructs an NLTK `Tree` given the scored spans. We also switch to exclusive
span ends when constructing the tree representation, because it makes indexing
into lists cleaner for ranges of text, rather than individual indices.
Finally, for batch prediction, we will have padded spans and class probabilities.
In order to make this less confusing, we remove all the padded spans and
distributions from `spans` and `class_probabilities` respectively.
"""
all_predictions = output_dict["class_probabilities"].cpu().data
all_spans = output_dict["spans"].cpu().data
all_sentences = output_dict["tokens"]
all_pos_tags = output_dict["pos_tags"] if all(output_dict["pos_tags"]) else None
num_spans = output_dict["num_spans"].data
trees = self.construct_trees(
all_predictions, all_spans, num_spans, all_sentences, all_pos_tags
)
batch_size = all_predictions.size(0)
output_dict["spans"] = [all_spans[i, : num_spans[i]] for i in range(batch_size)]
output_dict["class_probabilities"] = [
all_predictions[i, : num_spans[i], :] for i in range(batch_size)
]
output_dict["trees"] = trees
return output_dict
def construct_trees(
self,
predictions: torch.FloatTensor,
all_spans: torch.LongTensor,
num_spans: torch.LongTensor,
sentences: List[List[str]],
pos_tags: List[List[str]] = None,
) -> List[Tree]:
"""
Construct `nltk.Tree`'s for each batch element by greedily nesting spans.
The trees use exclusive end indices, which contrasts with how spans are
represented in the rest of the model.
# Parameters
predictions : `torch.FloatTensor`, required.
A tensor of shape `(batch_size, num_spans, span_label_vocab_size)`
representing a distribution over the label classes per span.
all_spans : `torch.LongTensor`, required.
A tensor of shape (batch_size, num_spans, 2), representing the span
indices we scored.
num_spans : `torch.LongTensor`, required.
A tensor of shape (batch_size), representing the lengths of non-padded spans
in `enumerated_spans`.
sentences : `List[List[str]]`, required.
A list of tokens in the sentence for each element in the batch.
pos_tags : `List[List[str]]`, optional (default = `None`).
A list of POS tags for each word in the sentence for each element
in the batch.
# Returns
A `List[Tree]` containing the decoded trees for each element in the batch.
"""
# Switch to using exclusive end spans.
exclusive_end_spans = all_spans.clone()
exclusive_end_spans[:, :, -1] += 1
no_label_id = self.vocab.get_token_index("NO-LABEL", "labels")
trees: List[Tree] = []
for batch_index, (scored_spans, spans, sentence) in enumerate(
zip(predictions, exclusive_end_spans, sentences)
):
selected_spans = []
for prediction, span in zip(
scored_spans[: num_spans[batch_index]], spans[: num_spans[batch_index]]
):
start, end = span
no_label_prob = prediction[no_label_id]
label_prob, label_index = torch.max(prediction, -1)
# Does the span have a label != NO-LABEL or is it the root node?
# If so, include it in the spans that we consider.
if int(label_index) != no_label_id or (start == 0 and end == len(sentence)):
selected_spans.append(
SpanInformation(
start=int(start),
end=int(end),
label_prob=float(label_prob),
no_label_prob=float(no_label_prob),
label_index=int(label_index),
)
)
# The spans we've selected might overlap, which causes problems when we try
# to construct the tree as they won't nest properly.
consistent_spans = self.resolve_overlap_conflicts_greedily(selected_spans)
spans_to_labels = {
(span.start, span.end): self.vocab.get_token_from_index(span.label_index, "labels")
for span in consistent_spans
}
sentence_pos = pos_tags[batch_index] if pos_tags is not None else None
trees.append(self.construct_tree_from_spans(spans_to_labels, sentence, sentence_pos))
return trees
@staticmethod
def resolve_overlap_conflicts_greedily(spans: List[SpanInformation]) -> List[SpanInformation]:
"""
Given a set of spans, removes spans which overlap by evaluating the difference
in probability between one being labeled and the other explicitly having no label
and vice-versa. The worst case time complexity of this method is `O(k * n^4)` where `n`
is the length of the sentence that the spans were enumerated from (and therefore
`k * m^2` complexity with respect to the number of spans `m`) and `k` is the
number of conflicts. However, in practice, there are very few conflicts. Hopefully.
This function modifies `spans` to remove overlapping spans.
# Parameters
spans : `List[SpanInformation]`, required.
A list of spans, where each span is a `namedtuple` containing the
following attributes:
start : `int`
The start index of the span.
end : `int`
The exclusive end index of the span.
no_label_prob : `float`
The probability of this span being assigned the `NO-LABEL` label.
label_prob : `float`
The probability of the most likely label.
# Returns
A modified list of `spans`, with the conflicts resolved by considering local
differences between pairs of spans and removing one of the two spans.
"""
conflicts_exist = True
while conflicts_exist:
conflicts_exist = False
for span1_index, span1 in enumerate(spans):
for span2_index, span2 in list(enumerate(spans))[span1_index + 1 :]:
if (
span1.start < span2.start < span1.end < span2.end
or span2.start < span1.start < span2.end < span1.end
):
# The spans overlap.
conflicts_exist = True
# What's the more likely situation: that span2 was labeled
# and span1 was unlabled, or that span1 was labeled and span2
# was unlabled? In the first case, we delete span2 from the
# set of spans to form the tree - in the second case, we delete
# span1.
if (
span1.no_label_prob + span2.label_prob
< span2.no_label_prob + span1.label_prob
):
spans.pop(span2_index)
else:
spans.pop(span1_index)
break
return spans
@staticmethod
def construct_tree_from_spans(
spans_to_labels: Dict[Tuple[int, int], str], sentence: List[str], pos_tags: List[str] = None
) -> Tree:
"""
# Parameters
spans_to_labels : `Dict[Tuple[int, int], str]`, required.
A mapping from spans to constituency labels.
sentence : `List[str]`, required.
A list of tokens forming the sentence to be parsed.
pos_tags : `List[str]`, optional (default = `None`)
A list of the pos tags for the words in the sentence, if they
were either predicted or taken as input to the model.
# Returns
An `nltk.Tree` constructed from the labelled spans.
"""
def assemble_subtree(start: int, end: int):
if (start, end) in spans_to_labels:
# Some labels contain nested spans, e.g S-VP.
# We actually want to create (S (VP ...)) nodes
# for these labels, so we split them up here.
labels: List[str] = spans_to_labels[(start, end)].split("-")
else:
labels = None
# This node is a leaf.
if end - start == 1:
word = sentence[start]
pos_tag = pos_tags[start] if pos_tags is not None else "XX"
tree = Tree(pos_tag, [word])
if labels is not None and pos_tags is not None:
# If POS tags were passed explicitly,
# they are added as pre-terminal nodes.
while labels:
tree = Tree(labels.pop(), [tree])
elif labels is not None:
# Otherwise, we didn't want POS tags
# at all.
tree = Tree(labels.pop(), [word])
while labels:
tree = Tree(labels.pop(), [tree])
return [tree]
argmax_split = start + 1
# Find the next largest subspan such that
# the left hand side is a constituent.
for split in range(end - 1, start, -1):
if (start, split) in spans_to_labels:
argmax_split = split
break
left_trees = assemble_subtree(start, argmax_split)
right_trees = assemble_subtree(argmax_split, end)
children = left_trees + right_trees
if labels is not None:
while labels:
children = [Tree(labels.pop(), children)]
return children
tree = assemble_subtree(0, len(sentence))
return tree[0]
def get_metrics(self, reset: bool = False) -> Dict[str, float]:
all_metrics = {}
all_metrics["tag_accuracy"] = self.tag_accuracy.get_metric(reset=reset)
if self._evalb_score is not None:
evalb_metrics = self._evalb_score.get_metric(reset=reset)
all_metrics.update(evalb_metrics)
return all_metrics
default_predictor = "constituency_parser"