-
Notifications
You must be signed in to change notification settings - Fork 4
/
markup_convert.py
executable file
·882 lines (837 loc) · 35.1 KB
/
markup_convert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
#!/usr/bin/env python3
# Convert using a markedup-style file
import sys, re
import trees, category, rule
log_out = sys.stdout
contains_bs = False
VERBOSE = False
VERBOSE = True
def verbose_print(text):
if VERBOSE:
print(text, file=log_out)
markup_info = {}
def read_markup(markup_file):
global markup_info
# Only read the markup info once
if len(markup_info) == 0:
# (NP\NP)/NP
# 2 ((NP{Y}\NP{Y}<1>){_}/NP{Z}<2>){_}
# (PP 0 2)
# (NP 1 0)
#
cur = []
for line in markup_file:
line = line.strip()
if len(line) > 0:
if line[0] == '#':
continue
cur.append(line)
else:
if len(cur) > 0:
label = cur.pop(0)
markup_info[label] = cur
cur = []
# Find the bracket that matches the one at text[start_index]
def get_balanced_point(text, start_index, deeper, shallower):
depth = 0
for i in range(start_index, len(text)):
if text[i] == deeper:
depth += 1
if text[i] == shallower:
depth -= 1
if depth == 0:
return i
return -1
UNIQUE_ID = 100
class Schema:
def __init__(self, lines, uniqued=False, argument=None, source_node=None):
global UNIQUE_ID
self.source = source_node
text = '(TEMP 0)'
self.parent = []
self.children = []
self.rule = 'unk'
self.get_label_from_argument = False
if type(lines) == type(''):
text = lines
self.parent = []
elif type(lines) == type([]):
text = lines[0]
# only one parent, which is the schema this will insert into
self.parent = lines[1:]
if 'arg' in text:
# check rules
text = None
self.parent = []
to_parent = False
for line in lines:
if to_parent:
self.parent.append(line)
else:
if 'arg:default' in line:
if text is None:
text = line
to_parent = True
elif argument is not None:
constraint = line.split('arg:')[1].split(':')[0]
if '(' not in constraint:
if type(argument) == type(self) and constraint == argument.label:
text = line
elif type(argument) == type('') and argument[1:].split()[0] == constraint:
text = line
elif type(argument) == type(self):
labels = constraint[1:-1].split()
children = []
for child in argument.children:
if type(child) == type(self) and child.label in ":,.;":
continue
elif type(child) == type('') and child[1] in ":,.;":
continue
elif type(child) == type(''):
children.append(child[1:].split()[0])
else:
children.append(child.label)
if '...' in labels:
if len(labels)-1 <= len(children):
use = True
if labels[0] == '...':
for i in range(len(labels)-1):
if labels[-1-i] != children[-1-i]:
use = False
elif labels[-1] == '...':
for i in range(len(labels)-1):
if labels[i] != children[i]:
use = False
else:
print('... in the middle of arguments is not yet supported')
use = False
if use:
text = line
elif len(labels) == len(children):
use = True
for i in range(len(labels)):
if labels[i] != children[i]:
use = False
if use:
text = line
if text[-1] not in ')}':
text = ' '.join(text.split(':')[0].split()[:-1])
# change numbers in text to be a unique ID
text = text.strip()
self.zero = None
if not uniqued:
mapping = {}
ntext = ''
pos = 0
while pos < len(text):
if text[pos] in '1234567890':
start = pos
end = pos
while text[end] in '1234567890':
end += 1
end -= 1
num = int(text[start:end+1])
if num not in mapping:
mapping[num] = UNIQUE_ID
UNIQUE_ID += 1
ntext += str(mapping[num])
if num == 0:
self.zero = mapping[num]
pos = end
else:
ntext += text[pos]
pos += 1
text = ntext
self.schema = text
# determine if this node is to be deleted
self.delete_on_adoption = self.schema.startswith('{(') and self.schema.endswith(')}')
self.label = self.schema.split()[0].strip('{(')
if '*' in self.label:
self.get_label_from_argument = True
self.label = self.label[:-1]
self.children = [] # the tree
self.incomplete = {} # elements somewhere in the tree that are to be filled
tschema = ')'.join('('.join(self.schema.split('(')[1:]).split(')')[:-1])
pos = len(tschema.split()[0]) # jump to after the label
while pos < len(tschema):
if tschema[pos] == '(':
# Create a subtree for this bracket set
balance = get_balanced_point(tschema, pos, '(', ')')
subschema = Schema(tschema[pos:balance+1], uniqued=True, source_node=self.source)
self.children.append(subschema)
for key in self.children[-1].incomplete:
if key not in self.incomplete:
self.incomplete[key] = []
self.incomplete[key] += self.children[-1].incomplete[key]
pos = balance
elif tschema[pos] == ' ':
if tschema[pos + 1] != '(':
left = pos + 1
right = left
while right < len(tschema) and tschema[right] in '1234567890{}<>':
right += 1
right -= 1
text = tschema[left:right+1]
self.children.append(text)
num = int(text.strip('{}<>'))
if num not in self.incomplete:
self.incomplete[num] = []
self.incomplete[num].append((text, self))
pos = right
pos += 1
def PTB_tree(self):
text = '('
text += self.label + ' '
child_texts = []
for child in self.children:
if type(child) != type(''):
child_texts.append(child.PTB_tree())
elif '(' in child:
child_texts.append(child)
if len(child_texts) == 0:
return ''
text += ' '.join(child_texts)
if self.delete_on_adoption:
return ' '.join(child_texts)
text += ')'
return text
def __repr__(self):
child_ans = []
for child in self.children:
if type(child) == type(''):
child_ans.append(child)
else:
child_ans.append('obj')
ans = ' schema: ' + self.schema + ' cur: '
if self.delete_on_adoption:
ans += '{'
ans += '(' + self.label + ' ' + ' '.join(child_ans) + ')'
if self.delete_on_adoption:
ans += '}'
ans += ' incomplete:'
for thing in self.incomplete:
ans += ' ('
ans += str(self.incomplete[thing][0][0])
if self.incomplete[thing][0][1] == self:
ans += ', self)'
else:
ans += ', other)'
for schema in self.parent:
ans += '\n' + schema
return ans
def insert(self, ID, value):
if ID is None:
print("Insert with None ID requested", file=log_out)
print("Insert with None ID requested", file=sys.stderr)
return
if ID != self.zero and self.get_label_from_argument:
try:
if type(value) != type(''):
if not value.delete_on_adoption:
self.label = value.label
except:
pass
original = value
keep_left = False
delete_left = False
keep_right = False
delete_right = False
stop = False
entries = self.incomplete.pop(ID)
for entry in entries:
value = original
text = entry[0]
parent = entry[1]
# find the position
index = 0
while index < len(parent.children):
if parent.children[index] == text:
break
index += 1
del parent.children[index]
if text[0] == '>':
if not keep_left:
keep_left = True
delete_left = False
else:
keep_left = False
delete_left = True
text = text[1:]
if text[-1] == '<':
if not delete_right:
delete_right = True
keep_right = False
else:
delete_right = False
keep_right = True
text = text[:-1]
if text[0] == '{' and text[-1] == '}':
try:
if len(value.children) > 0:
value = value.children
except:
# doesn't have sub=parts, ignore deletion {}
# can happen if we have a list, or a string
pass
text = text[1:-1]
if type(value) == type(self) and value.delete_on_adoption:
value = value.children
if stop:
parent.children.insert(index, '')
else:
if type(value) != type([]):
parent.children.insert(index, value)
if keep_left or delete_left or keep_right or delete_right:
stop = True
else:
if keep_left:
parent.children.insert(index, value[0])
elif delete_left:
parent.children = parent.children[:index] + value[1:] + parent.children[index:]
elif keep_right:
parent.children.insert(index, value[-1])
elif delete_right:
parent.children = parent.children[:index] + value[:-1] + parent.children[index:]
else:
parent.children = parent.children[:index] + value + parent.children[index:]
# When complete pass self to parent
return self
def set_zero(self, thing):
self.insert(self.zero, thing)
return self
def get_argument_key(self, key_no=0):
if len(self.incomplete) == 0:
print("Trying to insert into a complete schema!", file=log_out)
print("Trying to insert into a complete schema!", file=sys.stderr)
else:
for val in self.incomplete:
if key_no == 0:
return val
else:
key_no -= 1
return None
# fa.f and fa.b - Function application
def fa(self, argument, combinator):
# fill the incomplete argument with the argument
key = self.get_argument_key()
if key is not None:
self.insert(key, argument)
if 'conj1' == argument.rule:
pos = 0
while pos < len(self.children):
if type(self.children[pos]) == type(self) and self.children[pos].label == 'NX':
child = self.children[pos]
self.children = self.children[:pos] + child.children + self.children[pos+1:]
pos += len(child.children) - 1
pos += 1
else:
if combinator == 'fa.f':
return self.glom(argument)
else:
return argument.glom(self)
return self
# fc.f and fc.b - Function composition
def fc(self, argument):
# fill the incomplete argument with the argument
self.insert(self.get_argument_key(), argument)
# add the unfilled arguments of the argument to the incomplete arguments of
# self
for key in argument.incomplete:
self.incomplete[key] = []
for entry in argument.incomplete[key]:
used = False
for child in self.children:
if child == entry[0]:
used = True
self.incomplete[key].append((entry[0], self))
break
if not used:
self.incomplete[key].append((entry[0], entry[1]))
### if category.divide(self.source.category)[1] == '/':
### self.children.append(entry[0])
### else:
### self.children.insert(0, entry[0])
argument.incomplete = {}
return self
# bs.f and bs.b - Crossed substitution
def bs(self, argument):
print('bs is not implemented - this should not have been called')
print('bs is not implemented - this should not have been called', file=sys.stderr)
return nlevel
def is_empty(self):
for child in self.children:
if type(child) == type(self):
if not child.is_empty():
return False
elif child[0] == '(':
return False
return True
# cc.b - Backwards crossed composition
def back_cross(self, argument):
left = get_next_incomplete_schema(self, argument)
pos, children = left.get_last_partial_subtree()
if pos < 0:
pos = 0
children = left.children
argument = get_next_incomplete_schema(argument, None)
left.parent = argument.parent
non_empty_children = []
for child in argument.children:
if type(child) == type(left):
if not child.is_empty():
non_empty_children.append(child)
elif child[0] == '(':
non_empty_children.append(child)
if len(non_empty_children) == 1:
argument = non_empty_children[0]
children.insert(pos, argument)
return left
# Type raising
def tr(self, child):
if self.label[0] == child.label[0] and not self.delete_on_adoption:
child.delete_on_adoption = True
self.set_zero(child)
return self
# one of the special binary combination rules defined in rule.py
def special_binary(self, right, new_schemas):
new_schemas.set_zero(self)
new_schemas.insert(new_schemas.get_argument_key(), right)
return new_schemas
# one of the special unary combination rules defined in rule.py
def special_unary(self, unary_schema):
unary_schema.set_zero(self)
return unary_schema
def conj_part1(self, right):
# create a new node, with these two as children
if right.label in ['Nslash', 'Nnum']:
right.delete_on_adoption = True
if right.label == 'N':
if len(right.children) > 1:
right.label = 'NX'
else:
right.delete_on_adoption = True
left = self
if len(left.children) == 1:
left = left.children[0]
# detect a list and set right to be deleted
is_list = False
if len(right.children) > 2:
if type(right.children[1]) == type(left):
if right.children[1] == left or (left == '(, ,)' and 'CC' in right.children[1]):
if type(right.children[0]) == type(self) == type(right.children[2]):
if right.children[0].label == right.children[2].label:
is_list = True
if is_list:
right.delete_on_adoption = True
nlevel = Schema(['(%s 0 1)' % right.label] + right.parent, source_node=right.source)
nlevel.set_zero(left)
nlevel.insert(nlevel.get_argument_key(), right)
if nlevel.label == 'TEMP':
nlevel.delete_on_adoption = True
# move unfilled arguments
for key in right.incomplete:
nlevel.incomplete[key] = []
for entry in right.incomplete[key]:
text = entry[0]
parent = entry[1]
if text == parent.children[-1]:
if text in nlevel.children:
nlevel.children.remove(text)
nlevel.children.append(text)
else:
if text in nlevel.children:
nlevel.children.remove(text)
nlevel.children.insert(0, text)
nlevel.incomplete[key].append((text, nlevel))
nlevel.rule = 'conj1'
return nlevel
def conj_part2(self, right):
if self.label in "~!@#$%^&*()_+{}|:<>?,./;'[]\=-`" or self.label in ['LRB', 'RRB']:
# glom self on instead
return self.glom(right)
# check labels
if self.label in ['Nslash', 'Nnum']:
self.delete_on_adoption = True
if self.label == 'N':
if len(self.children) > 1:
self.label = 'NX'
else:
self.delete_on_adoption = True
if self.label != 'NX':
pos = 0
while pos < len(right.children):
if type(right.children[pos]) == type(right) and right.children[pos].label == 'NX':
child = right.children[pos]
right.children = right.children[:pos] + child.children + right.children[pos+1:]
pos += len(child.children) - 1
pos += 1
nlabel = self.label
if nlabel != right.label:
nlabel = 'UCP'
# check for VPs that are being conjed
try:
remove_VPs = False
print(self.label, self.children[0], file=log_out)
if self.label == 'VP' and 'VB' in self.children[0]:
all_empty = True
print(self.children[1:], file=log_out)
for child in self.children[1:]:
if type(child) != type('') or child[0] == '(':
all_empty = False
if all_empty:
print(right.label, right.children[1].label, right.children[1].children[0], file=log_out)
if right.label == 'VP' and right.children[1].label == 'VP' and 'VB' in right.children[1].children[0]:
all_empty = True
print(right.children[1].children[1:], file=log_out)
for child in right.children[1].children[1:]:
if type(child) != type('') or child[0] == '(':
all_empty = False
if all_empty:
remove_VPs = True
if remove_VPs:
self.delete_on_adoption = True
right.children[1] = right.children[1].children[0]
except:
pass
nlevel = Schema(['(%s 0 {1})' % nlabel] + self.parent, source_node=self.source)
nlevel.set_zero(self)
nlevel.insert(nlevel.get_argument_key(), right)
if nlevel.label == 'TEMP':
nlevel.delete_on_adoption = True
# move unfilled arguments
for key in self.incomplete:
nlevel.incomplete[key] = []
for entry in self.incomplete[key]:
text = entry[0]
parent = entry[1]
if text == parent.children[-1]:
if text in nlevel.children:
nlevel.children.remove(text)
nlevel.children.append(text)
elif text == parent.children[0]:
if text in nlevel.children:
nlevel.children.remove(text)
nlevel.children.insert(0, text)
else:
if text in nlevel.children:
nlevel.children.remove(text)
continue
nlevel.incomplete[key].append((text, nlevel))
nlevel.rule = 'conj2'
return nlevel
def get_first_partial_subtree(self):
if len(self.children) == 0:
return (0, [])
if type(self.children[0]) == type('') and self.children[0][0] == '(':
return (0, self.children)
for i in range(len(self.children)):
child = self.children[i]
if type(child) == type(self):
pos, children = child.get_first_partial_subtree()
if pos > 0:
return (pos, children)
elif pos == 0:
return (i, self.children)
elif type(child) == type('') and child[0] == '(':
return (i, self.children)
return (-1, [])
def get_last_partial_subtree(self):
if len(self.children) == 0:
return (0, [])
if type(self.children[-1]) == type('') and self.children[-1][0] == '(':
return (len(self.children), self.children)
for i in range(len(self.children) - 1, -1, -1):
child = self.children[i]
if type(child) == type(self):
pos, children = child.get_last_partial_subtree()
if 0 < pos < len(children):
return (pos, children)
elif pos == len(children):
return (i+1, self.children)
elif type(child) == type('') and len(child) > 0 and child[0] == '(':
return (i+1, self.children)
return (-1, [])
# misc - Just glom on the random stuff
def glom(self, right, keep_right=None):
left = self
if keep_right is None:
keep_right = left.label in "~!@#$%^&*()_+{}|:<>?,./;'[]\=-`" or left.label in ['LRB', 'RRB']
if keep_right:
# glom left on to left of right
if len(left.children) == 1:
left = left.children[0]
pos, children = right.get_first_partial_subtree()
if pos < 0:
pos = 0
children = right.children
children.insert(pos, left)
return right
else:
# glom right on to right of left
if len(right.children) == 1:
right = right.children[0]
if len(left.incomplete) != 0:
pos, children = left.get_last_partial_subtree()
if pos < 0:
pos = 0
children = left.children
children.insert(pos, right)
else:
left.children.append(right)
### nlevel = Schema(['{(TEMP 0 1)}'] + left.parent, source_node=left.source)
### nlevel.set_zero(left)
### key = nlevel.get_argument_key()
### nlevel.insert(key, right)
### return nlevel
return left
def fallback_schema(cat):
rules = ['{(TEMP 0)}']
while '/' in cat or '\\' in cat:
parts = category.divide(cat)
if parts[1] == '/':
rules.append("(NP 0 1)")
else:
rules.append("(NP 1 0)")
cat = parts[0]
plain_cat = cat
if plain_cat not in markup_info:
plain_cat = category.strip_square_brackets(cat)
if plain_cat in markup_info:
markup_lines = markup_info[plain_cat][1:]
if '/' not in markup_lines[0] and '\\' not in markup_lines[0]:
rules += markup_lines
return rules
return rules
ANGLE_RE = re.compile('<[^>]*>')
def markup_to_schemas(lines, cat=None, source=None):
unannotated = False
if lines == []:
unannotated = True
else:
for line in lines[1:]:
if '\\' in line or '/' in line:
cat_to_print = lines[0].strip().split()[1]
cat_to_print = category.strip_braces(cat_to_print)
cat_to_print = ''.join(cat_to_print.split('[X]'))
cat_to_print = ANGLE_RE.sub('', cat_to_print)
cat_to_print = category.remove_extra_brackets(cat_to_print)
print('Unannotated category:', cat_to_print, file=log_out)
print('Unannotated category:', cat_to_print, file=sys.stderr)
unannotated = True
break
if unannotated:
lines = fallback_schema(cat)
pos = None
word = None
if source is not None:
pos = source.pos
word = source.word
used = False
nlines = []
for i in range(1, len(lines)):
line = lines[i].strip()
if line[-1] not in ')}':
use = True
if 'POS' in line:
if pos is None or pos not in line.split('POS:')[1].split()[0].split(','):
use = False
if not used and 'POS:default' in line:
use = True
if 'Word' in line:
if word is None or word not in line.split('Word:')[1].split()[0].split(','):
use = False
if not used and 'Word:default' in line:
use = True
if use:
nlines.append(line)
if 'arg' not in line or 'arg:default:' in line:
used = True
else:
nlines.append(line)
used = False
if 'POS:default' in line or 'Word:default' in line:
if 'arg' not in line or 'arg:default:' in line:
used = False
return Schema(nlines, source_node=source)
def get_next_incomplete_schema(schema, arg):
while len(schema.incomplete) == 0 and len(schema.parent) > 0:
parent = Schema(schema.parent, argument=arg, source_node=schema.source)
parent.set_zero(schema)
schema = parent
return schema
def apply_markup(source, markup, top=True):
global contains_bs
# Bottom up, so get the results from below
children = []
for subtree in source.subtrees:
children.append(apply_markup(subtree, markup, False))
combinator = source.rule
result = None
verbose_print('using %s combiantor rule' % combinator)
for child in children:
verbose_print('%s' % child.PTB_tree())
verbose_print(child.__repr__())
if combinator == 'lex' or combinator == 'type':
source_category = source.category
if source_category not in markup_info:
source_category = category.strip_square_brackets(source.category)
schema_text = []
if source_category not in markup_info:
print("Missing category:", source.category, "asked for by", combinator, file=log_out)
print("Missing category:", source.category, "asked for by", combinator, file=sys.stderr)
else:
schema_text = markup_info[source_category]
schema = markup_to_schemas(schema_text, source.category, source)
if combinator == 'lex':
result = schema.set_zero("(%s %s)" % (source.pos, source.word))
elif combinator == 'type':
verbose_print("Type schema:")
verbose_print(schema.__repr__())
result = schema.tr(children[0])
elif combinator == 'conj1':
result = children[0].conj_part1(children[1])
elif combinator == 'conj2':
result = children[0].conj_part2(children[1])
elif combinator == 'unary':
unary_rule = rule.get_unary(source.subtrees[0].category, source.category, markup_info)
if unary_rule is None:
unary_rule = fallback_schema(source.category)
schemas = markup_to_schemas(['None'] + unary_rule, source=source)
verbose_print("Unary schema:")
verbose_print(schemas.__repr__())
result = children[0].special_unary(schemas)
elif combinator in ['binary', 'bs.f', 'bs.b']:
binary_rule = rule.get_binary_for_markedup(source.subtrees[0].category, source.subtrees[1].category, source.category, markup_info)
if binary_rule is None:
binary_rule = ['(VP 0 1)'] + fallback_schema(source.category)
schemas = markup_to_schemas(['None'] + binary_rule, source=source)
verbose_print("Binary schema:")
verbose_print(schemas.__repr__())
control = get_next_incomplete_schema(children[0], children[1])
result = control.special_binary(children[1], schemas)
elif combinator == 'fa.f':
control = get_next_incomplete_schema(children[0], children[1])
result = control.fa(children[1], combinator)
elif combinator == 'fa.b':
control = get_next_incomplete_schema(children[1], children[0])
result = control.fa(children[0], combinator)
elif combinator == 'fc.f':
control = get_next_incomplete_schema(children[0], children[1])
argument = get_next_incomplete_schema(children[1], None)
result = control.fc(argument)
elif combinator == 'fc.b':
control = get_next_incomplete_schema(children[1], children[0])
argument = get_next_incomplete_schema(children[0], None)
result = control.fc(argument)
elif combinator == 'cc.b':
control = get_next_incomplete_schema(children[0], children[1])
result = control.back_cross(children[1])
elif combinator == 'misc':
if len(source.subtrees) == 2:
cur = category.strip_square_brackets(source.category)
left = category.strip_square_brackets(source.subtrees[0].category)
right = category.strip_square_brackets(source.subtrees[1].category)
if cur != left and cur != right:
print("miscing an unknown category:", source.category, end=" ", file=log_out)
print("from", source.subtrees[0].category, "and", source.subtrees[1].category, file=log_out)
print("miscing an unknown category:", source.category, end=" ", file=sys.stderr)
print("from", source.subtrees[0].category, "and", source.subtrees[1].category, file=sys.stderr)
binary_rule = fallback_schema(source.category)
schemas = markup_to_schemas(['None','(NP 0 1)'] + binary_rule, source=source)
verbose_print("Misc Binary schema:")
verbose_print(schemas.__repr__())
result = children[0].special_binary(children[1], schemas)
else:
# check if this forms a PRN
words = source.all_word_yield()[1].split()
left_word = words[0]
right_word = words[-1]
verbose_print(left_word + ' ' + right_word)
use_PRN = False
if not top:
if left_word == ',' and right_word == ',':
use_PRN = True
elif left_word == '--' and right_word == '--':
use_PRN = True
elif left_word == '-LRB-' and right_word == '-RRB-':
use_PRN = True
result = children[0].glom(children[1], cur == right)
if use_PRN:
old_label = result.label
result.label = 'PRN'
result.delete_on_adoption = False
nlevel = Schema(['(%s 0)' % old_label] + result.parent, source_node=source)
if old_label == 'TEMP':
nlevel = Schema(['{(%s 0)}' % old_label] + result.parent, source_node=source)
nlevel.set_zero(result)
nlevel.incomplete = result.incomplete
result = nlevel
else:
print('misc combinator is not handled', file=sys.stderr)
verbose_print('resolved: %s' % result.PTB_tree())
verbose_print(result.__repr__())
verbose_print('')
return result
def remove_N(tree):
nsubtrees = []
for subtree in tree.subtrees:
sub = remove_N(subtree)
if type(sub) == type([]):
nsubtrees += sub
else:
nsubtrees.append(sub)
tree.subtrees = nsubtrees
if tree.label == 'N' or tree.label == 'Nslash' or tree.label == 'Nnum':
return tree.subtrees
else:
return tree
def remove_repetition(tree):
# recurse and update subtrees
if len(tree.subtrees) > 0:
nsubtrees = []
for subtree in tree.subtrees:
nsubtrees.append(remove_repetition(subtree))
tree.subtrees = nsubtrees
# look down and remove this if it is repeated
repeats = False
cur = tree
label = cur.label
while len(cur.subtrees) == 1:
cur = cur.subtrees[0]
if cur.label == label:
repeats = True
break
if repeats:
print('duplicate!', file=log_out)
print(tree.one_line_repr(), file=log_out)
print(cur.one_line_repr(), file=log_out)
tree = tree.subtrees[0]
return tree
def convert(source, argv, log=sys.stdout):
global markup_info, contains_bs, log_out, VERBOSE
log_out = log
VERBOSE = '-verbose' in ' '.join(argv)
filename = ' '.join(argv).split(' -method')[1].split()[1]
read_markup(open(filename))
contains_bs = False
auto_schema = apply_markup(source, markup_info)
###################
# Extra cleanup
# i.e. hacks that don't fit within the main architecture
###################
auto_ptb = trees.PTB_Tree('(ROOT ' + auto_schema.PTB_tree() + ')')
verbose_print('before cleaning: %s' % auto_ptb)
# remove remaining N
auto_ptb = remove_N(auto_ptb)
# collapse repetitions
auto_ptb = remove_repetition(auto_ptb)
verbose_print('cleaned: %s' % auto_ptb)
verbose_print('')
return (not contains_bs, auto_ptb, auto_schema)
if __name__ == '__main__':
if len(sys.argv) < 2:
print("Usage:\n%s -method_info <markup_file>" % sys.argv[0])
sys.exit(1)
print("Please enter CCG trees:")
for line in sys.stdin:
print(convert(trees.CCG_Tree(line.strip()), sys.argv))