-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
b877e0b
commit a4db4a3
Showing
7 changed files
with
1,523 additions
and
17 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,3 +4,4 @@ | |
.settings | ||
*.pyc | ||
out.rdf | ||
.DS_Store |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,293 @@ | ||
#!/usr/bin/env python | ||
""" | ||
disagree_btwn_sents.py | ||
(Derived from AMRICA/disagree.py) | ||
A tool for inspecting AMR data to id patterns of inter-annotator disagreement | ||
or semantic inequivalence. | ||
AMR input file expected in format where comments above each annotation indicate | ||
the sentence like so: | ||
# ::id DF-170-181103-888_2097.1 ::date 2013-09-16T07:15:31 ::annotator ANON-01 ::preferred | ||
# ::tok This is a sentence . | ||
(this / file | ||
:is (an / AMR)) | ||
For monolingual disagreement, all annotations of some sentence should occur | ||
consecutively in the monolingual annotation file. For bilingual, annotations | ||
should be in the same order of sentences between the two files. | ||
For bilingual disagreement, you can include a ::alignments field from jamr to help with | ||
AMR-sentence alignment. | ||
""" | ||
|
||
import argparse | ||
import argparse_config | ||
import codecs | ||
import networkx as nx | ||
from networkx.readwrite import json_graph | ||
import json | ||
import os | ||
import pygraphviz as pgz | ||
|
||
# internal libraries | ||
from compare_smatch import amr_metadata | ||
from compare_smatch import smatch_graph | ||
from compare_smatch.amr_alignment import Amr2AmrAligner | ||
from compare_smatch.amr_alignment import default_aligner | ||
from compare_smatch.smatch_graph import SmatchGraph | ||
from smatch import smatch | ||
|
||
cur_sent_id = 0 | ||
|
||
def hilight_disagreement(test_amrs, gold_amr, iter_num, aligner=default_aligner, gold_aligned_fh=None): | ||
""" | ||
Input: | ||
gold_amr: gold AMR object | ||
test_amrs: list of AMRs to compare to | ||
Returns list of disagreement graphs for each gold-test AMR pair. | ||
""" | ||
|
||
amr_graphs = [] | ||
smatchgraphs = [] | ||
gold_label=u'b' | ||
gold_amr.rename_node(gold_label) | ||
(gold_inst, gold_rel1, gold_rel2) = gold_amr.get_triples2() | ||
(gold_inst_t, gold_rel1_t, gold_rel2_t) = smatch_graph.amr2dict(gold_inst, gold_rel1, gold_rel2) | ||
|
||
for a in test_amrs: | ||
aligner.set_amrs(a, gold_amr) | ||
test_label=u'a' | ||
a.rename_node(test_label) | ||
(test_inst, test_rel1, test_rel2) = a.get_triples2() | ||
if gold_aligned_fh: | ||
best_match = get_next_gold_alignments(gold_aligned_fh) | ||
best_match_num = -1.0 | ||
else: | ||
(best_match, best_match_num) = smatch.get_fh(test_inst, test_rel1, test_rel2, | ||
gold_inst, gold_rel1, gold_rel2, | ||
test_label, gold_label, | ||
node_weight_fn=aligner.node_weight_fn, edge_weight_fn=aligner.edge_weight_fn, | ||
iter_num=iter_num) | ||
|
||
disagreement = SmatchGraph(test_inst, test_rel1, test_rel2, \ | ||
gold_inst_t, gold_rel1_t, gold_rel2_t, \ | ||
best_match, const_map_fn=aligner.const_map_fn) | ||
amr_graphs.append((disagreement.smatch2graph(node_weight_fn=aligner.node_weight_fn, | ||
edge_weight_fn=aligner.edge_weight_fn), | ||
best_match_num)) | ||
smatchgraphs.append(disagreement) | ||
return (amr_graphs, smatchgraphs) | ||
|
||
|
||
def open_output_files(args): | ||
json_fh = None | ||
if args.json_out: | ||
json_fh = codecs.open(args.json_out, 'w', encoding='utf8') | ||
align_fh = None | ||
if args.align_out: | ||
align_fh = codecs.open(args.align_out, 'w', encoding='utf8') | ||
return (json_fh, align_fh) | ||
|
||
|
||
def close_output_files(json_fh, align_fh): | ||
json_fh and json_fh.close() | ||
align_fh and align_fh.close() | ||
|
||
|
||
def get_next_gold_alignments(gold_aligned_fh): | ||
match_hash = {} | ||
line = gold_aligned_fh.readline().strip() | ||
while (line): | ||
if line.startswith('#'): # comment line | ||
line = gold_aligned_fh.readline().strip() | ||
continue | ||
align = line.split('\t') | ||
test_ind = int(align[0]) | ||
gold_ind = int(align[3]) | ||
if test_ind >= 0: | ||
match_hash[test_ind] = gold_ind | ||
line = gold_aligned_fh.readline().strip() | ||
|
||
match = [] | ||
for (i, (k, v)) in enumerate(sorted(match_hash.items(), key=lambda x: x[0])): | ||
assert i == k | ||
match.append(v) | ||
return match | ||
|
||
|
||
def get_sent_info(metadata, dflt_id=None): | ||
""" Return ID, sentence if available, and change metadata to reflect """ | ||
(sent_id, sent) = (None, None) | ||
if 'tok' in metadata: | ||
sent = metadata['tok'] | ||
else: | ||
sent = metadata['snt'] | ||
|
||
if 'id' in metadata: | ||
sent_id = metadata['id'] | ||
elif dflt_id is not None: | ||
sent_id = dflt_id | ||
else: | ||
sent_id = "%d" % cur_sent_id | ||
cur_sent_id += 1 | ||
|
||
(metadata['id'], metadata['tok']) = \ | ||
(sent_id, sent) | ||
|
||
return (sent_id, sent) | ||
|
||
|
||
def monolingual_main(args): | ||
infile = codecs.open(args.infile, encoding='utf8') | ||
gold_aligned_fh = None | ||
if args.align_in: | ||
gold_aligned_fh = codecs.open(args.align_in, encoding='utf8') | ||
(json_fh, align_fh) = open_output_files(args) | ||
|
||
amrs_same_sent = [] | ||
cur_id = "" | ||
while True: | ||
(amr_line, comments) = amr_metadata.get_amr_line(infile) | ||
cur_amr = None | ||
if amr_line: | ||
cur_amr = amr_metadata.AmrMeta.from_parse(amr_line, comments) | ||
get_sent_info(cur_amr.metadata) | ||
if 'annotator' not in cur_amr.metadata: | ||
cur_amr.metadata['annotator'] = '' | ||
if not cur_id: | ||
cur_id = cur_amr.metadata['id'] | ||
|
||
if cur_amr is None or cur_id != cur_amr.metadata['id']: | ||
gold_amr = amrs_same_sent[0] | ||
test_amrs = amrs_same_sent[1:] | ||
if len(test_amrs) == 0: | ||
test_amrs = [gold_amr] # single AMR view case | ||
args.num_restarts = 1 # TODO make single AMR view more efficient | ||
(amr_graphs, smatchgraphs) = hilight_disagreement(test_amrs, gold_amr, args.num_restarts) | ||
|
||
gold_anno = gold_amr.metadata['annotator'] | ||
sent = gold_amr.metadata['tok'] | ||
|
||
if (args.verbose): | ||
print("ID: %s\n Sentence: %s\n gold anno: %s" % (cur_id, sent, gold_anno)) | ||
|
||
for (a, (g, score)) in zip(test_amrs, amr_graphs): | ||
test_anno = a.metadata['annotator'] | ||
if json_fh: | ||
json_fh.write(json.dumps(g) + '\n') | ||
if align_fh: | ||
for sg in smatchgraphs: | ||
align_fh.write("""# ::id %s\n# ::tok %s\n# ::gold_anno %s\n# ::test_anno %s""" % \ | ||
(cur_id, sent, gold_anno, test_anno)) | ||
align_fh.write('\n'.join(sg.get_text_alignments()) + '\n\n') | ||
if (args.verbose): | ||
print(" annotator %s score: %d" % (test_anno, score)) | ||
|
||
ag = nx.to_agraph(g) | ||
ag.graph_attr['label'] = sent | ||
ag.layout(prog=args.layout) | ||
ag.draw('%s/%s_annotated_%s_%s.png' % (args.outdir, cur_id, gold_anno, test_anno)) | ||
|
||
amrs_same_sent = [] | ||
if cur_amr is not None: | ||
cur_id = cur_amr.metadata['id'] | ||
else: | ||
break | ||
|
||
amrs_same_sent.append(cur_amr) | ||
|
||
infile.close() | ||
gold_aligned_fh and gold_aligned_fh.close() | ||
close_output_files(json_fh, align_fh) | ||
|
||
|
||
def xlang_main(args): | ||
""" Disagreement graphs for aligned cross-language language. """ | ||
src_amr_fh = codecs.open(args.src_amr, encoding='utf8') | ||
tgt_amr_fh = codecs.open(args.tgt_amr, encoding='utf8') | ||
gold_aligned_fh = None | ||
if args.align_in: | ||
gold_aligned_fh = codecs.open(args.align_in, encoding='utf8') | ||
(json_fh, align_fh) = open_output_files(args) | ||
|
||
amrs_same_sent = [] | ||
aligner = Amr2AmrAligner(num_best=args.num_align_read, num_best_in_file=args.num_aligned_in_file) | ||
while True: | ||
(src_amr_line, src_comments) = amr_metadata.get_amr_line(src_amr_fh) | ||
if src_amr_line == "": | ||
break | ||
(tgt_amr_line, tgt_comments) = amr_metadata.get_amr_line(tgt_amr_fh) | ||
src_amr = amr_metadata.AmrMeta.from_parse(src_amr_line, src_comments, xlang=True) | ||
tgt_amr = amr_metadata.AmrMeta.from_parse(tgt_amr_line, tgt_comments, xlang=True) | ||
(cur_id, src_sent) = get_sent_info(src_amr.metadata) | ||
(tgt_id, tgt_sent) = get_sent_info(tgt_amr.metadata, dflt_id=cur_id) | ||
assert cur_id == tgt_id | ||
|
||
(amr_graphs, smatchgraphs) = hilight_disagreement([tgt_amr], src_amr, args.num_restarts, aligner=aligner, gold_aligned_fh=gold_aligned_fh) | ||
if json_fh: | ||
json_fh.write(json.dumps(amr_graphs[0]) + '\n') | ||
if align_fh: | ||
align_fh.write("""# ::id %s\n# ::src_snt %s\n# ::tgt_snt %s\n""" % (cur_id, src_sent, tgt_sent)) | ||
align_fh.write('\n'.join(smatchgraphs[0].get_text_alignments()) + '\n\n') | ||
if (args.verbose): | ||
print("ID: %s\n Sentence: %s\n Sentence: %s\n Score: %f" % (cur_id, src_sent, tgt_sent, amr_graphs[0][1])) | ||
#raw_input("Press enter to continue: ") | ||
|
||
ag = nx.to_agraph(amr_graphs[0][0]) | ||
ag.graph_attr['label'] = "%s\n%s" % (src_sent, tgt_sent) | ||
ag.layout(prog=args.layout) | ||
ag.draw('%s/%s.png' % (args.outdir, cur_id)) | ||
|
||
src_amr_fh.close() | ||
tgt_amr_fh.close() | ||
gold_aligned_fh and gold_aligned_fh.close() | ||
close_output_files(json_fh, align_fh) | ||
|
||
|
||
if __name__ == '__main__': | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("-c", "--conf_file", help="Specify config file") | ||
parser.add_argument('-i', '--infile', help='amr input file') | ||
parser.add_argument('-o', '--outdir', help='image output directory') | ||
parser.add_argument('-v', '--verbose', action='store_true') | ||
parser.add_argument('-s', '--src_amr', | ||
help='In bitext mode, source language AMR file.') | ||
parser.add_argument('-t', '--tgt_amr', | ||
help='In bitext mode, target language AMR file.') | ||
parser.add_argument('--align_src2tgt', | ||
help='In bitext mode, GIZA alignment .NBEST file (see GIZA++ -nbestalignments opt) with source as vcb1.') | ||
parser.add_argument('--align_tgt2src', | ||
help='In bitext mode, GIZA alignment .NBEST file (see GIZA++ -nbestalignments opt) with target as vcb1.') | ||
parser.add_argument('--num_align_read', type=int, | ||
help='N to read from GIZA NBEST file.') | ||
parser.add_argument('--num_aligned_in_file', type=int, default=1, | ||
help='N printed to GIZA NBEST file.') | ||
parser.add_argument('-j', '--json_out', | ||
help='File to dump json graphs to.') | ||
parser.add_argument('--num_restarts', type=int, default=5, | ||
help='Number of random restarts to execute during hill-climbing algorithm.') | ||
parser.add_argument('--align_out', | ||
help="Human-readable alignments output file") | ||
parser.add_argument('--align_in', | ||
help="Alignments from human-editable text file, as from align_out") | ||
parser.add_argument('--layout', default='dot', | ||
help='Graphviz output layout') | ||
# TODO make interactive option and option to process a specific range | ||
|
||
args_conf = parser.parse_args() | ||
if args_conf.conf_file: | ||
argparse_config.read_config_file(parser, args_conf.conf_file) | ||
|
||
args = parser.parse_args() | ||
if args.no_verbose: | ||
args.verbose = False | ||
if not args.num_align_read: | ||
args.num_align_read = args.num_aligned_in_file | ||
|
||
if not os.path.exists(args.outdir): | ||
os.makedirs(args.outdir) | ||
|
||
xlang_main(args) | ||
|
Oops, something went wrong.