Latest version for ISWC paper

BMKEG · Apr 29, 2016 · a4db4a3 · a4db4a3
1 parent b877e0b
commit a4db4a3
Show file tree

Hide file tree

Showing 7 changed files with 1,523 additions and 17 deletions.
diff --git a/.gitignore b/.gitignore
@@ -4,3 +4,4 @@
 .settings
 *.pyc
 out.rdf
+.DS_Store
diff --git a/amr_to_jsonld.py b/amr_to_jsonld.py
@@ -2,7 +2,7 @@
 """
 amr_to_jsonld.py
 
-Note, this is derived from the source code to AMRICA's disagree.py script by Naomi Saphra (nsaphra@jhu.edu)
+Note, this is derived from the source code to AMRICA's disagree_btwn_sents.py script by Naomi Saphra (nsaphra@jhu.edu)
 Copyright(c) 2015. All rights reserved.
 
 """

diff --git a/amr_to_rdf.py b/amr_to_rdf.py
@@ -2,7 +2,7 @@
 """
 amr_to_rdf.py
 
-Note, this is derived from the source code to AMRICA's disagree.py script by Naomi Saphra (nsaphra@jhu.edu)
+Note, this is derived from the source code to AMRICA's disagree_btwn_sents.py script by Naomi Saphra (nsaphra@jhu.edu)
 Copyright(c) 2015. All rights reserved.
 
 """
@@ -30,19 +30,40 @@ def strip_word_alignments(str, patt):
 
  return str
 
-def run_main(args):
+def run_main(args): 
+
+ inPath = args.inPath
+ outPath = args.outPath
+
+ #
+ # If the path is a directory then loop over the directory contents,
+ # Else run the script on the file as described
+ #
+ if( os.path.isfile(inPath) ):
+ run_main_on_file(args)
+ else:
+ if not os.path.exists(outPath):
+ os.makedirs(outPath)
+ for fn in os.listdir(inPath):
+ if os.path.isfile(inPath+"/"+fn) and fn.endswith(".txt"):
+ args.inPath =inPath + "/" + fn
+ args.outPath = outPath + "/" + fn + ".rdf"
+ run_main_on_file(args)
+
+def run_main_on_file(args):
+
  try:
  import rdflib
  except ImportError:
  raise ImportError('requires rdflib')
-
- infile = codecs.open(args.infile, encoding='utf8')
- outfile = open(args.outfile, 'w')
+ 
+ infile = codecs.open(args.inPath, encoding='utf8')
+ outfile = open(args.outPath, 'w')
 
  pBankRoles = True
  if( not(args.pbankRoles == u'1') ):
  pBankRoles = False
-
+  
  # create the basic RDF data structure
  g = rdflib.Graph()
 
@@ -349,9 +370,11 @@ def run_main(args):
 
 if __name__ == '__main__':
  parser = argparse.ArgumentParser()
- parser.add_argument('-i', '--infile', help='amr input file')
+
+ parser.add_argument('-i', '--inPath', help='AMR input file or directory')
+ parser.add_argument('-o', '--outPath', help='RDF output file or directory')
+
  parser.add_argument('-pbr', '--pbankRoles', default='1', help='Do we include PropBank Roles?')
- parser.add_argument('-o', '--outfile', help='RDF output file')
  parser.add_argument('-v', '--verbose', action='store_true')
  parser.add_argument('-f', '--format', nargs='?', default='nt',
  help="RDF Format: xml, n3, nt, trix, rdfa")

diff --git a/disagree_btwn_sents.py b/disagree_btwn_sents.py
@@ -0,0 +1,293 @@
+#!/usr/bin/env python
+"""
+disagree_btwn_sents.py
+(Derived from AMRICA/disagree.py)
+
+A tool for inspecting AMR data to id patterns of inter-annotator disagreement
+or semantic inequivalence.
+
+AMR input file expected in format where comments above each annotation indicate
+the sentence like so:
+
+# ::id DF-170-181103-888_2097.1 ::date 2013-09-16T07:15:31 ::annotator ANON-01 ::preferred
+# ::tok This is a sentence .
+(this / file
+ :is (an / AMR))
+
+For monolingual disagreement, all annotations of some sentence should occur
+consecutively in the monolingual annotation file. For bilingual, annotations
+should be in the same order of sentences between the two files.
+
+For bilingual disagreement, you can include a ::alignments field from jamr to help with
+AMR-sentence alignment.
+"""
+
+import argparse
+import argparse_config
+import codecs
+import networkx as nx
+from networkx.readwrite import json_graph
+import json
+import os
+import pygraphviz as pgz
+
+# internal libraries
+from compare_smatch import amr_metadata
+from compare_smatch import smatch_graph
+from compare_smatch.amr_alignment import Amr2AmrAligner
+from compare_smatch.amr_alignment import default_aligner
+from compare_smatch.smatch_graph import SmatchGraph
+from smatch import smatch
+
+cur_sent_id = 0
+
+def hilight_disagreement(test_amrs, gold_amr, iter_num, aligner=default_aligner, gold_aligned_fh=None):
+ """
+ Input:
+ gold_amr: gold AMR object
+ test_amrs: list of AMRs to compare to
+ Returns list of disagreement graphs for each gold-test AMR pair.
+ """
+
+ amr_graphs = []
+ smatchgraphs = []
+ gold_label=u'b'
+ gold_amr.rename_node(gold_label)
+ (gold_inst, gold_rel1, gold_rel2) = gold_amr.get_triples2()
+ (gold_inst_t, gold_rel1_t, gold_rel2_t) = smatch_graph.amr2dict(gold_inst, gold_rel1, gold_rel2)
+
+ for a in test_amrs:
+ aligner.set_amrs(a, gold_amr)
+ test_label=u'a'
+ a.rename_node(test_label)
+ (test_inst, test_rel1, test_rel2) = a.get_triples2()
+ if gold_aligned_fh:
+ best_match = get_next_gold_alignments(gold_aligned_fh)
+ best_match_num = -1.0
+ else:
+ (best_match, best_match_num) = smatch.get_fh(test_inst, test_rel1, test_rel2,
+ gold_inst, gold_rel1, gold_rel2,
+ test_label, gold_label,
+ node_weight_fn=aligner.node_weight_fn, edge_weight_fn=aligner.edge_weight_fn,
+ iter_num=iter_num)
+
+ disagreement = SmatchGraph(test_inst, test_rel1, test_rel2, \
+ gold_inst_t, gold_rel1_t, gold_rel2_t, \
+ best_match, const_map_fn=aligner.const_map_fn)
+ amr_graphs.append((disagreement.smatch2graph(node_weight_fn=aligner.node_weight_fn,
+ edge_weight_fn=aligner.edge_weight_fn),
+ best_match_num))
+ smatchgraphs.append(disagreement)
+ return (amr_graphs, smatchgraphs)
+
+
+def open_output_files(args):
+ json_fh = None
+ if args.json_out:
+ json_fh = codecs.open(args.json_out, 'w', encoding='utf8')
+ align_fh = None
+ if args.align_out:
+ align_fh = codecs.open(args.align_out, 'w', encoding='utf8')
+ return (json_fh, align_fh)
+
+
+def close_output_files(json_fh, align_fh):
+ json_fh and json_fh.close()
+ align_fh and align_fh.close()
+
+
+def get_next_gold_alignments(gold_aligned_fh):
+ match_hash = {}
+ line = gold_aligned_fh.readline().strip()
+ while (line):
+ if line.startswith('#'): # comment line
+ line = gold_aligned_fh.readline().strip()
+ continue
+ align = line.split('\t')
+ test_ind = int(align[0])
+ gold_ind = int(align[3])
+ if test_ind >= 0:
+ match_hash[test_ind] = gold_ind
+ line = gold_aligned_fh.readline().strip()
+
+ match = []
+ for (i, (k, v)) in enumerate(sorted(match_hash.items(), key=lambda x: x[0])):
+ assert i == k
+ match.append(v)
+ return match
+
+
+def get_sent_info(metadata, dflt_id=None):
+ """ Return ID, sentence if available, and change metadata to reflect """
+ (sent_id, sent) = (None, None)
+ if 'tok' in metadata:
+ sent = metadata['tok']
+ else:
+ sent = metadata['snt']
+
+ if 'id' in metadata:
+ sent_id = metadata['id']
+ elif dflt_id is not None:
+ sent_id = dflt_id
+ else:
+ sent_id = "%d" % cur_sent_id
+ cur_sent_id += 1
+
+ (metadata['id'], metadata['tok']) = \
+ (sent_id, sent)
+
+ return (sent_id, sent)
+
+
+def monolingual_main(args):
+ infile = codecs.open(args.infile, encoding='utf8')
+ gold_aligned_fh = None
+ if args.align_in:
+ gold_aligned_fh = codecs.open(args.align_in, encoding='utf8')
+ (json_fh, align_fh) = open_output_files(args)
+
+ amrs_same_sent = []
+ cur_id = ""
+ while True:
+ (amr_line, comments) = amr_metadata.get_amr_line(infile)
+ cur_amr = None
+ if amr_line:
+ cur_amr = amr_metadata.AmrMeta.from_parse(amr_line, comments)
+ get_sent_info(cur_amr.metadata)
+ if 'annotator' not in cur_amr.metadata:
+ cur_amr.metadata['annotator'] = ''
+ if not cur_id:
+ cur_id = cur_amr.metadata['id']
+
+ if cur_amr is None or cur_id != cur_amr.metadata['id']:
+ gold_amr = amrs_same_sent[0]
+ test_amrs = amrs_same_sent[1:]
+ if len(test_amrs) == 0:
+ test_amrs = [gold_amr] # single AMR view case
+ args.num_restarts = 1 # TODO make single AMR view more efficient
+ (amr_graphs, smatchgraphs) = hilight_disagreement(test_amrs, gold_amr, args.num_restarts)
+
+ gold_anno = gold_amr.metadata['annotator']
+ sent = gold_amr.metadata['tok']
+
+ if (args.verbose):
+ print("ID: %s\n Sentence: %s\n gold anno: %s" % (cur_id, sent, gold_anno))
+
+ for (a, (g, score)) in zip(test_amrs, amr_graphs):
+ test_anno = a.metadata['annotator']
+ if json_fh:
+ json_fh.write(json.dumps(g) + '\n')
+ if align_fh:
+ for sg in smatchgraphs:
+ align_fh.write("""# ::id %s\n# ::tok %s\n# ::gold_anno %s\n# ::test_anno %s""" % \
+ (cur_id, sent, gold_anno, test_anno))
+ align_fh.write('\n'.join(sg.get_text_alignments()) + '\n\n')
+ if (args.verbose):
+ print(" annotator %s score: %d" % (test_anno, score))
+
+ ag = nx.to_agraph(g)
+ ag.graph_attr['label'] = sent
+ ag.layout(prog=args.layout)
+ ag.draw('%s/%s_annotated_%s_%s.png' % (args.outdir, cur_id, gold_anno, test_anno))
+
+ amrs_same_sent = []
+ if cur_amr is not None:
+ cur_id = cur_amr.metadata['id']
+ else:
+ break
+
+ amrs_same_sent.append(cur_amr)
+
+ infile.close()
+ gold_aligned_fh and gold_aligned_fh.close()
+ close_output_files(json_fh, align_fh)
+
+
+def xlang_main(args):
+ """ Disagreement graphs for aligned cross-language language. """
+ src_amr_fh = codecs.open(args.src_amr, encoding='utf8')
+ tgt_amr_fh = codecs.open(args.tgt_amr, encoding='utf8')
+ gold_aligned_fh = None
+ if args.align_in:
+ gold_aligned_fh = codecs.open(args.align_in, encoding='utf8')
+ (json_fh, align_fh) = open_output_files(args)
+
+ amrs_same_sent = []
+ aligner = Amr2AmrAligner(num_best=args.num_align_read, num_best_in_file=args.num_aligned_in_file)
+ while True:
+ (src_amr_line, src_comments) = amr_metadata.get_amr_line(src_amr_fh)
+ if src_amr_line == "":
+ break
+ (tgt_amr_line, tgt_comments) = amr_metadata.get_amr_line(tgt_amr_fh)
+ src_amr = amr_metadata.AmrMeta.from_parse(src_amr_line, src_comments, xlang=True)
+ tgt_amr = amr_metadata.AmrMeta.from_parse(tgt_amr_line, tgt_comments, xlang=True)
+ (cur_id, src_sent) = get_sent_info(src_amr.metadata)
+ (tgt_id, tgt_sent) = get_sent_info(tgt_amr.metadata, dflt_id=cur_id)
+ assert cur_id == tgt_id
+
+ (amr_graphs, smatchgraphs) = hilight_disagreement([tgt_amr], src_amr, args.num_restarts, aligner=aligner, gold_aligned_fh=gold_aligned_fh)
+ if json_fh:
+ json_fh.write(json.dumps(amr_graphs[0]) + '\n')
+ if align_fh:
+ align_fh.write("""# ::id %s\n# ::src_snt %s\n# ::tgt_snt %s\n""" % (cur_id, src_sent, tgt_sent))
+ align_fh.write('\n'.join(smatchgraphs[0].get_text_alignments()) + '\n\n')
+ if (args.verbose):
+ print("ID: %s\n Sentence: %s\n Sentence: %s\n Score: %f" % (cur_id, src_sent, tgt_sent, amr_graphs[0][1]))
+ #raw_input("Press enter to continue: ")
+
+ ag = nx.to_agraph(amr_graphs[0][0])
+ ag.graph_attr['label'] = "%s\n%s" % (src_sent, tgt_sent)
+ ag.layout(prog=args.layout)
+ ag.draw('%s/%s.png' % (args.outdir, cur_id))
+
+ src_amr_fh.close()
+ tgt_amr_fh.close()
+ gold_aligned_fh and gold_aligned_fh.close()
+ close_output_files(json_fh, align_fh)
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument("-c", "--conf_file", help="Specify config file")
+ parser.add_argument('-i', '--infile', help='amr input file')
+ parser.add_argument('-o', '--outdir', help='image output directory')
+ parser.add_argument('-v', '--verbose', action='store_true')
+ parser.add_argument('-s', '--src_amr',
+ help='In bitext mode, source language AMR file.')
+ parser.add_argument('-t', '--tgt_amr',
+ help='In bitext mode, target language AMR file.')
+ parser.add_argument('--align_src2tgt',
+ help='In bitext mode, GIZA alignment .NBEST file (see GIZA++ -nbestalignments opt) with source as vcb1.')
+ parser.add_argument('--align_tgt2src',
+ help='In bitext mode, GIZA alignment .NBEST file (see GIZA++ -nbestalignments opt) with target as vcb1.')
+ parser.add_argument('--num_align_read', type=int,
+ help='N to read from GIZA NBEST file.')
+ parser.add_argument('--num_aligned_in_file', type=int, default=1,
+ help='N printed to GIZA NBEST file.')
+ parser.add_argument('-j', '--json_out',
+ help='File to dump json graphs to.')
+ parser.add_argument('--num_restarts', type=int, default=5,
+ help='Number of random restarts to execute during hill-climbing algorithm.')
+ parser.add_argument('--align_out',
+ help="Human-readable alignments output file")
+ parser.add_argument('--align_in',
+ help="Alignments from human-editable text file, as from align_out")
+ parser.add_argument('--layout', default='dot',
+ help='Graphviz output layout')
+ # TODO make interactive option and option to process a specific range
+
+ args_conf = parser.parse_args()
+ if args_conf.conf_file:
+ argparse_config.read_config_file(parser, args_conf.conf_file)
+
+ args = parser.parse_args()
+ if args.no_verbose:
+ args.verbose = False
+ if not args.num_align_read:
+ args.num_align_read = args.num_aligned_in_file
+
+ if not os.path.exists(args.outdir):
+ os.makedirs(args.outdir)
+
+ xlang_main(args)
+