Skip to content

Commit

Permalink
Latest version for ISWC paper
Browse files Browse the repository at this point in the history
  • Loading branch information
GullyAPCBurns committed Apr 29, 2016
1 parent b877e0b commit a4db4a3
Show file tree
Hide file tree
Showing 7 changed files with 1,523 additions and 17 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@
.settings
*.pyc
out.rdf
.DS_Store
2 changes: 1 addition & 1 deletion amr_to_jsonld.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"""
amr_to_jsonld.py
Note, this is derived from the source code to AMRICA's disagree.py script by Naomi Saphra (nsaphra@jhu.edu)
Note, this is derived from the source code to AMRICA's disagree_btwn_sents.py script by Naomi Saphra (nsaphra@jhu.edu)
Copyright(c) 2015. All rights reserved.
"""
Expand Down
39 changes: 31 additions & 8 deletions amr_to_rdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"""
amr_to_rdf.py
Note, this is derived from the source code to AMRICA's disagree.py script by Naomi Saphra (nsaphra@jhu.edu)
Note, this is derived from the source code to AMRICA's disagree_btwn_sents.py script by Naomi Saphra (nsaphra@jhu.edu)
Copyright(c) 2015. All rights reserved.
"""
Expand Down Expand Up @@ -30,19 +30,40 @@ def strip_word_alignments(str, patt):

return str

def run_main(args):
def run_main(args):

inPath = args.inPath
outPath = args.outPath

#
# If the path is a directory then loop over the directory contents,
# Else run the script on the file as described
#
if( os.path.isfile(inPath) ):
run_main_on_file(args)
else:
if not os.path.exists(outPath):
os.makedirs(outPath)
for fn in os.listdir(inPath):
if os.path.isfile(inPath+"/"+fn) and fn.endswith(".txt"):
args.inPath =inPath + "/" + fn
args.outPath = outPath + "/" + fn + ".rdf"
run_main_on_file(args)

def run_main_on_file(args):

try:
import rdflib
except ImportError:
raise ImportError('requires rdflib')

infile = codecs.open(args.infile, encoding='utf8')
outfile = open(args.outfile, 'w')
infile = codecs.open(args.inPath, encoding='utf8')
outfile = open(args.outPath, 'w')

pBankRoles = True
if( not(args.pbankRoles == u'1') ):
pBankRoles = False

# create the basic RDF data structure
g = rdflib.Graph()

Expand Down Expand Up @@ -349,9 +370,11 @@ def run_main(args):

if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--infile', help='amr input file')

parser.add_argument('-i', '--inPath', help='AMR input file or directory')
parser.add_argument('-o', '--outPath', help='RDF output file or directory')

parser.add_argument('-pbr', '--pbankRoles', default='1', help='Do we include PropBank Roles?')
parser.add_argument('-o', '--outfile', help='RDF output file')
parser.add_argument('-v', '--verbose', action='store_true')
parser.add_argument('-f', '--format', nargs='?', default='nt',
help="RDF Format: xml, n3, nt, trix, rdfa")
Expand Down
293 changes: 293 additions & 0 deletions disagree_btwn_sents.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,293 @@
#!/usr/bin/env python
"""
disagree_btwn_sents.py
(Derived from AMRICA/disagree.py)
A tool for inspecting AMR data to id patterns of inter-annotator disagreement
or semantic inequivalence.
AMR input file expected in format where comments above each annotation indicate
the sentence like so:
# ::id DF-170-181103-888_2097.1 ::date 2013-09-16T07:15:31 ::annotator ANON-01 ::preferred
# ::tok This is a sentence .
(this / file
:is (an / AMR))
For monolingual disagreement, all annotations of some sentence should occur
consecutively in the monolingual annotation file. For bilingual, annotations
should be in the same order of sentences between the two files.
For bilingual disagreement, you can include a ::alignments field from jamr to help with
AMR-sentence alignment.
"""

import argparse
import argparse_config
import codecs
import networkx as nx
from networkx.readwrite import json_graph
import json
import os
import pygraphviz as pgz

# internal libraries
from compare_smatch import amr_metadata
from compare_smatch import smatch_graph
from compare_smatch.amr_alignment import Amr2AmrAligner
from compare_smatch.amr_alignment import default_aligner
from compare_smatch.smatch_graph import SmatchGraph
from smatch import smatch

cur_sent_id = 0

def hilight_disagreement(test_amrs, gold_amr, iter_num, aligner=default_aligner, gold_aligned_fh=None):
"""
Input:
gold_amr: gold AMR object
test_amrs: list of AMRs to compare to
Returns list of disagreement graphs for each gold-test AMR pair.
"""

amr_graphs = []
smatchgraphs = []
gold_label=u'b'
gold_amr.rename_node(gold_label)
(gold_inst, gold_rel1, gold_rel2) = gold_amr.get_triples2()
(gold_inst_t, gold_rel1_t, gold_rel2_t) = smatch_graph.amr2dict(gold_inst, gold_rel1, gold_rel2)

for a in test_amrs:
aligner.set_amrs(a, gold_amr)
test_label=u'a'
a.rename_node(test_label)
(test_inst, test_rel1, test_rel2) = a.get_triples2()
if gold_aligned_fh:
best_match = get_next_gold_alignments(gold_aligned_fh)
best_match_num = -1.0
else:
(best_match, best_match_num) = smatch.get_fh(test_inst, test_rel1, test_rel2,
gold_inst, gold_rel1, gold_rel2,
test_label, gold_label,
node_weight_fn=aligner.node_weight_fn, edge_weight_fn=aligner.edge_weight_fn,
iter_num=iter_num)

disagreement = SmatchGraph(test_inst, test_rel1, test_rel2, \
gold_inst_t, gold_rel1_t, gold_rel2_t, \
best_match, const_map_fn=aligner.const_map_fn)
amr_graphs.append((disagreement.smatch2graph(node_weight_fn=aligner.node_weight_fn,
edge_weight_fn=aligner.edge_weight_fn),
best_match_num))
smatchgraphs.append(disagreement)
return (amr_graphs, smatchgraphs)


def open_output_files(args):
json_fh = None
if args.json_out:
json_fh = codecs.open(args.json_out, 'w', encoding='utf8')
align_fh = None
if args.align_out:
align_fh = codecs.open(args.align_out, 'w', encoding='utf8')
return (json_fh, align_fh)


def close_output_files(json_fh, align_fh):
json_fh and json_fh.close()
align_fh and align_fh.close()


def get_next_gold_alignments(gold_aligned_fh):
match_hash = {}
line = gold_aligned_fh.readline().strip()
while (line):
if line.startswith('#'): # comment line
line = gold_aligned_fh.readline().strip()
continue
align = line.split('\t')
test_ind = int(align[0])
gold_ind = int(align[3])
if test_ind >= 0:
match_hash[test_ind] = gold_ind
line = gold_aligned_fh.readline().strip()

match = []
for (i, (k, v)) in enumerate(sorted(match_hash.items(), key=lambda x: x[0])):
assert i == k
match.append(v)
return match


def get_sent_info(metadata, dflt_id=None):
""" Return ID, sentence if available, and change metadata to reflect """
(sent_id, sent) = (None, None)
if 'tok' in metadata:
sent = metadata['tok']
else:
sent = metadata['snt']

if 'id' in metadata:
sent_id = metadata['id']
elif dflt_id is not None:
sent_id = dflt_id
else:
sent_id = "%d" % cur_sent_id
cur_sent_id += 1

(metadata['id'], metadata['tok']) = \
(sent_id, sent)

return (sent_id, sent)


def monolingual_main(args):
infile = codecs.open(args.infile, encoding='utf8')
gold_aligned_fh = None
if args.align_in:
gold_aligned_fh = codecs.open(args.align_in, encoding='utf8')
(json_fh, align_fh) = open_output_files(args)

amrs_same_sent = []
cur_id = ""
while True:
(amr_line, comments) = amr_metadata.get_amr_line(infile)
cur_amr = None
if amr_line:
cur_amr = amr_metadata.AmrMeta.from_parse(amr_line, comments)
get_sent_info(cur_amr.metadata)
if 'annotator' not in cur_amr.metadata:
cur_amr.metadata['annotator'] = ''
if not cur_id:
cur_id = cur_amr.metadata['id']

if cur_amr is None or cur_id != cur_amr.metadata['id']:
gold_amr = amrs_same_sent[0]
test_amrs = amrs_same_sent[1:]
if len(test_amrs) == 0:
test_amrs = [gold_amr] # single AMR view case
args.num_restarts = 1 # TODO make single AMR view more efficient
(amr_graphs, smatchgraphs) = hilight_disagreement(test_amrs, gold_amr, args.num_restarts)

gold_anno = gold_amr.metadata['annotator']
sent = gold_amr.metadata['tok']

if (args.verbose):
print("ID: %s\n Sentence: %s\n gold anno: %s" % (cur_id, sent, gold_anno))

for (a, (g, score)) in zip(test_amrs, amr_graphs):
test_anno = a.metadata['annotator']
if json_fh:
json_fh.write(json.dumps(g) + '\n')
if align_fh:
for sg in smatchgraphs:
align_fh.write("""# ::id %s\n# ::tok %s\n# ::gold_anno %s\n# ::test_anno %s""" % \
(cur_id, sent, gold_anno, test_anno))
align_fh.write('\n'.join(sg.get_text_alignments()) + '\n\n')
if (args.verbose):
print(" annotator %s score: %d" % (test_anno, score))

ag = nx.to_agraph(g)
ag.graph_attr['label'] = sent
ag.layout(prog=args.layout)
ag.draw('%s/%s_annotated_%s_%s.png' % (args.outdir, cur_id, gold_anno, test_anno))

amrs_same_sent = []
if cur_amr is not None:
cur_id = cur_amr.metadata['id']
else:
break

amrs_same_sent.append(cur_amr)

infile.close()
gold_aligned_fh and gold_aligned_fh.close()
close_output_files(json_fh, align_fh)


def xlang_main(args):
""" Disagreement graphs for aligned cross-language language. """
src_amr_fh = codecs.open(args.src_amr, encoding='utf8')
tgt_amr_fh = codecs.open(args.tgt_amr, encoding='utf8')
gold_aligned_fh = None
if args.align_in:
gold_aligned_fh = codecs.open(args.align_in, encoding='utf8')
(json_fh, align_fh) = open_output_files(args)

amrs_same_sent = []
aligner = Amr2AmrAligner(num_best=args.num_align_read, num_best_in_file=args.num_aligned_in_file)
while True:
(src_amr_line, src_comments) = amr_metadata.get_amr_line(src_amr_fh)
if src_amr_line == "":
break
(tgt_amr_line, tgt_comments) = amr_metadata.get_amr_line(tgt_amr_fh)
src_amr = amr_metadata.AmrMeta.from_parse(src_amr_line, src_comments, xlang=True)
tgt_amr = amr_metadata.AmrMeta.from_parse(tgt_amr_line, tgt_comments, xlang=True)
(cur_id, src_sent) = get_sent_info(src_amr.metadata)
(tgt_id, tgt_sent) = get_sent_info(tgt_amr.metadata, dflt_id=cur_id)
assert cur_id == tgt_id

(amr_graphs, smatchgraphs) = hilight_disagreement([tgt_amr], src_amr, args.num_restarts, aligner=aligner, gold_aligned_fh=gold_aligned_fh)
if json_fh:
json_fh.write(json.dumps(amr_graphs[0]) + '\n')
if align_fh:
align_fh.write("""# ::id %s\n# ::src_snt %s\n# ::tgt_snt %s\n""" % (cur_id, src_sent, tgt_sent))
align_fh.write('\n'.join(smatchgraphs[0].get_text_alignments()) + '\n\n')
if (args.verbose):
print("ID: %s\n Sentence: %s\n Sentence: %s\n Score: %f" % (cur_id, src_sent, tgt_sent, amr_graphs[0][1]))
#raw_input("Press enter to continue: ")

ag = nx.to_agraph(amr_graphs[0][0])
ag.graph_attr['label'] = "%s\n%s" % (src_sent, tgt_sent)
ag.layout(prog=args.layout)
ag.draw('%s/%s.png' % (args.outdir, cur_id))

src_amr_fh.close()
tgt_amr_fh.close()
gold_aligned_fh and gold_aligned_fh.close()
close_output_files(json_fh, align_fh)


if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("-c", "--conf_file", help="Specify config file")
parser.add_argument('-i', '--infile', help='amr input file')
parser.add_argument('-o', '--outdir', help='image output directory')
parser.add_argument('-v', '--verbose', action='store_true')
parser.add_argument('-s', '--src_amr',
help='In bitext mode, source language AMR file.')
parser.add_argument('-t', '--tgt_amr',
help='In bitext mode, target language AMR file.')
parser.add_argument('--align_src2tgt',
help='In bitext mode, GIZA alignment .NBEST file (see GIZA++ -nbestalignments opt) with source as vcb1.')
parser.add_argument('--align_tgt2src',
help='In bitext mode, GIZA alignment .NBEST file (see GIZA++ -nbestalignments opt) with target as vcb1.')
parser.add_argument('--num_align_read', type=int,
help='N to read from GIZA NBEST file.')
parser.add_argument('--num_aligned_in_file', type=int, default=1,
help='N printed to GIZA NBEST file.')
parser.add_argument('-j', '--json_out',
help='File to dump json graphs to.')
parser.add_argument('--num_restarts', type=int, default=5,
help='Number of random restarts to execute during hill-climbing algorithm.')
parser.add_argument('--align_out',
help="Human-readable alignments output file")
parser.add_argument('--align_in',
help="Alignments from human-editable text file, as from align_out")
parser.add_argument('--layout', default='dot',
help='Graphviz output layout')
# TODO make interactive option and option to process a specific range

args_conf = parser.parse_args()
if args_conf.conf_file:
argparse_config.read_config_file(parser, args_conf.conf_file)

args = parser.parse_args()
if args.no_verbose:
args.verbose = False
if not args.num_align_read:
args.num_align_read = args.num_aligned_in_file

if not os.path.exists(args.outdir):
os.makedirs(args.outdir)

xlang_main(args)

Loading

0 comments on commit a4db4a3

Please sign in to comment.