diff --git a/source/boxer_graph_module.py b/source/boxer_graph_module.py new file mode 100644 index 0000000..71e2900 --- /dev/null +++ b/source/boxer_graph_module.py @@ -0,0 +1,599 @@ +#!/usr/bin/env python +#=================================================================================== +#title : boxer_graph_module.py = +#description : Define boxer graph class = +#author : Shashi Narayan, shashi.narayan(at){ed.ac.uk,loria.fr,gmail.com})= +#date : Created in 2014, Later revised in April 2016. = +#version : 0.1 = +#=================================================================================== + +import itertools +import math +import xml.etree.ElementTree as ET + +class Boxer_Graph: + def __init__(self): + ''' + self.nodes[symbol] = {"positions":[], "predicates":[(predsym, locations)]} + self.relations[symbol] = {"positions":[], "predicates":""} + self.edges = [(par, dep, lab)] + ''' + self.nodes = {} + self.relations = {} + self.edges = [] + + def isEmpty(self): + if len(self.nodes) == 0: + return True + else: + return False + + def get_nodeset(self): + nodeset = self.nodes.keys() + nodeset.sort() + return nodeset + + # @@@@@@@@@@@@@@@@@@@@@ Features extractor : Supporter functions @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + + def extract_oodword(self, oodnode, main_sent_dict): + # always just one position there + position = self.nodes[oodnode]["positions"][0] + oodnode_word = main_sent_dict[position][0] + return oodnode_word + + def extract_relword(self, relnode, main_sent_dict): + positions = self.relations[relnode]["positions"] + unique_pos = list(set(positions)) + + if len(unique_pos) == 0: # nn relation + # extract nodeset from child + depnode = -1 + for edge in self.edges: + if edge[2] == relnode: + depnode = edge[1] + if depnode == -1: + return nodeset + else: + subgraph_nodeset = self.extract_subgraph_nodeset([depnode], []) + unique_pos = self.extract_sentence_positions(subgraph_nodeset) + + words = [main_sent_dict[pos][0] for pos in unique_pos if pos in main_sent_dict] + rel_string = " ".join(words) + return rel_string + + def extract_relation_phrase(self, relnode, nodeset, main_sent_dict, filtered_mod_pos): + relation_span = self.extract_span_for_nodeset_with_rel(relnode, nodeset) + unique_pos = list(set(relation_span)) + unique_valid_pos = [item for item in unique_pos if item not in filtered_mod_pos] + unique_valid_pos.sort() + + words = [main_sent_dict[pos][0] for pos in unique_valid_pos if pos in main_sent_dict] + rel_phrase = " ".join(words) + return rel_phrase + + def calculate_iLength(self, parent_sentence, child_sentence_list): + # Counts are done at the word level, split criteria + lenth_complex = len(parent_sentence.split()) + + avg_simple_sentlen = 0 + for sent in child_sentence_list: + avg_simple_sentlen += len(sent.split()) + avg_simple_sentlen = float(avg_simple_sentlen)/len(child_sentence_list) + iLength = int(math.ceil(lenth_complex/avg_simple_sentlen)) + return iLength + + def get_pattern_4_split_candidate(self, split_tuple): + pattern_list = [] + for node in split_tuple: + rel_pattern = [] + for edge in self.edges: + if edge[0] == node: + relnode = edge[2] + relpred = self.relations[relnode]["predicates"] + rel_pattern.append(relpred) + rel_pattern.sort() + pattern_list.append(rel_pattern) + pattern_list.sort() + pattern = "" + for item in pattern_list: + if len(item) == 0: + pattern += "NULL_" + else: + pattern += ("-".join(item)+"_") + pattern = pattern[:-1] + return pattern + + # @@@@@@@@@@@@@@@@@@@@@ Candidates extractor @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + + def extract_split_candidate_tuples(self, nodeset, MAX_SPLIT_PAIR_SIZE): + # Get Event nodes which are parent and distinct + parent_event_nodes = [] + # Extract all children nodes + children_nodes = [edge[1] for edge in self.edges] + for node in nodeset: + preds = [item[0] for item in self.nodes[node]["predicates"]] + if "event" in preds: + # Check for parent nodes + if node not in children_nodes: + # Have at least one of agent, theme, eq or patient as their dependent relations + rel_pattern = [] + for edge in self.edges: + if edge[0] == node: + relnode = edge[2] + relpred = self.relations[relnode]["predicates"] + rel_pattern.append(relpred) + if ("agent" in rel_pattern) or ("theme" in rel_pattern) or ("eq" in rel_pattern) or ("patient" in rel_pattern): + parent_event_nodes.append(node) + + parent_distinct_event_nodes_span = [] + # Remove Homomorphic pairs + for node in parent_event_nodes: + subgraph_nodeset = self.extract_subgraph_nodeset([node], []) + subgraph_nodeset_filtered = [item for item in subgraph_nodeset if item in nodeset] + span = self.extract_span_for_nodeset(subgraph_nodeset_filtered) + flag = False + for tnode_span in parent_distinct_event_nodes_span: + if span == tnode_span[1]: + flag = True + break + if flag == False: + parent_distinct_event_nodes_span.append((node, span)) + parent_distinct_event_nodes = [item[0] for item in parent_distinct_event_nodes_span] + parent_distinct_event_nodes.sort() + + split_candidate_tuples = [] + for splitsize in range(2,MAX_SPLIT_PAIR_SIZE+1): + split_candidate_tuples += list(itertools.combinations(parent_distinct_event_nodes, splitsize)) + return split_candidate_tuples + + def extract_drop_rel_candidates(self, nodeset, RESTRICTED_DROP_REL, processed_relnode): + # potential edges + potential_edges = [] + for edge in self.edges: + parentnode = edge[0] + depnode = edge[1] + if (parentnode in nodeset) and (depnode in nodeset): + potential_edges.append(edge) + # Extract all children nodes + children_nodes = [edge[1] for edge in potential_edges] + # Select all parents in the nodeset + nodeset_to_process = [] + depthset_to_process = [] + for node in nodeset: + # Check for parent nodes + if node not in children_nodes: + nodeset_to_process.append(node) + depthset_to_process.append(0) + # Find relation nodes with their depth + relation_depth = self.extract_relationnode_depth(nodeset_to_process, depthset_to_process, [], [], potential_edges) + # Sort them based on their bottom-up appearance, try to drop smaller one first. (edit distance prefers to drop longer one, so try smaller one first) + relation_depth.sort(reverse=True) + + # Filtering out RESTRICTED_DROP_REL and processed_relnode + relcand_set = [] + for item in relation_depth: + relnode = item[1] + relpred = self.relations[relnode]["predicates"] + if (relpred not in RESTRICTED_DROP_REL) and (relnode not in processed_relnode): + relcand_set.append(relnode) + + # Removing relnodes whose dependents are connected by non-dropable nodes + relcand_set_filtered = [] + for relnode in relcand_set: + # Find dependent nodeset + dep_node = -1 + for edge in potential_edges: + if edge[2] == relnode: + dep_node = edge[1] + + subgraph_nodeset = self.extract_subgraph_nodeset([dep_node], []) + subgraph_nodeset_filtered = [item for item in subgraph_nodeset if item in nodeset] + edges_connecting_subgraph_nodeset = self.extract_edges_super_subgraph(nodeset, subgraph_nodeset_filtered) + + flag = True + for edge in edges_connecting_subgraph_nodeset: + if self.relations[edge[2]]["predicates"] in RESTRICTED_DROP_REL: + flag = False + break + if flag == True: + relcand_set_filtered.append(relnode) + + # removing homomorphic relations + relcand_span_uniq = [] + for relcand in relcand_set_filtered: + relcand_span = self.extract_span_for_nodeset_with_rel(relcand, nodeset) + flag = False + for trelcand_span_tuple in relcand_span_uniq: + if relcand_span == trelcand_span_tuple[1]: + flag = True + break + if flag == False: + relcand_span_uniq.append((relcand, relcand_span)) + + relcand_uniq = [item[0] for item in relcand_span_uniq] + return relcand_uniq + + def extract_drop_mod_candidates(self, nodeset, main_sent_dict, ALLOWED_DROP_MOD, processed_mod_pos): + modcand_set = [] + + local_processed_mod_pos = [] # two homomorphic node can have same postions, just consider one + + for node in nodeset: + positions = self.nodes[node]["positions"] + for position in positions: + if (position not in processed_mod_pos) and (position not in local_processed_mod_pos): + if main_sent_dict[position][1] in ALLOWED_DROP_MOD: + modcand_set.append((position, node)) + local_processed_mod_pos.append(position) + #print main_sent_dict[position] + return modcand_set + + def extract_ood_candidates(self, nodeset, processed_oodnodes): + oodnode_set = [itemnode_name for itemnode_name in nodeset if itemnode_name.startswith("OOD") and itemnode_name not in processed_oodnodes] + oodnode_set.sort() + return oodnode_set + + # @@@@@@@@@@@@@@@@@@@@@ Boxer Graph Processing Functions @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + + def extract_relationnode_depth(self, nodeset_to_process, depthset_to_process, relation_depth, nodes_processed, edges): + if len(nodeset_to_process) == 0: + return relation_depth + + node = nodeset_to_process[0] + depth = depthset_to_process[0] + nodes_processed.append(node) + + for edge in edges: + parent = edge[0] + dependent = edge[1] + relnode = edge[2] + if parent == node: + relation_depth.append((depth, relnode)) + if (dependent not in nodeset_to_process) and (dependent not in nodes_processed): + nodeset_to_process.append(dependent) + depthset_to_process.append(depth+1) + relation_depth = self.extract_relationnode_depth(nodeset_to_process[1:], depthset_to_process[1:], relation_depth, nodes_processed, edges) + return relation_depth + + def extract_span_for_nodeset_with_rel(self, rel_node, nodeset): + span = self.relations[rel_node]["positions"][:] + dep_node = -1 + for edge in self.edges: + if edge[2] == rel_node: + dep_node = edge[1] + if dep_node != -1: + subgraph_nodeset = self.extract_subgraph_nodeset([dep_node], []) + subgraph_nodeset_filtered = [item for item in subgraph_nodeset if item in nodeset] + span += self.extract_span_for_nodeset(subgraph_nodeset_filtered) + unique_pos = list(set(span)) + unique_pos.sort() + return unique_pos + + def extract_span_for_nodeset(self, nodeset): + span = [] + for node in nodeset: + positions = self.nodes[node]["positions"] + span += positions + for edge in self.edges: + rel = edge[2] + parnode = edge[0] + depnode = edge[1] + if (parnode in nodeset) and (depnode in nodeset): + positions = self.relations[rel]["positions"] + span += positions + unique_pos = list(set(span)) + unique_pos.sort() + return unique_pos + + def extract_parent_subgraph_nodeset_dict(self): + # Calculate parents + parents_subgraph_nodeset_dict = {} + # Extract all children nodes + children_nodes = [edge[1] for edge in self.edges] + for node in self.nodes: + # Check for parent nodes + if node not in children_nodes: + parent_node = node + subgraph_nodeset = self.extract_subgraph_nodeset([parent_node], []) + parents_subgraph_nodeset_dict[parent_node] = subgraph_nodeset + return parents_subgraph_nodeset_dict + + def extract_subgraph_nodeset(self, node_2_process_set, subgraph_nodeset): + if len(node_2_process_set) == 0: + return subgraph_nodeset + else: + nodename = node_2_process_set[0] + subgraph_nodeset.append(nodename) + for edge in self.edges: + if edge[0] == nodename: + depnode = edge[1] + if (depnode not in node_2_process_set) and (depnode not in subgraph_nodeset): + node_2_process_set.append(depnode) + subgraph_nodeset = self.extract_subgraph_nodeset(node_2_process_set[1:], subgraph_nodeset) + return subgraph_nodeset + + def extract_main_sentence(self, nodeset, main_sent_dict, filtered_mod_pos): + span = [] + for node in nodeset: + positions = self.nodes[node]["positions"] + span += positions + for edge in self.edges: + rel = edge[2] + parnode = edge[0] + depnode = edge[1] + if (parnode in nodeset) and (depnode in nodeset): + positions = self.relations[rel]["positions"] + span += positions + unique_pos = list(set(span)) + unique_valid_pos = [item for item in unique_pos if item not in filtered_mod_pos] + unique_valid_pos.sort() + + words = [main_sent_dict[pos][0] for pos in unique_valid_pos if pos in main_sent_dict] + main_sentence = " ".join(words) + return main_sentence + + def extract_span_min_max(self, nodeset): + span = [] + for node in nodeset: + positions = self.nodes[node]["positions"] + span += positions + for edge in self.edges: + rel = edge[2] + parnode = edge[0] + depnode = edge[1] + if (parnode in nodeset) and (depnode in nodeset): + positions = self.relations[rel]["positions"] + span += positions + unique_pos = list(set(span)) + unique_pos.sort() + + if len(unique_pos) == 0: + return (-1, -1) + else: + return (unique_pos[0], unique_pos[-1]) + + def extract_sentence_positions(self, nodeset): + span = [] + for node in nodeset: + positions = self.nodes[node]["positions"] + span += positions + for edge in self.edges: + rel = edge[2] + parnode = edge[0] + depnode = edge[1] + if (parnode in nodeset) and (depnode in nodeset): + positions = self.relations[rel]["positions"] + span += positions + unique_pos = list(set(span)) + return unique_pos + + def extract_edges_super_subgraph(self, super_nodeset, sub_nodeset): + connecting_edges = [] + for edge in self.edges: + rel = edge[2] + parnode = edge[0] + depnode = edge[1] + if (parnode in super_nodeset) and (parnode not in sub_nodeset) and (depnode in super_nodeset) and (depnode in sub_nodeset): + connecting_edges.append(edge) + return connecting_edges + + # @@@@@@@@@@@@@@@@@@@@@@ Node set changing operations @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + + def partition_drs_for_successful_candidate(self, split_candidate, parent_subgraph_nodeset_dict): + node_subgraph_nodeset_dict = {} + node_span_dict = {} + for node in split_candidate: + node_subgraph_nodeset_dict[node] = parent_subgraph_nodeset_dict[node][:] + node_span_dict[node] = self.extract_span_min_max(parent_subgraph_nodeset_dict[node]) + # print "node_span_dict : "+str(node_span_dict) + + # Normal nodes attachment with their increasing span + span_normalnodes = [(self.extract_span_min_max(parent_subgraph_nodeset_dict[nodename]) , nodename) + for nodename in parent_subgraph_nodeset_dict if nodename.startswith("x") and nodename not in split_candidate] + span_normalnodes.sort() + for item in span_normalnodes: + span_subgraph = item[0] + parent_subgraph = item[1] + self.attach_a_subgraph(node_subgraph_nodeset_dict, node_span_dict, parent_subgraph, span_subgraph, parent_subgraph_nodeset_dict) + + # Extra nodes attachment with their increasing span and + span_extranodes = [(self.extract_span_min_max(parent_subgraph_nodeset_dict[nodename]) , nodename) + for nodename in parent_subgraph_nodeset_dict if nodename.startswith("E") and nodename not in split_candidate] + span_extranodes.sort() + for item in span_extranodes: + span_subgraph = item[0] + parent_subgraph = item[1] + self.attach_a_subgraph(node_subgraph_nodeset_dict, node_span_dict, parent_subgraph, span_subgraph, parent_subgraph_nodeset_dict) + + # OOD (out of discourse) nodes attachment with their increasing span + span_oodnodes = [(self.extract_span_min_max(parent_subgraph_nodeset_dict[nodename]) , nodename) + for nodename in parent_subgraph_nodeset_dict if nodename.startswith("OOD") and nodename not in split_candidate] + span_oodnodes.sort() + for item in span_oodnodes: + span_subgraph = item[0] + parent_subgraph = item[1] + self.attach_a_subgraph(node_subgraph_nodeset_dict, node_span_dict, parent_subgraph, span_subgraph, parent_subgraph_nodeset_dict) + + return node_subgraph_nodeset_dict, node_span_dict + + def attach_a_subgraph(self, node_subgraph_nodeset_dict, node_span_dict, parent_subgraph, span_subgraph, parent_subgraph_nodeset_dict): + # Finding closest node to attach to + mean_subgraph = float(span_subgraph[0]+span_subgraph[1])/2 + mean_nodes = [(float(node_span_dict[node][0]+node_span_dict[node][1])/2, node) for node in node_span_dict] + distance_from_nodes = [(abs(item[0]-mean_subgraph), item[1]) for item in mean_nodes] + distance_from_nodes.sort() + required_node = distance_from_nodes[0][1] + + # Updating nodeset and span + node_subgraph_nodeset_dict[required_node] = list(set(node_subgraph_nodeset_dict[required_node]+parent_subgraph_nodeset_dict[parent_subgraph])) + node_span_dict[required_node] = self.extract_span_min_max(node_subgraph_nodeset_dict[required_node]) + + def drop_relation(self, nodeset, relnode_to_process, filtered_mod_pos): + nodeset_to_drop = [] + filtered_mod_pos_new = filtered_mod_pos[:] + + depnode = -1 + for edge in self.edges: + if edge[2] == relnode_to_process: + depnode = edge[1] + if depnode != -1: + subgraph_nodeset = self.extract_subgraph_nodeset([depnode], []) + nodeset_to_drop += subgraph_nodeset[:] + + # Span + relnode_span = self.extract_span_for_nodeset_with_rel(relnode_to_process, nodeset) + + # filtering out positions + filtered_mod_pos_new += relnode_span[:] + filtered_mod_pos_final = list(set(filtered_mod_pos_new)) + filtered_mod_pos_final.sort() + + # Drop all homomorphic relations and + for edge in self.edges: + trelnode = edge[2] + parent = edge[0] + dependent = edge[1] + if (trelnode != relnode_to_process) and (parent in nodeset) and (dependent in nodeset): + trelnode_span = self.extract_span_for_nodeset_with_rel(trelnode, nodeset) + if trelnode_span == relnode_span: + # homomorphic + subgraph_nodeset = self.extract_subgraph_nodeset([dependent], []) + nodeset_to_drop += subgraph_nodeset[:] + + filtered_nodeset = [node for node in nodeset if node not in nodeset_to_drop] + filtered_nodeset.sort() + + return filtered_nodeset, filtered_mod_pos_final + + # @@@@@@@@@@@@@@@@@@@@@@ Boxer Graph -> Elementary Tree @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + + def convert_to_elementarytree(self): + # Writing Discourse Data : nodes, relations, edges + boxer = ET.Element("box") + + nodes = ET.SubElement(boxer, "nodes") + for node in self.nodes: + bnode = ET.SubElement(nodes, "node") + bnode.attrib = {"sym":node} + + # Span positions + span = ET.SubElement(bnode, "span") + positions = self.nodes[node]["positions"] + positions.sort() + for pos in positions: + locelt = ET.SubElement(span, "loc") + locelt.attrib = {"id":str(pos)} + + # Predicates + predicates = self.nodes[node]["predicates"] + predselt = ET.SubElement(bnode, "preds") + for predtuple in predicates: + predname = predtuple[0] + predelt = ET.SubElement(predselt, "pred") + predelt.attrib = {"sym":predname} + + predpositions = predtuple[1] + predpositions.sort() + for predpos in predpositions: + predlocelt = ET.SubElement(predelt, "loc") + predlocelt.attrib = {"id":str(predpos)} + + rels = ET.SubElement(boxer, "rels") + for rel in self.relations: + brel = ET.SubElement(rels, "rel") + brel.attrib = {"sym":rel} + + relname = self.relations[rel]["predicates"] + predelt = ET.SubElement(brel, "pred") + predelt.attrib = {"sym":relname} + + relpositions = self.relations[rel]["positions"] + relpositions.sort() + span = ET.SubElement(brel, "span") + for relpos in relpositions: + rellocelt = ET.SubElement(span, "loc") + rellocelt.attrib = {"id":str(relpos)} + + edges = ET.SubElement(boxer, "edges") + for edge in self.edges: + edgeelt = ET.SubElement(edges, "edge") + edgeelt.attrib = {"lab":edge[2], "par":edge[0], "dep":edge[1]} + + return boxer + + # @@@@@@@@@@@@@@@@@@@@@@ Boxer Graph -> Dot Node @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + + def convert_to_dotstring(self, sentid, main_sentence, main_sent_dict, simple_sentences): + dot_string = "digraph boxer{\n" + + # Creating root node + nodename = 0 + textdot_root, nodename = self.textdot_root_node(nodename, sentid, main_sentence, main_sent_dict, simple_sentences) + dot_string += textdot_root+"\n" + # Creating all boxer nodes + node_graph_dict = {} + for node in self.nodes: + textdot_node, nodename = self.textdot_node(nodename, node, self.nodes[node]["positions"], self.nodes[node]["predicates"]) + node_graph_dict[node] = "struct"+str(nodename) + dot_string += textdot_node+"\n" + # Creating edges + for edge in self.edges: + reldata = edge[2]+"-"+self.relations[edge[2]]["predicates"]+"-"+str(self.relations[edge[2]]["positions"]) + par_boxergraph = node_graph_dict[edge[0]] + dep_boxergraph = node_graph_dict[edge[1]] + dot_string += par_boxergraph+" -> "+dep_boxergraph+"[label=\""+reldata+"\"];\n" + + # Extracting parents + parents_subgraph_nodeset_dict = self.extract_parent_subgraph_nodeset_dict() + #print parents_subgraph_nodeset_dict + + # Connect all parents to root + for parent in parents_subgraph_nodeset_dict: + par_boxergraph = node_graph_dict[parent] + dot_string += "struct1 -> "+par_boxergraph+";\n" + dot_string += "}" + return dot_string + + def textdot_root_node(self, nodename, sentid, main_sentence, main_sent_dict, simple_sentences): + textdot_root = "struct"+str(nodename+1)+" [shape=record,label=\"{" + textdot_root += "sentId: "+sentid+"|" + textdot_root += self.processtext("main: "+main_sentence)+"|" + for simple_sent in simple_sentences: + textdot_root += self.processtext("simple: "+simple_sent)+"|" + + main_sent_dict_text = "" + positions = main_sent_dict.keys() + positions.sort() + for pos in positions: + main_sent_dict_text += str(pos)+":("+main_sent_dict[pos][0]+","+main_sent_dict[pos][1]+") " + textdot_root += self.processtext(main_sent_dict_text) + textdot_root += "}\"];" + return textdot_root, nodename+1 + + def textdot_node(self, nodename, node, positions, predicates): + textdot_node = "struct"+str(nodename+1)+" [shape=record,label=\"{" + textdot_node += "node: "+node+"|" + textdot_node += self.processtext(str(positions))+"|" + index = 0 + for predicate_info in predicates: + textdot_node += predicate_info[0]+" "+self.processtext(str(predicate_info[1])) + index += 1 + if index < len(predicates): + textdot_node += "|" + textdot_node += "}\"];" + return textdot_node, nodename+1 + + def processtext(self, inputstring): + linesize = 100 + outputstring = "" + index = 0 + substr = inputstring[index*linesize:(index+1)*linesize] + while (substr!=""): + outputstring += substr + index += 1 + substr = inputstring[index*linesize:(index+1)*linesize] + if substr!="": + outputstring += "\\n" + return outputstring + + # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ Done @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ diff --git a/source/boxer_graph_module.pyc b/source/boxer_graph_module.pyc new file mode 100644 index 0000000..7c67522 Binary files /dev/null and b/source/boxer_graph_module.pyc differ diff --git a/source/boxer_graph_module.py~ b/source/boxer_graph_module.py~ new file mode 100644 index 0000000..316b330 --- /dev/null +++ b/source/boxer_graph_module.py~ @@ -0,0 +1,591 @@ +#!/usr/bin/env python +import itertools +import math +import xml.etree.ElementTree as ET + +class Boxer_Graph: + def __init__(self): + ''' + self.nodes[symbol] = {"positions":[], "predicates":[(predsym, locations)]} + self.relations[symbol] = {"positions":[], "predicates":""} + self.edges = [(par, dep, lab)] + ''' + self.nodes = {} + self.relations = {} + self.edges = [] + + def isEmpty(self): + if len(self.nodes) == 0: + return True + else: + return False + + def get_nodeset(self): + nodeset = self.nodes.keys() + nodeset.sort() + return nodeset + + # @@@@@@@@@@@@@@@@@@@@@ Features extractor : Supporter functions @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + + def extract_oodword(self, oodnode, main_sent_dict): + # always just one position there + position = self.nodes[oodnode]["positions"][0] + oodnode_word = main_sent_dict[position][0] + return oodnode_word + + def extract_relword(self, relnode, main_sent_dict): + positions = self.relations[relnode]["positions"] + unique_pos = list(set(positions)) + + if len(unique_pos) == 0: # nn relation + # extract nodeset from child + depnode = -1 + for edge in self.edges: + if edge[2] == relnode: + depnode = edge[1] + if depnode == -1: + return nodeset + else: + subgraph_nodeset = self.extract_subgraph_nodeset([depnode], []) + unique_pos = self.extract_sentence_positions(subgraph_nodeset) + + words = [main_sent_dict[pos][0] for pos in unique_pos if pos in main_sent_dict] + rel_string = " ".join(words) + return rel_string + + def extract_relation_phrase(self, relnode, nodeset, main_sent_dict, filtered_mod_pos): + relation_span = self.extract_span_for_nodeset_with_rel(relnode, nodeset) + unique_pos = list(set(relation_span)) + unique_valid_pos = [item for item in unique_pos if item not in filtered_mod_pos] + unique_valid_pos.sort() + + words = [main_sent_dict[pos][0] for pos in unique_valid_pos if pos in main_sent_dict] + rel_phrase = " ".join(words) + return rel_phrase + + def calculate_iLength(self, parent_sentence, child_sentence_list): + # Counts are done at the word level, split criteria + lenth_complex = len(parent_sentence.split()) + + avg_simple_sentlen = 0 + for sent in child_sentence_list: + avg_simple_sentlen += len(sent.split()) + avg_simple_sentlen = float(avg_simple_sentlen)/len(child_sentence_list) + iLength = int(math.ceil(lenth_complex/avg_simple_sentlen)) + return iLength + + def get_pattern_4_split_candidate(self, split_tuple): + pattern_list = [] + for node in split_tuple: + rel_pattern = [] + for edge in self.edges: + if edge[0] == node: + relnode = edge[2] + relpred = self.relations[relnode]["predicates"] + rel_pattern.append(relpred) + rel_pattern.sort() + pattern_list.append(rel_pattern) + pattern_list.sort() + pattern = "" + for item in pattern_list: + if len(item) == 0: + pattern += "NULL_" + else: + pattern += ("-".join(item)+"_") + pattern = pattern[:-1] + return pattern + + # @@@@@@@@@@@@@@@@@@@@@ Candidates extractor @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + + def extract_split_candidate_tuples(self, nodeset, MAX_SPLIT_PAIR_SIZE): + # Get Event nodes which are parent and distinct + parent_event_nodes = [] + # Extract all children nodes + children_nodes = [edge[1] for edge in self.edges] + for node in nodeset: + preds = [item[0] for item in self.nodes[node]["predicates"]] + if "event" in preds: + # Check for parent nodes + if node not in children_nodes: + # Have at least one of agent, theme, eq or patient as their dependent relations + rel_pattern = [] + for edge in self.edges: + if edge[0] == node: + relnode = edge[2] + relpred = self.relations[relnode]["predicates"] + rel_pattern.append(relpred) + if ("agent" in rel_pattern) or ("theme" in rel_pattern) or ("eq" in rel_pattern) or ("patient" in rel_pattern): + parent_event_nodes.append(node) + + parent_distinct_event_nodes_span = [] + # Remove Homomorphic pairs + for node in parent_event_nodes: + subgraph_nodeset = self.extract_subgraph_nodeset([node], []) + subgraph_nodeset_filtered = [item for item in subgraph_nodeset if item in nodeset] + span = self.extract_span_for_nodeset(subgraph_nodeset_filtered) + flag = False + for tnode_span in parent_distinct_event_nodes_span: + if span == tnode_span[1]: + flag = True + break + if flag == False: + parent_distinct_event_nodes_span.append((node, span)) + parent_distinct_event_nodes = [item[0] for item in parent_distinct_event_nodes_span] + parent_distinct_event_nodes.sort() + + split_candidate_tuples = [] + for splitsize in range(2,MAX_SPLIT_PAIR_SIZE+1): + split_candidate_tuples += list(itertools.combinations(parent_distinct_event_nodes, splitsize)) + return split_candidate_tuples + + def extract_drop_rel_candidates(self, nodeset, RESTRICTED_DROP_REL, processed_relnode): + # potential edges + potential_edges = [] + for edge in self.edges: + parentnode = edge[0] + depnode = edge[1] + if (parentnode in nodeset) and (depnode in nodeset): + potential_edges.append(edge) + # Extract all children nodes + children_nodes = [edge[1] for edge in potential_edges] + # Select all parents in the nodeset + nodeset_to_process = [] + depthset_to_process = [] + for node in nodeset: + # Check for parent nodes + if node not in children_nodes: + nodeset_to_process.append(node) + depthset_to_process.append(0) + # Find relation nodes with their depth + relation_depth = self.extract_relationnode_depth(nodeset_to_process, depthset_to_process, [], [], potential_edges) + # Sort them based on their bottom-up appearance, try to drop smaller one first. (edit distance prefers to drop longer one, so try smaller one first) + relation_depth.sort(reverse=True) + + # Filtering out RESTRICTED_DROP_REL and processed_relnode + relcand_set = [] + for item in relation_depth: + relnode = item[1] + relpred = self.relations[relnode]["predicates"] + if (relpred not in RESTRICTED_DROP_REL) and (relnode not in processed_relnode): + relcand_set.append(relnode) + + # Removing relnodes whose dependents are connected by non-dropable nodes + relcand_set_filtered = [] + for relnode in relcand_set: + # Find dependent nodeset + dep_node = -1 + for edge in potential_edges: + if edge[2] == relnode: + dep_node = edge[1] + + subgraph_nodeset = self.extract_subgraph_nodeset([dep_node], []) + subgraph_nodeset_filtered = [item for item in subgraph_nodeset if item in nodeset] + edges_connecting_subgraph_nodeset = self.extract_edges_super_subgraph(nodeset, subgraph_nodeset_filtered) + + flag = True + for edge in edges_connecting_subgraph_nodeset: + if self.relations[edge[2]]["predicates"] in RESTRICTED_DROP_REL: + flag = False + break + if flag == True: + relcand_set_filtered.append(relnode) + + # removing homomorphic relations + relcand_span_uniq = [] + for relcand in relcand_set_filtered: + relcand_span = self.extract_span_for_nodeset_with_rel(relcand, nodeset) + flag = False + for trelcand_span_tuple in relcand_span_uniq: + if relcand_span == trelcand_span_tuple[1]: + flag = True + break + if flag == False: + relcand_span_uniq.append((relcand, relcand_span)) + + relcand_uniq = [item[0] for item in relcand_span_uniq] + return relcand_uniq + + def extract_drop_mod_candidates(self, nodeset, main_sent_dict, ALLOWED_DROP_MOD, processed_mod_pos): + modcand_set = [] + + local_processed_mod_pos = [] # two homomorphic node can have same postions, just consider one + + for node in nodeset: + positions = self.nodes[node]["positions"] + for position in positions: + if (position not in processed_mod_pos) and (position not in local_processed_mod_pos): + if main_sent_dict[position][1] in ALLOWED_DROP_MOD: + modcand_set.append((position, node)) + local_processed_mod_pos.append(position) + #print main_sent_dict[position] + return modcand_set + + def extract_ood_candidates(self, nodeset, processed_oodnodes): + oodnode_set = [itemnode_name for itemnode_name in nodeset if itemnode_name.startswith("OOD") and itemnode_name not in processed_oodnodes] + oodnode_set.sort() + return oodnode_set + + # @@@@@@@@@@@@@@@@@@@@@ Boxer Graph Processing Functions @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + + def extract_relationnode_depth(self, nodeset_to_process, depthset_to_process, relation_depth, nodes_processed, edges): + if len(nodeset_to_process) == 0: + return relation_depth + + node = nodeset_to_process[0] + depth = depthset_to_process[0] + nodes_processed.append(node) + + for edge in edges: + parent = edge[0] + dependent = edge[1] + relnode = edge[2] + if parent == node: + relation_depth.append((depth, relnode)) + if (dependent not in nodeset_to_process) and (dependent not in nodes_processed): + nodeset_to_process.append(dependent) + depthset_to_process.append(depth+1) + relation_depth = self.extract_relationnode_depth(nodeset_to_process[1:], depthset_to_process[1:], relation_depth, nodes_processed, edges) + return relation_depth + + def extract_span_for_nodeset_with_rel(self, rel_node, nodeset): + span = self.relations[rel_node]["positions"][:] + dep_node = -1 + for edge in self.edges: + if edge[2] == rel_node: + dep_node = edge[1] + if dep_node != -1: + subgraph_nodeset = self.extract_subgraph_nodeset([dep_node], []) + subgraph_nodeset_filtered = [item for item in subgraph_nodeset if item in nodeset] + span += self.extract_span_for_nodeset(subgraph_nodeset_filtered) + unique_pos = list(set(span)) + unique_pos.sort() + return unique_pos + + def extract_span_for_nodeset(self, nodeset): + span = [] + for node in nodeset: + positions = self.nodes[node]["positions"] + span += positions + for edge in self.edges: + rel = edge[2] + parnode = edge[0] + depnode = edge[1] + if (parnode in nodeset) and (depnode in nodeset): + positions = self.relations[rel]["positions"] + span += positions + unique_pos = list(set(span)) + unique_pos.sort() + return unique_pos + + def extract_parent_subgraph_nodeset_dict(self): + # Calculate parents + parents_subgraph_nodeset_dict = {} + # Extract all children nodes + children_nodes = [edge[1] for edge in self.edges] + for node in self.nodes: + # Check for parent nodes + if node not in children_nodes: + parent_node = node + subgraph_nodeset = self.extract_subgraph_nodeset([parent_node], []) + parents_subgraph_nodeset_dict[parent_node] = subgraph_nodeset + return parents_subgraph_nodeset_dict + + def extract_subgraph_nodeset(self, node_2_process_set, subgraph_nodeset): + if len(node_2_process_set) == 0: + return subgraph_nodeset + else: + nodename = node_2_process_set[0] + subgraph_nodeset.append(nodename) + for edge in self.edges: + if edge[0] == nodename: + depnode = edge[1] + if (depnode not in node_2_process_set) and (depnode not in subgraph_nodeset): + node_2_process_set.append(depnode) + subgraph_nodeset = self.extract_subgraph_nodeset(node_2_process_set[1:], subgraph_nodeset) + return subgraph_nodeset + + def extract_main_sentence(self, nodeset, main_sent_dict, filtered_mod_pos): + span = [] + for node in nodeset: + positions = self.nodes[node]["positions"] + span += positions + for edge in self.edges: + rel = edge[2] + parnode = edge[0] + depnode = edge[1] + if (parnode in nodeset) and (depnode in nodeset): + positions = self.relations[rel]["positions"] + span += positions + unique_pos = list(set(span)) + unique_valid_pos = [item for item in unique_pos if item not in filtered_mod_pos] + unique_valid_pos.sort() + + words = [main_sent_dict[pos][0] for pos in unique_valid_pos if pos in main_sent_dict] + main_sentence = " ".join(words) + return main_sentence + + def extract_span_min_max(self, nodeset): + span = [] + for node in nodeset: + positions = self.nodes[node]["positions"] + span += positions + for edge in self.edges: + rel = edge[2] + parnode = edge[0] + depnode = edge[1] + if (parnode in nodeset) and (depnode in nodeset): + positions = self.relations[rel]["positions"] + span += positions + unique_pos = list(set(span)) + unique_pos.sort() + + if len(unique_pos) == 0: + return (-1, -1) + else: + return (unique_pos[0], unique_pos[-1]) + + def extract_sentence_positions(self, nodeset): + span = [] + for node in nodeset: + positions = self.nodes[node]["positions"] + span += positions + for edge in self.edges: + rel = edge[2] + parnode = edge[0] + depnode = edge[1] + if (parnode in nodeset) and (depnode in nodeset): + positions = self.relations[rel]["positions"] + span += positions + unique_pos = list(set(span)) + return unique_pos + + def extract_edges_super_subgraph(self, super_nodeset, sub_nodeset): + connecting_edges = [] + for edge in self.edges: + rel = edge[2] + parnode = edge[0] + depnode = edge[1] + if (parnode in super_nodeset) and (parnode not in sub_nodeset) and (depnode in super_nodeset) and (depnode in sub_nodeset): + connecting_edges.append(edge) + return connecting_edges + + # @@@@@@@@@@@@@@@@@@@@@@ Node set changing operations @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + + def partition_drs_for_successful_candidate(self, split_candidate, parent_subgraph_nodeset_dict): + node_subgraph_nodeset_dict = {} + node_span_dict = {} + for node in split_candidate: + node_subgraph_nodeset_dict[node] = parent_subgraph_nodeset_dict[node][:] + node_span_dict[node] = self.extract_span_min_max(parent_subgraph_nodeset_dict[node]) + # print "node_span_dict : "+str(node_span_dict) + + # Normal nodes attachment with their increasing span + span_normalnodes = [(self.extract_span_min_max(parent_subgraph_nodeset_dict[nodename]) , nodename) + for nodename in parent_subgraph_nodeset_dict if nodename.startswith("x") and nodename not in split_candidate] + span_normalnodes.sort() + for item in span_normalnodes: + span_subgraph = item[0] + parent_subgraph = item[1] + self.attach_a_subgraph(node_subgraph_nodeset_dict, node_span_dict, parent_subgraph, span_subgraph, parent_subgraph_nodeset_dict) + + # Extra nodes attachment with their increasing span and + span_extranodes = [(self.extract_span_min_max(parent_subgraph_nodeset_dict[nodename]) , nodename) + for nodename in parent_subgraph_nodeset_dict if nodename.startswith("E") and nodename not in split_candidate] + span_extranodes.sort() + for item in span_extranodes: + span_subgraph = item[0] + parent_subgraph = item[1] + self.attach_a_subgraph(node_subgraph_nodeset_dict, node_span_dict, parent_subgraph, span_subgraph, parent_subgraph_nodeset_dict) + + # OOD (out of discourse) nodes attachment with their increasing span + span_oodnodes = [(self.extract_span_min_max(parent_subgraph_nodeset_dict[nodename]) , nodename) + for nodename in parent_subgraph_nodeset_dict if nodename.startswith("OOD") and nodename not in split_candidate] + span_oodnodes.sort() + for item in span_oodnodes: + span_subgraph = item[0] + parent_subgraph = item[1] + self.attach_a_subgraph(node_subgraph_nodeset_dict, node_span_dict, parent_subgraph, span_subgraph, parent_subgraph_nodeset_dict) + + return node_subgraph_nodeset_dict, node_span_dict + + def attach_a_subgraph(self, node_subgraph_nodeset_dict, node_span_dict, parent_subgraph, span_subgraph, parent_subgraph_nodeset_dict): + # Finding closest node to attach to + mean_subgraph = float(span_subgraph[0]+span_subgraph[1])/2 + mean_nodes = [(float(node_span_dict[node][0]+node_span_dict[node][1])/2, node) for node in node_span_dict] + distance_from_nodes = [(abs(item[0]-mean_subgraph), item[1]) for item in mean_nodes] + distance_from_nodes.sort() + required_node = distance_from_nodes[0][1] + + # Updating nodeset and span + node_subgraph_nodeset_dict[required_node] = list(set(node_subgraph_nodeset_dict[required_node]+parent_subgraph_nodeset_dict[parent_subgraph])) + node_span_dict[required_node] = self.extract_span_min_max(node_subgraph_nodeset_dict[required_node]) + + def drop_relation(self, nodeset, relnode_to_process, filtered_mod_pos): + nodeset_to_drop = [] + filtered_mod_pos_new = filtered_mod_pos[:] + + depnode = -1 + for edge in self.edges: + if edge[2] == relnode_to_process: + depnode = edge[1] + if depnode != -1: + subgraph_nodeset = self.extract_subgraph_nodeset([depnode], []) + nodeset_to_drop += subgraph_nodeset[:] + + # Span + relnode_span = self.extract_span_for_nodeset_with_rel(relnode_to_process, nodeset) + + # filtering out positions + filtered_mod_pos_new += relnode_span[:] + filtered_mod_pos_final = list(set(filtered_mod_pos_new)) + filtered_mod_pos_final.sort() + + # Drop all homomorphic relations and + for edge in self.edges: + trelnode = edge[2] + parent = edge[0] + dependent = edge[1] + if (trelnode != relnode_to_process) and (parent in nodeset) and (dependent in nodeset): + trelnode_span = self.extract_span_for_nodeset_with_rel(trelnode, nodeset) + if trelnode_span == relnode_span: + # homomorphic + subgraph_nodeset = self.extract_subgraph_nodeset([dependent], []) + nodeset_to_drop += subgraph_nodeset[:] + + filtered_nodeset = [node for node in nodeset if node not in nodeset_to_drop] + filtered_nodeset.sort() + + return filtered_nodeset, filtered_mod_pos_final + + # @@@@@@@@@@@@@@@@@@@@@@ Boxer Graph -> Elementary Tree @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + + def convert_to_elementarytree(self): + # Writing Discourse Data : nodes, relations, edges + boxer = ET.Element("box") + + nodes = ET.SubElement(boxer, "nodes") + for node in self.nodes: + bnode = ET.SubElement(nodes, "node") + bnode.attrib = {"sym":node} + + # Span positions + span = ET.SubElement(bnode, "span") + positions = self.nodes[node]["positions"] + positions.sort() + for pos in positions: + locelt = ET.SubElement(span, "loc") + locelt.attrib = {"id":str(pos)} + + # Predicates + predicates = self.nodes[node]["predicates"] + predselt = ET.SubElement(bnode, "preds") + for predtuple in predicates: + predname = predtuple[0] + predelt = ET.SubElement(predselt, "pred") + predelt.attrib = {"sym":predname} + + predpositions = predtuple[1] + predpositions.sort() + for predpos in predpositions: + predlocelt = ET.SubElement(predelt, "loc") + predlocelt.attrib = {"id":str(predpos)} + + rels = ET.SubElement(boxer, "rels") + for rel in self.relations: + brel = ET.SubElement(rels, "rel") + brel.attrib = {"sym":rel} + + relname = self.relations[rel]["predicates"] + predelt = ET.SubElement(brel, "pred") + predelt.attrib = {"sym":relname} + + relpositions = self.relations[rel]["positions"] + relpositions.sort() + span = ET.SubElement(brel, "span") + for relpos in relpositions: + rellocelt = ET.SubElement(span, "loc") + rellocelt.attrib = {"id":str(relpos)} + + edges = ET.SubElement(boxer, "edges") + for edge in self.edges: + edgeelt = ET.SubElement(edges, "edge") + edgeelt.attrib = {"lab":edge[2], "par":edge[0], "dep":edge[1]} + + return boxer + + # @@@@@@@@@@@@@@@@@@@@@@ Boxer Graph -> Dot Node @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + + def convert_to_dotstring(self, sentid, main_sentence, main_sent_dict, simple_sentences): + dot_string = "digraph boxer{\n" + + # Creating root node + nodename = 0 + textdot_root, nodename = self.textdot_root_node(nodename, sentid, main_sentence, main_sent_dict, simple_sentences) + dot_string += textdot_root+"\n" + # Creating all boxer nodes + node_graph_dict = {} + for node in self.nodes: + textdot_node, nodename = self.textdot_node(nodename, node, self.nodes[node]["positions"], self.nodes[node]["predicates"]) + node_graph_dict[node] = "struct"+str(nodename) + dot_string += textdot_node+"\n" + # Creating edges + for edge in self.edges: + reldata = edge[2]+"-"+self.relations[edge[2]]["predicates"]+"-"+str(self.relations[edge[2]]["positions"]) + par_boxergraph = node_graph_dict[edge[0]] + dep_boxergraph = node_graph_dict[edge[1]] + dot_string += par_boxergraph+" -> "+dep_boxergraph+"[label=\""+reldata+"\"];\n" + + # Extracting parents + parents_subgraph_nodeset_dict = self.extract_parent_subgraph_nodeset_dict() + #print parents_subgraph_nodeset_dict + + # Connect all parents to root + for parent in parents_subgraph_nodeset_dict: + par_boxergraph = node_graph_dict[parent] + dot_string += "struct1 -> "+par_boxergraph+";\n" + dot_string += "}" + return dot_string + + def textdot_root_node(self, nodename, sentid, main_sentence, main_sent_dict, simple_sentences): + textdot_root = "struct"+str(nodename+1)+" [shape=record,label=\"{" + textdot_root += "sentId: "+sentid+"|" + textdot_root += self.processtext("main: "+main_sentence)+"|" + for simple_sent in simple_sentences: + textdot_root += self.processtext("simple: "+simple_sent)+"|" + + main_sent_dict_text = "" + positions = main_sent_dict.keys() + positions.sort() + for pos in positions: + main_sent_dict_text += str(pos)+":("+main_sent_dict[pos][0]+","+main_sent_dict[pos][1]+") " + textdot_root += self.processtext(main_sent_dict_text) + textdot_root += "}\"];" + return textdot_root, nodename+1 + + def textdot_node(self, nodename, node, positions, predicates): + textdot_node = "struct"+str(nodename+1)+" [shape=record,label=\"{" + textdot_node += "node: "+node+"|" + textdot_node += self.processtext(str(positions))+"|" + index = 0 + for predicate_info in predicates: + textdot_node += predicate_info[0]+" "+self.processtext(str(predicate_info[1])) + index += 1 + if index < len(predicates): + textdot_node += "|" + textdot_node += "}\"];" + return textdot_node, nodename+1 + + def processtext(self, inputstring): + linesize = 100 + outputstring = "" + index = 0 + substr = inputstring[index*linesize:(index+1)*linesize] + while (substr!=""): + outputstring += substr + index += 1 + substr = inputstring[index*linesize:(index+1)*linesize] + if substr!="": + outputstring += "\\n" + return outputstring + + # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ Done @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ diff --git a/source/explore_training_graph.py b/source/explore_training_graph.py new file mode 100644 index 0000000..44a1551 --- /dev/null +++ b/source/explore_training_graph.py @@ -0,0 +1,309 @@ +#!/usr/bin/env python +#=================================================================================== +#title : explore_training_graph.py = +#description : Training graph explorer = +#author : Shashi Narayan, shashi.narayan(at){ed.ac.uk,loria.fr,gmail.com})= +#date : Created in 2014, Later revised in April 2016. = +#version : 0.1 = +#=================================================================================== + + +from training_graph_module import Training_Graph +import function_select_methods +import functions_prepare_elementtree_dot + +class Explore_Training_Graph: + def __init__(self, output_stream, DISCOURSE_SENTENCE_MODEL, MAX_SPLIT_PAIR_SIZE, + RESTRICTED_DROP_REL, ALLOWED_DROP_MOD, METHOD_TRAINING_GRAPH): + self.output_stream = output_stream + + self.DISCOURSE_SENTENCE_MODEL = DISCOURSE_SENTENCE_MODEL + self.MAX_SPLIT_PAIR_SIZE = MAX_SPLIT_PAIR_SIZE + self.RESTRICTED_DROP_REL = RESTRICTED_DROP_REL + self.ALLOWED_DROP_MOD = ALLOWED_DROP_MOD + self.METHOD_TRAINING_GRAPH = METHOD_TRAINING_GRAPH + + self.method_training_graph = function_select_methods.select_training_graph_method(self.METHOD_TRAINING_GRAPH) + + def explore_training_graph(self, sentid, main_sentence, main_sent_dict, simple_sentences, boxer_graph): + # Start a training graph + training_graph = Training_Graph() + nodes_2_process = [] + + # Check if Discourse information is available + if boxer_graph.isEmpty(): + # Adding finishing major node + nodeset = boxer_graph.get_nodeset() + filtered_mod_pos = [] + majornode_data = ("fin", nodeset, simple_sentences, filtered_mod_pos) + + # Creating major node + majornode_name, isNew = training_graph.create_majornode(majornode_data) + nodes_2_process.append(majornode_name) # isNew = True + else: + # DRS data is available for the main sentence + # Check to add the starting node + nodeset = boxer_graph.get_nodeset() + majornode_name, isNew = self.addition_major_node(main_sent_dict, simple_sentences, boxer_graph, training_graph, "split", nodeset, [], []) + nodes_2_process.append(majornode_name) # isNew = True + + # Start expanding the training graph + self.expand_training_graph(nodes_2_process[:], main_sent_dict, boxer_graph, training_graph) + + # Writing sentence element + functions_prepare_elementtree_dot.prepare_write_sentence_element(self.output_stream, sentid, main_sentence, main_sent_dict, simple_sentences, boxer_graph, training_graph) + + # # Check to create visual representation + # if int(sentid) <= 100: + # functions_prepare_elementtree_dot.run_visual_graph_creator(sentid, main_sentence, main_sent_dict, simple_sentences, boxer_graph, training_graph) + + def expand_training_graph(self, nodes_2_process, main_sent_dict, boxer_graph, training_graph): + #print nodes_2_process + if len(nodes_2_process) == 0: + return + + node_name = nodes_2_process[0] + operreq = training_graph.get_majornode_type(node_name) + nodeset = training_graph.get_majornode_nodeset(node_name)[:] + simple_sentences = training_graph.get_majornode_simple_sentences(node_name)[:] + oper_candidates = training_graph.get_majornode_oper_candidates(node_name)[:] + processed_oper_candidates = training_graph.get_majornode_processed_oper_candidates(node_name)[:] + filtered_postions = training_graph.get_majornode_filtered_postions(node_name)[:] + + if operreq == "split": + split_candidate_tuples = oper_candidates + nodes_2_process = self.process_split_node_training_graph(node_name, nodeset, simple_sentences, split_candidate_tuples, + nodes_2_process, main_sent_dict, boxer_graph, training_graph) + + if operreq == "drop-rel": + relnode_candidates = oper_candidates + processed_relnode_candidates = processed_oper_candidates + filtered_mod_pos = filtered_postions + nodes_2_process = self.process_droprel_node_training_graph(node_name, nodeset, simple_sentences, relnode_candidates, processed_relnode_candidates, filtered_mod_pos, + nodes_2_process, main_sent_dict, boxer_graph, training_graph) + + if operreq == "drop-mod": + mod_candidates = oper_candidates + processed_mod_pos = processed_oper_candidates + filtered_mod_pos = filtered_postions + nodes_2_process = self.process_dropmod_node_training_graph(node_name, nodeset, simple_sentences, mod_candidates, processed_mod_pos, filtered_mod_pos, + nodes_2_process, main_sent_dict, boxer_graph, training_graph) + + if operreq == "drop-ood": + oodnode_candidates = oper_candidates + processed_oodnode_candidates = processed_oper_candidates + filtered_mod_pos = filtered_postions + nodes_2_process = self.process_dropood_node_training_graph(node_name, nodeset, simple_sentences, oodnode_candidates, processed_oodnode_candidates, filtered_mod_pos, + nodes_2_process, main_sent_dict, boxer_graph, training_graph) + + self.expand_training_graph(nodes_2_process[1:], main_sent_dict, boxer_graph, training_graph) + + def process_split_node_training_graph(self, node_name, nodeset, simple_sentences, split_candidate_tuples, nodes_2_process, main_sent_dict, boxer_graph, training_graph): + split_candidate_results = [] + splitAchieved = False + for split_candidate in split_candidate_tuples: + isValidSplit, split_results = self.method_training_graph.process_split_candidate_for_split(split_candidate, simple_sentences, main_sent_dict, boxer_graph) + # print "split_candidate : "+str(split_candidate) + " : " + str(isValidSplit) + split_candidate_results.append((isValidSplit, split_results)) + if isValidSplit: + splitAchieved = True + + if splitAchieved: + # At least one split candidate succeed + for split_candidate, results_tuple in zip(split_candidate_tuples, split_candidate_results): + if results_tuple[0] == True: + # Adding the operation node + not_applied_cands = [item for item in split_candidate_tuples if item is not split_candidate] + opernode_data = ("split", split_candidate, not_applied_cands) + opernode_name = training_graph.create_opernode(opernode_data) + training_graph.create_edge((node_name, opernode_name, split_candidate)) + + # Adding children major nodes + for item in results_tuple[1]: + child_nodeset = item[1] + child_nodeset.sort() + parent_child_nodeset = item[2] + simple_sentence = item[3] + + # Check for adding OOD or subsequent nodes + child_majornode_name, isNew = self.addition_major_node(main_sent_dict, [simple_sentence], boxer_graph, training_graph, "drop-rel", child_nodeset, [], []) + if isNew: + nodes_2_process.append(child_majornode_name) + training_graph.create_edge((opernode_name, child_majornode_name, parent_child_nodeset)) + + else: + # None of the split candidate succeed, adding the operation node + not_applied_cands = [item for item in split_candidate_tuples] + opernode_data = ("split", None, not_applied_cands) + opernode_name = training_graph.create_opernode(opernode_data) + training_graph.create_edge((node_name, opernode_name, None)) + + # Check for adding drop-rel or drop-mod or fin nodes + child_nodeset = nodeset + child_majornode_name, isNew = self.addition_major_node(main_sent_dict, simple_sentences, boxer_graph, training_graph, "drop-rel", child_nodeset, [], []) + if isNew: + nodes_2_process.append(child_majornode_name) + training_graph.create_edge((opernode_name, child_majornode_name, None)) + + return nodes_2_process + + def process_droprel_node_training_graph(self, node_name, nodeset, simple_sentences, relnode_set, processed_relnode, filtered_mod_pos, nodes_2_process, main_sent_dict, boxer_graph, training_graph): + relnode_to_process = relnode_set[0] + processed_relnode.append(relnode_to_process) + + isValidDrop = self.method_training_graph.process_rel_candidate_for_drop(relnode_to_process, filtered_mod_pos, nodeset, simple_sentences, main_sent_dict, boxer_graph) + if isValidDrop: + # Drop this rel node, adding the operation node + opernode_data = ("drop-rel", relnode_to_process, "True") + opernode_name = training_graph.create_opernode(opernode_data) + training_graph.create_edge((node_name, opernode_name, relnode_to_process)) + + # Check for adding REL or subsequent nodes, (nodeset is changed) + child_nodeset, child_filtered_mod_pos = boxer_graph.drop_relation(nodeset, relnode_to_process, filtered_mod_pos) + child_majornode_name, isNew = self.addition_major_node(main_sent_dict, simple_sentences, boxer_graph, training_graph, "drop-rel", child_nodeset, processed_relnode, child_filtered_mod_pos) + if isNew: + nodes_2_process.append(child_majornode_name) + training_graph.create_edge((opernode_name, child_majornode_name, "True")) + else: + # Dont drop this rel node, adding the operation node + opernode_data = ("drop-rel", relnode_to_process, "False") + opernode_name = training_graph.create_opernode(opernode_data) + training_graph.create_edge((node_name, opernode_name, relnode_to_process)) + + # Check for adding REL or subsequent nodes, (nodeset is unchanged) + child_nodeset = nodeset + child_filtered_mod_pos = filtered_mod_pos + child_majornode_name, isNew = self.addition_major_node(main_sent_dict, simple_sentences, boxer_graph, training_graph, "drop-rel", child_nodeset, processed_relnode, child_filtered_mod_pos) + if isNew: + nodes_2_process.append(child_majornode_name) + training_graph.create_edge((opernode_name, child_majornode_name, "False")) + + return nodes_2_process + + def process_dropmod_node_training_graph(self, node_name, nodeset, simple_sentences, modcand_set, processed_mod_pos, filtered_mod_pos, nodes_2_process, main_sent_dict, boxer_graph, training_graph): + modcand_to_process = modcand_set[0] + modcand_position_to_process = modcand_to_process[0] + processed_mod_pos.append(modcand_position_to_process) + + isValidDrop = self.method_training_graph.process_mod_candidate_for_drop(modcand_to_process, filtered_mod_pos, nodeset, simple_sentences, main_sent_dict, boxer_graph) + if isValidDrop: + # Drop this mod pos, adding the operation node + opernode_data = ("drop-mod", modcand_to_process, "True") + opernode_name = training_graph.create_opernode(opernode_data) + training_graph.create_edge((node_name, opernode_name, modcand_to_process)) + + # Check for adding mod and their subsequent nodes, (nodeset is not changed) + child_nodeset = nodeset + filtered_mod_pos.append(modcand_position_to_process) + child_majornode_name, isNew = self.addition_major_node(main_sent_dict, simple_sentences, boxer_graph, training_graph, "drop-mod", child_nodeset, processed_mod_pos, filtered_mod_pos) + if isNew: + nodes_2_process.append(child_majornode_name) + training_graph.create_edge((opernode_name, child_majornode_name, "True")) + else: + # Dont drop this pos, adding the operation node + opernode_data = ("drop-mod", modcand_to_process, "False") + opernode_name = training_graph.create_opernode(opernode_data) + training_graph.create_edge((node_name, opernode_name, modcand_to_process)) + + # Check for adding mod and their subsequent nodes, (nodeset is not changed) + child_nodeset = nodeset + child_majornode_name, isNew = self.addition_major_node(main_sent_dict, simple_sentences, boxer_graph, training_graph, "drop-mod", child_nodeset, processed_mod_pos, filtered_mod_pos) + if isNew: + nodes_2_process.append(child_majornode_name) + training_graph.create_edge((opernode_name, child_majornode_name, "False")) + return nodes_2_process + + def process_dropood_node_training_graph(self, node_name, nodeset, simple_sentences, oodnode_set, processed_oodnode, filtered_mod_pos, nodes_2_process, main_sent_dict, boxer_graph, training_graph): + + oodnode_to_process = oodnode_set[0] + processed_oodnode.append(oodnode_to_process) + + isValidDrop = self.method_training_graph.process_ood_candidate_for_drop(oodnode_to_process, filtered_mod_pos, nodeset, simple_sentences, main_sent_dict, boxer_graph) + if isValidDrop: + # Drop this ood node, adding the operation node + opernode_data = ("drop-ood", oodnode_to_process, "True") + opernode_name = training_graph.create_opernode(opernode_data) + training_graph.create_edge((node_name, opernode_name, oodnode_to_process)) + + # Check for adding OOD or subsequent nodes, (nodeset is changed) + child_nodeset = nodeset + child_nodeset.remove(oodnode_to_process) + child_majornode_name, isNew = self.addition_major_node(main_sent_dict, simple_sentences, boxer_graph, training_graph, "drop-ood", child_nodeset, processed_oodnode, filtered_mod_pos) + if isNew: + nodes_2_process.append(child_majornode_name) + training_graph.create_edge((opernode_name, child_majornode_name, "True")) + else: + # Dont drop this ood node, adding the operation node + opernode_data = ("drop-ood", oodnode_to_process, "False") + opernode_name = training_graph.create_opernode(opernode_data) + training_graph.create_edge((node_name, opernode_name, oodnode_to_process)) + + # Check for adding OOD or subsequent nodes, (nodeset is unchanged) + child_nodeset = nodeset + child_majornode_name, isNew = self.addition_major_node(main_sent_dict, simple_sentences, boxer_graph, training_graph, "drop-ood", child_nodeset, processed_oodnode, filtered_mod_pos) + if isNew: + nodes_2_process.append(child_majornode_name) + training_graph.create_edge((opernode_name, child_majornode_name, "False")) + + return nodes_2_process + + def addition_major_node(self, main_sent_dict, simple_sentences, boxer_graph, training_graph, opertype, nodeset, processed_candidates, extra_data): + # node type - value + type_val = {"split":1, "drop-rel":2, "drop-mod":3, "drop-ood":4} + operval = type_val[opertype] + + # Checking for the addition of "split" major-node + if operval <= type_val["split"]: + if opertype in self.DISCOURSE_SENTENCE_MODEL: + # Calculating Split Candidates - DRS Graph node tuples + split_candidate_tuples = boxer_graph.extract_split_candidate_tuples(nodeset, self.MAX_SPLIT_PAIR_SIZE) + # print "split_candidate_tuples : " + str(split_candidate_tuples) + + if len(split_candidate_tuples) != 0: + # Adding the major node for split + majornode_data = ("split", nodeset, simple_sentences, split_candidate_tuples) + majornode_name, isNew = training_graph.create_majornode(majornode_data) + return majornode_name, isNew + + if operval <= type_val["drop-rel"]: + if opertype in self.DISCOURSE_SENTENCE_MODEL: + # Calculate drop-rel candidates + processed_relnode = processed_candidates if opertype == "drop-rel" else [] + filtered_mod_pos = extra_data if opertype == "drop-rel" else [] + relnode_set = boxer_graph.extract_drop_rel_candidates(nodeset, self.RESTRICTED_DROP_REL, processed_relnode) + if len(relnode_set) != 0: + # Adding the major nodes for drop-rel + majornode_data = ("drop-rel", nodeset, simple_sentences, relnode_set, processed_relnode, filtered_mod_pos) + majornode_name, isNew = training_graph.create_majornode(majornode_data) + return majornode_name, isNew + + if operval <= type_val["drop-mod"]: + if opertype in self.DISCOURSE_SENTENCE_MODEL: + # Calculate drop-mod candidates + processed_mod_pos = processed_candidates if opertype == "drop-mod" else [] + filtered_mod_pos = extra_data + modcand_set = boxer_graph.extract_drop_mod_candidates(nodeset, main_sent_dict, self.ALLOWED_DROP_MOD, processed_mod_pos) + if len(modcand_set) != 0: + # Adding the major nodes for drop-mod + majornode_data = ("drop-mod", nodeset, simple_sentences, modcand_set, processed_mod_pos, filtered_mod_pos) + majornode_name, isNew = training_graph.create_majornode(majornode_data) + return majornode_name, isNew + + if operval <= type_val["drop-ood"]: + if opertype in self.DISCOURSE_SENTENCE_MODEL: + # Check for drop-OOD node candidates + processed_oodnodes = processed_candidates if opertype == "drop-ood" else [] + filtered_mod_pos = extra_data + oodnode_candidates = boxer_graph.extract_ood_candidates(nodeset, processed_oodnodes) + if len(oodnode_candidates) != 0: + # Adding the major node for drop-ood + majornode_data = ("drop-ood", nodeset, simple_sentences, oodnode_candidates, processed_oodnodes, filtered_mod_pos) + majornode_name, isNew = training_graph.create_majornode(majornode_data) + return majornode_name, isNew + + + # None of them matched, create "fin" node + filtered_mod_pos = extra_data + majornode_data = ("fin", nodeset, simple_sentences, filtered_mod_pos) + majornode_name, isNew = training_graph.create_majornode(majornode_data) + return majornode_name, isNew diff --git a/source/explore_training_graph.pyc b/source/explore_training_graph.pyc new file mode 100644 index 0000000..7830f30 Binary files /dev/null and b/source/explore_training_graph.pyc differ diff --git a/source/explore_training_graph.py~ b/source/explore_training_graph.py~ new file mode 100644 index 0000000..2e03bcd --- /dev/null +++ b/source/explore_training_graph.py~ @@ -0,0 +1,301 @@ +#!/usr/bin/env python + +from training_graph_module import Training_Graph +import function_select_methods +import functions_prepare_elementtree_dot + +class Explore_Training_Graph: + def __init__(self, output_stream, DISCOURSE_SENTENCE_MODEL, MAX_SPLIT_PAIR_SIZE, + RESTRICTED_DROP_REL, ALLOWED_DROP_MOD, METHOD_TRAINING_GRAPH): + self.output_stream = output_stream + + self.DISCOURSE_SENTENCE_MODEL = DISCOURSE_SENTENCE_MODEL + self.MAX_SPLIT_PAIR_SIZE = MAX_SPLIT_PAIR_SIZE + self.RESTRICTED_DROP_REL = RESTRICTED_DROP_REL + self.ALLOWED_DROP_MOD = ALLOWED_DROP_MOD + self.METHOD_TRAINING_GRAPH = METHOD_TRAINING_GRAPH + + self.method_training_graph = function_select_methods.select_training_graph_method(self.METHOD_TRAINING_GRAPH) + + def explore_training_graph(self, sentid, main_sentence, main_sent_dict, simple_sentences, boxer_graph): + # Start a training graph + training_graph = Training_Graph() + nodes_2_process = [] + + # Check if Discourse information is available + if boxer_graph.isEmpty(): + # Adding finishing major node + nodeset = boxer_graph.get_nodeset() + filtered_mod_pos = [] + majornode_data = ("fin", nodeset, simple_sentences, filtered_mod_pos) + + # Creating major node + majornode_name, isNew = training_graph.create_majornode(majornode_data) + nodes_2_process.append(majornode_name) # isNew = True + else: + # DRS data is available for the main sentence + # Check to add the starting node + nodeset = boxer_graph.get_nodeset() + majornode_name, isNew = self.addition_major_node(main_sent_dict, simple_sentences, boxer_graph, training_graph, "split", nodeset, [], []) + nodes_2_process.append(majornode_name) # isNew = True + + # Start expanding the training graph + self.expand_training_graph(nodes_2_process[:], main_sent_dict, boxer_graph, training_graph) + + # Writing sentence element + functions_prepare_elementtree_dot.prepare_write_sentence_element(self.output_stream, sentid, main_sentence, main_sent_dict, simple_sentences, boxer_graph, training_graph) + + # # Check to create visual representation + # if int(sentid) <= 100: + # functions_prepare_elementtree_dot.run_visual_graph_creator(sentid, main_sentence, main_sent_dict, simple_sentences, boxer_graph, training_graph) + + def expand_training_graph(self, nodes_2_process, main_sent_dict, boxer_graph, training_graph): + #print nodes_2_process + if len(nodes_2_process) == 0: + return + + node_name = nodes_2_process[0] + operreq = training_graph.get_majornode_type(node_name) + nodeset = training_graph.get_majornode_nodeset(node_name)[:] + simple_sentences = training_graph.get_majornode_simple_sentences(node_name)[:] + oper_candidates = training_graph.get_majornode_oper_candidates(node_name)[:] + processed_oper_candidates = training_graph.get_majornode_processed_oper_candidates(node_name)[:] + filtered_postions = training_graph.get_majornode_filtered_postions(node_name)[:] + + if operreq == "split": + split_candidate_tuples = oper_candidates + nodes_2_process = self.process_split_node_training_graph(node_name, nodeset, simple_sentences, split_candidate_tuples, + nodes_2_process, main_sent_dict, boxer_graph, training_graph) + + if operreq == "drop-rel": + relnode_candidates = oper_candidates + processed_relnode_candidates = processed_oper_candidates + filtered_mod_pos = filtered_postions + nodes_2_process = self.process_droprel_node_training_graph(node_name, nodeset, simple_sentences, relnode_candidates, processed_relnode_candidates, filtered_mod_pos, + nodes_2_process, main_sent_dict, boxer_graph, training_graph) + + if operreq == "drop-mod": + mod_candidates = oper_candidates + processed_mod_pos = processed_oper_candidates + filtered_mod_pos = filtered_postions + nodes_2_process = self.process_dropmod_node_training_graph(node_name, nodeset, simple_sentences, mod_candidates, processed_mod_pos, filtered_mod_pos, + nodes_2_process, main_sent_dict, boxer_graph, training_graph) + + if operreq == "drop-ood": + oodnode_candidates = oper_candidates + processed_oodnode_candidates = processed_oper_candidates + filtered_mod_pos = filtered_postions + nodes_2_process = self.process_dropood_node_training_graph(node_name, nodeset, simple_sentences, oodnode_candidates, processed_oodnode_candidates, filtered_mod_pos, + nodes_2_process, main_sent_dict, boxer_graph, training_graph) + + self.expand_training_graph(nodes_2_process[1:], main_sent_dict, boxer_graph, training_graph) + + def process_split_node_training_graph(self, node_name, nodeset, simple_sentences, split_candidate_tuples, nodes_2_process, main_sent_dict, boxer_graph, training_graph): + split_candidate_results = [] + splitAchieved = False + for split_candidate in split_candidate_tuples: + isValidSplit, split_results = self.method_training_graph.process_split_candidate_for_split(split_candidate, simple_sentences, main_sent_dict, boxer_graph) + # print "split_candidate : "+str(split_candidate) + " : " + str(isValidSplit) + split_candidate_results.append((isValidSplit, split_results)) + if isValidSplit: + splitAchieved = True + + if splitAchieved: + # At least one split candidate succeed + for split_candidate, results_tuple in zip(split_candidate_tuples, split_candidate_results): + if results_tuple[0] == True: + # Adding the operation node + not_applied_cands = [item for item in split_candidate_tuples if item is not split_candidate] + opernode_data = ("split", split_candidate, not_applied_cands) + opernode_name = training_graph.create_opernode(opernode_data) + training_graph.create_edge((node_name, opernode_name, split_candidate)) + + # Adding children major nodes + for item in results_tuple[1]: + child_nodeset = item[1] + child_nodeset.sort() + parent_child_nodeset = item[2] + simple_sentence = item[3] + + # Check for adding OOD or subsequent nodes + child_majornode_name, isNew = self.addition_major_node(main_sent_dict, [simple_sentence], boxer_graph, training_graph, "drop-rel", child_nodeset, [], []) + if isNew: + nodes_2_process.append(child_majornode_name) + training_graph.create_edge((opernode_name, child_majornode_name, parent_child_nodeset)) + + else: + # None of the split candidate succeed, adding the operation node + not_applied_cands = [item for item in split_candidate_tuples] + opernode_data = ("split", None, not_applied_cands) + opernode_name = training_graph.create_opernode(opernode_data) + training_graph.create_edge((node_name, opernode_name, None)) + + # Check for adding drop-rel or drop-mod or fin nodes + child_nodeset = nodeset + child_majornode_name, isNew = self.addition_major_node(main_sent_dict, simple_sentences, boxer_graph, training_graph, "drop-rel", child_nodeset, [], []) + if isNew: + nodes_2_process.append(child_majornode_name) + training_graph.create_edge((opernode_name, child_majornode_name, None)) + + return nodes_2_process + + def process_droprel_node_training_graph(self, node_name, nodeset, simple_sentences, relnode_set, processed_relnode, filtered_mod_pos, nodes_2_process, main_sent_dict, boxer_graph, training_graph): + relnode_to_process = relnode_set[0] + processed_relnode.append(relnode_to_process) + + isValidDrop = self.method_training_graph.process_rel_candidate_for_drop(relnode_to_process, filtered_mod_pos, nodeset, simple_sentences, main_sent_dict, boxer_graph) + if isValidDrop: + # Drop this rel node, adding the operation node + opernode_data = ("drop-rel", relnode_to_process, "True") + opernode_name = training_graph.create_opernode(opernode_data) + training_graph.create_edge((node_name, opernode_name, relnode_to_process)) + + # Check for adding REL or subsequent nodes, (nodeset is changed) + child_nodeset, child_filtered_mod_pos = boxer_graph.drop_relation(nodeset, relnode_to_process, filtered_mod_pos) + child_majornode_name, isNew = self.addition_major_node(main_sent_dict, simple_sentences, boxer_graph, training_graph, "drop-rel", child_nodeset, processed_relnode, child_filtered_mod_pos) + if isNew: + nodes_2_process.append(child_majornode_name) + training_graph.create_edge((opernode_name, child_majornode_name, "True")) + else: + # Dont drop this rel node, adding the operation node + opernode_data = ("drop-rel", relnode_to_process, "False") + opernode_name = training_graph.create_opernode(opernode_data) + training_graph.create_edge((node_name, opernode_name, relnode_to_process)) + + # Check for adding REL or subsequent nodes, (nodeset is unchanged) + child_nodeset = nodeset + child_filtered_mod_pos = filtered_mod_pos + child_majornode_name, isNew = self.addition_major_node(main_sent_dict, simple_sentences, boxer_graph, training_graph, "drop-rel", child_nodeset, processed_relnode, child_filtered_mod_pos) + if isNew: + nodes_2_process.append(child_majornode_name) + training_graph.create_edge((opernode_name, child_majornode_name, "False")) + + return nodes_2_process + + def process_dropmod_node_training_graph(self, node_name, nodeset, simple_sentences, modcand_set, processed_mod_pos, filtered_mod_pos, nodes_2_process, main_sent_dict, boxer_graph, training_graph): + modcand_to_process = modcand_set[0] + modcand_position_to_process = modcand_to_process[0] + processed_mod_pos.append(modcand_position_to_process) + + isValidDrop = self.method_training_graph.process_mod_candidate_for_drop(modcand_to_process, filtered_mod_pos, nodeset, simple_sentences, main_sent_dict, boxer_graph) + if isValidDrop: + # Drop this mod pos, adding the operation node + opernode_data = ("drop-mod", modcand_to_process, "True") + opernode_name = training_graph.create_opernode(opernode_data) + training_graph.create_edge((node_name, opernode_name, modcand_to_process)) + + # Check for adding mod and their subsequent nodes, (nodeset is not changed) + child_nodeset = nodeset + filtered_mod_pos.append(modcand_position_to_process) + child_majornode_name, isNew = self.addition_major_node(main_sent_dict, simple_sentences, boxer_graph, training_graph, "drop-mod", child_nodeset, processed_mod_pos, filtered_mod_pos) + if isNew: + nodes_2_process.append(child_majornode_name) + training_graph.create_edge((opernode_name, child_majornode_name, "True")) + else: + # Dont drop this pos, adding the operation node + opernode_data = ("drop-mod", modcand_to_process, "False") + opernode_name = training_graph.create_opernode(opernode_data) + training_graph.create_edge((node_name, opernode_name, modcand_to_process)) + + # Check for adding mod and their subsequent nodes, (nodeset is not changed) + child_nodeset = nodeset + child_majornode_name, isNew = self.addition_major_node(main_sent_dict, simple_sentences, boxer_graph, training_graph, "drop-mod", child_nodeset, processed_mod_pos, filtered_mod_pos) + if isNew: + nodes_2_process.append(child_majornode_name) + training_graph.create_edge((opernode_name, child_majornode_name, "False")) + return nodes_2_process + + def process_dropood_node_training_graph(self, node_name, nodeset, simple_sentences, oodnode_set, processed_oodnode, filtered_mod_pos, nodes_2_process, main_sent_dict, boxer_graph, training_graph): + + oodnode_to_process = oodnode_set[0] + processed_oodnode.append(oodnode_to_process) + + isValidDrop = self.method_training_graph.process_ood_candidate_for_drop(oodnode_to_process, filtered_mod_pos, nodeset, simple_sentences, main_sent_dict, boxer_graph) + if isValidDrop: + # Drop this ood node, adding the operation node + opernode_data = ("drop-ood", oodnode_to_process, "True") + opernode_name = training_graph.create_opernode(opernode_data) + training_graph.create_edge((node_name, opernode_name, oodnode_to_process)) + + # Check for adding OOD or subsequent nodes, (nodeset is changed) + child_nodeset = nodeset + child_nodeset.remove(oodnode_to_process) + child_majornode_name, isNew = self.addition_major_node(main_sent_dict, simple_sentences, boxer_graph, training_graph, "drop-ood", child_nodeset, processed_oodnode, filtered_mod_pos) + if isNew: + nodes_2_process.append(child_majornode_name) + training_graph.create_edge((opernode_name, child_majornode_name, "True")) + else: + # Dont drop this ood node, adding the operation node + opernode_data = ("drop-ood", oodnode_to_process, "False") + opernode_name = training_graph.create_opernode(opernode_data) + training_graph.create_edge((node_name, opernode_name, oodnode_to_process)) + + # Check for adding OOD or subsequent nodes, (nodeset is unchanged) + child_nodeset = nodeset + child_majornode_name, isNew = self.addition_major_node(main_sent_dict, simple_sentences, boxer_graph, training_graph, "drop-ood", child_nodeset, processed_oodnode, filtered_mod_pos) + if isNew: + nodes_2_process.append(child_majornode_name) + training_graph.create_edge((opernode_name, child_majornode_name, "False")) + + return nodes_2_process + + def addition_major_node(self, main_sent_dict, simple_sentences, boxer_graph, training_graph, opertype, nodeset, processed_candidates, extra_data): + # node type - value + type_val = {"split":1, "drop-rel":2, "drop-mod":3, "drop-ood":4} + operval = type_val[opertype] + + # Checking for the addition of "split" major-node + if operval <= type_val["split"]: + if opertype in self.DISCOURSE_SENTENCE_MODEL: + # Calculating Split Candidates - DRS Graph node tuples + split_candidate_tuples = boxer_graph.extract_split_candidate_tuples(nodeset, self.MAX_SPLIT_PAIR_SIZE) + # print "split_candidate_tuples : " + str(split_candidate_tuples) + + if len(split_candidate_tuples) != 0: + # Adding the major node for split + majornode_data = ("split", nodeset, simple_sentences, split_candidate_tuples) + majornode_name, isNew = training_graph.create_majornode(majornode_data) + return majornode_name, isNew + + if operval <= type_val["drop-rel"]: + if opertype in self.DISCOURSE_SENTENCE_MODEL: + # Calculate drop-rel candidates + processed_relnode = processed_candidates if opertype == "drop-rel" else [] + filtered_mod_pos = extra_data if opertype == "drop-rel" else [] + relnode_set = boxer_graph.extract_drop_rel_candidates(nodeset, self.RESTRICTED_DROP_REL, processed_relnode) + if len(relnode_set) != 0: + # Adding the major nodes for drop-rel + majornode_data = ("drop-rel", nodeset, simple_sentences, relnode_set, processed_relnode, filtered_mod_pos) + majornode_name, isNew = training_graph.create_majornode(majornode_data) + return majornode_name, isNew + + if operval <= type_val["drop-mod"]: + if opertype in self.DISCOURSE_SENTENCE_MODEL: + # Calculate drop-mod candidates + processed_mod_pos = processed_candidates if opertype == "drop-mod" else [] + filtered_mod_pos = extra_data + modcand_set = boxer_graph.extract_drop_mod_candidates(nodeset, main_sent_dict, self.ALLOWED_DROP_MOD, processed_mod_pos) + if len(modcand_set) != 0: + # Adding the major nodes for drop-mod + majornode_data = ("drop-mod", nodeset, simple_sentences, modcand_set, processed_mod_pos, filtered_mod_pos) + majornode_name, isNew = training_graph.create_majornode(majornode_data) + return majornode_name, isNew + + if operval <= type_val["drop-ood"]: + if opertype in self.DISCOURSE_SENTENCE_MODEL: + # Check for drop-OOD node candidates + processed_oodnodes = processed_candidates if opertype == "drop-ood" else [] + filtered_mod_pos = extra_data + oodnode_candidates = boxer_graph.extract_ood_candidates(nodeset, processed_oodnodes) + if len(oodnode_candidates) != 0: + # Adding the major node for drop-ood + majornode_data = ("drop-ood", nodeset, simple_sentences, oodnode_candidates, processed_oodnodes, filtered_mod_pos) + majornode_name, isNew = training_graph.create_majornode(majornode_data) + return majornode_name, isNew + + + # None of them matched, create "fin" node + filtered_mod_pos = extra_data + majornode_data = ("fin", nodeset, simple_sentences, filtered_mod_pos) + majornode_name, isNew = training_graph.create_majornode(majornode_data) + return majornode_name, isNew diff --git a/source/function_select_methods.py b/source/function_select_methods.py new file mode 100644 index 0000000..cd3be4c --- /dev/null +++ b/source/function_select_methods.py @@ -0,0 +1,26 @@ + +#=================================================================================== +#description : Methods for training graph and features exploration = +#author : Shashi Narayan, shashi.narayan(at){ed.ac.uk,loria.fr,gmail.com})= +#date : Created in 2014, Later revised in April 2016. = +#version : 0.1 = +#=================================================================================== + + +from methods_training_graph import Method_LED, Method_OVERLAP_LED +from methods_feature_extract import Feature_Init, Feature_Nov27 + +def select_training_graph_method(METHOD_TRAINING_GRAPH): + return{ + "method-0.99-lteq-lt": Method_OVERLAP_LED(0.99, "lteq", "lt"), + "method-0.75-lteq-lt": Method_OVERLAP_LED(0.75, "lteq", "lt"), + "method-0.5-lteq-lteq": Method_OVERLAP_LED(0.5, "lteq", "lteq"), + "method-led-lteq": Method_LED("lteq", "lteq", "lteq"), + "method-led-lt": Method_LED("lt", "lt", "lt") + }[METHOD_TRAINING_GRAPH] + +def select_feature_extract_method(METHOD_FEATURE_EXTRACT): + return{ + "feature-init": Feature_Init(), + "feature-Nov27": Feature_Nov27(), + }[METHOD_FEATURE_EXTRACT] diff --git a/source/function_select_methods.pyc b/source/function_select_methods.pyc new file mode 100644 index 0000000..82fc967 Binary files /dev/null and b/source/function_select_methods.pyc differ diff --git a/source/function_select_methods.py~ b/source/function_select_methods.py~ new file mode 100644 index 0000000..a810331 --- /dev/null +++ b/source/function_select_methods.py~ @@ -0,0 +1,18 @@ + +from methods_training_graph import Method_LED, Method_OVERLAP_LED +from methods_feature_extract import Feature_Init, Feature_Nov27 + +def select_training_graph_method(METHOD_TRAINING_GRAPH): + return{ + "method-0.99-lteq-lt": Method_OVERLAP_LED(0.99, "lteq", "lt"), + "method-0.75-lteq-lt": Method_OVERLAP_LED(0.75, "lteq", "lt"), + "method-0.5-lteq-lteq": Method_OVERLAP_LED(0.5, "lteq", "lteq"), + "method-led-lteq": Method_LED("lteq", "lteq", "lteq"), + "method-led-lt": Method_LED("lt", "lt", "lt") + }[METHOD_TRAINING_GRAPH] + +def select_feature_extract_method(METHOD_FEATURE_EXTRACT): + return{ + "feature-init": Feature_Init(), + "feature-Nov27": Feature_Nov27(), + }[METHOD_FEATURE_EXTRACT] diff --git a/source/functions_configuration_file.py b/source/functions_configuration_file.py new file mode 100644 index 0000000..5910958 --- /dev/null +++ b/source/functions_configuration_file.py @@ -0,0 +1,108 @@ +#=================================================================================== +#title : functions_configuration_file.py = +#description : Prepare/READ configuration file = +#author : Shashi Narayan, shashi.narayan(at){ed.ac.uk,loria.fr,gmail.com})= +#date : Created in 2014, Later revised in April 2016. = +#version : 0.1 = +#=================================================================================== + +def write_config_file(config_filename, config_data_dict): + config_file = open(config_filename, "w") + + config_file.write("##############################################################\n"+ + "####### Discourse-Complex-Simple Congifuration File ##########\n"+ + "##############################################################\n\n") + + config_file.write("# Generation Information\n") + if "TRAIN-BOXER-GRAPH" in config_data_dict: + config_file.write("[TRAIN-BOXER-GRAPH]\n"+config_data_dict["TRAIN-BOXER-GRAPH"]+"\n\n") + + if "TRANSFORMATION-MODEL" in config_data_dict: + config_file.write("[TRANSFORMATION-MODEL]\n"+" ".join(config_data_dict["TRANSFORMATION-MODEL"])+"\n\n") + + if "MAX-SPLIT-SIZE" in config_data_dict: + config_file.write("[MAX-SPLIT-SIZE]\n"+str(config_data_dict["MAX-SPLIT-SIZE"])+"\n\n") + + if "RESTRICTED-DROP-RELATION" in config_data_dict: + config_file.write("[RESTRICTED-DROP-RELATION]\n"+" ".join(config_data_dict["RESTRICTED-DROP-RELATION"])+"\n\n") + + if "ALLOWED-DROP-MODIFIER" in config_data_dict: + config_file.write("[ALLOWED-DROP-MODIFIER]\n"+" ".join(config_data_dict["ALLOWED-DROP-MODIFIER"])+"\n\n") + + if "METHOD-TRAINING-GRAPH" in config_data_dict: + config_file.write("[METHOD-TRAINING-GRAPH]\n"+config_data_dict["METHOD-TRAINING-GRAPH"]+"\n\n") + + if "METHOD-FEATURE-EXTRACT" in config_data_dict: + config_file.write("[METHOD-FEATURE-EXTRACT]\n"+config_data_dict["METHOD-FEATURE-EXTRACT"]+"\n\n") + + if "NUM-EM-ITERATION" in config_data_dict: + config_file.write("[NUM-EM-ITERATION]\n"+str(config_data_dict["NUM-EM-ITERATION"])+"\n\n") + + if "LANGUAGE-MODEL" in config_data_dict: + config_file.write("[LANGUAGE-MODEL]\n"+config_data_dict["LANGUAGE-MODEL"]+"\n\n") + + config_file.write("# Step-1\n") + if "TRAIN-TRAINING-GRAPH" in config_data_dict: + config_file.write("[TRAIN-TRAINING-GRAPH]\n"+config_data_dict["TRAIN-TRAINING-GRAPH"]+"\n\n") + + config_file.write("# Step-2\n") + if "TRANSFORMATION-MODEL-DIR" in config_data_dict: + config_file.write("[TRANSFORMATION-MODEL-DIR]\n"+config_data_dict["TRANSFORMATION-MODEL-DIR"]+"\n\n") + + config_file.write("# Step-3\n") + if "MOSES-COMPLEX-SIMPLE-DIR" in config_data_dict: + config_file.write("[MOSES-COMPLEX-SIMPLE-DIR]\n"+config_data_dict["MOSES-COMPLEX-SIMPLE-DIR"]+"\n\n") + + config_file.close() + + +def parser_config_file(config_file): + config_data = (open(config_file, "r").read().strip()).split("\n") + config_data_dict = {} + count = 0 + while count < len(config_data): + if config_data[count].startswith("["): + # Start Information + if config_data[count].strip()[1:-1] == "TRAIN-BOXER-GRAPH": + config_data_dict["TRAIN-BOXER-GRAPH"] = config_data[count+1].strip() + + if config_data[count].strip()[1:-1] == "TRANSFORMATION-MODEL": + config_data_dict["TRANSFORMATION-MODEL"] = config_data[count+1].strip().split() + + if config_data[count].strip()[1:-1] == "MAX-SPLIT-SIZE": + config_data_dict["MAX-SPLIT-SIZE"] = int(config_data[count+1].strip()) + + if config_data[count].strip()[1:-1] == "RESTRICTED-DROP-RELATION": + config_data_dict["RESTRICTED-DROP-RELATION"] = config_data[count+1].strip().split() + + if config_data[count].strip()[1:-1] == "ALLOWED-DROP-MODIFIER": + config_data_dict["ALLOWED-DROP-MODIFIER"] = config_data[count+1].strip().split() + + if config_data[count].strip()[1:-1] == "METHOD-TRAINING-GRAPH": + config_data_dict["METHOD-TRAINING-GRAPH"] = config_data[count+1].strip() + + if config_data[count].strip()[1:-1] == "METHOD-FEATURE-EXTRACT": + config_data_dict["METHOD-FEATURE-EXTRACT"] = config_data[count+1].strip() + + if config_data[count].strip()[1:-1] == "NUM-EM-ITERATION": + config_data_dict["NUM-EM-ITERATION"] = int(config_data[count+1].strip()) + + if config_data[count].strip()[1:-1] == "LANGUAGE-MODEL": + config_data_dict["LANGUAGE-MODEL"] = config_data[count+1].strip() + + # Step 1 + if config_data[count].strip()[1:-1] == "TRAIN-TRAINING-GRAPH": + config_data_dict["TRAIN-TRAINING-GRAPH"] = config_data[count+1].strip() + + # Step 2 + if config_data[count].strip()[1:-1] == "TRANSFORMATION-MODEL-DIR": + config_data_dict["TRANSFORMATION-MODEL-DIR"] = config_data[count+1].strip() + + # Step 3 + if config_data[count].strip()[1:-1] == "MOSES-COMPLEX-SIMPLE-DIR": + config_data_dict["MOSES-COMPLEX-SIMPLE-DIR"] = config_data[count+1].strip() + + count += 2 + else: + count += 1 + return config_data_dict diff --git a/source/functions_configuration_file.pyc b/source/functions_configuration_file.pyc new file mode 100644 index 0000000..16d546d Binary files /dev/null and b/source/functions_configuration_file.pyc differ diff --git a/source/functions_configuration_file.py~ b/source/functions_configuration_file.py~ new file mode 100644 index 0000000..4076fd2 --- /dev/null +++ b/source/functions_configuration_file.py~ @@ -0,0 +1,101 @@ + +def write_config_file(config_filename, config_data_dict): + config_file = open(config_filename, "w") + + config_file.write("##############################################################\n"+ + "####### Discourse-Complex-Simple Congifuration File ##########\n"+ + "##############################################################\n\n") + + config_file.write("# Generation Information\n") + if "TRAIN-BOXER-GRAPH" in config_data_dict: + config_file.write("[TRAIN-BOXER-GRAPH]\n"+config_data_dict["TRAIN-BOXER-GRAPH"]+"\n\n") + + if "TRANSFORMATION-MODEL" in config_data_dict: + config_file.write("[TRANSFORMATION-MODEL]\n"+" ".join(config_data_dict["TRANSFORMATION-MODEL"])+"\n\n") + + if "MAX-SPLIT-SIZE" in config_data_dict: + config_file.write("[MAX-SPLIT-SIZE]\n"+str(config_data_dict["MAX-SPLIT-SIZE"])+"\n\n") + + if "RESTRICTED-DROP-RELATION" in config_data_dict: + config_file.write("[RESTRICTED-DROP-RELATION]\n"+" ".join(config_data_dict["RESTRICTED-DROP-RELATION"])+"\n\n") + + if "ALLOWED-DROP-MODIFIER" in config_data_dict: + config_file.write("[ALLOWED-DROP-MODIFIER]\n"+" ".join(config_data_dict["ALLOWED-DROP-MODIFIER"])+"\n\n") + + if "METHOD-TRAINING-GRAPH" in config_data_dict: + config_file.write("[METHOD-TRAINING-GRAPH]\n"+config_data_dict["METHOD-TRAINING-GRAPH"]+"\n\n") + + if "METHOD-FEATURE-EXTRACT" in config_data_dict: + config_file.write("[METHOD-FEATURE-EXTRACT]\n"+config_data_dict["METHOD-FEATURE-EXTRACT"]+"\n\n") + + if "NUM-EM-ITERATION" in config_data_dict: + config_file.write("[NUM-EM-ITERATION]\n"+str(config_data_dict["NUM-EM-ITERATION"])+"\n\n") + + if "LANGUAGE-MODEL" in config_data_dict: + config_file.write("[LANGUAGE-MODEL]\n"+config_data_dict["LANGUAGE-MODEL"]+"\n\n") + + config_file.write("# Step-1\n") + if "TRAIN-TRAINING-GRAPH" in config_data_dict: + config_file.write("[TRAIN-TRAINING-GRAPH]\n"+config_data_dict["TRAIN-TRAINING-GRAPH"]+"\n\n") + + config_file.write("# Step-2\n") + if "TRANSFORMATION-MODEL-DIR" in config_data_dict: + config_file.write("[TRANSFORMATION-MODEL-DIR]\n"+config_data_dict["TRANSFORMATION-MODEL-DIR"]+"\n\n") + + config_file.write("# Step-3\n") + if "MOSES-COMPLEX-SIMPLE-DIR" in config_data_dict: + config_file.write("[MOSES-COMPLEX-SIMPLE-DIR]\n"+config_data_dict["MOSES-COMPLEX-SIMPLE-DIR"]+"\n\n") + + config_file.close() + + +def parser_config_file(config_file): + config_data = (open(config_file, "r").read().strip()).split("\n") + config_data_dict = {} + count = 0 + while count < len(config_data): + if config_data[count].startswith("["): + # Start Information + if config_data[count].strip()[1:-1] == "TRAIN-BOXER-GRAPH": + config_data_dict["TRAIN-BOXER-GRAPH"] = config_data[count+1].strip() + + if config_data[count].strip()[1:-1] == "TRANSFORMATION-MODEL": + config_data_dict["TRANSFORMATION-MODEL"] = config_data[count+1].strip().split() + + if config_data[count].strip()[1:-1] == "MAX-SPLIT-SIZE": + config_data_dict["MAX-SPLIT-SIZE"] = int(config_data[count+1].strip()) + + if config_data[count].strip()[1:-1] == "RESTRICTED-DROP-RELATION": + config_data_dict["RESTRICTED-DROP-RELATION"] = config_data[count+1].strip().split() + + if config_data[count].strip()[1:-1] == "ALLOWED-DROP-MODIFIER": + config_data_dict["ALLOWED-DROP-MODIFIER"] = config_data[count+1].strip().split() + + if config_data[count].strip()[1:-1] == "METHOD-TRAINING-GRAPH": + config_data_dict["METHOD-TRAINING-GRAPH"] = config_data[count+1].strip() + + if config_data[count].strip()[1:-1] == "METHOD-FEATURE-EXTRACT": + config_data_dict["METHOD-FEATURE-EXTRACT"] = config_data[count+1].strip() + + if config_data[count].strip()[1:-1] == "NUM-EM-ITERATION": + config_data_dict["NUM-EM-ITERATION"] = int(config_data[count+1].strip()) + + if config_data[count].strip()[1:-1] == "LANGUAGE-MODEL": + config_data_dict["LANGUAGE-MODEL"] = config_data[count+1].strip() + + # Step 1 + if config_data[count].strip()[1:-1] == "TRAIN-TRAINING-GRAPH": + config_data_dict["TRAIN-TRAINING-GRAPH"] = config_data[count+1].strip() + + # Step 2 + if config_data[count].strip()[1:-1] == "TRANSFORMATION-MODEL-DIR": + config_data_dict["TRANSFORMATION-MODEL-DIR"] = config_data[count+1].strip() + + # Step 3 + if config_data[count].strip()[1:-1] == "MOSES-COMPLEX-SIMPLE-DIR": + config_data_dict["MOSES-COMPLEX-SIMPLE-DIR"] = config_data[count+1].strip() + + count += 2 + else: + count += 1 + return config_data_dict diff --git a/source/functions_prepare_elementtree_dot.py b/source/functions_prepare_elementtree_dot.py new file mode 100644 index 0000000..9d19498 --- /dev/null +++ b/source/functions_prepare_elementtree_dot.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python +#=================================================================================== +#title : functions_prepare_elementtree_dot.py = +#description : Prepare dot file = +#author : Shashi Narayan, shashi.narayan(at){ed.ac.uk,loria.fr,gmail.com})= +#date : Created in 2014, Later revised in April 2016. = +#version : 0.1 = +#=================================================================================== + + +import os +import xml.etree.ElementTree as ET +from xml.dom import minidom + +def prettify_xml_element(element): + """Return a pretty-printed XML string for the Element. + """ + rough_string = ET.tostring(element) + reparsed = minidom.parseString(rough_string) + prettyxml = reparsed.documentElement.toprettyxml(indent=" ") + return prettyxml.encode("utf-8") + +############################### Elementary Tree ########################################## + +def prepare_write_sentence_element(output_stream, sentid, main_sentence, main_sent_dict, simple_sentences, boxer_graph, training_graph): + # Creating Sentence element + sentence = ET.Element('sentence') + sentence.attrib={"id":str(sentid)} + + # Writing main sentence + main = ET.SubElement(sentence, "main") + mainsent = ET.SubElement(main, "s") + mainsent.text = main_sentence + wordinfo = ET.SubElement(main, "winfo") + mainpositions = main_sent_dict.keys() + mainpositions.sort() + for position in mainpositions: + word = ET.SubElement(wordinfo, "w") + word.text = main_sent_dict[position][0] + word.attrib = {"id":str(position), "pos":main_sent_dict[position][1]} + + # Writing simple sentence + simpleset = ET.SubElement(sentence, "simple-set") + for simple_sentence in simple_sentences: + simple = ET.SubElement(simpleset, "simple") + simplesent = ET.SubElement(simple, "s") + simplesent.text = simple_sentence + + # Writing boxer Data : boxer_graph + boxer = boxer_graph.convert_to_elementarytree() + sentence.append(boxer) + + # Writing Training Graph : training_graph + traininggraph = training_graph.convert_to_elementarytree() + sentence.append(traininggraph) + + output_stream.write(prettify_xml_element(sentence)) + +############################ Dot - PNG File ################################################### + +def run_visual_graph_creator(sentid, main_sentence, main_sent_dict, simple_sentences, boxer_graph, training_graph): + print "Creating boxer and training graphs for sentence id : "+sentid+" ..." + + # Start creating boxer graph + foutput = open("/tmp/boxer-graph-"+sentid+".dot", "w") + boxer_dotstring = boxer_graph.convert_to_dotstring(sentid, main_sentence, main_sent_dict, simple_sentences) + foutput.write(boxer_dotstring) + foutput.close() + os.system("dot -Tpng /tmp/boxer-graph-"+sentid+".dot -o /tmp/boxer-graph-"+sentid+".png") + + + # Start creating training graph + foutput = open("/tmp/training-graph-"+sentid+".dot", "w") + train_dotstring = training_graph.convert_to_dotstring(main_sent_dict, boxer_graph) + foutput.write(train_dotstring) + foutput.close() + os.system("dot -Tpng /tmp/training-graph-"+sentid+".dot -o /tmp/training-graph-"+sentid+".png") diff --git a/source/functions_prepare_elementtree_dot.pyc b/source/functions_prepare_elementtree_dot.pyc new file mode 100644 index 0000000..97fdd08 Binary files /dev/null and b/source/functions_prepare_elementtree_dot.pyc differ diff --git a/source/functions_prepare_elementtree_dot.py~ b/source/functions_prepare_elementtree_dot.py~ new file mode 100644 index 0000000..ebaa356 --- /dev/null +++ b/source/functions_prepare_elementtree_dot.py~ @@ -0,0 +1,68 @@ +#!/usr/bin/env python +import os +import xml.etree.ElementTree as ET +from xml.dom import minidom + +def prettify_xml_element(element): + """Return a pretty-printed XML string for the Element. + """ + rough_string = ET.tostring(element) + reparsed = minidom.parseString(rough_string) + prettyxml = reparsed.documentElement.toprettyxml(indent=" ") + return prettyxml.encode("utf-8") + +############################### Elementary Tree ########################################## + +def prepare_write_sentence_element(output_stream, sentid, main_sentence, main_sent_dict, simple_sentences, boxer_graph, training_graph): + # Creating Sentence element + sentence = ET.Element('sentence') + sentence.attrib={"id":str(sentid)} + + # Writing main sentence + main = ET.SubElement(sentence, "main") + mainsent = ET.SubElement(main, "s") + mainsent.text = main_sentence + wordinfo = ET.SubElement(main, "winfo") + mainpositions = main_sent_dict.keys() + mainpositions.sort() + for position in mainpositions: + word = ET.SubElement(wordinfo, "w") + word.text = main_sent_dict[position][0] + word.attrib = {"id":str(position), "pos":main_sent_dict[position][1]} + + # Writing simple sentence + simpleset = ET.SubElement(sentence, "simple-set") + for simple_sentence in simple_sentences: + simple = ET.SubElement(simpleset, "simple") + simplesent = ET.SubElement(simple, "s") + simplesent.text = simple_sentence + + # Writing boxer Data : boxer_graph + boxer = boxer_graph.convert_to_elementarytree() + sentence.append(boxer) + + # Writing Training Graph : training_graph + traininggraph = training_graph.convert_to_elementarytree() + sentence.append(traininggraph) + + output_stream.write(prettify_xml_element(sentence)) + +############################ Dot - PNG File ################################################### + +def run_visual_graph_creator(sentid, main_sentence, main_sent_dict, simple_sentences, boxer_graph, training_graph): + print "Creating boxer and training graphs for sentence id : "+sentid+" ..." + + # Start creating boxer graph + foutput = open("/tmp/boxer-graph-"+sentid+".dot", "w") + boxer_dotstring = boxer_graph.convert_to_dotstring(sentid, main_sentence, main_sent_dict, simple_sentences) + foutput.write(boxer_dotstring) + foutput.close() + os.system("dot -Tpng /tmp/boxer-graph-"+sentid+".dot -o /tmp/boxer-graph-"+sentid+".png") + + + # Start creating training graph + foutput = open("/tmp/training-graph-"+sentid+".dot", "w") + train_dotstring = training_graph.convert_to_dotstring(main_sent_dict, boxer_graph) + foutput.write(train_dotstring) + foutput.close() + os.system("dot -Tpng /tmp/training-graph-"+sentid+".dot -o /tmp/training-graph-"+sentid+".png") diff --git a/source/methods_feature_extract.py b/source/methods_feature_extract.py new file mode 100644 index 0000000..98e8418 --- /dev/null +++ b/source/methods_feature_extract.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python +#=================================================================================== +#description : Methods for features exploration = +#author : Shashi Narayan, shashi.narayan(at){ed.ac.uk,loria.fr,gmail.com})= +#date : Created in 2014, Later revised in April 2016. = +#version : 0.1 = +#=================================================================================== + + +class Feature_Nov27: + + def get_split_feature(self, split_tuple, parent_sentence, children_sentence_list, boxer_graph): + # Calculating iLength + #iLength = boxer_graph.calculate_iLength(parent_sentence, children_sentence_list) + # Get split tuple pattern + split_pattern = boxer_graph.get_pattern_4_split_candidate(split_tuple) + #split_feature = split_pattern+"_"+str(iLength) + split_feature = split_pattern + return split_feature + + def get_drop_ood_feature(self, ood_node, nodeset, main_sent_dict, boxer_graph): + ood_word = boxer_graph.extract_oodword(ood_node, main_sent_dict) + ood_position = boxer_graph.nodes[ood_node]["positions"][0] # length of positions is one + span = boxer_graph.extract_span_min_max(nodeset) + boundaryVal = "false" + if ood_position <= span[0] or ood_position >= span[1]: + boundaryVal = "true" + drop_ood_feature = ood_word+"_"+boundaryVal + return drop_ood_feature + + def get_drop_rel_feature(self, rel_node, nodeset, main_sent_dict, boxer_graph): + rel_word = boxer_graph.relations[rel_node]["predicates"] + rel_span = boxer_graph.extract_span_for_nodeset_with_rel(rel_node, nodeset) + drop_rel_feature = rel_word+"_" + if len(rel_span) <= 2: + drop_rel_feature += "0-2" + elif len(rel_span) <= 5: + drop_rel_feature += "2-5" + elif len(rel_span) <= 10: + drop_rel_feature += "5-10" + elif len(rel_span) <= 15: + drop_rel_feature += "10-15" + else: + drop_rel_feature += "gt15" + return drop_rel_feature + + def get_drop_mod_feature(self, mod_cand, main_sent_dict, boxer_graph): + mod_pos = int(mod_cand[0]) + mod_word = main_sent_dict[mod_pos][0] + #mod_node = mod_cand[1] + drop_mod_feature = mod_word + return drop_mod_feature + +class Feature_Init: + + def get_split_feature(self, split_tuple, parent_sentence, children_sentence_list, boxer_graph): + # Calculating iLength + iLength = boxer_graph.calculate_iLength(parent_sentence, children_sentence_list) + # Get split tuple pattern + split_pattern = boxer_graph.get_pattern_4_split_candidate(split_tuple) + split_feature = split_pattern+"_"+str(iLength) + return split_feature + + def get_drop_ood_feature(self, ood_node, nodeset, main_sent_dict, boxer_graph): + ood_word = boxer_graph.extract_oodword(ood_node, main_sent_dict) + ood_position = boxer_graph.nodes[ood_node]["positions"][0] # length of positions is one + span = boxer_graph.extract_span_min_max(nodeset) + boundaryVal = "false" + if ood_position <= span[0] or ood_position >= span[1]: + boundaryVal = "true" + drop_ood_feature = ood_word+"_"+boundaryVal + return drop_ood_feature + + def get_drop_rel_feature(self, rel_node, nodeset, main_sent_dict, boxer_graph): + rel_word = boxer_graph.relations[rel_node]["predicates"] + rel_span = boxer_graph.extract_span_for_nodeset_with_rel(rel_node, nodeset) + drop_rel_feature = rel_word+"_"+str(len(rel_span)) + return drop_rel_feature + + def get_drop_mod_feature(self, mod_cand, main_sent_dict, boxer_graph): + mod_pos = int(mod_cand[0]) + mod_word = main_sent_dict[mod_pos][0] + #mod_node = mod_cand[1] + drop_mod_feature = mod_word + return drop_mod_feature + diff --git a/source/methods_feature_extract.pyc b/source/methods_feature_extract.pyc new file mode 100644 index 0000000..8979446 Binary files /dev/null and b/source/methods_feature_extract.pyc differ diff --git a/source/methods_feature_extract.py~ b/source/methods_feature_extract.py~ new file mode 100644 index 0000000..7004e32 --- /dev/null +++ b/source/methods_feature_extract.py~ @@ -0,0 +1,79 @@ +#!/usr/bin/env python + +class Feature_Nov27: + + def get_split_feature(self, split_tuple, parent_sentence, children_sentence_list, boxer_graph): + # Calculating iLength + #iLength = boxer_graph.calculate_iLength(parent_sentence, children_sentence_list) + # Get split tuple pattern + split_pattern = boxer_graph.get_pattern_4_split_candidate(split_tuple) + #split_feature = split_pattern+"_"+str(iLength) + split_feature = split_pattern + return split_feature + + def get_drop_ood_feature(self, ood_node, nodeset, main_sent_dict, boxer_graph): + ood_word = boxer_graph.extract_oodword(ood_node, main_sent_dict) + ood_position = boxer_graph.nodes[ood_node]["positions"][0] # length of positions is one + span = boxer_graph.extract_span_min_max(nodeset) + boundaryVal = "false" + if ood_position <= span[0] or ood_position >= span[1]: + boundaryVal = "true" + drop_ood_feature = ood_word+"_"+boundaryVal + return drop_ood_feature + + def get_drop_rel_feature(self, rel_node, nodeset, main_sent_dict, boxer_graph): + rel_word = boxer_graph.relations[rel_node]["predicates"] + rel_span = boxer_graph.extract_span_for_nodeset_with_rel(rel_node, nodeset) + drop_rel_feature = rel_word+"_" + if len(rel_span) <= 2: + drop_rel_feature += "0-2" + elif len(rel_span) <= 5: + drop_rel_feature += "2-5" + elif len(rel_span) <= 10: + drop_rel_feature += "5-10" + elif len(rel_span) <= 15: + drop_rel_feature += "10-15" + else: + drop_rel_feature += "gt15" + return drop_rel_feature + + def get_drop_mod_feature(self, mod_cand, main_sent_dict, boxer_graph): + mod_pos = int(mod_cand[0]) + mod_word = main_sent_dict[mod_pos][0] + #mod_node = mod_cand[1] + drop_mod_feature = mod_word + return drop_mod_feature + +class Feature_Init: + + def get_split_feature(self, split_tuple, parent_sentence, children_sentence_list, boxer_graph): + # Calculating iLength + iLength = boxer_graph.calculate_iLength(parent_sentence, children_sentence_list) + # Get split tuple pattern + split_pattern = boxer_graph.get_pattern_4_split_candidate(split_tuple) + split_feature = split_pattern+"_"+str(iLength) + return split_feature + + def get_drop_ood_feature(self, ood_node, nodeset, main_sent_dict, boxer_graph): + ood_word = boxer_graph.extract_oodword(ood_node, main_sent_dict) + ood_position = boxer_graph.nodes[ood_node]["positions"][0] # length of positions is one + span = boxer_graph.extract_span_min_max(nodeset) + boundaryVal = "false" + if ood_position <= span[0] or ood_position >= span[1]: + boundaryVal = "true" + drop_ood_feature = ood_word+"_"+boundaryVal + return drop_ood_feature + + def get_drop_rel_feature(self, rel_node, nodeset, main_sent_dict, boxer_graph): + rel_word = boxer_graph.relations[rel_node]["predicates"] + rel_span = boxer_graph.extract_span_for_nodeset_with_rel(rel_node, nodeset) + drop_rel_feature = rel_word+"_"+str(len(rel_span)) + return drop_rel_feature + + def get_drop_mod_feature(self, mod_cand, main_sent_dict, boxer_graph): + mod_pos = int(mod_cand[0]) + mod_word = main_sent_dict[mod_pos][0] + #mod_node = mod_cand[1] + drop_mod_feature = mod_word + return drop_mod_feature + diff --git a/source/methods_training_graph.py b/source/methods_training_graph.py new file mode 100644 index 0000000..9060d33 --- /dev/null +++ b/source/methods_training_graph.py @@ -0,0 +1,191 @@ +#!/usr/bin/env python +#=================================================================================== +#description : Methods for training graph exploration = +#author : Shashi Narayan, shashi.narayan(at){ed.ac.uk,loria.fr,gmail.com})= +#date : Created in 2014, Later revised in April 2016. = +#version : 0.1 = +#=================================================================================== + +from nltk.metrics.distance import edit_distance + +# Compare edit distance +def compare_edit_distance(operator,edit_dist_after_drop, edit_dist_before_drop): + if operator == "lt": + if edit_dist_after_drop < edit_dist_before_drop: + return True + else: + return False + + if operator == "lteq": + if edit_dist_after_drop <= edit_dist_before_drop: + return True + else: + return False + +# Split Candidate: Common for all clsses +def process_split_candidate_for_split_common(split_candidate, simple_sentences, main_sent_dict, boxer_graph): + if len(split_candidate) != len(simple_sentences): + # Number of events is less than number of simple sentences + return False, [] + + else: + # Calculate all parent and following subtrees + parent_subgraph_nodeset_dict = boxer_graph.extract_parent_subgraph_nodeset_dict() + #print "parent_subgraph_nodeset_dict : "+str(parent_subgraph_nodeset_dict) + + node_overlap_dict = {} + for nodename in split_candidate: + split_nodeset = parent_subgraph_nodeset_dict[nodename] + subsentence = boxer_graph.extract_main_sentence(split_nodeset, main_sent_dict, []) + subsentence_words_set = set(subsentence.split()) + + overlap_data = [] + for index in range(len(simple_sentences)): + simple_sent_words_set = set(simple_sentences[index].split()) + overlap_words_set = subsentence_words_set & simple_sent_words_set + overlap_data.append((len(overlap_words_set), index)) + overlap_data.sort(reverse=True) + + node_overlap_dict[nodename] = overlap_data[0] + + # Check that every node has some overlap in their maximum overlap else fail + overlap_maxvalues = [node_overlap_dict[node][0] for node in node_overlap_dict] + if 0 in overlap_maxvalues: + return False, [] + else: + # check the mapping covers all simple sentences + overlap_max_simple_indixes = [node_overlap_dict[node][1] for node in node_overlap_dict] + if len(set(overlap_max_simple_indixes)) == len(simple_sentences): + # Thats a valid split, attach unprocessed graph components + node_subgraph_nodeset_dict, node_span_dict = boxer_graph.partition_drs_for_successful_candidate(split_candidate, parent_subgraph_nodeset_dict) + + results = [] + for nodename in split_candidate: + span = node_span_dict[nodename] + nodeset = node_subgraph_nodeset_dict[nodename][:] + simple_sentence = simple_sentences[node_overlap_dict[nodename][1]] + results.append((span, nodeset, nodename, simple_sentence)) + # Sort them based on starting + results.sort() + return True, results + else: + return False, [] + +# functions : Drop-REL Candidate +def process_rel_candidate_for_drop_overlap(relnode_candidate, filtered_mod_pos, nodeset, simple_sentences, main_sent_dict, boxer_graph, overlap_percentage): + simple_sentence = " ".join(simple_sentences) + simple_words = simple_sentence.split() + + rel_phrase = boxer_graph.extract_relation_phrase(relnode_candidate, nodeset, main_sent_dict, filtered_mod_pos) + + #print relnode_candidate, rel_phrase + + rel_words = rel_phrase.split() + if len(rel_words) == 0: + return True + else: + found = 0 + for word in rel_words: + if word in simple_words: + found += 1 + percentage_found = found/float(len(rel_words)) + + if percentage_found <= overlap_percentage: + return True + else: + return False + +def process_rel_candidate_for_drop_led(relnode_candidate, filtered_mod_pos, nodeset, simple_sentences, main_sent_dict, boxer_graph, opr_drop_rel): + simple_sentence = " ".join(simple_sentences) + + sentence_before_drop = boxer_graph.extract_main_sentence(nodeset, main_sent_dict, filtered_mod_pos) + edit_dist_before_drop = edit_distance(sentence_before_drop.split(), simple_sentence.split()) + + temp_nodeset, temp_filtered_mod_pos = boxer_graph.drop_relation(nodeset, relnode_candidate, filtered_mod_pos) + sentence_after_drop = boxer_graph.extract_main_sentence(temp_nodeset, main_sent_dict, temp_filtered_mod_pos) + edit_dist_after_drop = edit_distance(sentence_after_drop.split(), simple_sentence.split()) + + isDrop = compare_edit_distance(opr_drop_rel, edit_dist_after_drop, edit_dist_before_drop) + return isDrop + +# functions : Drop-MOD Candidate +def process_mod_candidate_for_drop_led(modcand_to_process, filtered_mod_pos, nodeset, simple_sentences, main_sent_dict, boxer_graph, opr_drop_mod): + simple_sentence = " ".join(simple_sentences) + + sentence_before_drop = boxer_graph.extract_main_sentence(nodeset, main_sent_dict, filtered_mod_pos) + edit_dist_before_drop = edit_distance(sentence_before_drop.split(), simple_sentence.split()) + + modcand_position_to_process = modcand_to_process[0] + temp_filtered_mod_pos = filtered_mod_pos[:]+[modcand_position_to_process] + sentence_after_drop = boxer_graph.extract_main_sentence(nodeset, main_sent_dict, temp_filtered_mod_pos) + edit_dist_after_drop = edit_distance(sentence_after_drop.split(), simple_sentence.split()) + + isDrop = compare_edit_distance(opr_drop_mod, edit_dist_after_drop, edit_dist_before_drop) + return isDrop + +# functions : Drop-OOD Candidate +def process_ood_candidate_for_drop_led(oodnode_candidate, filtered_mod_pos, nodeset, simple_sentences, main_sent_dict, boxer_graph, opr_drop_ood): + simple_sentence = " ".join(simple_sentences) + + sentence_before_drop = boxer_graph.extract_main_sentence(nodeset, main_sent_dict, filtered_mod_pos) + edit_dist_before_drop = edit_distance(sentence_before_drop.split(), simple_sentence.split()) + + temp_nodeset = nodeset[:] + temp_nodeset.remove(oodnode_candidate) + sentence_after_drop = boxer_graph.extract_main_sentence(temp_nodeset, main_sent_dict, filtered_mod_pos) + edit_dist_after_drop = edit_distance(sentence_after_drop.split(), simple_sentence.split()) + + isDrop = compare_edit_distance(opr_drop_ood, edit_dist_after_drop, edit_dist_before_drop) + return isDrop + +class Method_OVERLAP_LED: + def __init__(self, overlap_percentage, opr_drop_mod, opr_drop_ood): + self.overlap_percentage = overlap_percentage + self.opr_drop_mod = opr_drop_mod + self.opr_drop_ood = opr_drop_ood + + # Split candidate + def process_split_candidate_for_split(self, split_candidate, simple_sentences, main_sent_dict, boxer_graph): + isSplit, results = process_split_candidate_for_split_common(split_candidate, simple_sentences, main_sent_dict, boxer_graph) + return isSplit, results + + # Drop-REL Candidate + def process_rel_candidate_for_drop(self, relnode_candidate, filtered_mod_pos, nodeset, simple_sentences, main_sent_dict, boxer_graph): + isDrop = process_rel_candidate_for_drop_overlap(relnode_candidate, filtered_mod_pos, nodeset, simple_sentences, main_sent_dict, boxer_graph, self.overlap_percentage) + return isDrop + + # Drop-MOD Candidate + def process_mod_candidate_for_drop(self, modcand_to_process, filtered_mod_pos, nodeset, simple_sentences, main_sent_dict, boxer_graph): + isDrop = process_mod_candidate_for_drop_led(modcand_to_process, filtered_mod_pos, nodeset, simple_sentences, main_sent_dict, boxer_graph, self.opr_drop_mod) + return isDrop + + # Drop-OOD Candidate + def process_ood_candidate_for_drop(self, oodnode_candidate, filtered_mod_pos, nodeset, simple_sentences, main_sent_dict, boxer_graph): + isDrop = process_ood_candidate_for_drop_led(oodnode_candidate, filtered_mod_pos, nodeset, simple_sentences, main_sent_dict, boxer_graph, self.opr_drop_ood) + return isDrop + +class Method_LED: + def __init__(self, opr_drop_rel, opr_drop_mod, opr_drop_ood): + self.opr_drop_rel = opr_drop_rel + self.opr_drop_mod = opr_drop_mod + self.opr_drop_ood = opr_drop_ood + + # Split candidate + def process_split_candidate_for_split(self, split_candidate, simple_sentences, main_sent_dict, boxer_graph): + isSplit, results = process_split_candidate_for_split_common(split_candidate, simple_sentences, main_sent_dict, boxer_graph) + return isSplit, results + + # Drop-REL Candidate + def process_rel_candidate_for_drop(self, relnode_candidate, filtered_mod_pos, nodeset, simple_sentences, main_sent_dict, boxer_graph): + isDrop = process_rel_candidate_for_drop_led(relnode_candidate, filtered_mod_pos, nodeset, simple_sentences, main_sent_dict, boxer_graph, self.opr_drop_rel) + return isDrop + + # Drop-MOD Candidate + def process_mod_candidate_for_drop(self, modcand_to_process, filtered_mod_pos, nodeset, simple_sentences, main_sent_dict, boxer_graph): + isDrop = process_mod_candidate_for_drop_led(modcand_to_process, filtered_mod_pos, nodeset, simple_sentences, main_sent_dict, boxer_graph, self.opr_drop_mod) + return isDrop + + # Drop-OOD Candidate + def process_ood_candidate_for_drop(self, oodnode_candidate, filtered_mod_pos, nodeset, simple_sentences, main_sent_dict, boxer_graph): + isDrop = process_ood_candidate_for_drop_led(oodnode_candidate, filtered_mod_pos, nodeset, simple_sentences, main_sent_dict, boxer_graph, self.opr_drop_ood) + return isDrop diff --git a/source/methods_training_graph.pyc b/source/methods_training_graph.pyc new file mode 100644 index 0000000..47a62ef Binary files /dev/null and b/source/methods_training_graph.pyc differ diff --git a/source/methods_training_graph.py~ b/source/methods_training_graph.py~ new file mode 100644 index 0000000..55efc8a --- /dev/null +++ b/source/methods_training_graph.py~ @@ -0,0 +1,184 @@ +#!/usr/bin/env python +from nltk.metrics.distance import edit_distance + +# Compare edit distance +def compare_edit_distance(operator,edit_dist_after_drop, edit_dist_before_drop): + if operator == "lt": + if edit_dist_after_drop < edit_dist_before_drop: + return True + else: + return False + + if operator == "lteq": + if edit_dist_after_drop <= edit_dist_before_drop: + return True + else: + return False + +# Split Candidate: Common for all clsses +def process_split_candidate_for_split_common(split_candidate, simple_sentences, main_sent_dict, boxer_graph): + if len(split_candidate) != len(simple_sentences): + # Number of events is less than number of simple sentences + return False, [] + + else: + # Calculate all parent and following subtrees + parent_subgraph_nodeset_dict = boxer_graph.extract_parent_subgraph_nodeset_dict() + #print "parent_subgraph_nodeset_dict : "+str(parent_subgraph_nodeset_dict) + + node_overlap_dict = {} + for nodename in split_candidate: + split_nodeset = parent_subgraph_nodeset_dict[nodename] + subsentence = boxer_graph.extract_main_sentence(split_nodeset, main_sent_dict, []) + subsentence_words_set = set(subsentence.split()) + + overlap_data = [] + for index in range(len(simple_sentences)): + simple_sent_words_set = set(simple_sentences[index].split()) + overlap_words_set = subsentence_words_set & simple_sent_words_set + overlap_data.append((len(overlap_words_set), index)) + overlap_data.sort(reverse=True) + + node_overlap_dict[nodename] = overlap_data[0] + + # Check that every node has some overlap in their maximum overlap else fail + overlap_maxvalues = [node_overlap_dict[node][0] for node in node_overlap_dict] + if 0 in overlap_maxvalues: + return False, [] + else: + # check the mapping covers all simple sentences + overlap_max_simple_indixes = [node_overlap_dict[node][1] for node in node_overlap_dict] + if len(set(overlap_max_simple_indixes)) == len(simple_sentences): + # Thats a valid split, attach unprocessed graph components + node_subgraph_nodeset_dict, node_span_dict = boxer_graph.partition_drs_for_successful_candidate(split_candidate, parent_subgraph_nodeset_dict) + + results = [] + for nodename in split_candidate: + span = node_span_dict[nodename] + nodeset = node_subgraph_nodeset_dict[nodename][:] + simple_sentence = simple_sentences[node_overlap_dict[nodename][1]] + results.append((span, nodeset, nodename, simple_sentence)) + # Sort them based on starting + results.sort() + return True, results + else: + return False, [] + +# functions : Drop-REL Candidate +def process_rel_candidate_for_drop_overlap(relnode_candidate, filtered_mod_pos, nodeset, simple_sentences, main_sent_dict, boxer_graph, overlap_percentage): + simple_sentence = " ".join(simple_sentences) + simple_words = simple_sentence.split() + + rel_phrase = boxer_graph.extract_relation_phrase(relnode_candidate, nodeset, main_sent_dict, filtered_mod_pos) + + #print relnode_candidate, rel_phrase + + rel_words = rel_phrase.split() + if len(rel_words) == 0: + return True + else: + found = 0 + for word in rel_words: + if word in simple_words: + found += 1 + percentage_found = found/float(len(rel_words)) + + if percentage_found <= overlap_percentage: + return True + else: + return False + +def process_rel_candidate_for_drop_led(relnode_candidate, filtered_mod_pos, nodeset, simple_sentences, main_sent_dict, boxer_graph, opr_drop_rel): + simple_sentence = " ".join(simple_sentences) + + sentence_before_drop = boxer_graph.extract_main_sentence(nodeset, main_sent_dict, filtered_mod_pos) + edit_dist_before_drop = edit_distance(sentence_before_drop.split(), simple_sentence.split()) + + temp_nodeset, temp_filtered_mod_pos = boxer_graph.drop_relation(nodeset, relnode_candidate, filtered_mod_pos) + sentence_after_drop = boxer_graph.extract_main_sentence(temp_nodeset, main_sent_dict, temp_filtered_mod_pos) + edit_dist_after_drop = edit_distance(sentence_after_drop.split(), simple_sentence.split()) + + isDrop = compare_edit_distance(opr_drop_rel, edit_dist_after_drop, edit_dist_before_drop) + return isDrop + +# functions : Drop-MOD Candidate +def process_mod_candidate_for_drop_led(modcand_to_process, filtered_mod_pos, nodeset, simple_sentences, main_sent_dict, boxer_graph, opr_drop_mod): + simple_sentence = " ".join(simple_sentences) + + sentence_before_drop = boxer_graph.extract_main_sentence(nodeset, main_sent_dict, filtered_mod_pos) + edit_dist_before_drop = edit_distance(sentence_before_drop.split(), simple_sentence.split()) + + modcand_position_to_process = modcand_to_process[0] + temp_filtered_mod_pos = filtered_mod_pos[:]+[modcand_position_to_process] + sentence_after_drop = boxer_graph.extract_main_sentence(nodeset, main_sent_dict, temp_filtered_mod_pos) + edit_dist_after_drop = edit_distance(sentence_after_drop.split(), simple_sentence.split()) + + isDrop = compare_edit_distance(opr_drop_mod, edit_dist_after_drop, edit_dist_before_drop) + return isDrop + +# functions : Drop-OOD Candidate +def process_ood_candidate_for_drop_led(oodnode_candidate, filtered_mod_pos, nodeset, simple_sentences, main_sent_dict, boxer_graph, opr_drop_ood): + simple_sentence = " ".join(simple_sentences) + + sentence_before_drop = boxer_graph.extract_main_sentence(nodeset, main_sent_dict, filtered_mod_pos) + edit_dist_before_drop = edit_distance(sentence_before_drop.split(), simple_sentence.split()) + + temp_nodeset = nodeset[:] + temp_nodeset.remove(oodnode_candidate) + sentence_after_drop = boxer_graph.extract_main_sentence(temp_nodeset, main_sent_dict, filtered_mod_pos) + edit_dist_after_drop = edit_distance(sentence_after_drop.split(), simple_sentence.split()) + + isDrop = compare_edit_distance(opr_drop_ood, edit_dist_after_drop, edit_dist_before_drop) + return isDrop + +class Method_OVERLAP_LED: + def __init__(self, overlap_percentage, opr_drop_mod, opr_drop_ood): + self.overlap_percentage = overlap_percentage + self.opr_drop_mod = opr_drop_mod + self.opr_drop_ood = opr_drop_ood + + # Split candidate + def process_split_candidate_for_split(self, split_candidate, simple_sentences, main_sent_dict, boxer_graph): + isSplit, results = process_split_candidate_for_split_common(split_candidate, simple_sentences, main_sent_dict, boxer_graph) + return isSplit, results + + # Drop-REL Candidate + def process_rel_candidate_for_drop(self, relnode_candidate, filtered_mod_pos, nodeset, simple_sentences, main_sent_dict, boxer_graph): + isDrop = process_rel_candidate_for_drop_overlap(relnode_candidate, filtered_mod_pos, nodeset, simple_sentences, main_sent_dict, boxer_graph, self.overlap_percentage) + return isDrop + + # Drop-MOD Candidate + def process_mod_candidate_for_drop(self, modcand_to_process, filtered_mod_pos, nodeset, simple_sentences, main_sent_dict, boxer_graph): + isDrop = process_mod_candidate_for_drop_led(modcand_to_process, filtered_mod_pos, nodeset, simple_sentences, main_sent_dict, boxer_graph, self.opr_drop_mod) + return isDrop + + # Drop-OOD Candidate + def process_ood_candidate_for_drop(self, oodnode_candidate, filtered_mod_pos, nodeset, simple_sentences, main_sent_dict, boxer_graph): + isDrop = process_ood_candidate_for_drop_led(oodnode_candidate, filtered_mod_pos, nodeset, simple_sentences, main_sent_dict, boxer_graph, self.opr_drop_ood) + return isDrop + +class Method_LED: + def __init__(self, opr_drop_rel, opr_drop_mod, opr_drop_ood): + self.opr_drop_rel = opr_drop_rel + self.opr_drop_mod = opr_drop_mod + self.opr_drop_ood = opr_drop_ood + + # Split candidate + def process_split_candidate_for_split(self, split_candidate, simple_sentences, main_sent_dict, boxer_graph): + isSplit, results = process_split_candidate_for_split_common(split_candidate, simple_sentences, main_sent_dict, boxer_graph) + return isSplit, results + + # Drop-REL Candidate + def process_rel_candidate_for_drop(self, relnode_candidate, filtered_mod_pos, nodeset, simple_sentences, main_sent_dict, boxer_graph): + isDrop = process_rel_candidate_for_drop_led(relnode_candidate, filtered_mod_pos, nodeset, simple_sentences, main_sent_dict, boxer_graph, self.opr_drop_rel) + return isDrop + + # Drop-MOD Candidate + def process_mod_candidate_for_drop(self, modcand_to_process, filtered_mod_pos, nodeset, simple_sentences, main_sent_dict, boxer_graph): + isDrop = process_mod_candidate_for_drop_led(modcand_to_process, filtered_mod_pos, nodeset, simple_sentences, main_sent_dict, boxer_graph, self.opr_drop_mod) + return isDrop + + # Drop-OOD Candidate + def process_ood_candidate_for_drop(self, oodnode_candidate, filtered_mod_pos, nodeset, simple_sentences, main_sent_dict, boxer_graph): + isDrop = process_ood_candidate_for_drop_led(oodnode_candidate, filtered_mod_pos, nodeset, simple_sentences, main_sent_dict, boxer_graph, self.opr_drop_ood) + return isDrop diff --git a/source/saxparser_xml_stanfordtokenized_boxergraph.py b/source/saxparser_xml_stanfordtokenized_boxergraph.py new file mode 100644 index 0000000..53279cd --- /dev/null +++ b/source/saxparser_xml_stanfordtokenized_boxergraph.py @@ -0,0 +1,208 @@ +#!/usr/bin/env python +#=================================================================================== +#title : saxparser_xml_stanfordtokenized_boxergraph.py = +#description : Boxer-Graph-XML-Handler = +#author : Shashi Narayan, shashi.narayan(at){ed.ac.uk,loria.fr,gmail.com})= +#date : Created in 2014, Later revised in April 2016. = +#version : 0.1 = +#=================================================================================== + +from xml.sax import handler, make_parser + +from boxer_graph_module import Boxer_Graph +from explore_training_graph import Explore_Training_Graph + +class SAXPARSER_XML_StanfordTokenized_BoxerGraph: + def __init__(self, process, xmlfile, output_stream, DISCOURSE_SENTENCE_MODEL, MAX_SPLIT_PAIR_SIZE, RESTRICTED_DROP_REL, ALLOWED_DROP_MOD, METHOD_TRAINING_GRAPH): + # process: "training" or "testing" + self.process = process + + self.xmlfile = xmlfile + + # output_stream: file stream for training and dictionary for testing + self.output_stream = output_stream + + self.DISCOURSE_SENTENCE_MODEL = DISCOURSE_SENTENCE_MODEL + self.MAX_SPLIT_PAIR_SIZE = MAX_SPLIT_PAIR_SIZE + self.RESTRICTED_DROP_REL = RESTRICTED_DROP_REL + self.ALLOWED_DROP_MOD = ALLOWED_DROP_MOD + self.METHOD_TRAINING_GRAPH = METHOD_TRAINING_GRAPH + + def parse_xmlfile_generating_training_graph(self): + handler = SAX_Handler(self.process, self.output_stream, self.DISCOURSE_SENTENCE_MODEL, self.MAX_SPLIT_PAIR_SIZE, + self.RESTRICTED_DROP_REL, self.ALLOWED_DROP_MOD, self.METHOD_TRAINING_GRAPH) + + parser = make_parser() + parser.setContentHandler(handler) + parser.parse(self.xmlfile) + +class SAX_Handler(handler.ContentHandler): + def __init__(self, process, output_stream, DISCOURSE_SENTENCE_MODEL, MAX_SPLIT_PAIR_SIZE, + RESTRICTED_DROP_REL, ALLOWED_DROP_MOD, METHOD_TRAINING_GRAPH): + self.process = process + self.output_stream = output_stream + + self.DISCOURSE_SENTENCE_MODEL = DISCOURSE_SENTENCE_MODEL + self.MAX_SPLIT_PAIR_SIZE = MAX_SPLIT_PAIR_SIZE + self.RESTRICTED_DROP_REL = RESTRICTED_DROP_REL + self.ALLOWED_DROP_MOD = ALLOWED_DROP_MOD + self.METHOD_TRAINING_GRAPH = METHOD_TRAINING_GRAPH + + # Training Graph Creator + self.training_graph_handler = Explore_Training_Graph(self.output_stream, self.DISCOURSE_SENTENCE_MODEL, self.MAX_SPLIT_PAIR_SIZE, + self.RESTRICTED_DROP_REL, self.ALLOWED_DROP_MOD, self.METHOD_TRAINING_GRAPH) + + # Sentence Data + self.sentid = "" + self.main_sentence = "" + self.main_sent_dict = {} + self.boxer_graph = Boxer_Graph() + self.simple_sentencs = [] + + # Sentence Flags, temporary variables + self.isMain = False + + self.isS = False + self.sentence = "" + self.wordlist = [] + + self.isW = False + self.word = "" + self.wid = "" + self.wpos = "" + + self.isSimple = False + + # Boxer flags, temporary variables + self.isNode = False + self.isRel = False + self.symbol = "" + self.predsymbol = "" + self.locationlist = [] + + def startDocument(self): + print "Start parsing the document ..." + + def endDocument(self): + print "End parsing the document ..." + + def startElement(self, nameElt, attrOfElt): + if nameElt == "sentence": + self.sentid = attrOfElt["id"] + + # Refreshing Sentence Data + self.main_sentence = "" + self.main_sent_dict = {} + self.boxer_graph = Boxer_Graph() + self.simple_sentences = [] + + if nameElt == "main": + self.isMain = True + + if nameElt == "simple": + self.isSimple = True + + if nameElt == "s": + self.isS = True + self.sentence = "" + self.wordlist = [] + + if nameElt == "w": + self.isW = True + self.wid = int(attrOfElt["id"][1:]) + self.wpos = attrOfElt["pos"] + self.word = "" + + if nameElt == "node": + self.isNode = True + self.symbol = attrOfElt["sym"] + self.boxer_graph.nodes[self.symbol] = {"positions":[], "predicates":[]} + + if nameElt == "rel": + self.isRel = True + self.symbol = attrOfElt["sym"] + self.boxer_graph.relations[self.symbol] = {"positions":[], "predicates":""} + + if nameElt == "span": + self.locationlist = [] + + if nameElt == "pred": + self.locationlist = [] + self.predsymbol = attrOfElt["sym"] + + if nameElt == "loc": + if int(attrOfElt["id"][1:]) in self.main_sent_dict: + self.locationlist.append(int(attrOfElt["id"][1:])) + + if nameElt == "edge": + self.boxer_graph.edges.append((attrOfElt["par"], attrOfElt["dep"], attrOfElt["lab"])) + + def endElement(self, nameElt): + if nameElt == "sentence": + #print self.sentid + # print self.main_sentence + # print self.main_sent_dict + # print self.simple_sentences + # print self.boxer_graph + + if self.process == "training": + self.training_graph_handler.explore_training_graph(self.sentid, self.main_sentence, self.main_sent_dict, self.simple_sentences, self.boxer_graph) + + if self.process == "testing": + self.output_stream[self.sentid] = [self.main_sentence, self.main_sent_dict, self.boxer_graph] + + # if len(self.main_sentence) > 600: + # print self.sentid + # if len(self.simple_sentences) == 6: + # print self.sentid + + if int(self.sentid)%10000 == 0: + print self.sentid + " training data processed ..." + + if nameElt == "main": + self.isMain = False + if len(self.wordlist) == 0: + self.main_sentence = self.sentence.lower() + else: + self.main_sentence = (" ".join(self.wordlist)).lower() + + if nameElt == "simple": + self.isSimple = False + self.simple_sentences.append(self.sentence.lower()) + + if nameElt == "s": + self.isS = False + + if nameElt == "w": + self.isW = False + self.main_sent_dict[self.wid] = (self.word.lower(), self.wpos.lower()) + self.wordlist.append(self.word.lower()) + + if nameElt == "node": + self.isNode = False + self.boxer_graph.nodes[self.symbol]["predicates"].sort() + + if nameElt == "rel": + self.isRel = False + + if nameElt == "span": + self.locationlist.sort() + if self.isNode: + self.boxer_graph.nodes[self.symbol]["positions"] = self.locationlist[:] + if self.isRel: + self.boxer_graph.relations[self.symbol]["positions"] = self.locationlist[:] + + if nameElt == "pred": + self.locationlist.sort() + if self.isNode: + self.boxer_graph.nodes[self.symbol]["predicates"].append((self.predsymbol, self.locationlist[:])) + if self.isRel: + self.boxer_graph.relations[self.symbol]["predicates"] = self.predsymbol + + def characters(self, chrs): + if self.isS: + self.sentence += chrs + + if self.isW: + self.word += chrs + diff --git a/source/saxparser_xml_stanfordtokenized_boxergraph.pyc b/source/saxparser_xml_stanfordtokenized_boxergraph.pyc new file mode 100644 index 0000000..f2ef9ed Binary files /dev/null and b/source/saxparser_xml_stanfordtokenized_boxergraph.pyc differ diff --git a/source/saxparser_xml_stanfordtokenized_boxergraph.py~ b/source/saxparser_xml_stanfordtokenized_boxergraph.py~ new file mode 100644 index 0000000..da69c8e --- /dev/null +++ b/source/saxparser_xml_stanfordtokenized_boxergraph.py~ @@ -0,0 +1,200 @@ +#!/usr/bin/env python +from xml.sax import handler, make_parser + +from boxer_graph_module import Boxer_Graph +from explore_training_graph import Explore_Training_Graph + +class SAXPARSER_XML_StanfordTokenized_BoxerGraph: + def __init__(self, process, xmlfile, output_stream, DISCOURSE_SENTENCE_MODEL, MAX_SPLIT_PAIR_SIZE, RESTRICTED_DROP_REL, ALLOWED_DROP_MOD, METHOD_TRAINING_GRAPH): + # process: "training" or "testing" + self.process = process + + self.xmlfile = xmlfile + + # output_stream: file stream for training and dictionary for testing + self.output_stream = output_stream + + self.DISCOURSE_SENTENCE_MODEL = DISCOURSE_SENTENCE_MODEL + self.MAX_SPLIT_PAIR_SIZE = MAX_SPLIT_PAIR_SIZE + self.RESTRICTED_DROP_REL = RESTRICTED_DROP_REL + self.ALLOWED_DROP_MOD = ALLOWED_DROP_MOD + self.METHOD_TRAINING_GRAPH = METHOD_TRAINING_GRAPH + + def parse_xmlfile_generating_training_graph(self): + handler = SAX_Handler(self.process, self.output_stream, self.DISCOURSE_SENTENCE_MODEL, self.MAX_SPLIT_PAIR_SIZE, + self.RESTRICTED_DROP_REL, self.ALLOWED_DROP_MOD, self.METHOD_TRAINING_GRAPH) + + parser = make_parser() + parser.setContentHandler(handler) + parser.parse(self.xmlfile) + +class SAX_Handler(handler.ContentHandler): + def __init__(self, process, output_stream, DISCOURSE_SENTENCE_MODEL, MAX_SPLIT_PAIR_SIZE, + RESTRICTED_DROP_REL, ALLOWED_DROP_MOD, METHOD_TRAINING_GRAPH): + self.process = process + self.output_stream = output_stream + + self.DISCOURSE_SENTENCE_MODEL = DISCOURSE_SENTENCE_MODEL + self.MAX_SPLIT_PAIR_SIZE = MAX_SPLIT_PAIR_SIZE + self.RESTRICTED_DROP_REL = RESTRICTED_DROP_REL + self.ALLOWED_DROP_MOD = ALLOWED_DROP_MOD + self.METHOD_TRAINING_GRAPH = METHOD_TRAINING_GRAPH + + # Training Graph Creator + self.training_graph_handler = Explore_Training_Graph(self.output_stream, self.DISCOURSE_SENTENCE_MODEL, self.MAX_SPLIT_PAIR_SIZE, + self.RESTRICTED_DROP_REL, self.ALLOWED_DROP_MOD, self.METHOD_TRAINING_GRAPH) + + # Sentence Data + self.sentid = "" + self.main_sentence = "" + self.main_sent_dict = {} + self.boxer_graph = Boxer_Graph() + self.simple_sentencs = [] + + # Sentence Flags, temporary variables + self.isMain = False + + self.isS = False + self.sentence = "" + self.wordlist = [] + + self.isW = False + self.word = "" + self.wid = "" + self.wpos = "" + + self.isSimple = False + + # Boxer flags, temporary variables + self.isNode = False + self.isRel = False + self.symbol = "" + self.predsymbol = "" + self.locationlist = [] + + def startDocument(self): + print "Start parsing the document ..." + + def endDocument(self): + print "End parsing the document ..." + + def startElement(self, nameElt, attrOfElt): + if nameElt == "sentence": + self.sentid = attrOfElt["id"] + + # Refreshing Sentence Data + self.main_sentence = "" + self.main_sent_dict = {} + self.boxer_graph = Boxer_Graph() + self.simple_sentences = [] + + if nameElt == "main": + self.isMain = True + + if nameElt == "simple": + self.isSimple = True + + if nameElt == "s": + self.isS = True + self.sentence = "" + self.wordlist = [] + + if nameElt == "w": + self.isW = True + self.wid = int(attrOfElt["id"][1:]) + self.wpos = attrOfElt["pos"] + self.word = "" + + if nameElt == "node": + self.isNode = True + self.symbol = attrOfElt["sym"] + self.boxer_graph.nodes[self.symbol] = {"positions":[], "predicates":[]} + + if nameElt == "rel": + self.isRel = True + self.symbol = attrOfElt["sym"] + self.boxer_graph.relations[self.symbol] = {"positions":[], "predicates":""} + + if nameElt == "span": + self.locationlist = [] + + if nameElt == "pred": + self.locationlist = [] + self.predsymbol = attrOfElt["sym"] + + if nameElt == "loc": + if int(attrOfElt["id"][1:]) in self.main_sent_dict: + self.locationlist.append(int(attrOfElt["id"][1:])) + + if nameElt == "edge": + self.boxer_graph.edges.append((attrOfElt["par"], attrOfElt["dep"], attrOfElt["lab"])) + + def endElement(self, nameElt): + if nameElt == "sentence": + #print self.sentid + # print self.main_sentence + # print self.main_sent_dict + # print self.simple_sentences + # print self.boxer_graph + + if self.process == "training": + self.training_graph_handler.explore_training_graph(self.sentid, self.main_sentence, self.main_sent_dict, self.simple_sentences, self.boxer_graph) + + if self.process == "testing": + self.output_stream[self.sentid] = [self.main_sentence, self.main_sent_dict, self.boxer_graph] + + # if len(self.main_sentence) > 600: + # print self.sentid + # if len(self.simple_sentences) == 6: + # print self.sentid + + if int(self.sentid)%10000 == 0: + print self.sentid + " training data processed ..." + + if nameElt == "main": + self.isMain = False + if len(self.wordlist) == 0: + self.main_sentence = self.sentence.lower() + else: + self.main_sentence = (" ".join(self.wordlist)).lower() + + if nameElt == "simple": + self.isSimple = False + self.simple_sentences.append(self.sentence.lower()) + + if nameElt == "s": + self.isS = False + + if nameElt == "w": + self.isW = False + self.main_sent_dict[self.wid] = (self.word.lower(), self.wpos.lower()) + self.wordlist.append(self.word.lower()) + + if nameElt == "node": + self.isNode = False + self.boxer_graph.nodes[self.symbol]["predicates"].sort() + + if nameElt == "rel": + self.isRel = False + + if nameElt == "span": + self.locationlist.sort() + if self.isNode: + self.boxer_graph.nodes[self.symbol]["positions"] = self.locationlist[:] + if self.isRel: + self.boxer_graph.relations[self.symbol]["positions"] = self.locationlist[:] + + if nameElt == "pred": + self.locationlist.sort() + if self.isNode: + self.boxer_graph.nodes[self.symbol]["predicates"].append((self.predsymbol, self.locationlist[:])) + if self.isRel: + self.boxer_graph.relations[self.symbol]["predicates"] = self.predsymbol + + def characters(self, chrs): + if self.isS: + self.sentence += chrs + + if self.isW: + self.word += chrs + diff --git a/source/training_graph_module.py b/source/training_graph_module.py new file mode 100644 index 0000000..d4b6d4c --- /dev/null +++ b/source/training_graph_module.py @@ -0,0 +1,455 @@ +#!/usr/bin/env python +#=================================================================================== +#title : training_graph_module.py = +#description : Define Training Graph = +#author : Shashi Narayan, shashi.narayan(at){ed.ac.uk,loria.fr,gmail.com})= +#date : Created in 2014, Later revised in April 2016. = +#version : 0.1 = +#=================================================================================== + + +import xml.etree.ElementTree as ET +import copy + +class Training_Graph: + def __init__(self): + ''' + self.major_nodes["MN-*"] + ("split", nodeset, simple_sentences, split_candidate_tuples) + ("drop-rel", nodeset, simple_sentences, relnode_set, processed_relnode, filtered_mod_pos) + ("drop-mod", nodeset, simple_sentences, modcand_set, processed_mod_pos, filtered_mod_pos) + ("drop-ood", nodeset, simple_sentence, oodnode_set, processed_oodnode, filtered_mod_pos) + ("fin", nodeset, simple_sentences, filtered_mod_pos) + + self.oper_nodes["ON-*"] + ("split", split_candidate, not_applied_cands) + ("split", None, not_applied_cands) + ("drop-rel", relnode_to_process, "True") + ("drop-rel", relnode_to_process, "False") + ("drop-mod", modcand_to_process, "True") + ("drop-mod", modcand_to_process, "False") + ("drop-ood", oodnode_to_process, "True") + ("drop-ood", oodnode_to_process, "False") + + self.edges = [(par, dep, lab)] + + ''' + self.major_nodes = {} + self.oper_nodes = {} + self.edges = [] + + def get_majornode_type(self, majornode_name): + majornode_tuple = self.major_nodes[majornode_name] + return majornode_tuple[0] + + def get_majornode_nodeset(self, majornode_name): + majornode_tuple = self.major_nodes[majornode_name] + return majornode_tuple[1] + + def get_majornode_simple_sentences(self, majornode_name): + majornode_tuple = self.major_nodes[majornode_name] + return majornode_tuple[2] + + def get_majornode_oper_candidates(self, majornode_name): + majornode_tuple = self.major_nodes[majornode_name] + if majornode_tuple[0] != "fin": + return majornode_tuple[3] + else: + return [] + + def get_majornode_processed_oper_candidates(self, majornode_name): + majornode_tuple = self.major_nodes[majornode_name] + if majornode_tuple[0] != "fin" and majornode_tuple[0] != "split": + return majornode_tuple[4] + else: + return [] + + def get_majornode_filtered_postions(self, majornode_name): + majornode_tuple = self.major_nodes[majornode_name] + if majornode_tuple[0] == "fin": + return majornode_tuple[3] + elif majornode_tuple[0] == "drop-rel" or majornode_tuple[0] == "drop-mod" or majornode_tuple[0] == "drop-ood": + return majornode_tuple[5] + else: + return [] + + def get_opernode_type(self, opernode_name): + opernode_tuple = self.oper_nodes[opernode_name] + return opernode_tuple[0] + + def get_opernode_oper_candidate(self, opernode_name): + opernode_tuple = self.oper_nodes[opernode_name] + return opernode_tuple[1] + + def get_opernode_failed_oper_candidates(self, opernode_name): + opernode_tuple = self.oper_nodes[opernode_name] + if opernode_tuple[0] == "split": + return opernode_tuple[2] + else: + return [] + + def get_opernode_drop_result(self, opernode_name): + opernode_tuple = self.oper_nodes[opernode_name] + if opernode_tuple[0] != "split": + return opernode_tuple[2] + else: + return None + + # @@@@@@@@@@@@@@@@@@@@@ Create nodes @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + + def create_majornode(self, majornode_data): + copy_data = copy.copy(majornode_data) + + # Check if node exists + for node_name in self.major_nodes: + node_data = self.major_nodes[node_name] + if node_data == copy_data: + return node_name, False + + # Otherwise create new node + majornode_name = "MN-"+str(len(self.major_nodes)+1) + self.major_nodes[majornode_name] = copy_data + return majornode_name, True + + def create_opernode(self, opernode_data): + copy_data = copy.copy(opernode_data) + opernode_name = "ON-"+str(len(self.oper_nodes)+1) + self.oper_nodes[opernode_name] = copy_data + return opernode_name + + def create_edge(self, edge_data): + self.edges.append(copy.copy(edge_data)) + + # @@@@@@@@@@@@@@@@@@@@@@@@ Final sentences @@@@@@@@@@@@@@@@@@@@@@@@@@ + + def get_final_sentences(self, main_sentence, main_sent_dict, boxer_graph): + fin_nodes = self.find_all_fin_majornode() + print + node_sent = [] + for node in fin_nodes: + # intpart = int(node[3:]) # removing "MN-", lower int part sentence comes before + if boxer_graph.isEmpty(): + #main_sentence = main_sentence.encode('utf-8') + simple_sentences = self.get_majornode_simple_sentences(node) + simple_sentence = " ".join(simple_sentences) + #node_sent.append((intpart, main_sentence, simple_sentence)) + + node_span = (0, len(main_sentence.split())) + node_sent.append((node_span, main_sentence, simple_sentence)) + + else: + nodeset = self.get_majornode_nodeset(node) + node_span = boxer_graph.extract_span_min_max(nodeset) + filtered_pos = self.get_majornode_filtered_postions(node) + main_sentence = boxer_graph.extract_main_sentence(nodeset, main_sent_dict, filtered_pos) + simple_sentences = self.get_majornode_simple_sentences(node) + simple_sentence = " ".join(simple_sentences) + #node_sent.append((intpart, main_sentence, simple_sentence)) + node_sent.append((node_span, main_sentence, simple_sentence)) + node_sent.sort() + sentence_pairs = [(item[1], item[2]) for item in node_sent] + #sentence_pairs = [(item[1].encode('utf-8'), item[2].encode('utf-8')) for item in node_sent] + #print sentence_pairs + return sentence_pairs + + + # @@@@@@@@@@@@@@@@@@@@@ Find nodes in Training Graph @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + + def find_all_fin_majornode(self): + fin_nodes = [] + for major_node in self.major_nodes: + if self.major_nodes[major_node][0] == "fin": + fin_nodes.append(major_node) + return fin_nodes + + def find_children_of_majornode(self, major_node): + children_oper_nodes = [] + for edge in self.edges: + if edge[0] == major_node: + children_oper_nodes.append(edge[1]) + return children_oper_nodes + + def find_children_of_opernode(self, oper_node): + children_major_nodes = [] + for edge in self.edges: + if edge[0] == oper_node: + children_major_nodes.append(edge[1]) + return children_major_nodes + + def find_parents_of_majornode(self, major_node): + parents_oper_nodes = [] + for edge in self.edges: + if edge[1] == major_node: + parent_oper_node = edge[0] + parents_oper_nodes.append(parent_oper_node) + return parents_oper_nodes + + def find_parent_of_opernode(self, oper_node): + parent_major_node = "" + for edge in self.edges: + if edge[1] == oper_node: + parent_major_node = edge[0] + break + return parent_major_node + + # @@@@@@@@@@@@ Training Graph -> Elementary Tree @@@@@@@@@@@@@@@@@@@@ + + def convert_to_elementarytree(self): + traininggraph = ET.Element("train-graph") + + # Major nodes + major_nodes_elt = ET.SubElement(traininggraph, "major-nodes") + for major_nodename in self.major_nodes: + major_nodetype = self.get_majornode_type(major_nodename) + major_nodeset = self.get_majornode_nodeset(major_nodename) + major_simple_sentences = self.get_majornode_simple_sentences(major_nodename) + oper_candidates = self.get_majornode_oper_candidates(major_nodename) + processed_oper_candidates = self.get_majornode_processed_oper_candidates(major_nodename) + filtered_postions = self.get_majornode_filtered_postions(major_nodename) + + major_node_elt = ET.SubElement(major_nodes_elt, "node") + major_node_elt.attrib = {"sym":major_nodename} + + # Opertype + major_nodetype_elt = ET.SubElement(major_node_elt, "type") + major_nodetype_elt.text = major_nodetype + + # Nodeset + major_nodeset_elt = ET.SubElement(major_node_elt, "nodeset") + for node in major_nodeset: + node_elt = ET.SubElement(major_nodeset_elt, "n") + node_elt.attrib = {"sym":node} + + # Simple sentences + major_simple_sentences_elt = ET.SubElement(major_node_elt, "simple-set") + for simple_sentence in major_simple_sentences: + major_simple_sentence_elt = ET.SubElement(major_simple_sentences_elt, "simple") + sent_data_elt = ET.SubElement(major_simple_sentence_elt, "s") + sent_data_elt.text = simple_sentence + + # Oper Candidates + if major_nodetype == "split": + split_candidate_tuples = oper_candidates + major_split_candidates_elt = ET.SubElement(major_node_elt, "split-candidates") + for split_candidate in split_candidate_tuples: + major_split_candidate_elt = ET.SubElement(major_split_candidates_elt, "sc") + for node in split_candidate: + node_elt = ET.SubElement(major_split_candidate_elt, "n") + node_elt.attrib = {"sym":str(node)} + + if major_nodetype == "drop-rel": + relnode_set = oper_candidates + major_relnode_set_elt = ET.SubElement(major_node_elt, "rel-candidates") + for node in relnode_set: + node_elt = ET.SubElement(major_relnode_set_elt, "n") + node_elt.attrib = {"sym":str(node)} + + processed_relnodes = processed_oper_candidates + major_processed_relnodes_elt = ET.SubElement(major_node_elt, "rel-processed") + for node in processed_relnodes: + node_elt = ET.SubElement(major_processed_relnodes_elt, "n") + node_elt.attrib = {"sym":str(node)} + + filtered_mod_pos = filtered_postions + major_filtered_mod_pos_elt = ET.SubElement(major_node_elt, "mod-loc-filtered") + for node in filtered_mod_pos: + node_elt = ET.SubElement(major_filtered_mod_pos_elt, "loc") + node_elt.attrib = {"id":str(node)} + + if major_nodetype == "drop-mod": + modcand_set = oper_candidates + major_modcand_set_elt = ET.SubElement(major_node_elt, "mod-candidates") + for node in modcand_set: + node_elt = ET.SubElement(major_modcand_set_elt, "n") + node_elt.attrib = {"sym":node[1],"loc":str(node[0])} + + processed_mod_pos = processed_oper_candidates + major_processed_mod_pos_elt = ET.SubElement(major_node_elt, "mod-loc-processed") + for node in processed_mod_pos: + node_elt = ET.SubElement(major_processed_mod_pos_elt, "loc") + node_elt.attrib = {"id":str(node)} + + filtered_mod_pos = filtered_postions + major_filtered_mod_pos_elt = ET.SubElement(major_node_elt, "mod-loc-filtered") + for node in filtered_mod_pos: + node_elt = ET.SubElement(major_filtered_mod_pos_elt, "loc") + node_elt.attrib = {"id":str(node)} + + if major_nodetype == "drop-ood": + oodnode_set = oper_candidates + major_oodnode_set_elt = ET.SubElement(major_node_elt, "ood-candidates") + for node in oodnode_set: + node_elt = ET.SubElement(major_oodnode_set_elt, "n") + node_elt.attrib = {"sym":str(node)} + + processed_oodnodes = processed_oper_candidates + major_processed_oodnodes_elt = ET.SubElement(major_node_elt, "ood-processed") + for node in processed_oodnodes: + node_elt = ET.SubElement(major_processed_oodnodes_elt, "n") + node_elt.attrib = {"sym":str(node)} + + filtered_mod_pos = filtered_postions + major_filtered_mod_pos_elt = ET.SubElement(major_node_elt, "mod-loc-filtered") + for node in filtered_mod_pos: + node_elt = ET.SubElement(major_filtered_mod_pos_elt, "loc") + node_elt.attrib = {"id":str(node)} + + if major_nodetype == "fin": + filtered_mod_pos = filtered_postions + major_filtered_mod_pos_elt = ET.SubElement(major_node_elt, "mod-loc-filtered") + for node in filtered_mod_pos: + node_elt = ET.SubElement(major_filtered_mod_pos_elt, "loc") + node_elt.attrib = {"id":str(node)} + + # Oper nodes + oper_nodes_elt = ET.SubElement(traininggraph, "oper-nodes") + for oper_nodename in self.oper_nodes: + oper_node_elt = ET.SubElement(oper_nodes_elt, "node") + oper_node_elt.attrib = {"sym":oper_nodename} + + oper_nodedata = self.oper_nodes[oper_nodename] + + # Nodetype + oper_nodetype = oper_nodedata[0] + oper_nodetype_elt = ET.SubElement(oper_node_elt, "type") + oper_nodetype_elt.text = oper_nodetype + + if oper_nodetype == "split": + split_cand_applied = oper_nodedata[1] + split_cand_applied_elt = ET.SubElement(oper_node_elt, "split-candidate-applied") + if split_cand_applied != None: + split_candidate_elt = ET.SubElement(split_cand_applied_elt, "sc") + for node in split_cand_applied: + node_elt = ET.SubElement(split_candidate_elt, "n") + node_elt.attrib = {"sym":node} + + split_cand_left = oper_nodedata[2] + split_cand_left_elt = ET.SubElement(oper_node_elt, "split-candidate-left") + for split_candidate in split_cand_left: + split_candidate_elt = ET.SubElement(split_cand_left_elt, "sc") + for node in split_candidate: + node_elt = ET.SubElement(split_candidate_elt, "n") + node_elt.attrib = {"sym":node} + + if oper_nodetype == "drop-ood": + oodnode_to_process = oper_nodedata[1] + oodnode_to_process_elt = ET.SubElement(oper_node_elt, "ood-candidate") + node_elt = ET.SubElement(oodnode_to_process_elt, "n") + node_elt.attrib = {"sym":oodnode_to_process} + + dropped = oper_nodedata[2] + dropped_elt = ET.SubElement(oper_node_elt, "is-dropped") + dropped_elt.attrib = {"val":dropped} + + if oper_nodetype == "drop-rel": + relnode_to_process = oper_nodedata[1] + relnode_to_process_elt = ET.SubElement(oper_node_elt, "rel-candidate") + node_elt = ET.SubElement(relnode_to_process_elt, "n") + node_elt.attrib = {"sym":relnode_to_process} + + dropped = oper_nodedata[2] + dropped_elt = ET.SubElement(oper_node_elt, "is-dropped") + dropped_elt.attrib = {"val":dropped} + + if oper_nodetype == "drop-mod": + modcand_to_process = oper_nodedata[1] + modcand_to_process_elt = ET.SubElement(oper_node_elt, "mod-candidate") + node_elt = ET.SubElement(modcand_to_process_elt, "n") + node_elt.attrib = {"sym":modcand_to_process[1],"loc":str(modcand_to_process[0])} + + dropped = oper_nodedata[2] + dropped_elt = ET.SubElement(oper_node_elt, "is-dropped") + dropped_elt.attrib = {"val":dropped} + + tg_edges_elt = ET.SubElement(traininggraph, "tg-edges") + for tg_edge in self.edges: + tg_edge_elt = ET.SubElement(tg_edges_elt, "edge") + tg_edge_elt.attrib = {"lab":str(tg_edge[2]), "par":tg_edge[0], "dep":tg_edge[1]} + + return traininggraph + + # @@@@@@@@@@@@ Training Graph -> Dot Graph @@@@@@@@@@@@@@@@@@@@ + + def convert_to_dotstring(self, main_sent_dict, boxer_graph): + dot_string = "digraph boxer{\n" + + nodename = 0 + node_graph_dict = {} + # Writing Major nodes + for major_nodename in self.major_nodes: + major_nodetype = self.get_majornode_type(major_nodename) + major_nodeset = self.get_majornode_nodeset(major_nodename) + major_simple_sentences = self.get_majornode_simple_sentences(major_nodename) + oper_candidates = self.get_majornode_oper_candidates(major_nodename) + processed_oper_candidates = self.get_majornode_processed_oper_candidates(major_nodename) + filtered_postions = self.get_majornode_filtered_postions(major_nodename) + + main_sentence = boxer_graph.extract_main_sentence(major_nodeset, main_sent_dict, filtered_postions) + simple_sentence_string = " ".join(major_simple_sentences) + major_node_data = [major_nodetype, major_nodeset[:], main_sentence, simple_sentence_string] + + if major_nodetype == "split": + major_node_data += [oper_candidates[:]] + + if major_nodetype == "drop-rel" or major_nodetype == "drop-mod" or major_nodetype == "drop-ood": + major_node_data += [oper_candidates[:], processed_oper_candidates[:], filtered_postions[:]] + + if major_nodetype == "fin": + major_node_data += [filtered_postions[:]] + + major_node_string, nodename = self.textdot_majornode(nodename, major_nodename, major_node_data[:]) + node_graph_dict[major_nodename] = "struct"+str(nodename) + dot_string += major_node_string+"\n" + + # Writing operation nodes + for oper_nodename in self.oper_nodes: + oper_node_string, nodename = self.textdot_opernode(nodename, oper_nodename, self.oper_nodes[oper_nodename]) + node_graph_dict[oper_nodename] = "struct"+str(nodename) + dot_string += oper_node_string+"\n" + + # Writing edges + for edge in self.edges: + par_graphnode = node_graph_dict[edge[0]] + dep_graphnode = node_graph_dict[edge[1]] + dot_string += par_graphnode+" -> "+dep_graphnode+"[label=\""+str(edge[2])+"\"];\n" + dot_string += "}" + return dot_string + + def textdot_majornode(self, nodename, node, nodedata): + textdot_node = "struct"+str(nodename+1)+" [shape=record,label=\"{" + textdot_node += "major-node: "+node+"|" + index = 0 + for data in nodedata: + textdot_node += self.processtext(str(data)) + index += 1 + if index < len(nodedata): + textdot_node += "|" + textdot_node += "}\"];" + return textdot_node, nodename+1 + + def textdot_opernode(self, nodename, node, nodedata): + textdot_node = "struct"+str(nodename+1)+" [shape=record,label=\"{" + textdot_node += "oper-node: "+node+"|" + index = 0 + for data in nodedata: + textdot_node += self.processtext(str(data)) + index += 1 + if index < len(nodedata): + textdot_node += "|" + textdot_node += "}\"];" + return textdot_node, nodename+1 + + def processtext(self, inputstring): + linesize = 100 + outputstring = "" + index = 0 + substr = inputstring[index*linesize:(index+1)*linesize] + while (substr!=""): + outputstring += substr + index += 1 + substr = inputstring[index*linesize:(index+1)*linesize] + if substr!="": + outputstring += "\\n" + return outputstring + + # @@@@@@@@@@@@@@@@@@@@@@@@@@ DONE @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ diff --git a/source/training_graph_module.pyc b/source/training_graph_module.pyc new file mode 100644 index 0000000..4fb0a86 Binary files /dev/null and b/source/training_graph_module.pyc differ diff --git a/source/training_graph_module.py~ b/source/training_graph_module.py~ new file mode 100644 index 0000000..ec092f1 --- /dev/null +++ b/source/training_graph_module.py~ @@ -0,0 +1,446 @@ +#!/usr/bin/env python +import xml.etree.ElementTree as ET +import copy + +class Training_Graph: + def __init__(self): + ''' + self.major_nodes["MN-*"] + ("split", nodeset, simple_sentences, split_candidate_tuples) + ("drop-rel", nodeset, simple_sentences, relnode_set, processed_relnode, filtered_mod_pos) + ("drop-mod", nodeset, simple_sentences, modcand_set, processed_mod_pos, filtered_mod_pos) + ("drop-ood", nodeset, simple_sentence, oodnode_set, processed_oodnode, filtered_mod_pos) + ("fin", nodeset, simple_sentences, filtered_mod_pos) + + self.oper_nodes["ON-*"] + ("split", split_candidate, not_applied_cands) + ("split", None, not_applied_cands) + ("drop-rel", relnode_to_process, "True") + ("drop-rel", relnode_to_process, "False") + ("drop-mod", modcand_to_process, "True") + ("drop-mod", modcand_to_process, "False") + ("drop-ood", oodnode_to_process, "True") + ("drop-ood", oodnode_to_process, "False") + + self.edges = [(par, dep, lab)] + + ''' + self.major_nodes = {} + self.oper_nodes = {} + self.edges = [] + + def get_majornode_type(self, majornode_name): + majornode_tuple = self.major_nodes[majornode_name] + return majornode_tuple[0] + + def get_majornode_nodeset(self, majornode_name): + majornode_tuple = self.major_nodes[majornode_name] + return majornode_tuple[1] + + def get_majornode_simple_sentences(self, majornode_name): + majornode_tuple = self.major_nodes[majornode_name] + return majornode_tuple[2] + + def get_majornode_oper_candidates(self, majornode_name): + majornode_tuple = self.major_nodes[majornode_name] + if majornode_tuple[0] != "fin": + return majornode_tuple[3] + else: + return [] + + def get_majornode_processed_oper_candidates(self, majornode_name): + majornode_tuple = self.major_nodes[majornode_name] + if majornode_tuple[0] != "fin" and majornode_tuple[0] != "split": + return majornode_tuple[4] + else: + return [] + + def get_majornode_filtered_postions(self, majornode_name): + majornode_tuple = self.major_nodes[majornode_name] + if majornode_tuple[0] == "fin": + return majornode_tuple[3] + elif majornode_tuple[0] == "drop-rel" or majornode_tuple[0] == "drop-mod" or majornode_tuple[0] == "drop-ood": + return majornode_tuple[5] + else: + return [] + + def get_opernode_type(self, opernode_name): + opernode_tuple = self.oper_nodes[opernode_name] + return opernode_tuple[0] + + def get_opernode_oper_candidate(self, opernode_name): + opernode_tuple = self.oper_nodes[opernode_name] + return opernode_tuple[1] + + def get_opernode_failed_oper_candidates(self, opernode_name): + opernode_tuple = self.oper_nodes[opernode_name] + if opernode_tuple[0] == "split": + return opernode_tuple[2] + else: + return [] + + def get_opernode_drop_result(self, opernode_name): + opernode_tuple = self.oper_nodes[opernode_name] + if opernode_tuple[0] != "split": + return opernode_tuple[2] + else: + return None + + # @@@@@@@@@@@@@@@@@@@@@ Create nodes @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + + def create_majornode(self, majornode_data): + copy_data = copy.copy(majornode_data) + + # Check if node exists + for node_name in self.major_nodes: + node_data = self.major_nodes[node_name] + if node_data == copy_data: + return node_name, False + + # Otherwise create new node + majornode_name = "MN-"+str(len(self.major_nodes)+1) + self.major_nodes[majornode_name] = copy_data + return majornode_name, True + + def create_opernode(self, opernode_data): + copy_data = copy.copy(opernode_data) + opernode_name = "ON-"+str(len(self.oper_nodes)+1) + self.oper_nodes[opernode_name] = copy_data + return opernode_name + + def create_edge(self, edge_data): + self.edges.append(copy.copy(edge_data)) + + # @@@@@@@@@@@@@@@@@@@@@@@@ Final sentences @@@@@@@@@@@@@@@@@@@@@@@@@@ + + def get_final_sentences(self, main_sentence, main_sent_dict, boxer_graph): + fin_nodes = self.find_all_fin_majornode() + print + node_sent = [] + for node in fin_nodes: + # intpart = int(node[3:]) # removing "MN-", lower int part sentence comes before + if boxer_graph.isEmpty(): + #main_sentence = main_sentence.encode('utf-8') + simple_sentences = self.get_majornode_simple_sentences(node) + simple_sentence = " ".join(simple_sentences) + #node_sent.append((intpart, main_sentence, simple_sentence)) + + node_span = (0, len(main_sentence.split())) + node_sent.append((node_span, main_sentence, simple_sentence)) + + else: + nodeset = self.get_majornode_nodeset(node) + node_span = boxer_graph.extract_span_min_max(nodeset) + filtered_pos = self.get_majornode_filtered_postions(node) + main_sentence = boxer_graph.extract_main_sentence(nodeset, main_sent_dict, filtered_pos) + simple_sentences = self.get_majornode_simple_sentences(node) + simple_sentence = " ".join(simple_sentences) + #node_sent.append((intpart, main_sentence, simple_sentence)) + node_sent.append((node_span, main_sentence, simple_sentence)) + node_sent.sort() + sentence_pairs = [(item[1], item[2]) for item in node_sent] + #sentence_pairs = [(item[1].encode('utf-8'), item[2].encode('utf-8')) for item in node_sent] + #print sentence_pairs + return sentence_pairs + + + # @@@@@@@@@@@@@@@@@@@@@ Find nodes in Training Graph @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + + def find_all_fin_majornode(self): + fin_nodes = [] + for major_node in self.major_nodes: + if self.major_nodes[major_node][0] == "fin": + fin_nodes.append(major_node) + return fin_nodes + + def find_children_of_majornode(self, major_node): + children_oper_nodes = [] + for edge in self.edges: + if edge[0] == major_node: + children_oper_nodes.append(edge[1]) + return children_oper_nodes + + def find_children_of_opernode(self, oper_node): + children_major_nodes = [] + for edge in self.edges: + if edge[0] == oper_node: + children_major_nodes.append(edge[1]) + return children_major_nodes + + def find_parents_of_majornode(self, major_node): + parents_oper_nodes = [] + for edge in self.edges: + if edge[1] == major_node: + parent_oper_node = edge[0] + parents_oper_nodes.append(parent_oper_node) + return parents_oper_nodes + + def find_parent_of_opernode(self, oper_node): + parent_major_node = "" + for edge in self.edges: + if edge[1] == oper_node: + parent_major_node = edge[0] + break + return parent_major_node + + # @@@@@@@@@@@@ Training Graph -> Elementary Tree @@@@@@@@@@@@@@@@@@@@ + + def convert_to_elementarytree(self): + traininggraph = ET.Element("train-graph") + + # Major nodes + major_nodes_elt = ET.SubElement(traininggraph, "major-nodes") + for major_nodename in self.major_nodes: + major_nodetype = self.get_majornode_type(major_nodename) + major_nodeset = self.get_majornode_nodeset(major_nodename) + major_simple_sentences = self.get_majornode_simple_sentences(major_nodename) + oper_candidates = self.get_majornode_oper_candidates(major_nodename) + processed_oper_candidates = self.get_majornode_processed_oper_candidates(major_nodename) + filtered_postions = self.get_majornode_filtered_postions(major_nodename) + + major_node_elt = ET.SubElement(major_nodes_elt, "node") + major_node_elt.attrib = {"sym":major_nodename} + + # Opertype + major_nodetype_elt = ET.SubElement(major_node_elt, "type") + major_nodetype_elt.text = major_nodetype + + # Nodeset + major_nodeset_elt = ET.SubElement(major_node_elt, "nodeset") + for node in major_nodeset: + node_elt = ET.SubElement(major_nodeset_elt, "n") + node_elt.attrib = {"sym":node} + + # Simple sentences + major_simple_sentences_elt = ET.SubElement(major_node_elt, "simple-set") + for simple_sentence in major_simple_sentences: + major_simple_sentence_elt = ET.SubElement(major_simple_sentences_elt, "simple") + sent_data_elt = ET.SubElement(major_simple_sentence_elt, "s") + sent_data_elt.text = simple_sentence + + # Oper Candidates + if major_nodetype == "split": + split_candidate_tuples = oper_candidates + major_split_candidates_elt = ET.SubElement(major_node_elt, "split-candidates") + for split_candidate in split_candidate_tuples: + major_split_candidate_elt = ET.SubElement(major_split_candidates_elt, "sc") + for node in split_candidate: + node_elt = ET.SubElement(major_split_candidate_elt, "n") + node_elt.attrib = {"sym":str(node)} + + if major_nodetype == "drop-rel": + relnode_set = oper_candidates + major_relnode_set_elt = ET.SubElement(major_node_elt, "rel-candidates") + for node in relnode_set: + node_elt = ET.SubElement(major_relnode_set_elt, "n") + node_elt.attrib = {"sym":str(node)} + + processed_relnodes = processed_oper_candidates + major_processed_relnodes_elt = ET.SubElement(major_node_elt, "rel-processed") + for node in processed_relnodes: + node_elt = ET.SubElement(major_processed_relnodes_elt, "n") + node_elt.attrib = {"sym":str(node)} + + filtered_mod_pos = filtered_postions + major_filtered_mod_pos_elt = ET.SubElement(major_node_elt, "mod-loc-filtered") + for node in filtered_mod_pos: + node_elt = ET.SubElement(major_filtered_mod_pos_elt, "loc") + node_elt.attrib = {"id":str(node)} + + if major_nodetype == "drop-mod": + modcand_set = oper_candidates + major_modcand_set_elt = ET.SubElement(major_node_elt, "mod-candidates") + for node in modcand_set: + node_elt = ET.SubElement(major_modcand_set_elt, "n") + node_elt.attrib = {"sym":node[1],"loc":str(node[0])} + + processed_mod_pos = processed_oper_candidates + major_processed_mod_pos_elt = ET.SubElement(major_node_elt, "mod-loc-processed") + for node in processed_mod_pos: + node_elt = ET.SubElement(major_processed_mod_pos_elt, "loc") + node_elt.attrib = {"id":str(node)} + + filtered_mod_pos = filtered_postions + major_filtered_mod_pos_elt = ET.SubElement(major_node_elt, "mod-loc-filtered") + for node in filtered_mod_pos: + node_elt = ET.SubElement(major_filtered_mod_pos_elt, "loc") + node_elt.attrib = {"id":str(node)} + + if major_nodetype == "drop-ood": + oodnode_set = oper_candidates + major_oodnode_set_elt = ET.SubElement(major_node_elt, "ood-candidates") + for node in oodnode_set: + node_elt = ET.SubElement(major_oodnode_set_elt, "n") + node_elt.attrib = {"sym":str(node)} + + processed_oodnodes = processed_oper_candidates + major_processed_oodnodes_elt = ET.SubElement(major_node_elt, "ood-processed") + for node in processed_oodnodes: + node_elt = ET.SubElement(major_processed_oodnodes_elt, "n") + node_elt.attrib = {"sym":str(node)} + + filtered_mod_pos = filtered_postions + major_filtered_mod_pos_elt = ET.SubElement(major_node_elt, "mod-loc-filtered") + for node in filtered_mod_pos: + node_elt = ET.SubElement(major_filtered_mod_pos_elt, "loc") + node_elt.attrib = {"id":str(node)} + + if major_nodetype == "fin": + filtered_mod_pos = filtered_postions + major_filtered_mod_pos_elt = ET.SubElement(major_node_elt, "mod-loc-filtered") + for node in filtered_mod_pos: + node_elt = ET.SubElement(major_filtered_mod_pos_elt, "loc") + node_elt.attrib = {"id":str(node)} + + # Oper nodes + oper_nodes_elt = ET.SubElement(traininggraph, "oper-nodes") + for oper_nodename in self.oper_nodes: + oper_node_elt = ET.SubElement(oper_nodes_elt, "node") + oper_node_elt.attrib = {"sym":oper_nodename} + + oper_nodedata = self.oper_nodes[oper_nodename] + + # Nodetype + oper_nodetype = oper_nodedata[0] + oper_nodetype_elt = ET.SubElement(oper_node_elt, "type") + oper_nodetype_elt.text = oper_nodetype + + if oper_nodetype == "split": + split_cand_applied = oper_nodedata[1] + split_cand_applied_elt = ET.SubElement(oper_node_elt, "split-candidate-applied") + if split_cand_applied != None: + split_candidate_elt = ET.SubElement(split_cand_applied_elt, "sc") + for node in split_cand_applied: + node_elt = ET.SubElement(split_candidate_elt, "n") + node_elt.attrib = {"sym":node} + + split_cand_left = oper_nodedata[2] + split_cand_left_elt = ET.SubElement(oper_node_elt, "split-candidate-left") + for split_candidate in split_cand_left: + split_candidate_elt = ET.SubElement(split_cand_left_elt, "sc") + for node in split_candidate: + node_elt = ET.SubElement(split_candidate_elt, "n") + node_elt.attrib = {"sym":node} + + if oper_nodetype == "drop-ood": + oodnode_to_process = oper_nodedata[1] + oodnode_to_process_elt = ET.SubElement(oper_node_elt, "ood-candidate") + node_elt = ET.SubElement(oodnode_to_process_elt, "n") + node_elt.attrib = {"sym":oodnode_to_process} + + dropped = oper_nodedata[2] + dropped_elt = ET.SubElement(oper_node_elt, "is-dropped") + dropped_elt.attrib = {"val":dropped} + + if oper_nodetype == "drop-rel": + relnode_to_process = oper_nodedata[1] + relnode_to_process_elt = ET.SubElement(oper_node_elt, "rel-candidate") + node_elt = ET.SubElement(relnode_to_process_elt, "n") + node_elt.attrib = {"sym":relnode_to_process} + + dropped = oper_nodedata[2] + dropped_elt = ET.SubElement(oper_node_elt, "is-dropped") + dropped_elt.attrib = {"val":dropped} + + if oper_nodetype == "drop-mod": + modcand_to_process = oper_nodedata[1] + modcand_to_process_elt = ET.SubElement(oper_node_elt, "mod-candidate") + node_elt = ET.SubElement(modcand_to_process_elt, "n") + node_elt.attrib = {"sym":modcand_to_process[1],"loc":str(modcand_to_process[0])} + + dropped = oper_nodedata[2] + dropped_elt = ET.SubElement(oper_node_elt, "is-dropped") + dropped_elt.attrib = {"val":dropped} + + tg_edges_elt = ET.SubElement(traininggraph, "tg-edges") + for tg_edge in self.edges: + tg_edge_elt = ET.SubElement(tg_edges_elt, "edge") + tg_edge_elt.attrib = {"lab":str(tg_edge[2]), "par":tg_edge[0], "dep":tg_edge[1]} + + return traininggraph + + # @@@@@@@@@@@@ Training Graph -> Dot Graph @@@@@@@@@@@@@@@@@@@@ + + def convert_to_dotstring(self, main_sent_dict, boxer_graph): + dot_string = "digraph boxer{\n" + + nodename = 0 + node_graph_dict = {} + # Writing Major nodes + for major_nodename in self.major_nodes: + major_nodetype = self.get_majornode_type(major_nodename) + major_nodeset = self.get_majornode_nodeset(major_nodename) + major_simple_sentences = self.get_majornode_simple_sentences(major_nodename) + oper_candidates = self.get_majornode_oper_candidates(major_nodename) + processed_oper_candidates = self.get_majornode_processed_oper_candidates(major_nodename) + filtered_postions = self.get_majornode_filtered_postions(major_nodename) + + main_sentence = boxer_graph.extract_main_sentence(major_nodeset, main_sent_dict, filtered_postions) + simple_sentence_string = " ".join(major_simple_sentences) + major_node_data = [major_nodetype, major_nodeset[:], main_sentence, simple_sentence_string] + + if major_nodetype == "split": + major_node_data += [oper_candidates[:]] + + if major_nodetype == "drop-rel" or major_nodetype == "drop-mod" or major_nodetype == "drop-ood": + major_node_data += [oper_candidates[:], processed_oper_candidates[:], filtered_postions[:]] + + if major_nodetype == "fin": + major_node_data += [filtered_postions[:]] + + major_node_string, nodename = self.textdot_majornode(nodename, major_nodename, major_node_data[:]) + node_graph_dict[major_nodename] = "struct"+str(nodename) + dot_string += major_node_string+"\n" + + # Writing operation nodes + for oper_nodename in self.oper_nodes: + oper_node_string, nodename = self.textdot_opernode(nodename, oper_nodename, self.oper_nodes[oper_nodename]) + node_graph_dict[oper_nodename] = "struct"+str(nodename) + dot_string += oper_node_string+"\n" + + # Writing edges + for edge in self.edges: + par_graphnode = node_graph_dict[edge[0]] + dep_graphnode = node_graph_dict[edge[1]] + dot_string += par_graphnode+" -> "+dep_graphnode+"[label=\""+str(edge[2])+"\"];\n" + dot_string += "}" + return dot_string + + def textdot_majornode(self, nodename, node, nodedata): + textdot_node = "struct"+str(nodename+1)+" [shape=record,label=\"{" + textdot_node += "major-node: "+node+"|" + index = 0 + for data in nodedata: + textdot_node += self.processtext(str(data)) + index += 1 + if index < len(nodedata): + textdot_node += "|" + textdot_node += "}\"];" + return textdot_node, nodename+1 + + def textdot_opernode(self, nodename, node, nodedata): + textdot_node = "struct"+str(nodename+1)+" [shape=record,label=\"{" + textdot_node += "oper-node: "+node+"|" + index = 0 + for data in nodedata: + textdot_node += self.processtext(str(data)) + index += 1 + if index < len(nodedata): + textdot_node += "|" + textdot_node += "}\"];" + return textdot_node, nodename+1 + + def processtext(self, inputstring): + linesize = 100 + outputstring = "" + index = 0 + substr = inputstring[index*linesize:(index+1)*linesize] + while (substr!=""): + outputstring += substr + index += 1 + substr = inputstring[index*linesize:(index+1)*linesize] + if substr!="": + outputstring += "\\n" + return outputstring + + # @@@@@@@@@@@@@@@@@@@@@@@@@@ DONE @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ diff --git a/start_learning_training_models.py b/start_learning_training_models.py new file mode 100644 index 0000000..fac6fc8 --- /dev/null +++ b/start_learning_training_models.py @@ -0,0 +1,246 @@ +#!/usr/bin/env python +#=================================================================================== +#title : start_learning_training_models.py = +#description : This will learn a model for sentence simplification. = +#author : Shashi Narayan, shashi.narayan(at){ed.ac.uk,loria.fr,gmail.com})= +#date : Created in 2014, Later revised in April 2016. = +#version : 0.1 = +#usage : python2.7 start_learning_training_models.py -help = +#notes : Look at README for requirements. = +#=================================================================================== + +import os +import argparse +import sys +import datetime + +sys.path.append("./source") +import functions_configuration_file +# import functions_model_files +from saxparser_xml_stanfordtokenized_boxergraph import SAXPARSER_XML_StanfordTokenized_BoxerGraph +# from saxparser_xml_stanfordtokenized_boxergraph_traininggraph import SAXPARSER_XML_StanfordTokenized_BoxerGraph_TrainingGraph + +if __name__=="__main__": + # Command line arguments ############## + argparser = argparse.ArgumentParser(prog='python start_learn_training_models.py', description=('Start the training process.')) + + # Optional [default value: 1] + argparser.add_argument('--start-state', help='Start state of the training process', choices=['1','2','3'], default='1', metavar=('Start_State')) + + # Optional [default value: 3] + argparser.add_argument('--end-state', help='End state of the training process', choices=['1','2','3'], default='3', metavar=('End_State')) + + # Optional [default value: split:drop-ood:drop-rel:drop-mod] (Any of their combinations, order is not important), drop-ood only applied after split + argparser.add_argument('--transformation', help='Transformation models learned', default="split:drop-ood:drop-rel:drop-mod", metavar=('TRANSFORMATION_MODEL')) + + # Optional [default value: 2] + argparser.add_argument('--max-split', help='Maximum split size', choices=['2','3'], default='2', metavar=('MAX_SPLIT_SIZE')) + + # Optional [default value: agent:patient:eq:theme], (order is not important) + argparser.add_argument('--restricted-drop-rel', help='Restricted drop relations', default="agent:patient:eq:theme", metavar=('RESTRICTED_DROP_REL')) + + # Optional [default value: jj:jjr:jjs:rb:rbr:rbs], (order is not important) + argparser.add_argument('--allowed-drop-mod', help='Allowed drop modifiers', default="jj:jjr:jjs:rb:rbr:rbs", metavar=('ALLOWED_DROP_MOD')) + + # Optional [default value: update with most recent one] + argparser.add_argument('--method-training-graph', help='Operation set for training graph file', choices=['method-led-lt', 'method-led-lteq', 'method-0.5-lteq-lteq', + 'method-0.75-lteq-lt', 'method-0.99-lteq-lt'], + default='method-0.99-lteq-lt', metavar=('Method_Training_Graph')) + + # Optional [default value: update with most recent one] + argparser.add_argument('--method-feature-extract', help='Operation set for extracting features', choices=['feature-init', 'feature-Nov27'], default='feature-Nov27', + metavar=('Method_Feature_Extract')) + + # Optional [default value: /home/ankh/Data/Simplification/Zhu-2010/PWKP_108016.tokenized.boxer-graph.xml] + argparser.add_argument('--train-boxer-graph', help='The training corpus file (xml, stanford-tokenized, boxer-graph)', metavar=('Train_Boxer_Graph'), + default='/disk/scratch/Sentence-Simplification/Zhu-2010/TrainingData/PWKP_108016.tokenized.boxer-graph.xml') + + # Optional [default value: 10] + argparser.add_argument('--num-em', help='The number of EM Algorithm iterations', metavar=('NUM_EM_ITERATION'), default='10') + + # Optional [default value: 0:3:/disk/scratch/Sentence-Simplification/Language-Model/simplewiki-20131030-data.srilm:0] + argparser.add_argument('--lang-model', help='Language model information (in the moses format)', metavar=('Lang_Model'), + default="0:3:/disk/scratch/Sentence-Simplification/Language-Model/simplewiki-20131030-data.srilm:0") + + # Optional (Cumpolsary when start state is >= 2) + argparser.add_argument('--d2s-config', help='D2S Configuration file', metavar=('D2S_Config')) + + # Compulsary + argparser.add_argument('--output-dir', help='The output directory',required=True, metavar=('Output_Directory')) + # ##################################### + args_dict = vars(argparser.parse_args(sys.argv[1:])) + # ##################################### + + # Creating the output directory to store training models + timestamp = datetime.datetime.now().strftime("%A%d-%B%Y-%I%M%p") + print timestamp+", Creating the output directory: "+args_dict['output_dir'] + try: + os.mkdir(args_dict['output_dir']) + print + except OSError: + print args_dict['output_dir'] + " directory already exists.\n" + + # Configuration dictionary + D2S_Config_data = {} + D2S_Config = args_dict['d2s_config'] + if D2S_Config != None: + D2S_Config_data = functions_configuration_file.parser_config_file(D2S_Config) + else: + D2S_Config_data["TRAIN-BOXER-GRAPH"] = args_dict['train_boxer_graph'] + D2S_Config_data["TRANSFORMATION-MODEL"] = args_dict['transformation'].split(":") + D2S_Config_data["MAX-SPLIT-SIZE"] = int(args_dict['max_split']) + D2S_Config_data["RESTRICTED-DROP-RELATION"] = args_dict['restricted_drop_rel'].split(":") + D2S_Config_data["ALLOWED-DROP-MODIFIER"] = args_dict['allowed_drop_mod'].split(":") + D2S_Config_data["METHOD-TRAINING-GRAPH"] = args_dict['method_training_graph'] + D2S_Config_data["METHOD-FEATURE-EXTRACT"] = args_dict['method_feature_extract'] + D2S_Config_data["NUM-EM-ITERATION"] = int(args_dict['num_em']) + D2S_Config_data["LANGUAGE-MODEL"] = args_dict['lang_model'] + + # Extracting arguments with their default values (default unless its specified) + START_STATE = int(args_dict['start_state']) + END_STATE = int(args_dict['end_state']) + + # Start state: 1, Starting building training graph + state = 1 + if (int(args_dict['start_state']) <= state) and (state <= int(args_dict['end_state'])): + timestamp = datetime.datetime.now().strftime("%A%d-%B%Y-%I%M%p") + print timestamp+", Starting building training graph (Step-"+str(state)+") ..." + + print "Input training file (xml, stanford tokenized and boxer graph): " + D2S_Config_data["TRAIN-BOXER-GRAPH"] + " ..." + TRAIN_TRAINING_GRAPH = args_dict['output_dir']+"/"+os.path.splitext(os.path.basename(D2S_Config_data["TRAIN-BOXER-GRAPH"]))[0]+".training-graph.xml" + print "Generating training graph file (xml, stanford tokenized, boxer graph and training graph): "+TRAIN_TRAINING_GRAPH+" ..." + + foutput = open(TRAIN_TRAINING_GRAPH, "w") + foutput.write("\n") + foutput.write("\n") + + print "Creating the SAX file (xml, stanford tokenized and boxer graph) handler ..." + training_xml_handler = SAXPARSER_XML_StanfordTokenized_BoxerGraph("training", D2S_Config_data["TRAIN-BOXER-GRAPH"], foutput, D2S_Config_data["TRANSFORMATION-MODEL"], + D2S_Config_data["MAX-SPLIT-SIZE"], D2S_Config_data["RESTRICTED-DROP-RELATION"], + D2S_Config_data["ALLOWED-DROP-MODIFIER"], D2S_Config_data["METHOD-TRAINING-GRAPH"]) + + print "Start generating training graph ..." + print "Start parsing "+D2S_Config_data["TRAIN-BOXER-GRAPH"]+" ..." + training_xml_handler.parse_xmlfile_generating_training_graph() + + foutput.write("\n") + foutput.close() + + D2S_Config_data["TRAIN-TRAINING-GRAPH"] = TRAIN_TRAINING_GRAPH + timestamp = datetime.datetime.now().strftime("%A%d-%B%Y-%I%M%p") + print timestamp+", Finished building training graph (Step-"+str(state)+")\n" + + # # Start state: 2 + # state = 2 + # if (int(args_dict['start_state']) <= state) and (state <= int(args_dict['end_state'])): + # timestamp = datetime.datetime.now().strftime("%A%d-%B%Y-%I%M%p") + # print timestamp+", Starting learning transformation models (Step-"+str(state)+") ..." + + # if "TRAIN-TRAINING-GRAPH" not in D2S_Config_data: + # print "The training graph file (xml, stanford tokenized, boxer graph and training graph) is not available." + # print "Please enter the configuration file or start with the State 1." + # timestamp = datetime.datetime.now().strftime("%A%d-%B%Y-%I%M%p") + # print timestamp+", No transformation models learned (Step-"+str(state)+")\n" + # exit(0) + + # # @ Defining data structure @ + # # Stores various sentence pairs (complex, simple) for SMT. + # smt_sentence_pairs = {} + # # probability tables - store all probabilities + # probability_tables = {} + # # count tables - store counts in next iteration + # count_tables = {} + # # @ @ + + # print "Creating the em-training XML file (stanford tokenized, boxer graph and training graph) handler ..." + # em_training_xml_handler = SAXPARSER_XML_StanfordTokenized_BoxerGraph_TrainingGraph(D2S_Config_data["TRAIN-TRAINING-GRAPH"], D2S_Config_data["NUM-EM-ITERATION"], + # smt_sentence_pairs, probability_tables, count_tables, D2S_Config_data["METHOD-FEATURE-EXTRACT"]) + + # print "Start Expectation Maximization (Inside-Outside) algorithm ..." + # timestamp = datetime.datetime.now().strftime("%A%d-%B%Y-%I%M%p") + # print timestamp+", Step "+str(state)+".1: Initialization of probability tables and populating smt_sentence_pairs ..." + # em_training_xml_handler.parse_to_initialize_probabilitytable() + # # print probability_tables + + # timestamp = datetime.datetime.now().strftime("%A%d-%B%Y-%I%M%p") + # print timestamp+", Step "+str(state)+".2: Start iterating for EM Inside-Outside probabilities ..." + # em_training_xml_handler.parse_to_iterate_probabilitytable() + # # print probability_tables + + # # Start writing model files + # timestamp = datetime.datetime.now().strftime("%A%d-%B%Y-%I%M%p") + # print timestamp+", Step "+str(state)+".3: Start writing model files ..." + # # Creating the output directory to store training models + # model_dir = args_dict['output_dir']+"/TRANSFORMATION-MODEL-DIR" + # timestamp = datetime.datetime.now().strftime("%A%d-%B%Y-%I%M%p") + # print timestamp+", Creating the output model directory: "+model_dir + # try: + # os.mkdir(model_dir) + # except OSError: + # print model_dir + " directory already exists." + # # Wriing model files + # functions_model_files.write_model_files(model_dir, probability_tables, smt_sentence_pairs) + + # D2S_Config_data["TRANSFORMATION-MODEL-DIR"] = model_dir + # timestamp = datetime.datetime.now().strftime("%A%d-%B%Y-%I%M%p") + # print timestamp+", Finished learning transformation models (Step-"+str(state)+")\n" + + # # Start state: 3 + # state = 3 + # if (int(args_dict['start_state']) <= state) and (state <= int(args_dict['end_state'])): + # timestamp = datetime.datetime.now().strftime("%A%d-%B%Y-%I%M%p") + # print timestamp+", Starting learning moses translation model (Step-"+str(state)+") ..." + + # if "TRANSFORMATION-MODEL-DIR" not in D2S_Config_data: + # print "The moses training files are not available." + # print "Please enter the configuration file or start with the State 1." + # timestamp = datetime.datetime.now().strftime("%A%d-%B%Y-%I%M%p") + # print timestamp+", No moses models learned (Step-"+str(state)+")\n" + # exit(0) + + # # Preparing the moses directory + # timestamp = datetime.datetime.now().strftime("%A%d-%B%Y-%I%M%p") + # print timestamp+", Step "+str(state)+".1: Preparing the moses directory ..." + # # Creating the output directory to store moses files + # moses_dir = args_dict['output_dir']+"/MOSES-COMPLEX-SIMPLE-DIR" + # timestamp = datetime.datetime.now().strftime("%A%d-%B%Y-%I%M%p") + # print timestamp+", Creating the moses directory: "+moses_dir + # try: + # os.mkdir(moses_dir) + # except OSError: + # print moses_dir + " directory already exists." + # # Creating the corpus directory + # moses_corpus_dir = args_dict['output_dir']+"/MOSES-COMPLEX-SIMPLE-DIR/corpus" + # timestamp = datetime.datetime.now().strftime("%A%d-%B%Y-%I%M%p") + # print timestamp+", Creating the moses corpus directory: "+moses_corpus_dir + # try: + # os.mkdir(moses_corpus_dir) + # except OSError: + # print moses_corpus_dir + " directory already exists." + + # # Cleaning the moses training file + # timestamp = datetime.datetime.now().strftime("%A%d-%B%Y-%I%M%p") + # print timestamp+", Step "+str(state)+".2: Cleaning the moses training file ..." + # command = "/home/ankh/Tools/moses/scripts/training/clean-corpus-n.perl "+D2S_Config_data["TRANSFORMATION-MODEL-DIR"]+"/D2S-SMT source target "+moses_corpus_dir+"/D2S-SMT-clean 1 95" + # os.system(command) + + # # Running moses training + # timestamp = datetime.datetime.now().strftime("%A%d-%B%Y-%I%M%p") + # print timestamp+", Step "+str(state)+".3: Running the moses training ..." + # command = ("/home/ankh/Tools/moses/scripts/training/train-model.perl -mgiza -mgiza-cpus 3 -cores 3 -parallel -sort-buffer-size 3G -sort-batch-size 253 -sort-compress gzip -sort-parallel 3 "+ + # "-root-dir "+moses_dir+" -corpus "+moses_corpus_dir+"/D2S-SMT-clean -f source -e target -external-bin-dir /home/ankh/Tools/mgizapp/bin "+ + # "-lm "+D2S_Config_data["LANGUAGE-MODEL"]) + # os.system(command) + + # D2S_Config_data["MOSES-COMPLEX-SIMPLE-DIR"] = moses_dir + # timestamp = datetime.datetime.now().strftime("%A%d-%B%Y-%I%M%p") + # print timestamp+", Finished learning moses translation model (Step-"+str(state)+")\n" + + # Last Step + config_file = args_dict['output_dir']+"/d2s.ini" + print "Writing the configuration file: "+config_file+" ..." + functions_configuration_file.write_config_file(config_file, D2S_Config_data) + + timestamp = datetime.datetime.now().strftime("%A%d-%B%Y-%I%M%p") + print timestamp+", Learning process done!!!" +