diff --git a/.gitignore b/.gitignore index 67f1d4f..617b1d6 100644 --- a/.gitignore +++ b/.gitignore @@ -14,6 +14,9 @@ __pycache__/ /include/ /lib/ /pip-selfcheck.json -neuralcoref/data/* -neuralcoref/train/* -.cache \ No newline at end of file +/runs/* +test_corefs.txt +test_mentions.txt +.cache +/.vscode/* +/.vscode \ No newline at end of file diff --git a/neuralcoref/algorithm.py b/neuralcoref/algorithm.py index 4e4841f..1e3d982 100644 --- a/neuralcoref/algorithm.py +++ b/neuralcoref/algorithm.py @@ -4,13 +4,13 @@ from __future__ import unicode_literals from __future__ import print_function -from pprint import pprint - +import sys import os import spacy import numpy as np -from neuralcoref.data import Data, MENTION_TYPE, NO_COREF_LIST +from neuralcoref.compat import unicode_ +from neuralcoref.document import Document, MENTION_TYPE, NO_COREF_LIST PACKAGE_DIRECTORY = os.path.dirname(os.path.abspath(__file__)) @@ -22,7 +22,7 @@ ####################### ###### CLASSES ######## -class Model: +class Model(object): ''' Coreference neural model ''' @@ -30,16 +30,20 @@ def __init__(self, model_path): weights, biases = [], [] for file in sorted(os.listdir(model_path)): if file.startswith("single_mention_weights"): - weights.append(np.load(os.path.join(model_path, file))) + w = np.load(os.path.join(model_path, file)) + weights.append(w) if file.startswith("single_mention_bias"): - biases.append(np.load(os.path.join(model_path, file))) + w = np.load(os.path.join(model_path, file)) + biases.append(w) self.single_mention_model = list(zip(weights, biases)) weights, biases = [], [] for file in sorted(os.listdir(model_path)): if file.startswith("pair_mentions_weights"): - weights.append(np.load(os.path.join(model_path, file))) + w = np.load(os.path.join(model_path, file)) + weights.append(w) if file.startswith("pair_mentions_bias"): - biases.append(np.load(os.path.join(model_path, file))) + w = np.load(os.path.join(model_path, file)) + biases.append(w) self.pair_mentions_model = list(zip(weights, biases)) def _score(self, features, layers): @@ -49,8 +53,8 @@ def _score(self, features, layers): features = np.maximum(features, 0) # ReLU return np.sum(features) - def get_single_mention_score(self, mention_embedding, anaphoricity_features): - first_layer_input = np.concatenate([mention_embedding, + def get_single_mention_score(self, mention, anaphoricity_features): + first_layer_input = np.concatenate([mention.embedding, anaphoricity_features], axis=0)[:, np.newaxis] return self._score(first_layer_input, self.single_mention_model) @@ -61,16 +65,20 @@ def get_pair_mentions_score(self, antecedent, mention, pair_features): return self._score(first_layer_input, self.pair_mentions_model) -class Coref: +class Coref(object): ''' Main coreference resolution algorithm ''' - def __init__(self, nlp=None, greedyness=0.5, max_dist=50, max_dist_match=500, conll=None, use_no_coref_list=True, debug=False): + def __init__(self, nlp=None, greedyness=0.5, max_dist=50, max_dist_match=500, conll=None, + use_no_coref_list=True, debug=False): self.greedyness = greedyness self.max_dist = max_dist self.max_dist_match = max_dist_match self.debug = debug - + model_path = os.path.join(PACKAGE_DIRECTORY, "weights/conll/" if conll is not None else "weights/") + trained_embed_path = os.path.join(PACKAGE_DIRECTORY, "weights/") + print("Loading neuralcoref model from", model_path) + self.coref_model = Model(model_path) if nlp is None: print("Loading spacy model") try: @@ -78,15 +86,10 @@ def __init__(self, nlp=None, greedyness=0.5, max_dist=50, max_dist_match=500, co model = 'en_core_web_sm' except IOError: print("No spacy 2 model detected, using spacy1 'en' model") + spacy.info('en') model = 'en' nlp = spacy.load(model) - - model_path = os.path.join(PACKAGE_DIRECTORY, "weights/conll/" if conll is not None else "weights/") - embed_model_path = os.path.join(PACKAGE_DIRECTORY, "weights/") - print("loading model from", model_path) - self.data = Data(nlp, model_path=embed_model_path, conll=conll, use_no_coref_list=use_no_coref_list, consider_speakers=conll) - self.coref_model = Model(model_path) - + self.data = Document(nlp, conll=conll, use_no_coref_list=use_no_coref_list, trained_embed_path=trained_embed_path) self.clusters = {} self.mention_to_cluster = [] self.mentions_single_scores = {} @@ -129,13 +132,22 @@ def _merge_coreference_clusters(self, ant_idx, mention_idx): del self.clusters[remove_id] + def remove_singletons_clusters(self): + remove_id = [] + for key, mentions in self.clusters.items(): + if len(mentions) == 1: + remove_id.append(key) + self.mention_to_cluster[key] = None + for rem in remove_id: + del self.clusters[rem] + def display_clusters(self): ''' Print clusters informations ''' print(self.clusters) for key, mentions in self.clusters.items(): - print("cluster", key, "(", ", ".join(str(self.data[m]) for m in mentions), ")") + print("cluster", key, "(", ", ".join(unicode_(self.data[m]) for m in mentions), ")") ################################### ####### MAIN COREF FUNCTIONS ###### @@ -150,11 +162,10 @@ def run_coref_on_mentions(self, mentions): for mention_idx, ant_list in self.data.get_candidate_pairs(mentions, self.max_dist, self.max_dist_match): mention = self.data[mention_idx] feats_, ana_feats = self.data.get_single_mention_features(mention) - anaphoricity_score = self.coref_model.get_single_mention_score(mention.embedding, ana_feats) - self.mentions_single_scores[mention_idx] = anaphoricity_score + single_score = self.coref_model.get_single_mention_score(mention, ana_feats) + self.mentions_single_scores[mention_idx] = single_score self.mentions_single_features[mention_idx] = {"spansEmbeddings": mention.spans_embeddings_, "wordsEmbeddings": mention.words_embeddings_, "features": feats_} - - best_score = anaphoricity_score - 50 * (self.greedyness - 0.5) + best_score = single_score - 50 * (self.greedyness - 0.5) for ant_idx in ant_list: antecedent = self.data[ant_idx] feats_, pwf = self.data.get_pair_mentions_features(antecedent, mention) @@ -164,7 +175,6 @@ def run_coref_on_mentions(self, mentions): "antecedentWordsEmbeddings": antecedent.words_embeddings_, "mentionSpansEmbeddings": mention.spans_embeddings_, "mentionWordsEmbeddings": mention.words_embeddings_ } - if score > best_score: best_score = score best_ant[mention_idx] = ant_idx @@ -173,18 +183,20 @@ def run_coref_on_mentions(self, mentions): self._merge_coreference_clusters(best_ant[mention_idx], mention_idx) return (n_ant, best_ant) - def run_coref_on_utterances(self, last_utterances_added=False, follow_chains=True): + def run_coref_on_utterances(self, last_utterances_added=False, follow_chains=True, debug=False): ''' Run the coreference model on some utterances Arg: last_utterances_added: run the coreference model over the last utterances added to the data follow_chains: follow coreference chains over previous utterances ''' + if debug: print("== run_coref_on_utterances == start") self._prepare_clusters() + if debug: self.display_clusters() mentions = list(self.data.get_candidate_mentions(last_utterances_added=last_utterances_added)) n_ant, antecedents = self.run_coref_on_mentions(mentions) mentions = antecedents.values() - if follow_chains and n_ant > 0: + if follow_chains and last_utterances_added and n_ant > 0: i = 0 while i < MAX_FOLLOW_UP: i += 1 @@ -192,6 +204,8 @@ def run_coref_on_utterances(self, last_utterances_added=False, follow_chains=Tru mentions = antecedents.values() if n_ant == 0: break + if debug: self.display_clusters() + if debug: print("== run_coref_on_utterances == end") def one_shot_coref(self, utterances, utterances_speakers_id=None, context=None, context_speakers_id=None, speakers_names=None): @@ -236,7 +250,7 @@ def continuous_coref(self, utterances, utterances_speakers_id=None, speakers_nam def get_utterances(self, last_utterances_added=True): ''' Retrieve the list of parsed uterrances''' - if last_utterances_added: + if last_utterances_added and len(self.data.last_utterances_loaded): return [self.data.utterances[idx] for idx in self.data.last_utterances_loaded] else: return self.data.utterances @@ -272,9 +286,10 @@ def get_scores(self): return {"single_scores": self.mentions_single_scores, "pair_scores": self.mentions_pairs_scores} - def get_clusters(self, remove_singletons=True, use_no_coref_list=True): + def get_clusters(self, remove_singletons=False, use_no_coref_list=False): ''' Retrieve cleaned clusters''' clusters = self.clusters + mention_to_cluster = self.mention_to_cluster remove_id = [] if use_no_coref_list: for key, mentions in clusters.items(): @@ -289,7 +304,7 @@ def get_clusters(self, remove_singletons=True, use_no_coref_list=True): for key, mentions in clusters.items(): if self.data.mentions[key].lower_ in NO_COREF_LIST: remove_id.append(key) - self.mention_to_cluster[key] = None + mention_to_cluster[key] = None if mentions: added[mentions[0]] = mentions for rem in remove_id: @@ -301,11 +316,11 @@ def get_clusters(self, remove_singletons=True, use_no_coref_list=True): for key, mentions in clusters.items(): if len(mentions) == 1: remove_id.append(key) - self.mention_to_cluster[key] = None + mention_to_cluster[key] = None for rem in remove_id: del clusters[rem] - return clusters + return clusters, mention_to_cluster def get_most_representative(self, last_utterances_added=True, use_no_coref_list=True): ''' @@ -314,7 +329,7 @@ def get_most_representative(self, last_utterances_added=True, use_no_coref_list= Return: Dictionnary of {original_mention: most_representative_resolved_mention, ...} ''' - clusters = self.get_clusters(remove_singletons=True, use_no_coref_list=use_no_coref_list) + clusters, _ = self.get_clusters(remove_singletons=True, use_no_coref_list=use_no_coref_list) coreferences = {} for key in self.data.get_candidate_mentions(last_utterances_added=last_utterances_added): if self.mention_to_cluster[key] is None: @@ -333,3 +348,19 @@ def get_most_representative(self, last_utterances_added=True, use_no_coref_list= representative = mention return coreferences + +if __name__ == '__main__': + coref = Coref(use_no_coref_list=False) + if len(sys.argv) > 1: + sent = sys.argv[1] + coref.one_shot_coref(sent) + else: + coref.one_shot_coref(u"Yes, I noticed that many friends, around me received it. It seems that almost everyone received this SMS.")#u"My sister has a dog. She loves him.") + mentions = coref.get_mentions() + print(mentions) + + utterances = coref.get_utterances() + print(utterances) + + resolved_utterance_text = coref.get_resolved_utterances() + print(resolved_utterance_text) diff --git a/neuralcoref/bld.bat b/neuralcoref/bld.bat new file mode 100644 index 0000000..417479d --- /dev/null +++ b/neuralcoref/bld.bat @@ -0,0 +1,2 @@ +"%PYTHON%" setup.py install --single-version-externally-managed --record=record.txt +if errorlevel 1 exit 1 \ No newline at end of file diff --git a/neuralcoref/build.sh b/neuralcoref/build.sh new file mode 100644 index 0000000..7fa0f85 --- /dev/null +++ b/neuralcoref/build.sh @@ -0,0 +1 @@ +$PYTHON setup.py install --single-version-externally-managed --record=record.txt # Python command to install the script. diff --git a/neuralcoref/checkpoints/.gitignore b/neuralcoref/checkpoints/.gitignore new file mode 100644 index 0000000..86d0cb2 --- /dev/null +++ b/neuralcoref/checkpoints/.gitignore @@ -0,0 +1,4 @@ +# Ignore everything in this directory +* +# Except this file +!.gitignore \ No newline at end of file diff --git a/neuralcoref/compat.py b/neuralcoref/compat.py new file mode 100644 index 0000000..c4dcd92 --- /dev/null +++ b/neuralcoref/compat.py @@ -0,0 +1,32 @@ +# coding: utf8 +"""Py2/3 compatibility""" +import sys + +is_python2 = int(sys.version[0]) == 2 +is_windows = sys.platform.startswith('win') +is_linux = sys.platform.startswith('linux') +is_osx = sys.platform == 'darwin' + +if is_python2: + bytes_ = str + unicode_ = unicode + string_types = (str, unicode) + chr_ = unichr + + def unicode_to_bytes(s, encoding='utf8', errors='strict'): + return s.encode(encoding=encoding, errors=errors) + + def bytes_to_unicode(b, encoding='utf8', errors='strict'): + return unicode_(b, encoding=encoding, errors=errors) + +else: + bytes_ = bytes + unicode_ = str + string_types = (bytes, str) + chr_ = chr + + def unicode_to_bytes(s, encoding='utf8', errors='strict'): + return s.encode(encoding=encoding, errors=errors) + + def bytes_to_unicode(b, encoding='utf8', errors='strict'): + return b.decode(encoding=encoding, errors=errors) diff --git a/neuralcoref/conll_processing_scripts/conll2coreference.py b/neuralcoref/conll_processing_scripts/conll2coreference.py new file mode 100755 index 0000000..2077bd4 --- /dev/null +++ b/neuralcoref/conll_processing_scripts/conll2coreference.py @@ -0,0 +1,642 @@ +#!/usr/bin/env python + +#---- standard library imports ----# +import sys + +# version check +if(not (sys.hexversion >= 0x2050000)): + sys.exit("\n\nplease use python version >= 2.5\n\n") + +import string +import re +import math +import os +import time +import getopt +import zlib +import gzip +import codecs +import optparse +import collections +import ConfigParser +from optparse import OptionParser + + +PART_COLUMN=1 +WORD_COLUMN=3 + + + + + + + + + + +def error(error_string, terminate_program=True, current_frame=False): + """Print error messages to stderr, optionally sys.exit""" + + if(current_frame == False): + pretty_error_string = """ + +-------------------------------------------------------------------------------- + ERROR +-------------------------------------------------------------------------------- +%s +-------------------------------------------------------------------------------- + +""" % (error_string) + else: + pretty_error_string = """ + +-------------------------------------------------------------------------------- + ERROR +-------------------------------------------------------------------------------- +FILE: %s +LINE: %s +-------------------------------------------------------------------------------- +%s +-------------------------------------------------------------------------------- + +""" % (current_frame.f_code.co_filename, current_frame.f_lineno, error_string) + + sys.stderr.write(pretty_error_string) + + if(terminate_program == True): + sys.exit(1) + + + + + + + + + + +def parse_cfg_args(arg_list): + """Parse command-line style config settings to a dictionary. + + If you want to override configuration file values on the command + line or set ones that were not set, this should make it simpler. + Given a list in format [section.key=value, ...] return a + dictionary in form { (section, key): value, ...}. + + So we might have: + + .. code-block:: python + + ['corpus.load=english-mz', + 'corpus.data_in=/home/user/corpora/ontonotes/data/'] + + we would then return the dictionary: + + .. code-block:: python + + { ('corpus', 'load') : 'english-mz', + ('corpus', 'data_in') : '/home/user/corpora/ontonotes/data/' } + + See also :func:`load_config` and :func:`load_options` + + """ + + if not arg_list: + return {} + + config_append = {} + + for arg in arg_list: + if len(arg.split("=")) != 2 or len(arg.split("=")[0].split('.')) != 2: + raise Exception("Invalid argument; not in form section.key=value : " + arg) + + key, value = arg.split("=") + config_append[tuple(key.split("."))] = value + + return config_append + + + + + + + + + + + + + + +__registered_config_options = collections.defaultdict( dict ) + +def required_config_sections(): + return [section for section in __registered_config_options if + [True for value in __registered_config_options[section] + if __registered_config_options[section][value][3]]] # section_required + + + + + + + + + + + + +def load_options(parser=None, argv=[], positional_args=True): + """ parses sys.argv, possibly exiting if there are mistakes + + If you set parser to a ConfigParser object, then you have control + over the usage string and you can prepopulate it with options you + intend to use. But don't set a ``--config`` / ``-c`` option; + load_options uses that to find a configuration file to load + + If a parser was passed in, we return ``(config, parser, [args])``. + Otherwise we return ``(config, [args])``. Args is only included + if ``positional_args`` is True and there are positional arguments + + See :func:`load_config` for details on the ``--config`` option. + + """ + + def is_config_appender(arg): + return "." in arg and "=" in arg and arg.find(".") < arg.find("=") + + parser_passed_in=parser + if not parser: + parser = OptionParser() + + parser.add_option("-c", "--config", help="the path to a config file to read options from") + + if argv: + options, args = parser.parse_args(argv) + else: + options, args = parser.parse_args() + + config = load_config(options.config, [a for a in args if is_config_appender(a)]) + + other_args = [a for a in args if not is_config_appender(a)] + + return_list = [config] + if parser_passed_in: + return_list.append(options) + if other_args: + if positional_args: + return_list.append(other_args) + else: + raise Exception("Arguments %s not understood" % other_args) + else: + if positional_args: + raise Exception("This program expects one or more positional arguments that are missing") + + if len(return_list) == 1: + return return_list[0] + else: + return tuple(return_list) + + + + + + + + + + + + + + +class FancyConfigParserError(Exception): + """ raised by :class:`FancyConfigParser` when used improperly """ + + def __init__(self, vals): + Exception.__init__(self, 'Config usage must be in the form "config[\'section\', \'item\']". ' + 'Given something more like "config[%s]".' % (", ".join("%r"%v for v in vals))) + + + + + + + + + + +class FancyConfigParser(ConfigParser.SafeConfigParser): + """ make a config parser with support for config[section, value] + + raises :class:`FancyConfigParserError` on improper usage. + + """ + + def __getitem__(self, vals): + try: + section, item = vals + except (ValueError, TypeError): + raise FancyConfigParserError(vals) + return self.get(section, item) + + + def __setitem__(self, vals, value): + try: + section, item = vals + except (ValueError, TypeError): + raise FancyConfigParserError(vals) + return self.set(section, item, value) + + def __delitem__(self, vals): + try: + section, item = vals + except (ValueError, TypeError): + raise FancyConfigParserError(vals) + + self.remove_option(section, item) + + + + + + + + + + +def load_config(cfg_name=None, config_append=[]): + """ Load a configuration file to memory. + + The given configuration file name can be a full path, in which + case we simply read that configuration file. Otherwise, if you + give 'myconfig' or something similar, we look in the current + directory and the home directory. We also look to see if files + with this name and extension '.conf' exist. So for 'myconfig' we + would look in the following places: + + * ./myconfig + * ./myconfig.conf + * [home]/.myconfig + * [home]/.myconfig.conf + + Once we find the configuration, we load it. We also extend + ConfigParser to support ``[]`` notation. So you could look up key + ``k`` in section ``s`` with ``config[s,k]``. See + :func:`FancyConfigParser` . + + If config_append is set we use :func:`parse_cfg_args` and add any + values it creates to the config object. These values override any + previous ones. + + """ + + config = FancyConfigParser() + + if cfg_name: + config_locs = [cfg_name + '.conf', + os.path.expanduser('~/.' + cfg_name + '.conf'), + cfg_name, + os.path.expanduser('~/.' + cfg_name)] + l = config.read(config_locs) + if not l: + raise Exception("Couldn't find config file. Looked in:" + + "".join(["\n - " + c for c in config_locs]) + + "\nto no avail.") + + + for (section, key_name), value in parse_cfg_args(config_append).iteritems(): + if not config.has_section(section): + config.add_section(section) + config.set(section, key_name, value) + + problems = [] + for section in config.sections(): + if not is_config_section_registered(section): + on.common.log.status("Ignoring unknown configuration section", section) + continue + for option in config.options(section): + if not is_config_registered(section, option): + problems.append("Unknown configuration variable %s.%s" % (section, option)) + continue + + value = config.get(section, option) + allowed = allowed_config_values(section, option) + multiple = allow_multiple_config_values(section, option) + + values = value.split() if multiple else [value] + for value in values: + if allowed and not value in allowed: + problems.append("Illegal value '%s' for configuration variable %s.%s. Permitted values are: %s" % + (value, section, option, ", ".join(["'%s'" % x for x in allowed]))) + + for option in required_config_options(section): + if not config.has_option(section, option): + problems.append("Required configuration variable %s.%s is absent" % (section, option)) + + for section in required_config_sections(): + if not config.has_section(section): + problems.append("Required configuration section %s is absent" % section) + + if problems: + print_config_docs() + + on.common.log.status("Configuration Problems:") + for problem in problems: + on.common.log.status(" " + problem) + + sys.exit(-1) + + return config + + + + + + + + + + + + +def make_sgml_safe(s, reverse=False, keep_turn=True): + """ return a version of the string that can be put in an sgml document + + This means changing angle brackets and ampersands to '-LAB-', + '-RAB-', and '-AMP-'. Needed for creating ``.name`` and + ``.coref`` files. + + If keep_turn is set, in the input is turned into [TURN], not turned into -LAB-TURN-RAB- + + """ + + if not reverse and keep_turn: + s = s.replace("", "[TURN]") + + for f, r in [("<", "-LAB-"), + (">", "-RAB-"), + ("&", "-AMP-")]: + if reverse: + r, f = f, r + s = s.replace(f, r) + + return s + + + + + + + +class link: + + def __init__(self, id, start, end): + self.id = id + self.start = start + self.end = end + + + def __repr__(self): + return "" % (self.id, self.start, self.end) + + + + + + + + + + +def link_compare(a_link, b_link): + if(a_link.start != b_link.start): + return cmp(a_link.start, b_link.start) + else: + return cmp(a_link.end, b_link.end) + + + + + + + + + + +class coreference_tagged_sentence: + + def __init__(self, r_c_matrix): + self.part_number = r_c_matrix[0][0] + self.words = [] + self.links = [] # list of all links in this sentence + self.chain_hash = {} # hash of all links in this sentence + + + + chain_start_hash = {} + + + #rows = [] + #for tuple in r_c_matrix: + # rows.append(" ".join(tuple)) + # + #on.common.util.pretty_print_table(rows) + #print + + + for r_i, r in enumerate(r_c_matrix): + + assert self.part_number == r[0], "all rows should contain the same part number" + self.words.append(r[1]) + + # process encoded chain + encoded_chain=r[2] + bits = encoded_chain.split("|") + + + ids = [] + for i in range(0, len(bits)): + id = bits[i].replace("(", "") + id = id.replace(")", "") + ids.append(id) + + assert len(ids) == len(bits), "the length of ids and bits should be the same" + + + + for i in range(0, len(bits)): + if(bits[i].startswith("(")): + if(not chain_start_hash.has_key(ids[i])): + chain_start_hash[ids[i]] = [] + chain_start_hash[ids[i]].append(r_i) + + if(bits[i].endswith(")")): + + if(not chain_start_hash.has_key(ids[i])): + print chain_start_hash + raise Exception("problem, found link end without a start") + + + + try: + a_link = link(ids[i], chain_start_hash[ids[i]].pop(), r_i) + + + + self.links.append(a_link) + + if(not self.chain_hash.has_key(ids[i])): + self.chain_hash[ids[i]] = [] + + self.chain_hash[ids[i]].append(a_link) + except: + sys.stderr.write("WARNING: dropping link with id [%s]" % (ids[i])) + + + + + + for k, v in chain_start_hash.iteritems(): + if( len(v) != 0): + raise Exception("all the lists in the start hash should be empty") + + + self.links.sort(link_compare) + + + + + + + + + + + def __repr__(self): + + coref_tagged_words = [] + coref_tagged_words.extend(self.words) + + # make words sgml safe + for i in range(0, len(coref_tagged_words)): + coref_tagged_words[i] = make_sgml_safe(coref_tagged_words[i]) + + + for a_link in self.links: + coref_tagged_words[a_link.start] = """%s""" % (a_link.id, coref_tagged_words[a_link.start]) + coref_tagged_words[a_link.end] = "%s" % (coref_tagged_words[a_link.end]) + + + return "%s" % (" ".join(coref_tagged_words)) + + + + + +def expand_document_id(document_id, language): + + if language == "english": + abbr_language = "en" + elif language == "chinese": + abbr_language = "ch" + elif language == "arabic": + abbr_language = "ar" + + + file_bit=document_id[-4:] + genre_bit, source_bit, ignore = document_id.split("/", 2) + constant="%s@on" % (abbr_language) + return "%s@%s@%s@%s@%s" % (document_id, file_bit, source_bit, genre_bit, constant) + + + + + + + + + +def main(): + # total number of expected actual arguments, not counting the command itself + required_number_of_args = 1 + + o_parser = optparse.OptionParser() + o_parser.set_defaults(DEBUG=False) + o_parser.set_defaults(LANGUAGE=None) + + o_parser.add_option("-d", "--debug", action="store_true", dest="DEBUG", help="Set debug mode on") + o_parser.add_option("-l", "--language", action="store", dest="LANGUAGE", help="Set language") + o_parser.add_option("-o", "--option", help="perform the specified task. this can be 'pre-process' or 'decode'") + + if(required_number_of_args > 0): + c_config, o_options, o_args = load_options(parser=o_parser) + + if(len(o_args) != required_number_of_args): + error("please specify %s arguments" % (required_number_of_args)) + else: + c_config, o_options = load_options(parser=o_parser, positional_args=False) + + + + legal_options = [] + if(legal_options != [] + and + o_options.option not in legal_options): + error("please specify one of %s options" % (" ".join(legal_options))) + + + + if o_options.LANGUAGE is None: + error("please specify language using -l option") + + + r_c_matrix = [] + sentence_index = 0 + if(required_number_of_args > 0): + file = open(o_args[0]) + file_line = file.readline() + while( file_line != "" ): + #---- start processing here ----# + file_line = file_line.strip() + + if(file_line.startswith("#begin")): + bits = file_line.split() + document_id=bits[2].replace("(", "").replace(");","") + part_number=bits[-1] + + if(part_number != "000"): + print '\n' % (part_number) + else: + print '\n' % (expand_document_id(document_id, o_options.LANGUAGE), part_number) + + elif(file_line.startswith("#end")): + pass + + elif(file_line == ""): + sentence_index = sentence_index + 1 + #print "sentence:", sentence_index + a_coreference_tagged_sentence = coreference_tagged_sentence(r_c_matrix) + print a_coreference_tagged_sentence + r_c_matrix = [] + + else: + columns = file_line.split() + (part_number, word, encoded_chain) = columns[PART_COLUMN], columns[WORD_COLUMN], columns[-1] + if o_options.LANGUAGE == "arabic": + r_c_matrix.append([part_number, re.sub("#.*", "", word), encoded_chain]) + else: + r_c_matrix.append([part_number, word, encoded_chain]) + + + file_line = file.readline() + + print "\n" + + #---- close the file ----# + if(file != sys.stdin): + file.close() + +if __name__ == '__main__': + main() + diff --git a/neuralcoref/conll_processing_scripts/conll2coreference.sh b/neuralcoref/conll_processing_scripts/conll2coreference.sh new file mode 100755 index 0000000..a58fc44 --- /dev/null +++ b/neuralcoref/conll_processing_scripts/conll2coreference.sh @@ -0,0 +1,181 @@ +#!/bin/bash + +function usage { +cat < + + +Description: +----------- + +Takes a *conll file as input and prints out the corresponding coreference file + +---------------------------------------------------------------------------------------------------- + + + + +EOF +exit; +} + + +function message +{ + echo "----------------------------------------------------------------------------------------------------" + echo + echo $* 1>&2 + echo + echo "----------------------------------------------------------------------------------------------------" + +} + + + +function r { echo ${1%.*}; } +function t { echo ${1##*/}; } +function e { echo $(t ${1##*.}); } +function h { echo ${1%/*}; } + +# define helper function: run a command and print its exit code +function erun () { + debug=0 + if [[ $1 == "-d" ]]; then + debug=1 + shift; + fi + + + if [[ $DEBUG -eq 1 ]]; then + debug=1 + fi + + + + + verbose=0 + if [[ $1 == "-v" ]]; then + verbose=1 + shift; + fi + + + if [[ $VERBOSE -eq 1 ]]; then + verbose=1 + fi + + + + + + + if [[ $debug -eq 1 ]]; then + echo "debug mode ..." + echo "eval $1" + else + echo "normal mode ..." + if [[ $verbose -eq 1 ]]; then + echo -e "\nrun: $1\n-------------" + fi + + eval $1 + fi + + + local code=$? + if [ $code -ne 0 ]; then + echo "Exit code: $code" + exit $code + fi +} + + + + +# handle the valid command line options +DEBUG=0 +VERBOSE=0 +DEBUG_OPTION="" +while getopts vdh opt +do + case "$opt" in + v) + VERBOSE=1;; + + d) + DEBUG=1;; + + \?) + usage + exit 1;; + + h) + usage + exit 0;; + + :) + echo "option -$OPTARG requires an argument" + usage + exit 1;; + + esac +done +shift `expr $OPTIND - 1` + + +# at this point $* contains the arguments after interpreting the options + +d=$1 + +# if no arguments are specified, then just print usage +if [[ $# -eq 0 ]]; then + usage +fi + + +# debugging +if [[ $DEBUG -eq 1 ]]; then + echo "debugging mode is on ..." 1>&2 + DEBUG_OPTION="-d" +fi + + + + + +for file in $(find $d -name "*_conll"); do + + if [[ $file =~ "data/english/annotations" ]]; then + LANGUAGE=english + elif [[ $file =~ "data/chinese/annotations" ]]; then + LANGUAGE=chinese + else + LANGUAGE=arabic + fi + + echo "language: $LANGUAGE" + coref=${file/_conll/_coref} + echo "$file -> $coref ..." + erun -v "python conll2coreference.py -l $LANGUAGE $file > $coref" +# conll2coreference.py -l $LANGUAGE $file > $coref +done + + + + + + + + +# complain if the exit status of the last command executed is non-zero +if [[ $? != 0 ]]; then echo "the last command exited with a non-zero status" 1>&2; fi + + + diff --git a/neuralcoref/conll_processing_scripts/conll2name.py b/neuralcoref/conll_processing_scripts/conll2name.py new file mode 100755 index 0000000..92b9bef --- /dev/null +++ b/neuralcoref/conll_processing_scripts/conll2name.py @@ -0,0 +1,505 @@ +#!/usr/bin/env python + +#---- standard library imports ----# +import sys + +# version check +if(not (sys.hexversion >= 0x2050000)): + sys.exit("\n\nplease use python version >= 2.5\n\n") + +import string +import re +import math +import os +import time +import getopt +import zlib +import gzip +import codecs +import optparse +import collections +import ConfigParser +from optparse import OptionParser + + + + +WORD_COLUMN=3 +NAME_COLUMN=10 + + + + + + + +def error(error_string, terminate_program=True, current_frame=False): + """Print error messages to stderr, optionally sys.exit""" + + if(current_frame == False): + pretty_error_string = """ + +-------------------------------------------------------------------------------- + ERROR +-------------------------------------------------------------------------------- +%s +-------------------------------------------------------------------------------- + +""" % (error_string) + else: + pretty_error_string = """ + +-------------------------------------------------------------------------------- + ERROR +-------------------------------------------------------------------------------- +FILE: %s +LINE: %s +-------------------------------------------------------------------------------- +%s +-------------------------------------------------------------------------------- + +""" % (current_frame.f_code.co_filename, current_frame.f_lineno, error_string) + + sys.stderr.write(pretty_error_string) + + if(terminate_program == True): + sys.exit(1) + + + + + + + + + + +def parse_cfg_args(arg_list): + """Parse command-line style config settings to a dictionary. + + If you want to override configuration file values on the command + line or set ones that were not set, this should make it simpler. + Given a list in format [section.key=value, ...] return a + dictionary in form { (section, key): value, ...}. + + So we might have: + + .. code-block:: python + + ['corpus.load=english-mz', + 'corpus.data_in=/home/user/corpora/ontonotes/data/'] + + we would then return the dictionary: + + .. code-block:: python + + { ('corpus', 'load') : 'english-mz', + ('corpus', 'data_in') : '/home/user/corpora/ontonotes/data/' } + + See also :func:`load_config` and :func:`load_options` + + """ + + if not arg_list: + return {} + + config_append = {} + + for arg in arg_list: + if len(arg.split("=")) != 2 or len(arg.split("=")[0].split('.')) != 2: + raise Exception("Invalid argument; not in form section.key=value : " + arg) + + key, value = arg.split("=") + config_append[tuple(key.split("."))] = value + + return config_append + + + + + + + + +__registered_config_options = collections.defaultdict( dict ) + +def required_config_sections(): + return [section for section in __registered_config_options if + [True for value in __registered_config_options[section] + if __registered_config_options[section][value][3]]] # section_required + + + + + + + + + + + + +def load_options(parser=None, argv=[], positional_args=True): + """ parses sys.argv, possibly exiting if there are mistakes + + If you set parser to a ConfigParser object, then you have control + over the usage string and you can prepopulate it with options you + intend to use. But don't set a ``--config`` / ``-c`` option; + load_options uses that to find a configuration file to load + + If a parser was passed in, we return ``(config, parser, [args])``. + Otherwise we return ``(config, [args])``. Args is only included + if ``positional_args`` is True and there are positional arguments + + See :func:`load_config` for details on the ``--config`` option. + + """ + + def is_config_appender(arg): + return "." in arg and "=" in arg and arg.find(".") < arg.find("=") + + parser_passed_in=parser + if not parser: + parser = OptionParser() + + parser.add_option("-c", "--config", help="the path to a config file to read options from") + + if argv: + options, args = parser.parse_args(argv) + else: + options, args = parser.parse_args() + + config = load_config(options.config, [a for a in args if is_config_appender(a)]) + + other_args = [a for a in args if not is_config_appender(a)] + + return_list = [config] + if parser_passed_in: + return_list.append(options) + if other_args: + if positional_args: + return_list.append(other_args) + else: + raise Exception("Arguments %s not understood" % other_args) + else: + if positional_args: + raise Exception("This program expects one or more positional arguments that are missing") + + if len(return_list) == 1: + return return_list[0] + else: + return tuple(return_list) + + + + + + + + + +class FancyConfigParserError(Exception): + """ raised by :class:`FancyConfigParser` when used improperly """ + + def __init__(self, vals): + Exception.__init__(self, 'Config usage must be in the form "config[\'section\', \'item\']". ' + 'Given something more like "config[%s]".' % (", ".join("%r"%v for v in vals))) + + + + + + + + + + +class FancyConfigParser(ConfigParser.SafeConfigParser): + """ make a config parser with support for config[section, value] + + raises :class:`FancyConfigParserError` on improper usage. + + """ + + def __getitem__(self, vals): + try: + section, item = vals + except (ValueError, TypeError): + raise FancyConfigParserError(vals) + return self.get(section, item) + + + def __setitem__(self, vals, value): + try: + section, item = vals + except (ValueError, TypeError): + raise FancyConfigParserError(vals) + return self.set(section, item, value) + + def __delitem__(self, vals): + try: + section, item = vals + except (ValueError, TypeError): + raise FancyConfigParserError(vals) + + self.remove_option(section, item) + + + + + + + + + + + +def load_config(cfg_name=None, config_append=[]): + """ Load a configuration file to memory. + + The given configuration file name can be a full path, in which + case we simply read that configuration file. Otherwise, if you + give 'myconfig' or something similar, we look in the current + directory and the home directory. We also look to see if files + with this name and extension '.conf' exist. So for 'myconfig' we + would look in the following places: + + * ./myconfig + * ./myconfig.conf + * [home]/.myconfig + * [home]/.myconfig.conf + + Once we find the configuration, we load it. We also extend + ConfigParser to support ``[]`` notation. So you could look up key + ``k`` in section ``s`` with ``config[s,k]``. See + :func:`FancyConfigParser` . + + If config_append is set we use :func:`parse_cfg_args` and add any + values it creates to the config object. These values override any + previous ones. + + """ + + config = FancyConfigParser() + + if cfg_name: + config_locs = [cfg_name + '.conf', + os.path.expanduser('~/.' + cfg_name + '.conf'), + cfg_name, + os.path.expanduser('~/.' + cfg_name)] + l = config.read(config_locs) + if not l: + raise Exception("Couldn't find config file. Looked in:" + + "".join(["\n - " + c for c in config_locs]) + + "\nto no avail.") + + + for (section, key_name), value in parse_cfg_args(config_append).iteritems(): + if not config.has_section(section): + config.add_section(section) + config.set(section, key_name, value) + + problems = [] + for section in config.sections(): + if not is_config_section_registered(section): + on.common.log.status("Ignoring unknown configuration section", section) + continue + for option in config.options(section): + if not is_config_registered(section, option): + problems.append("Unknown configuration variable %s.%s" % (section, option)) + continue + + value = config.get(section, option) + allowed = allowed_config_values(section, option) + multiple = allow_multiple_config_values(section, option) + + values = value.split() if multiple else [value] + for value in values: + if allowed and not value in allowed: + problems.append("Illegal value '%s' for configuration variable %s.%s. Permitted values are: %s" % + (value, section, option, ", ".join(["'%s'" % x for x in allowed]))) + + for option in required_config_options(section): + if not config.has_option(section, option): + problems.append("Required configuration variable %s.%s is absent" % (section, option)) + + for section in required_config_sections(): + if not config.has_section(section): + problems.append("Required configuration section %s is absent" % section) + + if problems: + print_config_docs() + + on.common.log.status("Configuration Problems:") + for problem in problems: + on.common.log.status(" " + problem) + + sys.exit(-1) + + return config + + + + + + + +def make_sgml_safe(s, reverse=False, keep_turn=True): + """ return a version of the string that can be put in an sgml document + + This means changing angle brackets and ampersands to '-LAB-', + '-RAB-', and '-AMP-'. Needed for creating ``.name`` and + ``.coref`` files. + + If keep_turn is set, in the input is turned into [TURN], not turned into -LAB-TURN-RAB- + + """ + + if not reverse and keep_turn: + s = s.replace("", "[TURN]") + + for f, r in [("<", "-LAB-"), + (">", "-RAB-"), + ("&", "-AMP-")]: + if reverse: + r, f = f, r + s = s.replace(f, r) + + return s + + + + +class name_tagged_sentence: + + def __init__(self, r_c_matrix): + self.words = [] + + for r_i, r in enumerate(r_c_matrix): + + self.words.append(make_sgml_safe(r[0])) + + # process encoded chain + encoded_name=r[1] + + if(encoded_name != "*" and encoded_name != "*)"): + name_type = encoded_name.replace("*", "", ).replace("(", "").replace(")", "") + + if(encoded_name.startswith("(")): + self.words[r_i] = '%s' % (name_type, self.words[r_i]) + + if(encoded_name.endswith(")")): + self.words[r_i] = '%s' % (self.words[r_i]) + + + + + + def __repr__(self): + return "%s" % (" ".join(self.words)) + + + + + + + + +def expand_document_id(document_id, language): + + if language == "english": + abbr_language = "en" + elif language == "chinese": + abbr_language = "ch" + elif language == "arabic": + abbr_language = "ar" + + file_bit=document_id[-4:] + genre_bit, source_bit, ignore = document_id.split("/", 2) + constant="%s@on" % (abbr_language) + return "%s@%s@%s@%s@%s" % (document_id, file_bit, source_bit, genre_bit, constant) + + + + + + + + + +def main(): + # total number of expected actual arguments, not counting the command itself + required_number_of_args = 1 + + o_parser = optparse.OptionParser() + o_parser.set_defaults(DEBUG=False) + o_parser.set_defaults(LANGUAGE=None) + + o_parser.add_option("-d", "--debug", action="store_true", dest="DEBUG", help="Set debug mode on") + o_parser.add_option("-l", "--language", action="store", dest="LANGUAGE", help="Set language") + o_parser.add_option("-o", "--option", help="perform the specified task. this can be 'pre-process' or 'decode'") + + if(required_number_of_args > 0): + c_config, o_options, o_args = load_options(parser=o_parser) + if(len(o_args) != required_number_of_args): + error("please specify %s arguments" % (required_number_of_args)) + else: + c_config, o_options = load_options(parser=o_parser, positional_args=False) + + + + legal_options = [] + if(legal_options != [] + and + o_options.option not in legal_options): + error("please specify one of %s options" % (" ".join(legal_options))) + + + if o_options.LANGUAGE is None: + error("please specify language using -l option") + + + r_c_matrix = [] + if(required_number_of_args > 0): + file = open(o_args[0]) + file_line = file.readline() + + first_begin=True + while( file_line != "" ): + #---- start processing here ----# + file_line = file_line.strip() + + if(file_line.startswith("#begin")): + bits = file_line.split() + document_id=bits[2].replace("(", "").replace(");","") + if(first_begin == True): + print '''''' % (expand_document_id(document_id, o_options.LANGUAGE)) + first_begin = False + + elif(file_line.startswith("#end")): + pass + + elif(file_line == ""): + a_name_tagged_sentence = name_tagged_sentence(r_c_matrix) + print a_name_tagged_sentence + r_c_matrix = [] + + else: + columns = file_line.split() + (word, encoded_name) = columns[WORD_COLUMN], columns[NAME_COLUMN] + + if o_options.LANGUAGE == "arabic": + r_c_matrix.append([re.sub("#.*", "", word), encoded_name]) + else: + r_c_matrix.append([word, encoded_name]) + + file_line = file.readline() + print "" + + #---- close the file ----# + if(file != sys.stdin): + file.close() + +if __name__ == '__main__': + main() + diff --git a/neuralcoref/conll_processing_scripts/conll2name.sh b/neuralcoref/conll_processing_scripts/conll2name.sh new file mode 100755 index 0000000..64d0b9f --- /dev/null +++ b/neuralcoref/conll_processing_scripts/conll2name.sh @@ -0,0 +1,180 @@ +#!/bin/bash + +function usage { +cat < + + +Description: +----------- + +Takes a *conll file as input and prints out the corresponding coreference file + +---------------------------------------------------------------------------------------------------- + + + + +EOF +exit; +} + + +function message +{ + echo "----------------------------------------------------------------------------------------------------" + echo + echo $* 1>&2 + echo + echo "----------------------------------------------------------------------------------------------------" + +} + + + +function r { echo ${1%.*}; } +function t { echo ${1##*/}; } +function e { echo $(t ${1##*.}); } +function h { echo ${1%/*}; } + +# define helper function: run a command and print its exit code +function erun () { + debug=0 + if [[ $1 == "-d" ]]; then + debug=1 + shift; + fi + + + if [[ $DEBUG -eq 1 ]]; then + debug=1 + fi + + + + + verbose=0 + if [[ $1 == "-v" ]]; then + verbose=1 + shift; + fi + + + if [[ $VERBOSE -eq 1 ]]; then + verbose=1 + fi + + + + + + + if [[ $debug -eq 1 ]]; then + echo "debug mode ..." + echo "eval $1" + else + echo "normal mode ..." + if [[ $verbose -eq 1 ]]; then + echo -e "\nrun: $1\n-------------" + fi + + eval $1 + fi + + + local code=$? + if [ $code -ne 0 ]; then + echo "Exit code: $code" + exit $code + fi +} + + + + +# handle the valid command line options +DEBUG=0 +VERBOSE=0 +DEBUG_OPTION="" +while getopts vdh opt +do + case "$opt" in + v) + VERBOSE=1;; + + d) + DEBUG=1;; + + \?) + usage + exit 1;; + + h) + usage + exit 0;; + + :) + echo "option -$OPTARG requires an argument" + usage + exit 1;; + + esac +done +shift `expr $OPTIND - 1` + + +# at this point $* contains the arguments after interpreting the options + +d=$1 + +# if no arguments are specified, then just print usage +if [[ $# -eq 0 ]]; then + usage +fi + + +# debugging +if [[ $DEBUG -eq 1 ]]; then + echo "debugging mode is on ..." 1>&2 + DEBUG_OPTION="-d" +fi + + + + +for file in $(find $d -name "*_conll"); do + + if [[ $file =~ "data/english/annotations" ]]; then + LANGUAGE=english + elif [[ $file =~ "data/chinese/annotations" ]]; then + LANGUAGE=chinese + else + LANGUAGE=arabic + fi + + echo "language: $LANGUAGE" + + name=${file/_conll/_name} + echo "$file -> $name ..." + erun -v "python conll2name.py -l $LANGUAGE $file > $name" +# conll2name.py -l $LANGUAGE $file > $name +done + + + + + + + +# complain if the exit status of the last command executed is non-zero +if [[ $? != 0 ]]; then echo "the last command exited with a non-zero status" 1>&2; fi + + + diff --git a/neuralcoref/conll_processing_scripts/conll2parse.py b/neuralcoref/conll_processing_scripts/conll2parse.py new file mode 100755 index 0000000..3d51b17 --- /dev/null +++ b/neuralcoref/conll_processing_scripts/conll2parse.py @@ -0,0 +1,543 @@ +#!/usr/bin/env python + +#---- standard library imports ----# +import sys + +# version check +if(not (sys.hexversion >= 0x2050000)): + sys.exit("\n\nplease use python version >= 2.5\n\n") + +import string +import re +import math +import os +import time +import getopt +import zlib +import gzip +import codecs +import optparse +import collections +import ConfigParser +from optparse import OptionParser + + + + + + + + + + + +def error(error_string, terminate_program=True, current_frame=False): + """Print error messages to stderr, optionally sys.exit""" + + if(current_frame == False): + pretty_error_string = """ + +-------------------------------------------------------------------------------- + ERROR +-------------------------------------------------------------------------------- +%s +-------------------------------------------------------------------------------- + +""" % (error_string) + else: + pretty_error_string = """ + +-------------------------------------------------------------------------------- + ERROR +-------------------------------------------------------------------------------- +FILE: %s +LINE: %s +-------------------------------------------------------------------------------- +%s +-------------------------------------------------------------------------------- + +""" % (current_frame.f_code.co_filename, current_frame.f_lineno, error_string) + + sys.stderr.write(pretty_error_string) + + if(terminate_program == True): + sys.exit(1) + + + + + + + + + + +def parse_cfg_args(arg_list): + """Parse command-line style config settings to a dictionary. + + If you want to override configuration file values on the command + line or set ones that were not set, this should make it simpler. + Given a list in format [section.key=value, ...] return a + dictionary in form { (section, key): value, ...}. + + So we might have: + + .. code-block:: python + + ['corpus.load=english-mz', + 'corpus.data_in=/home/user/corpora/ontonotes/data/'] + + we would then return the dictionary: + + .. code-block:: python + + { ('corpus', 'load') : 'english-mz', + ('corpus', 'data_in') : '/home/user/corpora/ontonotes/data/' } + + See also :func:`load_config` and :func:`load_options` + + """ + + if not arg_list: + return {} + + config_append = {} + + for arg in arg_list: + if len(arg.split("=")) != 2 or len(arg.split("=")[0].split('.')) != 2: + raise Exception("Invalid argument; not in form section.key=value : " + arg) + + key, value = arg.split("=") + config_append[tuple(key.split("."))] = value + + return config_append + + + + + + + + + + + + + + + + + + + +__registered_config_options = collections.defaultdict( dict ) + +def required_config_sections(): + return [section for section in __registered_config_options if + [True for value in __registered_config_options[section] + if __registered_config_options[section][value][3]]] # section_required + + + + + + + + + + + + +def load_options(parser=None, argv=[], positional_args=True): + """ parses sys.argv, possibly exiting if there are mistakes + + If you set parser to a ConfigParser object, then you have control + over the usage string and you can prepopulate it with options you + intend to use. But don't set a ``--config`` / ``-c`` option; + load_options uses that to find a configuration file to load + + If a parser was passed in, we return ``(config, parser, [args])``. + Otherwise we return ``(config, [args])``. Args is only included + if ``positional_args`` is True and there are positional arguments + + See :func:`load_config` for details on the ``--config`` option. + + """ + + def is_config_appender(arg): + return "." in arg and "=" in arg and arg.find(".") < arg.find("=") + + parser_passed_in=parser + if not parser: + parser = OptionParser() + + parser.add_option("-c", "--config", help="the path to a config file to read options from") + + if argv: + options, args = parser.parse_args(argv) + else: + options, args = parser.parse_args() + + config = load_config(options.config, [a for a in args if is_config_appender(a)]) + + other_args = [a for a in args if not is_config_appender(a)] + + return_list = [config] + if parser_passed_in: + return_list.append(options) + if other_args: + if positional_args: + return_list.append(other_args) + else: + raise Exception("Arguments %s not understood" % other_args) + else: + if positional_args: + raise Exception("This program expects one or more positional arguments that are missing") + + if len(return_list) == 1: + return return_list[0] + else: + return tuple(return_list) + + + + + + + + + + + + + + + +class FancyConfigParserError(Exception): + """ raised by :class:`FancyConfigParser` when used improperly """ + + def __init__(self, vals): + Exception.__init__(self, 'Config usage must be in the form "config[\'section\', \'item\']". ' + 'Given something more like "config[%s]".' % (", ".join("%r"%v for v in vals))) + + + + + + + + + + + +class FancyConfigParser(ConfigParser.SafeConfigParser): + """ make a config parser with support for config[section, value] + + raises :class:`FancyConfigParserError` on improper usage. + + """ + + def __getitem__(self, vals): + try: + section, item = vals + except (ValueError, TypeError): + raise FancyConfigParserError(vals) + return self.get(section, item) + + + def __setitem__(self, vals, value): + try: + section, item = vals + except (ValueError, TypeError): + raise FancyConfigParserError(vals) + return self.set(section, item, value) + + def __delitem__(self, vals): + try: + section, item = vals + except (ValueError, TypeError): + raise FancyConfigParserError(vals) + + self.remove_option(section, item) + + + + + + + + + + +def load_config(cfg_name=None, config_append=[]): + """ Load a configuration file to memory. + + The given configuration file name can be a full path, in which + case we simply read that configuration file. Otherwise, if you + give 'myconfig' or something similar, we look in the current + directory and the home directory. We also look to see if files + with this name and extension '.conf' exist. So for 'myconfig' we + would look in the following places: + + * ./myconfig + * ./myconfig.conf + * [home]/.myconfig + * [home]/.myconfig.conf + + Once we find the configuration, we load it. We also extend + ConfigParser to support ``[]`` notation. So you could look up key + ``k`` in section ``s`` with ``config[s,k]``. See + :func:`FancyConfigParser` . + + If config_append is set we use :func:`parse_cfg_args` and add any + values it creates to the config object. These values override any + previous ones. + + """ + + config = FancyConfigParser() + + if cfg_name: + config_locs = [cfg_name + '.conf', + os.path.expanduser('~/.' + cfg_name + '.conf'), + cfg_name, + os.path.expanduser('~/.' + cfg_name)] + l = config.read(config_locs) + if not l: + raise Exception("Couldn't find config file. Looked in:" + + "".join(["\n - " + c for c in config_locs]) + + "\nto no avail.") + + + for (section, key_name), value in parse_cfg_args(config_append).iteritems(): + if not config.has_section(section): + config.add_section(section) + config.set(section, key_name, value) + + problems = [] + for section in config.sections(): + if not is_config_section_registered(section): + on.common.log.status("Ignoring unknown configuration section", section) + continue + for option in config.options(section): + if not is_config_registered(section, option): + problems.append("Unknown configuration variable %s.%s" % (section, option)) + continue + + value = config.get(section, option) + allowed = allowed_config_values(section, option) + multiple = allow_multiple_config_values(section, option) + + values = value.split() if multiple else [value] + for value in values: + if allowed and not value in allowed: + problems.append("Illegal value '%s' for configuration variable %s.%s. Permitted values are: %s" % + (value, section, option, ", ".join(["'%s'" % x for x in allowed]))) + + for option in required_config_options(section): + if not config.has_option(section, option): + problems.append("Required configuration variable %s.%s is absent" % (section, option)) + + for section in required_config_sections(): + if not config.has_section(section): + problems.append("Required configuration section %s is absent" % section) + + if problems: + print_config_docs() + + on.common.log.status("Configuration Problems:") + for problem in problems: + on.common.log.status(" " + problem) + + sys.exit(-1) + + return config + + + + + + + + + + + + + + + + + + + + + + + + +def pretty_print_parse_string(a_parse_string, offset=''): + + if not a_parse_string.strip(): + return "" + + # Maximum depth we're prepared for in parses + maxdepth=100 + maxindent=300 + + # Table of indentation at parse depth + depth_to_indent = [0 for i in xrange(maxdepth)] + + # Initialize indent_string[i] to be a string of i spaces + indent_string = ['' for i in xrange(maxindent)] + for i in xrange(maxindent-1): + indent_string[i+1] = indent_string[i] + ' ' + + # RE object for split that matches on a ')' followed by not a ')', but only consumes the ')' + close_paren = re.compile(r'\)(?=\s*[^\s\)])') + + # RE object to pick up first on this line(should be only) POS tag and the word of each lexical leaf of the parse + lexical_leaf = re.compile(r'\((?P[^\s\)\(]+)\s+(?P[^\s\)\(]+)\)') + + # RE object to parse OntoNotes Normal Form parse lines: + a_parse = a_parse_string + + pp_parse = "" + + def parseindent(depth): + return indent_string[depth_to_indent[depth]]+offset #Indent to appropriate point + + + current_depth = 0 + for frag in close_paren.split(a_parse): #Split input into lines ending with a lexical item + if frag[-1]!= '\n': + frag=frag+')' + else: frag=frag[0:-1] + + #Indent to appropriate point + pp_parse += parseindent(current_depth) + + pfrag = "" + for pfrag in (frag).split('(')[1:]: # Split line into segments each beginning with an '(' + pfrag='('+pfrag # Restore deleted initial '(' + pp_parse += pfrag # Print each + current_depth=current_depth+1 # Up the current depth count + + # Remember how far to indent following items at this depth + depth_to_indent[current_depth]=depth_to_indent[current_depth-1]+len(pfrag) + + current_depth=current_depth-pfrag.count(')') # Correct depth given closing parens + if current_depth<=0: + pp_parse += '' # Separate toplevel parses with blank lines + + pp_parse += '\n' # Print CRLF + + + return re.sub("\)$", "", pp_parse) + + + + + + + + + +class parsed_sentence: + + def __init__(self, r_c_matrix): + self.words = [] + self.parse_bits = [] + + for r_i, r in enumerate(r_c_matrix): + + word = r[0] + self.words.append(word) + + # process encoded chain + part_of_speech=r[1] + encoded_parse=r[2] + + self.parse_bits.append(encoded_parse.replace("*", "(%s %s)" % (part_of_speech, word))) + + + + def __repr__(self): + return pretty_print_parse_string("%s" % ("".join(self.parse_bits).replace("(", " (").strip())) + + + + + + + + + + + +def main(): + # total number of expected actual arguments, not counting the command itself + required_number_of_args = 1 + + o_parser = optparse.OptionParser() + o_parser.set_defaults(DEBUG=False) + o_parser.set_defaults(LANGUAGE=None) + + o_parser.add_option("-d", "--debug", action="store_true", dest="DEBUG", help="Set debug mode on") + o_parser.add_option("-l", "--language", action="store", dest="LANGUAGE", help="Set language") + o_parser.add_option("-o", "--option", help="perform the specified task. this can be 'pre-process' or 'decode'") + + if(required_number_of_args > 0): + c_config, o_options, o_args = load_options(parser=o_parser) + if(len(o_args) != required_number_of_args): + error("please specify %s arguments" % (required_number_of_args)) + else: + c_config, o_options = load_options(parser=o_parser, positional_args=False) + + + + legal_options = [] + if(legal_options != [] + and + o_options.option not in legal_options): + error("please specify one of %s options" % (" ".join(legal_options))) + + + if o_options.LANGUAGE is None: + error("please specify language using -l option") + + + r_c_matrix = [] + if(required_number_of_args > 0): + file = open(o_args[0]) + file_line = file.readline() + while( file_line != "" ): + #---- start processing here ----# + file_line = file_line.strip() + + if(file_line.startswith("#begin") or file_line.startswith("#end")): + pass + + elif(file_line == ""): + a_parsed_sentence = parsed_sentence(r_c_matrix) + print a_parsed_sentence + r_c_matrix = [] + + else: + columns = file_line.split() + (word, part_of_speech, encoded_parse) = columns[3], columns[4], columns[5] + + if o_options.LANGUAGE == "arabic": + r_c_matrix.append([re.sub("#.*", "", word), part_of_speech, encoded_parse]) + else: + r_c_matrix.append([word, part_of_speech, encoded_parse]) + + file_line = file.readline() + + #---- close the file ----# + if(file != sys.stdin): + file.close() + +if __name__ == '__main__': + main() + diff --git a/neuralcoref/conll_processing_scripts/conll2parse.sh b/neuralcoref/conll_processing_scripts/conll2parse.sh new file mode 100755 index 0000000..cc961ab --- /dev/null +++ b/neuralcoref/conll_processing_scripts/conll2parse.sh @@ -0,0 +1,187 @@ +#!/bin/bash + +function usage { +cat < + + +Description: +----------- + +Takes a *conll file as input and prints out the corresponding parse file + +---------------------------------------------------------------------------------------------------- + + + + +EOF +exit; +} + + +function message +{ + echo "----------------------------------------------------------------------------------------------------" + echo + echo $* 1>&2 + echo + echo "----------------------------------------------------------------------------------------------------" + +} + + + +function r { echo ${1%.*}; } +function t { echo ${1##*/}; } +function e { echo $(t ${1##*.}); } +function h { echo ${1%/*}; } + +# define helper function: run a command and print its exit code +function erun () { + debug=0 + if [[ $1 == "-d" ]]; then + debug=1 + shift; + fi + + + if [[ $DEBUG -eq 1 ]]; then + debug=1 + fi + + + + + verbose=0 + if [[ $1 == "-v" ]]; then + verbose=1 + shift; + fi + + + if [[ $VERBOSE -eq 1 ]]; then + verbose=1 + fi + + + + + + + if [[ $debug -eq 1 ]]; then + echo "debug mode ..." + echo "eval $1" + else + echo "normal mode ..." + if [[ $verbose -eq 1 ]]; then + echo -e "\nrun: $1\n-------------" + fi + + eval $1 + fi + + + local code=$? + if [ $code -ne 0 ]; then + echo "Exit code: $code" + exit $code + fi +} + + + + +# handle the valid command line options +DEBUG=0 +VERBOSE=0 +DEBUG_OPTION="" +while getopts vdh opt +do + case "$opt" in + v) + VERBOSE=1;; + + d) + DEBUG=1;; + + \?) + usage + exit 1;; + + h) + usage + exit 0;; + + :) + echo "option -$OPTARG requires an argument" + usage + exit 1;; + + esac +done +shift `expr $OPTIND - 1` + + +# at this point $* contains the arguments after interpreting the options + +d=$1 + +# if no arguments are specified, then just print usage +if [[ $# -eq 0 ]]; then + usage +fi + + + +# debugging +if [[ $DEBUG -eq 1 ]]; then + echo "debugging mode is on ..." 1>&2 + DEBUG_OPTION="-d" +fi + + + + + +for file in $(find $d -name "*_conll"); do + + if [[ $file =~ "data/english/annotations" ]]; then + LANGUAGE=english + elif [[ $file =~ "data/chinese/annotations" ]]; then + LANGUAGE=chinese + else + LANGUAGE=arabic + fi + + + echo "language: $LANGUAGE" + + parse=${file/_conll/_parse} + echo "$file -> $parse ..." + erun -v "python conll2parse.py -l $LANGUAGE $file > $parse" +# conll2parse.py -l $LANGUAGE $file > $parse +done + + + + + + + + + + + +# complain if the exit status of the last command executed is non-zero +if [[ $? != 0 ]]; then echo "the last command exited with a non-zero status" 1>&2; fi + + + diff --git a/neuralcoref/conll_processing_scripts/skeleton2conll.py b/neuralcoref/conll_processing_scripts/skeleton2conll.py new file mode 100755 index 0000000..2a2e7ab --- /dev/null +++ b/neuralcoref/conll_processing_scripts/skeleton2conll.py @@ -0,0 +1,1176 @@ +#!/usr/bin/env python + +""" +Get most current usage with: + + python skeleton2conll.py --help + +""" + +from __future__ import with_statement +import codecs +import sys +import os +import re +import string +from collections import defaultdict + + +WORD_COLUMN=3 +LEMMA_COLUMN=6 + + + + +MIN_VERBOSITY = 0 +MED_VERBOSITY = 5 +MAX_VERBOSITY = 10 +SUPER_VERBOSITY = 15 + +DEBUG = False +VERBOSITY = MAX_VERBOSITY + + + + + + + + +def debug(debug_object, debug_flag=DEBUG, verbosity=MAX_VERBOSITY, nl=False): + if((debug_flag == True) and (verbosity <= VERBOSITY)): + if nl: + trailing_char = "\n" + else: + trailing_char = "" + + sys.stderr.write(str(debug_object) + trailing_char) + + + + + + + + + + + + +def warning(warning_string, verbosity=0): + """ print warning string depending on the value of VERBOSITY """ + + if(verbosity <= VERBOSITY): + sys.stderr.write(u""" + +-------------------------------------------------------------------------------- + WARNING +-------------------------------------------------------------------------------- +%s +-------------------------------------------------------------------------------- + +""" % (warning_string)) + + + + + + + + +class abstract_open_type_table: + + def __init__(self, a_id, data_pointer=None): + self.id = a_id + self.type_hash[self.id] += 1 + + @classmethod + def write_to_db(cls, cursor): + for a_type in cls.type_hash.keys(): + insert_ignoring_dups(cls, cursor, a_type) + + @classmethod + def __repr__(cls): + return " ".join(cls.type_hash.keys()) + + @classmethod + def get_table(cls): + try: + return cls.sql_insert_statement.strip().split("\n")[0].split()[2] + except Exception: + return "unknown" + + + + + + + + +class lemma_type(abstract_open_type_table): + type_hash = defaultdict(int) + + sql_table_name = "lemma_type" + sql_create_statement = \ +""" +create table lemma_type +( + id varchar(255) not null collate utf8_bin primary key +) +default character set utf8; +""" + + + sql_insert_statement = \ +"""insert into lemma_type +( + id +) +values (%s) +""" + + + + + + + +class lemma: + """ arabic trees have extra lemma information """ + + def __init__(self, input_string, b_transliteration, comment, index, offset, unvocalized_string, + vocalized_string, vocalized_input, pos, gloss, lemma, coarse_sense, leaf_id): + + self.input_string = input_string + self.b_transliteration = b_transliteration + self.comment = comment + self.index = index + self.offset = offset + self.unvocalized_string = unvocalized_string + self.vocalized_string = vocalized_string + self.vocalized_input = vocalized_input + self.pos = pos + self.gloss = gloss + self.lemma = lemma + self.coarse_sense = coarse_sense + self.leaf_id = leaf_id + + self.id = "%s@%s" % (self.lemma, self.leaf_id) + + sql_table_name = "lemma" + + def __repr__(self): + return "\n".join(["lemma instance:", + " input_string: " + self.input_string, + " vocalized_input: " + self.vocalized_input, + " unvocalized_string: " + self.unvocalized_string, + " vocalized_string: " + self.vocalized_string, + " gloss: " + self.gloss, + " index: %s" % self.index, + " offset: %s" % self.offset]) + + def __str__(self): + tr = ["INPUT STRING:%s" % self.input_string, + " IS_TRANS:%s" % self.b_transliteration, + " COMMENT:%s" % self.comment, + " INDEX:%s" % self.index, + " OFFSETS:%s" % self.offset, + " UNVOCALIZED:%s" % self.unvocalized_string, + " VOCALIZED:%s" % self.vocalized_string, + " VOC_STRING:%s" % self.vocalized_input, + " POS:%s" % self.pos, + " GLOSS:%s" % self.gloss] + + if self.lemma != "lemma_not_set": + if self.coarse_sense: + lemma_str = "%s_%s" % (self.lemma, self.coarse_sense) + else: + lemma_str = self.lemma + + tr.append(" LEMMA: [%s]" % lemma_str) + + return "\n".join(tr) + + + @staticmethod + def from_db(a_leaf_id, a_cursor): + a_cursor.execute("SELECT * FROM lemma WHERE leaf_id = '%s'" % a_leaf_id) + rows = a_cursor.fetchall() + + if not rows: + return None + + if len(rows) != 1: + assert all(row["lemma"] == rows[0]["lemma"] for row in rows), \ + "\n".join(", ".join(": ".join(a) for a in row.iteritems()) for row in rows) + + r = rows[0] + + return lemma(r["input_string"], + r["b_transliteration"], + r["comment"], + r["lemma_index"], + r["lemma_offset"], + r["unvocalized_string"], + r["vocalized_string"], + r["vocalized_input"], + r["pos"], + r["gloss"], + r["lemma"], + r["coarse_sense"], + r["leaf_id"]) + + # sql create statement for the syntactic_link table + sql_create_statement = \ +""" +create table lemma +( + id varchar(255) not null, + input_string varchar(255), + b_transliteration varchar(255), + comment varchar(255), + lemma_index varchar(255), + lemma_offset varchar(255), + unvocalized_string varchar(255), + vocalized_string varchar(255), + vocalized_input varchar(255), + pos varchar(255), + gloss varchar(255), + lemma varchar(255), + coarse_sense varchar(16), + leaf_id varchar(255), + foreign key (leaf_id) references tree.id +) +default character set utf8; +""" + + + # sql insert statement for the syntactic_link table + sql_insert_statement = \ +""" +insert into lemma +( + id, + input_string, + b_transliteration, + comment, + lemma_index, + lemma_offset, + unvocalized_string, + vocalized_string, + vocalized_input, + pos, + gloss, + lemma, + coarse_sense, + leaf_id +) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) +""" + + + def write_to_db(self, cursor): + data = [(self.id, self.input_string, self.b_transliteration, self.comment, self.index, + self.offset, self.unvocalized_string, self.vocalized_string, self.vocalized_input, + self.pos, self.gloss, self.lemma, self.coarse_sense, self.leaf_id)] + + cursor.executemany("%s" % (self.__class__.sql_insert_statement), data) + + + + + + + + + +def iterate_trees(string_seq): + """ + + given string_seq which is a sequence of strings, read from + string_seq and produce strings one at a time that represent trees. + + """ + + return [x for x in _iterate_trees_helper(string_seq) if x.strip()] + + + + + + + + + + +def _iterate_trees_helper(string_seq): + + parens = 0 + cur_parse = [] + + for s in string_seq: + if (s.startswith(";") or s.startswith("<") or s.startswith("*")) and s.endswith("\n"): + continue # ignore comments and sgml + + for c in s: + if c == "(" and parens == 0 and cur_parse: + yield "".join(cur_parse) + cur_parse = [] + + cur_parse.append(c) + + if c == "(": + parens += 1 + elif c == ")": + parens -= 1 + + if parens == 0: + yield "".join(cur_parse).strip() + cur_parse = [] + + if parens != 0: + raise Exception("Parens should have been zero at end, were %s" % parens) + if "".join(cur_parse).strip(): + raise Exception("curparse should have been empty at end, was %s" % cur_parse) + + + + + + + + + + +class InvalidSexprException(Exception): + def __init__(self, sexpr, parent=None): + self.sexpr = sexpr + self.parent = parent + + def __str__(self): + + ns = "" + ns += self.sexpr + if self.parent: + ns += "\n\n" + ns += str(self.parent) + return ns + + + + + + + + + + +def parse_sexpr(s): + """ turn an s-expression into a tree of lists: + + (a (b c) d) -> [a, [b, c], d] + + uses spaces and parens only -- no way to have a token with a space in it + + """ + s = s.replace("\n", " ").replace("\t"," ").strip() + + if not s.startswith("(") and not s.endswith(")"): + return s + elif s.startswith("(") and s.endswith(")"): + tr = [] + cur = [] + parens = 0 + for c in s[1:-1].strip() + " ": + if c == "(": + parens += 1 + elif c == ")": + parens -= 1 + elif c == " " and cur: + if parens == 0: + try: + x = parse_sexpr("".join(cur)) + except InvalidSexprException, e: + raise InvalidSexprException("Parent: %s" % s, e) + + if x: + tr.append(x) + cur = [] + + cur.append(c) + + if (cur and cur != [" "]) or parens != 0: + raise InvalidSexprException("Invalid s-expression: " + s + " note: %s" % "".join(cur) + " parens: %s" % parens) + + return tr + else: + raise InvalidSexprException("Invalid s-expression: \n" + s) + + + + + + + + + + +def unparse_sexpr(l): + if type(l) == type([]): + return "(" + " ".join(unparse_sexpr(a) for a in l) + ")" + return str(l) + + + + + + + + + + +def pretty_print_tree_string(a_tree_string, offset=''): + + if not a_tree_string.strip(): + return "" + + # Maximum depth we're prepared for in trees + maxdepth=100 + maxindent=300 + + # Table of indentation at tree depth + depth_to_indent = [0 for i in xrange(maxdepth)] + + # Initialize indent_string[i] to be a string of i spaces + indent_string = ['' for i in xrange(maxindent)] + for i in xrange(maxindent-1): + indent_string[i+1] = indent_string[i] + ' ' + + # RE object for split that matches on a ')' followed by not a ')', but only consumes the ')' + close_paren = re.compile(r'\)(?=\s*[^\s\)])') + + # RE object to pick up first on this line(should be only) POS tag and the word of each lexical leaf of the tree + lexical_leaf = re.compile(r'\((?P[^\s\)\(]+)\s+(?P[^\s\)\(]+)\)') + + # RE object to parse OntoNotes Normal Form tree lines: + a_tree = a_tree_string + + pp_tree = "" + + def treeindent(depth): + return indent_string[depth_to_indent[depth]]+offset #Indent to appropriate point + + + current_depth = 0 + for frag in close_paren.split(a_tree): #Split input into lines ending with a lexical item + if frag[-1]!= '\n': + frag=frag+')' + else: frag=frag[0:-1] + + #Indent to appropriate point + pp_tree += treeindent(current_depth) + + pfrag = "" + for pfrag in (frag).split('(')[1:]: # Split line into segments each beginning with an '(' + pfrag='('+pfrag # Restore deleted initial '(' + pp_tree += pfrag # Print each + current_depth=current_depth+1 # Up the current depth count + + # Remember how far to indent following items at this depth + depth_to_indent[current_depth]=depth_to_indent[current_depth-1]+len(pfrag) + + current_depth=current_depth-pfrag.count(')') # Correct depth given closing parens + if current_depth<=0: + pp_tree += '' # Separate toplevel trees with blank lines + + pp_tree += '\n' # Print CRLF + + + return re.sub("\)$", "", pp_tree) + + + + + + +DONT_DELETE_TREES = True + + + + + + + + + + +def car(sp): + return sp[0] + + + + + + + + + + +def cdr(sp): + return sp[1:] + + + + + + + + + + +def split_node(sp): + return car(sp), cdr(sp) + + + + + + + + + + +def is_leaf(sp): + return len(sp) == 2 and type(sp[1]) != type([]) + + +transformations = {} + + + + + + + + + + +def pp(sexpr, out_text=False): + """ pretty print the S-expr, or just spit text out if out_text is true + + out_text also skips traces + + """ + if not out_text: + return pretty_print_tree_string(unparse_sexpr(sexpr)) + else: + words = [word for tag, word in all_leaves(sexpr) + if tag != "-NONE-"] # skip traces + + return "\n".join(words) + + + + + + +def transforms(transformation): + assert transformation.startswith("+") or transformation.startswith("-") + def regfunc(func): + transformations[transformation] = func + return func + return regfunc + + + + + + + + + + +def require(b): + if not b: + raise Exception("Failed Requirement") + + + + + + + + + + +@transforms("-edited") +def remove_edits(sp): + """Remove subtrees tagged 'EDITED' (disfluencies) """ + + return remove_tagger(sp, "EDITED") + + + + + + + + + + +@transforms("-trace") +def remove_edits(sp): + """Remove traces part of speech tagged '-NONE-' """ + + return remove_tagger(sp, "-NONE-") + + + + + + + + + + +@transforms("-phrase-tags") +def all_leaves(sp): + """Make a tree of just the leaves + + .. code-block: scheme + + (TOP (S (NP-SBJ (NNP Zambia)) + (VP (VBD had) + (ADVP-TMP (RB previously)) + (VP (VBD lost) + (NP (PRP$ its) + (RB away) + (VBD game)) + (NP-ADV (NP (CD 0)) + (PP (SYM -) + (NP (CD 1)))))) + (. .))) + + becomes + + .. code-block: scheme + + ( (NNP Zambia) + (VBD Had) + (RB Previously) + (VBD lost) + (PRP$ its) + (RB away) + (VBG game) + (CD 0) + (SYM -) + (CD 0) ) + + """ + + tag, rest = split_node(sp) + if is_leaf(sp): + return [[tag, rest[0]]] + + tr = [] + for x in rest: + tr.extend(all_leaves(x)) + return tr + + + + + + + + + + +def remove_tagger(sp, tag_to_remove): + """ remove tag_to_remove from sp, culling empty branches """ + def callback(tag, rest): + return tag == tag_to_remove + return remover(sp, callback) + + + + + + + + + + +def remover(sp, callback): + tag, rest = split_node(sp) + if callback(tag, rest): + return [] + if is_leaf(sp): + return sp + + new_rest = [y for y in [remover(x, callback) for x in rest] if y] + + if not new_rest: + return [] + return [tag] + new_rest + + + + + + + + + + +def pad_items_in_list(a_list, a_character=None): + """ + this function will return the same list with the right amount of + padding equal to two spaces on each side of the widest string. it + will perform right justification. + + if the optional character is specified, then it will do a + centering around the character in the process of padding. + left/right justification does not work with this option. + """ + + if(a_character != None): + for an_item in a_list: + if(an_item.find(a_character) == -1): + a_character = None + break + + if(a_character != None): + lmax=0 + rmax=0 + for an_item in a_list: + an_item = an_item.strip() + + lf = an_item.find("*") + if(lmax < lf): + lmax = lf + + rf = len(an_item) - an_item.find("*") + if(rmax < rf): + rmax = rf + + + + i=0 + for i in range(0, len(a_list)): + a_list[i] = a_list[i].strip() + + x = a_list[i].find(a_character) + + len_i=len(a_list[i]) + + a_list[i] = " "*(lmax-x+2) + a_list[i] + a_list[i] = a_list[i] + " "*(rmax-len_i+x+2) + + else: + max=0 + for an_item in a_list: + an_item = an_item.strip() + x = len(an_item) + if(max < x): + max = x + + i=0 + for i in range(0, len(a_list)): + a_list[i] = a_list[i].strip() + + if(a_list[i].endswith("*") or + a_list[i].endswith("-") or + a_list[i][-1] in string.digits ): + a_list[i] = "%s " % (a_list[i]) + + a_list[i] = a_list[i].rjust(max+2) + + return a_list + + + + + + + + + + +def rows2columns(matrix): + columns = [] + + for row in matrix: + c=0 + for cell in row: + if(c == len(columns)): + columns.append([]) + + columns[c].append(cell) + c = c + 1 + + return columns + + + + + + + + + + +def pretty_print_table(rows, separator=None, out_file=None): + + # cells is the matrix + r_c_matrix = [] + for row in rows: + r_c_matrix.append(row.split()) + + + c_r_matrix = rows2columns(r_c_matrix) + + + for i in range(0, len(c_r_matrix)): + + if(i==5 or i>10): + padding_character=separator + else: + padding_character=None + + c_r_matrix[i] = pad_items_in_list(c_r_matrix[i], padding_character) + + r_c_matrix = rows2columns(c_r_matrix) + + if(out_file == None): + for row in r_c_matrix: + print " ".join(row).strip() + print + + elif(out_file == "-"): + rows=[] + for row in r_c_matrix: + rows.append(" ".join(row).strip()) + return "%s\n" % ("\n".join(rows)) + + else: + raise NotImplementedError("this functionality has not yet been implemented") + + + + + + + + + + +def start(input_fname, conll_fname, output_fname, encoding, changes): + """ apply changes in order to the trees in input_fname, write to output_fname """ + + + out_text = False + if "--text" in changes: + out_text = True + changes.remove("--text") + + out = [] + with codecs.open(input_fname, "r", encoding) as inf: + + for a_tree in iterate_trees(inf): + sexpr = parse_sexpr(a_tree) + for change in changes: + if not sexpr: + continue + + try: + change_func = transformations[change] + except KeyError: + raise Exception("Invalid argument '%s' for change. Allowed changes are: %s" % (change, transformations.keys())) + + try: + old_sexpr = sexpr[:] + sexpr = change_func(sexpr) + except Exception: + sys.stderr.write("ERR in %s\n\nTree:\n%s\n\nInput sexpr:\n%s\n" % (change, a_tree, pp(sexpr))) + raise + + + if not sexpr and DONT_DELETE_TREES: + nullp = ["XX", "nullp"] + if old_sexpr and old_sexpr[0] == "TOP": + sexpr = ["TOP", nullp] + else: + sexpr = nullp + + if sexpr: + out.append(pp(sexpr, out_text)) + + + + w_list = [] + for o in out: + w_list.append(o.split("\n")) + + + num_words = 0 + for a_word_list in w_list: + for a_word in a_word_list: + num_words = num_words + 1 + + debug("number of words: %d\n" % (num_words), DEBUG, MAX_VERBOSITY) + debug("input_fname: %s" % (input_fname), DEBUG, MAX_VERBOSITY) + + + is_arabic = False + a_list_of_lemmas = [] + + if re.search('data%s+arabic%s+annotations' % (os.sep, os.sep), input_fname): + is_arabic = True + + + + + if is_arabic is True: + lemma_fname = re.sub("\.parse$", ".lemma", input_fname) + debug("lemma_fname: %s" % (lemma_fname), DEBUG, MAX_VERBOSITY) + + if os.path.exists(lemma_fname): + lemma_file = codecs.open(lemma_fname, "r", "utf-8") + + actual_word_list = [] + buckwalter_word_list = [] + lemma_list = [] + + input_string_regex = re.compile(r"^\s*INPUT STRING:(.*)", re.U|re.MULTILINE) + buckwalter_regex = re.compile(r"^\s*IS_TRANS:(.*)", re.U|re.MULTILINE) + comment_regex = re.compile(r"^\s*COMMENT:(.*)", re.U|re.MULTILINE) + index_regex = re.compile(r"^\s*INDEX:(.*)", re.U|re.MULTILINE) + offsets_regex = re.compile(r"^\s*OFFSETS:(.*)", re.U|re.MULTILINE) + unvocalized_string_regex = re.compile(r"^\s*UNVOCALIZED:(.*)", re.U|re.MULTILINE) + vocalized_string_regex = re.compile(r"^\s*VOCALIZED:(.*)", re.U|re.MULTILINE) + vocalized_input_string_regex = re.compile(r"^\s*VOC_STRING:(.*)", re.U|re.MULTILINE) + pos_string_regex = re.compile(r"^\s*POS:(.*)", re.U|re.MULTILINE) + gloss_string_regex = re.compile(r"^\s*GLOSS:(.*)", re.U|re.MULTILINE) + lemma_regex = re.compile(r"LEMMA:\s+\[([^\]]*)\]", re.U|re.MULTILINE) + + lemma_file_lines = lemma_file.readlines() + + list_of_lemma_blocks = [] + + i=0 + lemma_block = "" + list_of_lemma_blocks = [] + while(i [transformations] ..." + print "\nAllowed transforms:" + + max_key_len = max(len(t) for t in transformations) + 1 # +1 for colon + + for key in transformations: + print " %s %s" %(("%s:"%key).rjust(max_key_len), + transformations[key].__doc__.split("\n")[0]) + + print " %s %s" % ("--text:".rjust(max_key_len), + "Produce text output instead of parse trees") + print + print + print "Example:" + print "python skeleton2conll.py /data/.../bc/cnn/00/cnn_0000.parse conll-2011/dev/data/english/annotations/bc/cnn/00/cnn_0000.v0_gold_skel conll-2011/dev/data/english/annotations/bc/cnn/00/cnn_0000.v0_gold_conll -edited --text" + print "-"*120 + else: + input_fname, conll_fname, output_fname, changes = sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4:] + start(input_fname, conll_fname, output_fname, encoding, changes) diff --git a/neuralcoref/conll_processing_scripts/skeleton2conll.sh b/neuralcoref/conll_processing_scripts/skeleton2conll.sh new file mode 100755 index 0000000..48d0173 --- /dev/null +++ b/neuralcoref/conll_processing_scripts/skeleton2conll.sh @@ -0,0 +1,209 @@ +#!/bin/bash + +function usage { +cat < + + +Description: +----------- + +: Location of the data directory under the OntoNotes 4.0 release + : The directory inside which the *_skel files exist and need to + be convered to .conll files + +---------------------------------------------------------------------------------------------------- + + + + +EOF +exit; +} + + +function message +{ + (echo "----------------------------------------------------------------------------------------------------"; + echo "" ; + echo $* ; + echo "" ; + echo "----------------------------------------------------------------------------------------------------") 1>&2 + +} + +function warning +{ + message "$*" +} + +function error +{ + message "$*" + exit +} + + +function r { echo ${1%.*}; } +function t { echo ${1##*/}; } +function e { echo $(t ${1##*.}); } +function h { echo ${1%/*}; } + + + +# define helper function: run a command and print its exit code +function erun () +{ + local debug; + local verbose; + debug=0; + if [[ $1 == "-d" ]]; then + debug=1; + shift; + fi; + verbose=0; + if [[ $1 == "-v" ]]; then + verbose=1; + shift; + fi; + if [[ $DEBUG -eq 1 ]]; then + debug=1; + fi; + if [[ $VERBOSE -eq 1 ]]; then + verbose=1; + fi; + if [[ $debug -eq 1 ]]; then + echo "eval $1"; + else + if [[ $verbose -eq 1 ]]; then + echo "-> $1"; + fi; + eval $1; + fi; + local code=$?; + if [ $code -ne 0 ]; then + echo "Exit code: $code"; + break; + fi +} + + + +# handle the valid command line options +DEBUG=0 +TESTING=false +VERBOSE=0 +DEBUG_OPTION="" +EDITED="" +while getopts D:dhT opt +do + case "$opt" in + v) + VERBOSE=1;; + + d) + DEBUG=1 + DEBUG_OPTION="-d";; + + D) + ON_DATA_DIR="$OPTARG" + ON_DATA_DIR=${ON_DATA_DIR%/} + + if [[ -z $ON_DATA_DIR ]]; then + error "please specify a valid ontonotes data directory using the -D option" + usage + fi;; + + T) + # this option is used internally for testing + TESTING=true;; + + \?) + usage + exit 1;; + + h) + usage + exit 0;; + + :) + echo "option -$OPTARG requires an argument" + usage + exit 1;; + + esac +done +shift `expr $OPTIND - 1` + + + + +# at this point $* contains the arguments after interpreting the options + +d=$1 +d=${d%/} + + +# if the conll release directory is not correct +if [[ $(t $d) != "conll-2012" ]]; then + error "please make sure that you are pointing to the directory 'conll-2012'" +fi + + + +# if we are testing the release, we do not want to clobber the +# true _conll files +if $TESTING; then + EXT="_skel2conll" +else + EXT="_conll" +fi + + +# if no arguments are specified, then just print usage +if [[ $# -eq 0 ]]; then + usage +fi + + + + +for language in arabic english chinese; do + # set the EDITED option only for english + if [[ $language == "english" ]]; then + EDITED="-edited" + else + EDITED="" + fi + + for partition in train development test; do + for skel in $(find $d/v?/data/$partition/data/$language/ -name "*_skel"); do + gold_parse=$ON_DATA_DIR/$(r ${skel/*data\//}).parse + + if [[ ! -e $gold_parse ]]; then + error "could not find the gold parse [$gold_parse] in the ontonotes distribution ... exiting ..." + exit + fi + + conll=${skel/_skel/$EXT} + erun -v "python $d/v?/scripts/skeleton2conll.py $gold_parse $skel $conll $EDITED --text" + done + done +done + + + + + +# complain if the exit status of the last command executed is non-zero +if [[ $? != 0 ]]; then echo "the last command exited with a non-zero status" 1>&2; fi + + + diff --git a/neuralcoref/conllparser.py b/neuralcoref/conllparser.py new file mode 100644 index 0000000..cf41a2d --- /dev/null +++ b/neuralcoref/conllparser.py @@ -0,0 +1,723 @@ +# coding: utf8 +"""Conll parser""" +from __future__ import absolute_import +from __future__ import division +from __future__ import unicode_literals +from __future__ import print_function + +import re +import sys +import codecs +import argparse +import time +import os +import io +import pickle + +import spacy +from spacy.tokens import Doc + +import numpy as np + +from tqdm import tqdm + +from neuralcoref.compat import unicode_ +from neuralcoref.document import Mention, Document, Speaker, EmbeddingExtractor, MISSING_WORD +from neuralcoref.utils import parallel_process + +PACKAGE_DIRECTORY = os.path.dirname(os.path.abspath(__file__)) +REMOVED_CHAR = ["/", "%", "*"] +NORMALIZE_DICT = {"/.": ".", + "/?": "?", + "-LRB-": "(", + "-RRB-": ")", + "-LCB-": "{", + "-RCB-": "}", + "-LSB-": "[", + "-RSB-": "]"} + +CONLL_GENRES = {"bc": 0, "bn": 1, "mz": 2, "nw": 3, "pt": 4, "tc": 5, "wb": 6} + +FEATURES_NAMES = ["mentions_features", # 0 + "mentions_labels", # 1 + "mentions_pairs_length", # 2 + "mentions_pairs_start_index", # 3 + "mentions_spans", # 4 + "mentions_words", # 5 + "pairs_ant_index", # 6 + "pairs_features", # 7 + "pairs_labels", # 8 + "locations", # 9 + "conll_tokens", # 10 + "spacy_lookup", # 11 + "doc", # 12 + ] + +MISSED_MENTIONS_FILE = os.path.join(PACKAGE_DIRECTORY, "test_mentions_identification.txt") +SENTENCES_PATH = os.path.join(PACKAGE_DIRECTORY, "test_sentences.txt") + +################### +### UTILITIES ##### + +def clean_token(token): + cleaned_token = token + if cleaned_token in NORMALIZE_DICT: + cleaned_token = NORMALIZE_DICT[cleaned_token] + if cleaned_token not in REMOVED_CHAR: + for char in REMOVED_CHAR: + cleaned_token = cleaned_token.replace(char, u'') + if len(cleaned_token) == 0: + cleaned_token = "," + return cleaned_token + +def mention_words_idx(embed_extractor, mention, debug=False): + # index of the word in the tuned embeddings no need for normalizing, + # it is already performed in set_mentions_features() + # We take them in the tuned vocabulary which is a smaller voc tailored from conll + words = [] + for _, w in sorted(mention.words_embeddings_.items()): + if w not in embed_extractor.tun_idx: + if debug: print("No matching tokens in tuned voc for word ", w, "surrounding or inside mention", mention) + words.append(MISSING_WORD) + else: + words.append(w) + return [embed_extractor.tun_idx[w] for w in words] + +def check_numpy_array(feature, array, n_mentions_list, compressed=True): + for n_mentions in n_mentions_list: + if feature == FEATURES_NAMES[0]: + assert array.shape[0] == len(n_mentions) + if compressed: + assert np.array_equiv(array[:, 3], np.array([len(n_mentions)] * len(n_mentions))) + assert np.max(array[:, 2]) == len(n_mentions)-1 + assert np.min(array[:, 2]) == 0 + elif feature == FEATURES_NAMES[1]: + assert array.shape[0] == len(n_mentions) + elif feature == FEATURES_NAMES[2]: + assert array.shape[0] == len(n_mentions) + assert np.array_equiv(array[:, 0], np.array(list(range(len(n_mentions))))) + elif feature == FEATURES_NAMES[3]: + assert array.shape[0] == len(n_mentions) + assert np.array_equiv(array[:, 0], np.array([p*(p-1)/2 for p in range(len(n_mentions))])) + elif feature == FEATURES_NAMES[4]: + assert array.shape[0] == len(n_mentions) + elif feature == FEATURES_NAMES[5]: + assert array.shape[0] == len(n_mentions) + elif feature == FEATURES_NAMES[6]: + assert array.shape[0] == len(n_mentions)*(len(n_mentions)-1)/2 + assert np.max(array) == len(n_mentions)-2 + elif feature == FEATURES_NAMES[7]: + if compressed: + assert array.shape[0] == len(n_mentions)*(len(n_mentions)-1)/2 + assert np.max(array[:, 7]) == len(n_mentions)-2 + assert np.min(array[:, 7]) == 0 + elif feature == FEATURES_NAMES[8]: + assert array.shape[0] == len(n_mentions)*(len(n_mentions)-1)/2 + +############################################################################################### +### PARALLEL FCT (has to be at top-level of the module to be pickled for multiprocessing) ##### +def load_file(full_name, debug=False): + ''' + load a *._conll file + Input: full_name: path to the file + Output: list of tuples for each conll doc in the file, where the tuple contains: + (utts_text ([str]): list of the utterances in the document + utts_tokens ([[str]]): list of the tokens (conll words) in the document + utts_corefs: list of coref objects (dicts) with the following properties: + coref['label']: id of the coreference cluster, + coref['start']: start index (index of first token in the utterance), + coref['end': end index (index of last token in the utterance). + utts_speakers ([str]): list of the speaker associated to each utterances in the document + name (str): name of the document + part (str): part of the document + ) + ''' + docs = [] + with io.open(full_name, 'rt', encoding='utf-8', errors='strict') as f: + lines = list(f)#.readlines() + utts_text = [] + utts_tokens = [] + utts_corefs = [] + utts_speakers = [] + tokens = [] + corefs = [] + index = 0 + speaker = "" + name = "" + part = "" + for li, line in enumerate(lines): + cols = line.split() + if debug: print("line", li, "cols:", cols) + # End of utterance + if len(cols) == 0: + if tokens: + if debug: print("End of utterance") + utts_text.append(u''.join(t + u' ' for t in tokens)) + utts_tokens.append(tokens) + utts_speakers.append(speaker) + utts_corefs.append(corefs) + tokens = [] + corefs = [] + index = 0 + speaker = "" + continue + # End of doc + elif len(cols) == 2: + if debug: print("End of doc") + if cols[0] == "#end": + if debug: print("Saving doc") + docs.append((utts_text, utts_tokens, utts_corefs, utts_speakers, name, part)) + utts_text = [] + utts_tokens = [] + utts_corefs = [] + utts_speakers = [] + else: + raise ValueError("Error on end line " + line) + # New doc + elif len(cols) == 5: + if debug: print("New doc") + if cols[0] == "#begin": + name = re.match(r"\((.*)\);", cols[2]).group(1) + try: + part = cols[4] + except ValueError: + print("Error parsing document part " + line) + if debug: print("New doc", name, part, name[:2]) + tokens = [] + corefs = [] + index = 0 + else: + raise ValueError("Error on begin line " + line) + # Inside utterance + elif len(cols) > 7: + if debug: print("Inside utterance") + assert (cols[0] == name and int(cols[1]) == int(part)), "Doc name or part error " + line + assert (int(cols[2]) == index), "Index error on " + line + if speaker: + assert (cols[9] == speaker), "Speaker changed in " + line + speaker + else: + speaker = cols[9] + if debug: print("speaker", speaker) + if cols[-1] != u'-': + coref_expr = cols[-1].split(u'|') + if debug: print("coref_expr", coref_expr) + if not coref_expr: + raise ValueError("Coref expression empty " + line) + for tok in coref_expr: + if debug: print("coref tok", tok) + try: + match = re.match(r"^(\(?)(\d+)(\)?)$", tok) + except: + print("error getting coreferences for line " + line) + assert match is not None, "Error parsing coref " + tok + " in " + line + num = match.group(2) + assert (num is not u''), "Error parsing coref " + tok + " in " + line + if match.group(1) == u'(': + if debug: print("New coref", num) + corefs.append({'label': num, 'start': index, 'end': None}) + if match.group(3) == u')': + j = None + for i in range(len(corefs)-1, -1, -1): + if debug: print("i", i) + if corefs[i]['label'] == num and corefs[i]['end'] is None: + j = i + break + assert (j is not None), "coref closing error " + line + if debug: print("End coref", num) + corefs[j]['end'] = index + tokens.append(clean_token(cols[3])) + index += 1 + else: + raise ValueError("Line not standard " + line) + return docs + +def set_feats(doc): + doc.set_mentions_features() + +def get_feats(doc, i): + return doc.get_feature_array(doc_id=i) + +def gather_feats(gathering_array, array, feat_name, pairs_ant_index, pairs_start_index): + if gathering_array is None: + gathering_array = array + else: + if feat_name == FEATURES_NAMES[6]: + array = [a + pairs_ant_index for a in array] + elif feat_name == FEATURES_NAMES[3]: + array = [a + pairs_start_index for a in array] + gathering_array += array + return feat_name, gathering_array + +def read_file(full_name): + doc = "" + with io.open(full_name, 'rt', encoding='utf-8', errors='strict') as f: + doc = f.read() + return doc + +################### +### ConllDoc ##### + +class ConllDoc(Document): + def __init__(self, name, part, *args, **kwargs): + self.name = name + self.part = part + self.feature_matrix = {} + self.conll_tokens = [] + self.conll_lookup = [] + self.gold_corefs = [] + self.missed_gold = [] + super(ConllDoc, self).__init__(*args, **kwargs) + + def get_conll_spacy_lookup(self, conll_tokens, spacy_tokens, debug=False): + ''' + Compute a look up table between spacy tokens (from spacy tokenizer) + and conll pre-tokenized tokens + Output: list[conll_index] => list of associated spacy tokens (assume spacy tokenizer has a finer granularity) + ''' + lookup = [] + c_iter = (t for t in conll_tokens) + s_iter = enumerate(t for t in spacy_tokens) + i, s_tok = next(s_iter) + for c_tok in c_iter: + #if debug: print("conll", c_tok, "spacy", s_tok, "index", i) + c_lookup = [] + while i is not None and len(c_tok) and c_tok.startswith(s_tok.text): + c_lookup.append(i) + c_tok = c_tok[len(s_tok):] + i, s_tok = next(s_iter, (None, None)) + if debug and len(c_tok): print("eating token: conll", c_tok, "spacy", s_tok, "index", i) + assert len(c_lookup), "Unmatched conll and spacy tokens" + lookup.append(c_lookup) + return lookup + + def add_conll_utterance(self, parsed, tokens, corefs, speaker_id, use_gold_mentions=False, debug=False): + conll_lookup = self.get_conll_spacy_lookup(tokens, parsed) + self.conll_tokens.append(tokens) + self.conll_lookup.append(conll_lookup) + # Convert conll tokens coref index in spacy tokens indexes + identified_gold = [False] * len(corefs) + for coref in corefs: + assert (coref['label'] is not None and coref['start'] is not None and coref['end'] is not None), \ + ("Error in coreference " + coref + " in " + parsed) + coref['start'] = conll_lookup[coref['start']][0] + coref['end'] = conll_lookup[coref['end']][-1] + + if speaker_id not in self.speakers: + speaker_name = speaker_id.split(u'_') + if debug: print("New speaker: ", speaker_id, "name: ", speaker_name) + self.speakers[speaker_id] = Speaker(speaker_id, speaker_name) + if use_gold_mentions: + for coref in corefs: + # print("coref['label']", coref['label']) + # print("coref text",parsed[coref['start']:coref['end']]) + mention = Mention(parsed[coref['start']:coref['end']], len(self.mentions), len(self.utterances), + self.n_sents, speaker=self.speakers[speaker_id], gold_label=coref['label']) + self.mentions.append(mention) + # print("mention: ", mention, "label", mention.gold_label) + else: + self._extract_mentions(parsed, len(self.utterances), self.n_sents, self.speakers[speaker_id]) + # Assign a gold label to mentions which have one + if debug: print("Check corefs", corefs) + for i, coref in enumerate(corefs): + for m in self.mentions: + if m.utterance_index != len(self.utterances): + continue + # if debug: print("Checking mention", m, m.utterance_index, m.start, m.end) + if coref['start'] == m.start and coref['end'] == m.end - 1: + m.gold_label = coref['label'] + identified_gold[i] = True + # if debug: print("Gold mention found:", m, coref['label']) + for found, coref in zip(identified_gold, corefs): + if not found: + self.missed_gold.append([self.name, self.part, str(len(self.utterances)), parsed.text, parsed[coref['start']:coref['end']+1].text]) + if debug: + print("❄️ gold mention not in predicted mentions", coref, parsed[coref['start']:coref['end']+1]) + self.utterances.append(parsed) + self.gold_corefs.append(corefs) + self.utterances_speaker.append(self.speakers[speaker_id]) + self.n_sents += len(list(parsed.sents)) + + def get_single_mention_features_conll(self, mention, compressed=True): + ''' Compressed or not single mention features''' + if not compressed: + _, features = self.get_single_mention_features(mention) + return features[np.newaxis, :] + feat_l = [mention.features_["01_MentionType"], + mention.features_["02_MentionLength"], + mention.index, + len(self.mentions), + mention.features_["04_IsMentionNested"], + self.genre_, + ] + return feat_l + + def get_pair_mentions_features_conll(self, m1, m2, compressed=True): + ''' Compressed or not single mention features''' + if not compressed: + _, features = self.get_pair_mentions_features(m1, m2) + return features[np.newaxis, :] + features_, _ = self.get_pair_mentions_features(m1, m2) + feat_l = [features_["00_SameSpeaker"], + features_["01_AntMatchMentionSpeaker"], + features_["02_MentionMatchSpeaker"], + features_["03_HeadsAgree"], + features_["04_ExactStringMatch"], + features_["05_RelaxedStringMatch"], + features_["06_SentenceDistance"], + features_["07_MentionDistance"], + features_["08_Overlapping"], + ] + return feat_l + + def get_feature_array(self, doc_id, feature=None, compressed=True, debug=True): + """ + Prepare feature array: + mentions_spans: (N, S) + mentions_words: (N, W) + mentions_features: (N, Fs) + mentions_labels: (N, 1) + mentions_pairs_start_index: (N, 1) index of beggining of pair list in pair_labels + mentions_pairs_length: (N, 1) number of pairs (i.e. nb of antecedents) for each mention + pairs_features: (P, Fp) + pairs_labels: (P, 1) + pairs_ant_idx: (P, 1) => indexes of antecedents mention for each pair (mention index in doc) + """ + if not self.mentions: + print("No mention in this doc !") + return {} + if debug: print("🛎 features matrices") + mentions_spans = [] + mentions_words = [] + mentions_features = [] + pairs_ant_idx = [] + pairs_features = [] + pairs_labels = [] + mentions_labels = [] + mentions_pairs_start = [] + mentions_pairs_length = [] + mentions_location = [] + n_mentions = 0 + total_pairs = 0 + if debug: print("mentions", self.mentions, str([m.gold_label for m in self.mentions])) + for mention_idx, antecedents_idx in list(self.get_candidate_pairs(max_distance=None, max_distance_with_match=None)): + n_mentions += 1 + mention = self.mentions[mention_idx] + mentions_spans.append(mention.spans_embeddings) + w_idx = mention_words_idx(self.embed_extractor, mention) + if w_idx is None: + print("error in", self.name, self.part, mention.utterance_index) + mentions_words.append(w_idx) + mentions_features.append(self.get_single_mention_features_conll(mention, compressed)) + mentions_location.append([mention.start, mention.end, mention.utterance_index, mention_idx, doc_id]) + ants = [self.mentions[ant_idx] for ant_idx in antecedents_idx] + no_antecedent = not any(ant.gold_label == mention.gold_label for ant in ants) or mention.gold_label is None + if antecedents_idx: + pairs_ant_idx += [idx for idx in antecedents_idx] + pairs_features += [self.get_pair_mentions_features_conll(ant, mention, compressed) for ant in ants] + ant_labels = [0 for ant in ants] if no_antecedent else [1 if ant.gold_label == mention.gold_label else 0 for ant in ants] + pairs_labels += ant_labels + mentions_labels.append(1 if no_antecedent else 0) + mentions_pairs_start.append(total_pairs) + total_pairs += len(ants) + mentions_pairs_length.append(len(ants)) + + out_dict = {FEATURES_NAMES[0]: mentions_features, + FEATURES_NAMES[1]: mentions_labels, + FEATURES_NAMES[2]: mentions_pairs_length, + FEATURES_NAMES[3]: mentions_pairs_start, + FEATURES_NAMES[4]: mentions_spans, + FEATURES_NAMES[5]: mentions_words, + FEATURES_NAMES[6]: pairs_ant_idx if pairs_ant_idx else None, + FEATURES_NAMES[7]: pairs_features if pairs_features else None, + FEATURES_NAMES[8]: pairs_labels if pairs_labels else None, + FEATURES_NAMES[9]: [mentions_location], + FEATURES_NAMES[10]: [self.conll_tokens], + FEATURES_NAMES[11]: [self.conll_lookup], + FEATURES_NAMES[12]: [{'name': self.name, + 'part': self.part, + 'utterances': list(str(u) for u in self.utterances), + 'mentions': list(str(m) for m in self.mentions)}], + } + if debug: + print("🚘 Summary") + for k, v in out_dict.items(): + print(k, len(v)) + return n_mentions, total_pairs, out_dict + +################### +### ConllCorpus ##### +class ConllCorpus(object): + def __init__(self, n_jobs=4, embed_path=PACKAGE_DIRECTORY+"/weights/", use_gold_mentions=False): + self.n_jobs = n_jobs + self.features = {} + self.utts_text = [] + self.utts_tokens = [] + self.utts_corefs = [] + self.utts_speakers = [] + self.utts_doc_idx = [] + self.docs_names = [] + self.docs = [] + if embed_path is not None: + self.embed_extractor = EmbeddingExtractor(embed_path) + self.trainable_embed = [] + self.trainable_voc = [] + self.use_gold_mentions = use_gold_mentions + + def check_words_in_embeddings_voc(self, embedding, tuned=True, debug=False): + print("🌋 Checking if words are in embedding voc") + if tuned: + embed_voc = embedding.tun_idx + else: + embed_voc = embedding.stat_idx + missing_words = [] + missing_words_sents = [] + missing_words_doc = [] + for doc in self.docs: + # if debug: print("Checking doc", doc.name, doc.part) + for sent in doc.utterances: + # if debug: print(sent.text) + for word in sent: + w = embedding.normalize_word(word) + # if debug: print(w) + if w not in embed_voc: + missing_words.append(w) + missing_words_sents.append(sent.text) + missing_words_doc.append(doc.name + doc.part) + if debug: + out_str = "No matching tokens in tuned voc for " + w + \ + " in sentence " + sent.text + \ + " in doc " + doc.name + doc.part + print(out_str) + return missing_words, missing_words_sents, missing_words_doc + + def test_sentences_words(self, save_file, debug=False): + print("🌋 Saving sentence list") + with io.open(save_file, "w", encoding='utf-8') as f: + if debug: print("Sentences saved in", save_file) + for doc in self.docs: + out_str = "#begin document (" + doc.name + \ + "); part " + doc.part + "\n" + f.write(out_str) + for sent in doc.utterances: + f.write(sent.text + '\n') + out_str = "#end document\n\n" + f.write(out_str) + + def save_sentences(self, save_file, debug=False): + print("🌋 Saving sentence list") + with io.open(save_file, "w", encoding='utf-8') as f: + if debug: print("Sentences saved in", save_file) + for doc in self.docs: + out_str = "#begin document (" + doc.name + \ + "); part " + doc.part + "\n" + f.write(out_str) + for sent in doc.utterances: + f.write(sent.text + '\n') + out_str = "#end document\n\n" + f.write(out_str) + + def build_key_file(self, data_path, key_file, debug=False): + print("🌋 Building key file from corpus") + print("Saving in", key_file) + # Create a pool of processes. By default, one is created for each CPU in your machine. + with io.open(key_file, "w", encoding='utf-8') as kf: + if debug: print("Key file saved in", key_file) + for dirpath, _, filenames in os.walk(data_path): + print("In", dirpath) + file_list = [os.path.join(dirpath, f) for f in filenames if f.endswith(".v4_auto_conll") \ + or f.endswith(".v4_gold_conll")] + cleaned_file_list = [] + for f in file_list: + fn = f.split('.') + if fn[1] == "v4_auto_conll": + gold = fn[0] + "." + "v4_gold_conll" + if gold not in file_list: + cleaned_file_list.append(f) + else: + cleaned_file_list.append(f) + #self.load_file(file_list[0]) + doc_list = parallel_process(cleaned_file_list, read_file) + for doc in doc_list: + kf.write(doc) + + def list_undetected_mentions(self, data_path, save_file, debug=True): + self.read_corpus(data_path) + print("🌋 Listing undetected mentions") + with io.open(save_file, 'w', encoding='utf-8') as out_file: + for doc in tqdm(self.docs): + for name, part, utt_i, utt, coref in doc.missed_gold: + out_str = name + u"\t" + part + u"\t" + utt_i + u'\t"' + utt + u'"\n' + out_str += coref + u"\n" + out_file.write(out_str) + if debug: print(out_str) + + def read_corpus(self, data_path, debug=False): + print("🌋 Reading files") + for dirpath, _, filenames in os.walk(data_path): + print("In", dirpath, os.path.abspath(dirpath)) + file_list = [os.path.join(dirpath, f) for f in filenames if f.endswith(".v4_auto_conll") \ + or f.endswith(".v4_gold_conll")] + cleaned_file_list = [] + for f in file_list: + fn = f.split('.') + if fn[1] == "v4_auto_conll": + gold = fn[0] + "." + "v4_gold_conll" + if gold not in file_list: + cleaned_file_list.append(f) + else: + cleaned_file_list.append(f) + doc_list = parallel_process(cleaned_file_list, load_file) + for docs in doc_list:#executor.map(self.load_file, cleaned_file_list): + for utts_text, utt_tokens, utts_corefs, utts_speakers, name, part in docs: + print("Imported", name) + if debug: + print("utts_text", utts_text) + print("utt_tokens", utt_tokens) + print("utts_corefs", utts_corefs) + print("utts_speakers", utts_speakers) + print("name, part", name, part) + self.utts_text += utts_text + self.utts_tokens += utt_tokens + self.utts_corefs += utts_corefs + self.utts_speakers += utts_speakers + self.utts_doc_idx += [len(self.docs_names)] * len(utts_text) + self.docs_names.append((name, part)) + print("utts_text size", len(self.utts_text)) + print("utts_tokens size", len(self.utts_tokens)) + print("utts_corefs size", len(self.utts_corefs)) + print("utts_speakers size", len(self.utts_speakers)) + print("utts_doc_idx size", len(self.utts_doc_idx)) + print("🌋 Building docs") + for name, part in self.docs_names: + self.docs.append(ConllDoc(name=name, part=part, nlp=None, + use_no_coref_list=False, consider_speakers=True, + embedding_extractor=self.embed_extractor, + conll=CONLL_GENRES[name[:2]])) + print("🌋 Loading spacy model") + try: + spacy.info('en_core_web_sm') + model = 'en_core_web_sm' + except IOError: + print("No spacy 2 model detected, using spacy1 'en' model") + spacy.info('en') + model = 'en' + nlp = spacy.load(model) + print("🌋 Parsing utterances and filling docs") + doc_iter = (s for s in self.utts_text) + for utt_tuple in tqdm(zip(nlp.pipe(doc_iter), + self.utts_tokens, self.utts_corefs, + self.utts_speakers, self.utts_doc_idx)): + spacy_tokens, conll_tokens, corefs, speaker, doc_id = utt_tuple + if debug: print(unicode_(self.docs_names[doc_id]), "-", spacy_tokens) + doc = spacy_tokens + if debug: + out_str = "utterance " + unicode_(doc) + " corefs " + unicode_(corefs) + \ + " speaker " + unicode_(speaker) + "doc_id" + unicode_(doc_id) + print(out_str.encode('utf-8')) + self.docs[doc_id].add_conll_utterance(doc, conll_tokens, corefs, speaker, + use_gold_mentions=self.use_gold_mentions) + + def build_and_gather_multiple_arrays(self, save_path): + print("🌋 Extracting mentions features") + parallel_process(self.docs, set_feats, n_jobs=self.n_jobs) + + print("🌋 Building and gathering arrays") + arr =[{'doc': doc, + 'i': i} for i, doc in enumerate(self.docs)] + arrays_dicts = parallel_process(arr, get_feats, use_kwargs=True, n_jobs=self.n_jobs) + gathering_dict = dict((feat, None) for feat in FEATURES_NAMES) + n_mentions_list = [] + pairs_ant_index = 0 + pairs_start_index = 0 + for n, p, arrays_dict in tqdm(arrays_dicts): + for f in FEATURES_NAMES: + if gathering_dict[f] is None: + gathering_dict[f] = arrays_dict[f] + else: + if f == FEATURES_NAMES[6]: + array = [a + pairs_ant_index for a in arrays_dict[f]] + elif f == FEATURES_NAMES[3]: + array = [a + pairs_start_index for a in arrays_dict[f]] + else: + array = arrays_dict[f] + gathering_dict[f] += array + pairs_ant_index += n + pairs_start_index += p + n_mentions_list.append(n) + + for feature in FEATURES_NAMES[:9]: + print("Building numpy array for", feature, "length", len(gathering_dict[feature])) + if feature != "mentions_spans": + array = np.array(gathering_dict[feature]) + if array.ndim == 1: + array = np.expand_dims(array, axis=1) + else: + array = np.stack(gathering_dict[feature]) + # check_numpy_array(feature, array, n_mentions_list) + print("Saving numpy", feature, "size", array.shape) + np.save(save_path + feature, array) + for feature in FEATURES_NAMES[9:]: + print("Saving pickle", feature, "size", len(gathering_dict[feature])) + with open(save_path + feature + '.bin', "wb") as fp: + pickle.dump(gathering_dict[feature], fp) + + def save_vocabulary(self, save_path, debug=False): + print("🌋 Building tunable vocabulary matrix from static vocabulary") + tunable_voc = self.embed_extractor.tun_voc + + print("🌋 Saving vocabulary") + with io.open(save_path+"tuned_word_vocabulary.txt", "w", encoding='utf-8') as f: + if debug: print("tunable voc saved in", save_path+"tuned_word_vocabulary.txt", "size", len(tunable_voc)) + for w in tunable_voc: + # print(w) + f.write(w + '\n') + # We also save a copy of the static vocabulary (for conll test set scoring) + with io.open(save_path+"static_word_vocabulary.txt", "w", encoding='utf-8') as f: + if debug: print("static voc saved in", save_path+"static_word_vocabulary.txt", "size", len(tunable_voc)) + for w in tunable_voc: + f.write(w + '\n') + + tuned_word_embeddings = np.vstack([self.embed_extractor.get_stat_word(w)[1] for w in tunable_voc]) + print("Saving tunable voc, size:", tuned_word_embeddings.shape) + np.save(save_path + "tuned_word_embeddings", tuned_word_embeddings) + + static_word_embeddings = np.vstack([self.embed_extractor.static_embeddings[w] for w in self.embed_extractor.stat_voc]) + print("Saving static voc, size:", static_word_embeddings.shape) + np.save(save_path + "static_word_embeddings", static_word_embeddings) + +if __name__ == '__main__': + DIR_PATH = os.path.dirname(os.path.realpath(__file__)) + parser = argparse.ArgumentParser(description='Training the neural coreference model') + parser.add_argument('--function', type=str, default='all', help='Function ("all", "key", "parse", "find_undetected")') + parser.add_argument('--path', type=str, default=DIR_PATH + '/data/', help='Path to the dataset') + parser.add_argument('--key', type=str, help='Path to an optional key file for scoring') + parser.add_argument('--n_jobs', type=int, default=1, help='Number of parallel jobs (default 1)') + args = parser.parse_args() + if args.key is None: + args.key = args.path + "/key.txt" + CORPUS = ConllCorpus(n_jobs=args.n_jobs) + if args.function == 'parse' or args.function == 'all': + SAVE_DIR = args.path + "/numpy/" + if not os.path.exists(SAVE_DIR): + os.makedirs(SAVE_DIR) + else: + if os.listdir(SAVE_DIR): + print("There are already data in", SAVE_DIR) + print("Erasing") + for file in os.listdir(SAVE_DIR): + print(file) + os.remove(SAVE_DIR + file) + start_time = time.time() + CORPUS.read_corpus(args.path) + print('=> read_corpus time elapsed', time.time() - start_time) + start_time2 = time.time() + CORPUS.build_and_gather_multiple_arrays(SAVE_DIR) + print('=> build_and_gather_multiple_arrays time elapsed', time.time() - start_time2) + start_time2 = time.time() + CORPUS.save_vocabulary(SAVE_DIR) + print('=> save_vocabulary time elapsed', time.time() - start_time2) + print('=> total time elapsed', time.time() - start_time) + if args.function == 'key' or args.function == 'all': + CORPUS.build_key_file(args.path, args.key) + if args.function == 'find_undetected': + CORPUS.list_undetected_mentions(args.path, args.path + "/undetected_mentions.txt") diff --git a/neuralcoref/data/.gitignore b/neuralcoref/data/.gitignore new file mode 100644 index 0000000..86d0cb2 --- /dev/null +++ b/neuralcoref/data/.gitignore @@ -0,0 +1,4 @@ +# Ignore everything in this directory +* +# Except this file +!.gitignore \ No newline at end of file diff --git a/neuralcoref/dataset.py b/neuralcoref/dataset.py new file mode 100644 index 0000000..db86636 --- /dev/null +++ b/neuralcoref/dataset.py @@ -0,0 +1,355 @@ +# coding: utf8 +"""Conll training algorithm""" +from __future__ import absolute_import +from __future__ import unicode_literals +from __future__ import print_function + +import os +import io +import numpy as np + +import torch +import torch.utils.data + +from torch.utils.data.sampler import Sampler +from torch.utils.data import Dataset + +from neuralcoref.utils import encode_distance +from neuralcoref.conllparser import FEATURES_NAMES + +PACKAGE_DIRECTORY = os.path.dirname(os.path.abspath(__file__)) +BATCH_SIZE_PATH = os.path.join(PACKAGE_DIRECTORY, "test_batch_size.txt") #fernandes.txt")# + +SIZE_SPAN = 250 # size of the span vector (averaged word embeddings) +SIZE_WORD = 8 # number of words in a mention (tuned embeddings) +SIZE_EMBEDDING = 50 # size of the words embeddings +SIZE_FP = 70 # number of features for a pair of mention +SIZE_FP_COMPRESSED = 9 # size of the features for a pair of mentions as stored in numpy arrays +SIZE_FS = 24 # number of features of a single mention +SIZE_FS_COMPRESSED = 6 # size of the features for a mention as stored in numpy arrays +SIZE_GENRE = 7 # Size of the genre one-hot array + +SIZE_PAIR_IN = 2 * SIZE_SPAN + 2 * SIZE_WORD * SIZE_EMBEDDING + SIZE_FP # Input to the mentions pair neural network +SIZE_SINGLE_IN = SIZE_SPAN + SIZE_WORD * SIZE_EMBEDDING + SIZE_FS # Input to the single mention neural network + +def load_embeddings_from_file(name): + print("loading", name+"_embeddings.npy") + embed = torch.from_numpy(np.load(name+"_embeddings.npy")).float() + print(embed.size()) + print("loading", name+"_vocabulary.txt") + with io.open(name+"_vocabulary.txt", 'r', encoding='utf-8') as f: + voc = [line.strip() for line in f] + return embed, voc + +class NCDataset(Dataset): + def __init__(self, data_path, params, no_targets=False): + print("🏝 Loading Dataset at", data_path) + self.costs = params.costs + self.no_targets = no_targets + # Load files + datas = {} + if not os.listdir(data_path): + raise ValueError("Empty data_path") + numpy_files_found = False + print("Reading ", end='') + for file_name in os.listdir(data_path): + if not '.npy' in file_name: + continue + numpy_files_found = True + print(file_name, end=', ') + datas[file_name.split(u'.')[0]] = np.load(data_path + file_name) + if not numpy_files_found: + raise ValueError("Can't find numpy files in {}".format(data_path)) + + # Gather arrays in two lists of tuples for mention and pairs + self.mentions = list(zip(*(arr for key, arr in sorted(datas.items()) if key.startswith(u"mentions")))) + self.pairs = list(zip(*(arr for key, arr in sorted(datas.items()) if key.startswith(u"pairs")))) + self.mentions_pair_length = datas[FEATURES_NAMES[2]] + assert [arr.shape[0] for arr in self.mentions[0]] == [6, 1, 1, 1, 250, 8] # Cf order of FEATURES_NAMES in conllparser.py + assert [arr.shape[0] for arr in self.pairs[0]] == [1, 9, 1] # Cf order of FEATURES_NAMES in conllparser.py + + def __len__(self): + return len(self.mentions) + + def __getitem__(self, mention_idx, debug=False): + """ + Return: + Definitions: + P is the number of antecedent per mention (number of pairs for the mention) + S = 250 is the size of the span vector (averaged word embeddings) + W = 8 is the number of words in a mention (tuned embeddings) + Fp = 70 is the number of features for a pair of mention + Fs = 24 is the number of features of a single mention + + if there are some pairs: + inputs = (spans, words, features, ant_spans, ant_words, ana_spans, ana_words, pairs_features) + targets = (labels, costs, true_ants, false_ants) + else: + inputs = (spans, words, features) + targets = (labels, costs, true_ants) + + inputs: Tuple of + spans => (S,) + words => (W,) + features => (Fs,) + + if there are potential antecedents (P > 0): + ant_spans => (P, S) or nothing if no pairs + ant_words => (P, W) or nothing if no pairs + ana_spans => (P, S) or nothing if no pairs + ana_words => (P, W) or nothing if no pairs + pair_features => (P, Fp) or nothing if no pairs + + targets: Tuple of + labels => (P+1,) + costs => (P+1,) + true_ant => (P+1,) + + if there are potential antecedents (P > 0): + false_ant => (P+1,) + + """ + features_raw, label, pairs_length, pairs_start_index, spans, words = self.mentions[mention_idx] + pairs_start_index = np.asscalar(pairs_start_index) + pairs_length = np.asscalar(pairs_length) + + # Build features array (float) from raw features (int) + assert features_raw.shape[0] == SIZE_FS_COMPRESSED + features = np.zeros((SIZE_FS,)) + features[features_raw[0]] = 1 + features[4:15] = encode_distance(features_raw[1]) + features[15] = features_raw[2].astype(float) / features_raw[3].astype(float) + features[16] = features_raw[4] + features[features_raw[5] + 17] = 1 + + if pairs_length == 0: + spans = torch.from_numpy(spans).float() + words = torch.from_numpy(words) + features = torch.from_numpy(features).float() + inputs = (spans, words, features) + if self.no_targets: + return inputs + true_ant = torch.zeros(1).long() # zeros = indices of true ant + costs = torch.from_numpy((1 - label) * self.costs['FN']).float() + label = torch.from_numpy(label).float() + targets = (label, costs, true_ant) + if debug: + print("inputs shapes: ", [a.size() for a in inputs]) + print("targets shapes: ", [a.size() for a in targets]) + return inputs, targets + + start = pairs_start_index + end = pairs_start_index + pairs_length + pairs = self.pairs[start:end] + assert len(pairs) == pairs_length + assert len(pairs[0]) == 3 # pair[i] = (pairs_ant_index, pairs_features, pairs_labels) + pairs_ant_index, pairs_features_raw, pairs_labels = list(zip(*pairs)) + + pairs_features_raw = np.stack(pairs_features_raw) + pairs_labels = np.squeeze(np.stack(pairs_labels), axis=1) + + # Build pair features array (float) from raw features (int) + assert pairs_features_raw[0, :].shape[0] == SIZE_FP_COMPRESSED + pairs_features = np.zeros((len(pairs_ant_index), SIZE_FP)) + pairs_features[:, 0:6] = pairs_features_raw[:, 0:6] + pairs_features[:, 6:17] = encode_distance(pairs_features_raw[:, 6]) + pairs_features[:, 17:28] = encode_distance(pairs_features_raw[:, 7]) + pairs_features[:, 28] = pairs_features_raw[:, 8] + # prepare antecent features + ant_features_raw = np.concatenate([self.mentions[np.asscalar(idx)][0][np.newaxis, :] for idx in pairs_ant_index]) + ant_features = np.zeros((pairs_length, SIZE_FS-SIZE_GENRE)) + ant_features[:, ant_features_raw[:, 0]] = 1 + ant_features[:, 4:15] = encode_distance(ant_features_raw[:, 1]) + ant_features[:, 15] = ant_features_raw[:, 2].astype(float) / ant_features_raw[:, 3].astype(float) + ant_features[:, 16] = ant_features_raw[:, 4] + pairs_features[:, 29:46] = ant_features + # Here we keep the genre + ana_features = np.tile(features, (pairs_length, 1)) + pairs_features[:, 46:] = ana_features + + ant_spans = np.concatenate([self.mentions[np.asscalar(idx)][4][np.newaxis, :] for idx in pairs_ant_index]) + ant_words = np.concatenate([self.mentions[np.asscalar(idx)][5][np.newaxis, :] for idx in pairs_ant_index]) + ana_spans = np.tile(spans, (pairs_length, 1)) + ana_words = np.tile(words, (pairs_length, 1)) + ant_spans = torch.from_numpy(ant_spans).float() + ant_words = torch.from_numpy(ant_words) + ana_spans = torch.from_numpy(ana_spans).float() + ana_words = torch.from_numpy(ana_words) + pairs_features = torch.from_numpy(pairs_features).float() + + labels_stack = np.concatenate((pairs_labels, label), axis=0) + assert labels_stack.shape == (pairs_length + 1,) + labels = torch.from_numpy(labels_stack).float() + + spans = torch.from_numpy(spans).float() + words = torch.from_numpy(words) + features = torch.from_numpy(features).float() + + inputs = (spans, words, features, + ant_spans, ant_words, + ana_spans, ana_words, + pairs_features) + + if self.no_targets: + return inputs + + if label == 0: + costs = np.concatenate((self.costs['WL'] * (1 - pairs_labels), [self.costs['FN']])) # Inverse labels: 1=>0, 0=>1 + else: + costs = np.concatenate((self.costs['FL'] * np.ones_like(pairs_labels), [0])) + assert costs.shape == (pairs_length + 1,) + costs = torch.from_numpy(costs).float() + + true_ants_unpad = np.flatnonzero(labels_stack) + if len(true_ants_unpad) == 0: + raise ValueError("Error: no True antecedent for mention") + true_ants = np.pad(true_ants_unpad, (0, len(pairs_labels) + 1 - len(true_ants_unpad)), 'edge') + assert true_ants.shape == (pairs_length + 1,) + true_ants = torch.from_numpy(true_ants).long() + + false_ants_unpad = np.flatnonzero(1 - labels_stack) + assert len(false_ants_unpad) != 0 + false_ants = np.pad(false_ants_unpad, (0, len(pairs_labels) + 1 - len(false_ants_unpad)), 'edge') + assert false_ants.shape == (pairs_length + 1,) + false_ants = torch.from_numpy(false_ants).long() + + targets = (labels, costs, true_ants, false_ants) + if debug: + print("Mention", mention_idx) + print("inputs shapes: ", [a.size() for a in inputs]) + print("targets shapes: ", [a.size() for a in targets]) + return inputs, targets + +class NCBatchSampler(Sampler): + """A Batch sampler to group mentions in batches with close number of pairs to be padded together + """ + + def __init__(self, mentions_pairs_length, batchsize=600, + shuffle=False, debug=False): + """ Create and feed batches of mentions having close number of antecedents + The batch are padded and collated by the padder_collate function + + # Arguments: + mentions_pairs_length array of shape (N, 1): list/array of the number of pairs for each mention + batchsize: Number of pairs of each batch will be capped at this + """ + self.shuffle = shuffle + num_mentions = len(mentions_pairs_length) + mentions_lengths = np.concatenate([mentions_pairs_length, np.arange(0, num_mentions, 1, dtype=int)[:, np.newaxis]], axis=1) + sorted_lengths = mentions_lengths[mentions_lengths[:, 0].argsort()] + print("Preparing batches 📚") + + self.batches = [] + self.batches_pairs = [] + self.batches_size = [] + batch = [] + n_pairs = [] + num = 0 + for length, mention_idx in sorted_lengths: + if num > batchsize or (num == len(batch) and length != 0): # We keep the no_pairs batches pure + if debug: print("Added batch number", len(self.batches), + "with", len(batch), "mentions and", num, "pairs") + self.batches.append(batch) + self.batches_size.append(num) # We don't count the max 7 additional mentions that are repeated + self.batches_pairs.append(n_pairs) + + # Start a new batch + batch = [mention_idx] + n_pairs = [length] + num = length + 1 # +1 since we also have the single mention to add to the number of pairs + else: + num += length + 1 + batch.append(mention_idx) + n_pairs.append(length) + + # Complete and store the last batch + if debug: print("Added batch number", len(self.batches),"with", len(batch), "mentions and", num, "pairs") + self.batches.append(batch) + self.batches_size.append(num) + self.batches_pairs.append(n_pairs) + self.n_pairs = sum(sum(p) for p in self.batches_pairs) + self.n_mentions = sum(len(b) for b in self.batches) + self.n_batches = len(self.batches) + self.pairs_per_batch = float(self.n_pairs) / self.n_batches + self.mentions_per_batch = float(self.n_mentions) / self.n_batches + print("Dataset has:", self.n_batches, "batches,", self.n_mentions, "mentions,", self.n_pairs, "pairs") + + def get_batch_info(self): + return self.batches, self.batches_pairs + + def save_batch_sizes(self, save_file=BATCH_SIZE_PATH, debug=False): + print("🌋 Saving sizes of batches") + with io.open(save_file, "w", encoding='utf-8') as f: + if debug: print("Batch sizes saved in", save_file) + for batch, size in zip(self.batches, self.batches_size): + out_str = str(len(batch)) + "\t" + str(size) + "\n" + f.write(out_str) + + def __iter__(self): + if self.shuffle: + np.random.shuffle(self.batches) + for batch in self.batches: + yield batch + + def __len__(self): + return self.n_batches + +def padder_collate(batch, debug=False): + """ Puts each data field into a tensor with outer dimension batch size + Pad variable length input tensors and add a weight tensor to the target + """ + transposed_inputs = tuple(zip(*batch)) + if len(transposed_inputs) == 2: + inputs, targets = transposed_inputs + transposed_inputs = tuple(zip(*inputs)) + transposed_targets = tuple(zip(*targets)) + else: + transposed_targets = None + + max_pairs = max(len(t) for t in transposed_inputs[3]) if len(transposed_inputs) == 8 else 0 # Get max nb of pairs (batch are sorted by nb of pairs) + if max_pairs > 0: + out_inputs = [] + out_targets = [] + for t_inp in transposed_inputs: + if len(t_inp[0].shape) == 2: + out_inputs.append(torch.stack([torch.cat([t, t.new(max_pairs - len(t), len(t[0])).zero_()]) \ + if len(t) != max_pairs else t for t in t_inp], 0)) + else: + out_inputs.append(torch.stack(t_inp, 0)) + if transposed_targets is not None: + for i, t_targ in enumerate(transposed_targets): #0:labels, 1:costs, 2:true_ants, 3:false_ants + if i == 2 or i == 3: + if debug: + print("collate before", t_targ) + # shift the antecedent index associated to single anaphores (last) + t_targ = tuple(t.masked_fill_(torch.eq(t, len(t)-1), max_pairs) for t in t_targ) + if debug: + print("collate after", t_targ) + out_targets.append(torch.stack( + [torch.cat( + [t[:-1] if len(t) > 2 else t.new(1).fill_(t[0]), + t.new(max_pairs + 1 - len(t)).fill_(t[0]), + t.new(1).fill_(t[-1])] + ) if len(t) != max_pairs + 1 else t for t in t_targ + ], 0)) + + t_costs = transposed_targets[1] # We build the weights from the costs to have a float Tensor + out_targets.append(torch.stack( + [torch.cat([t.new(len(t)-1).fill_(1), + t.new(max_pairs + 1 - len(t)).zero_(), + t.new(1).fill_(1)]) if len(t) != max_pairs + 1 \ + else t.new(max_pairs + 1).fill_(1) for t in t_costs], 0)) + else: + # Remark this mask is the inverse of the weights in the above target (used for evaluation masking) + t_base = transposed_inputs[3] + out_targets = torch.stack( + [torch.cat([t.new(len(t)-1).zero_().byte(), + t.new(max_pairs + 1 - len(t)).fill_(1).byte(), + t.new(1).zero_().byte()]) if len(t) != max_pairs + 1 \ + else t.new(max_pairs + 1).zero_().byte() for t in t_base], 0) + else: + out_inputs = [torch.stack(t_inp, 0) for t_inp in transposed_inputs] + if transposed_targets is not None: + out_targets = [torch.stack(t_targ, 0) for t_targ in transposed_targets] + out_targets.append(out_targets[1].new(len(out_targets[1]), 1).fill_(1)) + else: + out_targets = out_inputs[0].new(len(out_inputs[0]), 1).zero_().byte() + return (out_inputs, out_targets) diff --git a/neuralcoref/data.py b/neuralcoref/document.py similarity index 59% rename from neuralcoref/data.py rename to neuralcoref/document.py index 7e3a0de..3bda044 100644 --- a/neuralcoref/data.py +++ b/neuralcoref/document.py @@ -5,8 +5,12 @@ from __future__ import print_function import re +import io from six import string_types, integer_types +from neuralcoref.compat import unicode_ +from neuralcoref.utils import encode_distance + try: from itertools import izip_longest as zip_longest except ImportError: # will be 3.x series @@ -28,19 +32,8 @@ ACCEPTED_ENTS = ["PERSON", "NORP", "FACILITY", "ORG", "GPE", "LOC", "PRODUCT", "EVENT", "WORK_OF_ART", "LANGUAGE"] WHITESPACE_PATTERN = r"\s+|_+" UNKNOWN_WORD = "*UNK*" -NORMALIZE_DICT = {"/.": ".", "/?": "?", "-LRB-": "(", "-RRB-": ")", - "-LCB-": "{", "-RCB-": "}", "-LSB-": "[", "-RSB-": "]"} -DISTANCE_BINS = list(range(5)) + [5]*3 + [6]*8 + [7]*16 +[8]*32 - -def encode_distance(d): - ''' Encode an integer as a (bined) one-hot numpy array ''' - dist_vect = np.zeros((11,)) - if d < 64: - dist_vect[DISTANCE_BINS[d]] = 1 - else: - dist_vect[9] = 1 - dist_vect[10] = min(float(d), 64.0) / 64.0 - return dist_vect +MISSING_WORD = "" +MAX_ITER = 100 ######################### ## MENTION EXTRACTION ### @@ -50,20 +43,28 @@ def extract_mentions_spans(doc, use_no_coref_list=True, debug=False): ''' Extract potential mentions from a spacy parsed Doc ''' - nouns_or_prp = re.compile(r"N.*|PRP.*|DT") - det_or_comp = ["det", "compound", "appos"] + keep_tags = re.compile(r"N.*|PRP.*|DT|IN") + leave_dep = ["det", "compound", "appos"] + keep_dep = ["nsubj", "dobj", "iobj", "pobj"] nsubj_or_dep = ["nsubj", "dep"] - conj_punct_pos = ["CCONJ", "PUNCT"] + conj_or_prep = ["conj", "prep"] + remove_pos = ["CCONJ", "INTJ", "ADP"] + lower_not_end = ["'s", ',', '.', '!', '?', ':', ';'] # Utility to remove bad endings def cleanup_endings(left, right, token): - minchild_idx = min(left) if left else token.i - maxchild_idx = max(right) if right else token.i + minchild_idx = min(left + [token.i]) if left else token.i + maxchild_idx = max(right + [token.i]) if right else token.i # Clean up endings and begginging - while maxchild_idx >= token.i and (doc[maxchild_idx].pos_ in conj_punct_pos or doc[maxchild_idx].lower_ == "'s"): + while maxchild_idx >= minchild_idx and (doc[maxchild_idx].pos_ in remove_pos + or doc[maxchild_idx].lower_ in lower_not_end): if debug: print("Removing last token", doc[maxchild_idx].lower_, doc[maxchild_idx].tag_) maxchild_idx -= 1 # We don't want mentions finishing with 's or conjunctions/punctuation - return min(minchild_idx, token.i), max(maxchild_idx, token.i)+1 + while minchild_idx <= maxchild_idx and (doc[minchild_idx].pos_ in remove_pos + or doc[minchild_idx].lower_ in lower_not_end): + if debug: print("Removing first token", doc[minchild_idx].lower_, doc[minchild_idx].tag_) + minchild_idx += 1 # We don't want mentions starting with 's or conjunctions/punctuation + return minchild_idx, maxchild_idx+1 if debug: print('===== doc ====:', doc) for c in doc: @@ -74,12 +75,12 @@ def cleanup_endings(left, right, token): # Pronouns and Noun phrases for token in doc: - if debug: print("🚀 tok:", token, "tok.tag_", token.tag_) + if debug: print("🚀 tok:", token, "tok.tag_:", token.tag_, "tok.pos_:", token.pos_, "tok.dep_:", token.dep_) if use_no_coref_list and token.lower_ in NO_COREF_LIST: if debug: print("token in no_coref_list") continue - if not nouns_or_prp.match(token.tag_) or token.dep_ in det_or_comp: + if (not keep_tags.match(token.tag_) or token.dep_ in leave_dep) and not token.dep_ in keep_dep: if debug: print("not pronoun or no right dependency") continue @@ -89,9 +90,7 @@ def cleanup_endings(left, right, token): endIdx = token.i + 1 span = doc[token.i: endIdx] - if not any((ent.start <= span.start and span.end <= ent.end for ent in doc.ents)): - if debug: print("==-- not in entity store:", span) - mentions_spans.append(span) + mentions_spans.append(span) # when pronoun is a part of conjunction (e.g., you and I) if token.n_rights > 0 or token.n_lefts > 0: @@ -101,17 +100,23 @@ def cleanup_endings(left, right, token): continue # Add NP mention - if debug: print("NP", token.lower_) + if debug: + print("NP or IN:", token.lower_) + if token.tag_ == 'IN': + print("IN tag") # Take care of 's if token.lower_ == "'s": if debug: print("'s detected") h = token.head - while h.head is not h: - if debug: print("token head", h, h.dep_, "head:", h.head) + j = 0 + while h.head.i != h.i and j < MAX_ITER: + if debug: + print("token head:", h, h.dep_, "head:", h.head) + print(id(h.head), id(h)) if h.dep_ == "nsubj": - minchild_idx = min((c.left_edge.i for c in doc if c.head == h.head and c.dep_ in nsubj_or_dep), + minchild_idx = min((c.left_edge.i for c in doc if c.head.i == h.head.i and c.dep_ in nsubj_or_dep), default=token.i) - maxchild_idx = max((c.right_edge.i for c in doc if c.head == h.head and c.dep_ in nsubj_or_dep), + maxchild_idx = max((c.right_edge.i for c in doc if c.head.i == h.head.i and c.dep_ in nsubj_or_dep), default=token.i) if debug: print("'s', i1:", doc[minchild_idx], " i2:", doc[maxchild_idx]) span = doc[minchild_idx : maxchild_idx+1] @@ -119,32 +124,49 @@ def cleanup_endings(left, right, token): mentions_spans.append(span) break h = h.head + j += 1 + assert j != MAX_ITER continue # clean up for c in doc: - if debug and c.head == token: print("🚧 token in span", c, c.head, c.dep_) - left = list(c.left_edge.i for c in doc if c.head == token) - if debug: print("left side", left) - right = list(c.right_edge.i for c in doc if c.head == token) - if debug: print("right side", right) + if debug and c.head.i == token.i: print("🚧 token in span:", c, "- head & dep:", c.head, c.dep_) + left = list(c.left_edge.i for c in doc if c.head.i == token.i) + right = list(c.right_edge.i for c in doc if c.head.i == token.i) + if token.tag_ == 'IN' and token.dep_ == "mark" and len(left) == 0 and len(right) == 0: + left = list(c.left_edge.i for c in doc if c.head.i == token.head.i) + right = list(c.right_edge.i for c in doc if c.head.i == token.head.i) + if debug: + print("left side:", left) + print("right side:", right) + minchild_idx = min(left) if left else token.i + maxchild_idx = max(right) if right else token.i + print("full span:", doc[minchild_idx:maxchild_idx+1]) start, end = cleanup_endings(left, right, token) + if start == end: + continue if doc[start].lower_ == "'s": continue # we probably already have stored this mention span = doc[start:end] - if debug: print("==-- full span store:", span) + if debug: + print("cleaned endings span:", doc[start:end]) + print("==-- full span store:", span) mentions_spans.append(span) - if any(tok.dep_ == "conj" for tok in span): + if debug and token.tag_ == 'IN': + print("IN tag") + if any(tok.dep_ in conj_or_prep for tok in span): if debug: print("Conjunction found, storing first element separately") for c in doc: - if c.head == token and c.dep_ != "conj": - if debug: print("left no conj", c, c.dep_, c.left_edge) - if debug: print("right no conj", c, c.dep_, c.right_edge) - left_no_conj = list(c.left_edge.i for c in doc if c.head == token and c.dep_ != "conj") - right_no_conj = list(c.right_edge.i for c in doc if c.head == token and c.dep_ != "conj") - if debug: print("left side no conj", [doc[i] for i in left_no_conj]) - if debug: print("right side no conj", [doc[i] for i in right_no_conj]) + if c.head.i == token.i and c.dep_ not in conj_or_prep: + if debug: print("left no conj:", c, 'dep & edge:', c.dep_, c.left_edge) + if debug: print("right no conj:", c, 'dep & edge:', c.dep_, c.right_edge) + left_no_conj = list(c.left_edge.i for c in doc if c.head.i == token.i and c.dep_ not in conj_or_prep) + right_no_conj = list(c.right_edge.i for c in doc if c.head.i == token.i and c.dep_ not in conj_or_prep) + if debug: print("left side no conj:", [doc[i] for i in left_no_conj]) + if debug: print("right side no conj:", [doc[i] for i in right_no_conj]) start, end = cleanup_endings(left_no_conj, right_no_conj, token) + if start == end: + continue span = doc[start:end] if debug: print("==-- full span store:", span) mentions_spans.append(span) @@ -152,7 +174,7 @@ def cleanup_endings(left, right, token): spans_set = set() cleaned_mentions_spans = [] for spans in mentions_spans: - if (spans.start, spans.end) not in spans_set: + if spans.end > spans.start and (spans.start, spans.end) not in spans_set: cleaned_mentions_spans.append(spans) spans_set.add((spans.start, spans.end)) @@ -170,7 +192,18 @@ def __new__(cls, span, mention_index, utterance_index, utterance_start_sent, spe obj = spacy.tokens.Span.__new__(cls, span.doc, span.start, span.end, *args, **kwargs) return obj - def __init__(self, span, mention_index, utterance_index, utterances_start_sent, speaker=None, gold_label=None): + def __init__(self, span, mention_index, utterance_index, utterances_start_sent, + speaker=None, gold_label=None): + ''' + Arguments: + span (spaCy Span): the spaCy span from which creating the Mention object + mention_index (int): index of the Mention in the Document + utterance_index (int): index of the utterance of the Mention in the Document + utterances_start_sent (int): index of the first sentence of the utterance of the Mention in the Document + (an utterance can comprise several sentences) + speaker (Speaker): the speaker of the mention + gold_label (anything): a gold label associated to the Mention (for training) + ''' self.index = mention_index self.utterance_index = utterance_index self.utterances_sent = utterances_start_sent + self._doc_sent_number() @@ -230,8 +263,8 @@ def find_type(self): def _doc_sent_number(self): ''' Index of the sentence of the Mention in the current utterance''' - for i, sent in enumerate(self.doc.sents): - if sent == self.sent: + for i, s in enumerate(self.doc.sents): + if s == self.sent: return i return None @@ -261,7 +294,7 @@ def speaker_match_mention(self, mention2): return self.speaker.speaker_matches_mention(mention2, strict_match=False) return False -class Speaker: +class Speaker(object): ''' A speaker with its names, list of mentions and matching test functions ''' @@ -269,13 +302,13 @@ def __init__(self, speaker_id, speaker_names=None): self.mentions = [] self.speaker_id = speaker_id if speaker_names is None: - self.speaker_names = [str(speaker_id)] + self.speaker_names = [unicode_(speaker_id)] elif isinstance(speaker_names, string_types): self.speaker_names = [speaker_names] elif len(speaker_names) > 1: self.speaker_names = speaker_names else: - self.speaker_names = str(speaker_names) + self.speaker_names = unicode_(speaker_names) self.speaker_tokens = [tok.lower() for s in self.speaker_names for tok in re.split(WHITESPACE_PATTERN, s)] def __str__(self): @@ -320,9 +353,9 @@ class EmbeddingExtractor: ''' Compute words embedding features for mentions ''' - def __init__(self, model_path): - self.average_mean, self.static_embeddings, self.static_voc = self.load_embeddings_from_file(model_path + "static_word") - _, self.tuned_embeddings, self.tuned_voc = self.load_embeddings_from_file(model_path + "tuned_word") + def __init__(self, pretrained_model_path): + _, self.static_embeddings, self.stat_idx, self.stat_voc = self.load_embeddings_from_file(pretrained_model_path + "static_word") + _, self.tuned_embeddings, self.tun_idx, self.tun_voc = self.load_embeddings_from_file(pretrained_model_path + "tuned_word") self.fallback = self.static_embeddings.get(UNKNOWN_WORD) self.shape = self.static_embeddings[UNKNOWN_WORD].shape @@ -331,23 +364,24 @@ def __init__(self, model_path): @staticmethod def load_embeddings_from_file(name): + print("Loading embeddings from", name) embeddings = {} - voc = {} + voc_to_idx = {} + idx_to_voc = [] mat = np.load(name+"_embeddings.npy") average_mean = np.average(mat, axis=0, weights=np.sum(mat, axis=1)) - with open(name+"_vocabulary.txt", encoding="utf8") as f: + with io.open(name+"_vocabulary.txt", 'r', encoding='utf-8') as f: for i, line in enumerate(f): embeddings[line.strip()] = mat[i, :] - voc[line.strip()] = i - return average_mean, embeddings, voc + voc_to_idx[line.strip()] = i + idx_to_voc.append(line.strip()) + return average_mean, embeddings, voc_to_idx, idx_to_voc @staticmethod def normalize_word(w): if w is None: - return "" - elif w.lower_ in NORMALIZE_DICT: - return NORMALIZE_DICT[w.lower_] - return w.lower_.replace("\\d", "0") + return MISSING_WORD + return re.sub(r"\d", u"0", w.lower_) def get_document_embedding(self, utterances_list): ''' Embedding for the document ''' @@ -355,31 +389,27 @@ def get_document_embedding(self, utterances_list): # return embed_vector embed_vector = np.zeros(self.shape) for utt in utterances_list: - embed_vector += self.get_average_embedding(utt) + _, utt_embed = self.get_average_embedding(utt) + embed_vector += utt_embed return embed_vector/max(len(utterances_list), 1) + def get_stat_word(self, word): + if word in self.static_embeddings: + return word, self.static_embeddings.get(word) + else: + return UNKNOWN_WORD, self.fallback + def get_word_embedding(self, word, static=False): ''' Embedding for a single word (tuned if possible, otherwise static) ''' norm_word = self.normalize_word(word) if static: - if norm_word in self.static_embeddings: - word = norm_word - embed = self.static_embeddings.get(norm_word) - else: - word = UNKNOWN_WORD - embed = self.fallback + return self.get_stat_word(norm_word) else: if norm_word in self.tuned_embeddings: - word = norm_word - embed = self.tuned_embeddings.get(norm_word) - elif norm_word in self.static_embeddings: - word = norm_word - embed = self.static_embeddings.get(norm_word) + return norm_word, self.tuned_embeddings.get(norm_word) else: - word = UNKNOWN_WORD - embed = self.fallback - return word, embed - + return self.get_stat_word(norm_word) + def get_word_in_sentence(self, word_idx, sentence): ''' Embedding for a word in a sentence ''' if word_idx < sentence.start or word_idx >= sentence.end: @@ -398,84 +428,99 @@ def get_average_embedding(self, token_list): return word_list, (embed_vector/max(len(word_list), 1)) def get_mention_embeddings(self, mention, doc_embedding): - ''' Embedding for a mention ''' - sent = mention.sent - mention_lefts = mention.doc[max(mention.start-5, sent.start):mention.start] - mention_rights = mention.doc[mention.end:min(mention.end+5, sent.end)] + ''' Get span (averaged) and word (single) embeddings of a mention ''' + st = mention.sent + mention_lefts = mention.doc[max(mention.start-5, st.start):mention.start] + mention_rights = mention.doc[mention.end:min(mention.end+5, st.end)] head = mention.root.head spans = [self.get_average_embedding(mention), self.get_average_embedding(mention_lefts), self.get_average_embedding(mention_rights), - self.get_average_embedding(sent), - (str(doc_embedding[0:8]) + "...", doc_embedding)] + self.get_average_embedding(st), + (unicode_(doc_embedding[0:8]) + "...", doc_embedding)] words = [self.get_word_embedding(mention.root), self.get_word_embedding(mention[0]), self.get_word_embedding(mention[-1]), - self.get_word_in_sentence(mention.start-1, sent), - self.get_word_in_sentence(mention.end, sent), - self.get_word_in_sentence(mention.start-2, sent), - self.get_word_in_sentence(mention.end+1, sent), + self.get_word_in_sentence(mention.start-1, st), + self.get_word_in_sentence(mention.end, st), + self.get_word_in_sentence(mention.start-2, st), + self.get_word_in_sentence(mention.end+1, st), self.get_word_embedding(head)] - spans_embeddings_ = {"Mention": spans[0][0], - "MentionLeft": spans[1][0], - "MentionRight": spans[2][0], - "Sentence": spans[3][0], - "Doc": spans[4][0]} - words_embeddings_ = {"MentionHead": words[0][0], - "MentionFirstWord": words[1][0], - "MentionLastWord": words[2][0], - "PreviousWord": words[3][0], - "NextWord": words[4][0], - "SecondPreviousWord": words[5][0], - "SecondNextWord": words[6][0], - "MentionRootHead": words[7][0]} + spans_embeddings_ = {"00_Mention": spans[0][0], + "01_MentionLeft": spans[1][0], + "02_MentionRight": spans[2][0], + "03_Sentence": spans[3][0], + "04_Doc": spans[4][0]} + words_embeddings_ = {"00_MentionHead": words[0][0], + "01_MentionFirstWord": words[1][0], + "02_MentionLastWord": words[2][0], + "03_PreviousWord": words[3][0], + "04_NextWord": words[4][0], + "05_SecondPreviousWord": words[5][0], + "06_SecondNextWord": words[6][0], + "07_MentionRootHead": words[7][0]} return (spans_embeddings_, words_embeddings_, - np.concatenate(list(em[1] for em in spans), axis=0)[: np.newaxis], - np.concatenate(list(em[1] for em in words), axis=0)[: np.newaxis]) + np.concatenate([em[1] for em in spans], axis=0), + np.concatenate([em[1] for em in words], axis=0)) -class Data: +class Document(object): ''' Main data class: encapsulate list of utterances, mentions and speakers Process utterances to extract mentions and pre-compute mentions features ''' - def __init__(self, nlp, model_path=None, conll=None, utterances=None, utterances_speaker=None, - speakers_names=None, use_no_coref_list=True, consider_speakers=False ,debug=False): + def __init__(self, nlp, utterances=None, utterances_speaker=None, speakers_names=None, + use_no_coref_list=False, consider_speakers=False, + trained_embed_path=None, embedding_extractor=None, + conll=None, debug=False): + ''' + Arguments: + nlp (spaCy Language Class): A spaCy Language Class for processing the text input + utterances: utterance(s) to load already see self.add_utterances() + utterances_speaker: speaker(s) of utterance(s) to load already see self.add_utterances() + speakers_names: speaker(s) of utterance(s) to load already see self.add_utterances() + use_no_coref_list (boolean): use a list of term for which coreference is not preformed + consider_speakers (boolean): consider speakers informations + pretrained_model_path (string): Path to a folder with pretrained word embeddings + embedding_extractor (EmbeddingExtractor): Use a pre-loaded word embeddings extractor + conll (string): If training on coNLL data: identifier of the document type + debug (boolean): print debug informations + ''' self.nlp = nlp self.use_no_coref_list = use_no_coref_list self.utterances = [] self.utterances_speaker = [] - self.last_utterances_loaded = None + self.last_utterances_loaded = [] self.mentions = [] self.speakers = {} self.n_sents = 0 self.debug = debug - self.consider_speakers = consider_speakers + self.consider_speakers = consider_speakers or conll is not None - self.genre_ = conll - if conll is not None: - self.genre = np.zeros((7,)) - genres = {"bc": 0, "bn": 1, "mz": 2, "nw": 3, "pt": 4, "tc": 5, "wb": 6} - #. We take broadcast conversations to use speaker infos - self.genre[genres[conll]] = 1 - else: - self.genre = np.array(0, ndmin=1, copy=False) - - if model_path is not None: - self.embed_extractor = EmbeddingExtractor(model_path) - assert self.embed_extractor.shape is not None - self.doc_embedding = np.zeros(self.embed_extractor.shape) + self.genre_, self.genre = self.set_genre(conll) + + if trained_embed_path is not None and embedding_extractor is None: + self.embed_extractor = EmbeddingExtractor(trained_embed_path) + elif embedding_extractor is not None: + self.embed_extractor = embedding_extractor else: self.embed_extractor = None - self.doc_embedding = None if utterances: self.add_utterances(utterances, utterances_speaker, speakers_names) + def set_genre(self, conll): + if conll is not None: + genre = np.zeros((7,)) + genre[conll] = 1 + else: + genre = np.array(0, ndmin=1, copy=False) + return conll, genre + def __str__(self): return ' \n {}\n \n {}' \ - .format('\n '.join(str(i) + " " + str(s) for i, s in zip(self.utterances, self.utterances_speaker)), - '\n '.join(str(i) + " " + str(i.speaker) for i in self.mentions)) + .format('\n '.join(unicode_(i) + " " + unicode_(s) for i, s in zip(self.utterances, self.utterances_speaker)), + '\n '.join(unicode_(i) + " " + unicode_(i.speaker) for i in self.mentions)) def __len__(self): ''' Return the number of mentions (not utterances) since it is what we really care about ''' @@ -527,7 +572,16 @@ def add_utterances(self, utterances, utterances_speaker=None, speakers_names=Non utterances_speaker = ((i + a + 1) % 2 for i in range(len(utterances))) utterances_index = [] utt_start = len(self.utterances) - for utt_index, (doc, speaker_id) in enumerate(zip_longest(self.nlp.pipe(utterances), utterances_speaker)): + for utt_index, (utterance, speaker_id) in enumerate(zip_longest(utterances, utterances_speaker)): + if utterance is None: + break + # Pipe currently broken in spacy 2 alpha + # Also, spacy 2 currently throws an exception on empty strings + try: + doc = self.nlp(utterance) + except IndexError: + doc = self.nlp(u" ") + if self.debug: print("Empty string") if speaker_id not in self.speakers: speaker_name = speakers_names.get(speaker_id, None) if speakers_names else None self.speakers[speaker_id] = Speaker(speaker_id, speaker_name) @@ -559,24 +613,24 @@ def set_mentions_features(self): ''' Compute features for the extracted mentions ''' - #TODO : we should probably update doc embedding here (not used currently) + doc_embedding = self.embed_extractor.get_document_embedding(self.utterances) if self.embed_extractor is not None else None for mention in self.mentions: one_hot_type = np.zeros((4,)) one_hot_type[mention.mention_type] = 1 - features_ = {"MentionType": mention.mention_type, - "MentionLength": len(mention)-1, - "MentionNormLocation": (mention.index)/len(self.mentions), - "IsMentionNested": 1 if any((m is not mention + features_ = {"01_MentionType": mention.mention_type, + "02_MentionLength": len(mention)-1, + "03_MentionNormLocation": (mention.index)/len(self.mentions), + "04_IsMentionNested": 1 if any((m is not mention and m.utterances_sent == mention.utterances_sent and m.start <= mention.start and mention.end <= m.end) for m in self.mentions) else 0} features = np.concatenate([one_hot_type, - encode_distance(features_["MentionLength"]), - np.array(features_["MentionNormLocation"], ndmin=1, copy=False), - np.array(features_["IsMentionNested"], ndmin=1, copy=False) + encode_distance(features_["02_MentionLength"]), + np.array(features_["03_MentionNormLocation"], ndmin=1, copy=False), + np.array(features_["04_IsMentionNested"], ndmin=1, copy=False) ], axis=0) - spans_embeddings_, words_embeddings_, spans_embeddings, words_embeddings = self.embed_extractor.get_mention_embeddings(mention, self.doc_embedding) + spans_embeddings_, words_embeddings_, spans_embeddings, words_embeddings = self.embed_extractor.get_mention_embeddings(mention, doc_embedding) mention.features_ = features_ mention.features = features mention.spans_embeddings = spans_embeddings @@ -592,27 +646,27 @@ def get_single_mention_features(self, mention): def get_pair_mentions_features(self, m1, m2): ''' Features for pair of mentions (same speakers, speaker mentioned, string match)''' - features_ = {"SameSpeaker": 1 if self.consider_speakers and m1.speaker == m2.speaker else 0, - "AntMatchMentionSpeaker": 1 if self.consider_speakers and m2.speaker_match_mention(m1) else 0, - "MentionMatchSpeaker": 1 if self.consider_speakers and m1.speaker_match_mention(m2) else 0, - "HeadsAgree": 1 if m1.heads_agree(m2) else 0, - "ExactStringMatch": 1 if m1.exact_match(m2) else 0, - "RelaxedStringMatch": 1 if m1.relaxed_match(m2) else 0, - "SentenceDistance": m2.utterances_sent - m1.utterances_sent, - "MentionDistance": 1, #m2.index - m1.index - 1, - "Overlapping": 1 if (m1.utterances_sent == m2.utterances_sent and m1.end > m2.start) else 0, - "M1Features": m1.features_, - "M2Features": m2.features_, - "DocGenre": self.genre_} - pairwise_features = [np.array([features_["SameSpeaker"], - features_["AntMatchMentionSpeaker"], - features_["MentionMatchSpeaker"], - features_["HeadsAgree"], - features_["ExactStringMatch"], - features_["RelaxedStringMatch"]]), - encode_distance(features_["SentenceDistance"]), - encode_distance(features_["MentionDistance"]), - np.array(features_["Overlapping"], ndmin=1), + features_ = {"00_SameSpeaker": 1 if self.consider_speakers and m1.speaker == m2.speaker else 0, + "01_AntMatchMentionSpeaker": 1 if self.consider_speakers and m2.speaker_match_mention(m1) else 0, + "02_MentionMatchSpeaker": 1 if self.consider_speakers and m1.speaker_match_mention(m2) else 0, + "03_HeadsAgree": 1 if m1.heads_agree(m2) else 0, + "04_ExactStringMatch": 1 if m1.exact_match(m2) else 0, + "05_RelaxedStringMatch": 1 if m1.relaxed_match(m2) else 0, + "06_SentenceDistance": m2.utterances_sent - m1.utterances_sent, + "07_MentionDistance": m2.index - m1.index - 1, + "08_Overlapping": 1 if (m1.utterances_sent == m2.utterances_sent and m1.end > m2.start) else 0, + "09_M1Features": m1.features_, + "10_M2Features": m2.features_, + "11_DocGenre": self.genre_} + pairwise_features = [np.array([features_["00_SameSpeaker"], + features_["01_AntMatchMentionSpeaker"], + features_["02_MentionMatchSpeaker"], + features_["03_HeadsAgree"], + features_["04_ExactStringMatch"], + features_["05_RelaxedStringMatch"]]), + encode_distance(features_["06_SentenceDistance"]), + encode_distance(features_["07_MentionDistance"]), + np.array(features_["08_Overlapping"], ndmin=1), m1.features, m2.features, self.genre] @@ -636,7 +690,7 @@ def get_candidate_mentions(self, last_utterances_added=False): for i in iterator: yield i - def get_candidate_pairs(self, mentions=None, max_distance=50, max_distance_with_match=500): + def get_candidate_pairs(self, mentions=None, max_distance=50, max_distance_with_match=500, debug=False): ''' Yield tuples of mentions, dictionnary of candidate antecedents for the mention @@ -648,21 +702,46 @@ def get_candidate_pairs(self, mentions=None, max_distance=50, max_distance_with_ ''' if mentions is None: mentions = range(len(self.mentions)) - - word_to_mentions = {} - for i in mentions: - for tok in self.mentions[i].content_words: - if not tok in word_to_mentions: - word_to_mentions[tok] = [i] - else: - word_to_mentions[tok].append(i) + if debug: print("get_candidate_pairs: mentions", mentions) + + if max_distance_with_match is not None: + word_to_mentions = {} + for i in mentions: + for tok in self.mentions[i].content_words: + if not tok in word_to_mentions: + word_to_mentions[tok] = [i] + else: + word_to_mentions[tok].append(i) for i in mentions: - antecedents = set(range(max(0, i - max_distance), i)) - for tok in self.mentions[i].content_words: - with_string_match = word_to_mentions.get(tok, None) - for match_idx in with_string_match: - if match_idx < i and match_idx >= i - max_distance_with_match: - antecedents.add(match_idx) - if antecedents: - yield i, antecedents + antecedents = set(range(i)) if max_distance is None else set(range(max(0, i - max_distance), i)) + if debug: print("antecedents", antecedents) + if max_distance_with_match is not None: + for tok in self.mentions[i].content_words: + with_string_match = word_to_mentions.get(tok, None) + for match_idx in with_string_match: + if match_idx < i and match_idx >= i - max_distance_with_match: + antecedents.add(match_idx) + yield i, antecedents + +def mention_detection_debug(sentence): + print(u"🌋 Loading spacy model") + try: + spacy.info('en_core_web_sm') + model = 'en_core_web_sm' + except IOError: + print("No spacy 2 model detected, using spacy1 'en' model") + spacy.info('en') + model = 'en' + nlp = spacy.load(model) + doc = nlp(sentence.decode('utf-8')) + mentions = extract_mentions_spans(doc, use_no_coref_list=False, debug=True) + for mention in mentions: + print(mention) + +if __name__ == '__main__': + if len(sys.argv) > 1: + sent = sys.argv[1] + mention_detection_debug(sent) + else: + mention_detection_debug(u"My sister has a dog. She loves him.") diff --git a/neuralcoref/evaluator.py b/neuralcoref/evaluator.py new file mode 100644 index 0000000..e973207 --- /dev/null +++ b/neuralcoref/evaluator.py @@ -0,0 +1,249 @@ +# coding: utf8 +"""Conll Evaluation - Scoring""" +from __future__ import absolute_import +from __future__ import unicode_literals +from __future__ import print_function + +import os +import subprocess +import io +#import concurrent.futures +import pickle + +from torch.autograd import Variable +from torch.utils.data import DataLoader + +#from algorithm import Coref +from neuralcoref.conllparser import FEATURES_NAMES +from neuralcoref.dataset import NCBatchSampler, padder_collate +from neuralcoref.compat import unicode_ + +PACKAGE_DIRECTORY = os.path.dirname(os.path.abspath(__file__)) + +OUT_PATH = os.path.join(PACKAGE_DIRECTORY, "test_corefs.txt") #fernandes.txt")# +ALL_MENTIONS_PATH = os.path.join(PACKAGE_DIRECTORY, "test_mentions.txt") +#KEY_PATH = os.path.join(PACKAGE_DIRECTORY, "conll-2012-test-test-key.txt") +SCORING_SCRIPT = os.path.join(PACKAGE_DIRECTORY, "scorer_wrapper.pl") + +METRICS = ['muc', 'bcub', 'ceafm', 'ceafe', 'blanc'] +CONLL_METRICS = ['muc', 'bcub', 'ceafe'] + +class ConllEvaluator(object): + def __init__(self, model, dataset, test_data_path, test_key_file, embed_path, + args): + """ Evaluate the pytorch model that is currently being build + We take the embedding vocabulary currently being trained + """ + self.test_key_file = test_key_file + self.cuda = args.cuda + self.model = model + batch_sampler = NCBatchSampler(dataset.mentions_pair_length, + batchsize=args.batchsize, shuffle=False) + self.dataloader = DataLoader(dataset, + collate_fn=padder_collate, + batch_sampler=batch_sampler, + num_workers=args.numworkers, + pin_memory=args.cuda) + self.mentions_idx, self.n_pairs = batch_sampler.get_batch_info() + self.load_meta(test_data_path) + + def load_meta(self, test_data_path): + # Load meta files + datas = {} + if not os.listdir(test_data_path): + raise ValueError("Empty test_data_path") + bin_files_found = False + print("Reading ", end='') + for file_name in os.listdir(test_data_path): + if '.bin' not in file_name: + continue + bin_files_found = True + print(file_name, end=', ') + with open(test_data_path + file_name, 'rb') as f: + datas[file_name.split(u'.')[0]] = pickle.load(f) + if not bin_files_found: + raise ValueError("Can't find bin files in {}".format(test_data_path)) + print("Done") + self.m_loc = datas[FEATURES_NAMES[9]] + self.tokens = datas[FEATURES_NAMES[10]] + self.lookup = datas[FEATURES_NAMES[11]] + self.docs = datas[FEATURES_NAMES[12]] + self.flat_m_idx = list((doc_i, m_i) for doc_i, l in enumerate(self.m_loc) for m_i in range(len(l))) + + ########################### + #### CLUSTER FUNCTIONS #### + ########################### + + def _prepare_clusters(self): + ''' + Clean up and prepare one cluster for each mention + ''' + self.mention_to_cluster = list(list(range(len(doc_mentions))) for doc_mentions in self.m_loc) + self.clusters = list(dict((i, [i]) for i in doc_mentions) for doc_mentions in self.mention_to_cluster) + + def _merge_coreference_clusters(self, ant_flat_idx, mention_flat_idx): + ''' + Merge two clusters together + ''' + doc_idx, ant_idx = self.flat_m_idx[ant_flat_idx] + doc_idx2, mention_idx = self.flat_m_idx[mention_flat_idx] + assert doc_idx2 == doc_idx + if self.mention_to_cluster[doc_idx][ant_idx] == self.mention_to_cluster[doc_idx][mention_idx]: + return + remove_id = self.mention_to_cluster[doc_idx][ant_idx] + keep_id = self.mention_to_cluster[doc_idx][mention_idx] + for idx in self.clusters[doc_idx][remove_id]: + self.mention_to_cluster[doc_idx][idx] = keep_id + self.clusters[doc_idx][keep_id].append(idx) + del self.clusters[doc_idx][remove_id] + + def remove_singletons_clusters(self, debug=False): + for doc_idx in range(len(self.docs)): + remove_id = [] + kept = False + for key, mentions in self.clusters[doc_idx].items(): + if len(mentions) == 1: + remove_id.append(key) + self.mention_to_cluster[doc_idx][key] = None + else: + kept = True + if debug: + l = list(self.m_loc[doc_idx][m][3] for m in mentions) + print("Cluster found", key) + print("Corefs:", "|".join(str(self.docs[doc_idx]['mentions'][m_idx]) \ + + " (" + str(m_idx) + ")" for m_idx in l)) + if not kept: + print("❄️ No coreference found") + for rem in remove_id: + del self.clusters[doc_idx][rem] + + def display_clusters(self, doc_idx=None): + ''' + Print clusters informations + ''' + doc_it = range(len(self.docs)) if doc_idx is None else [doc_idx] + for d_i in doc_it: + print("Clusters in doc:", doc_it, self.docs[d_i]['name'], self.docs[d_i]['part']) + print(self.clusters[d_i]) + for key, mentions in self.clusters[d_i].items(): + l = list(self.m_loc[d_i][m][3] for m in mentions) + print("cluster", key, "(", ", ".join(self.docs[d_i]['mentions'][m_idx] for m_idx in l), ")") + + ######################## + #### MAIN FUNCTIONS #### + ######################## + def get_max_score(self, batch, debug=False): + inputs, mask = batch + inputs = tuple(Variable(i, volatile=True) for i in inputs) + if self.cuda: + inputs = tuple(i.cuda() for i in inputs) + mask = mask.cuda() + self.model.eval() + scores = self.model.forward(inputs, concat_axis=1).data + scores.masked_fill_(mask, -float('Inf')) + _, max_idx = scores.max(dim=1) # We may want to weight the single score with coref.greedyness + if debug: + print("Max_idx", max_idx) + return scores.cpu().numpy(), max_idx.cpu().numpy() + + def test_model(self): + print("🌋 Test evaluator / print all mentions") + self.build_test_file(out_path=ALL_MENTIONS_PATH, print_all_mentions=True) + self.get_score(file_path=ALL_MENTIONS_PATH) + + def build_test_file(self, out_path=OUT_PATH, remove_singleton=True, print_all_mentions=False, debug=None): + """ Build a test file to supply to the coreference scoring perl script + """ + print("🌋 Building test file") + self._prepare_clusters() + self.dataloader.dataset.no_targets = True + if not print_all_mentions: + print("🌋 Build coreference clusters") + cur_m = 0 + for sample_batched, mentions_idx, n_pairs_l in zip(self.dataloader, self.mentions_idx, self.n_pairs): + scores, max_i = self.get_max_score(sample_batched) + for m_idx, ind, n_pairs in zip(mentions_idx, max_i, n_pairs_l): + if ind < n_pairs : # the single score is not the highest, we have a match ! + prev_idx = m_idx - n_pairs + ind + if debug is not None and (debug == -1 or debug == prev_idx or debug == m_idx): + m1_doc, m1_idx = self.flat_m_idx[m_idx] + m1 = self.docs[m1_doc]['mentions'][m1_idx] + m2_doc, m2_idx = self.flat_m_idx[prev_idx] + m2 = self.docs[m2_doc]['mentions'][m2_idx] + print("We have a match between:", m1, "(" + str(m1_idx) + ")", "and:", m2, "(" + str(m2_idx) + ")") + self._merge_coreference_clusters(prev_idx, m_idx) + if remove_singleton: + self.remove_singletons_clusters() + self.dataloader.dataset.no_targets = False + + print("🌋 Construct test file") + out_str = "" + for doc, d_tokens, d_lookup, d_m_loc, d_m_to_c in zip(self.docs, self.tokens, self.lookup, self.m_loc, self.mention_to_cluster): + out_str += u"#begin document (" + doc['name'] + u"); part " + doc['part'] + u"\n" + for utt_idx, (c_tokens, c_lookup) in enumerate(zip(d_tokens, d_lookup)): + for i, (token, lookup) in enumerate(zip(c_tokens, c_lookup)): + out_coref = u"" + for m_str, mention, mention_cluster in zip(doc['mentions'], d_m_loc, d_m_to_c): + m_start, m_end, m_utt, m_idx, m_doc = mention + if mention_cluster is None: + pass + elif m_utt == utt_idx: + if m_start in lookup: + out_coref += u"|" if out_coref else u"" + out_coref += u"(" + unicode_(mention_cluster) + if (m_end - 1) in lookup: + out_coref += u")" + else: + out_coref += u"" + elif (m_end - 1) in lookup: + out_coref += u"|" if out_coref else u"" + out_coref += unicode_(mention_cluster) + u")" + out_line = doc['name'] + u" " + doc['part'] + u" " + unicode_(i) \ + + u" " + token + u" " + out_line += u"-" if len(out_coref) == 0 else out_coref + out_str += out_line + u"\n" + out_str += u"\n" + out_str += u"#end document\n" + + # Write test file + print("Writing in", out_path) + with io.open(out_path, 'w', encoding='utf-8') as out_file: + out_file.write(out_str) + + def get_score(self, file_path=OUT_PATH, debug=False): + """ Call the coreference scoring perl script on the created test file + """ + print("🌋 Computing score") + score = {} + ident = None + for metric_name in CONLL_METRICS: + if debug: print("Computing metric:", metric_name) + try: + scorer_out = subprocess.check_output(["perl", + SCORING_SCRIPT, + metric_name, + self.test_key_file, + file_path], stderr=subprocess.STDOUT, encoding='utf-8') + except subprocess.CalledProcessError as err: + print("Error during the scoring") + print(err) + print(err.output) + raise + if debug: print("scorer_out", scorer_out) + value, ident = scorer_out.split(u"\n")[-2], scorer_out.split(u"\n")[-1] + if debug: print("value", value, "identification", ident) + NR, DR, NP, DP = [float(x) for x in value.split(u" ")] + ident_NR, ident_DR, ident_NP, ident_DP = [float(x) for x in ident.split(u" ")] + precision = NP/DP if DP else 0 + recall = NR/DR if DR else 0 + F1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0 + ident_precision = ident_NP/ident_DP if ident_DP else 0 + ident_recall = ident_NR/ident_DR if ident_DR else 0 + ident_F1 = 2 * ident_precision * ident_recall / (ident_precision + ident_recall) if ident_precision + ident_recall > 0 else 0 + score[metric_name] = (precision, recall, F1) + ident = (ident_precision, ident_recall, ident_F1, ident_NR, ident_DR, ident_NP, ident_DP) + F1_conll = sum([score[metric][2] for metric in CONLL_METRICS])/len(CONLL_METRICS) + print("Mention identification recall", ident[1], "<= Detected mentions", ident[3], "True mentions", ident[4]) + print("Scores", score) + print("F1_conll", F1_conll) + return score, F1_conll, ident diff --git a/neuralcoref/learn.py b/neuralcoref/learn.py new file mode 100644 index 0000000..e360187 --- /dev/null +++ b/neuralcoref/learn.py @@ -0,0 +1,323 @@ +# coding: utf8 +"""Conll training algorithm""" +from __future__ import absolute_import +from __future__ import unicode_literals +from __future__ import print_function + +import os +import time +import argparse +import socket +from datetime import datetime +import numpy as np + +import torch +import torch.nn as nn +from torch.autograd import Variable +from torch.optim import RMSprop +from torch.utils.data import DataLoader +from tensorboardX import SummaryWriter + +from neuralcoref.model import Model +from neuralcoref.dataset import (NCDataset, NCBatchSampler, + load_embeddings_from_file, padder_collate, + SIZE_PAIR_IN, SIZE_SINGLE_IN, SIZE_EMBEDDING) +from neuralcoref.evaluator import ConllEvaluator + +PACKAGE_DIRECTORY = os.path.dirname(os.path.abspath(__file__)) +STAGES = ["allpairs", "toppairs", "ranking"] + +def clipped_sigmoid(inputs): + epsilon = 1.0e-7 + return torch.sigmoid(inputs).clamp(epsilon, 1.0 - epsilon) + +def get_all_pairs_loss(n): + def all_pair_loss(scores, targets): + """ All pairs and single mentions probabilistic loss + """ + labels = targets[0] + weights = targets[4].data if len(targets) == 5 else None + loss_op = nn.BCEWithLogitsLoss(weight=weights, size_average=False) + loss = loss_op(scores, labels) + return loss / n + return all_pair_loss + +def get_top_pair_loss(n): + def top_pair_loss(scores, targets, debug=False): + """ Top pairs (best true and best mistaken) and single mention probabilistic loss + """ + true_ants = targets[2] + false_ants = targets[3] if len(targets) == 5 else None + s_scores = clipped_sigmoid(scores) + true_pairs = torch.gather(s_scores, 1, true_ants) + top_true, top_true_arg = torch.log(true_pairs).max(dim=1) # max(log(p)), p=sigmoid(s) + if debug: + print("true_pairs", true_pairs.data) + print("top_true", top_true.data) + print("top_true_arg", top_true_arg.data) + out_score = torch.sum(top_true).neg() + if false_ants is not None: # We have no false antecedents when there are no pairs + false_pairs = torch.gather(s_scores, 1, false_ants) + top_false, _ = torch.log(1-false_pairs).min(dim=1) # min(log(1-p)), p=sigmoid(s) + out_score = out_score + torch.sum(top_false).neg() + return out_score / n + return top_pair_loss + +def get_ranking_loss(n): + def ranking_loss(scores, targets): + """ Slack-rescaled max margin loss + """ + costs = targets[1] + true_ants = targets[2] + weights = targets[4] if len(targets) == 5 else None + true_ant_score = torch.gather(scores, 1, true_ants) + top_true, _ = true_ant_score.max(dim=1) + tmp_loss = scores.add(1).add(top_true.unsqueeze(1).neg()) # 1 + scores - top_true + if weights is not None: + tmp_loss = tmp_loss.mul(weights) + tmp_loss = tmp_loss.mul(costs) + loss, _ = tmp_loss.max(dim=1) + out_score = torch.sum(loss) + return out_score / n + return ranking_loss + +def decrease_lr(optim_func, factor=0.1, min_lrs=0, eps=0, verbose=True): + for i, param_group in enumerate(optim_func.param_groups): + old_lr = float(param_group['lr']) + new_lr = max(old_lr * factor, min_lrs) + if old_lr - new_lr > eps: + param_group['lr'] = new_lr + if verbose: + print('Reducing learning rate' + ' of group {} to {:.4e}.'.format(i, new_lr)) + return new_lr + +def load_model(model, path): + print("⛄️ Reloading model from", path) + model.load_state_dict(torch.load(path) if args.cuda \ + else torch.load(path, map_location=lambda storage, loc: storage)) + +def run_model(args): + print("Training for", args.all_pairs_epoch, + args.top_pairs_epoch, args.ranking_epoch, "epochs") + # Tensorboard server + writer = SummaryWriter() + + # Load datasets and embeddings + embed_path = args.weights if args.weights is not None else args.train + tensor_embeddings, voc = load_embeddings_from_file(embed_path + "tuned_word") + dataset = NCDataset(args.train, args) + eval_dataset = NCDataset(args.eval, args) + print("Vocabulary:", len(voc)) + + # Construct model + print("🏝 Build model") + model = Model(len(voc), SIZE_EMBEDDING, args.h1, + args.h2, args.h3, SIZE_PAIR_IN, SIZE_SINGLE_IN) + model.load_embeddings(tensor_embeddings) + if args.cuda: + model.cuda() + if args.weights is not None: + print("🏝 Loading pre-trained weights") + model.load_weights(args.weights) + if args.checkpoint_file is not None: + print("⛄️ Loading model from", args.checkpoint_file) + model.load_state_dict(torch.load(args.checkpoint_file) if args.cuda \ + else torch.load(args.checkpoint_file, map_location=lambda storage, loc: storage)) + + print("🏝 Loading conll evaluator") + eval_evaluator = ConllEvaluator(model, eval_dataset, args.eval, args.evalkey, + embed_path, args) + train_evaluator = ConllEvaluator(model, dataset, args.train, args.trainkey, + embed_path, args) + print("🏝 Testing evaluator and getting first eval score") + eval_evaluator.test_model() + start_time = time.time() + eval_evaluator.build_test_file() + score, f1_conll, ident = eval_evaluator.get_score() + elapsed = time.time() - start_time + print('|| s/evaluation {:5.2f}'.format(elapsed)) + writer.add_scalar("eval/" + "F1_conll", f1_conll, 0) + + # Preparing dataloader + print("🏝 Preparing dataloader") + print("Dataloader parameters: batchsize", args.batchsize, "numworkers", args.numworkers) + batch_sampler = NCBatchSampler(dataset.mentions_pair_length, shuffle=True, + batchsize=args.batchsize) + dataloader = DataLoader(dataset, collate_fn=padder_collate, batch_sampler=batch_sampler, + num_workers=args.numworkers, pin_memory=args.cuda) + mentions_idx, n_pairs = batch_sampler.get_batch_info() + + print("🏝 Start training") + g_step = 0 + start_from = args.startstep if args.startstep is not None and args.startstage is not None else 0 + def run_epochs(start_epoch, end_epoch, loss_func, optim_func, save_name, lr, g_step, debug=None): + best_model_path = args.save_path + "best_model" + save_name + start_time_all = time.time() + best_f1_conll = 0 + lower_eval = 0 + for epoch in range(start_epoch, end_epoch): + """ Run an epoch """ + print("🚘 {} Epoch {:d}".format(save_name, epoch)) + model.train() + start_time_log = time.time() + start_time_epoch = time.time() + epoch_loss = 0 + for batch_i, (m_idx, n_pairs_l, batch) in enumerate(zip(mentions_idx, n_pairs, dataloader)): + if debug is not None and (debug == -1 or debug in m_idx): + l = list(dataset.flat_m_loc[m][2:] for m in m_idx) + print("🏔 Batch", batch_i, "m_idx:", "|".join(str(i) for i in m_idx),"mentions:","|".join(dataset.docs[d]['mentions'][i] for u, i, d in l)) + print("Batch n_pairs:","|".join(str(p) for p in n_pairs_l)) + inputs, targets = batch + inputs = tuple(Variable(inp, requires_grad=False) for inp in inputs) + targets = tuple(Variable(tar, requires_grad=False) for tar in targets) + if args.cuda: + inputs = tuple(i.cuda() for i in inputs) + targets = tuple(t.cuda() for t in targets) + scores = model(inputs) + if debug is not None and (debug == -1 or debug in m_idx): + print("Scores:\n" + "\n".join("|".join(str(s) for s in s_l) for s_l in scores.data.cpu().numpy())) + print("Labels:\n" + "\n".join("|".join(str(s) for s in s_l) for s_l in targets[0].data.cpu().numpy())) + loss = loss_func(scores, targets) + if debug is not None and (debug == -1 or debug in m_idx): + print('Loss', loss.data[0]) + # Zero gradients, perform a backward pass, and update the weights. + optim_func.zero_grad() + loss.backward() + epoch_loss += loss.data[0] + optim_func.step() + writer.add_scalar("train/" + save_name + "_loss", loss.data[0], g_step) + writer.add_scalar("meta/" + "lr", lr, g_step) + writer.add_scalar("meta/" + "stage", STAGES.index(save_name), g_step) + g_step += 1 + if batch_i % args.log_interval == 0 and batch_i > 0: + elapsed = time.time() - start_time_log + print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:.2e} | ms/batch {:5.2f} | ' + 'loss {:.2e}'.format( + epoch, batch_i, len(dataloader), optim_func.param_groups[0]['lr'], + elapsed * 1000 / args.log_interval, loss.data[0])) + start_time_log = time.time() + elapsed_all = time.time() - start_time_all + elapsed_epoch = time.time() - start_time_epoch + print('|| min/epoch {:5.2f} | est. remaining time (h) {:5.2f} | loss {:.2e}'.format(elapsed_epoch/60, + elapsed_all/3600*float(end_epoch-epoch)/float(epoch-start_epoch+1), epoch_loss)) + writer.add_scalar("epoch/" + "loss", epoch_loss, g_step) + if epoch % args.conll_train_interval == 0: + start_time = time.time() + train_evaluator.build_test_file() + score, f1_conll, ident = train_evaluator.get_score() + elapsed = time.time() - start_time + print('|| min/train evaluation {:5.2f} | F1_conll {:5.2f}'.format(elapsed/60, f1_conll)) + writer.add_scalar("epoch/" + "F1_conll", f1_conll, g_step) + if epoch % args.conll_eval_interval == 0: + start_time = time.time() + eval_evaluator.build_test_file() + score, f1_conll, ident = eval_evaluator.get_score() + elapsed = time.time() - start_time + print('|| min/evaluation {:5.2f}'.format(elapsed/60)) + writer.add_scalar("eval/" + "F1_conll", f1_conll, g_step) + g_step += 1 + save_path = args.save_path + save_name + "_" + str(epoch) + torch.save(model.state_dict(), save_path) + if f1_conll > best_f1_conll: + best_f1_conll = f1_conll + torch.save(model.state_dict(), best_model_path) + lower_eval = 0 + elif args.on_eval_decrease != 'nothing': + print("Evaluation metric decreases") + lower_eval += 1 + if lower_eval >= args.patience: + if args.on_eval_decrease == 'divide_lr' or args.on_eval_decrease == 'divide_then_next': + print("reload best model and decrease lr") + load_model(model, best_model_path) + lr = decrease_lr(optim_func) + if args.on_eval_decrease == 'next_stage' or lr <= args.min_lr: + print("Switch to next stage") + break + # Save last step + start_time = time.time() + eval_evaluator.build_test_file() + score, f1_conll, ident = eval_evaluator.get_score() + elapsed = time.time() - start_time + print('|| min/evaluation {:5.2f}'.format(elapsed/60)) + writer.add_scalar("eval/" + "F1_conll", f1_conll, g_step) + g_step += 1 + save_path = args.save_path + save_name + "_" + str(epoch) + torch.save(model.state_dict(), save_path) + load_model(model, best_model_path) + return g_step + + if args.startstage is None or args.startstage == "allpairs": + optimizer = RMSprop(model.parameters(), lr=args.all_pairs_lr, weight_decay=args.all_pairs_l2) + loss_func = get_all_pairs_loss(batch_sampler.pairs_per_batch) + g_step = run_epochs(start_from, args.all_pairs_epoch, loss_func, optimizer, + "allpairs", args.all_pairs_lr, g_step) + start_from = 0 + + if args.startstage is None or args.startstage in ["allpairs", "toppairs"]: + optimizer = RMSprop(model.parameters(), lr=args.top_pairs_lr, weight_decay=args.top_pairs_l2) + loss_func = get_top_pair_loss(10 * batch_sampler.mentions_per_batch) + g_step = run_epochs(start_from, args.top_pairs_epoch, loss_func, optimizer, + "toppairs", args.top_pairs_lr, g_step) + start_from = 0 + + if args.startstage is None or args.startstage in ["ranking", "allpairs", "toppairs"]: + optimizer = RMSprop(model.parameters(), lr=args.ranking_lr, weight_decay=args.ranking_l2) + loss_func = get_ranking_loss(batch_sampler.mentions_per_batch) + g_step = run_epochs(start_from, args.ranking_epoch, loss_func, optimizer, + "ranking", args.ranking_lr, g_step) + +if __name__ == '__main__': + DIR_PATH = os.path.dirname(os.path.realpath(__file__)) + parser = argparse.ArgumentParser(description='Training the neural coreference model') + parser.add_argument('--train', type=str, default=DIR_PATH + '/data/tiny/', help='Path to the train dataset') + parser.add_argument('--eval', type=str, default=DIR_PATH + '/data/tiny/', help='Path to the eval dataset') + parser.add_argument('--evalkey', type=str, help='Path to an optional key file for scoring') + parser.add_argument('--weights', type=str, help='Path to pre-trained weights (if you only want to test the scoring for e.g.)') + parser.add_argument('--batchsize', type=int, default=20000, help='Size of a batch in total number of pairs') + parser.add_argument('--numworkers', type=int, default=8, help='Number of workers for loading batches') + parser.add_argument('--startstage', type=str, help='Start from a specific stage ("allpairs", "toppairs", "ranking")') + parser.add_argument('--startstep', type=int, help='Start from a specific step') + parser.add_argument('--checkpoint_file', type=str, help='Start from a previously saved checkpoint file') + parser.add_argument('--log_interval', type=int, default=10, help='test every X mini-batches') + parser.add_argument('--conll_eval_interval', type=int, default=10, help='evaluate eval F1 conll every X epochs') + parser.add_argument('--conll_train_interval', type=int, default=20, help='evaluate train F1 conll every X epochs') + parser.add_argument('--seed', type=int, default=1111, help='random seed') + parser.add_argument('--costfn', type=float, default=0.8, help='cost of false new') + parser.add_argument('--costfl', type=float, default=0.4, help='cost of false link') + parser.add_argument('--costwl', type=float, default=1.0, help='cost of wrong link') + parser.add_argument('--h1', type=int, default=1000, help='number of hidden unit on layer 1') + parser.add_argument('--h2', type=int, default=500, help='number of hidden unit on layer 2') + parser.add_argument('--h3', type=int, default=500, help='number of hidden unit on layer 3') + parser.add_argument('--all_pairs_epoch', type=int, default=200, help='number of epochs for all-pairs pre-training') + parser.add_argument('--top_pairs_epoch', type=int, default=200, help='number of epochs for top-pairs pre-training') + parser.add_argument('--ranking_epoch', type=int, default=200, help='number of epochs for ranking training') + parser.add_argument('--all_pairs_lr', type=float, default=2e-4, help='all pairs pre-training learning rate') + parser.add_argument('--top_pairs_lr', type=float, default=2e-4, help='top pairs pre-training learning rate') + parser.add_argument('--ranking_lr', type=float, default=2e-6, help='ranking training learning rate') + parser.add_argument('--all_pairs_l2', type=float, default=1e-6, help='all pairs pre-training l2 regularization') + parser.add_argument('--top_pairs_l2', type=float, default=1e-5, help='top pairs pre-training l2 regularization') + parser.add_argument('--ranking_l2', type=float, default=1e-5, help='ranking training l2 regularization') + parser.add_argument('--patience', type=int, default=3, help='patience (epochs) before considering evaluationhas decreased') + parser.add_argument('--min_lr', type=float, default=2e-8, help='min learning rate') + parser.add_argument('--on_eval_decrease', type=str, default='nothing', + help='What to do when evaluation decreases ("nothing", "divide_lr", "next_stage", "divide_then_next")') + args = parser.parse_args() + + args.costs = {'FN': args.costfn, 'FL': args.costfl, 'WL' : args.costwl } + + current_time = datetime.now().strftime('%b%d_%H-%M-%S') + args.save_path = os.path.join(PACKAGE_DIRECTORY, 'checkpoints', current_time + '_' + socket.gethostname() + '_') + + np.random.seed(args.seed) + torch.manual_seed(args.seed) + args.cuda = torch.cuda.is_available() + if args.cuda: + torch.cuda.manual_seed(args.seed) + + args.evalkey = args.evalkey if args.evalkey is not None else args.eval + "/key.txt" + args.trainkey = args.train + "/key.txt" + args.train = args.train + "/numpy/" + args.eval = args.eval + "/numpy/" + print(args) + run_model(args) diff --git a/neuralcoref/meta.yaml b/neuralcoref/meta.yaml new file mode 100644 index 0000000..152103c --- /dev/null +++ b/neuralcoref/meta.yaml @@ -0,0 +1,24 @@ +package: + name: neuralcoref + version: "2.0" + +source: + git_rev: v2.0 + git_url: https://github.com/huggingface/neuralcoref.git + +requirements: + build: + - python + - setuptools + + run: + - python + +test: + imports: + - neuralcoref + +about: + home: https://github.com/huggingface/neuralcoref + license: MIT + license_file: LICENCE.txt \ No newline at end of file diff --git a/neuralcoref/model.py b/neuralcoref/model.py new file mode 100644 index 0000000..396e673 --- /dev/null +++ b/neuralcoref/model.py @@ -0,0 +1,82 @@ +# coding: utf8 +"""Conll training algorithm""" +from __future__ import absolute_import +from __future__ import unicode_literals +from __future__ import print_function + +import os +import numpy as np + +import torch +import torch.nn as nn +import torch.utils.data + +class Model(nn.Module): + def __init__(self, vocab_size, embedding_dim, H1, H2, H3, D_pair_in, D_single_in, dropout=0.5): + super(Model, self).__init__() + self.word_embeds = nn.Embedding(vocab_size, embedding_dim) + self.drop = nn.Dropout(dropout) + self.pair_top = nn.Sequential(nn.Linear(D_pair_in, H1), nn.ReLU(), nn.Dropout(dropout), + nn.Linear(H1, H2), nn.ReLU(), nn.Dropout(dropout), + nn.Linear(H2, H3), nn.ReLU(), nn.Dropout(dropout), + nn.Linear(H3, 1), + nn.Linear(1, 1)) + self.single_top = nn.Sequential(nn.Linear(D_single_in, H1), nn.ReLU(), nn.Dropout(dropout), + nn.Linear(H1, H2), nn.ReLU(), nn.Dropout(dropout), + nn.Linear(H2, H3), nn.ReLU(), nn.Dropout(dropout), + nn.Linear(H3, 1), + nn.Linear(1, 1)) + self.init_weights() + + def init_weights(self): + w = (param.data for name, param in self.named_parameters() if 'weight' in name) + b = (param.data for name, param in self.named_parameters() if 'bias' in name) + nn.init.uniform(self.word_embeds.weight.data, a=-0.5, b=0.5) + for t in w: + nn.init.xavier_uniform(t) + for t in b: + nn.init.constant(t, 0) + + def load_embeddings(self, preloaded_weights): + self.word_embeds.weight = nn.Parameter(preloaded_weights) + + def load_weights(self, weights_path): + print("Loading weights") + single_layers_weights, single_layers_biases = [], [] + for f in sorted(os.listdir(weights_path)): + if f.startswith("single_mention_weights"): + single_layers_weights.append(np.load(os.path.join(weights_path, f))) + if f.startswith("single_mention_bias"): + single_layers_biases.append(np.load(os.path.join(weights_path, f))) + top_single_linear = (layer for layer in self.single_top if isinstance(layer, nn.Linear)) + for w, b, layer in zip(single_layers_weights, single_layers_biases, top_single_linear): + layer.weight = nn.Parameter(torch.from_numpy(w).float()) + layer.bias = nn.Parameter(torch.from_numpy(b).float().squeeze()) + pair_layers_weights, pair_layers_biases = [], [] + for f in sorted(os.listdir(weights_path)): + if f.startswith("pair_mentions_weights"): + pair_layers_weights.append(np.load(os.path.join(weights_path, f))) + if f.startswith("pair_mentions_bias"): + pair_layers_biases.append(np.load(os.path.join(weights_path, f))) + top_pair_linear = (layer for layer in self.pair_top if isinstance(layer, nn.Linear)) + for w, b, layer in zip(pair_layers_weights, pair_layers_biases, top_pair_linear): + layer.weight = nn.Parameter(torch.from_numpy(w).float()) + layer.bias = nn.Parameter(torch.from_numpy(b).float().squeeze()) + + def forward(self, inputs, concat_axis=1): + pairs = (len(inputs) == 8) + if pairs: + spans, words, single_features, ant_spans, ant_words, ana_spans, ana_words, pair_features = inputs + else: + spans, words, single_features = inputs + embed_words = self.drop(self.word_embeds(words).view(words.size()[0], -1)) + single_input = torch.cat([spans, embed_words, single_features], 1) + single_scores = self.single_top(single_input) + if pairs: + batchsize, pairs_num, _ = ana_spans.size() + ant_embed_words = self.drop(self.word_embeds(ant_words.view(batchsize, -1)).view(batchsize, pairs_num, -1)) + ana_embed_words = self.drop(self.word_embeds(ana_words.view(batchsize, -1)).view(batchsize, pairs_num, -1)) + pair_input = torch.cat([ant_spans, ant_embed_words, ana_spans, ana_embed_words, pair_features], 2) + pair_scores = self.pair_top(pair_input).squeeze(dim=2) + total_scores = torch.cat([pair_scores, single_scores], concat_axis) + return total_scores if pairs else single_scores diff --git a/neuralcoref/scorer/README.txt b/neuralcoref/scorer/README.txt new file mode 100644 index 0000000..2533a29 --- /dev/null +++ b/neuralcoref/scorer/README.txt @@ -0,0 +1,100 @@ +NAME + CorScorer: Perl package for scoring coreference resolution systems + using different metrics. + + +VERSION + v8.01 -- reference implementations of MUC, B-cubed, CEAF and BLANC metrics. + + +CHANGES SINCE v8.0 + - fixed a bug that crashed the BLANC scorer when a duplicate singleton + mention was present in the response. + +INSTALLATION + Requirements: + 1. Perl: downloadable from http://perl.org + 2. Algorithm-Munkres: included in this package and downloadable + from CPAN http://search.cpan.org/~tpederse/Algorithm-Munkres-0.08 + +USE + This package is distributed with two scripts to execute the scorer from + the command line. + + Windows (tm): scorer.bat + Linux: scorer.pl + + +SYNOPSIS + use CorScorer; + + $metric = 'ceafm'; + + # Scores the whole dataset + &CorScorer::Score($metric, $keys_file, $response_file); + + # Scores one file + &CorScorer::Score($metric, $keys_file, $response_file, $name); + + +INPUT + metric: the metric desired to score the results: + muc: MUCScorer (Vilain et al, 1995) + bcub: B-Cubed (Bagga and Baldwin, 1998) + ceafm: CEAF (Luo et al., 2005) using mention-based similarity + ceafe: CEAF (Luo et al., 2005) using entity-based similarity + blanc: BLANC (Luo et al., 2014) BLANC metric for gold and predicted mentions + all: uses all the metrics to score + + keys_file: file with expected coreference chains in CoNLL-2011/2012 format + + response_file: file with output of coreference system (CoNLL-2011/2012 format) + + name: [optional] the name of the document to score. If name is not + given, all the documents in the dataset will be scored. If given + name is "none" then all the documents are scored but only total + results are shown. + + +OUTPUT + The score subroutine returns an array with four values in this order: + 1) Recall numerator + 2) Recall denominator + 3) Precision numerator + 4) Precision denominator + + Also recall, precision and F1 are printed in the standard output when variable + $VERBOSE is not null. + + Final scores: + Recall = recall_numerator / recall_denominator + Precision = precision_numerator / precision_denominator + F1 = 2 * Recall * Precision / (Recall + Precision) + + Identification of mentions + An scorer for identification of mentions (recall, precision and F1) is also included. + Mentions from system response are compared with key mentions. This version performs + strict mention matching as was used in the CoNLL-2011 and 2012 shared tasks. + +AUTHORS + Emili Sapena, Universitat Politècnica de Catalunya, http://www.lsi.upc.edu/~esapena, esapena lsi.upc.edu + Sameer Pradhan, sameer.pradhan childrens.harvard.edu + Sebastian Martschat, sebastian.martschat h-its.org + Xiaoqiang Luo, xql google.com + +COPYRIGHT AND LICENSE + Copyright (C) 2009-2011, Emili Sapena esapena lsi.upc.edu + 2011-2014, Sameer Pradhan sameer.pradhan childrens.harvard.edu + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 2 of the License, or (at your + option) any later version. This program is distributed in the hope that + it will be useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + diff --git a/neuralcoref/scorer/lib/Algorithm/Munkres.pm b/neuralcoref/scorer/lib/Algorithm/Munkres.pm new file mode 100644 index 0000000..b0864f1 --- /dev/null +++ b/neuralcoref/scorer/lib/Algorithm/Munkres.pm @@ -0,0 +1,596 @@ +package Algorithm::Munkres; + +use 5.006; +use strict; +use warnings; + +require Exporter; + +our @ISA = qw(Exporter); + +our @EXPORT = qw( assign ); + +our $VERSION = '0.08'; + +#Variables global to the package +my @mat = (); +my @mask = (); +my @colcov = (); +my @rowcov = (); +my $Z0_row = 0; +my $Z0_col = 0; +my @path = (); + +#The exported subroutine. +#Expected Input: Reference to the input matrix (MxN) +#Output: Mx1 matrix, giving the column number of the value assigned to each row. (For more explaination refer perldoc) +sub assign +{ + #reference to the input matrix + my $rmat = shift; + my $rsolution_mat = shift; + my ($row, $row_len) = (0,0); + + # re-initialize that global variables + @mat = (); + @mask = (); + @colcov = (); + @rowcov = (); + $Z0_row = 0; + $Z0_col = 0; + @path = (); + + #variables local to the subroutine + my $step = 0; + my ($i, $j) = (0,0); + + #the input matrix + my @inp_mat = @$rmat; + + #copy the orginal matrix, before applying the algorithm to the matrix + foreach (@inp_mat) + { + push @mat, [ @$_ ]; + } + + #check if the input matrix is well-formed i.e. either square or rectangle. + $row_len = $#{$mat[0]}; + foreach my $row (@mat) + { + if($row_len != $#$row) + { + die "Please check the input matrix.\nThe input matrix is not a well-formed matrix!\nThe input matrix has to be rectangular or square matrix.\n"; + } + } + + #check if the matrix is a square matrix, + #if not convert it to square matrix by padding zeroes. + if($#mat < $#{$mat[0]}) + { + # Add rows + my $diff = $#{$mat[0]} - $#mat; + for (1 .. $diff) + { + push @mat, [ (0) x @{$mat[0]} ]; + } + } + elsif($#mat > $#{$mat[0]}) + { + # Add columns + my $diff = $#mat - $#{$mat[0]}; + for (0 .. $#mat) + { + push @{$mat[$_]}, (0) x $diff; + } + } + + #initialize mask, column cover and row cover matrices + clear_covers(); + + for($i=0;$i<=$#mat;$i++) + { + push @mask, [ (0) x @mat ]; + } + + #The algorithm can be grouped in 6 steps. + &stepone(); + &steptwo(); + $step = &stepthree(); + while($step == 4) + { + $step = &stepfour(); + while($step == 6) + { + &stepsix(); + $step = &stepfour(); + } + &stepfive(); + $step = &stepthree(); + } + + #create the output matrix + for my $i (0 .. $#mat) + { + for my $j (0 .. $#{$mat[$i]}) + { + if($mask[$i][$j] == 1) + { + $rsolution_mat->[$i] = $j; + } + } + } + + +#Code for tracing------------------ + <<'ee'; + print "\nInput Matrix:\n"; + for($i=0;$i<=$#mat;$i++) + { + for($j=0;$j<=$#mat;$j++) + { + print $mat[$i][$j] . "\t"; + } + print "\n"; + } + + print "\nMask Matrix:\n"; + for($i=0;$i<=$#mat;$i++) + { + for($j=0;$j<=$#mat;$j++) + { + print $mask[$i][$j] . "\t"; + } + print "\n"; + } + + print "\nOutput Matrix:\n"; + print "$_\n" for @$rsolution_mat; +ee + +#---------------------------------- + +} + +#Step 1 - Find minimum value for every row and subtract this min from each element of the row. +sub stepone +{ +# print "Step 1 \n"; + + #Find the minimum value for every row + for my $row (@mat) + { + my $min = $row->[0]; + for (@$row) + { + $min = $_ if $min > $_; + } + + #Subtract the minimum value of the row from each element of the row. + @$row = map {$_ - $min} @$row; + } +# print "Step 1 end \n"; +} + +#Step 2 - Star the zeroes, Create the mask and cover matrices. Re-initialize the cover matrices for next steps. +#To star a zero: We search for a zero in the matrix and than cover the column and row in which it occurs. Now this zero is starred. +#A next starred zero can occur only in those columns and rows which have not been previously covered by any other starred zero. +sub steptwo +{ +# print "Step 2 \n"; + + my ($i, $j) = (0,0); + + for($i=0;$i<=$#mat;$i++) + { + for($j=0;$j<=$#{$mat[$i]};$j++) + { + if($mat[$i][$j] == 0 && $colcov[$j] == 0 && $rowcov[$i] == 0) + { + $mask[$i][$j] = 1; + $colcov[$j] = 1; + $rowcov[$i] = 1; + } + } + } + #Re-initialize the cover matrices + &clear_covers(); +# print "Step 2 end\n"; +} + +#Step 3 - Check if each column has a starred zero. If yes then the problem is solved else proceed to step 4 +sub stepthree +{ +# print "Step 3 \n"; + + my $cnt = 0; + + for my $i (0 .. $#mat) + { + for my $j (0 .. $#mat) + { + if($mask[$i][$j] == 1) + { + $colcov[$j] = 1; + $cnt++; + } + } + } + if($cnt > $#mat) + { +# print "Step 3 end. Next expected step 7 \n"; + return 7; + } + else + { +# print "Step 3 end. Next expected step 4 \n"; + return 4; + } + +} + +#Step 4 - Try to find a zero which is not starred and whose columns and rows are not yet covered. +#If such a zero found, prime it, try to find a starred zero in its row, +# if not found proceed to step 5 +# else continue +#Else proceed to step 6. +sub stepfour +{ +# print "Step 4 \n"; + + while(1) + { + my ($row, $col) = &find_a_zero(); + if ($row < 0) + { + # No zeroes + return 6; + } + + $mask[$row][$col] = 2; + my $star_col = &find_star_in_row($row); + if ($star_col >= 0) + { + $col = $star_col; + $rowcov[$row] = 1; + $colcov[$col] = 0; + } + else + { + $Z0_row = $row; + $Z0_col = $col; + return 5; + } + } +} + +#Tries to find yet uncovered zero +sub find_a_zero +{ + for my $i (0 .. $#mat) + { + next if $rowcov[$i]; + + for my $j (reverse(0 .. $#mat)) # Prefer large $j + { + next if $colcov[$j]; + return ($i, $j) if $mat[$i][$j] == 0; + } + } + + return (-1, -1); +} + +#Tries to find starred zero in the given row and returns the column number +sub find_star_in_row +{ + my $row = shift; + + for my $j (0 .. $#mat) + { + if($mask[$row][$j] == 1) + { + return $j; + } + } + return -1; +} + +#Step 5 - Try to find a starred zero in the column of the uncovered zero found in the step 4. +#If starred zero found, try to find a prime zero in its row. +#Continue finding starred zero in the column and primed zero in the row until, +#we get to a primed zero which does not have a starred zero in its column. +#At this point reduce the non-zero values of mask matrix by 1. i.e. change prime zeros to starred zeroes. +#Clear the cover matrices and clear any primes i.e. values=2 from mask matrix. +sub stepfive +{ +# print "Step 5 \n"; + + my $cnt = 0; + my $done = 0; + + $path[$cnt][0] = $Z0_row; + $path[$cnt][1] = $Z0_col; + + while($done == 0) + { + my $row = &find_star_in_col($path[$cnt][1]); + if($row > -1) + { + $cnt++; + $path[$cnt][0] = $row; + $path[$cnt][1] = $path[$cnt - 1][1]; + } + else + { + $done = 1; + } + if($done == 0) + { + my $col = &find_prime_in_row($path[$cnt][0]); + $cnt++; + $path[$cnt][0] = $path[$cnt - 1][0]; + $path[$cnt][1] = $col; + } + } + &convert_path($cnt); + &clear_covers(); + &erase_primes(); + +# print "Step 5 end \n"; +} + +#Tries to find starred zero in the given column and returns the row number +sub find_star_in_col +{ + my $col = shift; + + for my $i (0 .. $#mat) + { + return $i if $mask[$i][$col] == 1; + } + + return -1; +} + +#Tries to find primed zero in the given row and returns the column number +sub find_prime_in_row +{ + my $row = shift; + + for my $j (0 .. $#mat) + { + return $j if $mask[$row][$j] == 2; + } + + return -1; +} + +#Reduces non-zero value in the mask matrix by 1. +#i.e. converts all primes to stars and stars to none. +sub convert_path +{ + my $cnt = shift; + + for my $i (0 .. $cnt) + { + for ( $mask[$path[$i][0]][$path[$i][1]] ) { + $_ = ( $_ == 1 ) ? 0 : 1; + } + } +} + +#Clears cover matrices +sub clear_covers +{ + @rowcov = @colcov = (0) x @mat; +} + +#Changes all primes i.e. values=2 to 0. +sub erase_primes +{ + for my $row (@mask) + { + for my $j (0 .. $#$row) + { + $row->[$j] = 0 if $row->[$j] == 2; + } + } +} + +#Step 6 - Find the minimum value from the rows and columns which are currently not covered. +#Subtract this minimum value from all the elements of the columns which are not covered. +#Add this minimum value to all the elements of the rows which are covered. +#Proceed to step 4. +sub stepsix +{ +# print "Step 6 \n"; + my ($i, $j); + my $minval = 0; + + $minval = &find_smallest(); + + for($i=0;$i<=$#mat;$i++) + { + for($j=0;$j<=$#{$mat[$i]};$j++) + { + if($rowcov[$i] == 1) + { + $mat[$i][$j] += $minval; + } + if($colcov[$j] == 0) + { + $mat[$i][$j] -= $minval; + } + } + } + +# print "Step 6 end \n"; +} + +#Finds the minimum value from all the matrix values which are not covered. +sub find_smallest +{ + my $minval; + + for my $i (0 .. $#mat) + { + next if $rowcov[$i]; + + for my $j (0 .. $#mat) + { + next if $colcov[$j]; + if( !defined($minval) || $minval > $mat[$i][$j]) + { + $minval = $mat[$i][$j]; + } + } + } + return $minval; +} + + +1; +__END__ + +=head1 NAME + + Algorithm::Munkres - Perl extension for Munkres' solution to + classical Assignment problem for square and rectangular matrices + This module extends the solution of Assignment problem for square + matrices to rectangular matrices by padding zeros. Thus a rectangular + matrix is converted to square matrix by padding necessary zeros. + +=head1 SYNOPSIS + +use Algorithm::Munkres; + + @mat = ( + [2, 4, 7, 9], + [3, 9, 5, 1], + [8, 2, 9, 7], + ); + +assign(\@mat,\@out_mat); + + Then the @out_mat array will have the output as: (0,3,1,2), + where + 0th element indicates that 0th row is assigned 0th column i.e value=2 + 1st element indicates that 1st row is assigned 3rd column i.e.value=1 + 2nd element indicates that 2nd row is assigned 1st column.i.e.value=2 + 3rd element indicates that 3rd row is assigned 2nd column.i.e.value=0 + + +=head1 DESCRIPTION + + Assignment Problem: Given N jobs, N workers and the time taken by + each worker to complete a job then how should the assignment of a + Worker to a Job be done, so as to minimize the time taken. + + Thus if we have 3 jobs p,q,r and 3 workers x,y,z such that: + x y z + p 2 4 7 + q 3 9 5 + r 8 2 9 + + where the cell values of the above matrix give the time required + for the worker(given by column name) to complete the job(given by + the row name) + + then possible solutions are: + Total + 1. 2, 9, 9 20 + 2. 2, 2, 5 9 + 3. 3, 4, 9 16 + 4. 3, 2, 7 12 + 5. 8, 9, 7 24 + 6. 8, 4, 5 17 + + Thus (2) is the optimal solution for the above problem. + This kind of brute-force approach of solving Assignment problem + quickly becomes slow and bulky as N grows, because the number of + possible solution are N! and thus the task is to evaluate each + and then find the optimal solution.(If N=10, number of possible + solutions: 3628800 !) + Munkres' gives us a solution to this problem, which is implemented + in this module. + + This module also solves Assignment problem for rectangular matrices + (M x N) by converting them to square matrices by padding zeros. ex: + If input matrix is: + [2, 4, 7, 9], + [3, 9, 5, 1], + [8, 2, 9, 7] + i.e 3 x 4 then we will convert it to 4 x 4 and the modified input + matrix will be: + [2, 4, 7, 9], + [3, 9, 5, 1], + [8, 2, 9, 7], + [0, 0, 0, 0] + +=head1 EXPORT + + "assign" function by default. + +=head1 INPUT + + The input matrix should be in a two dimensional array(array of + array) and the 'assign' subroutine expects a reference to this + array and not the complete array. + eg:assign(\@inp_mat, \@out_mat); + The second argument to the assign subroutine is the reference + to the output array. + +=head1 OUTPUT + + The assign subroutine expects references to two arrays as its + input paramenters. The second parameter is the reference to the + output array. This array is populated by assign subroutine. This + array is single dimensional Nx1 matrix. + For above example the output array returned will be: + (0, + 2, + 1) + + where + 0th element indicates that 0th row is assigned 0th column i.e value=2 + 1st element indicates that 1st row is assigned 2nd column i.e.value=5 + 2nd element indicates that 2nd row is assigned 1st column.i.e.value=2 + +=head1 SEE ALSO + + 1. http://216.249.163.93/bob.pilgrim/445/munkres.html + + 2. Munkres, J. Algorithms for the assignment and transportation + Problems. J. Siam 5 (Mar. 1957), 32-38 + + 3. François Bourgeois and Jean-Claude Lassalle. 1971. + An extension of the Munkres algorithm for the assignment + problem to rectangular matrices. + Communication ACM, 14(12):802-804 + +=head1 AUTHOR + + Anagha Kulkarni, University of Minnesota Duluth + kulka020 d.umn.edu + + Ted Pedersen, University of Minnesota Duluth + tpederse d.umn.edu + +=head1 COPYRIGHT AND LICENSE + +Copyright (C) 2007-2008, Ted Pedersen and Anagha Kulkarni + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +=cut diff --git a/neuralcoref/scorer/lib/Algorithm/README.Munkres b/neuralcoref/scorer/lib/Algorithm/README.Munkres new file mode 100644 index 0000000..7b9c234 --- /dev/null +++ b/neuralcoref/scorer/lib/Algorithm/README.Munkres @@ -0,0 +1,130 @@ +NAME + Algorithm-Munkres : Perl extension for Munkres' solution to + classical Assignment problem for square and rectangular matrices + This module extends the solution of Assignment problem for square + matrices to rectangular matrices by padding zeros. Thus a rectangular + matrix is converted to square matrix by padding necessary zeros. + +SYNOPSIS + use Algorithm::Munkres; + + @mat = ( + [2, 4, 7, 9], + [3, 9, 5, 1], + [8, 2, 9, 7], + ); + + assign(\@mat,\@out_mat); + + Then the @out_mat array will have the output as: (0,3,1,2), + where + 0th element indicates that 0th row is assigned 0th column i.e value=2 + 1st element indicates that 1st row is assigned 3rd column i.e.value=1 + 2nd element indicates that 2nd row is assigned 1st column.i.e.value=2 + 3rd element indicates that 3rd row is assigned 2nd column.i.e.value=0 + +DESCRIPTION + Assignment Problem: Given N jobs, N workers and the time taken by + each worker to complete a job then how should the assignment of a + Worker to a Job be done, so as to minimize the time taken. + + Thus if we have 3 jobs p,q,r and 3 workers x,y,z such that: + x y z + p 2 4 7 + q 3 9 5 + r 8 2 9 + + where the cell values of the above matrix give the time required + for the worker(given by column name) to complete the job(given by + the row name) + + then possible solutions are: + Total + 1. 2, 9, 9 20 + 2. 2, 2, 5 9 + 3. 3, 4, 9 16 + 4. 3, 2, 7 12 + 5. 8, 9, 7 24 + 6. 8, 4, 5 17 + + Thus (2) is the optimal solution for the above problem. + This kind of brute-force approach of solving Assignment problem + quickly becomes slow and bulky as N grows, because the number of + possible solution are N! and thus the task is to evaluate each + and then find the optimal solution.(If N=10, number of possible + solutions: 3628800 !) + Munkres' gives us a solution to this problem, which is implemented + in this module. + + This module also solves Assignment problem for rectangular matrices + (M x N) by converting them to square matrices by padding zeros. ex: + If input matrix is: + [2, 4, 7, 9], + [3, 9, 5, 1], + [8, 2, 9, 7] + i.e 3 x 4 then we will convert it to 4 x 4 and the modified input + matrix will be: + [2, 4, 7, 9], + [3, 9, 5, 1], + [8, 2, 9, 7], + [0, 0, 0, 0] + +EXPORT + "assign" function by default. + +INPUT + The input matrix should be in a two dimensional array(array of + array) and the 'assign' subroutine expects a reference to this + array and not the complete array. + eg:assign(\@inp_mat, \@out_mat); + The second argument to the assign subroutine is the reference + to the output array. + +OUTPUT + The assign subroutine expects references to two arrays as its + input paramenters. The second parameter is the reference to the + output array. This array is populated by assign subroutine. This + array is single dimensional Nx1 matrix. + For above example the output array returned will be: + (0, + 2, + 1) + + where + 0th element indicates that 0th row is assigned 0th column i.e value=2 + 1st element indicates that 1st row is assigned 2nd column i.e.value=5 + 2nd element indicates that 2nd row is assigned 1st column.i.e.value=2 + +SEE ALSO + 1. http://216.249.163.93/bob.pilgrim/445/munkres.html + + 2. Munkres, J. Algorithms for the assignment and transportation + Problems. J. Siam 5 (Mar. 1957), 32-38 + + 3. François Bourgeois and Jean-Claude Lassalle. 1971. + An extension of the Munkres algorithm for the assignment + problem to rectangular matrices. + Communication ACM, 14(12):802-804 + +AUTHOR + Anagha Kulkarni, University of Minnesota Duluth + kulka020 d.umn.edu + + Ted Pedersen, University of Minnesota Duluth + tpederse d.umn.edu + +COPYRIGHT AND LICENSE + Copyright (C) 2007-2008, Ted Pedersen and Anagha Kulkarni + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 2 of the License, or (at your + option) any later version. This program is distributed in the hope that + it will be useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + diff --git a/neuralcoref/scorer/lib/CorScorer.pm b/neuralcoref/scorer/lib/CorScorer.pm new file mode 100644 index 0000000..187ccae --- /dev/null +++ b/neuralcoref/scorer/lib/CorScorer.pm @@ -0,0 +1,1348 @@ +package CorScorer; + +# Copyright (C) 2009-2011, Emili Sapena esapena lsi.upc.edu +# 2011-2014, Sameer Pradhan childrens.harvard.edu +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. This program is distributed in the hope that +# it will be useful, but WITHOUT ANY WARRANTY; without even the implied +# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# +# Modified in 2013 for v1.07 by Sebastian Martschat, +# sebastian.martschat h-its.org +# +# Revised in July, 2013 by Xiaoqiang Luo (xql google.com) to create v6.0. +# See comments under $VERSION for modifications. +# +# Revised in March, 2014 by Sameer Pradhan (sameer.pradhan childrens.harvard.edu) +# to implement the BLANC metric for predicted mentions + + +use strict; +use Algorithm::Munkres; +use Data::Dumper; + +#use Algorithm::Combinatorics qw(combinations); +use Math::Combinatorics; +use Cwd; + +our $VERSION = '8.01'; +print "version: " . $VERSION . " " . Cwd::realpath(__FILE__) . "\n"; + +## +# 8.01 fixed a bug that crashed the the BLANC scoring when duplicate +# (potentially singleton) mentions were present in the +# response. as part of the fix, wee will allow a maximum of 10 +# duplicate mentions in response, but if there are more, than it +# is a sign of a systematic error/manipulation and we will refuse +# to score that run. + +# 8.0 added code to compute the BLANC metric (generalized for both gold +# and system mentions (Luo et al., 2014) +# +# 7.0 Removed code to compute *_cs metrics +# +# 6.0 The directory hosting the scorer is under v6 and internal $VERSION is +# set to "6.0." +# Changes: +# - 'ceafm', 'ceafe' and 'bcub' in the previous version are renamed +# 'ceafm_cs', 'ceafe_cs', and 'bcub_cs', respectively. +# - 'ceafm', 'ceafe' and 'bcub' are implemented without (Cai&Strube 2010) +# modification. These metrics can handle twinless mentions and entities +# just fine. +# +# 1.07 Modifications to implement BCUB and CEAFM +# exactly as proposed by (Cai & Strube, 2010). +# 1.06 ? +# 1.05 Modification of IdentifMentions in order to correctly evaluate the +# outputs with detected mentions. Based on (Cai & Strubbe, 2010) +# 1.04 Some output corrections in BLANC functions. Changed package name to "Scorer" +# 1.03 Detects mentions that start in a document but do not end +# 1.02 Corrected BCUB bug. It fails when the key file does not have any mention + +# global variables +my $VERBOSE = 0;#2;#3; +my $HEAD_COLUMN = 8; +my $RESPONSE_COLUMN = -1; +my $KEY_COLUMN = -1; + +# Score. Scores the results of a coreference resolution system +# Input: Metric, keys file, response file, [name] +# Metric: the metric desired to evaluate: +# muc: MUCScorer (Vilain et al, 1995) +# bcub: B-Cubed (Bagga and Baldwin, 1998) +# ceafm: CEAF (Luo et al, 2005) using mention-based similarity +# ceafe: CEAF (Luo et al, 2005) using entity-based similarity +# keys file: file with expected coreference chains in SemEval format +# response file: file with output of coreference system (SemEval format) +# name: [optional] the name of the document to score. If name is not +# given, all the documents in the dataset will be scored. +# +# Output: an array with numerators and denominators of recall and precision +# (recall_num, recall_den, precision_num, precision_den) +# +# Final scores: +# Recall = recall_num / recall_den +# Precision = precision_num / precision_den +# F1 = 2 * Recall * Precision / (Recall + Precision) +sub Score { + my ($metric, $kFile, $rFile, $name) = @_; + our $repeated_mentions = 0; + + if (lc($metric) eq 'blanc') { + return ScoreBLANC($kFile, $rFile, $name); + } + + my %idenTotals = + (recallDen => 0, recallNum => 0, precisionDen => 0, precisionNum => 0); + my ($acumNR, $acumDR, $acumNP, $acumDP) = (0, 0, 0, 0); + + if (defined($name) && $name ne 'none') { + print "$name:\n" if ($VERBOSE); + my $keys = GetCoreference($kFile, $KEY_COLUMN, $name); + my $response = GetCoreference($rFile, $RESPONSE_COLUMN, $name); + my ( + $keyChains, $keyChainsWithSingletonsFromResponse, + $responseChains, $responseChainsWithoutMentionsNotInKey, + $keyChainsOrig, $responseChainsOrig + ) = IdentifMentions($keys, $response, \%idenTotals); + ($acumNR, $acumDR, $acumNP, $acumDP) = Eval( + $metric, $keyChains, + $keyChainsWithSingletonsFromResponse, $responseChains, + $responseChainsWithoutMentionsNotInKey, $keyChainsOrig, + $responseChainsOrig + ); + } + else { + my $kIndexNames = GetFileNames($kFile); + my $rIndexNames = GetFileNames($rFile); + + $VERBOSE = 0 if ($name eq 'none'); + foreach my $iname (keys(%{$kIndexNames})) { + my $keys = + GetCoreference($kFile, $KEY_COLUMN, $iname, $kIndexNames->{$iname}); + my $response = GetCoreference($rFile, $RESPONSE_COLUMN, $iname, + $rIndexNames->{$iname}); + + print "$iname:\n" if ($VERBOSE); + my ( + $keyChains, $keyChainsWithSingletonsFromResponse, + $responseChains, $responseChainsWithoutMentionsNotInKey, + $keyChainsOrig, $responseChainsOrig + ) = IdentifMentions($keys, $response, \%idenTotals); + my ($nr, $dr, $np, $dp) = Eval( + $metric, $keyChains, + $keyChainsWithSingletonsFromResponse, $responseChains, + $responseChainsWithoutMentionsNotInKey, $keyChainsOrig, + $responseChainsOrig + ); + + $acumNR += $nr; + $acumDR += $dr; + $acumNP += $np; + $acumDP += $dp; + } + } + + if ($VERBOSE || $name eq 'none') { + print "\n====== TOTALS =======\n"; + print "Identification of Mentions: "; + ShowRPF( + $idenTotals{recallNum}, $idenTotals{recallDen}, + $idenTotals{precisionNum}, $idenTotals{precisionDen} + ); + print "Coreference: "; + ShowRPF($acumNR, $acumDR, $acumNP, $acumDP); + } + + return ($acumNR, $acumDR, $acumNP, $acumDP, + $idenTotals{recallNum}, $idenTotals{recallDen}, + $idenTotals{precisionNum}, $idenTotals{precisionDen}); +} + +sub GetIndex { + my ($ind, $i) = @_; + if (!defined($ind->{$i})) { + my $n = $ind->{nexti} || 0; + $ind->{$i} = $n; + $n++; + $ind->{nexti} = $n; + } + + return $ind->{$i}; +} + +# Get the coreference information from column $column of the file $file +# If $name is defined, only keys between "#begin document $name" and +# "#end file $name" are taken. +# The output is an array of entites, where each entity is an array +# of mentions and each mention is an array with two values corresponding +# to the mention's begin and end. For example: +# @entities = ( [ [1,3], [45,45], [57,62] ], # <-- entity 0 +# [ [5,5], [25,27], [31,31] ], # <-- entity 1 +# ... +# ); +# entity 0 is composed of 3 mentions: from token 1 to 3, token 45 and +# from token 57 to 62 (both included) +# +# if $name is not specified, the output is a hash including each file +# found in the document: +# $coref{$file} = \@entities +sub GetCoreference { + my ($file, $column, $name, $pos) = @_; + my %coref; + my %ind; + + open(F, $file) || die "Can not open $file: $!"; + if ($pos) { + seek(F, $pos, 0); + } + my $fName; + my $getout = 0; + do { + # look for the begin of a file + while (my $l = ) { + chomp($l); + $l =~ s/\r$//; # m$ format jokes + if ($l =~ /^\#\s*begin document (.*?)$/) { + if (defined($name)) { + if ($name eq $1) { + $fName = $name; + $getout = 1; + last; + } + } + else { + $fName = $1; + last; + } + } + } + print "====> $fName:\n" if ($VERBOSE > 1); + + # Extract the keys from the file until #end is found + my $lnumber = 0; + my @entities; + my @half; + my @head; + my @sentId; + while (my $l = ) { + chomp($l); + $l =~ s/^\s+$//; + next if ($l eq ''); + if ($l =~ /\#\s*end document/) { + foreach my $h (@half) { + if (defined($h) && @$h) { + die "Error: some mentions in the document do not close\n"; + } + } + last; + } + my @columns = split(/\t/, $l); + my $cInfo = $columns[$column]; + push(@head, $columns[$HEAD_COLUMN]); + push(@sentId, $columns[0]); + if ($cInfo ne '_') { + + #discard double antecedent + while ($cInfo =~ s/\((\d+\+\d)\)//) { + print "Discarded ($1)\n" if ($VERBOSE > 1); + } + + # one-token mention(s) + while ($cInfo =~ s/\((\d+)\)//) { + my $ie = GetIndex(\%ind, $1); + push(@{$entities[$ie]}, [$lnumber, $lnumber, $lnumber]); + print "+mention (entity $ie): ($lnumber,$lnumber)\n" + if ($VERBOSE > 2); + } + + # begin of mention(s) + while ($cInfo =~ s/\((\d+)//) { + my $ie = GetIndex(\%ind, $1); + push(@{$half[$ie]}, $lnumber); + print "+init mention (entity $ie): ($lnumber\n" if ($VERBOSE > 2); + } + + # end of mention(s) + while ($cInfo =~ s/(\d+)\)//) { + my $numberie = $1; + my $ie = GetIndex(\%ind, $numberie); + my $start = pop(@{$half[$ie]}); + if (defined($start)) { + my $inim = $sentId[$start]; + my $endm = $sentId[$lnumber]; + my $tHead = $start; + + # the token whose head is outside the mention is the head of the mention + for (my $t = $start ; $t <= $lnumber ; $t++) { + if ($head[$t] < $inim || $head[$t] > $endm) { + $tHead = $t; + last; + } + } + push(@{$entities[$ie]}, [$start, $lnumber, $tHead]); + } + else { + die +"Detected the end of a mention [$numberie]($ie) without begin (?,$lnumber)"; + } + print "+mention (entity $ie): ($start,$lnumber)\n" if ($VERBOSE > 2); + + } + } + $lnumber++; + } + + # verbose + if ($VERBOSE > 1) { + print "File $fName:\n"; + for (my $e = 0 ; $e < scalar(@entities) ; $e++) { + print "Entity $e:"; + foreach my $mention (@{$entities[$e]}) { + print " ($mention->[0],$mention->[1])"; + } + print "\n"; + } + } + + $coref{$fName} = \@entities; + } while (!$getout && !eof(F)); + + if (defined($name)) { + return $coref{$name}; + } + return \%coref; +} + +sub GetFileNames { + my $file = shift; + my %hash; + my $last = 0; + open(F, $file) || die "Can not open $file: $!"; + while (my $l = ) { + chomp($l); + $l =~ s/\r$//; # m$ format jokes + if ($l =~ /^\#\s*begin document (.*?)$/) { + my $name = $1; + $hash{$name} = $last; + } + $last = tell(F); + } + close(F); + return \%hash; +} + +sub IdentifMentions { + my ($keys, $response, $totals) = @_; + my @kChains; + my @kChainsWithSingletonsFromResponse; + my @rChains; + my @rChainsWithoutMentionsNotInKey; + my %id; + my %map; + my $idCount = 0; + my @assigned; + my @kChainsOrig = (); + my @rChainsOrig = (); + + # for each mention found in keys an ID is generated + foreach my $entity (@$keys) { + foreach my $mention (@$entity) { + if (defined($id{"$mention->[0],$mention->[1]"})) { + print "Repeated mention in the key: $mention->[0], $mention->[1] ", + $id{"$mention->[0],$mention->[1]"}, $idCount, "\n"; + } + $id{"$mention->[0],$mention->[1]"} = $idCount; + $idCount++; + } + } + + # correct identification: Exact bound limits + my $exact = 0; + foreach my $entity (@$response) { + + my $i = 0; + my @remove; + + foreach my $mention (@$entity) { + if (defined($map{"$mention->[0],$mention->[1]"})) { + print "Repeated mention in the response: $mention->[0], $mention->[1] ", + $map{"$mention->[0],$mention->[1]"}, + $id{"$mention->[0],$mention->[1]"}, + "\n"; + push(@remove, $i); + $main::repeated_mentions++; + + if ($main::repeated_mentions > 10) + { + print STDERR "Found too many repeated mentions (> 10) in the response, so refusing to score. Please fix the output.\n"; + exit 1; + } + + } + elsif (defined($id{"$mention->[0],$mention->[1]"}) + && !$assigned[$id{"$mention->[0],$mention->[1]"}]) + { + $assigned[$id{"$mention->[0],$mention->[1]"}] = 1; + $map{"$mention->[0],$mention->[1]"} = + $id{"$mention->[0],$mention->[1]"}; + $exact++; + } + $i++; + } + + # Remove repeated mentions in the response + foreach my $i (sort { $b <=> $a } (@remove)) { + splice(@$entity, $i, 1); + } + } + + + # now, lets remove any empty elements in the response array after removing + # potential repeats + my @another_remove = (); + my $ii; + + foreach my $eentity (@$response) + { + if ( @$eentity == 0) + { + push(@another_remove, $ii); + } + $ii++; + } + + foreach my $iii (sort { $b <=> $a } (@another_remove)) { + splice(@$response, $iii, 1); + } + + + # Partial identificaiton: Inside bounds and including the head + my $part = 0; + + # Each mention in response not included in keys has a new ID + my $mresp = 0; + foreach my $entity (@$response) { + foreach my $mention (@$entity) { + my $ini = $mention->[0]; + my $end = $mention->[1]; + if (!defined($map{"$mention->[0],$mention->[1]"})) { + $map{"$mention->[0],$mention->[1]"} = $idCount; + $idCount++; + } + $mresp++; + } + } + + if ($VERBOSE) { + print "Total key mentions: " . scalar(keys(%id)) . "\n"; + print "Total response mentions: " . scalar(keys(%map)) . "\n"; + print "Strictly correct identified mentions: $exact\n"; + print "Partially correct identified mentions: $part\n"; + print "No identified: " . (scalar(keys(%id)) - $exact - $part) . "\n"; + print "Invented: " . ($idCount - scalar(keys(%id))) . "\n"; + } + + if (defined($totals)) { + $totals->{recallDen} += scalar(keys(%id)); + $totals->{recallNum} += $exact; + $totals->{precisionDen} += scalar(keys(%map)); + $totals->{precisionNum} += $exact; + $totals->{precisionExact} += $exact; + $totals->{precisionPart} += $part; + } + + # The coreference chains arrays are generated again with ID of mentions + # instead of token coordenates + my $e = 0; + foreach my $entity (@$keys) { + foreach my $mention (@$entity) { + push(@{$kChainsOrig[$e]}, $id{"$mention->[0],$mention->[1]"}); + push(@{$kChains[$e]}, $id{"$mention->[0],$mention->[1]"}); + } + $e++; + } + $e = 0; + foreach my $entity (@$response) { + foreach my $mention (@$entity) { + push(@{$rChainsOrig[$e]}, $map{"$mention->[0],$mention->[1]"}); + push(@{$rChains[$e]}, $map{"$mention->[0],$mention->[1]"}); + } + $e++; + } + + # In order to use the metrics as in (Cai & Strube, 2010): + # 1. Include the non-detected key mentions into the response as singletons + # 2. Discard the detected mentions not included in key resolved as singletons + # 3a. For computing precision: put twinless system mentions in key + # 3b. For computing recall: discard twinless system mentions in response + + my $kIndex = Indexa(\@kChains); + my $rIndex = Indexa(\@rChains); + + # 1. Include the non-detected key mentions into the response as singletons + my $addkey = 0; + if (scalar(keys(%id)) - $exact - $part > 0) { + foreach my $kc (@kChains) { + foreach my $m (@$kc) { + if (!defined($rIndex->{$m})) { + push(@rChains, [$m]); + $addkey++; + } + } + } + } + + @kChainsWithSingletonsFromResponse = @kChains; + @rChainsWithoutMentionsNotInKey = []; + + # 2. Discard the detected mentions not included in key resolved as singletons + my $delsin = 0; + + if ($idCount - scalar(keys(%id)) > 0) { + foreach my $rc (@rChains) { + if (scalar(@$rc) == 1) { + if (!defined($kIndex->{$rc->[0]})) { + @$rc = (); + $delsin++; + } + } + } + } + +# 3a. For computing precision: put twinless system mentions in key as singletons + my $addinv = 0; + + if ($idCount - scalar(keys(%id)) > 0) { + foreach my $rc (@rChains) { + if (scalar(@$rc) > 1) { + foreach my $m (@$rc) { + if (!defined($kIndex->{$m})) { + push(@kChainsWithSingletonsFromResponse, [$m]); + $addinv++; + } + } + } + } + } + + # 3b. For computing recall: discard twinless system mentions in response + my $delsys = 0; + + foreach my $rc (@rChains) { + my @temprc; + my $i = 0; + + foreach my $m (@$rc) { + if (defined($kIndex->{$m})) { + push(@temprc, $m); + $i++; + } + else { + $delsys++; + } + } + + if ($i > 0) { + push(@rChainsWithoutMentionsNotInKey, \@temprc); + } + } + + # We clean the empty chains + my @newrc; + foreach my $rc (@rChains) { + if (scalar(@$rc) > 0) { + push(@newrc, $rc); + } + } + @rChains = @newrc; + + return ( + \@kChains, \@kChainsWithSingletonsFromResponse, + \@rChains, \@rChainsWithoutMentionsNotInKey, + \@kChainsOrig, \@rChainsOrig + ); +} + +sub Eval { + my ($scorer, $keys, $keysPrecision, $response, $responseRecall, + $keyChainsOrig, $responseChainsOrig) + = @_; + $scorer = lc($scorer); + my ($nr, $dr, $np, $dp); + if ($scorer eq 'muc') { + ($nr, $dr, $np, $dp) = + MUCScorer($keys, $keysPrecision, $response, $responseRecall); + } + elsif ($scorer eq 'bcub') { + ($nr, $dr, $np, $dp) = BCUBED($keyChainsOrig, $responseChainsOrig); + } + elsif ($scorer eq 'ceafm') { + ($nr, $dr, $np, $dp) = CEAF($keyChainsOrig, $responseChainsOrig, 1); + } + elsif ($scorer eq 'ceafe') { + ($nr, $dr, $np, $dp) = CEAF($keyChainsOrig, $responseChainsOrig, 0); + } + else { + die "Metric $scorer not implemented yet\n"; + } + return ($nr, $dr, $np, $dp); +} + +# Indexes an array of arrays, in order to easily know the position of an element +sub Indexa { + my ($arrays) = @_; + my %index; + + for (my $i = 0 ; $i < @$arrays ; $i++) { + foreach my $e (@{$arrays->[$i]}) { + $index{$e} = $i; + } + } + return \%index; +} + +# Consider the "links" within every coreference chain. For example, +# chain A-B-C-D has 3 links: A-B, B-C and C-D. +# Recall: num correct links / num expected links. +# Precision: num correct links / num output links + +sub MUCScorer { + my ($keys, $keysPrecision, $response, $responseRecall) = @_; + + my $kIndex = Indexa($keys); + + # Calculate correct links + my $correct = 0; + foreach my $rEntity (@$response) { + next if (!defined($rEntity)); + + # for each possible pair + for (my $i = 0 ; $i < @$rEntity ; $i++) { + my $id_i = $rEntity->[$i]; + for (my $j = $i + 1 ; $j < @$rEntity ; $j++) { + my $id_j = $rEntity->[$j]; + if ( defined($kIndex->{$id_i}) + && defined($kIndex->{$id_j}) + && $kIndex->{$id_i} == $kIndex->{$id_j}) + { + $correct++; + last; + } + } + } + } + + # Links in key + my $keylinks = 0; + foreach my $kEntity (@$keys) { + next if (!defined($kEntity)); + $keylinks += scalar(@$kEntity) - 1 if (scalar(@$kEntity)); + } + + # Links in response + my $reslinks = 0; + foreach my $rEntity (@$response) { + next if (!defined($rEntity)); + $reslinks += scalar(@$rEntity) - 1 if (scalar(@$rEntity)); + } + + ShowRPF($correct, $keylinks, $correct, $reslinks) if ($VERBOSE); + return ($correct, $keylinks, $correct, $reslinks); +} + +# Compute precision for every mention in the response, and compute +# recall for every mention in the keys +sub BCUBED { + my ($keys, $response) = @_; + my $kIndex = Indexa($keys); + my $rIndex = Indexa($response); + my $acumP = 0; + my $acumR = 0; + foreach my $rChain (@$response) { + foreach my $m (@$rChain) { + my $kChain = (defined($kIndex->{$m})) ? $keys->[$kIndex->{$m}] : []; + my $ci = 0; + my $ri = scalar(@$rChain); + my $ki = scalar(@$kChain); + + # common mentions in rChain and kChain => Ci + foreach my $mr (@$rChain) { + foreach my $mk (@$kChain) { + if ($mr == $mk) { + $ci++; + last; + } + } + } + + $acumP += $ci / $ri if ($ri); + $acumR += $ci / $ki if ($ki); + } + } + + # Mentions in key + my $keymentions = 0; + foreach my $kEntity (@$keys) { + $keymentions += scalar(@$kEntity); + } + + # Mentions in response + my $resmentions = 0; + foreach my $rEntity (@$response) { + $resmentions += scalar(@$rEntity); + } + + ShowRPF($acumR, $keymentions, $acumP, $resmentions) if ($VERBOSE); + return ($acumR, $keymentions, $acumP, $resmentions); +} + +# type = 0: Entity-based +# type = 1: Mention-based +sub CEAF { + my ($keys, $response, $type) = @_; + + my @sim; + for (my $i = 0 ; $i < scalar(@$keys) ; $i++) { + for (my $j = 0 ; $j < scalar(@$response) ; $j++) { + if (defined($keys->[$i]) && defined($response->[$j])) { + if ($type == 0) { # entity-based + $sim[$i][$j] = 1 - SIMEntityBased($keys->[$i], $response->[$j]); + + # 1 - X => the library searches minima not maxima + } + elsif ($type == 1) { # mention-based + $sim[$i][$j] = 1 - SIMMentionBased($keys->[$i], $response->[$j]); + } + } + else { + $sim[$i][$j] = 1; + } + } + + # fill the matrix when response chains are less than key ones + for (my $j = scalar(@$response) ; $j < scalar(@$keys) ; $j++) { + $sim[$i][$j] = 1; + } + + #$denrec += SIMEntityBased($kChain->[$i], $kChain->[$i]); + } + + my @out; + + # Munkres algorithm + assign(\@sim, \@out); + + my $numerador = 0; + my $denpre = 0; + my $denrec = 0; + + # entity-based + if ($type == 0) { + foreach my $c (@$response) { + $denpre++ if (defined($c) && scalar(@$c) > 0); + } + foreach my $c (@$keys) { + $denrec++ if (defined($c) && scalar(@$c) > 0); + } + } + + # mention-based + elsif ($type == 1) { + foreach my $c (@$response) { + $denpre += scalar(@$c) if (defined($c)); + } + foreach my $c (@$keys) { + $denrec += scalar(@$c) if (defined($c)); + } + } + + for (my $i = 0 ; $i < scalar(@$keys) ; $i++) { + $numerador += 1 - $sim[$i][$out[$i]]; + } + + ShowRPF($numerador, $denrec, $numerador, $denpre) if ($VERBOSE); + + return ($numerador, $denrec, $numerador, $denpre); +} + +sub SIMEntityBased { + my ($a, $b) = @_; + my $intersection = 0; + + # Common elements in A and B + foreach my $ma (@$a) { + next if (!defined($ma)); + foreach my $mb (@$b) { + next if (!defined($mb)); + if ($ma == $mb) { + $intersection++; + last; + } + } + } + + my $r = 0; + my $d = scalar(@$a) + scalar(@$b); + if ($d != 0) { + $r = 2 * $intersection / $d; + } + + return $r; +} + +sub SIMMentionBased { + my ($a, $b) = @_; + my $intersection = 0; + + # Common elements in A and B + foreach my $ma (@$a) { + next if (!defined($ma)); + foreach my $mb (@$b) { + next if (!defined($mb)); + if ($ma == $mb) { + $intersection++; + last; + } + } + } + + return $intersection; +} + +sub ShowRPF { + my ($numrec, $denrec, $numpre, $denpre, $f1) = @_; + + my $precisio = $denpre ? $numpre / $denpre : 0; + my $recall = $denrec ? $numrec / $denrec : 0; + if (!defined($f1)) { + $f1 = 0; + if ($recall + $precisio) { + $f1 = 2 * $precisio * $recall / ($precisio + $recall); + } + } + + print "Recall: ($numrec / $denrec) " . int($recall * 10000) / 100 . '%'; + print "\tPrecision: ($numpre / $denpre) " + . int($precisio * 10000) / 100 . '%'; + print "\tF1: " . int($f1 * 10000) / 100 . "\%\n"; + print +"--------------------------------------------------------------------------\n"; +} + +# NEW +sub ScoreBLANC { + my ($kFile, $rFile, $name) = @_; + my ($acumNRa, $acumDRa, $acumNPa, $acumDPa) = (0, 0, 0, 0); + my ($acumNRr, $acumDRr, $acumNPr, $acumDPr) = (0, 0, 0, 0); + my %idenTotals = + (recallDen => 0, recallNum => 0, precisionDen => 0, precisionNum => 0); + + if (defined($name) && $name ne 'none') { + print "$name:\n" if ($VERBOSE); + my $keys = GetCoreference($kFile, $KEY_COLUMN, $name); + my $response = GetCoreference($rFile, $RESPONSE_COLUMN, $name); + my ( + $keyChains, $keyChainsWithSingletonsFromResponse, + $responseChains, $responseChainsWithoutMentionsNotInKey, + $keyChainsOrig, $responseChainsOrig + ) = IdentifMentions($keys, $response, \%idenTotals); + ( + $acumNRa, $acumDRa, $acumNPa, $acumDPa, + $acumNRr, $acumDRr, $acumNPr, $acumDPr + ) = BLANC_Internal($keyChainsOrig, $responseChainsOrig); + } + else { + my $kIndexNames = GetFileNames($kFile); + my $rIndexNames = GetFileNames($rFile); + + $VERBOSE = 0 if ($name eq 'none'); + foreach my $iname (keys(%{$kIndexNames})) { + my $keys = + GetCoreference($kFile, $KEY_COLUMN, $iname, $kIndexNames->{$iname}); + my $response = GetCoreference($rFile, $RESPONSE_COLUMN, $iname, + $rIndexNames->{$iname}); + + print "$name:\n" if ($VERBOSE); + my ( + $keyChains, $keyChainsWithSingletonsFromResponse, + $responseChains, $responseChainsWithoutMentionsNotInKey, + $keyChainsOrig, $responseChainsOrig + ) = IdentifMentions($keys, $response, \%idenTotals); + my ($nra, $dra, $npa, $dpa, $nrr, $drr, $npr, $dpr) = + BLANC_Internal($keyChainsOrig, $responseChainsOrig); + + $acumNRa += $nra; + $acumDRa += $dra; + $acumNPa += $npa; + $acumDPa += $dpa; + $acumNRr += $nrr; + $acumDRr += $drr; + $acumNPr += $npr; + $acumDPr += $dpr; + } + } + + if ($VERBOSE || $name eq 'none') { + print "\n====== TOTALS =======\n"; + print "Identification of Mentions: "; + ShowRPF( + $idenTotals{recallNum}, $idenTotals{recallDen}, + $idenTotals{precisionNum}, $idenTotals{precisionDen} + ); + print "\nCoreference:\n"; + print "Coreference links: "; + ShowRPF($acumNRa, $acumDRa, $acumNPa, $acumDPa); + print "Non-coreference links: "; + ShowRPF($acumNRr, $acumDRr, $acumNPr, $acumDPr); + print "BLANC: "; + + my $Ra = ($acumDRa) ? $acumNRa / $acumDRa : -1; + my $Rr = ($acumDRr) ? $acumNRr / $acumDRr : -1; + my $Pa = ($acumDPa) ? $acumNPa / $acumDPa : 0; + my $Pr = ($acumDPr) ? $acumNPr / $acumDPr : 0; + + my $R = ($Ra + $Rr) / 2; + my $P = ($Pa + $Pr) / 2; + + my $Fa = ($Pa + $Ra) ? 2 * $Pa * $Ra / ($Pa + $Ra) : 0; + my $Fr = ($Pr + $Rr) ? 2 * $Pr * $Rr / ($Pr + $Rr) : 0; + + my $f1 = ($Fa + $Fr) / 2; + + if ($Ra == -1 && $Rr == -1) { + $R = 0; + $P = 0; + $f1 = 0; + } + elsif ($Ra == -1) { + $R = $Rr; + $P = $Pr; + $f1 = $Fr; + } + elsif ($Rr == -1) { + $R = $Ra; + $P = $Pa; + $f1 = $Fa; + } + + ShowRPF($R, 1, $P, 1, $f1); + } + return ( + $acumNRa, $acumDRa, $acumNPa, $acumDPa, + $acumNRr, $acumDRr, $acumNPr, $acumDPr + ); +} + +sub cartesian { + my @C = map { [$_] } @{shift @_}; + + foreach (@_) { + my @A = @$_; + + @C = map { + my $n = $_; + map { [$n, @$_] } @C + } @A; + } + + return @C; +} + +sub BLANC_Internal { + my ($keys, $response) = @_; + my ($ga, $gr, $ba, $br) = (0, 0, 0, 0); + my $key_coreference_links = {}; + my $key_non_coreference_links = {}; + my $response_coreference_links = {}; + my $response_non_coreference_links = {}; + + print "list containing list of chains in key:\n" if ($VERBOSE > 2); + print Dumper $keys if ($VERBOSE > 2); + + print "each key chain printed individually:\n" if ($VERBOSE > 2); + + if ($VERBOSE > 2) { + foreach my $z (@$keys) { + print Dumper $z; + } + } + + print "list containing list of chains in response:\n" if ($VERBOSE > 2); + print Dumper $response if ($VERBOSE > 2); + + print "each response chain printed individually:\n" if ($VERBOSE > 2); + + if ($VERBOSE > 2) { + foreach my $z (@$response) { + print Dumper $z; + } + } + + print +"---------------------------------------------------------------------------------" + . "\n" + if ($VERBOSE > 2); + + print "combinations of links for each chain in the key:\n" if ($VERBOSE > 2); + for my $kkk (@$keys) { + my $ccombinat = Math::Combinatorics->new( + count => 2, + data => [@$kkk], + ); + + while (my @zcombo = $ccombinat->next_combination) { + print Dumper [@zcombo] if ($VERBOSE > 2); + my @zzcombo = sort { $a <=> $b } @zcombo; + + $key_coreference_links->{$zzcombo[0] . "-" . $zzcombo[1]} = 1; + } + + print +"................................................................................\n" + if ($VERBOSE > 2); + } + + print Dumper $key_coreference_links if ($VERBOSE > 2); + print +"********************************************************************************\n" + if ($VERBOSE > 2); + + print +"---------------------------------------------------------------------------------" + . "\n" + if ($VERBOSE > 2); + print "combinations of links for each chain in the response:\n" + if ($VERBOSE > 2); + for my $rrr (@$response) { + my $ccombinat = Math::Combinatorics->new( + count => 2, + data => [@$rrr], + ); + + while (my @zcombo = $ccombinat->next_combination) { + print Dumper [@zcombo] if ($VERBOSE > 2); + my @zzcombo = sort { $a <=> $b } @zcombo; + + $response_coreference_links->{$zzcombo[0] . "-" . $zzcombo[1]} = 1; + } + + print +"................................................................................\n" + if ($VERBOSE > 2); + } + + print Dumper $response_coreference_links if ($VERBOSE > 2); + print +"********************************************************************************\n" + if ($VERBOSE > 2); + + my $number_chains_in_key = @$keys; + print "number chains in key: " . $number_chains_in_key . "\n" + if ($VERBOSE > 2); + + my @s = (0 .. $number_chains_in_key - 1); + my $ss = join(' ', @s); + my @n = split(' ', $ss); + + my $combinat = Math::Combinatorics->new( + count => 2, + data => [@n], + ); + + print "combinations of 2 from: " . join(" ", @n) . "\n" if ($VERBOSE > 2); + print "------------------------" . ("--" x scalar(@n)) . "\n" + if ($VERBOSE > 2); + + while (my @combo = $combinat->next_combination) { + + my @kcombo = (); + foreach my $comboo (@combo) { + push(@kcombo, @$keys[$comboo]); + } + + my $lkcombo = @kcombo; + print "length: " . $lkcombo . "\n" if ($VERBOSE > 2); + print "kcombo:\n" if ($VERBOSE > 2); + print "+++++\n" if ($VERBOSE > 2); + print Dumper [@kcombo] if ($VERBOSE > 2); + my @kccar = cartesian($kcombo[0], $kcombo[1]); + + foreach my $x (@kccar) { + print "--->>>>>>>>>>>>\n" if ($VERBOSE > 2); + print Dumper $x if ($VERBOSE > 2); + my @y = sort { $a <=> $b } @$x; + print Dumper [@y] if ($VERBOSE > 2); + $key_non_coreference_links->{@y[0] . "-" . @y[1]} = 1; + } + + print Dumper $key_non_coreference_links if ($VERBOSE > 2); + print "" . "\n" if ($VERBOSE > 2); + + print ".....\n" if ($VERBOSE > 2); + + print "\n" if ($VERBOSE > 2); + } + + print "\n" if ($VERBOSE > 2); + my $number_chains_in_response = @$response; + print "number chains in response: " . $number_chains_in_response . "\n" + if ($VERBOSE > 2); + + my @s = (0 .. $number_chains_in_response - 1); + my $ss = join(' ', @s); + my @n = split(' ', $ss); + + my $combinat = Math::Combinatorics->new( + count => 2, + data => [@n], + ); + + print "combinations of 2 from: " . join(" ", @n) . "\n" if ($VERBOSE > 2); + print "------------------------" . ("--" x scalar(@n)) . "\n" + if ($VERBOSE > 2); + + while (my @combo = $combinat->next_combination) { + my @kcombo = (); + foreach my $comboo (@combo) { + push(@kcombo, @$response[$comboo]); + } + + my $lkcombo = @kcombo; + print "length: " . $lkcombo . "\n" if ($VERBOSE > 2); + print "kcombo:\n" if ($VERBOSE > 2); + print "+++++\n" if ($VERBOSE > 2); + print Dumper [@kcombo] if ($VERBOSE > 2); + my @kccar = cartesian($kcombo[0], $kcombo[1]); + + foreach my $x (@kccar) { + print "--->>>>>>>>>>>>\n" if ($VERBOSE > 2); + print Dumper $x if ($VERBOSE > 2); + my @y = sort { $a <=> $b } @$x; + print Dumper [@y] if ($VERBOSE > 2); + $response_non_coreference_links->{@y[0] . "-" . @y[1]} = 1; + } + + print Dumper $response_non_coreference_links if ($VERBOSE > 2); + print "" . "\n" if ($VERBOSE > 2); + + print ".....\n" if ($VERBOSE > 2); + print "\n" if ($VERBOSE > 2); + } + + print "\n" if ($VERBOSE > 2); + + print +"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n" + if ($VERBOSE > 2); + print Dumper $key_coreference_links if ($VERBOSE > 2); + print Dumper $response_coreference_links if ($VERBOSE > 2); + print Dumper $key_non_coreference_links if ($VERBOSE > 2); + print Dumper $response_non_coreference_links if ($VERBOSE > 2); + print +"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n" + if ($VERBOSE > 2); + + my @union_cl = my @isect_cl = (); + my %union_cl = my %isect_cl = (); + + my @kcl = keys %$key_coreference_links; + my @rcl = keys %$response_coreference_links; + + print Dumper @kcl if ($VERBOSE > 2); + print +"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n" + if ($VERBOSE > 2); + print Dumper @rcl if ($VERBOSE > 2); + print +"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n" + if ($VERBOSE > 2); + + foreach my $e (@kcl, @rcl) { $union_cl{$e}++ && $isect_cl{$e}++ } + + @union_cl = keys %union_cl; + @isect_cl = keys %isect_cl; + + print Dumper @isect_cl if ($VERBOSE > 2); + print +"********************************************************************************\n" + if ($VERBOSE > 2); + + my @union_ncl = my @isect_ncl = (); + my %union_ncl = my %isect_ncl = (); + + my @kncl = keys %$key_non_coreference_links; + my @rncl = keys %$response_non_coreference_links; + + print Dumper @kncl if ($VERBOSE > 2); + print +"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n" + if ($VERBOSE > 2); + print Dumper @rncl if ($VERBOSE > 2); + print +"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n" + if ($VERBOSE > 2); + + foreach my $e (@kncl, @rncl) { $union_ncl{$e}++ && $isect_ncl{$e}++ } + + @union_ncl = keys %union_ncl; + @isect_ncl = keys %isect_ncl; + + print Dumper @isect_ncl if ($VERBOSE > 2); + print +"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n" + if ($VERBOSE > 2); + + my $num_isect_cl = @isect_cl; + print +" number of links in the intersection of key and response coreference links: " + . $num_isect_cl . "\n" + if ($VERBOSE > 2); + + my $num_isect_ncl = @isect_ncl; + print +"number of links in the intersection of key and response non-coreference links: " + . $num_isect_ncl . "\n" + if ($VERBOSE > 2); + + my $num_key_coreference_links = keys %$key_coreference_links; + print "number of key coreference links: " . $num_key_coreference_links . "\n" + if ($VERBOSE > 2); + + my $num_response_coreference_links = keys %$response_coreference_links; + print "number of response coreference links: " + . $num_response_coreference_links . "\n" + if ($VERBOSE > 2); + + my $num_key_non_coreference_links = keys %$key_non_coreference_links; + print "number of key non-coreference links: " + . $num_key_non_coreference_links . "\n" + if ($VERBOSE > 2); + + my $num_response_non_coreference_links = + keys %$response_non_coreference_links; + print "number of response non-coreference links: " + . $num_response_non_coreference_links . "\n" + if ($VERBOSE > 2); + + my ($r_blanc, $p_blanc, $f_blanc) = ComputeBLANCFromCounts( + $num_isect_cl, $num_key_coreference_links, + $num_response_coreference_links, $num_isect_ncl, + $num_key_non_coreference_links, $num_response_non_coreference_links + ); + + print " blanc recall: " . $r_blanc . "\n" if ($VERBOSE > 2); + print "blanc precision: " . $p_blanc . "\n" if ($VERBOSE > 2); + print " blanc score: " . $f_blanc . "\n" if ($VERBOSE > 2); + print +">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n" + if ($VERBOSE > 2); + + return ( + $num_isect_cl, $num_key_coreference_links, + $num_isect_cl, $num_response_coreference_links, + $num_isect_ncl, $num_key_non_coreference_links, + $num_isect_ncl, $num_response_non_coreference_links + ); +} + +################################################################################ +# Compute BLANC recall, precision and F-measure from counts. +# Parameters: +# (#correct_coref_links, #key_coref_links, #response_coref_links, +# #correct_noncoref_links, #key_noncoref_links, #response_noncoref_links). +# Returns: (recall, precision, F-measure). +################################################################################ +sub ComputeBLANCFromCounts { + my ( + $num_isect_cl, $num_key_coreference_links, + $num_response_coreference_links, $num_isect_ncl, + $num_key_non_coreference_links, $num_response_non_coreference_links + ) = @_; + + my $kcl_recall = + ($num_key_coreference_links == 0) + ? 0 + : ($num_isect_cl / $num_key_coreference_links); + my $kcl_precision = + ($num_response_coreference_links == 0) + ? 0 + : ($num_isect_cl / $num_response_coreference_links); + + print +"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n" + if ($VERBOSE > 2); + print " coreference recall: " . $kcl_recall . "\n" if ($VERBOSE > 2); + print " coreference precision: " . $kcl_precision . "\n" if ($VERBOSE > 2); + + my $fcl = + ($kcl_recall + $kcl_precision == 0) + ? 0 + : (2 * $kcl_recall * $kcl_precision / ($kcl_recall + $kcl_precision)); + print " coreference f-score: " . $fcl . "\n" if ($VERBOSE > 2); + + my $kncl_recall = + ($num_key_non_coreference_links == 0) + ? 0 + : ($num_isect_ncl / $num_key_non_coreference_links); + my $kncl_precision = + ($num_response_non_coreference_links == 0) + ? 0 + : ($num_isect_ncl / $num_response_non_coreference_links); + + print +"--------------------------------------------------------------------------------\n" + if ($VERBOSE > 2); + print " non-coreference recall: " . $kncl_recall . "\n" if ($VERBOSE > 2); + print "non-coreference precision: " . $kncl_precision . "\n" + if ($VERBOSE > 2); + + my $fncl = + ($kncl_recall + $kncl_precision == 0) + ? 0 + : (2 * $kncl_recall * $kncl_precision / ($kncl_recall + $kncl_precision)); + print " non-coreference f-score: " . $fncl . "\n" if ($VERBOSE > 2); + print +"--------------------------------------------------------------------------------\n" + if ($VERBOSE > 2); + + my $r_blanc = -1; + my $p_blanc = -1; + my $f_blanc = -1; + + if ($num_key_coreference_links == 0 && $num_key_non_coreference_links == 0) { + $r_blanc = 0; + $p_blanc = 0; + $f_blanc = 0; + } + elsif ($num_key_coreference_links == 0 || $num_key_non_coreference_links == 0) + { + if ($num_key_coreference_links == 0) { + $r_blanc = $kncl_recall; + $p_blanc = $kncl_precision; + $f_blanc = $fncl; + } + elsif ($num_key_non_coreference_links == 0) { + $r_blanc = $kcl_recall; + $p_blanc = $kcl_precision; + $f_blanc = $fcl; + } + } + else { + $r_blanc = ($kcl_recall + $kncl_recall) / 2; + $p_blanc = ($kcl_precision + $kncl_precision) / 2; + $f_blanc = ($fcl + $fncl) / 2; + } + + return ($r_blanc, $p_blanc, $f_blanc); +} + +1; diff --git a/neuralcoref/scorer/lib/Cwd.pm b/neuralcoref/scorer/lib/Cwd.pm new file mode 100644 index 0000000..a48d205 --- /dev/null +++ b/neuralcoref/scorer/lib/Cwd.pm @@ -0,0 +1,836 @@ +package Cwd; + +=head1 NAME + +Cwd - get pathname of current working directory + +=head1 SYNOPSIS + + use Cwd; + my $dir = getcwd; + + use Cwd 'abs_path'; + my $abs_path = abs_path($file); + +=head1 DESCRIPTION + +This module provides functions for determining the pathname of the +current working directory. It is recommended that getcwd (or another +*cwd() function) be used in I code to ensure portability. + +By default, it exports the functions cwd(), getcwd(), fastcwd(), and +fastgetcwd() (and, on Win32, getdcwd()) into the caller's namespace. + + +=head2 getcwd and friends + +Each of these functions are called without arguments and return the +absolute path of the current working directory. + +=over 4 + +=item getcwd + + my $cwd = getcwd(); + +Returns the current working directory. + +Exposes the POSIX function getcwd(3) or re-implements it if it's not +available. + +=item cwd + + my $cwd = cwd(); + +The cwd() is the most natural form for the current architecture. For +most systems it is identical to `pwd` (but without the trailing line +terminator). + +=item fastcwd + + my $cwd = fastcwd(); + +A more dangerous version of getcwd(), but potentially faster. + +It might conceivably chdir() you out of a directory that it can't +chdir() you back into. If fastcwd encounters a problem it will return +undef but will probably leave you in a different directory. For a +measure of extra security, if everything appears to have worked, the +fastcwd() function will check that it leaves you in the same directory +that it started in. If it has changed it will C with the message +"Unstable directory path, current directory changed +unexpectedly". That should never happen. + +=item fastgetcwd + + my $cwd = fastgetcwd(); + +The fastgetcwd() function is provided as a synonym for cwd(). + +=item getdcwd + + my $cwd = getdcwd(); + my $cwd = getdcwd('C:'); + +The getdcwd() function is also provided on Win32 to get the current working +directory on the specified drive, since Windows maintains a separate current +working directory for each drive. If no drive is specified then the current +drive is assumed. + +This function simply calls the Microsoft C library _getdcwd() function. + +=back + + +=head2 abs_path and friends + +These functions are exported only on request. They each take a single +argument and return the absolute pathname for it. If no argument is +given they'll use the current working directory. + +=over 4 + +=item abs_path + + my $abs_path = abs_path($file); + +Uses the same algorithm as getcwd(). Symbolic links and relative-path +components ("." and "..") are resolved to return the canonical +pathname, just like realpath(3). + +=item realpath + + my $abs_path = realpath($file); + +A synonym for abs_path(). + +=item fast_abs_path + + my $abs_path = fast_abs_path($file); + +A more dangerous, but potentially faster version of abs_path. + +=back + +=head2 $ENV{PWD} + +If you ask to override your chdir() built-in function, + + use Cwd qw(chdir); + +then your PWD environment variable will be kept up to date. Note that +it will only be kept up to date if all packages which use chdir import +it from Cwd. + + +=head1 NOTES + +=over 4 + +=item * + +Since the path separators are different on some operating systems ('/' +on Unix, ':' on MacPerl, etc...) we recommend you use the File::Spec +modules wherever portability is a concern. + +=item * + +Actually, on Mac OS, the C, C and C +functions are all aliases for the C function, which, on Mac OS, +calls `pwd`. Likewise, the C function is an alias for +C. + +=back + +=head1 AUTHOR + +Originally by the perl5-porters. + +Maintained by Ken Williams + +=head1 COPYRIGHT + +Copyright (c) 2004 by the Perl 5 Porters. All rights reserved. + +This program is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +Portions of the C code in this library are copyright (c) 1994 by the +Regents of the University of California. All rights reserved. The +license on this code is compatible with the licensing of the rest of +the distribution - please see the source code in F for the +details. + +=head1 SEE ALSO + +L + +=cut + +use strict; +use Exporter; +use vars qw(@ISA @EXPORT @EXPORT_OK $VERSION); + +$VERSION = '3.39_02'; +my $xs_version = $VERSION; +$VERSION =~ tr/_//; + +@ISA = qw/ Exporter /; +@EXPORT = qw(cwd getcwd fastcwd fastgetcwd); +push @EXPORT, qw(getdcwd) if $^O eq 'MSWin32'; +@EXPORT_OK = qw(chdir abs_path fast_abs_path realpath fast_realpath); + +# sys_cwd may keep the builtin command + +# All the functionality of this module may provided by builtins, +# there is no sense to process the rest of the file. +# The best choice may be to have this in BEGIN, but how to return from BEGIN? + +if ($^O eq 'os2') { + local $^W = 0; + + *cwd = defined &sys_cwd ? \&sys_cwd : \&_os2_cwd; + *getcwd = \&cwd; + *fastgetcwd = \&cwd; + *fastcwd = \&cwd; + + *fast_abs_path = \&sys_abspath if defined &sys_abspath; + *abs_path = \&fast_abs_path; + *realpath = \&fast_abs_path; + *fast_realpath = \&fast_abs_path; + + return 1; +} + +# Need to look up the feature settings on VMS. The preferred way is to use the +# VMS::Feature module, but that may not be available to dual life modules. + +my $use_vms_feature; +BEGIN { + if ($^O eq 'VMS') { + if (eval { local $SIG{__DIE__}; require VMS::Feature; }) { + $use_vms_feature = 1; + } + } +} + +# Need to look up the UNIX report mode. This may become a dynamic mode +# in the future. +sub _vms_unix_rpt { + my $unix_rpt; + if ($use_vms_feature) { + $unix_rpt = VMS::Feature::current("filename_unix_report"); + } else { + my $env_unix_rpt = $ENV{'DECC$FILENAME_UNIX_REPORT'} || ''; + $unix_rpt = $env_unix_rpt =~ /^[ET1]/i; + } + return $unix_rpt; +} + +# Need to look up the EFS character set mode. This may become a dynamic +# mode in the future. +sub _vms_efs { + my $efs; + if ($use_vms_feature) { + $efs = VMS::Feature::current("efs_charset"); + } else { + my $env_efs = $ENV{'DECC$EFS_CHARSET'} || ''; + $efs = $env_efs =~ /^[ET1]/i; + } + return $efs; +} + + +# If loading the XS stuff doesn't work, we can fall back to pure perl +eval { + if ( $] >= 5.006 ) { + require XSLoader; + XSLoader::load( __PACKAGE__, $xs_version); + } else { + require DynaLoader; + push @ISA, 'DynaLoader'; + __PACKAGE__->bootstrap( $xs_version ); + } +}; + +# Big nasty table of function aliases +my %METHOD_MAP = + ( + VMS => + { + cwd => '_vms_cwd', + getcwd => '_vms_cwd', + fastcwd => '_vms_cwd', + fastgetcwd => '_vms_cwd', + abs_path => '_vms_abs_path', + fast_abs_path => '_vms_abs_path', + }, + + MSWin32 => + { + # We assume that &_NT_cwd is defined as an XSUB or in the core. + cwd => '_NT_cwd', + getcwd => '_NT_cwd', + fastcwd => '_NT_cwd', + fastgetcwd => '_NT_cwd', + abs_path => 'fast_abs_path', + realpath => 'fast_abs_path', + }, + + dos => + { + cwd => '_dos_cwd', + getcwd => '_dos_cwd', + fastgetcwd => '_dos_cwd', + fastcwd => '_dos_cwd', + abs_path => 'fast_abs_path', + }, + + # QNX4. QNX6 has a $os of 'nto'. + qnx => + { + cwd => '_qnx_cwd', + getcwd => '_qnx_cwd', + fastgetcwd => '_qnx_cwd', + fastcwd => '_qnx_cwd', + abs_path => '_qnx_abs_path', + fast_abs_path => '_qnx_abs_path', + }, + + cygwin => + { + getcwd => 'cwd', + fastgetcwd => 'cwd', + fastcwd => 'cwd', + abs_path => 'fast_abs_path', + realpath => 'fast_abs_path', + }, + + epoc => + { + cwd => '_epoc_cwd', + getcwd => '_epoc_cwd', + fastgetcwd => '_epoc_cwd', + fastcwd => '_epoc_cwd', + abs_path => 'fast_abs_path', + }, + + MacOS => + { + getcwd => 'cwd', + fastgetcwd => 'cwd', + fastcwd => 'cwd', + abs_path => 'fast_abs_path', + }, + ); + +$METHOD_MAP{NT} = $METHOD_MAP{MSWin32}; + + +# Find the pwd command in the expected locations. We assume these +# are safe. This prevents _backtick_pwd() consulting $ENV{PATH} +# so everything works under taint mode. +my $pwd_cmd; +foreach my $try ('/bin/pwd', + '/usr/bin/pwd', + '/QOpenSys/bin/pwd', # OS/400 PASE. + ) { + + if( -x $try ) { + $pwd_cmd = $try; + last; + } +} +my $found_pwd_cmd = defined($pwd_cmd); +unless ($pwd_cmd) { + # Isn't this wrong? _backtick_pwd() will fail if somenone has + # pwd in their path but it is not /bin/pwd or /usr/bin/pwd? + # See [perl #16774]. --jhi + $pwd_cmd = 'pwd'; +} + +# Lazy-load Carp +sub _carp { require Carp; Carp::carp(@_) } +sub _croak { require Carp; Carp::croak(@_) } + +# The 'natural and safe form' for UNIX (pwd may be setuid root) +sub _backtick_pwd { + # Localize %ENV entries in a way that won't create new hash keys + my @localize = grep exists $ENV{$_}, qw(PATH IFS CDPATH ENV BASH_ENV); + local @ENV{@localize}; + + my $cwd = `$pwd_cmd`; + # Belt-and-suspenders in case someone said "undef $/". + local $/ = "\n"; + # `pwd` may fail e.g. if the disk is full + chomp($cwd) if defined $cwd; + $cwd; +} + +# Since some ports may predefine cwd internally (e.g., NT) +# we take care not to override an existing definition for cwd(). + +unless ($METHOD_MAP{$^O}{cwd} or defined &cwd) { + # The pwd command is not available in some chroot(2)'ed environments + my $sep = $Config::Config{path_sep} || ':'; + my $os = $^O; # Protect $^O from tainting + + + # Try again to find a pwd, this time searching the whole PATH. + if (defined $ENV{PATH} and $os ne 'MSWin32') { # no pwd on Windows + my @candidates = split($sep, $ENV{PATH}); + while (!$found_pwd_cmd and @candidates) { + my $candidate = shift @candidates; + $found_pwd_cmd = 1 if -x "$candidate/pwd"; + } + } + + # MacOS has some special magic to make `pwd` work. + if( $os eq 'MacOS' || $found_pwd_cmd ) + { + *cwd = \&_backtick_pwd; + } + else { + *cwd = \&getcwd; + } +} + +if ($^O eq 'cygwin') { + # We need to make sure cwd() is called with no args, because it's + # got an arg-less prototype and will die if args are present. + local $^W = 0; + my $orig_cwd = \&cwd; + *cwd = sub { &$orig_cwd() } +} + + +# set a reasonable (and very safe) default for fastgetcwd, in case it +# isn't redefined later (20001212 rspier) +*fastgetcwd = \&cwd; + +# A non-XS version of getcwd() - also used to bootstrap the perl build +# process, when miniperl is running and no XS loading happens. +sub _perl_getcwd +{ + abs_path('.'); +} + +# By John Bazik +# +# Usage: $cwd = &fastcwd; +# +# This is a faster version of getcwd. It's also more dangerous because +# you might chdir out of a directory that you can't chdir back into. + +sub fastcwd_ { + my($odev, $oino, $cdev, $cino, $tdev, $tino); + my(@path, $path); + local(*DIR); + + my($orig_cdev, $orig_cino) = stat('.'); + ($cdev, $cino) = ($orig_cdev, $orig_cino); + for (;;) { + my $direntry; + ($odev, $oino) = ($cdev, $cino); + CORE::chdir('..') || return undef; + ($cdev, $cino) = stat('.'); + last if $odev == $cdev && $oino == $cino; + opendir(DIR, '.') || return undef; + for (;;) { + $direntry = readdir(DIR); + last unless defined $direntry; + next if $direntry eq '.'; + next if $direntry eq '..'; + + ($tdev, $tino) = lstat($direntry); + last unless $tdev != $odev || $tino != $oino; + } + closedir(DIR); + return undef unless defined $direntry; # should never happen + unshift(@path, $direntry); + } + $path = '/' . join('/', @path); + if ($^O eq 'apollo') { $path = "/".$path; } + # At this point $path may be tainted (if tainting) and chdir would fail. + # Untaint it then check that we landed where we started. + $path =~ /^(.*)\z/s # untaint + && CORE::chdir($1) or return undef; + ($cdev, $cino) = stat('.'); + die "Unstable directory path, current directory changed unexpectedly" + if $cdev != $orig_cdev || $cino != $orig_cino; + $path; +} +if (not defined &fastcwd) { *fastcwd = \&fastcwd_ } + + +# Keeps track of current working directory in PWD environment var +# Usage: +# use Cwd 'chdir'; +# chdir $newdir; + +my $chdir_init = 0; + +sub chdir_init { + if ($ENV{'PWD'} and $^O ne 'os2' and $^O ne 'dos' and $^O ne 'MSWin32') { + my($dd,$di) = stat('.'); + my($pd,$pi) = stat($ENV{'PWD'}); + if (!defined $dd or !defined $pd or $di != $pi or $dd != $pd) { + $ENV{'PWD'} = cwd(); + } + } + else { + my $wd = cwd(); + $wd = Win32::GetFullPathName($wd) if $^O eq 'MSWin32'; + $ENV{'PWD'} = $wd; + } + # Strip an automounter prefix (where /tmp_mnt/foo/bar == /foo/bar) + if ($^O ne 'MSWin32' and $ENV{'PWD'} =~ m|(/[^/]+(/[^/]+/[^/]+))(.*)|s) { + my($pd,$pi) = stat($2); + my($dd,$di) = stat($1); + if (defined $pd and defined $dd and $di == $pi and $dd == $pd) { + $ENV{'PWD'}="$2$3"; + } + } + $chdir_init = 1; +} + +sub chdir { + my $newdir = @_ ? shift : ''; # allow for no arg (chdir to HOME dir) + $newdir =~ s|///*|/|g unless $^O eq 'MSWin32'; + chdir_init() unless $chdir_init; + my $newpwd; + if ($^O eq 'MSWin32') { + # get the full path name *before* the chdir() + $newpwd = Win32::GetFullPathName($newdir); + } + + return 0 unless CORE::chdir $newdir; + + if ($^O eq 'VMS') { + return $ENV{'PWD'} = $ENV{'DEFAULT'} + } + elsif ($^O eq 'MacOS') { + return $ENV{'PWD'} = cwd(); + } + elsif ($^O eq 'MSWin32') { + $ENV{'PWD'} = $newpwd; + return 1; + } + + if (ref $newdir eq 'GLOB') { # in case a file/dir handle is passed in + $ENV{'PWD'} = cwd(); + } elsif ($newdir =~ m#^/#s) { + $ENV{'PWD'} = $newdir; + } else { + my @curdir = split(m#/#,$ENV{'PWD'}); + @curdir = ('') unless @curdir; + my $component; + foreach $component (split(m#/#, $newdir)) { + next if $component eq '.'; + pop(@curdir),next if $component eq '..'; + push(@curdir,$component); + } + $ENV{'PWD'} = join('/',@curdir) || '/'; + } + 1; +} + + +sub _perl_abs_path +{ + my $start = @_ ? shift : '.'; + my($dotdots, $cwd, @pst, @cst, $dir, @tst); + + unless (@cst = stat( $start )) + { + _carp("stat($start): $!"); + return ''; + } + + unless (-d _) { + # Make sure we can be invoked on plain files, not just directories. + # NOTE that this routine assumes that '/' is the only directory separator. + + my ($dir, $file) = $start =~ m{^(.*)/(.+)$} + or return cwd() . '/' . $start; + + # Can't use "-l _" here, because the previous stat was a stat(), not an lstat(). + if (-l $start) { + my $link_target = readlink($start); + die "Can't resolve link $start: $!" unless defined $link_target; + + require File::Spec; + $link_target = $dir . '/' . $link_target + unless File::Spec->file_name_is_absolute($link_target); + + return abs_path($link_target); + } + + return $dir ? abs_path($dir) . "/$file" : "/$file"; + } + + $cwd = ''; + $dotdots = $start; + do + { + $dotdots .= '/..'; + @pst = @cst; + local *PARENT; + unless (opendir(PARENT, $dotdots)) + { + # probably a permissions issue. Try the native command. + require File::Spec; + return File::Spec->rel2abs( $start, _backtick_pwd() ); + } + unless (@cst = stat($dotdots)) + { + _carp("stat($dotdots): $!"); + closedir(PARENT); + return ''; + } + if ($pst[0] == $cst[0] && $pst[1] == $cst[1]) + { + $dir = undef; + } + else + { + do + { + unless (defined ($dir = readdir(PARENT))) + { + _carp("readdir($dotdots): $!"); + closedir(PARENT); + return ''; + } + $tst[0] = $pst[0]+1 unless (@tst = lstat("$dotdots/$dir")) + } + while ($dir eq '.' || $dir eq '..' || $tst[0] != $pst[0] || + $tst[1] != $pst[1]); + } + $cwd = (defined $dir ? "$dir" : "" ) . "/$cwd" ; + closedir(PARENT); + } while (defined $dir); + chop($cwd) unless $cwd eq '/'; # drop the trailing / + $cwd; +} + + +my $Curdir; +sub fast_abs_path { + local $ENV{PWD} = $ENV{PWD} || ''; # Guard against clobberage + my $cwd = getcwd(); + require File::Spec; + my $path = @_ ? shift : ($Curdir ||= File::Spec->curdir); + + # Detaint else we'll explode in taint mode. This is safe because + # we're not doing anything dangerous with it. + ($path) = $path =~ /(.*)/; + ($cwd) = $cwd =~ /(.*)/; + + unless (-e $path) { + _croak("$path: No such file or directory"); + } + + unless (-d _) { + # Make sure we can be invoked on plain files, not just directories. + + my ($vol, $dir, $file) = File::Spec->splitpath($path); + return File::Spec->catfile($cwd, $path) unless length $dir; + + if (-l $path) { + my $link_target = readlink($path); + die "Can't resolve link $path: $!" unless defined $link_target; + + $link_target = File::Spec->catpath($vol, $dir, $link_target) + unless File::Spec->file_name_is_absolute($link_target); + + return fast_abs_path($link_target); + } + + return $dir eq File::Spec->rootdir + ? File::Spec->catpath($vol, $dir, $file) + : fast_abs_path(File::Spec->catpath($vol, $dir, '')) . '/' . $file; + } + + if (!CORE::chdir($path)) { + _croak("Cannot chdir to $path: $!"); + } + my $realpath = getcwd(); + if (! ((-d $cwd) && (CORE::chdir($cwd)))) { + _croak("Cannot chdir back to $cwd: $!"); + } + $realpath; +} + +# added function alias to follow principle of least surprise +# based on previous aliasing. --tchrist 27-Jan-00 +*fast_realpath = \&fast_abs_path; + + +# --- PORTING SECTION --- + +# VMS: $ENV{'DEFAULT'} points to default directory at all times +# 06-Mar-1996 Charles Bailey bailey@newman.upenn.edu +# Note: Use of Cwd::chdir() causes the logical name PWD to be defined +# in the process logical name table as the default device and directory +# seen by Perl. This may not be the same as the default device +# and directory seen by DCL after Perl exits, since the effects +# the CRTL chdir() function persist only until Perl exits. + +sub _vms_cwd { + return $ENV{'DEFAULT'}; +} + +sub _vms_abs_path { + return $ENV{'DEFAULT'} unless @_; + my $path = shift; + + my $efs = _vms_efs; + my $unix_rpt = _vms_unix_rpt; + + if (defined &VMS::Filespec::vmsrealpath) { + my $path_unix = 0; + my $path_vms = 0; + + $path_unix = 1 if ($path =~ m#(?<=\^)/#); + $path_unix = 1 if ($path =~ /^\.\.?$/); + $path_vms = 1 if ($path =~ m#[\[<\]]#); + $path_vms = 1 if ($path =~ /^--?$/); + + my $unix_mode = $path_unix; + if ($efs) { + # In case of a tie, the Unix report mode decides. + if ($path_vms == $path_unix) { + $unix_mode = $unix_rpt; + } else { + $unix_mode = 0 if $path_vms; + } + } + + if ($unix_mode) { + # Unix format + return VMS::Filespec::unixrealpath($path); + } + + # VMS format + + my $new_path = VMS::Filespec::vmsrealpath($path); + + # Perl expects directories to be in directory format + $new_path = VMS::Filespec::pathify($new_path) if -d $path; + return $new_path; + } + + # Fallback to older algorithm if correct ones are not + # available. + + if (-l $path) { + my $link_target = readlink($path); + die "Can't resolve link $path: $!" unless defined $link_target; + + return _vms_abs_path($link_target); + } + + # may need to turn foo.dir into [.foo] + my $pathified = VMS::Filespec::pathify($path); + $path = $pathified if defined $pathified; + + return VMS::Filespec::rmsexpand($path); +} + +sub _os2_cwd { + $ENV{'PWD'} = `cmd /c cd`; + chomp $ENV{'PWD'}; + $ENV{'PWD'} =~ s:\\:/:g ; + return $ENV{'PWD'}; +} + +sub _win32_cwd_simple { + $ENV{'PWD'} = `cd`; + chomp $ENV{'PWD'}; + $ENV{'PWD'} =~ s:\\:/:g ; + return $ENV{'PWD'}; +} + +sub _win32_cwd { + # Need to avoid taking any sort of reference to the typeglob or the code in + # the optree, so that this tests the runtime state of things, as the + # ExtUtils::MakeMaker tests for "miniperl" need to be able to fake things at + # runtime by deleting the subroutine. *foo{THING} syntax on a symbol table + # lookup avoids needing a string eval, which has been reported to cause + # problems (for reasons that we haven't been able to get to the bottom of - + # rt.cpan.org #56225) + if (*{$DynaLoader::{boot_DynaLoader}}{CODE}) { + $ENV{'PWD'} = Win32::GetCwd(); + } + else { # miniperl + chomp($ENV{'PWD'} = `cd`); + } + $ENV{'PWD'} =~ s:\\:/:g ; + return $ENV{'PWD'}; +} + +*_NT_cwd = defined &Win32::GetCwd ? \&_win32_cwd : \&_win32_cwd_simple; + +sub _dos_cwd { + if (!defined &Dos::GetCwd) { + $ENV{'PWD'} = `command /c cd`; + chomp $ENV{'PWD'}; + $ENV{'PWD'} =~ s:\\:/:g ; + } else { + $ENV{'PWD'} = Dos::GetCwd(); + } + return $ENV{'PWD'}; +} + +sub _qnx_cwd { + local $ENV{PATH} = ''; + local $ENV{CDPATH} = ''; + local $ENV{ENV} = ''; + $ENV{'PWD'} = `/usr/bin/fullpath -t`; + chomp $ENV{'PWD'}; + return $ENV{'PWD'}; +} + +sub _qnx_abs_path { + local $ENV{PATH} = ''; + local $ENV{CDPATH} = ''; + local $ENV{ENV} = ''; + my $path = @_ ? shift : '.'; + local *REALPATH; + + defined( open(REALPATH, '-|') || exec '/usr/bin/fullpath', '-t', $path ) or + die "Can't open /usr/bin/fullpath: $!"; + my $realpath = ; + close REALPATH; + chomp $realpath; + return $realpath; +} + +sub _epoc_cwd { + $ENV{'PWD'} = EPOC::getcwd(); + return $ENV{'PWD'}; +} + + +# Now that all the base-level functions are set up, alias the +# user-level functions to the right places + +if (exists $METHOD_MAP{$^O}) { + my $map = $METHOD_MAP{$^O}; + foreach my $name (keys %$map) { + local $^W = 0; # assignments trigger 'subroutine redefined' warning + no strict 'refs'; + *{$name} = \&{$map->{$name}}; + } +} + +# In case the XS version doesn't load. +*abs_path = \&_perl_abs_path unless defined &abs_path; +*getcwd = \&_perl_getcwd unless defined &getcwd; + +# added function alias for those of us more +# used to the libc function. --tchrist 27-Jan-00 +*realpath = \&abs_path; + +1; diff --git a/neuralcoref/scorer/lib/Data/Dumper.pm b/neuralcoref/scorer/lib/Data/Dumper.pm new file mode 100644 index 0000000..a099277 --- /dev/null +++ b/neuralcoref/scorer/lib/Data/Dumper.pm @@ -0,0 +1,1341 @@ +# +# Data/Dumper.pm +# +# convert perl data structures into perl syntax suitable for both printing +# and eval +# +# Documentation at the __END__ +# + +package Data::Dumper; + +BEGIN { + $VERSION = '2.135_06'; # Don't forget to set version and release +} # date in POD! + +#$| = 1; + +use 5.006_001; +require Exporter; +require overload; + +use Carp; + +BEGIN { + @ISA = qw(Exporter); + @EXPORT = qw(Dumper); + @EXPORT_OK = qw(DumperX); + + # if run under miniperl, or otherwise lacking dynamic loading, + # XSLoader should be attempted to load, or the pure perl flag + # toggled on load failure. + eval { + require XSLoader; + XSLoader::load( 'Data::Dumper' ); + 1 + } + or $Useperl = 1; +} + +# module vars and their defaults +$Indent = 2 unless defined $Indent; +$Purity = 0 unless defined $Purity; +$Pad = "" unless defined $Pad; +$Varname = "VAR" unless defined $Varname; +$Useqq = 0 unless defined $Useqq; +$Terse = 0 unless defined $Terse; +$Freezer = "" unless defined $Freezer; +$Toaster = "" unless defined $Toaster; +$Deepcopy = 0 unless defined $Deepcopy; +$Quotekeys = 1 unless defined $Quotekeys; +$Bless = "bless" unless defined $Bless; +#$Expdepth = 0 unless defined $Expdepth; +$Maxdepth = 0 unless defined $Maxdepth; +$Pair = ' => ' unless defined $Pair; +$Useperl = 0 unless defined $Useperl; +$Sortkeys = 0 unless defined $Sortkeys; +$Deparse = 0 unless defined $Deparse; + +# +# expects an arrayref of values to be dumped. +# can optionally pass an arrayref of names for the values. +# names must have leading $ sign stripped. begin the name with * +# to cause output of arrays and hashes rather than refs. +# +sub new { + my($c, $v, $n) = @_; + + croak "Usage: PACKAGE->new(ARRAYREF, [ARRAYREF])" + unless (defined($v) && (ref($v) eq 'ARRAY')); + $n = [] unless (defined($n) && (ref($n) eq 'ARRAY')); + + my($s) = { + level => 0, # current recursive depth + indent => $Indent, # various styles of indenting + pad => $Pad, # all lines prefixed by this string + xpad => "", # padding-per-level + apad => "", # added padding for hash keys n such + sep => "", # list separator + pair => $Pair, # hash key/value separator: defaults to ' => ' + seen => {}, # local (nested) refs (id => [name, val]) + todump => $v, # values to dump [] + names => $n, # optional names for values [] + varname => $Varname, # prefix to use for tagging nameless ones + purity => $Purity, # degree to which output is evalable + useqq => $Useqq, # use "" for strings (backslashitis ensues) + terse => $Terse, # avoid name output (where feasible) + freezer => $Freezer, # name of Freezer method for objects + toaster => $Toaster, # name of method to revive objects + deepcopy => $Deepcopy, # dont cross-ref, except to stop recursion + quotekeys => $Quotekeys, # quote hash keys + 'bless' => $Bless, # keyword to use for "bless" +# expdepth => $Expdepth, # cutoff depth for explicit dumping + maxdepth => $Maxdepth, # depth beyond which we give up + useperl => $Useperl, # use the pure Perl implementation + sortkeys => $Sortkeys, # flag or filter for sorting hash keys + deparse => $Deparse, # use B::Deparse for coderefs + }; + + if ($Indent > 0) { + $s->{xpad} = " "; + $s->{sep} = "\n"; + } + return bless($s, $c); +} + +# Packed numeric addresses take less memory. Plus pack is faster than sprintf + +# Most users of current versions of Data::Dumper will be 5.008 or later. +# Anyone on 5.6.1 and 5.6.2 upgrading will be rare (particularly judging by +# the bug reports from users on those platforms), so for the common case avoid +# complexity, and avoid even compiling the unneeded code. + +sub init_refaddr_format { +} + +sub format_refaddr { + require Scalar::Util; + pack "J", Scalar::Util::refaddr(shift); +}; + +if ($] < 5.008) { + eval <<'EOC' or die; + no warnings 'redefine'; + my $refaddr_format; + sub init_refaddr_format { + require Config; + my $f = $Config::Config{uvxformat}; + $f =~ tr/"//d; + $refaddr_format = "0x%" . $f; + } + + sub format_refaddr { + require Scalar::Util; + sprintf $refaddr_format, Scalar::Util::refaddr(shift); + } + + 1 +EOC +} + +# +# add-to or query the table of already seen references +# +sub Seen { + my($s, $g) = @_; + if (defined($g) && (ref($g) eq 'HASH')) { + init_refaddr_format(); + my($k, $v, $id); + while (($k, $v) = each %$g) { + if (defined $v and ref $v) { + $id = format_refaddr($v); + if ($k =~ /^[*](.*)$/) { + $k = (ref $v eq 'ARRAY') ? ( "\\\@" . $1 ) : + (ref $v eq 'HASH') ? ( "\\\%" . $1 ) : + (ref $v eq 'CODE') ? ( "\\\&" . $1 ) : + ( "\$" . $1 ) ; + } + elsif ($k !~ /^\$/) { + $k = "\$" . $k; + } + $s->{seen}{$id} = [$k, $v]; + } + else { + carp "Only refs supported, ignoring non-ref item \$$k"; + } + } + return $s; + } + else { + return map { @$_ } values %{$s->{seen}}; + } +} + +# +# set or query the values to be dumped +# +sub Values { + my($s, $v) = @_; + if (defined($v) && (ref($v) eq 'ARRAY')) { + $s->{todump} = [@$v]; # make a copy + return $s; + } + else { + return @{$s->{todump}}; + } +} + +# +# set or query the names of the values to be dumped +# +sub Names { + my($s, $n) = @_; + if (defined($n) && (ref($n) eq 'ARRAY')) { + $s->{names} = [@$n]; # make a copy + return $s; + } + else { + return @{$s->{names}}; + } +} + +sub DESTROY {} + +sub Dump { + return &Dumpxs + unless $Data::Dumper::Useperl || (ref($_[0]) && $_[0]->{useperl}) || + $Data::Dumper::Useqq || (ref($_[0]) && $_[0]->{useqq}) || + $Data::Dumper::Deparse || (ref($_[0]) && $_[0]->{deparse}); + return &Dumpperl; +} + +# +# dump the refs in the current dumper object. +# expects same args as new() if called via package name. +# +sub Dumpperl { + my($s) = shift; + my(@out, $val, $name); + my($i) = 0; + local(@post); + init_refaddr_format(); + + $s = $s->new(@_) unless ref $s; + + for $val (@{$s->{todump}}) { + my $out = ""; + @post = (); + $name = $s->{names}[$i++]; + if (defined $name) { + if ($name =~ /^[*](.*)$/) { + if (defined $val) { + $name = (ref $val eq 'ARRAY') ? ( "\@" . $1 ) : + (ref $val eq 'HASH') ? ( "\%" . $1 ) : + (ref $val eq 'CODE') ? ( "\*" . $1 ) : + ( "\$" . $1 ) ; + } + else { + $name = "\$" . $1; + } + } + elsif ($name !~ /^\$/) { + $name = "\$" . $name; + } + } + else { + $name = "\$" . $s->{varname} . $i; + } + + my $valstr; + { + local($s->{apad}) = $s->{apad}; + $s->{apad} .= ' ' x (length($name) + 3) if $s->{indent} >= 2 and !$s->{terse}; + $valstr = $s->_dump($val, $name); + } + + $valstr = "$name = " . $valstr . ';' if @post or !$s->{terse}; + $out .= $s->{pad} . $valstr . $s->{sep}; + $out .= $s->{pad} . join(';' . $s->{sep} . $s->{pad}, @post) + . ';' . $s->{sep} if @post; + + push @out, $out; + } + return wantarray ? @out : join('', @out); +} + +# wrap string in single quotes (escaping if needed) +sub _quote { + my $val = shift; + $val =~ s/([\\\'])/\\$1/g; + return "'" . $val . "'"; +} + +# Old Perls (5.14-) have trouble resetting vstring magic when it is no +# longer valid. +use constant _bad_vsmg => defined &_vstring && (_vstring(~v0)||'') eq "v0"; + +# +# twist, toil and turn; +# and recurse, of course. +# sometimes sordidly; +# and curse if no recourse. +# +sub _dump { + my($s, $val, $name) = @_; + my($sname); + my($out, $realpack, $realtype, $type, $ipad, $id, $blesspad); + + $type = ref $val; + $out = ""; + + if ($type) { + + # Call the freezer method if it's specified and the object has the + # method. Trap errors and warn() instead of die()ing, like the XS + # implementation. + my $freezer = $s->{freezer}; + if ($freezer and UNIVERSAL::can($val, $freezer)) { + eval { $val->$freezer() }; + warn "WARNING(Freezer method call failed): $@" if $@; + } + + require Scalar::Util; + $realpack = Scalar::Util::blessed($val); + $realtype = $realpack ? Scalar::Util::reftype($val) : ref $val; + $id = format_refaddr($val); + + # if it has a name, we need to either look it up, or keep a tab + # on it so we know when we hit it later + if (defined($name) and length($name)) { + # keep a tab on it so that we dont fall into recursive pit + if (exists $s->{seen}{$id}) { +# if ($s->{expdepth} < $s->{level}) { + if ($s->{purity} and $s->{level} > 0) { + $out = ($realtype eq 'HASH') ? '{}' : + ($realtype eq 'ARRAY') ? '[]' : + 'do{my $o}' ; + push @post, $name . " = " . $s->{seen}{$id}[0]; + } + else { + $out = $s->{seen}{$id}[0]; + if ($name =~ /^([\@\%])/) { + my $start = $1; + if ($out =~ /^\\$start/) { + $out = substr($out, 1); + } + else { + $out = $start . '{' . $out . '}'; + } + } + } + return $out; +# } + } + else { + # store our name + $s->{seen}{$id} = [ (($name =~ /^[@%]/) ? ('\\' . $name ) : + ($realtype eq 'CODE' and + $name =~ /^[*](.*)$/) ? ('\\&' . $1 ) : + $name ), + $val ]; + } + } + my $no_bless = 0; + my $is_regex = 0; + if ( $realpack and ($] >= 5.009005 ? re::is_regexp($val) : $realpack eq 'Regexp') ) { + $is_regex = 1; + $no_bless = $realpack eq 'Regexp'; + } + + # If purity is not set and maxdepth is set, then check depth: + # if we have reached maximum depth, return the string + # representation of the thing we are currently examining + # at this depth (i.e., 'Foo=ARRAY(0xdeadbeef)'). + if (!$s->{purity} + and $s->{maxdepth} > 0 + and $s->{level} >= $s->{maxdepth}) + { + return qq['$val']; + } + + # we have a blessed ref + if ($realpack and !$no_bless) { + $out = $s->{'bless'} . '( '; + $blesspad = $s->{apad}; + $s->{apad} .= ' ' if ($s->{indent} >= 2); + } + + $s->{level}++; + $ipad = $s->{xpad} x $s->{level}; + + if ($is_regex) { + my $pat; + # This really sucks, re:regexp_pattern is in ext/re/re.xs and not in + # universal.c, and even worse we cant just require that re to be loaded + # we *have* to use() it. + # We should probably move it to universal.c for 5.10.1 and fix this. + # Currently we only use re::regexp_pattern when the re is blessed into another + # package. This has the disadvantage of meaning that a DD dump won't round trip + # as the pattern will be repeatedly wrapped with the same modifiers. + # This is an aesthetic issue so we will leave it for now, but we could use + # regexp_pattern() in list context to get the modifiers separately. + # But since this means loading the full debugging engine in process we wont + # bother unless its necessary for accuracy. + if (($realpack ne 'Regexp') && defined(*re::regexp_pattern{CODE})) { + $pat = re::regexp_pattern($val); + } else { + $pat = "$val"; + } + $pat =~ s <(\\.)|/> { $1 || '\\/' }ge; + $out .= "qr/$pat/"; + } + elsif ($realtype eq 'SCALAR' || $realtype eq 'REF' + || $realtype eq 'VSTRING') { + if ($realpack) { + $out .= 'do{\\(my $o = ' . $s->_dump($$val, "\${$name}") . ')}'; + } + else { + $out .= '\\' . $s->_dump($$val, "\${$name}"); + } + } + elsif ($realtype eq 'GLOB') { + $out .= '\\' . $s->_dump($$val, "*{$name}"); + } + elsif ($realtype eq 'ARRAY') { + my($pad, $mname); + my($i) = 0; + $out .= ($name =~ /^\@/) ? '(' : '['; + $pad = $s->{sep} . $s->{pad} . $s->{apad}; + ($name =~ /^\@(.*)$/) ? ($mname = "\$" . $1) : + # omit -> if $foo->[0]->{bar}, but not ${$foo->[0]}->{bar} + ($name =~ /^\\?[\%\@\*\$][^{].*[]}]$/) ? ($mname = $name) : + ($mname = $name . '->'); + $mname .= '->' if $mname =~ /^\*.+\{[A-Z]+\}$/; + for my $v (@$val) { + $sname = $mname . '[' . $i . ']'; + $out .= $pad . $ipad . '#' . $i if $s->{indent} >= 3; + $out .= $pad . $ipad . $s->_dump($v, $sname); + $out .= "," if $i++ < $#$val; + } + $out .= $pad . ($s->{xpad} x ($s->{level} - 1)) if $i; + $out .= ($name =~ /^\@/) ? ')' : ']'; + } + elsif ($realtype eq 'HASH') { + my($k, $v, $pad, $lpad, $mname, $pair); + $out .= ($name =~ /^\%/) ? '(' : '{'; + $pad = $s->{sep} . $s->{pad} . $s->{apad}; + $lpad = $s->{apad}; + $pair = $s->{pair}; + ($name =~ /^\%(.*)$/) ? ($mname = "\$" . $1) : + # omit -> if $foo->[0]->{bar}, but not ${$foo->[0]}->{bar} + ($name =~ /^\\?[\%\@\*\$][^{].*[]}]$/) ? ($mname = $name) : + ($mname = $name . '->'); + $mname .= '->' if $mname =~ /^\*.+\{[A-Z]+\}$/; + my ($sortkeys, $keys, $key) = ("$s->{sortkeys}"); + if ($sortkeys) { + if (ref($s->{sortkeys}) eq 'CODE') { + $keys = $s->{sortkeys}($val); + unless (ref($keys) eq 'ARRAY') { + carp "Sortkeys subroutine did not return ARRAYREF"; + $keys = []; + } + } + else { + $keys = [ sort keys %$val ]; + } + } + + # Ensure hash iterator is reset + keys(%$val); + + while (($k, $v) = ! $sortkeys ? (each %$val) : + @$keys ? ($key = shift(@$keys), $val->{$key}) : + () ) + { + my $nk = $s->_dump($k, ""); + $nk = $1 if !$s->{quotekeys} and $nk =~ /^[\"\']([A-Za-z_]\w*)[\"\']$/; + $sname = $mname . '{' . $nk . '}'; + $out .= $pad . $ipad . $nk . $pair; + + # temporarily alter apad + $s->{apad} .= (" " x (length($nk) + 4)) if $s->{indent} >= 2; + $out .= $s->_dump($val->{$k}, $sname) . ","; + $s->{apad} = $lpad if $s->{indent} >= 2; + } + if (substr($out, -1) eq ',') { + chop $out; + $out .= $pad . ($s->{xpad} x ($s->{level} - 1)); + } + $out .= ($name =~ /^\%/) ? ')' : '}'; + } + elsif ($realtype eq 'CODE') { + if ($s->{deparse}) { + require B::Deparse; + my $sub = 'sub ' . (B::Deparse->new)->coderef2text($val); + $pad = $s->{sep} . $s->{pad} . $s->{apad} . $s->{xpad} x ($s->{level} - 1); + $sub =~ s/\n/$pad/gse; + $out .= $sub; + } else { + $out .= 'sub { "DUMMY" }'; + carp "Encountered CODE ref, using dummy placeholder" if $s->{purity}; + } + } + else { + croak "Can\'t handle $realtype type."; + } + + if ($realpack and !$no_bless) { # we have a blessed ref + $out .= ', ' . _quote($realpack) . ' )'; + $out .= '->' . $s->{toaster} . '()' if $s->{toaster} ne ''; + $s->{apad} = $blesspad; + } + $s->{level}--; + + } + else { # simple scalar + + my $ref = \$_[1]; + my $v; + # first, catalog the scalar + if ($name ne '') { + $id = format_refaddr($ref); + if (exists $s->{seen}{$id}) { + if ($s->{seen}{$id}[2]) { + $out = $s->{seen}{$id}[0]; + #warn "[<$out]\n"; + return "\${$out}"; + } + } + else { + #warn "[>\\$name]\n"; + $s->{seen}{$id} = ["\\$name", $ref]; + } + } + $ref = \$val; + if (ref($ref) eq 'GLOB') { # glob + my $name = substr($val, 1); + if ($name =~ /^[A-Za-z_][\w:]*$/ && $name ne 'main::') { + $name =~ s/^main::/::/; + $sname = $name; + } + else { + $sname = $s->_dump( + $name eq 'main::' || $] < 5.007 && $name eq "main::\0" + ? '' + : $name, + "", + ); + $sname = '{' . $sname . '}'; + } + if ($s->{purity}) { + my $k; + local ($s->{level}) = 0; + for $k (qw(SCALAR ARRAY HASH)) { + my $gval = *$val{$k}; + next unless defined $gval; + next if $k eq "SCALAR" && ! defined $$gval; # always there + + # _dump can push into @post, so we hold our place using $postlen + my $postlen = scalar @post; + $post[$postlen] = "\*$sname = "; + local ($s->{apad}) = " " x length($post[$postlen]) if $s->{indent} >= 2; + $post[$postlen] .= $s->_dump($gval, "\*$sname\{$k\}"); + } + } + $out .= '*' . $sname; + } + elsif (!defined($val)) { + $out .= "undef"; + } + elsif (defined &_vstring and $v = _vstring($val) + and !_bad_vsmg || eval $v eq $val) { + $out .= $v; + } + elsif (!defined &_vstring + and ref $ref eq 'VSTRING' || eval{Scalar::Util::isvstring($val)}) { + $out .= sprintf "%vd", $val; + } + elsif ($val =~ /^(?:0|-?[1-9]\d{0,8})\z/) { # safe decimal number + $out .= $val; + } + else { # string + if ($s->{useqq} or $val =~ tr/\0-\377//c) { + # Fall back to qq if there's Unicode + $out .= qquote($val, $s->{useqq}); + } + else { + $out .= _quote($val); + } + } + } + if ($id) { + # if we made it this far, $id was added to seen list at current + # level, so remove it to get deep copies + if ($s->{deepcopy}) { + delete($s->{seen}{$id}); + } + elsif ($name) { + $s->{seen}{$id}[2] = 1; + } + } + return $out; +} + +# +# non-OO style of earlier version +# +sub Dumper { + return Data::Dumper->Dump([@_]); +} + +# compat stub +sub DumperX { + return Data::Dumper->Dumpxs([@_], []); +} + +sub Dumpf { return Data::Dumper->Dump(@_) } + +sub Dumpp { print Data::Dumper->Dump(@_) } + +# +# reset the "seen" cache +# +sub Reset { + my($s) = shift; + $s->{seen} = {}; + return $s; +} + +sub Indent { + my($s, $v) = @_; + if (defined($v)) { + if ($v == 0) { + $s->{xpad} = ""; + $s->{sep} = ""; + } + else { + $s->{xpad} = " "; + $s->{sep} = "\n"; + } + $s->{indent} = $v; + return $s; + } + else { + return $s->{indent}; + } +} + +sub Pair { + my($s, $v) = @_; + defined($v) ? (($s->{pair} = $v), return $s) : $s->{pair}; +} + +sub Pad { + my($s, $v) = @_; + defined($v) ? (($s->{pad} = $v), return $s) : $s->{pad}; +} + +sub Varname { + my($s, $v) = @_; + defined($v) ? (($s->{varname} = $v), return $s) : $s->{varname}; +} + +sub Purity { + my($s, $v) = @_; + defined($v) ? (($s->{purity} = $v), return $s) : $s->{purity}; +} + +sub Useqq { + my($s, $v) = @_; + defined($v) ? (($s->{useqq} = $v), return $s) : $s->{useqq}; +} + +sub Terse { + my($s, $v) = @_; + defined($v) ? (($s->{terse} = $v), return $s) : $s->{terse}; +} + +sub Freezer { + my($s, $v) = @_; + defined($v) ? (($s->{freezer} = $v), return $s) : $s->{freezer}; +} + +sub Toaster { + my($s, $v) = @_; + defined($v) ? (($s->{toaster} = $v), return $s) : $s->{toaster}; +} + +sub Deepcopy { + my($s, $v) = @_; + defined($v) ? (($s->{deepcopy} = $v), return $s) : $s->{deepcopy}; +} + +sub Quotekeys { + my($s, $v) = @_; + defined($v) ? (($s->{quotekeys} = $v), return $s) : $s->{quotekeys}; +} + +sub Bless { + my($s, $v) = @_; + defined($v) ? (($s->{'bless'} = $v), return $s) : $s->{'bless'}; +} + +sub Maxdepth { + my($s, $v) = @_; + defined($v) ? (($s->{'maxdepth'} = $v), return $s) : $s->{'maxdepth'}; +} + +sub Useperl { + my($s, $v) = @_; + defined($v) ? (($s->{'useperl'} = $v), return $s) : $s->{'useperl'}; +} + +sub Sortkeys { + my($s, $v) = @_; + defined($v) ? (($s->{'sortkeys'} = $v), return $s) : $s->{'sortkeys'}; +} + +sub Deparse { + my($s, $v) = @_; + defined($v) ? (($s->{'deparse'} = $v), return $s) : $s->{'deparse'}; +} + +# used by qquote below +my %esc = ( + "\a" => "\\a", + "\b" => "\\b", + "\t" => "\\t", + "\n" => "\\n", + "\f" => "\\f", + "\r" => "\\r", + "\e" => "\\e", +); + +# put a string value in double quotes +sub qquote { + local($_) = shift; + s/([\\\"\@\$])/\\$1/g; + my $bytes; { use bytes; $bytes = length } + s/([^\x00-\x7f])/'\x{'.sprintf("%x",ord($1)).'}'/ge if $bytes > length; + return qq("$_") unless + /[^ !"\#\$%&'()*+,\-.\/0-9:;<=>?\@A-Z[\\\]^_`a-z{|}~]/; # fast exit + + my $high = shift || ""; + s/([\a\b\t\n\f\r\e])/$esc{$1}/g; + + if (ord('^')==94) { # ascii + # no need for 3 digits in escape for these + s/([\0-\037])(?!\d)/'\\'.sprintf('%o',ord($1))/eg; + s/([\0-\037\177])/'\\'.sprintf('%03o',ord($1))/eg; + # all but last branch below not supported --BEHAVIOR SUBJECT TO CHANGE-- + if ($high eq "iso8859") { + s/([\200-\240])/'\\'.sprintf('%o',ord($1))/eg; + } elsif ($high eq "utf8") { +# use utf8; +# $str =~ s/([^\040-\176])/sprintf "\\x{%04x}", ord($1)/ge; + } elsif ($high eq "8bit") { + # leave it as it is + } else { + s/([\200-\377])/'\\'.sprintf('%03o',ord($1))/eg; + s/([^\040-\176])/sprintf "\\x{%04x}", ord($1)/ge; + } + } + else { # ebcdic + s{([^ !"\#\$%&'()*+,\-.\/0-9:;<=>?\@A-Z[\\\]^_`a-z{|}~])(?!\d)} + {my $v = ord($1); '\\'.sprintf(($v <= 037 ? '%o' : '%03o'), $v)}eg; + s{([^ !"\#\$%&'()*+,\-.\/0-9:;<=>?\@A-Z[\\\]^_`a-z{|}~])} + {'\\'.sprintf('%03o',ord($1))}eg; + } + + return qq("$_"); +} + +# helper sub to sort hash keys in Perl < 5.8.0 where we don't have +# access to sortsv() from XS +sub _sortkeys { [ sort keys %{$_[0]} ] } + +1; +__END__ + +=head1 NAME + +Data::Dumper - stringified perl data structures, suitable for both printing and C + +=head1 SYNOPSIS + + use Data::Dumper; + + # simple procedural interface + print Dumper($foo, $bar); + + # extended usage with names + print Data::Dumper->Dump([$foo, $bar], [qw(foo *ary)]); + + # configuration variables + { + local $Data::Dumper::Purity = 1; + eval Data::Dumper->Dump([$foo, $bar], [qw(foo *ary)]); + } + + # OO usage + $d = Data::Dumper->new([$foo, $bar], [qw(foo *ary)]); + ... + print $d->Dump; + ... + $d->Purity(1)->Terse(1)->Deepcopy(1); + eval $d->Dump; + + +=head1 DESCRIPTION + +Given a list of scalars or reference variables, writes out their contents in +perl syntax. The references can also be objects. The content of each +variable is output in a single Perl statement. Handles self-referential +structures correctly. + +The return value can be Ced to get back an identical copy of the +original reference structure. + +Any references that are the same as one of those passed in will be named +C<$VAR>I (where I is a numeric suffix), and other duplicate references +to substructures within C<$VAR>I will be appropriately labeled using arrow +notation. You can specify names for individual values to be dumped if you +use the C method, or you can change the default C<$VAR> prefix to +something else. See C<$Data::Dumper::Varname> and C<$Data::Dumper::Terse> +below. + +The default output of self-referential structures can be Ced, but the +nested references to C<$VAR>I will be undefined, since a recursive +structure cannot be constructed using one Perl statement. You should set the +C flag to 1 to get additional statements that will correctly fill in +these references. Moreover, if Ced when strictures are in effect, +you need to ensure that any variables it accesses are previously declared. + +In the extended usage form, the references to be dumped can be given +user-specified names. If a name begins with a C<*>, the output will +describe the dereferenced type of the supplied reference for hashes and +arrays, and coderefs. Output of names will be avoided where possible if +the C flag is set. + +In many cases, methods that are used to set the internal state of the +object will return the object itself, so method calls can be conveniently +chained together. + +Several styles of output are possible, all controlled by setting +the C flag. See L below +for details. + + +=head2 Methods + +=over 4 + +=item I->new(I, I) + +Returns a newly created C object. The first argument is an +anonymous array of values to be dumped. The optional second argument is an +anonymous array of names for the values. The names need not have a leading +C<$> sign, and must be comprised of alphanumeric characters. You can begin +a name with a C<*> to specify that the dereferenced type must be dumped +instead of the reference itself, for ARRAY and HASH references. + +The prefix specified by C<$Data::Dumper::Varname> will be used with a +numeric suffix if the name for a value is undefined. + +Data::Dumper will catalog all references encountered while dumping the +values. Cross-references (in the form of names of substructures in perl +syntax) will be inserted at all possible points, preserving any structural +interdependencies in the original set of values. Structure traversal is +depth-first, and proceeds in order from the first supplied value to +the last. + +=item I<$OBJ>->Dump I I->Dump(I, I) + +Returns the stringified form of the values stored in the object (preserving +the order in which they were supplied to C), subject to the +configuration options below. In a list context, it returns a list +of strings corresponding to the supplied values. + +The second form, for convenience, simply calls the C method on its +arguments before dumping the object immediately. + +=item I<$OBJ>->Seen(I<[HASHREF]>) + +Queries or adds to the internal table of already encountered references. +You must use C to explicitly clear the table if needed. Such +references are not dumped; instead, their names are inserted wherever they +are encountered subsequently. This is useful especially for properly +dumping subroutine references. + +Expects an anonymous hash of name => value pairs. Same rules apply for names +as in C. If no argument is supplied, will return the "seen" list of +name => value pairs, in a list context. Otherwise, returns the object +itself. + +=item I<$OBJ>->Values(I<[ARRAYREF]>) + +Queries or replaces the internal array of values that will be dumped. +When called without arguments, returns the values. Otherwise, returns the +object itself. + +=item I<$OBJ>->Names(I<[ARRAYREF]>) + +Queries or replaces the internal array of user supplied names for the values +that will be dumped. When called without arguments, returns the names. +Otherwise, returns the object itself. + +=item I<$OBJ>->Reset + +Clears the internal table of "seen" references and returns the object +itself. + +=back + +=head2 Functions + +=over 4 + +=item Dumper(I) + +Returns the stringified form of the values in the list, subject to the +configuration options below. The values will be named C<$VAR>I in the +output, where I is a numeric suffix. Will return a list of strings +in a list context. + +=back + +=head2 Configuration Variables or Methods + +Several configuration variables can be used to control the kind of output +generated when using the procedural interface. These variables are usually +Cized in a block so that other parts of the code are not affected by +the change. + +These variables determine the default state of the object created by calling +the C method, but cannot be used to alter the state of the object +thereafter. The equivalent method names should be used instead to query +or set the internal state of the object. + +The method forms return the object itself when called with arguments, +so that they can be chained together nicely. + +=over 4 + +=item * + +$Data::Dumper::Indent I I<$OBJ>->Indent(I<[NEWVAL]>) + +Controls the style of indentation. It can be set to 0, 1, 2 or 3. Style 0 +spews output without any newlines, indentation, or spaces between list +items. It is the most compact format possible that can still be called +valid perl. Style 1 outputs a readable form with newlines but no fancy +indentation (each level in the structure is simply indented by a fixed +amount of whitespace). Style 2 (the default) outputs a very readable form +which takes into account the length of hash keys (so the hash value lines +up). Style 3 is like style 2, but also annotates the elements of arrays +with their index (but the comment is on its own line, so array output +consumes twice the number of lines). Style 2 is the default. + +=item * + +$Data::Dumper::Purity I I<$OBJ>->Purity(I<[NEWVAL]>) + +Controls the degree to which the output can be Ced to recreate the +supplied reference structures. Setting it to 1 will output additional perl +statements that will correctly recreate nested references. The default is +0. + +=item * + +$Data::Dumper::Pad I I<$OBJ>->Pad(I<[NEWVAL]>) + +Specifies the string that will be prefixed to every line of the output. +Empty string by default. + +=item * + +$Data::Dumper::Varname I I<$OBJ>->Varname(I<[NEWVAL]>) + +Contains the prefix to use for tagging variable names in the output. The +default is "VAR". + +=item * + +$Data::Dumper::Useqq I I<$OBJ>->Useqq(I<[NEWVAL]>) + +When set, enables the use of double quotes for representing string values. +Whitespace other than space will be represented as C<[\n\t\r]>, "unsafe" +characters will be backslashed, and unprintable characters will be output as +quoted octal integers. Since setting this variable imposes a performance +penalty, the default is 0. C will run slower if this flag is set, +since the fast XSUB implementation doesn't support it yet. + +=item * + +$Data::Dumper::Terse I I<$OBJ>->Terse(I<[NEWVAL]>) + +When set, Data::Dumper will emit single, non-self-referential values as +atoms/terms rather than statements. This means that the C<$VAR>I names +will be avoided where possible, but be advised that such output may not +always be parseable by C. + +=item * + +$Data::Dumper::Freezer I $I->Freezer(I<[NEWVAL]>) + +Can be set to a method name, or to an empty string to disable the feature. +Data::Dumper will invoke that method via the object before attempting to +stringify it. This method can alter the contents of the object (if, for +instance, it contains data allocated from C), and even rebless it in a +different package. The client is responsible for making sure the specified +method can be called via the object, and that the object ends up containing +only perl data types after the method has been called. Defaults to an empty +string. + +If an object does not support the method specified (determined using +UNIVERSAL::can()) then the call will be skipped. If the method dies a +warning will be generated. + +=item * + +$Data::Dumper::Toaster I $I->Toaster(I<[NEWVAL]>) + +Can be set to a method name, or to an empty string to disable the feature. +Data::Dumper will emit a method call for any objects that are to be dumped +using the syntax CMETHOD()>. Note that this means that +the method specified will have to perform any modifications required on the +object (like creating new state within it, and/or reblessing it in a +different package) and then return it. The client is responsible for making +sure the method can be called via the object, and that it returns a valid +object. Defaults to an empty string. + +=item * + +$Data::Dumper::Deepcopy I $I->Deepcopy(I<[NEWVAL]>) + +Can be set to a boolean value to enable deep copies of structures. +Cross-referencing will then only be done when absolutely essential +(i.e., to break reference cycles). Default is 0. + +=item * + +$Data::Dumper::Quotekeys I $I->Quotekeys(I<[NEWVAL]>) + +Can be set to a boolean value to control whether hash keys are quoted. +A false value will avoid quoting hash keys when it looks like a simple +string. Default is 1, which will always enclose hash keys in quotes. + +=item * + +$Data::Dumper::Bless I $I->Bless(I<[NEWVAL]>) + +Can be set to a string that specifies an alternative to the C +builtin operator used to create objects. A function with the specified +name should exist, and should accept the same arguments as the builtin. +Default is C. + +=item * + +$Data::Dumper::Pair I $I->Pair(I<[NEWVAL]>) + +Can be set to a string that specifies the separator between hash keys +and values. To dump nested hash, array and scalar values to JavaScript, +use: C<$Data::Dumper::Pair = ' : ';>. Implementing C in JavaScript +is left as an exercise for the reader. +A function with the specified name exists, and accepts the same arguments +as the builtin. + +Default is: C< =E >. + +=item * + +$Data::Dumper::Maxdepth I $I->Maxdepth(I<[NEWVAL]>) + +Can be set to a positive integer that specifies the depth beyond which +we don't venture into a structure. Has no effect when +C is set. (Useful in debugger when we often don't +want to see more than enough). Default is 0, which means there is +no maximum depth. + +=item * + +$Data::Dumper::Useperl I $I->Useperl(I<[NEWVAL]>) + +Can be set to a boolean value which controls whether the pure Perl +implementation of C is used. The C module is +a dual implementation, with almost all functionality written in both +pure Perl and also in XS ('C'). Since the XS version is much faster, it +will always be used if possible. This option lets you override the +default behavior, usually for testing purposes only. Default is 0, which +means the XS implementation will be used if possible. + +=item * + +$Data::Dumper::Sortkeys I $I->Sortkeys(I<[NEWVAL]>) + +Can be set to a boolean value to control whether hash keys are dumped in +sorted order. A true value will cause the keys of all hashes to be +dumped in Perl's default sort order. Can also be set to a subroutine +reference which will be called for each hash that is dumped. In this +case C will call the subroutine once for each hash, +passing it the reference of the hash. The purpose of the subroutine is +to return a reference to an array of the keys that will be dumped, in +the order that they should be dumped. Using this feature, you can +control both the order of the keys, and which keys are actually used. In +other words, this subroutine acts as a filter by which you can exclude +certain keys from being dumped. Default is 0, which means that hash keys +are not sorted. + +=item * + +$Data::Dumper::Deparse I $I->Deparse(I<[NEWVAL]>) + +Can be set to a boolean value to control whether code references are +turned into perl source code. If set to a true value, C +will be used to get the source of the code reference. Using this option +will force using the Perl implementation of the dumper, since the fast +XSUB implementation doesn't support it. + +Caution : use this option only if you know that your coderefs will be +properly reconstructed by C. + +=back + +=head2 Exports + +=over 4 + +=item Dumper + +=back + +=head1 EXAMPLES + +Run these code snippets to get a quick feel for the behavior of this +module. When you are through with these examples, you may want to +add or change the various configuration variables described above, +to see their behavior. (See the testsuite in the Data::Dumper +distribution for more examples.) + + + use Data::Dumper; + + package Foo; + sub new {bless {'a' => 1, 'b' => sub { return "foo" }}, $_[0]}; + + package Fuz; # a weird REF-REF-SCALAR object + sub new {bless \($_ = \ 'fu\'z'), $_[0]}; + + package main; + $foo = Foo->new; + $fuz = Fuz->new; + $boo = [ 1, [], "abcd", \*foo, + {1 => 'a', 023 => 'b', 0x45 => 'c'}, + \\"p\q\'r", $foo, $fuz]; + + ######## + # simple usage + ######## + + $bar = eval(Dumper($boo)); + print($@) if $@; + print Dumper($boo), Dumper($bar); # pretty print (no array indices) + + $Data::Dumper::Terse = 1; # don't output names where feasible + $Data::Dumper::Indent = 0; # turn off all pretty print + print Dumper($boo), "\n"; + + $Data::Dumper::Indent = 1; # mild pretty print + print Dumper($boo); + + $Data::Dumper::Indent = 3; # pretty print with array indices + print Dumper($boo); + + $Data::Dumper::Useqq = 1; # print strings in double quotes + print Dumper($boo); + + $Data::Dumper::Pair = " : "; # specify hash key/value separator + print Dumper($boo); + + + ######## + # recursive structures + ######## + + @c = ('c'); + $c = \@c; + $b = {}; + $a = [1, $b, $c]; + $b->{a} = $a; + $b->{b} = $a->[1]; + $b->{c} = $a->[2]; + print Data::Dumper->Dump([$a,$b,$c], [qw(a b c)]); + + + $Data::Dumper::Purity = 1; # fill in the holes for eval + print Data::Dumper->Dump([$a, $b], [qw(*a b)]); # print as @a + print Data::Dumper->Dump([$b, $a], [qw(*b a)]); # print as %b + + + $Data::Dumper::Deepcopy = 1; # avoid cross-refs + print Data::Dumper->Dump([$b, $a], [qw(*b a)]); + + + $Data::Dumper::Purity = 0; # avoid cross-refs + print Data::Dumper->Dump([$b, $a], [qw(*b a)]); + + ######## + # deep structures + ######## + + $a = "pearl"; + $b = [ $a ]; + $c = { 'b' => $b }; + $d = [ $c ]; + $e = { 'd' => $d }; + $f = { 'e' => $e }; + print Data::Dumper->Dump([$f], [qw(f)]); + + $Data::Dumper::Maxdepth = 3; # no deeper than 3 refs down + print Data::Dumper->Dump([$f], [qw(f)]); + + + ######## + # object-oriented usage + ######## + + $d = Data::Dumper->new([$a,$b], [qw(a b)]); + $d->Seen({'*c' => $c}); # stash a ref without printing it + $d->Indent(3); + print $d->Dump; + $d->Reset->Purity(0); # empty the seen cache + print join "----\n", $d->Dump; + + + ######## + # persistence + ######## + + package Foo; + sub new { bless { state => 'awake' }, shift } + sub Freeze { + my $s = shift; + print STDERR "preparing to sleep\n"; + $s->{state} = 'asleep'; + return bless $s, 'Foo::ZZZ'; + } + + package Foo::ZZZ; + sub Thaw { + my $s = shift; + print STDERR "waking up\n"; + $s->{state} = 'awake'; + return bless $s, 'Foo'; + } + + package Foo; + use Data::Dumper; + $a = Foo->new; + $b = Data::Dumper->new([$a], ['c']); + $b->Freezer('Freeze'); + $b->Toaster('Thaw'); + $c = $b->Dump; + print $c; + $d = eval $c; + print Data::Dumper->Dump([$d], ['d']); + + + ######## + # symbol substitution (useful for recreating CODE refs) + ######## + + sub foo { print "foo speaking\n" } + *other = \&foo; + $bar = [ \&other ]; + $d = Data::Dumper->new([\&other,$bar],['*other','bar']); + $d->Seen({ '*foo' => \&foo }); + print $d->Dump; + + + ######## + # sorting and filtering hash keys + ######## + + $Data::Dumper::Sortkeys = \&my_filter; + my $foo = { map { (ord, "$_$_$_") } 'I'..'Q' }; + my $bar = { %$foo }; + my $baz = { reverse %$foo }; + print Dumper [ $foo, $bar, $baz ]; + + sub my_filter { + my ($hash) = @_; + # return an array ref containing the hash keys to dump + # in the order that you want them to be dumped + return [ + # Sort the keys of %$foo in reverse numeric order + $hash eq $foo ? (sort {$b <=> $a} keys %$hash) : + # Only dump the odd number keys of %$bar + $hash eq $bar ? (grep {$_ % 2} keys %$hash) : + # Sort keys in default order for all other hashes + (sort keys %$hash) + ]; + } + +=head1 BUGS + +Due to limitations of Perl subroutine call semantics, you cannot pass an +array or hash. Prepend it with a C<\> to pass its reference instead. This +will be remedied in time, now that Perl has subroutine prototypes. +For now, you need to use the extended usage form, and prepend the +name with a C<*> to output it as a hash or array. + +C cheats with CODE references. If a code reference is +encountered in the structure being processed (and if you haven't set +the C flag), an anonymous subroutine that +contains the string '"DUMMY"' will be inserted in its place, and a warning +will be printed if C is set. You can C the result, but bear +in mind that the anonymous sub that gets created is just a placeholder. +Someday, perl will have a switch to cache-on-demand the string +representation of a compiled piece of code, I hope. If you have prior +knowledge of all the code refs that your data structures are likely +to have, you can use the C method to pre-seed the internal reference +table and make the dumped output point to them, instead. See L +above. + +The C and C flags makes Dump() run slower, since the +XSUB implementation does not support them. + +SCALAR objects have the weirdest looking C workaround. + +Pure Perl version of C escapes UTF-8 strings correctly +only in Perl 5.8.0 and later. + +=head2 NOTE + +Starting from Perl 5.8.1 different runs of Perl will have different +ordering of hash keys. The change was done for greater security, +see L. This means that +different runs of Perl will have different Data::Dumper outputs if +the data contains hashes. If you need to have identical Data::Dumper +outputs from different runs of Perl, use the environment variable +PERL_HASH_SEED, see L. Using this restores +the old (platform-specific) ordering: an even prettier solution might +be to use the C filter of Data::Dumper. + +=head1 AUTHOR + +Gurusamy Sarathy gsar@activestate.com + +Copyright (c) 1996-98 Gurusamy Sarathy. All rights reserved. +This program is free software; you can redistribute it and/or +modify it under the same terms as Perl itself. + +=head1 VERSION + +Version 2.135_06 (March 20 2012) + +=head1 SEE ALSO + +perl(1) + +=cut diff --git a/neuralcoref/scorer/lib/Math/Combinatorics.pm b/neuralcoref/scorer/lib/Math/Combinatorics.pm new file mode 100644 index 0000000..badb49d --- /dev/null +++ b/neuralcoref/scorer/lib/Math/Combinatorics.pm @@ -0,0 +1,1044 @@ +=head1 NAME + +Math::Combinatorics - Perform combinations and permutations on lists + +=head1 SYNOPSIS + +Available as an object oriented API. + + use Math::Combinatorics; + + my @n = qw(a b c); + my $combinat = Math::Combinatorics->new(count => 2, + data => [@n], + ); + + print "combinations of 2 from: ".join(" ",@n)."\n"; + print "------------------------".("--" x scalar(@n))."\n"; + while(my @combo = $combinat->next_combination){ + print join(' ', @combo)."\n"; + } + + print "\n"; + + print "permutations of 3 from: ".join(" ",@n)."\n"; + print "------------------------".("--" x scalar(@n))."\n"; + while(my @permu = $combinat->next_permutation){ + print join(' ', @permu)."\n"; + } + + output: + +Or available via exported functions 'permute', 'combine', and 'factorial'. + + use Math::Combinatorics; + + my @n = qw(a b c); + print "combinations of 2 from: ".join(" ",@n)."\n"; + print "------------------------".("--" x scalar(@n))."\n"; + print join("\n", map { join " ", @$_ } combine(2,@n)),"\n"; + print "\n"; + print "permutations of 3 from: ".join(" ",@n)."\n"; + print "------------------------".("--" x scalar(@n))."\n"; + print join("\n", map { join " ", @$_ } permute(@n)),"\n"; + + +Output: + + combinations of 2 from: a b c + ------------------------------ + a b + a c + b c + + permutations of 3 from: a b c + ------------------------------ + a b c + a c b + b a c + b c a + c a b + c b a + +Output from both types of calls is the same, but the object-oriented approach consumes +much less memory for large sets. + +=head1 DESCRIPTION + +Combinatorics is the branch of mathematics studying the enumeration, combination, +and permutation of sets of elements and the mathematical relations that characterize +their properties. As a jumping off point, refer to: + + http://mathworld.wolfram.com/Combinatorics.html + +This module provides a pure-perl implementation of nCk, nCRk, nPk, nPRk, !n and n! +(combination, multiset, permutation, string, derangement, and factorial, respectively). +Functional and object-oriented usages allow problems such as the following to be solved: + +=over + +=item combine - nCk + + http://mathworld.wolfram.com/Combination.html + +"Fun questions to ask the pizza parlor wait staff: how many possible combinations +of 2 toppings can I get on my pizza?". + +=item derange - !n + + http://mathworld.wolfram.com/Derangement.html + +"A derangement of n ordered objects, denoted !n, is a permutation in which none of the +objects appear in their "natural" (i.e., ordered) place." + +=item permute - nPk + + http://mathworld.wolfram.com/Permutation.html + +"Master Mind Game: ways to arrange pieces of different colors in a +certain number of positions, without repetition of a color". + +=back + +Object-oriented usage additionally allows solving these problems by calling L +with a B vector: + +=over + +=item string - nPRk + + http://mathworld.wolfram.com/String.html + +"Morse signals: diferent signals of 3 positions using the two symbols - and .". + + $o = Math::Combinatorics->new( count=>3 , data=>[qw(. -)] , frequency=>[3,3] ); + while ( my @x = $o->next_multiset ) { + my $p = Math::Combinatorics->new( data=>\@x , frequency=>[map{1} @x] ); + while ( my @y = $p->next_string ) { + #do something + } + } + +=item multiset/multichoose - nCRk + + http://mathworld.wolfram.com/Multiset.html + +"ways to extract 3 balls at once of a bag with 3 black and 3 white balls". + + $o = Math::Combinatorics->new( count=>3 , data=>[qw(white black)] , frequency=>[3,3] ); + while ( my @x = $o->next_multiset ) { + #do something + } + +=back + +=head2 EXPORT + +the following export tags will bring a single method into the caller's +namespace. no symbols are exported by default. see pod documentation below for +method descriptions. + + combine + derange + multiset + permute + string + factorial + +=head1 AUTHOR + +Allen Day , with algorithmic contributions from Christopher Eltschka and +Tye. + +Copyright (c) 2004-2005 Allen Day. All rights reserved. This program is free software; you +can redistribute it and/or modify it under the same terms as Perl itself. + +=head1 ACKNOWLEDGEMENTS + +A sincere thanks to everyone for helping to make this a better module. After initial +development I've only had time to accept patches and improvements. Math::Combinatorics +continues to be developed and improved by the community. Contributors of note include: + +For adding new features: Carlos Rica, David Coppit, Carlos Segre, Lyon Lemmens + +For bug reports: Ying Yang, Joerg Beyer, Marc Logghe, Yunheng Wang, +Torsten Seemann, Gerrit Haase, Joern Behre, Lyon Lemmens, Federico Lucifredi + +=head1 BUGS / TODO + +Report them to the author. + + * Need more extensive unit tests. + + * tests for new()'s frequency argment + + * A known bug (more of a missing feature, actually) does not allow parameterization of k + for nPk in permute(). it is assumed k == n. L for details. You can work + around this by making calls to both L and L + + * Lots of really interesting stuff from Mathworld.Wolfram.com. MathWorld rocks! Expect + to see implementation of more concepts from their site, e.g.: + + http://mathworld.wolfram.com/BellNumber.html + http://mathworld.wolfram.com/StirlingNumberoftheSecondKind.html + http://mathworld.wolfram.com/Word.html + + * Other combinatorics stuff + http://en.wikipedia.org/wiki/Catalan_number + http://en.wikipedia.org/wiki/Stirling_number + +=head1 SEE ALSO + +L + +L + +L (alas misnamed, it actually returns permutations on a string). + + http://perlmonks.thepen.com/29374.html + + http://groups.google.com/groups?selm=38568F79.13680B86%40physik.tu-muenchen.de&output=gplain + + +=cut + +package Math::Combinatorics; + +use strict; +use Data::Dumper; +require Exporter; + +our @ISA = qw(Exporter); +our @EXPORT = qw( combine derange factorial permute ); +our $VERSION = '0.09'; + +=head1 EXPORTED FUNCTIONS + +=head2 combine() + + Usage : my @combinations = combine($k,@n); + Function: implements nCk (n choose k), or n!/(k!*(n-k!)). + returns all unique unorderd combinations of k items from set n. + items in n are assumed to be character data, and are + copied into the return data structure (see "Returns" below). + Example : my @n = qw(a b c); + my @c = combine(2,@n); + print join "\n", map { join " ", @$_ } @c; + # prints: + # b c + # a c + # a b + Returns : a list of arrays, where each array contains a unique combination + of k items from n + Args : a list of items to be combined + Notes : data is internally assumed to be alphanumeric. this is necessary + to efficiently generate combinations of large sets. if you need + combinations of non-alphanumeric data, or on data + C would not be appropriate, use the + object-oriented API. See L and the B option. + + Identical items are assumed to be non-unique. That is, calling + Cnew(data => [@n], count => $k); + while(my(@combo) = $c->next_combination){ + push @result, [@combo]; + } + + return @result; +} + +=head2 derange() + + Usage : my @deranges = derange(@n); + Function: implements !n, a derangement of n items in which none of the + items appear in their originally ordered place. + Example : my @n = qw(a b c); + my @d = derange(@n); + print join "\n", map { join " ", @$_ } @d; + # prints: + # a c b + # b a c + # b c a + # c a b + # c b a + Returns : a list of arrays, where each array contains a derangement of + k items from n (where k == n). + Args : a list of items to be deranged. + Note : k should really be parameterizable. this will happen + in a later version of the module. send me a patch to + make that version come out sooner. + Notes : data is internally assumed to be alphanumeric. this is necessary + to efficiently generate combinations of large sets. if you need + combinations of non-alphanumeric data, or on data + C would not be appropriate, use the + object-oriented API. See L, and the B option. + +=cut + +sub derange { + my(@n) = @_; + + my @result = (); + + my $c = __PACKAGE__->new(data => [@n]); + while(my(@derange) = $c->next_derangement){ + push @result, [@derange]; + } + + return @result; +} + +=head2 next_derangement() + + Usage : my @derangement = $c->next_derangement(); + Function: get derangements for @data. + Returns : returns a permutation of items from @data (see L), + where none of the items appear in their natural order. repeated calls + retrieve all unique derangements of @data elements. a returned empty + list signifies all derangements have been iterated. + Args : none. + +=cut + +sub next_derangement { + my $self = shift; + my $data = $self->data(); + + my $cursor = $self->_permutation_cursor(); + my $values = @$cursor; + if($self->{pin}){ + $self->{pin} = 0; + + my $i; + for ($i = 1; $i < $values; $i += 2) { + $$cursor[$i - 1] = $i; + $$cursor[$i] = $i - 1; + } + if ($values % 2 != 0) { + $$cursor[$values - 1] = $values - 3; + $$cursor[$values - 2] = $values - 1; + } + goto RESULT; + } + else { + my $values = @$cursor; + my $i; + my @found; # stores for each element if it has been found previously + for ($i = 0; $i < $values; $i++) { $found[$i] = 0 } + my $e; + my $elemfound = 0; + for ($i = $values - 1; $i > -1; $i--) { + $found[$$cursor[$i]] = 1; + if ($i > $values - 3) { # $values-1 or $values-2 + if ($i == $values - 2) { + #print "i=$i (values-2)\n";## + $e = $$cursor[$i + 1]; + if ($e > $$cursor[$i] && $e != $i + && $$cursor[$i] != $i + 1) { + $$cursor[$i + 1] = $$cursor[$i]; + $$cursor[$i] = $e; + #print "!\n";## + goto RESULT; + } + } + next; + } + for ($e = $$cursor[$i] + 1; $e < $values; $e++) { + if ($found[$e] && $e != $i) { + $elemfound = 1; + last; + } + } + last if ($elemfound); + } + if ($elemfound) { + $$cursor[$i] = $e; + $found[$e] = 0; + $i++; + my $j; + my @elems; + for ($j = 0; $j < $values; $j++) { + if ($found[$j]) { push(@elems, $j) } + } + for ($j = 0; $j < @elems; $j++) { + if ($elems[$j] != $i) { + # if the next is the last and it will be wrong: + if ($j + 2 == @elems + && $elems[$j + 1] == $i + 1) { + # interchange them: + $$cursor[$i] = $elems[$j + 1]; + $$cursor[$i + 1] = $elems[$j]; + last; + } + $$cursor[$i] = $elems[$j]; + } + elsif ($j + 1 < @elems) { + # use the next element: + $$cursor[$i] = $elems[$j + 1]; + $elems[$j + 1] = $elems[$j]; + } + else { die() } + $i++; + } + goto RESULT; + } + return (); + } + RESULT: + # map cursor to data array + my @result; + foreach my $c (@$cursor){ + push @result, $${ $data->[$c] }; + } + return @result; +} + +=head2 factorial() + + Usage : my $f = factorial(4); #returns 24, or 4*3*2*1 + Function: calculates n! (n factorial). + Returns : undef if n is non-integer or n < 0 + Args : a positive, non-zero integer + Note : this function is used internally by combine() and permute() + +=cut + +sub factorial { + my $n = shift; + return undef unless $n >= 0 and $n == int($n); + + my $f; + + for($f = 1 ; $n > 0 ; $n--){ + $f *= $n + } + + return $f; +} + +=head2 permute() + + Usage : my @permutations = permute(@n); + Function: implements nPk (n permute k) (where k == n), or n!/(n-k)! + returns all unique permutations of k items from set n + (where n == k, see "Note" below). items in n are assumed to + be character data, and are copied into the return data + structure. + Example : my @n = qw(a b c); + my @p = permute(@n); + print join "\n", map { join " ", @$_ } @p; + # prints: + # b a c + # b c a + # c b a + # c a b + # a c b + # a b c + Returns : a list of arrays, where each array contains a permutation of + k items from n (where k == n). + Args : a list of items to be permuted. + Note : k should really be parameterizable. this will happen + in a later version of the module. send me a patch to + make that version come out sooner. + Notes : data is internally assumed to be alphanumeric. this is necessary + to efficiently generate combinations of large sets. if you need + combinations of non-alphanumeric data, or on data + C would not be appropriate, use the + object-oriented API. See L, and the B option. + + Identical items are assumed to be non-unique. That is, calling + Cnew(data => [@n]); + while(my(@permu) = $c->next_permutation){ + push @result, [@permu]; + } + + return @result; +} + +=head1 CONSTRUCTOR + +=cut + +=head2 new() + + Usage : my $c = Math::Combinatorics->new( count => 2, #treated as int + data => [1,2,3,4] #arrayref or anonymous array + ); + Function: build a new Math::Combinatorics object. + Returns : a Math::Combinatorics object + Args : count - required for combinatoric functions/methods. number of elements to be + present in returned set(s). + data - required for combinatoric B permutagenic functions/methods. this is the + set elements are chosen from. B: this array is modified in place; make + a copy of your array if the order matters in the caller's space. + frequency - optional vector of data frequencies. must be the same length as the B + constructor argument. These two constructor calls here are equivalent: + + $a = 'a'; + $b = 'b'; + + Math::Combinatorics->new( count=>2, data=>[\$a,\$a,\$a,\$a,\$a,\$b,\$b] ); + Math::Combinatorics->new( count=>2, data=>[\$a,\$b], frequency=>[5,2] ); + + so why use this? sometimes it's useful to have multiple identical entities in + a set (in set theory jargon, this is called a "bag", See L). + compare - optional subroutine reference used in sorting elements of the set. examples: + + #appropriate for character elements + compare => sub { $_[0] cmp $_[1] } + #appropriate for numeric elements + compare => sub { $_[0] <=> $_[1] } + #appropriate for object elements, perhaps + compare => sub { $_[0]->value <=> $_[1]->value } + + The default sort mechanism is based on references, and cannot be predicted. + Improvements for a more flexible compare() mechanism are most welcome. + +=cut + +sub new { + my($class,%arg) = @_; + my $self = bless {}, $class; + + $self->{compare} = $arg{compare} || sub { $_[0] cmp $_[1] }; + $self->{count} = $arg{count}; + + #convert bag to set + my $freq = $arg{frequency}; + if(ref($freq) eq 'ARRAY' and scalar(@$freq) == scalar(@{$arg{data}})){ + $self->{frequency}++; + my @bag = @{$arg{data}}; + my @set = (); + + #allow '0 but defined' elements (Yunheng Wang) + foreach my $type ( @bag ) { + my $f = shift @$freq; + next if $f < 1; + for(1..$f){ + #we push on a reference to make sure, for instance, that objects + #are identical and not copied + push @set, \$type; + } + } + $arg{data} = \@set; + } + elsif(!ref($freq)){ + $arg{data} = [map { \$_ } @{$arg{data}}]; + } + +#warn join ' ', @{$arg{data}}; + + #OK, this is hokey, but I don't have time to fix it properly right now. + #We want to allow both user-specified sorting as well as our own + #reference-based internal sorting -- the latter only because unit tests + #are failing if we don't have it. Additionally, we don't want to require + #the triple derefernce necessary for comparison of the pristine data in + #the user-supplied compare coderef. The solution for now is to do an + #if/else. If you're staring at this please fix it! + my $compare = $self->{compare}; + if ( defined $arg{compare} ) { + $self->{data} = [sort {&$compare($$$a,$$$b)} map {\$_} @{$arg{data}}]; + } + else { + $self->{data} = [sort {&$compare($a,$b)} map {\$_} @{$arg{data}}]; + } + +#warn Dumper($self->{data}); + + $self->{cin} = 1; + $self->{pin} = 1; + + return $self; +} + +=head1 OBJECT METHODS + +=cut + +=head2 next_combination() + + Usage : my @combo = $c->next_combination(); + Function: get combinations of size $count from @data. + Returns : returns a combination of $count items from @data (see L). + repeated calls retrieve all unique combinations of $count elements. + a returned empty list signifies all combinations have been iterated. + Note : this method may only be used if a B argument is B + given to L, otherwise use L. + Args : none. + +=cut + +sub next_combination { + my $self = shift; + if ( $self->{frequency} ) { + print STDERR "must use next_multiset() if 'frequency' argument passed to constructor\n"; + return (); + } + return $self->_next_combination; +} + +sub _next_combination { + my $self = shift; + my $data = $self->data(); + my $combo_end = $self->count(); + + my $begin = 0; + my $end = $#{$data} + 1; + + my @result; + + return () if scalar(@$data) < $self->count(); + + if($self->{cin}){ + $self->{cin} = 0; + + for(0..$self->count-1){ + push @result, $${ $data->[$_] }; + } +#warn 1; + return @result; + } + + if ($combo_end == $begin || $combo_end == $end) { + return (); + } + + my $combo = $combo_end; + my $total_set; + + --$combo; + $total_set = $self->upper_bound($combo_end,$end,$data->[$combo]); + if ($total_set != $end) { + $self->swap($combo,$total_set); + + for(0..$self->count-1){ + push @result, $${ $data->[$_] }; + } +#warn 2; + return @result; + } + + --$total_set; + $combo = $self->lower_bound($begin, $combo_end, $data->[$total_set]); + + if ($combo == $begin) { + $self->rotate($begin, $combo_end, $end); +#warn 3; + return (); + } + + my $combo_next = $combo; + --$combo; + $total_set = $self->upper_bound($combo_end, $end, $data->[$combo]); + + my $sort_pos = $end; + $sort_pos += $combo_end - $total_set - 1; + + $self->rotate($combo_next, $total_set, $end); + $self->rotate($combo, $combo_next, $end); + $self->rotate($combo_end, $sort_pos, $end); + + for(0..$self->count-1){ + push @result, $${ $data->[$_] }; + } +#warn 4; + return @result; +} + +=head2 next_multiset() + + Usage : my @multiset = $c->next_multiset(); + Function: get multisets for @data. + Returns : returns a multiset of items from @data (see L). + a multiset is a special type of combination where the set from which + combinations are drawn contains items that are indistinguishable. use + L when a B argument is passed to L. + repeated calls retrieve all unique multisets of @data elements. a + returned empty list signifies all multisets have been iterated. + Note : this method may only be used if a B argument is given to + L, otherwise use L. + Args : none. + +=cut + +sub next_multiset { + my $self = shift; + + if ( ! $self->{frequency} ) { + print STDERR "must use next_combination() if 'frequency' argument not passed to constructor\n"; + return (); + } + + my $data = $self->data(); + my $compare = $self->compare(); + + while ( my @combo = $self->_next_combination ) { + my $x = join '', map {scalar($$_)} sort @$data; + my $y = join '', map {scalar($_) } sort @combo; + + next if $self->{'cache_multiset'}{$y}++; + return @combo; + } + $self->{'cache_multiset'} = undef; + return (); +} + +=head2 next_permutation() + + Usage : my @permu = $c->next_permutation(); + Function: get permutations of elements in @data. + Returns : returns a permutation of items from @data (see L). + repeated calls retrieve all unique permutations of @data elements. + a returned empty list signifies all permutations have been iterated. + Note : this method may only be used if a B argument is B + given to L, otherwise use L. + Args : none. + +=cut + +sub next_permutation { + my $self = shift; + if ( $self->{frequency} ) { + print STDERR "must use next_string() if 'frequency' argument passed to constructor\n"; + return (); + } + return $self->_next_permutation; +} + +sub _next_permutation { + my $self = shift; + my $data = $self->data(); + + if($self->{pin}){ + $self->{pin} = 0; + return map {$$$_} @$data; + } + + my $cursor = $self->_permutation_cursor(); + + my $last= $#{$cursor}; + + if($last < 1){ + return (); + } + + # Find last item not in reverse-sorted order: + my $i = $last - 1; + $i-- while 0 <= $i && $cursor->[$i] >= $cursor->[$i+1]; + + if($i == -1){ + return (); + } + + + # Re-sort the reversely-sorted tail of the list: + @{$cursor}[$i+1..$last] = reverse @{$cursor}[$i+1..$last] + if $cursor->[$i+1] > $cursor->[$last]; + + # Find next item that will make us "greater": + my $j = $i+1; + $j++ while $cursor->[$i] >= $cursor->[$j]; + + # Swap: + @{$cursor}[$i,$j] = @{$cursor}[$j,$i]; + + # map cursor to data array + my @result; + foreach my $c (@$cursor){ + push @result, $${ $data->[$c] }; + } + return @result; +} + +=head2 next_string() + + Usage : my @string = $c->next_string(); + Function: get strings for @data. + Returns : returns a multiset of items from @data (see L). + a multiset is a special type of permutation where the set from which + combinations are drawn contains items that are indistinguishable. use + L when a B argument is passed to L. + repeated calls retrieve all unique multisets of @data elements. a + returned empty list signifies all strings have been iterated. + Note : this method may only be used if a B argument is given to + L, otherwise use L. + Args : none. + +=cut + +sub next_string { + my $self = shift; + my $data = $self->data(); + + if ( ! $self->{frequency} ) { + print STDERR "must use next_permutation() if 'frequency' argument not passed to constructor\n"; + return (); + } + + + while ( my @permu = $self->_next_permutation ) { + my $x = join '', map {scalar($$_)} @$data; + my $y = join '', map {scalar($_) } @permu; + + next if $self->{'cache_string'}{$y}++; + return @permu; + } + + $self->{'cache_string'} = undef; + return (); +} + +=head1 INTERNAL FUNCTIONS AND METHODS + +=head2 sum() + + Usage : my $sum = sum(1,2,3); # returns 6 + Function: sums a list of integers. non-integer list elements are ignored + Returns : sum of integer items in arguments passed in + Args : a list of integers + Note : this function is used internally by combine() + +=cut + +sub sum { + my $sum = 0; + foreach my $i (@_){ + $sum += $i if $i == int($i); + } + return $sum; +} + +=head2 compare() + + Usage : $obj->compare() + Function: internal, undocumented. holds a comparison coderef. + Returns : value of compare (a coderef) + + +=cut + +sub compare { + my($self,$val) = @_; + return $self->{'compare'}; +} + + +=head2 count() + + Usage : $obj->count() + Function: internal, undocumented. holds the "k" in nCk or nPk. + Returns : value of count (an int) + +=cut + +sub count { + my($self) = @_; + return $self->{'count'}; +} + + +=head2 data() + + Usage : $obj->data() + Function: internal, undocumented. holds the set "n" in nCk or nPk. + Returns : value of data (an arrayref) + +=cut + +sub data { + my($self) = @_; + return $self->{'data'}; +} + + +=head2 swap() + +internal, undocumented. + +=cut + +sub swap { + my $self = shift; + my $first = shift; + my $second = shift; + my $data = $self->data(); + + my $temp = $data->[$first]; + $data->[$first] = $data->[$second]; + $data->[$second] = $temp; +} + +=head2 reverse() + +internal, undocumented. + +=cut + +sub reverse { + my $self = shift; + my $first = shift; + my $last = shift; + my $data = $self->data(); + + while (1) { + if ($first == $last || $first == --$last) { + return; + } else { + $self->swap($first++, $last); + } + } +} + +=head2 rotate() + +internal, undocumented. + +=cut + +sub rotate { + my $self = shift; + my $first = shift; + my $middle = shift; + my $last = shift; + my $data = $self->data(); + + if ($first == $middle || $last == $middle) { + return; + } + + my $first2 = $middle; + + do { + $self->swap($first++, $first2++); + + if ($first == $middle) { + $middle = $first2; + } + } while ($first2 != $last); + + $first2 = $middle; + + while ($first2 != $last) { + $self->swap($first++, $first2++); + if ($first == $middle) { + $middle = $first2; + } elsif ($first2 == $last) { + $first2 = $middle; + } + } +} + +=head2 upper_bound() + +internal, undocumented. + +=cut + +sub upper_bound { + my $self = shift; + my $first = shift; + my $last = shift; + my $value = shift; + my $compare = $self->compare(); + my $data = $self->data(); + + my $len = $last - $first; + my $half; + my $middle; + + while ($len > 0) { + $half = $len >> 1; + $middle = $first; + $middle += $half; + + if (&$compare($value,$data->[$middle]) == -1) { + $len = $half; + } else { + $first = $middle; + ++$first; + $len = $len - $half - 1; + } + } + + return $first; +} + +=head2 lower_bound() + +internal, undocumented. + +=cut + +sub lower_bound { + my $self = shift; + my $first = shift; + my $last = shift; + my $value = shift; + my $compare = $self->compare(); + my $data = $self->data(); + + my $len = $last - $first; + my $half; + my $middle; + + while ($len > 0) { + $half = $len >> 1; + $middle = $first; + $middle += $half; + + if (&$compare($data->[$middle],$value) == -1) { + $first = $middle; + ++$first; + $len = $len - $half - 1; + } else { + $len = $half; + } + } + + return $first; +} + +=head2 _permutation_cursor() + + Usage : $obj->_permutation_cursor() + Function: internal method. cursor on permutation iterator order. + Returns : value of _permutation_cursor (an arrayref) + Args : none + +=cut + +sub _permutation_cursor { + my($self,$val) = @_; + + if(!$self->{'_permutation_cursor'}){ + my $data = $self->data(); + my @tmp = (); + my $i = 0; + push @tmp, $i++ foreach @$data; + $self->{'_permutation_cursor'} = \@tmp; + } + + return $self->{'_permutation_cursor'}; +} + +1; + diff --git a/neuralcoref/scorer/scorer.bat b/neuralcoref/scorer/scorer.bat new file mode 100644 index 0000000..679faed --- /dev/null +++ b/neuralcoref/scorer/scorer.bat @@ -0,0 +1,67 @@ +@rem = '--*-Perl-*-- +@echo off +if "%OS%" == "Windows_NT" goto WinNT +perl -x -S "%0" %1 %2 %3 %4 %5 %6 %7 %8 %9 +goto endofperl +:WinNT +perl -x -S %0 %* +if NOT "%COMSPEC%" == "%SystemRoot%\system32\cmd.exe" goto endofperl +if %errorlevel% == 9009 echo You do not have Perl in your PATH. +if errorlevel 1 goto script_failed_so_exit_with_non_zero_val 2>nul +goto endofperl +@rem '; +#!perl +#line 15 + +BEGIN { + $d = $0; + $d =~ s/\/[^\/][^\/]*$//g; + push(@INC, $d."/lib"); +} + +use strict; +use CorScorer; + +if (@ARGV < 3) { + print q| + use: scorer.bat [name] + + metric: the metric desired to score the results: + muc: MUCScorer (Vilain et al, 1995) + bcub: B-Cubed (Bagga and Baldwin, 1998) + ceafm: CEAF (Luo et al, 2005) using mention-based similarity + ceafe: CEAF (Luo et al, 2005) using entity-based similarity + all: uses all the metrics to score + + keys_file: file with expected coreference chains in SemEval format + + response_file: file with output of coreference system (SemEval format) + + name: [optional] the name of the document to score. If name is not + given, all the documents in the dataset will be scored. If given + name is "none" then all the documents are scored but only total + results are shown. + + |; + exit; +} + +my $metric = shift (@ARGV); +if ($metric !~ /^(muc|bcub|ceafm|ceafe|all)/i) { + print "Invalid metric\n"; + exit; +} + + +if ($metric eq 'all') { + foreach my $m ('muc', 'bcub', 'ceafm', 'ceafe') { + print "\nMETRIC $m:\n"; + &CorScorer::Score( $m, @ARGV ); + } +} +else { + &CorScorer::Score( $metric, @ARGV ); +} + +__END__ +:endofperl diff --git a/neuralcoref/scorer/scorer.pl b/neuralcoref/scorer/scorer.pl new file mode 100755 index 0000000..07b48d8 --- /dev/null +++ b/neuralcoref/scorer/scorer.pl @@ -0,0 +1,58 @@ +#!/usr/bin/perl + +BEGIN { + $d = $0; + $d =~ s/\/[^\/][^\/]*$//g; + + if ($d eq $0) { + unshift(@INC, "lib"); + } + else { + unshift(@INC, $d . "/lib"); + } +} + +use strict; +use CorScorer; + +if (@ARGV < 3) { + print q| +use: scorer.pl [name] + + metric: the metric desired to score the results: + muc: MUCScorer (Vilain et al, 1995) + bcub: B-Cubed (Bagga and Baldwin, 1998) + ceafm: CEAF (Luo et al, 2005) using mention-based similarity + ceafe: CEAF (Luo et al, 2005) using entity-based similarity + blanc: BLANC + all: uses all the metrics to score + + keys_file: file with expected coreference chains in SemEval format + + response_file: file with output of coreference system (SemEval format) + + name: [optional] the name of the document to score. If name is not + given, all the documents in the dataset will be scored. If given + name is "none" then all the documents are scored but only total + results are shown. + +|; + exit; +} + +my $metric = shift(@ARGV); +if ($metric !~ /^(muc|bcub|ceafm|ceafe|blanc|all)/i) { + print "Invalid metric\n"; + exit; +} + +if ($metric eq 'all') { + foreach my $m ('muc', 'bcub', 'ceafm', 'ceafe', 'blanc') { + print "\nMETRIC $m:\n"; + &CorScorer::Score($m, @ARGV); + } +} +else { + &CorScorer::Score($metric, @ARGV); +} + diff --git a/neuralcoref/scorer/test/CorefMetricTest.pm b/neuralcoref/scorer/test/CorefMetricTest.pm new file mode 100644 index 0000000..c5e96a3 --- /dev/null +++ b/neuralcoref/scorer/test/CorefMetricTest.pm @@ -0,0 +1,124 @@ +package CorefMetricTest; +use strict; +use warnings; +use Exporter; + +our @ISA= qw(Exporter); +our @EXPORT = qw(ComputeScoreFromCounts DiffExpectedAndActual); + +################################################################################ +# Compute recall, precision and F1. +# +# Input: (numerator_counts_for_recall, denominator_counts_for_recall, +# numerator_counts_for_precision, denominator_counts_for_precision) +# Output: (recall, precision, F1) +################################################################################ +sub ComputeScoreFromCounts { + # The first 4 are also coref link counts when using BLANC. + my ($recall_numerator, $recall_denominator, + $precision_numerator, $precision_denominator, @noncoref_counts) = @_; + # The coref recall, precision, and F1 when using BLANC. + my ($recall, $precision, $F1) = + RPFFromCounts($recall_numerator, $recall_denominator, + $precision_numerator, $precision_denominator); + + # BLANC: @noncoref_counts= + # (noncoref_numerator_recall, noncoref_denominator_recall, + # noncoref_numerator_precision, noncoref_denominator_precision) + if (scalar(@noncoref_counts) == 4) { + ($recall, $precision, $F1) = CorScorer::ComputeBLANCFromCounts( + $recall_numerator, $recall_denominator, $precision_denominator, + $noncoref_counts[0], $noncoref_counts[1], $noncoref_counts[3]); + } + $recall = ($recall < 0) ? 0 : $recall; + $precision = ($precision < 0) ? 0 : $precision; + $F1 = ($F1 < 0) ? 0 : $F1; + return ($recall, $precision, $F1); +} + +sub RPFFromCounts +{ + my ($recall_numerator, $recall_denominator, + $precision_numerator, $precision_denominator, @nonCorefCounts) = @_; + my ($recall, $precision, $F1) = (-1, -1, 0); + if ($recall_denominator > 0) { + $recall = $recall_numerator / $recall_denominator; + } + if ($precision_denominator > 0) { + $precision = $precision_numerator / $precision_denominator; + } + + if (($recall + $precision) > 0) { + $F1 = 2 * $recall * $precision / ($recall + $precision); + } + + return ($recall, $precision, $F1); +} + +# deprecated -- see CorScorer::ComputeBLANCFromCounts(). +sub ComputeBLANCRPF +{ + my ($coref_recall, $coref_precision, $coref_F1, + $noncoref_recall, $noncoref_precision, $noncoref_F1) = @_; + + my ($recall, $precision, $F1); + + if ($coref_recall < 0 && $noncoref_recall < 0) { + # no key mention. + $recall = $precision = $F1 = 0; + } elsif ($coref_recall < 0) { + # key: all links are non-coref (mentions are all singltons). + $recall = $noncoref_recall; + $precision = ($noncoref_precision < 0) ? 0 : $noncoref_precision; + $F1 = $noncoref_F1; + } elsif ($noncoref_recall < 0) { + # key: all links are coref (all mentions are in one entity). + $recall = $coref_recall; + $precision = ($coref_precision < 0) ? 0 : $coref_precision; + $F1 = $coref_F1; + } else { + #key contains both coref and non-coref links. + if ($coref_precision < 0 && $noncoref_precision < 0) { + # no response. + $recall = $precision = $F1 = 0; + } else { + if ($coref_precision < 0) { + # response: all links are non-coref, or response mentions are all + # singletons. + $coref_precision = 0; + } elsif ($noncoref_precision < 0) { + # response: all links are coref, or all mentions are in one entity. + $noncoref_precision = 0; + } + $recall = ($coref_recall + $noncoref_recall)/2; + $precision = ($coref_precision + $noncoref_precision)/2; + $F1 = ($coref_F1 + $noncoref_F1)/2; + } + } + + return ($recall, $precision, $F1); +} + +############################################################################## +# Compute the sum of the duifference between the expected recall, precision, +# F1 and the actual one. +############################################################################## +sub DiffExpectedAndActual { + my ($expected, $actual) = @_; + if (scalar(@$expected) != scalar(@$actual)) { + print STDERR "Expected and actual have diff dimensions: \n"; + print STDERR " Expected: ", join(" ", @$expected), "\n"; + print STDERR " Actual: ", join(" ", @$actual), "\n"; + return 1.0e5; + } + my $sum = 0.0; + my $i = 0; + foreach my $e (@$expected) { + $sum += abs($e - $actual->[$i]); + ++$i; + } + return $sum; +} + +1; + diff --git a/neuralcoref/scorer/test/CorefMetricTestConfig.pm b/neuralcoref/scorer/test/CorefMetricTestConfig.pm new file mode 100644 index 0000000..974f655 --- /dev/null +++ b/neuralcoref/scorer/test/CorefMetricTestConfig.pm @@ -0,0 +1,363 @@ +################################################################################ +# This is the test configuration file. Test cases are stored in an +# array, each element consisting of: +# (1) id: a unique identifier for the test case. +# (2) key_file: the key file to be tested in the CoNLL format. +# (3) response_file: the response file to be tested in the CoNLL format. +# (4) expected_metrics: is a hash label from a metric name (identical to those +# used in the scorer.{pl|bat}) to an array of expected +# metric values. All metrics have 3 expected numbers: +# (recall, precision, F-measure). +################################################################################ + +package CorefMetricTestConfig; +use strict; +use warnings; +use Exporter; + +our @ISA= qw( Exporter ); + +# these are exported by default. +our @EXPORT = qw(TestCases); + +# +# Values following metric names are [recall, precision, F1] +# +our @TestCases = ( +{ id => "A1", + key_file => "DataFiles/TC-A.key", + response_file => "DataFiles/TC-A-1.response", + expected_metrics => { "muc" => [1, 1, 1], + "bcub" => [6/6, 6/6, 1], + "ceafm" => [1, 1, 1], + "ceafe" => [1, 1, 1], + "blanc" => [1, 1, 1] } +}, +{ id => "A2", + key_file => "DataFiles/TC-A.key", + response_file => "DataFiles/TC-A-2.response", + expected_metrics => { "muc" => [1/3, 1/1, 0.5], + "bcub" => [(7/3)/6, 3/3, 14/25], + "ceafm" => [0.5, 1, 0.66667], + "ceafe" => [0.6, 0.9, 0.72], + "blanc" => [0.21591, 1, 0.35385] } +}, +{ id => "A3", + key_file => "DataFiles/TC-A.key", + response_file => "DataFiles/TC-A-3.response", + expected_metrics => { "muc" => [3/3, 3/5, 0.75], + "bcub" => [6/6, (4+7/12)/9, 110/163], + "ceafm" => [1, 0.66667, 0.8], + "ceafe" => [0.88571, 0.66429, 0.75918], + "blanc" => [1, 0.42593, 0.59717] } +}, +{ id => "A4", + key_file => "DataFiles/TC-A.key", + response_file => "DataFiles/TC-A-4.response", + expected_metrics => { "muc" => [1/3, 1/3, 1/3], + "bcub" => [(3+1/3)/6, (1+4/3+1/2)/7, 2*(5/9)*(17/42)/((5/9)+(17/42))], + "ceafm" => [0.66667, 0.57143, 0.61538], + "ceafe" => [0.73333, 0.55, 0.62857], + "blanc" => [0.35227, 0.27206, 0.30357] } +}, +{ id => "A5", + key_file => "DataFiles/TC-A.key", + response_file => "DataFiles/TC-A-5.response", + expected_metrics => { "muc" => [1/3, 1/4, 2/7], + "bcub" => [(3+1/3)/6, 2.5/8, 2*(5/9)*(5/16)/((5/9)+(5/16))], + "ceafm" => [0.66667, 0.5, 0.57143], + "ceafe" => [0.68889, 0.51667, 0.59048], + "blanc" => [0.35227, 0.19048, 0.24716] } +}, +{ id => "A6", + key_file => "DataFiles/TC-A.key", + response_file => "DataFiles/TC-A-6.response", + expected_metrics => { "muc" => [1/3, 1/4, 2/7], + "bcub" => [(10/3)/6, (1+4/3+1/2)/8, 2*(5/9)*(17/48)/((5/9)+(17/48))], + "ceafm" => [0.66667, 0.5, 0.57143], + "ceafe" => [0.73333, 0.55, 0.62857], + "blanc" => [0.35227, 0.20870, 0.25817] } +}, +{ id => "A7", + key_file => "DataFiles/TC-A.key", + response_file => "DataFiles/TC-A-7.response", + expected_metrics => { "muc" => [1/3, 1/3, 1/3], + "bcub" => [(10/3)/6, (1+4/3+1/2)/7, 2*(5/9)*(17/42)/((5/9)+(17/42))], + "ceafm" => [0.66667, 0.57143, 0.61538], + "ceafe" => [0.73333, 0.55, 0.62857], + "blanc" => [0.35227, 0.27206, 0.30357] } +}, +{ id => "A8", + key_file => "DataFiles/TC-A.key", + response_file => "DataFiles/TC-A-8.response", + expected_metrics => { "muc" => [1/3, 1/3, 1/3], + "bcub" => [(10/3)/6, (1+4/3+1/2)/7, 2*(5/9)*(17/42)/((5/9)+(17/42))], + "ceafm" => [0.66667, 0.57143, 0.61538], + "ceafe" => [0.73333, 0.55, 0.62857], + "blanc" => [0.35227, 0.27206, 0.30357] } +}, +{ id => "A9", + key_file => "DataFiles/TC-A.key", + response_file => "DataFiles/TC-A-9.response", + expected_metrics => { "muc" => [1/3, 1/3, 1/3], + "bcub" => [(10/3)/6, (1+4/3+1/2)/7, 2*(5/9)*(17/42)/((5/9)+(17/42))], + "ceafm" => [0.66667, 0.57143, 0.61538], + "ceafe" => [0.73333, 0.55, 0.62857], + "blanc" => [0.35227, 0.27206, 0.30357] } +}, +{ id => "A10", + key_file => "DataFiles/TC-A.key", + response_file => "DataFiles/TC-A-10.response", + expected_metrics => { "muc" => [0, 0, 0], + "bcub" => [3/6, 6/6, 2/3], + #”ceafm" => [1, 1, 1], + #”ceafe" => [1, 1, 1], + "blanc" => [0.5, 0.36667, 0.42308] } +}, +{ id => "A11", + key_file => "DataFiles/TC-A.key", + response_file => "DataFiles/TC-A-11.response", + expected_metrics => { "muc" => [3/3, 3/5, 6/8], + "bcub" => [6/6, (1/6+2*2/6+3*3/6)/6, 14/25], + #”ceafm" => [1, 1, 1], + #”ceafe" => [1, 1, 1], + "blanc" => [0.5, 0.13333, 0.21053] } +}, +{ id => "A12", + key_file => "DataFiles/TC-A.key", + response_file => "DataFiles/TC-A-12.response", + expected_metrics => { "muc" => [0, 0, 0], + "bcub" => [(1+1/2+2/3)/6, 4/7, 2*(13/36)*(4/7)/((13/36)+(4/7))], + #”ceafm" => [1, 1, 1], + #”ceafe" => [1, 1, 1], + "blanc" => [0.22727, 0.11905, 0.15625] } +}, +{ id => "A13", + key_file => "DataFiles/TC-A.key", + response_file => "DataFiles/TC-A-13.response", + expected_metrics => { "muc" => [1/3, 1/6, 2/9], + "bcub" => [(1+1/2+2*2/3)/6, (1/7+1/7+2*2/7)/7, 2*(17/36)*(6/49)/((17/36)+(6/49))], + #”ceafm" => [1, 1, 1], + #”ceafe" => [1, 1, 1], + "blanc" => [0.125, 0.02381, 0.04] } +}, +{ id => "B1", + key_file => "DataFiles/TC-B.key", + response_file => "DataFiles/TC-B-1.response", + expected_metrics => { #"muc" => [1, 1, 1], + #"bcub" => [1, 1, 1], + #”ceafm" => [1, 1, 1], + #”ceafe" => [1, 1, 1], + "blanc" => [1/2 * (1/4 + 1/3), 1/2 * (1/4 + 1/3), 1/2 * (1/4 + 1/3)] } +}, +{ id => "C1", + key_file => "DataFiles/TC-C.key", + response_file => "DataFiles/TC-C-1.response", + expected_metrics => { #"muc" => [1, 1, 1], + #"bcub" => [1, 1, 1], + #”ceafm" => [1, 1, 1], + #”ceafe" => [1, 1, 1], + "blanc" => [1/2 * (2/5 + 10/16), 1/2 * (2/5 + 10/16), 1/2 * (2/5 + 10/16)] } +}, +{ id => "D1", + key_file => "DataFiles/TC-D.key", + response_file => "DataFiles/TC-D-1.response", + expected_metrics => { "muc" => [9/9, 9/10, 2*(9/9)*(9/10)/(9/9+9/10)], + "bcub" => [12/12, 16/21, 2*(12/12)*(16/21)/(12/12+16/21)], + #"ceafm" => [1, 1, 1], + #"ceafe" => [1, 1, 1], + #"blanc" => [1, 1, 1] + } +}, +{ id => "E1", + key_file => "DataFiles/TC-E.key", + response_file => "DataFiles/TC-E-1.response", + expected_metrics => { "muc" => [9/9, 9/10, 2*(9/9)*(9/10)/(9/9+9/10)], + "bcub" => [1, 7/12, 2*1*(7/12)/(1+7/12)], + #"ceafm" => [1, 1, 1], + #"ceafe" => [1, 1, 1], + #"blanc" => [1, 1, 1] + } +}, +{ id => "F1", + key_file => "DataFiles/TC-F.key", + response_file => "DataFiles/TC-F-1.response", + expected_metrics => { "muc" => [2/3, 2/2, 2*(2/3)*(2/2)/(2/3+2/2)] , + #"bcub" => , + #"ceafm" => , + #"ceafe" => , + #"blanc" => + } +}, +{ id => "G1", + key_file => "DataFiles/TC-G.key", + response_file => "DataFiles/TC-G-1.response", + expected_metrics => { "muc" => [2/2, 2/3, 2*(2/2)*(2/3)/(2/2+2/3)], + #"bcub" => , + #"ceafm" => , + #"ceafe" => , + #"blanc" => + } +}, +{ id => "H1", + key_file => "DataFiles/TC-H.key", + response_file => "DataFiles/TC-H-1.response", + expected_metrics => { "muc" => [1, 1, 1], + #"bcub" => , + #"ceafm" => , + #"ceafe" => , + #"blanc" => + } +}, +{ id => "I1", + key_file => "DataFiles/TC-I.key", + response_file => "DataFiles/TC-I-1.response", + expected_metrics => { "muc" => [2/3, 2/2, 2*(2/3)*(2/2)/(2/3+2/2)], + #"bcub" => , + #"ceafm" => , + #"ceafe" => , + #"blanc" => + } +}, +{ id => "J1", + key_file => "DataFiles/TC-J.key", + response_file => "DataFiles/TC-J-1.response", + expected_metrics => { "muc" => [1/2, 1/1, 2*(1/2)*(1/1)/(1/2+1/1)], + #"bcub" => , + #"ceafm" => , + #"ceafe" => , + #"blanc" => + } +}, +{ id => "K1", + key_file => "DataFiles/TC-K.key", + response_file => "DataFiles/TC-K-1.response", + expected_metrics => { "muc" => [3/6, 3/6, 3/6], + #"bcub" => , + #"ceafm" => , + #"ceafe" => , + #"blanc" => + } +}, +{ id => "L1", + key_file => "DataFiles/TC-L.key", + response_file => "DataFiles/TC-L-1.response", + expected_metrics => { "muc" => [2/5, 2/4, 2*(2/5)*(2/4)/(2/5+2/4)], + #"bcub" => , + #"ceafm" => , + #"ceafe" => , + #"blanc" => + } +}, +{ id => "M1", + key_file => "DataFiles/TC-M.key", + response_file => "DataFiles/TC-M-1.response", + expected_metrics => { "muc" => [1, 1, 1], + "bcub" => [1, 1, 1], + "ceafm" => [1, 1, 1], + "ceafe" => [1, 1, 1], + "blanc" => [1, 1, 1] } +}, +{ id => "M2", + key_file => "DataFiles/TC-M.key", + response_file => "DataFiles/TC-M-2.response", + expected_metrics => { "muc" => [0, 0, 0], + #"bcub" => , + #"ceafm" => , + #"ceafe" => , + "blanc" => [0, 0, 0] } +}, +{ id => "M3", + key_file => "DataFiles/TC-M.key", + response_file => "DataFiles/TC-M-3.response", + expected_metrics => { #"muc" => , + #"bcub" => , + #"ceafm" => , + #"ceafe" => , + "blanc" => [0.26667, 1, 0.42105] } +}, +{ id => "M4", + key_file => "DataFiles/TC-M.key", + response_file => "DataFiles/TC-M-4.response", + expected_metrics => { #"muc" => , + #"bcub" => , + #"ceafm" => , + #"ceafe" => , + "blanc" => [0.2, 0.2, 0.2] } +}, +{ id => "M5", + key_file => "DataFiles/TC-M.key", + response_file => "DataFiles/TC-M-5.response", + expected_metrics => { "muc" => [0, 0, 0], + #"bcub" => , + #"ceafm" => , + #"ceafe" => , + "blanc" => [0, 0, 0] } +}, +{ id => "M6", + key_file => "DataFiles/TC-M.key", + response_file => "DataFiles/TC-M-6.response", + expected_metrics => { #"muc" => , + #"bcub" => , + #"ceafm" => , + #"ceafe" => , + "blanc" => [0.06667, 0.25, 0.10526] } +}, +{ id => "N1", + key_file => "DataFiles/TC-N.key", + response_file => "DataFiles/TC-N-1.response", + expected_metrics => { "muc" => [0, 0, 0], + #"bcub" => [1, 1, 1], + #"ceafm" => [1, 1, 1], + #"ceafe" => [1, 1, 1], + "blanc" => [1, 1, 1] } +}, +{ id => "N2", + key_file => "DataFiles/TC-N.key", + response_file => "DataFiles/TC-N-2.response", + expected_metrics => { "muc" => [0, 0, 0], + #"bcub" => , + #"ceafm" => , + #"ceafe" => , + "blanc" => [0, 0, 0] } +}, +{ id => "N3", + key_file => "DataFiles/TC-N.key", + response_file => "DataFiles/TC-N-3.response", + expected_metrics => { #"muc" => , + #"bcub" => , + #"ceafm" => , + #"ceafe" => , + "blanc" => [0.73333, 1, 0.84615] } +}, +{ id => "N4", + key_file => "DataFiles/TC-N.key", + response_file => "DataFiles/TC-N-4.response", + expected_metrics => { "muc" => [0, 0, 0], + #"bcub" => , + #"ceafm" => , + #"ceafe" => , + "blanc" => [0.2, 0.2, 0.2] } +}, +{ id => "N5", + key_file => "DataFiles/TC-N.key", + response_file => "DataFiles/TC-N-5.response", + expected_metrics => { #"muc" => , + #"bcub" => , + #"ceafm" => , + #"ceafe" => , + "blanc" => [0, 0, 0] } +}, +{ id => "N6", + key_file => "DataFiles/TC-N.key", + response_file => "DataFiles/TC-N-6.response", + expected_metrics => { #"muc" => , + #"bcub" => , + #"ceafm" => , + #"ceafe" => , + "blanc" => [0.13333, 0.18182, 0.15385] } +} + +); + +1; diff --git a/neuralcoref/scorer/test/DataFiles/TC-A-1.response b/neuralcoref/scorer/test/DataFiles/TC-A-1.response new file mode 100644 index 0000000..445a92e --- /dev/null +++ b/neuralcoref/scorer/test/DataFiles/TC-A-1.response @@ -0,0 +1,23 @@ +#begin document (LuoTestCase); +test1 0 0 a1 (0 +test1 0 1 a2 0) +test1 0 2 junk - +test1 0 3 b1 (1 +test1 0 4 b2 - +test1 0 5 b3 - +test1 0 6 b4 1) +test1 0 7 jnk - +test1 0 8 . - + +test2 0 0 c (1) +test2 0 1 jnk - +test2 0 2 d1 (2 +test2 0 3 d2 2) +test2 0 4 jnk - +test2 0 5 e (2) +test2 0 6 jnk - +test2 0 7 f1 (2 +test2 0 8 f2 - +test2 0 9 f3 2) +test2 0 10 . - +#end document diff --git a/neuralcoref/scorer/test/DataFiles/TC-A-10.response b/neuralcoref/scorer/test/DataFiles/TC-A-10.response new file mode 100644 index 0000000..e323b09 --- /dev/null +++ b/neuralcoref/scorer/test/DataFiles/TC-A-10.response @@ -0,0 +1,23 @@ +#begin document (LuoTestCase); +test1 0 0 a1 (0 +test1 0 1 a2 0) +test1 0 2 junk - +test1 0 3 b1 (1 +test1 0 4 b2 - +test1 0 5 b3 - +test1 0 6 b4 1) +test1 0 7 jnk - +test1 0 8 . - + +test2 0 0 c (2) +test2 0 1 x - +test2 0 2 d1 (3 +test2 0 3 d2 3) +test2 0 4 z - +test2 0 5 e (4) +test2 0 6 y - +test2 0 7 f1 (5 +test2 0 8 f2 - +test2 0 9 f3 5) +test2 0 10 . - +#end document diff --git a/neuralcoref/scorer/test/DataFiles/TC-A-11.response b/neuralcoref/scorer/test/DataFiles/TC-A-11.response new file mode 100644 index 0000000..90ea74d --- /dev/null +++ b/neuralcoref/scorer/test/DataFiles/TC-A-11.response @@ -0,0 +1,23 @@ +#begin document (LuoTestCase); +test1 0 0 a1 (0 +test1 0 1 a2 0) +test1 0 2 junk - +test1 0 3 b1 (0 +test1 0 4 b2 - +test1 0 5 b3 - +test1 0 6 b4 0) +test1 0 7 jnk - +test1 0 8 . - + +test2 0 0 c (0) +test2 0 1 x - +test2 0 2 d1 (0 +test2 0 3 d2 0) +test2 0 4 z - +test2 0 5 e (0) +test2 0 6 y - +test2 0 7 f1 (0 +test2 0 8 f2 - +test2 0 9 f3 0) +test2 0 10 . - +#end document diff --git a/neuralcoref/scorer/test/DataFiles/TC-A-12.response b/neuralcoref/scorer/test/DataFiles/TC-A-12.response new file mode 100644 index 0000000..1c59f5e --- /dev/null +++ b/neuralcoref/scorer/test/DataFiles/TC-A-12.response @@ -0,0 +1,23 @@ +#begin document (LuoTestCase); +test1 0 0 a1 (0 +test1 0 1 a2 0) +test1 0 2 junk - +test1 0 3 b1 (1 +test1 0 4 b2 1) +test1 0 5 b3 - +test1 0 6 b4 - +test1 0 7 jnk (2) +test1 0 8 . - + +test2 0 0 c (3) +test2 0 1 x - +test2 0 2 d1 (4 +test2 0 3 d2 4) +test2 0 4 z - +test2 0 5 e (5) +test2 0 6 y - +test2 0 7 f1 (6) +test2 0 8 f2 - +test2 0 9 f3 - +test2 0 10 . - +#end document diff --git a/neuralcoref/scorer/test/DataFiles/TC-A-13.response b/neuralcoref/scorer/test/DataFiles/TC-A-13.response new file mode 100644 index 0000000..cfe2b73 --- /dev/null +++ b/neuralcoref/scorer/test/DataFiles/TC-A-13.response @@ -0,0 +1,23 @@ +#begin document (LuoTestCase); +test1 0 0 a1 (0 +test1 0 1 a2 0) +test1 0 2 junk - +test1 0 3 b1 (0 +test1 0 4 b2 0) +test1 0 5 b3 - +test1 0 6 b4 - +test1 0 7 jnk (0) +test1 0 8 . - + +test2 0 0 c (0) +test2 0 1 x - +test2 0 2 d1 (0 +test2 0 3 d2 0) +test2 0 4 z - +test2 0 5 e (0) +test2 0 6 y - +test2 0 7 f1 (0) +test2 0 8 f2 - +test2 0 9 f3 - +test2 0 10 . - +#end document diff --git a/neuralcoref/scorer/test/DataFiles/TC-A-2.response b/neuralcoref/scorer/test/DataFiles/TC-A-2.response new file mode 100644 index 0000000..d0726f1 --- /dev/null +++ b/neuralcoref/scorer/test/DataFiles/TC-A-2.response @@ -0,0 +1,23 @@ +#begin document (LuoTestCase); +test1 0 0 a1 (0 +test1 0 1 a2 0) +test1 0 2 junk - +test1 0 3 b1 - +test1 0 4 b2 - +test1 0 5 b3 - +test1 0 6 b4 - +test1 0 7 jnk - +test1 0 8 . - + +test2 0 0 c - +test2 0 1 jnk - +test2 0 2 d1 (2 +test2 0 3 d2 2) +test2 0 4 jnk - +test2 0 5 e (2) +test2 0 6 jnk - +test2 0 7 f1 - +test2 0 8 f2 - +test2 0 9 f3 - +test2 0 10 . - +#end document diff --git a/neuralcoref/scorer/test/DataFiles/TC-A-3.response b/neuralcoref/scorer/test/DataFiles/TC-A-3.response new file mode 100644 index 0000000..49cec4b --- /dev/null +++ b/neuralcoref/scorer/test/DataFiles/TC-A-3.response @@ -0,0 +1,23 @@ +#begin document (LuoTestCase); +test1 0 0 a1 (0 +test1 0 1 a2 0) +test1 0 2 junk - +test1 0 3 b1 (1 +test1 0 4 b2 - +test1 0 5 b3 - +test1 0 6 b4 1) +test1 0 7 jnk - +test1 0 8 . - + +test2 0 0 c (1) +test2 0 1 x (1) +test2 0 2 d1 (2 +test2 0 3 d2 2) +test2 0 4 y (2) +test2 0 5 e (2) +test2 0 6 z (3) +test2 0 7 f1 (2 +test2 0 8 f2 - +test2 0 9 f3 2) +test2 0 10 . - +#end document diff --git a/neuralcoref/scorer/test/DataFiles/TC-A-4.response b/neuralcoref/scorer/test/DataFiles/TC-A-4.response new file mode 100644 index 0000000..df84841 --- /dev/null +++ b/neuralcoref/scorer/test/DataFiles/TC-A-4.response @@ -0,0 +1,23 @@ +#begin document (LuoTestCase); +test1 0 0 a1 (0 +test1 0 1 a2 0) +test1 0 2 junk - +test1 0 3 b1 (1 +test1 0 4 b2 - +test1 0 5 b3 - +test1 0 6 b4 1) +test1 0 7 jnk - +test1 0 8 . - + +test2 0 0 c (1) +test2 0 1 x (1) +test2 0 2 d1 (2 +test2 0 3 d2 2) +test2 0 4 x (3) +test2 0 5 e - +test2 0 6 y (2) +test2 0 7 f1 - +test2 0 8 f2 - +test2 0 9 f3 - +test2 0 10 . - +#end document diff --git a/neuralcoref/scorer/test/DataFiles/TC-A-5.response b/neuralcoref/scorer/test/DataFiles/TC-A-5.response new file mode 100644 index 0000000..921e34a --- /dev/null +++ b/neuralcoref/scorer/test/DataFiles/TC-A-5.response @@ -0,0 +1,23 @@ +#begin document (LuoTestCase); +test1 0 0 a1 (0 +test1 0 1 a2 0) +test1 0 2 junk - +test1 0 3 b1 (1 +test1 0 4 b2 (1 +test1 0 5 b3 1) +test1 0 6 b4 1) +test1 0 7 jnk - +test1 0 8 . - + +test2 0 0 c (1) +test2 0 1 x (1) +test2 0 2 d1 (2 +test2 0 3 d2 2) +test2 0 4 z (3) +test2 0 5 e - +test2 0 6 y (2) +test2 0 7 f1 - +test2 0 8 f2 - +test2 0 9 f3 - +test2 0 10 . - +#end document diff --git a/neuralcoref/scorer/test/DataFiles/TC-A-6.response b/neuralcoref/scorer/test/DataFiles/TC-A-6.response new file mode 100644 index 0000000..f1a8954 --- /dev/null +++ b/neuralcoref/scorer/test/DataFiles/TC-A-6.response @@ -0,0 +1,23 @@ +#begin document (LuoTestCase); +test1 0 0 a1 (0 +test1 0 1 a2 0) +test1 0 2 junk - +test1 0 3 b1 (1 +test1 0 4 b2 (3 +test1 0 5 b3 3) +test1 0 6 b4 1) +test1 0 7 jnk - +test1 0 8 . - + +test2 0 0 c (1) +test2 0 1 x (1) +test2 0 2 d1 (2 +test2 0 3 d2 2) +test2 0 4 z (3) +test2 0 5 e - +test2 0 6 y (2) +test2 0 7 f1 - +test2 0 8 f2 - +test2 0 9 f3 - +test2 0 10 . - +#end document diff --git a/neuralcoref/scorer/test/DataFiles/TC-A-7.response b/neuralcoref/scorer/test/DataFiles/TC-A-7.response new file mode 100644 index 0000000..b111e44 --- /dev/null +++ b/neuralcoref/scorer/test/DataFiles/TC-A-7.response @@ -0,0 +1,23 @@ +#begin document (LuoTestCase); +test1 0 0 a1 (0 +test1 0 1 a2 0) +test1 0 2 junk - +test1 0 3 b1 (1(1 +test1 0 4 b2 - +test1 0 5 b3 - +test1 0 6 b4 1)1) +test1 0 7 jnk - +test1 0 8 . - + +test2 0 0 c (1) +test2 0 1 x (1) +test2 0 2 d1 (2 +test2 0 3 d2 2) +test2 0 4 z (3) +test2 0 5 e - +test2 0 6 y (2) +test2 0 7 f1 - +test2 0 8 f2 - +test2 0 9 f3 - +test2 0 10 . - +#end document diff --git a/neuralcoref/scorer/test/DataFiles/TC-A-8.response b/neuralcoref/scorer/test/DataFiles/TC-A-8.response new file mode 100644 index 0000000..974b4f0 --- /dev/null +++ b/neuralcoref/scorer/test/DataFiles/TC-A-8.response @@ -0,0 +1,23 @@ +#begin document (LuoTestCase); +test1 0 0 a1 (0 +test1 0 1 a2 0) +test1 0 2 junk - +test1 0 3 b1 (1(3 +test1 0 4 b2 - +test1 0 5 b3 - +test1 0 6 b4 3)1) +test1 0 7 jnk - +test1 0 8 . - + +test2 0 0 c (1) +test2 0 1 x (1) +test2 0 2 d1 (2 +test2 0 3 d2 2) +test2 0 4 z (3) +test2 0 5 e - +test2 0 6 y (2) +test2 0 7 f1 - +test2 0 8 f2 - +test2 0 9 f3 - +test2 0 10 . - +#end document diff --git a/neuralcoref/scorer/test/DataFiles/TC-A-9.response b/neuralcoref/scorer/test/DataFiles/TC-A-9.response new file mode 100644 index 0000000..a370a5b --- /dev/null +++ b/neuralcoref/scorer/test/DataFiles/TC-A-9.response @@ -0,0 +1,23 @@ +#begin document (LuoTestCase); +test1 0 0 a1 (0 +test1 0 1 a2 0) +test1 0 2 junk - +test1 0 3 b1 (1(3(3(3(3(3(3(3(3(3(3 +test1 0 4 b2 - +test1 0 5 b3 - +test1 0 6 b4 3)3)3)3)3)3)3)3)3)3)1) +test1 0 7 jnk - +test1 0 8 . - + +test2 0 0 c (1) +test2 0 1 x (1) +test2 0 2 d1 (2 +test2 0 3 d2 2) +test2 0 4 z (3) +test2 0 5 e - +test2 0 6 y (2) +test2 0 7 f1 - +test2 0 8 f2 - +test2 0 9 f3 - +test2 0 10 . - +#end document diff --git a/neuralcoref/scorer/test/DataFiles/TC-A.key b/neuralcoref/scorer/test/DataFiles/TC-A.key new file mode 100644 index 0000000..445a92e --- /dev/null +++ b/neuralcoref/scorer/test/DataFiles/TC-A.key @@ -0,0 +1,23 @@ +#begin document (LuoTestCase); +test1 0 0 a1 (0 +test1 0 1 a2 0) +test1 0 2 junk - +test1 0 3 b1 (1 +test1 0 4 b2 - +test1 0 5 b3 - +test1 0 6 b4 1) +test1 0 7 jnk - +test1 0 8 . - + +test2 0 0 c (1) +test2 0 1 jnk - +test2 0 2 d1 (2 +test2 0 3 d2 2) +test2 0 4 jnk - +test2 0 5 e (2) +test2 0 6 jnk - +test2 0 7 f1 (2 +test2 0 8 f2 - +test2 0 9 f3 2) +test2 0 10 . - +#end document diff --git a/neuralcoref/scorer/test/DataFiles/TC-B-1.response b/neuralcoref/scorer/test/DataFiles/TC-B-1.response new file mode 100644 index 0000000..3224fbc --- /dev/null +++ b/neuralcoref/scorer/test/DataFiles/TC-B-1.response @@ -0,0 +1,74 @@ +#begin document (nw/xinhua/00/chtb_0009); part 000 +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (10043 +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 10043) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (10043) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (10043 +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 10043) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (10054 +nw/xinhua/00/chtb_0009 10054) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (10054) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - + +#end document diff --git a/neuralcoref/scorer/test/DataFiles/TC-B.key b/neuralcoref/scorer/test/DataFiles/TC-B.key new file mode 100644 index 0000000..59fd836 --- /dev/null +++ b/neuralcoref/scorer/test/DataFiles/TC-B.key @@ -0,0 +1,74 @@ +#begin document (nw/xinhua/00/chtb_0009); part 000 +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (10043 +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 10043) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (10054 +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 10054) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (10043) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (10054 +nw/xinhua/00/chtb_0009 10054) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (10054) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - + +#end document diff --git a/neuralcoref/scorer/test/DataFiles/TC-C-1.response b/neuralcoref/scorer/test/DataFiles/TC-C-1.response new file mode 100644 index 0000000..6e1ee2f --- /dev/null +++ b/neuralcoref/scorer/test/DataFiles/TC-C-1.response @@ -0,0 +1,74 @@ +#begin document (nw/xinhua/00/chtb_0009); part 000 +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (10043 +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 10043) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (10043) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (10043 +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 10043) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (10054 +nw/xinhua/00/chtb_0009 10054) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (10054) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (10060) +nw/xinhua/00/chtb_0009 (10060) + +#end document diff --git a/neuralcoref/scorer/test/DataFiles/TC-C.key b/neuralcoref/scorer/test/DataFiles/TC-C.key new file mode 100644 index 0000000..259383a --- /dev/null +++ b/neuralcoref/scorer/test/DataFiles/TC-C.key @@ -0,0 +1,74 @@ +#begin document (nw/xinhua/00/chtb_0009); part 000 +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (10043 +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 10043) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (10054 +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 10054) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (10043) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (10054 +nw/xinhua/00/chtb_0009 10054) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (10054) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (10060) +nw/xinhua/00/chtb_0009 (10060) + +#end document diff --git a/neuralcoref/scorer/test/DataFiles/TC-D-1.response b/neuralcoref/scorer/test/DataFiles/TC-D-1.response new file mode 100644 index 0000000..d2be1a0 --- /dev/null +++ b/neuralcoref/scorer/test/DataFiles/TC-D-1.response @@ -0,0 +1,31 @@ +#begin document (nw/xinhua/00/chtb_0009); part 000 +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (3) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (3) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (3) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (3) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (3) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (3) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (3) + +#end document diff --git a/neuralcoref/scorer/test/DataFiles/TC-D.key b/neuralcoref/scorer/test/DataFiles/TC-D.key new file mode 100644 index 0000000..785d0a4 --- /dev/null +++ b/neuralcoref/scorer/test/DataFiles/TC-D.key @@ -0,0 +1,31 @@ +#begin document (nw/xinhua/00/chtb_0009); part 000 +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (2) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (2) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (3) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (3) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (3) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (3) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (3) + +#end document diff --git a/neuralcoref/scorer/test/DataFiles/TC-E-1.response b/neuralcoref/scorer/test/DataFiles/TC-E-1.response new file mode 100644 index 0000000..c7710cd --- /dev/null +++ b/neuralcoref/scorer/test/DataFiles/TC-E-1.response @@ -0,0 +1,31 @@ +#begin document (nw/xinhua/00/chtb_0009); part 000 +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (2) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (2) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) + +#end document diff --git a/neuralcoref/scorer/test/DataFiles/TC-E.key b/neuralcoref/scorer/test/DataFiles/TC-E.key new file mode 100644 index 0000000..785d0a4 --- /dev/null +++ b/neuralcoref/scorer/test/DataFiles/TC-E.key @@ -0,0 +1,31 @@ +#begin document (nw/xinhua/00/chtb_0009); part 000 +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (2) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (2) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (3) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (3) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (3) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (3) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (3) + +#end document diff --git a/neuralcoref/scorer/test/DataFiles/TC-F-1.response b/neuralcoref/scorer/test/DataFiles/TC-F-1.response new file mode 100644 index 0000000..f2a6355 --- /dev/null +++ b/neuralcoref/scorer/test/DataFiles/TC-F-1.response @@ -0,0 +1,31 @@ +#begin document (nw/xinhua/00/chtb_0009); part 000 +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (2) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (2) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - + +#end document diff --git a/neuralcoref/scorer/test/DataFiles/TC-F.key b/neuralcoref/scorer/test/DataFiles/TC-F.key new file mode 100644 index 0000000..bb972d2 --- /dev/null +++ b/neuralcoref/scorer/test/DataFiles/TC-F.key @@ -0,0 +1,31 @@ +#begin document (nw/xinhua/00/chtb_0009); part 000 +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - + +#end document diff --git a/neuralcoref/scorer/test/DataFiles/TC-G-1.response b/neuralcoref/scorer/test/DataFiles/TC-G-1.response new file mode 100644 index 0000000..bb972d2 --- /dev/null +++ b/neuralcoref/scorer/test/DataFiles/TC-G-1.response @@ -0,0 +1,31 @@ +#begin document (nw/xinhua/00/chtb_0009); part 000 +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - + +#end document diff --git a/neuralcoref/scorer/test/DataFiles/TC-G.key b/neuralcoref/scorer/test/DataFiles/TC-G.key new file mode 100644 index 0000000..f2a6355 --- /dev/null +++ b/neuralcoref/scorer/test/DataFiles/TC-G.key @@ -0,0 +1,31 @@ +#begin document (nw/xinhua/00/chtb_0009); part 000 +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (2) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (2) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - + +#end document diff --git a/neuralcoref/scorer/test/DataFiles/TC-H-1.response b/neuralcoref/scorer/test/DataFiles/TC-H-1.response new file mode 100644 index 0000000..bb972d2 --- /dev/null +++ b/neuralcoref/scorer/test/DataFiles/TC-H-1.response @@ -0,0 +1,31 @@ +#begin document (nw/xinhua/00/chtb_0009); part 000 +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - + +#end document diff --git a/neuralcoref/scorer/test/DataFiles/TC-H.key b/neuralcoref/scorer/test/DataFiles/TC-H.key new file mode 100644 index 0000000..bb972d2 --- /dev/null +++ b/neuralcoref/scorer/test/DataFiles/TC-H.key @@ -0,0 +1,31 @@ +#begin document (nw/xinhua/00/chtb_0009); part 000 +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - + +#end document diff --git a/neuralcoref/scorer/test/DataFiles/TC-I-1.response b/neuralcoref/scorer/test/DataFiles/TC-I-1.response new file mode 100644 index 0000000..f2a6355 --- /dev/null +++ b/neuralcoref/scorer/test/DataFiles/TC-I-1.response @@ -0,0 +1,31 @@ +#begin document (nw/xinhua/00/chtb_0009); part 000 +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (2) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (2) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - + +#end document diff --git a/neuralcoref/scorer/test/DataFiles/TC-I.key b/neuralcoref/scorer/test/DataFiles/TC-I.key new file mode 100644 index 0000000..bb972d2 --- /dev/null +++ b/neuralcoref/scorer/test/DataFiles/TC-I.key @@ -0,0 +1,31 @@ +#begin document (nw/xinhua/00/chtb_0009); part 000 +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - + +#end document diff --git a/neuralcoref/scorer/test/DataFiles/TC-J-1.response b/neuralcoref/scorer/test/DataFiles/TC-J-1.response new file mode 100644 index 0000000..4f78b25 --- /dev/null +++ b/neuralcoref/scorer/test/DataFiles/TC-J-1.response @@ -0,0 +1,31 @@ +#begin document (nw/xinhua/00/chtb_0009); part 000 +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - + +#end document diff --git a/neuralcoref/scorer/test/DataFiles/TC-J.key b/neuralcoref/scorer/test/DataFiles/TC-J.key new file mode 100644 index 0000000..3519532 --- /dev/null +++ b/neuralcoref/scorer/test/DataFiles/TC-J.key @@ -0,0 +1,31 @@ +#begin document (nw/xinhua/00/chtb_0009); part 000 +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - + +#end document diff --git a/neuralcoref/scorer/test/DataFiles/TC-K-1.response b/neuralcoref/scorer/test/DataFiles/TC-K-1.response new file mode 100644 index 0000000..70a2552 --- /dev/null +++ b/neuralcoref/scorer/test/DataFiles/TC-K-1.response @@ -0,0 +1,31 @@ +#begin document (nw/xinhua/00/chtb_0009); part 000 +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (2) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (2) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (2) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (3) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (3) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (3) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - + +#end document diff --git a/neuralcoref/scorer/test/DataFiles/TC-K.key b/neuralcoref/scorer/test/DataFiles/TC-K.key new file mode 100644 index 0000000..588ff84 --- /dev/null +++ b/neuralcoref/scorer/test/DataFiles/TC-K.key @@ -0,0 +1,31 @@ +#begin document (nw/xinhua/00/chtb_0009); part 000 +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - + +#end document diff --git a/neuralcoref/scorer/test/DataFiles/TC-L-1.response b/neuralcoref/scorer/test/DataFiles/TC-L-1.response new file mode 100644 index 0000000..cb8ae7a --- /dev/null +++ b/neuralcoref/scorer/test/DataFiles/TC-L-1.response @@ -0,0 +1,31 @@ +#begin document (nw/xinhua/00/chtb_0009); part 000 +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (2) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (2) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (3) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (3) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (3) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - + +#end document diff --git a/neuralcoref/scorer/test/DataFiles/TC-L.key b/neuralcoref/scorer/test/DataFiles/TC-L.key new file mode 100644 index 0000000..472c3b9 --- /dev/null +++ b/neuralcoref/scorer/test/DataFiles/TC-L.key @@ -0,0 +1,31 @@ +#begin document (nw/xinhua/00/chtb_0009); part 000 +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (1) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (2) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (2) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (2) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 (2) +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - +nw/xinhua/00/chtb_0009 - + +#end document diff --git a/neuralcoref/scorer/test/DataFiles/TC-M-1.response b/neuralcoref/scorer/test/DataFiles/TC-M-1.response new file mode 100644 index 0000000..2dca5b3 --- /dev/null +++ b/neuralcoref/scorer/test/DataFiles/TC-M-1.response @@ -0,0 +1,23 @@ +#begin document (LuoTestCase); +test1 0 0 a1 (0 +test1 0 1 a2 0) +test1 0 2 junk - +test1 0 3 b1 (0 +test1 0 4 b2 - +test1 0 5 b3 - +test1 0 6 b4 0) +test1 0 7 jnk - +test1 0 8 . - + +test2 0 0 c (0) +test2 0 1 jnk - +test2 0 2 d1 (0 +test2 0 3 d2 0) +test2 0 4 jnk - +test2 0 5 e (0) +test2 0 6 jnk - +test2 0 7 f1 (0 +test2 0 8 f2 - +test2 0 9 f3 0) +test2 0 10 . - +#end document diff --git a/neuralcoref/scorer/test/DataFiles/TC-M-2.response b/neuralcoref/scorer/test/DataFiles/TC-M-2.response new file mode 100644 index 0000000..7fd13ec --- /dev/null +++ b/neuralcoref/scorer/test/DataFiles/TC-M-2.response @@ -0,0 +1,23 @@ +#begin document (LuoTestCase); +test1 0 0 a1 (0 +test1 0 1 a2 0) +test1 0 2 junk - +test1 0 3 b1 (1 +test1 0 4 b2 - +test1 0 5 b3 - +test1 0 6 b4 1) +test1 0 7 jnk - +test1 0 8 . - + +test2 0 0 c (2) +test2 0 1 jnk - +test2 0 2 d1 (3 +test2 0 3 d2 3) +test2 0 4 jnk - +test2 0 5 e (4) +test2 0 6 jnk - +test2 0 7 f1 (5 +test2 0 8 f2 - +test2 0 9 f3 5) +test2 0 10 . - +#end document diff --git a/neuralcoref/scorer/test/DataFiles/TC-M-3.response b/neuralcoref/scorer/test/DataFiles/TC-M-3.response new file mode 100644 index 0000000..bf3fb66 --- /dev/null +++ b/neuralcoref/scorer/test/DataFiles/TC-M-3.response @@ -0,0 +1,23 @@ +#begin document (LuoTestCase); +test1 0 0 a1 (0 +test1 0 1 a2 0) +test1 0 2 junk - +test1 0 3 b1 (0 +test1 0 4 b2 - +test1 0 5 b3 - +test1 0 6 b4 0) +test1 0 7 jnk - +test1 0 8 . - + +test2 0 0 c (1) +test2 0 1 jnk - +test2 0 2 d1 (1 +test2 0 3 d2 1) +test2 0 4 jnk - +test2 0 5 e (1) +test2 0 6 jnk - +test2 0 7 f1 (2 +test2 0 8 f2 - +test2 0 9 f3 2) +test2 0 10 . - +#end document diff --git a/neuralcoref/scorer/test/DataFiles/TC-M-4.response b/neuralcoref/scorer/test/DataFiles/TC-M-4.response new file mode 100644 index 0000000..590914b --- /dev/null +++ b/neuralcoref/scorer/test/DataFiles/TC-M-4.response @@ -0,0 +1,23 @@ +#begin document (LuoTestCase); +test1 0 0 a1 (0 +test1 0 1 a2 0) +test1 0 2 junk - +test1 0 3 b1 (0 +test1 0 4 b2 - +test1 0 5 b3 - +test1 0 6 b4 0) +test1 0 7 jnk - +test1 0 8 . - + +test2 0 0 c (0) +test2 0 1 jnk (0) +test2 0 2 d1 - +test2 0 3 d2 - +test2 0 4 jnk (0) +test2 0 5 e - +test2 0 6 jnk (0) +test2 0 7 f1 - +test2 0 8 f2 - +test2 0 9 f3 - +test2 0 10 . - +#end document diff --git a/neuralcoref/scorer/test/DataFiles/TC-M-5.response b/neuralcoref/scorer/test/DataFiles/TC-M-5.response new file mode 100644 index 0000000..00aa567 --- /dev/null +++ b/neuralcoref/scorer/test/DataFiles/TC-M-5.response @@ -0,0 +1,23 @@ +#begin document (LuoTestCase); +test1 0 0 a1 (0 +test1 0 1 a2 0) +test1 0 2 junk - +test1 0 3 b1 (1 +test1 0 4 b2 - +test1 0 5 b3 - +test1 0 6 b4 1) +test1 0 7 jnk - +test1 0 8 . - + +test2 0 0 c (2) +test2 0 1 jnk (3) +test2 0 2 d1 - +test2 0 3 d2 - +test2 0 4 jnk (4) +test2 0 5 e - +test2 0 6 jnk (5) +test2 0 7 f1 - +test2 0 8 f2 - +test2 0 9 f3 - +test2 0 10 . - +#end document diff --git a/neuralcoref/scorer/test/DataFiles/TC-M-6.response b/neuralcoref/scorer/test/DataFiles/TC-M-6.response new file mode 100644 index 0000000..8436410 --- /dev/null +++ b/neuralcoref/scorer/test/DataFiles/TC-M-6.response @@ -0,0 +1,23 @@ +#begin document (LuoTestCase); +test1 0 0 a1 (0 +test1 0 1 a2 0) +test1 0 2 junk - +test1 0 3 b1 (0 +test1 0 4 b2 - +test1 0 5 b3 - +test1 0 6 b4 0) +test1 0 7 jnk - +test1 0 8 . - + +test2 0 0 c (1) +test2 0 1 jnk (1) +test2 0 2 d1 - +test2 0 3 d2 - +test2 0 4 jnk (1) +test2 0 5 e - +test2 0 6 jnk (2) +test2 0 7 f1 - +test2 0 8 f2 - +test2 0 9 f3 - +test2 0 10 . - +#end document diff --git a/neuralcoref/scorer/test/DataFiles/TC-M.key b/neuralcoref/scorer/test/DataFiles/TC-M.key new file mode 100644 index 0000000..2dca5b3 --- /dev/null +++ b/neuralcoref/scorer/test/DataFiles/TC-M.key @@ -0,0 +1,23 @@ +#begin document (LuoTestCase); +test1 0 0 a1 (0 +test1 0 1 a2 0) +test1 0 2 junk - +test1 0 3 b1 (0 +test1 0 4 b2 - +test1 0 5 b3 - +test1 0 6 b4 0) +test1 0 7 jnk - +test1 0 8 . - + +test2 0 0 c (0) +test2 0 1 jnk - +test2 0 2 d1 (0 +test2 0 3 d2 0) +test2 0 4 jnk - +test2 0 5 e (0) +test2 0 6 jnk - +test2 0 7 f1 (0 +test2 0 8 f2 - +test2 0 9 f3 0) +test2 0 10 . - +#end document diff --git a/neuralcoref/scorer/test/DataFiles/TC-N-1.response b/neuralcoref/scorer/test/DataFiles/TC-N-1.response new file mode 100644 index 0000000..7fd13ec --- /dev/null +++ b/neuralcoref/scorer/test/DataFiles/TC-N-1.response @@ -0,0 +1,23 @@ +#begin document (LuoTestCase); +test1 0 0 a1 (0 +test1 0 1 a2 0) +test1 0 2 junk - +test1 0 3 b1 (1 +test1 0 4 b2 - +test1 0 5 b3 - +test1 0 6 b4 1) +test1 0 7 jnk - +test1 0 8 . - + +test2 0 0 c (2) +test2 0 1 jnk - +test2 0 2 d1 (3 +test2 0 3 d2 3) +test2 0 4 jnk - +test2 0 5 e (4) +test2 0 6 jnk - +test2 0 7 f1 (5 +test2 0 8 f2 - +test2 0 9 f3 5) +test2 0 10 . - +#end document diff --git a/neuralcoref/scorer/test/DataFiles/TC-N-2.response b/neuralcoref/scorer/test/DataFiles/TC-N-2.response new file mode 100644 index 0000000..2dca5b3 --- /dev/null +++ b/neuralcoref/scorer/test/DataFiles/TC-N-2.response @@ -0,0 +1,23 @@ +#begin document (LuoTestCase); +test1 0 0 a1 (0 +test1 0 1 a2 0) +test1 0 2 junk - +test1 0 3 b1 (0 +test1 0 4 b2 - +test1 0 5 b3 - +test1 0 6 b4 0) +test1 0 7 jnk - +test1 0 8 . - + +test2 0 0 c (0) +test2 0 1 jnk - +test2 0 2 d1 (0 +test2 0 3 d2 0) +test2 0 4 jnk - +test2 0 5 e (0) +test2 0 6 jnk - +test2 0 7 f1 (0 +test2 0 8 f2 - +test2 0 9 f3 0) +test2 0 10 . - +#end document diff --git a/neuralcoref/scorer/test/DataFiles/TC-N-3.response b/neuralcoref/scorer/test/DataFiles/TC-N-3.response new file mode 100644 index 0000000..bf3fb66 --- /dev/null +++ b/neuralcoref/scorer/test/DataFiles/TC-N-3.response @@ -0,0 +1,23 @@ +#begin document (LuoTestCase); +test1 0 0 a1 (0 +test1 0 1 a2 0) +test1 0 2 junk - +test1 0 3 b1 (0 +test1 0 4 b2 - +test1 0 5 b3 - +test1 0 6 b4 0) +test1 0 7 jnk - +test1 0 8 . - + +test2 0 0 c (1) +test2 0 1 jnk - +test2 0 2 d1 (1 +test2 0 3 d2 1) +test2 0 4 jnk - +test2 0 5 e (1) +test2 0 6 jnk - +test2 0 7 f1 (2 +test2 0 8 f2 - +test2 0 9 f3 2) +test2 0 10 . - +#end document diff --git a/neuralcoref/scorer/test/DataFiles/TC-N-4.response b/neuralcoref/scorer/test/DataFiles/TC-N-4.response new file mode 100644 index 0000000..00aa567 --- /dev/null +++ b/neuralcoref/scorer/test/DataFiles/TC-N-4.response @@ -0,0 +1,23 @@ +#begin document (LuoTestCase); +test1 0 0 a1 (0 +test1 0 1 a2 0) +test1 0 2 junk - +test1 0 3 b1 (1 +test1 0 4 b2 - +test1 0 5 b3 - +test1 0 6 b4 1) +test1 0 7 jnk - +test1 0 8 . - + +test2 0 0 c (2) +test2 0 1 jnk (3) +test2 0 2 d1 - +test2 0 3 d2 - +test2 0 4 jnk (4) +test2 0 5 e - +test2 0 6 jnk (5) +test2 0 7 f1 - +test2 0 8 f2 - +test2 0 9 f3 - +test2 0 10 . - +#end document diff --git a/neuralcoref/scorer/test/DataFiles/TC-N-5.response b/neuralcoref/scorer/test/DataFiles/TC-N-5.response new file mode 100644 index 0000000..590914b --- /dev/null +++ b/neuralcoref/scorer/test/DataFiles/TC-N-5.response @@ -0,0 +1,23 @@ +#begin document (LuoTestCase); +test1 0 0 a1 (0 +test1 0 1 a2 0) +test1 0 2 junk - +test1 0 3 b1 (0 +test1 0 4 b2 - +test1 0 5 b3 - +test1 0 6 b4 0) +test1 0 7 jnk - +test1 0 8 . - + +test2 0 0 c (0) +test2 0 1 jnk (0) +test2 0 2 d1 - +test2 0 3 d2 - +test2 0 4 jnk (0) +test2 0 5 e - +test2 0 6 jnk (0) +test2 0 7 f1 - +test2 0 8 f2 - +test2 0 9 f3 - +test2 0 10 . - +#end document diff --git a/neuralcoref/scorer/test/DataFiles/TC-N-6.response b/neuralcoref/scorer/test/DataFiles/TC-N-6.response new file mode 100644 index 0000000..8436410 --- /dev/null +++ b/neuralcoref/scorer/test/DataFiles/TC-N-6.response @@ -0,0 +1,23 @@ +#begin document (LuoTestCase); +test1 0 0 a1 (0 +test1 0 1 a2 0) +test1 0 2 junk - +test1 0 3 b1 (0 +test1 0 4 b2 - +test1 0 5 b3 - +test1 0 6 b4 0) +test1 0 7 jnk - +test1 0 8 . - + +test2 0 0 c (1) +test2 0 1 jnk (1) +test2 0 2 d1 - +test2 0 3 d2 - +test2 0 4 jnk (1) +test2 0 5 e - +test2 0 6 jnk (2) +test2 0 7 f1 - +test2 0 8 f2 - +test2 0 9 f3 - +test2 0 10 . - +#end document diff --git a/neuralcoref/scorer/test/DataFiles/TC-N.key b/neuralcoref/scorer/test/DataFiles/TC-N.key new file mode 100644 index 0000000..7fd13ec --- /dev/null +++ b/neuralcoref/scorer/test/DataFiles/TC-N.key @@ -0,0 +1,23 @@ +#begin document (LuoTestCase); +test1 0 0 a1 (0 +test1 0 1 a2 0) +test1 0 2 junk - +test1 0 3 b1 (1 +test1 0 4 b2 - +test1 0 5 b3 - +test1 0 6 b4 1) +test1 0 7 jnk - +test1 0 8 . - + +test2 0 0 c (2) +test2 0 1 jnk - +test2 0 2 d1 (3 +test2 0 3 d2 3) +test2 0 4 jnk - +test2 0 5 e (4) +test2 0 6 jnk - +test2 0 7 f1 (5 +test2 0 8 f2 - +test2 0 9 f3 5) +test2 0 10 . - +#end document diff --git a/neuralcoref/scorer/test/TestCases.README b/neuralcoref/scorer/test/TestCases.README new file mode 100644 index 0000000..d60d2bb --- /dev/null +++ b/neuralcoref/scorer/test/TestCases.README @@ -0,0 +1,390 @@ +TC-A-1 - perfect: +Key/Ref: {a} {bc} {def} +Rsp/Sys: {a} {bc} {def} +Expected: BCUB=1 [recall=6/6, prec=6/6] +Expected: MUC=1 [recall=3/3=1, prec=3/3=1] +Expected: CEAFm=1 [recall=6/6=1, prec=6/6=1] +Expected: CEAFe=1 [recall=3/3=1, prec=3/3=1] +Expected: BLANC=1 [recall_c=4/4=1, prec_c=4/4=1, recall_n=11/11=1, prec_n=11/11=1] + +TC-A-2 -- response with missing mentions/entities +Key/Ref: {a} {bc} {def} +Rsp/Sys: {a} {de} +Expected: BCUB=.5599 [recall=7/18, prec=3/3] +Expected: MUC=0.5 [recall=1/3, prec=1/1] +Expected: CEAFm=6/9=0.67 [common=3, recall=3/6=0.5, Prec=3/3=1] +Expected: CEAFe=3.6/5=0.72 [common=1+4/5=1.8, recall=1.8/3=0.6, Prec=1.8/2=0.9] +Expected: BLANC=0.35 [recall_c=1/4, prec_c=1/1, recall_n=2/11, prec_n=2/2] + +TC-A-3 -- response with false-alarm mentions/entities +Key/Ref: {a} {bc} {def} +Rsp/Sys: {a} {bcx} {defy} {z} +Expected: BCUB=.6748 [recall=6/6, prec=55/108] +Expected: MUC=0.75 [recall=3/3, prec=3/5] +Expected: CEAFm=12/15=0.8 [common=6, recall=6/6=1, prec=6/9=.67] +Expected: CEAFe=3.6/5=0.76 [common=1+4/5+6/7=2.66, recall=2.66/3=0.89, Prec=2.66/4=0.66] +Expected: BLANC=0.60 [recall_c=4/4, prec_c=4/9, recall_n=11/11, prec_n=11/27] + + +TC-A-4 -- response with both missing and false-alarm mentions/entities +Key/Ref: {a} {bc} {def} +Rsp/Sys: {a} {bcx} {dy} {z} +Expected: BCUB=.4683 [recall=5/9, prec=17/42] +Expected: MUC=1/3=.33333 [recall=1/3, prec=1/3] +Expected: CEAFm=8/13=0.62 [common=4 recall=4/6=0.67 prec=4/7=.57] +Expected: CEAFe=4.4/7=0.63 [common=1+4/5+2/5=2.2, recall=2.2/3=0.73, Prec=2.2/4=0.55] +Expected: BLANC=0.30 [recall_c=1/4, prec_c=1/4, recall_n=5/11, prec_n=5/17] + +TC-A-5 -- response with both missing and false-alarm mentions/entities, and overlapping mentions (capitalized letter: b and B). Overlapping mention B in the aligned entity. +Key/Ref: {a} {bc} {def} +Rsp/Sys: {a} {bcxB} {dy} {z} +Expected: BCUB=.4 [recall=5/9, prec=5/16] +Expected: MUC=2/7=.28571 [recall=1/3, prec=1/4] +Expected: CEAFm=8/14=0.57 [common=4 recall=4/6=0.67 prec=4/8=.5] +Expected: CEAFe=4.14/7=0.59 [common=1+4/6+2/5=2.07, recall=2.07/3=0.69, Prec=2.07/4=0.52] +Expected: BLANC=0.25 [recall_c=1/4, prec_c=1/7, recall_n=5/11, prec_n=5/21] + +TC-A-6 -- response with both missing and false-alarm mentions/entities, and overlapping mentions (capitalized letter: b and B). Overlapping mention B in an unaligned entity. +Key/Ref: {a} {bc} {def} +Rsp/Sys: {a} {bcx} {dy} {Bz} +Expected: BCUB=.4325 [recall=5/9, prec=17/48] +Expected: MUC=2/7=.28571 [recall=1/3, prec=1/4] +Expected: CEAFm=8/14=0.57 [common=4 recall=4/6=0.67 prec=4/8=.5] +Expected: CEAFe=4.4/7=0.63 [common=1+4/5+2/5=2.2, recall=2.2/3=0.73, Prec=2.2/4=0.55] +Expected: BLANC=0.26 [recall_c=1/4, prec_c=1/5, recall_n=5/11, prec_n=5/23] + +TC-A-7 -- response with both missing and false-alarm mentions/entities, and duplicate mentions (capitalized letter: b and B). Duplicate mention B in the same cluster entity (note: this is diff from TC5) -- this tests mention de-duplication. +Key/Ref: {a} {bc} {def} +Rsp/Sys: {a} {bcxB} {dy} {z} + de-dup: {a} {bcx} {dy} {z} + +de-dup: +Expected: BCUB=.4683 [recall=5/9, prec=17/42] +Expected: MUC=1/3=.33333 [recall=1/3, prec=1/3] +Expected: CEAFm=8/13=0.61538 [common=4, recall=4/6=0.66667, Prec=4/7=0.57143] +Expected: CEAFe=4.14/7=0.62857 [common=1+4/5+2/5=2.2, recall=2.2/3=0.73333, Prec=2.2/4=0.55] +Expected: BLANC=0.30 [recall_c=1/4, prec_c=1/4, recall_n=5/11, prec_n=5/17] + +if No de-dup: +Expected: CEAFm=8/14=0.57 [common=4 recall=4/6=0.67 prec=4/8=.5] +Expected: CEAFe=4.14/7=0.59 [common=1+4/6+2/5=2.07, recall=2.07/3=0.69, Prec=2.07/4=0.52] + + +TC-A-8 -- response with both missing and false-alarm mentions/entities, and duplicate mentions (capitalized letter: b and B). Duplicate mention B in a diff entity from b. +Key/Ref: {a} {bc} {def} +Rsp/Sys: {a} {bcx} {dy} {Bz} + +De-dup: +Expected: BCUB=.4683 [recall=5/9, prec=17/42] +Expected: MUC=1/3=.33333 [recall=1/3, prec=1/3] +Expected: CEAFm=8/13=0.61538 [common=4 recall=4/6=0.67 prec=4/7=.57143] +Expected: CEAFe=4.14/7=0.63 [common=1+4/5+2/5=2.2, recall=2.2/3=0.73, Prec=2.2/4=0.55] +Expected: BLANC=0.30 [recall_c=1/4, prec_c=1/4, recall_n=5/11, prec_n=5/17] + +If no de-dup: +Expected: CEAFm=8/14=0.57 [common=4 recall=4/6=0.67 prec=4/8=.5] +Expected: CEAFe=4.14/7=0.63 [common=1+4/5+2/5=2.2, recall=2.2/3=0.73, Prec=2.2/4=0.55] + +TC-A-9 -- show B3 can be canned: "b" is repeated 10 times so precision approaches 1 +Key/Ref: {a} {bc} {def} +Rsp/Sys: {a} {bcx} {dy} {Bx10z} +de-dup Rsp/Sys: {a} {bcx} {dy} {z} + +De-dup: +Expected: BCUB=.4683 [recall=5/9, prec=17/42] +Expected: MUC=1/3=.33333 [recall=1/3, prec=1/3] +Expected: CEAFm=8/14=0.57 [common=4 recall=4/6=0.67 prec=4/7=.57143] +Expected: CEAFe=4.4/7=0.63 [common=1+4/5+2/5=2.2, recall=2.2/3=0.73, Prec=2.2/4=0.55] +Expected: BLANC=0.30 [recall_c=1/4, prec_c=1/4, recall_n=5/11, prec_n=5/17] + + +TC-A-10 - Gold mentions. Only singletons in the response. +Key/Ref: {a} {bc} {def} +Rsp/Sys: {a} {b} {c} {d} {e} {f} +Expected: BCUB=.6667 [recall=3/6, prec=6/6] +Expected: MUC=0 [recall=0, prec=0] +Expected: BLANC=0.42 [recall_c=0/4, prec_c=0/0, f_c=0, recall_n=11/11, prec_n=11/15] + + +TC-A-11 - Gold mentions. All mentions are coreferent in the response. +Key/Ref: {a} {bc} {def} +Rsp/Sys: {abcdef} + +Expected: BCUB=0.5599 [recall=6/6, prec=7/18] +Expected: MUC=6/8=0.75 [recall=3/3, prec=3/5] +Expected: BLANC=0.21 [recall_c=4/4, prec_c=4/15, recall_n=0/11, prec_n=0/0, f_n=0] + + +TC-A-12 - System mentions. Only singletons in the response. +Key/Ref: {a} {bc} {def} +Rsp/Sys: {a} {x} {y} {c} {d} {e} {z} + +Expected: BCUB=0.4425 [recall=13/36, prec=4/7] +Expected: MUC=0 [recall=0, prec=0] +Expected: BLANC=0.16 [recall_c=0/4, prec_c=0/0, f_c=0, recall_n=5/11, prec_n=5/21] + + +TC-A-13 - System mentions. All mentions are coreferent in the response. +Key/Ref: {a} {bc} {def} +Rsp/Sys: {axycdez} + +Expected: BCUB=0.19447 [recall=17/36, prec=6/49] +Expected: MUC=2/9 [recall=1/3, prec=1/6] +Expected: BLANC=0.04 [recall_c=1/4, prec_c=1/21, recall_n=0/11, prec_n=0/0, f_n=0] + + +TC-B-1 -- spurious mention (x) and missing mention (a) in response; link (bc) is a key non-coref link and is an incorrect response coref link. + + Keys: {ab} {cde} +Response: {bcx} {de} + + key coref links: C_k = {(ab), (cd), (de), (ce)} +key non-coref links: N_k = {(ac), (ad), (ae), (bc), (bd), (be)} + + response coref links: C_r = {(bc), (bx), (cx), (de)} +response non-coref links: N_r = {(bd), (be), (cd), (ce), (xd), (xe)} + +(I'll use ^ for set intersection) +C_k ^ C_r = {(de)} => R_c = |C_k^C_r| / |C_k| = 1/4, P_c = 1/|C_r| = 1/4, F_c = 1/4 +N_k ^ N_r = {(bd), (be)} => R_n = |N_k^N_r|/|N_k| = 2/6, P_n = 2/|N_r| = 2/6, F_n = 1/3 + +BLANC = 1/2 (F_c + F_n) = 7/24. + + + + + TC-C-1 -- same as TC14 plus a new entity and its correct prediction shown. this was for testing the more than two entity case. + + Keys: {ab} {cde} {fg} +Response: {bcx} {de} {fg} + + key coref links: C_k = {(ab), (cd), (de), (ce), (fg)}} + key non-coref links: N_k = {(ac), (ad), (ae), (bc), (bd), (be), (af), (ag), (bf), (bg), (cf), (cg), (df), (dg), (ef), (eg)} + + response coref links: C_r = {(bc), (bx), (cx), (de), (fg)} +response non-coref links: N_r = {(bd), (be), (cd), (ce), (xd), (xe), (bf), (bg), (cf), (cg), (xf), (xg), (df), (dg), (ef), (eg)} + +(I'll use ^ for set intersection) +C_k ^ C_r = {(de), (fg)} => R_c = |C_k^C_r| / |C_k| = 2/5, P_c = 2/|C_r| = 2/5, F_c = 2/5 = 0.40 +N_k ^ N_r = {(bd), (be), (bf), (bg), (cf), (cg), (df), (dg), (ef), (eg)} => R_n = |N_k^N_r|/|N_k| = 10/16, P_n = 10/|N_r| = 10/16, F_n = 10/16 = 0.625 + +BLANC = 1/2 (F_c + F_n) = 0.5125 + + + +# ------------ examples from the B-CUBED paper + +TC-D-1 -- merging one small cluster with a big cluster + +key: {12345} {67} {89ABC} +--- + +1-2-3-4-5 + +6-7 + +8-9-A-B-C + + + +response: {12345} {6789ABC} +--------- + +1-2-3-4-5 + +6-7 + | + 8-9-A-B-C + + +Expected: BCUB [r=12/12, p=16/21, f=0.864864865] +Expected: MUC [r=9/9, p=9/10, f=0.947368421] + + + +TC-E-1 -- merging two big clusters + + +key: {12345} {67} {89ABC} +--- + +1-2-3-4-5 + +6-7 + +8-9-A-B-C + + + +response: {123456789ABC} {67} +--------- + +1-2-3-4-5 + | +6-7 | + | + 8-9-A-B-C + + +Expected: BCUB [r=1, p=7/12, f=0.736842105] +Expected: MUC [r=9/9, p=9/10, f=0.947368421] + + +# ---------- examples from the MUC paper + +TC-F-1 -- + + key: {ABCD} ---- Links: A-B; B-C; C-D +response: {AB} {CD} ---- Links: A-B; C-D + +Expected: MUC [r=2/3, p=2/2, f=2*(2/3)*(2/2)/(2/3+2/2)] + + + +TC-G-1 -- + + key: {AB} {CD} ---- Links: A-B; C-D +response: {ABCD} ---- Links: A-B; B-C; C-D + +Expected: MUC [r=2/2, p=2/3, f=2*(2/2)*(2/3)/(2/2+2/3)] + + + +TC-H-1 -- + + key: {ABCD} ---- Links: A-B; B-C; B-D +response: {ABCD} ---- Links: A-B; B-C; C-D + +Expected: MUC [r=1, p=1, f=1] + + + +TC-I-1 -- + + key: {ABCD} ---- Links: A-B; B-C; B-D +response: {AB} {CD} ---- Links: A-B; C-D + +Expected: MUC [r=2/3, p=2/2, f=2*(2/3)*(2/2)/(2/3+2/2)] + + + +TC-J-1 -- + + key: {ABC} ---- Links: A-B; B-C +response: {AC} ---- Links: A-C + +Expected: MUC [r=1/2, p=1/1, f=2*(1/2)*(1/1)/(1/2+1/1)] + + + +TC-K-1 -- + + key: {BCDEGHJ} ---- Links: B-C; C-D; D-E; E-G; G-H; H-J +response: {ABC} {DEF} {GHI} ---- Links: A-B; B-C; D-E; E-F; G-H; H-I + +Expected: MUC [r=3/6, p=3/6, f=3/6] + + + +TC-L-1 -- + + key: {ABC} {DEFG} ---- Links: A-B; B-C; D-E; E-F; F-G +response: {AB} {CD} {FGH} ---- Links: A-B; C-D; F-G; G-H + +Expected: MUC [r=2/5, p=2/4, f=2*(2/5)*(2/4)/(2/5+2/4)] + + +TC-M-1 - Only coreferent mentions in the key. Gold mentions. Matching response. Since the key contains no non-coreference link, BLANC equals recall_c, prec_c, F_c. +Key/Ref: {abcdef} +Rsp/Sys: {abcdef} + +Expected: BCUB=1 +Expected: MUC=1 +Expected: CEAFm=1 +Expected: CEAFe=1 +Expected: BLANC=1 [recall_c=15/15=1, prec_c=15/15=1] + + +TC-M-2 - Only coreferent mentions in the key. Gold mentions. Response contains only non-coreference links. +Key/Ref: {abcdef} +Rsp/Sys: {a} {b} {c} {d} {e} {f} + +Expected: MUC=0 +Expected: BLANC=0 [recall_c=0/15=0, prec_c=0/0=0] + + +TC-M-3 - Only coreferent mentions in the key. Gold mentions. Response contains coreference and non-coreference links. +Key/Ref: {abcdef} +Rsp/Sys: {ab} {cde} {f} + +Expected: BLANC=0.42 [recall_c=4/15, prec_c=4/4=1] + + +TC-M-4 - Only coreferent mentions in the key. System mentions: only coreferent mentions. Since the key contains no non-coreference link, BLANC equals recall_c, prec_c, F_c. +Key/Ref: {abcdef} +Rsp/Sys: {abcxyz} + +Expected: BLANC=0.20 [recall_c=3/15, prec_c=3/15] + + +TC-M-5 - Only coreferent mentions in the key. System mentions: only singletons. +Key/Ref: {abcdef} +Rsp/Sys: {a} {b} {c} {x} {y} {z} + +Expected: MUC=0 +Expected: BLANC=0 [recall_c=0/15=0, prec_c=0/0=0] + + +TC-M-6 - Only coreferent mentions in the key. System mentions: coreference and non-coreference links. +Key/Ref: {abcdef} +Rsp/Sys: {ab} {cxy} {z} + +Expected: BLANC=0.11 [recall_c=1/15, prec_c=1/4] + + +TC-N-1 - Only singletons in the key. Gold mentions. Matching response. Since the key contains no coreference link, BLANC equals recall_n, prec_n, F_n. +Key/Ref: {a} {b} {c} {d} {e} {f} +Rsp/Sys: {a} {b} {c} {d} {e} {f} + +Expected: BCUB=1 +Expected: MUC=0 +Expected: CEAFm=1 +Expected: CEAFe=1 +Expected: BLANC=1 [recall_n=15/15=1, prec_n=15/15=1] + + +TC-N-2 - Only singletons in the key. Gold mentions. Response contains only coreference links. +Key/Ref: {a} {b} {c} {d} {e} {f} +Rsp/Sys: {abcdef} + +Expected: BLANC=0 [recall_n=0/15=0, prec_n=0/0=0] + + +TC-N-3 - Only singletons in the key. Gold mentions. Response contains coreference and non-coreference links. +Key/Ref: {a} {b} {c} {d} {e} {f} +Rsp/Sys: {ab} {cde} {f} + +Expected: BLANC=0.85 [recall_n=11/15, prec_n=11/11=1] + + +TC-N-4 - Only singletons in the key. System mentions: only singletons. Since the key contains no coreference link, BLANC equals recall_n, prec_n, F_n. +Key/Ref: {a} {b} {c} {d} {e} {f} +Rsp/Sys: {a} {b} {c} {x} {y} {z} + +Expected: MUC=0 +Expected: BLANC=0.20 [recall_n=3/15, prec_n=3/15] + + +TC-N-5 - Only singletons in the key. System mentions: only coreference links. +Key/Ref: {a} {b} {c} {d} {e} {f} +Rsp/Sys: {abcxyz} + +Expected: BLANC=0 [recall_n=0/15=0, prec_n=0/0=0] + + +TC-N-6 - Only singletons in the key. Only coreferent mentions in the key. System mentions: coreference and non-coreference links. +Key/Ref: {a} {b} {c} {d} {e} {f} +Rsp/Sys: {ab} {cxy} {z} + +Expected: BLANC=0.15 [recall_n=2/15, prec_n=2/11] + diff --git a/neuralcoref/scorer/test/test.pl b/neuralcoref/scorer/test/test.pl new file mode 100755 index 0000000..78228e8 --- /dev/null +++ b/neuralcoref/scorer/test/test.pl @@ -0,0 +1,46 @@ +#!/usr/bin/perl + +BEGIN { + $d = $0; + $d =~ s/\/[^\/][^\/]*$//g; + push(@INC, $d); + push(@INC, $d . "/../lib"); +} + +use strict; +use CorScorer; +use CorefMetricTest; +use CorefMetricTestConfig; + +my $error_tolerance = 1.e-4; +my $script_dir = $0; +$script_dir =~ s/\/[^\/][^\/]*$//g; + +foreach my $test_case (@CorefMetricTestConfig::TestCases) { + my $id = $test_case->{'id'}; + my @key_response_files = ($script_dir . "/" . $test_case->{'key_file'}, + $script_dir . "/" . $test_case->{'response_file'}); + print "\nTesting case ($id): keyFile=", $key_response_files[0], + " responseFile=", $key_response_files[1], "\n"; + my $expected_metrics = $test_case->{'expected_metrics'}; + foreach my $metric_name (sort keys %$expected_metrics) { + my $expected_values = $expected_metrics->{$metric_name}; + *::SAVED_STDOUT = *STDOUT; + *STDOUT = *::SUPRRES_STDOUT; + my @actual_counts = &CorScorer::Score($metric_name, @key_response_files); + # Compute R,P,and F1 from raw counts. + my @actual_values = CorefMetricTest::ComputeScoreFromCounts(@actual_counts); + *STDOUT = *::SAVED_STDOUT; + my $diff = CorefMetricTest::DiffExpectedAndActual($expected_values, \@actual_values); + printf " metric: %+10s", $metric_name; + if ($diff < $error_tolerance) { + print " => PASS\n"; + } else { + print " => FAIL\n"; + print " Expected (recall, prec, F1) = (", join(" ", @$expected_values), ")\n"; + print " Actual (recall, prec, F1) = (", join(" ", @actual_values), ")\n"; + #exit(1); + } + } +} + diff --git a/neuralcoref/scorer_wrapper.pl b/neuralcoref/scorer_wrapper.pl new file mode 100644 index 0000000..b43fc30 --- /dev/null +++ b/neuralcoref/scorer_wrapper.pl @@ -0,0 +1,35 @@ +#!/usr/bin/perl + +BEGIN { + $d = $0; + $d =~ s/\/[^\/][^\/]*$//g; + + if ($d eq $0) { + unshift(@INC, "scorer/lib"); + } + else { + unshift(@INC, $d . "/scorer/lib"); + } +} + +use strict; +use CorScorer; + +my $metric = shift(@ARGV); +if ($metric !~ /^(muc|bcub|ceafm|ceafe|blanc|all)/i) { + print "Invalid metric\n"; + exit; +} + +if ($metric eq 'all') { + foreach my $m ('muc', 'bcub', 'ceafm', 'ceafe', 'blanc') { +# print "\nMETRIC $m:\n"; + my ($acumNR, $acumDR, $acumNP, $acumDP, $identNR, $identDR, $identNP, $identDP) = &CorScorer::Score($m, @ARGV); + print "$acumNR $acumDR $acumNP $acumDP\n$identNR $identDR $identNP $identDP"; + } +} +else { + my ($acumNR, $acumDR, $acumNP, $acumDP, $identNR, $identDR, $identNP, $identDP) = &CorScorer::Score($metric, @ARGV); + print "$acumNR $acumDR $acumNP $acumDP\n$identNR $identDR $identNP $identDP"; +} + diff --git a/neuralcoref/server.py b/neuralcoref/server.py index 7a0973f..6540f1b 100644 --- a/neuralcoref/server.py +++ b/neuralcoref/server.py @@ -6,15 +6,12 @@ from __future__ import unicode_literals import json -import sys from wsgiref.simple_server import make_server import falcon from neuralcoref.algorithm import Coref -from neuralcoref.data import MENTION_LABEL - -is_python2 = int(sys.version[0]) == 2 -unicode_ = unicode if is_python2 else str +from neuralcoref.document import MENTION_LABEL +from neuralcoref.compat import unicode_ class CorefWrapper(Coref): def parse_and_get_mentions(self, utterances, utterances_speakers_id=None, context=None, diff --git a/neuralcoref/utils.py b/neuralcoref/utils.py new file mode 100644 index 0000000..333ad28 --- /dev/null +++ b/neuralcoref/utils.py @@ -0,0 +1,78 @@ +# coding: utf8 +"""Utils""" + +from __future__ import absolute_import +from __future__ import unicode_literals +from __future__ import print_function + +from concurrent.futures import ThreadPoolExecutor, as_completed + +import numpy as np +from tqdm import tqdm + +DISTANCE_BINS = list(range(5)) + [5]*3 + [6]*8 + [7]*16 +[8]*32 + +def encode_distance(x): + ''' Encode an integer or an array of integers as a (bined) one-hot numpy array ''' + def _encode_distance(d): + ''' Encode an integer as a (bined) one-hot numpy array ''' + dist_vect = np.zeros((11,)) + if d < 64: + dist_vect[DISTANCE_BINS[d]] = 1 + else: + dist_vect[9] = 1 + dist_vect[10] = min(float(d), 64.0) / 64.0 + return dist_vect + + if isinstance(x, np.ndarray): + arr_l = [_encode_distance(y)[np.newaxis, :] for y in x] + out_arr = np.concatenate(arr_l) + else: + out_arr = _encode_distance(x) + return out_arr + +def parallel_process(array, function, n_jobs=16, use_kwargs=False, front_num=10): + """ + A parallel version of the map function with a progress bar. + + Args: + array (array-like): An array to iterate over. + function (function): A python function to apply to the elements of array + n_jobs (int, default=16): The number of cores to use + use_kwargs (boolean, default=False): Whether to consider the elements of array as dictionaries of + keyword arguments to function + front_num (int, default=3): The number of iterations to run serially before kicking off the parallel job. + Useful for catching bugs + Returns: + [function(array[0]), function(array[1]), ...] + """ + #We run the first few iterations serially to catch bugs + if front_num > 0: + front = [function(**a) if use_kwargs else function(a) for a in array[:front_num]] + #If we set n_jobs to 1, just run a list comprehension. This is useful for benchmarking and debugging. + if n_jobs==1: + return front + [function(**a) if use_kwargs else function(a) for a in tqdm(array[front_num:])] + #Assemble the workers + with ThreadPoolExecutor(max_workers=n_jobs) as pool: + #Pass the elements of array into function + if use_kwargs: + futures = [pool.submit(function, **a) for a in array[front_num:]] + else: + futures = [pool.submit(function, a) for a in array[front_num:]] + kwargs = { + 'total': len(futures), + 'unit': 'it', + 'unit_scale': True, + 'leave': True + } + #Print out the progress as tasks complete + for _ in tqdm(as_completed(futures), **kwargs): + pass + out = [] + #Get the results from the futures. + for future in tqdm(futures): + try: + out.append(future.result()) + except Exception as e: + out.append(e) + return front + out diff --git a/readme.md b/readme.md index 4449df4..70bc0cb 100644 --- a/readme.md +++ b/readme.md @@ -1,11 +1,11 @@ -# Neural coref +# Neural coref v2.0 State-of-the-art coreference resolution library using neural nets and spaCy. [Try it online !](https://huggingface.co/coref/) ![Neuralcoref demo](https://huggingface.co/coref/assets/thumbnail-large.png) This coreference resolution module is based on the super fast [spaCy](https://spacy.io/) parser and uses the neural net scoring model described in [Deep Reinforcement Learning for Mention-Ranking Coreference Models](http://cs.stanford.edu/people/kevclark/resources/clark-manning-emnlp2016-deep.pdf) by Kevin Clark and Christopher D. Manning, EMNLP 2016. -Be sure to check out [our medium post](https://medium.com/huggingface/state-of-the-art-neural-coreference-resolution-for-chatbots-3302365dcf30) in which we talk more about neuralcoref and coreference resolution. +With ✨Neuralcoref v2.0, you should now be able to train the coreference resolution system on your own dataset — e.g., another language than English! — **provided you have an annotated dataset**. Be sure to check [our medium post detailing the release of v2.0 and how to train the model](https://medium.com/huggingface/how-to-train-a-neural-coreference-model-neuralcoref-2-7bb30c1abdfe) as well as our [first medium post](https://medium.com/huggingface/state-of-the-art-neural-coreference-resolution-for-chatbots-3302365dcf30) in which we talk more about coreference resolution in general. ## Installation Clone the repo and install using pip (the trained model weights are too large for PyPI) @@ -16,14 +16,18 @@ cd neuralcoref pip install . ``` +The install script will install `spacy` and `falcon` (only used by the server). -You will also need an English model for spaCy if you don't already have spaCy installed in your environment. +You will also need an English model for spaCy if you don't already have one. ``` python -m spacy download en ``` The mention extraction module is strongly influenced by the quality of the parsing so we recommend selecting a model with a higher accuray than usual. +## Re-train the model / Extend to another language +If you want to retrain the model or train it on another language, see our detailed [training instructions](training.md) as well as our [detailed blog post](https://medium.com/huggingface/how-to-train-a-neural-coreference-model-neuralcoref-2-7bb30c1abdfe) + ## Usage ### As a standalone server `python -m neuralcoref.server` starts a wsgiref simple server. diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index d40c3f5..0000000 --- a/requirements.txt +++ /dev/null @@ -1,28 +0,0 @@ -certifi==2017.4.17 -chardet==3.0.4 -cymem==1.31.2 -cytoolz==0.8.2 -dill==0.2.7 -falcon==1.2.0 -ftfy==4.4.3 -html5lib==0.999999999 -idna==2.5 -murmurhash==0.26.4 -numpy==1.13.1 -pathlib==1.0.1 -plac==0.9.6 -preshed==1.0.0 -python-mimeparse==1.6.0 -regex==2017.4.5 -requests==2.18.1 -six==1.10.0 -spacy==2.0.7 -termcolor==1.1.0 -thinc==6.5.2 -toolz==0.8.2 -tqdm==4.14.0 -ujson==1.35 -urllib3==1.21.1 -wcwidth==0.1.7 -webencodings==0.5.1 -wrapt==1.10.10 diff --git a/setup.py b/setup.py index ad59efc..b2a414c 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,6 @@ 'Programming Language :: Python :: 3.5' ], install_requires=[ - 'numpy', 'spacy', 'falcon'], packages=['neuralcoref'], diff --git a/training.md b/training.md new file mode 100644 index 0000000..27b85f0 --- /dev/null +++ b/training.md @@ -0,0 +1,84 @@ +# How to train and modify the neural coreference model + +Please check our [detailed blog post](https://medium.com/huggingface/how-to-train-a-neural-coreference-model-neuralcoref-2-7bb30c1abdfe) together with these short notes. + +## Install +As always, we recommend creating a clean environment (conda or virtual env) to install and train the model. + +You will need to install [pyTorch](http://pytorch.org/), the neuralcoref package with the additional training requirements and download a language model for spacy. +Currently this can be done (assuming an English language model) with +````bash +conda install pytorch -c pytorch +pip install -r ./training_requirements.txt -e . +python -m spacy download en +```` + +## Get the data +To train on English, download: +- the [OntoNote dataset](https://catalog.ldc.upenn.edu/ldc2013t19) from LDC, and +- the [CoNLL-2012 skeleton files](http://conll.cemantix.org/2012/data.html) from the CoNLL 2012 shared task site + +(If you want to train on another language, see the section [train on a new language](#train-on-a-new-language) below.) + +You will then need to combine the skeleton files with the OntoNotesfiles to get the `*._conll` text files which can be used as inputs for the training. + +To this aim, the [CoNLL 2012 shared task site](http://conll.cemantix.org/2012/data.html) supply processing scripts. I have updated them to fix some outdated dependencies and you will find the updated scripts in [conll_processing_scripts/](/neuralcoref/conll_processing_scripts/). To use the scripts, follow the instructions given on the [CoNLL 2012 shared task site](http://conll.cemantix.org/2012/data.html), i.e. +````bash +skeleton2conll.sh -D [path_to_ontonotes_train_folder] [path_to_skeleton_train_folder] +skeleton2conll.sh -D [path_to_ontonotes_test_folder] [path_to_skeleton_test_folder] +skeleton2conll.sh -D [path_to_ontonotes_dev_folder] [path_to_skeleton_dev_folder] +```` + +## Prepare the data +Once you have the set of `*._conll` files, you can prepare the training data by running [conllparser.py](/neuralcoref/conllparser.py) on each split of the data set (train, test, dev) as +````bash +python -m neuralcoref.conllparser --path ./data/train/ +python -m neuralcoref.conllparser --path ./data/test/ +python -m neuralcoref.conllparser --path ./data/dev/ +```` + +Conllparser will: +- parse the `*._conll` files using spaCy, +- identify predicted mentions, +- compute the mentions features (see our blog post), and +- gather the mention features in a set of numpy arrays to be used as input for the neural net model. + +## Train the model +Once the files have been pre-processed (you should have a set of `*.npy` files in a sub-directory `/numpy` in each of your (train|test|dev) data folder), you can start the training process using [learn.py](/neuralcoref/learn.py), for example as +````bash +python -m neuralcoref.learn --train ./data/train/ --eval ./data/dev/ +```` + +There many parameters and options for the training. You can list them with the usual +````bash +python -m neuralcoref.learn --help +```` + +You can follow the training by running [Tensorboard for pyTorch](https://github.com/lanpa/tensorboard-pytorch) (it requires a version of Tensorflow, any version will be fine). Run it with `tensorboard --logdir runs`. + +## Some details on the training +The model and the training as thoroughfully described in our [very detailed blog post](https://medium.com/huggingface/how-to-train-a-neural-coreference-model-neuralcoref-2-7bb30c1abdfe). The training process is similar to the mention-ranking training described in [Clark and Manning (2016)](http://cs.stanford.edu/people/kevclark/resources/clark-manning-emnlp2016-deep.pdf), namely: +- A first step of training uses a standard cross entropy loss on the mention pair labels, +- A second step of training uses a cross entropy loss on the top pairs only, and +- A third step of training using a slack-rescaled ranking loss. + +With the default option, the training will switch from one step to the other as soon as the evaluation stop increasing. + +Traing the model with the default hyper-parameters reaches a test loss of about 61.2 which is lower than the mention ranking test loss of 64.7 reported in [Clark and Manning (2016)](http://cs.stanford.edu/people/kevclark/resources/clark-manning-emnlp2016-deep.pdf). + +Some possible explanations: +- Our mention extraction function is a simple rule-based function (in [document.py](/document.py)) that was not extensively tuned on the CoNLL dataset and as a result only identify about 90% of the gold mentions in the CoNLL-2012 dataset (see the evaluation at the start of the training) thereby reducing the maximum possible score. Manually tuning a mention identification module can be a lengthy process that basically involves designing a lot of heuristics to prune spurious mentions which keeping a high recall (see for example the [rule-based mention extraction used in CoreNLP](http://www.aclweb.org/anthology/D10-1048)). An alternative is train an end-to-end identification module as used in the AllenAI coreference module but this is a lot more complex (you have to learn a pruning function) and the focus of the neuralcoref project is to have a coreference module with a good trade-off between accuracy and simplicity/speed. +- The hyper-parameters and the optimization procedure has not been fully tuned and it is likely possible to find better hyper-parameters and smarter ways to optimize. One possibiility is to adjust the balance between the gradients backpropagated in the single-mention and the mentions-pair feedforward networks (see our [blog post](https://medium.com/huggingface/how-to-train-a-neural-coreference-model-neuralcoref-2-7bb30c1abdfe) for more details on the model architecture). Here again, we aimed for a balance between the accuracy and the training speed. As a result, the model trains in about 18h versus about a week for the original model of [Clark and Manning (2016)](http://cs.stanford.edu/people/kevclark/resources/clark-manning-emnlp2016-deep.pdf) and 2 days for the current state-of-the-art model of AllenAI. +- Again for the sake of high throughput, the parse tree output by the [standard English model](https://spacy.io/models/en#en_core_web_sm) of spaCy 2 (that we used for these tests) are slightly less accurate than the carefully tuned CoreNLP pars trees (but they are way faster to compute!) and will lead to a slightly higher percentage of wrong parsing annotations. +- Eventually, it may also be interesting to use newer wrod-vectors like the [ELMo](https://arxiv.org/abs/1802.05365) as they were shown to be able to increase the state-or-the-art corerefence model F1 test measure by more than 3 percents. + +## Train on a new language +Training on a new language is now possible. However, do not expect it to be a plug-in operation as it involves finding a good annotated dataset and adapting the file-loading and mention-extraction functions to your file format and your language syntax (parse tree). + +To boot-strap your work, I detail here the general step you should follow: +- Find a corpus with coreference annotations (as always, the bigger, the better). +- Check that spaCy [support your language](https://spacy.io/models/) (i.e. is able to parse it). If not, you will have to find another parser that is able to parse your language and integrate it with the project (might involve quite large modifications to neuralcoref depending on the parser). +- Find a set of pre-trained word vectors in your language (gloVe or others). +- If your dataset does not follow the tabular `*_conll` file format (see [details on the CoNLL file format](http://conll.cemantix.org/2012/data.html) on the CoNLL website), you will have to tweak the `load_file` function in [conllparser.py](/conllparser.py) to adapt it to your file format. +- Adapt the mention extraction function to your language parse trees (`extract_mentions_spans` in [document.py](/document.py)) to reach an acceptable identification of mentions (the function should output the list of all possible mention in a document: pronouns, nouns, noun phrases and all the nested possible combinations). +- Re-train the model and tune the hyper-parameters. \ No newline at end of file diff --git a/training_requirements.txt b/training_requirements.txt new file mode 100644 index 0000000..637f078 --- /dev/null +++ b/training_requirements.txt @@ -0,0 +1,3 @@ +spacy +torch +tensorboardX